I am trying to replicate matlab fft functionality, where it does a row by row (or column by column) fft of a matrix. Each row would be one of the batches in the cufft plan.
I can get it working using cufftExecC2C (the commented out part in the code below works), but not cufftExecR2C. My code is using cufftPlan1d, but ideally I want to implement it using cufftPlanMany.
I am wondering what I'm doing wrong, and if there is a better way of doing this. Thank you.
// linker -> input -> additional dependencies -> add 'cufft.lib'
// VC++ Directories -> include directories - > add 'C:\ProgramData\NVIDIA Corporation\CUDA Samples\v6.0\common\inc'
#include <stdio.h>
#include <stdlib.h>
#include <cufft.h>
#include <cuda_runtime.h>
#include <iostream>
#define NX 6
#define NY 5
void printArray(float *my_array);
void printComplexArray(float2 *my_array);
int main(){
/************************************************************ C2C ************************************************************/
/*  
    float2 *initial_array = (float2 *)malloc(sizeof(float2) * NX * NY);
    for (int h = 0; h < NX; h++){
        for (int w = 0; w < NY; w++){
            initial_array[NY * h + w].x = 0;
            initial_array[NY * h + w].y = 0;
        }
    }
    initial_array[NY*3 + 0].x = 1;
    initial_array[NY*5 + 0].x = 1;
    printComplexArray(initial_array);
    float2 *transformed_array= (float2 *)malloc(sizeof(float2) * NX * NY);
    cufftComplex *gpu_initial_array;
    cufftComplex *gpu_transformed_array;
    cudaMalloc((void **)&gpu_initial_array, NX*NY*sizeof(cufftComplex));
    cudaMalloc((void **)&gpu_transformed_array, NX*NY*sizeof(cufftComplex));
    cudaMemcpy(gpu_initial_array, initial_array, NX*NY*sizeof(float2), cudaMemcpyHostToDevice);
    cufftHandle plan;
    cufftPlan1d(&plan, NY, CUFFT_C2C, NX);
    cufftExecC2C(plan, gpu_initial_array, gpu_transformed_array, CUFFT_FORWARD);
    cudaMemcpy(transformed_array, gpu_transformed_array, NX*NY*sizeof(cufftComplex), cudaMemcpyDeviceToHost);
    printComplexArray(transformed_array);
*/
/************************************************************ C2C ************************************************************/
/************************************************************ R2C ************************************************************/
    float *initial_array = (float *)malloc(sizeof(float) * NX * NY);
    for (int h = 0; h < NX; h++){
        for (int w = 0; w < NY; w++)
            initial_array[NY * h + w] = 0;
    }
    initial_array[NY*3 + 0] = 1;
    printArray(initial_array);
    float2 *transformed_array= (float2 *)malloc(sizeof(float2) * (NY/2+1) * NX);
    cufftReal *gpu_initial_array;
    cufftComplex *gpu_transformed_array;
    cudaMalloc((void **)&gpu_initial_array, NX*NY*sizeof(cufftReal));
    cudaMalloc((void **)&gpu_transformed_array, (NY/2+1)*NX*sizeof(cufftComplex));
    cudaMemcpy(gpu_initial_array, initial_array, NX*NY*sizeof(float), cudaMemcpyHostToDevice);
    cufftHandle plan;
    cufftPlan1d(&plan, NY, CUFFT_R2C, NX);
    //                       ***** cufftPlanMany *****
    //int n[2] = {NX, NY};
    //cufftPlanMany(&plan,1,n,NULL,1,0,NULL,1,0,CUFFT_R2C,NX);
    cufftExecR2C(plan, gpu_initial_array, gpu_transformed_array);
    cudaMemcpy(transformed_array, gpu_transformed_array, NX*(NY/2+1)*sizeof(cufftComplex), cudaMemcpyDeviceToHost);
    printComplexArray(transformed_array);
/************************************************************ R2C ************************************************************/
    cufftDestroy(plan);
    free(initial_array);
    free(transformed_array);
    cudaFree(gpu_initial_array);
    cudaFree(gpu_transformed_array);
    std::system("pause");
    return 0;
}
void printArray(float *my_array){
    for (int h = 0; h < NX; h++){
        for (int w = 0; w < NY; w++)
            std::cout << my_array[NY * h + w] << " | ";
        std::cout << std::endl; 
    }
    std::cout << std::endl;     
}
void printComplexArray(float2 *my_array){
    for (int h = 0; h < NX; h++){
        for (int w = 0; w < NY; w++)
            std::cout << my_array[NY * h + w].x << " + " << my_array[NY * h + w].y << " | ";
        std::cout << std::endl;
    }
    std::cout << std::endl; 
}
 
     
    