I have implemented a CUDA version of inverse discrete cosine transform (IDCT), by "translating" the MATLAB built-in function idct.m into CUDA:
- My implementation is 
cuIDCT.cu, works when m = n and both m and n are even numbers. 
cuIDCT.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cufft.h>
#include <cuComplex.h>
// round up n/m
inline int iDivUp(int n, int m)
{
    return (n + m - 1) / m;
}
typedef cufftComplex complex;
#define PI 3.1415926535897932384626433832795028841971693993751
__global__
    void idct_ComputeWeightsKernel(const int n, complex *ww)
{
    const int pos = threadIdx.x + blockIdx.x * blockDim.x;
    if (pos >= n) return;
    ww[pos].x =  sqrtf(2*n) * cosf(pos*PI/(2*n));
    ww[pos].y =  sqrtf(2*n) * sinf(pos*PI/(2*n));
}
__global__
    void idct_ComputeEvenKernel(const float *b, const int n, const int m, complex *ww, complex *y)
{
    const int ix = threadIdx.x + blockIdx.x * blockDim.x;
    const int iy = threadIdx.y + blockIdx.y * blockDim.y;
    if (ix >= n || iy >= m) return;
    const int pos = ix + iy*n;
    // Compute precorrection factor
    ww[0].x = ww[0].x / sqrtf(2);
    ww[0].y = ww[0].y / sqrtf(2);
    y[iy + ix*m].x = ww[iy].x * b[pos];
    y[iy + ix*m].y = ww[iy].y * b[pos];
}
__global__
    void Reordering_a0_Kernel(complex *y, const int n, const int m, complex *yy)
{
    const int ix = threadIdx.x + blockIdx.x * blockDim.x;
    const int iy = threadIdx.y + blockIdx.y * blockDim.y;
    if (ix >= n || iy >= m) return;
    const int pos = ix + iy*n;
    yy[iy + ix*n].x = y[pos].x / (float) n;
    yy[iy + ix*n].y = y[pos].y / (float) n;
}
__global__
    void Reordering_a_Kernel(complex *yy, const int n, const int m, float *a)
{
    const int ix = threadIdx.x + blockIdx.x * blockDim.x;
    const int iy = threadIdx.y + blockIdx.y * blockDim.y;
    if (ix >= n || iy >= m) return;
    const int pos = ix + iy*n;
    // Re-order elements of each column according to equations (5.93) and (5.94) in Jain
    if (iy < n/2)
    {
        a[ix + 2*iy*n]     = yy[pos].x;
        a[ix + (2*iy+1)*n] = yy[ix + (m-iy-1)*n].x;
    }
}
/**
* a = idct(b), where a is of size [n m].
* @param b, input array
* @param n, first dimension of a
* @param m, second dimension of a
* @param a, output array
*/
void cuIDCT(float *h_in, int n, int m, float *h_out) // a is of size [n m]
{
    const int data_size   = n * m * sizeof(float);
    // device memory allocation
    float *d_in, *d_out;
    cudaMalloc(&d_in, data_size);
    cudaMalloc(&d_out, data_size);
    // transfer data from host to device
    cudaMemcpy(d_in, h_in, data_size, cudaMemcpyHostToDevice);
    // compute IDCT using CUDA
    // begin============================================
    // Compute weights
    complex *ww;
    cudaMalloc(&ww, n*sizeof(complex));
    dim3 threads(256);
    dim3 blocks(iDivUp(n, threads.x));
    idct_ComputeWeightsKernel<<<blocks, threads>>>(n, ww);
    complex *y;
    complex *yy;
    cufftHandle plan;
    dim3 threads1(32, 6);
    dim3 blocks2(iDivUp(n,   threads1.x), iDivUp(m, threads1.y)); // for even case
    int Length[1] = {m}; // for each IFFT, the length is m
    cudaMalloc(&y,  n*m*sizeof(complex));
    idct_ComputeEvenKernel<<<blocks2, threads1>>>(d_in, n, m, ww, y);
    cufftPlanMany(&plan, 1, Length, 
                 Length, 1, m, 
                 Length, 1, m, CUFFT_C2C, n);
    cufftExecC2C(plan, y, y, CUFFT_INVERSE); // y is of size [n m]
    cudaMalloc(&yy,  n*m*sizeof(complex));
    Reordering_a0_Kernel<<<blocks2, threads1>>>(y, n, m, yy);
    Reordering_a_Kernel<<<blocks2, threads1>>>(yy, n, m, d_out);
    // end============================================
    // transfer result from device to host
    cudaMemcpy(h_out, d_out, data_size, cudaMemcpyDeviceToHost);
    // cleanup
    cufftDestroy(plan);
    cudaFree(ww);
    cudaFree(y);
    cudaFree(yy);
    cudaFree(d_in);
    cudaFree(d_out);
}
Then I compared the result of my CUDA IDCT (i.e. cuIDCT.cu) against MATLAB idct.m using following code:
- a test 
main.cppfunction, and - a MATLAB main function 
main.mto read result from CUDA and compare it against MATLAB. 
main.cpp
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <helper_functions.h>
#include <stdlib.h>
#include <stdio.h>
// N must equal to M, and both must be even numbers
#define N 256
#define M 256
void WriteDataFile(const char *name, int w, int h, const float *in, const float *out)
{
    FILE *stream;
    stream = fopen(name, "wb");
    float data = 202021.25f;
    fwrite(&data, sizeof(float), 1, stream);
    fwrite(&w,    sizeof(w), 1, stream);
    fwrite(&h,    sizeof(h), 1, stream);
    for (int i = 0; i < h; i++)
        for (int j = 0; j < w; j++)
        {
            const int pos = j + i * h;
            fwrite(in  + pos, sizeof(float),  1, stream);
            fwrite(out + pos, sizeof(float), 1, stream);
        }
    fclose(stream);
}
void cuIDCT(float *b, int n, int m, float *a);
int main()
{
    // host memory allocation
    float *h_in   = new float [N * M];
    float *h_out  = new float [N * M];
    float *h_temp = new float [N * M];
    // input data initialization
    for (int i = 0; i < N * M; i++)
    {
        h_in[i]   = (float)rand()/(float)RAND_MAX;
        h_out[i]  = h_in[i];
        h_temp[i] = h_in[i];
    }
    // please comment either one case for testing
    // test case 1: use cuIDCT.cu once
    // cuIDCT(h_in, N, M, h_out);
    // test case 2: iteratively use cuIDCT.cu
    for (int i = 0; i < 4; i++)
    {
        if (i % 2 == 0)
            cuIDCT(h_out, N, M, h_temp);
        else
            cuIDCT(h_temp, N, M, h_out);
    }
    // write data, for further visualization using MATLAB
    WriteDataFile("test.flo", N, M, h_in, h_out);
    // cleanup
    delete [] h_in;
    delete [] h_out;
    delete [] h_temp;
    cudaDeviceReset();
}
main.m
clc;clear;
% read
[h_in, h_out] = read_data('test.flo');
% MATLAB result, for test case 1, comment the for-loop
matlab_out = h_in;
for i = 1:4
    matlab_out = idct(matlab_out);
end
% compare
err = matlab_out - h_out;
% show
figure(1);
subplot(221);   imshow(h_in,  []);       title('h\_in');        colorbar
subplot(222);   imshow(h_out, []);       title('h\_out');       colorbar
subplot(223);   imshow(matlab_out, []);  title('matlab\_out');  colorbar
subplot(224);   imshow(err,   []);       title('error map');    colorbar
disp(['maximum error between CUDA and MATLAB is ' ...
        num2str(max(max(abs(err))))])
I ran the code on Visual Studio 11 (i.e. VS2012) in Windows 7 with Nvidia GPU Tesla K20c, using CUDA Toolkit version 7.5, and my MATLAB version is R2015b.
My test steps:
- For test case 1. Un-comment test case 1 and comment test case 2.
- Run 
main.cpp. - Run 
main.min MATLAB. - Repeat step 1 and step 2 (without any change, just re-run the code).
 
 - Run 
 
I repeated step 3 for 20 times. The output result is unchanged, and results in main.m are:
The maximum error is 7.7152e-07.
- For test case 2. Un-comment test case 2 and comment test case 1.
- Run 
main.cpp. - Run 
main.min MATLAB. - Repeat step 1 and step 2 (without any change, just re-run the code).
 
 - Run 
 
I repeated step 3 for 20 times. The output result is changed, and results in main.m are (not enough reputation to put all images, only wrong case is shown below):
one situation (the wrong one) of test case 2
The maximum error is 0.45341 (2 times), 0.44898 (1 time), 0.26186 (1 time), 0.26301 (1 time), and 9.5716e-07 (15 times).
From the test results, my conclusion is:
- From test case 1: 
cuIDCT.cuis numerically correct (error ~10^-7) toidct.m. - From test case 2: recursively use of 
cuIDCT.culeads to unstable result (i.e. the output changes every time when re-run the code and may sometimes be numerically wrong, error ~0.1) 
My question:
From test case 1 we know cuIDCT.cu is numerically correct to idct.m. But why recursiviely use of cuIDCT.cu leads to different output result each time when re-run the code?
Any helps or suggestions are highly appreciated.