CUDA matrix transpose with shared memory

Question

I need to implement a matrix transpose function on a GPU using shared memory. I have done it in a simple way without shared memory which works fine and also an attempt with SM. But unfortunately the calculation is not correct and I can not figure out why. A complete working example can be found here and at the bottom of this question.

EDIT 1

I further know that the first index of the result where I have a wrong value is index 32 (of the flatted matrix, so matr[0][32] in a two dimensional fashion).

If there are any more information neede I will prvide them gladly.

A short extract from the entire code which resembles the not working function is listed below:

__global__ void notSoNaivaTransKernel(float *matrB, float *matrA, const int width,
    const int height, const int nreps)
{
    __shared__ float tile[TILE_DIM][TILE_DIM + 1];
    int blockIdx_y = blockIdx.x;
    int blockIdx_x = (blockIdx.x + blockIdx.y) % gridDim.x;
    int xIndex = blockIdx_x * TILE_DIM + threadIdx.x;
    int yIndex = blockIdx_y * TILE_DIM + threadIdx.y;
    int index_in = xIndex + (yIndex)* width;

    xIndex = blockIdx_y * TILE_DIM + threadIdx.x;
    yIndex = blockIdx_x * TILE_DIM + threadIdx.y;
    int index_out = xIndex + (yIndex)* height;

    int r, i;
#pragma unroll
    for (r = 0; r < nreps; r++)
    {
#pragma unroll
        for (i = 0; i < TILE_DIM; i += BLOCK_ROWS)
            tile[threadIdx.y + i][threadIdx.x] = matrA[index_in + i * width];

        __syncthreads();

#pragma unroll
        for (i = 0; i < TILE_DIM; i += BLOCK_ROWS)
            if (index_in + i * width < width * height)
               matrB[index_out + i * height] = tile[threadIdx.x][threadIdx.y + i];
    }
}

The output looks like this:

Avg. CPU Transpose Time: 0.106048 ms, Bandwidth: 3.771873 GB/s

Avg. GPU Naive Trans Time: 0.009871 ms, bandwidth: 40.520836 GB/s
    Correct: 50000, Wrong: 0

Avg. GPU Trans with SM Time: 0.007598 ms, bandwidth: 52.643482 GB/s
    Correct: 12352, Wrong: 37648

Here is the full working example. I stripped most of the unnecessary code from it so it is less stuffed:

#include "cuda_runtime.h"
#include "device_functions.h"
#include "device_launch_parameters.h"

#include <chrono>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>

#define TILE_DIM 32
#define BLOCK_ROWS 8
#define BLOCK_COLS 32

cudaError_t matrMagicCuda(float *matrB, float *matrA, const int width, const int height, const int nreps, const int operation);
void cpuMatrTrans(float *matrB, float *matrA, const int width, const int height, const int nreps);
__global__ void naiveTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps);
__global__ void notSoNaivaTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps);

int main()
{
    int i, width, height, nreps, size, wrong, correct;
    double cpuTime, cpuBandwidth;
    cudaError_t cudaStatus;

    float *matrA, *matrATC, *matrATG, *matrAC;

    srand(time(NULL));

    nreps = 10000;
    width = 500;
    height = 100;
    size = width * height;

    matrA = (float*)malloc(size * sizeof(float)); // matrix A
    matrAC = (float*)malloc(size * sizeof(float)); // matrix A copied
    matrATC = (float*)malloc(size * sizeof(float)); // matrix A transposed by CPU
    matrATG = (float*)malloc(size * sizeof(float)); // matrix A transposed by GPU

    for (i = 0; i < size; i++)
    {
        matrA[i] = (float)i;
    }

    auto start = std::chrono::high_resolution_clock::now();

    //CPU Transpose
    cpuMatrTrans(matrATC, matrA, width, height, nreps);

    auto end = std::chrono::high_resolution_clock::now();

    std::chrono::duration<double> diff = end - start;
    cpuTime = (diff.count() * 1000) / nreps;
    cpuBandwidth = (sizeof(float) * size * 2) / (cpuTime * 1000000);//scaling from ms to s and B to GB doen implicitly, shortened in fraction, times two for read and write
    printf("Avg. CPU Transpose Time: %f ms, Bandwidth: %f GB/s\n\n", cpuTime, cpuBandwidth);

    correct = 0;
    wrong = 0;

    //Naive transpose
    matrMagicCuda(matrATG, matrA, width, height, nreps, 1);

    //Check if calc was correct
    for (i = 0; i < size; i++)
    {
        if (matrATC[i] != matrATG[i])
        {
            /*printf("ERROR - %d - ATC:%f - ATG:%f\n\n", i, matrATC[i], matrATG[i]);
            return;*/
            wrong++;
        }
        else
        {
            correct++;
        }
    }

    printf("\tCorrect: %d, Wrong: %d\n\n", correct, wrong);
    correct = 0;
    wrong = 0;

    //Transpose with shared memory
    matrMagicCuda(matrATG, matrA, width, height, nreps, 2);

    //Check if calc was correct
    for (i = 0; i < size; i++)
    {
        if (matrATC[i] != matrATG[i])
        {
            /*printf("ERROR - %d - ATC:%f - ATG:%f\n\n", i, matrATC[i], matrATG[i]);
            return;*/
            wrong++;
        }
        else
        {
            correct++;
        }
    }

    //printf("\tTranspose with SM on GPU was executed correctly.\n\n");
    printf("\tCorrect: %d, Wrong: %d\n\n", correct, wrong);
    correct = 0;
    wrong = 0;

    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaDeviceReset failed!\n");
        return 1;
    }

    return 0;
}

cudaError_t matrMagicCuda(float *matrB, float *matrA, const int width, const int height, const int nreps, const int operation)
{
    float elapsed = 0;
    float *dev_matrA = 0;
    float *dev_matrB = 0;
    cudaError_t cudaStatus;
    dim3 dim_grid, dim_block;
    double gpuBandwidth;

    int size = width * height;

    dim_block.x = TILE_DIM;
    dim_block.y = BLOCK_ROWS;
    dim_block.z = 1;

    dim_grid.x = (width + TILE_DIM - 1) / TILE_DIM;
    dim_grid.y = (height + TILE_DIM - 1) / TILE_DIM;
    dim_grid.z = 1;

    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for three matrix
    cudaStatus = cudaMalloc((void**)&dev_matrA, size * sizeof(float));
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_matrB, size * sizeof(float));
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    // Copy input matrix from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_matrA, matrA, size * sizeof(float), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    switch (operation)
    {
        case(1):
        {
            cudaEventRecord(start);
            // Launch a kernel on the GPU with one thread for each element.
            naiveTransKernel << <dim_grid, dim_block >> >(dev_matrB, dev_matrA, width, height, nreps);

            cudaEventRecord(stop);
            cudaEventSynchronize(stop);

            cudaEventElapsedTime(&elapsed, start, stop);
            cudaEventDestroy(start);
            cudaEventDestroy(stop);

            elapsed /= nreps;

            gpuBandwidth = (sizeof(float) * size * 2) / (elapsed * 1000000);//scaling from ms to s and B to GB doen implicitly, shortened in fraction, times two for read and write
            printf("Avg. GPU Naive Trans Time: %f ms, bandwidth: %f GB/s\n", elapsed, gpuBandwidth);

            break;
        }

        case(2):
        {
            cudaEventRecord(start);
            // Launch a kernel on the GPU with one thread for each element.
            notSoNaivaTransKernel << <dim_grid, dim_block >> >(dev_matrB, dev_matrA, width, height, nreps);

            cudaEventRecord(stop);
            cudaEventSynchronize(stop);

            cudaEventElapsedTime(&elapsed, start, stop);
            cudaEventDestroy(start);
            cudaEventDestroy(stop);

            elapsed /= nreps;

            gpuBandwidth = (sizeof(float) * size * 2) / (elapsed * 1000000);//scaling from ms to s and B to GB doen implicitly, shortened in fraction, times two for read and write
            printf("Avg. GPU Trans with SM Time: %f ms, bandwidth: %f GB/s\n", elapsed, gpuBandwidth);

            break;
        }

    default:
        printf("No matching opcode was found.\n");
    }

    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }

    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching Kernel!\n", cudaStatus);
        goto Error;
    }

    // Copy output matrix from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(matrB, dev_matrB, size * sizeof(float), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

Error:
    cudaFree(dev_matrB);
    cudaFree(dev_matrA);

    return cudaStatus;
}

void cpuMatrTrans(float *matrB, float *matrA, const int width, const int height, const int nreps)
{
    int i, j, r;

#pragma unroll
    for (r = 0; r < nreps; r++)
#pragma unroll
        for (i = 0; i < height; i++)
#pragma unroll
            for (j = 0; j < width; j++)
                matrB[j * height + i] = matrA[i * width + j];
}

__global__ void naiveTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps)
{
    int i, r;
    int row = blockIdx.x * TILE_DIM + threadIdx.x;
    int col = blockIdx.y * TILE_DIM + threadIdx.y;
    int index_in = row + width * col;
    int index_out = col + height * row;

#pragma unroll
    for (r = 0; r < nreps; r++)
#pragma unroll
        for (i = 0; i < TILE_DIM; i += BLOCK_ROWS)
            if (index_in + i * width < width * height)
                matrB[index_out + i] = matrA[index_in + i * width];
}

__global__ void notSoNaivaTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps)
{
    __shared__ float tile[TILE_DIM][TILE_DIM + 1];
    int blockIdx_y = blockIdx.x;
    int blockIdx_x = (blockIdx.x + blockIdx.y) % gridDim.x;
    int xIndex = blockIdx_x * TILE_DIM + threadIdx.x;
    int yIndex = blockIdx_y * TILE_DIM + threadIdx.y;
    int index_in = xIndex + (yIndex)* width;

    xIndex = blockIdx_y * TILE_DIM + threadIdx.x;
    yIndex = blockIdx_x * TILE_DIM + threadIdx.y;
    int index_out = xIndex + (yIndex)* height;

    int r, i;
#pragma unroll
    for (r = 0; r < nreps; r++)
    {
#pragma unroll
        for (i = 0; i < TILE_DIM; i += BLOCK_ROWS)
            tile[threadIdx.y + i][threadIdx.x] = matrA[index_in + i * width];

        __syncthreads();

#pragma unroll
        for (i = 0; i < TILE_DIM; i += BLOCK_ROWS)
            if (index_in + i * width < width * height)
               matrB[index_out + i * height] = tile[threadIdx.x][threadIdx.y + i];
    }
}

apparently you want to be able to do non-square matrix transpose? Or only square matrix transpose? You should include an [mcve] *in your question*, not in an external link. — Robert Crovella, Nov 22 '16 at 18:10
Yes, I want to do the transpose on non square matrixes as well. For the sake of readability I placed it outside but as you wish I will add it to the question as well. — JRsz, Nov 22 '16 at 19:55
Your code is making out-of-bounds accesses. Any time you are having trouble with a CUDA code, you should use [proper cuda error checking](http://stackoverflow.com/questions/14038589) and run your code with `cuda-memcheck`. Even if you don't understand the error output, it will be useful for others trying to help you. Also note that while I understand this is probably a learning exercise, for serious work its recommended that you use a library routine like cublas [geam](http://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-geam). — Robert Crovella, Nov 23 '16 at 02:05

Robert Crovella · Accepted Answer · 2020-07-06T22:03:47.170

There were a number of issues with this code. I'm not sure I will be able to cover all of them.

Possibly the most important issue was your lack of (and lack of understanding of) a proper 2D thread check. Your algorithm creates a grid of threads that is larger in both dimensions than the problem size. This creates logical threads outside of the dimension of your matrix, in both dimensions.

You have attempted to create a 2D thread check like this:

        if (index_in + i * width < width * height)

This will not work. Suppose I have a 3x3 matrix and a 4x4 grid of threads. The thread at (3,0) is clearly out-of-bounds for your matrix, but would pass your 2D thread check.

In this situation, a proper thread check must test each dimension separately, not as a product.

Note that this logical error exists in your "naive" transpose kernel as well, and you can confirm that if you run your code with cuda-memcheck. It will indicate out-of-bounds access errors in that kernel, even though it appears to work correctly.

There were various other problems as well. Most of these had to do with indexing in your shared memory kernel. It's not clear to me that you understood the necessary indexing manipulation for a shared memory transpose. There are two separate indexing transposes that we must do in this case:

Transpose the block (tile) indices
Transpose the thread indices

Transposition of the thread indices is done upon reading/writing shared memory. You have this correctly accounted for with a reversal of the use of threadIdx.x and threadIdx.y for read/write of shared memory. But as near as I can tell, your index generation for the reversal of block indices (which reversal is used during the reading/writing to global memory) was broken. That was the other major issue to be sorted out.

The following code has these and some other issues fixed, and appears to work correctly for me:

$ cat t33.cu    
#include <chrono>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>

#define TILE_DIM 32
#define BLOCK_ROWS 8
#define BLOCK_COLS 32

cudaError_t matrMagicCuda(float *matrB, float *matrA, const int width, const int height, const int nreps, const int operation);
void cpuMatrTrans(float *matrB, float *matrA, const int width, const int height, const int nreps);
__global__ void naiveTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps);
__global__ void notSoNaivaTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps);

int main()
{
    int i, width, height, nreps, size, wrong, correct;
    double cpuTime, cpuBandwidth;
    cudaError_t cudaStatus;

    float *matrA, *matrATC, *matrATG, *matrAC;

    srand(time(NULL));

    nreps = 10000;
    width = 500;
    height = 100;


    size = width * height;

    matrA = (float*)malloc(size * sizeof(float)); // matrix A
    matrAC = (float*)malloc(size * sizeof(float)); // matrix A copied
    matrATC = (float*)malloc(size * sizeof(float)); // matrix A transposed by CPU
    matrATG = (float*)malloc(size * sizeof(float)); // matrix A transposed by GPU

    for (i = 0; i < size; i++)
    {
        matrA[i] = (float)i;
    }

    auto start = std::chrono::high_resolution_clock::now();

    //CPU Transpose
    cpuMatrTrans(matrATC, matrA, width, height, nreps);

    auto end = std::chrono::high_resolution_clock::now();

    std::chrono::duration<double> diff = end - start;
    cpuTime = (diff.count() * 1000) / nreps;
    cpuBandwidth = (sizeof(float) * size * 2) / (cpuTime * 1000000);//scaling from ms to s and B to GB doen implicitly, shortened in fraction, times two for read and write
    printf("Avg. CPU Transpose Time: %f ms, Bandwidth: %f GB/s\n\n", cpuTime, cpuBandwidth);

    correct = 0;
    wrong = 0;

    //Naive transpose
    memset(matrATG, 0, size*sizeof(float));
    matrMagicCuda(matrATG, matrA, width, height, nreps, 1);

    //Check if calc was correct
    for (i = 0; i < size; i++)
    {
        if (matrATC[i] != matrATG[i])
        {
            /*printf("ERROR - %d - ATC:%f - ATG:%f\n\n", i, matrATC[i], matrATG[i]);
            return;*/
            wrong++;
        }
        else
        {
            correct++;
        }
    }

    printf("\tCorrect: %d, Wrong: %d\n\n", correct, wrong);
    correct = 0;
    wrong = 0;

    //Transpose with shared memory
    memset(matrATG, 0, size*sizeof(float));
    matrMagicCuda(matrATG, matrA, width, height, nreps, 2);

    //Check if calc was correct
    for (i = 0; i < size; i++)
    {
        if (matrATC[i] != matrATG[i])
        {
            /*printf("ERROR - %d - ATC:%f - ATG:%f\n\n", i, matrATC[i], matrATG[i]);
            return;*/
            wrong++;
        }
        else
        {
            correct++;
        }
    }

    //printf("\tTranspose with SM on GPU was executed correctly.\n\n");
    printf("\tCorrect: %d, Wrong: %d\n\n", correct, wrong);
    correct = 0;
    wrong = 0;

    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaDeviceReset failed!\n");
        return 1;
    }

    return 0;
}

cudaError_t matrMagicCuda(float *matrB, float *matrA, const int width, const int height, const int nreps, const int operation)
{
    float elapsed = 0;
    float *dev_matrA = 0;
    float *dev_matrB = 0;
    cudaError_t cudaStatus;
    dim3 dim_grid, dim_block;
    double gpuBandwidth;

    int size = width * height;

    dim_block.x = TILE_DIM;
    dim_block.y = BLOCK_ROWS;
    dim_block.z = 1;

    dim_grid.x = (width + TILE_DIM - 1) / TILE_DIM;
    dim_grid.y = (height + TILE_DIM - 1) / TILE_DIM;
    dim_grid.z = 1;

    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for three matrix
    cudaStatus = cudaMalloc((void**)&dev_matrA, size * sizeof(float));
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_matrB, size * sizeof(float));
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    // Copy input matrix from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_matrA, matrA, size * sizeof(float), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }
    cudaMemset(dev_matrB, 0, size * sizeof(float));
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    switch (operation)
    {
        case(1):
        {
            cudaEventRecord(start);
            // Launch a kernel on the GPU with one thread for each element.
            naiveTransKernel << <dim_grid, dim_block >> >(dev_matrB, dev_matrA, width, height, nreps);

            cudaEventRecord(stop);
            cudaEventSynchronize(stop);

            cudaEventElapsedTime(&elapsed, start, stop);
            cudaEventDestroy(start);
            cudaEventDestroy(stop);

            elapsed /= nreps;

            gpuBandwidth = (sizeof(float) * size * 2) / (elapsed * 1000000);//scaling from ms to s and B to GB doen implicitly, shortened in fraction, times two for read and write
            printf("Avg. GPU Naive Trans Time: %f ms, bandwidth: %f GB/s\n", elapsed, gpuBandwidth);

            break;
        }

        case(2):
        {
            cudaEventRecord(start);
            // Launch a kernel on the GPU with one thread for each element.
            notSoNaivaTransKernel << <dim_grid, dim_block >> >(dev_matrB, dev_matrA, width, height, nreps);

            cudaEventRecord(stop);
            cudaEventSynchronize(stop);

            cudaEventElapsedTime(&elapsed, start, stop);
            cudaEventDestroy(start);
            cudaEventDestroy(stop);

            elapsed /= nreps;

            gpuBandwidth = (sizeof(float) * size * 2) / (elapsed * 1000000);//scaling from ms to s and B to GB doen implicitly, shortened in fraction, times two for read and write
            printf("Avg. GPU Trans with SM Time: %f ms, bandwidth: %f GB/s\n", elapsed, gpuBandwidth);

            break;
        }

    default:
        printf("No matching opcode was found.\n");
    }

    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }

    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching Kernel!\n", cudaStatus);
        goto Error;
    }

    // Copy output matrix from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(matrB, dev_matrB, size * sizeof(float), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

Error:
    cudaFree(dev_matrB);
    cudaFree(dev_matrA);

    return cudaStatus;
}

void cpuMatrTrans(float *matrB, float *matrA, const int width, const int height, const int nreps)
{
    int i, j, r;

#pragma unroll
    for (r = 0; r < nreps; r++)
#pragma unroll
        for (i = 0; i < height; i++)
#pragma unroll
            for (j = 0; j < width; j++)
                matrB[j * height + i] = matrA[i * width + j];
}

__global__ void naiveTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps)
{
    int i, r;
    int col = blockIdx.x * TILE_DIM + threadIdx.x;
    int row = blockIdx.y * TILE_DIM + threadIdx.y;
    int index_in = col + width * row;
    int index_out = row + height * col;

#pragma unroll
    for (r = 0; r < nreps; r++)
#pragma unroll
        for (i = 0; i < TILE_DIM; i += BLOCK_ROWS)
            if ((row+i<height) && (col < width))
                matrB[index_out + i] = matrA[index_in + i * width];
}

__global__ void notSoNaivaTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps)
{
    __shared__ float tile[TILE_DIM][TILE_DIM + 1];
    int ciIndex = blockIdx.x * TILE_DIM + threadIdx.x;
    int riIndex = blockIdx.y * TILE_DIM + threadIdx.y;
    int coIndex = blockIdx.y * TILE_DIM + threadIdx.x;
    int roIndex = blockIdx.x * TILE_DIM + threadIdx.y;
    int index_in = ciIndex + (riIndex)* width;
    int index_out = coIndex + (roIndex)* height;


    int r, i;
#pragma unroll
    for (r = 0; r < nreps; r++)
    {
#pragma unroll
        for (i = 0; i < TILE_DIM; i += BLOCK_ROWS)
            if ((ciIndex<width) && (riIndex+i < height))
              tile[threadIdx.y + i][threadIdx.x] = matrA[index_in + i * width];
        __syncthreads();

#pragma unroll
        for (i = 0; i < TILE_DIM; i += BLOCK_ROWS)
            if ((coIndex<height) && (roIndex+i < width))
               matrB[index_out + i*height] = tile[threadIdx.x][threadIdx.y + i];
        __syncthreads();
    }
}
$ nvcc -std=c++11 -arch=sm_61 -o t33 t33.cu
t33.cu(25): warning: variable "matrAC" was set but never used

t33.cu(25): warning: variable "matrAC" was set but never used

$ cuda-memcheck ./t33
========= CUDA-MEMCHECK
Avg. CPU Transpose Time: 0.143087 ms, Bandwidth: 2.795509 GB/s

Avg. GPU Naive Trans Time: 0.028587 ms, bandwidth: 13.992195 GB/s
        Correct: 50000, Wrong: 0

Avg. GPU Trans with SM Time: 0.040328 ms, bandwidth: 9.918678 GB/s
        Correct: 50000, Wrong: 0

========= ERROR SUMMARY: 0 errors
$ ./t33
Avg. CPU Transpose Time: 0.140469 ms, Bandwidth: 2.847594 GB/s

Avg. GPU Naive Trans Time: 0.003828 ms, bandwidth: 104.505440 GB/s
        Correct: 50000, Wrong: 0

Avg. GPU Trans with SM Time: 0.000715 ms, bandwidth: 559.206604 GB/s
        Correct: 50000, Wrong: 0

$

Note: The code attempts to measure bandwidth. However, you should be aware that the measured bandwidth here is affected by the cache bandwidth. Your matrix sizes (500x100 = 200Kbytes each for input and output) here are easily small enough to fit in the L2 cache on most GPUs. This fact, coupled with the fact that you are running the same transpose multiple times (nreps) means that much of the work is operating directly out of the L2 cache. Therefore, in the "optimized" case above, we see a reported bandwidth number that greatly exceeds the available memory bandwidth of the GPU (this case happens to be a Pascal Titan X, so about ~340GB/s of main memory bandwidth available). This is due to the fact that this measurement includes some benefit from the L2 cache, whose bandwidth is at least twice as high as main memory bandwidth. You can eliminate this effect by using a significantly larger matrix size and/or reducing nreps to 1.

You are correct this is for a lecture and I am very new and as you already have realized not familiar enough with the indexing. Your solution works and I am extremely grateful for your help. I will try to understand every line of it so in the future I will not make these mistakes again. On a sidenote, my algorithm is from our lecturer :/ Do you have a good reference at hand that can help me understand the exact index, block and grid layout. If the link you provided addresses this forget about it. I will definitly read it! And BTW nice GPU you got there! A Matrix 5000x1000 crashes my driver :( — JRsz, Nov 23 '16 at 09:22
The best reference I can point you to is the one I already linked in my answer. However, for simplicity of presentation, it covers the case where you have a square matrix and also where the matrix dimensions are evenly divisible by the tile dimensions. However many aspects of your presented algorithm seem to be similar to that one, even down to the use of the `BLOCK_ROWS` literal, and the particular kernel loop method that has each thread perform the transpose for several members within the tile. So I imagine that your lecturer is familiar with the blog I already linked. — Robert Crovella, Nov 23 '16 at 09:29
Regarding the driver crash, it seems evident you are working on a windows platform. For windows GPUs that are in WDDM mode (probably the case in your test), the GPU kernel execution is limited to about 2 seconds by the WDDM TDR watchdog. It's possible to work around this if you wish (google "WDDM TDR cuda") but I think it is the most likely reason for your driver crash. The larger matrix transpose, coupled with the large number of repetitions, is causing the kernel to take more than 2 seconds (it might be the naive kernel also). So you could also try to work around this by reducing `nreps`. — Robert Crovella, Nov 23 '16 at 09:51

CUDA matrix transpose with shared memory

1 Answers1