I hope to pick some items(not all of them) from the input data in CUDA.
My input array d_in size is 53*53 which is (sorry it is long):
$abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz
z$abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxy
yz$abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwx
xyz$abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvw
wxyz$abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuv
vwxyz$abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstu
uvwxyz$abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrst
tuvwxyz$abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrs
stuvwxyz$abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqr
rstuvwxyz$abcdefghijklmnopqrstuvwxyzabcdefghijklmnopq
qrstuvwxyz$abcdefghijklmnopqrstuvwxyzabcdefghijklmnop
pqrstuvwxyz$abcdefghijklmnopqrstuvwxyzabcdefghijklmno
opqrstuvwxyz$abcdefghijklmnopqrstuvwxyzabcdefghijklmn
nopqrstuvwxyz$abcdefghijklmnopqrstuvwxyzabcdefghijklm
mnopqrstuvwxyz$abcdefghijklmnopqrstuvwxyzabcdefghijkl
lmnopqrstuvwxyz$abcdefghijklmnopqrstuvwxyzabcdefghijk
klmnopqrstuvwxyz$abcdefghijklmnopqrstuvwxyzabcdefghij
jklmnopqrstuvwxyz$abcdefghijklmnopqrstuvwxyzabcdefghi
ijklmnopqrstuvwxyz$abcdefghijklmnopqrstuvwxyzabcdefgh
hijklmnopqrstuvwxyz$abcdefghijklmnopqrstuvwxyzabcdefg
ghijklmnopqrstuvwxyz$abcdefghijklmnopqrstuvwxyzabcdef
fghijklmnopqrstuvwxyz$abcdefghijklmnopqrstuvwxyzabcde
efghijklmnopqrstuvwxyz$abcdefghijklmnopqrstuvwxyzabcd
defghijklmnopqrstuvwxyz$abcdefghijklmnopqrstuvwxyzabc
cdefghijklmnopqrstuvwxyz$abcdefghijklmnopqrstuvwxyzab
bcdefghijklmnopqrstuvwxyz$abcdefghijklmnopqrstuvwxyza
abcdefghijklmnopqrstuvwxyz$abcdefghijklmnopqrstuvwxyz
zabcdefghijklmnopqrstuvwxyz$abcdefghijklmnopqrstuvwxy
yzabcdefghijklmnopqrstuvwxyz$abcdefghijklmnopqrstuvwx
xyzabcdefghijklmnopqrstuvwxyz$abcdefghijklmnopqrstuvw
wxyzabcdefghijklmnopqrstuvwxyz$abcdefghijklmnopqrstuv
vwxyzabcdefghijklmnopqrstuvwxyz$abcdefghijklmnopqrstu
uvwxyzabcdefghijklmnopqrstuvwxyz$abcdefghijklmnopqrst
tuvwxyzabcdefghijklmnopqrstuvwxyz$abcdefghijklmnopqrs
stuvwxyzabcdefghijklmnopqrstuvwxyz$abcdefghijklmnopqr
rstuvwxyzabcdefghijklmnopqrstuvwxyz$abcdefghijklmnopq
qrstuvwxyzabcdefghijklmnopqrstuvwxyz$abcdefghijklmnop
pqrstuvwxyzabcdefghijklmnopqrstuvwxyz$abcdefghijklmno
opqrstuvwxyzabcdefghijklmnopqrstuvwxyz$abcdefghijklmn
nopqrstuvwxyzabcdefghijklmnopqrstuvwxyz$abcdefghijklm
mnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz$abcdefghijkl
lmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz$abcdefghijk
klmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz$abcdefghij
jklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz$abcdefghi
ijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz$abcdefgh
hijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz$abcdefg
ghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz$abcdef
fghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz$abcde
efghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz$abcd
defghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz$abc
cdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz$ab
bcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz$a
abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz$
and I want to pick the last item of each row from the input to my output d_out. In this way, the output size should be 53. Here is my code. For coping data of preSort tod_in and temp to d_out, allocating memory for two pointers and launching kernel.
//variables declared
const int ARRAY_BYTES_IN = CAPACITY * sizeof(char);
const int ARRAY_BYTES_ST = CAPACITY * CAPACITY * sizeof(char);
const int CAPACITY = 53;
char preSort[CAPACITY * CAPACITY];
char temp[CAPACITY];
void getLast(){
     //two pointers
     char* d_in;
     char* d_out;
     //allocate gpu memory     
     cudaMalloc(&d_in, ARRAY_BYTES_ST);
     cudaMalloc(&d_out, ARRAY_BYTES_IN);
     //transfer input into gpu
     cudaMemcpy(d_in, preSort, ARRAY_BYTES_ST, cudaMemcpyHostToDevice);
     int size = CAPACITY*CAPACITY;
     int blockSize = 1024;
     int numbBlock = (size + blockSize - 1) / blockSize;
     //Launch the kernel
     DoGetLast<<<numbBlock, blockSize>>>(d_out, d_in);
     //Copy back to the  host
     cudaMemcpy(temp, d_out, ARRAY_BYTES_IN, cudaMemcpyDeviceToHost);
     cudaFree(d_in);
     cudaFree(d_out);
}
the GPU kernel is
__global__ void DoGetLast(char* d_out, char* d_in){
     int CAP = 53*53;
     int idx = blockDim.x * blockIdx.x + threadIdx.x;
     char f;
     //get the output trmo the input, It's a 1-D array actually, so pick 
     //only one character through every 53 characters from d_in
     if(idx % CAP == (CAP - 1)){
          f = d_in[idx];
          d_out[idx] = f;
     }
}
In the main, I only call the getLast() method, and using a loop for showing the output.I hope the output will look like:
zyxwvutsrqponmlkjihgfedcbazyxwvutsrqponmlkjihgfedcba$
However, I only got one letter output which is only z in my output.
Anyone can tell the problems in my code? and give a help?
 
     
     
    