The threadIdx index is architecture-dependent and you cannot just set it at whatever you like.
The following code works until arraySize == 1024 on my system, but then at arraySize == 1025 I get undefined values
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
using namespace std;
__global__ void gpucopy( int* src, int* dst )
{
  int i = threadIdx.x;
  dst[i] = src[i];
} 
int main()
{
  const int arraySize = 500; // >= 1025 will fail on my system!
  int* data1 = new int[arraySize]; 
  int* data2 = new int[arraySize];
  // Initialized both data1 and data2
  // ... 
  for(int i=0; i<arraySize; i++)
    data1[i] = 2*i;
  int* dev_data1 = NULL; 
  int* dev_data2 = NULL; 
  // Initialized both dev_data1 and dev_data2
  // ... 
  cudaMalloc(&dev_data1, arraySize*sizeof(int));
  cudaMalloc(&dev_data2, arraySize*sizeof(int));
  // copy data1 to device
  cudaMemcpy(dev_data1, data1, arraySize*sizeof(int), cudaMemcpyHostToDevice );
  // copy dev_data1 to dev_data2 with gpu
  gpucopy<<<1, arraySize>>>( dev_data1, dev_data2 ); 
  // copy dev_data2 to data
  cudaMemcpy(data2, dev_data2, arraySize*sizeof(int), cudaMemcpyDeviceToHost );
  for(int i=0; i<arraySize; i++)
    if(data2[i] != data1[i])
      cout << "Error: data is different - data2[" << i << "] is " << data2[i] << endl;
      return 0;
}
You can find out this value by either looking at the documentation or with the deviceQuery() function
C:\ProgramData\NVIDIA Corporation\NVIDIA GPU Computing SDK 4.1\C\bin\win64\Release\deviceQuery.exe Starting...
 CUDA Device Query (Runtime API) version (CUDART static linking)
Found 2 CUDA Capable device(s)
Device 0: "Tesla C2050"
  CUDA Driver Version / Runtime Version          4.2 / 4.1
  CUDA Capability Major/Minor version number:    2.0
  Total amount of global memory:                 2688 MBytes (2818572288 bytes)
  (14) Multiprocessors x (32) CUDA Cores/MP:     448 CUDA Cores
  GPU Clock Speed:                               1.15 GHz
  Memory Clock rate:                             1500.00 Mhz
  Memory Bus Width:                              384-bit
  L2 Cache Size:                                 786432 bytes
  Max Texture Dimension Size (x,y,z)             1D=(65536), 2D=(65536,65535), 3D=(2048,2048,2048)
  Max Layered Texture Size (dim) x layers        1D=(16384) x 2048, 2D=(16384,16384) x 2048
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total number of registers available per block: 32768
  Warp size:                                     32
  Maximum number of threads per block:           1024 <-----