I have a program which (for now) calculates values of two functions in random points on GPU , sends these values back to host, and then visualizes them. This is what I get, some nice semi-random points:
 Now, if I modify my kernel code, and add the local array initalization code at the very end,
Now, if I modify my kernel code, and add the local array initalization code at the very end, 
__global__ void optymalize(curandState * state, float* testPoints)
{
int ind=blockDim.x*blockIdx.x+threadIdx.x;
int step=blockDim.x*gridDim.x; 
for(int i=ind*2;i<NOF*TEST_POINTS;i+=step*2)
{   
    float* x=generateX(state);
    testPoints[i]=ZDT_f1(x);
    testPoints[i+1]=ZDT_f2(x);
}
//works fine with 'new'
//float* test_array=new float[2];
float test_array[2]={1.0f,2.0f};    
}
I get something like this everytime:

Does anyone know the cause of this behavior? All the drawn points are computed BEFORE test_array is initialized, yet they are affected by it. It doesn't happen when I initialize test_array before the 'for' loop.
Host/device code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "curand_kernel.h"  
#include "device_functions.h"
#include <random>
#include <iostream>
#include <time.h>
#include <fstream>
using namespace std;
#define XSIZE 5
#define TEST_POINTS 100
#define NOF 2
#define BLOCK_COUNT 64
#define THR_COUNT 128
#define POINTS_PER_THREAD (NOF*TEST_POINTS+THR_COUNT*BLOCK_COUNT-1)/(THR_COUNT*BLOCK_COUNT)
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=false)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}
__device__ float g(float* x)
{
    float tmp=1;
    for(int i=1;i<XSIZE;i++)
        tmp*=x[i];
    return 1+9*(tmp/(XSIZE-1));
}
__device__ float ZDT_f1(float* x)
{
    return x[0];
}
__device__ float ZDT_f2(float* x)
{
    float gp=g(x);
    return gp*(1-sqrtf(x[0]/gp));
}
__device__ bool oneDominatesTwo(float* x1, float* x2)
{
    for(int i=0;i<XSIZE;i++)
        if(x1[i]>=x2[i])
            return false;
    return true;
}
__device__ float* generateX(curandState* globalState)
{
    int ind = threadIdx.x;
    float x[XSIZE];
    for(int i=0;i<XSIZE;i++)
        x[i]=curand_uniform(&globalState[ind]);
    return x;
}
__global__ void setup_kernel ( curandState * state, unsigned long seed )
{
    int id = blockDim.x*blockIdx.x+threadIdx.x;
    curand_init ( seed, id, 0, &state[id] );
}
__global__ void optymalize(curandState * state, float* testPoints)
{
    int ind=blockDim.x*blockIdx.x+threadIdx.x;
    int step=blockDim.x*gridDim.x; 
    for(int i=ind*2;i<NOF*TEST_POINTS;i+=step*2)
    {   
        float* x=generateX(state);
        testPoints[i]=ZDT_f1(x);
        testPoints[i+1]=ZDT_f2(x);
    }
    __syncthreads();
    //float* test_array=new float[2];
    //test_array[0]=1.0f;
    //test_array[1]=1.0f;
    float test_array[2]={1.0f,1.0f};    
}
void saveResultToFile(float* result)
{
    ofstream resultFile;
    resultFile.open ("result.txt");
    for(unsigned int i=0;i<NOF*TEST_POINTS;i+=NOF)
    {
        resultFile << result[i] << " "<<result[i+1]<<"\n";
    }
    resultFile.close();
}
int main()
{
    float* dev_fPoints;
    float* fPoints=new float[NOF*TEST_POINTS];
    gpuErrchk(cudaMalloc((void**)&dev_fPoints, NOF * TEST_POINTS * sizeof(float)));
    curandState* devStates;
    gpuErrchk(cudaMalloc(&devStates,THR_COUNT*sizeof(curandState)));
    cudaEvent_t start;
    gpuErrchk(cudaEventCreate(&start));
    cudaEvent_t stop;
    gpuErrchk(cudaEventCreate(&stop));
    gpuErrchk(cudaThreadSetLimit(cudaLimitMallocHeapSize, 128*1024*1024));
    gpuErrchk(cudaEventRecord(start, NULL));
    setup_kernel<<<BLOCK_COUNT, THR_COUNT>>>(devStates,unsigned(time(NULL)));
    gpuErrchk(cudaDeviceSynchronize());
    gpuErrchk(cudaGetLastError());
    optymalize<<<BLOCK_COUNT,THR_COUNT>>>(devStates, dev_fPoints);
    gpuErrchk(cudaDeviceSynchronize());
    gpuErrchk(cudaGetLastError());
    gpuErrchk(cudaMemcpy(fPoints, dev_fPoints, NOF * TEST_POINTS * sizeof(float), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaEventRecord(stop, NULL));
    gpuErrchk(cudaEventSynchronize(stop));
    float msecTotal = 0.0f;
    cudaEventElapsedTime(&msecTotal, start, stop);
    cout<<"Kernel execution time: "<<msecTotal<< "ms"<<endl;
    saveResultToFile(fPoints);
    system("start pythonw  plot_data.py result.txt");
    cudaFree(dev_fPoints);
    cudaFree(devStates);
    system("pause");
    return 0;
}
Plot script code:
import matplotlib.pyplot as plt;
import sys;
if len(sys.argv)<2:
    print("Usage: python PlotScript <filename>");
    sys.exit(0);
path=sys.argv[1];
x=[]
y=[]
with open(path,"r") as f:
    for line in f:
        vals=line.strip().split(" ");
        x.append(vals[0]);
        y.append(vals[1]);
plt.plot(x,y,'ro')
plt.show();
 
     
    