I have a simple task, that I can't seem to solve. I got 2 unidimensional arrays (called vectors) consisting of 10 elements. Each element of the array contains a random positive number. The goal is to use CUDA to calculate the sum of those 2 arrays of each index number (in other words: Vector Sum[0] = Vector A[0] + Vector B[0], then the same with 1,2...10)
Here is my code (kernel.cu). I know I am using the "float-anything" variable names for integer data types. That's because I initially planned to do it on float data types but I could not get the project working at all as a result of data type incompatibilities. Correct me if it's actually possible using float data types for this.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
__global__ void vecAdd_kernel(int *floatAr1gpu, int *floatAr2gpu, int *floatSumGPU, int The_N){
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < The_N) floatSumGPU[i] = floatAr1gpu[i] + floatAr2gpu[i];
}
int main() 
{
    const unsigned int arraySize = 10;
    int floatArray1[arraySize];
    int floatArray2[arraySize];
    int *floatAr1gpu = 0;
    int *floatAr2gpu = 0;
    int floatSum[arraySize];
    int *floatSumGPU = 0;
    for (int c = 0; c < arraySize; c++) {
        floatArray1[c] = (rand() % 10)+1;
        floatArray2[c] = (rand() % 10)+1;
    }
    //Put the data into the GPU now
    //                      V--- This is allocating GPU memory under that name and Variable
    cudaMalloc((void **)&floatArray1, sizeof(float)*arraySize);
    cudaMalloc((void **)&floatArray2, sizeof(float)*arraySize);
    cudaMalloc((void **)&floatSum, sizeof(float)*arraySize);
    //           CPU Memory    GPU Mem       Array size              Method
    cudaMemcpy(floatArray1, floatAr1gpu, sizeof(float)*arraySize, cudaMemcpyHostToDevice);
    cudaMemcpy(floatArray2, floatAr2gpu, sizeof(float)*arraySize, cudaMemcpyHostToDevice);
    // execute
    //         grid size, block size
    vecAdd_kernel << < 1, arraySize >> > (floatArray1, floatArray2, floatSum, arraySize);
    //Copy data back from GPU to RAM
    //          GPU Memory   CPU Mem       Array size               Method
    cudaMemcpy(floatSumGPU, floatSum, sizeof(float)*arraySize, cudaMemcpyDeviceToHost);
    // clean up
    cudaFree(floatArray1);
    cudaFree(floatArray2);
    cudaFree(floatSum);
    for (int cc = 0; cc < arraySize; cc++) {
        std::cout << "Result of array number " << cc << " = " << floatSum[cc] << std::endl;
    }
    std::cout << "Done. Press any key to exit." << std::endl;
    char key = std::cin.get();
    return 0;
}
This is what I get as a result: Program result
This is what I want to achieve (using CUDA): Program result
What's wrong with the code? I placed a break-point to check that array here: array contents
 
    