I wrote a few kernel function and wonder how many miliseconds to process these functions.
using namespace std;
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#define N 8000
void fillArray(int *data, int count) {
    for (int i = 0; i < count; i++)
        data[i] = rand() % 100;
}
__global__ void add(int* a, int *b) {
    int add = 0;
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        add = a[tid] + b[tid];
    }
}
__global__ void subtract(int* a, int *b) {
    int subtract = 0;
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        subtract = a[tid] - b[tid];
    }
}
__global__ void multiply(int* a, int *b) {
    int multiply = 0;
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        multiply = a[tid] * b[tid];
    }
}
__global__ void divide(int* a, int *b) {
    int divide = 0;
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        divide = a[tid] / b[tid];
    }
}
__global__ void modu(int* a, int *b) {
    int modulus = 0;
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        modulus = a[tid] % b[tid];
    }
}
__global__ void neg(int *data) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        data[tid] = -data[tid];
    }
}
float duration(int *devA, int *devB, int blocksPerGrid, int threadsPerBlock) {
    cudaEvent_t start, stop;
    float elapsedTime;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    add<<<blocksPerGrid, threadsPerBlock>>>(devA, devB);
    subtract<<<blocksPerGrid, threadsPerBlock>>>(devA, devB);
    multiply<<<blocksPerGrid, threadsPerBlock>>>(devA, devB);
    divide<<<blocksPerGrid, threadsPerBlock>>>(devA, devB);
    modu<<<blocksPerGrid, threadsPerBlock>>>(devA, devB);
    neg<<<blocksPerGrid, threadsPerBlock>>>(devA);
    neg<<<blocksPerGrid, threadsPerBlock>>>(devB);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    return elapsedTime;
}
int main(void) {
    int a[N], b[N];
    float dur = 0;
    int *devA, *devB;
    cudaMalloc((void**) &devA, N * sizeof(int));
    cudaMalloc((void**) &devB, N * sizeof(int));
    fillArray(a, N);
    fillArray(b, N);
    cudaMemcpy(devA, a, N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(devA, b, N * sizeof(int), cudaMemcpyHostToDevice);
    dur = duration(a, b, N, 1);
    cout << "Global memory version:\n";
    cout << "Process completed in " << dur;
    cout << " for a data set of " << N << " integers.";
    return 0;
}
Milisecond always return zero. Why? What I'm missing here? If a i remove the neg functions from the duration duration function. It returns 0.15687 ms. I think it is a small number to process these functions. whats wrong with that program?
After edit, I did this:
using namespace std;
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
const int N = 8000;
void fillArray(int *data, int count) {
    for (int i = 0; i < count; i++)
        data[i] = rand() % 100;
}
__global__ void add(int* a, int *b, int *c) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        c[tid] = a[tid] + b[tid];
    }
}
__global__ void subtract(int* a, int *b, int *c) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        c[tid] = a[tid] - b[tid];
    }
}
__global__ void multiply(int* a, int *b, int *c) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        c[tid] = a[tid] * b[tid];
    }
}
__global__ void divide(int* a, int *b, int *c) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        c[tid] = a[tid] / b[tid];
    }
}
__global__ void modu(int* a, int *b, int *c) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        c[tid] = a[tid] % b[tid];
    }
}
__global__ void neg(int *data, int *c) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        c[tid] = -data[tid];
    }
}
float duration(int *devA, int *devB, int *devC, int blocksPerGrid, int threadsPerBlock) {
    cudaEvent_t start, stop;
    float elapsedTime;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    double hArrayC[N];
    add<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
    cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);
    subtract<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
    cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);
    multiply<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
    cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);
    divide<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
    cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);
    modu<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
    cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);
    neg<<<blocksPerGrid, threadsPerBlock>>>(devA,devC);
    cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);
    neg<<<blocksPerGrid, threadsPerBlock>>>(devB,devC);
    cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    return elapsedTime;
}
int main(void) {
    int a[N], b[N],c[N];
    float dur = 0;
    int *devA, *devB,*devC;
    cudaMalloc((void**) &devA, N * sizeof(int));
    cudaMalloc((void**) &devB, N * sizeof(int));
    cudaMalloc((void**) &devC, N * sizeof(int));
    fillArray(a, N);
    fillArray(b, N);
    cudaMemcpy(devA, a, N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(devB, b, N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(devC, c, N * sizeof(int), cudaMemcpyHostToDevice);
    dur = duration(devA, devB, devC,N, 1);
    cout << "Global memory version:\n";
    cout << "Process completed in " << dur;
    cout << " for a data set of " << N << " integers.";
    cudaFree(devA);
    cudaFree(devB);
    return 0;
}
 
     
     
     
    