I am trying to measure the performance difference of a GPU between allocating memory using 'malloc' in a kernel function vs. using pre-allocated storage from 'cudaMalloc' on the host. To do this, I have two kernel functions, one that uses malloc, one that uses a pre-allocated array, and I time the execution of each function repeatedly.
The problem is that the first execution of each kernel function takes between 400 - 2500 microseconds, but all subsequent runs take about 15 - 30 microseconds.
Is this behavior expected, or am I witnessing some sort of carryover effect from previous runs? If this is carryover, what can I do to prevent it?
I have tried putting in a kernel function that zeros out all memory on the GPU between each timed test run to eliminate that carryover, but nothing changed. I have also tried reversing the order in which I run the tests, and that has no effect on relative or absolute execution times.
const int TEST_SIZE = 1000;
struct node {
    node* next;
    int data;
};
int main() {
    int numTests = 5;
    for (int i = 0; i < numTests; ++i) {
        memClear();
        staticTest();
        memClear();
        dynamicTest();
    }
    return 0;
}
__global__ void staticMalloc(int* sum) {
    // start a linked list
    node head[TEST_SIZE];
    // initialize nodes
    for (int j = 0; j < TEST_SIZE; j++) {
        // allocate the node & assign values
        head[j].next = NULL;
        head[j].data = j;
    }
    // verify creation by adding up values
    int total = 0;
    for (int j = 0; j < TEST_SIZE; j++) {
        total += head[j].data;
    }
    sum[0] = total;
}
/**
 * This is a test that will time execution of static allocation
 */
int staticTest() {
    int expectedValue = 0;
    for (int i = 0; i < TEST_SIZE; ++i) {
        expectedValue += i;
    }
    // host output vector
    int* h_sum = new int[1];
    h_sum[0] = -1;
    // device output vector
    int* d_sum;
    // vector size
    size_t bytes = sizeof(int);
    // allocate memory on device
    cudaMalloc(&d_sum, bytes);
    // only use 1 CUDA thread
    dim3 blocksize(1, 1, 1), gridsize(1, 1, 1);
    Timer runTimer;
    int runTime = 0;
    // check dynamic allocation time
    runTime = 0;
    runTimer.start();
    staticMalloc<<<gridsize, blocksize>>>(d_sum);
    runTime += runTimer.lap();
    h_sum[0] = 0;
    cudaMemcpy(h_sum, d_sum, bytes, cudaMemcpyDeviceToHost);
    cudaFree(d_sum);
    delete (h_sum);
    return 0;
}
__global__ void dynamicMalloc(int* sum) {
    // start a linked list
    node* headPtr = (node*) malloc(sizeof(node));
    headPtr->data = 0;
    headPtr->next = NULL;
    node* curPtr = headPtr;
    // add nodes to test cudaMalloc in device
    for (int j = 1; j < TEST_SIZE; j++) {
        // allocate the node & assign values
        node* nodePtr = (node*) malloc(sizeof(node));
        nodePtr->data = j;
        nodePtr->next = NULL;
        // add it to the linked list
        curPtr->next = nodePtr;
        curPtr = nodePtr;
    }
    // verify creation by adding up values
    curPtr = headPtr;
    int total = 0;
    while (curPtr != NULL) {
        // add and increment current value
        total += curPtr->data;
        curPtr = curPtr->next;
        // clean up memory
        free(headPtr);
        headPtr = curPtr;
    }
    sum[0] = total;
}
/**
 * Host function that prepares data array and passes it to the CUDA kernel.
 */
int dynamicTest() {
    // host output vector
    int* h_sum = new int[1];
    h_sum[0] = -1;
    // device output vector
    int* d_sum;
    // vector size
    size_t bytes = sizeof(int);
    // allocate memory on device
    cudaMalloc(&d_sum, bytes);
    // only use 1 CUDA thread
    dim3 blocksize(1, 1, 1), gridsize(1, 1, 1);
    Timer runTimer;
    int runTime = 0;
    // check dynamic allocation time
    runTime = 0;
    runTimer.start();
    dynamicMalloc<<<gridsize, blocksize>>>(d_sum);
    runTime += runTimer.lap();
    h_sum[0] = 0;
    cudaMemcpy(h_sum, d_sum, bytes, cudaMemcpyDeviceToHost);
    cudaFree(d_sum);
    delete (h_sum);
    return 0;
}
__global__ void clearMemory(char *zeros) {
    int i = threadIdx.x + blockDim.x * blockIdx.x;
    zeros[i] = 0;
}
void memClear() {
    char *zeros[1024]; // device pointers
    for (int i = 0; i < 1024; ++i) {
        cudaMalloc((void**) &(zeros[i]), 4 * 1024 * 1024);
        clearMemory<<<1024, 4 * 1024>>>(zeros[i]);
    }
    for (int i = 0; i < 1024; ++i) {
        cudaFree(zeros[i]);
    }
}
 
     
    