I'm trying to implement string matching program with CUDA in C and I have th following issue.
When I set 1 block and 1 thread per block the result for pattern dfh is 2. That's correct, but when I increase the blocks the result is 4.
The text file is:
ffskdfhksdjhfksdfksjdfhksdhfksjdhfkjer654yrkhjkfgjhdsrtrhkjchgkjthyoirthygfnbkjgkjdhykhkjchgkjfdhsfykhkbhkjfghkfgjy
This is my code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda.h>
__global__ void string_matching(char *buffer, char *pattern, int match_size, int pattern_size, int *result){
    int tid, i;
    __shared__ int local_matches;
    if(threadIdx.x == 0) local_matches = 0;
    __syncthreads();
    for(tid=blockIdx.x*blockDim.x+threadIdx.x; tid<match_size; tid+=blockDim.x){
        for (i = 0; i < pattern_size && pattern[i] == buffer[i + tid]; ++i);
        if(i >= pattern_size){
            atomicAdd(&local_matches, 1);
        }
    }
    __syncthreads();
    if(threadIdx.x == 0) 
        atomicAdd(result, local_matches);
}
int main(int argc, char *argv[]){
    FILE *pFile;
    long file_size, match_size, pattern_size;
    char * buffer;
    char * filename, *pattern;
    size_t result;
    int *match, total_matches;
    //CUDA variables
    int blocks, threads_per_block;
    int *result_dev;
    char *buffer_dev, *pattern_dev;
    float total_time, comp_time;
    cudaEvent_t total_start, total_stop, comp_start, comp_stop;
    cudaEventCreate(&total_start);
    cudaEventCreate(&total_stop);
    cudaEventCreate(&comp_start);
    cudaEventCreate(&comp_stop);
    if (argc != 5) {
        printf ("Usage : %s <file_name> <string> <blocks> <threads_per_block>\n", argv[0]);
        return 1;
    }
    filename = argv[1];
    pattern = argv[2];
    blocks = strtol(argv[3], NULL, 10);
    threads_per_block = strtol(argv[4], NULL, 10);
    
    pFile = fopen ( filename , "rb" );
    if (pFile==NULL) {printf ("File error\n"); return 2;}
    // obtain file size:
    fseek (pFile , 0 , SEEK_END);
    file_size = ftell (pFile);
    rewind (pFile);
    printf("file size is %ld\n", file_size);
    
    // allocate memory to contain the file:
    buffer = (char*) malloc (sizeof(char)*file_size);
    if (buffer == NULL) {printf ("Memory error\n"); return 3;}
    // copy the file into the buffer:
    result = fread (buffer,1,file_size,pFile);
    if (result != file_size) {printf ("Reading error\n"); return 4;} 
    
    pattern_size = strlen(pattern);
    match_size = file_size - pattern_size + 1;
    
    match = (int *) malloc (sizeof(int)*match_size);
    if (match == NULL) {printf ("Malloc error\n"); return 5;}
    cudaMalloc((void **)&result_dev, sizeof(int));
    cudaMalloc((void **)&buffer_dev, file_size*sizeof(char));
    cudaMalloc((void **)&pattern_dev, pattern_size*sizeof(char));
    cudaEventRecord(total_start);
    cudaEventRecord(comp_start);
    cudaMemcpy(buffer_dev, buffer, file_size*sizeof(char), cudaMemcpyHostToDevice);
    cudaMemcpy(pattern_dev, pattern, pattern_size*sizeof(char), cudaMemcpyHostToDevice);
    string_matching<<<blocks, threads_per_block>>>(buffer_dev, pattern_dev, match_size, pattern_size, result_dev);
    cudaThreadSynchronize();
    cudaEventRecord(comp_stop);
    cudaEventSynchronize(comp_stop);
    cudaEventElapsedTime(&comp_time, comp_start, comp_stop);
    cudaMemcpy(&total_matches, result_dev, sizeof(int), cudaMemcpyDeviceToHost);
    cudaEventRecord(total_stop);
    cudaEventSynchronize(total_stop);
    cudaEventElapsedTime(&total_time, total_start, total_stop);
    cudaFree(result_dev);
    cudaFree(buffer_dev);
    cudaFree(pattern_dev);
    fclose (pFile);
    free (buffer);
    //Print result
    printf("Total matches: %d\n", total_matches);
    printf("\n\n\nN: %d, Blocks: %d, Threads: %d\n", file_size, blocks, blocks*threads_per_block);
    printf("Total time (ms): %.3f\n", total_time);
    printf("Kernel time (ms): %.3f\n", comp_time);
    printf("Data transfer time(ms): %.3f\n\n\n", total_time-comp_time);
}
 
     
    