I'm running this code to sort big array of IPs using thrust stable_sort and user defined operator to compare the IPs. this code is working for arrays less than 50000 IPs, but I got a memory error for big arrays. here is the code I used:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <stdio.h>
#include <time.h>
#include <device_functions.h>
template<typename T>
struct vector_less
{
    typedef T first_argument_type;
    typedef T second_argument_type;
    typedef bool result_type;
    __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {
        if (lhs[0] == rhs[0])
        if (lhs[1] == rhs[1])
        if (lhs[2] == rhs[2])
            return lhs[3] < rhs[3];
        else
            return lhs[2] < rhs[2];
        else
            return lhs[1] < rhs[1];
        else
            return lhs[0] < rhs[0];
    }
}; 
__global__ void prepare_ips_list(unsigned char ** dev_sorted_Ips, unsigned char * ip_b1, unsigned char * ip_b2, unsigned char * ip_b3, unsigned char * ip_b4, unsigned int searchedIpsSize)
{
    int thread = threadIdx.x + blockIdx.x * blockDim.x;
    if (thread < searchedIpsSize)
    {
        dev_sorted_Ips[thread] = new unsigned char[4];
        dev_sorted_Ips[thread][0] = ip_b1[thread];
        dev_sorted_Ips[thread][1] = ip_b2[thread];
        dev_sorted_Ips[thread][2] = ip_b3[thread];
        dev_sorted_Ips[thread][3] = ip_b4[thread];
    }
}
int main()
{
    const int size = 1000000;
    unsigned char * ip_b1 = new unsigned char[size];
    unsigned char * ip_b2 = new unsigned char[size];;
    unsigned char * ip_b3 = new unsigned char[size];;
    unsigned char * ip_b4 = new unsigned char[size];;
    unsigned char * dev_ip_b1;
    unsigned char * dev_ip_b2;
    unsigned char * dev_ip_b3;
    unsigned char * dev_ip_b4;
    unsigned char ** dev_sortedIps;
    for (int i = 0; i < size; i++)
    {
        ip_b1[i] = rand() % 240;
        ip_b2[i] = rand() % 240;
        ip_b3[i] = rand() % 240;
        ip_b4[i] = rand() % 240;
    }
    cudaError_t cudaStatus;
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }
    cudaStatus = cudaMalloc((void**)&dev_ip_b1, size * sizeof(unsigned char));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }
    cudaStatus = cudaMemcpy(dev_ip_b1, ip_b1, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }
    cudaStatus = cudaMalloc((void**)&dev_ip_b2, size * sizeof(unsigned char));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }
    cudaStatus = cudaMemcpy(dev_ip_b2, ip_b2, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }
    cudaStatus = cudaMalloc((void**)&dev_ip_b3, size * sizeof(unsigned char));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }
    cudaStatus = cudaMemcpy(dev_ip_b3, ip_b3, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }
    cudaStatus = cudaMalloc((void**)&dev_ip_b4, size * sizeof(unsigned char));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }
    cudaStatus = cudaMemcpy(dev_ip_b4, ip_b4, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }
    cudaStatus = cudaMalloc((void**)&dev_sortedIps, size * sizeof(unsigned char *));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }
    int resetThreads = size;
    int resetBlocks = 1;
    if (size > 1024)
    {
        resetThreads = 1024;
        resetBlocks = size / 1024;
        if (size % 1024 > 0)
            resetBlocks++;
    }
    prepare_ips_list << <resetBlocks, resetThreads >> >(dev_sortedIps, dev_ip_b1, dev_ip_b2, dev_ip_b3, dev_ip_b4, size);
    thrust::device_ptr<unsigned char *> sorted_list_ptr1(dev_sortedIps);
    thrust::stable_sort(sorted_list_ptr1, sorted_list_ptr1 + size, vector_less<unsigned char *>());
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }
    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching !\n", cudaStatus);
        goto Error;
    }
    return 0;
Error:
    cudaFree(dev_ip_b1);
    cudaFree(dev_ip_b2);
    cudaFree(dev_ip_b3);
    cudaFree(dev_ip_b4);
    cudaFree(dev_sortedIps);
}
the error I got is : Microsoft C++ exception: thrust::system::system_error at memory location
how to solve this problem with big arrays? should I use another technique to achieve this sorting such as dividing and sorting for the parts then merging?
 
     
     
    