I currently have to run a nested loop as follow:
for(int i = 0; i < N; i++){
    for(int j = i+1; j <= N; j++){
        compute(...)//some calculation here
    }
}
I've tried leaving the first loop in CPU and do the second loop in GPU. Results are too many memory access. Is there any other ways to do it? For example by thrust::reduce_by_key?
The whole program is here:
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/generate.h>
#include <thrust/sort.h>
#include <thrust/binary_search.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/random.h>
#include <cmath>
#include <iostream>
#include <iomanip>
#define N 1000000
// define a 2d point pair
typedef thrust::tuple<float, float> Point;
// return a random Point in [0,1)^2
Point make_point(void)
{
  static thrust::default_random_engine rng(12345);
  static thrust::uniform_real_distribution<float> dist(0.0f, 1.0f);
  float x = dist(rng);
  float y = dist(rng);
  return Point(x,y);
}
struct sqrt_dis: public thrust::unary_function<Point, double>
{
  float x, y;
  double tmp;
  sqrt_dis(float _x, float _y): x(_x), y(_y){}
  __host__ __device__
  float operator()(Point a)
 {
    tmp =(thrust::get<0>(a)-x)*(thrust::get<0>(a)-x)+\
    (thrust::get<1>(a)-y)*(thrust::get<1>(a)-y);
    tmp = -1.0*(sqrt(tmp));
    return (1.0/tmp);
 }
};
int main(void) {
  clock_t t1, t2;
  double result;
  t1 = clock();
  // allocate some random points in the unit square on the host
  thrust::host_vector<Point> h_points(N);
  thrust::generate(h_points.begin(), h_points.end(), make_point);
  // transfer to device
  thrust::device_vector<Point> points = h_points;
  thrust::plus<double> binary_op;
  float init = 0;
  for(int i = 0; i < N; i++){
    Point tmp_i = points[i];
    float x = thrust::get<0>(tmp_i);
    float y = thrust::get<1>(tmp_i);
    result += thrust::transform_reduce(points.begin()+i,\
                                       points.end(),sqrt_dis(x,y),\
                                       init,binary_op);
    std::cout<<"result"<<i<<": "<<result<<std::endl;
  }
  t2 = clock()-t1;
  std::cout<<"result: ";
  std::cout.precision(10);
  std::cout<< result <<std::endl;
  std::cout<<"run time: "<<t2/CLOCKS_PER_SEC<<"s"<<std::endl;
  return 0;
 }