I have the following problem (keep in mind that I am fairly new to programming with CUDA).
I have a class called vec3f that is just like the float3 data type but with overloaded operators, and other vector functions.  These functions are prefixed with __device__ __host__.  Then, in my kernel I do a nested for loop over block_x and block_y indices and do something like,
//set up shared memory block
extern __shared__ vec3f share[];
vec3f *sh_pos = share;
vec3f *sh_velocity = &sh_pos[blockDim.x*blockDim.y];
sh_pos[blockDim.x * threadIdx.x + threadIdx.y] = oldParticles[index].position();
sh_velocity[blockDim.x * threadIdx.x + threadIdx.y] = oldParticles[index].velocity();
__syncthreads();
In the above code, oldParticles is a pointer to a class called particles that is being passed to the kernel.  OldParticles is actually an underlying pointer of a thrust::device_vector<particle> (I'm not sure if this has something to do with it).  Everything compiles okay but when I run I get the error
libc++abi.dylib: terminate called throwing an exception
Abort trap: 6
Thanks for the replies. I think the error had to do with me not allocating room for the arguments being passed to my kernel. Doing the following in my host code fixed this error,
particle* particle_ptrs[2];
particle_ptrs[0] = thrust::raw_pointer_cast(&d_old_particles[0]);
particle_ptrs[1] = thrust::raw_pointer_cast(&d_new_particles[0]);
CUDA_SAFE_CALL( cudaMalloc( (void**)&particle_ptrs[0], max_particles * sizeof(particle) ) );
CUDA_SAFE_CALL( cudaMalloc( (void**)&particle_ptrs[1], max_particles * sizeof(particle) ) );
The kernel call is then,
force_kernel<<< grid,block,sharedMemSize  >>>(particle_ptrs[0],particle_ptrs[1],time_step);
The issue that I am having now seems to be that I can't get data copied back to the host from the device. I think this has to do with me not being familiar with Thrust.
I'm doing a series of copies as follows,
//make a host vector assume this is initialized
thrust::host_vector<particle> h_particles;
thrust::device_vector<particle> d_old_particles, d_new_particles;
d_old_particles = h_particles;
//launch kernel as shown above 
//with thrust vectors having been casted into their underlying pointers
//particle_ptrs[1] gets modified and so shouldnt d_new_particles?
//copy back
h_particles = d_new_particles;
So I guess my question is, can I modify a Thrust device_vector in a kernel (in this case particle_pters[0]) save the modification to another Thrust device_vector in the kernel (in this case particle_pters[1]) and then once I exit from the kernel, copy that to a host_vector?
I still can't get this to work. I made a shorter example where I am having the same problem,
#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include "vec3f.h"
const int BLOCK_SIZE = 8;
const int max_particles = 64;
const float dt = 0.01;
using namespace std;
//particle class
class particle {
public:
  particle() : 
    _velocity(vec3f(0,0,0)), _position(vec3f(0,0,0)), _density(0.0) {
  };
  particle(const vec3f& pos, const vec3f& vel) :
    _position(pos), _velocity(vel), _density(0.0) {
  };
  vec3f _velocity;
  vec3f _position;
  float _density;
};
//forward declaration of kernel func
__global__ void kernel_func(particle* old_parts, particle* new_parts, float dt);
//global thrust vectors
thrust::host_vector<particle> h_parts;
thrust::device_vector<particle> old_parts, new_parts;
particle* particle_ptrs[2];
int main() {
  //load host vector
  for (int i =0; i<max_particles; i++) {
    h_parts.push_back(particle(vec3f(0.5,0.5,0.5),vec3f(10,10,10)));
  }
  particle_ptrs[0] = thrust::raw_pointer_cast(&old_parts[0]);
  particle_ptrs[1] = thrust::raw_pointer_cast(&new_parts[0]);
  cudaMalloc( (void**)&particle_ptrs[0], max_particles * sizeof(particle) );
  cudaMalloc( (void**)&particle_ptrs[1], max_particles * sizeof(particle) );
  //copy host particles to old device particles...
  old_parts = h_parts;
  //kernel block and grid dimensions
  dim3 block(BLOCK_SIZE,BLOCK_SIZE,1);
  dim3 grid(int(sqrt(float(max_particles) / (float(block.x*block.y)))), int(sqrt(float(max_particles) / (float(block.x*block.y)))), 1);
  kernel_func<<<block,grid>>>(particle_ptrs[0],particle_ptrs[1],dt);
  //copy new device particles back to host particles
  h_parts = new_parts;
  for (int i =0; i<max_particles; i++) {
    particle temp1 = h_parts[i];
    cout << temp1._position << endl;
  }  
  //delete thrust device vectors
  old_parts.clear();
  old_parts.shrink_to_fit();
  new_parts.clear();
  new_parts.shrink_to_fit();
  return 0;
}
//kernel function
__global__ void kernel_func(particle* old_parts, particle* new_parts, float dt) {
  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
  unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
  //get array position for 2d grid...
  unsigned int arr_pos = y*blockDim.x*gridDim.x + x;
  new_parts[arr_pos]._velocity = old_parts[arr_pos]._velocity * 10.0 * dt;
  new_parts[arr_pos]._position = old_parts[arr_pos]._position * 10.0 * dt;
  new_parts[arr_pos]._density = old_parts[arr_pos]._density * 10.0 * dt;
}
So the host_vector has an initial position of (0.5,0.5,0.5) for all 64 particles.  Then the kernel attempts to multiply that by 10 to give (5,5,5) as the position for all particles.  But I don't see this when I cout the data.  It is still just (0.5,0.5,0.5).  Is there a problem with how I am allocating memory?  Is there a problem with the lines:
  //copy new device particles back to host particles
  h_parts = new_parts;
What could be the issue? Thank you.
 
     
     
    