I have this code that I parallelized using OpenMP that seems to run slower than the serial version. Here's the relevant fragment of the code:
Out_props ion_out;
#pragma omp parallel for firstprivate(Egx,Egy,vi_inlet,dt,xmin,xmax,ymin,ymax,qmi,dy,Nx) private(ion_out)
for (int i=0;i<Np;i++)
{
ion_out = ApplyReflectionBC(dt,Nx,xmin,xmax,ymin,ymax,qmi,dy,vi_inlet,Egx,Egy,xi_i[2*i],xi_i[1+2*i],vi_i[2*i],vi_i[1+2*i]);
xi_o[1-1+2*i]=ion_out.xout;
xi_o[2-1+2*i]=ion_out.yout;
vi_o[1-1+2*i]=ion_out.vxout;
vi_o[2-1+2*i]=ion_out.vyout;
}
Here outprops is just a structure with 4 members of the double type. The ApplyReflectionBC functions (given below) just applies some operations for each i. All these operations are completely independent of each other. Egx and Egy are 60x60 matrices defined prior to entering this loop and vi_inlet is 2x1 vector. I've tried making ion_out a matrix of size Np to further increase independence, but that seems to make no difference. Everything else inside firstprivate is a double type defined prior to entering this loop.
I'd really appreciate any insights into why this might be running many times slower than the serial version. Thank you!
Out_props ApplyReflectionBC(double dt,int Nx,double xmin,double xmax,double ymin, double ymax,double qmp, double dy, double *vp_inlet,double *Egx,double *Egy, double xpx,double xpy,double vpx,double vpy)
{
Out_props part_out;
double Lgy=ymax-ymin;
double xp_inp[2]={xpx,xpy};
double vp_inp[2]={vpx,vpy};
double xp_out[2];
double vp_out[2];
struct vector
{
double x;
double y;
}vnmf,Ep,xnmf;
if((xp_inp[1-1]>xmin) && (xp_inp[1-1]<xmax) && (xp_inp[2-1]<ymin)) //ONLY below lower wall
{
xp_out[1-1]=xp_inp[1-1];
xp_out[2-1]=ymin;
vp_out[1-1]=vp_inp[1-1];
vp_out[2-1]=-vp_inp[2-1];
}
else if((xp_inp[1-1]<xmin) || (xp_inp[1-1]>xmax) || (xp_inp[2-1]>ymax))
{//Simple Boris Push
xnmf.x=xmin;
xnmf.y=ymin+Lgy*rand()/RAND_MAX;
vnmf.x=vp_inlet[0];
vnmf.y=vp_inlet[1];
//Find E field at x,y
double yjp=ymin+dy*floor((xnmf.y-ymin)/(1.0*dy));
double yjp1p=yjp+dy;
int kp=(yjp-ymin)/dy;
int kpp1=kp+1;
double ylg=xnmf.y-yjp;
double wjk=1.0*(dy-ylg)/(1.0*dy);
double wjkp1=1.0*ylg/(1.0*dy);
Ep.x=wjk*Egx[Nx*kp]+wjkp1*Egx[Nx*kpp1];
Ep.y=wjk*Egy[Nx*kp]+wjkp1*Egy[Nx*kpp1];
do
{
double f=1.0*rand()/RAND_MAX;
xp_out[1-1]=xnmf.x+f*dt*(vnmf.x+qmp*Ep.x*f*dt/2.0);
xp_out[2-1]=xnmf.y+f*dt*(vnmf.y+qmp*Ep.y*f*dt/2.0);
vp_out[1-1]=vnmf.x+qmp*Ep.x*(f-0.5)*dt;
vp_out[2-1]=vnmf.y+qmp*Ep.y*(f-0.5)*dt;
} while((xp_out[1-1]<xmin) || (xp_out[1-1]>xmax) || (xp_out[2-1]<ymin) || (xp_out[2-1]>ymax));
}
else
{
xp_out[1-1]=xp_inp[1-1];
xp_out[2-1]=xp_inp[2-1];
vp_out[1-1]=vp_inp[1-1];
vp_out[2-1]=vp_inp[2-1];
}
part_out.xout=xp_out[0];
part_out.yout=xp_out[1];
part_out.vxout=vp_out[0];
part_out.vyout=vp_out[1];
return part_out;
}