I have tried to use a polynomial class in CUDA. The class definition is as follow:
template<int Degree>
class Polynomial{
    public:
        float coefficients[Degree+1];
};
template<int Degree>
class StartingPolynomial{
    public:
        Polynomial<Degree> p;
        float start;
};
template<int Degree>
class PPolynomial{
    public:
        size_t polyCount;
        StartingPolynomial<Degree>* polys;
};
The output of PPolynomial class is something like:
[-0.750000,-0.250000]   1.5000 x^0 +4.0000 x^1 +2.6667 x^2 
[-0.250000,0.250000]    1.0000 x^0 +0.0000 x^1 -5.3333 x^2 
[0.250000,0.750000] 1.5000 x^0 -4.0000 x^1 +2.6667 x^2 
[0.750000,Infinity] 0.0000 x^0 +0.0000 x^1 +0.0000 x^2 
I write a __device__ function try to change the data member in __global__ function. But I failed and got warned that is an illegal access.
ERROR: /home/davidxu/CLionProjects/Practice/main.cu:973,code:700,reason:an illegal memory access was encountered
I write a test program and I still can't figure out how to change the data member.
Program:
#include "cuda.h"
#include "cstdio"
#include "cuda_runtime.h"
template<int Degree>
class Polynomial{
    public:
        float coefficients[Degree+1];
};
template<int Degree>
class StartingPolynomial{
    public:
        Polynomial<Degree> p;
        float start;
};
template<int Degree>
class PPolynomial{
    public:
        size_t polyCount;
        StartingPolynomial<Degree>* polys;
};
template<int Degree>
__device__ void scale(PPolynomial<Degree> *pp,const float& scale){
    for(int i=0;i<pp->polyCount;++i){
        printf("change start\n");
        printf("start is %f\n",pp->polys[i].start);
        atomicExch(&pp->polys[i].start,scale*pp->polys[i].start);
        printf("start ok\n");
        float s2=1.0;
        for(int j=0;j<=Degree;++j){
            printf("change polys\n");
            pp->polys[i].p.coefficients[j]*=s2;
            printf("polys ok\n");
            s2/=scale;
        }
    }
}
__global__ void test(PPolynomial<2> *pp){
    scale(pp,0.5);
}
int main(){
    PPolynomial<2> pp;
    pp.polyCount=2;
    pp.polys=(StartingPolynomial<2>*)malloc(sizeof(StartingPolynomial<2>)*pp.polyCount);
    pp.polys[0].start=-1;
    /* pp.polys.p.coefficients[0]=1; */
    pp.polys[0].start=1;
    /* pp.polys.p.coefficients[0]=2; */
    PPolynomial<2> *pd=NULL;
    cudaMalloc((PPolynomial<2>**)&pd,sizeof(pp));
    cudaMemcpy(pd,&pp,sizeof(pp),cudaMemcpyHostToDevice);
    test<<<1,1>>>(pd);
    cudaDeviceSynchronize();
}
output:
change start
I try to pass the pram to __global__ function by input the address of device variable, but the kernel function seems to be dead at last. That makes me really confused.
How can I solve my problem?
Edit 1:
I find out that the pointer pd->polys points to host address(Although pd is a pointer to device address).Is there an elegant way to copy the whole PPolynomial object from host to device?
