I'm writing a CUDA kernel scheduler. The scheduler gets a vector of Task pointers and brings them to execution. The pointers point to KernelTask objects of different type parameters to support kernels with arbitrary parameters. 
There's a CPU version of the Scheduler and a GPU version. The CPU version works just fine. It calls the virtual function Task::start to execute a Kernel. The GPU version has three problems:
- Virtual functions are not allowed in CUDA. How can I avoid them without down casting?
- std::get is a host function. Is there a way to implement std::get myself for the GPU?
- (Low priority) Because KernelTaskobjects are of diffrent size I copy all of them seperatly withcopyToGPU(). Is there a way for batch copying?
Here is the code:
// see http://stackoverflow.com/questions/7858817/unpacking-a-tuple-to-call-a-matching-function-pointer
template<int ...>
struct seq { };
template<int N, int ...S>
struct gens : gens<N-1, N-1, S...> { };
template<int ...S>
struct gens<0, S...> {
  typedef seq<S...> type;
};
class Task {
private:
    bool visited;
    bool reached;
protected:
    std::vector<std::shared_ptr<Task>> dependsOn;
    Task();
public:
    Task **d_dependsOn = NULL;
    int d_dependsOnSize;
    Task *d_self = NULL;
    int streamId;
    int id;
    cudaStream_t stream;
    virtual void copyToGPU() = 0;
    virtual void start() = 0;
    virtual void d_start() = 0;
    virtual ~Task() {}
    void init();
    void addDependency(std::shared_ptr<Task> t);
    cudaStream_t dfs();
};
template<typename... Args>
class KernelTask : public Task {
private:
    std::tuple<Args...> params;
    dim3 threads;
    dim3 blocks;
    void (*kfp)(Args...);
    template<int ...S>
    void callFunc(seq<S...>) {
        // inserting task into stream
        this->kfp<<<this->blocks, this->threads, 0, this->stream>>>(std::get<S>(params) ...);
        checkCudaErrors(cudaGetLastError());
        if (DEBUG) printf("Task %d: Inserting Task in Stream.\n", this->id);
    }
    template<int ...S>
    __device__ void d_callFunc(seq<S...>) {
        // inserting task into stream
        this->kfp<<<this->blocks, this->threads, 0, this->stream>>>(std::get<S>(params) ...);
        if (DEBUG) printf("Task %d: Inserting Task in Stream.\n", this->id);
    }
    KernelTask(int id, void (*kfp)(Args...), std::tuple<Args...> params, dim3 threads, dim3 blocks);
public:
    ~KernelTask();
    void copyToGPU();
    void start() override {
        callFunc(typename gens<sizeof...(Args)>::type());
    }
    __device__ void d_start() override {
        d_callFunc(typename gens<sizeof...(Args)>::type());
    }
    static std::shared_ptr<KernelTask<Args...>> create(int id, void (*kfp)(Args...), std::tuple<Args...> params, dim3 threads, dim3 blocks);
};
class Scheduler {
private:
    std::vector<std::shared_ptr<Task>> tasks;
public:
    Scheduler(std::vector<std::shared_ptr<Task>> &tasks) {
        this->tasks = tasks;
    }
    void runCPUScheduler();
    void runGPUScheduler();
};
EDIT:
(1) Virtual Functions in CUDA: I get a Warp Illegal Address exception in scheduler in the following example:
struct Base {
    __host__ __device__ virtual void start() = 0;
    virtual ~Base() {}
};
struct Derived : Base {
    __host__ __device__ void start() override {
        printf("In start\n");
    }
};
__global__ void scheduler(Base *c) {
    c->start();
}
int main(int argc, char **argv) {
    Base *c = new Derived();
    Base *d_c;
    checkCudaErrors(cudaMalloc(&d_c, sizeof(Derived)));
    checkCudaErrors(cudaMemcpy(d_c, c, sizeof(Derived), cudaMemcpyHostToDevice));
    c->start();
    scheduler<<<1,1>>>(d_c);
    checkCudaErrors(cudaFree(d_c));
    return 0;
}
(2) thrust::tuple works fine.
(3) I'm open to suggestions.
(4) How do I pass a kernel function pointer to a kernel? I get a Warp Misaligned Address exception in the following example:
__global__ void baz(int a, int b) {
    printf("%d + %d = %d\n", a, b, a+b);
}
void schedulerHost(void (*kfp)(int, int)) {
    kfp<<<1,1>>>(1,2);
}
__global__ void schedulerDevice(void (*kfp)(int, int)) {
    kfp<<<1,1>>>(1,2);
}
int main(int argc, char **argv) {
    schedulerHost(&baz);
    schedulerDevice<<<1,1>>>(&baz);
    return 0;
}
 
    