I have a C++ project
Is it possible to create a class for CUDA stuff that lives during program lifetime
Sure.  I'm not sure what the extern "C" stuff has to do with a C++ project.  CUDA is a C++ type of language definition.
Here is an example:
$ cat t1901.cu
#include <iostream>
#include <vector>
__global__ void my_work(int N, double *X, double *Y, double *scale){
  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  if (idx < N)
    Y[idx] += X[idx] * (*scale);
}
class stuffHandler
{
  double persistent;
  double *dev_persistent = NULL;
  public:
    stuffHandler(double persistent_) : persistent(persistent_) {
      cudaMalloc(&dev_persistent, sizeof(double));
      cudaMemcpy(dev_persistent, &persistent, sizeof(double), cudaMemcpyHostToDevice);}
    void doStuff(int N, double *arg1, double *arg2){
      double *d_arg1, *d_arg2;
      cudaMalloc(&d_arg1, N*sizeof(double));
      cudaMalloc(&d_arg2, N*sizeof(double));
      cudaMemcpy(d_arg1, arg1, N*sizeof(double), cudaMemcpyHostToDevice);
      cudaMemcpy(d_arg2, arg2, N*sizeof(double), cudaMemcpyHostToDevice);
      my_work<<<(N+255)/256, 256>>>(N, d_arg1, d_arg2, dev_persistent);
      cudaMemcpy(arg2, d_arg2, N*sizeof(double), cudaMemcpyDeviceToHost);
      cudaFree(d_arg1);
      cudaFree(d_arg2);}
    ~stuffHandler(){if (dev_persistent) cudaFree(dev_persistent);}
};
int main(){
  int my_N = 4;
  double scale = 1.5;
  stuffHandler stuff_handler(scale);
  std::vector<double> v1(my_N, 0.1);
  std::vector<double> v2(my_N, 0.2);
  stuff_handler.doStuff(my_N, v1.data(), v2.data());
  std::cout << v2[0] << std::endl;
}
$ nvcc -o t1901 t1901.cu
$ compute-sanitizer ./t1901
========= COMPUTE-SANITIZER
0.35
========= ERROR SUMMARY: 0 errors
$
Responding to a question in the comments, you could rearrange the above as follows for a typical multi-module project implementation:
$ cat t1901.h
class stuffHandler
{
  double persistent;
  double *dev_persistent = NULL;
  public:
    stuffHandler(double persistent_);
    void doStuff(int N, double *arg1, double *arg2);
    ~stuffHandler();
};
$ cat t1901.cu
#include "t1901.h"
__global__ void my_work(int N, double *X, double *Y, double *scale){
  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  if (idx < N)
    Y[idx] += X[idx] * (*scale);
}
stuffHandler::stuffHandler(double persistent_) : persistent(persistent_) {
      cudaMalloc(&dev_persistent, sizeof(double));
      cudaMemcpy(dev_persistent, &persistent, sizeof(double), cudaMemcpyHostToDevice);}
void stuffHandler::doStuff(int N, double *arg1, double *arg2){
      double *d_arg1, *d_arg2;
      cudaMalloc(&d_arg1, N*sizeof(double));
      cudaMalloc(&d_arg2, N*sizeof(double));
      cudaMemcpy(d_arg1, arg1, N*sizeof(double), cudaMemcpyHostToDevice);
      cudaMemcpy(d_arg2, arg2, N*sizeof(double), cudaMemcpyHostToDevice);
      my_work<<<(N+255)/256, 256>>>(N, d_arg1, d_arg2, dev_persistent);
      cudaMemcpy(arg2, d_arg2, N*sizeof(double), cudaMemcpyDeviceToHost);
      cudaFree(d_arg1);
      cudaFree(d_arg2);}
stuffHandler::~stuffHandler(){if (dev_persistent) cudaFree(dev_persistent);}
$ cat main.cpp
#include <iostream>
#include <vector>
#include "t1901.h"
int main(){
  int my_N = 4;
  double scale = 1.5;
  stuffHandler stuff_handler(scale);
  std::vector<double> v1(my_N, 0.1);
  std::vector<double> v2(my_N, 0.2);
  stuff_handler.doStuff(my_N, v1.data(), v2.data());
  std::cout << v2[0] << std::endl;
}
$ nvcc -o t1901 t1901.cu main.cpp
$ compute-sanitizer ./t1901
========= COMPUTE-SANITIZER
0.35
========= ERROR SUMMARY: 0 errors
$
If you wanted to break the compilation and linking steps apart, you could do:
nvcc -c t1901.cu
g++ -c main.cpp
g++ main.o t1901.o -o test -L/usr/local/cuda/lib64 -lcudart
Or in an MPI project, replace the above g++ with mpicxx
After some additional discussion in the comments, contrary to your question title and first sentence, you actually have a C project, not C++ (you are wanting to do the final link with mpicc which is a C compiler).
In that case we could lay out the above code somewhat differently, and refer to some instructions here for getting all of our C++ linking in order.  Here is another example:
$ cat t1902.h
#ifdef __cplusplus
extern "C"
#endif
void C_init(double scale);
#ifdef __cplusplus
extern "C"
#endif
void C_doStuff(int N, double *arg1, double *arg2);
#ifdef __cplusplus
extern "C"
#endif
void C_end();
$ cat t1902.cu
#include <iostream>
#include <vector>
__global__ void my_work(int N, double *X, double *Y, double *scale){
  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  if (idx < N)
    Y[idx] += X[idx] * (*scale);
}
class stuffHandler
{
  double persistent;
  double *dev_persistent = NULL;
  public:
    stuffHandler(double persistent_) : persistent(persistent_) {
      cudaMalloc(&dev_persistent, sizeof(double));
      cudaMemcpy(dev_persistent, &persistent, sizeof(double), cudaMemcpyHostToDevice);}
    void doStuff(int N, double *arg1, double *arg2){
      double *d_arg1, *d_arg2;
      cudaMalloc(&d_arg1, N*sizeof(double));
      cudaMalloc(&d_arg2, N*sizeof(double));
      cudaMemcpy(d_arg1, arg1, N*sizeof(double), cudaMemcpyHostToDevice);
      cudaMemcpy(d_arg2, arg2, N*sizeof(double), cudaMemcpyHostToDevice);
      my_work<<<(N+255)/256, 256>>>(N, d_arg1, d_arg2, dev_persistent);
      cudaMemcpy(arg2, d_arg2, N*sizeof(double), cudaMemcpyDeviceToHost);
      cudaFree(d_arg1);
      cudaFree(d_arg2);}
    void finish(){if (dev_persistent) cudaFree(dev_persistent);}
};
stuffHandler *stuff_handler = NULL;
extern "C" void C_doStuff(int N, double *arg1, double *arg2){
  if (stuff_handler)
    stuff_handler->doStuff(N, arg1, arg2);
}
extern "C" void C_end(){
  if (stuff_handler) {
    stuff_handler->finish();
    delete stuff_handler;}
  stuff_handler = NULL;
}
extern "C" void C_init(double scale){
  if (stuff_handler) C_end();
  stuff_handler = new stuffHandler(scale);
}
$ cat main.c
#include <stdio.h>
#include <stdlib.h>
#include "t1902.h"
int main(){
  int i,my_N = 4;
  double scale = 1.5;
  C_init(scale);
  double *d1 = malloc(my_N*sizeof(double));
  double *d2 = malloc(my_N*sizeof(double));
  for (i=0; i < my_N; i++) {
    d1[i] = 0.1;
    d2[i] = 0.2;}
  C_doStuff(my_N, d1, d2);
  printf("%f\n", d2[0]);
  C_end();
}
$ nvcc -c t1902.cu
$ gcc -c main.c
$ gcc -o test main.o t1902.o -L/usr/local/cuda/lib64 -lcudart_static -lculibos -lpthread -lrt -ldl -lstdc++
$ compute-sanitizer ./test
========= COMPUTE-SANITIZER
0.350000
========= ERROR SUMMARY: 0 errors
$
In the compilation sequence above, it should be possible to replace gcc with mpicc.