From the NVIDIA documentation, when PTX, CUBIN or FATBIN is generated, the host code gets discarded from the file. Now I have my host code (main.cu) and the device code (shared.cu). When compiling each file to *.o using the nvcc option nvcc -c  main.cu shared.cu or even with nvcc -dc  main.cu shared.cu and linking them with the option nvcc -link  main.o shared.o, I can generate the executable. But when shared.cu is compiled to shared.cubin and further to *.o, then the linking fails with an error tmpxft_00001253_00000000-4_main.cudafe1.cpp:(.text+0x150): undefined reference to <KERNEL FUNCTION>
Here I wonder shared.cu contains only device code and even if the host code is removed why the linking should fail.
The source code files are main.cu
#include <stdio.h>
#include <cuda_runtime_api.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include "shared.h"
 int main()
{
        int a[5]={1,2,3,4,5};
        int b[5]={1,1,1,1,1};
        int c[5];
        int i;
        int *dev_a;
        int *dev_b;
        int *dev_c;
        cudaMalloc( (void**)&dev_a, 5*sizeof(int) );
        cudaMalloc( (void**)&dev_b, 5*sizeof(int) );
        cudaMalloc( (void**)&dev_c, 5*sizeof(int) );
        cudaMemcpy(dev_a, a , 5 * sizeof(int), cudaMemcpyHostToDevice);
        cudaMemcpy(dev_b, b , 5 * sizeof(int), cudaMemcpyHostToDevice);
        add<<<1,5>>>(dev_a,dev_b,dev_c);
        cudaMemcpy(&c,dev_c,5*sizeof(int),cudaMemcpyDeviceToHost);
        for(i = 0; i < 5; i++ )
        {
                printf("a[%d] + b[%d] = %d\n",i,i,c[i]);
        }
        cudaFree( dev_a);
        cudaFree( dev_b);
        cudaFree( dev_c);
        return 0;
}
shared.cu
#include<stdio.h>
__global__  void add(int *dev_a, int *dev_b, int *dev_c){
        //allocate shared memory
        __shared__ int a_shared[5];
        __shared__ int b_shared[5];
        __shared__ int c_shared[5];
        {
                //get data in shared memory
                a_shared[threadIdx.x]=dev_a[threadIdx.x];
                __syncthreads();
                b_shared[threadIdx.x]=dev_b[threadIdx.x];
                __syncthreads();
                //perform the addition in the shared memory space
                c_shared[threadIdx.x]= a_shared[threadIdx.x] + b_shared[threadIdx.x];
                __syncthreads();
                //shift data back to global memory
                dev_c[threadIdx.x]=c_shared[threadIdx.x];
                __syncthreads();
        }
}
shared.h
#ifndef header
#define header
extern __global__  void add(int *dev_a, int *dev_b, int *dev_c);
#endif
 
    