I'm learning how to use multi GPU for my CUDA application. I tried out a simple program which successfully ran on a system having two Tesla C2070. But when I tried to run the same program on a different system having a Tesla K40c and a Tesla C2070, it shows a segmentation fault. What might be the problem? I'm sure that there is no problem with the code. Is there any settings to be done in the environment? I have attached my code here for your reference.
#include <stdio.h>
#include "device_launch_parameters.h"
#include "cuda_runtime_api.h"
__global__ void testA(int *a)
{
   int i = blockIdx.x * blockDim.x + threadIdx.x;
   a[i] = a[i] * 2;
}
int main()
{
   int *ai, *bi, *ao, *bo;
   int iter;
   cudaStream_t streamA, streamB;
   cudaSetDevice(0);
   cudaStreamCreate(&streamA);
   cudaMalloc((void**)&ao, 10 * sizeof(int));
   cudaHostAlloc((void**)&ai, 10 * sizeof(int), cudaHostAllocMapped);
   for(iter=0; iter<10; iter++)
   {
       ai[iter] = iter+1;
   }
   cudaSetDevice(1);
   cudaStreamCreate(&streamB);
   cudaMalloc((void**)&bo, 10 * sizeof(int));
   cudaHostAlloc((void**)&bi, 10 * sizeof(int), cudaHostAllocMapped);
   for(iter=0; iter<10; iter++)
   {
       bi[iter] = iter+11;
   }
   cudaSetDevice(0);
   cudaMemcpyAsync(ao, ai, 10 * sizeof(int), cudaMemcpyHostToDevice, streamA);
   testA<<<1, 10, 0, streamA>>>(ao);
   cudaMemcpyAsync(ai, ao, 10 * sizeof(int), cudaMemcpyDeviceToHost, streamA);
   cudaSetDevice(1);
   cudaMemcpyAsync(bo, bi, 10 * sizeof(int), cudaMemcpyHostToDevice, streamB);
   testA<<<1, 10, 0, streamB>>>(bo);
   cudaMemcpyAsync(bi, bo, 10 * sizeof(int), cudaMemcpyDeviceToHost, streamB);
   cudaSetDevice(0);
   cudaStreamSynchronize(streamA);
   cudaSetDevice(1);
   cudaStreamSynchronize(streamB);
   printf("%d %d %d %d %d\n",ai[0],ai[1],ai[2],ai[3],ai[4]);
   printf("%d %d %d %d %d\n",bi[0],bi[1],bi[2],bi[3],bi[4]);
   return 0;
}
The segmentation fault occurs when bi array is initialized inside the for loop, which means the memory is not allocated for bi.
 
    