Here is a demo. The kernel cannot overlap with previous cudaMemcpyAsync, although they are in different streams.
#include <iostream>
#include <cuda_runtime.h>
__global__ void warmUp(){
    int Id = blockIdx.x*blockDim.x+threadIdx.x;
    if(Id == 0){
        printf("warm up!");
    }
}
__global__ void kernel(){
    int Id = blockIdx.x*blockDim.x+threadIdx.x;
    if(Id == 0){
        long long x = 0;
        for(int i=0; i<1000000; i++){
            x += i>>1;
        }
        printf("kernel!%d\n", x);
    }
}
int main(){
    //warmUp<<<1,32>>>();
    int *data, *data_dev;
    int dataSize = pow(10, 7);
    cudaMallocHost(&data, dataSize*sizeof(int));
    cudaMalloc(&data_dev, dataSize*sizeof(int));
    
    cudaStream_t stream1, stream2;
    cudaStreamCreate(&stream1);
    cudaStreamCreate(&stream2);
    cudaMemcpyAsync(data_dev, data, dataSize*sizeof(int), cudaMemcpyHostToDevice, stream1);
    kernel<<<1, 32, 0, stream2>>>();
}
After some attempts, I found out that this is due to it being the first kernel call.
Uncomment warmUp<<<1,32>>>();, Visual Profiler show, overlap!
Why?
 
    