i think an array can be allocated on gpu ex. __device__ int device_array[100]; without using cudaMalloc as the lenght is known. But when i run the following code some irrelevant numbers are displayed. I examined a popular book for cuda and all examples in there uses cudaMalloc. A fixed size array can be used like this or it must be allocated with cudaMalloc?
__device__ int device_array[100];
__global__ void kernel() {
    device_array[blockIdx.x] = blockIdx.x;
}
void call_kernel( int *host_array ) {
    kernel<<<100,1>>>();
    cudaMemcpy( host_array, device_array, 100 * sizeof( int ), cudaMemcpyDeviceToHost );
}
int main() {
    int host_array[100];
    call_kernel( host_array );
    for ( int i = 0; i < 100; i++ )
        cout << host_array[i] << endl;
}
 
    