I am currently going through the tutorial examples on http://code.google.com/p/stanford-cs193g-sp2010/ to learn CUDA. The code which demostrates __global__ functions is given below. It simply creates two arrays, one on the CPU and one on the GPU, populates the GPU array with the number 7 and copies the GPU array data into the CPU array. 
#include <stdlib.h>
#include <stdio.h>
__global__ void kernel(int *array)
{
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  array[index] = 7;
}
int main(void)
{
  int num_elements = 256;
  int num_bytes = num_elements * sizeof(int);
  // pointers to host & device arrays
  int *device_array = 0;
  int *host_array = 0;
  // malloc a host array
  host_array = (int*)malloc(num_bytes);
  // cudaMalloc a device array
  cudaMalloc((void**)&device_array, num_bytes);
  int block_size = 128;
  int grid_size = num_elements / block_size;
  kernel<<<grid_size,block_size>>>(device_array);
  // download and inspect the result on the host:
  cudaMemcpy(host_array, device_array, num_bytes, cudaMemcpyDeviceToHost);
  // print out the result element by element
  for(int i=0; i < num_elements; ++i)
  {
    printf("%d ", host_array[i]);
  }
  // deallocate memory
  free(host_array);
  cudaFree(device_array);
} 
My question is why have they worded the cudaMalloc((void**)&device_array, num_bytes); statement with a double pointer? Even here definition of cudamalloc() on says the first argument is a double pointer.  
Why not simply return a pointer to the beginning of the allocated memory on the GPU, just like the malloc function does on the CPU?

