Benchmark C code easily
#include <time.h>
int main(void) {
  clock_t start_time = clock();
  // code or function to benchmark
  double elapsed_time = (double)(clock() - start_time) / CLOCKS_PER_SEC;
  printf("Done in %f seconds\n", elapsed_time);
}
Easy benchmark of multi-threaded C code
If you want to benchmark multithreaded program you first need to take a closer look at clock:
Description
The clock() function returns an approximation of processor time
used by the program.
Return value
The value returned is the CPU time used so far as a clock_t; to
get the number of seconds used, divide by CLOCKS_PER_SEC.  If the
processor time used is not available or its value cannot be
represented, the function returns the value (clock_t)(-1)
Hence it is very important to divide your elapsed_time by the number of threads in order to get the execution time of your function:
#include <time.h>
#include <omp.h>
#define THREADS_NB omp_get_max_threads()
#pragma omp parallel for private(i) num_threads(THREADS_NB)
clock_t start_time = clock();
// code or function to benchmark
double elapsed_time = (double)(clock() - start_time) / CLOCKS_PER_SEC;
printf("Done in %f seconds\n", elapsed_time / THREADS_NB); // divide by THREADS_NB!
Example
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <time.h>
#include <omp.h>
#define N 20000
#define THREADS_NB omp_get_max_threads()
void init_arrays(double *a, double *b) {
  memset(a, 0, sizeof(a));
  memset(b, 0, sizeof(b));
  for (int i = 0; i < N; i++) {
    a[i] += 1.0;
    b[i] += 1.0;
  }
}
double func2(double i, double j) {
  double res = 0.0;
  while (i / j > 0.0) {
    res += i / j;
    i -= 0.1;
    j -= 0.000003;
  }
  return res;
}
double single_thread(double *a, double *b) {
  double res = 0;
  int i, j;
  for (i = 0; i < N; i++) {
    for (j = 0; j < N; j++) {
      if (i == j) continue;
      res += func2(a[i], b[j]);
    }
  }
  return res;
}
double multi_threads(double *a, double *b) {
  double res = 0;
  int i, j;
  #pragma omp parallel for private(j) num_threads(THREADS_NB) reduction(+:res)
  for (i = 0; i < N; i++) {
    for (j = 0; j < N; j++) {
      if (i == j) continue;
      res += func2(a[i], b[j]);
    }
  }
  return res;
}
int main(void) {
  double *a, *b;
  a = (double *)calloc(N, sizeof(double));
  b = (double *)calloc(N, sizeof(double));
  init_arrays(a, b);
  clock_t start_time = clock();
  double res = single_thread(a, b);
  double elapsed_time = (double)(clock() - start_time) / CLOCKS_PER_SEC;
  printf("Default:  Done with %f in %f sd\n", res, elapsed_time);
  start_time = clock();
  res = multi_threads(a, b);
  elapsed_time = (double)(clock() - start_time) / CLOCKS_PER_SEC;
  printf("With OMP: Done with %f in %f sd\n", res, elapsed_time / THREADS_NB);
}
Compile with:
gcc -O3 multithread_benchmark.c -fopenmp && time ./a.out
Output:
Default:  Done with 2199909813.614555 in 4.909633 sd
With OMP: Done with 2199909799.377532 in 1.708831 sd
real    0m6.703s (from time function)