I needed something similar for a case where I didn't have any profiling tools, but I wanted to count how many threads were inside a particular block of code as well as the amount of time (ticks) spent in that block of code by each thread,  In this case every block needed a unique static variable accessible to all threads, and I needed to later reference that variable to incr (I used a logging API rather than printf in the actual code, but this works as well).   At first I thought I was very clever by doing the following:
#define PROF_START { \
    static volatile int entry_count##___FUNCTION__##__LINE__ = 0; int *ptc = &entry_count##___FUNCTION__##__LINE__; \
    clock_t start, end; \
    start = times(0); \
    (*ptc)++;
But then I realized this is just silly and the C compiler will simply do this for you, as long as each "static" declaration is its own block:
#include <stdio.h>
#include <sys/times.h>
#define PROF_START { \
    static int entry_count = 0; \
    clock_t start, end; \
    start = times(0); \
    entry_count++;
#define PROF_END \
    end = times(0); \
    printf("[%s:%d] TIMER: %ld:%d\n" , __FUNCTION__, __LINE__, end-start, entry_count); \
    entry_count--; \
    }
Note the open/close brackets in each macro.   This isn't strictly thread-safe, but for my profiling purposes I could assume the incr and decr operations were atomic.  Here's a recursion sample which uses the macros
#define ITEM_COUNT 5
struct node {
   int data;
   struct node *next;
 };
revsort(struct node **head)
{
  struct node *current = *head;
  struct node *next_item;
  while (current->next)
  {
PROF_START
    next_item = current->next;
    current->next = next_item->next;
    next_item->next = *head;
    *head = next_item;
PROF_END
  }
}
rrevsort(struct node **head)
{
  struct node *current = *head;
  struct node *next_item = current->next;
PROF_START
  current->next = 0;
  if (next_item)
  {
   *head = next_item;
    rrevsort(head);
    next_item->next = current;
  }
PROF_END
}
printnode(struct node *head)
{
  if (head)
  {
    printf("%d ", head->data);
    printnode(head->next);
  }
  else
    printf("\n");
}
main()
{
  struct node node_list[ITEM_COUNT];
  struct node *head = &node_list[0];
  int i;
  for (i=0; i < ITEM_COUNT - 1; i++)
  {
PROF_START
      node_list[i].data = i;
      node_list[i].next = &node_list[i+1];
PROF_END
  }
  node_list[i].data = i;
  node_list[i].next = 0;
  printf("before\n");
  printnode(head);
  revsort(&head);
  printf("after\n");
  printnode(head);
  rrevsort(&head);
  printf("before\n");
  printnode(head);
}
Extra hint, the above program is a common interview question.   Excerpt from "nm -A":
macro:0804a034 b entry_count.1715
macro:0804a030 b entry_count.1739
macro:0804a028 b entry_count.1768
macro:0804a02c b entry_count.1775