One possible straightforward approach:
$ cat t105.cu
#include <cstdio>
__device__ unsigned r = 0;
template <typename T>
__device__ T pdqru(T p, T q){
  T p1 = p +  (q-1);
  if (sizeof(T) == 8)
    q = __ffsll(q);
  else
    q = __ffs(q);
  return (p1<p)?((p>>(q-1))+1) :(p1 >> (q-1));
}
__global__ void test(unsigned p, unsigned q){
#ifdef USE_DISPLAY
  unsigned q2 = 16;
  unsigned z = 0;
  unsigned l = 1U<<31;
  printf("result %u/%u = %u\n", p, q, pdqru(p, q));
  printf("result %u/%u = %u\n", p, q2, pdqru(p, q2));
  printf("result %u/%u = %u\n", p, z, pdqru(p, z));
  printf("result %u/%u = %u\n", z, q, pdqru(z, q));
  printf("result %u/%u = %u\n", l, q, pdqru(l, q));
  printf("result %u/%u = %u\n", q, l, pdqru(q, l));
  printf("result %u/%u = %u\n", l, l, pdqru(l, l));
  printf("result %u/%u = %u\n", q, q, pdqru(q, q));
#else
  r = pdqru(p, q);
#endif
}
int main(){
  unsigned h_r;
  test<<<1,1>>>(32767, 32);
  cudaMemcpyFromSymbol(&h_r, r, sizeof(unsigned));
  printf("result = %u\n", h_r);
}
$ nvcc -arch=sm_61 -o t105 t105.cu
$ cuobjdump -sass ./t105
Fatbin elf code:
================
arch = sm_61
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit
        code for sm_61
Fatbin elf code:
================
arch = sm_61
code version = [1,7]
producer = cuda
host = linux
compile_size = 64bit
        code for sm_61
                Function : _Z4testjj
        .headerflags    @"EF_CUDA_SM61 EF_CUDA_PTX_SM(EF_CUDA_SM61)"
                                                                         /* 0x001fc800fec007f6 */
        /*0008*/                   MOV R1, c[0x0][0x20];                 /* 0x4c98078000870001 */
        /*0010*/                   IADD R0, RZ, -c[0x0][0x144];          /* 0x4c1100000517ff00 */
        /*0018*/                   LOP.AND R0, R0, c[0x0][0x144];        /* 0x4c47000005170000 */
                                                                         /* 0x005fd401fe20003d */
        /*0028*/                   FLO.U32 R2, R0;                       /* 0x5c30000000070002 */
        /*0030*/                   MOV R0, c[0x0][0x144];                /* 0x4c98078005170000 */
        /*0038*/                   IADD32I R3, -R2, 0x1f;                /* 0x1d00000001f70203 */
                                                                         /* 0x001fd000fc2007f1 */
        /*0048*/                   IADD32I R0, R0, -0x1;                 /* 0x1c0ffffffff70000 */
        /*0050*/                   MOV R2, c[0x0][0x140];                /* 0x4c98078005070002 */
        /*0058*/                   IADD32I R4, -R3, 0x1f;                /* 0x1d00000001f70304 */
                                                                         /* 0x001fd800fe2007f6 */
        /*0068*/                   IADD R5, R0, c[0x0][0x140];           /* 0x4c10000005070005 */
        /*0070*/                   ISETP.LT.U32.AND P0, PT, R5, R0, PT;  /* 0x5b62038000070507 */
        /*0078*/                   SHR.U32 R0, R2, R4;                   /* 0x5c28000000470200 */
                                                                         /* 0x001fd000fc2007f1 */
        /*0088*/                   IADD32I R0, R0, 0x1;                  /* 0x1c00000000170000 */
        /*0090*/                   MOV32I R2, 0x0;                       /* 0x010000000007f002 */
        /*0098*/                   MOV32I R3, 0x0;                       /* 0x010000000007f003 */
                                                                         /* 0x001ffc001e2007f2 */
        /*00a8*/              @!P0 SHR.U32 R0, R5, R4;                   /* 0x5c28000000480500 */
        /*00b0*/                   STG.E [R2], R0;                       /* 0xeedc200000070200 */
        /*00b8*/                   EXIT;                                 /* 0xe30000000007000f */
                                                                         /* 0x001f8000fc0007ff */
        /*00c8*/                   BRA 0xc0;                             /* 0xe2400fffff07000f */
        /*00d0*/                   NOP;                                  /* 0x50b0000000070f00 */
        /*00d8*/                   NOP;                                  /* 0x50b0000000070f00 */
                                                                         /* 0x001f8000fc0007e0 */
        /*00e8*/                   NOP;                                  /* 0x50b0000000070f00 */
        /*00f0*/                   NOP;                                  /* 0x50b0000000070f00 */
        /*00f8*/                   NOP;                                  /* 0x50b0000000070f00 */
                ..........................
Fatbin ptx code:
================
arch = sm_61
code version = [5,0]
producer = cuda
host = linux
compile_size = 64bit
compressed
$ nvcc -arch=sm_61 -o t105 t105.cu -DUSE_DISPLAY
$ cuda-memcheck ./t105
========= CUDA-MEMCHECK
result 32767/32 = 1024
result 32767/16 = 2048
result 32767/0 = 2048
result 0/32 = 0
result 2147483648/32 = 67108864
result 32/2147483648 = 1
result 2147483648/2147483648 = 1
result 32/32 = 1
result = 0
========= ERROR SUMMARY: 0 errors
$
Approximately 14 SASS instructions for the 32-bit case, to get the answer into R0.  It produces spurious results for the divide-by-zero case.
The equivalent assembly for this answer case looks like this:
$ cat t106.cu
#include <cstdio>
#include <cstdint>
__device__ unsigned r = 0;
template <typename T> __device__ int find_first_set(T x);
template <> __device__ int find_first_set<uint32_t>(uint32_t x) { return __ffs(x);   }
template <> __device__ int find_first_set<uint64_t>(uint64_t x) { return __ffsll(x); }
template <typename T>  __device__ T lg(T x) { return find_first_set(x) - 1; }
template <typename T>
__device__ T pdqru(T dividend, T divisor)
{
    auto log_2_of_divisor = lg(divisor);
    auto mask = divisor - 1;
    auto correction_for_rounding_up = ((dividend & mask) + mask) >> log_2_of_divisor;
    return (dividend >> log_2_of_divisor) + correction_for_rounding_up;
}
__global__ void test(unsigned p, unsigned q){
#ifdef USE_DISPLAY
  unsigned q2 = 16;
  unsigned z = 0;
  unsigned l = 1U<<31;
  printf("result %u/%u = %u\n", p, q, pdqru(p, q));
  printf("result %u/%u = %u\n", p, q2, pdqru(p, q2));
  printf("result %u/%u = %u\n", p, z, pdqru(p, z));
  printf("result %u/%u = %u\n", z, q, pdqru(z, q));
  printf("result %u/%u = %u\n", l, q, pdqru(l, q));
  printf("result %u/%u = %u\n", q, l, pdqru(q, l));
  printf("result %u/%u = %u\n", l, l, pdqru(l, l));
  printf("result %u/%u = %u\n", q, q, pdqru(q, q));
#else
  r = pdqru(p, q);
#endif
}
int main(){
  unsigned h_r;
  test<<<1,1>>>(32767, 32);
  cudaMemcpyFromSymbol(&h_r, r, sizeof(unsigned));
  printf("result = %u\n", h_r);
}
$ nvcc -std=c++11  -arch=sm_61 -o t106 t106.cu
$ cuobjdump -sass t106
Fatbin elf code:
================
arch = sm_61
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit
        code for sm_61
Fatbin elf code:
================
arch = sm_61
code version = [1,7]
producer = cuda
host = linux
compile_size = 64bit
        code for sm_61
                Function : _Z4testjj
        .headerflags    @"EF_CUDA_SM61 EF_CUDA_PTX_SM(EF_CUDA_SM61)"
                                                                   /* 0x001fd400fe2007f6 */
        /*0008*/                   MOV R1, c[0x0][0x20];           /* 0x4c98078000870001 */
        /*0010*/                   IADD R0, RZ, -c[0x0][0x144];    /* 0x4c1100000517ff00 */
        /*0018*/                   MOV R2, c[0x0][0x144];          /* 0x4c98078005170002 */
                                                                   /* 0x003fc40007a007f2 */
        /*0028*/                   LOP.AND R0, R0, c[0x0][0x144];  /* 0x4c47000005170000 */
        /*0030*/                   FLO.U32 R3, R0;                 /* 0x5c30000000070003 */
        /*0038*/                   IADD32I R0, R2, -0x1;           /* 0x1c0ffffffff70200 */
                                                                   /* 0x001fc400fcc017f5 */
        /*0048*/                   IADD32I R3, -R3, 0x1f;          /* 0x1d00000001f70303 */
        /*0050*/                   LOP.AND R2, R0, c[0x0][0x140];  /* 0x4c47000005070002 */
        /*0058*/                   IADD R2, R0, R2;                /* 0x5c10000000270002 */
                                                                   /* 0x001fd000fe2007f1 */
        /*0068*/                   IADD32I R0, -R3, 0x1f;          /* 0x1d00000001f70300 */
        /*0070*/                   MOV R3, c[0x0][0x140];          /* 0x4c98078005070003 */
        /*0078*/                   MOV32I R6, 0x0;                 /* 0x010000000007f006 */
                                                                   /* 0x001fc400fc2407f1 */
        /*0088*/                   SHR.U32 R4, R2, R0.reuse;       /* 0x5c28000000070204 */
        /*0090*/                   SHR.U32 R5, R3, R0;             /* 0x5c28000000070305 */
        /*0098*/                   MOV R2, R6;                     /* 0x5c98078000670002 */
                                                                   /* 0x0003c400fe4007f4 */
        /*00a8*/                   MOV32I R3, 0x0;                 /* 0x010000000007f003 */
        /*00b0*/                   IADD R0, R4, R5;                /* 0x5c10000000570400 */
        /*00b8*/                   STG.E [R2], R0;                 /* 0xeedc200000070200 */
                                                                   /* 0x001f8000ffe007ff */
        /*00c8*/                   EXIT;                           /* 0xe30000000007000f */
        /*00d0*/                   BRA 0xd0;                       /* 0xe2400fffff87000f */
        /*00d8*/                   NOP;                            /* 0x50b0000000070f00 */
                                                                   /* 0x001f8000fc0007e0 */
        /*00e8*/                   NOP;                            /* 0x50b0000000070f00 */
        /*00f0*/                   NOP;                            /* 0x50b0000000070f00 */
        /*00f8*/                   NOP;                            /* 0x50b0000000070f00 */
                ..........................
Fatbin ptx code:
================
arch = sm_61
code version = [5,0]
producer = cuda
host = linux
compile_size = 64bit
compressed
$
which appears to be 1 instruction longer, by my count.