To make saolof's answer concrete, the following is an implementation using the CLMUL instruction set, interleaving two pairs of uint32_ts per call:
#include <immintrin.h>
#include <stdint.h>
typedef struct {
  uint32_t x;
  uint32_t y;
} uint32_2;
static inline void interleave_clmul(uint32_2 *input, uint64_t *out) {
  __m128i xy = _mm_load_si128((const __m128i *)input);
  xy = _mm_shuffle_epi32(xy, 0b11011000);
  // Two carryless multiplies
  __m128i p2 = _mm_clmulepi64_si128(xy, xy, 0x11);
  __m128i p1 = _mm_clmulepi64_si128(xy, xy, 0x00);
  // Bitwise interleave the results
  p2 = _mm_slli_epi16(p2, 1);
  __m128i p3 = _mm_or_si128(p1, p2);
  _mm_storeu_si128((__m128i *)out, p3);
}
That compiles down to the following:
interleave_clmul(uint32_2*, unsigned long*):
        vpshufd         xmm0, xmmword ptr [rdi], 216    # xmm0 = mem[0,2,1,3]
        vpclmulqdq      xmm1, xmm0, xmm0, 17
        vpclmulqdq      xmm0, xmm0, xmm0, 0
        vpaddw          xmm1, xmm1, xmm1
        vpor            xmm0, xmm0, xmm1
        vmovdqu         xmmword ptr [rsi], xmm0
        ret
Replace _mm_load_si128 with _mm_loadu_si128 if your data is not aligned--unaligned loads aren't that much slower on x86 anyway. This system is faster than the corresponding implementation with pdep instructions in terms of throughput.
Naive
Total rdtsc: 1295559857, iterations: 1000, count: 10000
clmul-based
Total rdtsc: 17751716, iterations: 1000, count: 10000
pdep-based
Total rdtsc: 26123417, iterations: 1000, count: 10000
pdep-based unrolled
Total rdtsc: 24281811, iterations: 1000, count: 10000
Turbo boost was disabled; CPU is a 1.60 GHz base clock Kaby Lake. Results seem consistent across runs.  (Results on other architectures would be nice.) The (somewhat messy) testing code:
#include <stdio.h>
#include <inttypes.h>
#include <string.h>
#include <stdlib.h>
#include <immintrin.h>
// rdtscp
#include <x86intrin.h>
typedef struct uint32_2 {
    uint32_t x;
    uint32_t y;
} uint32_2;
uint32_2* generate_pairs(const int count) {
    uint32_2* p = malloc(count * sizeof(uint32_2));
    uint32_t r = 401923091;
#define RNG r *= 52308420; r += 2304;
    for (int i = 0; i < count; ++i) {
        p[i].x = r;
        RNG RNG
        p[i].y = r;
        RNG RNG
    }
    return p;
}
void interleave_naive(uint64_t* dst, uint32_2* src, int count) {
    for (int i = 0; i < count; ++i) {
        struct uint32_2 s = src[i];
        uint32_t x = s.x, y = s.y;
        uint64_t result = 0;
        for (int k = 0; k < 32; ++k) {
            if (x & ((uint32_t)1 << k)) {
                result |= (uint64_t)1 << (2 * k);
            }
            if (y & ((uint32_t)1 << k)) {
                result |= (uint64_t)1 << (2 * k + 1);
            }
        }
        dst[i] = result;
    }
}
void interleave_pdep(uint64_t* dst, uint32_2* src, int count) {
    for (int i = 0; i < count; ++i) {
        struct uint32_2 s = src[i];
        uint32_t x = s.x, y = s.y;
        uint64_t result = _pdep_u64(x, 0x5555555555555555) | _pdep_u64(y, 0xaaaaaaaaaaaaaaaa);
        dst[i] = result;
    }
}
void interleave_pdep_unrolled(uint64_t* dst, uint32_2* src, int count) {
    for (int i = 0; i < count; i += 2) {
        struct uint32_2 s1 = src[i];
        struct uint32_2 s2 = src[i + 1];
        uint32_t x1 = s1.x, y1 = s1.y;
        uint32_t x2 = s2.x, y2 = s2.y;
        uint64_t result1 = _pdep_u64(x1, 0x5555555555555555) | _pdep_u64(y1, 0xaaaaaaaaaaaaaaaa);
        uint64_t result2 = _pdep_u64(x2, 0x5555555555555555) | _pdep_u64(y2, 0xaaaaaaaaaaaaaaaa);
        dst[i] = result1;
        dst[i + 1] = result2;
    }
}
void interleave_clmul(uint64_t* dst, uint32_2* src, int count) {
    uint32_2* end = src + count;
    uint64_t* out = dst;
    for (uint32_2* p = src; p < end; p += 2, out += 2) {
        __m128i xy = _mm_load_si128((const __m128i *) p);
        xy = _mm_shuffle_epi32(xy, 0b11011000);
        __m128i p2 = _mm_clmulepi64_si128(xy, xy, 0x11);
        __m128i p1 = _mm_clmulepi64_si128(xy, xy, 0x00);
        p2 = _mm_slli_epi16(p2, 1);
        __m128i p3 = _mm_or_si128(p1, p2);
        _mm_store_si128((__m128i *)out, p3);
    }
}
#define ITERATIONS 1000
void time_inv(uint32_2* pairs, int count, void (*interleave) (uint64_t*, uint32_2*, int)) {
    uint64_t* result = malloc(count * sizeof(uint64_t));
    uint64_t* reference_result = malloc(count * sizeof(uint64_t));
    interleave_naive(reference_result, pairs, count);
    // Induce page faults
    memset(result, 0, count * sizeof(uint64_t));
    unsigned _;
    int64_t start_rdtsc = __rdtscp(&_);
    for (int i = 0; i < ITERATIONS; ++i) {
        interleave(result, pairs, count);
    }
    int64_t end_rdtsc = __rdtscp(&_);
    for (int i = 0; i < count; ++i) {
        if (reference_result[i] != result[i]) {
            fprintf(stderr, "Incorrect value at index %d; got %" PRIx64 ", should be %" PRIx64 "\n", i, result[i], reference_result[i]);
            abort();
        }
    }
    printf("Total rdtsc: %" PRId64 ", iterations: %d, count: %d\n", end_rdtsc - start_rdtsc, ITERATIONS, count);
    free(result);
}
int main() {
    const int count = 10000;
    uint32_2* pairs = generate_pairs(count);
    printf("Naive\n");
    time_inv(pairs, count, interleave_naive);
    printf("clmul-based\n");
    time_inv(pairs, count, interleave_clmul);
    printf("pdep-based\n");
    time_inv(pairs, count, interleave_pdep);
    printf("pdep-based unrolled\n");
    time_inv(pairs, count, interleave_pdep_unrolled);
    free(pairs);
}
Compile with gcc bleh.c -o bleh -O2 -march=native.
Perf stats for each implementation are below. CLMUL seems to do 1.5c/pair, bottlenecking on port 5 contention by 2 pclmulqdq and 1 vpshufd, while the pdep implementations bottleneck on port 1, on which pdep executes, resulting in 2c/pair:
clmul-based
Total rdtsc: 1774895925, total milliseconds: 985.575000, iterations: 100000, count: 10000
 Performance counter stats for './interleave':
       556,602,052      uops_dispatched_port.port_0                                     (49.87%)
     1,556,592,314      cycles                                                        (49.86%)
       469,021,017      uops_dispatched_port.port_1                                     (49.86%)
       472,968,452      uops_dispatched_port.port_2                                     (50.08%)
       519,804,531      uops_dispatched_port.port_3                                     (50.13%)
       499,980,587      uops_dispatched_port.port_4                                     (50.14%)
     1,509,928,584      uops_dispatched_port.port_5                                     (50.14%)
     1,484,649,884      uops_dispatched_port.port_6                                     (49.92%)
pdep-based
Total rdtsc: 2588637876, total milliseconds: 1438.065000, iterations: 100000, count: 10000
 Performance counter stats for './interleave':
       745,844,862      uops_dispatched_port.port_0                                     (50.02%)
     2,289,048,624      cycles                                                        (50.02%)
     2,033,116,738      uops_dispatched_port.port_1                                     (50.02%)
     1,508,870,090      uops_dispatched_port.port_2                                     (50.02%)
     1,498,920,409      uops_dispatched_port.port_3                                     (49.98%)
     1,056,089,339      uops_dispatched_port.port_4                                     (49.98%)
       843,399,033      uops_dispatched_port.port_5                                     (49.98%)
     1,414,062,891      uops_dispatched_port.port_6                                     (49.98%)
pdep-based unrolled
Total rdtsc: 2387027127, total milliseconds: 1325.857000, iterations: 100000, count: 10000
 Performance counter stats for './interleave':
       532,577,450      uops_dispatched_port.port_0                                     (49.64%)
     2,099,782,071      cycles                                                        (49.94%)
     2,004,347,972      uops_dispatched_port.port_1                                     (50.24%)
     1,532,203,395      uops_dispatched_port.port_2                                     (50.54%)
     1,467,988,364      uops_dispatched_port.port_3                                     (50.36%)
     1,701,095,132      uops_dispatched_port.port_4                                     (50.06%)
       543,597,866      uops_dispatched_port.port_5                                     (49.76%)
       930,460,812      uops_dispatched_port.port_6                                     (49.46%)