I am writing a raycaster, and I am trying to speed it up by making lookup tables for my most commonly called trig functions, namely sin, cos, and tan. This first snippet is my table lookup code. In order to avoid making a lookup table for each, I am just making one sin table, and defining cos(x) as sin(half_pi - x) and tan(x) as sin(x) / cos(x).
#include <math.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
const float two_pi = M_PI * 2, half_pi = M_PI / 2;
typedef struct {
    int fn_type, num_vals;
    double* vals, step;
} TrigTable;
static TrigTable sin_table;
TrigTable init_trig_table(const int fn_type, const int num_vals) {
    double (*trig_fn) (double), period;
    switch (fn_type) {
        case 0: trig_fn = sin, period = two_pi; break;
        case 1: trig_fn = cos, period = two_pi; break;
        case 2: trig_fn = tan, period = M_PI; break;
    }
    TrigTable table = {fn_type, num_vals,
        calloc(num_vals, sizeof(double)), period / num_vals};
    for (double x = 0; x < period; x += table.step)
        table.vals[(int) round(x / table.step)] = trig_fn(x);
    return table;
}
double _lookup(const TrigTable table, const double x) {
    return table.vals[(int) round(x / table.step)];
}
double lookup_sin(double x) {
    const double orig_x = x;
    if (x < 0) x = -x;
    if (x > two_pi) x = fmod(x, two_pi);
    const double result = _lookup(sin_table, x);
    return orig_x < 0 ? -result : result;
}
double lookup_cos(double x) {
    return lookup_sin(half_pi - x);
}
double lookup_tan(double x) {
    return lookup_sin(x) / lookup_cos(x);
}
Here is how I went about benchmarking my code: my function for the current time in milliseconds is from here. The problem arises here: when timing my lookup_sin vs math.h's sin, my variant takes around three times longer: Table time vs default: 328 ms, 108 ms.
Here is the timing for cos:
Table time vs default: 332 ms, 109 ms
Here is the timing for tan:
Table time vs default: 715 ms, 153 ms
What makes my code so much slower? I would think that precomputing sin values would greatly accelerate my code. Perhaps it's the fmod in the lookup_sin function? Please provide whatever insight that you have. I am compiling with clang with no optimizations enabled, so that the calls to each trig function are not removed (I am ignoring the return value).
const int64_t millis() {
    struct timespec now;
    timespec_get(&now, TIME_UTC);
    return ((int64_t) now.tv_sec) * 1000 + ((int64_t) now.tv_nsec) / 1000000;
}
const int64_t benchmark(double (*trig_fn) (double)) {
    const int64_t before = millis();
    for (double i = 0; i < 10000; i += 0.001)
        trig_fn(i);
    return millis() - before;
}
int main() {
    sin_table = init_trig_table(0, 15000);
    const int64_t table_time = benchmark(lookup_sin), default_time = benchmark(sin);
    printf("Table time vs default: %lld ms, %lld ms\n", table_time, default_time);
    free(sin_table.vals);
}
 
    