Here's an example. It sets up a full terabyte mapping, but initially inaccessible (PROT_NONE). You, the programmer, maintain a window that can only extend and move upwards in memory. The example program uses a one and a half gigabyte window, advancing it in steps of 1,023,739,137 bytes (the mapping_use() makes sure the available pages cover at least the desired region), and does actually modify every page in every window, just to be sure.
#define _GNU_SOURCE
#define _POSIX_C_SOURCE 200809L
#include <stdlib.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <errno.h>
#include <string.h>
#include <stdio.h>
typedef struct mapping mapping;
struct mapping {
unsigned char *head; /* Start of currently accessible region */
unsigned char *tail; /* End of currently accessible region */
unsigned char *ends; /* End of region */
size_t page; /* Page size of this mapping */
};
/* Discard mapping.
*/
void mapping_free(mapping *const m)
{
if (m && m->ends > m->head) {
munmap(m->head, (size_t)(m->ends - m->head));
m->head = NULL;
m->tail = NULL;
m->ends = NULL;
m->page = 0;
}
}
/* Move the accessible part up in memory, to [from..to).
*/
int mapping_use(mapping *const m, void *const from, void *const to)
{
if (m && m->ends > m->head) {
unsigned char *const head = ((unsigned char *)from <= m->head) ? m->head :
((unsigned char *)from >= m->ends) ? m->ends :
m->head + m->page * (size_t)(((size_t)((unsigned char *)from - m->head)) / m->page);
unsigned char *const tail = ((unsigned char *)to <= head) ? head :
((unsigned char *)to >= m->ends) ? m->ends :
m->head + m->page * (size_t)(((size_t)((unsigned char *)to - m->head) + m->page - 1) / m->page);
if (head > m->head) {
munmap(m->head, (size_t)(head - m->head));
m->head = head;
}
if (tail > m->tail) {
#ifdef USE_MPROTECT
mprotect(m->tail, (size_t)(tail - m->tail), PROT_READ | PROT_WRITE);
#else
void *result;
do {
result = mmap(m->tail, (size_t)(tail - m->tail), PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_FIXED | MAP_PRIVATE | MAP_NORESERVE, -1, (off_t)0);
} while (result == MAP_FAILED && errno == EINTR);
if (result == MAP_FAILED)
return errno = ENOMEM;
#endif
m->tail = tail;
}
return 0;
}
return errno = EINVAL;
}
/* Initialize a mapping.
*/
int mapping_create(mapping *const m, const size_t size)
{
void *base;
size_t page, truesize;
if (!m || size < (size_t)1)
return errno = EINVAL;
m->head = NULL;
m->tail = NULL;
m->ends = NULL;
m->page = 0;
/* Obtain default page size. */
{
long value = sysconf(_SC_PAGESIZE);
page = (size_t)value;
if (value < 1L || (long)page != value)
return errno = ENOTSUP;
}
/* Round size up to next multiple of page. */
if (size % page)
truesize = size + page - (size % page);
else
truesize = size;
/* Create mapping. */
do {
errno = ENOTSUP;
base = mmap(NULL, truesize, PROT_NONE, MAP_ANONYMOUS | MAP_PRIVATE | MAP_NORESERVE, -1, (off_t)0);
} while (base == MAP_FAILED && errno == EINTR);
if (base == MAP_FAILED)
return errno;
/* Success. */
m->head = base;
m->tail = base;
m->ends = (unsigned char *)base + truesize;
m->page = page;
errno = 0;
return 0;
}
static void memtouch(void *const ptr, const size_t size)
{
if (ptr && size > 0) {
unsigned char *mem = (unsigned char *)ptr;
const size_t step = 2048;
size_t n = size / (size_t)step - 1;
mem[0]++;
mem[size-1]++;
while (n-->0) {
mem += step;
mem[0]++;
}
}
}
int main(void)
{
const size_t size = (size_t)1024 * (size_t)1024 * (size_t)1024 * (size_t)1024;
const size_t need = (size_t)1500000000UL;
const size_t step = (size_t)1023739137UL;
unsigned char *base;
mapping map;
size_t i;
if (mapping_create(&map, size)) {
fprintf(stderr, "Cannot create a %zu-byte mapping: %m.\n", size);
return EXIT_FAILURE;
}
printf("Have a %zu-byte mapping at %p to %p.\n", size, (void *)map.head, (void *)map.ends);
fflush(stdout);
base = map.head;
for (i = 0; i <= size - need; i += step) {
printf("Requesting %p to %p .. ", (void *)(base + i), (void *)(base + i + need));
fflush(stdout);
if (mapping_use(&map, base + i, base + i + need)) {
printf("Failed (%m).\n");
fflush(stdout);
return EXIT_FAILURE;
}
printf("received %p to %p.\n", (void *)map.head, (void *)map.tail);
fflush(stdout);
memtouch(base + i, need);
}
mapping_free(&map);
return EXIT_SUCCESS;
}
The approach is twofold. First, an inaccessible (PROT_NONE) mapping is created to reserve the necessary virtual contiguous address space. If we omit this step, it would make it possible for a malloc() call or similar to acquire pages within this range, which would defeat the entire purpose; a single terabyte-long mapping.
Second, when the accessible window extends into the region, either mprotect() (if USE_MPROTECT is defined), or mmap() is used to make the required pages accessible. Pages no longer needed are completely unmapped.
Compile and run using
gcc -Wall -Wextra -std=c99 example.c -o example
time ./example
or, to use mmap() only once and mprotect() to move the window,
gcc -DUSE_MPROTECT=1 -Wall -Wextra -std=c99 example.c -o example
time ./example
Note that you probably don't want to run the test if you don't have at least 4GB of physical RAM.
On this particular machine (i5-4200U laptop with 4GB of RAM, 3.13.0-62-generic kernel on Ubuntu x86_64), quick testing didn't show any kind of performance difference between mprotect() and mmap(), in execution speed or resident set size.
If anyone bothers to compile and run the above, and finds that one of them has a repeatable benefit/drawback (resident set size or time used), I'd very much like to know about it. Please also define your kernel and CPU used.
I'm not sure which details I should expand on, since this is pretty straightforward, really, and the Linux man pages project man 2 mmap and man 2 mprotect pages are quite descriptive. If you have any questions on this approach or program, I'd be happy to try and elaborate.