Since I was unable to find a tool that works like dd but would reproduce the sparse-ness of the input in its output, I wrote a small Python script which does exactly that:
#!/usr/bin/env python
import subprocess
import os
import sys
def main():
infd = os.open(sys.argv[1], os.O_RDONLY)
inlength = os.lseek(infd, 0, os.SEEK_END)
outfd = os.open(sys.argv[2], os.O_CREAT | os.O_WRONLY)
outlength = os.lseek(outfd, 0, os.SEEK_END)
offset = int(sys.argv[3])
curr = 0
while True:
try:
data = os.lseek(infd, curr, os.SEEK_DATA)
except OSError:
# no more data
break
try:
hole = os.lseek(infd, data, os.SEEK_HOLE)
except OSError:
# no hole afterwards, copy until EOF
hole = inlength
print(
f"copying range {data}..{hole} ({100*hole/inlength:.2f}%)", file=sys.stderr
)
os.copy_file_range(
infd, outfd, hole - data, offset_src=data, offset_dst=data + offset
)
curr = hole
if outlength < inlength + offset:
os.truncate(outfd, inlength + offset)
os.close(infd)
os.close(outfd)
if name == "main":
main()
According to du my final system images now only take 3.7 G of disk space instead of the full 5.1 G that the image file is sized at.
For my 3 G ext4 filesystems, the main loop above is only iterating around 500 times, so most of the time will be spent in copy_file_range() and not in Python, so I'm not worried that the code is not written in C but porting it to C would probably be trivial.
EDIT
Okay, here is a version written in C:
#define _GNU_SOURCE
#define _LARGEFILE64_SOURCE
#include <sys/types.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
int main(int argc, char *argv[]) {
if (argc != 3 && argc != 4) {
fprintf(stderr, "Usage: %s infile outfile [offset]\n", argv[0]);
exit(EXIT_FAILURE);
}
int infd = open(argv[1], O_RDONLY);
if (infd == -1) {
perror("open");
exit(EXIT_FAILURE);
}
off64_t inlength = lseek64(infd, 0, SEEK_END);
if (inlength == -1) {
perror("lseek64");
exit(EXIT_FAILURE);
}
int outfd = open(argv[2], O_CREAT | O_WRONLY);
if (outfd == -1) {
perror("open");
exit(EXIT_FAILURE);
}
off64_t outlength = lseek64(outfd, 0, SEEK_END);
if (outlength == -1) {
perror("lseek64");
exit(EXIT_FAILURE);
}
long long offset = 0;
if (argc == 4) {
offset = strtoll(argv[3], NULL, 10);
if (errno != 0) {
perror("strtoll");
exit(EXIT_FAILURE);
}
}
off64_t curr = 0;
while (true) {
off64_t data = lseek64(infd, curr, SEEK_DATA);
if (data == -1) {
break;
}
off64_t hole = lseek64(infd, data, SEEK_HOLE);
if (hole == -1) {
hole = inlength;
}
off64_t off_out = data + offset;
ssize_t ret = copy_file_range(infd, &data, outfd, &off_out, hole - data, 0);
if (ret == -1) {
perror("copy_file_range");
exit(EXIT_FAILURE);
}
curr = hole;
}
if (outlength < inlength + offset) {
int ret = ftruncate(outfd, inlength + offset);
if (ret == -1) {
perror("ftruncate");
exit(EXIT_FAILURE);
}
}
close(infd);
close(outfd);
exit(EXIT_SUCCESS);
}