I have a problem with C++ and memory. Here's the pseudocode:
main.cpp
#include <iostream>
#include "seq.h"
int main(int argc, char *argv[]) {
    SnpSite snp_site("/mnt/c/Users/manht/Downloads/s_typhi_wong_holt.aln.gz");
    snp_site.test(); // run the first time
    snp_site.test(); // run the second time
}
seq.h
#include "file_handler.h"
#include <stdio.h>
class SnpSite {
private:
    string inputfile;
    FileHandler fh;
public:
    SnpSite(char* _inputfile);
    int is_unknown(char base);
    void test();
};
seq.cpp
#include "seq.h"
SnpSite::SnpSite(char* _inputfile) {
    fh = FileHandler();
    inputfile = _inputfile;
}
void SnpSite::test() {
    string sample_name, seq;
    this->fh.open(this->inputfile.c_str());
    this->fh.assign_next_sample_to(&sample_name, &seq);
    this->fh.close();
}
file_handler.h
#ifndef SEQ_H_
#include <zlib.h>
#include <utility>
#include <ctype.h>
#include "my_string.h"
#include <string>
using namespace std;
#define SEQ_H_
typedef bool (*match_func)(int c, int delimiter);
class FileHandler {
private:
    gzFile file;
    char buffer[2048]; // Static allocation for better performance.
    int buffer_start, buffer_end;
    bool eof;
    void get_until(int delimiter, string *s);
public:
    FileHandler();
    FileHandler(int _buffer_size);
    void open(const char* filename);
    void close();
    void assign_next_sample_to(string *name, string *seq);
    int next_char();
    bool is_eof();
};
#endif
file_handler.cpp
#include "file_handler.h"
FileHandler::FileHandler() {
    buffer_start = -1;
    buffer_end = -1;
    eof = false;
}
void FileHandler::open(const char* filename) {
    file = gzopen(filename, "r");
    eof = false;
}
void FileHandler::close() {
    gzclose(file);
}
int FileHandler::next_char() {
    /* Read current character and increase cursor (buffer_start) by 1.*/
    if (buffer_start >= buffer_end) {
        buffer_end = gzread(file, buffer, 2048);
        buffer_start = -1;
        if (buffer_end == 0) eof = true;
    }
    return buffer[++buffer_start];
}
bool FileHandler::is_eof() {
    return eof;
}
#define SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
#define SEP_TAB   1 // isspace() && !' '
#define SEP_LINE  2 // line separator: "\n" (Unix) or "\r\n" (Windows)
#define SEP_MAX   2
// list of function to compare c and delimiter, need exactly 2 arguments.
bool match_space(int c, int delimter) {
    return isspace(c);
}
bool match_tab(int c, int delimter) {
    return isspace(c) && c != ' ';
}
bool match_newline(int c, int delimter) {
    return c == '\n';
}
bool match_char(int c, int delimter) {
    return c == delimter;
}
bool no_match(int c, int delimiter) {
    return false;
}
// end list.
void FileHandler::get_until(int delimiter, string *s) {
    /*
        Read till delimiter and append bytes read to s.
        When done cursor will be at the end of the line.
    */
    match_func match; // function to check if a char match delimiter
    switch (delimiter) {
        case SEP_SPACE:
            match = match_space;
            break;
        case SEP_TAB:
            match = match_tab;
            break;
        case SEP_LINE:
            match = match_newline;
            break;
        default:
            if (delimiter > SEP_MAX) match = match_char;
            else match = no_match;
    }
    // begin process
    int i = buffer_start;
    while (!match(buffer[i], delimiter)) {
        if (buffer_start >= buffer_end) {
            buffer_end = gzread(file, buffer, 2048);
            buffer_start = 0;
            i = 0;
            if (buffer_end == 0) {
                eof = true;
                break;
            }
        }
        while (!match(buffer[i], delimiter) && i < buffer_end) i++;
        s->append((char*)(buffer + buffer_start), i - buffer_start);
        buffer_start = i;
    }
}
/* 
    Get next sample name and sequence, assign it to *name and *seq.
    (Note: this function do not read quality score for QUAL file).
*/
void FileHandler::assign_next_sample_to(string *name, string *seq) {
    /* Get next sample name and sequence, assign it to *name and *seq.*/
    name->erase();
    seq->erase();
    int c;
    while (!eof && (c = next_char()) != '>' && c != '@') {} // read until meet sample name
    get_until(SEP_SPACE, name); // get sample name
    while (!eof && (c = next_char()) != '>' && c != '@' && c != '+') {
        if (c == '\n') continue;
        get_until(SEP_LINE, seq); // read sequence
    }
    buffer_start--; // step back to the end of sequence
}
I don't use any dynamic allocation, and when I traced memory usage by PID in htop, I found something that I can't explain:
- The first time I call test():
- At the beginning of the function, my process uses 6168 KBytes.
- At the end of the function, my process uses 13998 Kbytes.
 
- The second time I call test():
- At the beginning of the function, my process uses 6304 Kbytes.
- At the end of the function, my process uses 21664 Kbytes.
 
The length of the seq variable is 4809037 and sample_name is 11 in both cases. I don't understand why memory usage is so different between them. Hope someone can find out and explain it to me, it helps me a lot. Thanks
 
    