The following minimal example of calling a python function from C++ has a memory leak on my system:
script.py:
import tensorflow
def foo(param):
    return "something"
main.cpp:
#include "python3.5/Python.h"
#include <iostream>
#include <string>
int main()
{
    Py_Initialize();
    PyRun_SimpleString("import sys");
    PyRun_SimpleString("if not hasattr(sys,'argv'): sys.argv = ['']");
    PyRun_SimpleString("sys.path.append('./')");
    PyObject* moduleName = PyUnicode_FromString("script");
    PyObject* pModule = PyImport_Import(moduleName);
    PyObject* fooFunc = PyObject_GetAttrString(pModule, "foo");
    PyObject* param = PyUnicode_FromString("dummy");
    PyObject* args = PyTuple_Pack(1, param);
    PyObject* result = PyObject_CallObject(fooFunc, args);
    Py_CLEAR(result);
    Py_CLEAR(args);
    Py_CLEAR(param);
    Py_CLEAR(fooFunc);
    Py_CLEAR(pModule);
    Py_CLEAR(moduleName);
    Py_Finalize();
}
compiled with
g++ -std=c++11 main.cpp $(python3-config --cflags) $(python3-config --ldflags) -o main
and run with valgrind
valgrind --leak-check=yes ./main
produces the following summary
LEAK SUMMARY:
==24155==    definitely lost: 161,840 bytes in 103 blocks
==24155==    indirectly lost: 33 bytes in 2 blocks
==24155==      possibly lost: 184,791 bytes in 132 blocks
==24155==    still reachable: 14,067,324 bytes in 130,118 blocks
==24155==                       of which reachable via heuristic:
==24155==                         stdstring          : 2,273,096 bytes in 43,865 blocks
==24155==         suppressed: 0 bytes in 0 blocks
I'm using Linux Mint 18.2 Sonya, g++ 5.4.0, Python 3.5.2 and TensorFlow 1.4.1.
Removing import tensorflow makes the leak disappear. Is this a bug in TensorFlow or did I do something wrong? (I expect the latter to be true.)
Additionally when I create a Keras layer in Python
#script.py
from keras.layers import Input
def foo(param):
    a = Input(shape=(32,))
    return "str"
and run the call to Python from C++ repeatedly
//main.cpp
#include "python3.5/Python.h"
#include <iostream>
#include <string>
int main()
{
    Py_Initialize();
    PyRun_SimpleString("import sys");
    PyRun_SimpleString("if not hasattr(sys,'argv'): sys.argv = ['']");
    PyRun_SimpleString("sys.path.append('./')");
    PyObject* moduleName = PyUnicode_FromString("script");
    PyObject* pModule = PyImport_Import(moduleName);
    for (int i = 0; i < 10000000; ++i)
    {
        std::cout << i << std::endl;
        PyObject* fooFunc = PyObject_GetAttrString(pModule, "foo");
        PyObject* param = PyUnicode_FromString("dummy");
        PyObject* args = PyTuple_Pack(1, param);
        PyObject* result = PyObject_CallObject(fooFunc, args);
        Py_CLEAR(result);
        Py_CLEAR(args);
        Py_CLEAR(param);
        Py_CLEAR(fooFunc);
    }
    Py_CLEAR(pModule);
    Py_CLEAR(moduleName);
    Py_Finalize();
}
the memory consumption of the application continuously grows ad infinitum during runtime.
So I guess there is something fundamentally wrong with the way I call the python function from C++, but what is it?
