(Continued from a previous question)
I am trying to deploy a google dataflow job to run it as a cron job on the google app engine, following the method described here.
I have an DataFlow script (written in python) in a pipelines/script.py folder. Running this script locally (using the Apache Beam DirectRunner) or on google cloud (using the DataFlowRunner) works properly. But when deploying the job to run it periodically on the app engine, the job raises the following error when executed:
(4cb822d7f796239a): Traceback (most recent call last):   File
"/usr/local/lib/python2.7/dist-packages/dataflow_worker/batchworker.py",
line 582, in do_work
    work_executor.execute()   File "/usr/local/lib/python2.7/dist-packages/dataflow_worker/executor.py",
line 166, in execute
    op.start()   File "apache_beam/runners/worker/operations.py", line 294, in apache_beam.runners.worker.operations.DoOperation.start
(apache_beam/runners/worker/operations.c:10607)
    def start(self):   File "apache_beam/runners/worker/operations.py", line 295, in
apache_beam.runners.worker.operations.DoOperation.start
(apache_beam/runners/worker/operations.c:10501)
    with self.scoped_start_state:   File "apache_beam/runners/worker/operations.py", line 300, in
apache_beam.runners.worker.operations.DoOperation.start
(apache_beam/runners/worker/operations.c:9702)
    pickler.loads(self.spec.serialized_fn))   File "/usr/local/lib/python2.7/dist-
packages/apache_beam/internal/pickler.py", line 225, in loads
    return dill.loads(s)   File "/usr/local/lib/python2.7/dist-packages/dill/dill.py", line 277, in
loads
    return load(file)   File "/usr/local/lib/python2.7/dist-packages/dill/dill.py", line 266, in
load
    obj = pik.load()   File "/usr/lib/python2.7/pickle.py", line 858, in load
    dispatch[key](self)   File "/usr/lib/python2.7/pickle.py", line 1090, in load_global
    klass = self.find_class(module, name)   File "/usr/local/lib/python2.7/dist-packages/dill/dill.py", line 423, in
find_class
    return StockUnpickler.find_class(self, module, name)   File "/usr/lib/python2.7/pickle.py", line 1124, in find_class
    __import__(module) ImportError: No module named pipelines.spanner_backup
This is the stack trace visible when directly accessing the job in the dataflow panel of the google cloud console. However, if I click on "Stack Traces" to see the error stack trace from the "Stackdriver Error Reporting" panel, I see the following trace:
Traceback (most recent call last):
  File "/usr/local/lib/python2.7/dist-packages/dataflow_worker/batchworker.py", line 738, in run
    work, execution_context, env=self.environment)
  File "/usr/local/lib/python2.7/dist-packages/dataflow_worker/workitem.py", line 130, in get_work_items
    work_item_proto.sourceOperationTask.split)
  File "/usr/local/lib/python2.7/dist-packages/dataflow_worker/workercustomsources.py", line 142, in __init__
    source_spec[names.SERIALIZED_SOURCE_KEY]['value'])
  File "/usr/local/lib/python2.7/dist-packages/apache_beam/internal/pickler.py", line 225, in loads
    return dill.loads(s)
  File "/usr/local/lib/python2.7/dist-packages/dill/dill.py", line 277, in loads
    return load(file)
  File "/usr/local/lib/python2.7/dist-packages/dill/dill.py", line 266, in load
    obj = pik.load()
  File "/usr/lib/python2.7/pickle.py", line 858, in load
    dispatch[key](self)
  File "/usr/lib/python2.7/pickle.py", line 1090, in load_global
    klass = self.find_class(module, name)
  File "/usr/local/lib/python2.7/dist-packages/dill/dill.py", line 423, in find_class
    return StockUnpickler.find_class(self, module, name)
  File "/usr/lib/python2.7/pickle.py", line 1124, in find_class
    __import__(module)
ImportError: No module named spanner.client
Suggesting some import error when sharing things between workers? Google Spanner should be properly installed though.
I am using:
Flask==0.12.2 
apache-beam[gcp]==2.1.1 
gunicorn==19.7.1 
gevent==1.2.1
google-cloud-dataflow==2.1.1 
google-cloud-spanner==0.26
Am I missing something ?
Edit: My setup.py is the following: (as described here, corresponding github link with comments here)
from distutils.command.build import build as _build
import subprocess
import setuptools
class build(_build):  # pylint: disable=invalid-name
  sub_commands = _build.sub_commands + [('CustomCommands', None)]
CUSTOM_COMMANDS = [
    ['echo', 'Custom command worked!']]
class CustomCommands(setuptools.Command):
  """A setuptools Command class able to run arbitrary commands."""
  def initialize_options(self):
    pass
  def finalize_options(self):
    pass
  def RunCustomCommand(self, command_list):
    print 'Running command: %s' % command_list
    p = subprocess.Popen(
        command_list,
        stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    # Can use communicate(input='y\n'.encode()) if the command run requires
    # some confirmation.
    stdout_data, _ = p.communicate()
    print 'Command output: %s' % stdout_data
    if p.returncode != 0:
      raise RuntimeError(
          'Command %s failed: exit code: %s' % (command_list, p.returncode))
  def run(self):
    for command in CUSTOM_COMMANDS:
      self.RunCustomCommand(command)
REQUIRED_PACKAGES = ["Flask==0.12.2",
                        "apache-beam[gcp]==2.1.1",
                        "gunicorn==19.7.1",
                        "gevent==1.2.1",
                        "google-cloud-dataflow==2.1.1",
                        "google-cloud-spanner==0.26"
                    ]
setuptools.setup(
    name='dataflow_python_pipeline',
    version='1.0.0',
    description='DataFlow Python Pipeline',
    install_requires=REQUIRED_PACKAGES,
    packages=setuptools.find_packages(),
    cmdclass={
        'build': build,
        'CustomCommands': CustomCommands,
        }
    )