I'm trying to run a scrapy spider via a Telegram bot using the python-telegram-bot API wrapper. Using the below code, I can successfully execute the spider and forward the scraped results to the bot, but only ONCE since I run the script. When I attempt to re-execute the spider via the bot (telegram bot command), I get the error twisted.internet.error.ReactorNotRestartable.
from twisted.internet import reactor
from scrapy import cmdline
from telegram.ext import Updater, CommandHandler, MessageHandler, Filters, RegexHandler
import logging
import os
import ConfigParser
import json
import textwrap
from MIS.spiders.moodle_spider import MySpider
from scrapy.utils.project import get_project_settings
from scrapy.crawler import CrawlerRunner, CrawlerProcess
from scrapy.utils.log import configure_logging
# Read settings from config file
config = ConfigParser.RawConfigParser()
config.read('./spiders/creds.ini')
TOKEN = config.get('BOT', 'TOKEN')
#APP_NAME = config.get('BOT', 'APP_NAME')
#PORT = int(os.environ.get('PORT', '5000'))
updater = Updater(TOKEN)
# Setting Webhook
#updater.start_webhook(listen="0.0.0.0",
# port=PORT,
# url_path=TOKEN)
#updater.bot.setWebhook(APP_NAME + TOKEN)
logging.basicConfig(format='%(asctime)s -# %(name)s - %(levelname)s - %(message)s',level=logging.INFO)
dispatcher = updater.dispatcher
# Real stuff
def doesntRun(bot, update):
#process = CrawlerProcess(get_project_settings())
#process.crawl(MySpider)
#process.start()
############
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
runner = CrawlerRunner({
'FEED_FORMAT' : 'json',
'FEED_URI' : 'output.json'
})
d = runner.crawl(MySpider)
d.addBoth(lambda _: reactor.stop())
reactor.run(installSignalHandlers=0) # the script will block here until the crawling is finished
#reactor.stop()
with open("./output.json", 'r') as file:
contents = file.read()
a_r = json.loads(contents)
AM = a_r[0]['AM']
...
...
message_template = textwrap.dedent("""
AM: {AM}
...
""")
messageContent = message_template.format(AM=AM, ...)
#print messageContent
bot.sendMessage(chat_id=update.message.chat_id, text=messageContent)
#reactor.stop()
# Handlers
test_handler = CommandHandler('doesntRun', doesntRun)
# Dispatchers
dispatcher.add_handler(test_handler)
updater.start_polling()
updater.idle()
I'm using the code from the docs: https://doc.scrapy.org/en/latest/topics/practices.html#run-scrapy-from-a-script
Code goes like this:
from twisted.internet import reactor
import scrapy
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
class MySpider(scrapy.Spider):
# Your spider definition
...
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
runner = CrawlerRunner()
d = runner.crawl(MySpider)
d.addBoth(lambda _: reactor.stop())
reactor.run() # the script will block here until the crawling is finished