I made this code which it's reading the UpdatedUrls.tmp but this file contains only one url.
The code do somethings like scrap website looking for emails, but if i put 2 address or more in this file, the code dont work. The emails.txt be empty.
I need put a loop or change something to get two or more emails from urls.
Content of UpdatedUrls.tmp:
https://mobissom.com.br/contato/
I need working with:
https://mobissom.com.br/contato/
https://www.site2.com
https://www.site3.com
The code is here:
import re 
import requests 
from urllib.parse import urlsplit 
from collections import deque 
from bs4 import BeautifulSoup 
import pandas as pd 
with open("updatedUrls.tmp", "r") as smails:
    original_url = smails.readlines()
    original_url = ''.join(original_url)
# to save urls to be scraped
unscraped = deque([original_url])
# to save scraped urls
scraped = set()
# to save fetched emails
emails = set()
while len(unscraped):
    # move unsraped_url to scraped_urls set
    url = unscraped.popleft()  # popleft(): Remove and return an element from the left side of the deque
    scraped.add(url)
    parts = urlsplit(url)
    base_url = "{0.scheme}://{0.netloc}".format(parts)
    if '/' in parts.path:
        path = url[:url.rfind('/') + 1]
    else:
        path = url
    print("Crawling URL %s" % url)
    try:
        response = requests.get(url)
    except (requests.exceptions.MissingSchema, requests.exceptions.ConnectionError):
        continue
    new_emails = set(re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.com", response.text, re.I))
    emails.update(new_emails)
    soup = BeautifulSoup(response.text, 'lxml')
    for anchor in soup.find_all("a"):
        if "href" in anchor.attrs:
            link = anchor.attrs["href"]
        else:
            link = ''
            if link.startswith('/'):
                link = base_url + link
            elif not link.startswith('http'):
                link = path + link
            if not link.endswith(".gz"):
                if not link in unscraped and not link in scraped:
                    unscraped.append(link)
df = pd.DataFrame(emails, columns=None)
df.to_csv('email.txt', index=False)
with open('email.txt', 'r') as fin:
    data = fin.read().splitlines(True)
with open('email.txt', 'w') as fout:
    fout.writelines(data[1:])
 
    