I have the following soup:
next ... From this I want to extract the href, "some_url"
this I want to extract the href, "some_url"
and the whole list of the pages that are listed on this page: https://www.catholic-hierarchy.org/diocese/laa.html
note: there are a whole lot of links to sub-pages: which i need to parse. at the moment: getting all the data out it : -dioceses -Urls -description -contact-data -etc. etx.
The example below will grab all URLs of dioceses, get some info about each of them and creates final dataframe. To speed-up the process multiprocessing.Pool is used:
but wait: how to get this scraper running without the support of the multiprocessing!? i want to run it in Colab - therefore in need to get rid of the multiprocessing-feature.
How to achieve this..!?
import requests
from bs4 import BeautifulSoup
from multiprocessing import Pool
def get_dioceses_urls(section_url):
    dioceses_urls = set()
    while True:
        print(section_url)
        soup = BeautifulSoup(
            requests.get(section_url, headers=headers).content, "lxml"
        )
        for a in soup.select('ul a[href^="d"]'):
            dioceses_urls.add(
                "https://www.catholic-hierarchy.org/diocese/" + a["href"]
            )
        # is there Next Page button?
        next_page = soup.select_one('a:has(img[alt="[Next Page]"])')
        if next_page:
            section_url = (
                "https://www.catholic-hierarchy.org/diocese/"
                + next_page["href"]
            )
        else:
            break
    return dioceses_urls
def get_diocese_info(url):
    print(url)
    soup = BeautifulSoup(requests.get(url, headers=headers).content, "html5lib")
    data = {
        "Title 1": soup.h1.get_text(strip=True),
        "Title 2": soup.h2.get_text(strip=True),
        "Title 3": soup.h3.get_text(strip=True) if soup.h3 else "-",
        "URL": url,
    }
    li = soup.find(
        lambda tag: tag.name == "li"
        and "type of jurisdiction:" in tag.text.lower()
        and tag.find() is None
    )
    if li:
        for l in li.find_previous("ul").find_all("li"):
            t = l.get_text(strip=True, separator=" ")
            if ":" in t:
                k, v = t.split(":", maxsplit=1)
                data[k.strip()] = v.strip()
    # get other info about the diocese
    # ...
    return data
if __name__ == "__main__":
    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:99.0) Gecko/20100101 Firefox/99.0"
    }
    # get main sections:
    url = "https://www.catholic-hierarchy.org/diocese/laa.html"
    soup = BeautifulSoup(
        requests.get(url, headers=headers).content, "html.parser"
    )
    main_sections = [url]
    for a in soup.select("a[target='_parent']"):
        main_sections.append(
            "https://www.catholic-hierarchy.org/diocese/" + a["href"]
        )
    all_data, dioceses_urls = [], set()
    with Pool() as pool:
        # get all dioceses urls:
        for urls in pool.imap_unordered(get_dioceses_urls, main_sections):
            dioceses_urls.update(urls)
        # get info about all dioceses:
        for info in pool.imap_unordered(get_diocese_info, dioceses_urls):
            all_data.append(info)
    # create dataframe from the info about dioceses
    df = pd.DataFrame(all_data).sort_values("Title 1")
    # save it to csv file
    df.to_csv("data.csv", index=False)
    print(df.head().to_markdown())
update: well see what i get back if i run the script on colab:
https://www.catholic-hierarchy.org/diocese/laa.htmlhttps://www.catholic-hierarchy.org/diocese/lab.html
---------------------------------------------------------------------------
RemoteTraceback                           Traceback (most recent call last)
RemoteTraceback: 
"""
Traceback (most recent call last):
  File "/usr/lib/python3.7/multiprocessing/pool.py", line 121, in worker
    result = (True, func(*args, **kwds))
  File "<ipython-input-1-f5ea34a0190f>", line 21, in get_dioceses_urls
    next_page = soup.select_one('a:has(img[alt="[Next Page]"])')
  File "/usr/local/lib/python3.7/dist-packages/bs4/element.py", line 1403, in select_one
    value = self.select(selector, limit=1)
  File "/usr/local/lib/python3.7/dist-packages/bs4/element.py", line 1528, in select
    'Only the following pseudo-classes are implemented: nth-of-type.')
NotImplementedError: Only the following pseudo-classes are implemented: nth-of-type.
"""
The above exception was the direct cause of the following exception:
NotImplementedError                       Traceback (most recent call last)
<ipython-input-1-f5ea34a0190f> in <module>
     81     with Pool() as pool:
     82         # get all dioceses urls:
---> 83         for urls in pool.imap_unordered(get_dioceses_urls, main_sections):
     84             dioceses_urls.update(urls)
     85 
/usr/lib/python3.7/multiprocessing/pool.py in next(self, timeout)
    746         if success:
    747             return value
--> 748         raise value
    749 
    750     __next__ = next                    # XXX
NotImplementedError: Only the following pseudo-classes are implemented: nth-of-type.
 
     
    