I have been trying to develop a scraper class which will read Wikipedia data into a JSON file. I need to be able to read tables, extract links from the first columns, retrieve information from those links and make it into a JSON file. The problem I have been encountering is, I have been getting error messages like NoneType has no attribute find_all when I try to extract table data from table rows. While I tested the file with one page and it worked perfectly, on another page it is struggling. At the same time, other functions of the same class work perfectly in other pages. I am not able to figure out the Indexing Error as well.
I have tried iterating through the rows, tables and also iterating through soup object in order to derive data from first column. The results I receive in varied circumstances: - NoneType has no attribute find_all - str object has no attribute find_all (when I iterate through rows) - Empty list - Only the first name - Index Error. Index out of range.
import requests 
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import lxml
import wikipedia
import json
import html5lib
class wiki2json():
    def __init__(self, url):
        self.url = url
    def urlretrieve(url):
        web = requests.get(url)
        text = web.text
        soup = BeautifulSoup(text, 'lxml')
        return soup
    def get_title(url):
        title = wiki2json.urlretrieve(url).find('h1', {'class': 'firstHeading'}).get_text()
        return title
    def get_content(url):
        page = wikipedia.page(wiki2json.get_title(url))
        content = page.content
        return content
    def get_links(url):
        links = []
        for link in wiki2json.urlretrieve(url).find_all('a', {'href' : re.compile('^https://|^http://')}):
            links.append(link)
        return links
    def get_tables(url):
        tables = wiki2json.urlretrieve(url).find_all('table', {'class' : ['sortable', 'plainrowheaders']})
        return tables
    def get_tablelinks(url):
        anchor_table = []
        for table in wiki2json.get_tables(url):
            tablelink = wiki2json.urlretrieve(url).find_all('a')
            anchor_table.append(tablelink)
        return anchor_table
    def get_tableheads(url):
        for table in wiki2json.get_tables(url):
            theads = table.find_all('th')
            heads = [thead.text.strip() for thead in theads]
        return heads
    def get_tablerows(url):
        all_tables2 = wiki2json.urlretrieve(url).find_all('table')[2]
        rows = all_tables2.find_all('tr')
        return rows[1:]
# wiki2json.get_tablerows('https://en.wikipedia.org/wiki/List_of_Presidents_of_India')    
    def get_tablenames(url):
        first_column = []
        names = []
        for row in wiki2json.get_tablerows(url):
            first_column.append(row.find_all('td')[0])
            names.append([first.text.strip('\\n') for first in first_column])
        return names
# wiki2json.get_tablenames('https://en.wikipedia.org/wiki/List_of_Presidents_of_India')
wiki2json.get_tablenames('https://simple.wikipedia.org/wiki/List_of_countries')