My goal is to collect a maximum of profile links on Khan Academy and then select some specific data on each of these profiles to store them into a CSV file.
Here is my script to get profile links. Then scrape specific data on each of these profiles. And then store them in a csv file.
from bs4 import BeautifulSoup
from requests_html import HTMLSession
import re
session = HTMLSession()
r = session.get('https://www.khanacademy.org/computing/computer-science/algorithms/intro-to-algorithms/v/what-are-algorithms')
r.html.render(sleep=5)
soup=BeautifulSoup(r.html.html,'html.parser')
#find the profile links
profiles = soup.find_all(href=re.compile("/profile/kaid"))
profile_list=[]
for links in profiles:
    links_no_list = links.extract()
    text_link = links_no_list['href']
    text_link_nodiscussion = text_link[:-10]
    final_profile_link ='https://www.khanacademy.org'+text_link_nodiscussion
    profile_list.append(final_profile_link)
#create the csv file
filename = "khanscraptry1.csv"
f = open(filename, "w")
headers = "link, date_joined, points, videos, questions, votes, answers, flags, project_request, project_replies, comments, tips_thx, last_date\n"
f.write(headers)
#for each profile link, scrape the specific data and store them into the csv
for link in profile_list: 
    print("Scrapping ",link)
    session = HTMLSession()
    r = session.get(link)
    r.html.render(sleep=5)
    soup=BeautifulSoup(r.html.html,'html.parser')
    user_info_table=soup.find('table', class_='user-statistics-table')
    if user_info_table is not None:
        dates,points,videos=[tr.find_all('td')[1].text for tr in user_info_table.find_all('tr')]
    else:
        dates=points=videos='NA'
    user_socio_table=soup.find_all('div', class_='discussion-stat')
    data = {}
    for gettext in user_socio_table:
        category = gettext.find('span')
        category_text = category.text.strip()
        number = category.previousSibling.strip()
        data[category_text] = number
    full_data_keys=['questions','votes','answers','flags raised','project help requests','project help replies','comments','tips and thanks'] #might change answers to answer because when it's 1 it's putting NA instead
    for header_value in full_data_keys:
        if header_value not in data.keys():
            data[header_value]='NA'
    user_calendar = soup.find('div',class_='streak-calendar-scroll-container')
    if user_calendar is not None:
        last_activity = user_calendar.find('span',class_='streak-cell filled')
        try:
            last_activity_date = last_activity['title']
        except TypeError:
            last_activity_date='NA'
    else:
        last_activity_date='NA'
    f.write(link + "," + dates + "," + points.replace("," , "") + "," + videos + "," + data['questions'] + "," + data['votes'] + "," + data['answers'] + "," + data['flags raised'] + "," + data['project help requests'] + "," + data['project help replies'] + "," + data['comments'] + "," + data['tips and thanks'] + "," + last_activity_date + "\n") #might change answers to answer because when it's 1 it's putting NA instead
f.close()
This first script should work fine. Now, my problem is that this script found about 40 profile links: print(len(profile_list)) return 40.
If I could click on show more button (on : https://www.khanacademy.org/computing/computer-science/algorithms/intro-to-algorithms/v/what-are-algorithms), then I will get more profile links (and thus more profiles to scrape).
That script is infinitely clicking on show more button, until there is no show more button:
import unittest
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
driver = webdriver.Chrome() #watch out, change if you are not using Chrome
driver.get("https://www.khanacademy.org/computing/computer-science/algorithms/intro-to-algorithms/v/what-are-algorithms")
driver.implicitly_wait(10)
def showmore(self):
       while True:
             try:
               driver.implicitly_wait(5)
               showmore = self.find_element_by_class_name("button_1eqj1ga-o_O-shared_1t8r4tr-o_O-default_9fm203")
               showmore.click()
             except NoSuchElementException:
               break
showmore(driver)
This second script should also work fine.
My question is: how can I merge these two scripts? How to make BeautifulSoup, Selenium and Requests work together?
In other words: How can I apply the second script to get a full page and then treat it into the first script?