I'm now web-scraping about 10,000 products from a retail website (e.g., Amazon) everyday to keep track of price history.
Scraping consists of two parts; firstly, I collect the "listings" of products where I get the basic information, such as product name, price, id, and url of each listing and save that to pandas dataframe. Secondly, using each url of product, I collect more detailed information about product and attach that one-by-one into the other columns of dataframe. I have no issue in the first part (takes <10 minutes), but it usually takes more than 15 hours to complete the second part.
Below is a sample code. Although the sample code below is not real, the actual code is just prolonged version of it.
import requests
import pandas as pd
import json
from user_agent import generate_user_agent
df_dic = {
"product_name": ['product1','product2','product3','product4','product5'],
"product_price": ['500','800','300','700','1000'],
"product_id": ['1000','1001','1002','1003','1004'],
"product_url": ['url1','url2','url3','url4','url5'],
}
# df is the data scraped from the first part
df = pd.DataFrame(df_dic)
df['product_chracter1'] = ""
df['product_chracter2'] = ""
df['product_chracter3'] = ""
df['product_chracter4'] = ""
df['product_chracter5'] = ""
df['product_chracter6'] = ""
df['product_chracter7'] = ""
df['product_chracter8'] = ""
df['product_chracter9'] = ""
df['product_chracter10'] = ""
# Below is the beginning of the second part where detailed product characteristics (more than 50) are attached to dataframe
for i_url in df['product_url']:
try:
product_id = df['product_id'].loc[df['prouct_url']==i_url]
params = {'productSeq': product_id}
headers = {'User-Agent': generate_user_agent(device_type='smartphone', navigator='chrome')}
baseline_url = r'https://www.something.com'
html = requests.post(baseline_url, headers = headers, params = params).text
time.sleep(0.3)
df['product_chracter1'].loc[df['prouct_url']==i_url] = re.findall(r'var product_chracter1 = "(.+?)";', html)[0]
df['product_chracter2'].loc[df['prouct_url']==i_url] = re.findall(r'var product_chracter2 = "(.+?)";', html)[0]
df['product_chracter3'].loc[df['prouct_url']==i_url] = re.findall(r'var product_chracter3 = "(.+?)";', html)[0]
df['product_chracter4'].loc[df['prouct_url']==i_url] = re.findall(r'var product_chracter4 = "(.+?)";', html)[0]
df['product_chracter5'].loc[df['prouct_url']==i_url] = re.findall(r'var product_chracter5 = "(.+?)";', html)[0]
baseline_url_2 = r'https://www.something_2.com'
html = requests.post(baseline_url_2, headers = headers, params = params).text
time.sleep(0.3)
df['product_chracter6'].loc[df['prouct_url']==i_url] = re.findall(r'var product_chracter6 = "(.+?)";', html)[0]
df['product_chracter7'].loc[df['prouct_url']==i_url] = re.findall(r'var product_chracter7 = "(.+?)";', html)[0]
df['product_chracter8'].loc[df['prouct_url']==i_url] = re.findall(r'var product_chracter8 = "(.+?)";', html)[0]
df['product_chracter9'].loc[df['prouct_url']==i_url] = re.findall(r'var product_chracter9 = "(.+?)";', html)[0]
df['product_chracter10'].loc[df['prouct_url']==i_url] = re.findall(r'var product_chracter10 = "(.+?)";', html)[0]
except Exception as e:
print(f"Some error happened at {i_url}")
print(e)
continue
filename = 'site_date'
df.to_pickle(f'{filename}.pkl')
I used try and except as there exist some products that are sold while scraping or do not show some characteristics, thus, causes error. I'm only using requests.get and requests.post, not selenium. My question is, how I can use multiprocessing in python and make my code run faster? I read that ThreadPoolExecutor from the concurrent.futures library can help, but really do not have any knowledge how I can implement that in my case.
Any help, different approach, and any comments would be greatly appreciated. Thank you for your time and consideration for reading this.
Updated: 2022-07-02
Thanks to @GohKohHan's comment, I was able to apply multiprocessing into my code and it takes almost 1/max_workers of the time run with the previous code. I know my code is not perfect and a lot of people here are super masters at python, so I hope you to point out any mistakes or things that need to be improved from the following code.
import requests,re
import time
import pandas as pd
from user_agent import generate_user_agent
import concurrent.futures
start_time = time.time
urls = df['product_url']
def return_dataframe(i_url):
try:
unique = []
product_chracter1=[]; product_chracter6=[]
product_chracter2=[]; product_chracter7=[]
product_chracter3=[]; product_chracter8=[]
product_chracter4=[]; product_chracter9=[]
product_chracter5=[]; product_chracter10=[]
product_id = df['product_id'].loc[df['prouct_url']==i_url]
params = {'productSeq': product_id}
headers = {'User-Agent':generate_user_agent(device_type='smartphone', navigator='chrome')}
unique.append(i_url)
baseline_url = r'https://www.something.com'
html = requests.post(baseline_url, headers = headers, params = params).text
time.sleep(0.3)
product_chracter1.append(re.findall(r'var product_chracter1 = "(.+?)";', html)[0])
product_chracter2.append(re.findall(r'var product_chracter2 = "(.+?)";', html)[0])
product_chracter3.append(re.findall(r'var product_chracter3 = "(.+?)";', html)[0])
product_chracter4.append(re.findall(r'var product_chracter4 = "(.+?)";', html)[0])
product_chracter5.append(re.findall(r'var product_chracter5 = "(.+?)";', html)[0])
baseline_url_2 = r'https://www.something_2.com'
html = requests.post(baseline_url_2, headers = headers, params = params).text
time.sleep(0.3)
product_chracter6.append(re.findall(r'var product_chracter6 = "(.+?)";', html)[0])
product_chracter7.append(re.findall(r'var product_chracter7 = "(.+?)";', html)[0])
product_chracter8.append(re.findall(r'var product_chracter8 = "(.+?)";', html)[0])
product_chracter9.append(re.findall(r'var product_chracter9 = "(.+?)";', html)[0])
product_chracter10.append(re.findall(r'var product_chracter10 = "(.+?)";', html)[0])
i_df_detailed = pd.DataFrame(list(zip(unique, product_chracter1, product_chracter2, product_chracter3,
product_chracter4, product_chracter5, product_chracter6,
product_chracter7, product_chracter8, product_chracter9,
product_chracter10)),
columns = ['unique', 'product_chracter1', 'product_chracter2', 'product_chracter3',
'product_chracter4', 'product_chracter5', 'product_chracter6',
'product_chracter7', 'product_chracter8', 'product_chracter9',
'product_chracter10'])
return i_df_detailed
except Exception as e:
print(f"Some error happened at {i_url}")
print(e)
continue
detailed_agg = []
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
future_to_url = {executor.submit(return_dataframe, i_url): i_url for i_url in urls}
for future in concurrent.futures.as_completed(future_to_url):
try:
detailed_agg.append(future.result())
except Exception as exc:
print('generated an exception: %s' % (exc))
df_detailed = pd.concat(detailed_agg)
df_agg = pd.merge(df, df_detailed, how='left', left_on=['product_url'], right_on=['unique'])
# How long did it take?
print("--- %.1f minutes ---" % (int(time.time() - start_time)/60))
Any suggestions would be greatly appreciated!