I'm using Python-requests to scrape data from Linkedin.com. But I met some issue with csrf authentication.
Linkedin.com put csrf-related values on its html file and it's easy to find them, so I imitated This Stack-overflow question and write this script:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
import json
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 20)
url = 'https://www.linkedin.com/login/zh-cn?fromSignIn=true&trk=guest_homepage-basic_nav-header-signin'
driver.get(url)
username = wait.until(EC.presence_of_element_located((By.ID, 'username')))
password = wait.until(EC.presence_of_element_located((By.ID, 'password')))
submit = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="app__container"]/main/div[2]/form/div[3]/button')))
loginCsrfParam = driver.find_element_by_xpath('//*[@id="app__container"]/main/div[2]/form/input[9]')
print(loginCsrfParam.get_attribute('value'))
username.send_keys('username@xxx.com')
password.send_keys('password')
time.sleep(1)
submit.click()
cookies = driver.get_cookies()
with open("cookies.txt", "w") as fp:
json.dump(cookies, fp)
driver.quit()
import requests
from requests.cookies import RequestsCookieJar
import json
from lxml import etree
LINKEDIN_USERNAME = 'username@xxx.com'
LINKEDIN_PASSWORD = 'password'
jar = RequestsCookieJar()
with open("cookies.txt", "r") as fp:
cookies = json.load(fp)
for cookie in cookies:
jar.set(cookie['name'], cookie['value'])
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_0_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36',
'Referer':'https://www.linkedin.com/login/zh-cn?fromSignIn=true&trk=guest_homepage-basic_nav-header-signin',
'Connection': 'keep-alive',
}
client = requests.Session()
client.headers = headers
client.cookies = jar
HOMEPAGE_URL = 'https://www.linkedin.com/login/zh-cn?fromSignIn=true&trk=guest_homepage-basic_nav-header-signin'
LOGIN_URL = 'https://www.linkedin.com/checkpoint/lg/login-submit'
r = client.get(HOMEPAGE_URL,cookies=jar)
html = r.content
h = etree.HTML(html)
csrfToken = h.xpath('//*[@id="app__container"]/main/div[2]/form/input[1]/@value') #csrfToken
sIdString = h.xpath('//*[@id="app__container"]/main/div[2]/form/input[3]/@value') #sIdString
loginCsrfParam = h.xpath('//*[@id="app__container"]/main/div[2]/form/input[9]/@value') #loginCsrfParam
login_information = {
'session_key':LINKEDIN_USERNAME,
'session_password':LINKEDIN_PASSWORD,
'loginCsrfParam': loginCsrfParam[0],
'csrfToken':csrfToken[0],
'sIdString':sIdString[0],
'parentPageKey':'d_checkpoint_lg_consumerLogin',
'trk':'guest_homepage-basic_nav-header-signin',
'controlId':'d_checkpoint_lg_consumerLogin-login_submit_button',
'loginFlow':'REMEMBER_ME_OPTIN'
}
p = client.post(LOGIN_URL, data=login_information, cookies=jar)
req = client.get('https://www.linkedin.com/feed', cookies=jar)
print(req.text)
However, the script didn't work. "print(req.text)" didn't return a logged-in Linkedin site, instead, the return html looks like this
My questions are below:
Is it possible to to use requests to login Linkedin.com? I googled a lot and there is no example still useful today.
If possible, what's wrong with my code?
Except from csrf, did Linkedin.com use other methods to avoid web-scraping?
Thanks!