To extract the image URL for both thumbnail, and original size URL you need to:
- Find all
<script> tags.
- Match and extract image URLs via
regex.
- Iterate over found matches and decode them.
Code and full example in the online IDE (it's straightforward, try to read it slowly):
import requests, lxml, re, json
from bs4 import BeautifulSoup
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
params = {
"q": "pexels cat",
"tbm": "isch",
"hl": "en",
"ijn": "0",
}
html = requests.get("https://www.google.com/search", params=params, headers=headers)
soup = BeautifulSoup(html.text, 'lxml')
def get_images_data():
# this steps could be refactored to a more compact
all_script_tags = soup.select('script')
# # https://regex101.com/r/48UZhY/4
matched_images_data = ''.join(re.findall(r"AF_initDataCallback\(([^<]+)\);", str(all_script_tags)))
# https://kodlogs.com/34776/json-decoder-jsondecodeerror-expecting-property-name-enclosed-in-double-quotes
# if you try to json.loads() without json.dumps it will throw an error:
# "Expecting property name enclosed in double quotes"
matched_images_data_fix = json.dumps(matched_images_data)
matched_images_data_json = json.loads(matched_images_data_fix)
# https://regex101.com/r/pdZOnW/3
matched_google_image_data = re.findall(r'\[\"GRID_STATE0\",null,\[\[1,\[0,\".*?\",(.*),\"All\",', matched_images_data_json)
# https://regex101.com/r/NnRg27/1
matched_google_images_thumbnails = ', '.join(
re.findall(r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]',
str(matched_google_image_data))).split(', ')
print('Google Image Thumbnails:') # in order
for fixed_google_image_thumbnail in matched_google_images_thumbnails:
# https://stackoverflow.com/a/4004439/15164646 comment by Frédéric Hamidi
google_image_thumbnail_not_fixed = bytes(fixed_google_image_thumbnail, 'ascii').decode('unicode-escape')
# after first decoding, Unicode characters are still present. After the second iteration, they were decoded.
google_image_thumbnail = bytes(google_image_thumbnail_not_fixed, 'ascii').decode('unicode-escape')
print(google_image_thumbnail)
# removing previously matched thumbnails for easier full resolution image matches.
removed_matched_google_images_thumbnails = re.sub(
r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]', '', str(matched_google_image_data))
# https://regex101.com/r/fXjfb1/4
# https://stackoverflow.com/a/19821774/15164646
matched_google_full_resolution_images = re.findall(r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]",
removed_matched_google_images_thumbnails)
print('\nGoogle Full Resolution Images:') # in order
for fixed_full_res_image in matched_google_full_resolution_images:
# https://stackoverflow.com/a/4004439/15164646 comment by Frédéric Hamidi
original_size_img_not_fixed = bytes(fixed_full_res_image, 'ascii').decode('unicode-escape')
original_size_img = bytes(original_size_img_not_fixed, 'ascii').decode('unicode-escape')
print(original_size_img)
get_images_data()
--------------
'''
Google Image Thumbnails:
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSb48h3zks_bf6y7HnZGyGPn3s2TAHKKm_7kzxufi5nzbouJcQderHqoEoOZ4SpOuPDjfw&usqp=CAU
...
Google Full Resolution Images:
https://images.pexels.com/photos/1170986/pexels-photo-1170986.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500
...
'''
Alternatively, you can do it as well by using Google Images API from SerpApi. It's a paid API with a free plan.
Essentially, one of the main differences is that you don't need to dig into the source code of the page and extract something via regex, all that needs to be done is to iterate over structured JSON string and get the data you want. Check out the playground.
Code to integrate:
import os, json # json for pretty output
from serpapi import GoogleSearch
def get_google_images():
params = {
"api_key": os.getenv("API_KEY"),
"engine": "google",
"q": "pexels cat",
"tbm": "isch"
}
search = GoogleSearch(params)
results = search.get_dict()
print(json.dumps(results['images_results'], indent=2, ensure_ascii=False))
get_google_images()
----------
'''
...
{
"position": 60, # img number
"thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRt-tXSZMBNLLX8MhavbBNkKmjJ7wNXxtdr5Q&usqp=CAU",
"source": "pexels.com",
"title": "1,000+ Best Cats Videos · 100% Free Download · Pexels Stock Videos",
"link": "https://www.pexels.com/search/videos/cats/",
"original": "https://images.pexels.com/videos/855282/free-video-855282.jpg?auto=compress&cs=tinysrgb&dpr=1&w=500",
"is_product": false
}
...
'''
P.S - I wrote a more in-depth blog post with GIFs and screenshots about how to scrape Google Images.
Disclaimer, I work for SerpApi.