How can i get the UPC of a product from bestbuy using scrapy

Question

hi there

i need to scrap bestbuy i am currently using scrapy i was able to get most of the data i need but however i had faced some problems trying to get the specification data section where UPC is. i was able to get features but that part i am not able to grab the data.

really appreciate your help this is my code

from scrapy import Spider
from bestbuy_spider.items import BestbuyProductItem
from scrapy import Request
import re
import json


class Bestbuy2Spider(Spider):
    name = 'bestbuy2'
    # allowed_domains = ['https://www.bestbuy.com']
    allowed_domains = ['bestbuy.com']
    # https://www.bestbuy.com/site/searchpage.jsp?cp=1&searchType=search&browsedCategory=pcmcat209400050001&ks=960&sp=-bestsellingsort%20skuidsaas&sc=Global&list=y&usc=All%20Categories&type=page&id=pcat17071&iht=n&nrp=15&seeAll=&st=categoryid%24pcmcat209400050001&qp=carrier_facet%3DCarrier~Verizon
    # start_urls = ['https://www.bestbuy.com/site/laptop-computers/all-laptops/pcmcat138500050001.c?id=pcmcat138500050001']
    start_urls = ['https://www.bestbuy.com/site/searchpage.jsp?id=pcat17071&qp=storepickupstores_facet%3DStore%20Availability%20-%20In%20Store%20Pickup~237&st=%2A']

    def parse(self, response):
        text = response.xpath('//div[@class="left-side"]/span/text()').extract_first()
        _, items_page, total = tuple(map(lambda x: int(x), re.findall('\d+',text)))

        num_pages = total // items_page
        #print('number of pages:', num_pages)


        urls = [
            'https://www.bestbuy.com/site/searchpage.jsp?cp={}&id=pcat17071&qp=storepickupstores_facet%3DStore%20Availability%20-%20In%20Store%20Pickup~237&st=%2A'.format(
                x) for x in range(1, num_pages + 1)]
        for url in urls[:1]:

            # product list page
            yield Request(url=url, callback=self.parse_product_list)

    def parse_product_list(self, response):
        # product list

        rows = response.xpath('//ol[@class="sku-item-list"]/li')
        # print(len(rows))
        # print('=' * 50)
        for row in rows:
            url = row.xpath('.//div[@class="sku-title"]/h4/a/@href').extract_first()
            print(url)
            yield Request(url='https://www.bestbuy.com' + str(url), callback=self.parse_product)
        #'//ul[@Class="thumbnail-list"]//@src'

    def parse_product(self, response):
        price_txt = response.xpath('//div[@class="pricing-price__regular-price"]/text()').extract_first()
        #reg_price = price_txt.replace('Was ', '')




        item = BestbuyProductItem(
            product = response.xpath('//div[@class="sku-title"]/h1/text()').extract_first(),
            #color = response.xpath('li[@class="image selected"]/div/a/@title').extract_first(),
            #skuId = response.xpath('//div[@class="sku product-data"]/span[2]/text()').extract_first(),
            #price = response.xpath('//div[@class="priceView-hero-price priceView-customer-price"]/span[1]/text()').extract_first(),
            #model = response.xpath('//div[@class="model product-data"]/span[2]/text()').extract_first(),
            #main_image = response.xpath('//img[@class="primary-image"]/@src').extract_first(),
            #images = response.xpath('//*[@class="thumbnail-list"]//img/@src').extract(),

            #description = response.xpath('//div[@class="long-description-container body-copy "]//div/text()').extract(),
            #features = response.xpath('//div[@class="list-row"]/p/text()').extract(),
            #regular_price = price_txt,
            Location = response.xpath('//div[@class="fulfillment-fulfillment-summary"]//div/p[1]/span/text()').extract()






        )
        yield item

score 0 · Accepted Answer · answered Apr 24 '22 at 12:10

Looking at one product page code (https://www.bestbuy.com/site/sony-65-class-bravia-xr-x95j-4k-uhd-smart-google-tv/6459306.p?skuId=6459306) i notice there's a json with the gtin13 field (the upc code you're looking for). Should be easy to parse it with json module and get what you need.

{
 "@context":"http://schema.org/",
 "@type":"Product",
 "name":"Sony - 65\" class BRAVIA XR X95J 4K UHD Smart Google TV",
 "image":"https://pisces.bbystatic.com/image2/BestBuy_US/images/products/6459/6459306_sd.jpg",
 "url":"https://www.bestbuy.com/site/sony-65-class-bravia-xr-x95j-4k-uhd-smart-google-tv/6459306.p?skuId=6459306",
 "description":"Shop Sony 65\" class BRAVIA XR X95J 4K UHD Smart Google TV at Best Buy. Find low everyday prices and buy online for delivery or in-store pick-up. Price Match Guarantee.",
 "sku":"6459306",
 "gtin13":"0027242921818",
 "model":"XR65X95J",
 "width":{
 "@type":"http://schema.org/QuantitativeValue",
  "unitCode":"INH",
  "value":"56.87"
 },
 "color":"Black",
 "brand":{
  "@type":"Brand",
  "name":"Sony"
 },
 "aggregateRating":{
  "@type":"AggregateRating",
  "ratingValue":"4.7",
  "reviewCount":"221"
 },
 "offers":{
   "@type":"AggregateOffer",
   "priceCurrency":"USD",
   "seller":{
     "@type":"Organization",
     "name":"Best Buy"
  },
  "lowPrice":"1184.99",
  "highPrice":"1499.99",
  "offercount":5,
  "offers":[
     {
        "@type":"Offer",
        "priceCurrency":"USD",
        "price":"1499.99",
        "availability":"http://schema.org/InStock",
        "itemCondition":"http://schema.org/NewCondition",
        "description":"New"
     },
     {
        "@type":"Offer",
        "priceCurrency":"USD",
        "price":"1319.99",
        "itemCondition":"http://schema.org/UsedCondition",
        "description":"Open-Box Excellent - Certified"
     },
     {
        "@type":"Offer",
        "priceCurrency":"USD",
        "price":"1274.99",
        "itemCondition":"http://schema.org/UsedCondition",
        "description":"Open-Box Excellent"
     },
     {
        "@type":"Offer",
        "priceCurrency":"USD",
        "price":"1229.99",
        "itemCondition":"http://schema.org/UsedCondition",
        "description":"Open-Box Satisfactory"
     },
     {
        "@type":"Offer",
        "priceCurrency":"USD",
        "price":"1184.99",
        "itemCondition":"http://schema.org/UsedCondition",
        "description":"Open-Box Fair"
     }
  ]
  }
 }

i was able to get the code accesing data however the problems is that not all products have the data in the same xpath location in some of them changes and i am getting errors because that what would be a better approach? currently this is what i am using to get the info data = json.loads(response.xpath('//html/body/div[3]/main/div[2]/div/div[1]/div[7]/div/div[7]/script/text()').extract_first()) — Jonathan JC, Apr 27 '22 at 17:24
What i suggest is to keep the xpath as simple as can be: if "upc" field is always present in the json of the products (and only there), you can use the following xpath ('//script[contains(text(), "upc")]/text()).extract()[0] to be sure you're extracting the right json. — Pierluigi Vinciguerra, Apr 29 '22 at 17:19

How can i get the UPC of a product from bestbuy using scrapy

1 Answers1