So I tried to apply the examples in this link but my output is strange:
My item:
class Artiste(scrapy.Item):
url = scrapy.Field()
nom = scrapy.Field()
styles = scrapy.Field()
My Scrapy class:
class AnnuSpider(scrapy.Spider):
name = "annu"
start_urls = [
'https://www.livetonight.fr/groupe-musique-dj',
]
def parse(self, response):
doc = Artiste()
for artiste in response.css('.card-musician'):
details_partial_link = artiste.css('a::attr(href)').get()
doc['nom'] = artiste.css('.card-musician-title-wrapper').xpath('normalize-space(./h4/text())').get()
doc['url'] = details_partial_link
details_link = response.urljoin(details_partial_link)
request = scrapy.Request(details_link, callback=self.parse_details)
request.meta['item'] = doc
print "NOM", doc['nom']
yield request
def parse_details(self, response):
doc = response.meta['item']
doc['styles'] = response.css('.show-overview-info').xpath('normalize-space(./p/text())')[0].get()
return doc
So, instead of giving me 21 lines with each having their own nom,url,stylesI get 21 lines with the same (which is the last of the list) nomand urland the right styles.
Here is the full output:
[
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Jazz / Folk / Rock"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Rock / Pop / Folk"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Soul / Pop / Funk"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Soul / Pop / Funk"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Jazz / Pop"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Pop / Rock / Funk"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Pop / Rock / Jazz"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Rock / Pop / Funk"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Rock / Pop / Funk"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Jazz"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Rock / Blues / Soul"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Rock / Blues / Soul"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Funk / Soul / Pop"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Pop / Folk / Soul"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Pop / Jazz / Funk"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Pop / Jazz / Funk"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Jazz / Swing / Musique du monde"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Guinguette / Swing"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Guinguette / Swing"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Jazz / Swing / Pop"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Pop / Funk / Dj"}
]
What's strange to me is the fact that if I take out the request, my output is perfect. Like with this code:
class AnnuSpider(scrapy.Spider):
name = "annu"
start_urls = [
'https://www.livetonight.fr/groupe-musique-dj',
]
def parse(self, response):
doc = Artiste()
for artiste in response.css('.card-musician'):
details_partial_link = artiste.css('a::attr(href)').get()
doc['nom'] = artiste.css('.card-musician-title-wrapper').xpath('normalize-space(./h4/text())').get()
doc['url'] = details_partial_link
details_link = response.urljoin(details_partial_link)
yield doc