The data extraction in python scrapy-selenium traverses not one extraction at a time, but one at a time.

-sharp -*- coding: utf-8 -*-
import json

import scrapy
from scrapy import Request

from cosmetics.items import CosmeticsItem


class CosSpider(scrapy.Spider):
    name = "cos"
    -sharp allowed_domains = ["www.jd.com"]
    -sharp start_urls = ["https://search.jd.com/Search?keyword=%E5%8F%A3%E7%BA%A2&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&stock=1&page=1&s=54&click=0"]

    def start_requests(self):
        star_url = "https://search.jd.com/"
        yield Request(url=star_url, callback=self.parse, meta={"data": "0"})

    def parse(self, response):
        item = CosmeticsItem()
        ul_list = response.css("-sharpJ_goodsList > ul > li")
        page_next = response.css("-sharpJ_bottomPage > span.p-num > a.pn-next")
        print("ul_list is :::::::", ul_list)

        """
        for li in ul_list:
            item = CosItemLoader(item=CosItem(), response=li)
            with open("rule.json", "r") as f:
                data = json.load(f)
                for keys in data:
                    item.add_xpath(keys, data[keys])
            -sharp item.add_xpath("img", ".//div[@class="p-img"]/a/img/@href")
            -sharp item.add_xpath("price", ".//div[@class="p-price"]//i/text()")
            -sharp item.add_xpath("name", ".//div[@class="p-name p-name-type-2"]//em/text()")
            -sharp item.add_xpath("commit_counts", ".//div[@class="p-commit"]//a/text()")
            -sharp item.add_xpath("shop", ".//div[@class="p-shop"]/span/a/text()")
                    yield item.load_item()
        """
        for l in ul_list:
            -sharpal = ul.extract()
            print("ul is :::::", l.extract())
            img     = "https:" + l.xpath("//div[@class="p-img"]/a/img/@src").extract()[0]
            price   = l.xpath("//div[@class="p-price"]//i/text()").extract()
            name    = l.xpath("//div[@class="p-name p-name-type-2"]//em/text()").extract()[0].strip(" ")
            commits = l.xpath("//div[@class="p-commit"]//a/text()").extract()[0]
            shop    = l.xpath("//div[@class="p-shop"]/span/a/text()").extract()[0]
            for field in item.fields.keys():
                item[field] = eval(field)
            yield item
        if len(page_next) > 0:
            yield Request(url=response.url, callback=self.parse, dont_filter=True, meta={"data": "2"})
        else:
            print("")

first paste the code in spider, traverse the extracted li list, and then use xpath to retrieve data instead of traversing one by one, but directly take them all out at a time. What"s going on here?

Why not traverse one by one, but fetch it all at once? How to solve this situation? Ask the boss to solve it!

Dec.27,2021

/ / div [@ class= "p-price"] / / i/text ()'). Extract () is missing a dot

Menu