How does scrapy download pictures and classify them?

problem description

cannot put an atlas in the same directory while downloading http://www.umei.cc/p/gaoqing/..

the environmental background of the problems and what methods you have tried

tried a lot of methods on the Internet, but could not solve

related codes

/ / Please paste the code text below (do not replace the code with pictures)

-sharpcoding:utf-8
import random
import re
import urllib2
from urllib import urlopen

import requests
import logging

import time
from bs4 import BeautifulSoup,Comment
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from z2.items import Z2Item
from scrapy.http import Request

logging.basicConfig(
    level=logging.INFO,
    format=
    "%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s",
    datefmt="%a, %d %b %Y %H:%M:%S",
    filename="cataline.log",
    filemode="w")

class Spider(CrawlSpider):
    name = "z2"
    img_urls = []
    allowed_domains = ["www.umei.cc"]
    start_urls = ["http://www.umei.cc/p/gaoqing/rihan/"]
    -sharp rules = (
    -sharp     Rule(LinkExtractor(allow=("http://www.umei.cc/p/gaoqing/rihan/\d{1,6}.htm",), deny=("http://www.umei.cc/p/gaoqing/rihan/\d{1,6}_\d{1,6}.htm")),
    -sharp          callback="parse_z2_info", follow=True),
    -sharp )

    def start_requests(self):
        yield Request(url="http://www.umei.cc/p/gaoqing/rihan/",
                      callback=self.parse_z2_key)

    def parse_z2_key(self, response):
        soup = BeautifulSoup(response.body, "lxml")
        content = soup.find("div", attrs={"class": "TypeList"})
        -sharp logging.debug(content)
        for link in content.findAll("a", attrs={"href": re.compile( r"(.*)(/rihan/)(\d{1,6})(.htm)"), "class": "TypeBigPics"}):
            logging.debug(link["href"])
            yield Request(url=link["href"],
                          callback=self.parse_z2_info)
            break

    def parse_z2_info(self, response):
        soup = BeautifulSoup(response.body, "lxml")
        item = Z2Item()
        -sharp html
        for element in soup(text=lambda text: isinstance(text, Comment)):
            element.extract()

        -sharp script
        [s.extract() for s in soup("script")]

        -sharp b
        [s.extract() for s in soup("b")]


        ArticleDesc = soup.find("p", attrs={"class": "ArticleDesc"})
        logging.debug(ArticleDesc.get_text())

        Pages = soup.find("div", attrs={"class": "NewPages"}).find("li")
        pageCounts = filter(str.isdigit, Pages.get_text().encode("gbk"))
        -sharp 
        -sharp logging.debug(re.findall(r"\d+\.?\d*", Pages.get_text())[0])

        -sharp 
        -sharp logging.debug(Pages.get_text()[1:-3])

        -sharp 
        logging.debug(filter(str.isdigit, Pages.get_text().encode("gbk")))

        -sharp img = soup.find("div", attrs={"class": "ImageBody"}).find("img")
        -sharp url = img.attrs["src"]
        -sharp self.img_urls.append(url)
        -sharp logging.debug(self.img_urls)

        item["name"] = re.match(".*/(\d+)", response.url).group(1)
        logging.debug(item["name"])

        -sharp image_urls = []
        -sharp item["image_urls"] = image_urls
        sourceUrl = response.url[0:-4]
        -sharp logging.debug(sourceUrl)
        for i in xrange(1, int(pageCounts) + 1):
            nextUrl = sourceUrl + "_" + str(i) + ".htm"
            -sharp logging.debug(nextUrl)
            yield  Request(url=nextUrl,callback=self.parse_z2_single_img)
        item["image_urls"] = self.img_urls
        yield item


    def parse_z2_single_img(self, response):
        soup = BeautifulSoup(response.body, "lxml")
        img = soup.find("div", attrs={"class": "ImageBody"}).find("img")
        url = img.attrs["src"]
        self.img_urls.append(url)






what result do you expect? What is the error message actually seen?

Mar.25,2021

final solution: extract URL as the identity folder. For the same set of pictures, the url prefix is the same. You can use the URL name as the prefix

.
Menu