Scrapy crawled the picture and reported an error (IOError: cannot identify image file).

recently learn crawler, use selenium to crawl the home page picture of the website, test to get the url, of the picture, but always report the following error, can not find the reason, hope the hero to help!

the code is as follows:
item.py

import scrapy

class JiandanItem(scrapy.Item):
    -sharp define the fields for your item here like:
    -sharp name = scrapy.Field()
    image_urls = scrapy.Field()-sharp
    images = scrapy.Field()

pipeline.py

import os
import urllib
import scrapy
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
from PIL import Image
from jiandan import settings

class JiandanPipeline(ImagesPipeline)

    def get_media_requests(self, item, info):
        for image_url in item["image_urls"]:
            print(image_url)
            yield scrapy.Request(image_url)


    def item_completed(self, results, item, info):
        image_paths = [x["path"] for ok, x in results if ok]
        if not image_paths:
            raise DropItem("Item contains no images")
        return item

middlewares.py

import scrapy
from selenium import webdriver
from scrapy.http import HtmlResponse
import time

class PageMiddleware(object):
    def process_request(self, request, spider):
        if request.meta.has_key("PhantomJS"):
            return
        else:
            driver = webdriver.PhantomJS()
            driver.get(request.url)
            time.sleep(1)
            content = driver.page_source.encode("utf-8")
            -sharp print(content)
            driver.quit()

            return HtmlResponse(request.url, encoding="utf-8",body=content, request=request)

jiandanSpider.py

import scrapy
from jiandan.items import JiandanItem

from scrapy.crawler import CrawlerProcess

class jiandanSpider(scrapy.Spider):
    name = "jiandan"
    allowed_domains = ["http://www.172mn.com/"]
    start_urls = ["http://www.172mn.com/"]

    def parse(self, response):
        item = JiandanItem()
        item["image_urls"] = response.xpath("//li//img/@src").extract()
        yield item

settings.py

BOT_NAME = "jiandan"
SPIDER_MODULES = ["jiandan.spiders"]
NEWSPIDER_MODULE = "jiandan.spiders"
HTTPERROR_ALLOWED_CODES = [403]
ROBOTSTXT_OBEY = False
DEFAULT_REQUEST_HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.10 Safari/537.36",
    "Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Accept-Language" : "zh-CN,zh;q=0.9",
}
DOWNLOADER_MIDDLEWARES = {
   "jiandan.middlewares.PageMiddleware": 543,
}
ITEM_PIPELINES = {
    "jiandan.pipelines.JiandanPipeline": 1,
}
IMAGES_STORE = "/home/python/Desktop/"
DOWNLOAD_DELAY = 3
IMAGES_THUMBS = {
    "small": (50, 50),
    "big": (200, 200),
}

error message:

https://img.codeshelper.com/upload/img/2021/03/13/e1b20hwhbgg7616.jpg
2018-05-21 22:33:58 [selenium.webdriver.remote.remote_connection] DEBUG: POST http://127.0.0.1:34265/wd/hub/session {"desiredCapabilities": {"platform": "ANY", "browserName": "phantomjs", "version": "", "javascriptEnabled": true}}
2018-05-21 22:33:58 [selenium.webdriver.remote.remote_connection] DEBUG: Finished Request
2018-05-21 22:33:58 [selenium.webdriver.remote.remote_connection] DEBUG: POST http://127.0.0.1:34265/wd/hub/session/ff87b910-5d03-11e8-8234-c5368f3096da/url {"url": "https://img.codeshelper.com/upload/img/2021/03/13/e1b20hwhbgg7616.jpg", "sessionId": "ff87b910-5d03-11e8-8234-c5368f3096da"}
2018-05-21 22:34:00 [selenium.webdriver.remote.remote_connection] DEBUG: Finished Request
2018-05-21 22:34:01 [selenium.webdriver.remote.remote_connection] DEBUG: GET http://127.0.0.1:34265/wd/hub/session/ff87b910-5d03-11e8-8234-c5368f3096da/source {"sessionId": "ff87b910-5d03-11e8-8234-c5368f3096da"}
2018-05-21 22:34:01 [selenium.webdriver.remote.remote_connection] DEBUG: Finished Request
2018-05-21 22:34:01 [selenium.webdriver.remote.remote_connection] DEBUG: DELETE http://127.0.0.1:34265/wd/hub/session/ff87b910-5d03-11e8-8234-c5368f3096da {"sessionId": "ff87b910-5d03-11e8-8234-c5368f3096da"}
2018-05-21 22:34:01 [selenium.webdriver.remote.remote_connection] DEBUG: Finished Request
2018-05-21 22:34:01 [scrapy] DEBUG: Crawled (200) <GET https://img.codeshelper.com/upload/img/2021/03/13/e1b20hwhbgg7616.jpg> (referer: None)
2018-05-21 22:34:01 [scrapy] DEBUG: File (downloaded): Downloaded file from <GET https://img.codeshelper.com/upload/img/2021/03/13/e1b20hwhbgg7616.jpg> referred in <None>
2018-05-21 22:34:01 [scrapy] ERROR: File (unknown-error): Error processing file from <GET https://img.codeshelper.com/upload/img/2021/03/13/e1b20hwhbgg7616.jpg> referred in <None>
Traceback (most recent call last):
  File "/usr/local/lib/python2.7/dist-packages/scrapy/pipelines/files.py", line 355, in media_downloaded
    checksum = self.file_downloaded(response, request, info)
  File "/usr/local/lib/python2.7/dist-packages/scrapy/pipelines/images.py", line 95, in file_downloaded
    return self.image_downloaded(response, request, info)
  File "/usr/local/lib/python2.7/dist-packages/scrapy/pipelines/images.py", line 99, in image_downloaded
    for path, image, buf in self.get_images(response, request, info):
  File "/usr/local/lib/python2.7/dist-packages/scrapy/pipelines/images.py", line 112, in get_images
    orig_image = Image.open(BytesIO(response.body))
  File "/usr/local/lib/python2.7/dist-packages/PIL/Image.py", line 2590, in open
    % (filename if filename else fp))
IOError: cannot identify image file <cStringIO.StringI object at 0x7f7c38d2fcf0>
Mar.13,2021
Menu