Why do you use scarpy to climb Dianping's city home page with content, but you can't get it when you climb by area?

as shown in the figure below, when the page is the food section of the whole city, for example, the URL of Xi"an food is "http://www.dianping.com/xian/ch10", you can crawl the data normally (figure 1).

50"http://www.dianping.com/xian/..."

Please take a look at what went wrong! Thank you very much.

I tried to run the site with scrapy shell, but what I opened with view (response) was an empty txt document, not a web page.

here is my code
dianping.py

import scrapy
import requests
from scrapy.selector import Selector
from ..items import DianpingItem


class DianpingSpider(scrapy.Spider):
    name = "dianping"
    start_urls = ["http://www.dianping.com/xian/ch10/r8915/"]

    def parse(self, response):
        Dianping = DianpingItem()
        title_list = response.xpath("//div[@class="tit"]/a[1]/h4/text()").extract()
        level_list = response.xpath("//div[@class="comment"]/span[1]/@title").extract()
        comment_list = response.xpath("//div[@class="comment"]/a[1]/b/text()").extract()
        price_list = response.xpath("//div[@class="comment"]/a[2]/b/text()").extract()
        kouwei_list = response.xpath("//span[@class="comment-list"]/span[1]/b/text()").extract()
        huanjing_list = response.xpath("//span[@class="comment-list"]/span[2]/b/text()").extract()
        fuwu_list = response.xpath("//span[@class="comment-list"]/span[2]/b/text()").extract()
        caixi_list = response.xpath("//div[@class="tag-addr"]/a[1]/span/text()").extract()
        area_list = response.xpath("//div[@class="tag-addr"]/a[2]/span/text()").extract()
        address_list = response.xpath("//div[@class="tag-addr"]//span[@class="addr"]/text()").extract()
        recommend_list = response.xpath("//div[@class="tit"]/a[1]/h4/text()").extract()
        -sharp recommend_list2 = recommend_list1[0].xpath("string(.)").extract()
        -sharp recommend_list = [item.replace(" ", "").replace("\n", "|") for item in recommend_list2]
        for i1, i2, i3, i4, i5,i6,i7,i8,i9,i10,i11 in zip(title_list,level_list,comment_list,price_list,kouwei_list,huanjing_list,fuwu_list,caixi_list,area_list,address_list,recommend_list):
            Dianping["title"] = i1
            Dianping["level"] = i2
            Dianping["comment"] = i3
            Dianping["price"] = i4
            Dianping["kouwei"] = i5
            Dianping["huanjing"] = i6
            Dianping["fuwu"] = i7
            Dianping["caixi"] = i8
            Dianping["area"] = i9
            Dianping["address"] = i10
            Dianping["recommend"] = i11
            yield Dianping
        next_pages = response.xpath("//div[@class="page"]/a[@class="next"]/@href").extract()
        if next_pages:
            yield scrapy.Request(next_pages[0], callback=self.parse)

setting.py

-sharp -*- coding: utf-8 -*-

-sharp Scrapy settings for dianping project
-sharp
-sharp For simplicity, this file contains only settings considered important or
-sharp commonly used. You can find more settings consulting the documentation:
-sharp
-sharp     https://doc.scrapy.org/en/latest/topics/settings.html
-sharp     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
-sharp     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = "dianping"

SPIDER_MODULES = ["dianping.spiders"]
NEWSPIDER_MODULE = "dianping.spiders"
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36"
-sharp Crawl responsibly by identifying yourself (and your website) on the user-agent
-sharpUSER_AGENT = "dianping (+http://www.yourdomain.com)"

-sharp Obey robots.txt rules
ROBOTSTXT_OBEY = True

-sharp Configure maximum concurrent requests performed by Scrapy (default: 16)
-sharp CONCURRENT_REQUESTS = 1

-sharp Configure a delay for requests for the same website (default: 0)
-sharp See https://doc.scrapy.org/en/latest/topics/settings.html-sharpdownload-delay
-sharp See also autothrottle settings and docs
DOWNLOAD_DELAY = 5
-sharp The download delay setting will honor only one of:
-sharpCONCURRENT_REQUESTS_PER_DOMAIN = 16
-sharpCONCURRENT_REQUESTS_PER_IP = 16

-sharp Disable cookies (enabled by default)
COOKIES_ENABLED = True

-sharp Disable Telnet Console (enabled by default)
-sharpTELNETCONSOLE_ENABLED = False

-sharp Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
      "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
      "Accept-Language": "en-US,en;q=0.9",
      "Accept-Encoding": "gzip, deflate"
}

-sharp Enable or disable spider middlewares
-sharp See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
-sharpSPIDER_MIDDLEWARES = {
-sharp    "dianping.middlewares.DianpingSpiderMiddleware": 543,
-sharp}

-sharp Enable or disable downloader middlewares
-sharp See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
-sharp DOWNLOADER_MIDDLEWARES = {
-sharp     "dianping.middlewares.DianpingDownloaderMiddleware": 543,
-sharp }
-sharp DOWNLOADER_MIDDLEWARES = {
-sharp      "scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware": 543,
-sharp      "dianping.middlewares.ProxyMiddleWare": 125,
-sharp      "dianping.middlewares.DianpingDownloaderMiddleware": 543,
-sharp }
-sharp Enable or disable extensions
-sharp See https://doc.scrapy.org/en/latest/topics/extensions.html
-sharpEXTENSIONS = {
-sharp    "scrapy.extensions.telnet.TelnetConsole": None,
-sharp}

-sharp Configure item pipelines
-sharp See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    "dianping.pipelines.DianpingPipeline": 300,
}

-sharp Enable and configure the AutoThrottle extension (disabled by default)
-sharp See https://doc.scrapy.org/en/latest/topics/autothrottle.html
-sharpAUTOTHROTTLE_ENABLED = True
-sharp The initial download delay
-sharpAUTOTHROTTLE_START_DELAY = 5
-sharp The maximum download delay to be set in case of high latencies
-sharpAUTOTHROTTLE_MAX_DELAY = 60
-sharp The average number of requests Scrapy should be sending in parallel to
-sharp each remote server
-sharpAUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
-sharp Enable showing throttling stats for every response received:
-sharpAUTOTHROTTLE_DEBUG = False

-sharp Enable and configure HTTP caching (disabled by default)
-sharp See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html-sharphttpcache-middleware-settings
-sharpHTTPCACHE_ENABLED = True
-sharpHTTPCACHE_EXPIRATION_SECS = 0
-sharpHTTPCACHE_DIR = "httpcache"
-sharpHTTPCACHE_IGNORE_HTTP_CODES = []
-sharpHTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"

ROBOTSTXT_OBEY = False try again

Menu