Please tell me why I climbed a blank file.

Why is my crawl file blank

I want to climb Dianping"s shop name and address

import requests
from bs4 import BeautifulSoup
import sys
import random
import time

base_url =" https://www.dianping.com/sear."
deep = 5

def get_html (url):

try:
    r = requests.get(url,timeout = 30)
    r.raise_for_status
    r.encoding = "utf-8"
    return r.text
except:
    return "Something Wrong"

def get_content (url):

lists_ = []
html = get_html(url)
soup = BeautifulSoup(html,"lxml")
info = soup.find_all("li",{"class":" "})
for i in info:
    list_ = {}
    try:
        list_["store_name"] = i.find("a",{"h4":" "}).text.strip()
        list_["url"] = i.find("a",{"target":" _blank"})["href"]
        list_["addr"] = i.find("span",{"class":" addr"}).text.strip()
        lists_.append(list_)
    except:
        print("Something Wrong")
return lists_
    

def outfill (dict):

with open("JBS_Store.txt","a+",encoding="utf-8") as f:
    for list_ in dict:
        f.write(":{}\t :{}\t URL:{}\n".fomate(list_["store_name"],list_["addr"],list_["url"]))
    print("")
    

def main (baer_url,deep):

url_list=[]
for i in range(0,deep):
    url_list.append(baer_url+"/p"+str(i))
print("")

for url in url_list:
    content = get_content(url)
    outfill(content)
print("")

if name ="_ _ main__":

main(base_url, deep)

Nov.14,2021

the main problem is still dealing with the HEADERS part of HTTP, because Dianping will consider Cookie to anti-crawl, so the crawler without Cookie will be directly drop.

and even with the addition of the code provided by Cookie, it is still difficult to run.

  

f.close

Menu