A crawler code exists in Bug, to help see what the problem is.

put the code first

-sharp coding=utf-8
from bs4 import BeautifulSoup
import requests
from time import sleep
-sharp
City_Name = "qd"
page = "http://newhouse.{0}.fang.com/house/s".format(City_Name)
-sharpDownload_Newitem_List()
def Download_Newitem_List(url,try_num=2):
    global City_Name
    print(":",url)
    try:
        all_html = requests.get(url,timeout=10)
    except Exception as e:
        print(":",e.reason)
        all_html=None
        if try_num >0:
            if hasattr(e,"code") and 500 <= e.code <600:
                return Download_Newitem_List(url,try_num-1)
    all_html.encoding = "gb18030"
    soup = BeautifulSoup(all_html.text, "html5lib")
    -sharp
    Item_Total = soup.find("a", id="allUrl").find("span").text.replace("(","").replace(")","")
    -sharp020+1
    if (int(Item_Total) % 20)>0:
        Page_Num = (int(Item_Total) // 20) + 1
    else:
        Page_Num = (int(Item_Total) // 20)
        
    with open("{0}_list_link.txt".format(City_Name), "w",encoding="utf-8") as f:
        for i in range(1, Page_Num + 1):
            New_Page_Link = "http://newhouse.{0}.fang.com/house/s/b9{1}".format(City_Name, i)
            print(New_Page_Link)
            print(New_Page_Link, file=f)
-sharpDownload_item_link(City)
def Download_item_link(City):
    with open("{0}_list_link.txt".format(City), "r",encoding="utf-8") as f:
        -sharpprint(f.readlines())
        for line in f.readlines():
            print(":", line)
            sleep(2)
            try:
                all_html = requests.get(line,timeout=10)
                all_html.encoding = "gb18030"
                -sharpprint(all_html.text)
            except Exception as e:
                print(":", e)
                -sharpif try_num > 0:
                -sharp    if hasattr(e, "code") and 500 <= e.code < 600:
                -sharp        return Download_Newitem_List(url, try_num - 1)
            soup = BeautifulSoup(all_html.text, "html5lib")
            master_html=soup.find_all("div", class_="nlcd_name")
            with open("{0}_Newall_link.txt".format(City), "w",encoding="utf-8") as d:
                for link in master_html:
                    -sharpprint(link.get_text().rstrip() + ":" + link.a["href"].rstrip())
                    print(link.a["href"].rstrip(),file=d)

Download_Newitem_List(page)
Download_item_link("qd")

the above code can be directly run in ide
take Qingdao as an example (qd) I extracted the total real estate project 482, 25 page links are also extracted, but when I use the Download_item_link () function to extract links to each project in the list, there is a problem, there should be 482 links in the qd_Newall_link.txt file, but how to do it is 20 links. I"ve been thinking about it for a long time, but I can"t figure out what the problem is.
I hope some great god can help me take a look.

Mar.20,2021

Open the file, change'w' to'a'


Download_item_link open w change to a

< hr >

change requests.get (line) to requests.get (line.strip ())

)
Menu