The hook net always reports an error.

import requests
from config.ip_pool import get_ip
from middlewares import *
import json, random, time
from lxml import etree
from fake_useragent import UserAgent
import yt_common.factory
import re

class lagou ():

def __init__(self):
    self.ua = UserAgent()
    self.http = yt_common.factory.Factory.get_instance("project")

def get_content(self):
    cookies_str = "user_trace_token=20180909010719-4eb82332-59f2-4979-b7ba-4a96de35eb40; _ga=GA1.2.1153938840.1536426437; LGUID=20180909010720-a5755fe0-b389-11e8-8ccd-525400f775ce; _qddaz=QD.wx1cg9.ftx1wj.jnl51m1t; JSESSIONID=ABAAABAAADEAAFIE6475DE07CCCE2D0833999916DC6AED6; utm_source=m_cf_seo_ald_wap; fromsite=""; TG-TRACK-CODE=jobs_similar; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216628753458944-0ec2370b38b4a8-163b6953-1296000-16628753459972%22%2C%22%24device_id%22%3A%2216628753458944-0ec2370b38b4a8-163b6953-1296000-16628753459972%22%2C%22props%22%3A%7B%22%24latest_utm_source%22%3A%22m_cf_cpt_baidu_pc%22%2C%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%2C%22%24latest_referrer_host%22%3A%22www.baidu.com%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%7D%7D; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=0; index_location_city=%E5%8C%97%E4%BA%AC; WEBTJ-ID=12252018%2C161612-167e46f847741d-04ce0d97f54b0f-163b6953-1296000-167e46f8478f5f; _gid=GA1.2.1277196703.1545725773; X_HTTP_TOKEN=3dec5bde9264a1350e562709684512ea; LG_LOGIN_USER_ID=aa0676d165159370bc5d629d9b5a41215c2b10b329a917bb; _putrc=73B45C3A2AAE9C2E; login=true; unick=%E6%8B%89%E5%8B%BE%E7%94%A8%E6%88%B76572; _gat=1; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1545816032,1545873968,1545878834,1545904413; LGSID=20181227175333-45c3f23b-09bd-11e9-b129-525400f775ce; PRE_UTM=m_cf_cpt_baidu_pc; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Fs%3Fwd%3D%25E6%258B%2589%25E5%258B%25BE%25E7%25BD%2591%26rsv_spt%3D1%26rsv_iqid%3D0xfb7d4ab90001af54%26issp%3D1%26f%3D8%26rsv_bp%3D1%26rsv_idx%3D2%26ie%3Dutf-8%26rqlang%3Dcn%26tn%3Dbaiduhome_pg%26rsv_enter%3D1%26oq%3Dnohup%252520%2525E5%2525A4%252584%2525E7%252590%252586%26rsv_t%3D2a73cVwL843%252Ba5Ai2lBIHgKBBA9Hf58WCmSNIrhGhaXjOjWtQO46%252Fa1hW5BKfpVlE%252BnB%26inputT%3D4637%26rsv_pq%3Dbb8ccaa20001742c%26rsv_sug3%3D70%26rsv_sug1%3D54%26rsv_sug7%3D100%26bs%3Dnohup%2520%25E5%25A4%2584%25E7%2590%2586; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Flp%2Fhtml%2Fcommon.html%3Futm_source%3Dm_cf_cpt_baidu_pc; gate_login_token=fa8999aa6d617649ff083782230eac8ba8c9cc1520ae502f; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1545904416; LGRID=20181227175336-475b23dd-09bd-11e9-ad84-5254005c3644"
    headers = {
        "User-Agent": self.ua.random,
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language":"en-US,en;q=0.9",
         "Cache-Control": "no-cache",
         "Pragma": "no-cache",
        "Cookie": cookies_str,
        "X-Anit-Forge-Code": "0",
        "Connection": "keep-alive",
        "X-Anit-Forge-Token": "None",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Host": "www.lagou.com"}
    header = {"Upgrade-Insecure-Requests": "1", "Host": "www.lagou.com", "User-Agent": self.ua.random,
              "Cookie": cookies_str, "Accept-Encoding": "gzip, deflate, br", "Connection": "keep-alive",
              "Cache-Control": "max-age=0", "Accept-Language": "en-US,en;q=0.9",
              "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8","Accept-Language": "en-US,en;q=0.9","Cache-Control": "no-cache","Pragma": "no-cache"}

    url_list = ["https://www.lagou.com/gongsi/0-2-0-0.json", "https://www.lagou.com/gongsi/0-1-0-0.json",
                "https://www.lagou.com/gongsi/0-3-0-0.json"]
    for url in url_list:
        headers["Referer"] = re.findall("(.*?)\.json", url)[0]
        header["Referer"] = re.findall("(.*?)\.json", url)[0]
        for i in range(20, 0,-1):

            print("%d" % i)
            print("%s" % url)
            form_data = {"first": "false", "pn": {}, "sortField": "0", "havemark": "0".format(i)}
            try:
                response = requests.post(url=url, headers=headers, data=form_data,
                                         proxies=get_ip())
                time.sleep(random.randint(10, 20))

            except Exception as e:
                response = requests.post(url=url, headers=headers, data=form_data,
                                         proxies=get_ip())
                print(e)
            print(response.text)

            json_data = json.loads(response.text)

            for j in range(0, len(json_data["result"])):
                companyId = json_data["result"][j]["companyId"]
                companyFullName = json_data["result"][j]["companyFullName"]
                companyShortName = json_data["result"][j]["companyShortName"]
                companyLogoLink = "https://www.lgstatic.com/thumbnail_300x300/" + str(
                    json_data["result"][j]["companyLogo"])
                companyFeatures = json_data["result"][j]["companyFeatures"]
                companyLink = "https://www.lagou.com/gongsi/" + str(json_data["result"][j]["companyId"]) + ".html"
                companyCity = json_data["result"][j]["city"]
                companySize = json_data["result"][j]["companySize"]
                financeStage = json_data["result"][j]["financeStage"]
                industryField = json_data["result"][j]["industryField"].replace("", ",")

                try:

                    res = requests.get(companyLink, headers=header,
                                       proxies=get_ip())

                    time.sleep(random.randint(10, 30))

                except Exception as e:
                    print(e)
                    res = requests.get(companyLink, headers=header,
                                       proxies=get_ip())
                print(companyLink)
                print(res.url)

                teamInfo = {}
                companyLink=etree.HTML(res.text).xpath("//div[@class="company_main"]/h1/a/@href")[0]
                print("%s"%companyLink)
                companyAddress = etree.HTML(res.text).xpath("//p[@class="mlist_li_desc"]/text()")
                companyAddress = [ad.strip() for ad in companyAddress]

                name = etree.HTML(res.text).xpath("//p[@class="item_manager_name"]/span/text()")

                instro = etree.HTML(res.text).xpath("//div[@class="item_manager_content"]/p/text()|//div[@class="item_manager_content"]/text()")

                title = etree.HTML(res.content.decode("utf-8")).xpath("//p[@class="item_manager_title"]/text()")
                print(name)
                print(instro)
                print(title)

                for i in range(0, len(name)):
                    teamInfo.setdefault(str(i), {}).setdefault("name", name[i])
                    teamInfo.setdefault(str(i), {}).setdefault("title", title[i])
                    if len(instro) != 0:
                        teamInfo.setdefault(str(i), {}).setdefault("instro", "".join(instro[i].split()))
                    else:
                        teamInfo.setdefault(str(i), {}).setdefault("instro", "")

                data = json.dumps({"companyId": companyId, "companyFullName": companyFullName,
                                   "companyShortName": companyShortName, "companyLogoLink": companyLogoLink,
                                   "companyFeature": companyFeatures,
                                   "company_link": companyLink,
                                   "companyCity": companyCity,
                                   "companySize": companySize,
                                   "financeStage": financeStage,
                                   "industryField": industryField,
                                   "companyAddress": companyAddress,
                                   "companyTeam": teamInfo}).encode("utf-8").decode(
                    "unicode_escape")

                lagou_response = self.http.set_post().http_send("/spider/source/save-lagou", {"data": data})
                print(lagou_response)
                time.sleep(random.randint(3, 5))
                print("\n")

if name ="_ _ main__":

pass

this is the code for pull hook net. I always encounter some anti-climbing when I climb the page, such as gateway error 502

.
Mar.19,2022
Menu