As shown in the picture, response.text is not a web page code.

as shown in the figure,
clipboard.png
:
clipboard.png

at first it was IndexError: list index out of range
I found that the response.text was all the same, but I couldn"t find the problem.
the specific code is as follows. Thank you.

import json
from urllib.parse import urlencode
import re
import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException

def get_page_index (offset,keyword):

data = {
    "offset": offset,
    "format": "json",
    "keyword": keyword,
    "autoload": "true",
    "count": "20",
    "cur_tab": 3
}
url = "https://www.toutiao.com/search_content/?"+urlencode(data)
try:
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    return None
except RequestException:
    print("")
    return None

def parse_page_index (html):

data = json.loads(html)
if data and "data" in data.keys():
    for item in data.get("data"):
        yield item.get("article_url")

def get_page_detail (url):

try:
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    return None
except RequestException:
    print("",url)
    return None

def parse_page_detail (html):

soup = BeautifulSoup(html,"lxml")
title = soup.select("title")[0].get_text()
print(title)
images_pattern=re.compile("gallery: JSON.parse (.*?);",re.S)
result = re.search(images_pattern,html)
if result:
    print(result.group(1))

def main ():

html = get_page_index(0,"")
for url in parse_page_index(html):
    html = get_page_detail(url)
    if html:
        parse_page_detail(html)

if _ _ name__=="__main__":

main()

Jun.28,2021

check that the request header headers, is probably illegal.


after testing, adding a header can solve


when it is possible that the headline doesn't like crawlers, so it crawls back

.
Menu