[python rookie] asked questions about exception handling when writing crawler code

The

code is as follows:

-sharp -*- coding:utf-8 -*-
from urllib.request import urlopen
import bs4
import webbrowser
import requests

html_list = [] -sharp
html_list_txt = [] -sharp
movie_list = [] -sharp
-sharpmovie_total = {} -sharp

-sharpdef add_movie():


html = "https://movie.douban.com/top250"
html_list.append(html)
-sharphtml2 = requests.get("https://movie.douban.com/top250")
-sharpwebbrowser.open(html)
-sharpwebbrowser.open(html)
html_txt = (urlopen(html)).read()
-sharphtml_txt = (requests.get(html)).text

bsObj = bs4.BeautifulSoup(html_txt, "html.parser")
print("---1---")
html_div = bsObj.find("div",{"class":"paginator"})
print("---2---")
html_a = html_div.findAll("a")
print("---3---")
for html_a_temp in html_a:
-sharp    print(type(html_a_temp))
    -sharpi = 2
   -sharpwhile i <= 9:
    -sharpprint(html_a_temp.get_text())
    -sharpif html_a_temp.get_text != ">":
        html_href = html_a_temp.attrs["href"]
        html_href = "https://movie.douban.com/top250" + html_href
        html_list.append(html_href)
        -sharpi += 1
    
print("---4---")
html_list = list(set(html_list))
print(len(html_list))
-sharpprint(html_set)
-sharpprint(len(html_set))
-sharpprint(set(html_list))
-sharpprint(type(""))

-sharp
for html_list_temp in html_list:
    """"""
    html_read = bs4.BeautifulSoup(urlopen(html_list_temp).read(), "html.parser")
    html_list_txt.append(html_read)

for html_page in html_list_txt:
    name_div_list = html_page.findAll("div",{"class":"info"})
    for name_div_temp in name_div_list:
        movie_total = {} -sharp
        name_div_inside = name_div_temp.findAll("div")

        movie_name = name_div_inside[0].a.span.get_text()  -sharp
        name_div_star = name_div_temp.find("div",{"class":"star"})
        name_div_star_span = name_div_star.findAll("span")
        movie_score = name_div_star_span[1].get_text()  -sharp
        movie_number = name_div_star_span[3].get_text()  -sharp
        -sharp -----------
        try:
            movie_introduction = name_div_temp.find("span",{"class":"inq"}).get_text()-sharp
        except AttributeError:
            print("")
        print(movie_introduction)
        -sharpname_span_inq = name_div_temp.findAll("p")[1].span.get_text()
        -sharpmovie_introduction = name_span_inq

        -sharpname_span_inq = name_div_temp.find("span",{"class":"inq"})
        -sharpmovie_introduction = name_span_inq.get_text()  -sharp
        movie_total["name"] = movie_name
        movie_total["score"] = movie_score
        movie_total["number"] = movie_number
        movie_total["introduction"] = movie_introduction
        movie_list.append(movie_total)
print(movie_list)

"""
        name_div_inside_span_list = name_div_inside[1].div.findAll("span")
        for name_div_inside_span_temp in name_div_inside_span_list:
            movie_score = name_div_inside[1].div.span[1].get_text() -sharp
            movie_number = name_div_inside[1].div.span[3].get_text() -sharp
"""
-sharpmovie_total[name] = movie_name
"""
    name_div_list = html_page.findAll("div",{"class":"hd"})
    for name_div_temp in name_div_list:
        movie_name = name_div_temp.a.span.get_text()
        movie_name_list.append(movie_name)
"""
-sharpprint(movie_name_list)

the running result is as follows:

clipboard.png

clipboard.png

clipboard.png

Mar.03,2021

try: the colon here is full width

Menu