[python rookie] made a crawler, but the things crawled out could not be stored.

questions are as follows:

-sharp -*- coding:utf-8 -*-
from urllib.request import urlopen
import bs4
import webbrowser
import requests

html_list = [] -sharp
html_list_txt = [] -sharp
movie_list = [] -sharp
movie_total = {} -sharp

html = "https://movie.douban.com/top250"
html_list.append(html)
-sharphtml2 = requests.get("https://movie.douban.com/top250")
-sharpwebbrowser.open(html)
-sharpwebbrowser.open(html)
html_txt = (urlopen(html)).read()
-sharphtml_txt = (requests.get(html)).text

bsObj = bs4.BeautifulSoup(html_txt, "html.parser")
print("---1---")
html_div = bsObj.find("div",{"class":"paginator"})
print("---2---")
html_a = html_div.findAll("a")
print("---3---")
for html_a_temp in html_a:
-sharp    print(type(html_a_temp))
    -sharpi = 2
   -sharpwhile i <= 9:
    -sharpprint(html_a_temp.get_text())
    -sharpif html_a_temp.get_text != ">":
        html_href = html_a_temp.attrs["href"]
        html_href = "https://movie.douban.com/top250" + html_href
        html_list.append(html_href)
        -sharpi += 1
    
print("---4---")
html_list = list(set(html_list))
print(len(html_list))
-sharpprint(html_set)
-sharpprint(len(html_set))
-sharpprint(set(html_list))
-sharpprint(type(""))

-sharp
for html_list_temp in html_list:
    """"""
    html_read = bs4.BeautifulSoup(urlopen(html_list_temp).read(), "html.parser")
    html_list_txt.append(html_read)

for html_page in html_list_txt:
    name_div_list = html_page.findAll("div",{"class":"info"})
    for name_div_temp in name_div_list:
        name_div_inside = name_div_temp.findAll("div")

        movie_name = name_div_inside[0].a.span.get_text()  -sharp
        name_div_star = name_div_temp.find("div",{"class":"star"})
        name_div_star_span = name_div_star.findAll("span")
        movie_score = name_div_star_span[1].get_text()  -sharp
        movie_number = name_div_star_span[3].get_text()  -sharp
        -sharp -----------
        -sharpname_span_inq = name_div_temp.findAll("p")[1].span.get_text()
        -sharpmovie_introduction = name_span_inq

        -sharpname_span_inq = name_div_temp.find("span",{"class":"inq"})
        print("----ttt----")
        -sharpmovie_introduction = name_span_inq.get_text()  -sharp
        movie_total["name"] = movie_name
        movie_total["score"] = movie_score
        movie_total["number"] = movie_number
        -sharpmovie_total["introduction"] = movie_introduction
        movie_list.append(movie_total)
print(movie_list)

"""
        name_div_inside_span_list = name_div_inside[1].div.findAll("span")
        for name_div_inside_span_temp in name_div_inside_span_list:
            movie_score = name_div_inside[1].div.span[1].get_text() -sharp
            movie_number = name_div_inside[1].div.span[3].get_text() -sharp
"""
-sharpmovie_total[name] = movie_name
"""
    name_div_list = html_page.findAll("div",{"class":"hd"})
    for name_div_temp in name_div_list:
        movie_name = name_div_temp.a.span.get_text()
        movie_name_list.append(movie_name)
"""
-sharpprint(movie_name_list)

the running result is as follows:

clipboard.png

I am still testing this crawler, and there are still some other problems that need to be solved.
but what I want to ask now is, when I climb Douban TOP250 movies, why do I grab one movie at a time, put it in the list, and finally print out all the same movie?
how can I change the code to make these movies a different movie?

Mar.03,2021

because your movie_total is always the same object, you keep modifying the same object, and the end result is that your movie_list all references the same object, and each time you modify movie_total will overwrite the previous object. The solution is to move the declaration of movie_total into the body of the loop, and reinitialize movie_total to an empty dictionary for each loop.

Menu