Python multithreaded crawler queue queue problem.

the idea is to first construct the url list all_url
and then
for i in range (0, len (all_url)):
urlqueue.put (all_ URL [I])

then get can pull url from the list every time
now the problem is that range cannot be written as 0 to list length
will show IndexError: list index out of range
which means index error: list index is out of range

and there is no problem with the list, there is no empty

and if the list length is 2000, then only range (0, 1000), so there are no errors
, which is troublesome

here is the code
import requests
from lxml import html
import time
import threading
from queue import Queue

class Spider (threading.Thread):

def __init__(self, name, urlqueue):
    super().__init__()
    self.name = name
    self.urlqueue = urlqueue

def run(self):
    
    headers = {
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4094.1 Safari/537.36"
                }
    
    print(" :" + self.name + "")
    
    while not self.urlqueue.empty():
        try:
            url = self.urlqueue.get()
            rep = requests.get(url, headers = headers, timeout = 5)
            time.sleep(1)
            if rep.status_code == 200:
                
                print("")
                self.parse(rep)
                print(url + "  ")
                
        except Exception as e:
            print("::" +url + " , ::", e)
            pass
            
    print(" :" + self.name + "")
    
    
def parse(self, rep):
    
    con = rep.content
    sel = html.fromstring(con)
    title = sel.xpath("//div[@class="titmain"]/h1/text()")
    title = str(title).replace("]", "").replace("[", "").replace(""", "").replace(",", "").replace(r"\r\n", "").replace(""", "").replace(" ", "").replace(r"\xa0", "").replace("?", "").replace("/", "").replace(r"\u3000", " ") 
    date = sel.xpath("//div[@class="texttit_m1"]/p/text()")
    date = str(date).replace("]", "").replace("[", "").replace(""", "").replace(r"\u3000", " ") 
    if len(date) > 20:
        file_name = title + ".txt"
        a = open(file_name, "w+", encoding="utf-8")
        a.write("\n" + str(title) + "\n" + "\n" + str(date))
        print(file_name + "")
        a.close
    
    else:
        pass
        
    

if name ="_ _ main__":


with open("url.txt") as f:
    data = f.readline() 
    -sharp
    james = data.strip().split(",")
    -sharp

all_url = []
for jame in james:    
    a=eval(jame)
    -sharpifu
    all_url.append(a)
    
print(len(all_url))
start = time.time()
urlqueue = Queue()
threadNum = 3   -sharp 

for i in range(0, 1468):
    urlqueue.put(all_url[i])  -sharp
    del all_url[i]

threads = []

for i in range(1, threadNum+1):
    
    thread = Spider("" + str(i), urlqueue)
    thread.start()
    threads.append(thread)
    
for thread in threads:
    thread.join()

with open("url.txt", "w+") as b:
    b.write("\n".join([str(all_url)]))
    b.write("\n" + "=" *50 + "\n")
    b.close
    print(" url ")
    
end = time.time()
print("-------------------------------")
print(". {}".format(end-start))    

in addition, url is read from txt. I don"t know how to upload it. The final all_url list is definitely no problem.

Menu