请在 下方输入 要搜索的题目:

请分别使用多线程和协程两种技术爬取豆瓣电影排行榜的电影名称和评分信息,其网址为:https://movie.douban.com/top250。

请分别使用多线程和协程两种技术爬取豆瓣电影排行榜的电影名称和评分信息,其网址为:https://movie.douban.com/top250。

发布时间:2025-08-11 12:45:42
推荐参考答案 ( 由 快搜搜题库 官方老师解答 )
联系客服
答案:多线程版本: importrequests import time from queue import Queue from lxml import etree import threading class Douban(object): def __init__(self): self.headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"} self.base_url = "https://movie.douban.com/top250?start=" self.url_list = [ self.base_url + str(page) for page in range(0, 225 + 1, 25)] # 创建保存数据的队列 self.data_queue = Queue() self.count = 0 def send_request(self, url): print("[INFO]: 正在抓取" + url) html = requests.get(url, headers = self.headers).content # 每次请求间隔1秒 time.sleep(1) self.parse_page(html) def parse_page(self, html): html_obj = etree.HTML(html) node_list = html_obj.xpath("//div[@class='info']") for node in node_list: # 电影标题 title = node.xpath("./div[@class='hd']/a/span[1]/text()")[0] # 电影评分 score = node.xpath(".//span[@class='rating_num']/text()")[0] self.count += 1 self.data_queue.put(score + "\t" + title) def start_work(self): # 单线程: """ for url in self.url_list: self.send_request(url) """ thread_list = [] for url in self.url_list: # 创建一个线程对象 thread = threading.Thread(target = self.send_request, args = ) # 启动线程,执行任务 thread.start() # 将当前线程对象存到列表 thread_list.append(thread) # 让主线程等待,所有子线程执行结束,再执行后面的代码 for thread in thread_list: thread.join() while not self.data_queue.empty(): print(self.data_queue.get()) print(self.count) if __name__ == "__main__": douban = Douban() douban.start_work() 协程版本: import requests import time from queue import Queue from lxml import etree import gevent from gevent import monkey monkey.patch_all() # 在程序执行的时候,将Python的网络库打个补丁。在执行网络操作的时候,按异步的方式执行。 class Douban(object): def __init__(self): self.headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"} self.base_url = "https://movie.douban.com/top250?start=" self.url_list = [ self.base_url + str(page) for page in range(0, 225 + 1, 25)] # 创建保存数据的队列 self.data_queue = Queue() self.count = 0 def send_request(self, url): print("[INFO]: 正在抓取" + url) html = requests.get(url, headers = self.headers).content # 每次请求间隔1秒 time.sleep(1) self.parse_page(html) def parse_page(self, html): html_obj = etree.HTML(html) node_list = html_obj.xpath("//div[@class='info']") for node in node_list: # 电影标题 title = node.xpath("./div[@class='hd']/a/span[1]/text()")[0] # 电影评分 score = node.xpath(".//span[@class='rating_num']/text()")[0] self.count += 1 self.data_queue.put(score + "\t" + title) def start_work(self): job_list = [] for url in self.url_list: # 创建一个协程任务对象 job = gevent.spawn(self.send_request, url) # 保存所有的协程任务 job_list.append(job) # joinall()接收一个列表,将列表里的所有协程任务添加到任务队列里执行 gevent.joinall(job_list) while not self.data_queue.empty(): print(self.data_queue.get()) print(self.count) if __name__ == "__main__": douban = Douban() douban.start_work()
专业技术学习
专业技术学习
搜搜题库系统