答案:多线程版本: importrequests import time from queue import Queue
from lxml import etree
import threading
class Douban(object):
def __init__(self):
self.headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}
self.base_url = "https://movie.douban.com/top250?start="
self.url_list = [ self.base_url + str(page) for page in range(0, 225 + 1, 25)]
# 创建保存数据的队列
self.data_queue = Queue()
self.count = 0
def send_request(self, url):
print("[INFO]: 正在抓取" + url)
html = requests.get(url, headers = self.headers).content
# 每次请求间隔1秒
time.sleep(1)
self.parse_page(html)
def parse_page(self, html):
html_obj = etree.HTML(html)
node_list = html_obj.xpath("//div[@class='info']")
for node in node_list:
# 电影标题
title = node.xpath("./div[@class='hd']/a/span[1]/text()")[0]
# 电影评分
score = node.xpath(".//span[@class='rating_num']/text()")[0]
self.count += 1
self.data_queue.put(score + "\t" + title)
def start_work(self):
# 单线程:
"""
for url in self.url_list:
self.send_request(url)
"""
thread_list = []
for url in self.url_list:
# 创建一个线程对象
thread = threading.Thread(target = self.send_request, args = )
# 启动线程,执行任务
thread.start()
# 将当前线程对象存到列表
thread_list.append(thread)
# 让主线程等待,所有子线程执行结束,再执行后面的代码
for thread in thread_list:
thread.join()
while not self.data_queue.empty():
print(self.data_queue.get())
print(self.count)
if __name__ == "__main__":
douban = Douban()
douban.start_work() 协程版本: import requests
import time
from queue import Queue
from lxml import etree
import gevent
from gevent import monkey
monkey.patch_all()
# 在程序执行的时候,将Python的网络库打个补丁。在执行网络操作的时候,按异步的方式执行。
class Douban(object):
def __init__(self):
self.headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}
self.base_url = "https://movie.douban.com/top250?start="
self.url_list = [ self.base_url + str(page) for page in range(0, 225 + 1, 25)]
# 创建保存数据的队列
self.data_queue = Queue()
self.count = 0
def send_request(self, url):
print("[INFO]: 正在抓取" + url)
html = requests.get(url, headers = self.headers).content
# 每次请求间隔1秒
time.sleep(1)
self.parse_page(html)
def parse_page(self, html):
html_obj = etree.HTML(html)
node_list = html_obj.xpath("//div[@class='info']")
for node in node_list:
# 电影标题
title = node.xpath("./div[@class='hd']/a/span[1]/text()")[0]
# 电影评分
score = node.xpath(".//span[@class='rating_num']/text()")[0]
self.count += 1
self.data_queue.put(score + "\t" + title)
def start_work(self):
job_list = []
for url in self.url_list:
# 创建一个协程任务对象
job = gevent.spawn(self.send_request, url)
# 保存所有的协程任务
job_list.append(job)
# joinall()接收一个列表,将列表里的所有协程任务添加到任务队列里执行
gevent.joinall(job_list)
while not self.data_queue.empty():
print(self.data_queue.get())
print(self.count)
if __name__ == "__main__":
douban = Douban()
douban.start_work()