前言
众所周知,pyhon里如果用多线程访问同一个资源是很容易出错的,并且多线程是无序的,也就是一般来讲,我们用多线程需要用线程锁来决定谁来访问,就算用了线程锁,他的无序也决定了我们无法保证内容是按章节顺序存在txt里的,所以为了解决上面两个问题,我们引入了线程池和PriorityQueue重要性队列,按照事件的重要性来判断先后顺序,这样我们同时请求几百条信息的时候赋值给他们自己的重要性数值,就可以决定谁先写入了,下图是1000个线程爬完1400章所需时间,我们可以看到只花了10秒,这个时间是指整个程序开始,一直到文件写入完毕,是非常快的,我也试了单线程,但是单线程实在太慢了,我没耐心等下去,我算了一下,单线程爬完最少需要2分多钟
代码实现
首先看一下引用的库
import requests, re
from lxml import etree
from queue import PriorityQueue
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
class Spider():
url = 'http://www.shuquge.com/txt/73523/index.html'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
}
def get_page_urls(self):
rsp = requests.get(self.url, headers=self.headers)
rsp.encoding = rsp.apparent_encoding
html = etree.HTML(rsp.content)
titles = html.xpath('//dd/a/text()')[0]
title = html.xpath('/html/body/div[4]/div[2]/h2/text() ')[0]
links = html.xpath('//dd/a/@href')
links = [self.url.replace("index.html", "") + i for i in links]
return links,title
class PageJob():
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
}
def __init__(self, priority, url):
self.priority = priority
self.url = url
self.GetContent()
return
def __lt__(self, other):
return self.priority < other.priority
def GetContent(self):
rsp = requests.get(self.url, headers=self.headers)
if rsp.status_code == 503:
time.sleep(1)
rsp = requests.get(self.url, headers=self.headers)
html = etree.HTML(rsp.content)
title = html.xpath('//h1/text()')[0]
content = html.xpath('//div[@id="content"]/text()')[:-3]
while '\r' in content:
content.remove('\r')
content = [re.sub('\xa0\xa0\xa0\xa0', '', i) for i in content]
content = [re.sub('\r', '\n', i) for i in content]
self.title = '\n\n'+title+'\n\n'
self.content = content
print(title)
def PutPageJob(para):
q = para[0]
i = para[1]
links = para[2]
q.put(PageJob(i, links[i]))
if __name__ == '__main__':
start_time = time.time()
spider = Spider()
links, title = spider.get_page_urls()
q = PriorityQueue()
with ThreadPoolExecutor(max_workers=1000) as t: # 创建一个最大容纳数量为1000的线程池
obj_list = []
links = links[12:]
for i in range(len(links)):
para = (q, i, links)
p = t.submit(PutPageJob, para)
obj_list.append(p)
for future in as_completed(obj_list):
data = future.result()
while not q.empty():
next_job = q.get() # 可根据优先级取序列
with open('text/{}.txt'.format(title), 'a', encoding='utf-8') as f:
f.write(next_job.title)
f.writelines(next_job.content)
print('花费时间:', time.time() - start_time)