python 爬虫 基于网站博主所有作品多线程爬取优化细节
#!/usr/bin/env python# -*- coding: utf-8 -*-# @Time: 2020/12/1 18:33# @Author: huni# @File: 爬壁纸(多线程+多页爬取).py# @Software: PyCharmfrom threading import Thread#多线程的包from queue import Queue#队列import reque
·
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2020/12/1 18:33
# @Author : huni
# @File : 爬壁纸(多线程+多页爬取).py
# @Software: PyCharm
from threading import Thread #多线程的包
from queue import Queue #队列
import requests
from lxml import etree
import os
class CrawlInfo(Thread):
#重写构造函数
def __init__(self,url_queue,html_queue):
Thread.__init__(self)
#声明两个类属性
self.url_queue = url_queue
self.html_queue = html_queue
#重写run方法
def run(self):
#爬虫代码
headers = {
'Referer': 'http://www.xiannvku.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36'
}
while self.url_queue.empty() == False: #当url队列中不是空的就继续爬
url = self.url_queue.get() #从队列中获取一个url
reponse = requests.get(url=url,headers=headers)
if reponse.status_code == 200:
self.html_queue.put(reponse.text) #访问成功把html文件放进html队列中
class ParseInfo(Thread):
def __init__(self,html_queue):
Thread.__init__(self)
self.html_queue = html_queue
#重写run方法
def run(self):
s = requests.Session()
s.keep_alive = False # 关闭多余连接
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
}
while self.html_queue.empty() == False:
e = etree.HTML(self.html_queue.get()) #从html队列中获取一个html使用etree解析
#下面的内容可以修改为自己需要解析的内容,比如爬取段子,音频,视频等,只需要在运行前把base_url做更改就可以了
li_list = e.xpath('//ul[@class="img"]/li')
for li in li_list:
detailurl = li.xpath('./a[1]/@href')[0]
detail_page = requests.get(detailurl, headers=headers).text
detail_tree = etree.HTML(detail_page)
pagenum = int(detail_tree.xpath('//div[@id="pages"]/a')[-2:-1][0].xpath('./text()')[0])
title = detail_tree.xpath('//title/text()')[0]
title_path = search_path + f'/{title}'
if not os.path.exists(title_path):
os.mkdir(title_path)
for j in range(1, pagenum + 1):
rep = str(j) + '.html'
href = detailurl.replace(detailurl.split('-')[-1], rep)
page = requests.get(url=href, headers=headers).text
tree = etree.HTML(page)
img_list = tree.xpath('//div[@class="content"]/center/img')
for img in img_list:
src = img.xpath('./@src')[0]
jpgname = src.split('/')[-1]
jpgpath = title_path + '/' + jpgname
jpgdata = requests.get(url=src, headers=headers).content
with open(jpgpath, 'wb') as fp:
fp.write(jpgdata)
print(jpgname, '保存完成!')
if __name__ == '__main__':
headers = {
'Referer': 'http://www.xiannvku.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
}
# 创建一个存储有url和html的容器:队列
url_queue = Queue()
html_queue = Queue()
key = 'XXX' # 可更换其他关键词
base_url = 'http://www.xiannvku.com/index.php/pic/search'
paradata = {
'key': key
}
search_page = requests.post(url=base_url, headers=headers, data=paradata).text
search_tree = etree.HTML(search_page)
search_num = search_tree.xpath('//div[@class="text-c"]/a[1]/text()')[0]
print('搜索到:', search_num, '内容')
search_path = './xxx' + f'/{key}'
if not os.path.exists(search_path):
os.mkdir(search_path)
gril_page_num = (int(search_num.replace('条', '')) // 28) + 1
for i in range(1, gril_page_num + 1):
every_url = f'http://www.xiannvku.com/pic/search?key={key}&page={i}'
url_queue.put(every_url)
crawl_list = [] #创建三个线程,加到线程列表中
for i in range(100):
Crawl = CrawlInfo(url_queue,html_queue)
crawl_list.append(Crawl)
Crawl.start()
for crawl in crawl_list:
#等待操作,可以理解成url队列解析完了之后,需要等待一会,再交给html队列解析内容
crawl.join()
parse_list = []
for i in range(100):
parse = ParseInfo(html_queue)
parse_list.append(parse)
parse.start()
for parse in parse_list:
parse.join()
``
更多推荐
所有评论(0)