python 爬虫 网站可视化像素点集合单元合集的采集项目自编
#!/usr/bin/env python# -*- coding: utf-8 -*-# @Time: 2020/11/14 18:33# @Author: huni# @File: 爬壁纸(多线程+多页爬取).py# @Software: PyCharmfrom threading import Thread#多线程的包from queue import Queue#队列from fake_u
·
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2020/11/14 18:33
# @Author : huni
# @File : 爬壁纸(多线程+多页爬取).py
# @Software: PyCharm
from threading import Thread #多线程的包
from queue import Queue #队列
from fake_useragent import UserAgent #模拟请求头的包,可用可不用,我这里没用,自己写的headers
import requests
from lxml import etree
import os
class CrawlInfo(Thread):
#重写构造函数
def __init__(self,url_queue,html_queue):
Thread.__init__(self)
#声明两个类属性
self.url_queue = url_queue
self.html_queue = html_queue
#重写run方法
def run(self):
#爬虫代码
headers = {
'Referer': 'https://www.mzitu.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36'
}
while self.url_queue.empty() == False: #当url队列中不是空的就继续爬
url = self.url_queue.get() #从队列中获取一个url
reponse = requests.get(url=url,headers=headers)
if reponse.status_code == 200:
self.html_queue.put(reponse.text) #访问成功把html文件放进html队列中
class ParseInfo(Thread):
def __init__(self,html_queue):
Thread.__init__(self)
self.html_queue = html_queue
#重写run方法
def run(self):
head = {
'Referer': 'https://www.mzitu.com/212698/1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
}
while self.html_queue.empty() == False:
e = etree.HTML(self.html_queue.get()) #从html队列中获取一个html使用etree解析
#下面的内容可以修改为自己需要解析的内容,比如爬取段子,音频,视频等,只需要在运行前把base_url做更改就可以了
li_list = e.xpath('//*[@id="pins"]/li')
grils_list = []
for li in li_list:
li_href = li.xpath('./a/@href')[0]
grils_list.append(li_href)
for s_url in grils_list:
page_text = requests.get(url=s_url, headers=head).text
tree = etree.HTML(page_text)
page_start =int(tree.xpath('/html/body/div[2]/div[1]/div[4]/span[1]/text()')[0])
page_end = int(tree.xpath('/html/body/div[2]/div[1]/div[4]/a[5]/span/text()')[0])
# print(page_start,page_end)
title = ''.join(tree.xpath('/html/body/div[2]/div[1]/h2/text()'))
# print(page_start,page_end)
title_path = './文件夹/%s' % title
if not os.path.exists(title_path):
os.mkdir(title_path)
new_url_list = []
base_url = s_url + '/' + '{}'
for i in range(page_start, page_end+1): # page_end+1-90
new_url = base_url.format(i)
# new_url_list.append(new_url)
heada = {
'Referer': new_url,
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
}
# try:
# for link in new_url_list:
page_new_text = requests.get(url=new_url, headers=heada).text
tree1 = etree.HTML(page_new_text)
src = tree1.xpath('/html/body/div[2]/div[1]/div[3]/p/a/img/@src')[0]
# print(src)
jpg_data = requests.get(url=src, headers=head).content
# print(jpg_data)
jpg_name = src.split('/')[-1] # 每个简历rar文件的名字是链接切片的/后的所有字符串
jpg_path = title_path + '/' + jpg_name
with open(jpg_path, 'wb') as fp:
fp.write(jpg_data)
print(jpg_name, '下载完成')
# except:
# print('爬取失败!!!')
if __name__ == '__main__':
# if not os.path.exists('./4Ktupian'):
# os.mkdir('./4Ktupian')
#创建一个存储有url和html的容器:队列
url_queue = Queue()
html_queue = Queue()
base_url = 'https://www.mzitu.com/page/{}/' #占位符填充
for i in range(1,11):
new_url = base_url.format(i) #占位符填充
url_queue.put(new_url)
crawl_list = [] #创建三个线程,加到线程列表中
for i in range(10):
Crawl = CrawlInfo(url_queue,html_queue)
crawl_list.append(Crawl)
Crawl.start()
for crawl in crawl_list:
#等待操作,可以理解成url队列解析完了之后,需要等待一会,再交给html队列解析内容
crawl.join()
parse_list = []
for i in range(10):
parse = ParseInfo(html_queue)
parse_list.append(parse)
parse.start()
for parse in parse_list:
parse.join()
更多推荐
所有评论(0)