爬虫篇|爬虫实战（十）

前言：

对于爬虫还有一点小知识

fake_useragent的使用

fake_useragent第三方库，来实现随机请求头的设置；

安装 ? ? ? ? ? ? ? ? ? ?---> pip3 install fake-useragent

查看useragent ? ---> http://fake-useragent.herokuapp测试数据/browsers/

 from?fake_useragent?import?UserAgent ua?=?UserAgent() print(ua.ie)???#随机打印ie浏览器任意版本 print(ua.firefox)?#随机打印firefox浏览器任意版本 print(ua.chrome)??#随机打印chrome浏览器任意版本 print(ua.random)??#随机打印任意厂家的浏览器

Queue模块

主要有以下成员函数：

Queue.empty()：判断消息队列是否为空，返回True或False。同样不可靠。

Queue.not_empty()：判断消息队列是否为非空。同上不可靠。

Queue.full()：类似上边，判断消息队列是否满。

Queue.put(item, block=True, timeout=None)：往消息队列中存放消息。block可以控制是否阻塞，timeout指定阻塞时候的等待时间。如果不阻塞或者超时，会引起一个full exception。

Queue.get(block=True, timeout=None)：获取一个消息，其他同put。

爬虫练习

目标：爬取毛豆新车的数据，开线程使用队列大量的爬取

https://HdhCmsTestmaodou测试数据/car/list/all/ ? ? （链接）

要点进去继续爬取，这是爬虫最常见的方式，也是必须会的爬虫，对于这种方法，一般用框架使用的多

就是把车的全部信息扒下来

导入对应的模块

 import?threading from?threading?import?Thread from?queue?import?Queue import?requests from?lxml?import?etree from?fake_useragent?import?UserAgent

将每辆车的url用列表储存起来

 def?page_url(base_url): ????headers?=?{ ????????'User-Agent':?ua.random, ????} ????page?=?'1' ????url_list?=?[] ????while?True: ????????url?=?base_url?%?page ????????print(url) ????????#?解码 ????????html?=?requests.get(url,?headers=headers).content.decode('utf-8') ????????#?遍历 ????????page?=?str(int(page)?+?1) ????????tree?=?etree.HTML(html) ????????a_list?=?tree.xpath('//div[@class="list-wrap?clearfix"]/a/@href') ????????for?a?in?a_list: ????????????url_list.append(a) ????????if?len(a_list)?==?0: ????????????break ????return?url_list

用队列将每页的url储存起来

 get_queue?=?Queue() class?Crawl_MD(Thread): ????def?__init__(self,?url_queue): ????????#?类的写法 ????????super(Crawl_MD,?self).__init__() ????????self.url_queue?=?url_queue ????def?run(self): ????????while?True: ????????????if?self.url_queue.empty(): ????????????????break ????????????try: ????????????????url?=?self.url_queue.get(block=False) ????????????????self.get_request(url) ????????????except?Exception?as?e: ????????????????print(e) ????def?get_request(self,?url): ????????headers?=?{ ????????????'User-Agent':?ua.random, ????????} ????????response?=?requests.get(url,?headers=headers).content.decode('utf-8') ????????get_queue.put(response)

最后在详细页把需要的信息一个一个匹对下来

 num?=?1 class?Customer_MD(Thread): ????def?run(self): ????????while?True: ????????????if?get_queue.empty()?and?flag: ????????????????break ????????????try: ????????????????response?=?get_queue.get(block=False) ????????????????self.get_data(response) ????????????except?Exception?as?e: ????????????????print(e) ????def?get_none(self,?word): ????????if?len(word)?>?0: ????????????return?word[0] ????????else: ????????????return?'' ????def?get_data(self,?response): ????????tree?=?etree.HTML(response) ????????title?=?tree.xpath('//h2[@class="banner-tit"]/text()') ????????img?=?tree.xpath('//div[@class="slider"]//li[1]/img/@src') ????????soufu?=?tree.xpath('//div[@class="sy-yf"]//p[@class="sy-num"]/text()') ????????yuegong?=?tree.xpath('//div[@class="sy-yf"]/div[2]/p[@class="yf-num?sy-num"]/text()') ????????firm_money?=?tree.xpath('//p[@class="price?"]/text()') ????????peizhi?=?tree.xpath('//ul[@class="config-detail"]//p/text()') ????????PZ?=?{} ????????for?i,?j?in?zip(peizhi[::2],?peizhi[1::2]): ????????????PZ[i]?=?j ????????#?print(title,?img,?soufu,?yuegong,?firm_money,?peizhi) ????????data?=?{ ????????????'title':?self.get_none(title), ????????????'img':?self.get_none(img), ????????????'首付':?''.join(soufu).replace('???',?'|'), ????????????'月供':?''.join(yuegong).replace('??',?'|'), ????????????'firm_money':?self.get_none(firm_money), ????????????'配置':?PZ ????????} ????????print(data) ????????global?num ????????word?=?[{"num":?num},?{'data':?data}] ????????if?lock.acquire(): ????????????with?open('data.txt',?'a')?as?f: ????????????????f.write(str(word)?+?'\n') ????????????????num?+=?1 ????????????????lock.release()

开爬

 if?__name__?==?'__main__': ????#?创建队列用于储存翻页url ????get_queue?=?Queue() ????ua?=?UserAgent() ????#?用来做标识 ????flag?=?False ????#?每辆车详细页的url ????list?=?page_url('https://HdhCmsTestmaodou测试数据/car/list/all/pg%s') ????#?创建队列用于爬取数据 ????url_queue?=?Queue() ????#?翻页的url列表 ????crawl_list?=?[] ????#?每辆车的url ????customer_list?=?[] ????#?锁起来 ????lock?=?threading.Lock() ????#?详细页的url的队列 ????[url_queue.put(i)?for?i?in?list] ????#?开三个线程来爬翻页的url ????for?cre?in?range(3): ????????crawl?=?Crawl_MD(url_queue) ????????crawl.start() ????????crawl_list.append(crawl) ????#?开三个线程来爬数据 ????for?cus?in?range(3): ????????customer?=?Customer_MD() ????????customer.start() ????????customer_list.append(customer) ????#?释放锁 ????[i.join()?for?i?in?crawl_list] ????#?如果分页的队列可能为空 ????flag?=?True ????#?释放锁 ????[a.join()?for?a?in?customer_list]

爬取结果

总结：

对于此类爬虫，一般使用的都是scrapy和pyspider框架，但我觉得能不能使用框架最好不使用框架

在公众号回复【毛豆】,获得本文代码

--END--

最后，祝有所学习，有所成长

回复【 1024 】获取学习资料

转发，好看支持一下，感谢

你的转发，就是对我最大的支持

查看更多关于爬虫篇|爬虫实战（十）的详细内容...

声明：本文来自网络，不代表【好得很程序员自学网】立场，转载请注明出处：http://haodehen.cn/did163290

更新时间：2023-02-10 阅读：61次