现在很多网站都是异步加载的方式加载数据,大部分都是json数据,如果不知道数据的传递过程,一些参数理不清头绪的话,又想要获取数据,那就比较难搞了,尤其是对于本渣渣级选手而言。
? ? ? ?
? ? ?
目标网址
https://HdhCmsTestkeyshot测试数据/gallery/
需求
获取图片信息,需高清大图
?
经过简单浏览器抓包调试,可以获取到一些信息!
? ? ? ?
? ? ?
? ? ? ?
? ? ?
?
不想努力了,想了两个笨方法,好在数据量不大!
枚举法获取图片地址,爬取图片
1.枚举获取图片地址
代码示例
for?i?in?range(10000):
????if?len(str(i))==1:
????????i=f'000{i}'
????if?len(str(i))==2:
????????i?=?f'00{i}'
????if?len(str(i))==3:
????????i?=?f'0{i}'
????if?len(str(i))?==4:
????????i=i
????print(i)
????url=f"https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2016/06/keyshot-gallery-{i}.jpg"
????if?requests.get(url,?headers=self.random_headers):
????????print("存在图片!")
图片链接:
https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2016/06/keyshot-gallery-0003.jpg
可以看到id与图片链接是存在关系的,所以,对于id进行迭代,同时进行了if判断!
2.图片下载
代码示例
????def?save_img(self,?img_url,?img_name,?path):
????????os.makedirs(f'{path}/',?exist_ok=True)
????????print("开始下载图片!")
????????print(f">>>?开始保存?{img_name}?图片")
????????r?=?requests.get(img_url,?headers=self.random_headers,timeout=8)
????????with?open(f'{path}/{img_name}.jpg',?'wb')?as?f:
????????????f.write(r.content)
????????print(f">>>?保存?{img_name}?图片成功")
这里需要注意的是 timeout=8 属性一定需要标配,尤其是国外网站获取请求的话,不然容易卡死!
完整代码
#?-*-?coding:?UTF-8?-*-
#微信:huguo00289
import?requests
import?random,os
class?Httprequest(object):
????ua_list?=?[
????????'Mozilla/5.0?(Windows?NT?6.1;?WOW64)?AppleWebKit/535.1?(KHTML,?like?Gecko)?Chrome/14.0.835.163?Safari/535.1',
????????'Mozilla/5.0?(Windows?NT?10.0;?Win64;?x64)?AppleWebKit/537.36?(KHTML,?like?Gecko)?Chrome/73.0.3683.103?Safari/537.36Chrome?17.0',
????????'Mozilla/5.0?(Macintosh;?Intel?Mac?OS?X?10_7_0)?AppleWebKit/535.11?(KHTML,?like?Gecko)?Chrome/17.0.963.56?Safari/535.11',
????????'Mozilla/5.0?(Windows?NT?6.1;?WOW64;?rv:6.0)?Gecko/20100101?Firefox/6.0Firefox?4.0.1',
????????'Mozilla/5.0?(Macintosh;?Intel?Mac?OS?X?10.6;?rv:2.0.1)?Gecko/20100101?Firefox/4.0.1',
????????'Mozilla/5.0?(Macintosh;?U;?Intel?Mac?OS?X?10_6_8;?en-us)?AppleWebKit/534.50?(KHTML,?like?Gecko)?Version/5.1?Safari/534.50',
????????'Mozilla/5.0?(Windows;?U;?Windows?NT?6.1;?en-us)?AppleWebKit/534.50?(KHTML,?like?Gecko)?Version/5.1?Safari/534.50',
????????'Opera/9.80?(Windows?NT?6.1;?U;?en)?Presto/2.8.131?Version/11.11',
????]
????@property??#把方法变成属性的装饰器
????def?random_headers(self):
????????return?{
????????????'User-Agent':?random.choice(self.ua_list)
????????}
class?Get_imgs(Httprequest):
????def?__init__(self):
????????self.path="key"
????def?getimgs(self):
????????for?i?in?range(10000):
????????????if?len(str(i))==1:
????????????????i=f'000{i}'
????????????if?len(str(i))==2:
????????????????i?=?f'00{i}'
????????????if?len(str(i))==3:
????????????????i?=?f'0{i}'
????????????if?len(str(i))?==4:
????????????????i=i
????????????print(i)
????????????url=f"https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2016/06/keyshot-gallery-{i}.jpg"
????????????if?requests.get(url,?headers=self.random_headers):
????????????????print("存在图片!")
????????????????self.save_img(url,?str(i),?self.path)
????#下载图片
????def?save_img(self,?img_url,?img_name,?path):
????????os.makedirs(f'{path}/',?exist_ok=True)
????????print("开始下载图片!")
????????print(f">>>?开始保存?{img_name}?图片")
????????r?=?requests.get(img_url,?headers=self.random_headers,timeout=8)
????????with?open(f'{path}/{img_name}.jpg',?'wb')?as?f:
????????????f.write(r.content)
????????print(f">>>?保存?{img_name}?图片成功")
if?__name__=='__main__':
????spider=Get_imgs()
????spider.getimgs()
手动获取json数据包,爬取图片
1.正则获取图片地址
代码示例
img_urls=[] ????zeimg=r'href="(.+?)"' ????imgs=re.findall(zeimg,str(datas),re.S) ????for?img?in?imgs: ????????if?"HdhCmsTestkeyshot测试数据"?in?img: ????????????img_urls.append(img)
2.多线程下载图片,这里使用了线程池技术
代码示例
def?main():
????img_urls=get_imgs()
????try:
????????#?开4个?worker,没有参数时默认是?cpu?的核心数
????????pool?=?ThreadPool()
????????results?=?pool.map(save_img,?img_urls)
????????pool.close()
????????pool.join()
????????print("采集所有图片完成!")
????except:
????????print("Error:?unable?to?start?thread")
完整代码
#keyshot图片采集
#?-*-?coding:?UTF-8?-*-
#微信:huguo00289
import?requests,re,os,random
from?multiprocessing.dummy?import?Pool?as?ThreadPool
def?get_imgs():
????datas="""
????
????
????data:?"<li?id="eg-2-post-id-333312"?data-skin="keyshot-gallery"?class="filterall?filter-entertainment?eg-keyshot-gallery-wrapper?eg-post-id-333312?eg-newli"?data-date="1352027697"?data-title="dmitrij-le">?????<div?class="esg-media-cover-wrapper">?<div?class="esg-entry-media"><img?src="https://HdhCmsTestkeyshot测试数据/wp-content/plugins/essential-grid/public/assets/images/300x200transparent.png"?data-lazythumb="https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2016/06/keyshot-gallery-0095-700x1121-25x25.jpg"?data-no-lazy="1"?data-lazysrc="https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2016/06/keyshot-gallery-0095-700x1121.jpg"?alt=""?width="700"?height="1121"></div>??????????????<div?class="esg-entry-cover?esg-transition"?data-delay="0"?data-duration="deafult"?data-clickable="on"?data-transition="esg-fade">??<a?class="eg-invisiblebutton??esgbox"?href="https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2016/06/keyshot-gallery-0095.jpg"??data-width="923"??data-height="1478"?></a>????????????????<div?class="esg-overlay?esg-transition?eg-keyshot-gallery-container"?data-delay="0"?data-duration="default"?data-transition="esg-fade"></div>??????<div?class="esg-center?eg-post-333312?eg-keyshot-gallery-element-14-a?esg-transition"?data-delay="0"?data-duration="default"?data-transition="esg-slideup"><a?class="eg-keyshot-gallery-element-14?eg-post-333312"?href="javascript:void(0);"?target="_self"></a></div>???????????????<div?class="esg-center?eg-keyshot-gallery-element-8?esg-none?esg-clear"?style="height:?5px;?visibility:?hidden;"></div>?????<div?class="esg-center?eg-post-333312?eg-keyshot-gallery-element-3?esg-transition"?data-delay="0.1"?data-duration="default"?data-transition="esg-flipup">Dmitrij?Leppée</div>???????????????<div?class="esg-center?eg-keyshot-gallery-element-9?esg-none?esg-clear"?style="height:?5px;?visibility:?hidden;"></div>???????????????<div?class="esg-center?eg-keyshot-gallery-element-11?esg-none?esg-clear"?style="height:?5px;?visibility:?hidden;"></div>????????????</div>????</div>??</li>?<li?id="eg-2-post-id-333248"?data-skin="keyshot-gallery"?class="filterall?filter-jewelry?eg-keyshot-gallery-wrapper?eg-post-id-333248?eg-newli"?data-date="1351522438"?data-title="tiho-ramov">?????<div?class="esg-media-cover-wrapper">?<div?class="esg-entry-media"><img?src="https://HdhCmsTestkeyshot测试数据/wp-content/plugins/essential-grid/public/assets/images/300x200transparent.png"?data-lazythumb="https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2016/06/keyshot-gallery-0073-700x321-25x25.png"?data-no-lazy="1"?data-lazysrc="https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2016/06/keyshot-gallery-0073-700x321.png"?alt=""?width="700"?height="321"></div>??????????????<div?class="esg-entry-cover?esg-transition"?data-delay="0"?data-duration="deafult"?data-clickable="on"?data-transition="esg-fade">??<a?class="eg-invisiblebutton??esgbox"?href="https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2016/06/keyshot-gallery-0073.png"??data-width="1000"??data-height="458"?></a>????????????????<div?class="esg-overlay?esg-transition?eg-keyshot-gallery-container"?data-delay="0"?data-duration="default"?data-transition="esg-fade"></div>??????<div?class="esg-center?eg-post-333248?eg-keyshot-gallery-element-14-a?esg-transition"?data-delay="0"?data-duration="default"?data-transition="esg-slideup"><a?class="eg-keyshot-gallery-element-14?eg-post-333248"?href="javascript:void(0);"?target="_self"></a></div>???????????????<div?class="esg-center?eg-keyshot-gallery-element-8?esg-none?esg-clear"?style="height:?5px;?visibility:?hidden;"></div>?????<div?class="esg-center?eg-post-333248?eg-keyshot-gallery-element-3?esg-transition"?data-delay="0.1"?data-duration="default"?data-transition="esg-flipup">Tiho?Ramovic</div>???????????????<div?class="esg-center?eg-keyshot-gallery-element-9?esg-none?esg-clear"?style="height:?5px;?visibility:?hidden;"></div>???????????????<div?class="esg-center?eg-keyshot-gallery-element-11?esg-none?esg-clear"?style="height:?5px;?visibility:?hidden;"></div>????????????</div>????</div>??</li>?<li?id="eg-2-post-id-333308"?data-skin="keyshot-gallery"?class="filterall?filter-entertainment?eg-keyshot-gallery-wrapper?eg-post-id-333308?eg-newli"?data-date="1349780210"?data-title="vitaly-bul">?????<div?class="esg-media-cover-wrapper">?<div?class="esg-entry-media"><img?src="https://HdhCmsTestkeyshot测试数据/wp-content/plugins/essential-grid/public/assets/images/300x200transparent.png"?data-lazythumb="https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2012/10/keyshot-gallery-0113-700x1020-25x25.jpg"?data-no-lazy="1"?data-lazysrc="https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2012/10/keyshot-gallery-0113-700x1020.jpg"?alt=""?width="700"?height="1020"></div>??????????????<div?class="esg-entry-cover?esg-transition"?data-delay="0"?data-duration="deafult"?data-clickable="on"?data-transition="esg-fade">??<a?class="eg-invisiblebutton??esgbox"?href="https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2012/10/keyshot-gallery-0113.jpg"??data-width="961"??data-height="1400"?></a>????????????????<div?class="esg-overlay?esg-transition?eg-keyshot-gallery-container"?data-delay="0"?data-duration="default"?data-transition="esg-fade"></div>??????<div?class="esg-center?eg-post-333308?eg-keyshot-gallery-element-14-a?esg-transition"?data-delay="0"?data-duration="default"?data-transition="esg-slideup"><a?class="eg-keyshot-gallery-element-14?eg-post-333308"?href="javascript:void(0);"?target="_self"></a></div>???????????????<div?class="esg-center?eg-keyshot-gallery-element-8?esg-none?esg-clear"?style="height:?5px;?visibility:?hidden;"></div>?????<div?class="esg-center?eg-post-333308?eg-keyshot-gallery-element-3?esg-transition"?data-delay="0.1"?data-duration="default"?data-transition="esg-flipup">Vitaly?Bulgarov</div>???????????????<div?class="esg-center?eg-keyshot-gallery-element-9?esg-none?esg-clear"?style="height:?5px;?visibility:?hidden;"></div>???????????????<div?class="esg-center?eg-keyshot-gallery-element-11?esg-none?esg-clear"?style="height:?5px;?visibility:?hidden;"></div>????????????</div>????</div>??</li>?<li?id="eg-2-post-id-333310"?data-skin="keyshot-gallery"?class="filterall?filter-entertainment?eg-keyshot-gallery-wrapper?eg-post-id-333310?eg-newli"?data-date="1345460494"?data-title="maarten-ve">?????<div?class="esg-media-cover-wrapper">?<div?class="esg-entry-media"><img?src="https://HdhCmsTestkeyshot测试数据/wp-content/plugins/essential-grid/public/assets/images/300x200transparent.png"?data-lazythumb="https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2016/06/keyshot-gallery-0094-700x1017-25x25.jpg"?data-no-lazy="1"?data-lazysrc="https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2016/06/keyshot-gallery-0094-700x1017.jpg"?alt=""?width="700"?height="1017"></div>??????????????<div?class="esg-entry-cover?esg-transition"?data-delay="0"?data-duration="deafult"?data-clickable="on"?data-transition="esg-fade">??<a?class="eg-invisiblebutton??esgbox"?href="https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2016/06/keyshot-gallery-0094.jpg"??data-width="1321"??data-height="1920"?></a>????????????????<div?class="esg-overlay?esg-transition?eg-keyshot-gallery-container"?data-delay="0"?data-duration="default"?data-transition="esg-fade"></div>??????<div?class="esg-center?eg-post-333310?eg-keyshot-gallery-element-14-a?esg-transition"?data-delay="0"?data-duration="default"?data-transition="esg-slideup"><a?class="eg-keyshot-gallery-element-14?eg-post-333310"?href="javascript:void(0);"?target="_self"></a></div>???????????????<div?class="esg-center?eg-keyshot-gallery-element-8?esg-none?esg-clear"?style="height:?5px;?visibility:?hidden;"></div>?????<div?class="esg-center?eg-post-333310?eg-keyshot-gallery-element-3?esg-transition"?data-delay="0.1"?data-duration="default"?data-transition="esg-flipup">Maarten?Verhoeven</div>???????????????<div?class="esg-center?eg-keyshot-gallery-element-9?esg-none?esg-clear"?style="height:?5px;?visibility:?hidden;"></div>???????????????<div?class="esg-center?eg-keyshot-gallery-element-11?esg-none?esg-clear"?style="height:?5px;?visibility:?hidden;"></div>????????????</div>????</div>??</li>?<li?id="eg-2-post-id-333207"?data-skin="keyshot-gallery"?class="filterall?filter-engineering?eg-keyshot-gallery-wrapper?eg-post-id-333207?eg-newli"?data-date="1334153155"?data-title="philippe-v">?????<div?class="esg-media-cover-wrapper">?<div?class="esg-entry-media"><img?src="https://HdhCmsTestkeyshot测试数据/wp-content/plugins/essential-grid/public/assets/images/300x200transparent.png"?data-lazythumb="https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2016/06/keyshot-gallery-0054-700x394-25x25.jpg"?data-no-lazy="1"?data-lazysrc="https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2016/06/keyshot-gallery-0054-700x394.jpg"?alt=""?width="700"?height="394"></div>??????????????<div?class="esg-entry-cover?esg-transition"?data-delay="0"?data-duration="deafult"?data-clickable="on"?data-transition="esg-fade">??<a?class="eg-invisiblebutton??esgbox"?href="https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2016/06/keyshot-gallery-0054.jpg"??data-width="1280"??data-height="720"?></a>????????????????<div?class="esg-overlay?esg-transition?eg-keyshot-gallery-container"?data-delay="0"?data-duration="default"?data-transition="esg-fade"></div>??????<div?class="esg-center?eg-post-333207?eg-keyshot-gallery-element-14-a?esg-transition"?data-delay="0"?data-duration="default"?data-transition="esg-slideup"><a?class="eg-keyshot-gallery-element-14?eg-post-333207"?href="javascript:void(0);"?target="_self"></a></div>???????????????<div?class="esg-center?eg-keyshot-gallery-element-8?esg-none?esg-clear"?style="height:?5px;?visibility:?hidden;"></div>?????<div?class="esg-center?eg-post-333207?eg-keyshot-gallery-element-3?esg-transition"?data-delay="0.1"?data-duration="default"?data-transition="esg-flipup">Philippe?Vanagt</div>???????????????<div?class="esg-center?eg-keyshot-gallery-element-9?esg-none?esg-clear"?style="height:?5px;?visibility:?hidden;"></div>???????????????<div?class="esg-center?eg-keyshot-gallery-element-11?esg-none?esg-clear"?style="height:?5px;?visibility:?hidden;"></div>????????????</div>????</div>??</li>?"
????message:?""
????success:?true
????
????
"""
????img_urls=[]
????zeimg=r'href="(.+?)"'
????imgs=re.findall(zeimg,str(datas),re.S)
????for?img?in?imgs:
????????if?"HdhCmsTestkeyshot测试数据"?in?img:
????????????img_urls.append(img)
????print(len(img_urls))
????return?img_urls
#下载图片
def?save_img(img_url):
????path?=?"key"
????ua_list?=?[
????????'Mozilla/5.0?(Windows?NT?6.1;?WOW64)?AppleWebKit/535.1?(KHTML,?like?Gecko)?Chrome/14.0.835.163?Safari/535.1',
????????'Mozilla/5.0?(Windows?NT?10.0;?Win64;?x64)?AppleWebKit/537.36?(KHTML,?like?Gecko)?Chrome/73.0.3683.103?Safari/537.36Chrome?17.0',
????????'Mozilla/5.0?(Macintosh;?Intel?Mac?OS?X?10_7_0)?AppleWebKit/535.11?(KHTML,?like?Gecko)?Chrome/17.0.963.56?Safari/535.11',
????????'Mozilla/5.0?(Windows?NT?6.1;?WOW64;?rv:6.0)?Gecko/20100101?Firefox/6.0Firefox?4.0.1',
????????'Mozilla/5.0?(Macintosh;?Intel?Mac?OS?X?10.6;?rv:2.0.1)?Gecko/20100101?Firefox/4.0.1',
????????'Mozilla/5.0?(Macintosh;?U;?Intel?Mac?OS?X?10_6_8;?en-us)?AppleWebKit/534.50?(KHTML,?like?Gecko)?Version/5.1?Safari/534.50',
????????'Mozilla/5.0?(Windows;?U;?Windows?NT?6.1;?en-us)?AppleWebKit/534.50?(KHTML,?like?Gecko)?Version/5.1?Safari/534.50',
????????'Opera/9.80?(Windows?NT?6.1;?U;?en)?Presto/2.8.131?Version/11.11',
????]
????os.makedirs(f'{path}/',?exist_ok=True)
????img_name=img_url.split('/')[-1]
????print("开始下载图片!")
????print(f">>>?开始保存?{img_name}?图片")
????r?=?requests.get(img_url,headers={'User-Agent':random.choice(ua_list)},timeout=8)
????with?open(f'{path}/{img_name}',?'wb')?as?f:
????????f.write(r.content)
????print(f">>>?保存?{img_name}?图片成功")
def?main():
????img_urls=get_imgs()
????try:
????????#?开4个?worker,没有参数时默认是?cpu?的核心数
????????pool?=?ThreadPool()
????????results?=?pool.map(save_img,?img_urls)
????????pool.close()
????????pool.join()
????????print("采集所有图片完成!")
????except:
????????print("Error:?unable?to?start?thread")
if?__name__=='__main__':
????main()
????
?? ? ?
微信公众号:二爷记
不定时分享python源码及工具
查看更多关于low版爬虫脚本,Python简单图片爬虫案例的详细内容...
声明:本文来自网络,不代表【好得很程序员自学网】立场,转载请注明出处:http://haodehen.cn/did126117