堆糖网,图片壁纸网站,存在反爬,发现返回的json数据错乱严重,只能爬取部分数据,图片数据缺失很厉害,应用python进行图片抓取采集下载,一个多进程及多线程的使用例子。
网址入口
get方式,参数
json数据
运行效果
单线程
#www.duitang.com #20200603 by WX:huguo00289 # -*- coding: utf-8 -*- from fake_useragent import UserAgent import urllib.parse import requests,time,os,json def ua(): ua=UserAgent() headers={'User-Agent':ua.random,'Cookie': 'sessionid=ef6912ba-38d9-4b6e-a3d9-8d6526805f07; js=1; Hm_lvt_d8276dcc8bdfef6bb9d5bc9e3bcfcaf4=1590492733,1591182385; Hm_lpvt_d8276dcc8bdfef6bb9d5bc9e3bcfcaf4=1591182414'} #headers = {'User-Agent': ua.random} return headers def get_imgs(i,keyword): kd=urllib.parse.quote(keyword) url=f"https://www.duitang.com/napi/blog/list/by_search/?kw={kd}&type=feed&include_fields=top_comments%2Cis_root%2Csource_link%2Citem%2Cbuyable%2Croot_id%2Cstatus%2Clike_count%2Clike_id%2Csender%2Calbum%2Creply_count%2Cfavorite_blog_id&_type=&start={24*i}&_=159118241418{i}" html=requests.get(url,headers=ua(),timeout=8).content.decode('utf-8') time.sleep(1) datas=json.loads(html) object_lists=datas['data']['object_list'] print(len(object_lists)) for object_list in object_lists: print(object_list) img_url=object_list['album']['covers'][0] img_name='%s%s'%(object_list['album']['id'],os.path.splitext(img_url)[1]) print(img_url,img_name) down_img(img_url, img_name,keyword) def down_img(img_url,img_name,keyword): os.makedirs(f'{keyword}/',exist_ok=True) #创建目录 r=requests.get(img_url,headers=ua(),timeout=5) with open(f'{keyword}/{img_name}','wb') as f: f.write(r.content) print(f'>>>保存{img_name}图片成功!') def main(keyword): for i in range(1,10): print(f'>>>正在爬取第{i}页图片内容') get_imgs(i,keyword) print('采集图片完毕!') if __name__=='__main__': main("按钮")
多行程及多进程
#www.duitang.com #20200603 by WX:huguo00289 # -*- coding: utf-8 -*- from fake_useragent import UserAgent import urllib.parse import requests,time,os,json import threading #多线程 import multiprocessing #多进程 def ua(): ua=UserAgent() headers={'User-Agent':ua.random,'Cookie': 'sessionid=ef6912ba-38d9-4b6e-a3d9-8d6526805f07; js=1; Hm_lvt_d8276dcc8bdfef6bb9d5bc9e3bcfcaf4=1590492733,1591182385; Hm_lpvt_d8276dcc8bdfef6bb9d5bc9e3bcfcaf4=1591182414'} #headers = {'User-Agent': ua.random} return headers def get_imgs(num,keyword): kd=urllib.parse.quote(keyword) print(f'>>>正在爬取第{num}页图片内容') url=f"https://www.duitang.com/napi/blog/list/by_search/?kw={kd}&type=feed&include_fields=top_comments%2Cis_root%2Csource_link%2Citem%2Cbuyable%2Croot_id%2Cstatus%2Clike_count%2Clike_id%2Csender%2Calbum%2Creply_count%2Cfavorite_blog_id&_type=&start={24*num}&_=159118241418{num}" html=requests.get(url,headers=ua(),timeout=8).content.decode('utf-8') time.sleep(1) datas=json.loads(html) object_lists=datas['data']['object_list'] print(len(object_lists)) threads = [] for object_list in object_lists: print(object_list) img_url=object_list['album']['covers'][0] img_name='%s%s'%(object_list['album']['id'],os.path.splitext(img_url)[1]) print(img_url,img_name) t = threading.Thread(target=down_img, args=(img_url,img_name,keyword)) threads.append(t) for i in threads: i.start() for i in threads: i.join() print(num, 'is ok') def down_img(img_url,img_name,keyword): os.makedirs(f'{keyword}/',exist_ok=True) #创建目录 r=requests.get(img_url,headers=ua(),timeout=5) with open(f'{keyword}/{img_name}','wb') as f: f.write(r.content) print(f'>>>保存{img_name}图片成功!') #单进程 def main(keyword): for i in range(1,10): get_imgs(i,keyword) print('采集图片完毕!') #多进程 def maindjc(keyword): pool = multiprocessing.Pool(processes=4) # 开4个进程 for i in range(1, 10): pool.apply_async(func=get_imgs, args=(i, keyword)) pool.close() pool.join() print('采集图片完毕!') if __name__=='__main__': maindjc("美女")
参考来源:
[Python 爬虫]煎蛋网 OOXX 妹子图爬虫(2)——多线程+多进程下载图片
https://tendcode.com/article/jiandan-meizi-spider-2/
查看更多关于Python堆糖网图片爬虫,多进程及多线程的使用例子的详细内容...
声明:本文来自网络,不代表【好得很程序员自学网】立场,转载请注明出处:http://haodehen.cn/did126097