微信公众号内容采集,比较怪异,其参数,post参数需要话费时间去搞定,这里采集的是话题标签的内容,同时应用了pdfkit打印输出内容。
这里实现应用了两个版本,第一个是直接网页访问,其真实地址即post网址也存在比较多的参数,没有尝试过,获取到的内容仅有部分,比较不理想。第二个版本是采用了无头浏览器直接访问,获取到网页源码,进行解析,得到想要的内容。
本渣渣现在比较懒,代码都是拿以前的,现成的,复制,改改,直接使用的!
版本一:
#微信公众号内容获取打印pdf #by 微信:huguo00289 #https://mp.weixin.qq.com/mp/homepage?__biz=MzA4NjQ3MDk4OA==&hid=5&sn=573b1b806f9ebf63171a56ee2936b883&devicetype=android-29&version=27001239&lang=zh_CN&nettype=WIFI&a=&session_us=gh_7d55ab2d943f&wx_header=1&fontScale=100&from=timeline&isappinstalled=0&scene=1&subscene=2&clicktime=1594602258&enterid=1594602258&ascene=14 # -*- coding: UTF-8 -*- import requests from fake_useragent import UserAgent import os,re import pdfkit confg = pdfkit.configuration( wkhtmltopdf=r'D:\wkhtmltox-0.12.5-1.mxe-cross-win64\wkhtmltox\bin\wkhtmltopdf.exe') class Du(): def __init__(self,furl): ua=UserAgent() self.headers={ "User-Agent": ua.random, } self.url=furl def get_urls(self): response=requests.get(self.url,headers=self.headers,timeout=8) html=response.content.decode('utf-8') req=re.findall(r'var data={(.+?)if',html,re.S)[0] urls=re.findall(r',"link":"(.+?)",',req,re.S) urls=set(urls) print(len(urls)) return urls def get_content(self,url,category): response = requests.get(url, headers=self.headers, timeout=8) print(response.status_code) html = response.content.decode('utf-8') req = re.findall(r'<div id="img-content" class="rich_media_wrp">(.+?)var first_sceen__time',html,re.S)[0] #获取标题 h1=re.findall(r'<h2 class="rich_media_title" id="activity-name">(.+?)</h2>',req,re.S)[0] h1=h1.strip() pattern = r"[\/\\\:\*\?\"\<\>\|]" h1 = re.sub(pattern, "_", h1) # 替换为下划线 print(h1) #获取详情 detail = re.findall(r'<div class="rich_media_content " id="js_content" style="visibility: hidden;">(.+?)<script nonce=".+?" type="text/javascript">',req,re.S)[0] data = f'<h1>{h1}</h1>\n{detail}' self.dypdf(h1,data,category) return data def dypdf(self,h1,data,category): datas = f'<html><head><meta charset="UTF-8"></head><body>{data}</body></html>' print("开始打印内容!") pdfkit.from_string(datas, f'{category}/{h1}.pdf', configuration=confg) print("打印保存成功!") if __name__=='__main__': furl="https://mp.weixin.qq.com/mp/homepage?__biz=MzA4NjQ3MDk4OA==&hid=5&sn=573b1b806f9ebf63171a56ee2936b883&devicetype=android-29&version=27001239&lang=zh_CN&nettype=WIFI&a=&session_us=gh_7d55ab2d943f&wx_header=1&fontScale=100&from=timeline&isappinstalled=0&scene=1&subscene=2&clicktime=1594602258&enterid=1594602258&ascene=14" category="潘通色卡(电子版)" datas = '' os.makedirs(f'{category}/',exist_ok=True) spider=Du(furl) urls=spider.get_urls() for url in urls: print(f">> 正在爬取链接:{url} ..") try: data=spider.get_content(url,category) except Exception as e: print(f"爬取错误,错误代码为:{e}") datas='%s%s%s'%(datas,'\n',data) spider.dypdf(category,datas,category)
版本二:
#微信公众号内容获取打印pdf #by 微信:huguo00289 #https://mp.weixin.qq.com/mp/homepage?__biz=MzA4NjQ3MDk4OA==&hid=5&sn=573b1b806f9ebf63171a56ee2936b883&devicetype=android-29&version=27001239&lang=zh_CN&nettype=WIFI&a=&session_us=gh_7d55ab2d943f&wx_header=1&fontScale=100&from=timeline&isappinstalled=0&scene=1&subscene=2&clicktime=1594602258&enterid=1594602258&ascene=14 # -*- coding: UTF-8 -*- import requests from selenium import webdriver import os,re,time import pdfkit from bs4 import BeautifulSoup confg = pdfkit.configuration( wkhtmltopdf=r'D:\wkhtmltox-0.12.5-1.mxe-cross-win64\wkhtmltox\bin\wkhtmltopdf.exe') class wx(): def __init__(self,furl): self.url = furl self.chrome_driver = r'C:\Users\Administrator\Desktop\chromedriver_win32\chromedriver.exe' # chromedriver的文件位置 self.browser = webdriver.Chrome(executable_path=self.chrome_driver) def get_urls(self): urls=[] self.browser.get(self.url) hrefs=self.browser.find_elements_by_xpath("//div[@class='article_list']/a[@class='list_item js_post']") for href in hrefs: url=href.get_attribute('href') urls.append(url) print(len(urls)) return urls def get_content(self,url,category): self.browser.get(url) time.sleep(5) # 调用driver的page_source属性获取页面源码 pageSource = self.browser.page_source soup=BeautifulSoup(pageSource,'lxml') #获取标题 h1=re.findall(r'<h2 class="rich_media_title" id="activity-name">(.+?)</h2>',pageSource,re.S)[0] h1=h1.strip() pattern = r"[\/\\\:\*\?\"\<\>\|]" h1 = re.sub(pattern, "_", h1) # 替换为下划线 print(h1) #获取详情 detail =soup.find('div',class_="rich_media_content") detail=str(detail) del_text="""<p class="" style="margin-top: -1px; max-width: 100%; font-family: 微软雅黑; white-space: normal; min-height: 40px; visibility: visible; height: 40px; line-height: 40px; border-radius: 10px; text-align: center; box-shadow: rgb(190, 190, 190) 0px 3px 5px; color: rgb(255, 255, 255); box-sizing: border-box !important; word-wrap: break-word !important; background-image: none; background-attachment: scroll; background-color: rgb(245, 143, 198); background-position: 0% 0%; background-repeat: repeat;"><strong class="" style="max-width: 100%; box-sizing: border-box !important; word-wrap: break-word !important;"><span style="max-width: 100%; font-size: 14px; box-sizing: border-box !important; word-wrap: break-word !important;">↑ 点击上方<span style="max-width: 100%; box-sizing: border-box !important; word-wrap: break-word !important;">“染整百科”</span>关注我们</span></strong></p>""" detail=detail.replace(del_text,'') data = f'<h1>{h1}</h1>\n{detail}' self.dypdf(h1,data,category) return data def dypdf(self,h1,data,category): datas = f'<html><head><meta charset="UTF-8"></head><body>{data}</body></html>' print("开始打印内容!") pdfkit.from_string(datas, f'{category}/{h1}.pdf', configuration=confg) print("打印保存成功!") def quit(self): self.browser.quit() if __name__=='__main__': furl="https://mp.weixin.qq.com/mp/homepage?__biz=MzA4NjQ3MDk4OA==&hid=5&sn=573b1b806f9ebf63171a56ee2936b883&devicetype=android-29&version=27001239&lang=zh_CN&nettype=WIFI&a=&session_us=gh_7d55ab2d943f&wx_header=1&fontScale=100&from=timeline&isappinstalled=0&scene=1&subscene=2&clicktime=1594602258&enterid=1594602258&ascene=14" category="潘通色卡(电子版)" datas = '' os.makedirs(f'{category}/',exist_ok=True) spider=wx(furl) urls=spider.get_urls() for url in urls: print(f">> 正在爬取链接:{url} ..") try: data=spider.get_content(url,category) except Exception as e: print(f"爬取错误,错误代码为:{e}") datas='%s%s%s'%(datas,'\n',data) spider.quit() spider.dypdf(category,datas,category)
以上代码仅供参考,如有雷同,那肯定是本渣渣抄袭的!
微信公众号:二爷记
不定时分享python源码及工具
查看更多关于Python爬虫,微信公众号话题标签内容采集打印PDF输出的详细内容...
声明:本文来自网络,不代表【好得很程序员自学网】立场,转载请注明出处:http://haodehen.cn/did126147