import requests,re
from lxml import etree
start_url=‘https://HdhCmsTest23us.so/files/article/html‘
url=start_url+‘/10/10839/index.html‘
response=requests.get(url).text
numbers_list=re.findall(‘\w\shref="‘+start_url+‘/10/10839/(\d+).html‘,response,re.S)
#j=re.findall(‘<a href="‘+ur+‘/9/9579/9633139.html">(.*?)</a>‘,k,re.S)
x1=url
y1=‘//*[@class="L"]//text()‘
novel_name = ‘剑来.txt‘
def pares(x,y):
m = requests.get(x)
m.encoding = m.apparent_encoding
um = etree.HTML(m.text)
poo = um.xpath(y)
return poo
def writecontext():
for i in do:
with open(novel_name, ‘a‘, encoding=‘utf-8‘)as f:
f.write(str(i))
print(i)
def writetitle():
with open(novel_name, ‘a‘, encoding=‘utf-8‘)as f:
f.write("\n\n"+o+"\n")
print(o)
writecontext()
doo=pares(x1,y1)
e=0
while e<10000:#控制章节数,
x2=start_url+"/10/10839/{}.html".format(numbers_list[e])
y2=‘//*[@id="contents"]/text()‘
do = pares(x2,y2)
o = doo[e]
e = e + 1
writetitle()
查看更多关于python---requests爬取顶点小说的详细内容...
声明:本文来自网络,不代表【好得很程序员自学网】立场,转载请注明出处:http://haodehen.cn/did170731