好得很程序员自学网

<tfoot draggable='sEl'></tfoot>

Python爬取小说

import?requests
import?os
import?re

#?https://HdhCmsTest17k测试数据/chapter/263899/5856183.html
#?获取每章节下载链接的url
def?get_toc(html):
????print('get?url')
????to_url_list?=?[]
????toc_block?=?re.findall('class="tit">正文(.*?)BAIDU_banner_bottom',?html,?re.S)[0]
????toc_url?=?re.findall('href="(.*?)"',?toc_block,?re.S)
????start_url?=?'https://HdhCmsTest17k测试数据'
????for?url?in?toc_url[:-1]:
????????to_url_list.append(start_url?+?url)

????return?to_url_list

#?获取标题和小说内容
def?get_article(html):
????print('get?chapter?and?text')
????chapter_name?=?re.search('<h1>(.*?)</h1>',?html,?re.S).group(1)
????text_block?=?re.search('class="p">(.*?)<p?class="copy?">',?html,?re.S).group(1)
????#?print(chapter_name)
????#?print(text_block.replace('<p>',?'').replace('</p>',?''))
????text_content?=?text_block.replace('<p>',?'').replace('</p>',?'')
????save(chapter_name,?re.sub('[?\t]',?'',?text_content))??#?sub去除文章中的一大串空格

#?将小说内容保存到电脑上
def?save(chapter,?article):
????file_path?=?r'C:\Users\coremail\Desktop\爬虫\仙剑四'
????file_name?=?os.path.join(file_path,?chapter?+?'.txt')
????os.makedirs(file_path,?exist_ok=True)
????with?open(file_name,?'w',?encoding='utf-8')?as?f:
????????f.write(article)

#?仙剑四的url
url?=?'https://HdhCmsTest17k测试数据/list/263899.html'
htmlContent?=?requests.get(url).content.decode('UTF-8')

url_list?=?get_toc(htmlContent)
for?novel_url?in?url_list:
????print(novel_url)
????try:
????????get_article(requests.get(novel_url).content.decode('UTF-8'))
????except?Exception?as?e:
????????print(e)

#?get_article(requests.get('https://HdhCmsTest17k测试数据/chapter/263899/5868069.html').content.decode('utf-8'))
print('over')

查看更多关于Python爬取小说的详细内容...

  阅读:33次