import?requests import?os import?re #?https://HdhCmsTest17k测试数据/chapter/263899/5856183.html #?获取每章节下载链接的url def?get_toc(html): ????print('get?url') ????to_url_list?=?[] ????toc_block?=?re.findall('class="tit">正文(.*?)BAIDU_banner_bottom',?html,?re.S)[0] ????toc_url?=?re.findall('href="(.*?)"',?toc_block,?re.S) ????start_url?=?'https://HdhCmsTest17k测试数据' ????for?url?in?toc_url[:-1]: ????????to_url_list.append(start_url?+?url) ????return?to_url_list #?获取标题和小说内容 def?get_article(html): ????print('get?chapter?and?text') ????chapter_name?=?re.search('<h1>(.*?)</h1>',?html,?re.S).group(1) ????text_block?=?re.search('class="p">(.*?)<p?class="copy?">',?html,?re.S).group(1) ????#?print(chapter_name) ????#?print(text_block.replace('<p>',?'').replace('</p>',?'')) ????text_content?=?text_block.replace('<p>',?'').replace('</p>',?'') ????save(chapter_name,?re.sub('[?\t]',?'',?text_content))??#?sub去除文章中的一大串空格 #?将小说内容保存到电脑上 def?save(chapter,?article): ????file_path?=?r'C:\Users\coremail\Desktop\爬虫\仙剑四' ????file_name?=?os.path.join(file_path,?chapter?+?'.txt') ????os.makedirs(file_path,?exist_ok=True) ????with?open(file_name,?'w',?encoding='utf-8')?as?f: ????????f.write(article) #?仙剑四的url url?=?'https://HdhCmsTest17k测试数据/list/263899.html' htmlContent?=?requests.get(url).content.decode('UTF-8') url_list?=?get_toc(htmlContent) for?novel_url?in?url_list: ????print(novel_url) ????try: ????????get_article(requests.get(novel_url).content.decode('UTF-8')) ????except?Exception?as?e: ????????print(e) #?get_article(requests.get('https://HdhCmsTest17k测试数据/chapter/263899/5868069.html').content.decode('utf-8')) print('over')
声明:本文来自网络,不代表【好得很程序员自学网】立场,转载请注明出处:http://haodehen.cn/did162319