好得很程序员自学网

<tfoot draggable='sEl'></tfoot>

Python之爬虫-猫眼电影

Python之爬虫-猫眼电影

#!/usr/bin/env?python
#?coding:?utf-8

import?json
import?requests
import?re
import?time
#?猫眼多了反爬虫,速度过快,则会无响应,所以这里多了一个延时等待
from?requests.exceptions?import?RequestException


def?get_one_page(url):
????try:
????????headers?=?{
????????????'User-Agent':?'Mozilla/5.0?(Macintosh;?Intel?Mac?OS?X?10_11_4)?AppleWebKit/537.36(KHTML,?like?Gecko)?'
??????????????????????????'Chrome/52.0.2743.116?Safari/537.36',
????????}
????????response?=?requests.get(url,?headers=headers)
????????if?response.status_code?==?200:
????????????return?response.text??#?使得get_one_page()函数输出是一个文本
????????return?None
????except?RequestException:
????????return?None


def?parse_one_page(html):
????pattern?=?re测试数据pile(
????????'<dd>.*?board-index.*?>(.*?)</i>.*?name.*?a.*?>(.*?)</a>.*?star.*?>(.*?)</p>.*?releasetime.*?>(.*?)</p>.*?'
????????'integer.*?>(.*?)</i>.*?fraction.*?>(.*?)</i>.*?</dd>',
????????re.S)??#?正则表达式获取需要保存的东西编译成正则表达式对象
????items?=?re.findall(pattern,?html)??#?遍历html文件中的所有pattern正则表达式对象
????for?item?in?items:??#?把提取的对象装入字典中
????????yield?{
????????????'index':?item[0],
????????????'title':?item[1],
????????????'actor':?item[2].strip()[3:],
????????????'time':?item[3].strip()[5:],
????????????'score':?item[4]?+?item[5]
????????}


def?write_to_file(content):??#?把文件写入并保存在result.tx?+?'\n')
????with?open('result.txt',?'a',?encoding='utf-8')?as?f:
????????f.write(json.dumps(content,?ensure_ascii=False)?+?'\n')


def?main(offset):??#?遍历TOP100的电影的所有网址
????url?=?'http://maoyan测试数据/board/4?offset='?+?str(offset)??#?接收一个偏移量offset
????html?=?get_one_page(url)
????for?item?in?parse_one_page(html):
????????print(item)
????????write_to_file(item)


if?__name__?==?'__main__':??#?创建一个偏移量offset
????for?i?in?range(10):
????????main(offset=i?*?10)
????????time.sleep(1)

查看更多关于Python之爬虫-猫眼电影的详细内容...

  阅读:31次