Python之爬虫-猫眼电影
#!/usr/bin/env?python #?coding:?utf-8 import?json import?requests import?re import?time #?猫眼多了反爬虫,速度过快,则会无响应,所以这里多了一个延时等待 from?requests.exceptions?import?RequestException def?get_one_page(url): ????try: ????????headers?=?{ ????????????'User-Agent':?'Mozilla/5.0?(Macintosh;?Intel?Mac?OS?X?10_11_4)?AppleWebKit/537.36(KHTML,?like?Gecko)?' ??????????????????????????'Chrome/52.0.2743.116?Safari/537.36', ????????} ????????response?=?requests.get(url,?headers=headers) ????????if?response.status_code?==?200: ????????????return?response.text??#?使得get_one_page()函数输出是一个文本 ????????return?None ????except?RequestException: ????????return?None def?parse_one_page(html): ????pattern?=?re测试数据pile( ????????'<dd>.*?board-index.*?>(.*?)</i>.*?name.*?a.*?>(.*?)</a>.*?star.*?>(.*?)</p>.*?releasetime.*?>(.*?)</p>.*?' ????????'integer.*?>(.*?)</i>.*?fraction.*?>(.*?)</i>.*?</dd>', ????????re.S)??#?正则表达式获取需要保存的东西编译成正则表达式对象 ????items?=?re.findall(pattern,?html)??#?遍历html文件中的所有pattern正则表达式对象 ????for?item?in?items:??#?把提取的对象装入字典中 ????????yield?{ ????????????'index':?item[0], ????????????'title':?item[1], ????????????'actor':?item[2].strip()[3:], ????????????'time':?item[3].strip()[5:], ????????????'score':?item[4]?+?item[5] ????????} def?write_to_file(content):??#?把文件写入并保存在result.tx?+?'\n') ????with?open('result.txt',?'a',?encoding='utf-8')?as?f: ????????f.write(json.dumps(content,?ensure_ascii=False)?+?'\n') def?main(offset):??#?遍历TOP100的电影的所有网址 ????url?=?'http://maoyan测试数据/board/4?offset='?+?str(offset)??#?接收一个偏移量offset ????html?=?get_one_page(url) ????for?item?in?parse_one_page(html): ????????print(item) ????????write_to_file(item) if?__name__?==?'__main__':??#?创建一个偏移量offset ????for?i?in?range(10): ????????main(offset=i?*?10) ????????time.sleep(1)
声明:本文来自网络,不代表【好得很程序员自学网】立场,转载请注明出处:http://haodehen.cn/did127443