正则表达式
search //匹配第一次遇到符合规则的
匹配IP地址
import re re.search(r'(([01]{0,1}\d{0,1}\d|2[0-4]\d|25[0-5])\.){3}([01]{0,1}\d{0,1}\d|2[0-4]\d|25[0-5])','192.168.1.1')?
findall:匹配所有,把所有匹配到的字符放到一个列表中并返回
>>> p = re测试数据pile('\d+') >>> p.findall('3只小甲鱼,15条腿,多出的3条在哪里?') ['3','15','3']?
?
编译正则表达式 ??re测试数据pile()
>>>import re >>> p = re测试数据pile('[a-z]+') >>> p re测试数据pile('[a-z]+') >>> p.match("") >>>print(p.match("")) None >>> m = p.match('fishc') >>> m <_sre.SRE_Match object; span=(0,5), match='fishc'>方法:
group() ? 返回匹配的字符串
start() ? ?返回匹配的开始位置
end() ? ? ?返回匹配的结束位置
span() ? ?返回一个元组表示匹配位置(开始,结束)
>>> m.group() 'fishc' >>> m.start() 0 >>> m.end() 5 >>> m.span() (0,5)?
编译标志,详情请查看http://bbs.fishc测试数据/thread-57207-1-1.html
设置了编译标志符 charref = re测试数据pile(r""" &[#] # 开始数字引用 ( 0[0-7]+ # 八进制格式 | [0-9]+ # 十进制格式 | x[0-9a-fA-F]+ # 十六进制格式 ) ; # 结尾分号 """, re.VERBOSE) 未设置编译标志符 charref = re测试数据pile("&#(0[0-7]+|[0-9]+|x[0-9a-fA-F]+);")?
示例1:从代理网站上爬IP地址
????import?urllib.request
????import?re
????def?open_url(url):
????????req?=?urllib.request.Request(url)
????????req.add_header('User-Agent','Mozilla/5.0?(Windows?NT?10.0;?WOW64)?AppleWebKit/537.36?(KHTML,?like?Gecko)?Chrome/48.0.2564.97?Safari/537.36')
????????page?=?urllib.request.urlopen(req)#获取页面内容
????????html?=?page.read().decode('utf-8')#解码页面内容
????????return?html
????
????def?get_img(html):
????????p?=?r'(?:(?:[0,1]?\d?\d|2[0-4]\d|25[0-5])\.){3}(?:[0,1]?\d?\d|2[0-4]\d|25[0-5])'?#匹配IP地址
????????iplist?=?re.findall(p,html)
????????for?each?in?iplist:
????????????print(each)
????
????if?__name__?=='__main__':
????????url?="http://HdhCmsTestxicidaili测试数据/"
????????get_img(open_url(url))?
?
?
示例2:爬取妹纸图
import?urllib.request
import?os
import?re
def?save_imgs(folder,?img_addrs):
????for?each?in?img_addrs:
????????filename?=?each.split('/')[-1]
????????with?open(filename,?'wb')?as?f:
????????????img?=?url_open(each)
????????????f.write(img)
????????print(1)
????print(2)
def?url_open(url):
????reg?=?urllib.request.Request(url)
????reg.add_header('User-Agent',
???????????????????'Mozilla/5.0?(Windows?NT?6.1)?AppleWebKit/537.36?(KHTML,?like?Gecko)?Chrome/58.0.3029.110?Safari/537.36')
????response?=?urllib.request.urlopen(url)
????html?=?response.read()
????return?html
def?get_page(url):
????html?=?url_open(url).decode('utf-8')
????#?a?=?html.find('current-comment-page')+23
????a?=?re.search(r'\[\d{1,4}\]',?html)
????a?=?a.group()
????b?=?len(a)
????a?=?a[1:b?-?1]
????return?a
def?find_imgs(url):
????html?=?url_open(url).decode('utf-8')
????img_addrs?=?[]
????a?=?html.find('img?src=')
????while?a?!=?-1:
????????b?=?html.find('.jpg',?a,?a?+?255)
????????if?b?!=?-1:
????????????if?'lanya'?in?html[a?+?9:b?+?4]:
????????????????pass
????????????else:
????????????????img_addrs.append('http:'?+?html[a?+?9:b?+?4])
????????else:
????????????b?=?a?+?9
????????a?=?html.find('img?src=',?b)
????print(img_addrs)
????return?img_addrs
def?download_mm(folder='ooxx',?pages=4,?star=0):
????os.mkdir(folder)
????os.chdir(folder)
????url?=?'http://jandan.net/ooxx/'??#?妹子图地址
????#?url?=?'http://jandan.net/pic/'?#无聊图地址
????if?star?!=?0:
????????page_num?=?star
????else:
????????page_num?=?int(get_page(url))
????for?i?in?range(pages):
????????page_num?-=?1
????????page_url?=?url?+?'page-'?+?str(page_num)?+?'#comments'
????????img_addrs?=?find_imgs(page_url)
????????save_imgs(folder,?img_addrs)
????????print(page_url)
if?__name__?==?'__main__':
????download_mm()?
?
null
查看更多关于python爬虫入门(2)re模块-正则表达式的详细内容...
声明:本文来自网络,不代表【好得很程序员自学网】立场,转载请注明出处:http://haodehen.cn/did126963