新版字符截取函数
return r
def get_part2(html,start,end,start_count=1,end_count=1,start_re=False,end_re=False):
"""
取得start,end中间的数据,不包括start和end中的字符
html="abcabc"
print get_part2(html,"a","c",start_count=2)
"""
a,b=find_tag_idx(html,start,start_count,use_re=start_re)
if a==-1:
return ""
#print a,b
_html=html[b:]
#print _html
c,d=find_tag_idx(_html,end,end_count,use_re=end_re)
#print c,d
if c==-1:
return ""
return _html[:c]
def filter_tags(html,tags=["em","dd","input","h1","h2","h3","br","a","b","span","strong","p","hr","strong","p","hr","font","div","td","tr","img","form","table"]):
result=html
for elem in tags:
result=re.sub(r"(?i)<%s[\s\S]*?>"%elem,"",result)
result=re.sub(r"(?i)</ *%s[\s\S]*?>"%elem,"",result)
return result
def filter_comment(html):
r=re.sub(r"<!--[\s\S]*?-->",'', html)
return r
def filter_characters(html,tags=["¥"," ","]",":"]):
for tag in tags:
html=html.replace(tag,"")
return html
def filter_int(html):
r=re.sub(r"(?m)[^\d]+",'', html).strip()
try:
return str(int(r))
except:
return "0"
def filter_price(html):
r=re.sub(r"(?m)[^\d\.]*",'', html).strip()
try:
return str(float(r))
except:
return "0"
def _(u):
if not isinstance(u,unicode):
return unicode(u,"utf8")
return u
def gen_xml(url):
xml_filename=os.path.join(g_xmls_folder,url2filename(url)+".xml")
xml=minidom.Document()
add=xml.createElement("add")
xml.appendChild(add)
doc=xml.createElement("doc")
add.appendChild(doc)
def c(na,va):
"create field node"
field=xml.createElement("field")
field.setAttribute("name",na)
field.appendChild(xml.createTextNode(va))
doc.appendChild(field)
for k in dict.keys():
if dict[k] is None or ((type(dict[k])==types.StringType or type(dict[k])==types.UnicodeType) and dict[k].strip()==""):
del dict[k]
for k,v in dict.iteritems():
c(k,str(v))#stock等数值类型,在抽取时也使用字符类型
import codecs
f=codecs.open(xml_filename,"w")
f.write(codecs.BOM_UTF8)
f.write(xml.toxml("utf-8"))
f.close()
print "生成文件%s"%xml_filename