<tfoot draggable='sEl'></tfoot>

最终用于html截取，同python版本

最终用于html截取，同python版本

   return r
def get_part2(html,start,end,start_count=1,end_count=1,start_re=False,end_re=False,reverseDirection=False):
   """
   取得start,end中间的数据,不包括start和end中的字符
   html="abcabc"
   print get_part2(html,"a","c",start_count=2)
   reverseDirection为True时，start还是从左边起始算，end为右边起始的算，在内部算时先算end这边，再推到左边过来
   """
   if not reverseDirection:
   a,b=find_tag_idx(html,start,start_count,use_re=start_re)
   if a==-1:
   return ""
   #print a,b
   _html=html[b:]
   #print _html
   c,d=find_tag_idx(_html,end,end_count,use_re=end_re)
   #print c,d
   if c==-1:
   return ""
   return _html[:c]
   else:
   a,b=rfind_tag_idx(html,end,end_count,use_re=end_re)
   #print a,b
   if a==-1:
   return ""
   _html=html[:a]
   #print _html
   c,d=rfind_tag_idx(_html,start,start_count,use_re=start_re)
   if c==-1:
   return ""
   #print c,d
   return _html[d:]
def filter_tags(html,tags=["em","dd","input","h1","h2","h3","br","a","b","span","strong","p","hr","strong","p","hr","font","div","td","tr","img","form","table"]):
   result=html
   for elem in tags:
   result=re.sub(r"(?i)<%s[\s\S]?>"%elem,"",result)
   result=re.sub(r"(?i)</ %s[\s\S]?>"%elem,"",result)
   return result
def filter_comment(html):
   r=re.sub(r"",'', html)
   return r
def filter_characters(html,tags=["￥"," ","]","："]):
   for tag in tags:
   html=html.replace(tag,"")
   return html
def filter_int(html):
   r=re.sub(r"(?m)[^\d]+",'', html).strip()
   try:
   return str(int(r))
   except:
   return "0"
def filter_price(html):
   r=re.sub(r"(?m)[^\d\.]*",'', html).strip()
   try:
   return str(float(r))
   except:
   return "0"
def _(u):
   if not isinstance(u,unicode):
   return unicode(u,"utf8")
   return u
def gen_xml(url):
   xml_filename=os.path.join(g_xmls_folder,url2filename(url)+".xml")
   xml=minidom.Document()
   add=xml.createElement("add")
   xml.appendChild(add)
   doc=xml.createElement("doc")
   add.appendChild(doc)
   def c(na,va):
   "create field node"
   field=xml.createElement("field")
   field.setAttribute("name",na)
   field.appendChild(xml.createTextNode(va))
   doc.appendChild(field)
   for k in dict.keys():
   if dict[k] is None or ((type(dict[k])==types.StringType or type(dict[k])==types.UnicodeType) and dict[k].strip()==""):
   del dict[k]
   for k,v in dict.iteritems():
   c(k,str(v))#stock等数值类型，在抽取时也使用字符类型
   import codecs
   f=codecs.open(xml_filename,"w")
   f.write(codecs.BOM_UTF8)
   f.write(xml.toxml("utf-8"))
   f.close()
   print "生成文件%s"%xml_filename

查看更多关于最终用于html截取，同python版本的详细内容...

声明：本文来自网络，不代表【好得很程序员自学网】立场，转载请注明出处：http://haodehen.cn/did43320

更新时间：2022-09-24 阅读：41次

上一篇：爬虫新方法

下一篇：ruby查找字符串