好得很程序员自学网

<tfoot draggable='sEl'></tfoot>

新版字符截取函数

新版字符截取函数

    return r

            

def get_part2(html,start,end,start_count=1,end_count=1,start_re=False,end_re=False):

    """

    取得start,end中间的数据,不包括start和end中的字符

    html="abcabc"

    print get_part2(html,"a","c",start_count=2)

    """

    a,b=find_tag_idx(html,start,start_count,use_re=start_re)

    if a==-1:

        return ""

    #print a,b

    _html=html[b:]

    #print _html

    

    c,d=find_tag_idx(_html,end,end_count,use_re=end_re)

    #print c,d

    if c==-1:

        return ""

    return _html[:c]

    

def filter_tags(html,tags=["em","dd","input","h1","h2","h3","br","a","b","span","strong","p","hr","strong","p","hr","font","div","td","tr","img","form","table"]):

    result=html

    for elem in tags:

        result=re.sub(r"(?i)<%s[\s\S]*?>"%elem,"",result)

        result=re.sub(r"(?i)</ *%s[\s\S]*?>"%elem,"",result)

    return result

def filter_comment(html):

    r=re.sub(r"<!--[\s\S]*?-->",'', html)

    return r

def filter_characters(html,tags=["¥"," ","]",":"]):

    for tag in tags:

        html=html.replace(tag,"")

    return html

def filter_int(html):

    r=re.sub(r"(?m)[^\d]+",'', html).strip()

    try:

        return str(int(r))

    except:

        return "0"

def filter_price(html):

    r=re.sub(r"(?m)[^\d\.]*",'', html).strip()

    try:

        return str(float(r))

    except:

        return "0"

def _(u):

    if not isinstance(u,unicode):

        return unicode(u,"utf8")

    return u

def gen_xml(url):

    xml_filename=os.path.join(g_xmls_folder,url2filename(url)+".xml")    

    xml=minidom.Document()

    add=xml.createElement("add")

    xml.appendChild(add)

    doc=xml.createElement("doc")

    add.appendChild(doc)

    

    def c(na,va):

        "create field node"

        field=xml.createElement("field")

        field.setAttribute("name",na)

        field.appendChild(xml.createTextNode(va))

        doc.appendChild(field)    

    

    for k in dict.keys():

        if dict[k] is None or ((type(dict[k])==types.StringType or type(dict[k])==types.UnicodeType) and dict[k].strip()==""):

            del dict[k]

    

    for k,v in dict.iteritems():

        c(k,str(v))#stock等数值类型,在抽取时也使用字符类型

        

    import codecs

    f=codecs.open(xml_filename,"w")

    f.write(codecs.BOM_UTF8)

    f.write(xml.toxml("utf-8"))

    f.close()

    print "生成文件%s"%xml_filename

查看更多关于新版字符截取函数的详细内容...

  阅读:31次