def cjk_error(e):
if not isinstance(e, UnicodeDecodeError):
raise TypeError("don't know how to handle %r" % exc)
if exc.end + 1 > len(exc.object):
raise TypeError('unknown codec ,the object too short!')
ch1 = ord(exc.object[exc.start:exc.end])
newpos = exc.end + 1
ch2 = ord(exc.object[exc.start + 1:newpos])
sk = exc.object[exc.start:newpos]
if 0x81<=ch1<=0xFE and (0x40<=ch2<=0x7E or 0x7E<=ch2<=0xFE): # GBK
return (unicode(sk,'cp936'), newpos)
if 0x81<=ch1<=0xFE and (0x40<=ch2<=0x7E or 0xA1<=ch2<=0xFE): # BIG5
return (unicode(sk,'big5'), newpos)
raise TypeError('unknown codec !')
codecs.register_error("cjk_replace", cjk_replace) a = "你" # utf8编码:'\xe4\xbd\xa0' c = unicode(a[:2],'gbk') # 正常返回 c = unicode(a, 'gbk') # UnicodeDecodeError 。错误发生在第三个字节
import codec
def cjk_replace(e):
if not isinstance(e, UnicodeDecodeError):
raise TypeError("invalid exception type %s" e)
src = e.encoding
if src in ('gbk','gb18030', 'big5'):
beg = e.start - 2
if beg >= 0:
try:
return unicode(e.object[beg:e.end], 'utf8'), e.end + 1
except:
pass
if exc.end + 1 > len(exc.object):
raise TypeError('unknown codec ,the object too short!')
ch1 = ord(exc.object[exc.start:exc.end])
newpos = exc.end + 1
ch2 = ord(exc.object[exc.start + 1:newpos])
sk = exc.object[exc.start:newpos]
if src != 'gbk' and 0x81<=ch1<=0xFE and (0x40<=ch2<=0x7E or 0x7E<=ch2<=0xFE): # GBK
return (unicode(sk,'cp936'), newpos)
if src != 'big5' and 0x81<=ch1<=0xFE and (0x40<=ch2<=0x7E or 0xA1<=ch2<=0xFE): # BIG5
return (unicode(sk,'big5'), newpos)
raise TypeError('unknown codec !')
codecs.register_error("cjk_replace", cjk_replace) 当然,这个逻辑其实还是不够严谨的。虽然对于这种混合编码这种畸形活处理有点较真儿。
不过既然python提供这样的能力,大家可以一起来讨论下,我们怎么可以做的更好?
以上就是python黑魔法之编码转换方法的详细内容,更多请关注Gxl网其它相关文章!
查看更多关于python黑魔法之编码转换方法的详细内容...
声明:本文来自网络,不代表【好得很程序员自学网】立场,转载请注明出处:http://haodehen.cn/did85642