Encoding Unicode XML HTML




# From Python Cookbook. Recipe 1.23 - Encoding Unicode Data for XML and HTML.

# Problem: Want to encode Unicode text for output in HTML, or some other XML
# application, using a limited but popular encoding such as ASCII or latin-1

def encode_for_xml(unicode_data, encoding='ascii'):
    return unicode_data.encode(encoding, 'xmlcharrefreplace')

# If you prefer to use HTML's symbolic entity references instead. For this you
# need to define and register a customized encoding error handler.

import codecs
from htmlentitydefs import codepoint2name

def html_replace(exc):
    if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
        s = [ u'&%s;' % codepoint2name[ord(c)] for c in
        return ''.join(s),exc.end
        raise TypeError("can't handle %s" % exc.__name__)


def encode_for_html(unicode_data, encoding='ascii'):
    return unicode_data.encode(encoding, 'html_replace')

if __name__ == '__main__':
    # demo
    data = u'''\
            <title>Encoding Test</title>
            <p>accented characters:
            <li>\xe0 (a + grave)
            <li>\xe7 (c + cedilla)
            <li>\xe9 (e + acute)
            <li>\xa3 (British pound)
            <li>\u20ac (Euro)
            <li>\u221e (Infinity)
    #print encode_for_xml(data)
    print encode_for_html(data)
Run this
Comments by Disqus