Twisted3

Question

Parse the html of a file in twisted.

Solution

# -*- coding: utf8 -*-
import cStringIO as StringIO

from twisted.internet import reactor
from twisted.web.client import getPage
from twisted.python.util import println
from lxml import etree

def parseHtml(html):
    parser = etree.HTMLParser(encoding='utf8')
    tree = etree.parse(StringIO.StringIO(html), parser)
    return tree

def extractTitle(tree):
    return tree
    #titleText = unicode(tree.xpath("//title/text()")[0])
    #return titleText

d = getPage('http://www.uthcode.com')
d.addCallback(parseHtml)
d.addCallback(extractTitle)
d.addBoth(println)

reactor.run()
Run this
Comments by Disqus