August 11th 2009

No citations please, we're British

import re
import urllib

from lxml import etree
from BeautifulSoup import BeautifulSoup

tree = etree.parse(urllib.urlopen('http://etbe.coker.com.au/feed/'))

for item in tree.xpath('//*[name()="content:encoded"]'):
    soup = BeautifulSoup(item.text)

    for link in soup.findAll('a'):
        for x in link.contents:
            # Kids and their damn hypertext
            x.replaceWith(re.sub(r' \[\d+\]$', '', x))

    try:
        soup.findAll('ul')[-1].extract()
    except IndexError:
        pass

    item.text = unicode(soup)

print etree.tostring(tree)



You can subscribe to new posts via email or RSS.