August 11th 2009

No citations please, we're British

import re
import urllib

from lxml import etree
from BeautifulSoup import BeautifulSoup

tree = etree.parse(urllib.urlopen(''))

for item in tree.xpath('//*[name()="content:encoded"]'):
    soup = BeautifulSoup(item.text)

    for link in soup.findAll('a'):
        for x in link.contents:
            # Kids and their damn hypertext
            x.replaceWith(re.sub(r' \[\d+\]$', '', x))

    except IndexError:

    item.text = unicode(soup)

print etree.tostring(tree)

You can subscribe to new posts via email or RSS.