I tried to import UK parliamentary debates into R, but it seems that Hansard reports are too large for R. R also has very poor in handling different character coding, so I gave up with R and wrote an importer in Python. The Python script imports the XML into MySQL database.
#!/usr/bin/python # -*- coding: utf-8 -*- from __future__ import division import os, sys, string, re, time, datetime import xml.etree.ElementTree as ET import MySQLdb as MySQL import HTMLParser as HTML def outputConsole(values): timestamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') print(timestamp + ' ' + ' '.join(values)) def getEid(node): if('id' in node.attrib): return node.attrib['id'] else: return '' def getDate(node): if('id' in node.attrib): return node.attrib['id'].split('/')[2][0:10] else: return '' def getSid(node): if('speakerid' in node.attrib and len(node.attrib['speakerid'].split('/')) == 3): #print(node.attrib['speakerid']) return node.attrib['speakerid'].split('/')[2] else: return 0 def getSpeaker(node): if('speakername' in node.attrib): return node.attrib['speakername'].encode('utf-8') else: return '' def getTime(node): if('time' in node.attrib): parts = node.attrib['time'].split(':') return '%02d:%02d:%02d' % (int(parts[0]), int(parts[1]), int(parts[2])) else: return '00:00:00' def getText(node): texts = [] for p in speech.findall('p'): if p.text != None and len(p.text) > 0: texts.append(p.text.encode('utf-8')) return(' | '.join(texts)) def execute(query): try: cur.execute(query) except MySQL.Error, e: print('Query error: ' + query + str(e)) if __name__ == '__main__': html = HTML.HTMLParser() db = MySQL.connect(host="localhost", user="username", passwd="password", db="immigration", charset='utf8') db.autocommit(True) cur = db.cursor() xmlDir = '/home/kohei/Documents/UK immigration dictionary/UK Parlimentary debates/scrapedxml/debates' if os.path.isdir(xmlDir) == False: outputConsole(['Directory does not exist', imageDir]) sys.exit() xmlFiles = [ xmlDir + '/' + xmlFile for xmlFile in os.listdir(xmlDir) if os.path.isfile(xmlDir + '/' + xmlFile) ] execute("TRUNCATE `debate`") #print(xmlFiles) for xmlFile in xmlFiles: outputConsole(['Import', xmlFile]) doc = ET.parse(xmlFile, parser=None) for speech in doc.findall('speech'): eid = getEid(speech) date = getDate(speech) time = getTime(speech) sid = getSid(speech) speaker = db.escape_string(getSpeaker(speech)) text = db.escape_string(html.unescape(getText(speech))) #print(db.escape_string(text) + '\n') query = "INSERT IGNORE INTO `debate` (`eid`, `date`, `time`, `sid`, `speaker`, `text`) VALUES ('%s', '%s', '%s', '%s', '%s', '%s')" % (eid, date, time, sid, speaker, text) execute(query) sys.exit() db.close()