I tried to import UK parliamentary debates into R, but it seems that Hansard reports are too large for R. R also has very poor in handling different character coding, so I gave up with R and wrote an importer in Python. The Python script imports the XML into MySQL database.
#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import division
import os, sys, string, re, time, datetime
import xml.etree.ElementTree as ET
import MySQLdb as MySQL
import HTMLParser as HTML
def outputConsole(values):
timestamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
print(timestamp + ' ' + ' '.join(values))
def getEid(node):
if('id' in node.attrib):
return node.attrib['id']
else:
return ''
def getDate(node):
if('id' in node.attrib):
return node.attrib['id'].split('/')[2][0:10]
else:
return ''
def getSid(node):
if('speakerid' in node.attrib and len(node.attrib['speakerid'].split('/')) == 3):
#print(node.attrib['speakerid'])
return node.attrib['speakerid'].split('/')[2]
else:
return 0
def getSpeaker(node):
if('speakername' in node.attrib):
return node.attrib['speakername'].encode('utf-8')
else:
return ''
def getTime(node):
if('time' in node.attrib):
parts = node.attrib['time'].split(':')
return '%02d:%02d:%02d' % (int(parts[0]), int(parts[1]), int(parts[2]))
else:
return '00:00:00'
def getText(node):
texts = []
for p in speech.findall('p'):
if p.text != None and len(p.text) > 0:
texts.append(p.text.encode('utf-8'))
return(' | '.join(texts))
def execute(query):
try:
cur.execute(query)
except MySQL.Error, e:
print('Query error: ' + query + str(e))
if __name__ == '__main__':
html = HTML.HTMLParser()
db = MySQL.connect(host="localhost", user="username", passwd="password", db="immigration", charset='utf8')
db.autocommit(True)
cur = db.cursor()
xmlDir = '/home/kohei/Documents/UK immigration dictionary/UK Parlimentary debates/scrapedxml/debates'
if os.path.isdir(xmlDir) == False:
outputConsole(['Directory does not exist', imageDir])
sys.exit()
xmlFiles = [ xmlDir + '/' + xmlFile for xmlFile in os.listdir(xmlDir) if os.path.isfile(xmlDir + '/' + xmlFile) ]
execute("TRUNCATE `debate`")
#print(xmlFiles)
for xmlFile in xmlFiles:
outputConsole(['Import', xmlFile])
doc = ET.parse(xmlFile, parser=None)
for speech in doc.findall('speech'):
eid = getEid(speech)
date = getDate(speech)
time = getTime(speech)
sid = getSid(speech)
speaker = db.escape_string(getSpeaker(speech))
text = db.escape_string(html.unescape(getText(speech)))
#print(db.escape_string(text) + '\n')
query = "INSERT IGNORE INTO `debate` (`eid`, `date`, `time`, `sid`, `speaker`, `text`) VALUES ('%s', '%s', '%s', '%s', '%s', '%s')" % (eid, date, time, sid, speaker, text)
execute(query)
sys.exit()
db.close()
