Import UK parliamentary debate data in Python

KoheiAugust 7, 2014December 22, 2019

I tried to import UK parliamentary debates into R, but it seems that Hansard reports are too large for R. R also has very poor in handling different character coding, so I gave up with R and wrote an importer in Python. The Python script imports the XML into MySQL database.

#!/usr/bin/python
# -*- coding: utf-8 -*-

from __future__ import division
import os, sys, string, re, time, datetime
import xml.etree.ElementTree as ET
import MySQLdb as MySQL
import HTMLParser as HTML
    
def outputConsole(values):
    timestamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print(timestamp + ' ' + ' '.join(values))
    
def getEid(node):
    if('id' in node.attrib):
        return node.attrib['id']
    else:
        return ''
	
def getDate(node):
    if('id' in node.attrib):
        return node.attrib['id'].split('/')[2][0:10]
    else:
        return ''

def getSid(node):
    if('speakerid' in node.attrib and len(node.attrib['speakerid'].split('/')) == 3):
        #print(node.attrib['speakerid'])
        return node.attrib['speakerid'].split('/')[2]
    else:
        return 0
        
def getSpeaker(node):
    if('speakername' in node.attrib):
        return node.attrib['speakername'].encode('utf-8')
    else:
        return ''
    
def getTime(node):
    if('time' in node.attrib):
        parts = node.attrib['time'].split(':')
        return '%02d:%02d:%02d' % (int(parts[0]), int(parts[1]), int(parts[2]))
    else:
        return '00:00:00'
        
def getText(node):
    texts = []
    for p in speech.findall('p'):
        if p.text != None and len(p.text) > 0:
            texts.append(p.text.encode('utf-8'))
    return(' | '.join(texts))

def execute(query):
    try:
        cur.execute(query)
    except MySQL.Error, e:
        print('Query error: ' + query + str(e))

if __name__ == '__main__':
    
    html = HTML.HTMLParser()
    
    db = MySQL.connect(host="localhost", user="username", passwd="password", db="immigration", charset='utf8')
    db.autocommit(True)
    cur = db.cursor()
    
    xmlDir = '/home/kohei/Documents/UK immigration dictionary/UK Parlimentary debates/scrapedxml/debates'
    if os.path.isdir(xmlDir) == False:
        outputConsole(['Directory does not exist', imageDir])
        sys.exit()
    xmlFiles = [ xmlDir + '/' + xmlFile for xmlFile in os.listdir(xmlDir) if os.path.isfile(xmlDir + '/' + xmlFile) ]
    
    execute("TRUNCATE `debate`")
    #print(xmlFiles)
    for xmlFile in xmlFiles:
        outputConsole(['Import', xmlFile])
        doc = ET.parse(xmlFile, parser=None)
        for speech in doc.findall('speech'):
            eid = getEid(speech)
            date = getDate(speech)
            time = getTime(speech)
            sid = getSid(speech)
            speaker = db.escape_string(getSpeaker(speech))
            text = db.escape_string(html.unescape(getText(speech)))
            #print(db.escape_string(text) + '\n')
            query = "INSERT IGNORE INTO `debate` (`eid`, `date`, `time`, `sid`, `speaker`, `text`) VALUES ('%s', '%s', '%s', '%s', '%s', '%s')" % (eid, date, time, sid, speaker, text)
            execute(query)
    sys.exit()
    db.close()

Kohei

Posts created 114

Leave a Reply Cancel reply

Develop efficient custom functions using quanteda v4.0 – Kohei Watanabe on New tokens object in quanteda v4.0April 16, 2024
[…] most important change in quanteda v4.0 is the creation of the external pointer-based tokens object, called tokens_xptr, that allows…
Setting fonts to plot Chinese polarity words in LSS – Kohei Watanabe on New paper on historical geopolitical threats to the USFebruary 19, 2024
[…] models are measuring to others. I am using this function myself in my project on construction of a geopolitical…
New paper on semantic temporality analysis – Kohei Watanabe on New paper on Latent Semantic ScalingAugust 29, 2023
[…] on temporal orientation of texts appeared in Research & Politics. In this study we applied latent semantic scaling (LSS)…
Kohei on Tutorial websites on LSS and Seeded LDAAugust 26, 2023
Please use base R's set.seed() before running the command.
Marli Fernandes on Tutorial websites on LSS and Seeded LDAAugust 24, 2023
I am currently using the seededlda package. I am using the following code: slda <- textmodel_seededlda(dfmt, dict, residual = 2)…

Back To Top