Blog Archives

Learning to Link With Wikipedia – II

I’m done with most of pre-processing. Feel free to tell me how crappy my code is. Just be polite, otherwise I’ll probably cry. This takes ages to write to disk. That’s the bottleneck. It’s a sort of hackjob, though I must say I used to write worse code.

And you can use this code if you like.

import xml.dom.minidom
import re

class xmlMine:
 stopWordDict = {'':1} #dictionary of stopwords

 titleArticleDict = {} #hashmap of titles mapped to articles.

 def xmlMine(self):
 print "instantiated"

 def getStopwords(self,stopWordFile):
 #loads stopwords from file to memory
 stopWordObj = open(stopWordFile)
 stopWordLines = stopWordObj.readlines()
 for stopWord in stopWordLines:
 stopWord = stopWord.replace("\n","")
 self.stopWordDict[stopWord] = 1
 #print self.stopWordDict

 def cleanTitle(self,title):
 #removes non-ascii characters from title
 return  "".join([x for x in title if ord(x) < 128])

 def extractLinksFromText(self,textContent):
 textContent = "]] "+textContent
 textContent = textContent.replace("\n"," ") #remove linebreaks
 textContent = textContent.replace("'","") #remove quotes. they mess up the regexes.

 #remove regions in wiki pages where looking for links is meaningless
 refs = re.compile("==[\s]*References[\s]*==.+")
 textContent = refs.sub(" ",textContent)

 refs = re.compile("==[\s]*See Also[\s]*==.+")
 textContent = refs.sub(" ",textContent)

 refs = re.compile("==[\s]*External links[\s]*==.+")
 textContent = refs.sub(" ",textContent)

 refs = re.compile("==[\s]*Sources[\s]*==.+")
 textContent = refs.sub(" ",textContent)

 refs = re.compile("==[\s]*Notes[\s]*==.+")
 textContent = refs.sub(" ",textContent)

 refs = re.compile("==[\s]*Notes and references[\s]*==.+")
 textContent = refs.sub(" ",textContent)

 refs = re.compile("==[\s]*Gallery[\s]*==.+")
 textContent = refs.sub(" ",textContent)

 refs = re.compile("\{\|[\s]*class=\"wikitable\".+?\|\}")
 textContent = refs.sub(" ",textContent)

 textContent = textContent + "[["

 #remove stuff that's not enclosed in [[]]
 brackets = re.compile("\]\].*?\[\[")
 textContent = brackets.sub("]] [[",textContent)
 wordList = textContent.split("]] [[") #and store only the list of words sans the brackets
 #print wordList

 newWordList = []

 for word in wordList:
 originalWord = deepcopy(word)
 word = word.lower() #convert to lowercase
 #remove part before |
 altText = re.compile(".*?\|")
 word = altText.sub("",word)
 #replace number, punctuation by space
 numbr = re.compile("\d") #number
 word = numbr.sub(" ",word)
 punct = re.compile("\W") #punctuation
 word = punct.sub(" ",word)

 #if space added, split by space. replace by two/more words
 newWords = word.split(" ")

 for newWord in newWords:
 #remove trailing s after consonant
 trailingS = re.compile("^(.*[bcdfghjklmnpqrtvwxyz])(s)$")
 if trailingS.match(newWord) is not None:
 lastS = re.compile("s$")
 newWord = lastS.sub("",newWord)
 #print newWord
 if newWord not in self.stopWordDict: #remove stopwords
 if len(newWord)>2: #no point of too-short words.
 newWordList.append(newWord)
 return newWordList

 def extractTextFromXml(self,xmlFileName):
 # extracts the <title> and <text> fields from the xml files
 # processes both.
 xmlFile = xml.dom.minidom.parse(xmlFileName)
 root = xmlFile.getElementsByTagName("mediawiki");
 for mediaWiki in root:
 pageList = mediaWiki.getElementsByTagName("page")
 for page in pageList:
 titleWords = ""
 text = []
 textNodes = page.getElementsByTagName("text")
 for textNode in textNodes:
 if textNode.childNodes[0].nodeType == textNode.TEXT_NODE:
 #print textNode.childNodes[0].data
 text = self.extractLinksFromText(textNode.childNodes[0].data)
 #self.extractLinksFromText(repr("[[link0]] blah [[link1]] nolink [[link2]] nolink [[link3]]"))
 titleNodes = page.getElementsByTagName("title")
 for titleNode in titleNodes:
 if titleNode.childNodes[0].nodeType == titleNode.TEXT_NODE:
 #print titleNode.childNodes[0].data.encode('utf-8')
 titleWords =  self.cleanTitle(titleNode.childNodes[0].data)
 #print titleWords
 self.titleArticleDict[titleWords] = text

def main():
 a = xmlMine()
 a.getStopwords("stopwords.txt")
 a.extractTextFromXml("Wikipedia-20090505185206.xml")
 opFile = open("links.txt","w")
 string = ""
 for article in a.titleArticleDict.keys():
 string = string + str(article)
 string = string + ":"
 linkList = a.titleArticleDict[article]
 for link in linkList:
 string = string + str(link) + ","

 lastComma = re.compile(",$")
 string = lastComma.sub("",string)
 string = string + "\n"
 opFile.write(string.encode('utf-8'))

if __name__ == "__main__":
 main()

Learning to Link with Wikipedia – I

I hope to maintain a log of the project I’m working on for my Data Mining course this quarter. I find blogging makes me feel more accountable on a day-to-day basis, and I could really use any help that comes my way on this.

So now to the problem:

Identifying which terms in a Wikipedia article need to be linked to other articles.

I have a dataset to work with. It has information about labels on the data and the words present in each document. I’m now trying to extract which words are linked.

So, yeah, still stuck in preprocessing.

I’ll post the python script after I’m done with it. Which should happen in the next few hours. Till then, I’m offline 🙂