Blog Archives

Learning to Link With Wikipedia – II

I’m done with most of pre-processing. Feel free to tell me how crappy my code is. Just be polite, otherwise I’ll probably cry. This takes ages to write to disk. That’s the bottleneck. It’s a sort of hackjob, though I must say I used to write worse code.

And you can use this code if you like.

import xml.dom.minidom
import re

class xmlMine:
 stopWordDict = {'':1} #dictionary of stopwords

 titleArticleDict = {} #hashmap of titles mapped to articles.

 def xmlMine(self):
 print "instantiated"

 def getStopwords(self,stopWordFile):
 #loads stopwords from file to memory
 stopWordObj = open(stopWordFile)
 stopWordLines = stopWordObj.readlines()
 for stopWord in stopWordLines:
 stopWord = stopWord.replace("\n","")
 self.stopWordDict[stopWord] = 1
 #print self.stopWordDict

 def cleanTitle(self,title):
 #removes non-ascii characters from title
 return  "".join([x for x in title if ord(x) < 128])

 def extractLinksFromText(self,textContent):
 textContent = "]] "+textContent
 textContent = textContent.replace("\n"," ") #remove linebreaks
 textContent = textContent.replace("'","") #remove quotes. they mess up the regexes.

 #remove regions in wiki pages where looking for links is meaningless
 refs = re.compile("==[\s]*References[\s]*==.+")
 textContent = refs.sub(" ",textContent)

 refs = re.compile("==[\s]*See Also[\s]*==.+")
 textContent = refs.sub(" ",textContent)

 refs = re.compile("==[\s]*External links[\s]*==.+")
 textContent = refs.sub(" ",textContent)

 refs = re.compile("==[\s]*Sources[\s]*==.+")
 textContent = refs.sub(" ",textContent)

 refs = re.compile("==[\s]*Notes[\s]*==.+")
 textContent = refs.sub(" ",textContent)

 refs = re.compile("==[\s]*Notes and references[\s]*==.+")
 textContent = refs.sub(" ",textContent)

 refs = re.compile("==[\s]*Gallery[\s]*==.+")
 textContent = refs.sub(" ",textContent)

 refs = re.compile("\{\|[\s]*class=\"wikitable\".+?\|\}")
 textContent = refs.sub(" ",textContent)

 textContent = textContent + "[["

 #remove stuff that's not enclosed in [[]]
 brackets = re.compile("\]\].*?\[\[")
 textContent = brackets.sub("]] [[",textContent)
 wordList = textContent.split("]] [[") #and store only the list of words sans the brackets
 #print wordList

 newWordList = []

 for word in wordList:
 originalWord = deepcopy(word)
 word = word.lower() #convert to lowercase
 #remove part before |
 altText = re.compile(".*?\|")
 word = altText.sub("",word)
 #replace number, punctuation by space
 numbr = re.compile("\d") #number
 word = numbr.sub(" ",word)
 punct = re.compile("\W") #punctuation
 word = punct.sub(" ",word)

 #if space added, split by space. replace by two/more words
 newWords = word.split(" ")

 for newWord in newWords:
 #remove trailing s after consonant
 trailingS = re.compile("^(.*[bcdfghjklmnpqrtvwxyz])(s)$")
 if trailingS.match(newWord) is not None:
 lastS = re.compile("s$")
 newWord = lastS.sub("",newWord)
 #print newWord
 if newWord not in self.stopWordDict: #remove stopwords
 if len(newWord)>2: #no point of too-short words.
 newWordList.append(newWord)
 return newWordList

 def extractTextFromXml(self,xmlFileName):
 # extracts the <title> and <text> fields from the xml files
 # processes both.
 xmlFile = xml.dom.minidom.parse(xmlFileName)
 root = xmlFile.getElementsByTagName("mediawiki");
 for mediaWiki in root:
 pageList = mediaWiki.getElementsByTagName("page")
 for page in pageList:
 titleWords = ""
 text = []
 textNodes = page.getElementsByTagName("text")
 for textNode in textNodes:
 if textNode.childNodes[0].nodeType == textNode.TEXT_NODE:
 #print textNode.childNodes[0].data
 text = self.extractLinksFromText(textNode.childNodes[0].data)
 #self.extractLinksFromText(repr("[[link0]] blah [[link1]] nolink [[link2]] nolink [[link3]]"))
 titleNodes = page.getElementsByTagName("title")
 for titleNode in titleNodes:
 if titleNode.childNodes[0].nodeType == titleNode.TEXT_NODE:
 #print titleNode.childNodes[0].data.encode('utf-8')
 titleWords =  self.cleanTitle(titleNode.childNodes[0].data)
 #print titleWords
 self.titleArticleDict[titleWords] = text

def main():
 a = xmlMine()
 a.getStopwords("stopwords.txt")
 a.extractTextFromXml("Wikipedia-20090505185206.xml")
 opFile = open("links.txt","w")
 string = ""
 for article in a.titleArticleDict.keys():
 string = string + str(article)
 string = string + ":"
 linkList = a.titleArticleDict[article]
 for link in linkList:
 string = string + str(link) + ","

 lastComma = re.compile(",$")
 string = lastComma.sub("",string)
 string = string + "\n"
 opFile.write(string.encode('utf-8'))

if __name__ == "__main__":
 main()
%d bloggers like this: