diff --git a/scripts/import_wikipedia.py b/scripts/import_wikipedia.py
index b316fd9..d46a2ba 100755
--- a/scripts/import_wikipedia.py
+++ b/scripts/import_wikipedia.py
@@ -1,81 +1,663 @@
-#!/usr/bin/env python3
-# Before you start, run:
-# -> pip install clint
-import argparse
-import sqlite3
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+#
+# =============================================================================
+#  Version: 2.7 (Oct 14, 2013)
+#  Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa
+#	   Antonio Fuschetto (fuschett@di.unipi.it), University of Pisa
+#
+#  Contributors:
+#	Leonardo Souza (lsouza@amtera.com.br)
+#	Juan Manuel Caicedo (juan@cavorite.com)
+#	Humberto Pereira (begini@gmail.com)
+#	Siegfried-A. Gevatter (siegfried@gevatter.com)
+#	Pedro Assis (pedroh2306@gmail.com)
+#       Sean Gallagher (stgallag@gmail.com)
+#
+# =============================================================================
+#  Copyright (c) 2009. Giuseppe Attardi (attardi@di.unipi.it).
+# =============================================================================
+#  This file is part of Tanl.
+#
+#  Tanl is free software; you can redistribute it and/or modify it
+#  under the terms of the GNU General Public License, version 3,
+#  as published by the Free Software Foundation.
+#
+#  Tanl is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+# =============================================================================
+
+"""Wikipedia Extractor:
+Extracts and cleans text from Wikipedia database dump and stores output in the
+watsonsim database.
+
+Usage:
+  import_wikipedia.py <source tag> <watsonsim database> [options]
+
+Options:
+  -c, --compress        : compress output files using bzip
+  -b, --bytes= n[KM]    : put specified bytes per output file (default 500K)
+  -B, --base= URL       : base URL for the Wikipedia pages
+  -l, --link            : preserve links
+  -n NS, --ns NS        : accepted namespaces (separated by commas)
+  -o, --output= dir     : place output files in specified directory (default
+                          current)
+  -s, --sections	: preserve sections
+  -h, --help            : display this help and exit
+"""
+
+import sys
+import gc
+import getopt
+import urllib
+import re
 import bz2
-import code
-from threading import Thread
-from clint.textui import progress
-from lxml import etree
-parser = argparse.ArgumentParser(description="Import TREC data into sqlite3")
-parser.add_argument("-t", "--table", default="documents", help="SQL table to dump into")
-parser.add_argument("db", help="SQLite database")
-parser.add_argument("xmlbz2", help="Input bzipped wikipedia xml file")
-args = parser.parse_args()
-
-
-
-def ns(tag):
-  # Convenience method to tack on the namespace.
-  # TODO: Find a better way
-  return "{http://www.mediawiki.org/xml/export-0.8/}" + tag
-
-# This can be one line but it gets really long...
-gen = bz2.BZ2File(args.xmlbz2)
-gen = etree.iterparse(gen, tag=ns("page"))
-gen = enumerate(gen)
-gen = progress.bar(gen, label="Importing Wikipedia..", width=50, expected_size=4465000)
-
-pages = []
-redirects = []
-background = None
-
-def commit(page_index, pages, redirects):
-  db = sqlite3.connect(args.db)
-  db.executescript("""
-    -- Finish commits faster: truncate instead of delete.
-    pragma journal_mode = WAL;
-    -- Don't wait for the disk
-    pragma synchronous = OFF;
-    -- Enabling foreign keys means we would have to pend many redirects
-    -- That would take too much memory
-    pragma foreign_keys = OFF;
-  """)
-  db.executemany("insert or replace into redirect_{table}(source_title, target_docid, target_title) values (?, ?, ?);".format(table=args.table), redirects)
-  #db.executemany("insert into {table} (docno, title, text, source) values (?,?,?,'wikipedia');".format(table=args.table), pages)
-  #if not (page_index % 100000):
-  #  db.execute("insert into search_{table}(search_{table}) values ('merge=200,8');".format(table=args.table)) # Clean search trees a bit
-  db.commit()
-  db.close()
-
-for page_index, (event, page) in gen:
-  redirect = page.xpath("*[local-name()='redirect']/@title")
-  title = page.findtext(ns("title")).strip()
-  if redirect:
-    redirect = redirect[0].strip()
-    redirects.append([
-      title,   # Source title
-      # Target docid
-      "wikipedia-full-text-{target_title}".format(target_title=redirect),
-      redirect # Target title
-    ])
-  #else: 
-  #  pages.append([
-  #    "wikipedia-full-text-{title}".format(title=title), # docno
-  #    title,
-  #    page.findtext(ns("revision") + "/" + ns("text")) # text
-  #  ])
-  if not (page_index % 10000):
-    if background:
-      background.join()
-    background = Thread(target=commit, args=(page_index,pages,redirects))
-    background.start()
-    pages = []
-    redirects = []
-  page.clear()
-  del page
-
-# Clean the tree the last time.
-# It takes a long time and we don't do that many searches anyway.
-# db.execute("insert into search_{table}(search_{table}) values ('optimize');".format(table=args.table))
+import os.path
+from htmlentitydefs import name2codepoint
+
+import sqlite3
+import nltk
+import random
+from collections import Counter
+
+### PARAMS ####################################################################
+
+# This is obtained from the dump itself
+prefix = None
+
+##
+# Whether to preseve links in output
+#
+keepLinks = False
+
+##
+# Whether to transform sections into HTML
+#
+keepSections = False
+
+##
+# Recognize only these namespaces
+# w: Internal links to the Wikipedia
+# wiktionary: Wiki dictionry
+# wikt: shortcut for Wikctionry
+#
+acceptedNamespaces = set(['w', 'wiktionary', 'wikt'])
+
+##
+# Drop these elements from article text
+#
+discardElements = set([
+        'gallery', 'timeline', 'noinclude', 'pre',
+        'table', 'tr', 'td', 'th', 'caption',
+        'form', 'input', 'select', 'option', 'textarea',
+        'ul', 'li', 'ol', 'dl', 'dt', 'dd', 'menu', 'dir',
+        'ref', 'references', 'img', 'imagemap', 'source'
+        ])
+
+#=========================================================================
+#
+# MediaWiki Markup Grammar
+ 
+# Template = "{{" [ "msg:" | "msgnw:" ] PageName { "|" [ ParameterName "=" AnyText | AnyText ] } "}}" ;
+# Extension = "<" ? extension ? ">" AnyText "</" ? extension ? ">" ;
+# NoWiki = "<nowiki />" | "<nowiki>" ( InlineText | BlockText ) "</nowiki>" ;
+# Parameter = "{{{" ParameterName { Parameter } [ "|" { AnyText | Parameter } ] "}}}" ;
+# Comment = "<!--" InlineText "-->" | "<!--" BlockText "//-->" ;
+#
+# ParameterName = ? uppercase, lowercase, numbers, no spaces, some special chars ? ;
+#
+#=========================================================================== 
+
+# Program version
+version = '2.7'
+
+def open_database(database_filename):
+    db_conn = sqlite3.connect(database_filename)
+    db_cursor = db_conn.cursor()
+    db_cursor.execute("PRAGMA synchronous=off;")
+    db_cursor.execute("PRAGMA busy_timeout=10000000;")
+    return db_conn
+
+##### Main function ###########################################################
+def WikiDocument(id, title, text, source_tag, db):
+    paragraph_index = 0
+    outlinks = []
+    for paragraph in compact(clean(outlinks, text)):
+	# Cut out the section headers. They don't help for text search really.
+	# And we are not advanced enough to get anything from it if we tried.
+	if paragraph.count(' '):
+            content_id = random.getrandbits(63)
+            db.execute("INSERT INTO content (id, text) VALUES (?, ?);",
+                (content_id, paragraph))
+            db.execute("INSERT INTO meta (id, title, source, paragraph, reference) "
+                "VALUES (?, ?, ?, ?, ?);",
+                (content_id, title, source_tag, paragraph_index, "WDB:%i" % content_id))
+        
+        paragraph_index += 1
+    
+    link_counts = Counter(outlinks)
+    db.executemany("INSERT INTO wiki_links(tag, source, target, count) VALUES"
+               " (?, ?, ?, ?);",
+               [(source_tag, title, target, count)
+                   for target, count in link_counts.items()])
+    if random.random() < 0.001: db.commit()
+
+def get_url(id, prefix):
+    return "%s?curid=%s" % (prefix, id)
+
+#------------------------------------------------------------------------------
+
+selfClosingTags = [ 'br', 'hr', 'nobr', 'ref', 'references' ]
+
+# handle 'a' separately, depending on keepLinks
+ignoredTags = [
+        'b', 'big', 'blockquote', 'center', 'cite', 'div', 'em',
+        'font', 'h1', 'h2', 'h3', 'h4', 'hiero', 'i', 'kbd', 'nowiki',
+        'p', 'plaintext', 's', 'small', 'span', 'strike', 'strong',
+        'sub', 'sup', 'tt', 'u', 'var',
+]
+
+placeholder_tags = {'math':'formula', 'code':'codice'}
+
+##
+# Normalize title
+def normalizeTitle(title):
+  # remove leading whitespace and underscores
+  title = title.strip(' _')
+  # replace sequences of whitespace and underscore chars with a single space
+  title = re.compile(r'[\s_]+').sub(' ', title)
+
+  m = re.compile(r'([^:]*):(\s*)(\S(?:.*))').match(title)
+  if m:
+      prefix = m.group(1)
+      if m.group(2):
+          optionalWhitespace = ' '
+      else:
+          optionalWhitespace = ''
+      rest = m.group(3)
+
+      ns = prefix.capitalize()
+      if ns in acceptedNamespaces:
+          # If the prefix designates a known namespace, then it might be
+          # followed by optional whitespace that should be removed to get
+          # the canonical page name
+          # (e.g., "Category:  Births" should become "Category:Births").
+          title = ns + ":" + rest.capitalize()
+      else:
+          # No namespace, just capitalize first letter.
+	  # If the part before the colon is not a known namespace, then we must
+          # not remove the space after the colon (if any), e.g.,
+          # "3001: The_Final_Odyssey" != "3001:The_Final_Odyssey".
+          # However, to get the canonical page name we must contract multiple
+          # spaces into one, because
+          # "3001:   The_Final_Odyssey" != "3001: The_Final_Odyssey".
+          title = prefix.capitalize() + ":" + optionalWhitespace + rest
+  else:
+      # no namespace, just capitalize first letter
+      title = title.capitalize();
+  return title
+
+##
+# Removes HTML or XML character references and entities from a text string.
+#
+# @param text The HTML (or XML) source text.
+# @return The plain text, as a Unicode string, if necessary.
+
+def unescape(text):
+    def fixup(m):
+        text = m.group(0)
+        code = m.group(1)
+        try:
+            if text[1] == "#":  # character reference
+                if text[2] == "x":
+                    return unichr(int(code[1:], 16))
+                else:
+                    return unichr(int(code))
+            else:               # named entity
+                return unichr(name2codepoint[code])
+        except:
+            return text # leave as is
+
+    return re.sub("&#?(\w+);", fixup, text)
+
+# Match HTML comments
+comment = re.compile(r'<!--.*?-->', re.DOTALL)
+
+# Match elements to ignore
+discard_element_patterns = []
+for tag in discardElements:
+    pattern = re.compile(r'<\s*%s\b[^>]*>.*?<\s*/\s*%s>' % (tag, tag), re.DOTALL | re.IGNORECASE)
+    discard_element_patterns.append(pattern)
+
+# Match ignored tags
+ignored_tag_patterns = []
+def ignoreTag(tag):
+    left = re.compile(r'<\s*%s\b[^>]*>' % tag, re.IGNORECASE)
+    right = re.compile(r'<\s*/\s*%s>' % tag, re.IGNORECASE)
+    ignored_tag_patterns.append((left, right))
+
+for tag in ignoredTags:
+    ignoreTag(tag)
+
+# Match selfClosing HTML tags
+selfClosing_tag_patterns = []
+for tag in selfClosingTags:
+    pattern = re.compile(r'<\s*%s\b[^/]*/\s*>' % tag, re.DOTALL | re.IGNORECASE)
+    selfClosing_tag_patterns.append(pattern)
+
+# Match HTML placeholder tags
+placeholder_tag_patterns = []
+for tag, repl in placeholder_tags.items():
+    pattern = re.compile(r'<\s*%s(\s*| [^>]+?)>.*?<\s*/\s*%s\s*>' % (tag, tag), re.DOTALL | re.IGNORECASE)
+    placeholder_tag_patterns.append((pattern, repl))
+
+# Match preformatted lines
+preformatted = re.compile(r'^ .*?$', re.MULTILINE)
+
+# Match external links (space separates second optional parameter)
+externalLink = re.compile(r'\[\w+.*? (.*?)\]')
+externalLinkNoAnchor = re.compile(r'\[\w+[&\]]*\]')
+
+# Matches bold/italic
+bold_italic = re.compile(r"'''''([^']*?)'''''")
+bold = re.compile(r"'''(.*?)'''")
+italic_quote = re.compile(r"''\"(.*?)\"''")
+italic = re.compile(r"''([^']*)''")
+quote_quote = re.compile(r'""(.*?)""')
+
+# Matches space
+spaces = re.compile(r' {2,}')
+
+# Matches dots
+dots = re.compile(r'\.{4,}')
+
+# Matches one specific Wiktionary template
+wiktLink = re.compile(r'\{\{m\|[^|]+\|([^\}|]+)[^\}]+\}\}')
+wiktComp = re.compile(r'\{\{term\|([^\|]+)\|[^\}]+\}\}')
+
+# A matching function for nested expressions, e.g. namespaces and tables.
+def dropNested(text, openDelim, closeDelim):
+    openRE = re.compile(openDelim)
+    closeRE = re.compile(closeDelim)
+    # partition text in separate blocks { } { }
+    matches = []                # pairs (s, e) for each partition
+    nest = 0                    # nesting level
+    start = openRE.search(text, 0)
+    if not start:
+        return text
+    end = closeRE.search(text, start.end())
+    next = start
+    while end:
+        next = openRE.search(text, next.end())
+        if not next:            # termination
+            while nest:         # close all pending
+                nest -=1
+                end0 = closeRE.search(text, end.end())
+                if end0:
+                    end = end0
+                else:
+                    break
+            matches.append((start.start(), end.end()))
+            break
+        while end.end() < next.start():
+            # { } {
+            if nest:
+                nest -= 1
+                # try closing more
+                last = end.end()
+                end = closeRE.search(text, end.end())
+                if not end:     # unbalanced
+                    if matches:
+                        span = (matches[0][0], last)
+                    else:
+                        span = (start.start(), last)
+                    matches = [span]
+                    break
+            else:
+                matches.append((start.start(), end.end()))
+                # advance start, find next close
+                start = next
+                end = closeRE.search(text, next.end())
+                break           # { }
+        if next != start:
+            # { { }
+            nest += 1
+    # collect text outside partitions
+    res = ''
+    start = 0
+    for s, e in  matches:
+        res += text[start:s]
+        start = e
+    res += text[start:]
+    return res
+
+def dropSpans(matches, text):
+    """Drop from text the blocks identified in matches"""
+    matches.sort()
+    res = ''
+    start = 0
+    for s, e in  matches:
+        res += text[start:s]
+        start = e
+    res += text[start:]
+    return res
+
+# Match interwiki links, | separates parameters.
+# First parameter is displayed, also trailing concatenated text included
+# ini display, e.g. s for plural).
+#
+# Can be nested [[File:..|..[[..]]..|..]], [[Category:...]], etc.
+# We first expand inner ones, than remove enclosing ones.
+#
+wikiLink = re.compile(r'\[\[([^[]*?)(?:\|([^[]*?))?\]\](\w*)')
+
+parametrizedLink = re.compile(r'\[\[.*?\]\]')
+
+# Function applied to wikiLinks
+
+# linkpairs: [(String, String)]
+def get_tag_maker(linkpairs):
+    def make_anchor_tag(match):
+        global keepLinks
+        link = match.group(1)
+        colon = link.find(':')
+        if colon > 0 and link[:colon] not in acceptedNamespaces:
+            return ''
+        trail = match.group(3)
+        anchor = match.group(2)
+        if not anchor:
+            anchor = link
+        anchor += trail
+	linkpairs.append(anchor)
+
+        if keepLinks:
+            return '<a href="%s">%s</a>' % (link, anchor)
+        else:
+            return anchor
+    return make_anchor_tag
+
+def clean(outlinks, text):
+    # Make an exception to the no-templates rule
+    text = wiktLink.sub(r'\1', text)
+    text = wiktComp.sub(r'\1', text)
+
+    # FIXME: templates should be expanded
+    # Drop transclusions (template, parser functions)
+    # See: http://www.mediawiki.org/wiki/Help:Templates
+    text = dropNested(text, r'{{', r'}}')
+
+    # Drop tables
+    text = dropNested(text, r'{\|', r'\|}')
+
+    # Expand links
+    text = wikiLink.sub(get_tag_maker(outlinks), text)
+    # Drop all remaining ones
+    text = parametrizedLink.sub('', text)
+
+    # Handle external links
+    text = externalLink.sub(r'\1', text)
+    text = externalLinkNoAnchor.sub('', text)
+
+    # Handle bold/italic/quote
+    text = bold_italic.sub(r'\1', text)
+    text = bold.sub(r'\1', text)
+    text = italic_quote.sub(r'&quot;\1&quot;', text)
+    text = italic.sub(r'&quot;\1&quot;', text)
+    text = quote_quote.sub(r'\1', text)
+    text = text.replace("'''", '').replace("''", '&quot;')
+
+    ################ Process HTML ###############
+
+    # turn into HTML
+    text = unescape(text)
+    # do it again (&amp;nbsp;)
+    text = unescape(text)
+
+    # Collect spans
+
+    matches = []
+    # Drop HTML comments
+    for m in comment.finditer(text):
+            matches.append((m.start(), m.end()))
+
+    # Drop self-closing tags
+    for pattern in selfClosing_tag_patterns:
+        for m in pattern.finditer(text):
+            matches.append((m.start(), m.end()))
+
+    # Drop ignored tags
+    for left, right in ignored_tag_patterns:
+        for m in left.finditer(text):
+            matches.append((m.start(), m.end()))
+        for m in right.finditer(text):
+            matches.append((m.start(), m.end()))
+
+    # Bulk remove all spans
+    text = dropSpans(matches, text)
+
+    # Cannot use dropSpan on these since they may be nested
+    # Drop discarded elements
+    for pattern in discard_element_patterns:
+        text = pattern.sub('', text)
+
+    # Expand placeholders
+    for pattern, placeholder in placeholder_tag_patterns:
+        index = 1
+        for match in pattern.finditer(text):
+            text = text.replace(match.group(), '%s_%d' % (placeholder, index))
+            index += 1
+
+    text = text.replace('<<', u'«').replace('>>', u'»')
+
+    #############################################
+
+    # Drop preformatted
+    # This can't be done before since it may remove tags
+    text = preformatted.sub('', text)
+
+    # Cleanup text
+    text = text.replace('\t', ' ')
+    text = spaces.sub(' ', text)
+    text = dots.sub('...', text)
+    text = re.sub(u' (,:\.\)\]»)', r'\1', text)
+    text = re.sub(u'(\[\(«) ', r'\1', text)
+    text = re.sub(r'\n\W+?\n', '\n', text) # lines with only punctuations
+    text = text.replace(',,', ',').replace(',.', '.')
+    return text
+
+section = re.compile(r'(==+)\s*(.*?)\s*\1')
+
+def compact(text):
+    """Deal with headers, lists, empty sections, residuals of tables"""
+    page = []                   # list of paragraph
+    headers = {}                # Headers for unfilled sections
+    emptySection = False        # empty sections are discarded
+    inList = False              # whether opened <UL>
+
+    for line in text.split('\n'):
+
+        if not line:
+            continue
+        # Handle section titles
+        m = section.match(line)
+        if m:
+            title = m.group(2)
+            lev = len(m.group(1))
+            if keepSections:
+                page.append("<h%d>%s</h%d>" % (lev, title, lev))
+            if title and title[-1] not in '!?':
+                title += '.'
+            headers[lev] = title
+            # drop previous headers
+            for i in headers.keys():
+                if i > lev:
+                    del headers[i]
+            emptySection = True
+            continue
+        # Handle page title
+        if line.startswith('++'):
+            title = line[2:-2]
+            if title:
+                if title[-1] not in '!?':
+                    title += '.'
+                page.append(title)
+        # handle lists
+        elif line[0] in '*#:;':
+            if keepSections:
+                page.append("<li>%s</li>" % line[1:])
+            else:
+                continue
+        # Drop residuals of lists
+        elif line[0] in '{|' or line[-1] in '}':
+            continue
+        # Drop irrelevant lines
+        elif (line[0] == '(' and line[-1] == ')') or line.strip('.-') == '':
+            continue
+        elif len(headers):
+            items = headers.items()
+            items.sort()
+            for (i, v) in items:
+                page.append(v)
+            headers.clear()
+            page.append(line)   # first line
+            emptySection = False
+        elif not emptySection:
+            page.append(line)
+
+    return page
+
+def handle_unicode(entity):
+    numeric_code = int(entity[2:-1])
+    if numeric_code >= 0x10000: return ''
+    return unichr(numeric_code)
+
+#------------------------------------------------------------------------------
+
+### READER ###################################################################
+
+tagRE = re.compile(r'(.*?)<(/?\w+)[^>]*>(?:([^<]*)(<.*?>)?)?')
+
+def process_data(input, source_tag, database):
+    global prefix
+
+    page = []
+    id = None
+    inText = False
+    redirect = False
+    for line in input:
+        line = line.decode('utf-8')
+        tag = ''
+        if '<' in line:
+            m = tagRE.search(line)
+            if m:
+                tag = m.group(2)
+        if tag == 'page':
+            page = []
+            redirect = False
+        elif tag == 'id' and not id:
+            id = m.group(3)
+        elif tag == 'title':
+            title = m.group(3)
+        elif tag == 'redirect':
+            redirect = True
+        elif tag == 'text':
+            inText = True
+            line = line[m.start(3):m.end(3)] + '\n'
+            page.append(line)
+            if m.lastindex == 4: # open-close
+                inText = False
+        elif tag == '/text':
+            if m.group(1):
+                page.append(m.group(1) + '\n')
+            inText = False
+        elif inText:
+            page.append(line)
+        elif tag == '/page':
+            colon = title.find(':')
+            if (colon < 0 or title[:colon] in acceptedNamespaces) and \
+                    not redirect:
+                print id, title.encode('utf-8')
+                sys.stdout.flush()
+                WikiDocument(id, title, ''.join(page), source_tag, database)
+            id = None
+            page = []
+        elif tag == 'base':
+            # discover prefix from the xml dump file
+            # /mediawiki/siteinfo/base
+            base = m.group(3)
+            prefix = base[:base.rfind("/")]
+    database.commit()
+
+### CL INTERFACE ############################################################
+
+def show_help():
+    print >> sys.stdout, __doc__,
+
+def show_usage(script_name):
+    print >> sys.stderr, 'Usage: %s [options]' % script_name
+
+##
+# Minimum size of output files
+minFileSize = 200 * 1024
+
+def main():
+    global keepLinks, keepSections, prefix, acceptedNamespaces
+    script_name = os.path.basename(sys.argv[0])
+
+    try:
+        long_opts = ['help', 'basename=', 'links', 'ns=', 'sections', 'version']
+        opts, args = getopt.gnu_getopt(sys.argv[1:], 'hln:B:sv', long_opts)
+    except getopt.GetoptError:
+        show_usage(script_name)
+        sys.exit(1)
+
+    compress = False
+    file_size = 500 * 1024
+    output_dir = '.'
+
+    for opt, arg in opts:
+        if opt in ('-h', '--help'):
+            show_help()
+            sys.exit()
+        elif opt in ('-l', '--links'):
+            keepLinks = True
+        elif opt in ('-s', '--sections'):
+            keepSections = True
+        elif opt in ('-B', '--base'):
+            prefix = arg
+        elif opt in ('-n', '--ns'):
+                acceptedNamespaces = set(arg.split(','))
+        elif opt in ('-v', '--version'):
+                print 'WikiExtractor.py version:', version
+                sys.exit(0)
+
+    if len(args) != 2:
+        show_usage(script_name)
+        sys.exit(4)
+    else:
+        source_tag = args[0]
+        database_filename = args[1]
+
+    if not os.path.isdir(output_dir):
+        try:
+            os.makedirs(output_dir)
+        except:
+            print >> sys.stderr, 'Could not create: ', output_dir
+            return
+
+    if not keepLinks:
+        ignoreTag('a')
+
+    process_data(
+        sys.stdin,
+        source_tag,
+        open_database(database_filename)
+    )
+
+if __name__ == '__main__':
+    main()
diff --git a/src/main/java/scripts/ParallelStats.java b/src/main/java/scripts/ParallelStats.java
new file mode 100644
index 0000000..b2277b3
--- /dev/null
+++ b/src/main/java/scripts/ParallelStats.java
@@ -0,0 +1,254 @@
+package scripts;
+
+import static org.junit.Assert.fail;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.List;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.TimeUnit;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import org.apache.http.NameValuePair;
+import org.apache.http.client.fluent.Form;
+import org.apache.http.client.fluent.Request;
+import org.eclipse.jgit.lib.Repository;
+import org.eclipse.jgit.storage.file.FileRepositoryBuilder;
+
+import uncc2014watsonsim.Answer;
+import uncc2014watsonsim.DBQuestionSource;
+import uncc2014watsonsim.DefaultPipeline;
+import uncc2014watsonsim.Question;
+import uncc2014watsonsim.StringUtils;
+
+/**
+ *
+ * @author Phani Rahul
+ */
+public class ParallelStats {
+
+    /**
+     * @param args the command line arguments
+     * @throws Exception 
+     */
+    public static void main(String[] args) throws Exception {
+    	// Oversubscribing makes scheduling the CPU-scheduler's problem
+        ExecutorService pool = Executors.newFixedThreadPool(50);
+    	for (int i=0; i < 5000; i += 100) {
+    		pool.execute(new SingleTrainingResult(i));
+    	}
+        pool.shutdown();
+        
+        try {
+            pool.awaitTermination(2, TimeUnit.DAYS);
+        } catch (InterruptedException ex) {
+            Logger.getLogger(ParallelStats.class.getName()).log(Level.SEVERE, null, ex);
+        }
+        System.out.println("Done.");
+    }
+}
+
+class SingleTrainingResult extends Thread {
+	int offset;
+	
+	public SingleTrainingResult(int offset) {
+		this.offset = offset;
+	}
+	
+	public void run() {
+		String sql = String.format("INNER JOIN cache ON query = question GROUP BY question LIMIT 100 OFFSET %d", offset);
+		try {
+			new StatsGenerator("IBL with redirects (v2)", sql).run();
+		} catch (SQLException e) {
+			e.printStackTrace();
+			fail("Database missing, invalid, or out of date. Check that you "
+					+ "have the latest version.");
+		}
+	}
+}
+
+/**
+ * This private class runs all the kinds of statistics in the background.
+ * <p>
+ * It measures:<p>
+ *     1. Overall (top) accuracy<p>
+ *     2. Top-3 accuracy<p>
+ *     3. Mean Reciprocal Rank (MRR), aka mean inverse rank.
+ *        It is only calculated on questions where the correct answer was one
+ *        of the candidate answers. Thus, Scorers and the Learner should use
+ *        MRR as a guide, looking to approach 1.0. <p>
+ *     4. Availability, aka binary recall.
+ *        This is more of an issue with the Searchers, which should strive for
+ *        high binary recall. Precision eventually comes in to play too but is
+ *        not calculated because the intention is for scorers to improve it
+ *        instead of filtering it out early in Searchers. Still, it comes into
+ *        play. <p>
+ *     5. A histogram of accuracy by confidence. In theory, it should be more
+ *        accurate when it is more confident. That has not yet panned out. <p>
+ *     6. Miscellaneous facts like the Git commit, for later reference.<p>
+ * <p>
+ * It also prints out a number each time it finishes a question, simply to
+ * relieve some of the boredom of watching it calculate. Expect to see: 0 1 2
+ * 3 ...
+ * 
+ * There is only one method to call, which is basically just a procedure. But
+ * internally there are several private functions to aid organization.
+ *     
+ * @author Phani Rahul
+ * @author Sean Gallagher
+ */
+class StatsGenerator {
+	String dataset;
+	private DBQuestionSource questionsource;
+	// correct[n] =def= number of correct answers at rank n 
+	int[] correct = new int[100];
+	int available = 0;
+	double total_inverse_rank = 0;
+	int total_answers = 0;
+	
+	double runtime;
+	int[] conf_correct = new int[100];
+	int[] conf_hist = new int[100];
+	
+	/**
+	 * Generate statistics on a specific set of questions
+	 * 
+	 * To understand the query, see {@link DBQuestionSource}.
+	 * @param dataset  What to name the result when it is posted online.
+	 * @param question_query  The SQL filters for the questions. 
+	 * @throws Exception
+	 */
+	public StatsGenerator(String dataset, String question_query) throws SQLException {
+		this.dataset = dataset;
+		questionsource = new DBQuestionSource(question_query);
+	}
+	
+	/** Measure how accurate the top question is as a histogram across confidence */
+	private void calculateConfidenceHistogram(Question question) {
+		if (question.size() >= 1) {
+			// Supposing there is at least one answer
+			Answer a = question.get(0);
+			// Clamp to [0, 99]
+			int bin = (int)(a.score() * 99);
+			bin = Math.max(0, Math.min(bin, 99)); 
+			if(a.equals(question.answer)) conf_correct[bin]++;
+			conf_hist[bin]++;
+		}
+	}
+	
+	/** Callback for every correct answer */
+	public void onCorrectAnswer(Question question, Answer candidate, int rank) {
+		total_inverse_rank += 1 / ((double)rank + 1);
+		available++;
+		// Clamp the rank to 100. Past that we don't have a histogram.
+		correct[rank < 100 ? rank : 99]++;
+	}
+	
+	/** Send Statistics to the server */
+	private void report() {
+		
+		// At worst, give an empty branch and commit
+		String branch = "", commit = "";
+		if (System.getenv("TRAVIS_BRANCH") != null) {
+			// Use CI information if possible.
+			branch = System.getenv("TRAVIS_BRANCH");
+			commit = System.getenv("TRAVIS_COMMIT");
+		} else {
+			// Otherwise take a stab at it ourselves.
+			try {
+			  	Repository repo = new FileRepositoryBuilder()
+			  		.readEnvironment()
+			  		.findGitDir()
+			  		.build();
+			   	commit = repo
+		   			.resolve("HEAD")
+		   			.abbreviate(10)
+		   			.name();
+			   	if (commit == null) {
+			   		commit = "";
+			   		System.err.println("Problem finding git repository.\n"
+			   				+ "Resulting stats will be missing information.");
+			   	}
+				branch = repo.getBranch();
+			} catch (IOException ex) {
+				// Well at least we tried.
+			}
+		}
+		// Generate report
+		List<NameValuePair> response = Form.form()
+				.add("run[branch]", branch)
+				.add("run[commit_hash]", commit.substring(0, 10))
+				.add("run[dataset]", dataset)
+				.add("run[top]", String.valueOf(correct[0]))
+				.add("run[top3]", String.valueOf(correct[0] + correct[1] + correct[2]))
+				.add("run[available]", String.valueOf(available))
+				.add("run[rank]", String.valueOf(total_inverse_rank))
+				.add("run[total_questions]", String.valueOf(questionsource.size()))
+				.add("run[total_answers]", String.valueOf(total_answers))
+				.add("run[confidence_histogram]", StringUtils.join(conf_hist, " "))
+				.add("run[confidence_correct_histogram]", StringUtils.join(conf_correct, " "))
+				.add("run[runtime]", String.valueOf(runtime))
+				.build();
+		try {
+			Request.Post("http://watsonsim.herokuapp.com/runs.json").bodyForm(response).execute();
+		} catch (IOException e) {
+			System.err.println("Error uploading stats. Ignoring. "
+					+ "Details follow.");
+			e.printStackTrace();
+		}
+		
+	
+		System.out.println("" + correct[0] + " of " + questionsource.size() + " correct");
+		System.out.println("" + available + " of " + questionsource.size() + " could have been");
+		System.out.println("Mean Inverse Rank " + total_inverse_rank);
+	}
+	
+	
+	/** Run statistics, then upload to the server */
+	public void run() {
+		long start_time = System.nanoTime();
+
+		
+		System.out.println("Asking Questions");
+		for (int i=0; i<questionsource.size(); i++) {
+			Question q = questionsource.get(i);
+			DefaultPipeline.ask(q);
+			
+			System.out.print(" " + i);
+			if (i % 25 == 0) System.out.println();
+			
+			if (q.size() == 0) continue;
+	
+			for (int rank=0; rank<q.size(); rank++) {
+				Answer candidate = q.get(rank);
+				if (candidate.matches(q.answer)) {
+					onCorrectAnswer(q, candidate, rank);
+					break;
+				}
+			}
+			
+			calculateConfidenceHistogram(q);
+			
+			total_answers += q.size();
+			//System.out.println("Q: " + text.question + "\n" +
+			//		"A[Guessed: " + top_answer.getScore() + "]: " + top_answer.getTitle() + "\n" +
+			//		"A[Actual:" + correct_answer_score + "]: "  + text.answer);
+			q.clear();
+		}
+	
+		// Only count the rank of questions that were actually there
+		total_inverse_rank /= available;
+		// Finish the timing
+		runtime = System.nanoTime() - start_time;
+		runtime /= 1e9;
+		report();
+	}
+}
diff --git a/src/main/java/uncc2014watsonsim/Answer.java b/src/main/java/uncc2014watsonsim/Answer.java
index 895b258..5793cf8 100644
--- a/src/main/java/uncc2014watsonsim/Answer.java
+++ b/src/main/java/uncc2014watsonsim/Answer.java
@@ -4,9 +4,11 @@
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Objects;
+import java.util.Set;
 
 import org.json.simple.JSONObject;
 
@@ -27,6 +29,14 @@ public Answer(Passage d) {
         this.candidate_text = d.title;
     }
     
+    public Answer(List<Passage> passages,
+    		Map<String, Double> scores,
+    		String candidate_text) {
+    	this.passages = passages;
+    	this.scores = scores;
+    	this.candidate_text = candidate_text;
+    }
+    
     /**
      * Create an Answer with one implicitly defined Passage
      */
@@ -153,9 +163,35 @@ public int compareTo(Answer other) {
 	}
     
     /** Change this Answer to include all the information of another
-     * TODO: What should we do to merge scores? */
-    public void merge(Answer other) {
-    	passages.addAll(other.passages);
+     * HACK: We average the scores but we should probably use a
+     * pluggable binary operator*/
+    public static Answer merge(List<Answer> others) {
+    	Map<String, Double> scores = new HashMap<>();
+        List<Passage> passages = new ArrayList<>();
+        String candidate_text;
+        
+        // Merge all the passages
+    	for (Answer other : others)
+    		passages.addAll(other.passages);
+    	
+    	// Merge the scores
+    	Set<String> all_score_names = new HashSet<>();
+    	for (Answer other : others) all_score_names.addAll(other.scores.keySet());
+    	/// Just average them for now  - THIS IS A HACK
+    	for (String score_name : all_score_names) {
+    		double total=0; 
+    		for (Answer other : others) {
+    			Double score = other.scores.get(score_name);
+    			if (score != null) total += score; 
+    		}
+    	    scores.put(score_name, total / others.size());
+    	}
+    	
+    	// Pick the first candidate answer
+    	candidate_text = others.get(0).candidate_text;
+    	
+    	// Now make an answer from it
+    	return new Answer(passages, scores, candidate_text);
     }
     
 
diff --git a/src/main/java/uncc2014watsonsim/Database.java b/src/main/java/uncc2014watsonsim/Database.java
new file mode 100644
index 0000000..49b0b8f
--- /dev/null
+++ b/src/main/java/uncc2014watsonsim/Database.java
@@ -0,0 +1,72 @@
+package uncc2014watsonsim;
+
+import java.sql.Connection;
+import java.sql.DriverManager;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Properties;
+import java.util.Set;
+
+public class Database {
+	private Connection conn;
+	
+	public Database() {
+		try {
+			Class.forName("org.sqlite.JDBC");
+		    Properties props = new Properties();
+		    props.put("busy_timeout", "30000");
+			conn = DriverManager.getConnection("jdbc:sqlite:data/watsonsim.db", props);
+			conn.createStatement().execute("PRAGMA journal_mode = WAL;");
+			conn.createStatement().execute("PRAGMA synchronous = OFF;");
+			// JDBC's SQLite uses autocommit (So commit() is redundant)
+			// Furthermore, close() is a no-op as long as the results are commit()'d
+
+			if (!sanityCheck()) {
+				System.out.println(String.format("Warning: Database missing or malformed."));
+			}
+		} catch (SQLException | ClassNotFoundException e2) {
+			e2.printStackTrace();
+			throw new RuntimeException("Can't run without a database.");
+		}
+	}
+	
+	/** Simple wrapper for creating an SQL statement */
+	public PreparedStatement prep(String sql) {
+		PreparedStatement ps;
+		try {
+			ps = conn.prepareStatement(sql);
+			ps.setFetchSize(100);
+		} catch (SQLException e) {
+			e.printStackTrace();
+			throw new RuntimeException("Can't prepare an SQL statement \"" + sql + "\"");
+		}
+		return ps;
+	}
+	
+	/** Check that the SQLite DB we opened contains the right tables
+	 * You would do this rather than check if the file exists because SQLite
+	 * creates the file implicitly and it simply has no contents. 
+	 * */
+	public boolean sanityCheck() {
+		Set<String> existent_tables = new HashSet<String>();
+		try {
+			ResultSet sql = prep("select tbl_name from sqlite_master;").executeQuery();
+			while (sql.next()) {
+				existent_tables.add(sql.getString("tbl_name"));
+			}
+		} catch (SQLException e) {
+			// There was a problem executing the query
+			return false;
+		}
+
+		return existent_tables.containsAll(Arrays.asList(new String[]{
+				"meta", "content", "redirects", "questions", "results", "cache"
+		}));
+	}
+
+}
diff --git a/src/main/java/uncc2014watsonsim/researchers/Merge.java b/src/main/java/uncc2014watsonsim/researchers/Merge.java
index 9def38b..b12a87f 100644
--- a/src/main/java/uncc2014watsonsim/researchers/Merge.java
+++ b/src/main/java/uncc2014watsonsim/researchers/Merge.java
@@ -1,5 +1,8 @@
 package uncc2014watsonsim.researchers;
 
+import java.util.ArrayList;
+import java.util.List;
+
 import uncc2014watsonsim.Answer;
 import uncc2014watsonsim.Question;
 
@@ -7,19 +10,43 @@ public class Merge extends Researcher {
 	@Override
 	/** Call merge on any two answers with the same title */
 	public void question(Question q) {
-		// The left cursor moves right
-		for (int first_ai=0; first_ai<q.size(); first_ai++) {
-			// The right cursor moves left (so that we can delete safely)
-			for (int second_ai=q.size()-1; second_ai>first_ai; second_ai--) {
-				Answer first_a = q.get(first_ai);
-				Answer second_a = q.get(second_ai);
-				// Merge if necessary
-				//TODO: This uses more or less exact matching. We should do better.
-				if (second_a.matches(first_a)) {
-					first_a.merge(second_a);
-					q.remove(second_ai);
+		List<List<Answer>> answer_blocks = new ArrayList<>();
+		// Arrange the answers into blocks
+		for (Answer original : q) {
+			List<Answer> target = null;
+			for (List<Answer> block : answer_blocks) {
+				for (Answer example : block) {
+					// Look through the examples in this topic
+					// If it matches, choose to put it in this block and quit.
+					if (original.matches(example)) {
+						target = block;
+						break;
+					}
+				}
+				// Found a good option. break again
+				if (target != null) {
+					break;
 				}
 			}
+			if (target == null) {
+				// Make a new topic for this answer
+				List<Answer> new_block = new ArrayList<>();
+				new_block.add(original);
+				answer_blocks.add(new_block);
+			} else {
+				// Use the old topic
+				target.add(original);
+			}
+		}
+		
+		// Merge the blocks
+		q.clear();
+		for (List<Answer> block : answer_blocks) {
+			if (block.size() > 1) {
+				q.add(Answer.merge(block));
+			} else {
+				q.add(block.get(0));
+			}
 		}
 	}
 
diff --git a/src/main/java/uncc2014watsonsim/researchers/RedirectSynonyms.java b/src/main/java/uncc2014watsonsim/researchers/RedirectSynonyms.java
index 86d17ed..7f981d3 100644
--- a/src/main/java/uncc2014watsonsim/researchers/RedirectSynonyms.java
+++ b/src/main/java/uncc2014watsonsim/researchers/RedirectSynonyms.java
@@ -3,6 +3,8 @@
 import java.sql.PreparedStatement;
 import java.sql.ResultSet;
 import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.List;
 
 import uncc2014watsonsim.Answer;
 import uncc2014watsonsim.Question;
@@ -17,22 +19,25 @@
 public class RedirectSynonyms extends Researcher {
 	SQLiteDB db = new SQLiteDB("sources");
 
+	PreparedStatement s = db.prep(
+		"SELECT source from wiki_redirects where target = ? collate nocase;");
+
 	@Override
-	public void answer(Question q, Answer a) {
-		PreparedStatement s = db.prep("select source_title from redirect_documents inner join documents on docno=target_docid where title = ?;");
+	public void question(Question q) {
+		List<Answer> prev_answers = new ArrayList<>();
+		prev_answers.addAll(q);
 		
-		try {
-			s.setString(1, a.candidate_text);
-			ResultSet results = s.executeQuery();
-			while (results.next()) {
-				Answer new_a = new Answer(results.getString("source_title"));
-				new_a.passages.addAll(a.passages);
-				q.add(new_a);
+		for (Answer a : prev_answers) {
+			try {
+				s.setString(1, a.candidate_text);
+				ResultSet results = s.executeQuery();
+				while (results.next()) {
+					q.add(new Answer(a.passages, a.scores, results.getString("source")));
+				}
+			} catch (SQLException e) {
+				// Just don't make any synonyms.
+				return;
 			}
-		} catch (SQLException e) {
-			// Just don't make any synonyms.
-			return;
 		}
-		
 	}
 }
diff --git a/src/main/java/uncc2014watsonsim/researchers/WekaTee.java b/src/main/java/uncc2014watsonsim/researchers/WekaTee.java
index e738bc1..4bef617 100644
--- a/src/main/java/uncc2014watsonsim/researchers/WekaTee.java
+++ b/src/main/java/uncc2014watsonsim/researchers/WekaTee.java
@@ -17,19 +17,17 @@
 
 /** Pipe Answer scores to an ARFF file for Weka */
 public class WekaTee extends Researcher {
-	private Instances data;
 	
-	public WekaTee() {
-		FastVector attributes = new FastVector();
-		// Answer score names
-		for (String name : Score.answer_score_names)
-			attributes.addElement(new Attribute(name));
-		// Passage score names
-		for (int passage_i=0; passage_i<Score.MAX_PASSAGE_COUNT; passage_i++)
-			for (String name : Score.passage_score_names)
-				attributes.addElement(new Attribute(name + "_" + passage_i));
-		data = new Instances("Watsonsim captured question stream", attributes, 0);
-	}
+	// This is the sceme we will use for exporting
+	// Note it's inherently sorted
+	Set<String> schema = new TreeSet<>();
+	// But in memory there is no schema because it changes
+	List<Map<String, Double>> dataset = new ArrayList<>();
+	
+	
+	// Make every run unique, but overwrite between questions
+	// This way, you still get /something/ if you interrupt it
+	private Date start_time = new Date();
 	
 	@Override
 	public void research(Question q) {
@@ -44,12 +42,34 @@ public void question(Question q) {
 		}
 	}
 	
-	@Override
-	public void complete() {
+	/**
+	 * Export the current dataset to an Arff file
+	 */
+	private void exportToFile() {
+		FastVector attributes = new FastVector();
+		// Answer score names
+		for (String name: schema)
+			attributes.addElement(new Attribute(name));
+		Instances data = new Instances("Watsonsim captured question stream", attributes, 0);
+		
+		// Fill in all the rows in sorted order, then export.
+		int schema_len = schema.size();
+		for (Map<String, Double> dataset_row : dataset) {
+			double[] row = new double[schema_len];
+			Arrays.fill(row, Double.NaN);
+			int col_idx = 0;
+			for (String column : schema) {
+				Double value = dataset_row.get(column);
+				row[col_idx++] = value == null ? Double.NaN : value;
+			}
+			data.add(new Instance(1.0, row));
+		}
+		
+		// Save the results to a file
 		ArffSaver saver = new ArffSaver();
 		saver.setInstances(data);
 		try {
-			saver.setFile(new File("data/weka-log.arff"));
+			saver.setFile(new File("data/weka-log." + start_time + Thread.currentThread().getId() + ".arff"));
 			saver.writeBatch();
 		} catch (IOException e) {
 			System.out.println("Failed to write Weka log. Ignoring.");