diff --git a/scripts/import_wikipedia.py b/scripts/import_wikipedia.py index b316fd9..d46a2ba 100755 --- a/scripts/import_wikipedia.py +++ b/scripts/import_wikipedia.py @@ -1,81 +1,663 @@ -#!/usr/bin/env python3 -# Before you start, run: -# -> pip install clint -import argparse -import sqlite3 +#!/usr/bin/python +# -*- coding: utf-8 -*- +# +# ============================================================================= +# Version: 2.7 (Oct 14, 2013) +# Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa +# Antonio Fuschetto (fuschett@di.unipi.it), University of Pisa +# +# Contributors: +# Leonardo Souza (lsouza@amtera.com.br) +# Juan Manuel Caicedo (juan@cavorite.com) +# Humberto Pereira (begini@gmail.com) +# Siegfried-A. Gevatter (siegfried@gevatter.com) +# Pedro Assis (pedroh2306@gmail.com) +# Sean Gallagher (stgallag@gmail.com) +# +# ============================================================================= +# Copyright (c) 2009. Giuseppe Attardi (attardi@di.unipi.it). +# ============================================================================= +# This file is part of Tanl. +# +# Tanl is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License, version 3, +# as published by the Free Software Foundation. +# +# Tanl is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# ============================================================================= + +"""Wikipedia Extractor: +Extracts and cleans text from Wikipedia database dump and stores output in the +watsonsim database. + +Usage: + import_wikipedia.py [options] + +Options: + -c, --compress : compress output files using bzip + -b, --bytes= n[KM] : put specified bytes per output file (default 500K) + -B, --base= URL : base URL for the Wikipedia pages + -l, --link : preserve links + -n NS, --ns NS : accepted namespaces (separated by commas) + -o, --output= dir : place output files in specified directory (default + current) + -s, --sections : preserve sections + -h, --help : display this help and exit +""" + +import sys +import gc +import getopt +import urllib +import re import bz2 -import code -from threading import Thread -from clint.textui import progress -from lxml import etree -parser = argparse.ArgumentParser(description="Import TREC data into sqlite3") -parser.add_argument("-t", "--table", default="documents", help="SQL table to dump into") -parser.add_argument("db", help="SQLite database") -parser.add_argument("xmlbz2", help="Input bzipped wikipedia xml file") -args = parser.parse_args() - - - -def ns(tag): - # Convenience method to tack on the namespace. - # TODO: Find a better way - return "{http://www.mediawiki.org/xml/export-0.8/}" + tag - -# This can be one line but it gets really long... -gen = bz2.BZ2File(args.xmlbz2) -gen = etree.iterparse(gen, tag=ns("page")) -gen = enumerate(gen) -gen = progress.bar(gen, label="Importing Wikipedia..", width=50, expected_size=4465000) - -pages = [] -redirects = [] -background = None - -def commit(page_index, pages, redirects): - db = sqlite3.connect(args.db) - db.executescript(""" - -- Finish commits faster: truncate instead of delete. - pragma journal_mode = WAL; - -- Don't wait for the disk - pragma synchronous = OFF; - -- Enabling foreign keys means we would have to pend many redirects - -- That would take too much memory - pragma foreign_keys = OFF; - """) - db.executemany("insert or replace into redirect_{table}(source_title, target_docid, target_title) values (?, ?, ?);".format(table=args.table), redirects) - #db.executemany("insert into {table} (docno, title, text, source) values (?,?,?,'wikipedia');".format(table=args.table), pages) - #if not (page_index % 100000): - # db.execute("insert into search_{table}(search_{table}) values ('merge=200,8');".format(table=args.table)) # Clean search trees a bit - db.commit() - db.close() - -for page_index, (event, page) in gen: - redirect = page.xpath("*[local-name()='redirect']/@title") - title = page.findtext(ns("title")).strip() - if redirect: - redirect = redirect[0].strip() - redirects.append([ - title, # Source title - # Target docid - "wikipedia-full-text-{target_title}".format(target_title=redirect), - redirect # Target title - ]) - #else: - # pages.append([ - # "wikipedia-full-text-{title}".format(title=title), # docno - # title, - # page.findtext(ns("revision") + "/" + ns("text")) # text - # ]) - if not (page_index % 10000): - if background: - background.join() - background = Thread(target=commit, args=(page_index,pages,redirects)) - background.start() - pages = [] - redirects = [] - page.clear() - del page - -# Clean the tree the last time. -# It takes a long time and we don't do that many searches anyway. -# db.execute("insert into search_{table}(search_{table}) values ('optimize');".format(table=args.table)) +import os.path +from htmlentitydefs import name2codepoint + +import sqlite3 +import nltk +import random +from collections import Counter + +### PARAMS #################################################################### + +# This is obtained from the dump itself +prefix = None + +## +# Whether to preseve links in output +# +keepLinks = False + +## +# Whether to transform sections into HTML +# +keepSections = False + +## +# Recognize only these namespaces +# w: Internal links to the Wikipedia +# wiktionary: Wiki dictionry +# wikt: shortcut for Wikctionry +# +acceptedNamespaces = set(['w', 'wiktionary', 'wikt']) + +## +# Drop these elements from article text +# +discardElements = set([ + 'gallery', 'timeline', 'noinclude', 'pre', + 'table', 'tr', 'td', 'th', 'caption', + 'form', 'input', 'select', 'option', 'textarea', + 'ul', 'li', 'ol', 'dl', 'dt', 'dd', 'menu', 'dir', + 'ref', 'references', 'img', 'imagemap', 'source' + ]) + +#========================================================================= +# +# MediaWiki Markup Grammar + +# Template = "{{" [ "msg:" | "msgnw:" ] PageName { "|" [ ParameterName "=" AnyText | AnyText ] } "}}" ; +# Extension = "<" ? extension ? ">" AnyText "" ; +# NoWiki = "" | "" ( InlineText | BlockText ) "" ; +# Parameter = "{{{" ParameterName { Parameter } [ "|" { AnyText | Parameter } ] "}}}" ; +# Comment = "" | "" ; +# +# ParameterName = ? uppercase, lowercase, numbers, no spaces, some special chars ? ; +# +#=========================================================================== + +# Program version +version = '2.7' + +def open_database(database_filename): + db_conn = sqlite3.connect(database_filename) + db_cursor = db_conn.cursor() + db_cursor.execute("PRAGMA synchronous=off;") + db_cursor.execute("PRAGMA busy_timeout=10000000;") + return db_conn + +##### Main function ########################################################### +def WikiDocument(id, title, text, source_tag, db): + paragraph_index = 0 + outlinks = [] + for paragraph in compact(clean(outlinks, text)): + # Cut out the section headers. They don't help for text search really. + # And we are not advanced enough to get anything from it if we tried. + if paragraph.count(' '): + content_id = random.getrandbits(63) + db.execute("INSERT INTO content (id, text) VALUES (?, ?);", + (content_id, paragraph)) + db.execute("INSERT INTO meta (id, title, source, paragraph, reference) " + "VALUES (?, ?, ?, ?, ?);", + (content_id, title, source_tag, paragraph_index, "WDB:%i" % content_id)) + + paragraph_index += 1 + + link_counts = Counter(outlinks) + db.executemany("INSERT INTO wiki_links(tag, source, target, count) VALUES" + " (?, ?, ?, ?);", + [(source_tag, title, target, count) + for target, count in link_counts.items()]) + if random.random() < 0.001: db.commit() + +def get_url(id, prefix): + return "%s?curid=%s" % (prefix, id) + +#------------------------------------------------------------------------------ + +selfClosingTags = [ 'br', 'hr', 'nobr', 'ref', 'references' ] + +# handle 'a' separately, depending on keepLinks +ignoredTags = [ + 'b', 'big', 'blockquote', 'center', 'cite', 'div', 'em', + 'font', 'h1', 'h2', 'h3', 'h4', 'hiero', 'i', 'kbd', 'nowiki', + 'p', 'plaintext', 's', 'small', 'span', 'strike', 'strong', + 'sub', 'sup', 'tt', 'u', 'var', +] + +placeholder_tags = {'math':'formula', 'code':'codice'} + +## +# Normalize title +def normalizeTitle(title): + # remove leading whitespace and underscores + title = title.strip(' _') + # replace sequences of whitespace and underscore chars with a single space + title = re.compile(r'[\s_]+').sub(' ', title) + + m = re.compile(r'([^:]*):(\s*)(\S(?:.*))').match(title) + if m: + prefix = m.group(1) + if m.group(2): + optionalWhitespace = ' ' + else: + optionalWhitespace = '' + rest = m.group(3) + + ns = prefix.capitalize() + if ns in acceptedNamespaces: + # If the prefix designates a known namespace, then it might be + # followed by optional whitespace that should be removed to get + # the canonical page name + # (e.g., "Category: Births" should become "Category:Births"). + title = ns + ":" + rest.capitalize() + else: + # No namespace, just capitalize first letter. + # If the part before the colon is not a known namespace, then we must + # not remove the space after the colon (if any), e.g., + # "3001: The_Final_Odyssey" != "3001:The_Final_Odyssey". + # However, to get the canonical page name we must contract multiple + # spaces into one, because + # "3001: The_Final_Odyssey" != "3001: The_Final_Odyssey". + title = prefix.capitalize() + ":" + optionalWhitespace + rest + else: + # no namespace, just capitalize first letter + title = title.capitalize(); + return title + +## +# Removes HTML or XML character references and entities from a text string. +# +# @param text The HTML (or XML) source text. +# @return The plain text, as a Unicode string, if necessary. + +def unescape(text): + def fixup(m): + text = m.group(0) + code = m.group(1) + try: + if text[1] == "#": # character reference + if text[2] == "x": + return unichr(int(code[1:], 16)) + else: + return unichr(int(code)) + else: # named entity + return unichr(name2codepoint[code]) + except: + return text # leave as is + + return re.sub("&#?(\w+);", fixup, text) + +# Match HTML comments +comment = re.compile(r'', re.DOTALL) + +# Match elements to ignore +discard_element_patterns = [] +for tag in discardElements: + pattern = re.compile(r'<\s*%s\b[^>]*>.*?<\s*/\s*%s>' % (tag, tag), re.DOTALL | re.IGNORECASE) + discard_element_patterns.append(pattern) + +# Match ignored tags +ignored_tag_patterns = [] +def ignoreTag(tag): + left = re.compile(r'<\s*%s\b[^>]*>' % tag, re.IGNORECASE) + right = re.compile(r'<\s*/\s*%s>' % tag, re.IGNORECASE) + ignored_tag_patterns.append((left, right)) + +for tag in ignoredTags: + ignoreTag(tag) + +# Match selfClosing HTML tags +selfClosing_tag_patterns = [] +for tag in selfClosingTags: + pattern = re.compile(r'<\s*%s\b[^/]*/\s*>' % tag, re.DOTALL | re.IGNORECASE) + selfClosing_tag_patterns.append(pattern) + +# Match HTML placeholder tags +placeholder_tag_patterns = [] +for tag, repl in placeholder_tags.items(): + pattern = re.compile(r'<\s*%s(\s*| [^>]+?)>.*?<\s*/\s*%s\s*>' % (tag, tag), re.DOTALL | re.IGNORECASE) + placeholder_tag_patterns.append((pattern, repl)) + +# Match preformatted lines +preformatted = re.compile(r'^ .*?$', re.MULTILINE) + +# Match external links (space separates second optional parameter) +externalLink = re.compile(r'\[\w+.*? (.*?)\]') +externalLinkNoAnchor = re.compile(r'\[\w+[&\]]*\]') + +# Matches bold/italic +bold_italic = re.compile(r"'''''([^']*?)'''''") +bold = re.compile(r"'''(.*?)'''") +italic_quote = re.compile(r"''\"(.*?)\"''") +italic = re.compile(r"''([^']*)''") +quote_quote = re.compile(r'""(.*?)""') + +# Matches space +spaces = re.compile(r' {2,}') + +# Matches dots +dots = re.compile(r'\.{4,}') + +# Matches one specific Wiktionary template +wiktLink = re.compile(r'\{\{m\|[^|]+\|([^\}|]+)[^\}]+\}\}') +wiktComp = re.compile(r'\{\{term\|([^\|]+)\|[^\}]+\}\}') + +# A matching function for nested expressions, e.g. namespaces and tables. +def dropNested(text, openDelim, closeDelim): + openRE = re.compile(openDelim) + closeRE = re.compile(closeDelim) + # partition text in separate blocks { } { } + matches = [] # pairs (s, e) for each partition + nest = 0 # nesting level + start = openRE.search(text, 0) + if not start: + return text + end = closeRE.search(text, start.end()) + next = start + while end: + next = openRE.search(text, next.end()) + if not next: # termination + while nest: # close all pending + nest -=1 + end0 = closeRE.search(text, end.end()) + if end0: + end = end0 + else: + break + matches.append((start.start(), end.end())) + break + while end.end() < next.start(): + # { } { + if nest: + nest -= 1 + # try closing more + last = end.end() + end = closeRE.search(text, end.end()) + if not end: # unbalanced + if matches: + span = (matches[0][0], last) + else: + span = (start.start(), last) + matches = [span] + break + else: + matches.append((start.start(), end.end())) + # advance start, find next close + start = next + end = closeRE.search(text, next.end()) + break # { } + if next != start: + # { { } + nest += 1 + # collect text outside partitions + res = '' + start = 0 + for s, e in matches: + res += text[start:s] + start = e + res += text[start:] + return res + +def dropSpans(matches, text): + """Drop from text the blocks identified in matches""" + matches.sort() + res = '' + start = 0 + for s, e in matches: + res += text[start:s] + start = e + res += text[start:] + return res + +# Match interwiki links, | separates parameters. +# First parameter is displayed, also trailing concatenated text included +# ini display, e.g. s for plural). +# +# Can be nested [[File:..|..[[..]]..|..]], [[Category:...]], etc. +# We first expand inner ones, than remove enclosing ones. +# +wikiLink = re.compile(r'\[\[([^[]*?)(?:\|([^[]*?))?\]\](\w*)') + +parametrizedLink = re.compile(r'\[\[.*?\]\]') + +# Function applied to wikiLinks + +# linkpairs: [(String, String)] +def get_tag_maker(linkpairs): + def make_anchor_tag(match): + global keepLinks + link = match.group(1) + colon = link.find(':') + if colon > 0 and link[:colon] not in acceptedNamespaces: + return '' + trail = match.group(3) + anchor = match.group(2) + if not anchor: + anchor = link + anchor += trail + linkpairs.append(anchor) + + if keepLinks: + return '%s' % (link, anchor) + else: + return anchor + return make_anchor_tag + +def clean(outlinks, text): + # Make an exception to the no-templates rule + text = wiktLink.sub(r'\1', text) + text = wiktComp.sub(r'\1', text) + + # FIXME: templates should be expanded + # Drop transclusions (template, parser functions) + # See: http://www.mediawiki.org/wiki/Help:Templates + text = dropNested(text, r'{{', r'}}') + + # Drop tables + text = dropNested(text, r'{\|', r'\|}') + + # Expand links + text = wikiLink.sub(get_tag_maker(outlinks), text) + # Drop all remaining ones + text = parametrizedLink.sub('', text) + + # Handle external links + text = externalLink.sub(r'\1', text) + text = externalLinkNoAnchor.sub('', text) + + # Handle bold/italic/quote + text = bold_italic.sub(r'\1', text) + text = bold.sub(r'\1', text) + text = italic_quote.sub(r'"\1"', text) + text = italic.sub(r'"\1"', text) + text = quote_quote.sub(r'\1', text) + text = text.replace("'''", '').replace("''", '"') + + ################ Process HTML ############### + + # turn into HTML + text = unescape(text) + # do it again (&nbsp;) + text = unescape(text) + + # Collect spans + + matches = [] + # Drop HTML comments + for m in comment.finditer(text): + matches.append((m.start(), m.end())) + + # Drop self-closing tags + for pattern in selfClosing_tag_patterns: + for m in pattern.finditer(text): + matches.append((m.start(), m.end())) + + # Drop ignored tags + for left, right in ignored_tag_patterns: + for m in left.finditer(text): + matches.append((m.start(), m.end())) + for m in right.finditer(text): + matches.append((m.start(), m.end())) + + # Bulk remove all spans + text = dropSpans(matches, text) + + # Cannot use dropSpan on these since they may be nested + # Drop discarded elements + for pattern in discard_element_patterns: + text = pattern.sub('', text) + + # Expand placeholders + for pattern, placeholder in placeholder_tag_patterns: + index = 1 + for match in pattern.finditer(text): + text = text.replace(match.group(), '%s_%d' % (placeholder, index)) + index += 1 + + text = text.replace('<<', u'«').replace('>>', u'»') + + ############################################# + + # Drop preformatted + # This can't be done before since it may remove tags + text = preformatted.sub('', text) + + # Cleanup text + text = text.replace('\t', ' ') + text = spaces.sub(' ', text) + text = dots.sub('...', text) + text = re.sub(u' (,:\.\)\]»)', r'\1', text) + text = re.sub(u'(\[\(«) ', r'\1', text) + text = re.sub(r'\n\W+?\n', '\n', text) # lines with only punctuations + text = text.replace(',,', ',').replace(',.', '.') + return text + +section = re.compile(r'(==+)\s*(.*?)\s*\1') + +def compact(text): + """Deal with headers, lists, empty sections, residuals of tables""" + page = [] # list of paragraph + headers = {} # Headers for unfilled sections + emptySection = False # empty sections are discarded + inList = False # whether opened
    + + for line in text.split('\n'): + + if not line: + continue + # Handle section titles + m = section.match(line) + if m: + title = m.group(2) + lev = len(m.group(1)) + if keepSections: + page.append("%s" % (lev, title, lev)) + if title and title[-1] not in '!?': + title += '.' + headers[lev] = title + # drop previous headers + for i in headers.keys(): + if i > lev: + del headers[i] + emptySection = True + continue + # Handle page title + if line.startswith('++'): + title = line[2:-2] + if title: + if title[-1] not in '!?': + title += '.' + page.append(title) + # handle lists + elif line[0] in '*#:;': + if keepSections: + page.append("
  • %s
  • " % line[1:]) + else: + continue + # Drop residuals of lists + elif line[0] in '{|' or line[-1] in '}': + continue + # Drop irrelevant lines + elif (line[0] == '(' and line[-1] == ')') or line.strip('.-') == '': + continue + elif len(headers): + items = headers.items() + items.sort() + for (i, v) in items: + page.append(v) + headers.clear() + page.append(line) # first line + emptySection = False + elif not emptySection: + page.append(line) + + return page + +def handle_unicode(entity): + numeric_code = int(entity[2:-1]) + if numeric_code >= 0x10000: return '' + return unichr(numeric_code) + +#------------------------------------------------------------------------------ + +### READER ################################################################### + +tagRE = re.compile(r'(.*?)<(/?\w+)[^>]*>(?:([^<]*)(<.*?>)?)?') + +def process_data(input, source_tag, database): + global prefix + + page = [] + id = None + inText = False + redirect = False + for line in input: + line = line.decode('utf-8') + tag = '' + if '<' in line: + m = tagRE.search(line) + if m: + tag = m.group(2) + if tag == 'page': + page = [] + redirect = False + elif tag == 'id' and not id: + id = m.group(3) + elif tag == 'title': + title = m.group(3) + elif tag == 'redirect': + redirect = True + elif tag == 'text': + inText = True + line = line[m.start(3):m.end(3)] + '\n' + page.append(line) + if m.lastindex == 4: # open-close + inText = False + elif tag == '/text': + if m.group(1): + page.append(m.group(1) + '\n') + inText = False + elif inText: + page.append(line) + elif tag == '/page': + colon = title.find(':') + if (colon < 0 or title[:colon] in acceptedNamespaces) and \ + not redirect: + print id, title.encode('utf-8') + sys.stdout.flush() + WikiDocument(id, title, ''.join(page), source_tag, database) + id = None + page = [] + elif tag == 'base': + # discover prefix from the xml dump file + # /mediawiki/siteinfo/base + base = m.group(3) + prefix = base[:base.rfind("/")] + database.commit() + +### CL INTERFACE ############################################################ + +def show_help(): + print >> sys.stdout, __doc__, + +def show_usage(script_name): + print >> sys.stderr, 'Usage: %s [options]' % script_name + +## +# Minimum size of output files +minFileSize = 200 * 1024 + +def main(): + global keepLinks, keepSections, prefix, acceptedNamespaces + script_name = os.path.basename(sys.argv[0]) + + try: + long_opts = ['help', 'basename=', 'links', 'ns=', 'sections', 'version'] + opts, args = getopt.gnu_getopt(sys.argv[1:], 'hln:B:sv', long_opts) + except getopt.GetoptError: + show_usage(script_name) + sys.exit(1) + + compress = False + file_size = 500 * 1024 + output_dir = '.' + + for opt, arg in opts: + if opt in ('-h', '--help'): + show_help() + sys.exit() + elif opt in ('-l', '--links'): + keepLinks = True + elif opt in ('-s', '--sections'): + keepSections = True + elif opt in ('-B', '--base'): + prefix = arg + elif opt in ('-n', '--ns'): + acceptedNamespaces = set(arg.split(',')) + elif opt in ('-v', '--version'): + print 'WikiExtractor.py version:', version + sys.exit(0) + + if len(args) != 2: + show_usage(script_name) + sys.exit(4) + else: + source_tag = args[0] + database_filename = args[1] + + if not os.path.isdir(output_dir): + try: + os.makedirs(output_dir) + except: + print >> sys.stderr, 'Could not create: ', output_dir + return + + if not keepLinks: + ignoreTag('a') + + process_data( + sys.stdin, + source_tag, + open_database(database_filename) + ) + +if __name__ == '__main__': + main() diff --git a/src/main/java/scripts/ParallelStats.java b/src/main/java/scripts/ParallelStats.java new file mode 100644 index 0000000..b2277b3 --- /dev/null +++ b/src/main/java/scripts/ParallelStats.java @@ -0,0 +1,254 @@ +package scripts; + +import static org.junit.Assert.fail; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.Date; +import java.util.List; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.apache.http.NameValuePair; +import org.apache.http.client.fluent.Form; +import org.apache.http.client.fluent.Request; +import org.eclipse.jgit.lib.Repository; +import org.eclipse.jgit.storage.file.FileRepositoryBuilder; + +import uncc2014watsonsim.Answer; +import uncc2014watsonsim.DBQuestionSource; +import uncc2014watsonsim.DefaultPipeline; +import uncc2014watsonsim.Question; +import uncc2014watsonsim.StringUtils; + +/** + * + * @author Phani Rahul + */ +public class ParallelStats { + + /** + * @param args the command line arguments + * @throws Exception + */ + public static void main(String[] args) throws Exception { + // Oversubscribing makes scheduling the CPU-scheduler's problem + ExecutorService pool = Executors.newFixedThreadPool(50); + for (int i=0; i < 5000; i += 100) { + pool.execute(new SingleTrainingResult(i)); + } + pool.shutdown(); + + try { + pool.awaitTermination(2, TimeUnit.DAYS); + } catch (InterruptedException ex) { + Logger.getLogger(ParallelStats.class.getName()).log(Level.SEVERE, null, ex); + } + System.out.println("Done."); + } +} + +class SingleTrainingResult extends Thread { + int offset; + + public SingleTrainingResult(int offset) { + this.offset = offset; + } + + public void run() { + String sql = String.format("INNER JOIN cache ON query = question GROUP BY question LIMIT 100 OFFSET %d", offset); + try { + new StatsGenerator("IBL with redirects (v2)", sql).run(); + } catch (SQLException e) { + e.printStackTrace(); + fail("Database missing, invalid, or out of date. Check that you " + + "have the latest version."); + } + } +} + +/** + * This private class runs all the kinds of statistics in the background. + *

    + * It measures:

    + * 1. Overall (top) accuracy

    + * 2. Top-3 accuracy

    + * 3. Mean Reciprocal Rank (MRR), aka mean inverse rank. + * It is only calculated on questions where the correct answer was one + * of the candidate answers. Thus, Scorers and the Learner should use + * MRR as a guide, looking to approach 1.0.

    + * 4. Availability, aka binary recall. + * This is more of an issue with the Searchers, which should strive for + * high binary recall. Precision eventually comes in to play too but is + * not calculated because the intention is for scorers to improve it + * instead of filtering it out early in Searchers. Still, it comes into + * play.

    + * 5. A histogram of accuracy by confidence. In theory, it should be more + * accurate when it is more confident. That has not yet panned out.

    + * 6. Miscellaneous facts like the Git commit, for later reference.

    + *

    + * It also prints out a number each time it finishes a question, simply to + * relieve some of the boredom of watching it calculate. Expect to see: 0 1 2 + * 3 ... + * + * There is only one method to call, which is basically just a procedure. But + * internally there are several private functions to aid organization. + * + * @author Phani Rahul + * @author Sean Gallagher + */ +class StatsGenerator { + String dataset; + private DBQuestionSource questionsource; + // correct[n] =def= number of correct answers at rank n + int[] correct = new int[100]; + int available = 0; + double total_inverse_rank = 0; + int total_answers = 0; + + double runtime; + int[] conf_correct = new int[100]; + int[] conf_hist = new int[100]; + + /** + * Generate statistics on a specific set of questions + * + * To understand the query, see {@link DBQuestionSource}. + * @param dataset What to name the result when it is posted online. + * @param question_query The SQL filters for the questions. + * @throws Exception + */ + public StatsGenerator(String dataset, String question_query) throws SQLException { + this.dataset = dataset; + questionsource = new DBQuestionSource(question_query); + } + + /** Measure how accurate the top question is as a histogram across confidence */ + private void calculateConfidenceHistogram(Question question) { + if (question.size() >= 1) { + // Supposing there is at least one answer + Answer a = question.get(0); + // Clamp to [0, 99] + int bin = (int)(a.score() * 99); + bin = Math.max(0, Math.min(bin, 99)); + if(a.equals(question.answer)) conf_correct[bin]++; + conf_hist[bin]++; + } + } + + /** Callback for every correct answer */ + public void onCorrectAnswer(Question question, Answer candidate, int rank) { + total_inverse_rank += 1 / ((double)rank + 1); + available++; + // Clamp the rank to 100. Past that we don't have a histogram. + correct[rank < 100 ? rank : 99]++; + } + + /** Send Statistics to the server */ + private void report() { + + // At worst, give an empty branch and commit + String branch = "", commit = ""; + if (System.getenv("TRAVIS_BRANCH") != null) { + // Use CI information if possible. + branch = System.getenv("TRAVIS_BRANCH"); + commit = System.getenv("TRAVIS_COMMIT"); + } else { + // Otherwise take a stab at it ourselves. + try { + Repository repo = new FileRepositoryBuilder() + .readEnvironment() + .findGitDir() + .build(); + commit = repo + .resolve("HEAD") + .abbreviate(10) + .name(); + if (commit == null) { + commit = ""; + System.err.println("Problem finding git repository.\n" + + "Resulting stats will be missing information."); + } + branch = repo.getBranch(); + } catch (IOException ex) { + // Well at least we tried. + } + } + // Generate report + List response = Form.form() + .add("run[branch]", branch) + .add("run[commit_hash]", commit.substring(0, 10)) + .add("run[dataset]", dataset) + .add("run[top]", String.valueOf(correct[0])) + .add("run[top3]", String.valueOf(correct[0] + correct[1] + correct[2])) + .add("run[available]", String.valueOf(available)) + .add("run[rank]", String.valueOf(total_inverse_rank)) + .add("run[total_questions]", String.valueOf(questionsource.size())) + .add("run[total_answers]", String.valueOf(total_answers)) + .add("run[confidence_histogram]", StringUtils.join(conf_hist, " ")) + .add("run[confidence_correct_histogram]", StringUtils.join(conf_correct, " ")) + .add("run[runtime]", String.valueOf(runtime)) + .build(); + try { + Request.Post("http://watsonsim.herokuapp.com/runs.json").bodyForm(response).execute(); + } catch (IOException e) { + System.err.println("Error uploading stats. Ignoring. " + + "Details follow."); + e.printStackTrace(); + } + + + System.out.println("" + correct[0] + " of " + questionsource.size() + " correct"); + System.out.println("" + available + " of " + questionsource.size() + " could have been"); + System.out.println("Mean Inverse Rank " + total_inverse_rank); + } + + + /** Run statistics, then upload to the server */ + public void run() { + long start_time = System.nanoTime(); + + + System.out.println("Asking Questions"); + for (int i=0; i passages, + Map scores, + String candidate_text) { + this.passages = passages; + this.scores = scores; + this.candidate_text = candidate_text; + } + /** * Create an Answer with one implicitly defined Passage */ @@ -153,9 +163,35 @@ public int compareTo(Answer other) { } /** Change this Answer to include all the information of another - * TODO: What should we do to merge scores? */ - public void merge(Answer other) { - passages.addAll(other.passages); + * HACK: We average the scores but we should probably use a + * pluggable binary operator*/ + public static Answer merge(List others) { + Map scores = new HashMap<>(); + List passages = new ArrayList<>(); + String candidate_text; + + // Merge all the passages + for (Answer other : others) + passages.addAll(other.passages); + + // Merge the scores + Set all_score_names = new HashSet<>(); + for (Answer other : others) all_score_names.addAll(other.scores.keySet()); + /// Just average them for now - THIS IS A HACK + for (String score_name : all_score_names) { + double total=0; + for (Answer other : others) { + Double score = other.scores.get(score_name); + if (score != null) total += score; + } + scores.put(score_name, total / others.size()); + } + + // Pick the first candidate answer + candidate_text = others.get(0).candidate_text; + + // Now make an answer from it + return new Answer(passages, scores, candidate_text); } diff --git a/src/main/java/uncc2014watsonsim/Database.java b/src/main/java/uncc2014watsonsim/Database.java new file mode 100644 index 0000000..49b0b8f --- /dev/null +++ b/src/main/java/uncc2014watsonsim/Database.java @@ -0,0 +1,72 @@ +package uncc2014watsonsim; + +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Properties; +import java.util.Set; + +public class Database { + private Connection conn; + + public Database() { + try { + Class.forName("org.sqlite.JDBC"); + Properties props = new Properties(); + props.put("busy_timeout", "30000"); + conn = DriverManager.getConnection("jdbc:sqlite:data/watsonsim.db", props); + conn.createStatement().execute("PRAGMA journal_mode = WAL;"); + conn.createStatement().execute("PRAGMA synchronous = OFF;"); + // JDBC's SQLite uses autocommit (So commit() is redundant) + // Furthermore, close() is a no-op as long as the results are commit()'d + + if (!sanityCheck()) { + System.out.println(String.format("Warning: Database missing or malformed.")); + } + } catch (SQLException | ClassNotFoundException e2) { + e2.printStackTrace(); + throw new RuntimeException("Can't run without a database."); + } + } + + /** Simple wrapper for creating an SQL statement */ + public PreparedStatement prep(String sql) { + PreparedStatement ps; + try { + ps = conn.prepareStatement(sql); + ps.setFetchSize(100); + } catch (SQLException e) { + e.printStackTrace(); + throw new RuntimeException("Can't prepare an SQL statement \"" + sql + "\""); + } + return ps; + } + + /** Check that the SQLite DB we opened contains the right tables + * You would do this rather than check if the file exists because SQLite + * creates the file implicitly and it simply has no contents. + * */ + public boolean sanityCheck() { + Set existent_tables = new HashSet(); + try { + ResultSet sql = prep("select tbl_name from sqlite_master;").executeQuery(); + while (sql.next()) { + existent_tables.add(sql.getString("tbl_name")); + } + } catch (SQLException e) { + // There was a problem executing the query + return false; + } + + return existent_tables.containsAll(Arrays.asList(new String[]{ + "meta", "content", "redirects", "questions", "results", "cache" + })); + } + +} diff --git a/src/main/java/uncc2014watsonsim/researchers/Merge.java b/src/main/java/uncc2014watsonsim/researchers/Merge.java index 9def38b..b12a87f 100644 --- a/src/main/java/uncc2014watsonsim/researchers/Merge.java +++ b/src/main/java/uncc2014watsonsim/researchers/Merge.java @@ -1,5 +1,8 @@ package uncc2014watsonsim.researchers; +import java.util.ArrayList; +import java.util.List; + import uncc2014watsonsim.Answer; import uncc2014watsonsim.Question; @@ -7,19 +10,43 @@ public class Merge extends Researcher { @Override /** Call merge on any two answers with the same title */ public void question(Question q) { - // The left cursor moves right - for (int first_ai=0; first_aifirst_ai; second_ai--) { - Answer first_a = q.get(first_ai); - Answer second_a = q.get(second_ai); - // Merge if necessary - //TODO: This uses more or less exact matching. We should do better. - if (second_a.matches(first_a)) { - first_a.merge(second_a); - q.remove(second_ai); + List> answer_blocks = new ArrayList<>(); + // Arrange the answers into blocks + for (Answer original : q) { + List target = null; + for (List block : answer_blocks) { + for (Answer example : block) { + // Look through the examples in this topic + // If it matches, choose to put it in this block and quit. + if (original.matches(example)) { + target = block; + break; + } + } + // Found a good option. break again + if (target != null) { + break; } } + if (target == null) { + // Make a new topic for this answer + List new_block = new ArrayList<>(); + new_block.add(original); + answer_blocks.add(new_block); + } else { + // Use the old topic + target.add(original); + } + } + + // Merge the blocks + q.clear(); + for (List block : answer_blocks) { + if (block.size() > 1) { + q.add(Answer.merge(block)); + } else { + q.add(block.get(0)); + } } } diff --git a/src/main/java/uncc2014watsonsim/researchers/RedirectSynonyms.java b/src/main/java/uncc2014watsonsim/researchers/RedirectSynonyms.java index 86d17ed..7f981d3 100644 --- a/src/main/java/uncc2014watsonsim/researchers/RedirectSynonyms.java +++ b/src/main/java/uncc2014watsonsim/researchers/RedirectSynonyms.java @@ -3,6 +3,8 @@ import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.SQLException; +import java.util.ArrayList; +import java.util.List; import uncc2014watsonsim.Answer; import uncc2014watsonsim.Question; @@ -17,22 +19,25 @@ public class RedirectSynonyms extends Researcher { SQLiteDB db = new SQLiteDB("sources"); + PreparedStatement s = db.prep( + "SELECT source from wiki_redirects where target = ? collate nocase;"); + @Override - public void answer(Question q, Answer a) { - PreparedStatement s = db.prep("select source_title from redirect_documents inner join documents on docno=target_docid where title = ?;"); + public void question(Question q) { + List prev_answers = new ArrayList<>(); + prev_answers.addAll(q); - try { - s.setString(1, a.candidate_text); - ResultSet results = s.executeQuery(); - while (results.next()) { - Answer new_a = new Answer(results.getString("source_title")); - new_a.passages.addAll(a.passages); - q.add(new_a); + for (Answer a : prev_answers) { + try { + s.setString(1, a.candidate_text); + ResultSet results = s.executeQuery(); + while (results.next()) { + q.add(new Answer(a.passages, a.scores, results.getString("source"))); + } + } catch (SQLException e) { + // Just don't make any synonyms. + return; } - } catch (SQLException e) { - // Just don't make any synonyms. - return; } - } } diff --git a/src/main/java/uncc2014watsonsim/researchers/WekaTee.java b/src/main/java/uncc2014watsonsim/researchers/WekaTee.java index e738bc1..4bef617 100644 --- a/src/main/java/uncc2014watsonsim/researchers/WekaTee.java +++ b/src/main/java/uncc2014watsonsim/researchers/WekaTee.java @@ -17,19 +17,17 @@ /** Pipe Answer scores to an ARFF file for Weka */ public class WekaTee extends Researcher { - private Instances data; - public WekaTee() { - FastVector attributes = new FastVector(); - // Answer score names - for (String name : Score.answer_score_names) - attributes.addElement(new Attribute(name)); - // Passage score names - for (int passage_i=0; passage_i schema = new TreeSet<>(); + // But in memory there is no schema because it changes + List> dataset = new ArrayList<>(); + + + // Make every run unique, but overwrite between questions + // This way, you still get /something/ if you interrupt it + private Date start_time = new Date(); @Override public void research(Question q) { @@ -44,12 +42,34 @@ public void question(Question q) { } } - @Override - public void complete() { + /** + * Export the current dataset to an Arff file + */ + private void exportToFile() { + FastVector attributes = new FastVector(); + // Answer score names + for (String name: schema) + attributes.addElement(new Attribute(name)); + Instances data = new Instances("Watsonsim captured question stream", attributes, 0); + + // Fill in all the rows in sorted order, then export. + int schema_len = schema.size(); + for (Map dataset_row : dataset) { + double[] row = new double[schema_len]; + Arrays.fill(row, Double.NaN); + int col_idx = 0; + for (String column : schema) { + Double value = dataset_row.get(column); + row[col_idx++] = value == null ? Double.NaN : value; + } + data.add(new Instance(1.0, row)); + } + + // Save the results to a file ArffSaver saver = new ArffSaver(); saver.setInstances(data); try { - saver.setFile(new File("data/weka-log.arff")); + saver.setFile(new File("data/weka-log." + start_time + Thread.currentThread().getId() + ".arff")); saver.writeBatch(); } catch (IOException e) { System.out.println("Failed to write Weka log. Ignoring.");