+
+ for line in text.split('\n'):
+
+ if not line:
+ continue
+ # Handle section titles
+ m = section.match(line)
+ if m:
+ title = m.group(2)
+ lev = len(m.group(1))
+ if keepSections:
+ page.append("%s" % (lev, title, lev))
+ if title and title[-1] not in '!?':
+ title += '.'
+ headers[lev] = title
+ # drop previous headers
+ for i in headers.keys():
+ if i > lev:
+ del headers[i]
+ emptySection = True
+ continue
+ # Handle page title
+ if line.startswith('++'):
+ title = line[2:-2]
+ if title:
+ if title[-1] not in '!?':
+ title += '.'
+ page.append(title)
+ # handle lists
+ elif line[0] in '*#:;':
+ if keepSections:
+ page.append("- %s
" % line[1:])
+ else:
+ continue
+ # Drop residuals of lists
+ elif line[0] in '{|' or line[-1] in '}':
+ continue
+ # Drop irrelevant lines
+ elif (line[0] == '(' and line[-1] == ')') or line.strip('.-') == '':
+ continue
+ elif len(headers):
+ items = headers.items()
+ items.sort()
+ for (i, v) in items:
+ page.append(v)
+ headers.clear()
+ page.append(line) # first line
+ emptySection = False
+ elif not emptySection:
+ page.append(line)
+
+ return page
+
+def handle_unicode(entity):
+ numeric_code = int(entity[2:-1])
+ if numeric_code >= 0x10000: return ''
+ return unichr(numeric_code)
+
+#------------------------------------------------------------------------------
+
+### READER ###################################################################
+
+tagRE = re.compile(r'(.*?)<(/?\w+)[^>]*>(?:([^<]*)(<.*?>)?)?')
+
+def process_data(input, source_tag, database):
+ global prefix
+
+ page = []
+ id = None
+ inText = False
+ redirect = False
+ for line in input:
+ line = line.decode('utf-8')
+ tag = ''
+ if '<' in line:
+ m = tagRE.search(line)
+ if m:
+ tag = m.group(2)
+ if tag == 'page':
+ page = []
+ redirect = False
+ elif tag == 'id' and not id:
+ id = m.group(3)
+ elif tag == 'title':
+ title = m.group(3)
+ elif tag == 'redirect':
+ redirect = True
+ elif tag == 'text':
+ inText = True
+ line = line[m.start(3):m.end(3)] + '\n'
+ page.append(line)
+ if m.lastindex == 4: # open-close
+ inText = False
+ elif tag == '/text':
+ if m.group(1):
+ page.append(m.group(1) + '\n')
+ inText = False
+ elif inText:
+ page.append(line)
+ elif tag == '/page':
+ colon = title.find(':')
+ if (colon < 0 or title[:colon] in acceptedNamespaces) and \
+ not redirect:
+ print id, title.encode('utf-8')
+ sys.stdout.flush()
+ WikiDocument(id, title, ''.join(page), source_tag, database)
+ id = None
+ page = []
+ elif tag == 'base':
+ # discover prefix from the xml dump file
+ # /mediawiki/siteinfo/base
+ base = m.group(3)
+ prefix = base[:base.rfind("/")]
+ database.commit()
+
+### CL INTERFACE ############################################################
+
+def show_help():
+ print >> sys.stdout, __doc__,
+
+def show_usage(script_name):
+ print >> sys.stderr, 'Usage: %s [options]' % script_name
+
+##
+# Minimum size of output files
+minFileSize = 200 * 1024
+
+def main():
+ global keepLinks, keepSections, prefix, acceptedNamespaces
+ script_name = os.path.basename(sys.argv[0])
+
+ try:
+ long_opts = ['help', 'basename=', 'links', 'ns=', 'sections', 'version']
+ opts, args = getopt.gnu_getopt(sys.argv[1:], 'hln:B:sv', long_opts)
+ except getopt.GetoptError:
+ show_usage(script_name)
+ sys.exit(1)
+
+ compress = False
+ file_size = 500 * 1024
+ output_dir = '.'
+
+ for opt, arg in opts:
+ if opt in ('-h', '--help'):
+ show_help()
+ sys.exit()
+ elif opt in ('-l', '--links'):
+ keepLinks = True
+ elif opt in ('-s', '--sections'):
+ keepSections = True
+ elif opt in ('-B', '--base'):
+ prefix = arg
+ elif opt in ('-n', '--ns'):
+ acceptedNamespaces = set(arg.split(','))
+ elif opt in ('-v', '--version'):
+ print 'WikiExtractor.py version:', version
+ sys.exit(0)
+
+ if len(args) != 2:
+ show_usage(script_name)
+ sys.exit(4)
+ else:
+ source_tag = args[0]
+ database_filename = args[1]
+
+ if not os.path.isdir(output_dir):
+ try:
+ os.makedirs(output_dir)
+ except:
+ print >> sys.stderr, 'Could not create: ', output_dir
+ return
+
+ if not keepLinks:
+ ignoreTag('a')
+
+ process_data(
+ sys.stdin,
+ source_tag,
+ open_database(database_filename)
+ )
+
+if __name__ == '__main__':
+ main()
diff --git a/src/main/java/scripts/ParallelStats.java b/src/main/java/scripts/ParallelStats.java
new file mode 100644
index 0000000..b2277b3
--- /dev/null
+++ b/src/main/java/scripts/ParallelStats.java
@@ -0,0 +1,254 @@
+package scripts;
+
+import static org.junit.Assert.fail;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.List;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.TimeUnit;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import org.apache.http.NameValuePair;
+import org.apache.http.client.fluent.Form;
+import org.apache.http.client.fluent.Request;
+import org.eclipse.jgit.lib.Repository;
+import org.eclipse.jgit.storage.file.FileRepositoryBuilder;
+
+import uncc2014watsonsim.Answer;
+import uncc2014watsonsim.DBQuestionSource;
+import uncc2014watsonsim.DefaultPipeline;
+import uncc2014watsonsim.Question;
+import uncc2014watsonsim.StringUtils;
+
+/**
+ *
+ * @author Phani Rahul
+ */
+public class ParallelStats {
+
+ /**
+ * @param args the command line arguments
+ * @throws Exception
+ */
+ public static void main(String[] args) throws Exception {
+ // Oversubscribing makes scheduling the CPU-scheduler's problem
+ ExecutorService pool = Executors.newFixedThreadPool(50);
+ for (int i=0; i < 5000; i += 100) {
+ pool.execute(new SingleTrainingResult(i));
+ }
+ pool.shutdown();
+
+ try {
+ pool.awaitTermination(2, TimeUnit.DAYS);
+ } catch (InterruptedException ex) {
+ Logger.getLogger(ParallelStats.class.getName()).log(Level.SEVERE, null, ex);
+ }
+ System.out.println("Done.");
+ }
+}
+
+class SingleTrainingResult extends Thread {
+ int offset;
+
+ public SingleTrainingResult(int offset) {
+ this.offset = offset;
+ }
+
+ public void run() {
+ String sql = String.format("INNER JOIN cache ON query = question GROUP BY question LIMIT 100 OFFSET %d", offset);
+ try {
+ new StatsGenerator("IBL with redirects (v2)", sql).run();
+ } catch (SQLException e) {
+ e.printStackTrace();
+ fail("Database missing, invalid, or out of date. Check that you "
+ + "have the latest version.");
+ }
+ }
+}
+
+/**
+ * This private class runs all the kinds of statistics in the background.
+ *
+ * It measures:
+ * 1. Overall (top) accuracy
+ * 2. Top-3 accuracy
+ * 3. Mean Reciprocal Rank (MRR), aka mean inverse rank.
+ * It is only calculated on questions where the correct answer was one
+ * of the candidate answers. Thus, Scorers and the Learner should use
+ * MRR as a guide, looking to approach 1.0.
+ * 4. Availability, aka binary recall.
+ * This is more of an issue with the Searchers, which should strive for
+ * high binary recall. Precision eventually comes in to play too but is
+ * not calculated because the intention is for scorers to improve it
+ * instead of filtering it out early in Searchers. Still, it comes into
+ * play.
+ * 5. A histogram of accuracy by confidence. In theory, it should be more
+ * accurate when it is more confident. That has not yet panned out.
+ * 6. Miscellaneous facts like the Git commit, for later reference.
+ *
+ * It also prints out a number each time it finishes a question, simply to
+ * relieve some of the boredom of watching it calculate. Expect to see: 0 1 2
+ * 3 ...
+ *
+ * There is only one method to call, which is basically just a procedure. But
+ * internally there are several private functions to aid organization.
+ *
+ * @author Phani Rahul
+ * @author Sean Gallagher
+ */
+class StatsGenerator {
+ String dataset;
+ private DBQuestionSource questionsource;
+ // correct[n] =def= number of correct answers at rank n
+ int[] correct = new int[100];
+ int available = 0;
+ double total_inverse_rank = 0;
+ int total_answers = 0;
+
+ double runtime;
+ int[] conf_correct = new int[100];
+ int[] conf_hist = new int[100];
+
+ /**
+ * Generate statistics on a specific set of questions
+ *
+ * To understand the query, see {@link DBQuestionSource}.
+ * @param dataset What to name the result when it is posted online.
+ * @param question_query The SQL filters for the questions.
+ * @throws Exception
+ */
+ public StatsGenerator(String dataset, String question_query) throws SQLException {
+ this.dataset = dataset;
+ questionsource = new DBQuestionSource(question_query);
+ }
+
+ /** Measure how accurate the top question is as a histogram across confidence */
+ private void calculateConfidenceHistogram(Question question) {
+ if (question.size() >= 1) {
+ // Supposing there is at least one answer
+ Answer a = question.get(0);
+ // Clamp to [0, 99]
+ int bin = (int)(a.score() * 99);
+ bin = Math.max(0, Math.min(bin, 99));
+ if(a.equals(question.answer)) conf_correct[bin]++;
+ conf_hist[bin]++;
+ }
+ }
+
+ /** Callback for every correct answer */
+ public void onCorrectAnswer(Question question, Answer candidate, int rank) {
+ total_inverse_rank += 1 / ((double)rank + 1);
+ available++;
+ // Clamp the rank to 100. Past that we don't have a histogram.
+ correct[rank < 100 ? rank : 99]++;
+ }
+
+ /** Send Statistics to the server */
+ private void report() {
+
+ // At worst, give an empty branch and commit
+ String branch = "", commit = "";
+ if (System.getenv("TRAVIS_BRANCH") != null) {
+ // Use CI information if possible.
+ branch = System.getenv("TRAVIS_BRANCH");
+ commit = System.getenv("TRAVIS_COMMIT");
+ } else {
+ // Otherwise take a stab at it ourselves.
+ try {
+ Repository repo = new FileRepositoryBuilder()
+ .readEnvironment()
+ .findGitDir()
+ .build();
+ commit = repo
+ .resolve("HEAD")
+ .abbreviate(10)
+ .name();
+ if (commit == null) {
+ commit = "";
+ System.err.println("Problem finding git repository.\n"
+ + "Resulting stats will be missing information.");
+ }
+ branch = repo.getBranch();
+ } catch (IOException ex) {
+ // Well at least we tried.
+ }
+ }
+ // Generate report
+ List response = Form.form()
+ .add("run[branch]", branch)
+ .add("run[commit_hash]", commit.substring(0, 10))
+ .add("run[dataset]", dataset)
+ .add("run[top]", String.valueOf(correct[0]))
+ .add("run[top3]", String.valueOf(correct[0] + correct[1] + correct[2]))
+ .add("run[available]", String.valueOf(available))
+ .add("run[rank]", String.valueOf(total_inverse_rank))
+ .add("run[total_questions]", String.valueOf(questionsource.size()))
+ .add("run[total_answers]", String.valueOf(total_answers))
+ .add("run[confidence_histogram]", StringUtils.join(conf_hist, " "))
+ .add("run[confidence_correct_histogram]", StringUtils.join(conf_correct, " "))
+ .add("run[runtime]", String.valueOf(runtime))
+ .build();
+ try {
+ Request.Post("http://watsonsim.herokuapp.com/runs.json").bodyForm(response).execute();
+ } catch (IOException e) {
+ System.err.println("Error uploading stats. Ignoring. "
+ + "Details follow.");
+ e.printStackTrace();
+ }
+
+
+ System.out.println("" + correct[0] + " of " + questionsource.size() + " correct");
+ System.out.println("" + available + " of " + questionsource.size() + " could have been");
+ System.out.println("Mean Inverse Rank " + total_inverse_rank);
+ }
+
+
+ /** Run statistics, then upload to the server */
+ public void run() {
+ long start_time = System.nanoTime();
+
+
+ System.out.println("Asking Questions");
+ for (int i=0; i passages,
+ Map scores,
+ String candidate_text) {
+ this.passages = passages;
+ this.scores = scores;
+ this.candidate_text = candidate_text;
+ }
+
/**
* Create an Answer with one implicitly defined Passage
*/
@@ -153,9 +163,35 @@ public int compareTo(Answer other) {
}
/** Change this Answer to include all the information of another
- * TODO: What should we do to merge scores? */
- public void merge(Answer other) {
- passages.addAll(other.passages);
+ * HACK: We average the scores but we should probably use a
+ * pluggable binary operator*/
+ public static Answer merge(List others) {
+ Map scores = new HashMap<>();
+ List passages = new ArrayList<>();
+ String candidate_text;
+
+ // Merge all the passages
+ for (Answer other : others)
+ passages.addAll(other.passages);
+
+ // Merge the scores
+ Set all_score_names = new HashSet<>();
+ for (Answer other : others) all_score_names.addAll(other.scores.keySet());
+ /// Just average them for now - THIS IS A HACK
+ for (String score_name : all_score_names) {
+ double total=0;
+ for (Answer other : others) {
+ Double score = other.scores.get(score_name);
+ if (score != null) total += score;
+ }
+ scores.put(score_name, total / others.size());
+ }
+
+ // Pick the first candidate answer
+ candidate_text = others.get(0).candidate_text;
+
+ // Now make an answer from it
+ return new Answer(passages, scores, candidate_text);
}
diff --git a/src/main/java/uncc2014watsonsim/Database.java b/src/main/java/uncc2014watsonsim/Database.java
new file mode 100644
index 0000000..49b0b8f
--- /dev/null
+++ b/src/main/java/uncc2014watsonsim/Database.java
@@ -0,0 +1,72 @@
+package uncc2014watsonsim;
+
+import java.sql.Connection;
+import java.sql.DriverManager;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Properties;
+import java.util.Set;
+
+public class Database {
+ private Connection conn;
+
+ public Database() {
+ try {
+ Class.forName("org.sqlite.JDBC");
+ Properties props = new Properties();
+ props.put("busy_timeout", "30000");
+ conn = DriverManager.getConnection("jdbc:sqlite:data/watsonsim.db", props);
+ conn.createStatement().execute("PRAGMA journal_mode = WAL;");
+ conn.createStatement().execute("PRAGMA synchronous = OFF;");
+ // JDBC's SQLite uses autocommit (So commit() is redundant)
+ // Furthermore, close() is a no-op as long as the results are commit()'d
+
+ if (!sanityCheck()) {
+ System.out.println(String.format("Warning: Database missing or malformed."));
+ }
+ } catch (SQLException | ClassNotFoundException e2) {
+ e2.printStackTrace();
+ throw new RuntimeException("Can't run without a database.");
+ }
+ }
+
+ /** Simple wrapper for creating an SQL statement */
+ public PreparedStatement prep(String sql) {
+ PreparedStatement ps;
+ try {
+ ps = conn.prepareStatement(sql);
+ ps.setFetchSize(100);
+ } catch (SQLException e) {
+ e.printStackTrace();
+ throw new RuntimeException("Can't prepare an SQL statement \"" + sql + "\"");
+ }
+ return ps;
+ }
+
+ /** Check that the SQLite DB we opened contains the right tables
+ * You would do this rather than check if the file exists because SQLite
+ * creates the file implicitly and it simply has no contents.
+ * */
+ public boolean sanityCheck() {
+ Set existent_tables = new HashSet();
+ try {
+ ResultSet sql = prep("select tbl_name from sqlite_master;").executeQuery();
+ while (sql.next()) {
+ existent_tables.add(sql.getString("tbl_name"));
+ }
+ } catch (SQLException e) {
+ // There was a problem executing the query
+ return false;
+ }
+
+ return existent_tables.containsAll(Arrays.asList(new String[]{
+ "meta", "content", "redirects", "questions", "results", "cache"
+ }));
+ }
+
+}
diff --git a/src/main/java/uncc2014watsonsim/researchers/Merge.java b/src/main/java/uncc2014watsonsim/researchers/Merge.java
index 9def38b..b12a87f 100644
--- a/src/main/java/uncc2014watsonsim/researchers/Merge.java
+++ b/src/main/java/uncc2014watsonsim/researchers/Merge.java
@@ -1,5 +1,8 @@
package uncc2014watsonsim.researchers;
+import java.util.ArrayList;
+import java.util.List;
+
import uncc2014watsonsim.Answer;
import uncc2014watsonsim.Question;
@@ -7,19 +10,43 @@ public class Merge extends Researcher {
@Override
/** Call merge on any two answers with the same title */
public void question(Question q) {
- // The left cursor moves right
- for (int first_ai=0; first_aifirst_ai; second_ai--) {
- Answer first_a = q.get(first_ai);
- Answer second_a = q.get(second_ai);
- // Merge if necessary
- //TODO: This uses more or less exact matching. We should do better.
- if (second_a.matches(first_a)) {
- first_a.merge(second_a);
- q.remove(second_ai);
+ List> answer_blocks = new ArrayList<>();
+ // Arrange the answers into blocks
+ for (Answer original : q) {
+ List target = null;
+ for (List block : answer_blocks) {
+ for (Answer example : block) {
+ // Look through the examples in this topic
+ // If it matches, choose to put it in this block and quit.
+ if (original.matches(example)) {
+ target = block;
+ break;
+ }
+ }
+ // Found a good option. break again
+ if (target != null) {
+ break;
}
}
+ if (target == null) {
+ // Make a new topic for this answer
+ List new_block = new ArrayList<>();
+ new_block.add(original);
+ answer_blocks.add(new_block);
+ } else {
+ // Use the old topic
+ target.add(original);
+ }
+ }
+
+ // Merge the blocks
+ q.clear();
+ for (List block : answer_blocks) {
+ if (block.size() > 1) {
+ q.add(Answer.merge(block));
+ } else {
+ q.add(block.get(0));
+ }
}
}
diff --git a/src/main/java/uncc2014watsonsim/researchers/RedirectSynonyms.java b/src/main/java/uncc2014watsonsim/researchers/RedirectSynonyms.java
index 86d17ed..7f981d3 100644
--- a/src/main/java/uncc2014watsonsim/researchers/RedirectSynonyms.java
+++ b/src/main/java/uncc2014watsonsim/researchers/RedirectSynonyms.java
@@ -3,6 +3,8 @@
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.List;
import uncc2014watsonsim.Answer;
import uncc2014watsonsim.Question;
@@ -17,22 +19,25 @@
public class RedirectSynonyms extends Researcher {
SQLiteDB db = new SQLiteDB("sources");
+ PreparedStatement s = db.prep(
+ "SELECT source from wiki_redirects where target = ? collate nocase;");
+
@Override
- public void answer(Question q, Answer a) {
- PreparedStatement s = db.prep("select source_title from redirect_documents inner join documents on docno=target_docid where title = ?;");
+ public void question(Question q) {
+ List prev_answers = new ArrayList<>();
+ prev_answers.addAll(q);
- try {
- s.setString(1, a.candidate_text);
- ResultSet results = s.executeQuery();
- while (results.next()) {
- Answer new_a = new Answer(results.getString("source_title"));
- new_a.passages.addAll(a.passages);
- q.add(new_a);
+ for (Answer a : prev_answers) {
+ try {
+ s.setString(1, a.candidate_text);
+ ResultSet results = s.executeQuery();
+ while (results.next()) {
+ q.add(new Answer(a.passages, a.scores, results.getString("source")));
+ }
+ } catch (SQLException e) {
+ // Just don't make any synonyms.
+ return;
}
- } catch (SQLException e) {
- // Just don't make any synonyms.
- return;
}
-
}
}
diff --git a/src/main/java/uncc2014watsonsim/researchers/WekaTee.java b/src/main/java/uncc2014watsonsim/researchers/WekaTee.java
index e738bc1..4bef617 100644
--- a/src/main/java/uncc2014watsonsim/researchers/WekaTee.java
+++ b/src/main/java/uncc2014watsonsim/researchers/WekaTee.java
@@ -17,19 +17,17 @@
/** Pipe Answer scores to an ARFF file for Weka */
public class WekaTee extends Researcher {
- private Instances data;
- public WekaTee() {
- FastVector attributes = new FastVector();
- // Answer score names
- for (String name : Score.answer_score_names)
- attributes.addElement(new Attribute(name));
- // Passage score names
- for (int passage_i=0; passage_i schema = new TreeSet<>();
+ // But in memory there is no schema because it changes
+ List