Skip to content

Commit

Permalink
(Mostly) finish and create tests for the SCNLP similarity score
Browse files Browse the repository at this point in the history
  • Loading branch information
SeanTater committed Nov 13, 2014
1 parent 9a29cb9 commit 376c307
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 148 deletions.
180 changes: 34 additions & 146 deletions src/main/java/uncc2014watsonsim/scorers/CoreNLPSentenceSimilarity.java
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,17 @@
*/

public class CoreNLPSentenceSimilarity extends PassageScorer {
public CoreNLPSentenceSimilarity() {
}

public Tree parseToTree(String text) {
Properties props;
StanfordCoreNLP pipeline;
public CoreNLPSentenceSimilarity() {
// creates a StanfordCoreNLP object, with POS tagging, lemmatization, NER, parsing, and coreference resolution
Properties props = new Properties();
props = new Properties();
props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
pipeline = new StanfordCoreNLP(props);
}

public List<Tree> parseToTrees(String text) {

// create an empty Annotation just with the given text
Annotation document = new Annotation(text);
Expand All @@ -74,160 +77,45 @@ public Tree parseToTree(String text) {
// these are all the sentences in this document
// a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
List<CoreMap> sentences = document.get(SentencesAnnotation.class);
List<Tree> trees = new ArrayList<>();

for(CoreMap sentence: sentences) {
// this is the parse tree of the current sentence
return sentence.get(TreeAnnotation.class);
trees.add(sentence.get(TreeAnnotation.class));
}
return Tree.valueOf("");
}
/** Turn one tokenized sentence into one top-ranked parse tree. */
public Parse parseSentence(List<String> tokens) {
//StringTokenizer st = new StringTokenizer(tks[i]);
//There are several tokenizers available. SimpleTokenizer works best
System.out.print(";");
String sent= StringUtils.join(tokens," ");
return ParserTool.parseLine(sent,parser, 1)[0];
}

/** Turn a tokenized paragraph into a list of parses */
public List<Parse> parseParagraph(List<List<String>> paragraph) {
//find sentences, tokenize each, parse each, return top parse for each
List<Parse> results = new ArrayList<>(paragraph.size());
for (List<String> sentence : paragraph) {
//StringTokenizer st = new StringTokenizer(tks[i]);
//There are several tokenizers available. SimpleTokenizer works best
results.add(parseSentence(sentence));
}
return results;
}

/** Tokenize a paragraph into sentences, then into words. */
public List<List<String>> tokenizeParagraph(String paragraph) {
List<List<String>> results = new ArrayList<>();
// Find sentences, tokenize each, parse each, return top parse for each
for (String unsplit_sentence : sentenceDetector.sentDetect(paragraph)) {
results.add(Arrays.asList(
SimpleTokenizer.INSTANCE.tokenize(unsplit_sentence)
));
}
return results;
}

/** Enumerate all of the child parses of a parse tree */
public List<Parse> getAllChildren(List<Parse> parses){
List<Parse> doneChildren = new ArrayList<>(parses.size());
Deque<Parse> nextChildren = new ArrayDeque<>(100);
nextChildren.addAll(parses);
while (!nextChildren.isEmpty()) {
Parse child = nextChildren.remove();
doneChildren.add(child);
nextChildren.addAll(Arrays.asList(child.getChildren()));
}
return doneChildren;
}

/** Enumerate all the child parses of a single-root parse tree */
private List<Parse> getAllChildren(Parse parse){
List<Parse> p = new ArrayList<>(1);
p.add(parse);
return getAllChildren(p);
}

/** Compute the number of matches between two sets of parses
* where a match means same label over the same string
* @param pa1 One Parse forest
* @param pa2 Another parse forest
* @param verbose Whether to print progress to stdout
* @return score
*/
public double compareParseChunks(List<Parse> pa1, List<Parse> pa2, boolean verbose){

HashSet<String> bag1 = new HashSet<>();
HashSet<String> bag2 = new HashSet<>();

for (Parse p : pa1) {
bag1.add(p.getCoveredText()+"\n"+p.getLabel());
}
for (Parse p : pa2) {
bag2.add(p.getCoveredText()+"\n"+p.getLabel());
}

bag2.retainAll(bag1);
return bag2.size();
return trees;
}

/**
* Flatten a paragraph into a set of unique tokens
* @param paragraph
* @return the flattened set
* Score the similarity of two sentences according to
* sum([ len(x) | x of X, y of Y, if x == y ])
* where X and Y are the sets of subtrees of the parses of s1 and s2.
* @param x
* @param y
* @return
*/
public HashSet<String> flatten(List<List<String>> paragraph) {
HashSet<String> results = new HashSet<>();
for (List<String> sentence : paragraph)
for (String word : sentence)
results.add(word.toLowerCase());
return results;
public double scorePhrases(String s1, String s2) {
List<Tree> t1 = parseToTrees(s1);
List<Tree> t2 = parseToTrees(s2);
HashSet<Tree> common_subtrees = new HashSet<>();
common_subtrees.addAll(t1);
common_subtrees.retainAll(t2);

double score = 0.0;
for (Tree x : common_subtrees) {
// x.getLeaves().size() may also be a good idea.
// I don't have any intuition for which may be better.
score += x.size();
}
return score;
}


/** Generare a normalized score.
/** Generate a simple score based on scorePhrases.
*
*/
//TODO divide by passage length containing the matches, not the full passage length
public double scorePassage(Question q, Answer a, Passage p) {
boolean verbose = true;

// Tokenize the text, necessary for simple and NLP searches
List<List<String>> ca_sentences = tokenizeParagraph(a.candidate_text);
List<List<String>> q_sentences = tokenizeParagraph(q.getRaw_text());
List<List<String>> passage_sentences = tokenizeParagraph(p.getText());

// Run NLP on the question and candidate answer
List<Parse> ca_children = getAllChildren(parseParagraph(ca_sentences));
List<Parse> q_children = getAllChildren(parseParagraph(q_sentences));
List<Parse> p_children = new ArrayList<>();

// Speedup: Look for these tokens before running NLP
HashSet<String> target_tokens = flatten(ca_sentences);
//target_tokens.addAll(flatten(q_sentences));
// Free stop filtering (costs no more than what we were
// already doing)
target_tokens.removeAll(Arrays.asList(new String[]{
"i", "me", "you", "he", "she", "him", "they", "them",
"his", "her", "hers", "my", "mine", "your", "yours", "their", "theirs",
"of", "a", "the",
"and", "or", "not", "but",
"this", "that", "these", "those",
"on", "in", "from", "to", "over", "under", "with", "by", "for",
"without", "beside", "between",
"has", "have", "had", "will", "would", "gets", "get", "got",
"be", "am", "been", "was", "were", "being", "is",
".", ",", ":", ";", "[", "{", "}", "]", "(", ")", "<", ">",
"?", "/", "\\", "-", "_", "=", "+", "~", "`", "@", "#", "$",
"%", "^", "&", "*"
}));

for (List<String> sentence : passage_sentences) {
// Does it have the right tokens?
for (String word : sentence) {
if (target_tokens.contains(word.toLowerCase())) {
// Found a common word. Now compare the sentences.
p_children.addAll(getAllChildren(parseSentence(sentence)));
break;
}
}
}

double q_score = compareParseChunks(
q_children,
p_children,
verbose);
double ca_score = compareParseChunks(
ca_children,
p_children,
verbose);
return q_score*ca_score/p.getText().length();
return scorePhrases(p.getText(), a.candidate_text);
}
}

40 changes: 38 additions & 2 deletions src/test/java/uncc2014watsonsim/CoreNLPSentenceSimilarityTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,53 @@

import static org.junit.Assert.*;

import java.util.ArrayList;

import org.junit.Test;

import edu.stanford.nlp.trees.Tree;
import uncc2014watsonsim.scorers.CoreNLPSentenceSimilarity;

public class CoreNLPSentenceSimilarityTest {

@Test
public void testParseToTree() {
CoreNLPSentenceSimilarity scorer = new CoreNLPSentenceSimilarity();
assertEquals(scorer.parseToTree(""), null);
assertEquals(scorer.parseToTree("Example"), null);

// Empty case
assertEquals(new ArrayList<>(), scorer.parseToTrees(""));
// Simple case
assertEquals(Tree.valueOf("(ROOT (NP (NN Example)))"), scorer.parseToTrees("Example").get(0));
// Challenging case
// fails: "Buffalo buffalo Buffalo buffalo buffalo buffalo Buffalo buffalo."
// succeeds, or at least it looks generally right to me:
assertEquals(Tree.valueOf("(ROOT (S (NP (NNP Niel) (NNP Armstrong)) "
+ "(VP (VBD was) (NP (DT the) (JJ first) (NN man)"
+ "(S (VP (TO to) (VP (VB walk) "
+ "(PP (IN on) (NP (DT the) (NN moon)))))))) (. .)))"),
scorer.parseToTrees("Niel Armstrong was the first man to walk on the moon.").get(0));

assertEquals(
Tree.valueOf("(ROOT (S (NP (PRP I)) (VP (VBP am) (ADJP (JJ tall))) (. .)))"),
scorer.parseToTrees("I am tall. You are short.").get(0));
assertEquals(
Tree.valueOf("(ROOT (S (NP (PRP You)) (VP (VBP are) (ADJP (JJ short))) (. .)))"),
scorer.parseToTrees("I am tall. You are short.").get(1));

}

@Test
public void testScorePhrases() {
CoreNLPSentenceSimilarity scorer = new CoreNLPSentenceSimilarity();
assertEquals(
6.0,
scorer.scorePhrases("My goat knows the bowling score.", "Michael rowed the boat ashore."),
0.01
);
assertEquals(
28.0,
scorer.scorePhrases("A tisket, a tasket, a green and yellow basket.", "A tisket, a tasket, what color is my basket?"),
0.01
);
}
}

0 comments on commit 376c307

Please sign in to comment.