terms/terms_teach.py

import prodigy
from prodigy.components.db import connect
from prodigy.components.sorters import Probability
from prodigy.util import split_string, set_hashes
import spacy
from spacy.tokens import Doc
from typing import List


# Recipe decorator with argument annotations: (description, argument type,
# shortcut, type / converter function called on value before it's passed to
# the function). Descriptions are also shown when typing --help.
@prodigy.recipe(
    "terms.teach",
    dataset=("The dataset to use", "positional", None, str),
    vectors=("Loadable spaCy model with word vectors", "positional", None, str),
    seeds=("One or more comma-separated seed terms", "option", "o", split_string),
)
def terms_teach(dataset: str, vectors: str, seeds: List[str]):
    """
    Bootstrap a terminology list with word vectors and seeds terms. Prodigy
    will suggest similar terms based on the word vectors, and update the
    target vector accordingly.
    """
    # Connect to the database using the settings from prodigy.json and add the
    # seed terms to the dataset
    DB = connect()
    if dataset and dataset in DB:
        seed_tasks = [set_hashes({"text": s, "answer": "accept"}) for s in seeds]
        DB.add_examples(seed_tasks, datasets=[dataset])

    # Load the spaCy model with vectors
    nlp = spacy.load(vectors)

    # Create two Doc objects for the accepted and rejected terms
    accept_doc = Doc(nlp.vocab, words=seeds)
    reject_doc = Doc(nlp.vocab, words=[])
    score = 0

    def predict(term):
        """Score a term given the current accept_doc and reject_doc."""
        if len(accept_doc) == 0 and len(reject_doc) == 0:
            return 0.5
        # Use spaCy's .similarity() method to compare the term to the
        # accepted and rejected Doc
        if len(accept_doc) and accept_doc.vector_norm != 0.0:
            accept_score = max(term.similarity(accept_doc), 0.0)
        else:
            accept_score = 0.0
        if len(reject_doc) and reject_doc.vector_norm != 0:
            reject_score = max(term.similarity(reject_doc), 0.0)
        else:
            reject_score = 0.0
        score = accept_score / (accept_score + reject_score + 0.2)
        return max(score, 0.0)

    def update(answers):
        # Called whenever Prodigy receives new annotations
        nonlocal accept_doc, reject_doc, score
        accept_words = [t.text for t in accept_doc]
        reject_words = [t.text for t in reject_doc]
        for answer in answers:
            # Increase or decrease score depending on answer and update
            # list of accepted and rejected terms
            if answer["answer"] == "accept":
                score += 1
                accept_words.append(answer["text"])
            elif answer["answer"] == "reject":
                score -= 1
                reject_words.append(answer["text"])
        # Update the target documents in place
        accept_doc = Doc(nlp.vocab, words=accept_words)
        reject_doc = Doc(nlp.vocab, words=reject_words)

    def score_stream(stream):
        # Get all lexemes in the vocab and score them
        lexemes = [lex for lex in stream if lex.is_alpha and lex.is_lower]
        while True:
            seen = set(w.orth for w in accept_doc)
            seen.update(set(w.orth for w in reject_doc))
            lexemes = [w for w in lexemes if w.orth not in seen and w.vector_norm]
            by_score = [(predict(lex), lex) for lex in lexemes]
            by_score.sort(reverse=True)
            for _, term in by_score:
                score = predict(term)
                # Return (score, example) tuples for the scored terms
                yield score, {"text": term.text, "meta": {"score": score}}

    # Sort the scored vocab by probability and return examples
    stream = Probability(score_stream(nlp.vocab))

    return {
        "view_id": "text",  # Annotation interface to use
        "dataset": dataset,  # Name of dataset to save annotations
        "stream": stream,  # Incoming stream of examples
        "update": update,  # Update callback, called with answers
    }