FineTuning a transformer SpanCat pipeline on a small corpus - a Best pratice Inquiry. #13398

Sifortin · 2024-03-26T15:34:03Z

Sifortin
Mar 26, 2024

Hi there,

I am currently trying to fine tune a Roberta on a SpanCat task with ~20 labels over a thousand document corpus(80% Training/20% Validation).
While we are getting relevant results (average of 0.65 f1 score over all labels), I am finding it hard to pinpoint what could be refined so that we get better results. To give more context, we currently have 1000 documents parsed in Tesseract, manually labelled over the field we want to recognise, and then made into a spacy dataset. We believe that precision and recall should somehow grow with the size of the dataset, but as a paper I read pointed out, they may not be directly correlated.

There is also the question of the config file we used, we basically did not deviate from the standard config of spancat (apart from using the singlelabel), but this may or may not affect our result.

Some fine-tuning focus papers I read talked about longer epochs with more steps per epoch, but how does that translate into SpaCy?

Thanks and Kind regards.

Here is our Config file:

[paths]
train = "./datasets/x/train_spancat.spacy"
dev = "./datasets/x/validation_spancat.spacy"
vectors = null
init_tok2vec = null

[system]
gpu_allocator = "pytorch"
seed = 0

[nlp]
lang = "xx"
pipeline = ["transformer","spancat_singlelabel"]
batch_size = 128
disabled = []
before_creation = null
after_creation = null
after_pipeline_creation = null
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
vectors = {"@vectors":"spacy.Vectors.v1"}

[components]

[components.spancat_singlelabel]
factory = "spancat_singlelabel"
allow_overlap = "False"
negative_weight = 1
scorer = {"@scorers":"score_spans_custom_v1"}
spans_key = "sc"

[components.spancat_singlelabel.model]
@architectures = "spacy.SpanCategorizer.v1"

[components.spancat_singlelabel.model.reducer]
@layers = "spacy.mean_max_reducer.v1"
hidden_size = 128

[components.spancat_singlelabel.model.scorer]
@layers = "spacy.LinearLogistic.v1"
nO = null
nI = null

[components.spancat_singlelabel.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
pooling = {"@layers":"reduce_mean.v1"}
upstream = "*"

[components.spancat_singlelabel.suggester]
@misc = "spacy.ngram_suggester.v1"
sizes = [1,2,3]

[components.transformer]
factory = "transformer"
max_batch_items = 4096
set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}

[components.transformer.model]
@architectures = "spacy-transformers.TransformerModel.v3"
name = "roberta-base"
mixed_precision = false

[components.transformer.model.get_spans]
@span_getters = "spacy-transformers.strided_spans.v1"
window = 128
stride = 96

[components.transformer.model.grad_scaler_config]

[components.transformer.model.tokenizer_config]
use_fast = true

[components.transformer.model.transformer_config]

[corpora]

[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
max_length = 0
gold_preproc = false
limit = 0
augmenter = null

[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
max_length = 0
gold_preproc = false
limit = 0
augmenter = null

[training]
accumulate_gradient = 3
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"
seed = ${system.seed}
gpu_allocator = ${system.gpu_allocator}
dropout = 0.05
patience = 2000
max_epochs = 0
max_steps = 0
eval_frequency = 128
frozen_components = []
annotating_components = []
before_to_disk = null
before_update = null

[training.batcher]
@batchers = "spacy.batch_by_padded.v1"
discard_oversize = true
size = 2000
buffer = 256
get_length = null

[training.logger]
@loggers = "logger.V1"
progress_bar = true
console_output = true
output_file = "training_log.jsonl"

[training.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = false
eps = 0.00000001

[training.optimizer.learn_rate]
@schedules = "warmup_linear.v1"
warmup_steps = 250
total_steps = 20000
initial_rate = 0.00005

[training.score_weights]
spans_sc_f = 0.0
spans_sc_p = 0.0
spans_sc_r = 0.0
TRANSMITTER_NUMBER_precision = 0.0
TRANSMITTER_NUMBER_recall = 0.0
TRANSMITTER_NUMBER_f1 = 0.08
RECEIVER_NUMBER_precision = 0.0
RECEIVER_NUMBER_recall = 0.0
RECEIVER_NUMBER_f1 = 0.08
DUE_DATE_precision = 0.0
DUE_DATE_recall = 0.0
DUE_DATE_f1 = 0.04
RATE_VALUE_precision = 0.0
RATE_VALUE_recall = 0.0
RATE_VALUE_f1 = 0.08
INVOICE_DATE_precision = 0.0
INVOICE_DATE_recall = 0.0
INVOICE_DATE_f1 = 0.08
INVOICE_NUMBER_precision = 0.0
INVOICE_NUMBER_recall = 0.0
INVOICE_NUMBER_f1 = 0.08
TOTAL_EXCL_VAT_precision = 0.0
TOTAL_EXCL_VAT_recall = 0.0
TOTAL_EXCL_VAT_f1 = 0.08
TOTAL_INCL_VAT_precision = 0.0
TOTAL_INCL_VAT_recall = 0.0
TOTAL_INCL_VAT_f1 = 0.08
TOTAL_VAT_precision = 0.0
TOTAL_VAT_recall = 0.0
TOTAL_VAT_f1 = 0.08
IBAN_precision = 0.0
IBAN_recall = 0.0
IBAN_f1 = 0.04
VAT_VALUE_precision = 0.0
VAT_VALUE_recall = 0.0
VAT_VALUE_f1 = 0.08
EXCL_VAT_VALUE_precision = 0.0
EXCL_VAT_VALUE_recall = 0.0
EXCL_VAT_VALUE_f1 = 0.08
INCL_VAT_VALUE_precision = 0.0
INCL_VAT_VALUE_recall = 0.0
INCL_VAT_VALUE_f1 = 0.08
VCS_precision = 0.0
VCS_recall = 0.0
VCS_f1 = 0.04

[pretraining]

[initialize]
vectors = ${paths.vectors}
init_tok2vec = ${paths.init_tok2vec}
vocab_data = null
lookups = null
before_init = null
after_init = null

[initialize.components]

[initialize.tokenizer]

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FineTuning a transformer SpanCat pipeline on a small corpus - a Best pratice Inquiry. #13398

{{title}}

{{editor}}'s edit

{{editor}}'s edit

Replies: 0 comments

Select a reply

FineTuning a transformer SpanCat pipeline on a small corpus - a Best pratice Inquiry. #13398

Sifortin Mar 26, 2024

Replies: 0 comments

Sifortin
Mar 26, 2024