Incorrect Tokenization of a Tibetan custom tokenizer #13671

ykyogoku · 2024-10-19T07:55:39Z

ykyogoku
Oct 19, 2024

I trained a pipeline (tok2vec, morphologizer) for Tibetan using a custom tokenizer called botok. However, the trained pipeline incorrectly tokenizes certain Tibetan sentences. For example, the following sentence is tokenized incorrectly:

དེ་ནི་སྙན་ངག་གསར་རྩོམ་ལ་འཇུག་པའི་སྤྱིའི་ཐབས་ཚུལ་ཡང་ཡིན།
(de ni snyan ngag gsar rtsom la 'jug pa'i spyi'i thabs tshul yang yin |: transliteration)

The word la (eng. to) and 'jug pa (eng. apply/applied) should be separated, as they are always separated in the training dataset, and botok, the Tibetan tokenizer integrated into the pipeline, correctly tokenizes the same sentence. Similarly, de (eng. that) and ni (topic particle), which should be also separated, are not tokenized correctly.

I wonder if something went wrong in the spaCy training process. Can anyone help identify the cause? I checked similar issues, such as this one, but I could not find an issue that matches this specific case.

My Environment

I tested the pipelines trained in different environments:
spacy 3.6.x and spacy 3.2.x (both under python 3.7.x)

The configuration file is as follows:

[paths]
train = "/work/yk36rery-MySpace/corpus/train.spacy"
dev = "/work/yk36rery-MySpace/corpus/dev.spacy"
vectors = null
init_tok2vec = null

[system]
gpu_allocator = null
seed = 0

[nlp]
lang = "xx"
pipeline = ["tok2vec","morphologizer","parser"]
batch_size = 1000
disabled = []
before_creation = null
after_creation = null
after_pipeline_creation = null
tokenizer = {"@tokenizers":"botok_tokenizer"}
# vectors = {"@vectors":"spacy.Vectors.v1"}

[components]

[components.morphologizer]
factory = "morphologizer"
extend = false
# label_smoothing = 0.05
overwrite = true
scorer = {"@scorers":"spacy.morphologizer_scorer.v1"}

[components.morphologizer.model]
# @architectures = "spacy.Tagger.v2"
@architectures = "spacy.Tagger.v1"
nO = null
# normalize = false

[components.morphologizer.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
upstream = "*"

[components.parser]
factory = "parser"
learn_tokens = false
# min_action_freq = 30
min_action_freq = 1
moves = null
scorer = {"@scorers":"spacy.parser_scorer.v1"}
update_with_oracle_cut_size = 100

[components.parser.model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "parser"
extra_state_tokens = false
hidden_width = 128
maxout_pieces = 3
use_upper = true
nO = null

[components.parser.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
upstream = "*"

[components.tok2vec]
factory = "tok2vec"

[components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v2"

[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v2"
width = ${components.tok2vec.model.encode.width}
attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
rows = [5000,1000,2500,2500]
include_static_vectors = true

[components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
width = 256
depth = 8
window_size = 1
maxout_pieces = 3

[corpora]

[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
max_length = 0
gold_preproc = false
limit = 0
augmenter = null

[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
max_length = 0
gold_preproc = false
limit = 0
augmenter = null

[training]
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"
seed = ${system.seed}
gpu_allocator = ${system.gpu_allocator}
dropout = 0.1
accumulate_gradient = 1
patience = 1600
max_epochs = 0
max_steps = 20000
eval_frequency = 200
frozen_components = []
annotating_components = []
# annotating_components = ["tok2vec","morphologizer"]
before_to_disk = null
# before_update = null

[training.batcher]
@batchers = "spacy.batch_by_words.v1"
discard_oversize = false
tolerance = 0.2
get_length = null

[training.batcher.size]
@schedules = "compounding.v1"
start = 100
stop = 1000
compound = 1.001
t = 0.0

[training.logger]
@loggers = "spacy.ConsoleLogger.v1"
progress_bar = false

[training.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = false
eps = 0.00000001
learn_rate = 0.001

[training.score_weights]
pos_acc = 0.25
morph_acc = 0.25
morph_per_feat = null
dep_uas = 0.25
dep_las = 0.25
dep_las_per_type = null
sents_p = null
sents_r = null
sents_f = 0.0

[pretraining]

[initialize]
vectors = ${paths.vectors}
init_tok2vec = ${paths.init_tok2vec}
vocab_data = null
lookups = null
before_init = null
after_init = null

[initialize.components]

[initialize.tokenizer]

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Incorrect Tokenization of a Tibetan custom tokenizer #13671

{{title}}

Replies: 0 comments

Select a reply

Incorrect Tokenization of a Tibetan custom tokenizer #13671

ykyogoku Oct 19, 2024

My Environment

Replies: 0 comments

ykyogoku
Oct 19, 2024