Skip to content

Commit

Permalink
Lemma deduplication helper on merge lexicon job
Browse files Browse the repository at this point in the history
  • Loading branch information
Icemole committed Oct 27, 2023
1 parent dee0609 commit ba03170
Showing 1 changed file with 12 additions and 1 deletion.
13 changes: 12 additions & 1 deletion lexicon/modification.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,16 +130,22 @@ class MergeLexiconJob(Job):
will create a new lexicon that might be incompatible to previously generated alignments.
"""

def __init__(self, bliss_lexica, sort_phonemes=False, sort_lemmata=False, compressed=True):
__sis_hash_exclude__ = {"deduplicate_lemmata": False}

def __init__(
self, bliss_lexica, sort_phonemes=False, sort_lemmata=False, compressed=True, deduplicate_lemmata=False
):
"""
:param list[Path] bliss_lexica: list of bliss lexicon files (plain or gz)
:param bool sort_phonemes: sort phoneme inventory alphabetically
:param bool sort_lemmata: sort lemmata alphabetically based on first orth entry
:param bool compressed: compress final lexicon
:param bool deduplicate_lemmata: whether to deduplicate lemmatas, only applied when sort_lemmata=True
"""
self.lexica = bliss_lexica
self.sort_phonemes = sort_phonemes
self.sort_lemmata = sort_lemmata
self.deduplicate_lemmata = deduplicate_lemmata

self.out_bliss_lexicon = self.output_path("lexicon.xml.gz" if compressed else "lexicon.xml")

Expand Down Expand Up @@ -178,7 +184,12 @@ def run(self):
for lemma in lex.lemmata:
# sort by first orth entry
orth_key = lemma.orth[0] if lemma.orth else ""
if self.deduplicate_lemmata:
# don't add the lemma when there's already an equal lemma
if len(lemma_dict[orth_key]) > 0 and lemma == lemma_dict[orth_key][0]:
continue
lemma_dict[orth_key].append(lemma)
print(lemma_dict)
merged_lex.lemmata = list(itertools.chain(*[lemma_dict[key] for key in sorted(lemma_dict.keys())]))
else:
for lex in lexica:
Expand Down

0 comments on commit ba03170

Please sign in to comment.