From 284da97ee70bf8288c3abbbf1b0675ac88cf693e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?= Date: Mon, 9 Dec 2024 09:33:02 +0000 Subject: [PATCH 1/4] Add left/right context orth Also add elements to segment constructor --- lib/corpus.py | 45 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/lib/corpus.py b/lib/corpus.py index 076cb048..8bbd98db 100644 --- a/lib/corpus.py +++ b/lib/corpus.py @@ -116,7 +116,7 @@ def startElement(self, name: str, attrs: Dict[str, str]): def endElement(self, name: str): e = self.elements[-1] - if name == "orth": + if name == "orth" or name == "left-context-orth" or name == "right-context-orth": assert isinstance(e, Segment) # we do some processing of the text that goes into the orth tag to get a nicer formating, some corpora may have # multiline content in the orth tag, but to keep it that way might not be consistent with the indentation during @@ -124,7 +124,7 @@ def endElement(self, name: str): text = self.chars.strip() text = re.sub(" +", " ", text) text = re.sub("\n", "", text) - e.orth = text + setattr(e, name.replace("-", "_"), text) elif isinstance(e, Speaker) and name != "speaker-description": # we allow all sorts of elements within a speaker description e.attribs[name] = self.chars.strip() @@ -356,15 +356,38 @@ def get_segment_mapping(self) -> Dict[str, Segment]: class Segment(NamedEntity): - def __init__(self): + def __init__( + self, + start: float = 0.0, + end: float = 0.0, + track: Optional[int] = None, + orth: Optional[str] = None, + left_context_orth: Optional[str] = None, + right_context_orth: Optional[str] = None, + speaker_name: Optional[str] = None, + recording: Optional[Recording] = None, + ): + """ + :param start: Segment start. + :param end: Segment end. + :param track: Segment track/channel. + :param orth: Segment text. + :param left_context_orth: Optional left context when aligning (specific to RASR). + :param right_context_orth: Optional right context when aligning (specific to RASR). + :param speaker_name: Speaker name. + :param recording: Recording in which the segment is embedded. + """ super().__init__() - self.start = 0.0 - self.end = 0.0 - self.track: Optional[int] = None - self.orth: Optional[str] = None - self.speaker_name: Optional[str] = None - self.recording: Optional[Recording] = None + self.start = start + self.end = end + self.track = track + self.orth = orth + self.left_context_orth = left_context_orth + self.right_context_orth = right_context_orth + self.speaker_name = speaker_name + + self.recording = recording def fullname(self) -> str: return self.recording.fullname() + "/" + self.name @@ -384,6 +407,10 @@ def dump(self, out: TextIO, indentation: str = ""): out.write('%s \n' % (indentation, self.speaker_name)) if self.orth is not None: out.write("%s %s \n" % (indentation, saxutils.escape(self.orth))) + if self.left_context_orth is not None: + out.write("%s %s \n" % (indentation, saxutils.escape(self.left_context_orth))) + if self.right_context_orth is not None: + out.write("%s %s \n" % (indentation, saxutils.escape(self.right_context_orth))) if has_child_element: out.write("%s\n" % indentation) else: From 35d6eda41c46ed202372c2b5e763d1ab4dcb8c9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?= Date: Mon, 9 Dec 2024 09:45:31 +0000 Subject: [PATCH 2/4] Black --- lib/corpus.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/lib/corpus.py b/lib/corpus.py index 8bbd98db..ccc35cbb 100644 --- a/lib/corpus.py +++ b/lib/corpus.py @@ -372,8 +372,8 @@ def __init__( :param end: Segment end. :param track: Segment track/channel. :param orth: Segment text. - :param left_context_orth: Optional left context when aligning (specific to RASR). - :param right_context_orth: Optional right context when aligning (specific to RASR). + :param left_context_orth: Optional left context when aligning (specific for RASR alignment). + :param right_context_orth: Optional right context when aligning (specific for RASR alignment). :param speaker_name: Speaker name. :param recording: Recording in which the segment is embedded. """ @@ -408,9 +408,15 @@ def dump(self, out: TextIO, indentation: str = ""): if self.orth is not None: out.write("%s %s \n" % (indentation, saxutils.escape(self.orth))) if self.left_context_orth is not None: - out.write("%s %s \n" % (indentation, saxutils.escape(self.left_context_orth))) + out.write( + "%s %s \n" + % (indentation, saxutils.escape(self.left_context_orth)) + ) if self.right_context_orth is not None: - out.write("%s %s \n" % (indentation, saxutils.escape(self.right_context_orth))) + out.write( + "%s %s \n" + % (indentation, saxutils.escape(self.right_context_orth)) + ) if has_child_element: out.write("%s\n" % indentation) else: From 7fa6ab8e05493a8ff300f68a5676b8b2315dca58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?= <31628502+Icemole@users.noreply.github.com> Date: Mon, 9 Dec 2024 15:04:54 +0100 Subject: [PATCH 3/4] Allow only kwargs for Segment init Co-authored-by: Albert Zeyer --- lib/corpus.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/corpus.py b/lib/corpus.py index ccc35cbb..40ee486b 100644 --- a/lib/corpus.py +++ b/lib/corpus.py @@ -358,6 +358,7 @@ def get_segment_mapping(self) -> Dict[str, Segment]: class Segment(NamedEntity): def __init__( self, + *, start: float = 0.0, end: float = 0.0, track: Optional[int] = None, From dcc19aa49a5d1fff06173c8027d4f303995d0748 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?= <31628502+Icemole@users.noreply.github.com> Date: Mon, 9 Dec 2024 15:36:36 +0100 Subject: [PATCH 4/4] Set inclusion instead of comparison via or Co-authored-by: Eugen Beck --- lib/corpus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/corpus.py b/lib/corpus.py index 40ee486b..11c9b861 100644 --- a/lib/corpus.py +++ b/lib/corpus.py @@ -116,7 +116,7 @@ def startElement(self, name: str, attrs: Dict[str, str]): def endElement(self, name: str): e = self.elements[-1] - if name == "orth" or name == "left-context-orth" or name == "right-context-orth": + if name in {"orth", "left-context-orth", "right-context-orth"}: assert isinstance(e, Segment) # we do some processing of the text that goes into the orth tag to get a nicer formating, some corpora may have # multiline content in the orth tag, but to keep it that way might not be consistent with the indentation during