From 284da97ee70bf8288c3abbbf1b0675ac88cf693e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?=
 <nbeneitez@apptek.com>
Date: Mon, 9 Dec 2024 09:33:02 +0000
Subject: [PATCH 1/4] Add left/right context orth

Also add elements to segment constructor
---
 lib/corpus.py | 45 ++++++++++++++++++++++++++++++++++++---------
 1 file changed, 36 insertions(+), 9 deletions(-)
diff --git a/lib/corpus.py b/lib/corpus.py
index 076cb048..8bbd98db 100644
--- a/lib/corpus.py
+++ b/lib/corpus.py
@@ -116,7 +116,7 @@ def startElement(self, name: str, attrs: Dict[str, str]):
     def endElement(self, name: str):
         e = self.elements[-1]
 
-        if name == "orth":
+        if name == "orth" or name == "left-context-orth" or name == "right-context-orth":
             assert isinstance(e, Segment)
             # we do some processing of the text that goes into the orth tag to get a nicer formating, some corpora may have
             # multiline content in the orth tag, but to keep it that way might not be consistent with the indentation during
@@ -124,7 +124,7 @@ def endElement(self, name: str):
             text = self.chars.strip()
             text = re.sub(" +", " ", text)
             text = re.sub("\n", "", text)
-            e.orth = text
+            setattr(e, name.replace("-", "_"), text)
         elif isinstance(e, Speaker) and name != "speaker-description":
             # we allow all sorts of elements within a speaker description
             e.attribs[name] = self.chars.strip()
@@ -356,15 +356,38 @@ def get_segment_mapping(self) -> Dict[str, Segment]:
 
 
 class Segment(NamedEntity):
-    def __init__(self):
+    def __init__(
+        self,
+        start: float = 0.0,
+        end: float = 0.0,
+        track: Optional[int] = None,
+        orth: Optional[str] = None,
+        left_context_orth: Optional[str] = None,
+        right_context_orth: Optional[str] = None,
+        speaker_name: Optional[str] = None,
+        recording: Optional[Recording] = None,
+    ):
+        """
+        :param start: Segment start.
+        :param end: Segment end.
+        :param track: Segment track/channel.
+        :param orth: Segment text.
+        :param left_context_orth: Optional left context when aligning (specific to RASR).
+        :param right_context_orth: Optional right context when aligning (specific to RASR).
+        :param speaker_name: Speaker name.
+        :param recording: Recording in which the segment is embedded.
+        """
         super().__init__()
-        self.start = 0.0
-        self.end = 0.0
-        self.track: Optional[int] = None
-        self.orth: Optional[str] = None
-        self.speaker_name: Optional[str] = None
 
-        self.recording: Optional[Recording] = None
+        self.start = start
+        self.end = end
+        self.track = track
+        self.orth = orth
+        self.left_context_orth = left_context_orth
+        self.right_context_orth = right_context_orth
+        self.speaker_name = speaker_name
+
+        self.recording = recording
 
     def fullname(self) -> str:
         return self.recording.fullname() + "/" + self.name
@@ -384,6 +407,10 @@ def dump(self, out: TextIO, indentation: str = ""):
             out.write('%s  <speaker name="%s"/>\n' % (indentation, self.speaker_name))
         if self.orth is not None:
             out.write("%s  <orth> %s </orth>\n" % (indentation, saxutils.escape(self.orth)))
+        if self.left_context_orth is not None:
+            out.write("%s  <left-context-orth> %s </left-context-orth>\n" % (indentation, saxutils.escape(self.left_context_orth)))
+        if self.right_context_orth is not None:
+            out.write("%s  <right-context-orth> %s </right-context-orth>\n" % (indentation, saxutils.escape(self.right_context_orth)))
         if has_child_element:
             out.write("%s</segment>\n" % indentation)
         else:

From 35d6eda41c46ed202372c2b5e763d1ab4dcb8c9d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?=
 <nbeneitez@apptek.com>
Date: Mon, 9 Dec 2024 09:45:31 +0000
Subject: [PATCH 2/4] Black

---
 lib/corpus.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/lib/corpus.py b/lib/corpus.py
index 8bbd98db..ccc35cbb 100644
--- a/lib/corpus.py
+++ b/lib/corpus.py
@@ -372,8 +372,8 @@ def __init__(
         :param end: Segment end.
         :param track: Segment track/channel.
         :param orth: Segment text.
-        :param left_context_orth: Optional left context when aligning (specific to RASR).
-        :param right_context_orth: Optional right context when aligning (specific to RASR).
+        :param left_context_orth: Optional left context when aligning (specific for RASR alignment).
+        :param right_context_orth: Optional right context when aligning (specific for RASR alignment).
         :param speaker_name: Speaker name.
         :param recording: Recording in which the segment is embedded.
         """
@@ -408,9 +408,15 @@ def dump(self, out: TextIO, indentation: str = ""):
         if self.orth is not None:
             out.write("%s  <orth> %s </orth>\n" % (indentation, saxutils.escape(self.orth)))
         if self.left_context_orth is not None:
-            out.write("%s  <left-context-orth> %s </left-context-orth>\n" % (indentation, saxutils.escape(self.left_context_orth)))
+            out.write(
+                "%s  <left-context-orth> %s </left-context-orth>\n"
+                % (indentation, saxutils.escape(self.left_context_orth))
+            )
         if self.right_context_orth is not None:
-            out.write("%s  <right-context-orth> %s </right-context-orth>\n" % (indentation, saxutils.escape(self.right_context_orth)))
+            out.write(
+                "%s  <right-context-orth> %s </right-context-orth>\n"
+                % (indentation, saxutils.escape(self.right_context_orth))
+            )
         if has_child_element:
             out.write("%s</segment>\n" % indentation)
         else:

From 7fa6ab8e05493a8ff300f68a5676b8b2315dca58 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?=
 <31628502+Icemole@users.noreply.github.com>
Date: Mon, 9 Dec 2024 15:04:54 +0100
Subject: [PATCH 3/4] Allow only kwargs for Segment init

Co-authored-by: Albert Zeyer <zeyer@cs.rwth-aachen.de>
---
 lib/corpus.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/corpus.py b/lib/corpus.py
index ccc35cbb..40ee486b 100644
--- a/lib/corpus.py
+++ b/lib/corpus.py
@@ -358,6 +358,7 @@ def get_segment_mapping(self) -> Dict[str, Segment]:
 class Segment(NamedEntity):
     def __init__(
         self,
+        *,
         start: float = 0.0,
         end: float = 0.0,
         track: Optional[int] = None,

From dcc19aa49a5d1fff06173c8027d4f303995d0748 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?=
 <31628502+Icemole@users.noreply.github.com>
Date: Mon, 9 Dec 2024 15:36:36 +0100
Subject: [PATCH 4/4] Set inclusion instead of comparison via or

Co-authored-by: Eugen Beck <curufinwe@users.noreply.github.com>
---
 lib/corpus.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/corpus.py b/lib/corpus.py
index 40ee486b..11c9b861 100644
--- a/lib/corpus.py
+++ b/lib/corpus.py
@@ -116,7 +116,7 @@ def startElement(self, name: str, attrs: Dict[str, str]):
     def endElement(self, name: str):
         e = self.elements[-1]
 
-        if name == "orth" or name == "left-context-orth" or name == "right-context-orth":
+        if name in {"orth", "left-context-orth", "right-context-orth"}:
             assert isinstance(e, Segment)
             # we do some processing of the text that goes into the orth tag to get a nicer formating, some corpora may have
             # multiline content in the orth tag, but to keep it that way might not be consistent with the indentation during