diff --git a/tests/job_tests/text/__init__.py b/tests/job_tests/text/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/job_tests/text/files/input_file.txt b/tests/job_tests/text/files/input_file.txt new file mode 100644 index 00000000..8449831a --- /dev/null +++ b/tests/job_tests/text/files/input_file.txt @@ -0,0 +1,4 @@ +This is an input file for testing. Sentence 1. +Sentence 2. +Sentence 3. +Sentence 4. diff --git a/tests/job_tests/text/files/out.head.1.txt b/tests/job_tests/text/files/out.head.1.txt new file mode 100644 index 00000000..36d0307a --- /dev/null +++ b/tests/job_tests/text/files/out.head.1.txt @@ -0,0 +1 @@ +This is an input file for testing. Sentence 1. diff --git a/tests/job_tests/text/files/out.head.2.txt b/tests/job_tests/text/files/out.head.2.txt new file mode 100644 index 00000000..2a289dad --- /dev/null +++ b/tests/job_tests/text/files/out.head.2.txt @@ -0,0 +1,2 @@ +This is an input file for testing. Sentence 1. +Sentence 2. diff --git a/tests/job_tests/text/files/out.head.2.txt.gz b/tests/job_tests/text/files/out.head.2.txt.gz new file mode 100644 index 00000000..9f84cdc5 Binary files /dev/null and b/tests/job_tests/text/files/out.head.2.txt.gz differ diff --git a/tests/job_tests/text/files/out.tail.2.txt b/tests/job_tests/text/files/out.tail.2.txt new file mode 100644 index 00000000..ca15bd09 --- /dev/null +++ b/tests/job_tests/text/files/out.tail.2.txt @@ -0,0 +1,2 @@ +Sentence 3. +Sentence 4. diff --git a/tests/job_tests/text/files/out.tail.2.txt.gz b/tests/job_tests/text/files/out.tail.2.txt.gz new file mode 100644 index 00000000..619c4d8c Binary files /dev/null and b/tests/job_tests/text/files/out.tail.2.txt.gz differ diff --git a/tests/job_tests/text/files/out.tail.3.txt b/tests/job_tests/text/files/out.tail.3.txt new file mode 100644 index 00000000..364de786 --- /dev/null +++ b/tests/job_tests/text/files/out.tail.3.txt @@ -0,0 +1,3 @@ +Sentence 2. +Sentence 3. +Sentence 4. diff --git a/tests/job_tests/text/test_processing.py b/tests/job_tests/text/test_processing.py new file mode 100644 index 00000000..030ed7ae --- /dev/null +++ b/tests/job_tests/text/test_processing.py @@ -0,0 +1,57 @@ +import filecmp +import tempfile +from sisyphus import setup_path + +from i6_core.text.processing import HeadJob, TailJob + +Path = setup_path(__package__) + + +def test_head_job(): + with tempfile.TemporaryDirectory() as tmpdir: + from sisyphus import gs + + gs.WORK_DIR = tmpdir + + text_file = Path("files/input_file.txt") + + cases = [ + (None, 0.25, Path("files/out.head.1.txt"), False), + (None, 0.5, Path("files/out.head.2.txt"), False), + (None, 0.6, Path("files/out.head.2.txt"), False), + (2, None, Path("files/out.head.2.txt"), False), + (1, None, Path("files/out.head.1.txt"), False), + (None, 0.5, Path("files/out.head.2.txt.gz"), True), + ] + + for num_lines, ratio, reference_file, zip_output in cases: + job = HeadJob(text_file=text_file, ratio=ratio, num_lines=num_lines, zip_output=zip_output) + + job._sis_setup_directory() + job.run() + + assert filecmp.cmp(job.out.get_path(), reference_file.get_path(), shallow=False) + + +def test_tail_job(): + with tempfile.TemporaryDirectory() as tmpdir: + from sisyphus import gs + + gs.WORK_DIR = tmpdir + + text_file = Path("files/input_file.txt") + + cases = [ + (None, 0.5, Path("files/out.tail.2.txt"), False), + (None, 0.75, Path("files/out.tail.3.txt"), False), + (2, None, Path("files/out.tail.2.txt"), False), + (2, None, Path("files/out.tail.2.txt.gz"), True), + ] + + for num_lines, ratio, reference_file, zip_output in cases: + job = TailJob(text_file=text_file, ratio=ratio, num_lines=num_lines, zip_output=zip_output) + + job._sis_setup_directory() + job.run() + + assert filecmp.cmp(job.out.get_path(), reference_file.get_path(), shallow=False) diff --git a/text/processing.py b/text/processing.py index ef49c6e6..a1c3f4c4 100644 --- a/text/processing.py +++ b/text/processing.py @@ -185,7 +185,7 @@ def __init__(self, text_file, num_lines=None, ratio=None, zip_output=True): self.ratio = ratio self.zip_output = zip_output - self.out = self.output_path("out.gz") + self.out = self.output_path("out.gz") if self.zip_output else self.output_path("out") self.length = self.output_var("length") def tasks(self): @@ -202,7 +202,7 @@ def run(self): if self.ratio: assert not self.num_lines length = int(self.sh("zcat -f {text_file} | wc -l", True)) - self.lines = int(length * self.ratio) + self.num_lines = int(length * self.ratio) pipeline = "zcat -f {text_file} | head -n {num_lines}" if self.zip_output: @@ -223,11 +223,17 @@ class TailJob(HeadJob): def run(self): if self.ratio: - assert not self.lines + assert not self.num_lines length = int(self.sh("zcat -f {text_file} | wc -l", True)) - self.lines = int(length * self.ratio) + self.num_lines = int(length * self.ratio) + + pipeline = "zcat -f {text_file} | tail -n {num_lines}" + if self.zip_output: + pipeline += " | gzip" + pipeline += " > {out}" - self.sh("zcat -f {text_file} | tail -n {num_lines} | gzip > {out}") + self.sh(pipeline) + self.length.set(self.num_lines) class SetDifferenceJob(Job):