Skip to content

Commit

Permalink
Fix jobs and add tests
Browse files Browse the repository at this point in the history
  • Loading branch information
jairsan committed Dec 4, 2023
1 parent db17195 commit cdfee9e
Show file tree
Hide file tree
Showing 10 changed files with 80 additions and 5 deletions.
Empty file.
4 changes: 4 additions & 0 deletions tests/job_tests/text/files/input_file.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
This is an input file for testing. Sentence 1.
Sentence 2.
Sentence 3.
Sentence 4.
1 change: 1 addition & 0 deletions tests/job_tests/text/files/out.head.1.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is an input file for testing. Sentence 1.
2 changes: 2 additions & 0 deletions tests/job_tests/text/files/out.head.2.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
This is an input file for testing. Sentence 1.
Sentence 2.
Binary file added tests/job_tests/text/files/out.head.2.txt.gz
Binary file not shown.
2 changes: 2 additions & 0 deletions tests/job_tests/text/files/out.tail.2.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Sentence 3.
Sentence 4.
Binary file added tests/job_tests/text/files/out.tail.2.txt.gz
Binary file not shown.
3 changes: 3 additions & 0 deletions tests/job_tests/text/files/out.tail.3.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Sentence 2.
Sentence 3.
Sentence 4.
57 changes: 57 additions & 0 deletions tests/job_tests/text/test_processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import filecmp
import tempfile
from sisyphus import setup_path

from i6_core.text.processing import HeadJob, TailJob

Path = setup_path(__package__)


def test_head_job():
with tempfile.TemporaryDirectory() as tmpdir:
from sisyphus import gs

gs.WORK_DIR = tmpdir

text_file = Path("files/input_file.txt")

cases = [
(None, 0.25, Path("files/out.head.1.txt"), False),
(None, 0.5, Path("files/out.head.2.txt"), False),
(None, 0.6, Path("files/out.head.2.txt"), False),
(2, None, Path("files/out.head.2.txt"), False),
(1, None, Path("files/out.head.1.txt"), False),
(None, 0.5, Path("files/out.head.2.txt.gz"), True),
]

for num_lines, ratio, reference_file, zip_output in cases:
job = HeadJob(text_file=text_file, ratio=ratio, num_lines=num_lines, zip_output=zip_output)

job._sis_setup_directory()
job.run()

assert filecmp.cmp(job.out.get_path(), reference_file.get_path(), shallow=False)


def test_tail_job():
with tempfile.TemporaryDirectory() as tmpdir:
from sisyphus import gs

gs.WORK_DIR = tmpdir

text_file = Path("files/input_file.txt")

cases = [
(None, 0.5, Path("files/out.tail.2.txt"), False),
(None, 0.75, Path("files/out.tail.3.txt"), False),
(2, None, Path("files/out.tail.2.txt"), False),
(2, None, Path("files/out.tail.2.txt.gz"), True),
]

for num_lines, ratio, reference_file, zip_output in cases:
job = TailJob(text_file=text_file, ratio=ratio, num_lines=num_lines, zip_output=zip_output)

job._sis_setup_directory()
job.run()

assert filecmp.cmp(job.out.get_path(), reference_file.get_path(), shallow=False)
16 changes: 11 additions & 5 deletions text/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ def __init__(self, text_file, num_lines=None, ratio=None, zip_output=True):
self.ratio = ratio
self.zip_output = zip_output

self.out = self.output_path("out.gz")
self.out = self.output_path("out.gz") if self.zip_output else self.output_path("out")
self.length = self.output_var("length")

def tasks(self):
Expand All @@ -202,7 +202,7 @@ def run(self):
if self.ratio:
assert not self.num_lines
length = int(self.sh("zcat -f {text_file} | wc -l", True))
self.lines = int(length * self.ratio)
self.num_lines = int(length * self.ratio)

pipeline = "zcat -f {text_file} | head -n {num_lines}"
if self.zip_output:
Expand All @@ -223,11 +223,17 @@ class TailJob(HeadJob):

def run(self):
if self.ratio:
assert not self.lines
assert not self.num_lines
length = int(self.sh("zcat -f {text_file} | wc -l", True))
self.lines = int(length * self.ratio)
self.num_lines = int(length * self.ratio)

pipeline = "zcat -f {text_file} | tail -n {num_lines}"
if self.zip_output:
pipeline += " | gzip"
pipeline += " > {out}"

self.sh("zcat -f {text_file} | tail -n {num_lines} | gzip > {out}")
self.sh(pipeline)
self.length.set(self.num_lines)


class SetDifferenceJob(Job):
Expand Down

0 comments on commit cdfee9e

Please sign in to comment.