diff --git a/scripts/helper_scripts/unipept-database-rs/src/bin/taxa-by-chunk.rs b/scripts/helper_scripts/unipept-database-rs/src/bin/taxa-by-chunk.rs new file mode 100644 index 00000000..64a20225 --- /dev/null +++ b/scripts/helper_scripts/unipept-database-rs/src/bin/taxa-by-chunk.rs @@ -0,0 +1,101 @@ +use std::fs::{read_dir, File}; +use std::io::{BufRead, BufWriter, Write}; +use std::path::{Path, PathBuf}; + +use anyhow::{Context, Result}; +use clap::Parser; +use regex::Regex; + +use unipept_database::utils::files::open_sin; + +fn main() -> Result<()> { + let args = Cli::parse(); + + let mut all_taxa: Vec = Vec::new(); + + let reader = open_sin(); + + // Read all taxa ids from stdin + for line in reader.lines() { + let line = line.context("Error reading line from stdin")?; + + // Ignore empty lines + if line.trim().is_empty() { + continue; + } + + let taxa_id: u64 = line + .trim() + .parse() + .with_context(|| format!("Error parsing {line} as an integer"))?; + all_taxa.push(taxa_id); + } + + let chunk_file_regex = Regex::new(r"unipept\..*\.gz").context("Error creating regex")?; + + for entry in read_dir(&args.chunk_dir).context("Error reading chunk directory")? { + let entry = entry.context("Error reading entry from chunk directory")?; + let path = entry.path(); + if !path.is_file() { + continue; + } + + let base_name = match path.file_name() { + None => { + continue; + } + Some(n) => n.to_str().context("Error creating string from file path")?, + }; + + if !chunk_file_regex.is_match(base_name) { + continue; + } + + // Parse the taxa range out of the filename + let replaced_name = base_name.replace("unipept.", "").replace(".chunk.gz", ""); + let range = replaced_name.split_once('-'); + let range = range.with_context(|| format!("Unable to split {replaced_name} on '-'"))?; + let start: u64 = range + .0 + .parse() + .with_context(|| format!("Error parsing {} as an integer", range.0))?; + let end: u64 = range + .1 + .parse() + .with_context(|| format!("Error parsing {} as an integer", range.1))?; + + let matching_taxa: Vec<&u64> = all_taxa + .iter() + .filter(|&t| start <= *t && *t <= end) + .collect(); + + // Write matches to a temporary output file + if !matching_taxa.is_empty() { + let mapped_taxa: Vec = + matching_taxa.iter().map(|&t| format!("\t{t}$")).collect(); + let joined_taxa = mapped_taxa.join("\n"); + + let temp_file_path = Path::new(&args.temp_dir).join(format!("{base_name}.pattern")); + let temp_file = + File::create(&temp_file_path).context("Error creating temporary pattern file")?; + let mut writer = BufWriter::new(temp_file); + write!(&mut writer, "{joined_taxa}",) + .context("Error writing to temporary pattern file")?; + + // The two unwraps here can't be handled using the ? operator + println!("{}", temp_file_path.into_os_string().into_string().unwrap()); + println!("{}", path.into_os_string().into_string().unwrap()); + } + } + + Ok(()) +} + +#[derive(Parser, Debug)] +struct Cli { + #[clap(long)] + chunk_dir: PathBuf, + + #[clap(long)] + temp_dir: PathBuf, +} diff --git a/scripts/helper_scripts/unipept-database-rs/src/bin/write-to-chunk.rs b/scripts/helper_scripts/unipept-database-rs/src/bin/write-to-chunk.rs new file mode 100644 index 00000000..43bdaba2 --- /dev/null +++ b/scripts/helper_scripts/unipept-database-rs/src/bin/write-to-chunk.rs @@ -0,0 +1,86 @@ +use std::fs::File; +use std::io::{BufRead, BufWriter, Write}; +use std::path::{Path, PathBuf}; + +use anyhow::{Context, Result}; +use clap::Parser; + +use unipept_database::utils::files::open_sin; + +fn main() -> Result<()> { + let args = Cli::parse(); + + let mut file_streams: Vec> = Vec::with_capacity(TAXA_BOUNDS.len()); + + // Create writers for the output files + for (idx, bound) in TAXA_BOUNDS.iter().take(TAXA_BOUNDS.len() - 1).enumerate() { + let next = TAXA_BOUNDS[idx + 1]; + let file_name = format!("unipept.{bound}-{next}.chunk"); + let file_path = Path::new(&args.output_dir).join(file_name); + let file_handler = File::create(file_path) + .with_context(|| format!("Unable to create output file {bound}-{next}"))?; + let writer = BufWriter::new(file_handler); + file_streams.push(writer); + } + + let mut reader = open_sin(); + + // First read the header + let mut header: String = String::new(); + reader + .read_line(&mut header) + .context("Error reading header from stdin")?; + write_header(&args.output_dir, header)?; + + // Then the rest of the data + for line in reader.lines() { + let line = line.context("Error reading line from stdin")?; + + if args.verbose { + eprintln!("INFO VERBOSE: writing line to chunk: {line}"); + } + + let spl: Vec<&str> = line.split('\t').collect(); + let taxon_id = spl[8] + .trim() + .parse::() + .with_context(|| format!("Error parsing {} as an integer", spl[8]))?; + + // Find the index of this taxon id in the array + // Note that this can be sped up using binary search (see Python's bisect.bisect_left), + // but this tool is near-instant so we favour readability + let mut index: usize = 0; + while taxon_id > TAXA_BOUNDS[index] { + index += 1; + } + + writeln!(&mut file_streams[index - 1], "{line}").context("Error writing to output file")?; + } + + Ok(()) +} + +#[derive(Parser, Debug)] +struct Cli { + #[clap(short, long)] + output_dir: PathBuf, + #[clap(short, long)] + verbose: bool, +} + +const TAXA_BOUNDS: [usize; 45] = [ + 0, 550, 1352, 3047, 5580, 8663, 11676, 32473, 40214, 52774, 66656, 86630, 116960, 162147, + 210225, 267979, 334819, 408172, 470868, 570509, 673318, 881260, 1046115, 1136135, 1227077, + 1300307, 1410620, 1519492, 1650438, 1756149, 1820614, 1871070, 1898104, 1922217, 1978231, + 2024617, 2026757, 2035430, 2070414, 2202732, 2382165, 2527964, 2601669, 2706029, 10000000, +]; + +fn write_header(output_dir: &PathBuf, header: String) -> Result<()> { + let file_path = Path::new(output_dir).join("db.header"); + let file_handler = File::create(file_path).context("Unable to create header output file")?; + let mut writer = BufWriter::new(file_handler); + + write!(&mut writer, "{}", header).context("Error writing header")?; + + Ok(()) +}