From e90e7ccd2e4b5c3459e8de250b6b91ef9d2e21b4 Mon Sep 17 00:00:00 2001 From: stijndcl Date: Fri, 8 Mar 2024 13:14:38 +0100 Subject: [PATCH 1/2] Fix changes from Pieter --- scripts/build_database.sh | 7 +++++-- .../unipept-database-rs/src/bin/dat-parser.rs | 5 ++--- .../unipept-database-rs/src/bin/xml-parser.rs | 9 ++++----- .../unipept-database-rs/src/dat_parser/entry.rs | 5 ++--- .../helper_scripts/unipept-database-rs/src/lib.rs | 1 - .../unipept-database-rs/src/uniprot/mod.rs | 15 --------------- 6 files changed, 13 insertions(+), 29 deletions(-) delete mode 100644 scripts/helper_scripts/unipept-database-rs/src/uniprot/mod.rs diff --git a/scripts/build_database.sh b/scripts/build_database.sh index 20fea74..eb7e0a2 100755 --- a/scripts/build_database.sh +++ b/scripts/build_database.sh @@ -83,7 +83,7 @@ END # This function removes all temporary files that have been created by this script. clean() { # Clean contents of temporary directory -# rm -rf "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT" + rm -rf "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT" export TMPDIR="$OLD_TMPDIR" } @@ -365,7 +365,8 @@ download_and_convert_all_sources() { DB_TYPES_ARRAY=($DB_TYPES) DB_SOURCES_ARRAY=($DB_SOURCES) - IFS="$OLDIFS" + # Set IFS to newline to properly split the $CHUNKS variable for folders with newlines + IFS=$"\n" while [[ "$IDX" -ne "${#DB_TYPES_ARRAY}" ]] && [[ -n $(echo "${DB_TYPES_ARRAY[$IDX]}" | sed "s/\s//g") ]] do @@ -462,6 +463,8 @@ download_and_convert_all_sources() { IDX=$((IDX + 1)) done + + IFS="$OLDIFS" } filter_sources_by_taxa() { diff --git a/scripts/helper_scripts/unipept-database-rs/src/bin/dat-parser.rs b/scripts/helper_scripts/unipept-database-rs/src/bin/dat-parser.rs index 007716d..7e9e3c6 100644 --- a/scripts/helper_scripts/unipept-database-rs/src/bin/dat-parser.rs +++ b/scripts/helper_scripts/unipept-database-rs/src/bin/dat-parser.rs @@ -2,7 +2,6 @@ use anyhow::{Context, Result}; use clap::Parser; use unipept_database::dat_parser::uniprot_dat_parser; use unipept_database::dat_parser::utils::write_header; -use unipept_database::uniprot::UniprotType; use unipept_database::utils::files::open_sin; @@ -24,8 +23,8 @@ fn main() -> Result<()> { #[derive(Parser, Debug)] struct Cli { - #[clap(value_enum, short = 't', long, default_value_t = UniprotType::Swissprot)] - db_type: UniprotType, + #[clap(short = 't', long, default_value = "swissprot")] + db_type: String, #[clap(long, default_value_t = 0)] threads: usize, } diff --git a/scripts/helper_scripts/unipept-database-rs/src/bin/xml-parser.rs b/scripts/helper_scripts/unipept-database-rs/src/bin/xml-parser.rs index 9e102cb..b10ae09 100644 --- a/scripts/helper_scripts/unipept-database-rs/src/bin/xml-parser.rs +++ b/scripts/helper_scripts/unipept-database-rs/src/bin/xml-parser.rs @@ -4,7 +4,6 @@ use std::num::NonZeroUsize; use anyhow::{Context, Result}; use clap::Parser; use smartstring::{LazyCompact, SmartString}; -use unipept_database::uniprot::UniprotType; use uniprot::uniprot::{SequentialParser, ThreadedParser}; use unipept_database::utils::files::open_sin; @@ -50,8 +49,8 @@ type SmartStr = SmartString; // Parse a Uniprot XML file and convert it into a TSV-file #[derive(Parser, Debug)] struct Cli { - #[clap(value_enum, short = 't', long, default_value_t = UniprotType::Swissprot)] - uniprot_type: UniprotType, + #[clap(short = 't', long, default_value = "swissprot")] + uniprot_type: String, #[clap(long, default_value_t = 0)] threads: u32, #[clap(short, long, default_value_t = false)] @@ -123,7 +122,7 @@ fn parse_name(entry: &uniprot::uniprot::Entry) -> SmartStr { } /// Write a single UniProt entry to stdout -fn write_entry(entry: &uniprot::uniprot::Entry, db_type: &UniprotType, verbose: bool) { +fn write_entry(entry: &uniprot::uniprot::Entry, db_type: &str, verbose: bool) { let accession_number: SmartStr = entry.accessions[0].clone(); let sequence: SmartStr = entry.sequence.value.clone(); @@ -165,7 +164,7 @@ fn write_entry(entry: &uniprot::uniprot::Entry, db_type: &UniprotType, verbose: SmartStr::from(ec_references.join(";")), SmartStr::from(go_references.join(";")), SmartStr::from(ip_references.join(";")), - SmartStr::from(db_type.to_str()), + SmartStr::from(db_type), taxon_id, ]; diff --git a/scripts/helper_scripts/unipept-database-rs/src/dat_parser/entry.rs b/scripts/helper_scripts/unipept-database-rs/src/dat_parser/entry.rs index f4e291d..f155b5d 100644 --- a/scripts/helper_scripts/unipept-database-rs/src/dat_parser/entry.rs +++ b/scripts/helper_scripts/unipept-database-rs/src/dat_parser/entry.rs @@ -1,4 +1,3 @@ -use crate::uniprot::UniprotType; use anyhow::Context; use std::collections::HashSet; @@ -56,7 +55,7 @@ impl UniProtDATEntry { } /// Write an entry to stdout - pub fn write(&self, db_type: &UniprotType) { + pub fn write(&self, db_type: &str) { if self.name.is_empty() { eprintln!( "Could not find a name for entry AC-{}", @@ -73,7 +72,7 @@ impl UniProtDATEntry { self.ec_references.join(";"), self.go_references.join(";"), self.ip_references.join(";"), - db_type.to_str(), + db_type, self.taxon_id ) } diff --git a/scripts/helper_scripts/unipept-database-rs/src/lib.rs b/scripts/helper_scripts/unipept-database-rs/src/lib.rs index 497dd64..9b309f5 100644 --- a/scripts/helper_scripts/unipept-database-rs/src/lib.rs +++ b/scripts/helper_scripts/unipept-database-rs/src/lib.rs @@ -2,5 +2,4 @@ pub mod calculate_lcas; pub mod dat_parser; pub mod taxons_lineages; pub mod taxons_uniprots_tables; -pub mod uniprot; pub mod utils; diff --git a/scripts/helper_scripts/unipept-database-rs/src/uniprot/mod.rs b/scripts/helper_scripts/unipept-database-rs/src/uniprot/mod.rs deleted file mode 100644 index ae293ec..0000000 --- a/scripts/helper_scripts/unipept-database-rs/src/uniprot/mod.rs +++ /dev/null @@ -1,15 +0,0 @@ -/// Enum for the different kinds of databases -#[derive(clap::ValueEnum, Clone, Debug)] -pub enum UniprotType { - Swissprot, - Trembl, -} - -impl UniprotType { - pub fn to_str(&self) -> &str { - match self { - UniprotType::Swissprot => "swissprot", - UniprotType::Trembl => "trembl", - } - } -} From d9630630d537c727275606ebf9aee3223395dfac Mon Sep 17 00:00:00 2001 From: stijndcl Date: Fri, 8 Mar 2024 13:17:30 +0100 Subject: [PATCH 2/2] Fix ifs quoting --- scripts/build_database.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/build_database.sh b/scripts/build_database.sh index eb7e0a2..6c421d8 100755 --- a/scripts/build_database.sh +++ b/scripts/build_database.sh @@ -366,7 +366,7 @@ download_and_convert_all_sources() { DB_SOURCES_ARRAY=($DB_SOURCES) # Set IFS to newline to properly split the $CHUNKS variable for folders with newlines - IFS=$"\n" + IFS=$'\n' while [[ "$IDX" -ne "${#DB_TYPES_ARRAY}" ]] && [[ -n $(echo "${DB_TYPES_ARRAY[$IDX]}" | sed "s/\s//g") ]] do