From e90e7ccd2e4b5c3459e8de250b6b91ef9d2e21b4 Mon Sep 17 00:00:00 2001
From: stijndcl <declercq.stijn@outlook.com>
Date: Fri, 8 Mar 2024 13:14:38 +0100
Subject: [PATCH 1/2] Fix changes from Pieter

---
 scripts/build_database.sh                         |  7 +++++--
 .../unipept-database-rs/src/bin/dat-parser.rs     |  5 ++---
 .../unipept-database-rs/src/bin/xml-parser.rs     |  9 ++++-----
 .../unipept-database-rs/src/dat_parser/entry.rs   |  5 ++---
 .../helper_scripts/unipept-database-rs/src/lib.rs |  1 -
 .../unipept-database-rs/src/uniprot/mod.rs        | 15 ---------------
 6 files changed, 13 insertions(+), 29 deletions(-)
 delete mode 100644 scripts/helper_scripts/unipept-database-rs/src/uniprot/mod.rs
diff --git a/scripts/build_database.sh b/scripts/build_database.sh
index 20fea74..eb7e0a2 100755
--- a/scripts/build_database.sh
+++ b/scripts/build_database.sh
@@ -83,7 +83,7 @@ END
 # This function removes all temporary files that have been created by this script.
 clean() {
 	# Clean contents of temporary directory
-#	rm -rf "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT"
+	rm -rf "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT"
 	export TMPDIR="$OLD_TMPDIR"
 }
 
@@ -365,7 +365,8 @@ download_and_convert_all_sources() {
   DB_TYPES_ARRAY=($DB_TYPES)
   DB_SOURCES_ARRAY=($DB_SOURCES)
 
-  IFS="$OLDIFS"
+  # Set IFS to newline to properly split the $CHUNKS variable for folders with newlines
+  IFS=$"\n"
 
   while [[ "$IDX" -ne "${#DB_TYPES_ARRAY}" ]] && [[ -n $(echo "${DB_TYPES_ARRAY[$IDX]}" | sed "s/\s//g") ]]
   do
@@ -462,6 +463,8 @@ download_and_convert_all_sources() {
 
     IDX=$((IDX + 1))
   done
+
+  IFS="$OLDIFS"
 }
 
 filter_sources_by_taxa() {
diff --git a/scripts/helper_scripts/unipept-database-rs/src/bin/dat-parser.rs b/scripts/helper_scripts/unipept-database-rs/src/bin/dat-parser.rs
index 007716d..7e9e3c6 100644
--- a/scripts/helper_scripts/unipept-database-rs/src/bin/dat-parser.rs
+++ b/scripts/helper_scripts/unipept-database-rs/src/bin/dat-parser.rs
@@ -2,7 +2,6 @@ use anyhow::{Context, Result};
 use clap::Parser;
 use unipept_database::dat_parser::uniprot_dat_parser;
 use unipept_database::dat_parser::utils::write_header;
-use unipept_database::uniprot::UniprotType;
 
 use unipept_database::utils::files::open_sin;
 
@@ -24,8 +23,8 @@ fn main() -> Result<()> {
 
 #[derive(Parser, Debug)]
 struct Cli {
-    #[clap(value_enum, short = 't', long, default_value_t = UniprotType::Swissprot)]
-    db_type: UniprotType,
+    #[clap(short = 't', long, default_value = "swissprot")]
+    db_type: String,
     #[clap(long, default_value_t = 0)]
     threads: usize,
 }
diff --git a/scripts/helper_scripts/unipept-database-rs/src/bin/xml-parser.rs b/scripts/helper_scripts/unipept-database-rs/src/bin/xml-parser.rs
index 9e102cb..b10ae09 100644
--- a/scripts/helper_scripts/unipept-database-rs/src/bin/xml-parser.rs
+++ b/scripts/helper_scripts/unipept-database-rs/src/bin/xml-parser.rs
@@ -4,7 +4,6 @@ use std::num::NonZeroUsize;
 use anyhow::{Context, Result};
 use clap::Parser;
 use smartstring::{LazyCompact, SmartString};
-use unipept_database::uniprot::UniprotType;
 use uniprot::uniprot::{SequentialParser, ThreadedParser};
 
 use unipept_database::utils::files::open_sin;
@@ -50,8 +49,8 @@ type SmartStr = SmartString<LazyCompact>;
 // Parse a Uniprot XML file and convert it into a TSV-file
 #[derive(Parser, Debug)]
 struct Cli {
-    #[clap(value_enum, short = 't', long, default_value_t = UniprotType::Swissprot)]
-    uniprot_type: UniprotType,
+    #[clap(short = 't', long, default_value = "swissprot")]
+    uniprot_type: String,
     #[clap(long, default_value_t = 0)]
     threads: u32,
     #[clap(short, long, default_value_t = false)]
@@ -123,7 +122,7 @@ fn parse_name(entry: &uniprot::uniprot::Entry) -> SmartStr {
 }
 
 /// Write a single UniProt entry to stdout
-fn write_entry(entry: &uniprot::uniprot::Entry, db_type: &UniprotType, verbose: bool) {
+fn write_entry(entry: &uniprot::uniprot::Entry, db_type: &str, verbose: bool) {
     let accession_number: SmartStr = entry.accessions[0].clone();
     let sequence: SmartStr = entry.sequence.value.clone();
 
@@ -165,7 +164,7 @@ fn write_entry(entry: &uniprot::uniprot::Entry, db_type: &UniprotType, verbose:
         SmartStr::from(ec_references.join(";")),
         SmartStr::from(go_references.join(";")),
         SmartStr::from(ip_references.join(";")),
-        SmartStr::from(db_type.to_str()),
+        SmartStr::from(db_type),
         taxon_id,
     ];
 
diff --git a/scripts/helper_scripts/unipept-database-rs/src/dat_parser/entry.rs b/scripts/helper_scripts/unipept-database-rs/src/dat_parser/entry.rs
index f4e291d..f155b5d 100644
--- a/scripts/helper_scripts/unipept-database-rs/src/dat_parser/entry.rs
+++ b/scripts/helper_scripts/unipept-database-rs/src/dat_parser/entry.rs
@@ -1,4 +1,3 @@
-use crate::uniprot::UniprotType;
 use anyhow::Context;
 use std::collections::HashSet;
 
@@ -56,7 +55,7 @@ impl UniProtDATEntry {
     }
 
     /// Write an entry to stdout
-    pub fn write(&self, db_type: &UniprotType) {
+    pub fn write(&self, db_type: &str) {
         if self.name.is_empty() {
             eprintln!(
                 "Could not find a name for entry AC-{}",
@@ -73,7 +72,7 @@ impl UniProtDATEntry {
             self.ec_references.join(";"),
             self.go_references.join(";"),
             self.ip_references.join(";"),
-            db_type.to_str(),
+            db_type,
             self.taxon_id
         )
     }
diff --git a/scripts/helper_scripts/unipept-database-rs/src/lib.rs b/scripts/helper_scripts/unipept-database-rs/src/lib.rs
index 497dd64..9b309f5 100644
--- a/scripts/helper_scripts/unipept-database-rs/src/lib.rs
+++ b/scripts/helper_scripts/unipept-database-rs/src/lib.rs
@@ -2,5 +2,4 @@ pub mod calculate_lcas;
 pub mod dat_parser;
 pub mod taxons_lineages;
 pub mod taxons_uniprots_tables;
-pub mod uniprot;
 pub mod utils;
diff --git a/scripts/helper_scripts/unipept-database-rs/src/uniprot/mod.rs b/scripts/helper_scripts/unipept-database-rs/src/uniprot/mod.rs
deleted file mode 100644
index ae293ec..0000000
--- a/scripts/helper_scripts/unipept-database-rs/src/uniprot/mod.rs
+++ /dev/null
@@ -1,15 +0,0 @@
-/// Enum for the different kinds of databases
-#[derive(clap::ValueEnum, Clone, Debug)]
-pub enum UniprotType {
-    Swissprot,
-    Trembl,
-}
-
-impl UniprotType {
-    pub fn to_str(&self) -> &str {
-        match self {
-            UniprotType::Swissprot => "swissprot",
-            UniprotType::Trembl => "trembl",
-        }
-    }
-}

From d9630630d537c727275606ebf9aee3223395dfac Mon Sep 17 00:00:00 2001
From: stijndcl <declercq.stijn@outlook.com>
Date: Fri, 8 Mar 2024 13:17:30 +0100
Subject: [PATCH 2/2] Fix ifs quoting

---
 scripts/build_database.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/build_database.sh b/scripts/build_database.sh
index eb7e0a2..6c421d8 100755
--- a/scripts/build_database.sh
+++ b/scripts/build_database.sh
@@ -366,7 +366,7 @@ download_and_convert_all_sources() {
   DB_SOURCES_ARRAY=($DB_SOURCES)
 
   # Set IFS to newline to properly split the $CHUNKS variable for folders with newlines
-  IFS=$"\n"
+  IFS=$'\n'
 
   while [[ "$IDX" -ne "${#DB_TYPES_ARRAY}" ]] && [[ -n $(echo "${DB_TYPES_ARRAY[$IDX]}" | sed "s/\s//g") ]]
   do