From df80f075d49a42e69ec72db0cf5580250f5b82c4 Mon Sep 17 00:00:00 2001 From: Kim Soo Hyun Date: Sun, 25 Aug 2024 18:49:01 +0900 Subject: [PATCH 1/2] Change createdb.cpp so that it takes in ".txt" file containing paths of different fasta files --- src/util/createdb.cpp | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) mode change 100644 => 100755 src/util/createdb.cpp diff --git a/src/util/createdb.cpp b/src/util/createdb.cpp old mode 100644 new mode 100755 index 18d2fc7d0..440b2ed06 --- a/src/util/createdb.cpp +++ b/src/util/createdb.cpp @@ -19,6 +19,28 @@ int createdb(int argc, const char **argv, const Command& command) { std::vector filenames(par.filenames); std::string dataFile = filenames.back(); filenames.pop_back(); + if (Util::endsWith(".txt", filenames[0])) { + if (filenames.size() > 1) { + Debug(Debug::ERROR) << "Only one txt file can be given\n"; + EXIT(EXIT_FAILURE); + } + std::string tsv = filenames.back(); + filenames.pop_back(); + + FILE* file = FileUtil::openFileOrDie(tsv.c_str(), "r", true); + char* line = NULL; + size_t len = 0; + ssize_t read; + while ((read = getline(&line, &len, file)) != -1) { + if (line[read - 1] == '\n') { + line[read - 1] = '\0'; + read--; + } + filenames.push_back(line); + } + free(line); + fclose(file); + } for (size_t i = 0; i < filenames.size(); i++) { if (FileUtil::directoryExists(filenames[i].c_str()) == true) { From 3617ea7153b54f7131366660d570d5c68053d58a Mon Sep 17 00:00:00 2001 From: Kim Soo Hyun Date: Mon, 26 Aug 2024 16:11:29 +0900 Subject: [PATCH 2/2] Change filepath format from txt to tsv. Also add explanations of its usage to createdb and easy-search in MMseqsBase.cpp. --- src/MMseqsBase.cpp | 9 ++++++++- src/util/createdb.cpp | 4 ++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/MMseqsBase.cpp b/src/MMseqsBase.cpp index 8325f0d4a..c252ade41 100644 --- a/src/MMseqsBase.cpp +++ b/src/MMseqsBase.cpp @@ -11,6 +11,10 @@ std::vector baseCommands = { "Sensitive homology search", "# Search multiple FASTA against FASTA (like BLASTP, TBLASTN, BLASTX, BLASTN --search-type 3, TBLASTX --search-type 2)\n" "mmseqs easy-search examples/QUERY.fasta examples/QUERY.fasta examples/DB.fasta result.m8 tmp\n\n" + "# Search multiple query fasta files against target fasta files using a tsv file containing filepaths\n" + "echo -e \"dir1/QUERY1.fasta\\ndir2/QUERY2.fasta\" > examples/queries.tsv\n" + "echo -e \"dir3/TARGET1.fasta\\ndir4/TARGET2.fasta\" > examples/targets.tsv\n" + "mmseqs easy-search examples/queries.tsv examples/targets.tsv result.m8 tmp\n\n" "# Iterative profile search from stdin (like PSI-BLAST)\n" "cat examples/QUERY.fasta | mmseqs easy-search stdin examples/DB.fasta result.m8 tmp --num-iterations 2\n\n" "# Profile search against small databases (e.g. PFAM, eggNOG)\n" @@ -125,7 +129,10 @@ std::vector baseCommands = { "# Create a seqDB from stdin\n" "cat seq.fasta | mmseqs createdb stdin sequenceDB\n\n" "# Create a seqDB by indexing existing FASTA/Q (for single line fasta entries only)\n" - "mmseqs createdb seq.fasta sequenceDB --createdb-mode 1\n", + "mmseqs createdb seq.fasta sequenceDB --createdb-mode 1\n\n" + "# Create a seqDB from a tsv file containing filepaths of multiple FASTA files in each line\n" + "echo -e \"dir1/bacteria.fasta\\ndir2/archea.fasta.gz\" > filepaths.tsv\n" + "mmseqs createdb filepaths.tsv sequenceDB\n", "Martin Steinegger ", " ... | ", CITATION_MMSEQS2, {{"fast[a|q]File[.gz|bz2]|stdin", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfileStdinAndGeneric }, diff --git a/src/util/createdb.cpp b/src/util/createdb.cpp index 440b2ed06..e1d4ead3d 100755 --- a/src/util/createdb.cpp +++ b/src/util/createdb.cpp @@ -19,9 +19,9 @@ int createdb(int argc, const char **argv, const Command& command) { std::vector filenames(par.filenames); std::string dataFile = filenames.back(); filenames.pop_back(); - if (Util::endsWith(".txt", filenames[0])) { + if (Util::endsWith(".tsv", filenames[0])) { if (filenames.size() > 1) { - Debug(Debug::ERROR) << "Only one txt file can be given\n"; + Debug(Debug::ERROR) << "Only one tsv file can be given\n"; EXIT(EXIT_FAILURE); } std::string tsv = filenames.back();