diff --git a/README.md b/README.md index 4c1bf3223..28a521f03 100644 --- a/README.md +++ b/README.md @@ -86,6 +86,8 @@ $ dorado basecaller hac pod5s/ --resume-from incomplete.bam > calls.bam ### DNA Adapter and primer trimming +The dorado software can detect and remove any adapter and/or primer sequences from the beginning and end of DNA reads. Note that if you intend to demux the reads at some later time, trimming adapters and primers may result in some portions of the flanking regions of the barcodes being removed, which could negatively impact demuxing. + #### In-line with basecalling By default, `dorado basecaller` will attempt to detect any adapter or primer sequences at the beginning and ending of reads, and remove them from the output sequence. @@ -99,6 +101,8 @@ The `--trim` option takes as its argument one of the following values: * `adapters` This will result in any detected adapters being trimmed, but primers will not be trimmed, and if barcoding is enabled then barcodes will not be trimmed either. * `none` This is the same as using the --no-trim option. Nothing will be trimmed. +If adapter/primer trimming is done in-line with basecalling in combination with demuxing, then the software will automatically make sure that the trimming of adapters and primers does not interfere with the demuxing process. However, if you intend to do demuxing later as a separate step, then it is recommended that you disable adapter/primer trimming when basecalling with the `--no-trim` option, to insure that any barcode sequences remain completely intact in the reads. + #### Trimming existing datasets Existing basecalled datasets can be scanned for adapter and/or primer sequences at either end, and trim any such found sequences. To do this, run: @@ -111,6 +115,8 @@ $ dorado trim > trimmed.bam The `--no-trim-primers` option can be used to prevent the trimming of primer sequences. In this case only adapter sequences will be trimmed. +If it is also your intention to demux the data, then it is recommended that you do that before trimming any adapters and primers, as trimming adapters and primers first may result in the demux software being unable to classify the barcodes properly. + ### RNA Adapter trimming Adapters for RNA002 and RNA004 kits are automatically trimmed during basecalling. However, unlike in DNA, the RNA adapter cannot be trimmed post-basecalling. diff --git a/dorado/cli/basecaller.cpp b/dorado/cli/basecaller.cpp index 94cea9c15..7dcf7437c 100644 --- a/dorado/cli/basecaller.cpp +++ b/dorado/cli/basecaller.cpp @@ -165,17 +165,17 @@ void setup(std::vector args, {current_sink_node}, std::thread::hardware_concurrency(), is_rna_model(model_config), 1000); } + if (adapter_trimming_enabled) { + current_sink_node = pipeline_desc.add_node( + {current_sink_node}, thread_allocations.adapter_threads, !adapter_no_trim, + !primer_no_trim); + } if (barcode_enabled) { current_sink_node = pipeline_desc.add_node( {current_sink_node}, thread_allocations.barcoder_threads, barcode_kits, barcode_both_ends, barcode_no_trim, std::move(allowed_barcodes), std::move(custom_kit), std::move(custom_seqs)); } - if (adapter_trimming_enabled) { - current_sink_node = pipeline_desc.add_node( - {current_sink_node}, thread_allocations.adapter_threads, !adapter_no_trim, - !primer_no_trim); - } current_sink_node = pipeline_desc.add_node( {current_sink_node}, min_qscore, default_parameters.min_sequence_length, std::unordered_set{}, thread_allocations.read_filter_threads); diff --git a/dorado/demux/AdapterDetector.cpp b/dorado/demux/AdapterDetector.cpp index 2c9d1f6ac..279a19dbf 100644 --- a/dorado/demux/AdapterDetector.cpp +++ b/dorado/demux/AdapterDetector.cpp @@ -61,9 +61,7 @@ const std::vector primers = { {"PCS110_forward", "TCGCCTACCGTGACAAGAAAGTTGTCGGTGTCTTTGTGACTTGCCTGTCGCTCTATCTTCAGAGGAGAGTCCGCCGCCCGCAAGTTT"}, {"PCS110_reverse", "ATCGCCTACCGTGACAAGAAAGTTGTCGGTGTCTTTGTGTTTCTGTTGGTGCTGATATTGCTTT"}, - // Not included because it is too similar to RBK barcode flank - // {"RAD", "GCTTGGGTGTTTAACCGTTTTCGCATTTATCGTGAAACGCTTTCGCGTTTTTCGTGCGCCGCTTCA"} -}; + {"RAD", "GCTTGGGTGTTTAACCGTTTTCGCATTTATCGTGAAACGCTTTCGCGTTTTTCGTGCGCCGCTTCA"}}; } // namespace @@ -208,5 +206,80 @@ AdapterScoreResult AdapterDetector::detect(const std::string& seq, return result; } +void AdapterDetector::check_and_update_barcoding(SimplexRead& read, + std::pair& trim_interval) { + // If barcoding has been done, we may need to make some adjustments. + if (!read.read_common.barcoding_result) { + return; + } + int post_barcode_seq_len = int(read.read_common.pre_trim_seq_length); + if (read.read_common.barcode_trim_interval != std::pair(0, 0)) { + post_barcode_seq_len = read.read_common.barcode_trim_interval.second - + read.read_common.barcode_trim_interval.first; + } + bool front_barcode_trimmed = (read.read_common.barcode_trim_interval.first > 0); + bool rear_barcode_trimmed = (read.read_common.barcode_trim_interval.second > 0 && + read.read_common.barcode_trim_interval.second < + int(read.read_common.pre_trim_seq_length)); + + if (trim_interval.first > 0) { + // An adapter or primer was found at the beginning of the read. + // If any barcodes were found, their position details will need to be updated + // so that they refer to the position in the trimmed read. If the barcode + // overlaps the region we are planning to trim, then this probably means that + // the barcode was misidentified as a primer, so we should not trim it. + if (read.read_common.barcoding_result) { + auto& barcode_result = *read.read_common.barcoding_result; + if (barcode_result.barcode_name != "unclassified") { + if (front_barcode_trimmed) { + // We've already trimmed a front barcode. Adapters and primers do not appear after barcodes, so + // we should ignore this. + trim_interval.first = 0; + } else { + if (barcode_result.top_barcode_pos != std::pair(-1, -1)) { + // We have detected, but not trimmed, a front barcode. + if (barcode_result.top_barcode_pos.first < trim_interval.first) { + // We've misidentified the barcode as a primer. Ignore it. + trim_interval.first = 0; + } else { + // Update the position to correspond to the trimmed sequence. + barcode_result.top_barcode_pos.first -= trim_interval.first; + barcode_result.top_barcode_pos.second -= trim_interval.first; + } + } + if (barcode_result.bottom_barcode_pos != std::pair(-1, -1) && + !rear_barcode_trimmed) { + // We have detected a rear barcode, and have not trimmed barcodes, so we need to update + // the rear barcode position to correspond to the sequence which has now had a front adapter + // and/or primer trimmed from it. + barcode_result.bottom_barcode_pos.first -= trim_interval.first; + barcode_result.bottom_barcode_pos.second -= trim_interval.first; + } + } + } + } + } + if (trim_interval.second > 0 && trim_interval.second != post_barcode_seq_len) { + // An adapter or primer was found at the end of the read. + // This does not require any barcode positions to be updated, but if the + // barcode overlaps the region we are planning to trim, then this probably + // means that the barcode was misidentified as a primer, so we should not + // trim it. + if (read.read_common.barcoding_result) { + auto& barcode_result = *read.read_common.barcoding_result; + if (barcode_result.barcode_name != "unclassified") { + if (rear_barcode_trimmed) { + // We've already trimmed a rear barcode. Adapters and primers do not appear before rear barcodes, + // so we should ignore this. + trim_interval.second = post_barcode_seq_len; + } else if (barcode_result.bottom_barcode_pos.second > trim_interval.second) { + // We've misidentified the rear barcode as a primer. Ignore it. + trim_interval.second = post_barcode_seq_len; + } + } + } + } +} + } // namespace demux } // namespace dorado diff --git a/dorado/demux/AdapterDetector.h b/dorado/demux/AdapterDetector.h index 36f751df1..aa45f3df8 100644 --- a/dorado/demux/AdapterDetector.h +++ b/dorado/demux/AdapterDetector.h @@ -1,4 +1,5 @@ #pragma once +#include "read_pipeline/ReadPipeline.h" #include "utils/stats.h" #include "utils/types.h" @@ -28,6 +29,8 @@ class AdapterDetector { const std::vector& get_adapter_sequences() const; const std::vector& get_primer_sequences() const; + static void check_and_update_barcoding(SimplexRead& read, std::pair& trim_interval); + private: enum QueryType { ADAPTER, PRIMER }; diff --git a/dorado/read_pipeline/AdapterDetectorNode.cpp b/dorado/read_pipeline/AdapterDetectorNode.cpp index 512a02544..5bbbfaedd 100644 --- a/dorado/read_pipeline/AdapterDetectorNode.cpp +++ b/dorado/read_pipeline/AdapterDetectorNode.cpp @@ -135,6 +135,7 @@ void AdapterDetectorNode::process_read(SimplexRead& read) { trim_interval.first, trim_interval.second, read.read_common.seq); return; } + demux::AdapterDetector::check_and_update_barcoding(read, trim_interval); Trimmer::trim_sequence(read, trim_interval); read.read_common.adapter_trim_interval = trim_interval; } diff --git a/dorado/read_pipeline/ReadPipeline.h b/dorado/read_pipeline/ReadPipeline.h index 568fd2c7d..adad50457 100644 --- a/dorado/read_pipeline/ReadPipeline.h +++ b/dorado/read_pipeline/ReadPipeline.h @@ -56,7 +56,7 @@ class ReadCommon { std::shared_ptr adapter_info; std::shared_ptr barcoding_info; - std::shared_ptr barcoding_result; + std::shared_ptr barcoding_result; std::size_t pre_trim_seq_length{}; std::pair adapter_trim_interval{}; std::pair barcode_trim_interval{}; diff --git a/tests/AdapterDetectorTest.cpp b/tests/AdapterDetectorTest.cpp index a45befe49..7ee41ac0b 100644 --- a/tests/AdapterDetectorTest.cpp +++ b/tests/AdapterDetectorTest.cpp @@ -2,6 +2,7 @@ #include "MessageSinkUtils.h" #include "TestUtils.h" +#include "demux/Trimmer.h" #include "read_pipeline/AdapterDetectorNode.h" #include "read_pipeline/HtsReader.h" #include "utils/bam_utils.h" @@ -108,6 +109,135 @@ TEST_CASE("AdapterDetector: test primer detection", TEST_GROUP) { } } +void detect_and_trim(SimplexRead& read) { + demux::AdapterDetector detector; + auto seqlen = int(read.read_common.seq.length()); + std::pair adapter_trim_interval = {0, seqlen}; + std::pair primer_trim_interval = {0, seqlen}; + + auto adapter_res = detector.find_adapters(read.read_common.seq); + adapter_trim_interval = Trimmer::determine_trim_interval(adapter_res, seqlen); + auto primer_res = detector.find_primers(read.read_common.seq); + primer_trim_interval = Trimmer::determine_trim_interval(primer_res, seqlen); + std::pair trim_interval = adapter_trim_interval; + trim_interval.first = std::max(trim_interval.first, primer_trim_interval.first); + trim_interval.second = std::min(trim_interval.second, primer_trim_interval.second); + CHECK(trim_interval.first < trim_interval.second); + demux::AdapterDetector::check_and_update_barcoding(read, trim_interval); + Trimmer::trim_sequence(read, trim_interval); + read.read_common.adapter_trim_interval = trim_interval; +} + +TEST_CASE( + "AdapterDetector: check trimming when adapter/primer trimming is combined with barcode " + "detection.", + TEST_GROUP) { + using Catch::Matchers::Equals; + + const std::string seq = std::string(200, 'A'); + demux::AdapterDetector detector; + const auto& adapters = detector.get_adapter_sequences(); + const auto& primers = detector.get_primer_sequences(); + const auto& front_adapter = adapters[1].sequence; + const auto& front_primer = primers[2].sequence; + const auto& rear_adapter = adapters[1].sequence_rev; + const auto& rear_primer = primers[2].sequence_rev; + std::string front_barcode = "CCCCCCCCCC"; + std::string rear_barcode = "GGGGGGGGGG"; + const int stride = 6; + + { + // Test case where barcode detection has been done, but barcodes were not trimmed. + // Make sure that barcode results are updated to reflect their position in the sequence + // after the front adapter and primer have been trimmed. + SimplexRead read; + read.read_common.seq = front_adapter + front_primer + front_barcode + seq + rear_barcode + + rear_primer + rear_adapter; + read.read_common.qstring = std::string(read.read_common.seq.length(), '!'); + read.read_common.read_id = "read_id"; + read.read_common.model_stride = stride; + read.read_common.num_trimmed_samples = 0; + read.read_common.pre_trim_seq_length = read.read_common.seq.length(); + + std::vector moves; + for (size_t i = 0; i < read.read_common.seq.length(); i++) { + moves.push_back(1); + moves.push_back(0); + } + read.read_common.moves = moves; + read.read_common.raw_data = at::zeros(moves.size() * stride); + + const auto flank_size = front_adapter.length() + front_primer.length(); + const int additional_trimmed_samples = + int(stride * 2 * flank_size); // * 2 is because we have 2 moves per base + + // Add in barcoding information. + read.read_common.barcoding_result = std::make_shared(); + auto& barcode_results = *read.read_common.barcoding_result; + barcode_results.barcode_name = "fake_barcode"; + int front_barcode_start = int(front_adapter.length() + front_primer.length()); + int front_barcode_end = front_barcode_start + int(front_barcode.length()); + int rear_barcode_start = front_barcode_end + int(seq.length()); + int rear_barcode_end = rear_barcode_start + int(rear_barcode.length()); + barcode_results.top_barcode_pos = {front_barcode_start, front_barcode_end}; + barcode_results.bottom_barcode_pos = {rear_barcode_start, rear_barcode_end}; + + detect_and_trim(read); + std::string expected_trimmed_seq = front_barcode + seq + rear_barcode; + CHECK(read.read_common.seq == expected_trimmed_seq); + CHECK(read.read_common.num_trimmed_samples == uint64_t(additional_trimmed_samples)); + int expected_front_barcode_start = 0; + int expected_front_barcode_end = int(front_barcode.length()); + int expected_rear_barcode_start = expected_front_barcode_end + int(seq.length()); + int expected_rear_barcode_end = expected_rear_barcode_start + int(rear_barcode.length()); + CHECK(barcode_results.top_barcode_pos == + std::pair(expected_front_barcode_start, expected_front_barcode_end)); + CHECK(barcode_results.bottom_barcode_pos == + std::pair(expected_rear_barcode_start, expected_rear_barcode_end)); + } + { + // Test case where barcode detection has been done, but barcodes were not trimmed. + // In this case the detected adapter/primer overlaps what was detected as the barcode. + // The code should therefore not trim anything. + SimplexRead read; + read.read_common.seq = front_adapter + front_primer + seq + rear_primer + rear_adapter; + read.read_common.qstring = std::string(read.read_common.seq.length(), '!'); + read.read_common.read_id = "read_id"; + read.read_common.model_stride = stride; + read.read_common.num_trimmed_samples = 0; + read.read_common.pre_trim_seq_length = read.read_common.seq.length(); + + std::vector moves; + for (size_t i = 0; i < read.read_common.seq.length(); i++) { + moves.push_back(1); + moves.push_back(0); + } + read.read_common.moves = moves; + read.read_common.raw_data = at::zeros(moves.size() * stride); + + // Add in barcoding information. + read.read_common.barcoding_result = std::make_shared(); + auto& barcode_results = *read.read_common.barcoding_result; + barcode_results.barcode_name = "fake_barcode"; + int front_barcode_start = 5; + int front_barcode_end = 15; + int rear_barcode_start = + int(front_adapter.length() + front_primer.length() + seq.length()) + 5; + int rear_barcode_end = rear_barcode_start + 10; + barcode_results.top_barcode_pos = {front_barcode_start, front_barcode_end}; + barcode_results.bottom_barcode_pos = {rear_barcode_start, rear_barcode_end}; + + std::string expected_trimmed_seq = read.read_common.seq; // Nothing should get trimmed. + detect_and_trim(read); + CHECK(read.read_common.seq == expected_trimmed_seq); + CHECK(read.read_common.num_trimmed_samples == uint64_t(0)); + CHECK(barcode_results.top_barcode_pos == + std::pair(front_barcode_start, front_barcode_end)); + CHECK(barcode_results.bottom_barcode_pos == + std::pair(rear_barcode_start, rear_barcode_end)); + } +} + TEST_CASE( "AdapterDetectorNode: check read messages are correctly updated after adapter/primer " "detection and trimming",