Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: junctions extractor count overlapping read pairs once #196

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ include(TestHelper)

#versioning stuff
set (regtools_VERSION_MAJOR 1)
set (regtools_VERSION_MINOR 0)
set (regtools_VERSION_MINOR 1)
set (regtools_VERSION_PATCH 0)

configure_file (
Expand Down
24 changes: 22 additions & 2 deletions src/cis-splice-effects/cis_splice_effects_identifier.cc
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,13 @@ void CisSpliceEffectsIdentifier::usage(ostream& out) {
out << "\t\t" << "-t STR\tTag used in bam to label strand. [XS]" << endl;
out << "\t\t" << "-a INT\tMinimum anchor length. Junctions which satisfy a minimum \n"
<< "\t\t\t " << "anchor length on both sides are reported. [8]" << endl;
out << "\t\t" << "-A INT\tMinimum read anchor length. Reads which satisfy a minimum \n"
<< "\t\t\t " << "anchor length on both sides 'support' a junction. [0]" << endl;
out << "\t\t" << "-m INT\tMinimum intron length. [70]" << endl;
out << "\t\t" << "-M INT\tMaximum intron length. [500000]" << endl;
out << "\t\t" << "-f INT\tOnly use alignments where all flag bits set here are set. [0]" << endl;
out << "\t\t" << "-F INT\tOnly use alignments where no flag bits set here are set. [0]" << endl;
out << "\t\t" << "-q INT\tOnly use alignments with this mapping quality or above. [0]" << endl;
out << "\t\t" << "-w INT\tWindow size in b.p to identify splicing events in.\n"
<< "\t\t\t " << "The tool identifies events in variant.start +/- w basepairs.\n"
<< "\t\t\t " << "Default behaviour is to look at the window between previous and next exons." << endl;
Expand Down Expand Up @@ -113,7 +118,7 @@ void CisSpliceEffectsIdentifier::parse_options(int argc, char* argv[]) {
optind = 1; //Reset before parsing again.
stringstream help_ss;
char c;
while((c = getopt(argc, argv, "o:w:v:j:e:Ei:ISht:s:a:m:M:b:C")) != -1) {
while((c = getopt(argc, argv, "o:w:v:j:e:Ei:ISht:s:a:A:m:M:f:F:q:b:C")) != -1) {
switch(c) {
case 'o':
output_file_ = string(optarg);
Expand Down Expand Up @@ -164,12 +169,24 @@ void CisSpliceEffectsIdentifier::parse_options(int argc, char* argv[]) {
case 'a':
min_anchor_length_ = atoi(optarg);
break;
case 'A':
min_read_anchor_length_ = atoi(optarg);
break;
case 'm':
min_intron_length_ = atoi(optarg);
break;
case 'M':
max_intron_length_ = atoi(optarg);
break;
case 'f':
require_flags_ = atoi(optarg);
break;
case 'F':
filter_flags_ = atoi(optarg);
break;
case 'q':
min_map_qual_ = atoi(optarg);
break;
case 'b':
output_barcodes_file_ = string(optarg);
break;
Expand Down Expand Up @@ -285,7 +302,10 @@ void CisSpliceEffectsIdentifier::identify() {
} else {
ref_to_pass = "NA";
}
JunctionsExtractor je1(bam_, variant_region, strandness_, strand_tag_, min_anchor_length_, min_intron_length_, max_intron_length_, ref_to_pass);
JunctionsExtractor je1(bam_, variant_region, strandness_,
strand_tag_, min_anchor_length_, min_read_anchor_length_,
min_intron_length_, max_intron_length_,
filter_flags_, require_flags_, min_map_qual_, ref_to_pass);
je1.identify_junctions_from_BAM();
vector<Junction> junctions = je1.get_all_junctions();
//Add all the junctions to the unique set
Expand Down
15 changes: 13 additions & 2 deletions src/cis-splice-effects/cis_splice_effects_identifier.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,13 +87,20 @@ class CisSpliceEffectsIdentifier {
//tag used in BAM to denote strand, default "XS"
string strand_tag_;
//Minimum anchor length for junctions
//Junctions need atleast this many bp overlap
// on both ends.
//Junctions need at least this many bp overlap on both ends.
uint32_t min_anchor_length_;
//Reads need at least this many bp overlap on both ends to support junction.
uint32_t min_read_anchor_length_;
//Minimum length of an intron, i.e min junction width
uint32_t min_intron_length_;
//Maximum length of an intron, i.e max junction width
uint32_t max_intron_length_;
//filter reads containing any of these flags
uint16_t filter_flags_;
// filter reads not containing all of these flags
uint16_t require_flags_;
// filter reads below the minimum mapping quality
uint8_t min_map_qual_;
//whether to override strand of extracted junctions using intron-motif method
bool override_strand_with_canonical_intron_motif_;
public:
Expand All @@ -112,8 +119,12 @@ class CisSpliceEffectsIdentifier {
strandness_(-1),
strand_tag_("XS"),
min_anchor_length_(8),
min_read_anchor_length_(0),
min_intron_length_(70),
max_intron_length_(500000),
filter_flags_(0),
require_flags_(0),
min_map_qual_(0),
override_strand_with_canonical_intron_motif_(false) {}
//Destructor
~CisSpliceEffectsIdentifier() {
Expand Down
111 changes: 78 additions & 33 deletions src/junctions/junctions_extractor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -43,20 +43,32 @@ int JunctionsExtractor::parse_options(int argc, char *argv[]) {
optind = 1; //Reset before parsing again.
int c;
stringstream help_ss;
while((c = getopt(argc, argv, "ha:m:M:o:r:t:s:b:")) != -1) {
while((c = getopt(argc, argv, "ha:A:m:M:f:F:q:o:r:t:s:b:")) != -1) {
switch(c) {
case 'h':
usage(help_ss);
throw common::cmdline_help_exception(help_ss.str());
case 'a':
min_anchor_length_ = atoi(optarg);
break;
case 'A':
min_read_anchor_length_ = atoi(optarg);
break;
case 'm':
min_intron_length_ = atoi(optarg);
break;
case 'M':
max_intron_length_ = atoi(optarg);
break;
case 'f':
require_flags_ = atoi(optarg);
break;
case 'F':
filter_flags_ = atoi(optarg);
break;
case 'q':
min_map_qual_ = atoi(optarg);
break;
case 'o':
output_file_ = string(optarg);
break;
Expand Down Expand Up @@ -108,10 +120,18 @@ int JunctionsExtractor::parse_options(int argc, char *argv[]) {
throw runtime_error("Strandness mode 'intron-motif' requires a fasta file!\n\n");
}
}
if ( (require_flags_ & filter_flags_) != 0) {
usage();
throw runtime_error("No overlap allowed between '-f' and '-F' options (same flag filtered and required)!\n\n");
}

cerr << "Minimum junction anchor length: " << min_anchor_length_ << endl;
cerr << "Minimum read anchor length: " << min_read_anchor_length_ << endl;
cerr << "Minimum intron length: " << min_intron_length_ << endl;
cerr << "Maximum intron length: " << max_intron_length_ << endl;
cerr << "Require alignment flags: " << require_flags_ << endl;
cerr << "Filter alignment flags: " << filter_flags_ << endl;
cerr << "Minimum alignment mapping quality: " << int(min_map_qual_) << endl;
cerr << "Alignment: " << bam_ << endl;
cerr << "Output file: " << output_file_ << endl;
if (output_barcodes_file_ != "NA"){
Expand All @@ -128,8 +148,13 @@ int JunctionsExtractor::usage(ostream& out) {
out << "Options:" << endl;
out << "\t\t" << "-a INT\tMinimum anchor length. Junctions which satisfy a minimum \n"
<< "\t\t\t " << "anchor length on both sides are reported. [8]" << endl;
out << "\t\t" << "-A INT\tMinimum read anchor length. Reads which satisfy a minimum \n"
<< "\t\t\t " << "anchor length on both sides 'support' a junction. [0]" << endl;
out << "\t\t" << "-m INT\tMinimum intron length. [70]" << endl;
out << "\t\t" << "-M INT\tMaximum intron length. [500000]" << endl;
out << "\t\t" << "-f INT\tOnly use alignments where all flag bits set here are set. [0]" << endl;
out << "\t\t" << "-F INT\tOnly use alignments where no flag bits set here are set. [0]" << endl;
out << "\t\t" << "-q INT\tOnly use alignments with this mapping quality or above. [0]" << endl;
out << "\t\t" << "-o FILE\tThe file to write output to. [STDOUT]" << endl;
out << "\t\t" << "-r STR\tThe region to identify junctions \n"
<< "\t\t\t " << "in \"chr:start-end\" format. Entire BAM by default." << endl;
Expand All @@ -156,12 +181,19 @@ string JunctionsExtractor::get_new_junction_name() {
return name_ss.str();
}

//Do some basic qc on the junction
//Update if junction passes QC based on current read alignment
bool JunctionsExtractor::junction_qc(Junction &j1) {
// don't add support for junction if intron is wrong size
if(j1.end - j1.start < min_intron_length_ ||
j1.end - j1.start > max_intron_length_) {
return false;
}

// don't add support for junction if read isn't sufficiently anchored
if(j1.start - j1.thick_start < min_read_anchor_length_) return false;
if(j1.thick_end - j1.end < min_read_anchor_length_) return false;

// add support, update if this junction is sufficiently anchored
if(j1.start - j1.thick_start >= min_anchor_length_)
j1.has_left_min_anchor = true;
if(j1.thick_end - j1.end >= min_anchor_length_)
Expand All @@ -170,7 +202,8 @@ bool JunctionsExtractor::junction_qc(Junction &j1) {
}

//Add a junction to the junctions map
//The read_count field is the number of reads supporting the junction.
//The read_count field is the number of reads supporting the junction,
// and reads is a set of the read names supporting the junction
int JunctionsExtractor::add_junction(Junction j1) {
//Check junction_qc
if(!junction_qc(j1)) {
Expand All @@ -193,44 +226,46 @@ int JunctionsExtractor::add_junction(Junction j1) {
}
string key = j1.chrom + string(":") + start + "-" + end + ":" + strand_proxy;

//Check if new junction
//add new junction
if(!junctions_.count(key)) {
j1.name = get_new_junction_name();
j1.read_count = 1;
j1.score = common::num_to_str(j1.read_count);
} else { //existing junction
Junction j0 = junctions_[key];

junctions_[key] = j1;

} else { //update existing junction

if (output_barcodes_file_ != "NA"){
unordered_map<string, int>::const_iterator it = j0.barcodes.find(j1.barcodes.begin()->first);

if (it != j0.barcodes.end()) {// barcode exists already
j1.barcodes = j0.barcodes;
j1.barcodes[it->first]++;
// NOTE: Junction j1 will always have exactly one barcode,
// that of the current alignment; i.e. j1.barcodes = {<barcode>: 1}
auto it = junctions_[key].barcodes.find(j1.barcodes.begin()->first);
if (it != junctions_[key].barcodes.end()) {// barcode exists already
junctions_[key].barcodes[it->first]++;
} else {
// this block is where the slowness happens - not sure if it's the instantiation or the insertion
// well, tried to get around instantion by just inserting into j0 but that made it like another 2x slower so I don't think it's that
pair<string, int> tmp_barcode = *j1.barcodes.begin();
j1.barcodes = j0.barcodes;
j1.barcodes.insert(tmp_barcode);
junctions_[key].barcodes[it->first] = 1;
}
}
//increment read count
j1.read_count = j0.read_count + 1;
j1.score = common::num_to_str(j1.read_count);
//Keep the same name
j1.name = j0.name;
// following barcodes convention, Junction j1 has one read,
// that of the current alignment; i.e. j1.reads = {<read_name>}
string read_name = *j1.reads.begin();
//avoid counting the same paired-end read twice
//if both segments overlap junction
if (!junctions_[key].reads.count(read_name)) {
junctions_[key].reads.insert(read_name);
junctions_[key].read_count++;
junctions_[key].score = common::num_to_str(junctions_[key].read_count);
}
//Check if thick starts are any better
if(j0.thick_start < j1.thick_start)
j1.thick_start = j0.thick_start;
if(j0.thick_end > j1.thick_end)
j1.thick_end = j0.thick_end;
//preserve min anchor information
j1.has_left_min_anchor = j1.has_left_min_anchor || j0.has_left_min_anchor;
j1.has_right_min_anchor = j1.has_right_min_anchor || j0.has_right_min_anchor;
if(j1.thick_start < junctions_[key].thick_start)
junctions_[key].thick_start = j1.thick_start;
if(j1.thick_end > junctions_[key].thick_end)
junctions_[key].thick_end = j1.thick_end;
//update min anchor information
junctions_[key].has_left_min_anchor =
junctions_[key].has_left_min_anchor || j1.has_left_min_anchor;
junctions_[key].has_right_min_anchor =
junctions_[key].has_right_min_anchor || j1.has_right_min_anchor;
}
//Add junction and check anchor while printing.
junctions_[key] = j1;
return 0;
}

Expand Down Expand Up @@ -358,7 +393,7 @@ void JunctionsExtractor::set_junction_strand(bam1_t *aln, Junction& j1, string i
return;
}

//Get the the barcode
//Get the barcode
void JunctionsExtractor::set_junction_barcode(bam1_t *aln, Junction& j1) {
uint8_t *p = bam_aux_get(aln, barcode_tag_.c_str());
if(p != NULL) {
Expand All @@ -373,6 +408,14 @@ void JunctionsExtractor::set_junction_barcode(bam1_t *aln, Junction& j1) {
}
}

//Filters alignments based on filtering flags and mapping quality
bool JunctionsExtractor::filter_alignment(bam_hdr_t *header, bam1_t *aln) {
if ((aln->core.flag & filter_flags_) != 0) return true;
if ((aln->core.flag | require_flags_) != aln->core.flag) return true;
if (aln->core.qual < min_map_qual_) return true;
return false;
}

//Parse junctions from the read and store in junction map
int JunctionsExtractor::parse_alignment_into_junctions(bam_hdr_t *header, bam1_t *aln) {
int n_cigar = aln->core.n_cigar;
Expand All @@ -386,8 +429,9 @@ int JunctionsExtractor::parse_alignment_into_junctions(bam_hdr_t *header, bam1_t

Junction j1;
j1.chrom = chr;
j1.thick_start = read_pos; // start pos of read
j1.start = read_pos; //maintain start pos of junction
j1.thick_start = read_pos;
j1.reads.insert(bam_get_qname(aln));
string intron_motif;

if (output_barcodes_file_ != "NA"){
Expand Down Expand Up @@ -523,6 +567,7 @@ int JunctionsExtractor::identify_junctions_from_BAM() {
//Initiate the alignment record
bam1_t *aln = bam_init1();
while(sam_itr_next(in, iter, aln) >= 0) {
if (filter_alignment(header, aln)) continue;
parse_alignment_into_junctions(header, aln);
}
hts_itr_destroy(iter);
Expand Down
Loading