-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'master' of github.com:EnvGen/toolbox
- Loading branch information
Showing
6 changed files
with
425 additions
and
0 deletions.
There are no files selected for viewing
71 changes: 71 additions & 0 deletions
71
scripts/convenience/construct_ena_sequencing_runs_table.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
#!/usr/bin/env python | ||
"""construct_ena_sequencing_runs_table.py | ||
Based on a folder with uploaded files and a template, construct table ready to be submitted. | ||
This script is not general but is very niched to the NGI/Uppmax scenario. | ||
Ideally the user is expected to copy this script and edit it to suit the users needs. | ||
""" | ||
|
||
import argparse | ||
import sys | ||
import os | ||
import glob | ||
from os.path import join as opj | ||
import pandas as pd | ||
from collections import defaultdict | ||
import gzip | ||
|
||
# Need to fetch file name and link it to sample id | ||
# Fetch the md5sum already calculated | ||
|
||
def main(args): | ||
md5sum_df = pd.read_table(args.md5_summary, sep=',', header=None, names=['file_name', 'md5sum'], index_col=0) | ||
|
||
insert_sizes = pd.read_table(args.insert_size, index_col=0) | ||
info_d = {} | ||
|
||
for sample_dir in args.sample_dirs: | ||
for R1_run_file in glob.glob(opj(sample_dir, "*", "*_R1*.fastq.gz")): | ||
R1_file_name=os.path.basename(R1_run_file) | ||
sample_name="_".join(R1_file_name.split('_')[0:2]) | ||
run_id="_".join(R1_file_name.split('_')[0:4]) | ||
run_info = {} | ||
run_info['sample_accession'] = sample_name | ||
run_info['library_name'] = run_id | ||
is_series = insert_sizes.ix[sample_name]['Avg. FS'] | ||
try: | ||
run_info['insert_size'] = int(round(is_series)) | ||
except TypeError: | ||
run_info['insert_size'] = int(round(insert_sizes[insert_sizes['Lib QC'] == 'PASSED'].ix[sample_name]['Avg. FS'])) | ||
|
||
run_info['forward_file_name'] = R1_file_name | ||
run_info['forward_file_md5'] = md5sum_df.loc[R1_file_name]['md5sum'] | ||
R2_file_name = R1_file_name.replace("R1", "R2") | ||
run_info['reverse_file_name'] = R2_file_name | ||
run_info['reverse_file_md5'] = md5sum_df.loc[R2_file_name]['md5sum'] | ||
run_info['library_source'] = 'METAGENOMIC' | ||
run_info['library_selection'] = 'RANDOM' | ||
run_info['library_strategy'] = 'WGS' | ||
run_info['library_construction_protocol'] = 'Rubicon Thruplex' | ||
run_info['instrument_model'] = 'Illumina HiSeq 2500' | ||
run_info['file_type'] = 'fastq' | ||
run_info['library_layout'] = 'PAIRED' | ||
|
||
|
||
info_d[run_id] = run_info | ||
all_columns_sorted = ['sample_accession', 'library_name', 'library_source', 'insert_size', \ | ||
'library_selection', 'library_strategy', 'library_construction_protocol', 'instrument_model', \ | ||
'file_type', 'library_layout', 'insert_size', \ | ||
'forward_file_name', 'forward_file_md5', 'reverse_file_name', 'reverse_file_md5'] | ||
|
||
df = pd.DataFrame.from_dict(info_d, orient='index') | ||
df[all_columns_sorted].to_csv(sys.stdout, index=False, sep='\t', header=True) | ||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser(description=__doc__) | ||
parser.add_argument("md5_summary", help="Table (csv) with md5sum values for all files") | ||
parser.add_argument("insert_size", help="Table with insert size per sample") | ||
parser.add_argument("sample_dirs", nargs='*', help="Directories where read files are located in subdirs") | ||
args = parser.parse_args() | ||
|
||
main(args) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
# -n option disables auto-logon | ||
FILE_SUFFIX="fastq.gz" | ||
for upload_dir in "$@" | ||
do | ||
ftp -n webin.ebi.ac.uk <<End-Of-Session | ||
user $ENA_USER $ENA_PASS | ||
binary | ||
prompt | ||
lcd "$upload_dir" | ||
!ls *"$FILE_SUFFIX" | ||
mput *"$FILE_SUFFIX" | ||
bye | ||
End-Of-Session | ||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
#!/usr/bin/env python | ||
usage="""Checks the status for a subset of reads within a bam file. | ||
Three categories: | ||
Matching | ||
Soft clipped | ||
Not mapping | ||
""" | ||
|
||
import argparse | ||
import sys | ||
import re | ||
|
||
soft_clip_re = re.compile('([0-9]+)S') | ||
matched_re = re.compile('([0-9]+)M') | ||
|
||
left_clipped_re = re.compile('^([0-9]+)S') | ||
|
||
nr_mismatched_re = re.compile('XM:i:([0-9]+)') | ||
|
||
def parse_cigar(cigar): | ||
soft_clipped_matches = soft_clip_re.findall(cigar) | ||
matched_matches = matched_re.findall(cigar) | ||
soft_clipped_bases = sum([int(x) for x in soft_clipped_matches]) | ||
matched_bases = sum([int(x) for x in matched_matches]) | ||
|
||
left_clipped_matches = left_clipped_re.findall(cigar) | ||
left_clipped_bases = sum([int(x) for x in left_clipped_matches]) | ||
|
||
return soft_clipped_bases, matched_bases, left_clipped_bases | ||
|
||
|
||
""" | ||
chimeric or not | ||
soft clipped (> 20S) or not | ||
overlapping edge or not | ||
mapped or not | ||
soft clipped not soft clipped overlapping edge not overlapping edge overlapping edge and soft clipped mapped not mapped | ||
chimeric: | ||
Non-chimeric: | ||
""" | ||
|
||
def print_stats(name, soft_clipped_set, overlapping_set, mapped_set, complete_set): | ||
"soft clipped not soft clipped overlapping edge not overlapping edge overlapping edge and soft clipped mapped not mapped" | ||
|
||
print("{},{},{},{},{},{},{},{}".format(name, len(soft_clipped_set), len(complete_set - soft_clipped_set), len(overlapping_set), len(complete_set - overlapping_set), len(overlapping_set & soft_clipped_set), len(mapped_set), len(complete_set - mapped_set))) | ||
|
||
|
||
def main(args): | ||
chim_reads = set() | ||
for line in open(args.subset_reads, 'r'): | ||
read = line.strip() | ||
chim_reads.add(read) | ||
|
||
mapped_reads = set() | ||
soft_clipped_reads = set() | ||
overlapping_edge_reads = set() | ||
all_reads = set() | ||
contig_lens = {} | ||
|
||
contig_len_re = re.compile('SN:(.*) *LN:([0-9]*)$') | ||
|
||
# Read the input sam files | ||
for line in sys.stdin: | ||
line = line.strip() | ||
|
||
# Check if in header | ||
if line[0] == '@': | ||
if line[0:3] == '@SQ': | ||
contig, contig_len = contig_len_re.findall(line)[0] | ||
contig_lens[contig] = contig_len | ||
else: | ||
# Not in header | ||
|
||
line_split = line.split('\t') | ||
if (int(line_split[1]) / 128) == 1: | ||
read_id = line_split[0] + "_2" | ||
else: | ||
read_id = line_split[0] + "_1" | ||
|
||
all_reads.add(read_id) | ||
|
||
mapping_bit = (((((int(line_split[1]) % 128) % 64) % 32) % 16) % 8) / 4 | ||
if mapping_bit != 1: | ||
mapped_reads.add(read_id) | ||
cigar = line_split[5] | ||
|
||
soft_clipped, matching, left_clipped_bases = parse_cigar(cigar) | ||
if soft_clipped > 20: | ||
soft_clipped_reads.add(read_id) | ||
|
||
read_length = len(line_split[9]) | ||
|
||
contig, mapping_start_pos = line_split[2], line_split[7] | ||
|
||
if int(mapping_start_pos) - left_clipped_bases < 0: | ||
overlapping_edge_reads.add(read_id) | ||
|
||
if int(mapping_start_pos) + read_length > contig_lens[contig]: | ||
overlapping_edge_reads.add(read_id) | ||
|
||
# Aim to print> | ||
# "soft clipped not soft clipped overlapping edge not overlapping edge overlapping edge and soft clipped mapped not mapped" | ||
print("name,soft clipped, not soft clipped,overlapping edge,not overlapping edge,overlapping edge and soft clipped,mapped,not mapped") | ||
print_stats("chimeric_reads", soft_clipped_reads & chim_reads, overlapping_edge_reads & chim_reads, mapped_reads & chim_reads, chim_reads) | ||
print_stats("other_reads", soft_clipped_reads - chim_reads, overlapping_edge_reads - chim_reads, mapped_reads - chim_reads, all_reads - chim_reads) | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser(description=usage) | ||
parser.add_argument('subset_reads') | ||
args = parser.parse_args() | ||
|
||
main(args) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
#!/usr/bin/env python | ||
usage="""Filter potential chimeric reads to remove correctly none-clipped. | ||
""" | ||
|
||
import argparse | ||
import sys | ||
import re | ||
|
||
soft_clip_re = re.compile('([0-9]+)S') | ||
matched_re = re.compile('([0-9]+)M') | ||
|
||
nr_mismatched_re = re.compile('XM:i:([0-9]+)') | ||
|
||
def parse_cigar(cigar): | ||
soft_clipped_matches = soft_clip_re.findall(cigar) | ||
matched_matches = matched_re.findall(cigar) | ||
soft_clipped_bases = sum([int(x) for x in soft_clipped_matches]) | ||
matched_bases = sum([int(x) for x in matched_matches]) | ||
|
||
return soft_clipped_bases, matched_bases | ||
|
||
def main(args): | ||
all_reads = set() | ||
for line in open(args.potential_chimera, 'r'): | ||
read = line.strip() | ||
all_reads.add(read) | ||
|
||
# Read the input sam files | ||
for line in sys.stdin: | ||
line = line.strip() | ||
|
||
# Check if in header | ||
if line[0] != '@': | ||
# Not in header | ||
|
||
# Ignore reads which are note potential chimeras | ||
# Read is most likely not chimeric | ||
line_split = line.split('\t') | ||
if (int(line_split[1]) / 128) == 1: | ||
read_id = line_split[0] + "_2" | ||
else: | ||
read_id = line_split[0] + "_1" | ||
|
||
if read_id in all_reads: | ||
cigar = line_split[5] | ||
# Matching min 95% of bases | ||
soft_clipped, matching = parse_cigar(cigar) | ||
|
||
read_length = len(line_split[9]) | ||
mismatches = int(nr_mismatched_re.findall(line)[0]) | ||
|
||
if (matching > read_length*0.95) and (int(mismatches) < 5): | ||
all_reads.remove(read_id) | ||
|
||
print("\n".join(list(all_reads))) | ||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser(description=usage) | ||
parser.add_argument('potential_chimera') | ||
args = parser.parse_args() | ||
|
||
main(args) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
#!/usr/bin/env python | ||
usage="""Prints reads which are potential chimeras. | ||
These are determined as being soft clipped, not on the edge | ||
of contigs and at leat 50% mapped with max 2 mismatches. | ||
Input SAM with header. | ||
""" | ||
|
||
import argparse | ||
import sys | ||
import re | ||
|
||
soft_clip_re = re.compile('([0-9]+)S') | ||
matched_re = re.compile('([0-9]+)M') | ||
|
||
nr_mismatched_re = re.compile('XM:i:([0-9]+)') | ||
|
||
def parse_cigar(cigar): | ||
soft_clipped_matches = soft_clip_re.findall(cigar) | ||
matched_matches = matched_re.findall(cigar) | ||
soft_clipped_bases = sum([int(x) for x in soft_clipped_matches]) | ||
matched_bases = sum([int(x) for x in matched_matches]) | ||
|
||
return soft_clipped_bases, matched_bases | ||
|
||
def main(): | ||
contig_len_re = re.compile('SN:(.*) *LN:([0-9]*)$') | ||
contig_lens = {} | ||
|
||
# Read the input sam file | ||
for line in sys.stdin: | ||
line = line.strip() | ||
|
||
# Check if in header | ||
if line[0] == '@': | ||
if line[0:3] == '@SQ': | ||
contig, contig_len = contig_len_re.findall(line)[0] | ||
contig_lens[contig] = contig_len | ||
else: | ||
# Not in header | ||
line_split = line.split('\t') | ||
cigar = line_split[5] | ||
# Soft clipped at least 20 bases | ||
if 'S' in cigar: | ||
soft_clipped, matching = parse_cigar(cigar) | ||
if soft_clipped > 20: | ||
# Matching region should be > 50% of read | ||
read_length = len(line_split[9]) | ||
if matching > read_length/2.0: | ||
contig, mapping_start_pos = line_split[2], line_split[7] | ||
|
||
# mapping should not be on the edges | ||
if mapping_start_pos > 300 and int(contig_lens[contig]) - int(mapping_start_pos) > 300: | ||
# check nr_mismatches | ||
mismatches = int(nr_mismatched_re.findall(line)[0]) | ||
if int(mismatches) < 2: | ||
# check which read | ||
if (int(line_split[1]) / 128) == 1: | ||
read_nr = "2" | ||
else: | ||
read_nr = "1" | ||
|
||
print(line_split[0]+"_" + read_nr) | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser(description=usage) | ||
parser.parse_args() | ||
main() |
Oops, something went wrong.