Skip to content

Commit

Permalink
Merge branch 'master' of github.com:EnvGen/toolbox
Browse files Browse the repository at this point in the history
  • Loading branch information
alneberg committed Nov 15, 2018
2 parents 69044a2 + 4ebf7f1 commit 4edc939
Show file tree
Hide file tree
Showing 6 changed files with 425 additions and 0 deletions.
71 changes: 71 additions & 0 deletions scripts/convenience/construct_ena_sequencing_runs_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#!/usr/bin/env python
"""construct_ena_sequencing_runs_table.py
Based on a folder with uploaded files and a template, construct table ready to be submitted.
This script is not general but is very niched to the NGI/Uppmax scenario.
Ideally the user is expected to copy this script and edit it to suit the users needs.
"""

import argparse
import sys
import os
import glob
from os.path import join as opj
import pandas as pd
from collections import defaultdict
import gzip

# Need to fetch file name and link it to sample id
# Fetch the md5sum already calculated

def main(args):
md5sum_df = pd.read_table(args.md5_summary, sep=',', header=None, names=['file_name', 'md5sum'], index_col=0)

insert_sizes = pd.read_table(args.insert_size, index_col=0)
info_d = {}

for sample_dir in args.sample_dirs:
for R1_run_file in glob.glob(opj(sample_dir, "*", "*_R1*.fastq.gz")):
R1_file_name=os.path.basename(R1_run_file)
sample_name="_".join(R1_file_name.split('_')[0:2])
run_id="_".join(R1_file_name.split('_')[0:4])
run_info = {}
run_info['sample_accession'] = sample_name
run_info['library_name'] = run_id
is_series = insert_sizes.ix[sample_name]['Avg. FS']
try:
run_info['insert_size'] = int(round(is_series))
except TypeError:
run_info['insert_size'] = int(round(insert_sizes[insert_sizes['Lib QC'] == 'PASSED'].ix[sample_name]['Avg. FS']))

run_info['forward_file_name'] = R1_file_name
run_info['forward_file_md5'] = md5sum_df.loc[R1_file_name]['md5sum']
R2_file_name = R1_file_name.replace("R1", "R2")
run_info['reverse_file_name'] = R2_file_name
run_info['reverse_file_md5'] = md5sum_df.loc[R2_file_name]['md5sum']
run_info['library_source'] = 'METAGENOMIC'
run_info['library_selection'] = 'RANDOM'
run_info['library_strategy'] = 'WGS'
run_info['library_construction_protocol'] = 'Rubicon Thruplex'
run_info['instrument_model'] = 'Illumina HiSeq 2500'
run_info['file_type'] = 'fastq'
run_info['library_layout'] = 'PAIRED'


info_d[run_id] = run_info
all_columns_sorted = ['sample_accession', 'library_name', 'library_source', 'insert_size', \
'library_selection', 'library_strategy', 'library_construction_protocol', 'instrument_model', \
'file_type', 'library_layout', 'insert_size', \
'forward_file_name', 'forward_file_md5', 'reverse_file_name', 'reverse_file_md5']

df = pd.DataFrame.from_dict(info_d, orient='index')
df[all_columns_sorted].to_csv(sys.stdout, index=False, sep='\t', header=True)

if __name__ == "__main__":
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("md5_summary", help="Table (csv) with md5sum values for all files")
parser.add_argument("insert_size", help="Table with insert size per sample")
parser.add_argument("sample_dirs", nargs='*', help="Directories where read files are located in subdirs")
args = parser.parse_args()

main(args)
14 changes: 14 additions & 0 deletions scripts/convenience/upload_to_ena_ftp.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# -n option disables auto-logon
FILE_SUFFIX="fastq.gz"
for upload_dir in "$@"
do
ftp -n webin.ebi.ac.uk <<End-Of-Session
user $ENA_USER $ENA_PASS
binary
prompt
lcd "$upload_dir"
!ls *"$FILE_SUFFIX"
mput *"$FILE_SUFFIX"
bye
End-Of-Session
done
116 changes: 116 additions & 0 deletions scripts/sag_vs_mag/check_status_in_mapping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
#!/usr/bin/env python
usage="""Checks the status for a subset of reads within a bam file.
Three categories:
Matching
Soft clipped
Not mapping
"""

import argparse
import sys
import re

soft_clip_re = re.compile('([0-9]+)S')
matched_re = re.compile('([0-9]+)M')

left_clipped_re = re.compile('^([0-9]+)S')

nr_mismatched_re = re.compile('XM:i:([0-9]+)')

def parse_cigar(cigar):
soft_clipped_matches = soft_clip_re.findall(cigar)
matched_matches = matched_re.findall(cigar)
soft_clipped_bases = sum([int(x) for x in soft_clipped_matches])
matched_bases = sum([int(x) for x in matched_matches])

left_clipped_matches = left_clipped_re.findall(cigar)
left_clipped_bases = sum([int(x) for x in left_clipped_matches])

return soft_clipped_bases, matched_bases, left_clipped_bases


"""
chimeric or not
soft clipped (> 20S) or not
overlapping edge or not
mapped or not
soft clipped not soft clipped overlapping edge not overlapping edge overlapping edge and soft clipped mapped not mapped
chimeric:
Non-chimeric:
"""

def print_stats(name, soft_clipped_set, overlapping_set, mapped_set, complete_set):
"soft clipped not soft clipped overlapping edge not overlapping edge overlapping edge and soft clipped mapped not mapped"

print("{},{},{},{},{},{},{},{}".format(name, len(soft_clipped_set), len(complete_set - soft_clipped_set), len(overlapping_set), len(complete_set - overlapping_set), len(overlapping_set & soft_clipped_set), len(mapped_set), len(complete_set - mapped_set)))


def main(args):
chim_reads = set()
for line in open(args.subset_reads, 'r'):
read = line.strip()
chim_reads.add(read)

mapped_reads = set()
soft_clipped_reads = set()
overlapping_edge_reads = set()
all_reads = set()
contig_lens = {}

contig_len_re = re.compile('SN:(.*) *LN:([0-9]*)$')

# Read the input sam files
for line in sys.stdin:
line = line.strip()

# Check if in header
if line[0] == '@':
if line[0:3] == '@SQ':
contig, contig_len = contig_len_re.findall(line)[0]
contig_lens[contig] = contig_len
else:
# Not in header

line_split = line.split('\t')
if (int(line_split[1]) / 128) == 1:
read_id = line_split[0] + "_2"
else:
read_id = line_split[0] + "_1"

all_reads.add(read_id)

mapping_bit = (((((int(line_split[1]) % 128) % 64) % 32) % 16) % 8) / 4
if mapping_bit != 1:
mapped_reads.add(read_id)
cigar = line_split[5]

soft_clipped, matching, left_clipped_bases = parse_cigar(cigar)
if soft_clipped > 20:
soft_clipped_reads.add(read_id)

read_length = len(line_split[9])

contig, mapping_start_pos = line_split[2], line_split[7]

if int(mapping_start_pos) - left_clipped_bases < 0:
overlapping_edge_reads.add(read_id)

if int(mapping_start_pos) + read_length > contig_lens[contig]:
overlapping_edge_reads.add(read_id)

# Aim to print>
# "soft clipped not soft clipped overlapping edge not overlapping edge overlapping edge and soft clipped mapped not mapped"
print("name,soft clipped, not soft clipped,overlapping edge,not overlapping edge,overlapping edge and soft clipped,mapped,not mapped")
print_stats("chimeric_reads", soft_clipped_reads & chim_reads, overlapping_edge_reads & chim_reads, mapped_reads & chim_reads, chim_reads)
print_stats("other_reads", soft_clipped_reads - chim_reads, overlapping_edge_reads - chim_reads, mapped_reads - chim_reads, all_reads - chim_reads)


if __name__ == "__main__":
parser = argparse.ArgumentParser(description=usage)
parser.add_argument('subset_reads')
args = parser.parse_args()

main(args)
62 changes: 62 additions & 0 deletions scripts/sag_vs_mag/filter_out_none_clipped.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#!/usr/bin/env python
usage="""Filter potential chimeric reads to remove correctly none-clipped.
"""

import argparse
import sys
import re

soft_clip_re = re.compile('([0-9]+)S')
matched_re = re.compile('([0-9]+)M')

nr_mismatched_re = re.compile('XM:i:([0-9]+)')

def parse_cigar(cigar):
soft_clipped_matches = soft_clip_re.findall(cigar)
matched_matches = matched_re.findall(cigar)
soft_clipped_bases = sum([int(x) for x in soft_clipped_matches])
matched_bases = sum([int(x) for x in matched_matches])

return soft_clipped_bases, matched_bases

def main(args):
all_reads = set()
for line in open(args.potential_chimera, 'r'):
read = line.strip()
all_reads.add(read)

# Read the input sam files
for line in sys.stdin:
line = line.strip()

# Check if in header
if line[0] != '@':
# Not in header

# Ignore reads which are note potential chimeras
# Read is most likely not chimeric
line_split = line.split('\t')
if (int(line_split[1]) / 128) == 1:
read_id = line_split[0] + "_2"
else:
read_id = line_split[0] + "_1"

if read_id in all_reads:
cigar = line_split[5]
# Matching min 95% of bases
soft_clipped, matching = parse_cigar(cigar)

read_length = len(line_split[9])
mismatches = int(nr_mismatched_re.findall(line)[0])

if (matching > read_length*0.95) and (int(mismatches) < 5):
all_reads.remove(read_id)

print("\n".join(list(all_reads)))

if __name__ == "__main__":
parser = argparse.ArgumentParser(description=usage)
parser.add_argument('potential_chimera')
args = parser.parse_args()

main(args)
70 changes: 70 additions & 0 deletions scripts/sag_vs_mag/find_candidate_chimeric_reads.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#!/usr/bin/env python
usage="""Prints reads which are potential chimeras.
These are determined as being soft clipped, not on the edge
of contigs and at leat 50% mapped with max 2 mismatches.
Input SAM with header.
"""

import argparse
import sys
import re

soft_clip_re = re.compile('([0-9]+)S')
matched_re = re.compile('([0-9]+)M')

nr_mismatched_re = re.compile('XM:i:([0-9]+)')

def parse_cigar(cigar):
soft_clipped_matches = soft_clip_re.findall(cigar)
matched_matches = matched_re.findall(cigar)
soft_clipped_bases = sum([int(x) for x in soft_clipped_matches])
matched_bases = sum([int(x) for x in matched_matches])

return soft_clipped_bases, matched_bases

def main():
contig_len_re = re.compile('SN:(.*) *LN:([0-9]*)$')
contig_lens = {}

# Read the input sam file
for line in sys.stdin:
line = line.strip()

# Check if in header
if line[0] == '@':
if line[0:3] == '@SQ':
contig, contig_len = contig_len_re.findall(line)[0]
contig_lens[contig] = contig_len
else:
# Not in header
line_split = line.split('\t')
cigar = line_split[5]
# Soft clipped at least 20 bases
if 'S' in cigar:
soft_clipped, matching = parse_cigar(cigar)
if soft_clipped > 20:
# Matching region should be > 50% of read
read_length = len(line_split[9])
if matching > read_length/2.0:
contig, mapping_start_pos = line_split[2], line_split[7]

# mapping should not be on the edges
if mapping_start_pos > 300 and int(contig_lens[contig]) - int(mapping_start_pos) > 300:
# check nr_mismatches
mismatches = int(nr_mismatched_re.findall(line)[0])
if int(mismatches) < 2:
# check which read
if (int(line_split[1]) / 128) == 1:
read_nr = "2"
else:
read_nr = "1"

print(line_split[0]+"_" + read_nr)


if __name__ == "__main__":
parser = argparse.ArgumentParser(description=usage)
parser.parse_args()
main()
Loading

0 comments on commit 4edc939

Please sign in to comment.