-
Notifications
You must be signed in to change notification settings - Fork 0
/
tcrvseqfuncs.py
127 lines (112 loc) · 5.33 KB
/
tcrvseqfuncs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
"""
Module for TCRseq V and C region counts analysis for paired-end fastq data
@author:David Redmond (email [email protected])
"""
import sys, os, commands, csv, commands, operator, tempfile, subprocess, numpy
from itertools import groupby, count
from collections import Counter
from Bio.Blast import NCBIXML
#count reads in fastq files
def count_total_reads(myFastq1,myFastq2):
result=int(commands.getoutput("zcat %s | wc -l" % myFastq1))
result+=int(commands.getoutput("zcat %s | wc -l" % myFastq2))
return result/4
def is_empty(any_structure):
if any_structure:
return False
else:
return True
# gunzip fastq files
def gunzip_fastq(myFastq):
command="gunzip %s" % myFastq
os.system(command)
# gzip fastq files
def gzip_fastq(myFastq):
command="gzip -1 %s" % myFastq
os.system(command)
# Prepare fq files in format for blast mapping
def blast_fq_format(myFastq,outFastq):
#command="sed '3~4d;4~4d;s/@/>/g' "+myFastq+" > "+outFastq
command="zcat "+myFastq+" | sed '3~4d;4~4d;s/@/>/g' > "+outFastq
os.system(command)
#split fasta file into temporary files of 10k lines
def tempfile_split(filename, temp_dir, chunk=10**4):
fns={}
with open(filename, 'r') as datafile:
groups = groupby(datafile, key=lambda k, line=count(): next(line) // chunk)
for k, group in groups:
with tempfile.NamedTemporaryFile(delete=False,
dir=temp_dir,prefix='{}_'.format(str(k))) as outfile:
outfile.write(''.join(group))
fns[k]=outfile.name
return fns
def blastall_v_regions(myFastq1,myFastq2,myRef,outputfile,eVal,blastallDir):
os.system("rm "+outputfile)
fns={}
chunk=10**4
with open(myFastq1, 'r') as datafile1:
groups = groupby(datafile1, key=lambda k, line=count(): next(line) // chunk)
for k, group in groups:
with tempfile.NamedTemporaryFile(delete=False,
dir=tempfile.mkdtemp(),prefix='{}_'.format(str(k))) as outfile:
outfile.write(''.join(group))
fns[k]=outfile.name
blastn_cline = blastallDir+"blastall -p blastn -o "+str(outfile.name)+".blast.out -i "+str(outfile.name)+" -d "+myRef+" -e "+str(eVal)+" -m 8 -b 1"
os.system(blastn_cline+" > /dev/null 2>&1")
os.system("cat "+str(outfile.name)+".blast.out >> "+outputfile)
os.remove(str(outfile.name)+".blast.out")
os.remove(str(outfile.name))
testvar=commands.getstatusoutput("dirname "+str(outfile.name))
os.system("rm -r "+testvar[1])
fns={}
with open(myFastq2, 'r') as datafile2:
groups = groupby(datafile2, key=lambda k, line=count(): next(line) // chunk)
for k, group in groups:
with tempfile.NamedTemporaryFile(delete=False,
dir=tempfile.mkdtemp(),prefix='{}_'.format(str(k))) as outfile:
outfile.write(''.join(group))
fns[k]=outfile.name
blastn_cline = blastallDir+"blastall -p blastn -o "+str(outfile.name)+".blast.out -i "+str(outfile.name)+" -d "+myRef+" -e "+str(eVal)+" -m 8 -b 1"
os.system(blastn_cline+" > /dev/null 2>&1")
os.system("cat "+str(outfile.name)+".blast.out >> "+outputfile)
os.remove(str(outfile.name)+".blast.out")
os.remove(str(outfile.name))
testvar=commands.getstatusoutput("dirname "+str(outfile.name))
os.system("rm -r "+testvar[1])
def listToStringWithoutBrackets(list1):
return str(list1).replace('[','').replace(']','')
def run_seqtk(inputList,inputFastq1,inputFastq2,outputFastq,seqTkDir):
command1=seqTkDir+"seqtk subseq "+inputFastq1+" "+inputList+" >> "+outputFastq
command2=seqTkDir+"seqtk subseq "+inputFastq2+" "+inputList+" >> "+outputFastq
os.system(command1)
os.system(command2)
def return_counts(blastHitsFile,outName,fastq1,fastq2,minBlastAlignedLength):
myHits=[]
with open(blastHitsFile) as f:
reader = csv.reader(f, delimiter="\t")
for row in reader:
if(int(row[3])>=minBlastAlignedLength):
myHits.append(row[1])
gene_table={}
gene_table=Counter(myHits)
perc_table={}
for gene in gene_table:
perc_table[gene]=float(gene_table[gene])/float(sum(gene_table.values()))
gene_table_output = { k: [ gene_table[k], perc_table[k] ] for k in gene_table }
sorted_gto = sorted(gene_table_output.items(), key=operator.itemgetter(1),reverse=True)
f1=open(outName+".counts.txt", 'w+')
for item in sorted_gto:
print >>f1, item[0],",",listToStringWithoutBrackets(item[1])
f1.close()
def return_fastq_counts(myFastq1,myFastq2,outfile):
f1=open(outfile, 'w+')
print >>f1, count_total_reads(myFastq1,myFastq2)
f1.close()
def return_fastq_median_read_lengths(myFastq1,outfile,lengthScript):
cmd="zcat "+myFastq1+" | perl "+lengthScript+" - > "+outfile
os.system(cmd)
cmd="zcat "+myFastq1+" | perl "+lengthScript+" -"
return(int(os.popen(cmd).read()))
def get_variable_regions(myFastq1,myFastq2,myRef,outName,eVal,minBlastAlignedLength,blastallDir):
blastall_v_regions(myFastq1,myFastq2,myRef,outName+".matches.txt",eVal,blastallDir)
return_counts(outName+".matches.txt",outName,myFastq1,myFastq2,minBlastAlignedLength)