-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathrpoB_gene_extractor.py
66 lines (48 loc) · 1.86 KB
/
rpoB_gene_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#This script was written by Vincent Appiah
#This program extracts the sequence of rpoB gene from genbank files
#Don't forget to like my github page (vappiah) and my youtube channel(Bioinformatics Coach)
#You are free to use, modify and distribute this script
#I am not responsible for any anything that happens as a result of using this script
#Before using this script , make sure you are OK with the codes
from Bio import SeqIO
import os
from glob import glob
import sys
gene_of_interest='rpoB'
file_directory=sys.argv[1]
file_names=glob('%s/*.gb'%file_directory)
sequences=[]
def extract_sequence(genbank_file,gene_of_interest):
'''This script will extract rpoB gene sequences from genbank files
All the extracted sequences will be placed into a single multi-fasta file.
The multi-fasta file will be found in the same directory as your genbank files'''
gb_obj=SeqIO.read(genbank_file,'gb')
genes=[]
for feature in gb_obj.features:
if feature.type=='gene':
genes.append(feature)
hits=[]
for gene in genes:
if 'gene' in gene.qualifiers.keys():
if gene_of_interest == gene.qualifiers['gene'][0]:
hits.append(gene)
if len(hits)==0:
return None
else:
rpoB=hits[0]
extracted_sequence=rpoB.extract(gb_obj)
return extracted_sequence
for file_ in file_names:
sequence=extract_sequence(file_,gene_of_interest)
name=os.path.split(file_)[-1]
name=name.strip('.gb')
if not sequence:
print('gene not found for sample %s'%name)
else:
print('gene found in sample %s'%name)
sequence.id=name
sequence.description=''
sequences.append(sequence)
outputfile='%s/rpoB.fasta'%file_directory
SeqIO.write(sequences,outputfile,'fasta')
print('file has been generated in the sequence directory')