-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_geneList.py
55 lines (45 loc) · 2.4 KB
/
create_geneList.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
'''
Created: E. Reichenberger
Date: 3.7.2016
Purpose: To extact all gene_product names from GenBank files and place the information to a new text file. Output file should have the following format: Gene_product\tFeature_type\tCategory\n. #NOTE: Category will be an empty column.
'''
import Bio
import os
import sys
from Bio import GenBank
from Bio import SeqIO
#######################################################DEFINITIONS################################################################
def strip_it(string_name):
stripers = ['[', ']', '\'', '"']
for s in stripers:
string_name = string_name.replace(s, '')
string_name = string_name.lstrip()
return string_name
#---------------------------------------------------------------------------------------------------------------------------------
######################################################### File List ##############################################################
fileList = []
with open('fileList.txt', 'r') as inputFile:
for i in inputFile.readlines():
i = i.replace('\n', '')
fileList.append(i)
geneCorpus = {} #will contain list of text files (same as fileList, but .txt extension)
#---------------------------------------------------------------------------------------------------------------------------------
############################Searchable Genbank Files & write qualifying features to output file####################################
for f in fileList:
string = ''
with open(f, 'r') as handle:
for record in SeqIO.parse(handle, 'genbank'):
for feature in record.features:
feature_type = feature.type
product_name = strip_it(str(feature.qualifiers.get('product'))) #.lower() change case (will be needed later to find unique gene products)
if product_name not in geneCorpus:
geneCorpus[product_name] = {}
geneCorpus[product_name] = feature_type
#---------------------------------------------------------------------------------------------------------------------------------
############################Create Corpus File ####################################
with open('Output/gene_corpus.txt', 'w') as outputFile:
outputFile.write('\t'.join(['Gene_product', 'Feature_type', 'Category']) + '\n')
for g in geneCorpus:
string = '\t'.join([g, geneCorpus[g]]) + '\n'
outputFile.write(string)
#---------------------------------------------------------------------------------------------------------------------------------