-
Notifications
You must be signed in to change notification settings - Fork 11
/
fix_bedfile_genome_boundaries.py
executable file
·55 lines (40 loc) · 1.47 KB
/
fix_bedfile_genome_boundaries.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#!/usr/bin/env python
import csv
import os
import sys
EXTENSION = "chromSizes"
DEFAULT_CHROMSIZES_LOCATION = "/data/groups/lab_bock/shared/resources/genomes"
def getChrSizes(chrmFile):
"""
Reads tab-delimiter file with two rows describing the chromosomes and its lengths.
Returns dictionary of chr:sizes.
"""
with open(chrmFile, 'r') as f:
chrmSizes = {}
for line in enumerate(f):
row = line[1].strip().split('\t')
chrmSizes[str(row[0])] = int(row[1])
return chrmSizes
def getChrFile(assembly):
filename = "{}.{}".format(assembly, EXTENSION)
default_filepath = os.path.join(
DEFAULT_CHROMSIZES_LOCATION, assembly, filename)
genomes_folder_path = os.getenv("GENOMES")
try:
env_based_filepath = os.path.join(
genomes_folder_path, assembly, filename)
except TypeError:
return default_filepath
return env_based_filepath if os.path.exists(env_based_filepath) \
else default_filepath
genome = sys.argv[1]
chrSizesFilepath = getChrFile(genome)
chrms = getChrSizes(chrSizesFilepath) # get size of chromosomes
wr = csv.writer(sys.stdout, delimiter='\t', lineterminator='\n')
for row in csv.reader(iter(sys.stdin.readline, ''), delimiter='\t'):
chrm = row[0]
start = int(row[1])
end = int(row[2])
if chrm in chrms.keys(): # skip weird chromosomes
if start >= 1 and end <= chrms[chrm] and start < end:
wr.writerow(row)