Skip to content

Commit

Permalink
Merge pull request #8 from Integrative-Transcriptomics/redevelopment
Browse files Browse the repository at this point in the history
Redevelopment
  • Loading branch information
mwittep authored Mar 31, 2023
2 parents a12386f + 26cc992 commit 8321ceb
Show file tree
Hide file tree
Showing 179 changed files with 585,150 additions and 1,612 deletions.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
11 changes: 11 additions & 0 deletions Analysis/Resolution_Evaluation/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Resolution Evaluation

This directory contains the evaluation of the resolution algorithm.

The `generateUnresolvedFiles.py` subscript generates two SNP tables, a ground truth SNP table containing only resolved bases and a SNP table containing the same rows as the ground truth SNP table but with some unresolved bases instead of resolved bases.

The `evaluationResolution.py` subscript evaluates the performance of the resolution by comparing the resolved SNP table after the resolution with the ground truth and unresolved SNP table.

The `resolutionComparison.py` script is the script that needs to be executed to run the resolution performance as it combines the `generateUnresolvedFiles.py` script and the `evaluationResolution.py` script with CLASSICOv2.

The images visualize the results of the unresolved base resolution for the *Treponema pallidum* and *Mycobacterium leprae* datasets. The results with index 1 are from the first analysis in which only one base per row was replaced with an unresolved base and with different amounts of duplications of each row (7/14/21 or 10/20/30). The results with index 2 are from the second analysis in which multiple bases per row were replaced with unresolved bases and the results with index 3 are from the third analysis in which entire clades were replaced with unresolved bases.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
61 changes: 61 additions & 0 deletions Analysis/Resolution_Evaluation/evaluateResolution.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# ==========================================================
# This script evaluates the resolution of unresolved bases by
# outputting correctly resolved, incorrectly resolved and
# unresolved bases after resolution
# ==========================================================

# input:
# 1. path for SNP table containing ground truth
# 2. path for SNP table after resolution
# 3. path for modified SNP table with unresolved bases

# read input arguments
import sys
input = sys.argv
truth = open(input[1], 'r')
resolved = open(input[2], 'r')
unresolved = open(input[3], 'r')

# initialize counts
correct_resolved = 0
incorrect_resolved = 0
unresolved_resolved = 0

# read lines for each file
for line_truth in truth.readlines():
line_resolved = resolved.readline()
line_unresolved = unresolved.readline()
line_truth = line_truth[:len(line_truth) - 1]
line_resolved = line_resolved[:len(line_resolved) - 1]
line_unresolved = line_unresolved[:len(line_unresolved) - 1]

# split lines into columns
split_line_truth = line_truth.split('\t')
split_line_resolved = line_resolved.split('\t')
split_line_unresolved = line_unresolved.split('\t')

# check that the number of columns is equal for all three files
if (len(split_line_resolved) != len(split_line_truth) or len(split_line_truth) != len(split_line_unresolved) or len(split_line_resolved) != len(split_line_unresolved)):
print("Warning: Number of columns not equal!")
print(len(split_line_resolved) + ":" + len(split_line_truth) + ":" + len(split_line_unresolved))
break

# iterate over all SNPs (columns) that belong to samples (exclude position and reference)
for count in range(2, len(split_line_unresolved)):
# if the unresolved file contains a N this N should be resolved by the algorithm
if split_line_unresolved[count] == 'N':
# if the resolved file also contains a N, the base remained unresolved -> increase unresolved count
if split_line_resolved[count] == 'N':
unresolved_resolved += 1
else:
# if value in the groung truth file is equal to the resolved file the resolution was correct -> increase correct resolution count
if split_line_truth[count] in split_line_resolved[count]:
correct_resolved += 1
# otherwise it was incorrect -> increase incorrect resolution count
else:
incorrect_resolved += 1

# print number of incorrect, correct and unresolved bases
print('Incorrect Resolution: ' + str(incorrect_resolved))
print('Correct Resolution: ' + str(correct_resolved))
print('Unresolved Bases: ' + str(unresolved_resolved))
241 changes: 241 additions & 0 deletions Analysis/Resolution_Evaluation/generateUnresolvedFiles.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,241 @@
import sys
import random
import copy
import re
import itertools

# ===========================================================
# This script generates a SNP table containing unresolved bases and a corresponding
# ground truth SNP table from an input SNP table for the resolution evaluation
# ===========================================================

# Function that maps a line from the SNP table to a strucuture:
# In comparison to the line from a SNP table, the structure
# contains numbers instead of letters where each number corresponds
# to a letter. The first letter that occurs in a line gets the number
# 1, the second gets number 2, ...
# Example:
# The line A T A G G A would get the structure 1 2 1 3 3 1.

def line_to_structure(line):
mapping = {}
splits = line.split('\t')
count = 0
structure = ""
for split in splits[1:]:
if split not in mapping:
mapping[split] = count
count += 1
structure += str(mapping[split]) + "\t"
return structure

# read input arguments
# 1. path of original SNP table
# 2. path of generated unresolved SNP table
# 3. path of generated ground truth SNP table
# 4. number of repetitions for each position
# 5. number of unresolved clades per row
# 6. path of corresponding Newick tree file
# 7. size of unresolved clades

input = sys.argv
# read SNP table
file = open(input[1], 'r')

# initialize arrays that save the structures that are already seen / used in the SNP table
seen_structures = []
# and the row index of the first occurance of each structure
not_seen_ind = []

# get sample names from SNP table
line = file.readline()
line = line[:len(line) - 1]
names = line.split('\t')

# write sample names to new unresolved file and ground truth file
new_unresolved = open(input[2], 'w')
new_unresolved.write(line + "\n")
new_truth = open(input[3], 'w')
new_truth.write(line + "\n")

# read parameters that influence the output files
num_repetitions = int(input[4])
number_ns = int(input[5])
clade_size = int(input[7])

# read newick string
newick = str(open(input[6], 'r').readlines())

# for third analysis
# compute clades with the specified clade_size
# method: find small clades of size 2 and try to extend them to a specific clade size
if clade_size > 1:

# regex to get any two leaf nodes that are siblings (i.e. clades of size 2)
my_regex = "\([^():,]+:[.\d]+,[^():,]+:[.\d]+\)"
reg = re.compile(my_regex)
matches = reg.finditer(newick)
clades = []

# try to extend these clades further
for m in matches:
left_bracket_count = 1
right_bracket_count = 1
next_left = m.start()
next_right = m.end()

# as long as the clade size is not reached, the following steps are repeated
while(left_bracket_count < clade_size - 1):

# extend to left and right
next_left -= 1
next_right += 1

# find position of opening bracket that belongs to parent of current clades root
while(not(newick[next_left] == '(' and left_bracket_count == right_bracket_count)):
if newick[next_left] == ')':
right_bracket_count += 1
elif newick[next_left] == '(':
left_bracket_count += 1
next_left -= 1
left_bracket_count += 1

# find position of closing bracket that belongs to parent of current clades root
while(not(newick[next_right] == ')' and right_bracket_count + 1 == left_bracket_count)):
if newick[next_right] == ')':
right_bracket_count += 1
elif newick[next_right] == '(':
left_bracket_count += 1
next_right += 1
right_bracket_count += 1

# check that the size of the clade is not exceeded and that the clade is not already detected
if left_bracket_count == clade_size -1:
if newick[next_left:next_right + 1] not in clades:
clades.append(newick[next_left:next_right + 1])


# read all lines of original SNP table and generate unresolved SNP table and ground truth
line_counter = 1
for line in file.readlines():
# copy old line for ground truth file
oldline = copy.copy(line)

# check if there are any N's in the line
line = line[:len(line) - 1]
count = 0
splits = line.split('\t')
for split in splits[2:]:
if split == 'N':
count += 1

# only lines without any N's are used
if count == 0:

# only consider SNP structure which are not already used
curr_structure = line_to_structure(line)
if curr_structure not in seen_structures:
seen_structures.append(curr_structure)
not_seen_ind.append(line_counter)
count_N = 0
list_randint = []

# 1st case: single bases should be replaced with N's
if clade_size == 1:

# construct multiple unresolved lines per position
while count_N < num_repetitions:

# compute random positions of unresolved bases
randInts = random.sample(range(2,len(splits)),number_ns)
is_same = False

# 1st criterion: check if same pattern is already constructed for this position
for entry in list_randint:
if sorted(randInts) == sorted(entry):
is_same = True

# 2nd criterion: for multiple unresolved bases per row check if there are any two unresolved bases that are siblings
if len(randInts) > 1:
for combination in itertools.combinations(randInts, 2):
my_regex = rf"\({names[combination[0]]}:[.\d]+,{names[combination[1]]}:[.\d]+\)"
if re.search(my_regex, str(newick)) != None:
is_same = True

# don't consider lines that fulfil any of the two criterions that are checked above
if is_same:
continue
# --------------------------------------------------
# otherwise
# write the old line to the ground truth file
new_truth.write(oldline)

# generate a new line with unresolved bases at the above computed positions
previouses = {}
for randInt in randInts:
# save resolved bases in dictionary before replacing them
previouses[randInt] = splits[randInt]
splits[randInt] = 'N'
newline = '\t'.join(splits) + '\n'

# write new line to unresolved file
new_unresolved.write(newline)
list_randint.append(randInts)

# reset replaced bases with saved resolved bases before replacement
for randInt in randInts:
splits[randInt] = previouses[randInt]
count_N += 1

# 2nd case: entire clades should be replaced with unresolved bases
else:
if len(clades) > 0:

# get random clade out of clades with specified size
if len(clades) > 1:
randInt = random.randrange(0,len(clades) - 1)
else:
randInt = 0
clade = clades[randInt]

# find all leaf nodes of that clade
my_regex2 = "[^():,]+:[.\d]+"
matches2 = re.findall(my_regex2, clade)
list_indices = []
for match2 in matches2:
match = match2.split(':')
count = 0
for name in names:
if match[0] == name:
# save index of the sample of the current node
list_indices.append(count)
count += 1

# write the old line to the ground truth file
new_truth.write(oldline)

# generate a new line with unresolved bases at the above computed positions
previouses = {}
for ind in list_indices:
# save resolved bases in dictionary before replacing them
previouses[ind] = splits[ind]
splits[ind] = 'N'
newline = '\t'.join(splits) + '\n'

# write new line to unresolved file
new_unresolved.write(newline)

# reset replaced bases with saved resolved bases before replacement
for ind in list_indices:
splits[ind] = previouses[ind]
else:
print('No clade of this size in the tree')

line_counter += 1

# print output
print('different structures:', len(not_seen_ind))
if clade_size > 1:
print('possible clades:', len(clades))


Loading

0 comments on commit 8321ceb

Please sign in to comment.