Merge pull request #8 from Integrative-Transcriptomics/redevelopment

Redevelopment
Integrative-Transcriptomics · Mar 31, 2023 · 8321ceb · 8321ceb
2 parents a12386f + 26cc992
commit 8321ceb
Show file tree

Hide file tree

Showing 179 changed files with 585,150 additions and 1,612 deletions.
diff --git a/Analysis/Resolution_Evaluation/LepraResolution1_14.png b/Analysis/Resolution_Evaluation/LepraResolution1_14.png
diff --git a/Analysis/Resolution_Evaluation/LepraResolution1_21.png b/Analysis/Resolution_Evaluation/LepraResolution1_21.png
diff --git a/Analysis/Resolution_Evaluation/LepraResolution1_7.png b/Analysis/Resolution_Evaluation/LepraResolution1_7.png
diff --git a/Analysis/Resolution_Evaluation/LepraResolution2.png b/Analysis/Resolution_Evaluation/LepraResolution2.png
diff --git a/Analysis/Resolution_Evaluation/LepraResolution3.png b/Analysis/Resolution_Evaluation/LepraResolution3.png
diff --git a/Analysis/Resolution_Evaluation/README.md b/Analysis/Resolution_Evaluation/README.md
@@ -0,0 +1,11 @@
+# Resolution Evaluation
+
+This directory contains the evaluation of the resolution algorithm. 
+
+The `generateUnresolvedFiles.py` subscript generates two SNP tables, a ground truth SNP table containing only resolved bases and a SNP table containing the same rows as the ground truth SNP table but with some unresolved bases instead of resolved bases.
+
+The `evaluationResolution.py` subscript evaluates the performance of the resolution by comparing the resolved SNP table after the resolution with the ground truth and unresolved SNP table.
+
+The `resolutionComparison.py` script is the script that needs to be executed to run the resolution performance as it combines the `generateUnresolvedFiles.py` script and the `evaluationResolution.py` script with CLASSICOv2.
+
+The images visualize the results of the unresolved base resolution for the *Treponema pallidum* and *Mycobacterium leprae* datasets. The results with index 1 are from the first analysis in which only one base per row was replaced with an unresolved base and with different amounts of duplications of each row (7/14/21 or 10/20/30). The results with index 2 are from the second analysis in which multiple bases per row were replaced with unresolved bases and the results with index 3 are from the third analysis in which entire clades were replaced with unresolved bases.
diff --git a/Analysis/Resolution_Evaluation/TreponemaResolution1_10.png b/Analysis/Resolution_Evaluation/TreponemaResolution1_10.png
diff --git a/Analysis/Resolution_Evaluation/TreponemaResolution1_20.png b/Analysis/Resolution_Evaluation/TreponemaResolution1_20.png
diff --git a/Analysis/Resolution_Evaluation/TreponemaResolution1_30.png b/Analysis/Resolution_Evaluation/TreponemaResolution1_30.png
diff --git a/Analysis/Resolution_Evaluation/TreponemaResolution2.png b/Analysis/Resolution_Evaluation/TreponemaResolution2.png
diff --git a/Analysis/Resolution_Evaluation/TreponemaResolution3.png b/Analysis/Resolution_Evaluation/TreponemaResolution3.png
diff --git a/Analysis/Resolution_Evaluation/evaluateResolution.py b/Analysis/Resolution_Evaluation/evaluateResolution.py
@@ -0,0 +1,61 @@
+# ==========================================================
+# This script evaluates the resolution of unresolved bases by 
+# outputting correctly resolved, incorrectly resolved and 
+# unresolved bases after resolution
+# ==========================================================
+
+# input:
+# 1. path for SNP table containing ground truth
+# 2. path for SNP table after resolution
+# 3. path for modified SNP table with unresolved bases
+
+# read input arguments
+import sys
+input = sys.argv
+truth = open(input[1], 'r')
+resolved = open(input[2], 'r')
+unresolved = open(input[3], 'r')
+
+# initialize counts
+correct_resolved = 0
+incorrect_resolved = 0
+unresolved_resolved = 0
+
+# read lines for each file
+for line_truth in truth.readlines():
+    line_resolved = resolved.readline()
+    line_unresolved = unresolved.readline()
+    line_truth = line_truth[:len(line_truth) - 1]
+    line_resolved = line_resolved[:len(line_resolved) - 1]
+    line_unresolved = line_unresolved[:len(line_unresolved) - 1]
+
+    # split lines into columns
+    split_line_truth = line_truth.split('\t')
+    split_line_resolved = line_resolved.split('\t')
+    split_line_unresolved = line_unresolved.split('\t')
+
+    # check that the number of columns is equal for all three files
+    if (len(split_line_resolved) != len(split_line_truth) or len(split_line_truth) != len(split_line_unresolved) or len(split_line_resolved) != len(split_line_unresolved)):
+        print("Warning: Number of columns not equal!")
+        print(len(split_line_resolved) + ":" + len(split_line_truth) + ":" + len(split_line_unresolved))
+        break
+
+    # iterate over all SNPs (columns) that belong to samples (exclude position and reference)
+    for count in range(2, len(split_line_unresolved)):
+        # if the unresolved file contains a N this N should be resolved by the algorithm
+        if split_line_unresolved[count] == 'N':
+            # if the resolved file also contains a N, the base remained unresolved -> increase unresolved count
+            if split_line_resolved[count] == 'N':
+                unresolved_resolved += 1
+            else:
+                # if value in the groung truth file is equal to the resolved file the resolution was correct -> increase correct resolution count
+                if split_line_truth[count] in split_line_resolved[count]:
+                    correct_resolved += 1
+                # otherwise it was incorrect -> increase incorrect resolution count
+                else:
+                    incorrect_resolved += 1
+
+# print number of incorrect, correct and unresolved bases
+print('Incorrect Resolution: ' + str(incorrect_resolved))
+print('Correct Resolution: ' + str(correct_resolved))
+print('Unresolved Bases: ' + str(unresolved_resolved))
diff --git a/Analysis/Resolution_Evaluation/generateUnresolvedFiles.py b/Analysis/Resolution_Evaluation/generateUnresolvedFiles.py
@@ -0,0 +1,241 @@
+import sys
+import random
+import copy
+import re
+import itertools
+
+# ===========================================================
+# This script generates a SNP table containing unresolved bases and a corresponding
+# ground truth SNP table from an input SNP table for the resolution evaluation
+# ===========================================================
+
+# Function that maps a line from the SNP table to a strucuture:
+# In comparison to the line from a SNP table, the structure 
+# contains numbers instead of letters where each number corresponds
+# to a letter. The first letter that occurs in a line gets the number
+# 1, the second gets number 2, ...
+# Example: 
+# The line A T A G G A would get the structure 1 2 1 3 3 1.
+
+def line_to_structure(line):
+    mapping = {}
+    splits = line.split('\t')
+    count = 0
+    structure = ""
+    for split in splits[1:]:
+        if split not in mapping:
+            mapping[split] = count
+            count += 1
+        structure += str(mapping[split]) + "\t"
+    return structure
+
+# read input arguments
+# 1. path of original SNP table
+# 2. path of generated unresolved SNP table
+# 3. path of generated ground truth SNP table
+# 4. number of repetitions for each position
+# 5. number of unresolved clades per row 
+# 6. path of corresponding Newick tree file
+# 7. size of unresolved clades
+
+input = sys.argv
+# read SNP table
+file = open(input[1], 'r')
+
+# initialize arrays that save the structures that are already seen / used in the SNP table 
+seen_structures = []
+# and the row index of the first occurance of each structure
+not_seen_ind = []
+
+# get sample names from SNP table
+line = file.readline()
+line = line[:len(line) - 1]
+names = line.split('\t')
+
+# write sample names to new unresolved file and ground truth file
+new_unresolved = open(input[2], 'w')
+new_unresolved.write(line + "\n")
+new_truth = open(input[3], 'w')
+new_truth.write(line + "\n")
+
+# read parameters that influence the output files
+num_repetitions = int(input[4])
+number_ns = int(input[5])
+clade_size = int(input[7])
+
+# read newick string
+newick = str(open(input[6], 'r').readlines())
+
+# for third analysis
+# compute clades with the specified clade_size
+# method: find small clades of size 2 and try to extend them to a specific clade size
+if clade_size > 1:
+
+    # regex to get any two leaf nodes that are siblings (i.e. clades of size 2)
+    my_regex =  "\([^():,]+:[.\d]+,[^():,]+:[.\d]+\)"
+    reg = re.compile(my_regex)
+    matches = reg.finditer(newick)
+    clades = []
+
+    # try to extend these clades further
+    for m in matches:
+        left_bracket_count = 1
+        right_bracket_count = 1
+        next_left = m.start()
+        next_right = m.end()
+
+        # as long as the clade size is not reached, the following steps are repeated
+        while(left_bracket_count < clade_size - 1):
+
+            # extend to left and right
+            next_left -= 1
+            next_right += 1
+
+            # find position of opening bracket that belongs to parent of current clades root
+            while(not(newick[next_left] == '(' and left_bracket_count == right_bracket_count)):                                
+                if newick[next_left] == ')':
+                    right_bracket_count += 1
+                elif newick[next_left] == '(':
+                    left_bracket_count += 1
+                next_left -= 1
+            left_bracket_count += 1
+
+            # find position of closing bracket that belongs to parent of current clades root
+            while(not(newick[next_right] == ')' and right_bracket_count + 1 == left_bracket_count)):                                
+                if newick[next_right] == ')':
+                    right_bracket_count += 1
+                elif newick[next_right] == '(':
+                    left_bracket_count += 1
+                next_right += 1
+            right_bracket_count += 1
+
+        # check that the size of the clade is not exceeded and that the clade is not already detected
+        if left_bracket_count == clade_size -1:
+            if newick[next_left:next_right + 1] not in clades:
+                clades.append(newick[next_left:next_right + 1])
+
+
+# read all lines of original SNP table and generate unresolved SNP table and ground truth
+line_counter = 1
+for line in file.readlines():
+    # copy old line for ground truth file
+    oldline = copy.copy(line)
+
+    # check if there are any N's in the line
+    line = line[:len(line) - 1]
+    count = 0
+    splits = line.split('\t')
+    for split in splits[2:]:
+        if split == 'N':
+            count += 1
+
+    # only lines without any N's are used
+    if count == 0:
+
+        # only consider SNP structure which are not already used
+        curr_structure = line_to_structure(line)
+        if curr_structure not in seen_structures:
+            seen_structures.append(curr_structure)
+            not_seen_ind.append(line_counter)
+            count_N = 0
+            list_randint = []
+
+            # 1st case: single bases should be replaced with N's 
+            if clade_size == 1:
+
+                # construct multiple unresolved lines per position
+                while count_N < num_repetitions:
+
+                    # compute random positions of unresolved bases
+                    randInts = random.sample(range(2,len(splits)),number_ns)
+                    is_same = False
+
+                    # 1st criterion: check if same pattern is already constructed for this position
+                    for entry in list_randint:
+                        if sorted(randInts) == sorted(entry):
+                            is_same = True
+
+                    # 2nd criterion: for multiple unresolved bases per row check if there are any two unresolved bases that are siblings
+                    if len(randInts) > 1:
+                        for combination in itertools.combinations(randInts, 2):
+                            my_regex = rf"\({names[combination[0]]}:[.\d]+,{names[combination[1]]}:[.\d]+\)"
+                            if re.search(my_regex, str(newick)) != None:
+                                is_same = True
+
+                    # don't consider lines that fulfil any of the two criterions that are checked above
+                    if is_same:
+                        continue
+                    # --------------------------------------------------
+                    # otherwise
+                    # write the old line to the ground truth file
+                    new_truth.write(oldline)
+
+                    # generate a new line with unresolved bases at the above computed positions
+                    previouses = {}
+                    for randInt in randInts:
+                        # save resolved bases in dictionary before replacing them
+                        previouses[randInt] = splits[randInt]
+                        splits[randInt] = 'N'
+                    newline = '\t'.join(splits) + '\n'
+
+                    # write new line to unresolved file
+                    new_unresolved.write(newline)
+                    list_randint.append(randInts)
+
+                    # reset replaced bases with saved resolved bases before replacement
+                    for randInt in randInts:
+                        splits[randInt] = previouses[randInt]
+                    count_N += 1
+
+            # 2nd case: entire clades should be replaced with unresolved bases
+            else:
+                if len(clades) > 0:
+
+                    # get random clade out of clades with specified size
+                    if len(clades) > 1:
+                        randInt = random.randrange(0,len(clades) - 1)
+                    else:
+                        randInt = 0
+                    clade = clades[randInt]
+
+                    # find all leaf nodes of that clade
+                    my_regex2 = "[^():,]+:[.\d]+"
+                    matches2 = re.findall(my_regex2, clade)
+                    list_indices = []
+                    for match2 in matches2:
+                        match = match2.split(':')
+                        count = 0
+                        for name in names:
+                            if match[0] == name:
+                                # save index of the sample of the current node
+                                list_indices.append(count)
+                            count += 1
+
+                    # write the old line to the ground truth file
+                    new_truth.write(oldline)
+
+                    # generate a new line with unresolved bases at the above computed positions
+                    previouses = {}
+                    for ind in list_indices:
+                        # save resolved bases in dictionary before replacing them
+                        previouses[ind] = splits[ind]
+                        splits[ind] = 'N'
+                    newline = '\t'.join(splits) + '\n'
+
+                    # write new line to unresolved file
+                    new_unresolved.write(newline)
+
+                    # reset replaced bases with saved resolved bases before replacement
+                    for ind in list_indices:
+                        splits[ind] = previouses[ind]
+                else:
+                    print('No clade of this size in the tree')
+
+    line_counter += 1
+
+# print output
+print('different structures:', len(not_seen_ind))
+if clade_size > 1:
+    print('possible clades:', len(clades))
+
+