diff --git a/bin/deduplicate_annotations.py b/bin/deduplicate_annotations.py old mode 100644 new mode 100755 diff --git a/bin/helper.py b/bin/helper.py index eea690c..947ab83 100644 --- a/bin/helper.py +++ b/bin/helper.py @@ -5,6 +5,7 @@ from matplotlib.colors import LogNorm import pandas as pd + def parse_fasta(fasta_file): """Parses a fasta file and returns a dictionary of segment names and sequences. @@ -27,24 +28,25 @@ def parse_fasta(fasta_file): with open(fasta_file) as file: # read genome file for line in file: # parse genome file if line.startswith(">"): # parse header - #* if there is a header already, we store the current sequence - #* to this header. + # * if there is a header already, we store the current sequence + # * to this header. if header: fasta_dict[header] = seq - #* then we flush the sequence + # * then we flush the sequence seq = "" - #* and parse the new header + # * and parse the new header header = line.strip()[1:] else: - #* if no header is detected, we append the new line - #* to the current sequence + # * if no header is detected, we append the new line + # * to the current sequence seq += line.strip() - #* after the last line, we have to store - #* the last sequence as well. Since no new line with ">" occurs, we - #* do this manually + # * after the last line, we have to store + # * the last sequence as well. Since no new line with ">" occurs, we + # * do this manually fasta_dict[header] = seq return fasta_dict + def make_combination_array(genome_dict, intra_only=False): """ Creates a dictionary of numpy array of all possible genome segment combinations. @@ -72,21 +74,24 @@ def make_combination_array(genome_dict, intra_only=False): # * the iterator to a list. Actually, we also could just put the iterator in the for loop. # * should work as well. Is a tad more memory efficient. if intra_only: - segment_combinations = list(itertools.combinations_with_replacement(segments, 2)) + segment_combinations = list( + itertools.combinations_with_replacement(segments, 2) + ) segment_combinations = [ segment_combination for segment_combination in segment_combinations if segment_combination[0] == segment_combination[1] ] else: - segment_combinations = list(itertools.combinations_with_replacement(segments, 2)) + segment_combinations = list( + itertools.combinations_with_replacement(segments, 2) + ) segment_combinations = [ segment_combination for segment_combination in segment_combinations if segment_combination[0] != segment_combination[1] ] - for segment_combination in segment_combinations: # for segment_combination in itertools.combinations_with_replacement(segments,2): # * this should work as well combination_arrays[segment_combination] = np.zeros( @@ -98,38 +103,6 @@ def make_combination_array(genome_dict, intra_only=False): return combination_arrays -def parse_annotation_table(annotation_table): - """ - Parses an annotation table and returns a dictionary of segment names and annotations. - The annotation table must have the following columns: id,segment01,start01,end01,segment02,start02,end02 - - Parameters - ---------- - annotation_table : str - Path to annotation table. - - Returns - ------- - annotation_dict : dict - Dictionary of segment names and annotations. - - """ - annotation_dict = {} - with open(annotation_table) as file: - for line in file: - if not line.startswith("id"): - line = line.strip().split("\t") - annotation_dict[line[0]] = { - "segment01": line[1], - "start01": int(line[2]), - "end01": int(line[3]), - "segment02": line[4], - "start02": int(line[5]), - "end02": int(line[6]), - } - return annotation_dict - - def positive_to_negative_strand_point(genome_dict, segment, position): """Transpose a point from the positive to the negative strand. @@ -147,6 +120,7 @@ def positive_to_negative_strand_point(genome_dict, segment, position): # Get the position on the negative strand return aLen - position + 1 + def negative_to_positive_strand(genome_dict, aSeq, cai, caj, bSeq, cbi, cbj): """Transpose the region of interest from the negative to the positive strand. @@ -178,6 +152,7 @@ def negative_to_positive_strand(genome_dict, aSeq, cai, caj, bSeq, cbi, cbj): return aSeq, cai_pos, caj_pos, bSeq, cbi_pos, cbj_pos + def positive_to_negative_strand(genome_dict, aSeq, cai, caj, bSeq, cbi, cbj): """Transpose the region of interest from the positive to the negative strand. @@ -238,12 +213,10 @@ def parse_annotation_table(annotation_table): "end02", ] regions = pd.read_csv(annotation_table, names=header) - # add id column + # add id column regions["id"] = regions.index elif annotation_table.lower().endswith(".tsv"): regions = pd.read_csv(annotation_table, sep="\t") else: - raise ValueError( - "Annotation table must be either a csv file or a xlsx file." - ) - return regions \ No newline at end of file + raise ValueError("Annotation table must be either a csv file or a xlsx file.") + return regions diff --git a/bin/make_circos_files.py b/bin/make_circos_files.py index 05a5eec..610b92c 100755 --- a/bin/make_circos_files.py +++ b/bin/make_circos_files.py @@ -448,7 +448,10 @@ def main(): DESeq2_results = DESeq2_results.rename(columns={"Unnamed: 0": "id"}) # Read annotation table - annotation_table = hp.parse_annotation_table(annotation_table) + annotation_table = pd.read_csv(annotation_table, header=0, index_col=0, sep="\t") + + # Remove rows in DESeq2 results that are not in the annotation table + DESeq2_results = DESeq2_results[DESeq2_results["id"].isin(annotation_table.index)] # Parse genome segment names and sequences genome_dict = hp.parse_fasta(genome_file) diff --git a/main.nf b/main.nf index 17ca846..0c58186 100644 --- a/main.nf +++ b/main.nf @@ -73,6 +73,7 @@ include { fillArrays; mergeArrays } from './modules/handle_arrays.nf' include { plotHeatmaps as plotHeatmapsRaw } from './modules/data_visualization.nf' include { plotHeatmaps as plotHeatmapsMerged } from './modules/data_visualization.nf' include { plotHeatmapsAnnotated } from './modules/data_visualization.nf' +include { plotHeatmapsAnnotated as plotHeatmapsAnnotatedDedup } from './modules/data_visualization.nf' include { annotateArrays; mergeAnnotations } from './modules/annotate_interactions.nf' // differential analysis include { generateCountTables; mergeCountTables; runDESeq2 } from './modules/differential_analysis.nf' @@ -214,6 +215,7 @@ workflow { ) // Generate count tables + annotated_trns_ch.view() count_tables_ch = generateCountTables( annotated_trns_ch ) merged_count_tables_ch = mergeCountTables( count_tables_ch @@ -232,10 +234,17 @@ workflow { // Deduplicate annotations deduplicate_annotations_input_ch = merged_count_tables_all_ch // group_name, merged_count_table .combine( mergeAnnotations.out ) // merged_annotations - .map( it -> [ it[0], it[2], it[3] ] ) // group name, count table, annotations - deduplicate_annotations_input_ch.view() + .map( it -> [ it[0], it[2], it[1] ] ) // group name, count table, annotations + deduplicate_annotations_input_ch deduplicateAnnotations( deduplicate_annotations_input_ch ) + // Plot deduplicated annotations on the heatmaps + dedup_heatmaps_ch = annotated_arrays_ch + .map( it -> [ it[0], it[1], it[2], it[3] ] ) // sample name, genome, array, annotations + .combine( deduplicateAnnotations.out ) + dedup_heatmaps_ch.view() + plotHeatmapsAnnotatedDedup( dedup_heatmaps_ch ) + // Run differential analysis with DESeq2 samples_input_ch = Channel .fromPath( params.comparisons, checkIfExists: true ) @@ -244,6 +253,7 @@ workflow { .map( it -> [ it[1], it[0], it[2] ] ) .combine( merged_count_tables_ch, by: 0 ) .map( it -> [ it[1], it[2], it[0], it[3] ] ) + .view() runDESeq2( samples_input_ch ) @@ -271,7 +281,7 @@ workflow { .map( it -> [ it[0], it[2], it[1] ] ) .combine( deduplicateAnnotations.out ) } - + circos_deseq2_ch.view() // Create circos tables makeCircosTable_deseq2( circos_deseq2_ch ) makeCircosTable_count_table( circos_count_table_ch ) diff --git a/modules/annotate_interactions.nf b/modules/annotate_interactions.nf index faf92aa..a02f936 100644 --- a/modules/annotate_interactions.nf +++ b/modules/annotate_interactions.nf @@ -50,7 +50,7 @@ process deduplicateAnnotations { tuple val(group), path(annotation_table), path(count_table) output: - tuple path("deduplicated_annotations/deduplicated_annotation_table.tsv"), path("deduplicated_annotations/deduplicated_count_table.tsv") + tuple val(group), path("deduplicated_annotations/annotation_table_deduplicated.tsv"), path("deduplicated_annotations/count_table_deduplicated.tsv") publishDir "${params.output}/06-annotations" , mode: 'copy' diff --git a/modules/data_visualization.nf b/modules/data_visualization.nf index 96d2480..f7714c3 100644 --- a/modules/data_visualization.nf +++ b/modules/data_visualization.nf @@ -50,68 +50,14 @@ process makeCircosTable_count_table { label 'RNAswarm_small' input: - tuple val(genome_name), path(genome), path(count_table), path(annotation_table) + tuple val(genome_name), path(genome), path(genome_count_table), val(group_name), path(annotation_table), path(global_count_table) output: tuple val(genome_name), path("${genome_name}_circos") script: """ - make_circos_files.py -c ${count_table} -a ${annotation_table} -g ${genome} -o ${genome_name}_circos - """ -} - -/************************************************************************* -* make circos table for count_table results -*************************************************************************/ -process makeCircosTable_count_table_30 { - label 'RNAswarm_small' - - input: - tuple val(genome_name), path(genome), path(count_table), path(annotation_table) - - output: - tuple val(genome_name), path("${genome_name}_circos_30") - - script: - """ - make_circos_files.py -c ${count_table} -a ${annotation_table} -g ${genome} -o ${genome_name}_circos_30 --number_of_top_hits 30 - """ -} - -/************************************************************************* -* make circos table for count_table results -*************************************************************************/ -process makeCircosTable_count_table_40 { - label 'RNAswarm_small' - - input: - tuple val(genome_name), path(genome), path(count_table), path(annotation_table) - - output: - tuple val(genome_name), path("${genome_name}_circos_40") - - script: - """ - make_circos_files.py -c ${count_table} -a ${annotation_table} -g ${genome} -o ${genome_name}_circos_40 --number_of_top_hits 40 - """ -} - -/************************************************************************* -* make circos table for count_table results -*************************************************************************/ -process makeCircosTable_count_table_50 { - label 'RNAswarm' - - input: - tuple val(genome_name), path(genome), path(count_table), path(annotation_table) - - output: - tuple val(genome_name), path("${genome_name}_circos_50") - - script: - """ - make_circos_files.py -c ${count_table} -a ${annotation_table} -g ${genome} -o ${genome_name}_circos_50 --number_of_top_hits 50 + make_circos_files.py -c ${genome_count_table} -a ${annotation_table} -g ${genome} -o ${genome_name}_circos """ } @@ -122,7 +68,7 @@ process makeCircosTable_deseq2 { label 'RNAswarm_small' input: - tuple val(genome_name_01), path(genome_01), val(genome_name_02), path(genome_02), path(results_DESeq2), path(annotation_table) + tuple val(genome_name_01), path(genome_01), val(genome_name_02), path(genome_02), path(results_DESeq2), val(group_name), path(annotation_table), path(global_count_table) output: tuple val(genome_name_01), val(genome_name_02), path("${genome_name_01}_${genome_name_02}_circos") diff --git a/modules/differential_analysis.nf b/modules/differential_analysis.nf index fda83b4..997bbd5 100644 --- a/modules/differential_analysis.nf +++ b/modules/differential_analysis.nf @@ -19,6 +19,7 @@ process generateCountTables { """ } + /************************************************************************* # Merge count tables *************************************************************************/