diff --git a/kegg_charter.py b/kegg_charter.py deleted file mode 100644 index 47c9918..0000000 --- a/kegg_charter.py +++ /dev/null @@ -1,618 +0,0 @@ -#!/usr/bin/env python - -from PIL import Image -from argparse import ArgumentParser -import numpy as np -import os -import pandas as pd -from pathlib import Path -from re import search -from subprocess import run -import sys -from io import StringIO -from time import time, gmtime, strftime, sleep -from Bio.KEGG.REST import kegg_link, kegg_list, kegg_get -from Bio.KEGG.KGML import KGML_parser -from matplotlib import pyplot as plt, colors, cm -from tqdm import tqdm -from glob import glob - -from kegg_pathway_map import KEGGPathwayMap - -__version__ = "0.2.5" - - -def get_arguments(): - parser = ArgumentParser( - description="""KEGGCharter - A tool for representing genomic potential and transcriptomic expression into - KEGG pathways""", - epilog="Input file must be specified.") - parser.add_argument("-o", "--output", type=str, help="Output directory", - default='KEGGCharter_results') - parser.add_argument("-rd", "--resources-directory", type=str, default=sys.path[0], - help="Directory for storing KGML and CSV files.") - parser.add_argument("-mm", "--metabolic-maps", type=str, - help="IDs of metabolic maps to output", - default=','.join(KEGGCharter_prokaryotic_maps())) - parser.add_argument("-gcol", "--genomic-columns", type=str, - help="Names of columns with genomic identification") - parser.add_argument("-tcol", "--transcriptomic-columns", type=str, - help="Names of columns with transcriptomics quantification") - parser.add_argument("-tls", "--taxa-list", type=str, - help="List of taxa to represent in genomic potential charts (comma separated)") # TODO - must be tested - parser.add_argument("-not", "--number-of-taxa", type=str, default='10', - help="Number of taxa to represent in genomic potential charts (comma separated)") - parser.add_argument("-keggc", "--kegg-column", type=str, help="Column with KEGG IDs.") - parser.add_argument("-koc", "--ko-column", type=str, help="Column with KOs.") - parser.add_argument("-ecc", "--ec-column", type=str, help="Column with EC numbers.") - parser.add_argument("-iq", "--input-quantification", action="store_true", - help="If input table has no quantification, will create a mock quantification column") - parser.add_argument("-it", "--input-taxonomy", type=str, - help="If no taxonomy column exists and there is only one taxon in question.") - # TODO - test this argument without UniProt shenanigans - parser.add_argument("-tc", "--taxa-column", type=str, default='Taxonomic lineage (GENUS)', - help="Column with the taxa designations to represent with KEGGChart") - parser.add_argument("--resume", action="store_true", default=False, - help="Data inputed has already been analyzed by KEGGCharter.") - parser.add_argument('-v', '--version', action='version', version='KEGGCharter ' + __version__) - - required_named = parser.add_argument_group('required named arguments') - required_named.add_argument("-f", "--file", type=str, required=True, - help="TSV or EXCEL table with information to chart") - - special_functions = parser.add_argument_group('Special functions') - special_functions.add_argument( - "--show-available-maps", action="store_true", default=False, - help="Outputs KEGG maps IDs and descriptions to the console (so you may pick the ones you want!)") - - args = parser.parse_args() - - args.output = args.output.rstrip('/') - - for directory in [args.output]: - if not os.path.isdir(directory): - Path(directory).mkdir(parents=True, exist_ok=True) - print('Created ' + directory) - - return args - - -def timed_message(message): - print(strftime("%Y-%m-%d %H:%M:%S", gmtime()) + ': ' + message) - - -def read_input(file): - if file.endswith('.xlsx'): - return pd.read_excel(file) - return pd.read_csv(file, sep='\t') - - -def expand_by_list_column(df, column='Pathway'): - if len(df) == 0: - return pd.DataFrame() - lens = [len(item) for item in df[column]] - dictionary = dict() - for col in df.columns: - dictionary[col] = np.repeat(df[col].values, lens) - dictionary[column] = np.concatenate(df[column].values) - return pd.DataFrame(dictionary) - - -def most_abundant_taxa(data, columns, taxa_column, number_of_taxa=10): - """ - Calculates top genus from samples - :param data: data - :param columns: list of mg columns to consider for quantification of genus abundance - :param taxa_column: da column with da taxa - :param number_of_taxa: number of top genus to return - :return: list of top genus - """ - data = data.groupby(taxa_column)[columns].sum().sum(axis=1).sort_values(ascending=False) - if number_of_taxa > len(data.index.tolist()): - number_of_taxa = len(data.index.tolist()) - return data.index.tolist()[:number_of_taxa] - - -def taxa_colors(hex_values=None, ncolor=1): - """ - Creates list of hex colors to be used, using matplotlib or using custom colors - :param hex_values: list of hex colors - :param ncolor: int indicating the ammount of hex colors that should be created - :return: returns list with hex color codes - """ - if not hex_values: # if no colors are given creates a list of hex colors with ncolor from matplotlib discrete colormaps - color_scheme = (cm.get_cmap('Pastel2', 8) if ncolor <= 8 - else cm.get_cmap("Set3", 12) if ncolor <= 12 - else cm.get_cmap("rainbow", ncolor)) # if ncolor > 12 a continuous colormap is used instead - return [colors.to_hex(color_scheme(i)) for i in range(ncolor)] - - for hex_value in hex_values: - if not search(r'^#(?:[0-9a-fA-F]{3}){1,2}$', hex_value): - sys.exit(Exception("Colors aren't valid hex codes")) - return hex_values # has validated hex values and returns the original list - - -# Conversion functions -def id2id(input_ids, column, output_column, output_ids_type, step=150, desc=''): - """ - Converts KEGG_ID genes to Ortholog KO ID from KEGG - :param kegg_ids: (list) - KEGG ID genes - :param kegg_column: (str) - name of column to return - :param step: (int) - will convert "step" KEGG IDs at a time - :return: (list) - (list,list) - KEGG ID genes converted and ko IDs - """ - result = pd.DataFrame(columns=[column, output_column]) - for i in tqdm(range(0, len(input_ids), step), desc=desc): - j = min(i + step, len(input_ids)) - try: - result = pd.concat([result, pd.read_csv( - kegg_link(output_ids_type, input_ids[i:j]), sep='\t', names=[column, output_column])]) - except: - print(f'IDs conversion broke at index: {i}') - if output_ids_type == 'ko': - result[output_column] = result[output_column].apply(lambda x: x.strip('ko:')) - result[column] = result[column].apply(lambda x: x.strip('ec:')) - elif output_ids_type == 'enzyme': - result[column] = result[column].apply(lambda x: x.strip('ko:')) - result[output_column] = result[output_column].apply(lambda x: x.strip('ec:')) - return result - - -def ids_interconversion(data, column, ids_type='kegg'): - ids = list(set(data[data[column].notnull()][column])) - base_desc = 'Converting %i %s to %s through the KEGG API' - if ids_type == 'kegg': - # sometimes Kegg IDs come as mfc:BRM9_0145;mfi:DSM1535_1468; (e.g. from UniProt IDs mapping). - # Should be no problem since both IDs likely will always represent same functions - trimmed_ids = [ide.split(';')[0] for ide in ids] - relational = pd.DataFrame([ids, trimmed_ids]).transpose() - new_ids = id2id(trimmed_ids, column, 'KO (KEGGCharter)', 'ko', desc=base_desc % (len(ids), 'KEGG IDs', 'KOs')) - new_ids = pd.merge(new_ids, relational, left_on=column, right_on=1) # mcj:MCON_3003; mcj:MCON_3003 - del new_ids[column] # mcj:MCON_3003 K07486 mcj:MCON_3003; mcj:MCON_3003 - del new_ids[1] - new_ids.columns = ['KO (KEGGCharter)', column] - elif ids_type == 'ko': - new_ids = id2id( - ids, column, 'EC number (KEGGCharter)', 'enzyme', desc=base_desc % (len(ids), 'KOs', 'EC numbers')) - elif ids_type == 'ec': - ids = [f'ec:{ide}' for ide in ids] - new_ids = id2id(ids, column, 'KO (KEGGCharter)', 'ko', desc=base_desc % (len(ids), 'EC numbers', 'KOs')) - return pd.merge(data, new_ids, on=column, how='left') - - -def get_cross_references(data, kegg_column=None, ko_column=None, ec_column=None): - # KEGG ID to KO -> if KO column is not set, KEGGCharter will get them through the KEGG API - if kegg_column: - data = ids_interconversion(data, column=kegg_column, ids_type='kegg') - data = ids_interconversion(data, column='KO (KEGGCharter)', ids_type='ko') - if ko_column: - data = ids_interconversion(data, column=ko_column, ids_type='ko') - data = ids_interconversion(data, column='EC number (KEGGCharter)', ids_type='ec') - if ec_column: - data = ids_interconversion(data, column=ec_column, ids_type='ec') - data = ids_interconversion(data, column='KO (KEGGCharter)', ids_type='ko') - if not (kegg_column or ko_column or ec_column): - exit('Need to specify a column with either KEGG IDs, KOs or EC numbers!') - return data - - -def condense_data(data, main_column): - onlykos = data[data['KO (KEGGCharter)'].notnull() & (data['EC number (KEGGCharter)'].isnull())] - onlykos = onlykos.groupby(main_column).agg({'KO (KEGGCharter)': lambda x: ','.join(set(x))}).reset_index() - onlykos['EC number (KEGGCharter)'] = [np.nan] * len(onlykos) - wecs = data[data['EC number (KEGGCharter)'].notnull()] - wecs = wecs.groupby(main_column).agg({'KO (KEGGCharter)': lambda x: ','.join(set(x)), - 'EC number (KEGGCharter)': lambda x: ','.join(set(x))}).reset_index() - del data['KO (KEGGCharter)'] - del data['EC number (KEGGCharter)'] - newinfo = pd.concat([onlykos, wecs]) - return pd.merge(data, newinfo, on=main_column, how='left').drop_duplicates() - - -def prepare_data_for_charting(data, ko_column='KO (KEGGCharter)', ko_from_uniprot=False): - if ko_from_uniprot: - # If coming from UniProt ID mapping, KOs will be in the form "KXXXXX;" - data[ko_column] = data[ko_column].apply(lambda x: x.rstrip(';') if type(x) != float else x) - - # Expand KOs column if some elements are in the form KO1,KO2,... - data[ko_column] = [ko.split(',') if type(ko) != float else ko for ko in data[ko_column]] - nokos = data[data[ko_column].isnull()] - wkos = data[data[ko_column].notnull()] - wkos = expand_by_list_column(wkos, column=ko_column) - data = pd.concat([wkos, nokos]) - return data - - -# Get metabolic maps from KEGG Pathway -def KEGGCharter_prokaryotic_maps(file=sys.path[0] + '/KEGGCharter_prokaryotic_maps.txt'): - return open(file).read().split('\n') - - -def KEGG_metabolic_maps(): - """ - Creates a dic with all specific kegg pathway maps and their description - :return: pandas.DataFrame with Map ID as index and maps names as - sole column - """ - maps = pd.read_csv(StringIO(kegg_list("pathway").read()), sep='\t', names=['Map ID', 'Description']) - maps['Map ID'] = [ide.split('map')[-1] for ide in maps['Map ID']] - return maps - - -def add_blank_space(image_pil, width, height, image_mode='RGB'): - """ - Resizes an image with white background, keeping image size ratio - :param image_pil: PIL.Image - image to be resized - :param width: int - width of final image - :param height: int - heigth of final image - :param image_mode: str - image mode of image (RGBA, RGB, ...) - """ - ratio_w = width / image_pil.width - ratio_h = height / image_pil.height - if ratio_w < ratio_h: - # It must be fixed by width - resize_width = width - resize_height = round(ratio_w * image_pil.height) - else: - # Fixed by height - resize_width = round(ratio_h * image_pil.width) - resize_height = height - image_resize = image_pil.resize((resize_width, resize_height), - Image.ANTIALIAS) - background = Image.new('RGB', (width, height), (255, 255, 255, 255)) - offset = (round((width - resize_width) / 2), - round((height - resize_height) / 2)) - background.paste(image_resize, offset) - return background.convert(image_mode) - - -def resize_image(image_pil, ratio=None, width=None, height=None): - """ - Resizes an image with alteration to image size ratio - :param ratio: int - ratio of resize - how bigger or smaller will the output be? - :param image_pil: PIL.Image - image to be resized - :param width: int - width of final image - :param height: int - heigth of final image - """ - if ratio: - return image_pil.resize((image_pil.width * ratio, - image_pil.height * ratio), Image.ANTIALIAS) - elif width and height: - return image_pil.resize((width, height), Image.ANTIALIAS) - else: - return None - - -def pdf2png(pdf_filename): - """ - Converts a pdf file to a png file, RGB format - name changes, .pdf to .png - :param pdf_filename: str - filename of PDF file - """ - bash_command = f'pdftoppm {pdf_filename} {pdf_filename.split(".pdf")[0]} -png' - run(bash_command.split()) - os.rename(pdf_filename.replace('.pdf', '-1.png'), pdf_filename.replace('.pdf', '.png')) - - -def add_legend(kegg_map_file, legend_file, output): - """ - Merges the two files - KEGG metabolic map and respective legend - into one file - :param kegg_map_file: str - filename of PDF kegg metabolic map - :param legend_file: str - filename of PNG legend - """ - pdf2png(kegg_map_file) - imgs = [Image.open(file) for file in [kegg_map_file.replace('.pdf', '.png'), legend_file]] - imgs[0] = imgs[0].convert( - 'RGB') # KEGG Maps are converted to RGB by pdftoppm, dunno if converting to RGBA adds any transparency - imgs[1] = resize_image(imgs[1], ratio=5) - imgs[1] = add_blank_space(imgs[1], imgs[1].width, imgs[0].height) - imgs_comb = np.hstack([np.asarray(i) for i in imgs]) - - # save that beautiful picture - imgs_comb = Image.fromarray(imgs_comb) - imgs_comb.save(output) - for file in [kegg_map_file, kegg_map_file.replace('.pdf', '.png'), legend_file]: - os.remove(file) - - -def create_potential_legend(colors, labels, filename): - """ - Draws the color to taxa labels of genomic potential representations - :param colors: list - list of colors of the different taxa - :param labels: list - list of taxa corresponding to the colors - :param filename: string - filename to output - """ - f = lambda m, c: plt.plot([], [], marker=m, color=c, ls="none")[0] - handles = [f("s", color) for color in colors] - legend = plt.legend(handles, labels, loc=3, framealpha=1, frameon=True) - fig = legend.figure - fig.canvas.draw() - # The bbox manipulation removes the axis - bbox = legend.get_window_extent() - bbox = bbox.from_extents(*(bbox.extents + np.array([-2, -2, 2, 2]))) - bbox = bbox.transformed(fig.dpi_scale_trans.inverted()) - - fig.savefig(filename, dpi="figure", bbox_inches=bbox) - - -def genomic_potential_taxa(kegg_pathway_map, data, samples, dic_colors, ko_column, - taxa_column='Taxonomic lineage (GENUS)', output_basename=None): - """ - Represents the genomic potential of the dataset for a certain taxa level, - by coloring each taxon with a unique color - :param data: pandas.DataFrame with data already processed by KEGGPathway - :param samples: list of str column names of the dataset correspoding to - expression values - :param genera: list of genus to represent - :param number_of_taxa: int representing the number of diferent taxa to - be represented in the maps, in case the taxa are not specified - :param level_of_taxa: str - taxonomic level to represent - SPECIES, - SUPERKINGDOM, ... - :param output_basename: str - basename for map outputs - :param maxshared: int - maximum number of different taxa to represent - in a single map box - """ - # for every taxon, check all boxes it is in, and save that info to box2taxon - box2taxon = dict() - for taxon in dic_colors.keys(): - df = data[data[taxa_column] == taxon][samples + [ko_column]] - df = df[df.any(axis=1)] - for ortholog in df[ko_column]: - if ortholog in kegg_pathway_map.ko_boxes.keys(): - for box in kegg_pathway_map.ko_boxes[ortholog]: - if box in box2taxon.keys(): - if taxon not in box2taxon[box]: - box2taxon[box].append(taxon) - else: - box2taxon[box] = [taxon] - - # for every box with KOs identified from the most abundant taxa, sub-boxes are created with colours of the - # corresponding taxa - kegg_pathway_map.pathway_box_list(box2taxon, dic_colors) - - # boxes with KOs identified but not from the most abundant taxa are still identified - df = data[samples + [ko_column]] - df = df[df.any(axis=1)] - grey_boxes = list() - for ortholog in df[ko_column]: - if ortholog in kegg_pathway_map.ko_boxes.keys(): - for box in kegg_pathway_map.ko_boxes[ortholog]: - if box not in box2taxon.keys() and box not in grey_boxes: - grey_boxes.append(box) - kegg_pathway_map.grey_boxes(grey_boxes) - - name = kegg_pathway_map.pathway.name.split(':')[-1] - name_pdf = f'{output_basename}_{name}.pdf' - kegg_pathway_map.to_pdf(name_pdf) - - # a new taxon: "Other taxa" - if len(grey_boxes) > 0: - dic_colors["Other taxa"] = "#7c7272" - - # TODO - legend should be ajusted for the maps - otherwise, makes no sense to have one legend for each map - - # they all become the same, except for "Other taxa" - create_potential_legend(dic_colors.values(), dic_colors.keys(), - name_pdf.replace('.pdf', '_legend.png')) - - add_legend(name_pdf, name_pdf.replace('.pdf', '_legend.png'), - name_pdf.replace(name + '.pdf', kegg_pathway_map.pathway.title.replace('/', '|') + '.png')) - - -def differential_colorbar(dataframe, filename): - fig_size = (2, 3) - mpb = plt.pcolormesh(dataframe, cmap='coolwarm') - fig, ax = plt.subplots(figsize=fig_size) - plt.colorbar(mpb, ax=ax) - ax.remove() - plt.savefig(filename, bbox_inches='tight') - - -def differential_expression_sample(kegg_pathway_map, data, samples, ko_column, - output_basename=None, log=True): - """ - Represents in small heatmaps the expression levels of each sample on the - dataset present in the given pathway map. The values can be transford to - a log10 scale - :param data: pandas.DataFrame with data already processed by KEGGPathway - :param samples: list - column names of the dataset corresponding to - expression values - :param output_folder: string - name of folder to store pdfs - :param log: bol - convert the expression values to logarithmic scale? - """ - df = data.groupby(ko_column)[samples + [ko_column]].sum() - df = df[df.any(axis=1)] - df['Boxes'] = [ - kegg_pathway_map.ko_boxes[ko] if ko in kegg_pathway_map.ko_boxes.keys() else np.nan for ko in df.index] - df = df[df['Boxes'].notnull()] - df = expand_by_list_column(df, column='Boxes') - if len(df) == 0: - return 1 - df = df.groupby('Boxes')[samples].sum() - - kegg_pathway_map.pathway_boxes_differential(df, log) - - name = kegg_pathway_map.pathway.name.split(':')[-1] - name_pdf = f'{output_basename}_{name}.pdf' - kegg_pathway_map.to_pdf(name_pdf) - - differential_colorbar(df, name_pdf.replace(".pdf", '_legend.png')) - - add_legend(name_pdf, name_pdf.replace('.pdf', '_legend.png'), - name_pdf.replace(name + '.pdf', kegg_pathway_map.pathway.title.replace('/', '|') + '.png')) - - return 0 - - -def write_kgml(mmap, out_dir): - data = kegg_get("ko" + mmap, "kgml").read() - with open(f'{out_dir}/map{mmap}.xml', 'w') as f: - if len(data) > 1: - f.write(data) - - -def write_kgmls(mmaps, out_dir, max_tries=3): - print(f'[{len(mmaps)}] maps inputted') - maps_done = [filename.split('/map')[-1].rstrip('.xml') for filename in glob(f'{out_dir}/*.xml')] - print(f'[{len(maps_done)}] KGMLs already obtained') - mmaps = [map for map in mmaps if map not in maps_done] - i = 1 - for mmap in mmaps: - print(f'[{i}/{len(mmaps)}] Obtaining KGML for: {mmap}') - tries = 0 - done = False - while tries < max_tries and not done: - try: - write_kgml(mmap, out_dir) - done = True - print(f'[{mmap}]: success') - except: - if os.path.isfile(f'{out_dir}/map{mmap}.xml'): - os.remove(f'{out_dir}/map{mmap}.xml') - tries += 1 - print(f'[{mmap}]: failure') - i += 1 - - -def set_text_boxes_kgml(kgml_filename, desc=''): - handler = KGML_parser.read(open(kgml_filename)) - # Set text in boxes to EC numbers - with open(kgml_filename.replace('xml', 'csv'), 'w') as f: - for ortholog_rec in tqdm(handler.orthologs, desc=desc): - lines = list() - kos = ortholog_rec.name.split() - lines += kegg_link("enzyme", kos).read().split('\n') - ecs = [line.split('\t')[1] for line in lines if len(line) > 0] - f.write(f'{",".join(ecs)}\n') - - -def set_text_boxes_kgmls(mmaps, out_dir, max_tries=3): - maps_done = [filename.split('/map')[-1].rstrip('.csv') for filename in glob(f'{out_dir}/*.csv')] - print(f"[{len(maps_done)}] maps already have boxes' text set") - mmaps = [map for map in mmaps if map not in maps_done] - - i = 1 - for mmap in mmaps: - tries = 0 - done = False - while tries < max_tries and not done: - try: - set_text_boxes_kgml( - f'{out_dir}/map{mmap}.xml', desc=f'[{i}/{len(mmaps)}] Getting EC numbers for: {mmap}') - done = True - except: - print(f'\nFailed for map. Attempt: {tries + 1}') - if os.path.isfile(f'{out_dir}/map{mmap}.csv'): - os.remove(f'{out_dir}/map{mmap}.csv') - tries += 1 - sleep(10) - i += 1 - - -def chart_map(kgml_filename, ec_list, data, output=None, ko_column=None, taxa_column=None, dic_colors=None, - genomic_columns=None, transcriptomic_columns=None): - - if genomic_columns: # when not set is None - mmap = KGML_parser.read(open(kgml_filename)) - kegg_pathway_map = KEGGPathwayMap(pathway=mmap, ec_list=ec_list) - genomic_potential_taxa( - kegg_pathway_map, data, genomic_columns, dic_colors, ko_column, taxa_column=taxa_column, - output_basename=f'{output}/potential') - if transcriptomic_columns: # when not set is None - mmap = KGML_parser.read(open(kgml_filename)) - kegg_pathway_map = KEGGPathwayMap(pathway=mmap, ec_list=ec_list) - differential_expression_sample( - kegg_pathway_map, data, transcriptomic_columns, ko_column, output_basename=f'{output}/differential', - log=False) - plt.close() - - -def main(): - # Read input - args = get_arguments() - timed_message('Arguments valid.') - - if args.show_available_maps: - sys.exit(KEGG_metabolic_maps().to_string()) - data = read_input(args.file) - timed_message('Data successfully read.') - - if not args.resume: - data = get_cross_references( - data, kegg_column=args.kegg_column, ko_column=args.ko_column, ec_column=args.ec_column) - - main_column = (args.kegg_column if args.kegg_column is not None else - args.ko_column if args.ko_column is not None else args.ec_column) - data = condense_data(data, main_column) - - data.to_csv(f'{args.output}/KEGGCharter_results.tsv', sep='\t', index=False) - timed_message(f'Results saved to {args.output}/KEGGCharter_results.tsv') - - ko_column = 'KO (KEGGCharter)' # TODO - set ko_column to user defined value - - data = prepare_data_for_charting(data, ko_column=ko_column) - - if args.input_quantification: - data['Quantification (KEGGCharter)'] = [1] * len(data) - args.genomic_columns = 'Quantification (KEGGCharter)' - - if args.input_taxonomy: - data['Taxon (KEGGCharter)'] = [args.input_taxonomy] * len(data) - args.taxa_column = 'Taxon (KEGGCharter)' - args.taxa_list = args.input_taxonomy - - # Begin dat chart magic - metabolic_maps = args.metabolic_maps.split(',') - timed_message(f'Creating KEGG Pathway representations for {len(metabolic_maps)} metabolic pathways.') - write_kgmls(metabolic_maps, args.resources_directory) - set_text_boxes_kgmls(metabolic_maps, args.resources_directory) - - # Set colours for taxa if MG data is present - if args.genomic_columns: - args.genomic_columns = args.genomic_columns.split(',') - if args.taxa_list is None: - taxa = most_abundant_taxa(data, args.genomic_columns, args.taxa_column, - number_of_taxa=int(args.number_of_taxa)) - else: - taxa = args.taxa_list.split(',') - - colors = taxa_colors(ncolor=len(taxa)) - dic_colors = {taxa[i]: colors[i] for i in range(len(taxa))} - else: - dic_colors = None - - if args.transcriptomic_columns: - args.transcriptomic_columns = args.transcriptomic_columns.split(',') - - for i in range(len(metabolic_maps)): - pathway = KGML_parser.read(open(f'{args.resources_directory}/map{metabolic_maps[i]}.xml')) - - with open(f'{args.resources_directory}/map{metabolic_maps[i]}.csv') as f: - ec_list = f.read().split('\n') - - if len(pathway.orthologs) != len(ec_list) - 1: # -1 because of newline at the end - print(f'Number of ECs was different from the number of orthologs for map [{metabolic_maps[i]}]. ' - f'Will retrieve the right files now.') - - write_kgml(metabolic_maps[i], args.resources_directory) - set_text_boxes_kgml(f'{args.resources_directory}/map{metabolic_maps[i]}.xml') - - with open(f'{args.resources_directory}/map{metabolic_maps[i]}.csv') as f: - ec_list = f.read().split('\n') - - timed_message(f'[{i + 1}/{len(metabolic_maps)}] {pathway.title}') - chart_map(f'{args.resources_directory}/map{metabolic_maps[i]}.xml', ec_list, data, output=args.output, ko_column=ko_column, - taxa_column=args.taxa_column, dic_colors=dic_colors, - genomic_columns=args.genomic_columns, transcriptomic_columns=args.transcriptomic_columns) - - # TODO - implement multiprocessing for map generation? - ''' - with multiprocessing.Pool() as p: - p.starmap(chart_map, [(handler, data, args.output, ko_column, ec_column, args.taxa_column, dic_colors, - args.genomic_columns, args.transcriptomic_columns, - f'{args.output}/failed_maps.txt') for handler in pathways]) - ''' - - -if __name__ == '__main__': - start_time = time() - main() - print(f'KEGGCharter analysis finished in {strftime("%Hh%Mm%Ss", gmtime(time() - start_time))}') diff --git a/kegg_pathway_map.py b/kegg_pathway_map.py deleted file mode 100644 index 2dc78f8..0000000 --- a/kegg_pathway_map.py +++ /dev/null @@ -1,225 +0,0 @@ -#!/usr/bin/env python - -from Bio.KEGG.KGML import KGML_parser, KGML_pathway -from Bio.Graphics.KGML_vis import KGMLCanvas -from matplotlib import colors, cm -from time import sleep - - -def set_bgcolor(pathway_element, color): - """ - Sets graphic element background color - :param pathway_element: kegg pathway xml element object - :param color: color to be used in rgb - """ - pathway_element.graphics[0].bgcolor = color - - -def set_fgcolor(pathway_element, color): - """ - Sets graphic element contour color - :param pathway_element: kegg pathway xml element object - :param color: color to be used in rgb - """ - pathway_element.graphics[0].fgcolor = color - - -def conv_value_rgb(value, colormap, norm): - """ - Normalizes values in a vector and transforms it into corresponding - hex color using given colormap - :param value: numpy vector from dataframe apply function with expression - values - :param colormap: matplotlib colormap object - :param norm: matplotlib Normalize or LogNorm object - :return: returns hex colors in vector format - """ - return value.apply(norm).apply(colormap) - - -def conv_rgb_hex(rgb): - """ - converts rgb into hex color code in vector - :param rgb: rgb value in vector resulting from pandas dataframe apply - :return: vector with converted hex color code - """ - return rgb.apply(colors.to_hex) - - -def create_tile_box(record): - """ - Create box graphical element in pathway to draw the box countour and - give the correct name - :param record: graphical element to be created - """ - newrecord = KGML_pathway.Graphics(record) - newrecord.name = record.graphics[0].name - newrecord.type = "rectangle" - newrecord.width = record.graphics[0].width - newrecord.height = record.graphics[0].height - newrecord.y = record.graphics[0].y - newrecord.x = record.graphics[0].x - newrecord.bgcolor = "#FFFFFF00" - newrecord.fgcolor = "#000000" - record.graphics.append(newrecord) - record.graphics[0].bgcolor = "#FFFFFF00" - record.graphics[0].fgcolor = "#FFFFFF00" - record.graphics[0].name = "" - - -def create_box_heatmap(rec_old, nrboxes, i, paired=True): - """ - Helper function for creating heatmap, draws one expression value in its - correct position on the bigger parent box - :param rec_old: graphical element object to be used as reference - :param nrboxes: int nr of boxes to be drawed - :param i: int internal number of movements of the box given by the for loop - :param paired: boolean is number of samples a pair? - :return: graphical element object - """ - if rec_old.graphics[0].width is None: # it happens, guess it is because it has no KOs - return 1 - movement_steps = rec_old.graphics[0].width / (nrboxes * (2 if paired else 1)) - newrecord = KGML_pathway.Graphics(rec_old) - newrecord.name = "" - newrecord.type = "rectangle" - adjustment_factor = 1.3 if nrboxes > 2 else 1.1 if nrboxes > 1 else 1 # sub-boxes width, adjusted by a factor that experimentally fixed well in the representations - newrecord.width = movement_steps * adjustment_factor * (2 if paired else 1) - newrecord.height = rec_old.graphics[0].height - newrecord.y = rec_old.graphics[0].y - newrecord.x = (i * movement_steps) + rec_old.graphics[0].x - newrecord.fgcolor = "#FFFFFF00" - return newrecord - - -class KEGGPathwayMap: - """ - This class retrieves and manipulates KEGG metabolic maps from KEGG Pathway - """ - - def __init__(self, pathway, ec_list): - """ - Initialize object - :param data: pd.DataFrame - data from MOSCA analysis - :param kgml_file: (str) - KGML filename - """ - self.pathway = pathway - self.set_pathway(ec_list) - - def set_pathway(self, ec_list): - """ - Set pathway with Kegg Pathway ID - """ - self.ko_boxes = dict() - for i in range(len(self.pathway.orthologs)): - set_bgcolor(self.pathway.orthologs[i], "#ffffff") # set all boxes to white - # self.set_fgcolor(self.pathway.orthologs[i], "#ffffff") # This might be helpful in the future, if an additional layer of marking is needed - orthologs_in_box = [ide[3:] for ide in self.pathway.orthologs[ - i].name.split()] # 'ko:K16157 ko:K16158 ko:K16159' -> ['K16157', 'K16158', 'K16159'] - for ortholog in orthologs_in_box: - if ortholog not in self.ko_boxes.keys(): - self.ko_boxes[ortholog] = list() - self.ko_boxes[ortholog].append(i) # {'K16157':[0,13,432], 'K16158':[4,13,545]} - - # Set name as EC number - ecs = ec_list[i].split(',') - if len(ecs) > 0: - self.pathway.orthologs[i].graphics[0].name = max(set(ecs), key=ecs.count).upper() - else: - self.pathway.orthologs[i].graphics[0].name = orthologs_in_box[0] - - ############################################################################ - #### Operations #### - ############################################################################ - - def to_pdf(self, filename, imagemap=True, orthologs=True, compounds=True, maps=True, reactions=True): - """ - Prints current pathway to PDF file - :param filename: (str) - PDF filename - :param imagemap: (bol) - Print imagemap - :param orthologs: (bol) - Print orthologs - :param compounds: (bol) - Print compounds - :param maps: (bol) - Print maps - :param reactions: (bol) - Print reactions ??? - :return: creates PDF file with current pathway - """ - # TODO - check reactions parameter - drawn = False - while not drawn: - try: - KGMLCanvas(self.pathway, - import_imagemap=imagemap, - label_orthologs=orthologs, - label_compounds=compounds, - label_maps=maps, - label_reaction_entries=reactions).draw(filename) - drawn = True - except: - print('Draw failed! Waiting 10 secs...') - sleep(10) - - def pathway_box_list(self, taxa_in_box, dic_colors, maxshared=10): - """ - Represents items in the pathway map - :param taxa_in_box: dict - {box : list of taxa in box} - :param taxa: list - of taxa to be represented and given a specific color - :param maxshared: int - maximum number of taxa sharing one box - :param color: list of costum colors to be used to color the elements - """ - for box in taxa_in_box.keys(): - nrboxes = len(taxa_in_box[box]) - if nrboxes > maxshared: - nrboxes = maxshared - - paired = True if nrboxes % 2 == 0 else False - for i in range(nrboxes): - newrecord = create_box_heatmap( - self.pathway.orthologs[box], nrboxes, i * 2 - (nrboxes - 1) if paired else i - int(nrboxes / 2), # if nrboxes = 8, i * 2 - (nrboxes - 1) = -7,-5,-3,-1,1,3,5,7 # if nrboxes = 9, i - int(nrboxes / 2) = -4,-3,-2,-1,0,1,2,3,4 - paired=paired) - if newrecord != 1: - newrecord.bgcolor = dic_colors[taxa_in_box[box][i]] - self.pathway.orthologs[box].graphics.append(newrecord) - if self.pathway.orthologs[box].graphics[0].width is not None: # TODO - should check more deeply why sometimes width is None - create_tile_box(self.pathway.orthologs[box]) - - def pathway_boxes_differential(self, dataframe, log=True, colormap="coolwarm"): - """ - Represents expression values present in a dataframe in the - pathway map - :param dataframe: pandas DataFrame with each column representing a sample - and index corresponding to int list index of the ortholog element in the - pathway - :param log: bol providing the option for a log transformation of data - :param colormap: str representing a costum matplotlib colormap to be used - """ - - if log: - norm = cm.colors.LogNorm(vmin=dataframe.min().min(), vmax=dataframe.max().max()) - else: - norm = cm.colors.Normalize(vmin=dataframe.min().min(), vmax=dataframe.max().max()) - - colormap = cm.get_cmap(colormap) - dataframe = dataframe.apply(conv_value_rgb, args=(colormap, norm)) # TODO - Doesn't work if using log - dataframe = dataframe.apply(conv_rgb_hex) - - dataframe = dataframe[dataframe.columns.tolist()] - - nrboxes = len(dataframe.columns.tolist()) # number of samples - - for box in dataframe.index.tolist(): - colors = dataframe.loc[box].tolist() - paired = True if nrboxes % 2 == 0 else False - for i in range(nrboxes): - newrecord = create_box_heatmap( - self.pathway.orthologs[box], nrboxes, i * 2 - (nrboxes - 1) if paired else i - int(nrboxes / 2), # if nrboxes = 8, i * 2 - (nrboxes - 1) = -7,-5,-3,-1,1,3,5,7; if nrboxes = 9, i - int(nrboxes / 2) = -4,-3,-2,-1,0,1,2,3,4 - paired=paired) - if newrecord != 1: # TODO - assess why sometimes get 1 - newrecord.bgcolor = colors[i] - self.pathway.orthologs[box].graphics.append(newrecord) - if self.pathway.orthologs[box].graphics[0].width is not None: # TODO - should check more deeply why sometimes width is None - create_tile_box(self.pathway.orthologs[box]) - - def grey_boxes(self, box_list): - for i in box_list: - set_bgcolor(self.pathway.orthologs[i], "#7c7272") - set_fgcolor(self.pathway.orthologs[i], "#7c7272") diff --git a/keggcharter.py b/keggcharter.py new file mode 100644 index 0000000..94b44f3 --- /dev/null +++ b/keggcharter.py @@ -0,0 +1,467 @@ +#!/usr/bin/env python + +from argparse import ArgumentParser +import numpy as np +import os +import pandas as pd +from pathlib import Path +from subprocess import run +import sys +from io import StringIO +from time import time, gmtime, strftime, sleep +from Bio.KEGG.REST import kegg_link, kegg_list, kegg_get +from Bio.KEGG.KGML import KGML_parser +from matplotlib import pyplot as plt +from tqdm import tqdm +from glob import glob +import json + +from keggpathway_map import KEGGPathwayMap, expand_by_list_column + +__version__ = "0.3.0" + + +def get_arguments(): + parser = ArgumentParser( + description="""KEGGCharter - A tool for representing genomic potential and transcriptomic expression into + KEGG pathways""", + epilog="Input file must be specified.") + parser.add_argument("-o", "--output", type=str, help="Output directory", + default='KEGGCharter_results') + parser.add_argument("-rd", "--resources-directory", type=str, default=sys.path[0], + help="Directory for storing KGML and CSV files.") + parser.add_argument("-mm", "--metabolic-maps", type=str, + help="IDs of metabolic maps to output", + default=','.join(keggcharter_prokaryotic_maps())) + parser.add_argument("-gcol", "--genomic-columns", type=str, + help="Names of columns with genomic identification") + parser.add_argument("-tcol", "--transcriptomic-columns", type=str, + help="Names of columns with transcriptomics quantification") + parser.add_argument("-tls", "--taxa-list", type=str, + help="List of taxa to represent in genomic potential charts (comma separated)") # TODO - must be tested + parser.add_argument("-not", "--number-of-taxa", type=str, default='10', + help="Number of taxa to represent in genomic potential charts (comma separated)") + parser.add_argument("-keggc", "--kegg-column", type=str, help="Column with KEGG IDs.") + parser.add_argument("-koc", "--ko-column", type=str, help="Column with KOs.") + parser.add_argument("-ecc", "--ec-column", type=str, help="Column with EC numbers.") + parser.add_argument("-iq", "--input-quantification", action="store_true", + help="If input table has no quantification, will create a mock quantification column") + parser.add_argument("-it", "--input-taxonomy", type=str, + help="If no taxonomy column exists and there is only one taxon in question.") + # TODO - test this argument without UniProt shenanigans + parser.add_argument("-tc", "--taxa-column", type=str, default='Taxonomic lineage (GENUS)', + help="Column with the taxa designations to represent with KEGGCharter") + parser.add_argument("--resume", action="store_true", default=False, + help="If data inputed has already been analyzed by KEGGCharter.") + parser.add_argument('-v', '--version', action='version', version='KEGGCharter ' + __version__) + + required_named = parser.add_argument_group('required named arguments') + required_named.add_argument("-f", "--file", type=str, required=True, + help="TSV or EXCEL table with information to chart") + + special_functions = parser.add_argument_group('Special functions') + special_functions.add_argument( + "--show-available-maps", action="store_true", default=False, + help="Outputs KEGG maps IDs and descriptions to the console (so you may pick the ones you want!)") + + args = parser.parse_args() + + args.output = args.output.rstrip('/') + + for directory in [args.output]: + if not os.path.isdir(directory): + Path(directory).mkdir(parents=True, exist_ok=True) + print('Created ' + directory) + + return args + + +def timed_message(message): + print(strftime("%Y-%m-%d %H:%M:%S", gmtime()) + ': ' + message) + + +def run_command(bash_command, print_command=True, stdout=None, stderr=None): + if print_command: + print(bash_command) + run(bash_command.split(), stdout=stdout, stderr=stderr) + + +def download_organism(directory): + if not os.path.isfile(f"{directory}/organism"): + run_command(f'wget http://rest.kegg.jp/list/organism -P {directory}') + + +def read_input_file(file): + if file.endswith('.xlsx'): + return pd.read_excel(file) + return pd.read_csv(file, sep='\t') + + +def further_information(data, kegg_column=None, ko_column=None, ec_column=None): + data = get_cross_references(data, kegg_column=kegg_column, ko_column=ko_column, ec_column=ec_column) + main_column = kegg_column if kegg_column is not None else ko_column if ko_column is not None else ec_column + data = condense_data(data, main_column) + return data, main_column + + +# Conversion functions +def id2id(input_ids, column, output_column, output_ids_type, step=150, desc=''): + """ + Converts KEGG_ID genes to Ortholog KO ID from KEGG + :param input_ids: (list) - IDs to convert + :param column: (str) - name of column with IDs to convert (used for merging DFs) + :param output_column: (str) - name of column to return + :param output_ids_type: (str) - database to convert IDs to + :param step: (int) - will convert "step" KEGG IDs at a time + :param desc: (str) - string to output to tqdm progressbar + :return: (list) - (list,list) - KEGG ID genes converted and ko IDs + """ + result = pd.DataFrame(columns=[column, output_column]) + for i in tqdm(range(0, len(input_ids), step), desc=desc): + j = min(i + step, len(input_ids)) + try: + result = pd.concat([result, pd.read_csv( + kegg_link(output_ids_type, input_ids[i:j]), sep='\t', names=[column, output_column])]) + except: + print(f'IDs conversion broke at index: {i}') + if output_ids_type == 'ko': + result[output_column] = result[output_column].apply(lambda x: x.strip('ko:')) + result[column] = result[column].apply(lambda x: x.strip('ec:')) + elif output_ids_type == 'enzyme': + result[column] = result[column].apply(lambda x: x.strip('ko:')) + result[output_column] = result[output_column].apply(lambda x: x.strip('ec:')) + return result + + +def ids_interconversion(data, column, ids_type='kegg'): + ids = list(set(data[data[column].notnull()][column])) + base_desc = 'Converting %i %s to %s through the KEGG API' + if ids_type == 'kegg': + # sometimes Kegg IDs come as mfc:BRM9_0145;mfi:DSM1535_1468; (e.g. from UniProt IDs mapping). + # Should be no problem since both IDs likely will always represent same functions + trimmed_ids = [ide.split(';')[0] for ide in ids] + relational = pd.DataFrame([ids, trimmed_ids]).transpose() + new_ids = id2id(trimmed_ids, column, 'KO (KEGGCharter)', 'ko', desc=base_desc % (len(ids), 'KEGG IDs', 'KOs')) + new_ids = pd.merge(new_ids, relational, left_on=column, right_on=1) # mcj:MCON_3003; mcj:MCON_3003 + del new_ids[column] # mcj:MCON_3003 K07486 mcj:MCON_3003; mcj:MCON_3003 + del new_ids[1] + new_ids.columns = ['KO (KEGGCharter)', column] + elif ids_type == 'ko': + new_ids = id2id( + ids, column, 'EC number (KEGGCharter)', 'enzyme', desc=base_desc % (len(ids), 'KOs', 'EC numbers')) + else: # ids_type == 'ec': + ids = [f'ec:{ide}' for ide in ids] + new_ids = id2id(ids, column, 'KO (KEGGCharter)', 'ko', desc=base_desc % (len(ids), 'EC numbers', 'KOs')) + return pd.merge(data, new_ids, on=column, how='left') + + +def get_cross_references(data, kegg_column=None, ko_column=None, ec_column=None): + # KEGG ID to KO -> if KO column is not set, KEGGCharter will get them through the KEGG API + if kegg_column: + data = ids_interconversion(data, column=kegg_column, ids_type='kegg') + data = ids_interconversion(data, column='KO (KEGGCharter)', ids_type='ko') + if ko_column: + data = ids_interconversion(data, column=ko_column, ids_type='ko') + data = ids_interconversion(data, column='EC number (KEGGCharter)', ids_type='ec') + if ec_column: + data = ids_interconversion(data, column=ec_column, ids_type='ec') + data = ids_interconversion(data, column='KO (KEGGCharter)', ids_type='ko') + if not (kegg_column or ko_column or ec_column): + exit('Need to specify a column with either KEGG IDs, KOs or EC numbers!') + return data + + +def condense_data(data, main_column): + onlykos = data[data['KO (KEGGCharter)'].notnull() & (data['EC number (KEGGCharter)'].isnull())] + onlykos = onlykos.groupby(main_column).agg({'KO (KEGGCharter)': lambda x: ','.join(set(x))}).reset_index() + onlykos['EC number (KEGGCharter)'] = [np.nan] * len(onlykos) + wecs = data[data['EC number (KEGGCharter)'].notnull()] + wecs = wecs.groupby(main_column).agg({'KO (KEGGCharter)': lambda x: ','.join(set(x)), + 'EC number (KEGGCharter)': lambda x: ','.join(set(x))}).reset_index() + del data['KO (KEGGCharter)'] + del data['EC number (KEGGCharter)'] + return pd.merge(data, pd.concat([onlykos, wecs]), on=main_column, how='left').drop_duplicates() + + +def prepare_data_for_charting(data, ko_column='KO (KEGGCharter)', ko_from_uniprot=False): + nokos = data[data[ko_column].isnull()] + wkos = data[data[ko_column].notnull()].reset_index(drop=True) + if ko_from_uniprot: + # If coming from UniProt ID mapping, KOs will be in the form "KXXXXX;" + wkos[ko_column] = wkos[ko_column].apply(lambda x: x.rstrip(';')) + + # Expand KOs column if some elements are in the form KO1,KO2,... + wkos[ko_column] = wkos[ko_column].apply(lambda x: x.split(',')) + wkos = expand_by_list_column(wkos, column=ko_column) + data = pd.concat([wkos, nokos]) + return data + + +# Get metabolic maps from KEGG Pathway +def keggcharter_prokaryotic_maps(file=f'{sys.path[0]}/KEGGCharter_prokaryotic_maps.txt'): + return open(file).read().split('\n') + + +def kegg_metabolic_maps(): + """ + Creates a dic with all specific kegg pathway maps and their description + :return: pandas.DataFrame with Map ID as index and maps names as + sole column + """ + maps = pd.read_csv(StringIO(kegg_list("pathway").read()), sep='\t', names=['Map ID', 'Description']) + maps['Map ID'] = [ide.split('map')[-1] for ide in maps['Map ID']] + return maps + + +def write_kgml(mmap, output, organism='ko'): + data = kegg_get(f"{organism}{mmap}", "kgml").read() + with open(output, 'w') as f: + if len(data) > 1: + f.write(data) + return KGML_parser.read(data) + return None + + +def write_kgmls(mmaps, out_dir, max_tries=3, org='ko'): + print(f'[{len(mmaps)}] maps inputted for org [{org}]') + maps_done = [filename.split('/')[-1].rstrip('.xml').lstrip(org) for filename in glob(f'{out_dir}/{org}*.xml')] + mmap_to_orthologs = { + name: [orth.id for orth in KGML_parser.read(open(f'{out_dir}/{org}{name}.xml')).orthologs] + for name in maps_done} # maps already done will have their orthologs put in + print(f'[{len(maps_done)}] KGMLs already obtained for org [{org}]') + mmaps = [map for map in mmaps if map not in maps_done] + i = 1 + for mmap in mmaps: + print(f'[{i}/{len(mmaps)}] Obtaining KGML for map [{org}{mmap}]') + tries = 0 + done = False + while tries < max_tries and not done: + # try: + orthologs = [orth.id for orth in write_kgml(mmap, f'{out_dir}/{org}{mmap}.xml', organism=org).orthologs] + mmap_to_orthologs[mmap] = orthologs + done = True + print(f'[{org}{mmap}]: success') + ''' + except: + if os.path.isfile(f'{out_dir}/{org}{mmap}.xml'): + os.remove(f'{out_dir}/{org}{mmap}.xml') + tries += 1 + print(f'[{org}{mmap}]: failure') + ''' + i += 1 + return mmap_to_orthologs + + +def set_text_boxes_kgml(kgml_filename, desc=''): + handler = KGML_parser.read(open(kgml_filename)) + # Set text in boxes to EC numbers + with open(kgml_filename.replace('xml', 'csv'), 'w') as f: + for ortholog_rec in tqdm(handler.orthologs, desc=desc): + lines = list() + kos = ortholog_rec.name.split() + lines += kegg_link("enzyme", kos).read().split('\n') + ecs = [line.split('\t')[1] for line in lines if len(line) > 0] + if len(ecs) > 0: + f.write(f'{",".join(ecs)}\n') + else: + f.write(f'{",".join(kos)}\n') + + +# TODO - deprecated? +def set_text_boxes_kgmls(mmaps, out_dir, max_tries=3, org='ko'): + maps_done = [filename.split('/')[-1].rstrip('.csv') for filename in glob(f'{out_dir}/{org}*.csv')] + print(f"[{len(maps_done)}] maps already have boxes' text set") + mmaps = [map for map in mmaps if map not in maps_done] + i = 1 + for mmap in mmaps: + tries = 0 + done = False + while tries < max_tries and not done: + try: + set_text_boxes_kgml( + f'{out_dir}/{org}{mmap}.xml', desc=f"[{i}/{len(mmaps)}] Getting boxes' labels for map [{mmap}]") + done = True + except: + print(f'Failed for map. Attempt: {tries + 1}') + if os.path.isfile(f'{out_dir}/map{mmap}.csv'): + os.remove(f'{out_dir}/map{mmap}.csv') + tries += 1 + sleep(10) + i += 1 + + +def taxon2prefix(taxon_name, organism_df): + """ + :param taxon_name: str - e.g. Pongo abelii (Sumatran orangutan) + :param organism_df: pandas.DataFrame - organism file, index = taxa names, column names = KEGG prefix, ... + :returns str - KEGG prefix of taxon name + """ + if taxon_name == 'ko': + return 'ko' + if taxon_name in organism_df.index: # exactly the same + return organism_df.loc[taxon_name][0] + if taxon_name.split(' (')[0] in organism_df.index: # Homo sapiens (human) -> Homo sapiens + return organism_df.loc[taxon_name.split(' (')[0]][0] + df = organism_df.reset_index() + possible_prefixes = df[df.name.str.contains(taxon_name)].prefix.tolist() + if len(possible_prefixes) > 0: + return df[df.name.str.contains(taxon_name)].prefix.tolist()[0] # select the first strain + print(f'[{taxon_name}] was not found in taxon to KEGG prefix conversion!') + return None + + +def get_taxon_maps(kegg_prefix): + if kegg_prefix is None: + return [] + df = pd.read_csv(StringIO(kegg_list("pathway", kegg_prefix).read()), sep='\t', header=None) + return df[0].apply(lambda x: x.split(kegg_prefix)[1]).tolist() + + +def parse_organism(file): + return pd.read_csv(file, sep='\t', usecols=[1, 2], header=None, index_col=1, names=['prefix', 'name']) + + +def download_resources(data, resources_directory, taxa_column, metabolic_maps): + download_organism(resources_directory) + taxa = ['ko'] + list(set(data[data[taxa_column].notnull()][taxa_column])) + taxa_df = parse_organism(f'{resources_directory}/organism') + i = 1 + taxon_to_mmap_to_orthologs = dict() # {'Keratinibaculum paraultunense' : {'00190': ['1', '2']}} + for taxon in taxa: + print(f'[{i}/{len(taxa)}] Getting information for taxon [{taxon}]') + kegg_prefix = taxon2prefix(taxon, taxa_df) + if kegg_prefix is not None: + taxon_mmaps = get_taxon_maps(kegg_prefix) + taxon_mmaps = [mmap for mmap in taxon_mmaps if mmap in metabolic_maps] # select only inputted maps + taxon_to_mmap_to_orthologs[taxon] = write_kgmls( + taxon_mmaps, resources_directory, org=kegg_prefix) + else: + taxon_to_mmap_to_orthologs[taxon] = dict() + i += 1 + with open(f'{resources_directory}/taxon_to_mmap_to_orthologs.json', 'w') as f: + json.dump(taxon_to_mmap_to_orthologs, f) + return taxon_to_mmap_to_orthologs + + +def get_mmaps2taxa(taxon_to_mmap_to_orthologs): + mmaps2taxa = dict() + for org, mmaps2orthologs in taxon_to_mmap_to_orthologs.items(): + for mmap in mmaps2orthologs.keys(): + if mmap in mmaps2taxa.keys(): + mmaps2taxa[mmap].append(org) + else: + mmaps2taxa[mmap] = [org] + return mmaps2taxa + + +def chart_map(kgml_filename, ec_list, data, taxon_to_mmap_to_orthologs, mmaps2taxa, output=None, ko_column=None, + taxa_column=None, + genomic_columns=None, transcriptomic_columns=None, mmap2taxa=None): + if genomic_columns: # when not set is None + mmap = KGML_parser.read(open(kgml_filename)) + kegg_pathway_map = KEGGPathwayMap(pathway=mmap, ec_list=ec_list) + kegg_pathway_map.genomic_potential_taxa( + data, genomic_columns, ko_column, taxon_to_mmap_to_orthologs, mmaps2taxa, taxa_column=taxa_column, + output_basename=f'{output}/potential') + if transcriptomic_columns: # when not set is None + mmap = KGML_parser.read(open(kgml_filename)) + kegg_pathway_map = KEGGPathwayMap(pathway=mmap, ec_list=ec_list) + kegg_pathway_map.differential_expression_sample( + data, transcriptomic_columns, ko_column, taxon_to_mmap_to_orthologs, mmaps2taxa, + output_basename=f'{output}/differential', + log=False, taxa_column=taxa_column) + plt.close() + + +def get_pathway_and_ec_list(rd, mmap): + download = True + if os.path.isfile(f'{rd}/ko{mmap}.xml') and os.path.isfile(f'{rd}/ko{mmap}.csv'): + pathway = KGML_parser.read(open(f'{rd}/ko{mmap}.xml')) + with open(f'{rd}/ko{mmap}.csv') as f: + ec_list = f.read().split('\n') + if len(pathway.orthologs) == len(ec_list) - 1: # -1 because of newline at the end + download = False + else: + print(f'Lengths of orthologs in KGML and labels for corresponding boxes do not match for map [ko{mmap}]!') + else: + print(f'Some resources were not found for map [ko{mmap}]! Going to download them') + if download: + try: + write_kgml(mmap, f'{rd}/ko{mmap}.xml') + print(f'Got KGML for map [ko{mmap}]') + set_text_boxes_kgml(f'{rd}/ko{mmap}.xml', desc=f"Getting boxes' labels for map [ko{mmap}]") + pathway = KGML_parser.read(open(f'{rd}/ko{mmap}.xml')) + with open(f'{rd}/ko{mmap}.csv') as f: + ec_list = f.read().split('\n') + except: # may be 404 not found, but also connection timed out, this way everything works + print(f'Could not download resources for [ko{mmap}]!') + return None, None + return pathway, ec_list + + +def read_input(): + args = get_arguments() + timed_message('Arguments valid.') + if args.show_available_maps: + sys.exit(kegg_metabolic_maps().to_string()) + data = read_input_file(args.file) + timed_message('Data successfully read.') + return args, data + + +def main(): + args, data = read_input() + + if not args.resume: + data, main_column = further_information( + data, kegg_column=args.kegg_column, ko_column=args.ko_column, ec_column=args.ec_column) + data.to_csv(f'{args.output}/KEGGCharter_results.tsv', sep='\t', index=False) + timed_message(f'Results saved to {args.output}/KEGGCharter_results.tsv') + + ko_column = 'KO (KEGGCharter)' # TODO - set ko_column to user defined value + data = prepare_data_for_charting(data, ko_column=ko_column) + + if args.input_quantification: + data['Quantification (KEGGCharter)'] = [1] * len(data) + args.genomic_columns = 'Quantification (KEGGCharter)' + + if args.input_taxonomy: + data['Taxon (KEGGCharter)'] = [args.input_taxonomy] * len(data) + args.taxa_column = 'Taxon (KEGGCharter)' + args.taxa_list = args.input_taxonomy + + metabolic_maps = args.metabolic_maps.split(',') + args.genomic_columns = args.genomic_columns.split(',') + if args.transcriptomic_columns: + args.transcriptomic_columns = args.transcriptomic_columns.split(',') + + taxon_to_mmap_to_orthologs = download_resources(data, args.resources_directory, args.taxa_column, metabolic_maps) + mmaps2taxa = get_mmaps2taxa(taxon_to_mmap_to_orthologs) # '00190': ['Keratinibaculum paraultunense'] + timed_message(f'Creating KEGG Pathway representations for {len(metabolic_maps)} metabolic pathways.') + for i in range(len(metabolic_maps)): + pathway, ec_list = get_pathway_and_ec_list(args.resources_directory, metabolic_maps[i]) + if pathway is not None and ec_list is not None and metabolic_maps[i] in mmaps2taxa: + timed_message(f'[{i + 1}/{len(metabolic_maps)}] {pathway.title}') + #pathway_handler = KEGGPathwayMap(pathway, ec_list) + chart_map( + f'{args.resources_directory}/ko{metabolic_maps[i]}.xml', ec_list, data, taxon_to_mmap_to_orthologs, + mmaps2taxa, output=args.output, + ko_column=ko_column, taxa_column=args.taxa_column, + genomic_columns=args.genomic_columns, transcriptomic_columns=args.transcriptomic_columns) + else: + print(f'Analysis of map {metabolic_maps[i]} failed!') + i += 1 + + # TODO - implement multiprocessing for map generation? + ''' + with multiprocessing.Pool() as p: + p.starmap(chart_map, [(handler, data, args.output, ko_column, ec_column, args.taxa_column, dic_colors, + args.genomic_columns, args.transcriptomic_columns, + f'{args.output}/failed_maps.txt') for handler in pathways]) + ''' + + +if __name__ == '__main__': + start_time = time() + main() + print(f'KEGGCharter analysis finished in {strftime("%Hh%Mm%Ss", gmtime(time() - start_time))}') diff --git a/keggpathway_map.py b/keggpathway_map.py new file mode 100644 index 0000000..d401ab2 --- /dev/null +++ b/keggpathway_map.py @@ -0,0 +1,480 @@ +#!/usr/bin/env python + +from Bio.KEGG.KGML import KGML_pathway +from Bio.Graphics.KGML_vis import KGMLCanvas +from PIL import Image +import numpy as np +import os +from subprocess import run +from matplotlib import pyplot as plt, colors, cm +import pandas as pd +from re import search +import sys + + +def set_bgcolor(pathway_element, color): + """ + Sets graphic element background color + :param pathway_element: kegg pathway xml element object + :param color: color to be used in rgb + """ + pathway_element.graphics[0].bgcolor = color + + +def set_fgcolor(pathway_element, color): + """ + Sets graphic element contour color + :param pathway_element: kegg pathway xml element object + :param color: color to be used in rgb + """ + pathway_element.graphics[0].fgcolor = color + + +def conv_value_rgb(value, colormap, norm): + """ + Normalizes values in a vector and transforms it into corresponding + hex color using given colormap + :param value: numpy vector from dataframe apply function with expression + values + :param colormap: matplotlib colormap object + :param norm: matplotlib Normalize or LogNorm object + :return: returns hex colors in vector format + """ + return value.apply(norm).apply(colormap) + + +def conv_rgb_hex(rgb): + """ + converts rgb into hex color code in vector + :param rgb: rgb value in vector resulting from pandas dataframe apply + :return: vector with converted hex color code + """ + return rgb.apply(colors.to_hex) + + +def create_tile_box(record): + """ + Create box graphical element in pathway to draw the box countour and + give the correct name + :param record: graphical element to be created + """ + newrecord = KGML_pathway.Graphics(record) + newrecord.name = record.graphics[0].name + newrecord.type = "rectangle" + newrecord.width = record.graphics[0].width + newrecord.height = record.graphics[0].height + newrecord.y = record.graphics[0].y + newrecord.x = record.graphics[0].x + newrecord.bgcolor = "#FFFFFF00" + newrecord.fgcolor = "#000000" + record.graphics.append(newrecord) + record.graphics[0].bgcolor = "#FFFFFF00" + record.graphics[0].fgcolor = "#FFFFFF00" + record.graphics[0].name = "" + + +def create_box_heatmap(rec_old, nrboxes, i, paired=True): + """ + Helper function for creating heatmap, draws one expression value in its + correct position on the bigger parent box + :param rec_old: graphical element object to be used as reference + :param nrboxes: int nr of boxes to be drawed + :param i: int internal number of movements of the box given by the for loop + :param paired: boolean is number of samples a pair? + :return: graphical element object + """ + if rec_old.graphics[0].width is None: # it happens, guess it is because it has no KOs + return 1 + movement_steps = rec_old.graphics[0].width / (nrboxes * (2 if paired else 1)) + newrecord = KGML_pathway.Graphics(rec_old) + newrecord.name = "" + newrecord.type = "rectangle" + adjustment_factor = 1.3 if nrboxes > 2 else 1.1 if nrboxes > 1 else 1 # sub-boxes width, adjusted by a factor that experimentally fixed well in the representations + newrecord.width = movement_steps * adjustment_factor * (2 if paired else 1) + newrecord.height = rec_old.graphics[0].height + newrecord.y = rec_old.graphics[0].y + newrecord.x = (i * movement_steps) + rec_old.graphics[0].x + newrecord.fgcolor = "#FFFFFF00" + return newrecord + + +def pdf2png(pdf_filename): + """ + Converts a pdf file to a png file, RGB format - name changes, .pdf to .png + :param pdf_filename: str - filename of PDF file + """ + bash_command = f'pdftoppm {pdf_filename} {pdf_filename.split(".pdf")[0]} -png' + run(bash_command.split()) + os.rename(pdf_filename.replace('.pdf', '-1.png'), pdf_filename.replace('.pdf', '.png')) + + +def resize_image(image_pil, ratio=None, width=None, height=None): + """ + Resizes an image with alteration to image size ratio + :param ratio: int - ratio of resize - how bigger or smaller will the output be? + :param image_pil: PIL.Image - image to be resized + :param width: int - width of final image + :param height: int - heigth of final image + """ + if ratio: + return image_pil.resize((image_pil.width * ratio, + image_pil.height * ratio), Image.ANTIALIAS) + elif width and height: + return image_pil.resize((width, height), Image.ANTIALIAS) + else: + return None + + +def add_blank_space(image_pil, width, height, image_mode='RGB'): + """ + Resizes an image with white background, keeping image size ratio + :param image_pil: PIL.Image - image to be resized + :param width: int - width of final image + :param height: int - heigth of final image + :param image_mode: str - image mode of image (RGBA, RGB, ...) + """ + ratio_w = width / image_pil.width + ratio_h = height / image_pil.height + if ratio_w < ratio_h: + # It must be fixed by width + resize_width = width + resize_height = round(ratio_w * image_pil.height) + else: + # Fixed by height + resize_width = round(ratio_h * image_pil.width) + resize_height = height + image_resize = image_pil.resize((resize_width, resize_height), + Image.ANTIALIAS) + background = Image.new('RGB', (width, height), (255, 255, 255, 255)) + offset = (round((width - resize_width) / 2), + round((height - resize_height) / 2)) + background.paste(image_resize, offset) + return background.convert(image_mode) + + +def expand_by_list_column(df, column='Pathway'): + if len(df) == 0: + return pd.DataFrame() + lens = [len(item) for item in df[column]] + dictionary = dict() + for col in df.columns: + dictionary[col] = np.repeat(df[col].values, lens) + dictionary[column] = np.concatenate(df[column].values) + return pd.DataFrame(dictionary) + + +def taxa_colors(hex_values=None, ncolor=1): + """ + Creates list of hex colors to be used, using matplotlib or using custom colors + :param hex_values: list of hex colors + :param ncolor: int indicating the ammount of hex colors that should be created + :return: returns list with hex color codes + """ + if not hex_values: # if no colors are given creates a list of discrete hex colors + color_scheme = ( + cm.get_cmap('Pastel2', 8) if ncolor <= 8 + else cm.get_cmap("Set3", 12) if ncolor <= 12 + else cm.get_cmap("rainbow", ncolor)) # if ncolor > 12 a continuous colormap is used instead + return [colors.to_hex(color_scheme(i)) for i in range(ncolor)] + + for hex_value in hex_values: + if not search(r'^#(?:[0-9a-fA-F]{3}){1,2}$', hex_value): + sys.exit(Exception("Colors aren't valid hex codes")) + return hex_values # has validated hex values and returns the original list + + +class KEGGPathwayMap: + """ + This class retrieves and manipulates KEGG metabolic maps from KEGG Pathway + """ + + def __init__(self, pathway, ec_list): + """ + Initialize object + :param pathway: (Bio.KEGG.KGML.KGML_pathway.Pathway) + :param ec_list: (list) - list from ECs CSV + """ + self.pathway = pathway + self.set_pathway(ec_list) + + def __getattr__(self, item): + m = getattr(self.pathway, item, None) + if m is None: + raise AttributeError + return m + + def set_pathway(self, ec_list): + """ + Set pathway with Kegg Pathway ID + """ + self.ko_boxes = dict() + for i in range(len(self.orthologs)): + set_bgcolor(self.orthologs[i], "#ffffff") # set all boxes to white + # self.set_fgcolor(self.pathway.orthologs[i], "#ffffff") # This might be helpful in the future, if an additional layer of marking is needed + orthologs_in_box = [ide[3:] for ide in self.orthologs[ + i].name.split()] # 'ko:K16157 ko:K16158 ko:K16159' -> ['K16157', 'K16158', 'K16159'] + for ortholog in orthologs_in_box: + if ortholog not in self.ko_boxes.keys(): + self.ko_boxes[ortholog] = list() + self.ko_boxes[ortholog].append(i) # {'K16157':[0,13,432], 'K16158':[4,13,545]} + + # Set name as EC number + ecs = ec_list[i].split(',') + if len(ecs) > 0: + self.orthologs[i].graphics[0].name = max(set(ecs), key=ecs.count).upper() + else: + self.orthologs[i].graphics[0].name = orthologs_in_box[0] + + ############################################################################ + #### Operations #### + ############################################################################ + + def to_pdf(self, filename, imagemap=True, orthologs=True, compounds=True, maps=True, reactions=True): + """ + Prints current pathway to PDF file + :param filename: (str) - PDF filename + :param imagemap: (bol) - Print imagemap + :param orthologs: (bol) - Print orthologs + :param compounds: (bol) - Print compounds + :param maps: (bol) - Print maps + :param reactions: (bol) - Print reactions ??? + :return: creates PDF file with current pathway + """ + # TODO - check reactions parameter + try: + KGMLCanvas(self, + import_imagemap=imagemap, + label_orthologs=orthologs, + label_compounds=compounds, + label_maps=maps, + label_reaction_entries=reactions).draw(filename) + except PermissionError: + exit(f'Was forbidden to write to {filename}. Check your permissions for that folder/filename') + + def pathway_box_list(self, taxa_in_box, dic_colors, maxshared=10): + """ + Represents items in the pathway map + :param taxa_in_box: dict - {box : list of taxa in box} + :param dic_colors: list - of colors to assign to taxa + :param maxshared: int - maximum number of taxa sharing one box + """ + for box in taxa_in_box.keys(): + nrboxes = len(taxa_in_box[box]) + if nrboxes > maxshared: + nrboxes = maxshared + + paired = True if nrboxes % 2 == 0 else False + for i in range(nrboxes): + newrecord = create_box_heatmap( + self.orthologs[box], nrboxes, i * 2 - (nrboxes - 1) if paired else i - int(nrboxes / 2), # if nrboxes = 8, i * 2 - (nrboxes - 1) = -7,-5,-3,-1,1,3,5,7 # if nrboxes = 9, i - int(nrboxes / 2) = -4,-3,-2,-1,0,1,2,3,4 + paired=paired) + if newrecord != 1: + newrecord.bgcolor = dic_colors[taxa_in_box[box][i]] + self.orthologs[box].graphics.append(newrecord) + if self.orthologs[box].graphics[0].width is not None: # TODO - should check more deeply why sometimes width is None + create_tile_box(self.orthologs[box]) + + def pathway_boxes_differential(self, dataframe, log=True, colormap="coolwarm"): + """ + Represents expression values present in a dataframe in the + pathway map + :param dataframe: pandas DataFrame with each column representing a sample + and index corresponding to int list index of the ortholog element in the + pathway + :param log: bol providing the option for a log transformation of data + :param colormap: str representing a costum matplotlib colormap to be used + """ + + if log: + norm = cm.colors.LogNorm(vmin=dataframe.min().min(), vmax=dataframe.max().max()) + else: + norm = cm.colors.Normalize(vmin=dataframe.min().min(), vmax=dataframe.max().max()) + + colormap = cm.get_cmap(colormap) + dataframe = dataframe.apply(conv_value_rgb, args=(colormap, norm)) # TODO - Doesn't work if using log + dataframe = dataframe.apply(conv_rgb_hex) + + dataframe = dataframe[dataframe.columns.tolist()] + + nrboxes = len(dataframe.columns.tolist()) # number of samples + + for box in dataframe.index.tolist(): + colors = dataframe.loc[box].tolist() + paired = True if nrboxes % 2 == 0 else False + for i in range(nrboxes): + newrecord = create_box_heatmap( + self.orthologs[box], nrboxes, i * 2 - (nrboxes - 1) if paired else i - int(nrboxes / 2), # if nrboxes = 8, i * 2 - (nrboxes - 1) = -7,-5,-3,-1,1,3,5,7; if nrboxes = 9, i - int(nrboxes / 2) = -4,-3,-2,-1,0,1,2,3,4 + paired=paired) + if newrecord != 1: # TODO - assess why sometimes get 1 + newrecord.bgcolor = colors[i] + self.orthologs[box].graphics.append(newrecord) + if self.orthologs[box].graphics[0].width is not None: # TODO - should check more deeply why sometimes width is None + create_tile_box(self.orthologs[box]) + + def grey_boxes(self, box_list): + for i in box_list: + set_bgcolor(self.orthologs[i], "#7c7272") + set_fgcolor(self.orthologs[i], "#7c7272") + + def create_potential_legend(self, colors, labels, filename): + """ + Draws the color to taxa labels of genomic potential representations + :param colors: list - list of colors of the different taxa + :param labels: list - list of taxa corresponding to the colors + :param filename: string - filename to output + """ + handles = [plt.plot([], [], marker="s", color=color, ls="none")[0] for color in colors] + legend = plt.legend(handles, labels, loc=3, framealpha=1, frameon=True) + fig = legend.figure + fig.canvas.draw() + # The bbox manipulation removes the axis + bbox = legend.get_window_extent() + bbox = bbox.from_extents(*(bbox.extents + np.array([-2, -2, 2, 2]))) + bbox = bbox.transformed(fig.dpi_scale_trans.inverted()) + + fig.savefig(filename, dpi="figure", bbox_inches=bbox) + + def most_abundant_taxa(self, data, columns, taxa_column, number_of_taxa=10): + """ + Calculates top genus from samples + :param data: data + :param columns: list of mg columns to consider for quantification of genus abundance + :param taxa_column: da column with da taxa + :param number_of_taxa: number of top genus to return + :return: list of top genus + """ + data = data.groupby(taxa_column)[columns].sum().sum(axis=1).sort_values(ascending=False) + if number_of_taxa > len(data.index.tolist()): + number_of_taxa = len(data.index.tolist()) + return data.index.tolist()[:number_of_taxa] + + def genomic_potential_taxa( + self, data, samples, ko_column, taxon_to_mmap_to_orthologs, mmaps2taxa, + taxa_column='Taxonomic lineage (GENUS)', output_basename=None, + number_of_taxa=10): + """ + Represents the genomic potential of the dataset for a certain taxa level, + by coloring each taxon with a unique color + :param data: pandas.DataFrame with data already processed by KEGGPathway + :param samples: list of str column names of the dataset correspoding to + expression values + :param mmap2taxa: dict - of taxa to color + :param ko_column: str - column with KOs + :param taxa_column: str - column with taxonomic classification + :param output_basename: str - basename for map outputs + :param number_of_taxa: int - number of most abundant taxa to represent in each map + """ + # for every taxon, check all boxes it is in, and save that info to box2taxon + box2taxon = dict() + data = data[data[taxa_column].isin(mmaps2taxa[self.name.split('ko')[1]])] + taxa = self.most_abundant_taxa(data, samples, taxa_column, number_of_taxa=number_of_taxa) + taxonomy_colors = taxa_colors(ncolor=len(taxa)) + dic_colors = {taxa[i]: taxonomy_colors[i] for i in range(len(taxa))} + for taxon in dic_colors.keys(): + df = data[data[taxa_column] == taxon][samples + [ko_column]] + df = df[df.any(axis=1)] + for ortholog in df[ko_column]: + if ortholog in self.ko_boxes.keys(): + for box in self.ko_boxes[ortholog]: + if box in box2taxon.keys() and box in \ + taxon_to_mmap_to_orthologs[taxon][self.name.split('ko')[1]]: + if taxon not in box2taxon[box]: + box2taxon[box].append(taxon) + else: + box2taxon[box] = [taxon] + + # for every box with KOs identified from the most abundant taxa, sub-boxes are created with colours of the + # corresponding taxa + self.pathway_box_list(box2taxon, dic_colors) + # boxes with KOs identified but not from the most abundant taxa are still identified + df = data[samples + [ko_column]] + df = df[df.any(axis=1)] + grey_boxes = list() + for ortholog in df[ko_column]: + if ortholog in self.ko_boxes.keys(): + for box in self.ko_boxes[ortholog]: + if box not in box2taxon.keys() and box not in grey_boxes: + grey_boxes.append(box) + self.grey_boxes(grey_boxes) + + name = self.name.split(':')[-1] + name_pdf = f'{output_basename}_{name}.pdf' + self.to_pdf(name_pdf) + + # a new taxon: "Other taxa" + if len(grey_boxes) > 0: + dic_colors["Other taxa"] = "#7c7272" + + # TODO - legend should be ajusted for the maps - otherwise, makes no sense to have one legend for each map - + # they all become the same, except for "Other taxa" + self.create_potential_legend( + dic_colors.values(), dic_colors.keys(), name_pdf.replace('.pdf', '_legend.png')) + + self.add_legend( + name_pdf, name_pdf.replace('.pdf', '_legend.png'), + name_pdf.replace(name + '.pdf', self.title.replace('/', '|') + '.png')) + + def differential_colorbar(self, dataframe, filename): + fig_size = (2, 3) + mpb = plt.pcolormesh(dataframe, cmap='coolwarm') + fig, ax = plt.subplots(figsize=fig_size) + plt.colorbar(mpb, ax=ax) + ax.remove() + plt.savefig(filename, bbox_inches='tight') + + def differential_expression_sample( + self, data, samples, ko_column, taxon_to_mmap_to_orthologs, mmaps2taxa, + taxa_column='Taxonomic lineage (GENUS)', output_basename=None, log=True): + """ + Represents in small heatmaps the expression levels of each sample on the + dataset present in the given pathway map. The values can be transford to + a log10 scale + :param data: pandas.DataFrame with data already processed by KEGGPathway + :param samples: list - column names of the dataset corresponding to expression values + :param ko_column: str - column with KOs to represent + :param output_basename: string - basename of outputs + :param log: bol - convert the expression values to logarithmic scale? + """ + data = data[data[taxa_column].isin(mmaps2taxa[self.name.split('ko')[1]])] + df = data.groupby(ko_column)[samples + [ko_column]].sum() + df = df[df.any(axis=1)] + df['Boxes'] = [self.ko_boxes[ko] if ko in self.ko_boxes.keys() else np.nan for ko in df.index] + df = df[df['Boxes'].notnull()] + df = expand_by_list_column(df, column='Boxes') + if len(df) == 0: + return 1 + df = df.groupby('Boxes')[samples].sum() + + self.pathway_boxes_differential(df, log) + + name = self.name.split(':')[-1] + name_pdf = f'{output_basename}_{name}.pdf' + self.to_pdf(name_pdf) + + self.differential_colorbar(df, name_pdf.replace(".pdf", '_legend.png')) + + self.add_legend( + name_pdf, name_pdf.replace('.pdf', '_legend.png'), + name_pdf.replace(name + '.pdf', self.title.replace('/', '|') + '.png')) + + return 0 + + def add_legend(self, kegg_map_file, legend_file, output): + """ + Merges the two files - KEGG metabolic map and respective legend - into one file + :param kegg_map_file: str - filename of PDF kegg metabolic map + :param legend_file: str - filename of PNG legend + """ + pdf2png(kegg_map_file) + imgs = [Image.open(file) for file in [kegg_map_file.replace('.pdf', '.png'), legend_file]] + imgs[0] = imgs[0].convert( + 'RGB') # KEGG Maps are converted to RGB by pdftoppm, dunno if converting to RGBA adds any transparency + imgs[1] = resize_image(imgs[1], ratio=5) + imgs[1] = add_blank_space(imgs[1], imgs[1].width, imgs[0].height) + imgs_comb = np.hstack([np.asarray(i) for i in imgs]) + + # save that beautiful picture + imgs_comb = Image.fromarray(imgs_comb) + imgs_comb.save(output) + for file in [kegg_map_file, kegg_map_file.replace('.pdf', '.png'), legend_file]: + os.remove(file) diff --git a/meta.yaml b/meta.yaml index ec8dd41..295e042 100644 --- a/meta.yaml +++ b/meta.yaml @@ -18,8 +18,8 @@ build: mkdir -p "${dir}" cp *.py KEGGCharter_prokaryotic_maps.txt "${dir}/" mkdir -p "${PREFIX}/bin" - chmod +x "${dir}/kegg_charter.py" - ln -s "${dir}/kegg_charter.py" "${PREFIX}/bin/" + chmod +x "${dir}/keggcharter.py" + ln -s "${dir}/keggcharter.py" "${PREFIX}/bin/" requirements: run: @@ -34,7 +34,7 @@ requirements: test: commands: - - kegg_charter.py -v + - keggcharter.py -v about: home: https://github.com/iquasere/KEGGCharter