From 49c94897b0b5dbd3333cd73df78b1cf35761da04 Mon Sep 17 00:00:00 2001
From: Randy Creasi <rcreasi@broadinstitute.org>
Date: Thu, 9 Jan 2025 09:51:52 -0500
Subject: [PATCH] remove bad file

you can tell it's bad because of the -bad extension
---
 .../utils.py-bad                              | 232 ------------------
 1 file changed, 232 deletions(-)
 delete mode 100644 data-prep-pipeline/data_prep_pipeline/get_release_model_and_model_condition_files/utils.py-bad

diff --git a/data-prep-pipeline/data_prep_pipeline/get_release_model_and_model_condition_files/utils.py-bad b/data-prep-pipeline/data_prep_pipeline/get_release_model_and_model_condition_files/utils.py-bad
deleted file mode 100644
index e97f4052..00000000
--- a/data-prep-pipeline/data_prep_pipeline/get_release_model_and_model_condition_files/utils.py-bad
+++ /dev/null
@@ -1,232 +0,0 @@
-from gumbo_rest_client import Client
-from taigapy import create_taiga_client_v3
-import pandas as pd
-from .files_to_check_models_to_exclude import quarterly_files_to_check
-import io
-import yaml
-import re
-from tqdm import tqdm
-
-
-def get_second_part_of_string(string: str) -> str:
-    """
-    This function takes a string and returns the second part of the string if there is a dot, otherwise returns the whole string.
-
-    Args:
-        string (str): String
-
-    Returns:
-        str: Second part of the string if there is a dot, otherwise returns the whole string.
-    """
-    if "." in string:
-        return string.split(".")[1]
-    else:
-        return string
-
-
-def filter_empty_strings(list_with_empty_strings: list) -> list:
-    """
-    This function takes a list and returns a new list with the empty strings removed.
-
-    Args:
-        list_with_empty_strings (list): List with empty strings
-
-    Returns:
-        list: List with empty strings removed
-    """
-    return [item for item in list_with_empty_strings if item]
-
-
-def multi_join(gumbo_client : Client, primary_df: pd.DataFrame, join_instructions: list) -> pd.DataFrame:
-    """
-    Perform multiple joins on a DataFrame given a list of join instructions.
-
-    Args:
-        primary_df (pd.DataFrame): Primary DataFrame
-        join_instructions (list): List of join instructions. Each instruction should be a
-                                list with the following format:
-                                [df_to_join, join_field, index_field]
-    
-    Returns:
-        pd.DataFrame: Resulting DataFrame after performing multiple joins
-    """
-
-    result_df = primary_df
-    for instruction in join_instructions:
-        assert (
-            len(instruction.split(", ")) % 3 == 0
-        ), "Join instruction must have 3 parts in [df_to_join, join_field, index_field] format"
-        secondary_df_name = instruction.split(", ")[0]
-        joined_on = instruction.split(", ")[1]
-        index_field = instruction.split(", ")[2]
-        secondary_df = gumbo_client.get(secondary_df_name)
-        result_df = result_df.join(
-            secondary_df.set_index(index_field),
-            on=joined_on,
-            how="left",
-            lsuffix=(secondary_df_name + "."),
-        )
-
-    return result_df
-
-
-def gumbo_df_preprocessing(
-    gumbo_client: Client,
-    data_dictionary_table_df: pd.DataFrame, gumbo_table_df
-) -> pd.DataFrame:
-    """
-    This function takes a DataFrame containing the data dictionary for a single table and returns a DataFrame containing the data from the Gumbo table.
-
-    Args:
-        data_dictionary_table_df (pd.DataFrame): DataFrame containing the data dictionary for a single table
-    
-    Returns:
-        pd.DataFrame: DataFrame containing the filtered data from the Gumbo table
-    """
-    # Get the column names from the 'gumbo_column_name' column and remove the first part of the string if there is a dot
-    gumbo_column_names = data_dictionary_table_df["gumbo_column_name"].tolist()
-    gumbo_column_names = [get_second_part_of_string(s) for s in gumbo_column_names]
-
-    # Get the column names from the 'release_column_name' column
-    release_column_names = data_dictionary_table_df["release_column_name"].tolist()
-
-    # Get the join instructions from the 'joined_on' column and filter out empty strings
-    join_instructions = data_dictionary_table_df["joined_on"].tolist()
-    join_instructions = filter_empty_strings(join_instructions)
-
-    # Perform multiple joins on a DataFrame given a list of join instructions
-    df = multi_join(gumbo_client, gumbo_table_df, join_instructions)
-
-    missing_columns = set(gumbo_column_names) - set(df.columns)
-    assert (
-        len(missing_columns) == 0
-    ), f"Joined dataframe has missing columns: {', '.join(missing_columns)}"
-
-    # Filter out the columns that are not in the 'gumbo_column_names' list
-    df = df[gumbo_column_names]
-
-    # Rename the columns to release_column_names
-    df.columns = release_column_names
-
-    return df
-
-
-def get_model_ids(quarterly_release_dataset_id: str) -> set():
-    """
-        Get the model ids from the quarterly datasets
-        Args:
-            quarterly_release_dataset_id (str): Dataset id of the quarterly release
-        Returns:
-            set: Set of model ids
-        """
-
-    taiga_client_v3 = create_taiga_client_v3()
-
-    taiga_ids_to_check = {
-        f"{quarterly_release_dataset_id}/{f}": column
-        for f, column in quarterly_files_to_check.items()
-    }
-
-    model_ids = set()
-    for taiga_id, column in tqdm(
-        taiga_ids_to_check.items(), desc="Processing Taiga IDs", unit="id"
-    ):
-        print("taiga_id", taiga_id)
-        df = taiga_client_v3.get(taiga_id)
-        if re.match("ACH-[0-9]*", str(df.index[0])):
-            model_ids.update(set(df.index))
-        elif re.match("ACH-[0-9]*", str(df.columns[0])):
-            model_ids.update(set(df.columns))
-        else:
-            assert column in df.columns, f"Missing {column} in {df.columns}"
-            model_ids.update(set(df[column]))
-
-    return model_ids
-
-
-def get_model_condition_ids(quarterly_release_dataset_id: str) -> set():
-    """
-    Get the model condition ids from the quarterly release dataset
-
-    Args:
-        quarterly_release_dataset_id (str): Dataset id of the quarterly release
-
-    Returns:
-        set: Set of model condition ids
-    """
-
-    taiga_client_v3 = create_taiga_client_v3()
-
-    omics_profiles_taiga_id = f"{quarterly_release_dataset_id}/OmicsProfiles"
-    screen_sequence_map_taiga_id = f"{quarterly_release_dataset_id}/ScreenSequenceMap"
-
-    print(f"Importing omics_profiles_taiga_id: {omics_profiles_taiga_id}")
-    omics_profiles = taiga_client_v3.get(omics_profiles_taiga_id)
-
-    print(f"Importing screen_sequence_map_taiga_id: {screen_sequence_map_taiga_id}")
-    screen_sequence_map = taiga_client_v3.get(screen_sequence_map_taiga_id)
-
-    model_condition_ids = set(omics_profiles["ModelCondition"]) | set(
-        screen_sequence_map["ModelConditionID"]
-    )
-    return model_condition_ids
-
-
-fields_to_merge = ["files"]
-def _merge_downloads_yaml(a, b):
-    dest = dict(a)
-    collisions = []
-    for k, v in b.items():
-        if k in fields_to_merge:
-            dest[k] = dest.get(k, []) + v
-        else:
-            if k in dest:
-                collisions.append(k)
-            dest[k] = v
-    assert len(collisions) == 0, f"The following fields collided: {collisions}"
-    return dest
-import os.path
-
-def _read_downloads_yaml(yaml_filename):
-    with open(yaml_filename, "rt") as fd:
-        yaml_content = yaml.safe_load(fd)
-
-    parent_dir = os.path.dirname(yaml_filename)
-    for filename in yaml_content.get("include", []):
-        parsed = _read_downloads_yaml(os.path.join(parent_dir, filename))
-        yaml_content = _merge_downloads_yaml(yaml_content, parsed)
-    return yaml_content
-
-def _update_description(yaml_filename: str, filename: str, description: str):
-    yaml_content = _read_downloads_yaml(yaml_filename)
-
-    files_by_name = {file["name"]: file for file in yaml_content["files"]}
-    files_by_name[filename]["description"] = description
-
-    with open(yaml_filename, "wt") as fd:
-        yaml.dump(yaml_content, fd)
-
-
-def update_readme_content(
-    yaml_filename: str, data_dictionary_df: pd.DataFrame, filename: str, prologue: str
-) -> None:
-    """
-    This function takes a DataFrame containing the data dictionary and updates README yaml.
-
-    Args:
-        yaml_filename: path to YAML file to update
-        data_dictionary_df (pd.DataFrame): DataFrame containing the data dictionary
-        filename: Name of the file whose description is being updated
-    """
-    # Get the column names from the 'release_column_name' column
-    column_names = data_dictionary_df["release_column_name"].tolist()
-
-    # Get the column descriptions from the 'column_description' column
-    column_description = data_dictionary_df["column_description"].tolist()
-
-    buffer = io.StringIO()
-    buffer.write(prologue + "\n\n")
-    for name, desc in zip(column_names, column_description):
-        buffer.write(f"- {name}: {desc}\n\n")
-
-    _update_description(yaml_filename, filename, buffer.getvalue())