WIP

wwu-mmll · Nov 22, 2024 · 547da24 · 547da24
1 parent aac9fd7
commit 547da24
Show file tree

Hide file tree

Showing 12 changed files with 339 additions and 65 deletions.
diff --git a/.gitignore b/.gitignore
@@ -162,4 +162,6 @@ cython_debug/
 docs
 
 examples/tmp
-examples/simulated_data
+examples/simulated_data
+
+poetry.lock
diff --git a/cpm/reporting/plots/plots.py b/cpm/reporting/plots/plots.py
@@ -8,6 +8,44 @@
 
 
 def bar_plot(df, selected_metric, results_folder):
+    cmap = sns.color_palette("muted", n_colors=3)
+    flatui = [ "#ba6523", "#31559e", "#cc3727"]
+    cmap = sns.color_palette(flatui)
+
+    # Dropdown for metric selection
+
+    fig = sns.FacetGrid(df[df['model'].isin(['connectome', 'covariates', 'residuals', 'full'])],
+                      row='model', col='network', hue='network',
+                      aspect=3, height=2 / 2.54,
+                      col_order = ["positive", "negative", "both"],
+                      row_order=['covariates', 'connectome','full', 'residuals'],
+                      sharex='col', sharey=False, palette=cmap)
+
+    # then we add the densities kdeplots
+    fig.map(sns.boxplot, selected_metric, "model")
+
+    #fig.map(sns.kdeplot, selected_metric,
+    #        bw_adjust=1, clip_on=False,
+    #        fill=True, alpha=1, linewidth=1)
+
+    #fig.map(sns.violinplot, selected_metric, "model", inner = None, split = True,  hue=1, hue_order=[1, 2],
+    #        density_norm = 'count', dodge = True, fill = True)
+    #fig.map(sns.boxplot, selected_metric, "model")#, dodge=True, hue=1, hue_order=[2, 1])
+
+    fig.set_titles(template='')
+    fig.set(ylabel=None)
+    for ax,m in zip(fig.axes[0,:], ["positive", "negative", "both"]):
+        ax.set_title(m, fontweight='bold', fontsize=8)
+    for ax,l in zip(fig.axes[:,0], ['covariates', 'connectome','full', 'residuals']):
+        ax.set_ylabel(l, fontweight='bold', fontsize=8, rotation=0, ha='right', va='center')
+    fig.set(yticks=[])
+
+    fig.fig.subplots_adjust(hspace=0.2, wspace=0.15)
+    if selected_metric == 'pearson_score' or selected_metric == 'spearman_score':
+        for ax in fig.axes.ravel():
+            ax.set_xlim(-0.5, 1)
+        fig.map(plt.axvline, x=0, color='black', linewidth=0.5, zorder=-1)
+    """    
     cmap = sns.color_palette("muted", n_colors=4)
 
     # Dropdown for metric selection
@@ -35,9 +73,9 @@ def bar_plot(df, selected_metric, results_folder):
     plt.tight_layout()
     if selected_metric == 'pearson_score' or selected_metric == 'spearman_score':
         axes[1].set_ylim(-0.5, 1)
-
+    """
     plot_name = os.path.join(results_folder, "plots", 'point_plot.png')
-    sns.despine(offset=1, trim=True)
+    #sns.despine(offset=1, trim=True)
     fig.savefig(plot_name, dpi=300)
     return plot_name, fig
 

diff --git a/cpm/reporting/report.py b/cpm/reporting/report.py
@@ -17,8 +17,14 @@ def main():
         #st.session_state['results_directory'] = '/spm-data/vault-data3/mmll/projects/cpm_macs_example/haushaltsnetto'
         #st.session_state['results_directory'] = '/spm-data/vault-data3/mmll/projects/cpm_macs_example/haushaltsnetto_partial'
         #st.session_state['results_directory'] = '/spm-data/vault-data3/mmll/projects/cpm_macs_example/haushaltsnetto_05_fdr'
+        #st.session_state[
+        #    'results_directory'] = '/home/nwinter/PycharmProjects/cpm_python/examples/tmp/simulated_data_no_no_link/'
+        #st.session_state[
+        #    'results_directory'] = '/home/nwinter/PycharmProjects/cpm_python/examples/tmp/macs_fsozu_partial/'
+        #st.session_state[
+        #    'results_directory'] = '/home/nwinter/PycharmProjects/cpm_python/examples/tmp/macs_age_partial/'
         st.session_state[
-            'results_directory'] = '/home/nwinter/PycharmProjects/cpm_python/examples/tmp/simulated_data_no_no_link/'
+            'results_directory'] = '/home/nwinter/PycharmProjects/cpm_python/examples/tmp/macs_IQ_partial/'
 
     st.session_state['df'] = pd.read_csv(os.path.join(st.session_state.results_directory, 'cv_results.csv'))
     st.session_state['df_main'] = load_results_from_folder(st.session_state.results_directory, 'cv_results_mean_std.csv')

diff --git a/cpm/reporting/results.py b/cpm/reporting/results.py
@@ -23,4 +23,4 @@
 
         with st.container(border=False):
             #st.plotly_chart(fig, use_container_width=True)
-            st.image(plot_name)
+            st.image(plot_name, width=600)
diff --git a/cpm/simulate_data.py b/cpm/simulate_data.py
@@ -109,15 +109,15 @@ def simulate_regression_data_2(n_samples: int = 500,
     y = np.copy(y_rand)
 
     if link_type == 'no_link':
-        z = 1 * y_rand + z_rand
+        z = 0.4 * y_rand + z_rand
         X[:, :n_informative_features] = x_rand[:, :n_informative_features] + z
     elif link_type == 'no_no_link':
-        z = 0.8 * y_rand + z_rand
+        z = 0.4 * y_rand + z_rand
     elif link_type == 'direct_link':
-        z = 0.8 * y_rand + z_rand
+        z = 0.4 * y_rand + z_rand
         X[:, :n_informative_features] = x_rand[:, :n_informative_features] + (0.2 * (noise * y_rand) + y_rand) + z
     elif link_type == 'weak_link':
-        z = 0.3 * y_rand + z_rand
+        z = 0.4 * y_rand + z_rand
         X[:, :n_informative_features] = x_rand[:, :n_informative_features] + (0.8 * (noise * y_rand) + y_rand) + z
     else:
         raise NotImplemented(f'Link type {link_type} not implemented')

diff --git a/examples/example_simulated_data2.py b/examples/example_simulated_data2.py
@@ -5,25 +5,28 @@
 from cpm.edge_selection import PThreshold, UnivariateEdgeSelection
 
 
-link_types = [#'no_link',
-              #'no_no_link',
+link_types = ['no_link',
+              'no_no_link',
               'direct_link',
               'weak_link'
             ]
+edge_statistics = ['pearson', 'pearson_partial']
+results_folder = '/spm-data/vault-data3/mmll/projects/confound_corrected_cpm/results'
 
 for link in link_types:
-    X, y, covariates = simulate_regression_data_2(n_features=1225, n_informative_features=50, link_type=link)
+    for edge_statistic in edge_statistics:
+        X, y, covariates = simulate_regression_data_2(n_features=1225, n_informative_features=50, link_type=link)
 
-    univariate_edge_selection = UnivariateEdgeSelection(edge_statistic=['pearson'],
-                                                        edge_selection=[PThreshold(threshold=[0.05],
-                                                                                   correction=[None])])
-    cpm = CPMRegression(results_directory=f'./tmp/simulated_data_{link}',
-                        cv=RepeatedKFold(n_splits=10, n_repeats=5, random_state=42),
-                        edge_selection=univariate_edge_selection,
-                        #cv_edge_selection=ShuffleSplit(n_splits=1, test_size=0.2, random_state=42),
-                        add_edge_filter=True,
-                        n_permutations=2)
-    cpm.estimate(X=X, y=y, covariates=covariates)
+        univariate_edge_selection = UnivariateEdgeSelection(edge_statistic=[edge_statistic],
+                                                            edge_selection=[PThreshold(threshold=[0.05],
+                                                                                       correction=['fdr_by'])])
+        cpm = CPMRegression(results_directory=f'{results_folder}/simulated_data_{link}_{edge_statistic}',
+                            cv=RepeatedKFold(n_splits=10, n_repeats=5, random_state=42),
+                            edge_selection=univariate_edge_selection,
+                            #cv_edge_selection=ShuffleSplit(n_splits=1, test_size=0.2, random_state=42),
+                            add_edge_filter=True,
+                            n_permutations=2)
+        cpm.estimate(X=X, y=y, covariates=covariates)
 
-    #cpm._calculate_permutation_results('./tmp/example_simulated_data2')
+        #cpm._calculate_permutation_results('./tmp/example_simulated_data2')
 
diff --git a/examples/human_connectome_project.py b/examples/human_connectome_project.py
@@ -0,0 +1,49 @@
+import os
+import pandas as pd
+import numpy as np
+
+# Define the folder containing the .txt files
+folder_path = ('/spm-data/vault-data3/mmll/data/HCP/100_nodes_times_series/3T_HCP1200_MSMAll_d100_ts2')
+
+# Initialize a list to store correlation matrices
+correlation_matrices = []
+
+# Initialize a list to store subject IDs
+subject_ids = []
+
+file_list = [f for f in os.listdir(folder_path) if f.endswith('.txt')]
+
+# Sort the filenames by subject ID (assuming subject IDs are in the filename before .txt)
+file_list.sort()  # This will sort alphabetically, which works if subject IDs are formatted consistently
+
+# Loop through all files in the specified folder
+for filename in file_list:
+    print(filename)
+    # Construct the full file path
+    file_path = os.path.join(folder_path, filename)
+
+    # Read the text file into a DataFrame
+    df = pd.read_csv(file_path, sep='\s+', header=None)
+
+    # Calculate the correlation matrix
+    correlation_matrix = df.corr()
+
+    # Append the correlation matrix to the list
+    correlation_matrices.append(correlation_matrix.values)
+
+    # Extract subject ID from the filename (remove the .txt extension)
+    subject_id = filename[:-4]  # Exclude the last 4 characters ('.txt')
+    subject_ids.append(subject_id)
+
+# Convert the list of correlation matrices into a 3D numpy array
+correlation_array = np.array(correlation_matrices)
+
+# Create a DataFrame for subject IDs
+subject_ids_df = pd.DataFrame(subject_ids, columns=['Subject_ID'])
+
+# Save the numpy array and DataFrame to disk
+np.save('/spm-data/vault-data3/mmll/data/HCP/100_nodes_times_series/functional_connectivity.npy', correlation_array)  # Save as .npy file
+subject_ids_df.to_csv('/spm-data/vault-data3/mmll/data/HCP/100_nodes_times_series/subject_ids.csv', index=False)  # Save as .csv file
+
+
+print()
diff --git a/examples/macs.py b/examples/macs.py
@@ -14,35 +14,49 @@
 df = pd.read_csv('/spm-data/vault-data3/mmll/projects/cpm_macs_example/datahub/AnalysisReady/hc/DTI_fractional_anisotropy/subjects.csv',
                  na_values=-99)
 
-#X = X[df['Group'] == 1]
-#df = df[df['Group'] == 1]
+X = X[df['Group'] <= 1]
+df = df[df['Group'] <= 1]
 
 
-X = X[~df['CTQ_Sum'].isna()]
-df = df[~df['CTQ_Sum'].isna()]
+#X = X[~df['CTQ_Sum'].isna()]
+#df = df[~df['CTQ_Sum'].isna()]
+X = X[~df['IQ'].isna()]
+df = df[~df['IQ'].isna()]
+
+#X = X[~df['NEOFFI_Neurotizismus'].isna()]
+#df = df[~df['NEOFFI_Neurotizismus'].isna()]
+
+#X = X[~df['FSozU_Sum'].isna()]
+#df = df[~df['FSozU_Sum'].isna()]
 
 #X = X[~df['BDI_Sum'].isna()]
 #df = df[~df['BDI_Sum'].isna()]
 covs = df[['Alter', 'Geschlecht', 'Site']].to_numpy()
+#covs = df[['Geschlecht', 'Site']].to_numpy()
 #covs = df[['Geschlecht']].to_numpy()
 #y = df['BDI_Sum'].to_numpy()
-y = df['CTQ_Sum'].to_numpy()
+#y = df['FSozU_Sum'].to_numpy()
+#y = df['Alter'].to_numpy()
+y = df['IQ'].to_numpy()
+
+#y = df['NEOFFI_Neurotizismus'].to_numpy()
+#y = df['CTQ_Sum'].to_numpy()
 #covs = df[['Geschlecht']].to_numpy()
 #y = df['Alter'].to_numpy()
 
 
 from cpm.edge_selection import PThreshold, UnivariateEdgeSelection
-p_threshold = PThreshold(threshold=[0.05], correction=[None])
+p_threshold = PThreshold(threshold=[0.1, 0.05, 0.01, 0.005, 0.001], correction=[None])
 univariate_edge_selection = UnivariateEdgeSelection(edge_statistic=['pearson_partial'],
                                                     edge_selection=[p_threshold])
 
-cpm = CPMRegression(results_directory='./tmp/macs_ctq',
+cpm = CPMRegression(results_directory='./tmp/macs_IQ_partial',
                     cv=KFold(n_splits=10, shuffle=True, random_state=42),
                     edge_selection=univariate_edge_selection,
-                    #cv_edge_selection=KFold(n_splits=2, shuffle=True, random_state=42),
+                    cv_edge_selection=KFold(n_splits=10, shuffle=True, random_state=42),
                     add_edge_filter=True,
-                    n_permutations=100)
+                    n_permutations=20)
 results = cpm.estimate(X=X, y=y, covariates=covs)
 #print(results)
-cpm._calculate_permutation_results()
+#cpm._calculate_permutation_results()
 
diff --git a/examples/monte_carlo.py b/examples/monte_carlo.py
@@ -0,0 +1,44 @@
+import numpy as np
+import matplotlib.pyplot as plt
+
+
+def simulate_scenario(scenario, n=1000, noise_level=0.1):
+    np.random.seed(42)  # For reproducibility
+    if scenario == 1:
+        # Scenario 1: x and y associated, z independent
+        x = np.random.normal(0, 1, n)
+        y = 2 * x + np.random.normal(0, noise_level, n)
+        z = np.random.normal(0, 1, n)  # Independent of x and y
+    elif scenario == 2:
+        # Scenario 2: x and y spurious due to z
+        z = np.random.normal(0, 1, n)
+        x = 2 * z + np.random.normal(0, noise_level, n)
+        y = -3 * z + np.random.normal(0, noise_level, n)
+    elif scenario == 3:
+        # Scenario 3: x and y associated; z partially explains it
+        z = np.random.normal(0, 1, n)
+        x = 2 * z + np.random.normal(0, noise_level, n)
+        y = 3 * x - 1.5 * z + np.random.normal(0, noise_level, n)
+    else:
+        raise ValueError("Invalid scenario. Choose 1, 2, or 3.")
+    return x, y, z
+
+
+# Simulate and plot results for each scenario
+for scenario in range(1, 4):
+    x, y, z = simulate_scenario(scenario, noise_level=2)
+
+    print(f"Scenario {scenario} correlations:")
+    print(f"  Corr(x, y): {np.corrcoef(x, y)[0, 1]:.2f}")
+    print(f"  Corr(x, z): {np.corrcoef(x, z)[0, 1]:.2f}")
+    print(f"  Corr(y, z): {np.corrcoef(y, z)[0, 1]:.2f}")
+
+    plt.figure()
+    plt.scatter(x, y, alpha=0.5, label="x vs y")
+    plt.scatter(x, z, alpha=0.5, label="x vs z")
+    plt.scatter(y, z, alpha=0.5, label="y vs z")
+    plt.legend()
+    plt.title(f"Scenario {scenario}")
+    plt.xlabel("Variable")
+    plt.ylabel("Variable")
+    plt.show()
diff --git a/examples/monte_carlo_tutorial.py b/examples/monte_carlo_tutorial.py
@@ -0,0 +1,15 @@
+import numpy as np
+
+
+assoc_strength = 0.5
+
+x = np.random.normal(0, 1, 1000)
+e = np.random.normal(0, 1, 1000)
+y = assoc_strength * x + e
+
+print(f"var({assoc_strength}*x)={np.var(assoc_strength * x)}")
+print(f"var(noise)={np.var(e)}")
+print(f"var(y)={np.var(y)}")
+
+print(f"r={np.corrcoef(x, y)[0, 1]:.2f}")
+print(f"Explained variance: {np.square(np.corrcoef(x, y)[0, 1]):.2f}")
diff --git a/examples/plot_simulation_results.py b/examples/plot_simulation_results.py
@@ -5,35 +5,42 @@
 import pandas as pd
 
 
-results_folder = "/home/nwinter/PycharmProjects/cpm_python/examples/tmp/"
-
-results = {"brain not associated with y\n brain not associated with confound\n confound associated with y": os.path.join(results_folder, "simulated_data_no_no_link"),
-           "brain not associated with y\n brain associated with confound\n confound associated with y": os.path.join(results_folder, "simulated_data_no_link"),
-           "brain weakly associated with y\n brain associated with confound\n confound associated with y": os.path.join(results_folder, "simulated_data_weak_link"),
-           "brain strongly associated with y\n brain associated with confound\n confound associated with y": os.path.join(results_folder, "simulated_data_direct_link"),
-
-           }
-
-dfs = []
-for link_type, folder in results.items():
-    df = pd.read_csv(os.path.join(folder, 'cv_results.csv'))
-    df['link_type'] = link_type
-    dfs.append(df)
-
-concatenated_df = pd.concat(dfs, ignore_index=True)
-
-concatenated_df = concatenated_df[concatenated_df['network'] == 'both']
-
-concatenated_df['model'] = concatenated_df['model'].replace({"covariates": "confound only", "full": "connectome + confound",
-                                 "connectome": "connectome only"})
-
-g = sns.FacetGrid(concatenated_df, col="link_type", margin_titles=True, despine=True,
-                  height=2.5)
-g.map(plt.axvline, x=0, color='grey', linewidth=0.5, zorder=-1)
-g.map(sns.violinplot, "pearson_score", "model", inner=None, split=True, hue=1, hue_order=[1, 2],
-      density_norm='count', dodge=True, palette="Blues_r")
-g.map(sns.boxplot, "pearson_score", "model", dodge=True, hue=1, hue_order=[2, 1])
-g.set_titles(col_template="{col_name}", size=7)
-g.set_xlabels("Pearson correlation", size=8)
-plt.show()
-print()
+results_folder = "/spm-data/vault-data3/mmll/projects/confound_corrected_cpm/results"
+
+edge_statistics = ['pearson', 'pearson_partial']
+
+
+for edge_statistic in edge_statistics:
+    results = {"brain not associated with y\n brain not associated with confound\n confound associated with y": os.path.join(
+                    results_folder, f"simulated_data_no_no_link_{edge_statistic}"),
+        "brain not associated with y\n brain associated with confound\n confound associated with y": os.path.join(
+            results_folder, f"simulated_data_no_link_{edge_statistic}"),
+        "brain weakly associated with y\n brain associated with confound\n confound associated with y": os.path.join(
+            results_folder, f"simulated_data_weak_link_{edge_statistic}"),
+        "brain strongly associated with y\n brain associated with confound\n confound associated with y": os.path.join(
+            results_folder, f"simulated_data_direct_link_{edge_statistic}")}
+    dfs = []
+    for link_type, folder in results.items():
+        df = pd.read_csv(os.path.join(folder, 'cv_results.csv'))
+        df['link_type'] = link_type
+        dfs.append(df)
+
+    concatenated_df = pd.concat(dfs, ignore_index=True)
+
+    concatenated_df = concatenated_df[concatenated_df['network'] == 'both']
+
+    concatenated_df['model'] = concatenated_df['model'].replace({"covariates": "confound only", "full": "connectome + confound",
+                                     "connectome": "connectome only"})
+
+
+    g = sns.FacetGrid(concatenated_df, col="link_type", margin_titles=True, despine=True,
+                      height=2.5, hue="model")
+    g.map(plt.axvline, x=0, color='grey', linewidth=0.5, zorder=-1)
+    g.map(sns.violinplot, "pearson_score", "model", inner=None, split=True,# hue="model",# hue_order=[1, 2],
+          density_norm='count', dodge=True, palette="Blues_r", fill=True)
+    #g.map(sns.boxplot, "pearson_score", "model", dodge=True, #hue=1, hue_order=[2, 1]
+    #      )
+    g.set_titles(col_template="{col_name}", size=7)
+    g.set_xlabels("Pearson correlation", size=8)
+    plt.show()
+    print()