diff --git a/.gitignore b/.gitignore index da12acc..e8b9312 100644 --- a/.gitignore +++ b/.gitignore @@ -162,4 +162,6 @@ cython_debug/ docs examples/tmp -examples/simulated_data \ No newline at end of file +examples/simulated_data + +poetry.lock \ No newline at end of file diff --git a/cpm/reporting/plots/plots.py b/cpm/reporting/plots/plots.py index e6f8fba..c460bb1 100644 --- a/cpm/reporting/plots/plots.py +++ b/cpm/reporting/plots/plots.py @@ -8,6 +8,44 @@ def bar_plot(df, selected_metric, results_folder): + cmap = sns.color_palette("muted", n_colors=3) + flatui = [ "#ba6523", "#31559e", "#cc3727"] + cmap = sns.color_palette(flatui) + + # Dropdown for metric selection + + fig = sns.FacetGrid(df[df['model'].isin(['connectome', 'covariates', 'residuals', 'full'])], + row='model', col='network', hue='network', + aspect=3, height=2 / 2.54, + col_order = ["positive", "negative", "both"], + row_order=['covariates', 'connectome','full', 'residuals'], + sharex='col', sharey=False, palette=cmap) + + # then we add the densities kdeplots + fig.map(sns.boxplot, selected_metric, "model") + + #fig.map(sns.kdeplot, selected_metric, + # bw_adjust=1, clip_on=False, + # fill=True, alpha=1, linewidth=1) + + #fig.map(sns.violinplot, selected_metric, "model", inner = None, split = True, hue=1, hue_order=[1, 2], + # density_norm = 'count', dodge = True, fill = True) + #fig.map(sns.boxplot, selected_metric, "model")#, dodge=True, hue=1, hue_order=[2, 1]) + + fig.set_titles(template='') + fig.set(ylabel=None) + for ax,m in zip(fig.axes[0,:], ["positive", "negative", "both"]): + ax.set_title(m, fontweight='bold', fontsize=8) + for ax,l in zip(fig.axes[:,0], ['covariates', 'connectome','full', 'residuals']): + ax.set_ylabel(l, fontweight='bold', fontsize=8, rotation=0, ha='right', va='center') + fig.set(yticks=[]) + + fig.fig.subplots_adjust(hspace=0.2, wspace=0.15) + if selected_metric == 'pearson_score' or selected_metric == 'spearman_score': + for ax in fig.axes.ravel(): + ax.set_xlim(-0.5, 1) + fig.map(plt.axvline, x=0, color='black', linewidth=0.5, zorder=-1) + """ cmap = sns.color_palette("muted", n_colors=4) # Dropdown for metric selection @@ -35,9 +73,9 @@ def bar_plot(df, selected_metric, results_folder): plt.tight_layout() if selected_metric == 'pearson_score' or selected_metric == 'spearman_score': axes[1].set_ylim(-0.5, 1) - + """ plot_name = os.path.join(results_folder, "plots", 'point_plot.png') - sns.despine(offset=1, trim=True) + #sns.despine(offset=1, trim=True) fig.savefig(plot_name, dpi=300) return plot_name, fig diff --git a/cpm/reporting/report.py b/cpm/reporting/report.py index cb33fb7..b241747 100644 --- a/cpm/reporting/report.py +++ b/cpm/reporting/report.py @@ -17,8 +17,14 @@ def main(): #st.session_state['results_directory'] = '/spm-data/vault-data3/mmll/projects/cpm_macs_example/haushaltsnetto' #st.session_state['results_directory'] = '/spm-data/vault-data3/mmll/projects/cpm_macs_example/haushaltsnetto_partial' #st.session_state['results_directory'] = '/spm-data/vault-data3/mmll/projects/cpm_macs_example/haushaltsnetto_05_fdr' + #st.session_state[ + # 'results_directory'] = '/home/nwinter/PycharmProjects/cpm_python/examples/tmp/simulated_data_no_no_link/' + #st.session_state[ + # 'results_directory'] = '/home/nwinter/PycharmProjects/cpm_python/examples/tmp/macs_fsozu_partial/' + #st.session_state[ + # 'results_directory'] = '/home/nwinter/PycharmProjects/cpm_python/examples/tmp/macs_age_partial/' st.session_state[ - 'results_directory'] = '/home/nwinter/PycharmProjects/cpm_python/examples/tmp/simulated_data_no_no_link/' + 'results_directory'] = '/home/nwinter/PycharmProjects/cpm_python/examples/tmp/macs_IQ_partial/' st.session_state['df'] = pd.read_csv(os.path.join(st.session_state.results_directory, 'cv_results.csv')) st.session_state['df_main'] = load_results_from_folder(st.session_state.results_directory, 'cv_results_mean_std.csv') diff --git a/cpm/reporting/results.py b/cpm/reporting/results.py index bc86341..58e256a 100644 --- a/cpm/reporting/results.py +++ b/cpm/reporting/results.py @@ -23,4 +23,4 @@ with st.container(border=False): #st.plotly_chart(fig, use_container_width=True) - st.image(plot_name) + st.image(plot_name, width=600) diff --git a/cpm/simulate_data.py b/cpm/simulate_data.py index 7f5a709..563c39e 100644 --- a/cpm/simulate_data.py +++ b/cpm/simulate_data.py @@ -109,15 +109,15 @@ def simulate_regression_data_2(n_samples: int = 500, y = np.copy(y_rand) if link_type == 'no_link': - z = 1 * y_rand + z_rand + z = 0.4 * y_rand + z_rand X[:, :n_informative_features] = x_rand[:, :n_informative_features] + z elif link_type == 'no_no_link': - z = 0.8 * y_rand + z_rand + z = 0.4 * y_rand + z_rand elif link_type == 'direct_link': - z = 0.8 * y_rand + z_rand + z = 0.4 * y_rand + z_rand X[:, :n_informative_features] = x_rand[:, :n_informative_features] + (0.2 * (noise * y_rand) + y_rand) + z elif link_type == 'weak_link': - z = 0.3 * y_rand + z_rand + z = 0.4 * y_rand + z_rand X[:, :n_informative_features] = x_rand[:, :n_informative_features] + (0.8 * (noise * y_rand) + y_rand) + z else: raise NotImplemented(f'Link type {link_type} not implemented') diff --git a/examples/example_simulated_data2.py b/examples/example_simulated_data2.py index 6cd0655..ae5d7ab 100644 --- a/examples/example_simulated_data2.py +++ b/examples/example_simulated_data2.py @@ -5,25 +5,28 @@ from cpm.edge_selection import PThreshold, UnivariateEdgeSelection -link_types = [#'no_link', - #'no_no_link', +link_types = ['no_link', + 'no_no_link', 'direct_link', 'weak_link' ] +edge_statistics = ['pearson', 'pearson_partial'] +results_folder = '/spm-data/vault-data3/mmll/projects/confound_corrected_cpm/results' for link in link_types: - X, y, covariates = simulate_regression_data_2(n_features=1225, n_informative_features=50, link_type=link) + for edge_statistic in edge_statistics: + X, y, covariates = simulate_regression_data_2(n_features=1225, n_informative_features=50, link_type=link) - univariate_edge_selection = UnivariateEdgeSelection(edge_statistic=['pearson'], - edge_selection=[PThreshold(threshold=[0.05], - correction=[None])]) - cpm = CPMRegression(results_directory=f'./tmp/simulated_data_{link}', - cv=RepeatedKFold(n_splits=10, n_repeats=5, random_state=42), - edge_selection=univariate_edge_selection, - #cv_edge_selection=ShuffleSplit(n_splits=1, test_size=0.2, random_state=42), - add_edge_filter=True, - n_permutations=2) - cpm.estimate(X=X, y=y, covariates=covariates) + univariate_edge_selection = UnivariateEdgeSelection(edge_statistic=[edge_statistic], + edge_selection=[PThreshold(threshold=[0.05], + correction=['fdr_by'])]) + cpm = CPMRegression(results_directory=f'{results_folder}/simulated_data_{link}_{edge_statistic}', + cv=RepeatedKFold(n_splits=10, n_repeats=5, random_state=42), + edge_selection=univariate_edge_selection, + #cv_edge_selection=ShuffleSplit(n_splits=1, test_size=0.2, random_state=42), + add_edge_filter=True, + n_permutations=2) + cpm.estimate(X=X, y=y, covariates=covariates) - #cpm._calculate_permutation_results('./tmp/example_simulated_data2') + #cpm._calculate_permutation_results('./tmp/example_simulated_data2') diff --git a/examples/human_connectome_project.py b/examples/human_connectome_project.py new file mode 100644 index 0000000..d7b3da8 --- /dev/null +++ b/examples/human_connectome_project.py @@ -0,0 +1,49 @@ +import os +import pandas as pd +import numpy as np + +# Define the folder containing the .txt files +folder_path = ('/spm-data/vault-data3/mmll/data/HCP/100_nodes_times_series/3T_HCP1200_MSMAll_d100_ts2') + +# Initialize a list to store correlation matrices +correlation_matrices = [] + +# Initialize a list to store subject IDs +subject_ids = [] + +file_list = [f for f in os.listdir(folder_path) if f.endswith('.txt')] + +# Sort the filenames by subject ID (assuming subject IDs are in the filename before .txt) +file_list.sort() # This will sort alphabetically, which works if subject IDs are formatted consistently + +# Loop through all files in the specified folder +for filename in file_list: + print(filename) + # Construct the full file path + file_path = os.path.join(folder_path, filename) + + # Read the text file into a DataFrame + df = pd.read_csv(file_path, sep='\s+', header=None) + + # Calculate the correlation matrix + correlation_matrix = df.corr() + + # Append the correlation matrix to the list + correlation_matrices.append(correlation_matrix.values) + + # Extract subject ID from the filename (remove the .txt extension) + subject_id = filename[:-4] # Exclude the last 4 characters ('.txt') + subject_ids.append(subject_id) + +# Convert the list of correlation matrices into a 3D numpy array +correlation_array = np.array(correlation_matrices) + +# Create a DataFrame for subject IDs +subject_ids_df = pd.DataFrame(subject_ids, columns=['Subject_ID']) + +# Save the numpy array and DataFrame to disk +np.save('/spm-data/vault-data3/mmll/data/HCP/100_nodes_times_series/functional_connectivity.npy', correlation_array) # Save as .npy file +subject_ids_df.to_csv('/spm-data/vault-data3/mmll/data/HCP/100_nodes_times_series/subject_ids.csv', index=False) # Save as .csv file + + +print() \ No newline at end of file diff --git a/examples/macs.py b/examples/macs.py index 5611449..748e189 100644 --- a/examples/macs.py +++ b/examples/macs.py @@ -14,35 +14,49 @@ df = pd.read_csv('/spm-data/vault-data3/mmll/projects/cpm_macs_example/datahub/AnalysisReady/hc/DTI_fractional_anisotropy/subjects.csv', na_values=-99) -#X = X[df['Group'] == 1] -#df = df[df['Group'] == 1] +X = X[df['Group'] <= 1] +df = df[df['Group'] <= 1] -X = X[~df['CTQ_Sum'].isna()] -df = df[~df['CTQ_Sum'].isna()] +#X = X[~df['CTQ_Sum'].isna()] +#df = df[~df['CTQ_Sum'].isna()] +X = X[~df['IQ'].isna()] +df = df[~df['IQ'].isna()] + +#X = X[~df['NEOFFI_Neurotizismus'].isna()] +#df = df[~df['NEOFFI_Neurotizismus'].isna()] + +#X = X[~df['FSozU_Sum'].isna()] +#df = df[~df['FSozU_Sum'].isna()] #X = X[~df['BDI_Sum'].isna()] #df = df[~df['BDI_Sum'].isna()] covs = df[['Alter', 'Geschlecht', 'Site']].to_numpy() +#covs = df[['Geschlecht', 'Site']].to_numpy() #covs = df[['Geschlecht']].to_numpy() #y = df['BDI_Sum'].to_numpy() -y = df['CTQ_Sum'].to_numpy() +#y = df['FSozU_Sum'].to_numpy() +#y = df['Alter'].to_numpy() +y = df['IQ'].to_numpy() + +#y = df['NEOFFI_Neurotizismus'].to_numpy() +#y = df['CTQ_Sum'].to_numpy() #covs = df[['Geschlecht']].to_numpy() #y = df['Alter'].to_numpy() from cpm.edge_selection import PThreshold, UnivariateEdgeSelection -p_threshold = PThreshold(threshold=[0.05], correction=[None]) +p_threshold = PThreshold(threshold=[0.1, 0.05, 0.01, 0.005, 0.001], correction=[None]) univariate_edge_selection = UnivariateEdgeSelection(edge_statistic=['pearson_partial'], edge_selection=[p_threshold]) -cpm = CPMRegression(results_directory='./tmp/macs_ctq', +cpm = CPMRegression(results_directory='./tmp/macs_IQ_partial', cv=KFold(n_splits=10, shuffle=True, random_state=42), edge_selection=univariate_edge_selection, - #cv_edge_selection=KFold(n_splits=2, shuffle=True, random_state=42), + cv_edge_selection=KFold(n_splits=10, shuffle=True, random_state=42), add_edge_filter=True, - n_permutations=100) + n_permutations=20) results = cpm.estimate(X=X, y=y, covariates=covs) #print(results) -cpm._calculate_permutation_results() +#cpm._calculate_permutation_results() diff --git a/examples/monte_carlo.py b/examples/monte_carlo.py new file mode 100644 index 0000000..70cf628 --- /dev/null +++ b/examples/monte_carlo.py @@ -0,0 +1,44 @@ +import numpy as np +import matplotlib.pyplot as plt + + +def simulate_scenario(scenario, n=1000, noise_level=0.1): + np.random.seed(42) # For reproducibility + if scenario == 1: + # Scenario 1: x and y associated, z independent + x = np.random.normal(0, 1, n) + y = 2 * x + np.random.normal(0, noise_level, n) + z = np.random.normal(0, 1, n) # Independent of x and y + elif scenario == 2: + # Scenario 2: x and y spurious due to z + z = np.random.normal(0, 1, n) + x = 2 * z + np.random.normal(0, noise_level, n) + y = -3 * z + np.random.normal(0, noise_level, n) + elif scenario == 3: + # Scenario 3: x and y associated; z partially explains it + z = np.random.normal(0, 1, n) + x = 2 * z + np.random.normal(0, noise_level, n) + y = 3 * x - 1.5 * z + np.random.normal(0, noise_level, n) + else: + raise ValueError("Invalid scenario. Choose 1, 2, or 3.") + return x, y, z + + +# Simulate and plot results for each scenario +for scenario in range(1, 4): + x, y, z = simulate_scenario(scenario, noise_level=2) + + print(f"Scenario {scenario} correlations:") + print(f" Corr(x, y): {np.corrcoef(x, y)[0, 1]:.2f}") + print(f" Corr(x, z): {np.corrcoef(x, z)[0, 1]:.2f}") + print(f" Corr(y, z): {np.corrcoef(y, z)[0, 1]:.2f}") + + plt.figure() + plt.scatter(x, y, alpha=0.5, label="x vs y") + plt.scatter(x, z, alpha=0.5, label="x vs z") + plt.scatter(y, z, alpha=0.5, label="y vs z") + plt.legend() + plt.title(f"Scenario {scenario}") + plt.xlabel("Variable") + plt.ylabel("Variable") + plt.show() diff --git a/examples/monte_carlo_tutorial.py b/examples/monte_carlo_tutorial.py new file mode 100644 index 0000000..5ee1c54 --- /dev/null +++ b/examples/monte_carlo_tutorial.py @@ -0,0 +1,15 @@ +import numpy as np + + +assoc_strength = 0.5 + +x = np.random.normal(0, 1, 1000) +e = np.random.normal(0, 1, 1000) +y = assoc_strength * x + e + +print(f"var({assoc_strength}*x)={np.var(assoc_strength * x)}") +print(f"var(noise)={np.var(e)}") +print(f"var(y)={np.var(y)}") + +print(f"r={np.corrcoef(x, y)[0, 1]:.2f}") +print(f"Explained variance: {np.square(np.corrcoef(x, y)[0, 1]):.2f}") \ No newline at end of file diff --git a/examples/plot_simulation_results.py b/examples/plot_simulation_results.py index bbac2fe..6cab143 100644 --- a/examples/plot_simulation_results.py +++ b/examples/plot_simulation_results.py @@ -5,35 +5,42 @@ import pandas as pd -results_folder = "/home/nwinter/PycharmProjects/cpm_python/examples/tmp/" - -results = {"brain not associated with y\n brain not associated with confound\n confound associated with y": os.path.join(results_folder, "simulated_data_no_no_link"), - "brain not associated with y\n brain associated with confound\n confound associated with y": os.path.join(results_folder, "simulated_data_no_link"), - "brain weakly associated with y\n brain associated with confound\n confound associated with y": os.path.join(results_folder, "simulated_data_weak_link"), - "brain strongly associated with y\n brain associated with confound\n confound associated with y": os.path.join(results_folder, "simulated_data_direct_link"), - - } - -dfs = [] -for link_type, folder in results.items(): - df = pd.read_csv(os.path.join(folder, 'cv_results.csv')) - df['link_type'] = link_type - dfs.append(df) - -concatenated_df = pd.concat(dfs, ignore_index=True) - -concatenated_df = concatenated_df[concatenated_df['network'] == 'both'] - -concatenated_df['model'] = concatenated_df['model'].replace({"covariates": "confound only", "full": "connectome + confound", - "connectome": "connectome only"}) - -g = sns.FacetGrid(concatenated_df, col="link_type", margin_titles=True, despine=True, - height=2.5) -g.map(plt.axvline, x=0, color='grey', linewidth=0.5, zorder=-1) -g.map(sns.violinplot, "pearson_score", "model", inner=None, split=True, hue=1, hue_order=[1, 2], - density_norm='count', dodge=True, palette="Blues_r") -g.map(sns.boxplot, "pearson_score", "model", dodge=True, hue=1, hue_order=[2, 1]) -g.set_titles(col_template="{col_name}", size=7) -g.set_xlabels("Pearson correlation", size=8) -plt.show() -print() +results_folder = "/spm-data/vault-data3/mmll/projects/confound_corrected_cpm/results" + +edge_statistics = ['pearson', 'pearson_partial'] + + +for edge_statistic in edge_statistics: + results = {"brain not associated with y\n brain not associated with confound\n confound associated with y": os.path.join( + results_folder, f"simulated_data_no_no_link_{edge_statistic}"), + "brain not associated with y\n brain associated with confound\n confound associated with y": os.path.join( + results_folder, f"simulated_data_no_link_{edge_statistic}"), + "brain weakly associated with y\n brain associated with confound\n confound associated with y": os.path.join( + results_folder, f"simulated_data_weak_link_{edge_statistic}"), + "brain strongly associated with y\n brain associated with confound\n confound associated with y": os.path.join( + results_folder, f"simulated_data_direct_link_{edge_statistic}")} + dfs = [] + for link_type, folder in results.items(): + df = pd.read_csv(os.path.join(folder, 'cv_results.csv')) + df['link_type'] = link_type + dfs.append(df) + + concatenated_df = pd.concat(dfs, ignore_index=True) + + concatenated_df = concatenated_df[concatenated_df['network'] == 'both'] + + concatenated_df['model'] = concatenated_df['model'].replace({"covariates": "confound only", "full": "connectome + confound", + "connectome": "connectome only"}) + + + g = sns.FacetGrid(concatenated_df, col="link_type", margin_titles=True, despine=True, + height=2.5, hue="model") + g.map(plt.axvline, x=0, color='grey', linewidth=0.5, zorder=-1) + g.map(sns.violinplot, "pearson_score", "model", inner=None, split=True,# hue="model",# hue_order=[1, 2], + density_norm='count', dodge=True, palette="Blues_r", fill=True) + #g.map(sns.boxplot, "pearson_score", "model", dodge=True, #hue=1, hue_order=[2, 1] + # ) + g.set_titles(col_template="{col_name}", size=7) + g.set_xlabels("Pearson correlation", size=8) + plt.show() + print() diff --git a/examples/simulate_data_3.py b/examples/simulate_data_3.py new file mode 100644 index 0000000..6a6a40e --- /dev/null +++ b/examples/simulate_data_3.py @@ -0,0 +1,96 @@ +from sklearn.linear_model import LinearRegression +from sklearn.metrics import r2_score +import numpy as np +import matplotlib.pyplot as plt + + +def generate_scenario_data(scenario, n_samples=1000, n_features=10, n_informative=3, noise_level=0.1): + np.random.seed(42) # For reproducibility + + # Generate feature matrix X + X = np.random.normal(0, 1, (n_samples, n_features)) + + # Generate ground truth coefficients for X's influence on z and y + z_coefficients = np.zeros(n_features) + y_coefficients = np.zeros(n_features) + + if scenario == 1: + # Scenario 1: y and z are influenced by X independently + z_coefficients[:n_informative] = np.linspace(1, 0.1, n_informative) # Decreasing influence + y_coefficients[:n_informative] = np.linspace(1, 0.1, n_informative) # Decreasing influence + z = np.dot(X, z_coefficients) + np.random.normal(0, noise_level, n_samples) + y = np.dot(X, y_coefficients) + np.random.normal(0, noise_level, n_samples) + + elif scenario == 2: + # Scenario 2: z influences both X and y, inducing spurious association + z = np.random.normal(0, 1, n_samples) + z_coefficients[:n_informative] = np.linspace(1, 0.1, n_informative) + y = 2 * z + np.random.normal(0, noise_level, n_samples) + for i in range(n_features): + X[:, i] = z * z_coefficients[i] + np.random.normal(0, noise_level, n_samples) + + elif scenario == 3: + # Scenario 3: y is influenced by both X and z, with z partially mediating + z_coefficients[:n_informative] = np.linspace(1, 0.1, n_informative) + y_coefficients[:n_informative] = np.linspace(1, 0.1, n_informative) + z = np.dot(X, z_coefficients) + np.random.normal(0, noise_level, n_samples) + y = np.dot(X, y_coefficients) + 0.5 * z + np.random.normal(0, noise_level, n_samples) + elif scenario == 4: + y_coefficients[:n_informative] = np.linspace(1, 0.1, n_informative) + z = np.random.normal(0, noise_level, n_samples) + y = np.dot(X, y_coefficients) + np.random.normal(0, noise_level, n_samples) + + else: + raise ValueError("Invalid scenario. Choose 1, 2, or 3.") + + return X, y, z + + +def calculate_explained_variance(X, target): + # Fit a linear regression model + model = LinearRegression() + model.fit(X, target) + + # Predict the target using X + target_pred = model.predict(X) + + # Calculate R^2 (explained variance) + r2 = r2_score(target, target_pred) + return r2 + + +def calculate_explained_variance_y_given_z(y, z): + # Fit a linear regression model + model = LinearRegression() + z = z.reshape(-1, 1) # Reshape z to fit as a single feature + model.fit(z, y) + y_pred = model.predict(z) + r2 = r2_score(y, y_pred) + return r2 + + +# Generate data for each scenario and calculate explained variances +for scenario in range(1, 5): + X, y, z = generate_scenario_data(scenario, noise_level=1) + + # Calculate explained variances + explained_variance_X_y = calculate_explained_variance(X, y) + explained_variance_X_z = calculate_explained_variance(X, z) + explained_variance_y_given_z = calculate_explained_variance_y_given_z(y, z) + + print(f"Scenario {scenario}:") + print(f" Explained Variance (R^2) of X for y: {explained_variance_X_y:.2f}") + print(f" Explained Variance (R^2) of X for z: {explained_variance_X_z:.2f}") + print(f" Explained Variance (R^2) of y for z: {explained_variance_y_given_z:.2f}") + print(f" Corr(X[:, 0], y): {np.corrcoef(X[:, 0], y)[0, 1]:.2f}") + print(f" Corr(X[:, 0], z): {np.corrcoef(X[:, 0], z)[0, 1]:.2f}") + print(f" Corr(y, z): {np.corrcoef(y, z)[0, 1]:.2f}") + + # Plot y vs z for visualization + plt.figure() + plt.scatter(y, z, alpha=0.5, label="y vs z") + plt.title(f"Scenario {scenario}: y vs z") + plt.xlabel("y") + plt.ylabel("z") + plt.legend() + plt.show()