ROSMAP (Alzheimer’s Disease) Biomarker Discovery¶
Cohort Summary:
Stage |
Methylation |
mRNA |
miRNA |
Clinical |
|---|---|---|---|---|
Raw (features x samples) |
420,132 x 741 |
55,889 x 642 |
309 x 704 |
18 x 3584 |
Data Aligned (samples x features) |
521 x 420,132 |
521 x 55,889 |
521 x 309 |
521 x 17 |
Variance top 100k (Methylation only) |
521 x 100,000 |
- |
- |
- |
Anova-F & Random Forest Intersection k=8000 |
521 x 772 |
521 x 2054 |
- |
- |
After feature selection |
521 x 772 |
521 x 2054 |
521 x 309 |
521 x 17 |
Final feature selection (Variance k=300) |
521 x 300 |
521 x 300 |
521 x 300 |
521 x 17 |
Target Definition: Multi-class cognitive diagnosis derived from clinical cogdx scores. No Cognitive Impairment (NCI, n=168), Mild Cognitive Impairment (MCI, n=134), and Dementia (n=219).
Data Source: AD Knowledge Portal ROSMAP
!pip install synapseclient pandas bioneuralnet
# torch CPU
!pip install --no-cache-dir torch==2.5.1+cpu --index-url https://download.pytorch.org/whl/cpu
# torch_geometric
!pip install --no-cache-dir pyg-lib torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-2.5.1+cpu.html
# tunning for DPMON
!pip install --no-cache-dir "ray[tune]==2.46.0"
import synapseclient
TOKEN = "your_synapse_token"
try:
syn = synapseclient.login(authToken=TOKEN)
print(f"Login successful! User: {syn.getUserProfile().userName}")
except Exception as e:
print("Login failed. Check that you copied the full token.")
raise e
# you should recieve a Welcome + Login Successful message
import pandas as pd
try:
syn = synapseclient.login(authToken=TOKEN)
print(f"Login successful! User: {syn.getUserProfile().userName}")
except Exception as e:
print("Login failed. Check that you copied the full token.")
raise e
files_to_download = {
"ROSMAP_assay_RNAseq_metadata.csv": {"id": "syn21088596", "ver": 5},
"ROSMAP_assay_methylationArray_metadata.csv": {"id": "syn23569437", "ver": 1},
"ROSMAP_assay_miRNAarray_nanostring_metadata.csv": {"id": "syn23569439", "ver": 4},
"ROSMAP_arrayMethylation_imputed.tsv.gz": {"id": "syn3168763", "ver": 2},
"ROSMAP_clinical.csv": {"id": "syn3191087", "ver": 11},
"ROSMAP_clinical_codebook.pdf": {"id": "syn3191090", "ver": 2},
"ROSMAP_arraymiRNA.gct": {"id": "syn3387327", "ver": 1},
"ROSMAP_RNAseq_FPKM_gene.tsv": {"id": "syn3505720", "ver": 2},
"ROSMAP_biospecimen_metadata.csv": {"id": "syn21323366", "ver": 1},
"ROSMAP_IDkey.csv": {"id": "syn3382527", "ver": 9},
}
downloaded_files = {}
print("Starting download")
for filename, info in files_to_download.items():
print(f"Downloading {filename}")
entity = syn.get(info["id"], version=info["ver"])
downloaded_files[filename] = entity.path
print("\nAll files downloaded")
# clean and transpose the ROSMAP data
rna_path = downloaded_files["ROSMAP_RNAseq_FPKM_gene.tsv"]
rna_meta_path = downloaded_files["ROSMAP_assay_RNAseq_metadata.csv"]
# get the valid specimenIDs
df_meta = pd.read_csv(rna_meta_path)
print(f"Metadata Columns: {df_meta.columns.tolist()}")
# we use a set for faster lookup of valid IDs
valid_specimens = set(df_meta['specimenID'].astype(str))
print(f"Found {len(valid_specimens)} valid specimenIDs in metadata")
df_rna = pd.read_csv(rna_path, sep='\t')
# setting gene ID as index
if 'gene_id' in df_rna.columns:
df_rna = df_rna.set_index('gene_id')
if 'tracking_id' in df_rna.columns:
df_rna = df_rna.drop(columns=['tracking_id'])
# RNA Header ex: "123_456_0"
# Metadata ID ex: "123_456"
valid_cols = []
renamed_cols = {}
for col in df_rna.columns:
cleaned_id = col.rsplit('_', 1)[0]
if cleaned_id in valid_specimens:
valid_cols.append(col)
renamed_cols[col] = cleaned_id
print(f"Matched {len(valid_cols)} columns from RNA file to metadata")
df_rna = df_rna[valid_cols]
df_rna = df_rna.rename(columns=renamed_cols)
df_rna = df_rna.T
print(f"Final Cleaned RNA shape: {df_rna.shape}")
display(df_rna.head())
# Should see:
#Found 3196 valid specimenIDs in metadata
#Matched 640 columns from RNA file to metadata
#Final Cleaned RNA shape: (640, 55889)
# we perform the same steps for Methylation
meth_path = downloaded_files["ROSMAP_arrayMethylation_imputed.tsv.gz"]
meth_meta_path = downloaded_files["ROSMAP_assay_methylationArray_metadata.csv"]
df_meta_meth = pd.read_csv(meth_meta_path)
valid_meth_ids = set(df_meta_meth['specimenID'].astype(str))
print(f"Found {len(valid_meth_ids)} valid specimenIDs in Methylation metadata")
df_meth = pd.read_csv(meth_path, sep='\t', low_memory=False)
df_meth = df_meth.set_index(df_meth.columns[0])
print(f"Raw Meth shape: {df_meth.shape}")
valid_cols = []
for c in df_meth.columns:
if c in valid_meth_ids:
valid_cols.append(c)
print(f"Matched {len(valid_cols)} columns from Meth file to metadata")
df_meth = df_meth[valid_cols]
df_meth = df_meth.T
print(f"Final Cleaned Meth shape: {df_meth.shape}")
display(df_meth.head())
#Found 748 valid specimenIDs in Methylation metadata
#Raw Meth shape: (420132, 740)
#Matched 740 columns from Meth file to metadata
#Final Cleaned Meth shape: (740, 420132)
# Lastly we do the same for mirna
mirna_path = downloaded_files["ROSMAP_arraymiRNA.gct"]
mirna_meta_path = downloaded_files["ROSMAP_assay_miRNAarray_nanostring_metadata.csv"]
df_meta = pd.read_csv(mirna_meta_path)
id_map = dict(zip(df_meta['mirna_id'], df_meta['specimenID']))
with open(mirna_path, 'r') as f:
first_line = f.readline()
df_mirna = pd.read_csv(mirna_path, sep='\t', skiprows=2)
print(f"Columns found: {list(df_mirna.columns[:5])}")
if 'Name' in df_mirna.columns:
df_mirna = df_mirna.set_index('Name')
elif 'ProbeID' in df_mirna.columns:
df_mirna = df_mirna.set_index('ProbeID')
else:
print(f"not match found")
if 'Description' in df_mirna.columns:
df_mirna = df_mirna.drop(columns=['Description'])
valid_cols = [c for c in df_mirna.columns if c in id_map]
print(f"Matched {len(valid_cols)} sample columns")
df_mirna = df_mirna[valid_cols]
df_mirna = df_mirna.rename(columns=id_map)
df_mirna = df_mirna.T
print(f"Final Cleaned miRNA shape: {df_mirna.shape}")
display(df_mirna.head())
#Should see:
#Matched 525 sample columns
#Final Cleaned miRNA shape: (525, 309)
common_ids = df_rna.index.intersection(df_mirna.index)
print(f"RNA samples: {df_rna.shape[0]}")
print(f"miRNA samples: {df_mirna.shape[0]}")
print(f"Overlapping samples: {len(common_ids)}")
df_rna_aligned = df_rna.loc[common_ids]
df_mirna_aligned = df_mirna.loc[common_ids]
df_rna_aligned = df_rna_aligned.sort_index()
df_mirna_aligned = df_mirna_aligned.sort_index()
print(f"Aligned RNA Shape: {df_rna_aligned.shape}")
print(f"Aligned miRNA Shape: {df_mirna_aligned.shape}")
# verify they are identical
if df_rna_aligned.index.equals(df_mirna_aligned.index):
print("Success: Indices are perfectly aligned")
else:
print("Warning: Indices do not match perfectly")
#RNA samples: 640
#miRNA samples: 525
#Overlapping samples: 525
#Aligned RNA Shape: (527, 55889)
#Aligned miRNA Shape: (525, 309)
#Warning: Indices do not match perfectly
dupes = df_rna_aligned.index[df_rna_aligned.index.duplicated()]
print(f"IDs with multiple entries in RNA: {dupes.tolist()}")
df_rna_aligned = df_rna_aligned.groupby(level=0).mean()
print(f"\nAligned RNA Shape: {df_rna_aligned.shape}")
print(f"Aligned miRNA Shape: {df_mirna_aligned.shape}")
if df_rna_aligned.shape[0] == df_mirna_aligned.shape[0]:
print("Perfect match 525 vs 525")
#IDs with multiple entries in RNA: ['xxxxx', 'xxxxxx']
#Aligned RNA Shape: (525, 55889)
#Aligned miRNA Shape: (525, 309)
#Perfect match 525 vs 525
df_bio = pd.read_csv(downloaded_files["ROSMAP_biospecimen_metadata.csv"])
# load metadata
df_meta_rna = pd.read_csv(downloaded_files["ROSMAP_assay_RNAseq_metadata.csv"])
df_meta_meth = pd.read_csv(downloaded_files["ROSMAP_assay_methylationArray_metadata.csv"])
df_meta_mir = pd.read_csv(downloaded_files["ROSMAP_assay_miRNAarray_nanostring_metadata.csv"])
# keep just specimenID + individualID from biospecimen
bio_key = df_bio[["specimenID", "individualID"]].drop_duplicates()
# attach ID to each df
meta_rna_with_ind = df_meta_rna.merge(bio_key, on="specimenID", how="left")
meta_meth_with_ind = df_meta_meth.merge(bio_key, on="specimenID", how="left")
meta_mir_with_ind = df_meta_mir.merge(bio_key, on="specimenID", how="left")
rna_id_to_ind = (meta_rna_with_ind.drop_duplicates("specimenID").set_index("specimenID")["individualID"])
meth_id_to_ind = (meta_meth_with_ind.drop_duplicates("specimenID").set_index("specimenID")["individualID"])
mir_id_to_ind = (meta_mir_with_ind.drop_duplicates("specimenID").set_index("specimenID")["individualID"])
# RNA
rna_ind = df_rna_aligned.copy()
rna_ind["individualID"] = rna_id_to_ind.reindex(rna_ind.index).values
rna_ind = rna_ind.dropna(subset=["individualID"]).set_index("individualID")
# miRNA
mirna_ind = df_mirna_aligned.copy()
mirna_ind["individualID"] = mir_id_to_ind.reindex(mirna_ind.index).values
mirna_ind = mirna_ind.dropna(subset=["individualID"]).set_index("individualID")
# Methylation: df_meth.index is methylation specimenID
meth_ind = df_meth.copy()
meth_ind["individualID"] = meth_id_to_ind.reindex(meth_ind.index).values
meth_ind = meth_ind.dropna(subset=["individualID"]).set_index("individualID")
common_ids = sorted(
set(rna_ind.index) &
set(mirna_ind.index) &
set(meth_ind.index)
)
rna_final = rna_ind.loc[common_ids]
mirna_final = mirna_ind.loc[common_ids]
meth_final = meth_ind.loc[common_ids]
print(f"RNA shape: {rna_final.shape}")
print(f"miRNA shape: {mirna_final.shape}")
print(f"Meth shape: {meth_final.shape}")
# should see
#RNA shape: (521, 55889)
#miRNA shape: (521, 309)
#Meth shape: (521, 420132)
display(meth_final.head(2))
display(rna_final.head(2))
display(meth_final.head(2))
df_clin = pd.read_csv(downloaded_files["ROSMAP_clinical.csv"])
df_clin_ind = df_clin.set_index("individualID").loc[common_ids]
df_clin_ind.shape
# this should show a DF of shape (521,17)
df_clin = pd.read_csv(downloaded_files["ROSMAP_clinical.csv"])
df_clin_ind = df_clin.set_index("individualID")
common_ids_all = sorted(set(common_ids) & set(df_clin_ind.index))
rna_final = rna_final.loc[common_ids_all]
mirna_final = mirna_final.loc[common_ids_all]
meth_final = meth_final.loc[common_ids_all]
clin_final = df_clin_ind.loc[common_ids_all]
print(f"RNA shape: {rna_final.shape}")
print(f"miRNA shape: {mirna_final.shape}")
print(f"Meth shape: {meth_final.shape}")
print(f"Clinical shape: {clin_final.shape}")
# should see:
#RNA shape: (521, 55889)
#miRNA shape: (521, 309)
#Meth shape: (521, 420132)
#Clinical shape: (521, 17)
display(clin_final.head())
clin_final["cogdx"].value_counts()
# should see:
# cogdx
# 4.0 182
# 1.0 168
# 2.0 125
# 5.0 25
# 6.0 12
# 3.0 9
# Name: count, dtype: int64
from bioneuralnet import utils
# This is a computationally expensive step: DO IT ONCE, then save result.
print("\nConverting meth beta-values to M-values")
X_meth_m = utils.m_transform(meth_final, eps=1e-6)
print("Selecting top 100,000 by variance")
X_meth_100k = utils.variance_threshold(X_meth_m, k=100000)
print(f"X_meth_100k shape: {X_meth_100k.shape}")
# free big stuff if memory is limited
del meth_final, X_meth_m
import numpy as np
labeled_patients = clin_final.index[clin_final["cogdx"].notna()]
print("Patients with non-missing cogdx:", len(labeled_patients))
common_labeled_patients = (
set(labeled_patients)
& set(X_meth_100k.index)
& set(rna_final.index)
& set(mirna_final.index)
)
common_labeled_patients = sorted(common_labeled_patients)
print("Final labeled patients used for modeling:", len(common_labeled_patients))
X_meth = X_meth_100k.loc[common_labeled_patients]
X_rna = rna_final.loc[common_labeled_patients]
X_mirna = mirna_final.loc[common_labeled_patients]
clinical_aligned = clin_final.loc[common_labeled_patients]
cogdx_series = clinical_aligned["cogdx"]
def cogdx_to_3class(x):
if x == 1.0:
return 0
elif x in (2.0, 3.0):
return 1
elif x in (4.0, 5.0, 6.0):
return 2
else:
return np.nan
target = cogdx_series.map(cogdx_to_3class)
valid_mask = target.notna()
if not valid_mask.all():
print(f"Dropping patients with invalid cogdx codes: {(~valid_mask).sum()}")
final_patients = target.index[valid_mask]
X_meth = X_meth.loc[final_patients]
X_rna = X_rna.loc[final_patients]
X_mirna = X_mirna.loc[final_patients]
clinical_aligned = clinical_aligned.loc[final_patients]
target = target.loc[final_patients]
else:
final_patients = list(target.index)
target = target.astype("Int64")
y_df = pd.DataFrame(target.values, index=final_patients, columns=["target"])
# check index is aligned
assert X_meth.index.equals(X_rna.index)
assert X_meth.index.equals(X_mirna.index)
assert X_meth.index.equals(clinical_aligned.index)
assert X_meth.index.equals(y_df.index)
# same NAME for index in all dfs
idx_name = X_meth.index.name or "individualID"
X_meth.index.name = idx_name
X_rna.index.name = idx_name
X_mirna.index.name = idx_name
clinical_aligned.index.name = idx_name
y_df.index.name = idx_name
print("\nFinal shapes (aligned on same patients + index name):")
print(f"X_meth: {X_meth.shape}")
print(f"X_rna: {X_rna.shape}")
print(f"X_mirna: {X_mirna.shape}")
print(f"clinical:{clinical_aligned.shape}")
print(f"y_df: {y_df.shape}")
print("\nTarget value counts (0=NCI, 1=MCI, 2=Dementia):")
print(y_df['target'].value_counts())
Verify the following output before proceeding:¶
Final shapes (aligned on same patients + index name):
X_meth: (521, 100000)
X_rna: (521, 55889)
X_mirna: (521, 309)
clinical: (521, 17)
y_df: (521, 1)
Target value counts (0=NCI, 1=MCI, 2=Dementia):
2: 219
0: 168
1: 134
Name: count, dtype: Int64
# optional save in case we restart kernel
# X_meth.to_parquet("X_meth_final.parquet")
# X_rna.to_parquet("X_rna_final.parquet")
# X_mirna.to_parquet("X_mirna_final.parquet")
# clinical_aligned.to_parquet("X_clinical_final.parquet")
# y_df.to_parquet("y_df_final.parquet")
# if saved then data can be loaded this way
# X_meth = pd.read_parquet("X_meth_final.parquet")
# X_rna = pd.read_parquet("X_rna_final.parquet")
# X_mirna = pd.read_parquet("X_mirna_final.parquet")
# clinical = pd.read_parquet("X_clinical_final.parquet")
# y_df = pd.read_parquet("y_df_final.parquet")
from bioneuralnet import utils
# Use the 3-class target as a Series
Y_series = y_df["target"].astype(int)
print("\nMethylation: ANOVA F-test")
meth_af = utils.top_anova_f_features(
X_meth,
Y_series,
max_features=8000
)
print("Methylation: RandomForest")
meth_rf = utils.importance_rf(
X_meth,
Y_series,
top_k=8000
)
meth_anova_set = set(meth_af.columns)
meth_rf_set = set(meth_rf.columns)
meth_inter_cols = sorted(meth_anova_set & meth_rf_set)
print(f"Methylation intersection size: {len(meth_inter_cols)}")
# methylation final shape after feature selection
X_meth_fs = X_meth[meth_inter_cols]
print(f"X_meth_fs shape: {X_meth_fs.shape}")
print("\nRNA: ANOVA F-test")
rna_af = utils.top_anova_f_features(
X_rna,
Y_series,
max_features=8000
)
print("RNA: RandomForest")
rna_rf = utils.importance_rf(
X_rna,
Y_series,
top_k=8000
)
rna_anova_set = set(rna_af.columns)
rna_rf_set = set(rna_rf.columns)
rna_inter_cols = sorted(rna_anova_set & rna_rf_set)
print(f"RNA intersection size: {len(rna_inter_cols)}")
X_rna_fs = X_rna[rna_inter_cols]
print(f"X_rna_fs shape: {X_rna_fs.shape}")
# At this stage we have the following shapes
# X_meth: (521, 772)
# X_rna : (521, 2054)
# X_mirna: (521, 309)
# clinical: (521, 17)
# Y_labels: (521, 1)
from bioneuralnet.utils import preprocess_clinical
clinical_for_model = preprocess_clinical(
clinical_aligned,
# internal variance feature selection
top_k=8,
scale=False,
ignore_columns=["projid", "dcfdx_lv", "cogdx", "braaksc", "ceradsc", "Study", "age_first_ad_dx"],
nan_threshold=0.5)
import re
def clean_bn_name(col: str) -> str:
col = str(col)
node_clean = re.sub(r"[^0-9a-zA-Z_]", ".", col)
if not node_clean[0].isalpha():
node_clean = "X" + node_clean
return node_clean
X_meth_clean = X_meth_fs.copy()
meth_cols = []
for c in X_meth_clean.columns:
meth_cols.append(clean_bn_name(c))
X_meth_clean.columns = meth_cols
# 2. Process X_rna_clean
X_rna_clean = X_rna_fs.copy()
rna_cols = []
for c in X_rna_clean.columns:
rna_cols.append(clean_bn_name(c))
X_rna_clean.columns = rna_cols
# 3. Process X_mirna_clean
X_mirna_clean = X_mirna.copy()
mirna_cols = []
for c in X_mirna_clean.columns:
mirna_cols.append(clean_bn_name(c))
X_mirna_clean.columns = mirna_cols
omics_list = [X_meth_clean, X_rna_clean, X_mirna_clean]
omics_for_graph = pd.concat(omics_list, axis=1)
print(omics_for_graph.shape)
# Save everything in case kernel crashes
omics_for_graph.to_parquet("omics_for_graph.parquet")
X_meth_clean.to_parquet("X_meth_clean.parquet")
X_rna_clean.to_parquet("X_rna_clean.parquet")
X_mirna_clean.to_parquet("X_mirna_clean.parquet")
clinical_for_model.to_parquet("clinical_for_model.parquet")
y_df.to_parquet("y_df.parquet")
print(f"Selecting top 300 features from Methylation")
X_meth_clean3 = utils.variance_threshold(X_meth_clean, k=300)
print(f"Selecting top 300 features from RNA")
X_rna_clean3 = utils.variance_threshold(X_rna_clean, k=300)
print(f"Selecting top 300 features from miRNA")
X_mirna_clean3 = utils.variance_threshold(X_mirna_clean, k=300)
omics_for_graph_selected = pd.concat([X_meth_clean3, X_rna_clean3, X_mirna_clean3], axis=1)
new_total_features = X_meth_clean3.shape[1] + X_rna_clean3.shape[1] + X_mirna_clean3.shape[1]
from bioneuralnet.clustering import HybridLouvain
from bioneuralnet.network import similarity_network
similarity_10 = similarity_network(omics_for_graph_selected, k=10)
SEED = 1883
utils.set_seed(SEED)
# NCI vs rest
y_binary = (y_df["target"] == 0).astype(int)
hl = HybridLouvain(
G=similarity_10,
B=omics_for_graph,
Y=y_binary,
k_L=0.2,
k_P=0.8,
max_iter=5,
seed=SEED,
)
subnetworks = hl.run(as_dfs=True)
from bioneuralnet.metrics import plot_network
nci_submodule = subnetworks[3]
print(f"NCI vs Rest submodule: {nci_submodule.shape[1]} features")
nci_mapping = plot_network(nci_submodule, weight_threshold=.45, show_labels=True, show_edge_weights=False)
print(nci_mapping)
from bioneuralnet.network import NetworkAnalyzer
nci_submodule_50 = subnetworks[1]
nci_submodule_25 = subnetworks[2]
nci50_net = NetworkAnalyzer(nci_submodule_50)
nci50_net.basic_statistics()
nci25_net = NetworkAnalyzer(nci_submodule_25)
nci25_net.basic_statistics()
from bioneuralnet import DPMON
y_binary = (y_df["target"] == 0).astype(int)
# no tunning: using default parameters. ONly goal is to generate emebddings
dpmon_params_base = {
"adjacency_matrix": similarity_10,
"omics_list": omics_for_graph_selected,
"phenotype_data": y_binary,
"phenotype_col": "target",
"clinical_data": clinical_for_model,
}
dpmon_obj = DPMON(**dpmon_params_base)
_, _, embeddings = dpmon_obj.run()
# .detach().cpu().numpy() to get the raw numbers out of pytorch
GAT_embeddings_df = pd.DataFrame(
embeddings.detach().cpu().numpy(),
index=similarity_10.columns
)
display(GAT_embeddings_df.iloc[:5])
from bioneuralnet.metrics import plot_embeddings
node_labels = []
GAT_embeddings_array = GAT_embeddings_df.values
feature_names = GAT_embeddings_df.index
dna_feats = set(X_meth_clean3.columns)
rna_feats = set(X_rna_clean3.columns)
mirna_feats = set(X_mirna_clean3.columns)
for feat in feature_names:
if feat in dna_feats:
# 1 for DNA Methylation
node_labels.append(1)
elif feat in rna_feats:
# 2 for RNA
node_labels.append(2)
elif feat in mirna_feats:
# 3 for miRNA
node_labels.append(3)
else:
# 0 for everything else, just in case
node_labels.append(0)
# our plotting function needs an array to squeeze the embedings into 2Dspace
node_labels = np.array(node_labels)
plot_embeddings(GAT_embeddings_array, node_labels, legend_labels=["Background", "NCI_Meth", "NCI_RNA", "NCI_miRNA"])
References:¶
AD Knowledge Portal. Available from: AD Knowledge Portal. Data generated from postmortem brain tissue provided by the Religious Orders Study and Rush Memory and Aging Project (ROSMAP) cohort at Rush Alzheimer’s Disease Center, Rush University Medical Center, Chicago. Supported by Cure Alzheimer’s Fund and NIH grants AG058002, AG062377, NS110453, NS115064, AG062335, AG074003, NS127187, MH119509, HG008155, RF1AG062377, RF1AG054321, R01AG054012, and GM087237.