Source code for bioneuralnet.metrics.correlation

import numpy as np
import pandas as pd
from typing import Tuple

from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from bioneuralnet.utils.logger import get_logger
logger = get_logger(__name__)


[docs]
def omics_correlation(omics: pd.DataFrame, pheno: pd.DataFrame) -> Tuple[float, float]:
    """Computes the Pearson correlation between the first principal component of omics data and a phenotype.

    Args:

        omics (pd.DataFrame): Omics data with rows as samples and columns as features.
        pheno (pd.DataFrame): Phenotype data. Expected to have a single column.

    Returns:

        Tuple[float, float]: Pearson correlation coefficient and p-value.
    """
    target = pheno.squeeze()

    logger.info("Computing Pearson correlation coefficient.")

    if omics.empty or target.empty:
        logger.error("Omics data and phenotype must not be empty.")
        raise ValueError("Omics data and phenotype must not be empty.")

    if omics.shape[0] != len(target):
        logger.error("Number of rows in omics data and phenotype must be the same.")
        raise ValueError("Omics data and phenotype must have the same length.")

    scaler = StandardScaler()
    scaled = scaler.fit_transform(omics)
    pca = PCA(n_components=1)
    pc1 = pca.fit_transform(scaled).flatten()
    target_values = target.values

    corr, pvalue = pearsonr(pc1, target_values)

    return corr, pvalue



[docs]
def cluster_pca_correlation(cluster_df: pd.DataFrame, pheno: pd.DataFrame) -> tuple:
    """Computes the Pearson correlation coefficient between PC1 of a cluster and phenotype.

    Args:

        cluster_df (pd.DataFrame): DataFrame representing a cluster of samples.
        pheno (pd.DataFrame): DataFrame representing the phenotype.

    Returns:

        tuple: (cluster_size, correlation) or (size, None) if correlation fails.
    """
    cluster_size = cluster_df.shape[1]

    if cluster_size < 2:
        logger.info(f"Cluster with size {cluster_size} skipped (not enough features).")
        return (cluster_size, None)

    subset = cluster_df.fillna(0)

    if subset.var().sum() == 0:
        logger.warning("Cluster skipped: all features have zero variance.")
        return (cluster_size, None)

    try:
        pca = PCA(n_components=1)
        pc1 = pca.fit_transform(subset)
        pc1_series = pd.Series(pc1.flatten(), index=subset.index, name="PC1")

        pheno_series = pheno.iloc[:, 0]
        pc1_series, pheno_series = pc1_series.align(pheno_series, join="inner")

        if len(pc1_series) < 3:
            logger.warning("Not enough data points for Pearson correlation.")
            return (cluster_size, None)

        corr, _ = pearsonr(pc1_series, pheno_series)

    except Exception as e:
        logger.error(f"Error computing correlation: {e}")
        corr = None

    return (cluster_size, corr)



[docs]
def cluster_correlation(louvain_cluster: pd.DataFrame) -> pd.DataFrame:
    """Builds a new correlation network from an extracted cluster/subnetwork.

    This function is often used as an intermediate step between HybridLouvain and plotting.
    It allows the subnetwork to reveal its internal topological structure by computing the Pearson correlation of node connectivity profiles.

    Args:

        louvain_cluster (pd.DataFrame): Represents an induced subnetwork.

    Returns:

        pd.DataFrame: Adjacency matrix.
    """
    adjacency_matrix = louvain_cluster.corr(method="pearson")
    vals = adjacency_matrix.to_numpy().copy()
    np.fill_diagonal(vals, 0)
    adjacency_matrix = pd.DataFrame(vals, index=adjacency_matrix.index, columns=adjacency_matrix.columns)
    adjacency_matrix = adjacency_matrix.fillna(0)

    return adjacency_matrix