Source code for bioneuralnet.network.generate

import torch
import numpy as np
import pandas as pd
from typing import Optional
import torch.nn.functional as F

from ..utils.logger import get_logger
logger = get_logger(__name__)


[docs]
def similarity_network(X: pd.DataFrame, k: int = 15, metric: str = "cosine", mutual: bool = False, per_node: bool = True, self_loops: bool = False, normalize: bool = True) -> pd.DataFrame:
    """Build a k-nearest neighbors similarity graph from feature vectors.

    Pairwise similarities are computed using either cosine similarity or a Gaussian kernel on Euclidean distances. The similarity matrix is sparsified by keeping top-k neighbors per node (or via a global cutoff), optionally restricted to mutual neighbors, with optional self-loops and row-normalization.

    Args:

        X (pd.DataFrame): Input data of shape (N, D) where N is the number of samples and D is the number of features.
        k (int): Number of neighbors to keep per node, or approximate neighbors per node when using a global cutoff.
        metric (str): Similarity metric; either "cosine" or "euclidean" (case-insensitive) where the latter uses a Gaussian kernel on squared distances.
        mutual (bool): If True, retain only edges where i is in the kNN of j and j is in the kNN of i.
        per_node (bool): If True, apply kNN per node; if False, apply a global threshold to keep approximately k edges per node.
        self_loops (bool): If True, add self-loop weights of 1 on the diagonal of the adjacency matrix.
        normalize (bool): If True, row-normalize the adjacency so each row sums to 1.

    Returns:

        pd.DataFrame: Adjacency matrix of shape (D, D) representing the feature-feature similarity graph.

    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if isinstance(X, pd.DataFrame):
        X = X.T
        nodes = X.index
        number_of_omics = len(nodes)
        x_torch = torch.tensor(X.values, dtype=torch.float32, device=device)
    else:
        raise TypeError("X must be a pandas.DataFrame")

    N = x_torch.size(0)
    k = min(k, N - 1)

    # Full similarity matrix
    if metric == "cosine":
        X_normalized = F.normalize(x_torch, p=2, dim=1)
        S = torch.mm(X_normalized, X_normalized.t())
    else:
        D2 = torch.cdist(x_torch, x_torch).pow(2)
        median_d2 = D2.median()
        S = torch.exp(-D2 / (median_d2 + 1e-8))

    # kNN mask or global threshold
    if per_node:
        _, index = torch.topk(S, k=k + 1, dim=1)
        mask = torch.zeros(N, N, dtype=torch.bool, device=device)
        for i in range(N):
            for j in index[i, 1:k + 1]:
                mask[i, j] = True
    else:
        flat = S.reshape(-1)
        threshold = torch.kthvalue(flat, k * N).values
        mask = S >= threshold
        mask.fill_diagonal_(False)

    #  pruning option
    if mutual:
        mask = torch.logical_and(mask, mask.t())

    # mask and add self-loops
    A = S * mask.float()
    if self_loops:
        A = A + torch.eye(N, device=device, dtype=x_torch.dtype)

    # row normalization
    if normalize:
        A = F.normalize(A, p=1, dim=1)

    A_numpy = A.cpu().numpy()
    final_graph = pd.DataFrame(A_numpy, index=nodes, columns=nodes)

    if final_graph.shape != (number_of_omics, number_of_omics):
        logger.info(
            "Please make sure your input X follows the description: "
            "A DataFrame (N, D) where N (rows) is the number of subjects/samples "
            "and D (columns) represents the multi-omics features."
        )
        raise ValueError(
            f"Generated graph shape {final_graph.shape} does not match expected "
            f"shape ({number_of_omics}, {number_of_omics})."
        )

    return final_graph



[docs]
def correlation_network(X: pd.DataFrame, k: Optional[int] = 15, method: str = "pearson", signed: bool = True, normalize: bool = True, mutual: bool = False, per_node: bool = True, threshold: Optional[float] = None, self_loops: bool = False) -> pd.DataFrame:
    """Build a correlation-based graph from feature vectors with optional kNN sparsification.

    Pairwise correlations (Pearson or Spearman) are computed between features, mapped to similarity scores in [0, 1], and then optionally sparsified using per-node kNN or a global cutoff. Mutual pruning, self-loops, and row-normalization can be applied to obtain a final adjacency matrix.

    Args:

        X (pd.DataFrame): Input data of shape (N, D) where N is the number of samples and D is the number of features.
        k (int | None): Number of neighbors for sparsification; when per_node is True this is per node, otherwise used to approximate k*N edges globally, and if None with threshold=None a fully connected graph is returned subject to self_loops.
        method (str): Correlation method; "pearson" for standard correlation or "spearman" for rank-based correlation.
        signed (bool): If True, use signed correlations mapped to [0, 1] via (C + 1)/2; if False, use absolute correlations in [0, 1].
        normalize (bool): If True, row-normalize the adjacency; if False, keep raw similarity weights.
        mutual (bool): If True, retain only edges that are present in both directions (i->j and j->i).
        per_node (bool): If True, apply kNN per node; if False, use a global cutoff determined by k or threshold.
        threshold (float | None): Similarity cutoff; when provided and per_node is False, overrides the k-based global cutoff.
        self_loops (bool): If True, add self-loop weights of 1 on the diagonal of the adjacency matrix.

    Returns:

        pd.DataFrame: Adjacency matrix of shape (D, D) representing the feature-feature correlation graph.

    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if isinstance(X, pd.DataFrame):
        X = X.T
        nodes = X.index
        number_of_omics = len(nodes)
        x_torch = torch.tensor(X.values, dtype=torch.float32, device=device)
    else:
        raise TypeError("X must be a pandas.DataFrame")

    N = x_torch.size(0)

    # rank transform for Spearman, otherwise mean-center for Pearson
    if method == "spearman":
        x_ranked = x_torch.argsort(dim=1).argsort(dim=1).float()
        x_correlation = x_ranked - x_ranked.mean(dim=1, keepdim=True)
    else:
        x_correlation = x_torch - x_torch.mean(dim=1, keepdim=True)

    num = torch.mm(x_correlation, x_correlation.t())
    sum_sq = (x_correlation ** 2).sum(dim=1, keepdim=True)
    denom = torch.sqrt(torch.mm(sum_sq, sum_sq.t())).clamp(min=1e-8)
    C = num / denom

    # mapping to similarity matrix S in [0, 1]
    if signed:
        S = (C + 1) / 2
    else:
        S = C.abs()

    # fully connected, per-node kNN, or global threshold
    if k is None and threshold is None:
        # Fully connected mode
        mask = torch.ones(N, N, dtype=torch.bool, device=device)
        if not self_loops:
            mask.fill_diagonal_(False)

    elif per_node:
        if k is None:
            raise ValueError("k must be an integer when per_node is True.")
        k_to_use = min(k + 1, N)
        _, index = torch.topk(S, k=k_to_use, dim=1)
        mask = torch.zeros(N, N, dtype=torch.bool, device=device)
        mask.scatter_(1, index[:, 1:k_to_use], True)

    else:
        # cutoff mode
        if threshold is not None:
            mask = S >= threshold
            mask.fill_diagonal_(False)
        else:
            k_global = min(k * N, N * N - N)
            flat_off_diag = S[~torch.eye(N, dtype=torch.bool, device=device)].reshape(-1)
            thresh_val = torch.kthvalue(
                flat_off_diag, len(flat_off_diag) - k_global
            ).values
            mask = S >= thresh_val
            mask.fill_diagonal_(False)

    if mutual:
        mask = torch.logical_and(mask, mask.t())

    W = S * mask.float()
    if self_loops:
        W.fill_diagonal_(1.0)

    if normalize:
        W = F.normalize(W, p=1, dim=1)

    final_graph = pd.DataFrame(W.cpu().numpy(), index=nodes, columns=nodes)

    if final_graph.shape != (number_of_omics, number_of_omics):
        logger.info(
            "Please make sure your input X follows the description: "
            "A DataFrame (N, D) where N (rows) is the number of subjects/samples "
            "and D (columns) represents the multi-omics features."
        )
        raise ValueError(
            f"Generated graph shape {final_graph.shape} does not match expected "
            f"shape ({number_of_omics}, {number_of_omics})."
        )

    return final_graph



[docs]
def threshold_network(X: pd.DataFrame, b: float = 6.0, k: int = 15, mutual: bool = False, self_loops: bool = False, normalize: bool = True) -> pd.DataFrame:
    """Build a soft-thresholded kNN co-expression graph, similar to WGCNA-style networks.

    Absolute Pearson correlations between features are raised to a power b to obtain soft-thresholded similarities. A kNN mask keeps the top-k neighbors per node, optionally restricted to mutual neighbors, with optional self-loops and row-normalization.

    Args:

        X (pd.DataFrame): Input data of shape (N, D) where N is the number of samples and D is the number of features.
        b (float): Soft-threshold exponent applied to absolute correlations to control network sparsity and hub emphasis.
        k (int): Number of neighbors to keep per node in the kNN graph.
        mutual (bool): If True, retain only edges where i and j are mutual kNN neighbors.
        self_loops (bool): If True, add self-loop weights of 1 on the diagonal of the adjacency matrix.
        normalize (bool): If True, row-normalize the adjacency so each row sums to 1.

    Returns:

        pd.DataFrame: Adjacency matrix of shape (D, D) representing the soft-thresholded co-expression graph.

    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    if isinstance(X, pd.DataFrame):
        X = X.T  # features as nodes
        nodes = X.index
        number_of_omics = len(nodes)
        x_torch = torch.tensor(X.values, dtype=torch.float32, device=device)
    else:
        raise TypeError("X must be a pandas.DataFrame")

    N = x_torch.size(0)

    # Pearson correlation matrix
    Xc = x_torch - x_torch.mean(dim=1, keepdim=True)
    num = torch.mm(Xc, Xc.t())
    sum_sq = (Xc ** 2).sum(dim=1, keepdim=True)
    denom = torch.sqrt(torch.mm(sum_sq, sum_sq.t())).clamp(min=1e-8)

    C = num / denom
    S = C.abs().pow(b)

    _, index = torch.topk(S, k=k + 1, dim=1)
    mask = torch.zeros(N, N, dtype=torch.bool, device=device)
    for i in range(N):
        for j in index[i, 1:k + 1]:
            mask[i, j] = True

    if mutual:
        mask = torch.logical_and(mask, mask.t())

    W = S * mask.float()
    if self_loops:
        W.fill_diagonal_(1.0)

    if normalize:
        W = F.normalize(W, p=1, dim=1)

    final_graph = pd.DataFrame(W.cpu().numpy(), index=nodes, columns=nodes)

    if final_graph.shape != (number_of_omics, number_of_omics):
        logger.info(
            "Please make sure your input X follows the description: "
            "A DataFrame (N, D) where N (rows) is the number of subjects/samples "
            "and D (columns) represents the multi-omics features."
        )
        raise ValueError(
            f"Generated graph shape {final_graph.shape} does not match expected "
            f"shape ({number_of_omics}, {number_of_omics})."
        )

    return final_graph



[docs]
def gaussian_knn_network(X: pd.DataFrame, k: int = 15, sigma: Optional[float] = None, mutual: bool = False, self_loops: bool = True, normalize: bool = True) -> pd.DataFrame:
    """Build a Gaussian (RBF) kNN similarity graph from feature vectors.

    Pairwise Euclidean distances between features are converted to similarities using a Gaussian kernel with bandwidth sigma (or a median-distance heuristic). The graph is sparsified by keeping top-k neighbors per node, optionally restricted to mutual neighbors, with optional self-loops and row-normalization.

    Args:

        X (pd.DataFrame): Input data of shape (N, D) where N is the number of samples and D is the number of features.
        k (int): Number of neighbors to keep per node in the kNN graph.
        sigma (float | None): Bandwidth parameter for the Gaussian kernel; if None, a median squared distance heuristic is used.
        mutual (bool): If True, retain only edges where i and j are mutual kNN neighbors.
        self_loops (bool): If True, add self-loop weights of 1 on the diagonal of the adjacency matrix.
        normalize (bool): If True, row-normalize the adjacency so each row sums to 1.

    Returns:

        pd.DataFrame: Adjacency matrix of shape (D, D) representing the Gaussian-kernel feature similarity graph.

    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if isinstance(X, pd.DataFrame):
        X = X.T
        nodes = X.index
        number_of_omics = len(nodes)
        x_torch = torch.tensor(X.values, dtype=torch.float32, device=device)
    else:
        raise TypeError("X must be a pandas.DataFrame")

    N = x_torch.size(0)

    # Pairwise squared distances
    D2 = torch.cdist(x_torch, x_torch).pow(2)

    if sigma is None:
        sigma = D2.median().item()

    # Gaussian kernel
    S = torch.exp(-D2 / (2 * sigma))

    # kNN mask
    _, index = torch.topk(S, k=k + 1, dim=1)
    mask = torch.zeros(N, N, dtype=torch.bool, device=device)
    for i in range(N):
        for j in index[i, 1:k + 1]:
            mask[i, j] = True

    if mutual:
        mask = torch.logical_and(mask, mask.t())

    # mask and self-loops
    W = S * mask.float()
    if self_loops:
        W.fill_diagonal_(1.0)

    if normalize:
        W = F.normalize(W, p=1, dim=1)

    final_graph = pd.DataFrame(W.cpu().numpy(), index=nodes, columns=nodes)

    if final_graph.shape != (number_of_omics, number_of_omics):
        logger.info(
            "Please make sure your input X follows the description: "
            "A DataFrame (N, D) where N (rows) is the number of subjects/samples "
            "and D (columns) represents the multi-omics features."
        )
        raise ValueError(
            f"Generated graph shape {final_graph.shape} does not match expected "
            f"shape ({number_of_omics}, {number_of_omics})."
        )

    return final_graph