Source code for bioneuralnet.external_tools.extract_CVfold

import os
import shutil
import subprocess
import pandas as pd
from pathlib import Path


[docs]
def load_r_export_folds(base_path: str, num_omics: int, k: int = 5) -> dict:
    """Loads the specific SmCCNet directory structure exported from R.

    This function iterates through the cross-validation fold directories (fold_1, fold_2, etc.)
    and loads the associated omics CSV files and phenotype data into NumPy arrays.

    Args:

        base_path (str): The base directory containing the 'fold_N' subdirectories.
        num_omics (int): The number of omics data blocks to load per fold.
        k (int): The number of cross-validation folds to load. Defaults to 5.

    Returns:

        dict: A dictionary where keys are fold names (e.g., 'fold_1') and values are
        dictionaries containing 'X_train' (list of numpy arrays), 'X_test' (list of numpy arrays),
        'Y_train' (numpy array), and 'Y_test' (numpy array).

    Raises:

        FileNotFoundError: If a required fold directory or CSV file cannot be found.

    """
    folddata = {}
    print(f"Loading R-exported folds from: {base_path}")

    for i in range(1, k + 1):
        fold_key = f"fold_{i}"
        fold_dir = os.path.join(base_path, fold_key)

        if not os.path.exists(fold_dir):
            raise FileNotFoundError(f"Could not find directory: {fold_dir}")

        x_train_list = []
        x_test_list = []

        for omics_idx in range(1, num_omics + 1):
            xtrain_path = os.path.join(fold_dir, f"X_train_Omics_{omics_idx}.csv")
            xtest_path = os.path.join(fold_dir, f"X_test_Omics_{omics_idx}.csv")

            x_train = pd.read_csv(xtrain_path).to_numpy()
            x_test = pd.read_csv(xtest_path).to_numpy()

            x_train_list.append(x_train)
            x_test_list.append(x_test)

        ytrain_path = os.path.join(fold_dir, "Y_train.csv")
        ytest_path = os.path.join(fold_dir, "Y_test.csv")

        y_train = pd.read_csv(ytrain_path).iloc[:, 0].to_numpy()
        y_test = pd.read_csv(ytest_path).iloc[:, 0].to_numpy()

        folddata[fold_key] = {
            "X_train": x_train_list,
            "X_test": x_test_list,
            "Y_train": y_train,
            "Y_test": y_test
        }

    print(f"Successfully loaded {k} folds.")
    return folddata




[docs]
def extract_and_load_folds(output_path: str, num_omics: int = 3, k: int = 5) -> dict:
    """Extracts .Rdata fold files into CSVs using an R script, then loads them.

    This function acts as a wrapper to execute the external 'extract_CVfold.R' script,
    which parses 'CVFold.Rdata' and 'globalNetwork.Rdata' into a standard directory
    structure of CSVs. Once the R script completes successfully, it loads the data
    into memory using `load_r_export_folds`.

    Args:

        output_path (str): The target directory containing the source .Rdata files.
        num_omics (int): The number of omics data blocks to process. Defaults to 3.
        k (int): The number of cross-validation folds. Defaults to 5.

    Returns:

        dict: A dictionary containing the parsed cross-validation fold data.

    Raises:

        EnvironmentError: If 'Rscript' is not found in the system path.
        FileNotFoundError: If the required 'extract_CVfold.R' script is missing.
        RuntimeError: If the R script execution fails and returns a non-zero exit code.

    """
    rscript = shutil.which("Rscript")
    if rscript is None:
        raise EnvironmentError("Rscript not found in system path.")

    target_dir = Path(output_path).resolve()

    script_path = (Path(__file__).parent / "extract_CVfold.R").resolve()

    if not script_path.exists():
        raise FileNotFoundError(f"Missing required R script: {script_path}")

    cmd = [rscript, str(script_path), str(target_dir)]
    print(f"Running Rscript command: {' '.join(cmd)}")

    proc = subprocess.run(cmd, capture_output=True, text=True)

    if proc.stdout:
        print(f"Rscript stdout:\n{proc.stdout}")

    if proc.stderr:
        if proc.returncode == 0:
            print(f"Rscript messages:\n{proc.stderr}")
        else:
            print(f"Rscript stderr:\n{proc.stderr}")

    if proc.returncode != 0:
        raise RuntimeError(f"R conversion failed with return code: {proc.returncode}")

    export_base_dir = os.path.join(str(target_dir), "CV_Export")

    return load_r_export_folds(base_path=export_base_dir, num_omics=num_omics, k=k)