Source code for bioneuralnet.external_tools.extract_CVfold

import os
import shutil
import subprocess
import pandas as pd
from pathlib import Path

[docs] def load_r_export_folds(base_path: str, num_omics: int, k: int = 5) -> dict: """Loads the specific SmCCNet directory structure exported from R. This function iterates through the cross-validation fold directories (fold_1, fold_2, etc.) and loads the associated omics CSV files and phenotype data into NumPy arrays. Args: base_path (str): The base directory containing the 'fold_N' subdirectories. num_omics (int): The number of omics data blocks to load per fold. k (int): The number of cross-validation folds to load. Defaults to 5. Returns: dict: A dictionary where keys are fold names (e.g., 'fold_1') and values are dictionaries containing 'X_train' (list of numpy arrays), 'X_test' (list of numpy arrays), 'Y_train' (numpy array), and 'Y_test' (numpy array). Raises: FileNotFoundError: If a required fold directory or CSV file cannot be found. """ folddata = {} print(f"Loading R-exported folds from: {base_path}") for i in range(1, k + 1): fold_key = f"fold_{i}" fold_dir = os.path.join(base_path, fold_key) if not os.path.exists(fold_dir): raise FileNotFoundError(f"Could not find directory: {fold_dir}") x_train_list = [] x_test_list = [] for omics_idx in range(1, num_omics + 1): xtrain_path = os.path.join(fold_dir, f"X_train_Omics_{omics_idx}.csv") xtest_path = os.path.join(fold_dir, f"X_test_Omics_{omics_idx}.csv") x_train = pd.read_csv(xtrain_path).to_numpy() x_test = pd.read_csv(xtest_path).to_numpy() x_train_list.append(x_train) x_test_list.append(x_test) ytrain_path = os.path.join(fold_dir, "Y_train.csv") ytest_path = os.path.join(fold_dir, "Y_test.csv") y_train = pd.read_csv(ytrain_path).iloc[:, 0].to_numpy() y_test = pd.read_csv(ytest_path).iloc[:, 0].to_numpy() folddata[fold_key] = { "X_train": x_train_list, "X_test": x_test_list, "Y_train": y_train, "Y_test": y_test } print(f"Successfully loaded {k} folds.") return folddata
[docs] def extract_and_load_folds(output_path: str, num_omics: int = 3, k: int = 5) -> dict: """Extracts .Rdata fold files into CSVs using an R script, then loads them. This function acts as a wrapper to execute the external 'extract_CVfold.R' script, which parses 'CVFold.Rdata' and 'globalNetwork.Rdata' into a standard directory structure of CSVs. Once the R script completes successfully, it loads the data into memory using `load_r_export_folds`. Args: output_path (str): The target directory containing the source .Rdata files. num_omics (int): The number of omics data blocks to process. Defaults to 3. k (int): The number of cross-validation folds. Defaults to 5. Returns: dict: A dictionary containing the parsed cross-validation fold data. Raises: EnvironmentError: If 'Rscript' is not found in the system path. FileNotFoundError: If the required 'extract_CVfold.R' script is missing. RuntimeError: If the R script execution fails and returns a non-zero exit code. """ rscript = shutil.which("Rscript") if rscript is None: raise EnvironmentError("Rscript not found in system path.") target_dir = Path(output_path).resolve() script_path = (Path(__file__).parent / "extract_CVfold.R").resolve() if not script_path.exists(): raise FileNotFoundError(f"Missing required R script: {script_path}") cmd = [rscript, str(script_path), str(target_dir)] print(f"Running Rscript command: {' '.join(cmd)}") proc = subprocess.run(cmd, capture_output=True, text=True) if proc.stdout: print(f"Rscript stdout:\n{proc.stdout}") if proc.stderr: if proc.returncode == 0: print(f"Rscript messages:\n{proc.stderr}") else: print(f"Rscript stderr:\n{proc.stderr}") if proc.returncode != 0: raise RuntimeError(f"R conversion failed with return code: {proc.returncode}") export_base_dir = os.path.join(str(target_dir), "CV_Export") return load_r_export_folds(base_path=export_base_dir, num_omics=num_omics, k=k)