Source code for aiqclib.prepare.step6_split_dataset.dataset_all

"""
This module defines the `SplitDataSetAll` class, which is responsible for partitioning
feature data into training and test sets. It provides functionality for random sampling,
k-fold cross-validation index assignment, and column cleanup for Copernicus CTD datasets.
"""

from typing import Optional, Dict

import numpy as np
import polars as pl

from aiqclib.common.base.config_base import ConfigBase
from aiqclib.prepare.step6_split_dataset.split_base import SplitDataSetBase


[docs] class SplitDataSetAll(SplitDataSetBase): """ A subclass of :class:`SplitDataSetBase` that splits feature data into training and test sets for Copernicus CTD data. This class performs the following tasks: - Randomly samples a fraction of rows for the test set. - Ensures matching positive and negative rows are grouped by shared identifiers (e.g., ``pair_id``). - Splits out the remainder into a training set. - Assigns k-fold indices to the training set rows. - Optionally drops columns that are not required for subsequent analysis. .. note:: This class, :class:`SplitDataSetAll`, is specifically designed to split feature data into training and test sets with particular handling for Copernicus CTD data. """ expected_class_name: str = "SplitDataSetAll" def __init__( self, config: ConfigBase, target_features: Optional[Dict[str, pl.DataFrame]] = None, ) -> None: """ Initialize the dataset splitting class with configuration and target features. :param config: A dataset configuration object that specifies paths, test-set fraction, and k-fold details. :type config: :class:`aiqclib.common.base.config_base.ConfigBase` :param target_features: A dictionary mapping target names to Polars DataFrames containing extracted features. Defaults to None. :type target_features: Optional[Dict[str, pl.DataFrame]] """ super().__init__(config=config, target_features=target_features) #: Column names used for intermediate processing (e.g., to maintain #: matching references between positive and negative rows). self.drop_col_names = [ "profile_id", "pair_id", ]
[docs] def split_test_set(self, target_name: str) -> None: """ Split the specified target's DataFrame into training and test sets. 1. A random fraction of rows labeled 1 (positive) is sampled to form the test set. 2. Rows labeled 0 (negative) with matching ``pair_id`` are joined to that test set. 3. The remaining rows form the training set. :param target_name: The target name identifying which DataFrame in :attr:`target_features` to split. :type target_name: str """ test_set_fraction = self.get_test_set_fraction() pos_test_set = ( self.target_features[target_name] .filter(pl.col("label") == 1) .sample(fraction=test_set_fraction, shuffle=True) ) neg_test_set = ( self.target_features[target_name] .filter(pl.col("label") == 0) .sample(fraction=test_set_fraction, shuffle=True) ) test_set = pos_test_set.vstack(neg_test_set) self.test_sets[target_name] = test_set.select( ["row_id", pl.all().exclude("row_id")] ) pos_training_set = ( self.target_features[target_name].filter(pl.col("label") == 1) ).join(pos_test_set, on="row_id", how="anti") neg_training_set = ( self.target_features[target_name].filter(pl.col("label") == 0) ).join(neg_test_set, on="row_id", how="anti") training_set = pos_training_set.vstack(neg_training_set) self.training_sets[target_name] = training_set.select( ["row_id", pl.all().exclude("row_id")] )
[docs] def add_k_fold(self, target_name: str) -> None: """ Assign a k-fold identifier to each row in the training set for cross-validation. 1. Extracts rows labeled 1 (positive) and unevenly distributes them across the specified number of folds. 2. Joins negative rows based on ``pair_id`` so they share the same fold assignment. :param target_name: The target name identifying the training set within :attr:`training_sets`. :type target_name: str """ k_fold = self.get_k_fold() pos_training_set = self.training_sets[target_name].filter(pl.col("label") == 1) neg_training_set = self.training_sets[target_name].filter(pl.col("label") == 0) df_pos_size = pos_training_set.shape[0] df_neg_size = neg_training_set.shape[0] pos_n_per_value = df_pos_size // k_fold neg_n_per_value = df_neg_size // k_fold pos_k_values = np.array( [i for i in range(1, k_fold + 1) for _ in range(pos_n_per_value)] ) neg_k_values = np.array( [i for i in range(1, k_fold + 1) for _ in range(neg_n_per_value)] ) pos_remaining = df_pos_size % k_fold neg_remaining = df_neg_size % k_fold pos_k_values = np.concatenate( [pos_k_values, np.random.choice(range(1, k_fold + 1), pos_remaining)] ) neg_k_values = np.concatenate( [neg_k_values, np.random.choice(range(1, k_fold + 1), neg_remaining)] ) np.random.shuffle(pos_k_values) np.random.shuffle(neg_k_values) pos_training_set = pos_training_set.with_columns( pl.Series("k_fold", pos_k_values) ) neg_training_set = neg_training_set.with_columns( pl.Series("k_fold", neg_k_values) ) training_set = pos_training_set.vstack(neg_training_set) # The correct way to reorder columns is to explicitly select them. cols_to_front = [ "k_fold", "row_id", "platform_code", "profile_no", "observation_no", ] self.training_sets[target_name] = training_set.select( cols_to_front + [pl.all().exclude(cols_to_front)] )
[docs] def drop_columns(self, target_name: str) -> None: """ Remove specified working columns from both the training and test sets, leaving only the essential columns for subsequent steps. :param target_name: The target name identifying which training and test sets to modify. :type target_name: str """ self.training_sets[target_name] = self.training_sets[target_name].drop( self.drop_col_names ) self.test_sets[target_name] = self.test_sets[target_name].drop( self.drop_col_names )