Source code for aiqclib.train.step2_validate_model.kfold_validation_suite

"""
This module provides the KFoldValidationSuite class, an implementation of
k-fold cross-validation tailored for validating multiple ML algorithms
simultaneously via the ModelSuite class.
"""

from typing import Optional, List, Dict
import copy

import polars as pl

from aiqclib.common.base.config_base import ConfigBase
from aiqclib.train.step2_validate_model.validate_base import ValidationBase



[docs]
class KFoldValidationSuite(ValidationBase):
    """
    A subclass of :class:`ValidationBase` that performs k-fold cross-validation
    on training sets across multiple machine learning methods provided by a
    model suite (e.g., :class:`ModelSuite`).

    This class iterates over the specified number of folds and across all
    methods defined in the base model. Results are accumulated with composite
    keys (method + target) to ensure outputs are saved uniquely per method.
    """

    expected_class_name: str = "KFoldValidationSuite"

    def __init__(
        self,
        config: ConfigBase,
        training_sets: Optional[Dict[str, pl.DataFrame]] = None,
    ) -> None:
        """
        Initialize the k-fold validation suite process.

        :param config: A training configuration object containing
                       model parameters, file paths, and other
                       validation settings.
        :type config: ConfigBase
        :param training_sets: A dictionary where keys are target names and values are
                              Polars DataFrames of labeled data. Each DataFrame must
                              contain a column named ``k_fold``. Defaults to None.
        :type training_sets: Optional[Dict[str, pl.DataFrame]]
        :raises ValueError: If the configured base model does not have the `multi`
                            flag set to True.
        """
        super().__init__(config=config, training_sets=training_sets)

        # Ensure the base model is a multi-method suite (like ModelSuite)
        if not getattr(self.base_model, "multi", False):
            raise ValueError(
                "KFoldValidationSuite requires a base model with 'multi=True' "
                "(e.g., ModelSuite), but received a standard model class."
            )

        # Redefine default file names to include the {method} placeholder
        self.default_file_names: Dict[str, str] = {
            "report": "validation_report_{method}_{target_name}.tsv",
            "model_scores": "model_scores_{method}_{target_name}.parquet",
            "metric_plot": "metric_plots_{method}_{target_name}.svg",
        }

        # Re-generate output file names using the new pattern with {method}
        # For each output type (report, model_score, metric_plot),
        # get the target-specific filenames from config.
        # These filenames will still contain the {method} placeholder,
        # which will be replaced later in the validate method for each specific method.
        self.output_file_names: Dict[str, Dict[str, str]] = {
            k: self.config.get_target_file_names(
                step_name="validate", default_file_name=v
            )
            for k, v in self.default_file_names.items()
        }

        #: The default number of folds if none is specified in the config.
        self.default_k_fold: int = 10
        self.drop_cols = [
            "k_fold",
            "row_id",
            "platform_code",
            "profile_no",
            "observation_no",
        ]

        self.base_model.set_enable_shap(False)


[docs]
    def get_k_fold(self) -> int:
        """
        Retrieve the number of folds to use for cross-validation from
        the ``validate`` section of the YAML config, or fall back
        to :attr:`default_k_fold`.

        :return: The number of folds for k-fold cross-validation.
        :rtype: int
        """
        return (
            self.config.get_step_params("validate").get("k_fold", self.default_k_fold)
            or self.default_k_fold
        )



[docs]
    def validate(self, target_name: str) -> None:
        """
        Conduct k-fold cross-validation for the given target name across all
        methods in the ModelSuite.

        For each method in ``base_model.method_objs``:
          1. Iterate over the defined number of folds.
          2. Build the model using all training data except the current fold.
          3. Test the model on the held-out fold.
          4. Accumulate test results and model-scores tables under a composite
             key (`{method_name}_{target_name}`).
          5. Update `output_file_names` to replace the `{method}` placeholder.

        :param target_name: The identifier for which target dataset to validate.
        :type target_name: str
        """
        k_fold: int = self.get_k_fold()

        # Iterate through all configured ML methods loaded in ModelSuite
        for method_name, method_obj in self.base_model.method_objs.items():
            # Retrieve short_name (fallback to method_name if attribute is missing), and lowercase it
            method_lower = getattr(method_obj, "short_name", method_name).lower()

            # Create a composite key (e.g. "xgb_temp") to uniquely store results
            # and map to the parent ValidationBase's dictionaries.
            comp_key = f"{method_lower}_{target_name}"

            self.models[comp_key] = []
            reports: List[pl.DataFrame] = []
            model_scores: List[pl.DataFrame] = []

            for k in range(k_fold):
                # We need a fresh copy of the specific ML method model for each fold
                current_fold_model = copy.deepcopy(method_obj)
                current_fold_model.k = k + 1

                current_fold_model.training_set = (
                    self.training_sets[target_name]
                    .filter(pl.col("k_fold") != (k + 1))
                    .drop(self.drop_cols)
                )
                current_fold_model.build()
                self.models[comp_key].append(current_fold_model)

                current_fold_model.test_set = (
                    self.training_sets[target_name]
                    .filter(pl.col("k_fold") == (k + 1))
                    .drop(self.drop_cols)
                )
                current_fold_model.test()
                reports.append(current_fold_model.report)

                if current_fold_model.model_score is not None:
                    model_scores.append(current_fold_model.model_score)

            # Store the aggregated results using the composite key
            self.reports[comp_key] = pl.concat(reports)

            if model_scores:
                self.model_scores[comp_key] = pl.concat(model_scores)

            # Resolve the {method} placeholder in the output file paths specifically for this composite key.
            # The original target_name entry in self.output_file_names still contains the {method} placeholder.
            # This creates a new entry for the composite key with the resolved path.
            self.output_file_names["report"][comp_key] = self.output_file_names[
                "report"
            ][target_name].replace("{method}", method_lower)
            self.output_file_names["model_scores"][comp_key] = self.output_file_names[
                "model_scores"
            ][target_name].replace("{method}", method_lower)
            self.output_file_names["metric_plot"][comp_key] = self.output_file_names[
                "metric_plot"
            ][target_name].replace("{method}", method_lower)