Source code for aiqclib.classify.step6_classify_dataset.dataset_all

"""
This module defines the ClassifyAll class, a specialized implementation
of BuildModelBase designed for building and testing classification models
across multiple targets. It manages configuration, data handling, and
result persistence for a comprehensive classification workflow.
"""

from typing import Optional, Dict

import polars as pl

from aiqclib.common.base.config_base import ConfigBase
from aiqclib.train.step4_build_model.build_model_base import BuildModelBase



[docs]
class ClassifyAll(BuildModelBase):
    """
    A subclass of :class:`BuildModelBase` that orchestrates the building
    and testing of classification models for multiple targets using
    provided training and test sets.

    This class sets its :attr:`expected_class_name` to ``"ClassifyAll"``,
    which must match the YAML configuration’s ``base_class`` if you
    intend to instantiate it within that framework.
    """

    expected_class_name: str = "ClassifyAll"

    def __init__(
        self,
        config: ConfigBase,
        test_sets: Optional[Dict[str, pl.DataFrame]] = None,
    ) -> None:
        """
        Initialize the ClassifyAll instance.

        This constructor configures the classification process,
        setting up file names for models and predictions, and identifying
        columns to be dropped or kept during processing.

        .. warning::
           This class specifically initializes the parent :class:`BuildModelBase`
           with ``training_sets=None``, implying that the actual training
           data fetching/loading might be handled differently or by the
           underlying base model once it's set.

        :param config: A training configuration object specifying paths,
                       parameters, and model-building directives.
        :type config: ConfigBase
        :param test_sets: A dictionary of test data keyed by target name,
                          each value being a Polars DataFrame. Defaults to None.
        :type test_sets: Optional[Dict[str, pl.DataFrame]]
        """
        super().__init__(
            config=config, training_sets=None, test_sets=test_sets, step_name="classify"
        )

        #: Default names for model files and test reports,
        #: with placeholders for the target name.
        self.default_file_names: Dict[str, str] = {
            "report": "classify_report_{target_name}.tsv",
            "prediction": "classify_prediction_{target_name}.parquet",
            "model_scores": "classify_model_scores_{target_name}.parquet",
            "shap_value": "classify_shap_values_{target_name}.parquet",
            "metric_plot": "classify_metric_plots_{target_name}.svg",
        }
        self.default_model_file_name: str = "model_{target_name}.joblib"

        #: A dictionary mapping "model" or "result" to
        #: target-specific file paths, derived from configuration.
        self.output_file_names: Dict[str, Dict[str, str]] = {
            k: self.config.get_target_file_names("classify", v)
            for k, v in self.default_file_names.items()
        }

        #: A dictionary mapping "model" to target-specific file paths,
        #: derived from configuration.
        self.model_file_names: Dict[str, str] = self.config.get_target_file_names(
            step_name="model",
            default_file_name=self.default_model_file_name,
            use_dataset_folder=False,
        )

        #: Columns to be dropped from the test set before passing to the base model.
        self.drop_cols = ["row_id", "platform_code", "profile_no", "observation_no"]

        #: Columns to be selected from the original test set for final prediction output.
        self.test_cols = [
            "row_id",
            "platform_code",
            "profile_no",
            "observation_no",
            "label",
        ]


[docs]
    def build(self, target_name: str) -> None:
        """
        Placeholder method as training does not occur during classification.

        :param target_name: The name of the target variable.
        :type target_name: str
        """
        pass  # pragma: no cover



[docs]
    def build_final_model(self, target_name: str) -> None:
        """
        Placeholder method as training does not occur during classification.

        :param target_name: The name of the target variable.
        :type target_name: str
        """
        pass  # pragma: no cover



[docs]
    def test(self, target_name: str) -> None:
        """
        Test the model for the given target, storing the results in :attr:`results`.

        This method performs the following steps:

          1. Retrieves the trained model from :attr:`models[target_name]`.
          2. **Resets the model's model-scores table** to ensure no data duplication
             from previous runs.
          3. Prepares the appropriate test set by dropping specified columns
             from :attr:`test_sets[target_name]` and attaches it to the
             :attr:`base_model`.
          4. Calls the :meth:`base_model.test` method to generate predictions and reports.
          5. Stores the model-scores table in :attr:`model_scores[target_name]`.
          6. Concatenates relevant original test set columns with the
             generated predictions and stores them in :attr:`predictions[target_name]`.
          7. Stores the test report from the base model in :attr:`reports[target_name]`.

        :param target_name: The target variable name, used to index
                            both :attr:`models` and :attr:`test_sets`.
        :type target_name: str
        """
        self.base_model = self.models[target_name]

        # Reset model-scores table to avoid duplication if test is run multiple times
        self.base_model.model_score = None

        # Skip label creation / performance evaluation for label-free targets.
        self.base_model.skip_evaluation = self.config.get_skip_evaluation(target_name)

        self.base_model.test_set = self.test_sets[target_name].drop(self.drop_cols)
        self.base_model.test()

        # Always register the target (value may be None for label-free targets)
        # so downstream writers/plots see a stable key set and skip None entries.
        self.model_scores[target_name] = self.base_model.model_score

        if self.base_model.shap_values is not None:
            self.shap_values[target_name] = self.base_model.shap_values

        predictions = self.base_model.predictions
        self.predictions[target_name] = pl.concat(
            [
                self.test_sets[target_name].select(self.test_cols),
                predictions,
            ],
            how="horizontal",
        )
        self.reports[target_name] = self.base_model.report