Source code for aiqclib.classify.step5_extract_features.dataset_all

"""
This module provides the ExtractDataSetAll class, which is designed for extracting
features from Copernicus CTD (Conductivity, Temperature, and Depth) datasets.
It inherits from ExtractFeatureBase and utilizes a configuration-driven approach
to define data targets and output paths.
"""

from typing import Optional, Dict

import polars as pl

from aiqclib.common.base.config_base import ConfigBase
from aiqclib.prepare.step5_extract_features.extract_base import ExtractFeatureBase



[docs]
class ExtractDataSetAll(ExtractFeatureBase):
    """
    Feature extraction implementation specifically for Copernicus CTD data.

    This class serves as a concrete implementation of the :class:`ExtractFeatureBase`
    interface, specializing in the configuration and file naming conventions
    required for full CTD dataset processing.

    :cvar expected_class_name: The identifier used to match this class with
                               configuration settings.
    :vartype expected_class_name: str
    """

    expected_class_name: str = "ExtractDataSetAll"

    #: At classification time, normalization values are loaded from the file
    #: produced during preparation rather than re-derived from data.
    normalization_role: str = "apply"

    def __init__(
        self,
        config: ConfigBase,
        input_data: Optional[pl.DataFrame] = None,
        selected_profiles: Optional[pl.DataFrame] = None,
        selected_rows: Optional[Dict[str, pl.DataFrame]] = None,
        summary_stats: Optional[pl.DataFrame] = None,
    ) -> None:
        """
        Initialize the ExtractDataSetAll class with configuration and optional data.

        :param config: The configuration instance providing parameters and paths.
        :type config: ConfigBase
        :param input_data: Polars DataFrame containing the raw input data.
        :type input_data: Optional[pl.DataFrame]
        :param selected_profiles: DataFrame containing metadata or IDs for selected profiles.
        :type selected_profiles: Optional[pl.DataFrame]
        :param selected_rows: A mapping of target names to DataFrames containing specific row data.
        :type selected_rows: Optional[Dict[str, pl.DataFrame]]
        :param summary_stats: DataFrame containing statistics for normalization or scaling.
        :type summary_stats: Optional[pl.DataFrame]
        """
        super().__init__(
            config=config,
            input_data=input_data,
            selected_profiles=selected_profiles,
            selected_rows=selected_rows,
            summary_stats=summary_stats,
        )

        #: Default string template for naming exported feature files.
        self.default_file_name: str = (
            "extracted_features_classify_{target_name}.parquet"
        )

        #: Resolved mapping of target names to their specific output file paths.
        self.output_file_names: Dict[str, str] = self.config.get_target_file_names(
            step_name="extract", default_file_name=self.default_file_name
        )

        #: List of columns to be excluded or dropped during the extraction process.
        self.drop_col_names = [
            "profile_id",
            "pair_id",
        ]