Source code for aiqclib.prepare.features.basic_values

"""
This module provides the BasicValues class for extracting target
value observations from Polars DataFrames.

It extends FeatureBase and is designed for specific data processing needs,
such as those encountered with Copernicus CTD data.
"""

from typing import Optional, Dict

import polars as pl

from aiqclib.common.base.feature_base import FeatureBase
from aiqclib.common.utils.normalization import is_scaling_type, scale_flat_columns



[docs]
class BasicValues(FeatureBase):
    """
    A feature-extraction class for retrieving target values
    from Copernicus CTD data, extending :class:`FeatureBase`.
    """

    def __init__(
        self,
        target_name: Optional[str] = None,
        feature_info: Optional[Dict] = None,
        selected_profiles: Optional[pl.DataFrame] = None,
        filtered_input: Optional[pl.DataFrame] = None,
        selected_rows: Optional[Dict[str, pl.DataFrame]] = None,
        summary_stats: Optional[pl.DataFrame] = None,
    ) -> None:
        """
        Initialize an instance of BasicValues.

        :param target_name: The key identifying which target's rows to extract
                            features for from :attr:`selected_rows`. Defaults to :obj:`None`.
        :type target_name: Optional[str]
        :param feature_info: A dictionary containing feature-related parameters,
                             including a "stats" sub-dict with min/max info
                             and a "flank_up" integer specifying how many
                             upstream observations to retrieve. Defaults to :obj:`None`.
        :type feature_info: Optional[Dict]
        :param selected_profiles: A Polars DataFrame with selected profiles, typically
                                  used for further merges or lookups. Defaults to :obj:`None`.
        :type selected_profiles: Optional[pl.DataFrame]
        :param filtered_input: A potentially filtered Polars DataFrame containing
                               full observed variables. Defaults to :obj:`None`.
        :type filtered_input: Optional[pl.DataFrame]
        :param selected_rows: A dictionary mapping target names to their respective
                              DataFrames of relevant rows. Defaults to :obj:`None`.
        :type selected_rows: Optional[Dict[str, pl.DataFrame]]
        :param summary_stats: A Polars DataFrame of summary statistics
                              (unused in this subclass). Defaults to :obj:`None`.
        :type summary_stats: Optional[pl.DataFrame]
        """
        super().__init__(
            target_name=target_name,
            feature_info=feature_info,
            selected_profiles=selected_profiles,
            filtered_input=filtered_input,
            selected_rows=selected_rows,
            summary_stats=summary_stats,
        )
        self._expanded_observations: Optional[pl.DataFrame] = None
        self._feature_wide: Optional[pl.DataFrame] = None


[docs]
    def extract_features(self) -> None:
        """
        Initiate the multi-step process of creating the feature set in :attr:`features`.

        Steps:

          1. :meth:`_init_features` - Prepare a base DataFrame with essential columns
             (row_id, platform_code, profile_no, observation_no).
          2. For each column specified in ``feature_info["col_names"]``,
             call :meth:`_add_features` to join the pivoted data onto our feature table.
          3. :meth:`_clean_features` - Drop columns no longer needed.
        """
        self._init_features()
        for col_name in self.feature_info["col_names"]:
            self._add_features(col_name)
        self._clean_features()


    def _init_features(self) -> None:
        """
        Initialize :attr:`features` by selecting core identifying columns
        from the DataFrame specified by :attr:`selected_rows[target_name]`.
        """
        self.features = self.selected_rows[self.target_name].select(
            ["row_id", "platform_code", "profile_no", "observation_no"]
        )

    def _add_features(self, col_name: str) -> None:
        """
        Join the specified column from :attr:`filtered_input` onto :attr:`features`
        using common identifier columns.

        :param col_name: The name of the column from :attr:`filtered_input` to add
                         as a new feature.
        :type col_name: str
        """
        self.features = self.features.join(
            (
                self.filtered_input.select(
                    pl.col("platform_code"),
                    pl.col("profile_no"),
                    pl.col("observation_no"),
                    pl.col(col_name),
                )
            ),
            on=["platform_code", "profile_no", "observation_no"],
            maintain_order="left",
        )

    def _clean_features(self) -> None:
        """
        Drop intermediate columns from :attr:`features` that are no longer
        needed after feature extraction and joining.
        """
        self.features = self.features.drop(
            ["platform_code", "profile_no", "observation_no"]
        )


[docs]
    def scale_first(self) -> None:
        """
        Apply a pre-feature-extraction scaling step on :attr:`filtered_input`.

        This normalizes each relevant raw input column in place according to the
        normalization type declared in ``feature_info["stats_set"]["type"]``:
        ``min_max``/``auto_min_max`` apply min-max scaling and ``standard``
        applies standard scaling, both using the values in
        :attr:`feature_info["stats"]`. ``raw`` leaves the data untouched.
        """
        stats_type = self.feature_info.get("stats_set", {}).get("type", "raw")
        if is_scaling_type(stats_type) and self.feature_info.get("stats"):
            self.filtered_input = scale_flat_columns(
                self.filtered_input, self.feature_info["stats"], stats_type
            )



[docs]
    def scale_second(self) -> None:
        """
        Apply a post-feature-extraction scaling step if needed.

        This method is currently unimplemented but retains its placeholder
        for potential future additions of scaling or normalization
        after features have been pivoted and expanded.
        """
        pass  # pragma: no cover