Source code for aiqclib.prepare.features.profile_summary

"""
This module provides the ProfileSummaryStats class, which is responsible for
extracting and scaling statistical features from Polars DataFrames by merging
row-level data with summary statistics.
"""

from typing import Optional, Dict

import polars as pl

from aiqclib.common.base.feature_base import FeatureBase
from aiqclib.common.utils.normalization import is_scaling_type, scale_nested_columns


[docs] class ProfileSummaryStats(FeatureBase): """ A feature-extraction class that combines row references from :attr:`selected_rows` with summary statistics from :attr:`summary_stats`. It constructs columns of summarized metrics (e.g., min, max) for specified variables and optionally applies scaling. This class inherits from :class:`FeatureBase`, which provides a generic framework for feature extraction, including placeholders for multi-stage scaling. """ def __init__( self, target_name: Optional[str] = None, feature_info: Optional[Dict] = None, selected_profiles: Optional[pl.DataFrame] = None, filtered_input: Optional[pl.DataFrame] = None, selected_rows: Optional[Dict[str, pl.DataFrame]] = None, summary_stats: Optional[pl.DataFrame] = None, ) -> None: """ Initialize the profile summary stats feature extractor. :param target_name: The name of the target used to lookup corresponding rows in :attr:`selected_rows`. :type target_name: Optional[str] :param feature_info: A dictionary specifying feature parameters and stats. Example structure: .. code-block:: python { "stats": { "temp": { "min": {"min": 0.0, "max": 30.0}, "mean": {"min": 0.0, "max": 30.0} } }, "col_names": ["temp"], "summary_stats_names": ["min", "mean"], "stats_set": {"type": "min_max"} } :type feature_info: Optional[Dict] :param selected_profiles: A Polars DataFrame of selected profiles. :type selected_profiles: Optional[pl.DataFrame] :param filtered_input: A Polars DataFrame of potentially filtered input data. :type filtered_input: Optional[pl.DataFrame] :param selected_rows: A dictionary of DataFrames keyed by target names, containing rows for which features are extracted. :type selected_rows: Optional[Dict[str, pl.DataFrame]] :param summary_stats: A Polars DataFrame of summary statistics keyed by (platform_code, profile_no, variable). :type summary_stats: Optional[pl.DataFrame] """ super().__init__( target_name=target_name, feature_info=feature_info, selected_profiles=selected_profiles, filtered_input=filtered_input, selected_rows=selected_rows, summary_stats=summary_stats, )
[docs] def extract_features(self) -> None: """ Traverse the :attr:`feature_info` structure to assemble columns from :attr:`summary_stats`, merging them into :attr:`features`. Steps: 1. Initialize :attr:`features` via :meth:`_filter_selected_rows_cols`. 2. Join metrics from :attr:`summary_stats` for each variable/metric pair. 3. Remove join keys (platform_code, profile_no) from the final result. """ self._filter_selected_rows_cols() variables_and_metrics = [ (variable_name, metric_name) for variable_name in self.feature_info["col_names"] for metric_name in self.feature_info["summary_stats_names"] ] for variable_name, metric_name in variables_and_metrics: self._extract_single_summary(variable_name, metric_name) self.features = self.features.drop(["platform_code", "profile_no"])
def _filter_selected_rows_cols(self) -> None: """ Initialize :attr:`features` by selecting the essential columns from :attr:`selected_rows[target_name]`. """ self.features = self.selected_rows[self.target_name].select( ["row_id", "platform_code", "profile_no"] ) def _extract_single_summary(self, variable_name: str, metric_name: str) -> None: """ Join a single summary statistic from :attr:`summary_stats` onto :attr:`features`. :param variable_name: The variable category key (e.g., "temp", "psal"). :type variable_name: str :param metric_name: The specific metric key (e.g., "min", "mean", "max"). :type metric_name: str """ self.features = self.features.join( self.summary_stats.filter(pl.col("variable") == variable_name).select( pl.col("platform_code"), pl.col("profile_no"), pl.col(metric_name).alias(f"{variable_name}_{metric_name}"), ), on=["platform_code", "profile_no"], maintain_order="left", )
[docs] def scale_first(self) -> None: """ An initial scaling hook (unimplemented). """ pass # pragma: no cover
[docs] def scale_second(self) -> None: """ Scale the newly joined summary statistics based on :attr:`feature_info`. Transforms columns named ``{variable}_{metric}`` according to the normalization type in ``feature_info["stats_set"]["type"]``: ``min_max``/``auto_min_max`` apply min-max scaling and ``standard`` applies standard scaling, using the nested values in :attr:`feature_info["stats"]`. ``raw`` leaves the columns unchanged. """ stats_type = self.feature_info.get("stats_set", {}).get("type", "raw") if is_scaling_type(stats_type) and self.feature_info.get("stats"): self.features = scale_nested_columns( self.features, self.feature_info["stats"], stats_type )