"""
This module provides the BasicValues class for extracting target
value observations from Polars DataFrames.
It extends FeatureBase and is designed for specific data processing needs,
such as those encountered with Copernicus CTD data.
"""
from typing import Optional, Dict
import polars as pl
from aiqclib.common.base.feature_base import FeatureBase
from aiqclib.common.utils.normalization import is_scaling_type, scale_flat_columns
[docs]
class BasicValues(FeatureBase):
"""
A feature-extraction class for retrieving target values
from Copernicus CTD data, extending :class:`FeatureBase`.
"""
def __init__(
self,
target_name: Optional[str] = None,
feature_info: Optional[Dict] = None,
selected_profiles: Optional[pl.DataFrame] = None,
filtered_input: Optional[pl.DataFrame] = None,
selected_rows: Optional[Dict[str, pl.DataFrame]] = None,
summary_stats: Optional[pl.DataFrame] = None,
) -> None:
"""
Initialize an instance of BasicValues.
:param target_name: The key identifying which target's rows to extract
features for from :attr:`selected_rows`. Defaults to :obj:`None`.
:type target_name: Optional[str]
:param feature_info: A dictionary containing feature-related parameters,
including a "stats" sub-dict with min/max info
and a "flank_up" integer specifying how many
upstream observations to retrieve. Defaults to :obj:`None`.
:type feature_info: Optional[Dict]
:param selected_profiles: A Polars DataFrame with selected profiles, typically
used for further merges or lookups. Defaults to :obj:`None`.
:type selected_profiles: Optional[pl.DataFrame]
:param filtered_input: A potentially filtered Polars DataFrame containing
full observed variables. Defaults to :obj:`None`.
:type filtered_input: Optional[pl.DataFrame]
:param selected_rows: A dictionary mapping target names to their respective
DataFrames of relevant rows. Defaults to :obj:`None`.
:type selected_rows: Optional[Dict[str, pl.DataFrame]]
:param summary_stats: A Polars DataFrame of summary statistics
(unused in this subclass). Defaults to :obj:`None`.
:type summary_stats: Optional[pl.DataFrame]
"""
super().__init__(
target_name=target_name,
feature_info=feature_info,
selected_profiles=selected_profiles,
filtered_input=filtered_input,
selected_rows=selected_rows,
summary_stats=summary_stats,
)
self._expanded_observations: Optional[pl.DataFrame] = None
self._feature_wide: Optional[pl.DataFrame] = None
def _init_features(self) -> None:
"""
Initialize :attr:`features` by selecting core identifying columns
from the DataFrame specified by :attr:`selected_rows[target_name]`.
"""
self.features = self.selected_rows[self.target_name].select(
["row_id", "platform_code", "profile_no", "observation_no"]
)
def _add_features(self, col_name: str) -> None:
"""
Join the specified column from :attr:`filtered_input` onto :attr:`features`
using common identifier columns.
:param col_name: The name of the column from :attr:`filtered_input` to add
as a new feature.
:type col_name: str
"""
self.features = self.features.join(
(
self.filtered_input.select(
pl.col("platform_code"),
pl.col("profile_no"),
pl.col("observation_no"),
pl.col(col_name),
)
),
on=["platform_code", "profile_no", "observation_no"],
maintain_order="left",
)
def _clean_features(self) -> None:
"""
Drop intermediate columns from :attr:`features` that are no longer
needed after feature extraction and joining.
"""
self.features = self.features.drop(
["platform_code", "profile_no", "observation_no"]
)
[docs]
def scale_first(self) -> None:
"""
Apply a pre-feature-extraction scaling step on :attr:`filtered_input`.
This normalizes each relevant raw input column in place according to the
normalization type declared in ``feature_info["stats_set"]["type"]``:
``min_max``/``auto_min_max`` apply min-max scaling and ``standard``
applies standard scaling, both using the values in
:attr:`feature_info["stats"]`. ``raw`` leaves the data untouched.
"""
stats_type = self.feature_info.get("stats_set", {}).get("type", "raw")
if is_scaling_type(stats_type) and self.feature_info.get("stats"):
self.filtered_input = scale_flat_columns(
self.filtered_input, self.feature_info["stats"], stats_type
)
[docs]
def scale_second(self) -> None:
"""
Apply a post-feature-extraction scaling step if needed.
This method is currently unimplemented but retains its placeholder
for potential future additions of scaling or normalization
after features have been pivoted and expanded.
"""
pass # pragma: no cover