"""
This module defines a feature extraction class, DayOfYearFeat,
that calculates the day of the year from timestamps.
It is designed to be part of a larger feature engineering pipeline,
extending the FeatureBase class to derive temporal features,
specifically the day-of-year, and optionally apply a sinusoidal
transformation for cyclical encoding.
"""
from typing import Optional, Dict
import numpy as np
import polars as pl
from aiqclib.common.base.feature_base import FeatureBase
[docs]
class DayOfYearFeat(FeatureBase):
"""
A feature-extraction class that derives day-of-year features
from Copernicus CTD data.
This class specifically leverages the ``profile_timestamp``
column to generate a day-of-year value, optionally applying
a sinusoidal transformation for cyclical encoding.
"""
def __init__(
self,
target_name: Optional[str] = None,
feature_info: Optional[Dict] = None,
selected_profiles: Optional[pl.DataFrame] = None,
filtered_input: Optional[pl.DataFrame] = None,
selected_rows: Optional[Dict[str, pl.DataFrame]] = None,
summary_stats: Optional[pl.DataFrame] = None,
) -> None:
"""
Initialize the day-of-year feature extraction process.
:param target_name: The name of the target variable used to index
:attr:`selected_rows`. Defaults to None.
:type target_name: Optional[str]
:param feature_info: A dictionary describing feature parameters,
which may include a "convert" key (e.g., "sine")
for sinusoidal transformations. Defaults to None.
:type feature_info: Optional[Dict]
:param selected_profiles: A Polars DataFrame containing a subset
of profiles relevant to feature extraction.
Defaults to None.
:type selected_profiles: Optional[pl.DataFrame]
:param filtered_input: (Unused in this feature class) A filtered Polars
DataFrame of input data for advanced merging.
Defaults to None.
:type filtered_input: Optional[pl.DataFrame]
:param selected_rows: A dictionary of target-specific DataFrames, each
containing rows relevant to that target. Defaults to None.
:type selected_rows: Optional[Dict[str, pl.DataFrame]]
:param summary_stats: (Unused in this feature class) A Polars DataFrame
containing statistical information for potential scaling.
Defaults to None.
:type summary_stats: Optional[pl.DataFrame]
:return: None
:rtype: None
"""
super().__init__(
target_name=target_name,
feature_info=feature_info,
selected_profiles=selected_profiles,
filtered_input=filtered_input,
selected_rows=selected_rows,
summary_stats=summary_stats,
)
[docs]
def scale_first(self) -> None:
"""
(Optional) Perform the initial scaling step.
Currently, no transformations are applied to day-of-year values
in this step, but it can be extended for outlier removal or
other domain-specific logic.
:return: None
:rtype: None
"""
pass # pragma: no cover
[docs]
def scale_second(self) -> None:
"""
Optionally apply a sinusoidal or cosinusoidal transformation to the day-of-year values.
If ``"convert"`` is specified as either ``"sine"`` or ``"cosine"`` in :attr:`feature_info`,
transforms each day-of-year value into a cyclical feature in the range [0, 1].
:return: None
:rtype: None
"""
dispatcher = {"sine": self.convert_sine, "cosine": self.convert_cosine}
if (self.feature_info is not None) and ("convert" in self.feature_info):
conversion_type = self.feature_info.get("convert")
if conversion_type in dispatcher:
dispatcher[conversion_type]()
[docs]
def convert_sine(self) -> None:
"""
Optionally apply a sinusoidal transformation to the day-of-year values.
The transformation formula used is:
.. math::
day\\_of\\_year_{transformed} = \\frac{{\\sin((day\\_of\\_year - 1) \\cdot 2 \\cdot \\pi / 364) + 1}}{2}
:return: None
:rtype: None
"""
self.features = self.features.with_columns(
(((pl.col("day_of_year") - 1) * 2 * np.pi / 364).sin() + 1) / 2
)
[docs]
def convert_cosine(self) -> None:
"""
Optionally apply a cosinusoidal transformation to the day-of-year values.
The transformation formula used is:
.. math::
day\\_of\\_year_{transformed} = \\frac{{\\cos((day\\_of\\_year - 1) \\cdot 2 \\cdot \\pi / 364) + 1}}{2}
:return: None
:rtype: None
"""
self.features = self.features.with_columns(
(((pl.col("day_of_year") - 1) * 2 * np.pi / 364).cos() + 1) / 2
)