Source code for aiqclib.classify.step5_extract_features.dataset_all
"""
This module provides the ExtractDataSetAll class, which is designed for extracting
features from Copernicus CTD (Conductivity, Temperature, and Depth) datasets.
It inherits from ExtractFeatureBase and utilizes a configuration-driven approach
to define data targets and output paths.
"""
from typing import Optional, Dict
import polars as pl
from aiqclib.common.base.config_base import ConfigBase
from aiqclib.prepare.step5_extract_features.extract_base import ExtractFeatureBase
[docs]
class ExtractDataSetAll(ExtractFeatureBase):
"""
Feature extraction implementation specifically for Copernicus CTD data.
This class serves as a concrete implementation of the :class:`ExtractFeatureBase`
interface, specializing in the configuration and file naming conventions
required for full CTD dataset processing.
:cvar expected_class_name: The identifier used to match this class with
configuration settings.
:vartype expected_class_name: str
"""
expected_class_name: str = "ExtractDataSetAll"
#: At classification time, normalization values are loaded from the file
#: produced during preparation rather than re-derived from data.
normalization_role: str = "apply"
def __init__(
self,
config: ConfigBase,
input_data: Optional[pl.DataFrame] = None,
selected_profiles: Optional[pl.DataFrame] = None,
selected_rows: Optional[Dict[str, pl.DataFrame]] = None,
summary_stats: Optional[pl.DataFrame] = None,
) -> None:
"""
Initialize the ExtractDataSetAll class with configuration and optional data.
:param config: The configuration instance providing parameters and paths.
:type config: ConfigBase
:param input_data: Polars DataFrame containing the raw input data.
:type input_data: Optional[pl.DataFrame]
:param selected_profiles: DataFrame containing metadata or IDs for selected profiles.
:type selected_profiles: Optional[pl.DataFrame]
:param selected_rows: A mapping of target names to DataFrames containing specific row data.
:type selected_rows: Optional[Dict[str, pl.DataFrame]]
:param summary_stats: DataFrame containing statistics for normalization or scaling.
:type summary_stats: Optional[pl.DataFrame]
"""
super().__init__(
config=config,
input_data=input_data,
selected_profiles=selected_profiles,
selected_rows=selected_rows,
summary_stats=summary_stats,
)
#: Default string template for naming exported feature files.
self.default_file_name: str = (
"extracted_features_classify_{target_name}.parquet"
)
#: Resolved mapping of target names to their specific output file paths.
self.output_file_names: Dict[str, str] = self.config.get_target_file_names(
step_name="extract", default_file_name=self.default_file_name
)
#: List of columns to be excluded or dropped during the extraction process.
self.drop_col_names = [
"profile_id",
"pair_id",
]