Source code for aiqclib.prepare.step3_select_profiles.dataset_all

"""
Module for selecting and labeling oceanographic profiles based on QC flags.

This module defines the :class:`SelectDataSetAll` class, which identifies
"bad" (positive) and "good" (negative) profiles based on Quality Control (QC)
criteria and prepares a labeled dataset for machine learning applications.
"""

import operator
from functools import reduce
from typing import Optional, List

import polars as pl

from aiqclib.common.base.config_base import ConfigBase
from aiqclib.prepare.step3_select_profiles.select_base import ProfileSelectionBase


[docs] class SelectDataSetAll(ProfileSelectionBase): """ Selects positive/negative profiles from Copernicus CTD data. This class implements a strategy for labeling oceanographic profiles as "positive" (bad) or "negative" (good) based on their quality control (QC) flags. :ivar expected_class_name: The expected name of the class for config validation. :vartype expected_class_name: str :ivar pos_profile_df: DataFrame containing positively-labeled profiles. :vartype pos_profile_df: Optional[polars.DataFrame] :ivar neg_profile_df: DataFrame containing negatively-labeled profiles. :vartype neg_profile_df: Optional[polars.DataFrame] :ivar key_col_names: Column names used as unique identifiers for profiles. :vartype key_col_names: List[str] """ expected_class_name: str = "SelectDataSetAll" def __init__( self, config: ConfigBase, input_data: Optional[pl.DataFrame] = None ) -> None: """ Initialize the selection and labeling process. :param config: The configuration object containing paths and QC flag definitions. :type config: aiqclib.common.base.config_base.ConfigBase :param input_data: A Polars DataFrame containing the full set of profiles. :type input_data: Optional[polars.DataFrame] """ super().__init__(config=config, input_data=input_data) self.pos_profile_df: Optional[pl.DataFrame] = None self.neg_profile_df: Optional[pl.DataFrame] = None self.key_col_names: List[str] = [ "platform_code", "profile_no", "profile_timestamp", "longitude", "latitude", ]
[docs] def select_positive_profiles(self) -> None: """ Select profiles with "bad" QC flags. A profile is considered "positive" if any of its measurements have a QC flag defined as a positive flag in the configuration. Results are stored in :attr:`pos_profile_df`. """ conditions = reduce( operator.or_, [ pl.col(param["flag"]).is_in(param.get("pos_flag_values", [4])) for param in self.config.get_target_dict().values() ], ) self.pos_profile_df = ( self.input_data.filter(conditions) .select(self.key_col_names) .unique() .sort(["platform_code", "profile_no"]) .with_row_index("profile_id", offset=1) .with_columns( pl.lit(0, dtype=pl.UInt32).alias("neg_profile_id"), pl.lit(1, dtype=pl.UInt32).alias("label"), ) )
[docs] def select_negative_profiles(self) -> None: """ Select profiles with consistently "good" QC flags. A profile is considered "negative" if no measurements have a "bad" flag and at least one measurement has a "good" flag for all monitored parameters. Results are stored in :attr:`neg_profile_df`. """ exprs = reduce( operator.and_, [ (~pl.col(param["flag"]).is_in(param.get("pos_flag_values", [4])).any()) & (pl.col(param["flag"]).is_in(param.get("neg_flag_values", [1])).any()) for param in self.config.get_target_dict().values() ], ) self.neg_profile_df = ( self.input_data.filter(exprs.over(self.key_col_names)) .select(self.key_col_names) .unique() .sort(["platform_code", "profile_no"]) .with_row_index("profile_id", offset=self.pos_profile_df.shape[0] + 1) .with_columns( pl.lit(0, dtype=pl.UInt32).alias("neg_profile_id"), pl.lit(0, dtype=pl.UInt32).alias("label"), ) )
[docs] def label_profiles(self) -> None: """ Execute the full profile selection and labeling workflow. Orchestrates the identification of positive and negative profiles and vstacks them into the :attr:`selected_profiles` attribute. """ self.select_positive_profiles() self.select_negative_profiles() self.selected_profiles = self.pos_profile_df.vstack(self.neg_profile_df)