Source code for aiqclib.common.config.yaml_templates

"""
Module providing YAML templates for both dataset preparation
and training configurations. These templates can be customized
to fit various data pipeline requirements.
"""


def _get_dataset_path_info_sets() -> str:
    """
    Retrieves a YAML template string for dataset path information sets.

    This template defines common, input, and split path configurations for dataset preparation.

    :returns: A string containing the YAML template for path information sets.
    :rtype: str
    """
    return """
---
path_info_sets:
  - name: data_set_1
    common:
      base_path: /path/to/data # EDIT: Root output directory
    input:
      base_path: /path/to/input # EDIT: Directory with input files
      step_folder_name: ""
    split:
      step_folder_name: training

"""


def _get_dataset_target_sets() -> str:
    """
    Retrieves a YAML template string for dataset target variable sets.

    This template specifies variables to be processed along with their positive
    and negative quality flag values.

    :returns: A string containing the YAML template for target sets.
    :rtype: str
    """
    return """
target_sets:
  - name: target_set_1
    variables:
      - name: temp
        flag: temp_qc
        pos_flag_values: [ 4, 6, 7 ]
        neg_flag_values: [ 1 ]
      - name: psal
        flag: psal_qc
        pos_flag_values: [ 4, 6, 7 ]
        neg_flag_values: [ 1 ]
      - name: pres
        flag: pres_qc
        pos_flag_values: [ 4, 6, 7 ]
        neg_flag_values: [ 1 ]
        
"""


def _get_dataset_summary_stats_sets() -> str:
    """
    Retrieves a YAML template string for dataset summary statistics sets.

    This template defines sets of column names for which summary statistics
    (e.g., location, profile summary stats, basic values) will be calculated.

    :returns: A string containing the YAML template for summary statistics sets.
    :rtype: str
    """
    return """
summary_stats_sets:
  - name: summary_stats_set_1
    stats:
      - name: location
        col_names: [ longitude, latitude ]
      - name: profile_summary_stats
        col_names: [ temp, psal, pres ]
      - name: basic_values3
        col_names: [ temp, psal, pres ]

"""


def _get_dataset_feature_sets() -> str:
    """
    Retrieves a YAML template string for dataset feature sets.

    This template lists named sets of features that will be used in the
    dataset preparation process, such as location, day of year, profile
    summary stats, basic values, and flank features.

    :returns: A string containing the YAML template for feature sets.
    :rtype: str
    """
    return """
feature_sets:
  - name: feature_set_1
    features:
      - location
      - day_of_year
      - profile_summary_stats
      - basic_values
      - flank_up
      - flank_down

"""


def _get_dataset_feature_param_sets() -> str:
    """
    Retrieves a YAML template string for dataset feature parameter sets.

    This template defines detailed parameters for each feature, including
    statistics type, column names, conversion methods (e.g., cosine for day_of_year),
    and summary statistics names.

    :returns: A string containing the YAML template for feature parameter sets.
    :rtype: str
    """
    return """
feature_param_sets:
  - name: feature_set_1_param_set_1
    params:
      - feature: location
        stats_set: { type: raw }
        col_names: [ longitude, latitude ]
      - feature: day_of_year
        convert: cosine
        col_names: [ profile_timestamp ]
      - feature: profile_summary_stats
        stats_set: { type: raw }
        col_names: [ temp, psal, pres ]
        summary_stats_names: [ mean, median, sd, pct25, pct75 ]
      - feature: basic_values
        stats_set: { type: raw }
        col_names: [ temp, psal, pres ]
      - feature: flank_up
        flank_up: 5
        stats_set: { type: raw }
        col_names: [ temp, psal, pres ]
      - feature: flank_down
        flank_down: 5
        stats_set: { type: raw }
        col_names: [ temp, psal, pres ]

"""


def _get_dataset_feature_param_sets_full() -> str:
    """
    Retrieves a YAML template string for dataset feature parameter sets with full normalization.

    This template defines detailed parameters for each feature, specifying
    normalization types (e.g., min_max) and associated statistics sets.

    :returns: A string containing the YAML template for full feature parameter sets.
    :rtype: str
    """
    return """
feature_param_sets:
  - name: feature_set_1_param_set_1
    params:
      - feature: location
        stats_set: { type: min_max, name: location }
        col_names: [ longitude, latitude ]
      - feature: day_of_year
        convert: cosine
        col_names: [ profile_timestamp ]
      - feature: profile_summary_stats
        stats_set: { type: min_max, name: profile_summary_stats }
        col_names: [ temp, psal, pres ]
        summary_stats_names: [ mean, median, sd, pct25, pct75 ]
      - feature: basic_values
        stats_set: { type: min_max, name: basic_values3 }
        col_names: [ temp, psal, pres ]
      - feature: flank_up
        flank_up: 5
        stats_set: { type: min_max, name: basic_values3 }
        col_names: [ temp, psal, pres ]
      - feature: flank_down
        flank_down: 5
        stats_set: { type: min_max, name: basic_values3 }
        col_names: [ temp, psal, pres ]

"""


def _get_dataset_feature_stats_sets() -> str:
    """
    Retrieves a YAML template string for dataset feature statistics sets.

    This template is typically used to define methods and parameters for
    feature normalization or other statistical transformations. In this
    basic version, it's an empty placeholder.

    :returns: A string containing the YAML template for feature statistics sets.
    :rtype: str
    """
    return """
feature_stats_sets:
  - name: feature_set_1_stats_set_1

"""


def _get_dataset_feature_stats_sets_full() -> str:
    """
    Retrieves a YAML template string for dataset feature statistics sets with full normalization details.

    This template defines explicit min-max statistics for various features
    (e.g., longitude, latitude, temperature, salinity, pressure) used for normalization.

    :returns: A string containing the YAML template for full feature statistics sets.
    :rtype: str
    """
    return """
feature_stats_sets:
  - name: feature_set_1_stats_set_1
    min_max:
      - name: location
        stats: { longitude: { min: 14.5, max: 23.5 },
                 latitude: { min: 55, max: 66 } }
      - name: profile_summary_stats
        stats: { temp: { mean: { min: 0, max: 12.5 },
                         median: { min: 0, max: 15 },
                         sd: { min: 0, max: 6.5 },
                         pct25: { min: 0, max: 12 },
                         pct75: { min: 1, max: 19 } },
                 psal: { mean: { min: 2.9, max: 12 },
                         median: { min: 2.9, max: 12 },
                         sd: { min: 0, max: 4 },
                         pct25: { min: 2.5, max: 8.5 },
                         pct75: { min: 3, max: 16 } },
                 pres: { mean: { min: 24, max: 105 },
                         median: { min: 24, max: 105 },
                         sd: { min: 13, max: 60 },
                         pct25: { min: 12, max: 53 },
                         pct75: { min: 35, max: 156 } } }
      - name: basic_values3
        stats: { temp: { min: 0, max: 20 },
                 psal: { min: 0, max: 20 },
                 pres: { min: 0, max: 200 } }

"""


def _get_dataset_step_class_sets() -> str:
    """
    Retrieves a YAML template string for dataset step class sets.

    This template maps each step in the dataset preparation pipeline (e.g.,
    input, summary, select, locate, extract, split) to its corresponding
    Python class name.

    :returns: A string containing the YAML template for step class sets.
    :rtype: str
    """
    return """
step_class_sets:
  - name: data_set_step_set_1
    steps:
      input: InputDataSetA
      summary: SummaryDataSetA
      select: SelectDataSetA
      locate: LocateDataSetA
      extract: ExtractDataSetA
      split: SplitDataSetA

"""


def _get_dataset_step_class_sets_all() -> str:
    """
    Retrieves a YAML template string for dataset step class sets with 'All' variants.

    This template maps each step in the dataset preparation pipeline (e.g.,
    input, summary, select, locate, extract, split) to its 'All' variant
    Python class name, indicating a broader application or default behavior.

    :returns: A string containing the YAML template for 'All' step class sets.
    :rtype: str
    """
    return """
step_class_sets:
  - name: data_set_step_set_1
    steps:
      input: InputDataSetA
      summary: SummaryDataSetA
      select: SelectDataSetAll
      locate: LocateDataSetAll
      extract: ExtractDataSetA
      split: SplitDataSetAll

"""


def _get_dataset_step_param_sets() -> str:
    """
    Retrieves a YAML template string for dataset step parameter sets.

    This template defines optional parameters for each step in the dataset
    preparation pipeline, such as input filtering, select ratio, locate neighbors,
    and split fractions or k-fold values.

    :returns: A string containing the YAML template for step parameter sets.
    :rtype: str
    """
    return """
step_param_sets:
  - name: data_set_param_set_1
    steps:
      input: { sub_steps: { rename_columns: false,
                            filter_rows: true },
               rename_dict: { },
               filter_method_dict: { remove_years: [ 2023 ],
                                     keep_years: [ ] } }
      summary: { }
      select: { neg_pos_ratio: 5 }
      locate: { neighbor_n: 5 }
      extract: { }
      split: { test_set_fraction: 0.1,
               k_fold: 5 }

"""


def _get_dataset_step_param_sets_all() -> str:
    """
    Retrieves a YAML template string for dataset step parameter sets with 'All' variants.

    This template defines optional parameters for each step in the dataset
    preparation pipeline, specifically designed for 'All' step variants,
    often implying default or less specific configurations.

    :returns: A string containing the YAML template for 'All' step parameter sets.
    :rtype: str
    """
    return """
step_param_sets:
  - name: data_set_param_set_1
    steps:
      input: { sub_steps: { rename_columns: false,
                            filter_rows: true },
               rename_dict: { },
               filter_method_dict: { remove_years: [ 2023 ],
                                     keep_years: [ ] } }
      summary: { }
      select: { }
      locate: { }
      extract: { }
      split: { test_set_fraction: 0.1,
               k_fold: 5 }

"""


def _get_dataset_data_sets() -> str:
    """
    Retrieves a YAML template string for defining individual data sets.

    This template specifies configurations for a particular dataset, including
    its folder and input file names, and references to other configuration
    sets (e.g., path info, target, summary stats, features, step classes, and step parameters).

    :returns: A string containing the YAML template for data sets.
    :rtype: str
    """
    return """
data_sets:
  - name: dataset_0001  # EDIT: Your data set name
    dataset_folder_name: dataset_0001  # EDIT: Your output folder
    input_file_name: nrt_cora_bo_4.parquet # EDIT: Your input filename
    path_info: data_set_1
    target_set: target_set_1
    summary_stats_set: summary_stats_set_1
    feature_set: feature_set_1
    feature_param_set: feature_set_1_param_set_1
    feature_stats_set: feature_set_1_stats_set_1
    step_class_set: data_set_step_set_1
    step_param_set: data_set_param_set_1

"""


[docs] def get_config_data_set_template() -> str: """ Retrieve a YAML template string for dataset preparation configurations. This template includes: - ``path_info_sets``: specifying common, input, and split paths. - ``target_sets``: defining which variables to process and their flags. - ``summary_stats_sets``: defining summary statistics. - ``feature_sets``: listing named sets of feature extraction modules. - ``feature_param_sets``: detailing parameters for each feature. - ``feature_stats_sets``: detailing methods and stats for normalization. - ``step_class_sets``: referencing classes for each preparation step (e.g., input, summary, select, locate, extract, split). - ``step_param_sets``: referencing parameters for the preparation steps. - ``data_sets``: referencing specific dataset folders, files, and associated configuration sets (e.g., ``step_class_set``, ``step_param_set``). :returns: A string containing the YAML template. :rtype: str """ return ( _get_dataset_path_info_sets() + _get_dataset_target_sets() + _get_dataset_summary_stats_sets() + _get_dataset_feature_sets() + _get_dataset_feature_param_sets() + _get_dataset_feature_stats_sets() + _get_dataset_step_class_sets() + _get_dataset_step_param_sets() + _get_dataset_data_sets() )
[docs] def get_config_data_set_full_template() -> str: """ Retrieve a YAML template string for dataset preparation configurations with normalization. This template includes: - ``path_info_sets``: specifying common, input, and split paths. - ``target_sets``: defining which variables to process and their flags. - ``summary_stats_sets``: defining summary statistics. - ``feature_sets``: listing named sets of feature extraction modules. - ``feature_param_sets``: detailing parameters for each feature. - ``feature_stats_sets``: detailing methods and stats for normalization. - ``step_class_sets``: referencing classes for each preparation step (e.g., input, summary, select, locate, extract, split). - ``step_param_sets``: referencing parameters for the preparation steps. - ``data_sets``: referencing specific dataset folders, files, and associated configuration sets (e.g., ``step_class_set``, ``step_param_set``). :returns: A string containing the YAML template. :rtype: str """ return ( _get_dataset_path_info_sets() + _get_dataset_target_sets() + _get_dataset_summary_stats_sets() + _get_dataset_feature_sets() + _get_dataset_feature_param_sets_full() + _get_dataset_feature_stats_sets_full() + _get_dataset_step_class_sets() + _get_dataset_step_param_sets() + _get_dataset_data_sets() )
[docs] def get_config_data_set_all_template() -> str: """ Retrieve a YAML template string for dataset preparation configurations with 'All' step variants. This template includes: - ``path_info_sets``: specifying common, input, and split paths. - ``target_sets``: defining which variables to process and their flags. - ``summary_stats_sets``: defining summary statistics. - ``feature_sets``: listing named sets of feature extraction modules. - ``feature_param_sets``: detailing parameters for each feature. - ``feature_stats_sets``: detailing methods and stats for normalization. - ``step_class_sets``: referencing classes for each preparation step (e.g., input, summary, select, locate, extract, split) with 'All' variants. - ``step_param_sets``: referencing parameters for the preparation steps with 'All' variants. - ``data_sets``: referencing specific dataset folders, files, and associated configuration sets (e.g., ``step_class_set``, ``step_param_set``). :returns: A string containing the YAML template. :rtype: str """ return ( _get_dataset_path_info_sets() + _get_dataset_target_sets() + _get_dataset_summary_stats_sets() + _get_dataset_feature_sets() + _get_dataset_feature_param_sets() + _get_dataset_feature_stats_sets() + _get_dataset_step_class_sets_all() + _get_dataset_step_param_sets_all() + _get_dataset_data_sets() )
[docs] def get_config_train_set_template() -> str: """ Retrieve a YAML template string for training configurations. This template includes: - ``path_info_sets``: specifying common paths and subfolders for input, validate, and build. - ``target_sets``: defining variables and associated flags for training. - ``step_class_sets``: mapping each step (input, validate, model, build) to corresponding Python class names. - ``step_param_sets``: detailing optional parameters for each training step. - ``training_sets``: referencing specific dataset folders, the ``path_info`` used, the target set, and which ``step_class_set`` and ``step_param_set`` apply. :returns: A string containing the YAML template. :rtype: str """ yaml_template = """ --- path_info_sets: - name: data_set_1 common: base_path: /path/to/data # EDIT: Root output directory input: step_folder_name: training target_sets: - name: target_set_1 variables: - name: temp flag: temp_qc pos_flag_values: [ 4, 6, 7 ] neg_flag_values: [ 1 ] - name: psal flag: psal_qc pos_flag_values: [ 4, 6, 7 ] neg_flag_values: [ 1 ] - name: pres flag: pres_qc pos_flag_values: [ 4, 6, 7 ] neg_flag_values: [ 1 ] step_class_sets: - name: training_step_set_1 steps: input: InputTrainingSetA validate: KFoldValidation model: XGBoost build: BuildModel step_param_sets: - name: training_param_set_1 steps: input: { } validate: { k_fold: 5 } model: { calculate_shap: False, model_params: { scale_pos_weight: 200, n_jobs: -1 } } build: { } training_sets: - name: training_0001 # EDIT: Your training name dataset_folder_name: dataset_0001 # EDIT: Your output folder path_info: data_set_1 target_set: target_set_1 step_class_set: training_step_set_1 step_param_set: training_param_set_1 """ return yaml_template
def _get_classify_path_info_sets() -> str: """ Retrieves a YAML template string for classification path information sets. This template defines common, input, model, and concatenation path configurations for the classification process. :returns: A string containing the YAML template for classification path info sets. :rtype: str """ return """ --- path_info_sets: - name: data_set_1 common: base_path: /path/to/data # EDIT: Root output directory input: base_path: /path/to/input # EDIT: Directory with input files step_folder_name: "" model: base_path: /path/to/data/dataset_0001 # EDIT: Directory with model files step_folder_name: "model" concat: step_folder_name: classify # EDIT: Directory with classification results """ def _get_classify_step_class_sets() -> str: """ Retrieves a YAML template string for classification step class sets. This template maps each step in the classification pipeline (e.g., input, summary, select, locate, extract, model, classify, concat) to its corresponding Python class name, typically using 'All' variants. :returns: A string containing the YAML template for classification step class sets. :rtype: str """ return """ step_class_sets: - name: data_set_step_set_1 steps: input: InputDataSetAll summary: SummaryDataSetAll select: SelectDataSetAll locate: LocateDataSetAll extract: ExtractDataSetAll model: XGBoost classify: ClassifyAll concat: ConcatDataSetAll """ def _get_classify_step_param_sets() -> str: """ Retrieves a YAML template string for classification step parameter sets. This template defines optional parameters for each step in the classification pipeline, such as input filtering rules, and general empty parameters for other steps like summary, select, locate, extract, model, classify, and concat. :returns: A string containing the YAML template for classification step parameter sets. :rtype: str """ return """ step_param_sets: - name: data_set_param_set_1 steps: input: { sub_steps: { rename_columns: false, filter_rows: true }, rename_dict: { }, filter_method_dict: { remove_years: [ ], keep_years: [ 2023 ] } } summary: { } select: { } locate: { } extract: { } model: { calculate_shap: False } classify: { } concat: { } """ def _get_classification_sets() -> str: """ Retrieves a YAML template string for defining individual classification sets. This template specifies configurations for a particular classification run, including its folder and input file names, and references to other configuration sets (e.g., path info, target, summary stats, features, step classes, and step parameters). :returns: A string containing the YAML template for classification sets. :rtype: str """ return """ classification_sets: - name: classification_0001 # EDIT: Your classification name dataset_folder_name: dataset_0001 # EDIT: Your output folder input_file_name: nrt_cora_bo_4.parquet # EDIT: Your input filename path_info: data_set_1 target_set: target_set_1 summary_stats_set: summary_stats_set_1 feature_set: feature_set_1 feature_param_set: feature_set_1_param_set_1 feature_stats_set: feature_set_1_stats_set_1 step_class_set: data_set_step_set_1 step_param_set: data_set_param_set_1 """
[docs] def get_config_classify_set_template() -> str: """ Retrieve a YAML template string for classification configurations. This template includes: - ``path_info_sets``: specifying common, input, model, and concatenation paths. - ``target_sets``: defining which variables to process and their flags. - ``summary_stats_sets``: defining summary statistics. - ``feature_sets``: listing named sets of feature extraction modules. - ``feature_param_sets``: detailing parameters for each feature. - ``feature_stats_sets``: detailing methods and stats for normalization. - ``step_class_sets``: referencing classes for each classification step (e.g., input, summary, select, locate, extract, model, classify, concat). - ``step_param_sets``: referencing parameters for the classification steps. - ``classification_sets``: referencing specific dataset folders, files, and associated configuration sets (e.g., ``step_class_set``, ``step_param_set``). :returns: A string containing the YAML template. :rtype: str """ return ( _get_classify_path_info_sets() + _get_dataset_target_sets() + _get_dataset_summary_stats_sets() + _get_dataset_feature_sets() + _get_dataset_feature_param_sets() + _get_dataset_feature_stats_sets() + _get_classify_step_class_sets() + _get_classify_step_param_sets() + _get_classification_sets() )
[docs] def get_config_classify_set_full_template() -> str: """ Retrieve a YAML template string for classification configurations with normalization. This template includes: - ``path_info_sets``: specifying common, input, model, and concatenation paths. - ``target_sets``: defining which variables to process and their flags. - ``summary_stats_sets``: defining summary statistics. - ``feature_sets``: listing named sets of feature extraction modules. - ``feature_param_sets``: detailing parameters for each feature. - ``feature_stats_sets``: detailing methods and stats for normalization. - ``step_class_sets``: referencing classes for each classification step (e.g., input, summary, select, locate, extract, model, classify, concat). - ``step_param_sets``: referencing parameters for the classification steps. - ``classification_sets``: referencing specific dataset folders, files, and associated configuration sets (e.g., ``step_class_set``, ``step_param_set``). :returns: A string containing the YAML template. :rtype: str """ return ( _get_classify_path_info_sets() + _get_dataset_target_sets() + _get_dataset_summary_stats_sets() + _get_dataset_feature_sets() + _get_dataset_feature_param_sets_full() + _get_dataset_feature_stats_sets_full() + _get_classify_step_class_sets() + _get_classify_step_param_sets() + _get_classification_sets() )