Source code for aiqclib.interface.prepare

"""
Data Preparation Pipeline Orchestrator

This module orchestrates the creation of a training dataset by sequentially
loading and processing data through multiple preparation steps. It defines
the `create_training_dataset` function, which acts as the main entry point
for initiating the multi-stage data pipeline, from raw input to final
training and validation datasets.
"""

from aiqclib.common.base.config_base import ConfigBase
from aiqclib.common.loader.dataset_loader import (
    load_step1_input_dataset,
    load_step2_summary_dataset,
    load_step3_select_dataset,
    load_step4_locate_dataset,
    load_step5_extract_dataset,
    load_step6_split_dataset,
)


[docs] def create_training_dataset(config: ConfigBase) -> None: """ Execute a series of steps to produce a training dataset. This function orchestrates the sequential loading and processing of data through multiple preparation steps, as defined by the provided configuration object. It relies on a series of helper functions (e.g., ``load_stepX_dataset``) and class methods to perform distinct operations, ultimately generating and writing the final training and validation datasets. The processing involves the following stages: 1. Input Data Loading: Reads and prepares the initial raw data. 2. Summary Statistics Calculation: Computes and stores aggregate statistics. 3. Profile Selection: Identifies and labels specific profiles or data subsets. 4. Target Row Location: Pinpoints specific rows of interest within profiles. 5. Feature Extraction: Derives modeling features from the located rows. 6. Dataset Splitting: Divides features into training and validation sets. :param config: A configuration object specifying the classes and parameters for each step in the dataset preparation process. :type config: aiqclib.common.base.config_base.ConfigBase :return: None. This function performs I/O operations and does not return a value. :rtype: None :Example: .. code-block:: python from aiqclib.common.base.config_base import ConfigBase cfg = ConfigBase(...) create_training_dataset(cfg) """ # Step 1: Load and read raw input ds_input = load_step1_input_dataset(config) ds_input.read_input_data() # Step 2: Calculate and save summary statistics ds_summary = load_step2_summary_dataset(config, ds_input.input_data) ds_summary.calculate_stats() ds_summary.write_summary_stats() # Step 3: Label and save selected profiles ds_select = load_step3_select_dataset(config, ds_input.input_data) ds_select.label_profiles() ds_select.write_selected_profiles() # Step 4: Locate and save target rows ds_locate = load_step4_locate_dataset( config, ds_input.input_data, ds_select.selected_profiles ) ds_locate.process_targets() ds_locate.write_selected_rows() # Step 5: Extract and save features ds_extract = load_step5_extract_dataset( config, ds_input.input_data, ds_select.selected_profiles, ds_locate.selected_rows, ds_summary.summary_stats, ) ds_extract.process_targets() ds_extract.write_target_features() # Step 6: Split and save final datasets ds_split = load_step6_split_dataset(config, ds_extract.target_features) ds_split.process_targets() ds_split.write_data_sets()