"""Utilities for generating and formatting summary statistics.
This module provides high-level functions to calculate and display summary
statistics for a given dataset file. It uses a predefined configuration
template to process the data, compute statistics at both global and
per-profile levels, and format the results for human-readable output.
"""
import io
import os
import pprint
from typing import List, Dict
import polars as pl
from aiqclib.common.config.dataset_config import DataSetConfig
from aiqclib.common.loader.dataset_loader import load_step1_input_dataset
from aiqclib.common.loader.dataset_loader import load_step2_summary_dataset
[docs]
def get_summary_stats(input_file: str, summary_type: str) -> pl.DataFrame:
"""Calculate and retrieve summary statistics from a dataset file.
This function loads a dataset, computes global and per-profile summary
statistics, and returns the requested type of summary as a Polars DataFrame.
It uses a built-in configuration template and dynamically sets the input
path based on the provided file.
:param input_file: The path to the input dataset file (e.g., a TSV or Parquet file).
:type input_file: str
:param summary_type: The type of summary to return. Supported values are
"profiles" (for per-profile stats) and "all" (for global stats).
:type summary_type: str
:raises FileNotFoundError: If the ``input_file`` does not exist.
:raises ValueError: If the ``summary_type`` is not a supported value.
:return: A Polars DataFrame containing the requested summary statistics.
:rtype: polars.DataFrame
"""
config = DataSetConfig("template:data_sets")
if not os.path.exists(input_file):
raise FileNotFoundError(f"File '{input_file}' does not exist.")
config.select("dataset_0001")
config.data["path_info"]["input"]["base_path"] = os.path.dirname(input_file)
config.data["input_file_name"] = os.path.basename(input_file)
ds_input = load_step1_input_dataset(config)
ds_input.read_input_data()
ds_summary = load_step2_summary_dataset(config, ds_input.input_data)
ds_summary.calculate_stats()
ds_summary.create_summary_stats_observation()
ds_summary.create_summary_stats_profile()
selectors = {
"profiles": ds_summary.summary_stats_profile,
"all": ds_summary.summary_stats_observation,
}
if summary_type not in selectors:
raise ValueError(f"Summary type {summary_type} is not supported.")
return selectors[summary_type]
def _format_with_stats_column(
df: pl.DataFrame, variables: List[str], summary_stats: List[str]
) -> Dict:
"""Format a Polars DataFrame with a 'stats' column into a nested dictionary.
This helper function processes a DataFrame typically generated by
profile-level statistics calculation, where each row represents a specific
statistic (e.g., mean, min, max, etc.) for a variable. It filters the data
based on provided variables and summary statistics, and structures the
output as a nested dictionary: ``{variable: {statistic: {min: value, max: value}}}``.
:param df: The input Polars DataFrame. Expected to contain columns
like "variable", "stats", "min", and "max".
:type df: polars.DataFrame
:param variables: A list of variable names to include in the output.
If the list is empty, all variables found in the DataFrame
will be included.
:type variables: List[str]
:param summary_stats: A list of specific statistic names (e.g., "mean", "sd", "pct25")
to include in the output. Only statistics present in this list
and in the DataFrame's "stats" column will be processed.
:type summary_stats: List[str]
:return: A nested dictionary representing the formatted statistics.
Keys are variable names, and nested keys are statistic names.
:rtype: Dict
"""
stats_dict = {}
for row in df.iter_rows(named=True):
if (row["stats"] not in summary_stats) or (
len(variables) > 0 and row["variable"] not in variables
):
continue
stats_dict.setdefault(row["variable"], {})[row["stats"]] = {
"min": round(row["min"], 2),
"max": round(row["max"], 2),
}
return stats_dict
def _format_without_stats_column(
df: pl.DataFrame, variables: List[str], _: List[str]
) -> Dict:
"""Format a Polars DataFrame without a 'stats' column into a dictionary.
This helper function processes a DataFrame where each row represents
summary statistics (min, max) for a single variable, typically used
for 'all' (global) statistics that do not have a 'stats' sub-column.
It filters the data based on provided variable names.
The third parameter `_` is intentionally ignored to maintain a consistent
signature with :func:`_format_with_stats_column` when called by
:func:`format_summary_stats`.
:param df: The input Polars DataFrame. Expected to contain columns
like "variable", "min", and "max".
:type df: polars.DataFrame
:param variables: A list of variable names to include in the output.
If the list is empty, all variables found in the DataFrame
will be included.
:type variables: List[str]
:param _: An unused parameter (conventionally denoted by an underscore)
to maintain signature consistency with other formatting functions.
This parameter is ignored in the processing logic.
:type _: List[str]
:return: A dictionary where keys are variable names and values are
dictionaries containing "min" and "max" statistics for that variable.
:rtype: Dict
"""
stats_dict = {}
for row in df.iter_rows(named=True):
if len(variables) > 0 and row["variable"] not in variables:
continue
stats_dict[row["variable"]] = {
"min": round(row["min"], 2),
"max": round(row["max"], 2),
}
return stats_dict