Source code for aiqclib.common.utils.file

"""
This module provides utility functions for reading various file formats into Polars DataFrames.

It supports common data formats like Parquet, TSV (tab-separated values), and CSV
(comma-separated values), including their gzipped versions, and allows for automatic
file type inference based on file extensions.
"""

import os
from typing import Dict, Any, Optional

import polars as pl


[docs] def read_input_file( input_file: str, file_type: Optional[str] = None, options: Optional[Dict[str, Any]] = None, ) -> pl.DataFrame: """ Read an input file into a Polars DataFrame, supporting formats such as Parquet, TSV (optionally gzipped), and CSV (optionally gzipped). :param input_file: The full path to the file to be read. :type input_file: str :param file_type: The file format. Must be one of: - "parquet" - "tsv" - "tsv.gz" - "csv" - "csv.gz" If set to None or an empty string, the file type is inferred from the file extension. Defaults to None. :type file_type: Optional[str] :param options: A dictionary of additional keyword arguments to pass to the Polars reading function (e.g., "has_header", "infer_schema_length"). Defaults to None. :type options: Optional[Dict[str, Any]] :raises FileNotFoundError: If the specified ``input_file`` does not exist. :raises ValueError: If the file type cannot be inferred or is not supported. :returns: A Polars DataFrame containing the contents of the file. :rtype: pl.DataFrame Example Usage: >>> import polars as pl >>> # Assuming 'data.parquet' and 'data.tsv.gz' exist for demonstration >>> # df = read_input_file("data.parquet") >>> # df2 = read_input_file("data.tsv.gz", file_type="tsv.gz", options={"has_header": True}) """ if not os.path.exists(input_file): raise FileNotFoundError(f"File '{input_file}' does not exist.") if options is None: options = {} # Infer file type based on file extension if not provided. if not file_type: filename = os.path.basename(input_file).lower() if filename.endswith(".parquet"): file_type = "parquet" elif filename.endswith(".tsv.gz"): file_type = "tsv.gz" elif filename.endswith(".tsv"): file_type = "tsv" elif filename.endswith(".csv.gz"): file_type = "csv.gz" elif filename.endswith(".csv"): file_type = "csv" else: raise ValueError( "Could not infer file type automatically. Please specify 'file_type' explicitly." ) # Read the file using the appropriate Polars function. if file_type == "parquet": df = pl.read_parquet(input_file, **options) elif file_type in ("tsv", "tsv.gz"): df = pl.read_csv(input_file, separator="\t", **options) elif file_type in ("csv", "csv.gz"): df = pl.read_csv(input_file, **options) else: raise ValueError( f"Unsupported file_type '{file_type}'. Must be one of: " "'parquet', 'tsv', 'tsv.gz', 'csv', 'csv.gz'." ) return df