Source code for mth5.tables.fc_table

# -*- coding: utf-8 -*-
"""
Tabulate Fourier coefficients stored in an MTH5 file.

This module provides a small utility for summarizing Fourier-coefficient
datasets (e.g., `FCChannel`) into a structured table and exporting
to a convenient `pandas.DataFrame` for querying and analysis.

Notes
-----
- A basic test for this module exists under
    ``mth5/tests/version_1/test_fcs.py``.
- The table is populated by traversing the HDF5 hierarchy and collecting
    entries for datasets labeled with the attribute ``mth5_type='FCChannel'``.

"""

from __future__ import annotations

import h5py
import numpy as np

# =============================================================================
# Imports
# =============================================================================
import pandas as pd

from mth5 import FC_DTYPE
from mth5.tables import MTH5Table


# =============================================================================


[docs] class FCSummaryTable(MTH5Table): """ Summary table for Fourier coefficients. This class wraps an HDF5 dataset that stores a summary of Fourier coefficient datasets and provides convenience functions such as `summarize()` (to populate the table) and `to_dataframe()` (to export entries). Examples -------- Populate and export a summary from an existing MTH5 file:: >>> import h5py >>> from mth5.tables.fc_table import FCSummaryTable >>> f = h5py.File('example.mth5', 'r') >>> # Assume the summary dataset already exists at this path >>> table_ds = f['Exchange']['FC_Summary'] >>> fc_table = FCSummaryTable(table_ds) >>> fc_table.summarize() # walk the file and fill entries >>> df = fc_table.to_dataframe() >>> df.head() """ def __init__(self, hdf5_dataset: h5py.Dataset) -> None: super().__init__(hdf5_dataset, FC_DTYPE)
[docs] def to_dataframe(self) -> pd.DataFrame: """ Convert the table to a `pandas.DataFrame` for easier querying. Returns ------- pandas.DataFrame A dataframe with decoded string columns and parsed start/end timestamps. Examples -------- Export to a dataframe and filter by component:: >>> df = fc_table.to_dataframe() >>> df[df.component == 'ex'] """ if self.array is None: raise ValueError("Summary table dataset is not initialized.") df = pd.DataFrame(self.array[()]) for key in [ "survey", "station", "run", "component", "measurement_type", "units", ]: setattr(df, key, getattr(df, key).str.decode("utf-8")) try: df.start = pd.to_datetime(df.start.str.decode("utf-8"), format="mixed") df.end = pd.to_datetime(df.end.str.decode("utf-8"), format="mixed") except ValueError: df.start = pd.to_datetime(df.start.str.decode("utf-8")) df.end = pd.to_datetime(df.end.str.decode("utf-8")) return df
[docs] def summarize(self) -> None: """ Populate the summary table by traversing the HDF5 hierarchy. The traversal searches for datasets with attribute ``mth5_type == 'FCChannel'`` and adds a corresponding summary row for each. Returns ------- None Notes ----- - If the table contains rows from a different OS/encoding, row insertion can raise a `ValueError`. A warning is logged and processing continues for subsequent rows. Examples -------- Refresh the table entries:: >>> fc_table.clear_table() >>> fc_table.summarize() """ def recursive_get_fc_entry( group: h5py.Group | h5py.File | h5py.Dataset, ) -> None: """Recursively collect FC summary entries from the hierarchy.""" if isinstance(group, (h5py.Group, h5py.File)): for key, node in group.items(): recursive_get_fc_entry(node) elif isinstance(group, h5py.Dataset): try: ch_type = group.attrs["mth5_type"] if ch_type in [ "FCChannel", ]: fc_entry = _get_fc_entry(group) try: self.add_row(fc_entry) except ValueError as error: msg = ( f"{error}. " "it is possible that the OS that made the table is not the OS operating on it." ) self.logger.warning(msg) except KeyError: pass self.clear_table() # self.fc_entries = [] if self.array is None or getattr(self.array, "parent", None) is None: raise ValueError("Summary table dataset parent is not available.") parent = self.array.parent # Allow Mock objects and dictionaries for testing, in addition to h5py types if not ( isinstance(parent, (h5py.Group, h5py.File, h5py.Dataset)) or hasattr(parent, "items") or isinstance(parent, dict) ): raise TypeError("Unexpected parent type for summary dataset.") recursive_get_fc_entry(parent)
# for row in self.fc_entries: # try: # self.add_row(row) # except Exception as ee: # msg = f"Failed due to unknown exception {e}" # self.logger.warning(msg) # return def _get_fc_entry( group: h5py.Dataset, dtype: np.dtype | None = FC_DTYPE, ) -> np.ndarray: """ Build a single FC summary table row. Parameters ---------- group : h5py._hl.dataset.Dataset The HDF5 dataset representing a Fourier-coefficient channel (i.e., with attribute ``mth5_type='FCChannel'``). dtype : numpy.dtype, optional The dtype describing the summary table schema. Defaults to :data:`mth5.FC_DTYPE`. Returns ------- numpy.ndarray A 1-row structured array matching the summary table schema. Examples -------- Create a row for an existing FC dataset:: >>> fc_ds = f['Survey']['station']['run']['FC']['ex'] >>> row = _get_fc_entry(fc_ds) >>> row.dtype == FC_DTYPE True """ def _as_bytes(value: object) -> bytes: try: if isinstance(value, np.ndarray): value = value.item() if value.shape == () else value[0] except Exception: pass if isinstance(value, bytes): return value return str(value).encode("utf-8") fc_entry = np.array( [ ( _as_bytes( group.parent.parent.parent.parent.parent.parent.attrs["id"] ), # get survey from FCChannel _as_bytes( group.parent.parent.parent.parent.attrs["id"] ), # get station from FCChannel group.parent.parent.attrs["id"], # get run from FCChannel group.parent.attrs[ "decimation_level" ], # get decimation_level from FCChannel group.parent.parent.parent.parent.attrs["location.latitude"], group.parent.parent.parent.parent.attrs["location.longitude"], group.parent.parent.parent.parent.attrs["location.elevation"], group.attrs["component"], group.attrs["time_period.start"], group.attrs["time_period.end"], group.size, group.attrs["sample_rate_window_step"], group.attrs["mth5_type"], # group.attrs["measurement_azimuth"], # DO NOT go to the time series to access this info # group.attrs["measurement_tilt"], # the time series may not be in the mth5 # TODO: add azimuth and tilt on FCChannel creation group.attrs["units"], group.ref, group.parent.ref, group.parent.parent.ref, group.parent.parent.parent.parent.ref, ) ], dtype=dtype, ) return fc_entry