Source code for mth5.groups.standards

# -*- coding: utf-8 -*-
"""
Created on Wed Dec 23 17:05:33 2020

:copyright:
    Jared Peacock (jpeacock@usgs.gov)

:license: MIT

"""

# =============================================================================
# Imports
# =============================================================================
from __future__ import annotations

import inspect
from typing import Any, Optional

import numpy as np
from mt_metadata import timeseries
from mt_metadata.base import BaseDict
from mt_metadata.timeseries import filters
from mt_metadata.utils.summarize import summarize_standards
from mt_metadata.utils.validators import validate_attribute

from mth5 import STANDARDS_DTYPE
from mth5.groups.base import BaseGroup
from mth5.tables import MTH5Table
from mth5.utils.exceptions import MTH5TableError


[docs] ts_classes = dict(inspect.getmembers(timeseries, inspect.isclass))
[docs] flt_classes = dict(inspect.getmembers(filters, inspect.isclass))
# ============================================================================= # Summarize standards # =============================================================================
[docs] def summarize_metadata_standards() -> BaseDict: """ Summarize metadata standards into a dictionary. Aggregates metadata standard definitions from timeseries and filter classes, creating a flattened dictionary suitable for storage in the standards summary table. Returns ------- BaseDict Flattened dictionary containing metadata standards for all supported classes (Survey, Station, Run, Electric, Magnetic, Auxiliary, and various Filter types). Notes ----- Creates copies of attribute dictionaries to avoid mutations to the original class definitions. Examples -------- >>> standards = summarize_metadata_standards() >>> 'survey' in standards True >>> 'electric' in standards True """ # need to be sure to make copies otherwise things will get # added in not great places. summary_dict = BaseDict() for key in [ "survey", "station", "run", "electric", "magnetic", "auxiliary", ]: obj = ts_classes[key.capitalize()]() summary_dict.add_dict(obj._attr_dict.copy(), key) for key in [ "Coefficient", "FIR", "FrequencyResponseTable", "PoleZero", "TimeDelay", ]: key += "Filter" obj = flt_classes[key]() summary_dict.add_dict(obj._attr_dict.copy(), validate_attribute(key)) return summary_dict
# ============================================================================= # Standards Group # =============================================================================
[docs] class StandardsGroup(BaseGroup): """ Container for metadata standards documentation stored in the HDF5 file. Stores metadata standards used throughout the survey in a standardized summary table. This enables users to understand metadata directly from the file without requiring external documentation. The standards are organized in a summary table at ``/Survey/Standards/summary`` with columns for attribute name, type, requirements, style, units, and descriptions. Attributes ---------- summary_table : MTH5Table The standards summary table with metadata definitions. Notes ----- Standards include definitions for: - Survey, Station, Run, Electric, Magnetic, Auxiliary metadata - Filter types: Coefficient, FIR, FrequencyResponseTable, PoleZero, TimeDelay - Processing standards from aurora and fourier_coefficients modules Examples -------- >>> with MTH5('survey.mth5') as mth5_obj: ... standards = mth5_obj.standards_group ... summary = standards.summary_table ... print(summary.array.dtype.names) ('attribute', 'type', 'required', 'style', 'units', 'description', ...) Get information about a specific attribute: >>> standards.get_attribute_information('survey.release_license') survey.release_license -------------------------- type : string required : True style : controlled vocabulary ... """ def __init__(self, group: Any, **kwargs: Any) -> None: """ Initialize StandardsGroup. Parameters ---------- group : h5py.Group HDF5 group to manage standards data. **kwargs : Any Additional keyword arguments passed to BaseGroup. """ super().__init__(group, **kwargs) self._defaults_summary_attrs = { "name": "summary", "max_shape": (1000,), "dtype": STANDARDS_DTYPE, } self._modules = [ "common", "timeseries", "timeseries.filters", "transfer_functions.tf", "features", "features.weights", "processing", "processing.fourier_coefficients", "processing.aurora", ] @property
[docs] def summary_table(self) -> MTH5Table: return self._get_summary_table()
def _get_summary_table(self) -> MTH5Table: """ Get the standards summary table from HDF5. Returns ------- MTH5Table The MTH5Table object wrapping the standards summary dataset. """ return MTH5Table(self.hdf5_group["summary"], STANDARDS_DTYPE)
[docs] def get_attribute_information(self, attribute_name: str) -> None: """ Print detailed information about a metadata attribute. Retrieves and displays all metadata standards information for the specified attribute from the standards summary table. Parameters ---------- attribute_name : str Name of the attribute to describe (e.g., 'survey.release_license'). Raises ------ MTH5TableError If the attribute is not found in the standards summary table. Notes ----- Prints formatted output including: - Data type - Whether attribute is required - Style (e.g., controlled vocabulary) - Units - Description - Valid options - Aliases - Example values - Default value Examples -------- >>> standards = mth5_obj.standards_group >>> standards.get_attribute_information('survey.release_license') survey.release_license -------------------------- type : string required : True style : controlled vocabulary units : description : How the data can be used. The options are based on Creative Commons licenses. options : CC-0,CC-BY,CC-BY-SA,CC-BY-ND,CC-BY-NC-SA alias : example : CC-0 default : CC-0 """ find = self.summary_table.locate("attribute", attribute_name) if len(find) == 0: msg = f"Could not find {attribute_name} in standards." self.logger.error(msg) raise MTH5TableError(msg) meta_item = self.summary_table.array[find] lines = ["", attribute_name, "-" * (len(attribute_name) + 4)] for name, value in zip(meta_item.dtype.names[1:], meta_item.item()[1:]): if isinstance(value, (bytes, np.bytes_)): value = value.decode() lines.append("\t{0:<14} {1}".format(name + ":", value)) print("\n".join(lines))
[docs] def summary_table_from_dict(self, summary_dict: dict[str, Any]) -> None: """ Populate summary table from a dictionary of metadata standards. Converts a flattened dictionary of metadata standards into rows in the HDF5 summary table. Parameters ---------- summary_dict : dict[str, Any] Flattened dictionary of all metadata standards. Keys are attribute names, values are dictionaries with type, required, style, units, description, etc. Notes ----- Processes dictionary values: - Lists are converted to comma-separated strings - None values become empty strings - Bytes are decoded to UTF-8 TODO ---- Adapt method to accept pandas.DataFrame as alternative input. Examples -------- >>> standards = StandardsGroup(group) >>> metadata = summarize_metadata_standards() >>> standards.summary_table_from_dict(metadata) """ for key, v_dict in summary_dict.items(): key_list = [key] for dkey in self.summary_table.dtype.names[1:]: value = v_dict[dkey] if isinstance(value, list): if len(value) == 0: value = "" else: value = ",".join(["{0}".format(ii) for ii in value]) if value is None: value = "" key_list.append(value) key_list = np.array([tuple(key_list)], self.summary_table.dtype) index = self.summary_table.add_row(key_list) self.logger.debug(f"Added {index} rows to Standards Group")
[docs] def get_standards_summary(self, modules: Optional[list[str]] = None) -> np.ndarray: """ Get standards for specified metadata modules. Retrieves and concatenates standards arrays from one or more metadata modules for inclusion in the standards table. Parameters ---------- modules : list[str], optional List of module names to include (e.g., 'timeseries', 'filters'). If None, uses default modules: common, timeseries, timeseries.filters, transfer_functions.tf, features, features.weights, processing, processing.fourier_coefficients, processing.aurora. Default is None. Returns ------- np.ndarray Concatenated numpy structured array containing standards for all requested modules with dtype matching STANDARDS_DTYPE. Examples -------- >>> standards = StandardsGroup(group) >>> ts_standards = standards.get_standards_summary(['timeseries']) >>> print(ts_standards.shape) (45,) Get all default modules: >>> all_standards = standards.get_standards_summary() """ if modules is None: modules = self._modules summaries = [] for module in modules: summaries.append( summarize_standards(module, output_type="array", dtype=STANDARDS_DTYPE) ) return np.concatenate(summaries)
[docs] def summary_table_from_array(self, array: np.ndarray) -> None: """ Populate summary table from a numpy structured array. Converts a structured numpy array into rows in the HDF5 summary table. Parameters ---------- array : np.ndarray Structured numpy array with dtype matching STANDARDS_DTYPE. Each row represents one metadata attribute definition. Notes ----- Iterates through all rows of the structured array and adds them sequentially to the summary table using add_row(). Examples -------- >>> standards = StandardsGroup(group) >>> standards_array = standards.get_standards_summary() >>> standards.summary_table_from_array(standards_array) """ summary_table = self._get_summary_table() for index, row in enumerate(np.nditer(array)): index = summary_table.add_row(row) self.logger.debug(f"Added {index} rows to Standards Group")
[docs] def initialize_group(self) -> None: """ Initialize the standards group and create the summary table. Creates the summary table dataset in the HDF5 file and populates it with metadata standards from all default modules. Sets appropriate HDF5 attributes and writes the group metadata. Notes ----- Initialization process: 1. Creates HDF5 dataset for summary table with maximum expandable shape 2. Applies compression if configured in dataset_options 3. Sets HDF5 attributes: type, last_updated, reference 4. Populates table with standards from all default modules 5. Writes group metadata to HDF5 The summary table uses STANDARDS_DTYPE and supports up to 1000 rows. Examples -------- >>> mth5_obj.initialize_group() >>> summary_table = mth5_obj.standards_group.summary_table >>> print(summary_table.array.shape) (342,) """ if self.dataset_options["compression"] is None: summary_dataset = self.hdf5_group.create_dataset( self._defaults_summary_attrs["name"], (0,), maxshape=self._defaults_summary_attrs["max_shape"], dtype=self._defaults_summary_attrs["dtype"], ) else: summary_dataset = self.hdf5_group.create_dataset( self._defaults_summary_attrs["name"], (0,), maxshape=self._defaults_summary_attrs["max_shape"], dtype=self._defaults_summary_attrs["dtype"], **self.dataset_options, ) summary_dataset.attrs.update( { "type": "summary table", "last_updated": "date_time", "reference": summary_dataset.ref, } ) self.logger.debug( f"Created {self._defaults_summary_attrs['name']} table with " f"max_shape = {self._defaults_summary_attrs['max_shape']}, " "dtype={self._defaults_summary_attrs['dtype']}" ) self.logger.debug( "used options: " "; ".join([f"{k} = {v}" for k, v in self.dataset_options.items()]) ) self.summary_table_from_array(self.get_standards_summary()) self.write_metadata()