Source code for mth5.io.metronix.metronix_collection

# -*- coding: utf-8 -*-
"""
Metronix collection utilities for managing ATSS files.

This module provides classes for collecting and managing Metronix ATSS
(Audio Time Series System) files and creating pandas DataFrames with
metadata for processing workflows.

Classes
-------
MetronixCollection
    Collection class for managing Metronix ATSS files

Created on Fri Nov 22 13:22:44 2024

@author: jpeacock
"""

# =============================================================================
# Imports
# =============================================================================

from pathlib import Path
from typing import Any, Union

import pandas as pd

from mth5.io.collection import Collection
from mth5.io.metronix import ATSS


# =============================================================================


[docs] class MetronixCollection(Collection): """ Collection class for managing Metronix ATSS files. This class extends the base Collection class to handle Metronix ATSS (Audio Time Series System) files and their associated JSON metadata files. It provides functionality to create pandas DataFrames with comprehensive metadata for processing workflows. Parameters ---------- file_path : Union[str, Path, None], optional Path to directory containing Metronix ATSS files, by default None **kwargs Additional keyword arguments passed to parent Collection class Attributes ---------- file_ext : list[str] List of file extensions to search for (["atss"]) Examples -------- >>> from mth5.io.metronix import MetronixCollection >>> collection = MetronixCollection("/path/to/metronix/files") >>> df = collection.to_dataframe(sample_rates=[128, 256]) """ def __init__(self, file_path: Union[str, Path, None] = None, **kwargs: Any) -> None: super().__init__(file_path=file_path, **kwargs)
[docs] self.file_ext: list[str] = ["atss"]
[docs] def to_dataframe( self, sample_rates: list[int] = [128], run_name_zeros: int = 0, calibration_path: Union[str, Path, None] = None, ) -> pd.DataFrame: """ Create DataFrame for Metronix timeseries ATSS + JSON file sets. Processes all ATSS files in the collection directory, extracts metadata, and creates a comprehensive pandas DataFrame with information about each channel including timing, location, and instrument details. Parameters ---------- sample_rates : list[int], optional List of sample rates to include in Hz, by default [128] run_name_zeros : int, optional Number of zeros for zero-padding run names. If 0, run names are unchanged. If > 0, run names are formatted as 'sr{sample_rate}_{run_number:0{zeros}d}', by default 0 calibration_path : Union[str, Path, None], optional Path to calibration files (currently unused), by default None Returns ------- pd.DataFrame DataFrame with columns: - survey: Survey ID - station: Station ID - run: Run ID - start: Start time (datetime) - end: End time (datetime) - channel_id: Channel number - component: Component name (ex, ey, hx, hy, hz) - fn: File path - sample_rate: Sample rate in Hz - file_size: File size in bytes - n_samples: Number of samples - sequence_number: Sequence number (always 0) - dipole: Dipole length (always 0) - coil_number: Coil serial number (magnetic channels only) - latitude: Latitude in decimal degrees - longitude: Longitude in decimal degrees - elevation: Elevation in meters - instrument_id: Instrument/system number - calibration_fn: Calibration file path (always None) Examples -------- >>> collection = MetronixCollection("/path/to/files") >>> df = collection.to_dataframe(sample_rates=[128, 256]) >>> df = collection.to_dataframe(run_name_zeros=4) # Zero-pad run names """ entries = [] for atss_fn in set(self.get_files(self.file_ext)): atss_obj = ATSS(atss_fn) if not atss_obj.sample_rate in sample_rates: continue ch_metadata = atss_obj.channel_metadata entry = self.get_empty_entry_dict() entry["survey"] = atss_obj.survey_id entry["station"] = atss_obj.station_id entry["run"] = atss_obj.run_id entry["start"] = ch_metadata.time_period.start entry["end"] = ch_metadata.time_period.end entry["channel_id"] = atss_obj.channel_number entry["component"] = atss_obj.component entry["fn"] = atss_fn entry["sample_rate"] = ch_metadata.sample_rate entry["file_size"] = atss_obj.file_size entry["n_samples"] = atss_obj.n_samples entry["sequence_number"] = 0 entry["dipole"] = 0 if ch_metadata.type in ["magnetic"]: entry["coil_number"] = ch_metadata.sensor.id entry["latitude"] = ch_metadata.location.latitude entry["longitude"] = ch_metadata.location.longitude entry["elevation"] = ch_metadata.location.elevation else: entry["coil_number"] = None entry["latitude"] = ch_metadata.positive.latitude entry["longitude"] = ch_metadata.positive.longitude entry["elevation"] = ch_metadata.positive.elevation entry["instrument_id"] = atss_obj.system_number entry["calibration_fn"] = None entries.append(entry) # make pandas dataframe and set data types df = self._sort_df(self._set_df_dtypes(pd.DataFrame(entries)), run_name_zeros) return df
[docs] def assign_run_names(self, df: pd.DataFrame, zeros: int = 0) -> pd.DataFrame: """ Assign formatted run names based on sample rate and run number. If zeros is 0, run names are unchanged. Otherwise, run names are formatted as 'sr{sample_rate}_{run_number:0{zeros}d}' where the run number is extracted from the original run name after the first underscore. Parameters ---------- df : pd.DataFrame DataFrame containing run information with 'run' and 'sample_rate' columns zeros : int, optional Number of zeros for zero-padding run numbers. If 0, run names are unchanged, by default 0 Returns ------- pd.DataFrame DataFrame with updated run names Examples -------- >>> df = pd.DataFrame({ ... 'run': ['run_1', 'run_2'], ... 'sample_rate': [128, 256] ... }) >>> collection = MetronixCollection() >>> result = collection.assign_run_names(df, zeros=3) >>> print(result['run'].tolist()) ['sr128_001', 'sr256_002'] Notes ----- The method expects run names to be in format 'prefix_number' where 'number' can be extracted and converted to an integer for formatting. """ if zeros == 0: return df for row in df.itertuples(): df.loc[ row.Index, "run" ] = f"sr{row.sample_rate:.0f}_{int(row.run.split('_')[1]):0{zeros}}" return df