Source code for mth5.io.metronix.metronix_collection

# -*- coding: utf-8 -*-
"""
Metronix collection utilities for managing ATSS files.

This module provides classes for collecting and managing Metronix ATSS
(Audio Time Series System) files and creating pandas DataFrames with
metadata for processing workflows.

Classes
-------
MetronixCollection
    Collection class for managing Metronix ATSS files

Created on Fri Nov 22 13:22:44 2024

@author: jpeacock
"""

# =============================================================================
# Imports
# =============================================================================

from pathlib import Path
from typing import Any, Union

import pandas as pd

from mth5.io.collection import Collection
from mth5.io.metronix import ATSS


# =============================================================================



[docs]
class MetronixCollection(Collection):
    """
    Collection class for managing Metronix ATSS files.

    This class extends the base Collection class to handle Metronix ATSS
    (Audio Time Series System) files and their associated JSON metadata files.
    It provides functionality to create pandas DataFrames with comprehensive
    metadata for processing workflows.

    Parameters
    ----------
    file_path : Union[str, Path, None], optional
        Path to directory containing Metronix ATSS files, by default None
    **kwargs
        Additional keyword arguments passed to parent Collection class

    Attributes
    ----------
    file_ext : list[str]
        List of file extensions to search for (["atss"])

    Examples
    --------
    >>> from mth5.io.metronix import MetronixCollection
    >>> collection = MetronixCollection("/path/to/metronix/files")
    >>> df = collection.to_dataframe(sample_rates=[128, 256])
    """

    def __init__(self, file_path: Union[str, Path, None] = None, **kwargs: Any) -> None:
        super().__init__(file_path=file_path, **kwargs)

[docs]
        self.file_ext: list[str] = ["atss"]



[docs]
    def to_dataframe(
        self,
        sample_rates: list[int] = [128],
        run_name_zeros: int = 0,
        calibration_path: Union[str, Path, None] = None,
    ) -> pd.DataFrame:
        """
        Create DataFrame for Metronix timeseries ATSS + JSON file sets.

        Processes all ATSS files in the collection directory, extracts metadata,
        and creates a comprehensive pandas DataFrame with information about each
        channel including timing, location, and instrument details.

        Parameters
        ----------
        sample_rates : list[int], optional
            List of sample rates to include in Hz, by default [128]
        run_name_zeros : int, optional
            Number of zeros for zero-padding run names. If 0, run names
            are unchanged. If > 0, run names are formatted as
            'sr{sample_rate}_{run_number:0{zeros}d}', by default 0
        calibration_path : Union[str, Path, None], optional
            Path to calibration files (currently unused), by default None

        Returns
        -------
        pd.DataFrame
            DataFrame with columns:
            - survey: Survey ID
            - station: Station ID
            - run: Run ID
            - start: Start time (datetime)
            - end: End time (datetime)
            - channel_id: Channel number
            - component: Component name (ex, ey, hx, hy, hz)
            - fn: File path
            - sample_rate: Sample rate in Hz
            - file_size: File size in bytes
            - n_samples: Number of samples
            - sequence_number: Sequence number (always 0)
            - dipole: Dipole length (always 0)
            - coil_number: Coil serial number (magnetic channels only)
            - latitude: Latitude in decimal degrees
            - longitude: Longitude in decimal degrees
            - elevation: Elevation in meters
            - instrument_id: Instrument/system number
            - calibration_fn: Calibration file path (always None)

        Examples
        --------
        >>> collection = MetronixCollection("/path/to/files")
        >>> df = collection.to_dataframe(sample_rates=[128, 256])
        >>> df = collection.to_dataframe(run_name_zeros=4)  # Zero-pad run names
        """
        entries = []
        for atss_fn in set(self.get_files(self.file_ext)):
            atss_obj = ATSS(atss_fn)
            if not atss_obj.sample_rate in sample_rates:
                continue
            ch_metadata = atss_obj.channel_metadata

            entry = self.get_empty_entry_dict()
            entry["survey"] = atss_obj.survey_id
            entry["station"] = atss_obj.station_id
            entry["run"] = atss_obj.run_id
            entry["start"] = ch_metadata.time_period.start
            entry["end"] = ch_metadata.time_period.end
            entry["channel_id"] = atss_obj.channel_number
            entry["component"] = atss_obj.component
            entry["fn"] = atss_fn
            entry["sample_rate"] = ch_metadata.sample_rate
            entry["file_size"] = atss_obj.file_size
            entry["n_samples"] = atss_obj.n_samples
            entry["sequence_number"] = 0
            entry["dipole"] = 0
            if ch_metadata.type in ["magnetic"]:
                entry["coil_number"] = ch_metadata.sensor.id
                entry["latitude"] = ch_metadata.location.latitude
                entry["longitude"] = ch_metadata.location.longitude
                entry["elevation"] = ch_metadata.location.elevation
            else:
                entry["coil_number"] = None
                entry["latitude"] = ch_metadata.positive.latitude
                entry["longitude"] = ch_metadata.positive.longitude
                entry["elevation"] = ch_metadata.positive.elevation

            entry["instrument_id"] = atss_obj.system_number
            entry["calibration_fn"] = None
            entries.append(entry)
        # make pandas dataframe and set data types
        df = self._sort_df(self._set_df_dtypes(pd.DataFrame(entries)), run_name_zeros)

        return df



[docs]
    def assign_run_names(self, df: pd.DataFrame, zeros: int = 0) -> pd.DataFrame:
        """
        Assign formatted run names based on sample rate and run number.

        If zeros is 0, run names are unchanged. Otherwise, run names are
        formatted as 'sr{sample_rate}_{run_number:0{zeros}d}' where the
        run number is extracted from the original run name after the first
        underscore.

        Parameters
        ----------
        df : pd.DataFrame
            DataFrame containing run information with 'run' and 'sample_rate' columns
        zeros : int, optional
            Number of zeros for zero-padding run numbers. If 0, run names
            are unchanged, by default 0

        Returns
        -------
        pd.DataFrame
            DataFrame with updated run names

        Examples
        --------
        >>> df = pd.DataFrame({
        ...     'run': ['run_1', 'run_2'],
        ...     'sample_rate': [128, 256]
        ... })
        >>> collection = MetronixCollection()
        >>> result = collection.assign_run_names(df, zeros=3)
        >>> print(result['run'].tolist())
        ['sr128_001', 'sr256_002']

        Notes
        -----
        The method expects run names to be in format 'prefix_number' where
        'number' can be extracted and converted to an integer for formatting.
        """
        if zeros == 0:
            return df

        for row in df.itertuples():
            df.loc[
                row.Index, "run"
            ] = f"sr{row.sample_rate:.0f}_{int(row.run.split('_')[1]):0{zeros}}"
        return df