Source code for mth5.io.nims.nims_collection

# -*- coding: utf-8 -*-
"""
NIMS Collection
===============

Collection of NIMS binary files combined into runs for magnetotelluric data processing.

Created on Wed Aug 31 10:32:44 2022

@author: jpeacock
"""

# =============================================================================
# Imports
# =============================================================================
from __future__ import annotations

from pathlib import Path
from typing import Any, Union

import pandas as pd

from mth5.io.collection import Collection
from mth5.io.nims import NIMS


# =============================================================================


[docs] class NIMSCollection(Collection): """ Collection of NIMS binary files into runs. This class provides functionality for organizing and processing multiple NIMS binary files into a structured format for magnetotelluric data analysis. Parameters ---------- file_path : str or Path, optional Path to the directory containing NIMS binary files. **kwargs : dict Additional keyword arguments passed to the parent Collection class. Attributes ---------- file_ext : str File extension for NIMS binary files ('bin'). survey_id : str Survey identifier, defaults to 'mt'. Examples -------- >>> from mth5.io.nims import NIMSCollection >>> nc = NIMSCollection(r"/path/to/nims/station") >>> nc.survey_id = "mt001" >>> df = nc.to_dataframe() See Also -------- mth5.io.collection.Collection : Base collection class mth5.io.nims.NIMS : NIMS file reader """ def __init__(self, file_path: Union[str, Path, None] = None, **kwargs: Any) -> None: """ Initialize NIMSCollection instance. Parameters ---------- file_path : str or Path, optional Path to the directory containing NIMS binary files. **kwargs : dict Additional keyword arguments passed to the parent Collection class. """ super().__init__(file_path=file_path, **kwargs)
[docs] self.file_ext: str = "bin"
[docs] self.survey_id: str = "mt"
[docs] def to_dataframe( self, sample_rates: Union[int, list[int]] = [1], run_name_zeros: int = 2, calibration_path: Union[str, Path, None] = None, ) -> pd.DataFrame: """ Create a DataFrame of each NIMS binary file in the collection directory. This method processes all NIMS binary files in the specified directory and extracts metadata to create a structured DataFrame suitable for further magnetotelluric data processing. Parameters ---------- sample_rates : int or list of int, default [1] Sample rates to include in the DataFrame. Note that for NIMS data, this parameter is present for interface consistency but all files will be processed regardless of their sample rate. run_name_zeros : int, default 2 Number of zeros to use when formatting run names in the output. calibration_path : str or Path, optional Path to calibration files. Currently not used in NIMS processing but included for interface consistency. Returns ------- pd.DataFrame DataFrame containing metadata for each NIMS file with columns: - survey : Survey identifier - station : Station name from NIMS file - run : Run identifier from NIMS file - start : Start time in ISO format - end : End time in ISO format - fn : File path - sample_rate : Sampling rate - file_size : File size in bytes - n_samples : Number of samples - dipole : Electric dipole lengths [Ex, Ey] - channel_id : Channel identifier (always 1) - sequence_number : Sequence number (always 0) - component : Comma-separated component list - instrument_id : Instrument identifier (always 'NIMS') Notes ----- This method assumes the directory contains files from a single station. Each NIMS file is read to extract header information including timing, station identification, and measurement parameters. Examples -------- >>> from mth5.io.nims import NIMSCollection >>> nc = NIMSCollection("/path/to/nims/station") >>> df = nc.to_dataframe(run_name_zeros=3) >>> print(df[['station', 'run', 'start', 'sample_rate']]) """ entries = [] for fn in self.get_files( [self.file_ext, self.file_ext.lower(), self.file_ext.upper()] ): nims_obj = NIMS(fn) nims_obj.read_header() entry = self.get_empty_entry_dict() entry["survey"] = self.survey_id entry["station"] = nims_obj.station entry["run"] = nims_obj.run_id entry["start"] = nims_obj.start_time.isoformat() entry["end"] = nims_obj.end_time.isoformat() entry["fn"] = fn entry["sample_rate"] = nims_obj.sample_rate entry["file_size"] = nims_obj.file_size entry["n_samples"] = nims_obj.n_samples entry["dipole"] = [nims_obj.ex_length, nims_obj.ey_length] entries.append(entry) # make pandas dataframe and set data types df = pd.DataFrame(entries) # If there are no entries, create an empty DataFrame with the # expected columns so subsequent scalar assignments and dtype # enforcement work without raising (pandas raises when assigning # scalars into an empty frame with no defined index). if df.empty: expected_cols = [ "survey", "station", "run", "start", "end", "fn", "sample_rate", "file_size", "n_samples", "dipole", "channel_id", "sequence_number", "component", "instrument_id", ] df = pd.DataFrame(columns=expected_cols) # Populate/ensure scalar columns exist if "channel_id" not in df.columns: df["channel_id"] = 1 else: # Explicitly coerce to numeric before filling to avoid future downcast warnings df.loc[:, "channel_id"] = ( pd.to_numeric(df.loc[:, "channel_id"], errors="coerce") .fillna(1) .astype("int64") ) if "sequence_number" not in df.columns: df["sequence_number"] = 0 else: df.loc[:, "sequence_number"] = ( pd.to_numeric(df.loc[:, "sequence_number"], errors="coerce") .fillna(0) .astype("int64") ) if "component" not in df.columns: df["component"] = ",".join(["hx", "hy", "hz", "ex", "ey", "temperature"]) else: df.loc[:, "component"] = df.loc[:, "component"].fillna( ",".join(["hx", "hy", "hz", "ex", "ey", "temperature"]) ) if "instrument_id" not in df.columns: df["instrument_id"] = "NIMS" else: df.loc[:, "instrument_id"] = df.loc[:, "instrument_id"].fillna("NIMS") df = self._sort_df(self._set_df_dtypes(df), run_name_zeros) return df
[docs] def assign_run_names(self, df: pd.DataFrame, zeros: int = 2) -> pd.DataFrame: """ Assign standardized run names to DataFrame entries by station. This method assigns run names following the pattern 'sr{sample_rate}_{run_number}' where run_number is zero-padded according to the zeros parameter. Run names are assigned sequentially within each station, ordered by start time. Parameters ---------- df : pd.DataFrame DataFrame containing NIMS file metadata with required columns: 'station', 'start', 'run', 'sample_rate'. The DataFrame will be modified in-place. zeros : int, default 2 Number of zeros to use for zero-padding the run number in the generated run names (e.g., zeros=2 gives '01', '02', etc.). Returns ------- pd.DataFrame The input DataFrame with updated 'run' and 'sequence_number' columns. Run names follow the format 'sr{sample_rate}_{run_number:0{zeros}}'. Notes ----- - Existing run names (non-None values) are preserved - Files are processed in chronological order within each station - Sequence numbers are assigned incrementally starting from 1 - Only files with None run names receive new assignments Examples -------- >>> import pandas as pd >>> from mth5.io.nims import NIMSCollection >>> # Assuming df has columns: station, start, run, sample_rate >>> nc = NIMSCollection() >>> df_updated = nc.assign_run_names(df, zeros=3) >>> print(df_updated['run'].tolist()) ['sr8_001', 'sr8_002', 'sr1_001'] """ for station in df.station.unique(): count = 1 for row in df[df.station == station].sort_values("start").itertuples(): if row.run is None: df.loc[row.Index, "run"] = f"sr{row.sample_rate}_{count:0{zeros}}" df.loc[row.Index, "sequence_number"] = count count += 1 return df