Source code for mth5.io.nims.nims_collection

# -*- coding: utf-8 -*-
"""
NIMS Collection
===============

Collection of NIMS binary files combined into runs for magnetotelluric data processing.

Created on Wed Aug 31 10:32:44 2022

@author: jpeacock
"""

# =============================================================================
# Imports
# =============================================================================
from __future__ import annotations

from pathlib import Path
from typing import Any, Union

import pandas as pd

from mth5.io.collection import Collection
from mth5.io.nims import NIMS


# =============================================================================



[docs]
class NIMSCollection(Collection):
    """
    Collection of NIMS binary files into runs.

    This class provides functionality for organizing and processing multiple NIMS
    binary files into a structured format for magnetotelluric data analysis.

    Parameters
    ----------
    file_path : str or Path, optional
        Path to the directory containing NIMS binary files.
    **kwargs : dict
        Additional keyword arguments passed to the parent Collection class.

    Attributes
    ----------
    file_ext : str
        File extension for NIMS binary files ('bin').
    survey_id : str
        Survey identifier, defaults to 'mt'.

    Examples
    --------
    >>> from mth5.io.nims import NIMSCollection
    >>> nc = NIMSCollection(r"/path/to/nims/station")
    >>> nc.survey_id = "mt001"
    >>> df = nc.to_dataframe()

    See Also
    --------
    mth5.io.collection.Collection : Base collection class
    mth5.io.nims.NIMS : NIMS file reader
    """

    def __init__(self, file_path: Union[str, Path, None] = None, **kwargs: Any) -> None:
        """
        Initialize NIMSCollection instance.

        Parameters
        ----------
        file_path : str or Path, optional
            Path to the directory containing NIMS binary files.
        **kwargs : dict
            Additional keyword arguments passed to the parent Collection class.
        """
        super().__init__(file_path=file_path, **kwargs)

[docs]
        self.file_ext: str = "bin"


[docs]
        self.survey_id: str = "mt"



[docs]
    def to_dataframe(
        self,
        sample_rates: Union[int, list[int]] = [1],
        run_name_zeros: int = 2,
        calibration_path: Union[str, Path, None] = None,
    ) -> pd.DataFrame:
        """
        Create a DataFrame of each NIMS binary file in the collection directory.

        This method processes all NIMS binary files in the specified directory and
        extracts metadata to create a structured DataFrame suitable for further
        magnetotelluric data processing.

        Parameters
        ----------
        sample_rates : int or list of int, default [1]
            Sample rates to include in the DataFrame. Note that for NIMS data,
            this parameter is present for interface consistency but all files
            will be processed regardless of their sample rate.
        run_name_zeros : int, default 2
            Number of zeros to use when formatting run names in the output.
        calibration_path : str or Path, optional
            Path to calibration files. Currently not used in NIMS processing
            but included for interface consistency.

        Returns
        -------
        pd.DataFrame
            DataFrame containing metadata for each NIMS file with columns:
            - survey : Survey identifier
            - station : Station name from NIMS file
            - run : Run identifier from NIMS file
            - start : Start time in ISO format
            - end : End time in ISO format
            - fn : File path
            - sample_rate : Sampling rate
            - file_size : File size in bytes
            - n_samples : Number of samples
            - dipole : Electric dipole lengths [Ex, Ey]
            - channel_id : Channel identifier (always 1)
            - sequence_number : Sequence number (always 0)
            - component : Comma-separated component list
            - instrument_id : Instrument identifier (always 'NIMS')

        Notes
        -----
        This method assumes the directory contains files from a single station.
        Each NIMS file is read to extract header information including timing,
        station identification, and measurement parameters.

        Examples
        --------
        >>> from mth5.io.nims import NIMSCollection
        >>> nc = NIMSCollection("/path/to/nims/station")
        >>> df = nc.to_dataframe(run_name_zeros=3)
        >>> print(df[['station', 'run', 'start', 'sample_rate']])
        """
        entries = []
        for fn in self.get_files(
            [self.file_ext, self.file_ext.lower(), self.file_ext.upper()]
        ):
            nims_obj = NIMS(fn)
            nims_obj.read_header()
            entry = self.get_empty_entry_dict()
            entry["survey"] = self.survey_id
            entry["station"] = nims_obj.station
            entry["run"] = nims_obj.run_id
            entry["start"] = nims_obj.start_time.isoformat()
            entry["end"] = nims_obj.end_time.isoformat()
            entry["fn"] = fn
            entry["sample_rate"] = nims_obj.sample_rate
            entry["file_size"] = nims_obj.file_size
            entry["n_samples"] = nims_obj.n_samples
            entry["dipole"] = [nims_obj.ex_length, nims_obj.ey_length]

            entries.append(entry)

        # make pandas dataframe and set data types
        df = pd.DataFrame(entries)

        # If there are no entries, create an empty DataFrame with the
        # expected columns so subsequent scalar assignments and dtype
        # enforcement work without raising (pandas raises when assigning
        # scalars into an empty frame with no defined index).
        if df.empty:
            expected_cols = [
                "survey",
                "station",
                "run",
                "start",
                "end",
                "fn",
                "sample_rate",
                "file_size",
                "n_samples",
                "dipole",
                "channel_id",
                "sequence_number",
                "component",
                "instrument_id",
            ]
            df = pd.DataFrame(columns=expected_cols)

        # Populate/ensure scalar columns exist
        if "channel_id" not in df.columns:
            df["channel_id"] = 1
        else:
            # Explicitly coerce to numeric before filling to avoid future downcast warnings
            df.loc[:, "channel_id"] = (
                pd.to_numeric(df.loc[:, "channel_id"], errors="coerce")
                .fillna(1)
                .astype("int64")
            )

        if "sequence_number" not in df.columns:
            df["sequence_number"] = 0
        else:
            df.loc[:, "sequence_number"] = (
                pd.to_numeric(df.loc[:, "sequence_number"], errors="coerce")
                .fillna(0)
                .astype("int64")
            )

        if "component" not in df.columns:
            df["component"] = ",".join(["hx", "hy", "hz", "ex", "ey", "temperature"])
        else:
            df.loc[:, "component"] = df.loc[:, "component"].fillna(
                ",".join(["hx", "hy", "hz", "ex", "ey", "temperature"])
            )

        if "instrument_id" not in df.columns:
            df["instrument_id"] = "NIMS"
        else:
            df.loc[:, "instrument_id"] = df.loc[:, "instrument_id"].fillna("NIMS")

        df = self._sort_df(self._set_df_dtypes(df), run_name_zeros)

        return df



[docs]
    def assign_run_names(self, df: pd.DataFrame, zeros: int = 2) -> pd.DataFrame:
        """
        Assign standardized run names to DataFrame entries by station.

        This method assigns run names following the pattern 'sr{sample_rate}_{run_number}'
        where run_number is zero-padded according to the zeros parameter. Run names
        are assigned sequentially within each station, ordered by start time.

        Parameters
        ----------
        df : pd.DataFrame
            DataFrame containing NIMS file metadata with required columns:
            'station', 'start', 'run', 'sample_rate'. The DataFrame will be
            modified in-place.
        zeros : int, default 2
            Number of zeros to use for zero-padding the run number in the
            generated run names (e.g., zeros=2 gives '01', '02', etc.).

        Returns
        -------
        pd.DataFrame
            The input DataFrame with updated 'run' and 'sequence_number' columns.
            Run names follow the format 'sr{sample_rate}_{run_number:0{zeros}}'.

        Notes
        -----
        - Existing run names (non-None values) are preserved
        - Files are processed in chronological order within each station
        - Sequence numbers are assigned incrementally starting from 1
        - Only files with None run names receive new assignments

        Examples
        --------
        >>> import pandas as pd
        >>> from mth5.io.nims import NIMSCollection
        >>> # Assuming df has columns: station, start, run, sample_rate
        >>> nc = NIMSCollection()
        >>> df_updated = nc.assign_run_names(df, zeros=3)
        >>> print(df_updated['run'].tolist())
        ['sr8_001', 'sr8_002', 'sr1_001']
        """

        for station in df.station.unique():
            count = 1
            for row in df[df.station == station].sort_values("start").itertuples():
                if row.run is None:
                    df.loc[row.Index, "run"] = f"sr{row.sample_rate}_{count:0{zeros}}"
                df.loc[row.Index, "sequence_number"] = count
                count += 1

        return df