Source code for mth5.io.lemi.lemi_collection

# -*- coding: utf-8 -*-
"""
LEMI 424 Collection
====================

Collection of TXT files combined into runs

Created on Wed Aug 31 10:32:44 2022

@author: jpeacock
"""

import pathlib
from pathlib import Path
from typing import List

# =============================================================================
# Imports
# =============================================================================
import pandas as pd

from mth5.io.collection import Collection
from mth5.io.lemi import LEMI424


# =============================================================================


[docs] class LEMICollection(Collection): """ Collection of LEMI 424 files into runs based on start and end times. Will assign the run name as 'sr1_{index:0{zeros}}' --> 'sr1_0001' for `zeros` = 4. Notes ----- This class assumes that the given file path contains a single LEMI station. If you want to do multiple stations merge the returned data frames. LEMI data comes with little metadata about the station or survey, therefore you should assign `station_id` and `survey_id`. Parameters ---------- file_path : str or pathlib.Path, optional Full path to single station LEMI424 directory, by default None file_ext : list of str, optional Extension of LEMI424 files, by default ["txt", "TXT"] **kwargs Additional keyword arguments passed to parent Collection class Attributes ---------- station_id : str Station identification string, defaults to "mt001" survey_id : str Survey identification string, defaults to "mt" Examples -------- >>> from mth5.io.lemi import LEMICollection >>> lc = LEMICollection(r"/path/to/single/lemi/station") >>> lc.station_id = "mt001" >>> lc.survey_id = "test_survey" >>> run_dict = lc.get_runs(1) """ def __init__( self, file_path: str | pathlib.Path | None = None, file_ext: List[str] | None = None, **kwargs, ) -> None: if file_ext is None: file_ext = ["txt", "TXT"] super().__init__(file_path=file_path, file_ext=file_ext, **kwargs)
[docs] self.station_id = "mt001"
[docs] self.survey_id = "mt"
[docs] self.calibration_dict = {}
[docs] def get_calibrations(self, calibration_path: str | Path) -> dict: """ Get calibration dictionary for LEMI424 files. This assumes that the calibrations files are in JSON format and named as 'LEMI-424-<component>.json' Parameters ---------- calibration_path : str or pathlib.Path Path to calibration files Returns ------- dict Calibration dictionary for LEMI424 files Examples -------- >>> from mth5.io.lemi import LEMICollection >>> lc = LEMICollection("/path/to/single/lemi/station") >>> cal_dict = lc.get_calibrations(Path("/path/to/calibrations")) """ calibration_path = Path(calibration_path) calibration_dict = {} for fn in calibration_path.rglob("*.json"): comp = fn.stem.split("-")[-1].split(".", 1)[0] calibration_dict[comp] = fn return calibration_dict
[docs] def to_dataframe( self, sample_rates: int | List[int] | None = None, run_name_zeros: int = 4, calibration_path: str | Path | None = None, ) -> pd.DataFrame: """ Create a data frame of each TXT file in a given directory. Notes ----- This assumes the given directory contains a single station Parameters ---------- sample_rates : int or list of int, optional Sample rate to get, will always be 1 for LEMI data, by default [1] run_name_zeros : int, optional Number of zeros to assign to the run name, by default 4 calibration_path : str or pathlib.Path, optional Path to calibration files, by default None Returns ------- pd.DataFrame DataFrame with information of each TXT file in the given directory Examples -------- >>> from mth5.io.lemi import LEMICollection >>> lc = LEMICollection("/path/to/single/lemi/station") >>> lemi_df = lc.to_dataframe() """ if sample_rates is None: sample_rates = [1] if calibration_path is None: calibration_path = Path(self.file_path) self.calibration_dict = self.get_calibrations(calibration_path) if not self.calibration_dict: self.logger.warning( f"No calibration files found in {calibration_path}, " "proceeding without calibrations." ) entries = [] for fn in self.get_files(self.file_ext): lemi_obj = LEMI424(fn) n_samples = int(lemi_obj.n_samples or 0) lemi_obj.read_metadata() entry = self.get_empty_entry_dict() entry["survey"] = self.survey_id entry["station"] = self.station_id entry["start"] = lemi_obj.start.isoformat() if lemi_obj.start else "" entry["end"] = lemi_obj.end.isoformat() if lemi_obj.end else "" entry["component"] = ",".join(lemi_obj.run_metadata.channels_recorded_all) entry["fn"] = fn entry["sample_rate"] = lemi_obj.sample_rate entry["file_size"] = lemi_obj.file_size entry["n_samples"] = n_samples entries.append(entry) # make pandas dataframe and set data types if len(entries) == 0: self.logger.warning("No entries found for LEMI collection") return pd.DataFrame() df = pd.DataFrame(entries) df.loc[:, "channel_id"] = 1 df.loc[:, "sequence_number"] = 0 df.loc[:, "instrument_id"] = "LEMI424" df = self._sort_df(self._set_df_dtypes(df), run_name_zeros) return df
[docs] def assign_run_names(self, df: pd.DataFrame, zeros: int = 4) -> pd.DataFrame: """ Assign run names based on start and end times. Checks if a file has the same start time as the last end time. Run names are assigned as sr{sample_rate}_{run_number:0{zeros}}. Parameters ---------- df : pd.DataFrame DataFrame with the appropriate columns zeros : int, optional Number of zeros in run name, by default 4 Returns ------- pd.DataFrame DataFrame with run names assigned """ count = 1 for row in df.itertuples(): if row.Index == 0: df.loc[row.Index, "run"] = f"sr1_{count:0{zeros}}" previous_end = row.end else: if ( row.start - previous_end ).total_seconds() / row.sample_rate == row.sample_rate: df.loc[row.Index, "run"] = f"sr1_{count:0{zeros}}" else: count += 1 df.loc[row.Index, "run"] = f"sr1_{count:0{zeros}}" previous_end = row.end return df