"""
This module contains the RunSummary class.
This is a helper class that summarizes the Runs in an mth5.
TODO: This class and methods could be replaced by methods in MTH5.
Functionality of RunSummary()
1. User can get a list of local_station options, which correspond to unique pairs
of values: (survey, station)
2. User can see all possible ways of processing the data:
- one list per (survey, station) pair in the run_summary
Some of the following functionalities may end up in KernelDataset:
3. User can select local_station
-this can trigger a reduction of runs to only those that are from the local staion
and simultaneous runs at other stations
4. Given a local station, a list of possible reference stations can be generated
5. Given a remote reference station, a list of all relevent runs, truncated to
maximize coverage of the local station runs is generated
6. Given such a "restricted run list", runs can be dropped
7. Time interval endpoints can be changed
Development Notes:
TODO: consider adding methods:
- drop_runs_shorter_than": removes short runs from summary
- fill_gaps_by_time_interval": allows runs to be merged if gaps between
are short
- fill_gaps_by_run_names": allows runs to be merged if gaps between are
short
TODO: Consider whether this should return a copy or modify in-place when
querying the df.
"""
# =============================================================================
# Imports
# =============================================================================
import copy
from typing import Optional, Union
import pandas as pd
from loguru import logger
import mth5
from mth5.processing import MINI_SUMMARY_COLUMNS, RUN_SUMMARY_COLUMNS
from mth5.utils.helpers import initialize_mth5
# =============================================================================
[docs]
class RunSummary:
"""Class to contain a run-summary table from one or more mth5s.
WIP: For the full MMT case this may need modification to a channel based
summary.
"""
def __init__(
self,
input_dict: Optional[Union[dict, None]] = None,
df: Optional[Union[pd.DataFrame, None]] = None,
):
"""Constructor.
Parameters
----------
df : Optional[Union[pd.DataFrame, None]], optional
By default, None.
input_dict : Optional[Union[dict, None]], optional
By default, None.
kwargs :
"""
[docs]
self.column_dtypes = [str, str, pd.Timestamp, pd.Timestamp]
self._input_dict = input_dict
self.df = df
self._mini_summary_columns = MINI_SUMMARY_COLUMNS
def __str__(self):
"""Str function."""
return str(self.mini_summary.head(None))
def __repr__(self):
"""Repr function."""
return self.__str__()
@property
[docs]
def df(self) -> pd.DataFrame:
"""Df function."""
return self._df
@df.setter
def df(self, value: pd.DataFrame):
"""Make sure the data frame is set properly with proper column names.
Parameters
----------
value : pd.DataFrame
DESCRIPTION.
Returns
-------
TYPE
DESCRIPTION.
"""
if value is None:
self._df = None
return
if not isinstance(value, pd.DataFrame):
msg = f"Need to set df with a Pandas.DataFrame not type({type(value)})"
logger.error(msg)
raise TypeError(msg)
need_columns = []
for col in RUN_SUMMARY_COLUMNS:
if not col in value.columns:
need_columns.append(col)
if need_columns:
msg = f"DataFrame needs columns {', '.join(need_columns)}"
logger.error(msg)
raise ValueError(msg)
self._df = value
[docs]
def clone(self):
"""2022-10-20:
Cloning may be causing issues with extra instances of open h5 files ...
"""
return copy.deepcopy(self)
[docs]
def from_mth5s(self, mth5_list) -> list:
"""Iterates over mth5s in list and creates one big dataframe
summarizing the runs
"""
run_summary_df = extract_run_summaries_from_mth5s(mth5_list)
self.df = run_summary_df
def _warn_no_data_runs(self):
"""Warn no data runs."""
if False in self.df.has_data.values:
for row in self.df[self.df.has_data == False].itertuples():
logger.warning(
f"Found no data run in row {row.Index}: "
f"survey: {row.survey}, station: {row.station}, run: {row.run}"
)
logger.info("To drop no data runs use `drop_no_data_rows`")
@property
[docs]
def mini_summary(self) -> pd.DataFrame:
"""Shows the dataframe with only a few columns for readbility."""
return self.df[self._mini_summary_columns]
@property
[docs]
def print_mini_summary(self) -> str:
"""Calls minisummary through logger so it is formatted."""
logger.info(self.mini_summary)
[docs]
def drop_no_data_rows(self) -> bool:
"""Drops rows marked `has_data` = False and resets the index of self.df."""
self.df = self.df[self.df.has_data]
self.df.reset_index(drop=True, inplace=True)
[docs]
def set_sample_rate(self, sample_rate: float, inplace: bool = False):
"""Set the sample rate so that the run summary represents all runs for
a single sample rate.
Parameters
----------
sample_rate : float
inplace : bool, optional
DESCRIPTION. By default, False.
Returns
-------
TYPE
DESCRIPTION.
"""
if sample_rate not in self.df.sample_rate.values:
msg = (
f"Sample rate {sample_rate} is not in RunSummary. Unique "
f"values are {self.df.sample_rate.unique()}"
)
logger.error(msg)
raise ValueError(msg)
if inplace:
self.df = self.df[self.df.sample_rate == sample_rate]
else:
new_rs = self.clone()
new_rs.df = new_rs.df[new_rs.df.sample_rate == sample_rate]
return new_rs
### this can be deprcated now
# def extract_run_summary_from_mth5(mth5_obj, summary_type: Optional[str] = "run"):
# """Given a single mth5 object, get the channel_summary and compress it to a
# run_summary.
# Development Notes:
# TODO: Move this into MTH5 or replace with MTH5 built-in run_summary method.
# Parameters
# ----------
# mth5_obj : mth5.mth5.MTH5
# The initialized mth5 object that will be interrogated.
# summary_type : Optional[str], optional
# One of ["run", "channel"]. Returns a run summary or a channel summary. By default, "run".
# Returns
# -------
# out_df : pd.Dataframe
# Table summarizing the available runs in the input mth5_obj.
# """
# if summary_type == "run":
# out_df = mth5_obj.run_summary
# else:
# out_df = mth5_obj.channel_summary.to_dataframe()
# out_df["mth5_path"] = str(mth5_obj.filename)
# return out_df