Source code for mth5.io.lemi.lemi424

# -*- coding: utf-8 -*-
"""
Created on Tue May 11 15:31:31 2021

:copyright: 
    Jared Peacock (jpeacock@usgs.gov)

:license: MIT

"""
# =============================================================================
# Imports
# =============================================================================
from pathlib import Path
from io import StringIO
import warnings
from copy import deepcopy

# supress the future warning from pandas about using datetime parser.
warnings.simplefilter(action="ignore", category=FutureWarning)

import pandas as pd
import logging
from datetime import datetime

from mth5.timeseries import ChannelTS, RunTS
from mt_metadata.timeseries import Station, Run, Electric, Magnetic, Auxiliary
from mt_metadata.utils.mttime import MTime

# =============================================================================
[docs]def lemi_date_parser(year, month, day, hour, minute, second):
    """
    convenience function to combine the date-time columns that are output by
    lemi into a single column

    Assumes UTC

    :param year: year
    :type year: int
    :param month: month
    :type month: int
    :param day: day of the month
    :type day: int
    :param hour: hour in 24 hr format
    :type hour: int
    :param minute: minutes in the hour
    :type minute: int
    :param second: seconds in the minute
    :type second: int
    :return: date time as a single column
    :rtype: :class:`pandas.DateTime`

    """

    return pd.to_datetime(
        datetime(
            int(year),
            int(month),
            int(day),
            int(hour),
            int(minute),
            int(second),
        )
    )


[docs]def lemi_position_parser(position):
    """
    convenience function to parse the location strings into a decimal float
    Uses the hemisphere for the sign.

    .. note:: the format of the location is odd in that it is multiplied by
     100 within the LEMI to provide a single floating point value that
     includes the degrees and decimal degrees --> {degrees}{degrees[mm.ss]}.
     For example 40.50166 would be represented as 4030.1.

    :param position: DESCRIPTION
    :type position: TYPE
    :return: DESCRIPTION
    :rtype: TYPE

    """
    pos = f"{float(position) / 100}".split(".")
    degrees = int(pos[0])
    decimals = float(f"{pos[1][0:2]}.{pos[1][2:]}") / 60

    location = degrees + decimals
    return location


[docs]def lemi_hemisphere_parser(hemisphere):
    """
    convert hemisphere into a value [-1, 1].  Assumes the prime meridian is 0.

    :param hemisphere: hemisphere string [ 'N' | 'S' | 'E' | 'W']
    :type hemisphere: string
    :return: unity with a sign for the given hemisphere
    :rtype: signed integer

    """
    if hemisphere in ["S", "W"]:
        return -1
    return 1


[docs]class LEMI424:
    """
    Read in a LEMI424 file, this is a place holder until IRIS finalizes
    their reader.

    :param fn: full path to LEMI424 file
    :type fn: :class:`pathlib.Path` or string
    :param sample_rate: sample rate of the file, default is 1.0
    :type sample_rate: float
    :param chunk_size: chunk size for pandas to use, does not change reading
     time much for a single day file. default is 8640
    :type chunk_size: integer
    :param file_column_names: column names of the LEMI424 file
    :type file_column_names: list of strings
    :param dtypes: data types for each column
    :type dtypes: dictionary with keys of column names and values of data types
    :param data_column_names: same as file_column names with and added column
     for date, which is the combined date and time columns.
    :type data_column_names: dictionary with keys of column names and values
     of data types

    :LEMI424 File Column Names:

        - **year**
        - **month**
        - **day**
        - **hour**
        - **minute**
        - **second**
        - **bx**
        - **by**
        - **bz**
        - **temperature_e**
        - **temperature_h**
        - **e1**
        - **e2**
        - **e3**
        - **e4**
        - **battery**
        - **elevation**
        - **latitude**
        - **lat_hemisphere**
        - **longitude**
        - **lon_hemisphere**
        - **n_satellites**
        - **gps_fix**
        - **time_diff**

    :Data Column Names:

        - **date**
        - **bx**
        - **by**
        - **bz**
        - **temperature_e**
        - **temperature_h**
        - **e1**
        - **e2**
        - **e3**
        - **e4**
        - **battery**
        - **elevation**
        - **latitude**
        - **lat_hemisphere**
        - **longitude**
        - **lon_hemisphere**
        - **n_satellites**
        - **gps_fix**
        - **time_diff**

    """

    def __init__(self, fn=None, **kwargs):
        self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
        self.fn = fn
        self.sample_rate = 1.0
        self.chunk_size = 8640
        self.data = None
        self.file_column_names = [
            "year",
            "month",
            "day",
            "hour",
            "minute",
            "second",
            "bx",
            "by",
            "bz",
            "temperature_e",
            "temperature_h",
            "e1",
            "e2",
            "e3",
            "e4",
            "battery",
            "elevation",
            "latitude",
            "lat_hemisphere",
            "longitude",
            "lon_hemisphere",
            "n_satellites",
            "gps_fix",
            "time_diff",
        ]

        self.dtypes = dict(
            [
                ("year", int),
                ("month", int),
                ("day", int),
                ("hour", int),
                ("minute", int),
                ("second", int),
                ("bx", float),
                ("by", float),
                ("bz", float),
                ("temperature_e", float),
                ("temperature_h", float),
                ("e1", float),
                ("e2", float),
                ("e3", float),
                ("e4", float),
                ("battery", float),
                ("elevation", float),
                ("n_satellites", int),
                ("gps_fix", int),
                ("time_diff", float),
            ]
        )

        self.data_column_names = ["date"] + self.file_column_names[6:]

    def __str__(self):
        lines = ["LEMI 424 data", "-" * 20]
        lines.append(f"start:      {self.start.isoformat()}")
        lines.append(f"end:        {self.end.isoformat()}")
        lines.append(f"N samples:  {self.n_samples}")
        lines.append(f"latitude:   {self.latitude} (degrees)")
        lines.append(f"longitude:  {self.longitude} (degrees)")
        lines.append(f"elevation:  {self.elevation} m")

        return "\n".join(lines)

    def __repr__(self):
        return self.__str__()

    def __add__(self, other):
        """
        Can append other LEMI424 objects together as long as the start and
        end times match up.

        """
        if not self._has_data():
            raise ValueError("Data is None cannot append to. Read file first")

        if isinstance(other, LEMI424):
            new = LEMI424()
            new.__dict__.update(self.__dict__)
            new.data = pd.concat([new.data, other.data])
            return new

        elif isinstance(other, pd.DataFrame):

            if not other.columns != self.data.columns:
                raise ValueError("DataFrame columns are not the same.")

            new = LEMI424()
            new.__dict__.update(self.__dict__)
            new.data = pd.concat([new.data, other])
            return new

        else:
            raise ValueError(f"Cannot add {type(other)} to pd.DataFrame.")

    @property
    def data(self):
        """
        Data represented as a :class:`pandas.DataFrame` with data_column names

        """
        return self._data

    @data.setter
    def data(self, data):
        """
        Make sure if data is set that it is a pandas data frame with the proper
        column names
        """

        if data is None:
            self._data = None

        elif isinstance(data, pd.DataFrame):
            if len(data.columns) == len(self.file_column_names):
                if data.columns != self.file_column_names:
                    raise ValueError(
                        "Column names are not the same. "
                        "Check LEMI424.column_names for accepted names"
                    )
            elif len(data.columns) == len(self.data_column_names):
                if not data.columns == self.data_column_names:
                    raise ValueError(
                        "Column names are not the same. "
                        "Check LEMI424.column_names for accepted names"
                    )
            else:
                self._data = data

    def _has_data(self):
        """check to see if has data or not"""
        if self.data is not None:
            return True
        return False

    @property
    def fn(self):
        """full path to LEMI424 file"""
        return self._fn

    @fn.setter
    def fn(self, value):
        if value is not None:
            value = Path(value)
            if not value.exists():
                raise IOError(f"Could not find {value}")
        self._fn = value

    @property
    def file_size(self):
        """size of file in bytes"""
        if self.fn is not None:
            return self.fn.stat().st_size

    @property
    def start(self):
        """start time of data collection in the LEMI424 file"""
        if self._has_data():
            return MTime(self.data.index[0])

    @property
    def end(self):
        """end time of data collection in the LEMI424 file"""
        if self._has_data():
            return MTime(self.data.index[-1])

    @property
    def latitude(self):
        """median latitude where data have been collected in the LEMI424 file"""
        if self._has_data():

            return (
                self.data.latitude.median() * self.data.lat_hemisphere.median()
            )

    @property
    def longitude(self):
        """median longitude where data have been collected in the LEMI424 file"""
        if self._has_data():
            return (
                self.data.longitude.median() * self.data.lon_hemisphere.median()
            )

    @property
    def elevation(self):
        """median elevation where data have been collected in the LEMI424 file"""
        if self._has_data():
            return self.data.elevation.median()

    @property
    def n_samples(self):
        """number of samples in the file"""
        if self._has_data():
            return self.data.shape[0]
        elif self.fn is not None and self.fn.exists():
            return round(self.fn.stat().st_size / 152.0)

    @property
    def gps_lock(self):
        """has GPS lock"""
        if self._has_data():
            return self.data.gps_fix.values

    @property
    def station_metadata(self):
        """station metadata as :class:`mt_metadata.timeseries.Station`"""
        s = Station()
        if self._has_data():
            s.location.latitude = self.latitude
            s.location.longitude = self.longitude
            s.location.elevation = self.elevation
            s.time_period.start = self.start
            s.time_period.end = self.end
            s.add_run(self.run_metadata)
        return s

    @property
    def run_metadata(self):
        """run metadata as :class:`mt_metadata.timeseries.Run`"""
        r = Run()
        r.id = "a"
        r.sample_rate = self.sample_rate
        r.data_logger.model = "LEMI424"
        r.data_logger.manufacturer = "LEMI"
        if self._has_data():
            r.data_logger.power_source.voltage.start = self.data.battery.max()
            r.data_logger.power_source.voltage.end = self.data.battery.min()
            r.time_period.start = self.start
            r.time_period.end = self.end

            for ch_aux in ["temperature_e", "temperature_h"]:
                r.add_channel(Auxiliary(component=ch_aux))
            for ch_e in ["e1", "e2"]:
                r.add_channel(Electric(component=ch_e))
            for ch_h in ["bx", "by", "bz"]:
                r.add_channel(Magnetic(component=ch_h))

        return r

[docs]    def read(self, fn=None, fast=True):
        """
        Read a LEMI424 file using pandas.  The `fast` way will read in the
        first and last line to get the start and end time to make a time index.
        Then it will read in the data skipping parsing the date time columns.
        It will check to make sure the expected amount of points are correct.
        If not then it will read in the slower way which used the date time
        parser to ensure any time gaps are respected.

        :param fn: full path to file, defaults to None.  Uses LEMI424.fn if
         not provided
        :type fn: string or :class:`pathlib.Path`, optional
        :param fast: read the fast way (True) or not (False)
        :return: DESCRIPTION
        :rtype: TYPE

        """
        if fn is not None:
            self.fn = fn

        if not self.fn.exists():
            msg = "Could not find file %s"
            self.logger.error(msg, self.fn)
            raise IOError(msg % self.fn)

        if fast:
            try:
                self.read_metadata()
                data = pd.read_csv(
                    self.fn,
                    delimiter="\s+",
                    names=self.file_column_names,
                    dtype=self.dtypes,
                    usecols=(
                        "bx",
                        "by",
                        "bz",
                        "temperature_e",
                        "temperature_h",
                        "e1",
                        "e2",
                        "e3",
                        "e4",
                        "battery",
                        "elevation",
                        "latitude",
                        "lat_hemisphere",
                        "longitude",
                        "lon_hemisphere",
                        "n_satellites",
                        "gps_fix",
                        "time_diff",
                    ),
                    converters={
                        "latitude": lemi_position_parser,
                        "longitude": lemi_position_parser,
                        "lat_hemisphere": lemi_hemisphere_parser,
                        "lon_hemisphere": lemi_hemisphere_parser,
                    },
                )
                time_index = pd.date_range(
                    start=self.start.iso_no_tz,
                    end=self.end.iso_no_tz,
                    freq="1000000000N",
                )
                if time_index.size != data.shape[0]:
                    raise ValueError(
                        "Missing a time stamp use read with fast=False"
                    )

                data.index = time_index
                self.data = data
                return
            except ValueError:
                self.logger.warning(
                    "Data is missing a time stamp, reading in slow mode"
                )

        # read in chunks, this doesnt really speed up much as most of the
        # compute time is used in the date time parsing.
        if self.n_samples > self.chunk_size:
            st = MTime().now()
            dfs = list(
                pd.read_csv(
                    self.fn,
                    delimiter="\s+",
                    names=self.file_column_names,
                    dtype=self.dtypes,
                    parse_dates={
                        "date": [
                            "year",
                            "month",
                            "day",
                            "hour",
                            "minute",
                            "second",
                        ]
                    },
                    date_parser=lemi_date_parser,
                    converters={
                        "latitude": lemi_position_parser,
                        "longitude": lemi_position_parser,
                        "lat_hemisphere": lemi_hemisphere_parser,
                        "lon_hemisphere": lemi_hemisphere_parser,
                    },
                    index_col="date",
                    chunksize=self.chunk_size,
                )
            )

            self.data = pd.concat(dfs)
            et = MTime().now()
            self.logger.debug(
                f"Reading {self.fn.name} took {et - st:.2f} seconds"
            )

        else:
            st = MTime().now()
            self.data = pd.read_csv(
                self.fn,
                delimiter="\s+",
                names=self.file_column_names,
                dtype=self.dtypes,
                parse_dates={
                    "date": [
                        "year",
                        "month",
                        "day",
                        "hour",
                        "minute",
                        "second",
                    ]
                },
                date_parser=lemi_date_parser,
                converters={
                    "latitude": lemi_position_parser,
                    "longitude": lemi_position_parser,
                    "lat_hemisphere": lemi_hemisphere_parser,
                    "lon_hemisphere": lemi_hemisphere_parser,
                },
                index_col="date",
            )
            et = MTime().now()
            self.logger.debug(
                f"Reading {self.fn.name} took {et - st:.2f} seconds"
            )

[docs]    def read_metadata(self):
        """
        Read only first and last rows to get important metadata to use in
        the collection.

        """

        with open(self.fn) as fid:
            first_line = fid.readline()
            for line in fid:
                pass  # iterate to the end
            last_line = line

        lines = StringIO(f"{first_line}\n{last_line}")

        self.data = pd.read_csv(
            lines,
            delimiter="\s+",
            names=self.file_column_names,
            dtype=self.dtypes,
            parse_dates={
                "date": [
                    "year",
                    "month",
                    "day",
                    "hour",
                    "minute",
                    "second",
                ]
            },
            date_parser=lemi_date_parser,
            converters={
                "latitude": lemi_position_parser,
                "longitude": lemi_position_parser,
                "lat_hemisphere": lemi_hemisphere_parser,
                "lon_hemisphere": lemi_hemisphere_parser,
            },
            index_col="date",
        )

[docs]    def to_run_ts(self, fn=None, e_channels=["e1", "e2"]):
        """
        Create a :class:`mth5.timeseries.RunTS` object from the data

        :param fn: full path to file, defaults to None.  Will use LEMI424.fn
         if None.
        :type fn: string or :class:`pathlib.Path`, optional
        :param e_channels: columns for the electric channels to use,
         defaults to ["e1", "e2"]
        :type e_channels: list of strings, optional
        :return: RunTS object
        :rtype: :class:`mth5.timeseries.RunTS`

        """

        ch_list = []

        for comp in (
            ["bx", "by", "bz"] + e_channels + ["temperature_e", "temperature_h"]
        ):
            if comp[0] in ["h", "b"]:
                ch = ChannelTS("magnetic")

            elif comp[0] in ["e"]:
                ch = ChannelTS("electric")
            else:
                ch = ChannelTS("auxiliary")

            ch.sample_rate = self.sample_rate
            ch.start = self.start
            ch.ts = self.data[comp].values
            ch.component = comp

            ch_list.append(ch)

        return RunTS(
            array_list=ch_list,
            station_metadata=self.station_metadata,
            run_metadata=self.run_metadata,
        )


# =============================================================================
# define the reader
# =============================================================================
[docs]def read_lemi424(
    fn, e_channels=["e1", "e2"], fast=True, logger_file_handler=None
):
    """
    Read a LEMI 424 TXT file.

    :param fn: input file name
    :type fn: string or Path
    :param e_channels: A list of electric channels to read,
    defaults to ["e1", "e2"]
    :type e_channels: list of strings, optional
    :return: A RunTS object with appropriate metadata
    :rtype: :class:`mth5.timeseries.RunTS`

    """

    if not isinstance(fn, (list, tuple)):
        fn = [fn]

    txt_obj = LEMI424(fn[0])
    txt_obj.read(fast=fast)

    if logger_file_handler:
        txt_obj.logger.addHandler(logger_file_handler)

    # read a list of files into a single run
    if len(fn) > 1:
        for txt_file in fn:
            other = LEMI424(txt_file)
            other.read()
            txt_obj += other

    return txt_obj.to_run_ts(e_channels=e_channels)