Source code for leiap.time

"""
This file contains functions related to survey search time calculations
"""


import pandas as _pd


#######################################################################################################################


[docs]def clean_datetimes(df, dt_col='DataDate'): """Filter datetimes and correct for timezone issues Parameters ---------- df : pandas DataFrame DataFrame of point observations dt_col : str Column name for datetime data Returns ------- df : pandas DataFrame Identical to input DataFrame with an added 'dt_adj' column representing datetimes filtered and correct (see Notes) Notes ----- 1. Points collected in 2014 did not have their times recorded (only dates); they are assigned a time of 00:00:00 by the database. In the new `dt_adj` column, they are assigned NaT type. If you want to access just the dates, you can do so with the original `dt_col` (`DataDate` by default). 2. When points are downloaded from handheld GPS devices, their times are converted to the timezone of the laptop on which they are downloaded. Before we realized this, a lot of points were uploaded on machines set to U.S. Pacific Time. As a result, some times need to be adjusted by 9 hours. """ df['dt_adj'] = df[dt_col].where(df[dt_col].dt.year > 2014) df = correct_timezone(df, dt_col) return df
#######################################################################################################################
[docs]def correct_timezone(df, dt_col='DataDate'): """Account for some timezone issues Parameters ---------- df : pandas DataFrame DataFrame of point observations dt_col : str Column name for datetime data Returns ------- df : pandas DataFrame Identical to input DataFrame with datetimes fixed so that they all range from 06:30:00-21:30:00. In reality, the latest times are approx 15:00:00 """ df[dt_col] = df[dt_col].where((df[dt_col].dt.time > _pd.Timestamp('06:30:00').time()) & (df[dt_col].dt.time < _pd.Timestamp('21:30:00').time()), df[dt_col]+_pd.DateOffset(hours=9)) return df
#######################################################################################################################
[docs]def calc_search_time(df, dt_col='dt_adj', warn='enable'): """Calculate search times in seconds Parameters ---------- df : pandas DataFrame of points Must have columns 'FieldNumber', 'SurveyorName', 'SurveyPointId' dt_col : str Column name for datetime data warn : {'enable', 'disable'} Specify whether or not you want the generic warning message. Returns ------- df : pandas DataFrame Identical to input DataFrame with added 'search_time' and 'dist' columns representing search time in seconds, and distance from previous point in meters Notes ----- This is a naive calculation that doesn't discard any times. You will want to filter values further before using in any interpretively meaningful way. """ import math if warn == 'enable': import warnings warnings.warn('FYI: Search times calculated in a naive way (e.g., including unrealistic values and NaN values).' 'Consider further filtering before using `search_time` in calculations.') pts_ix = df.set_index(['FieldNumber', 'SurveyorName', 'SurveyPointId']).sort_index() # create multi-index df combos = list(set(zip(df['FieldNumber'], df['SurveyorName']))) # find all unique combos of field and surveyor s = [] # list to store series for each surveyor within each field for i in range(len(combos)): # loop through field/surveyor pairs field, surveyor = combos[i][0], combos[i][1] # create new df subset for that field and surveyor and sort by datetime df_fs = pts_ix.loc[field, surveyor].sort_values(dt_col) df_fs['dist'] = df_fs['dist'] = ((df_fs['Easting']-df_fs['Easting'].shift(1))**2 + (df_fs['Northing']-df_fs['Northing'].shift(1))**2 ).apply(lambda x: math.sqrt(x)) # calculate distance between consecutive pts df_fs['search_time'] = df_fs[dt_col].diff().dt.total_seconds() # calculate differences between consecutive dts s.append(df_fs[['search_time', 'dist']]) # append the small series to the list of series t = _pd.concat(s) # concatenate all of the series into one long one (should be of length == n points) return df.join(t, on='SurveyPointId') # join the series to the original points df
#######################################################################################################################
[docs]def filter_times(df, t_col='search_time', t_lim=(1, 900), dist_col='dist', dist_lim=(1, 20)): """Put time and distance restrictions on the time data. Parameters ---------- df : pandas DataFrame Dataset of points t_col : str Name of column with time info t_lim : tuple Min and max limits (inclusive) for time dist_col : str Name of column with distance info dist_lim : tuple Min and max limits (inclusive) for distance Returns ------- filtered : pandas DataFrame A subset of the original DataFrame filtered according to the input parameters """ filtered = df[~(df[t_col].isna()) & # times not N/A (df[t_col].between(t_lim[0], t_lim[1])) & # time limits (df[dist_col].between(dist_lim[0], dist_lim[1])) # distance limits ].sort_values(t_col, ascending=False) # sort by time return filtered
#######################################################################################################################