Source code for leiap.io

"""
This file contains functions related to I/O from the database
"""

import pandas as _pd
from leiap.time import *


#######################################################################################################################


[docs]def get_credentials(credentials_path='credentials.json'): """Access database credentials from JSON file Parameters ---------- credentials_path : str Location of the credentials JSON file Returns ------- credentials : dict Dictionary containing the database connection details """ import json with open(credentials_path, 'r') as f: credentials = json.load(f) return credentials
#######################################################################################################################
[docs]def connect2db(driver='{ODBC Driver 17 for SQL Server}', **kwargs): """Open a connection to the database Parameters ---------- driver : str, optional Database driver needed to connect **kwargs Optional arguments that are passed to get_credentials() Returns ------- connection : MS SQL database connection """ import pyodbc from sys import platform if platform == "darwin" or platform == "linux" or platform == "linux2": driver = "libmsodbcsql.17.dylib" # may not be necessary for all macOS/Linux users credentials = get_credentials(**kwargs) connection = pyodbc.connect('DRIVER=' + driver + ';SERVER=' + credentials['database']['server_name'] + ';DATABASE=' + credentials['database']['db_name'] + ';UID=' + credentials['database']['user'] + ';PWD=' + credentials['database']['password']) return connection
#######################################################################################################################
[docs]def db_query(query_text, **kwargs): """Send any SQL query to the database Parameters ---------- query_text : str Full SQL query to pass to the database **kwargs Optional arguments that are passed to get_credentials() Returns ------- df : pandas DataFrame DataFrame of query results """ conn = connect2db(**kwargs) df = _pd.read_sql(query_text, conn) conn.close() return df
#######################################################################################################################
[docs]def get_points(years=None, **kwargs): """Load a DataFrame of points Parameters ---------- years : list List of desired years; can be strings or integers **kwargs Optional arguments that are passed to get_credentials() Returns ------- points_df : pandas DataFrame DataFrame of all points """ query = """SELECT SurveyPoint.*, Field.FieldNumber, Surveyor.SurveyorName FROM SurveyPoint LEFT JOIN Field ON SurveyPoint.FieldId=Field.FieldId LEFT JOIN Surveyor ON SurveyPoint.SurveyorId=Surveyor.SurveyorId """ points_df = db_query(query, **kwargs) points_df = points_df.drop(columns=['SherdCount', 'tempFixIDs']) if years: points_df = points_df[points_df.DataDate.dt.year.isin(years)] return points_df
#######################################################################################################################
[docs]def get_points_simple(**kwargs): """Load a DataFrame of points with the most typical query Parameters ---------- **kwargs Optional arguments that are passed to get_credentials() Returns ------- points_df : pandas DataFrame DataFrame of all points """ import warnings warnings.warn( """get_points_simple() is no longer preferred. Use get_points() with appropriate parameters instead.""", DeprecationWarning) query = """SELECT SurveyPoint.*, Field.FieldNumber, Surveyor.SurveyorName FROM SurveyPoint LEFT JOIN Field ON SurveyPoint.FieldId=Field.FieldId LEFT JOIN Surveyor ON SurveyPoint.SurveyorId=Surveyor.SurveyorId """ points_df = db_query(query, **kwargs) return points_df
#######################################################################################################################
[docs]def get_points_by_year(years, **kwargs): """Load a DataFrame of points with the most typical query for specified year(s) Parameters ---------- years : list List of desired years; can be strings or integers **kwargs Optional arguments that are passed to get_credentials() Returns ------- points_df : pandas DataFrame DataFrame of all points for specified year(s) """ import warnings warnings.warn( """get_points_by_year() is no longer preferred. Use get_points() with appropriate parameters instead.""", DeprecationWarning) points_df = get_points_simple(**kwargs) points_df = points_df[points_df.DataDate.dt.year.isin(years)] return points_df
#######################################################################################################################
[docs]def get_artifacts(sections=['base'], years=None, include_discards=False, **kwargs): """Load a DataFrame of artifacts Parameters ---------- sections : list of some set of {'all', 'base', 'metrics', 'classify', 'production', 'tile_brick', 'waretypes', 'vesselparts', 'macro_fabric'} Sections to include in the output DataFrame. Each section refers to a group of column names. years : list List of desired years; can be strings or integers include_discards : bool If True, return all records, even artifacts marked as Discarded. **kwargs Optional arguments that are passed to get_credentials() Returns ------- artifacts_df : pandas DataFrame DataFrame of all artifacts """ if 'base' in sections: sections.remove('base') if 'all' in sections: sections = ['metrics', 'classify', 'production', 'tile_brick', 'waretypes', 'vesselparts', 'macro_fabric'] base = ['SherdId', 'FieldId', 'SurveyorId', 'PointId', 'SherdNum', 'SurveyPointId', 'ChangedDate', 'FieldNumber', 'Northing', 'Easting', 'SurveyorName'] col_grps = {'metrics': ['Length', 'Width', 'Thickness', 'Weight'], 'classify': ['MaterialTypeName', 'ManufactureName', 'FabricTypeName', 'Form', 'Note', 'VesselPartOther', 'WareTypeOther'], 'production': ['Chronology', 'CultureOther', 'RegionOther', 'EnteredDate', 'EarlyChrono', 'LateChrono', 'Catalan'], 'tile_brick': ['TileType', 'TileIsStamped', 'TileIsCurved', 'BrickIsStamped'], 'macro_fabric': ['SherdCondition', 'SurfaceTexture', 'SurfTextureOther', 'SurfaceCondition', 'SurvCondOther', 'SurfaceTreatExt', 'STEOther', 'SurfaceTreatInt', 'STIOther', 'HardnessSurface', 'HardnessCore', 'FiringCore', 'ColorExt', 'ColorInt', 'ColorCore', 'InclusionOther', 'DomInclusion', 'DomInclusionOther', 'InclusionSorting', 'InclusionShape', 'InclusionDensity', 'InclusionSize', 'InclusionLargestSize', 'InclusionTexture'] } query = """SELECT Sherd.*, FabricType.*, Field.FieldNumber, SurveyPoint.Northing, SurveyPoint.Easting, Surveyor.SurveyorName, ManufactureMethod.ManufactureName FROM Sherd LEFT JOIN Field ON Sherd.FieldId=Field.FieldId LEFT JOIN FabricType ON Sherd.FabricType=FabricType.FabricTypeId LEFT JOIN SurveyPoint ON SurveyPoint.SurveyPointId = Sherd.SurveyPointId LEFT JOIN Surveyor ON SurveyPoint.SurveyorId=Surveyor.SurveyorId LEFT JOIN ManufactureMethod ON Sherd.ManufactureMethod=ManufactureMethod.ManufactureID """ artifacts_df = db_query(query, **kwargs) if include_discards is False: artifacts_df = artifacts_df[artifacts_df['FabricTypeName'] != 'Discarded'] cols = base for s in sections: if s in col_grps.keys(): cols += col_grps[s] artifacts_df = artifacts_df[cols] if 'waretypes' in sections: waretypes = db_query("""SELECT swt.SherdId, wt.WareTypeName FROM SherdWareType AS swt LEFT JOIN WareType AS wt ON swt.WareTypeId=wt.WareTypeId LEFT JOIN Sherd ON Sherd.SherdId=swt.SherdId """) dummies = _pd.get_dummies(waretypes['WareTypeName']).rename( # one hot encoding columns={'other': 'other_waretype', 'unknown': 'unknown_waretype'}) waretypes = waretypes.merge(dummies, how='left', left_index=True, right_index=True) # attach to original waretypes = waretypes.groupby('SherdId').sum() # groupby to collapse duplicate SherdIds artifacts_df = artifacts_df.merge(waretypes, how='left', left_on='SherdId', right_index=True) # merge with df if 'vesselparts' in sections: vessel_parts = db_query("""SELECT svp.SherdId, vp.VesselPartName FROM SherdVesselPart AS svp LEFT JOIN VesselPart AS vp ON svp.VesselPartId=vp.VesselPartId LEFT JOIN Sherd ON Sherd.SherdId=svp.SherdId """) dummies = _pd.get_dummies(vessel_parts['VesselPartName']).rename( columns={'other': 'other_vesselpart', 'unknown': 'unknown_vesselpart'}) # one hot encoding vessel_parts = vessel_parts.merge(dummies, how='left', left_index=True, right_index=True) # attach to original vessel_parts = vessel_parts.groupby('SherdId').sum() # groupby to collapse duplicate SherdIds artifacts_df = artifacts_df.merge(vessel_parts, how='left', left_on='SherdId', right_index=True) # merge with df if years: artifacts_df = artifacts_df[artifacts_df.ChangedDate.dt.year.isin(years)] return artifacts_df
#######################################################################################################################
[docs]def get_artifacts_simple(include_discards=False, **kwargs): """Load a DataFrame of artifacts with the most typical query Parameters ---------- include_discards : bool If True, return all records, even artifacts marked as Discarded. **kwargs Optional arguments that are passed to get_credentials() Returns ------- artifacts_df : pandas DataFrame DataFrame of all artifacts """ import warnings warnings.warn( """get_artifacts_simple() is no longer preferred. Use get_artifacts() with appropriate parameters instead.""", DeprecationWarning) query = """SELECT Sherd.*, FabricType.*, Field.FieldNumber, SurveyPoint.Northing, SurveyPoint.Easting, Surveyor.SurveyorName FROM Sherd LEFT JOIN Field ON Sherd.FieldId=Field.FieldId LEFT JOIN FabricType ON Sherd.FabricType=FabricType.FabricTypeId LEFT JOIN SurveyPoint ON SurveyPoint.SurveyPointId = Sherd.SurveyPointId LEFT JOIN Surveyor ON SurveyPoint.SurveyorId=Surveyor.SurveyorId LEFT JOIN ManufactureMethod ON Sherd.ManufactureMethod=ManufactureMethod.ManufactureID """ artifacts_df = db_query(query, **kwargs) if include_discards is False: artifacts_df = artifacts_df[artifacts_df['FabricTypeName'] != 'Discarded'] return artifacts_df
#######################################################################################################################
[docs]def get_artifacts_by_year(years, discards=False, **kwargs): """Load a DataFrame of artifacts with the most typical query for specified year(s) Parameters ---------- years : list List of desired years; can be strings or integers discards : bool If True, return all records, even artifacts marked as Discarded. **kwargs Optional arguments that are passed to get_credentials() Returns ------- artifacts_df : pandas DataFrame DataFrame of all artifacts for specified year(s) """ import warnings warnings.warn( """get_artifacts_by_year() is no longer preferred. Use get_artifacts() with appropriate parameters instead.""", DeprecationWarning) artifacts_df = get_artifacts_simple(include_discards=discards, **kwargs) artifacts_df = artifacts_df[artifacts_df.ChangedDate.dt.year.isin(years)] return artifacts_df
#######################################################################################################################
[docs]def get_productions_simple(**kwargs): """Load a DataFrame of productions with the most typical query Parameters ---------- **kwargs Optional arguments that are passed to get_credentials() Returns ------- prods_df : pandas DataFrame DataFrame of all productions """ query = """SELECT * FROM FabricType """ prods_df = db_query(query, **kwargs) return prods_df
#######################################################################################################################
[docs]def get_production_cts_wts(**kwargs): """Load a DataFrame of all points with columns for counts and weights of all productions Parameters ---------- **kwargs Optional arguments that are passed to get_credentials() Returns ------- cts_wts : pandas DataFrame DataFrame of all points with counts and weights for all productions Notes ----- Also pulls in some non-vessel artifact types (e.g., tile, brick, other construction material) """ artifacts = get_artifacts(sections=['metrics', 'classify', 'production', 'tile_brick'], **kwargs) points = get_points(**kwargs) # For artifacts without a Production (i.e., tiles, bricks, etc), use their MaterialType as their Production. If # MaterialType is Tile, use TileType ('Tegula' or 'Imbrex') artifacts['Production'] = artifacts['FabricTypeName'].where(~artifacts['FabricTypeName'].isnull(), artifacts['MaterialTypeName'].where( artifacts['TileType'].isnull(), artifacts['TileType'])) artifacts['Production'] = artifacts['Production'].where( ~((artifacts.Note.str.contains('signinum')) | (artifacts.Note.str.contains('Signinum'))), 'Opus signinum') # summarize artifacts by point art_cts = artifacts.groupby(['SurveyPointId', 'Production']).agg({'Production': 'size', 'Weight': 'sum'}).unstack() # merge Production (counts) and Weight (weights) with the points df # do this in two steps so that we can append the _ct and _wt suffixes to columns cts = _pd.merge(points, art_cts['Production'], how='left', left_on='SurveyPointId', right_index=True) cts_wts = _pd.merge(cts, art_cts['Weight'], how='left', left_on='SurveyPointId', right_index=True, suffixes=('_ct', '_wt')) return cts_wts
#######################################################################################################################
[docs]def get_points_times(warn='enable', **kwargs): """Load a DataFrame of points with datetimes cleaned and search times calculated Parameters ---------- warn : {'enable', 'disable'} Argument passed to the `calc_search_time()` function specifying whether or to print a generic warning message. **kwargs Optional arguments that are passed to get_credentials() Returns ------- pts : pandas DataFrame DataFrame of all points with adjusted datetimes and search times """ pts = calc_search_time(clean_datetimes(get_points(**kwargs)), warn=warn) return pts
#######################################################################################################################