Source code for leiap.io

"""
This file contains functions related to I/O from the database
"""

import pandas as _pd
from leiap.time import *


#######################################################################################################################


[docs]def get_credentials(credentials_path='credentials.json'):
    """Access database credentials from JSON file

    Parameters
    ----------
    credentials_path : str
        Location of the credentials JSON file

    Returns
    -------
    credentials : dict
        Dictionary containing the database connection details
    """
    import json
    with open(credentials_path, 'r') as f:
        credentials = json.load(f)

    return credentials


#######################################################################################################################


[docs]def connect2db(driver='{ODBC Driver 17 for SQL Server}', **kwargs):
    """Open a connection to the database
    
    Parameters
    ----------
    driver : str, optional
        Database driver needed to connect
    **kwargs
        Optional arguments that are passed to get_credentials()

    Returns
    -------
    connection : MS SQL database connection
    """
    import pyodbc
    from sys import platform

    if platform == "darwin" or platform == "linux" or platform == "linux2":
        driver = "libmsodbcsql.17.dylib"  # may not be necessary for all macOS/Linux users

    credentials = get_credentials(**kwargs)

    connection = pyodbc.connect('DRIVER=' + driver +
                                ';SERVER=' + credentials['database']['server_name'] +
                                ';DATABASE=' + credentials['database']['db_name'] +
                                ';UID=' + credentials['database']['user'] +
                                ';PWD=' + credentials['database']['password'])

    return connection


#######################################################################################################################


[docs]def db_query(query_text, **kwargs):
    """Send any SQL query to the database
    
    Parameters
    ----------
    query_text : str
        Full SQL query to pass to the database
    **kwargs
        Optional arguments that are passed to get_credentials()

    Returns
    -------
    df : pandas DataFrame
        DataFrame of query results
    """
    conn = connect2db(**kwargs)
    df = _pd.read_sql(query_text, conn)
    conn.close()
    return df


#######################################################################################################################


[docs]def get_points(years=None, **kwargs):
    """Load a DataFrame of points

    Parameters
    ----------
    years : list
        List of desired years; can be strings or integers
    **kwargs
        Optional arguments that are passed to get_credentials()

    Returns
    -------
    points_df : pandas DataFrame
        DataFrame of all points
    """
    query = """SELECT SurveyPoint.*, Field.FieldNumber, Surveyor.SurveyorName 
               FROM SurveyPoint 
               LEFT JOIN Field ON SurveyPoint.FieldId=Field.FieldId
               LEFT JOIN Surveyor ON SurveyPoint.SurveyorId=Surveyor.SurveyorId
               """
    points_df = db_query(query, **kwargs)
    points_df = points_df.drop(columns=['SherdCount', 'tempFixIDs'])

    if years:
        points_df = points_df[points_df.DataDate.dt.year.isin(years)]

    return points_df


#######################################################################################################################


[docs]def get_points_simple(**kwargs):
    """Load a DataFrame of points with the most typical query

    Parameters
    ----------
    **kwargs
        Optional arguments that are passed to get_credentials()

    Returns
    -------
    points_df : pandas DataFrame
        DataFrame of all points
    """
    import warnings
    warnings.warn(
        """get_points_simple() is no longer preferred. Use get_points() with appropriate parameters instead.""",
        DeprecationWarning)

    query = """SELECT SurveyPoint.*, Field.FieldNumber, Surveyor.SurveyorName 
               FROM SurveyPoint 
               LEFT JOIN Field ON SurveyPoint.FieldId=Field.FieldId
               LEFT JOIN Surveyor ON SurveyPoint.SurveyorId=Surveyor.SurveyorId
               """
    points_df = db_query(query, **kwargs)
    return points_df


#######################################################################################################################


[docs]def get_points_by_year(years, **kwargs):
    """Load a DataFrame of points with the most typical query for specified year(s)

    Parameters
    ----------
    years : list
        List of desired years; can be strings or integers
    **kwargs
        Optional arguments that are passed to get_credentials()

    Returns
    -------
    points_df : pandas DataFrame
        DataFrame of all points for specified year(s)
    """
    import warnings
    warnings.warn(
        """get_points_by_year() is no longer preferred. Use get_points() with appropriate parameters instead.""",
        DeprecationWarning)

    points_df = get_points_simple(**kwargs)
    points_df = points_df[points_df.DataDate.dt.year.isin(years)]
    return points_df


#######################################################################################################################


[docs]def get_artifacts(sections=['base'], years=None, include_discards=False, **kwargs):
    """Load a DataFrame of artifacts

    Parameters
    ----------
    sections : list of some set of
        {'all', 'base', 'metrics', 'classify', 'production', 'tile_brick', 'waretypes', 'vesselparts', 'macro_fabric'}
        Sections to include in the output DataFrame. Each section refers to a group of column names.
    years : list
        List of desired years; can be strings or integers
    include_discards : bool
        If True, return all records, even artifacts marked as Discarded.
    **kwargs
        Optional arguments that are passed to get_credentials()

    Returns
    -------
    artifacts_df : pandas DataFrame
        DataFrame of all artifacts

    """
    if 'base' in sections:
        sections.remove('base')
    if 'all' in sections:
        sections = ['metrics', 'classify', 'production', 'tile_brick', 'waretypes', 'vesselparts', 'macro_fabric']

    base = ['SherdId', 'FieldId', 'SurveyorId', 'PointId', 'SherdNum', 'SurveyPointId',
            'ChangedDate', 'FieldNumber', 'Northing', 'Easting', 'SurveyorName']

    col_grps = {'metrics': ['Length', 'Width', 'Thickness', 'Weight'],
                'classify': ['MaterialTypeName', 'ManufactureName', 'FabricTypeName', 'Form', 'Note',
                             'VesselPartOther', 'WareTypeOther'],
                'production': ['Chronology', 'CultureOther', 'RegionOther', 'EnteredDate', 'EarlyChrono',
                               'LateChrono', 'Catalan'],
                'tile_brick': ['TileType', 'TileIsStamped', 'TileIsCurved', 'BrickIsStamped'],
                'macro_fabric': ['SherdCondition', 'SurfaceTexture', 'SurfTextureOther', 'SurfaceCondition',
                                 'SurvCondOther', 'SurfaceTreatExt', 'STEOther', 'SurfaceTreatInt', 'STIOther',
                                 'HardnessSurface', 'HardnessCore', 'FiringCore', 'ColorExt', 'ColorInt',
                                 'ColorCore', 'InclusionOther', 'DomInclusion', 'DomInclusionOther',
                                 'InclusionSorting', 'InclusionShape', 'InclusionDensity', 'InclusionSize',
                                 'InclusionLargestSize', 'InclusionTexture']
            }

    query = """SELECT Sherd.*, FabricType.*, Field.FieldNumber, SurveyPoint.Northing, SurveyPoint.Easting, 
                      Surveyor.SurveyorName, ManufactureMethod.ManufactureName
               FROM Sherd
               LEFT JOIN Field ON Sherd.FieldId=Field.FieldId
               LEFT JOIN FabricType ON Sherd.FabricType=FabricType.FabricTypeId
               LEFT JOIN SurveyPoint ON SurveyPoint.SurveyPointId = Sherd.SurveyPointId
               LEFT JOIN Surveyor ON SurveyPoint.SurveyorId=Surveyor.SurveyorId
               LEFT JOIN ManufactureMethod ON Sherd.ManufactureMethod=ManufactureMethod.ManufactureID
               """
    artifacts_df = db_query(query, **kwargs)

    if include_discards is False:
        artifacts_df = artifacts_df[artifacts_df['FabricTypeName'] != 'Discarded']

    cols = base
    for s in sections:
        if s in col_grps.keys():
            cols += col_grps[s]

    artifacts_df = artifacts_df[cols]

    if 'waretypes' in sections:
        waretypes = db_query("""SELECT swt.SherdId, wt.WareTypeName
                                     FROM SherdWareType AS swt
                                     LEFT JOIN WareType AS wt ON swt.WareTypeId=wt.WareTypeId
                                     LEFT JOIN Sherd ON Sherd.SherdId=swt.SherdId
                                     """)

        dummies = _pd.get_dummies(waretypes['WareTypeName']).rename(  # one hot encoding
            columns={'other': 'other_waretype', 'unknown': 'unknown_waretype'})
        waretypes = waretypes.merge(dummies, how='left', left_index=True, right_index=True)  # attach to original
        waretypes = waretypes.groupby('SherdId').sum()  # groupby to collapse duplicate SherdIds
        artifacts_df = artifacts_df.merge(waretypes, how='left', left_on='SherdId', right_index=True)  # merge with df

    if 'vesselparts' in sections:
        vessel_parts = db_query("""SELECT svp.SherdId, vp.VesselPartName
                                     FROM SherdVesselPart AS svp
                                     LEFT JOIN VesselPart AS vp ON svp.VesselPartId=vp.VesselPartId
                                     LEFT JOIN Sherd ON Sherd.SherdId=svp.SherdId
                                     """)

        dummies = _pd.get_dummies(vessel_parts['VesselPartName']).rename(
            columns={'other': 'other_vesselpart', 'unknown': 'unknown_vesselpart'})  # one hot encoding
        vessel_parts = vessel_parts.merge(dummies, how='left', left_index=True, right_index=True)  # attach to original
        vessel_parts = vessel_parts.groupby('SherdId').sum()  # groupby to collapse duplicate SherdIds
        artifacts_df = artifacts_df.merge(vessel_parts, how='left', left_on='SherdId',
                                          right_index=True)  # merge with df

    if years:
        artifacts_df = artifacts_df[artifacts_df.ChangedDate.dt.year.isin(years)]

    return artifacts_df


#######################################################################################################################


[docs]def get_artifacts_simple(include_discards=False, **kwargs):
    """Load a DataFrame of artifacts with the most typical query
    
    Parameters
    ----------
    include_discards : bool
        If True, return all records, even artifacts marked as Discarded.
    **kwargs
        Optional arguments that are passed to get_credentials()
    
    Returns
    -------
    artifacts_df : pandas DataFrame
        DataFrame of all artifacts

    """
    import warnings
    warnings.warn(
        """get_artifacts_simple() is no longer preferred. Use get_artifacts() with appropriate parameters instead.""", DeprecationWarning)
    query = """SELECT Sherd.*, FabricType.*, Field.FieldNumber, SurveyPoint.Northing, SurveyPoint.Easting, 
                      Surveyor.SurveyorName 
               FROM Sherd
               LEFT JOIN Field ON Sherd.FieldId=Field.FieldId
               LEFT JOIN FabricType ON Sherd.FabricType=FabricType.FabricTypeId
               LEFT JOIN SurveyPoint ON SurveyPoint.SurveyPointId = Sherd.SurveyPointId
               LEFT JOIN Surveyor ON SurveyPoint.SurveyorId=Surveyor.SurveyorId
               LEFT JOIN ManufactureMethod ON Sherd.ManufactureMethod=ManufactureMethod.ManufactureID
               """
    artifacts_df = db_query(query, **kwargs)
    if include_discards is False:
        artifacts_df = artifacts_df[artifacts_df['FabricTypeName'] != 'Discarded']

    return artifacts_df


#######################################################################################################################


[docs]def get_artifacts_by_year(years, discards=False, **kwargs):
    """Load a DataFrame of artifacts with the most typical query for specified year(s)
    
    Parameters
    ----------
    years : list
        List of desired years; can be strings or integers
    discards : bool
        If True, return all records, even artifacts marked as Discarded.    
    **kwargs
        Optional arguments that are passed to get_credentials()

    Returns
    -------
    artifacts_df : pandas DataFrame
        DataFrame of all artifacts for specified year(s)
        
    """
    import warnings
    warnings.warn(
        """get_artifacts_by_year() is no longer preferred. Use get_artifacts() with appropriate parameters instead.""",
        DeprecationWarning)
    artifacts_df = get_artifacts_simple(include_discards=discards, **kwargs)
    artifacts_df = artifacts_df[artifacts_df.ChangedDate.dt.year.isin(years)]
    return artifacts_df


#######################################################################################################################


[docs]def get_productions_simple(**kwargs):
    """Load a DataFrame of productions with the most typical query

    Parameters
    ----------
    **kwargs
        Optional arguments that are passed to get_credentials()

    Returns
    -------
    prods_df : pandas DataFrame
        DataFrame of all productions
    """
    query = """SELECT *
               FROM FabricType
               """
    prods_df = db_query(query, **kwargs)
    return prods_df


#######################################################################################################################


[docs]def get_production_cts_wts(**kwargs):
    """Load a DataFrame of all points with columns for counts and weights of all productions

    Parameters
    ----------
    **kwargs
        Optional arguments that are passed to get_credentials()

    Returns
    -------
    cts_wts : pandas DataFrame
        DataFrame of all points with counts and weights for all productions
        
    Notes
    -----
    Also pulls in some non-vessel artifact types (e.g., tile, brick, other construction material)
    
    """
    artifacts = get_artifacts(sections=['metrics', 'classify', 'production', 'tile_brick'], **kwargs)
    points = get_points(**kwargs)
    
    # For artifacts without a Production (i.e., tiles, bricks, etc), use their MaterialType as their Production. If
    # MaterialType is Tile, use TileType ('Tegula' or 'Imbrex')
    artifacts['Production'] = artifacts['FabricTypeName'].where(~artifacts['FabricTypeName'].isnull(),
                                                                artifacts['MaterialTypeName'].where(
                                                                    artifacts['TileType'].isnull(),
                                                                    artifacts['TileType']))

    artifacts['Production'] = artifacts['Production'].where(
        ~((artifacts.Note.str.contains('signinum')) | (artifacts.Note.str.contains('Signinum'))),
        'Opus signinum')

    # summarize artifacts by point
    art_cts = artifacts.groupby(['SurveyPointId', 'Production']).agg({'Production': 'size', 'Weight': 'sum'}).unstack()
    
    # merge Production (counts) and Weight (weights) with the points df
    # do this in two steps so that we can append the _ct and _wt suffixes to columns
    cts = _pd.merge(points, art_cts['Production'], how='left', left_on='SurveyPointId', right_index=True)
    cts_wts = _pd.merge(cts, art_cts['Weight'], how='left', left_on='SurveyPointId', right_index=True,
                        suffixes=('_ct', '_wt'))
    return cts_wts


#######################################################################################################################


[docs]def get_points_times(warn='enable', **kwargs):
    """Load a DataFrame of points with datetimes cleaned and search times calculated
    
    Parameters
    ----------
    warn : {'enable', 'disable'}
        Argument passed to the `calc_search_time()` function specifying whether or to print a generic warning message.
    **kwargs
        Optional arguments that are passed to get_credentials()
    
    Returns
    -------
    pts : pandas DataFrame
        DataFrame of all points with adjusted datetimes and search times
    """
    pts = calc_search_time(clean_datetimes(get_points(**kwargs)), warn=warn)
    return pts


#######################################################################################################################