Source code for leiap.report

"""
This file contains functions related to the annual report
"""


import pandas as _pd
# import numpy as _np
# import altair as _alt


#######################################################################################################################


[docs]def fields_summary_table(points, artifacts, fid_col='geo_field', save_excel=False, excel_file='field_counts.xlsx', excel_sheet='field_data'): """Create table (in Spanish) with basic stats by field Parameters ---------- points : DataFrame pandas DataFrame of survey points artifacts : DataFrame pandas DataFrame of artifacts fid_col: str, optional Column containing the field identifier to use for grouping save_excel : bool, optional Specify whether to write DataFrame to an Excel sheet excel_file : str, optional Filename for Excel excel_sheet : str, optional Sheet name for Excel Returns ------- fields_data : DataFrame DataFrame version of the created table Notes ----- The table is in the form: | Polígono | Parcela | Subparcela | Id | Núm. Pts. | Núm. Frags. | Pos. Pts. | | -------- | ------- | ---------- | ------ | --------- | ----------- | --------- | | 03 | 027 | 0 | 030270 | 351 | 39 | 26 | | 16 | 092 | a | 16092a | 105 | 58 | 51 | """ # count number of points per geographic field fields = points_per_group(points, grouper_col=fid_col, ct_col='Núm. Pts.') # count number of artifacts per field artifact_cts = artifacts_per_group( artifacts, grouper_col=fid_col, ct_col='Núm. Frags.') # count number of points with artifacts per field pos_pts = pos_points_per_group( artifacts, grouper_col=fid_col, pt_id_col='SurveyPointId', ct_col='Pos. Pts.') # concatenate the three dataframes created above with an outer join fields_data = _pd.concat([fields, artifact_cts, pos_pts], axis=1).fillna(0) fields_data.reset_index(inplace=True) fields_data.rename(columns={'index': 'Id'}, inplace=True) # set counts columns to proper integer datatypes fields_data[['Núm. Pts.', 'Núm. Frags.', 'Pos. Pts.']] = fields_data[ ['Núm. Pts.', 'Núm. Frags.', 'Pos. Pts.']].astype(int) # split up the `Id` column into various components fields_data['Polígono'] = fields_data['Id'].str[0:2] fields_data['Parcela'] = fields_data['Id'].str[2:5] fields_data['Subparcela'] = fields_data['Id'].str[5] # reorder columns fields_data = fields_data[[ 'Polígono', 'Parcela', 'Subparcela', 'Id', 'Núm. Pts.', 'Núm. Frags.', 'Pos. Pts.']] if save_excel: # save to Excel sheet fields_data.to_excel(excel_file, sheet_name=excel_sheet) return fields_data
#######################################################################################################################
[docs]def points_per_group(points, grouper_col, ct_col='Núm. Pts.'): """Count the points in the given group Parameters ---------- points : pandas DataFrame Point observations grouper_col : str Column in `points` that you want to use to group, usually 'geo_field' or similar ct_col : str, optional Name to give the new column Returns ------- df : pandas DataFrame Points grouped and counted according to the grouper_col """ df = points.groupby(grouper_col).size().reset_index( name=ct_col).set_index(grouper_col) df.index.rename('Id', inplace=True) return df
#######################################################################################################################
[docs]def artifacts_per_group(artifacts, grouper_col, ct_col='Núm. Frags.'): """Count the artifacts in the given group Parameters ---------- artifacts : pandas DataFrame Artifact observations grouper_col : str Column in `artifacts` that you want to use to group, usually 'geo_field' or similar ct_col : str, optional Name to give the new column Returns ------- df : pandas DataFrame Artifacts grouped and counted according to the grouper_col """ df = artifacts.groupby(grouper_col).size().reset_index( name=ct_col).set_index(grouper_col) return df
#######################################################################################################################
[docs]def pos_points_per_group(artifacts, grouper_col, pt_id_col='SurveyPointId', ct_col='Pos. Pts.'): """Count the number of points with artifacts in the given group Parameters ---------- artifacts : pandas DataFrame Artifact observations grouper_col : str Column in `artifacts` that you want to use to group, usually 'geo_field' or similar pt_id_col : str, optional Column that contains unique point identifier, usually 'SurveyPointId' ct_col : str, optional Name to give the new column Returns ------- df : pandas DataFrame Number of points with artifacts grouped and counted according to the grouper_col """ df = artifacts.groupby([grouper_col, pt_id_col]).size( ).reset_index().set_index(grouper_col) df = df.groupby(grouper_col).size().reset_index( name=ct_col).set_index(grouper_col) return df
#######################################################################################################################
[docs]def time_span_chart(data): """Create a chart showing all productions, their counts, their percentages, and their time spans Parameters ---------- data : pandas DataFrame Data containing columns `['Catalan', 'EarlyChrono', 'LateChrono', 'count', 'pct']` Returns ------- fig: matplotlib Figure """ import matplotlib.pyplot as plt from matplotlib import collections as mc from matplotlib.lines import Line2D import seaborn as sns def make_proxy(zvalue, scalar_mappable, **kwargs): """Helper function for creating the legend """ COLOR = 'black' return Line2D([0, 1], [0, 1], color=COLOR, solid_capstyle='butt', **kwargs) data = data.sort_values(by='EarlyChrono', ascending=False) data = data.assign(order_y=[i+1 for i in range(data.shape[0])]) # create tuples of the form (x_start, order_y) and (x_end, order_y)] for each production data['start_pt'] = list(zip(data['EarlyChrono'], data['order_y'])) data['end_pt'] = list(zip(data['LateChrono'], data['order_y'])) # create label for production type and count (Catalan) data['ylabel'] = data['Catalan'].map(str) + ' - ' + data['count'].map(str) # make list of lists of coordinates field_lines = [list(a) for a in zip(data['start_pt'], data['end_pt'])] # items needed for legend construction LW_BINS = [0, 10, 25, 50, 75, 90, 100] # bins for line width LW_LABELS = [3, 6, 9, 12, 15, 18] # line widths # convert percentages to line widths based on bin values data['lw'] = _pd.cut(data['pct'], bins=LW_BINS, labels=LW_LABELS) data['lw'] = _pd.to_numeric(data['lw']) # lines for each production with linewidths as determined above lc = mc.LineCollection(field_lines, color='black', linewidths=list(data['lw'])) # start and end values for x-axis (negative = BC, positive = AD) START = -1600 END = 2001 # create thin gray lines that stretch horizontally across the whole plot data['gray_start'] = START data['gray_end'] = END data['gray_start_pt'] = list(zip(data['gray_start'], data['order_y'])) data['gray_end_pt'] = list(zip(data['gray_end'], data['order_y'])) gray_lines = [list(B) for B in zip(data['gray_start_pt'], data['gray_end_pt'])] # horizontal gray lines lc2 = mc.LineCollection(gray_lines, color='gray', linewidth=0.5) # units for plot creation; t='top', b='bottom' HEIGHT_UNIT = 0.15 T = 1.0 B = 0.7 # inch # x-ticks INTERVAL = 400 # x-axis tick interval xticks = [x for x in range(START, END, INTERVAL)] # create ticks # x-values for vertical lines to be added representing time period boundaries VERT_LINES = [-850, -550, -123, 455, 533, 902, 1229, 1492, 1789] # time period labels # pos = abs(start-val)/float(abs(end-start)) TLABEL_V = 1.01 TLABEL_ANG = 90 PERIOD_LABELS = {'Navetiforme': [abs(START-(-1225))/float(abs(END-START)), TLABEL_V, TLABEL_ANG], 'Talaiòtic': [abs(START-(-700))/float(abs(END-START)), TLABEL_V, TLABEL_ANG], 'Posttalaiòtic': [abs(START-(-336.5))/float(abs(END-START)), TLABEL_V, TLABEL_ANG], 'Romana': [abs(START-(166))/float(abs(END-START)), TLABEL_V, TLABEL_ANG], 'Vàndala': [abs(START-(500))/float(abs(END-START)), TLABEL_V, TLABEL_ANG], 'Bizantina': [abs(START-(717.5))/float(abs(END-START)), TLABEL_V, TLABEL_ANG], 'Àndalusina': [abs(START-(1065.5))/float(abs(END-START)), TLABEL_V, TLABEL_ANG], 'Medieval\nCristiana': [abs(START-(1360.5))/float(abs(END-START)), TLABEL_V, TLABEL_ANG], 'Moderna': [abs(START-(1640.5))/float(abs(END-START)), TLABEL_V, TLABEL_ANG], 'Contemporània': [abs(START-(2000))/float(abs(END-START)), TLABEL_V, TLABEL_ANG]} # create plot and set properties sns.set(style="ticks") sns.set_context("notebook") height = HEIGHT_UNIT*(data.shape[0]+1)+T+B fig = plt.figure(figsize=(6.5, height)) ax = fig.add_subplot(111) ax.set_ylim(0, data.shape[0]+0.5) fig.subplots_adjust(bottom=B/height, top=1-T/height, left=0.45, right=0.95) ax.add_collection(lc2) ax.add_collection(lc) ax.set_xlim(left=START, right=END) ax.set_xticks(xticks) ax.xaxis.set_ticks_position('bottom') ax.tick_params(axis='x', labelsize=8) sns.despine(left=True) ax.set_yticks(data['order_y']) ax.set(yticklabels=data['ylabel']) ax.tick_params(axis='y', length=0, labelsize=8) # place time period vertical lines from list of x values for vline in VERT_LINES: plt.axvline(x=vline, ls='dashed', lw=0.5, color='gray') # place time period labels for period, val in PERIOD_LABELS.items(): ax.text(x=val[0], y=val[1], s=period, color='gray', fontsize=8, horizontalalignment='center', verticalalignment='bottom', rotation=val[2], transform=ax.transAxes) # legend proxies = [make_proxy(item, lc, linewidth=item) for item in LW_LABELS] leg = ax.legend(proxies, ['0-10%', '10-25%', '25-50%', '50-75%', '75-90%', '90-100%'], bbox_to_anchor=(0.05, 0.0), bbox_transform=fig.transFigure, loc='lower left', ncol=6, labelspacing=3.0, handlelength=4.0, handletextpad=0.5, markerfirst=False, columnspacing=0.5, frameon=False, fontsize=8) for txt in leg.get_texts(): txt.set_ha("left") # horizontal alignment of text item # txt.set_x(0) # x-position # txt.set_y() # y-position return fig
#######################################################################################################################
[docs]def write_excel_table(unit, data, writer): """Format data and add as worksheet to Excel output Parameters ---------- unit : str Name of field or other data group; will be used as Sheet name in Excel data : pandas `DataFrame` Data to write to the file writer : `ExcelWriter` Excel file Returns ------- None """ table_data = data.loc[:, ['Catalan', 'count', 'pct']].rename( columns={'Catalan': 'Producció', 'count': 'Núm', 'pct': '%'}) MAX_LEN = 40 table_data.to_excel(writer, sheet_name=str( unit), index=False, startrow=1, header=False) workbook = writer.book worksheet = writer.sheets[str(unit)] # header header_format = workbook.add_format( {'bold': True, 'align': 'center', 'font_name': 'Arial', 'font_size': 10, 'bottom': 2}) for col_num, value in enumerate(table_data.columns.values): worksheet.write(0, col_num, value, header_format) header_format2 = workbook.add_format( {'bold': True, 'align': 'right', 'font_name': 'Arial', 'font_size': 10, 'bottom': 2}) worksheet.write(0, 0, 'Producció', header_format2) colA_format = workbook.add_format( {'align': 'right', 'font_name': 'Arial', 'font_size': 10}) worksheet.set_column('A:A', MAX_LEN, colA_format) colB_format = workbook.add_format( {'align': 'center', 'font_name': 'Arial', 'font_size': 10}) worksheet.set_column('B:B', None, colB_format) colC_format = workbook.add_format( {'align': 'center', 'num_format': '0.0', 'font_name': 'Arial', 'font_size': 10}) worksheet.set_column('C:C', None, colC_format)
#######################################################################################################################
[docs]def make_report_tables(artifacts, group_col, output_folder, fname='field_tables.xlsx'): """Create and save Excel file with a Sheet for each group in `groups` Parameters ---------- artifacts : pandas `DataFrame` group_col : str Name of column that contains group info output_folder : str Folder to save the final Excel fname : str, optional Name of the Excel file (the default is 'field_tables.xlsx') Returns ------- None """ from .constants import get_misc_types # where no Catalan name exists, map the Catalan value from the misc dictionary to the MaterialTypeName # e.g., 'brick' --> 'maó', 'tile' --> 'teula' misc = get_misc_types(lang='both') artifacts['Catalan'] = artifacts['Catalan'].where(~artifacts['Catalan'].isnull(), artifacts['MaterialTypeName'].map(misc).where( artifacts['TileType'].isnull(), artifacts['TileType'])) artifacts['Catalan'] = artifacts['Catalan'].where( ~((artifacts['Note'].str.contains('signinum')) | (artifacts['Note'].str.contains('Signinum'))), 'Opus signinum') artifacts_sub = artifacts.loc[:, [group_col, 'Catalan']] groups = artifacts_sub.groupby([group_col]) writer = _pd.ExcelWriter(f'{output_folder}{fname}', engine='xlsxwriter') for unit, data in groups: table_group = data.groupby(['Catalan'])['Catalan'].count().rename( columns={'Catalan': 'count'}).reset_index().rename(columns={0: 'count'}) table_group['pct'] = ( table_group['count'] / table_group['count'].sum() * 100).round(decimals=2) table_group = table_group.sort_values(by='count', ascending=False) write_excel_table(unit, table_group, writer) writer.save() print(f'Excel file saved to {output_folder}{fname}')
#######################################################################################################################
[docs]def make_report_span_charts(artifacts, group_col, output_folder, file_prefix=''): # def make_report_span_charts(groups, group_col, output_folder, file_prefix=''): """Create and save production span charts Parameters ---------- artifacts : pandas `DataFrame` group_col : str Name of column that contains group info output_folder : str Folder to save the final image file_prefix : str, optional Extra text to add to the beginning of the file name (the default is an empty string `''`, which adds no text). All file names will end with the `unit` from `groups` (e.g., the field number) Returns ------- None """ import matplotlib.pyplot as plt from .constants import get_misc_types # where no Catalan name exists, map the Catalan value from the misc dictionary to the MaterialTypeName # e.g., 'brick' --> 'maó', 'tile' --> 'teula' misc = get_misc_types(lang='both') artifacts['Catalan'] = artifacts['Catalan'].where(~artifacts['Catalan'].isnull(), artifacts['MaterialTypeName'].map(misc).where( artifacts['TileType'].isnull(), artifacts['TileType'])) artifacts['Catalan'] = artifacts['Catalan'].where( ~((artifacts['Note'].str.contains('signinum')) | (artifacts['Note'].str.contains('Signinum'))), 'Opus signinum') artifacts_sub = artifacts.loc[:, [group_col, 'Catalan', 'EarlyChrono', 'LateChrono']] groups = artifacts_sub.groupby([group_col]) for unit, data in groups: span_group = data.groupby(['Catalan', 'EarlyChrono', 'LateChrono'])['Catalan'].count().rename(columns={'Catalan': 'count'}).reset_index( ).rename(columns={0: 'count'}) span_group['pct'] = (span_group['count'] / span_group['count'].sum() * 100).round(decimals=2) time_span_chart(span_group) plt.savefig(f'{output_folder}{file_prefix}{unit}.png', dpi=300) print(f'{file_prefix}{unit}.png saved') plt.close('all') print(f'{len(groups)} image file(s) saved to {output_folder}')