"""
This file contains functions related to the annual report
"""
import pandas as _pd
# import numpy as _np
# import altair as _alt
#######################################################################################################################
[docs]def fields_summary_table(points, artifacts, fid_col='geo_field',
save_excel=False, excel_file='field_counts.xlsx', excel_sheet='field_data'):
"""Create table (in Spanish) with basic stats by field
Parameters
----------
points : DataFrame
pandas DataFrame of survey points
artifacts : DataFrame
pandas DataFrame of artifacts
fid_col: str, optional
Column containing the field identifier to use for grouping
save_excel : bool, optional
Specify whether to write DataFrame to an Excel sheet
excel_file : str, optional
Filename for Excel
excel_sheet : str, optional
Sheet name for Excel
Returns
-------
fields_data : DataFrame
DataFrame version of the created table
Notes
-----
The table is in the form:
| Polígono | Parcela | Subparcela | Id | Núm. Pts. | Núm. Frags. | Pos. Pts. |
| -------- | ------- | ---------- | ------ | --------- | ----------- | --------- |
| 03 | 027 | 0 | 030270 | 351 | 39 | 26 |
| 16 | 092 | a | 16092a | 105 | 58 | 51 |
"""
# count number of points per geographic field
fields = points_per_group(points, grouper_col=fid_col, ct_col='Núm. Pts.')
# count number of artifacts per field
artifact_cts = artifacts_per_group(
artifacts, grouper_col=fid_col, ct_col='Núm. Frags.')
# count number of points with artifacts per field
pos_pts = pos_points_per_group(
artifacts, grouper_col=fid_col, pt_id_col='SurveyPointId', ct_col='Pos. Pts.')
# concatenate the three dataframes created above with an outer join
fields_data = _pd.concat([fields, artifact_cts, pos_pts], axis=1).fillna(0)
fields_data.reset_index(inplace=True)
fields_data.rename(columns={'index': 'Id'}, inplace=True)
# set counts columns to proper integer datatypes
fields_data[['Núm. Pts.', 'Núm. Frags.', 'Pos. Pts.']] = fields_data[
['Núm. Pts.', 'Núm. Frags.', 'Pos. Pts.']].astype(int)
# split up the `Id` column into various components
fields_data['Polígono'] = fields_data['Id'].str[0:2]
fields_data['Parcela'] = fields_data['Id'].str[2:5]
fields_data['Subparcela'] = fields_data['Id'].str[5]
# reorder columns
fields_data = fields_data[[
'Polígono', 'Parcela', 'Subparcela', 'Id', 'Núm. Pts.', 'Núm. Frags.', 'Pos. Pts.']]
if save_excel:
# save to Excel sheet
fields_data.to_excel(excel_file, sheet_name=excel_sheet)
return fields_data
#######################################################################################################################
[docs]def points_per_group(points, grouper_col, ct_col='Núm. Pts.'):
"""Count the points in the given group
Parameters
----------
points : pandas DataFrame
Point observations
grouper_col : str
Column in `points` that you want to use to group, usually 'geo_field' or similar
ct_col : str, optional
Name to give the new column
Returns
-------
df : pandas DataFrame
Points grouped and counted according to the grouper_col
"""
df = points.groupby(grouper_col).size().reset_index(
name=ct_col).set_index(grouper_col)
df.index.rename('Id', inplace=True)
return df
#######################################################################################################################
[docs]def artifacts_per_group(artifacts, grouper_col, ct_col='Núm. Frags.'):
"""Count the artifacts in the given group
Parameters
----------
artifacts : pandas DataFrame
Artifact observations
grouper_col : str
Column in `artifacts` that you want to use to group, usually 'geo_field' or similar
ct_col : str, optional
Name to give the new column
Returns
-------
df : pandas DataFrame
Artifacts grouped and counted according to the grouper_col
"""
df = artifacts.groupby(grouper_col).size().reset_index(
name=ct_col).set_index(grouper_col)
return df
#######################################################################################################################
[docs]def pos_points_per_group(artifacts, grouper_col, pt_id_col='SurveyPointId', ct_col='Pos. Pts.'):
"""Count the number of points with artifacts in the given group
Parameters
----------
artifacts : pandas DataFrame
Artifact observations
grouper_col : str
Column in `artifacts` that you want to use to group, usually 'geo_field' or similar
pt_id_col : str, optional
Column that contains unique point identifier, usually 'SurveyPointId'
ct_col : str, optional
Name to give the new column
Returns
-------
df : pandas DataFrame
Number of points with artifacts grouped and counted according to the grouper_col
"""
df = artifacts.groupby([grouper_col, pt_id_col]).size(
).reset_index().set_index(grouper_col)
df = df.groupby(grouper_col).size().reset_index(
name=ct_col).set_index(grouper_col)
return df
#######################################################################################################################
[docs]def time_span_chart(data):
"""Create a chart showing all productions, their counts, their percentages, and their time spans
Parameters
----------
data : pandas DataFrame
Data containing columns `['Catalan', 'EarlyChrono', 'LateChrono', 'count', 'pct']`
Returns
-------
fig: matplotlib Figure
"""
import matplotlib.pyplot as plt
from matplotlib import collections as mc
from matplotlib.lines import Line2D
import seaborn as sns
def make_proxy(zvalue, scalar_mappable, **kwargs):
"""Helper function for creating the legend
"""
COLOR = 'black'
return Line2D([0, 1], [0, 1], color=COLOR, solid_capstyle='butt', **kwargs)
data = data.sort_values(by='EarlyChrono', ascending=False)
data = data.assign(order_y=[i+1 for i in range(data.shape[0])])
# create tuples of the form (x_start, order_y) and (x_end, order_y)] for each production
data['start_pt'] = list(zip(data['EarlyChrono'], data['order_y']))
data['end_pt'] = list(zip(data['LateChrono'], data['order_y']))
# create label for production type and count (Catalan)
data['ylabel'] = data['Catalan'].map(str) + ' - ' + data['count'].map(str)
# make list of lists of coordinates
field_lines = [list(a) for a in zip(data['start_pt'], data['end_pt'])]
# items needed for legend construction
LW_BINS = [0, 10, 25, 50, 75, 90, 100] # bins for line width
LW_LABELS = [3, 6, 9, 12, 15, 18] # line widths
# convert percentages to line widths based on bin values
data['lw'] = _pd.cut(data['pct'], bins=LW_BINS, labels=LW_LABELS)
data['lw'] = _pd.to_numeric(data['lw'])
# lines for each production with linewidths as determined above
lc = mc.LineCollection(field_lines, color='black',
linewidths=list(data['lw']))
# start and end values for x-axis (negative = BC, positive = AD)
START = -1600
END = 2001
# create thin gray lines that stretch horizontally across the whole plot
data['gray_start'] = START
data['gray_end'] = END
data['gray_start_pt'] = list(zip(data['gray_start'], data['order_y']))
data['gray_end_pt'] = list(zip(data['gray_end'], data['order_y']))
gray_lines = [list(B)
for B in zip(data['gray_start_pt'], data['gray_end_pt'])]
# horizontal gray lines
lc2 = mc.LineCollection(gray_lines, color='gray', linewidth=0.5)
# units for plot creation; t='top', b='bottom'
HEIGHT_UNIT = 0.15
T = 1.0
B = 0.7 # inch
# x-ticks
INTERVAL = 400 # x-axis tick interval
xticks = [x for x in range(START, END, INTERVAL)] # create ticks
# x-values for vertical lines to be added representing time period boundaries
VERT_LINES = [-850, -550, -123, 455, 533, 902, 1229, 1492, 1789]
# time period labels
# pos = abs(start-val)/float(abs(end-start))
TLABEL_V = 1.01
TLABEL_ANG = 90
PERIOD_LABELS = {'Navetiforme': [abs(START-(-1225))/float(abs(END-START)), TLABEL_V, TLABEL_ANG],
'Talaiòtic': [abs(START-(-700))/float(abs(END-START)), TLABEL_V, TLABEL_ANG],
'Posttalaiòtic': [abs(START-(-336.5))/float(abs(END-START)), TLABEL_V, TLABEL_ANG],
'Romana': [abs(START-(166))/float(abs(END-START)), TLABEL_V, TLABEL_ANG],
'Vàndala': [abs(START-(500))/float(abs(END-START)), TLABEL_V, TLABEL_ANG],
'Bizantina': [abs(START-(717.5))/float(abs(END-START)), TLABEL_V, TLABEL_ANG],
'Àndalusina': [abs(START-(1065.5))/float(abs(END-START)), TLABEL_V, TLABEL_ANG],
'Medieval\nCristiana': [abs(START-(1360.5))/float(abs(END-START)), TLABEL_V, TLABEL_ANG],
'Moderna': [abs(START-(1640.5))/float(abs(END-START)), TLABEL_V, TLABEL_ANG],
'Contemporània': [abs(START-(2000))/float(abs(END-START)), TLABEL_V, TLABEL_ANG]}
# create plot and set properties
sns.set(style="ticks")
sns.set_context("notebook")
height = HEIGHT_UNIT*(data.shape[0]+1)+T+B
fig = plt.figure(figsize=(6.5, height))
ax = fig.add_subplot(111)
ax.set_ylim(0, data.shape[0]+0.5)
fig.subplots_adjust(bottom=B/height, top=1-T/height, left=0.45, right=0.95)
ax.add_collection(lc2)
ax.add_collection(lc)
ax.set_xlim(left=START, right=END)
ax.set_xticks(xticks)
ax.xaxis.set_ticks_position('bottom')
ax.tick_params(axis='x', labelsize=8)
sns.despine(left=True)
ax.set_yticks(data['order_y'])
ax.set(yticklabels=data['ylabel'])
ax.tick_params(axis='y', length=0, labelsize=8)
# place time period vertical lines from list of x values
for vline in VERT_LINES:
plt.axvline(x=vline, ls='dashed', lw=0.5, color='gray')
# place time period labels
for period, val in PERIOD_LABELS.items():
ax.text(x=val[0], y=val[1], s=period, color='gray', fontsize=8, horizontalalignment='center', verticalalignment='bottom',
rotation=val[2], transform=ax.transAxes)
# legend
proxies = [make_proxy(item, lc, linewidth=item) for item in LW_LABELS]
leg = ax.legend(proxies, ['0-10%', '10-25%', '25-50%', '50-75%', '75-90%', '90-100%'], bbox_to_anchor=(0.05, 0.0),
bbox_transform=fig.transFigure, loc='lower left', ncol=6, labelspacing=3.0, handlelength=4.0, handletextpad=0.5, markerfirst=False,
columnspacing=0.5, frameon=False, fontsize=8)
for txt in leg.get_texts():
txt.set_ha("left") # horizontal alignment of text item
# txt.set_x(0) # x-position
# txt.set_y() # y-position
return fig
#######################################################################################################################
[docs]def write_excel_table(unit, data, writer):
"""Format data and add as worksheet to Excel output
Parameters
----------
unit : str
Name of field or other data group; will be used as Sheet name in Excel
data : pandas `DataFrame`
Data to write to the file
writer : `ExcelWriter`
Excel file
Returns
-------
None
"""
table_data = data.loc[:, ['Catalan', 'count', 'pct']].rename(
columns={'Catalan': 'Producció', 'count': 'Núm', 'pct': '%'})
MAX_LEN = 40
table_data.to_excel(writer, sheet_name=str(
unit), index=False, startrow=1, header=False)
workbook = writer.book
worksheet = writer.sheets[str(unit)]
# header
header_format = workbook.add_format(
{'bold': True, 'align': 'center', 'font_name': 'Arial', 'font_size': 10, 'bottom': 2})
for col_num, value in enumerate(table_data.columns.values):
worksheet.write(0, col_num, value, header_format)
header_format2 = workbook.add_format(
{'bold': True, 'align': 'right', 'font_name': 'Arial', 'font_size': 10, 'bottom': 2})
worksheet.write(0, 0, 'Producció', header_format2)
colA_format = workbook.add_format(
{'align': 'right', 'font_name': 'Arial', 'font_size': 10})
worksheet.set_column('A:A', MAX_LEN, colA_format)
colB_format = workbook.add_format(
{'align': 'center', 'font_name': 'Arial', 'font_size': 10})
worksheet.set_column('B:B', None, colB_format)
colC_format = workbook.add_format(
{'align': 'center', 'num_format': '0.0', 'font_name': 'Arial', 'font_size': 10})
worksheet.set_column('C:C', None, colC_format)
#######################################################################################################################
[docs]def make_report_tables(artifacts, group_col, output_folder, fname='field_tables.xlsx'):
"""Create and save Excel file with a Sheet for each group in `groups`
Parameters
----------
artifacts : pandas `DataFrame`
group_col : str
Name of column that contains group info
output_folder : str
Folder to save the final Excel
fname : str, optional
Name of the Excel file (the default is 'field_tables.xlsx')
Returns
-------
None
"""
from .constants import get_misc_types
# where no Catalan name exists, map the Catalan value from the misc dictionary to the MaterialTypeName
# e.g., 'brick' --> 'maó', 'tile' --> 'teula'
misc = get_misc_types(lang='both')
artifacts['Catalan'] = artifacts['Catalan'].where(~artifacts['Catalan'].isnull(),
artifacts['MaterialTypeName'].map(misc).where(
artifacts['TileType'].isnull(),
artifacts['TileType']))
artifacts['Catalan'] = artifacts['Catalan'].where(
~((artifacts['Note'].str.contains('signinum')) |
(artifacts['Note'].str.contains('Signinum'))),
'Opus signinum')
artifacts_sub = artifacts.loc[:, [group_col, 'Catalan']]
groups = artifacts_sub.groupby([group_col])
writer = _pd.ExcelWriter(f'{output_folder}{fname}', engine='xlsxwriter')
for unit, data in groups:
table_group = data.groupby(['Catalan'])['Catalan'].count().rename(
columns={'Catalan': 'count'}).reset_index().rename(columns={0: 'count'})
table_group['pct'] = (
table_group['count'] / table_group['count'].sum() * 100).round(decimals=2)
table_group = table_group.sort_values(by='count', ascending=False)
write_excel_table(unit, table_group, writer)
writer.save()
print(f'Excel file saved to {output_folder}{fname}')
#######################################################################################################################
[docs]def make_report_span_charts(artifacts, group_col, output_folder, file_prefix=''):
# def make_report_span_charts(groups, group_col, output_folder, file_prefix=''):
"""Create and save production span charts
Parameters
----------
artifacts : pandas `DataFrame`
group_col : str
Name of column that contains group info
output_folder : str
Folder to save the final image
file_prefix : str, optional
Extra text to add to the beginning of the file name (the default is an empty string `''`, which adds no text). All file names will end with the `unit` from `groups` (e.g., the field number)
Returns
-------
None
"""
import matplotlib.pyplot as plt
from .constants import get_misc_types
# where no Catalan name exists, map the Catalan value from the misc dictionary to the MaterialTypeName
# e.g., 'brick' --> 'maó', 'tile' --> 'teula'
misc = get_misc_types(lang='both')
artifacts['Catalan'] = artifacts['Catalan'].where(~artifacts['Catalan'].isnull(),
artifacts['MaterialTypeName'].map(misc).where(
artifacts['TileType'].isnull(),
artifacts['TileType']))
artifacts['Catalan'] = artifacts['Catalan'].where(
~((artifacts['Note'].str.contains('signinum')) |
(artifacts['Note'].str.contains('Signinum'))),
'Opus signinum')
artifacts_sub = artifacts.loc[:, [group_col,
'Catalan', 'EarlyChrono', 'LateChrono']]
groups = artifacts_sub.groupby([group_col])
for unit, data in groups:
span_group = data.groupby(['Catalan', 'EarlyChrono', 'LateChrono'])['Catalan'].count().rename(columns={'Catalan': 'count'}).reset_index(
).rename(columns={0: 'count'})
span_group['pct'] = (span_group['count'] /
span_group['count'].sum() * 100).round(decimals=2)
time_span_chart(span_group)
plt.savefig(f'{output_folder}{file_prefix}{unit}.png', dpi=300)
print(f'{file_prefix}{unit}.png saved')
plt.close('all')
print(f'{len(groups)} image file(s) saved to {output_folder}')