first commit
This commit is contained in:
14
section_1_graph_library_module/pyproject.toml
Normal file
14
section_1_graph_library_module/pyproject.toml
Normal file
@@ -0,0 +1,14 @@
|
||||
# scripts/graph_generation_library/pyproject.toml
|
||||
[build-system]
|
||||
requires = ["setuptools", "wheel"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "section_1_graph_library_module"
|
||||
version = "0.1.0"
|
||||
description = "Internal PASBDC graph creation functions used to generate figures for the network wide desk reviews."
|
||||
|
||||
[tool.setuptools]
|
||||
packages = ["section_1_graph_library_module"]
|
||||
|
||||
|
||||
@@ -0,0 +1,4 @@
|
||||
Metadata-Version: 2.4
|
||||
Name: section_1_graph_library_module
|
||||
Version: 0.1.0
|
||||
Summary: Internal PASBDC graph creation functions used to generate figures for the network wide desk reviews.
|
||||
@@ -0,0 +1,12 @@
|
||||
pyproject.toml
|
||||
section_1_graph_library_module/__init__.py
|
||||
section_1_graph_library_module/counselling_interval_analysis.py
|
||||
section_1_graph_library_module/funding_analysis.py
|
||||
section_1_graph_library_module/naics_census_analysis.py
|
||||
section_1_graph_library_module/nbs_analysis.py
|
||||
section_1_graph_library_module/satisfaction_survey_analysis.py
|
||||
section_1_graph_library_module/trainings_analysis.py
|
||||
section_1_graph_library_module.egg-info/PKG-INFO
|
||||
section_1_graph_library_module.egg-info/SOURCES.txt
|
||||
section_1_graph_library_module.egg-info/dependency_links.txt
|
||||
section_1_graph_library_module.egg-info/top_level.txt
|
||||
@@ -0,0 +1 @@
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
section_1_graph_library_module
|
||||
@@ -0,0 +1,82 @@
|
||||
from .naics_census_analysis import (
|
||||
make_census_naics_chart,
|
||||
make_client_census_comparison_graph,
|
||||
make_county_heatmap
|
||||
)
|
||||
|
||||
from .funding_analysis import (
|
||||
make_funding_attribution_network_wide,
|
||||
make_funding_attribution_rate_chart,
|
||||
make_theoretical_funding_attribution_rate_chart,
|
||||
make_funding_director_confirmed_graph,
|
||||
)
|
||||
|
||||
from .satisfaction_survey_analysis import (
|
||||
make_survey_response_count_graph,
|
||||
make_average_survey_score_graph,
|
||||
make_responses_per_client_graph,
|
||||
make_nps_graph
|
||||
)
|
||||
|
||||
from .trainings_analysis import (
|
||||
StatChartVariants,
|
||||
make_network_trainings_count_statistics_charts,
|
||||
make_attendee_bins_statistics_charts,
|
||||
make_primary_training_topic_statistics_charts,
|
||||
make_center_attendee_statistics_charts,
|
||||
make_center_event_count_charts,
|
||||
make_center_attendee_range_charts,
|
||||
make_primary_training_topic_pie_charts,
|
||||
build_total_trainings_count_chart,
|
||||
build_total_trainings_percent_chart,
|
||||
build_no_first_steps_count_chart,
|
||||
build_no_first_steps_percent_chart,
|
||||
build_no_first_no_pre_count_chart,
|
||||
build_no_first_no_pre_percent_chart,
|
||||
build_first_pre_only_count_chart,
|
||||
build_first_pre_only_percent_chart,
|
||||
build_ondemand_count_chart,
|
||||
build_ondemand_percent_chart,
|
||||
build_ondemand_no_first_count_chart,
|
||||
build_ondemand_no_first_percent_chart,
|
||||
build_ondemand_no_first_no_pre_count_chart,
|
||||
build_ondemand_no_first_no_pre_percent_chart,
|
||||
)
|
||||
|
||||
from .nbs_analysis import (
|
||||
make_nbs_attribution_network_wide,
|
||||
make_attribution_rate_chart,
|
||||
make_theoretical_attribution_rate_chart,
|
||||
make_director_confirmed_graph,
|
||||
)
|
||||
|
||||
from .counselling_interval_analysis import (
|
||||
make_interval_snapshot_chart
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
'make_census_naics_chart',
|
||||
'make_client_census_comparison_graph',
|
||||
'make_county_heatmap',
|
||||
'make_funding_attribution_network_wide',
|
||||
'make_funding_attribution_rate_chart',
|
||||
'make_theoretical_funding_attribution_rate_chart',
|
||||
'make_funding_director_confirmed_graph',
|
||||
'make_survey_response_count_graph',
|
||||
'make_average_survey_score_graph',
|
||||
'make_responses_per_client_graph',
|
||||
'make_nps_graph',
|
||||
'make_network_trainings_count_statistics_charts',
|
||||
'make_attendee_bins_statistics_charts',
|
||||
'make_primary_training_topic_statistics_charts',
|
||||
'make_center_attendee_statistics_charts',
|
||||
'make_center_event_count_charts',
|
||||
'make_center_attendee_range_charts',
|
||||
'make_primary_training_topic_pie_charts',
|
||||
'make_nbs_attribution_network_wide',
|
||||
'make_attribution_rate_chart',
|
||||
'make_theoretical_attribution_rate_chart',
|
||||
'make_director_confirmed_graph',
|
||||
'StatChartVariants',
|
||||
'make_interval_snapshot_chart'
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,76 @@
|
||||
# FILE: counselling_interval_analysis.py
|
||||
# CREATED: 12/31/25
|
||||
# AUTHOR: Vincent Allen
|
||||
# CONTACT: vincent@vtallen.com valle276@live.kutztown.edu
|
||||
# PURPOSE:
|
||||
|
||||
# This file contains graph generation functions to visualize the data from the PASBDC Date Interval Snapshot
|
||||
# scorecard in neoserra
|
||||
|
||||
# Third party libraries
|
||||
from pandas.core.indexes.base import JoinHow
|
||||
import plotly.graph_objects as go
|
||||
import plotly.express as px
|
||||
import pandas as pd
|
||||
|
||||
# Python modules
|
||||
from enum import Enum
|
||||
from typing import Dict, List
|
||||
|
||||
# Custom packages
|
||||
from constants_module import NEOSERRA_COLUMNS, OUT_COLUMNS
|
||||
|
||||
def make_interval_snapshot_chart(
|
||||
df:pd.DataFrame,
|
||||
title:str,
|
||||
fiscal_year_tag:str,
|
||||
col_interval_data_value:str=NEOSERRA_COLUMNS.interval_data_value,
|
||||
col_neo_center:str=NEOSERRA_COLUMNS.center,
|
||||
):
|
||||
"""
|
||||
:param df: The input dataset
|
||||
:param title: The title of the chart
|
||||
:param fiscal_year_tag: The fiscal year to place in the title
|
||||
:param col_interval_data_value: The column to take the mean of per center group
|
||||
:param col_neo_center: The column in the dataset that contains the center
|
||||
:return: go.Figure - The constructed plotly graph
|
||||
|
||||
:description:
|
||||
Graphs the mean of the col_interval_data_value per center on a bar chart
|
||||
"""
|
||||
grouped_df = df.groupby(col_neo_center)[col_interval_data_value].mean().reset_index(name=col_interval_data_value) #pyright:ignore
|
||||
fig = px.bar(
|
||||
grouped_df,
|
||||
x=col_neo_center,
|
||||
y=col_interval_data_value,
|
||||
text=col_interval_data_value,
|
||||
title=f"{title} {fiscal_year_tag}",
|
||||
width=1400,
|
||||
height=1000,
|
||||
text_auto='.1f' #pyright:ignore
|
||||
)
|
||||
fig.update_layout(
|
||||
font_family="Futura",
|
||||
title_font_family="Futura",
|
||||
yaxis_title='Days'
|
||||
)
|
||||
fig.update_traces(
|
||||
marker_color='#73e0c6',
|
||||
)
|
||||
|
||||
fig.add_annotation(
|
||||
xref='paper', yref='paper',
|
||||
x=0.0, y=1.01,
|
||||
showarrow=False,
|
||||
text=f"Total clients: {df.shape[0]}"
|
||||
)
|
||||
|
||||
net_avg = df[col_interval_data_value].mean()
|
||||
fig.add_hline(y=net_avg,
|
||||
line_dash="dash",
|
||||
line_color="#004649",
|
||||
annotation_text=f"Network Average: {net_avg:.1f}",
|
||||
annotation_position="top right")
|
||||
|
||||
|
||||
return fig
|
||||
@@ -0,0 +1,377 @@
|
||||
# FILE: funding_analysis.py
|
||||
# CREATED: 12/23/25
|
||||
# AUTHOR: Vincent Allen
|
||||
# CONTACT: vincent@vtallen.com valle276@live.kutztown.edu
|
||||
# PURPOSE:
|
||||
|
||||
# Contains the functions used to generate the plotly graphs for the capital funding analysis in the network wide desk reviews.
|
||||
|
||||
# external libraries
|
||||
import plotly.express as px
|
||||
import plotly.graph_objects as go
|
||||
import pandas as pd
|
||||
|
||||
#python modules
|
||||
|
||||
# Custom modules
|
||||
from constants_module import NEOSERRA_COLUMNS, OUT_COLUMNS
|
||||
|
||||
|
||||
def make_funding_attribution_network_wide(
|
||||
funding_df: pd.DataFrame,
|
||||
fiscal_year:str,
|
||||
title: str = "Capital Funding Attributions Per Center",
|
||||
network_label: str = "Network Wide",
|
||||
graph_note: str = "<b>NOTE: Documentation levels were determined as follows.</b><br><br>"
|
||||
"<b>Documented:</br>Will be submitted to Nexus as long as 'Director Verified is checked'</b></br>There is a non-blank, non-'Requested on eCenter' attribution source</br>AND Affirmation Statement was non-blank</br></br>"
|
||||
"<b>Affirmation Statement Missing:</br>Will NOT be submitted to Nexus</b></br>Attribution source is non-blank, non-'Requested on eCenter'</br>BUT affirmation statement was blank.</br></br>"
|
||||
"<b>Not Documented:</br>Will NOT be submitted to Nexus</b></br>There is a non-blank, non-'Requested on eCenter' attribution source </br>with a value in the affirmation column. If the attribution source is</br>eCenter,then no value is required in the affirmation column.",
|
||||
col_neo_center: str = NEOSERRA_COLUMNS.center,
|
||||
col_documentation_level: str = OUT_COLUMNS.milestone_documentation_level
|
||||
) -> go.Figure:
|
||||
"""
|
||||
parameters:
|
||||
funding_df:pd.DataFrame - The capital funding analysis data
|
||||
fiscal_year:str - The fiscal year label to place at the end of titles
|
||||
title:str - The title to place on the graph
|
||||
network_label: str - The label to use in the title when title is 'Capital Funding Attributions Per Center'
|
||||
graph_note:str - The note to place on the bottom right of the graph explaining how documentation levels were derived
|
||||
col_neo_center:str - The column of the funding_df at which the center can be found
|
||||
col_documentation_level:str - The column of the funding_df at which the documentation level for the milestone can be found
|
||||
|
||||
returns: go.Figure - The constructed figure object
|
||||
|
||||
description:
|
||||
Takes in the capital funding analysis data on the documentation level of those milestones and visualizes
|
||||
it network wide as a stacked bar graph of the documentation levels per center
|
||||
"""
|
||||
|
||||
display_title = title
|
||||
if network_label != "Network Wide":
|
||||
if "Network" in display_title:
|
||||
display_title = display_title.replace("Network Wide", network_label).replace("Network", network_label)
|
||||
else:
|
||||
display_title = f"{network_label} {display_title}"
|
||||
|
||||
funding_agg_df = funding_df.groupby([col_neo_center, col_documentation_level]).size().reset_index(name='Count') #pyright:ignore
|
||||
funding_agg_df = funding_agg_df.sort_values(col_neo_center)
|
||||
|
||||
desired_order = ["Documented", "Affirmation Missing", "Not Documented"]
|
||||
|
||||
fig = px.bar(
|
||||
funding_agg_df,
|
||||
x=col_neo_center,
|
||||
y='Count',
|
||||
color=col_documentation_level,
|
||||
text='Count',
|
||||
color_discrete_map={"Documented":"#71bf44", "Affirmation Missing":"#ffba31", "Not Documented":"#004649"},
|
||||
category_orders = {col_documentation_level: desired_order}
|
||||
)
|
||||
|
||||
fig.update_traces(
|
||||
textposition='inside',
|
||||
textfont_size=12
|
||||
)
|
||||
|
||||
fig.update_layout(
|
||||
xaxis_title='Center',
|
||||
yaxis_title='Attribution Counts',
|
||||
title=f"{display_title} {fiscal_year}",
|
||||
height=700,
|
||||
width=1500,
|
||||
|
||||
)
|
||||
|
||||
if graph_note != "":
|
||||
fig.update_layout(margin=dict(r=470))
|
||||
fig.add_annotation(x=1.49, y=-0.1, xref='paper', yref='paper', showarrow=False, align='left',
|
||||
text=graph_note)
|
||||
|
||||
return fig
|
||||
|
||||
|
||||
def make_funding_attribution_rate_chart(
|
||||
funding_df: pd.DataFrame,
|
||||
fiscal_year: str,
|
||||
source_data_export_path: str = "",
|
||||
documented_tag: str = OUT_COLUMNS.val_documented,
|
||||
col_neo_center: str = NEOSERRA_COLUMNS.center,
|
||||
col_documentation_level: str = OUT_COLUMNS.milestone_documentation_level
|
||||
) -> go.Figure:
|
||||
"""
|
||||
parameters:
|
||||
funding_df:pd.DataFrame - The capital funding analysis data
|
||||
source_data_export_path:str - If a csv path is provided, the intermediate dataset will be exported there
|
||||
documented_tag:str - The value to consider a milestone documneted. The script checks this value to determine what to count as a documented milestone
|
||||
col_neo_center:str - The column of the dataset containing the center for a milestone
|
||||
col_documentation_level:str - The column containing the documentation level assigned to a milestone
|
||||
|
||||
returns: go.Figure
|
||||
|
||||
description:
|
||||
Contains the code to create a bar graph that displays the percentage of Funding milestones which were
|
||||
considered "Documented". It uses the funding data to derive an intermediate dataset arranged for easy graphing which can be
|
||||
exported by providing a valid CSV path + filename to the source_data_export_path parameter
|
||||
"""
|
||||
|
||||
# Aggregate the counts of each documentation level per center
|
||||
funding_agg_df = funding_df.groupby([col_neo_center, col_documentation_level]).size().reset_index(name='Count') #pyright:ignore
|
||||
|
||||
# Sum all of the milestones to get a denominator
|
||||
funding_total = funding_agg_df.groupby(col_neo_center)['Count'].sum()
|
||||
|
||||
# Select only the documented counts and sum them to get a total
|
||||
funding_documented = funding_agg_df[funding_agg_df[col_documentation_level] == documented_tag].groupby(col_neo_center)['Count'].sum()
|
||||
|
||||
funding_combined_df = pd.DataFrame({
|
||||
'Total': funding_total,
|
||||
'Documented Count': funding_documented
|
||||
})
|
||||
|
||||
funding_combined_df['Documented Count'] = funding_combined_df['Documented Count'].fillna(0)
|
||||
|
||||
funding_combined_df['Percent Documented'] = funding_combined_df['Documented Count'] / funding_combined_df['Total']
|
||||
|
||||
funding_combined_df = funding_combined_df.reset_index()
|
||||
|
||||
total_funding_milestones = funding_df.shape[0]
|
||||
funding_total_documented = funding_combined_df['Documented Count'].sum()
|
||||
funding_network_total = funding_total_documented / total_funding_milestones
|
||||
|
||||
funding_network_total_count = pd.DataFrame({
|
||||
col_neo_center: ["Network Total"],
|
||||
'Total' : [total_funding_milestones],
|
||||
'Documented Count': [funding_total_documented],
|
||||
'Percent Documented': [funding_network_total]
|
||||
})
|
||||
|
||||
funding_combined_df = pd.concat([funding_combined_df, funding_network_total_count], ignore_index=True)
|
||||
|
||||
# Save the derived dataset only if the user wants it
|
||||
if source_data_export_path:
|
||||
funding_combined_df.to_csv(source_data_export_path, index=False)
|
||||
|
||||
fig = px.bar(
|
||||
funding_combined_df[funding_combined_df[col_neo_center] != 'Network Total'],
|
||||
x=col_neo_center,
|
||||
y='Percent Documented',
|
||||
text='Percent Documented',
|
||||
color_discrete_sequence=['#71bf44']
|
||||
)
|
||||
|
||||
# Network total
|
||||
net_total = funding_combined_df[funding_combined_df[col_neo_center].isin(['Network Total'])]['Percent Documented'].iloc[0] #pyright:ignore
|
||||
fig.add_hline(y=net_total,
|
||||
line_dash="dash",
|
||||
line_color="#004649",
|
||||
annotation_text=f"Network Total: {net_total * 100:.1f}%",
|
||||
annotation_position="top left",
|
||||
annotation=dict(
|
||||
xref="paper",
|
||||
x=1.02,
|
||||
xanchor="left",
|
||||
)
|
||||
)
|
||||
|
||||
fig.update_traces(
|
||||
textposition='inside',
|
||||
textfont_size=12,
|
||||
texttemplate="%{text:.0%}"
|
||||
)
|
||||
|
||||
fig.update_layout(
|
||||
xaxis_title='Center',
|
||||
yaxis_title='Documented Percentage',
|
||||
yaxis_tickformat='.0%',
|
||||
title=f'Capital Funding Attribution Rates Per Center {fiscal_year}',
|
||||
height=700,
|
||||
width=1500,
|
||||
margin=dict(r=150)
|
||||
)
|
||||
|
||||
return fig
|
||||
|
||||
|
||||
def make_theoretical_funding_attribution_rate_chart(
|
||||
funding_df: pd.DataFrame,
|
||||
fiscal_year: str,
|
||||
title: str = 'Documented Percentage if All Funding Milestones With an Attribution Source had an Affirmation',
|
||||
source_data_export_path: str = "",
|
||||
documented_tag: str = OUT_COLUMNS.val_documented,
|
||||
affirmation_missing_tag: str = OUT_COLUMNS.val_affirmation_missing,
|
||||
col_neo_center: str = NEOSERRA_COLUMNS.center,
|
||||
col_documentation_level: str = OUT_COLUMNS.milestone_documentation_level
|
||||
) -> go.Figure:
|
||||
"""
|
||||
parameters:
|
||||
funding_df:pd.DataFrame - The capital funding analysis dataframe
|
||||
title:str - The title to place on the graph
|
||||
source_data_export_path - The path to save the intermediate dataset that produced the graph (if provided)
|
||||
documented_tag:str - The value that tells the function to consider a milestone documented
|
||||
documented_wrong_spot_tag:str - The value that tells the function to consider a milestone documented in the incorrect spot
|
||||
col_neo_center:str - The column of the dataset that determines the center
|
||||
col_documentation_level:str - The column of the dataset containing the documentation level
|
||||
|
||||
returns: go.Figure - The constructed figure object
|
||||
|
||||
description:
|
||||
Generates a bar chart that displays what the correct documentation rate would be for Capital Funding if all of the milestones that had their
|
||||
documentation in the incorrect spot had the documentation in the correct spot.
|
||||
"""
|
||||
funding_temp_agg = funding_df.groupby([col_neo_center, col_documentation_level]).size().reset_index(name='Count') #pyright:ignore
|
||||
|
||||
funding_combined_and_wrong_spot_df = funding_temp_agg[funding_temp_agg[col_documentation_level].isin(
|
||||
[documented_tag, affirmation_missing_tag]
|
||||
)]
|
||||
|
||||
funding_documented_agg_df = funding_combined_and_wrong_spot_df.groupby(col_neo_center).agg(
|
||||
Documentation_Levels_Combined=(col_documentation_level, ','.join),
|
||||
Documented_Count=('Count', 'sum')
|
||||
).reset_index()
|
||||
|
||||
funding_total_agg_df = funding_temp_agg.groupby(col_neo_center).agg(
|
||||
Grand_Total_Count=('Count', 'sum')
|
||||
).reset_index()
|
||||
|
||||
funding_final_df = pd.merge(
|
||||
funding_documented_agg_df,
|
||||
funding_total_agg_df,
|
||||
on=col_neo_center,
|
||||
how='outer'
|
||||
)
|
||||
|
||||
funding_final_df['Documented_Count'] = funding_final_df['Documented_Count'].fillna(0).astype(int)
|
||||
funding_final_df['Documentation_Levels_Combined'] = funding_final_df['Documentation_Levels_Combined'].fillna(0)
|
||||
|
||||
funding_final_df['Percent_Documented'] = funding_final_df['Documented_Count'] / funding_final_df['Grand_Total_Count']
|
||||
|
||||
total_funding_milestones = funding_df.shape[0]
|
||||
funding_total_group = funding_final_df['Documented_Count'].sum()
|
||||
funding_network_total = funding_total_group / total_funding_milestones
|
||||
|
||||
funding_network_total_count = pd.DataFrame({
|
||||
col_neo_center: ["Network Total"],
|
||||
'Grand_Total_Count' : [total_funding_milestones],
|
||||
'Documented_Count': [funding_total_group],
|
||||
'Percent_Documented': [funding_network_total]
|
||||
})
|
||||
|
||||
funding_final_df = pd.concat([funding_final_df, funding_network_total_count], ignore_index=True)
|
||||
|
||||
if source_data_export_path:
|
||||
funding_final_df.to_csv(source_data_export_path, index=False)
|
||||
|
||||
fig = px.bar(
|
||||
funding_final_df[funding_final_df[col_neo_center] != 'Network Total'],
|
||||
x=col_neo_center,
|
||||
y='Percent_Documented',
|
||||
text='Percent_Documented',
|
||||
color_discrete_sequence=['#71bf44']
|
||||
)
|
||||
|
||||
# Network total
|
||||
net_total = funding_final_df[funding_final_df[col_neo_center].isin(['Network Total'])]['Percent_Documented'].iloc[0] #pyright:ignore
|
||||
fig.add_hline(y=net_total,
|
||||
line_dash="dash",
|
||||
line_color="#004649",
|
||||
annotation_text=f"Network Total: {net_total * 100:.1f}%",
|
||||
annotation_position="top right",
|
||||
annotation=dict(
|
||||
xref="paper",
|
||||
x=1.02,
|
||||
xanchor="left",
|
||||
)
|
||||
)
|
||||
|
||||
fig.update_traces(
|
||||
textposition='inside',
|
||||
textfont_size=12,
|
||||
texttemplate="%{text:.0%}"
|
||||
)
|
||||
|
||||
fig.update_layout(
|
||||
xaxis_title='Center',
|
||||
yaxis_title='Documented and Affirmation Missing Percentage',
|
||||
yaxis_tickformat='.0%',
|
||||
title=f"{title} {fiscal_year}",
|
||||
height=700,
|
||||
width=1500,
|
||||
margin=dict(r=150)
|
||||
)
|
||||
|
||||
return fig
|
||||
|
||||
|
||||
def make_funding_director_confirmed_graph(
|
||||
funding_df: pd.DataFrame,
|
||||
fiscal_year: str,
|
||||
title: str = 'Percentage of Director Confirmed Capital Funding Attributions Per Center',
|
||||
source_data_export_path: str = "",
|
||||
col_neo_center: str = NEOSERRA_COLUMNS.center,
|
||||
col_neo_attribution_source: str = NEOSERRA_COLUMNS.milestone_attribution_source
|
||||
) -> go.Figure:
|
||||
"""
|
||||
parameters:
|
||||
funding_df:pd.DataFrame - The capital funding analysis data
|
||||
title:str - The title to place on the graph
|
||||
source_data_export_path:str - The path + filename to save the intermediate calculation dataset to (if provided)
|
||||
col_neo_center:str="Center" - The column in the dataset to consider the center column
|
||||
col_neo_attribution_source:str - The column of the dataset that contains the attribution source for a milestone
|
||||
|
||||
returns: go.Figure - The generated graph figure
|
||||
|
||||
description:
|
||||
Generates a bar graph displaying how many funding milestones from each center were director confirmed and not true confirmed milestones.
|
||||
|
||||
"""
|
||||
total_counts = funding_df.groupby(col_neo_center).size()
|
||||
|
||||
director_counts = funding_df[funding_df[col_neo_attribution_source].str.contains("Director", na=False)] \
|
||||
.groupby(col_neo_center) \
|
||||
.size()
|
||||
|
||||
funding_director_combined_df = pd.DataFrame({
|
||||
'Total Count': total_counts,
|
||||
'Director Count': director_counts
|
||||
})
|
||||
|
||||
funding_director_combined_df['Director Count'] = funding_director_combined_df['Director Count'].fillna(0).astype(int)
|
||||
|
||||
funding_director_combined_df['Percent Director'] = funding_director_combined_df['Director Count'] / funding_director_combined_df['Total Count']
|
||||
|
||||
funding_director_combined_df = funding_director_combined_df.reset_index()
|
||||
|
||||
if source_data_export_path:
|
||||
funding_director_combined_df.to_csv(source_data_export_path, index=False)
|
||||
|
||||
fig = px.bar(
|
||||
funding_director_combined_df,
|
||||
x=col_neo_center,
|
||||
y='Percent Director',
|
||||
text='Percent Director',
|
||||
color_discrete_sequence=['#ffba31']
|
||||
)
|
||||
|
||||
fig.update_traces(
|
||||
textposition='inside',
|
||||
textfont_size=12,
|
||||
texttemplate="%{text:.0%}",
|
||||
|
||||
)
|
||||
#annotation=dict(
|
||||
# xref="paper",
|
||||
# x=1.02,
|
||||
# xanchor="left",
|
||||
# )
|
||||
|
||||
fig.update_layout(
|
||||
xaxis_title='Center',
|
||||
yaxis_title='Percent of Director Confirmed Attributions',
|
||||
yaxis_tickformat='.0%',
|
||||
title=f'{title} {fiscal_year}',
|
||||
height=700,
|
||||
width=1500,
|
||||
margin=dict(r=150)
|
||||
)
|
||||
|
||||
return fig
|
||||
@@ -0,0 +1,357 @@
|
||||
# FILE: naics_census_analysis.py
|
||||
# CREATED: 12/19/25
|
||||
# AUTHOR: Vincent Allen
|
||||
# CONTACT: vincent@vtallen.com valle276@live.kutztown.edu
|
||||
# PURPOSE:
|
||||
|
||||
# Contains functions used to generate the figures for section 1.1 of the all centers desk review
|
||||
|
||||
# This section specifically contains an analysis of PASBDC client NAICs codes compared to the proportions
|
||||
# found in the US census data for PA.
|
||||
|
||||
# It also visualizes which counties have PASBDC client profiles with missing NAICs codes
|
||||
|
||||
# Third party libraries
|
||||
import plotly.graph_objects as go
|
||||
import plotly.express as px
|
||||
import pandas as pd
|
||||
import geopandas as gpd
|
||||
import certifi
|
||||
|
||||
# Python modules
|
||||
import textwrap
|
||||
from typing import List
|
||||
import ssl
|
||||
import urllib.request
|
||||
# Custom packages
|
||||
from constants_module import NEOSERRA_COLUMNS, OUT_COLUMNS
|
||||
|
||||
|
||||
def make_census_naics_chart(
|
||||
naics_df:pd.DataFrame,
|
||||
naics_column_name:str=OUT_COLUMNS.unified_naics,
|
||||
label_column_name:str=OUT_COLUMNS.naics_label,
|
||||
census_data_column_name:str=OUT_COLUMNS.census_pct) -> go.Figure:
|
||||
|
||||
"""
|
||||
parameters:
|
||||
naics_df:pd.DataFrame - The input dataframe containing NAICS codes, names for what the codes represent, and census data about their share of PA businesses
|
||||
naics_column_name:str - The column at which the NAICs codes can be found in naics_df,
|
||||
label_column_name:str - The column at which the NAICs code descriptions can be found in naics_df,
|
||||
census_column_name:str - The column at which the census percentage can be found
|
||||
|
||||
description:
|
||||
This function produces a plotly figure that displays a NAICS code, a description for that code, and a percentage value
|
||||
in the third column. For the case of the report this was made for, this is the percentage that NAICs code makes up of all
|
||||
businesses in PA.
|
||||
|
||||
This value should be a percentage out of 100 not out of 1 (99.34 not 0.9934)
|
||||
|
||||
returns: go.Figure - The constructed figure object
|
||||
"""
|
||||
|
||||
table_headers = ["Sector", "Definition", "PA Census Percentage"]
|
||||
|
||||
naics_codes = naics_df[naics_column_name]
|
||||
naics_code_names = naics_df[label_column_name]
|
||||
percentages = [f"{x:.1f}%" for x in naics_df[census_data_column_name]]
|
||||
|
||||
char_width = 45 # Adjust this width as needed
|
||||
wrapped_names = ['<br>'.join(textwrap.wrap(name, width=char_width)) for name in naics_code_names]
|
||||
|
||||
fig = go.Figure(data=[
|
||||
go.Table(
|
||||
header=dict(values=table_headers, fill_color='#6ebe4d'),
|
||||
cells=dict(
|
||||
values=[naics_codes, wrapped_names, percentages],
|
||||
align=['left', 'left', 'right'],
|
||||
font=dict(size=11, color='black'),
|
||||
fill_color='#bad4d6'
|
||||
),
|
||||
columnwidth=[0.15, 0.5, 0.15],
|
||||
)]
|
||||
)
|
||||
|
||||
fig.update_layout(
|
||||
height=575,
|
||||
width=500,
|
||||
margin=dict(l=10, r=10, t=10, b=10),
|
||||
font_family="Futura",
|
||||
title=None
|
||||
)
|
||||
|
||||
return fig
|
||||
|
||||
def make_client_census_comparison_graph(
|
||||
naics_df:pd.DataFrame,
|
||||
client_df:pd.DataFrame,
|
||||
title:str="Comparison between PA Census NAICS code distribution and PASBDC client NAICs distribution FY 25",
|
||||
naics_df_naics_code_column_name:str=OUT_COLUMNS.unified_naics,
|
||||
naics_df_naics_label_column_name:str=OUT_COLUMNS.naics_label,
|
||||
naics_df_census_percentage_column_name:str=OUT_COLUMNS.census_pct,
|
||||
client_df_naics2_column_name:str=OUT_COLUMNS.naics_2,
|
||||
client_df_census_percentage:str=OUT_COLUMNS.pa_naics_pct,
|
||||
client_df_pasbdc_percentage:str=OUT_COLUMNS.pasbdc_pct
|
||||
) -> go.Figure:
|
||||
"""
|
||||
parameters:
|
||||
naics_df:pd.DataFrame - The dataframe containing NAICs codes and their descriptions
|
||||
client_df:pd.DataFrame - The datafram containing PASBDC client data
|
||||
title:str - The title to give the graph
|
||||
naics_df_naics_code_column_name:str - The column of naics_df at which 2 digit naics codes can be found
|
||||
naics_df_naics_label_column_name:str - The column of naics_df at which the naics code descriptions can be found
|
||||
naics_df_census_percentage_column_name:str - The column of naics_df at which census percentages can be found (must be in the form 99.34 not 0.934)
|
||||
client_df_naics2_column_name:str - The column of client_df where 2 digit naics codes can be found
|
||||
client_df_census_percentage:str - The column of client_df where the census data about what share of businesses have that code in PA
|
||||
client_df_pasbdc_percentage:str - The column of client_df where the percentage of PASBDC businesses with that naics code can be found
|
||||
|
||||
returns:
|
||||
go.Figure - The constructed figure object
|
||||
|
||||
description:
|
||||
Constructs a bar chart displaying what percentage of PASBDC businesses fall into each NAICS code.
|
||||
Overlayed on top of this is a line graph that displays the percentage of that NAICS code within the
|
||||
census data.
|
||||
|
||||
NAICS codes 31, 32, and 33 are combined in the final graph
|
||||
"""
|
||||
# -------------------- Prepare NAICS mapping --------------------------------
|
||||
naics_mapping = {}
|
||||
for _, row in naics_df.iterrows():
|
||||
split = str(row[naics_df_naics_code_column_name]).split('-')
|
||||
if len(split) == 2:
|
||||
for code in range(int(split[0]), int(split[1]) + 1):
|
||||
naics_mapping[code] = {
|
||||
"Industry": row[naics_df_naics_label_column_name],
|
||||
"PA_Percentage": float(row[naics_df_census_percentage_column_name])
|
||||
}
|
||||
else:
|
||||
naics_mapping[int(row[naics_df_naics_code_column_name])] = {
|
||||
"Industry": row[naics_df_naics_label_column_name],
|
||||
"PA_Percentage": float(row[naics_df_census_percentage_column_name])
|
||||
}
|
||||
|
||||
# -------------------- Clean and Aggregate Client Data -----------------------
|
||||
client_df[client_df_naics2_column_name] = client_df[client_df_naics2_column_name].astype(float).astype(int)
|
||||
|
||||
agg_df = (
|
||||
client_df
|
||||
.groupby(client_df_naics2_column_name, as_index=False)
|
||||
.agg({
|
||||
client_df_pasbdc_percentage: "mean",
|
||||
client_df_census_percentage: "mean"
|
||||
})
|
||||
.sort_values(client_df_naics2_column_name)
|
||||
) #pyright:ignore
|
||||
|
||||
# -------------------- Combine NAICS 31, 32, 33 ------------------------------
|
||||
# Create a new column to group manufacturing codes
|
||||
agg_df["NAICS_Combined"] = agg_df["NAICS_2"].apply(
|
||||
lambda x: "31-33" if x in [31, 32, 33] else str(x)
|
||||
)
|
||||
|
||||
# Aggregate the combined codes
|
||||
combined_df = agg_df.groupby("NAICS_Combined", as_index=False).agg({
|
||||
client_df_pasbdc_percentage: "sum", # Sum the percentages
|
||||
client_df_census_percentage: "sum",
|
||||
client_df_naics2_column_name: "first" # Keep first code for mapping
|
||||
})
|
||||
|
||||
# Update the NAICS_2 for the combined row to get proper mapping
|
||||
combined_df.loc[combined_df["NAICS_Combined"] == "31-33", "NAICS_2"] = 31
|
||||
|
||||
# Add industry name from mapping
|
||||
combined_df["Industry"] = combined_df[client_df_naics2_column_name].map(#pyright:ignore
|
||||
lambda x: naics_mapping.get(x, {}).get("Industry", "Unknown")
|
||||
)
|
||||
|
||||
# For the combined manufacturing row, use a custom label
|
||||
combined_df.loc[combined_df["NAICS_Combined"] == "31-33", "Industry"] = "Manufacturing"
|
||||
|
||||
# Get PA percentages - but don't sum for 31-33 since they're already the same value
|
||||
combined_df["PA_Mapped_Percentage"] = combined_df[client_df_naics2_column_name].map(#pyright:ignore
|
||||
lambda x: naics_mapping.get(x, {}).get("PA_Percentage", None)
|
||||
) #pyright:ignore
|
||||
|
||||
# Combine label for x-axis
|
||||
combined_df["NAICS_Label"] = combined_df["NAICS_Combined"].astype(str) + " - " + combined_df["Industry"].astype(str) #pyright:ignore
|
||||
|
||||
# Sort to maintain logical order
|
||||
combined_df = combined_df.sort_values("NAICS_Combined") #pyright:ignore
|
||||
|
||||
# -------------------- Plot PASBDC Bars --------------------------------------
|
||||
fig = px.bar(
|
||||
combined_df,
|
||||
x="NAICS_Label",
|
||||
y="PASBDC NAICs Code Percentage",
|
||||
text="PASBDC NAICs Code Percentage",
|
||||
width=1500,
|
||||
height=1000,
|
||||
title=title
|
||||
)
|
||||
fig.update_traces(marker_color="#6ebe4d")
|
||||
fig.update_traces(
|
||||
name="PASBDC Percentage",
|
||||
texttemplate="%{text:.1f}%",
|
||||
textposition="outside",
|
||||
textfont_size=14
|
||||
)
|
||||
|
||||
fig.add_trace(go.Scatter(
|
||||
x=combined_df["NAICS_Label"],
|
||||
y=combined_df["PA_Mapped_Percentage"],
|
||||
name="PA Census Percentage",
|
||||
mode="lines+markers",
|
||||
line=dict(color="#27323a"),
|
||||
marker=dict(size=8)
|
||||
))
|
||||
|
||||
max_y = combined_df[[client_df_pasbdc_percentage, "PA_Mapped_Percentage"]].max().max()
|
||||
if pd.isna(max_y):
|
||||
max_y = 0.0
|
||||
y_offset = max_y * 0.06
|
||||
|
||||
"""
|
||||
annotations = []
|
||||
for x, y in zip(combined_df["NAICS_Label"], combined_df["PA_Mapped_Percentage"]):
|
||||
annotations.append(dict(
|
||||
x=x,
|
||||
y=y + y_offset,
|
||||
text=f"{y:.1f}%",
|
||||
showarrow=False,
|
||||
font=dict(size=12, color="#27323a"),
|
||||
xanchor="center",
|
||||
yanchor="bottom"
|
||||
))
|
||||
"""
|
||||
fig.update_layout(
|
||||
#annotations=annotations,
|
||||
bargap=0.1,
|
||||
barmode="group",
|
||||
yaxis=dict(
|
||||
range=[0, max_y * 1.15],
|
||||
title="Percentage of PASBDC Businesses",
|
||||
tickformat=".0f"
|
||||
),
|
||||
xaxis=dict(title="Industry (NAICS Code)"),
|
||||
legend=dict(x=0.02, y=0.98)
|
||||
)
|
||||
|
||||
return fig
|
||||
|
||||
def make_county_heatmap(
|
||||
county_stats_df:pd.DataFrame,
|
||||
value_column:str,
|
||||
title:str,
|
||||
tick_suffix:str="%",
|
||||
color_continuous_scale:List[str]=['#cde0c3','#b4e09a', '#6dafb2', '#499699', '#2d797c', '#256e70', '#156264', '#094f51', '#004649', '#003234', '#002f30', '#002223', '#001111']
|
||||
):
|
||||
"""
|
||||
parameters:
|
||||
county_stats_df:pd.DataFrame - The dataframe containing a fips column and a value you want to visualize on a heatmap
|
||||
value_column:str - The value you want to visualize on the state heat map
|
||||
title:str - The title of the map
|
||||
tick_suffix:str - The suffix to place after the numbes on the y axis label
|
||||
color_continuous_scale:List[str] - Used to set the color gradient of the key
|
||||
|
||||
returns:
|
||||
go.Figure - The constructed figure object
|
||||
|
||||
description:
|
||||
Creates a heatmap of all of the counties within PA based on an input dataframe and a selected
|
||||
value column.
|
||||
"""
|
||||
# Disable ssl cert checking just for this function call
|
||||
# Python 3.14 has beef with windows for whatever reason. The joys of developing using Windows.
|
||||
ssl._create_default_https_context = ssl._create_unverified_context
|
||||
# This file has the FIPS codes and the county shape (geometry)
|
||||
geojson_url = 'https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json'
|
||||
gdf = gpd.read_file(geojson_url)
|
||||
|
||||
# We merge your data *onto* the geospatial data
|
||||
merged_gdf = gdf.merge(county_stats_df, left_on='id', right_on='fips')
|
||||
|
||||
# Filter for Pennsylvania (FIPS prefix '42')
|
||||
pa_gdf = merged_gdf[merged_gdf['fips'].str.startswith('42')].copy()
|
||||
|
||||
#Calculate the centroid for each county's geometry
|
||||
#Note: This might warn about CRS, but for plotting it's usually fine.
|
||||
pa_gdf['centroid'] = pa_gdf.geometry.centroid
|
||||
|
||||
# Extract needed positional data
|
||||
pa_gdf['lon'] = pa_gdf.centroid.x
|
||||
pa_gdf['lat'] = pa_gdf.centroid.y
|
||||
|
||||
pa_gdf = pa_gdf.drop(columns=['centroid'])
|
||||
|
||||
# Create the text label
|
||||
# Removed the percentage value from the end for readability
|
||||
#pa_gdf['percent_str'] = (pa_gdf[value_column].round(2).astype(str)) + tick_suffix
|
||||
# Plotly accepts html styling in text labels
|
||||
pa_gdf['label_text'] = '<span style="color:white; font-size:10px;">' + pa_gdf['County'].astype(str) + "</span>"#+ '<br>' + pa_gdf['percent_str'] + "</span>"
|
||||
|
||||
# Create the Base Choropleth Map
|
||||
fig = px.choropleth(pa_gdf,
|
||||
geojson=pa_gdf,
|
||||
locations='fips',
|
||||
featureidkey='properties.fips',
|
||||
color=value_column, # Column for color
|
||||
hover_name='County',
|
||||
hover_data={value_column: ':.1f', 'fips': False},
|
||||
color_continuous_scale=color_continuous_scale,
|
||||
range_color=[0, 100]
|
||||
)
|
||||
|
||||
# Add a scatter plot of the state labels
|
||||
fig.add_trace(
|
||||
go.Scattergeo(
|
||||
lat=pa_gdf['lat'],
|
||||
lon=pa_gdf['lon'],
|
||||
text=pa_gdf['label_text'],
|
||||
mode='text',
|
||||
hoverinfo='none'
|
||||
)
|
||||
)
|
||||
|
||||
fig.update_geos(
|
||||
fitbounds="locations",
|
||||
visible=False,
|
||||
projection_scale=60,
|
||||
center={"lat": pa_gdf['lat'].mean(), "lon": pa_gdf['lon'].mean()}
|
||||
)
|
||||
|
||||
fig.update_coloraxes(
|
||||
colorbar=dict(
|
||||
title=value_column,
|
||||
ticksuffix="%",
|
||||
)
|
||||
)
|
||||
|
||||
# Set the layout for saving an image
|
||||
fig.update_layout(
|
||||
title=dict(
|
||||
text=title,
|
||||
font=dict(size=22, color='black'),
|
||||
x=0.5, # center the title horizontally (0=left, 1=right)
|
||||
xanchor='center',
|
||||
yanchor='top'
|
||||
),
|
||||
margin={"r":0,"t":80,"l":0,"b":0},
|
||||
width=2000,
|
||||
height=1200,
|
||||
)
|
||||
|
||||
# Derive the network wide missing naics code value
|
||||
total_clients = county_stats_df['Total Clients'].sum()
|
||||
missing_naics_total = county_stats_df['Missing NAICS'].sum()
|
||||
|
||||
fig.add_annotation(
|
||||
x=0.0,
|
||||
y=-0.0,
|
||||
xref="paper",
|
||||
yref="paper",
|
||||
text=f"Network Wide, {round((missing_naics_total/total_clients)*100, 2)}% of client profiles are missing NAICS codes.",
|
||||
showarrow=False,
|
||||
)
|
||||
|
||||
return fig
|
||||
@@ -0,0 +1,381 @@
|
||||
# FILE: nbs_analysis.py
|
||||
# CREATED: 12/23/25
|
||||
# AUTHOR: Vincent Allen
|
||||
# CONTACT: vincent@vtallen.com valle276@live.kutztown.edu
|
||||
# PURPOSE:
|
||||
|
||||
# Contains the functions used to generate the plotly graphs for the milestone analysis in the network wide desk reviews.
|
||||
|
||||
# external libraries
|
||||
import plotly.express as px
|
||||
import plotly.graph_objects as go
|
||||
import pandas as pd
|
||||
|
||||
#python modules
|
||||
|
||||
# Custom modules
|
||||
from constants_module import NEOSERRA_COLUMNS, OUT_COLUMNS
|
||||
|
||||
def make_nbs_attribution_network_wide(
|
||||
nbs_df:pd.DataFrame,
|
||||
title:str="New Business Start Attributions Per Center FY 25",
|
||||
network_label:str="Network",
|
||||
graph_note:str="<b>NOTE:</b>Documentation levels were determined as follows.<br><br>"
|
||||
"<b>Documented: </br>Will be submitted to Nexus as long as 'Director Verified' is checked</b></br>There is a non-blank, non-'Requested on eCenter' attribution source </br>AND Affirmation Statement was non-blank<br>\tNOTE: If the attribution source is eCenter, no affirmation is required.<br><br>"
|
||||
"<b>Affirmation Statement Missing:</br>Will NOT be submitted to Nexus</b></br>Attribution source is non-blank, non-'Requested on eCenter'</br>BUT affirmation statement was blank.</br></br>"
|
||||
"<b>Not Documented:</br>Will NOT be submitted to Nexus</b></br>There is a non-blank, non-'Requested on eCenter' attribution source </br>with a value in the affirmation column. If the attribution <br>source is eCenter, then no value is required in the affirmation column.",
|
||||
col_neo_center:str=NEOSERRA_COLUMNS.center,
|
||||
col_documentation_level:str=OUT_COLUMNS.milestone_documentation_level
|
||||
):
|
||||
"""
|
||||
parameters:
|
||||
nbs_df:pd.DataFrame - The new business starts analysis data
|
||||
title:str - The title to place on the graph
|
||||
network_label:str - The label to use in the title when it starts with 'Network'
|
||||
graph_note:str - The note to place on the bottom right of the graph explaining how documentation levels were derived
|
||||
col_neo_center:str - The column of the nbs_df at which the center can be found
|
||||
col_documentation_level:str - The column of the nbs_df at which the documentation level for the milestone can be found
|
||||
|
||||
returns: go.Figure - The constructed figure object
|
||||
|
||||
description:
|
||||
Takes in the new business starts analysis data on the documentation level of those milestones and visualizes
|
||||
it network wide as a stacked bar graph of the documentation levels per center
|
||||
"""
|
||||
|
||||
display_title = title
|
||||
if network_label != "Network":
|
||||
# If a custom label is provided, ensure it's represented in the title
|
||||
if "Network" in display_title:
|
||||
display_title = display_title.replace("Network", network_label)
|
||||
else:
|
||||
display_title = f"{network_label} {display_title}"
|
||||
|
||||
agg_df = nbs_df.groupby([col_neo_center, col_documentation_level]).size().reset_index(name='Count') #pyright:ignore
|
||||
agg_df = agg_df.sort_values(col_neo_center)
|
||||
|
||||
desired_order = ["Documented", "Affirmation Missing", "Not Documented"]
|
||||
|
||||
fig = px.bar(
|
||||
agg_df,
|
||||
x=col_neo_center,
|
||||
y='Count',
|
||||
color=col_documentation_level,
|
||||
text='Count',
|
||||
#color_discrete_sequence=['#71bf44', '#ffba31', '#004649'],
|
||||
color_discrete_map={"Documented": "#71bf44", "Affirmation Missing": "#ffba31", "Not Documented": "#004649"},
|
||||
category_orders={col_documentation_level: desired_order}
|
||||
)
|
||||
|
||||
fig.update_traces(
|
||||
textposition='inside',
|
||||
textfont_size=12
|
||||
)
|
||||
|
||||
fig.update_layout(
|
||||
xaxis_title='Center',
|
||||
yaxis_title='Attribution Counts',
|
||||
title=display_title,
|
||||
height=700,
|
||||
width=1500,
|
||||
xaxis={'categoryorder': 'category ascending'}
|
||||
)
|
||||
|
||||
if graph_note != "":
|
||||
fig.update_layout(margin=dict(r=470))
|
||||
fig.add_annotation(x=1.49, y=0, xref='paper', yref='paper', showarrow=False, align='left',
|
||||
text=graph_note)
|
||||
|
||||
return fig
|
||||
|
||||
def make_attribution_rate_chart(
|
||||
nbs_df:pd.DataFrame,
|
||||
source_data_export_path:str="",
|
||||
fiscalyear:str="",
|
||||
documented_tag:str=OUT_COLUMNS.val_documented,
|
||||
col_neo_center:str=NEOSERRA_COLUMNS.center,
|
||||
col_documentation_level:str=OUT_COLUMNS.milestone_documentation_level
|
||||
):
|
||||
"""
|
||||
parameters:
|
||||
nbs_df:pd.DataFrame - The new business starts analysis data
|
||||
source_data_export_path:str - If a csv path is provided, the intermediate dataset will be exported there
|
||||
documented_tag:str - The value to consider a milestone documneted. The script checks this value to determine what to count as a documented milestone
|
||||
col_neo_center:str - The column of the dataset containing the center for a milestone
|
||||
col_documentation_level:str - The column containing the documentation level assigned to a milestone
|
||||
|
||||
returns: go.Figure
|
||||
|
||||
description:
|
||||
Contains the code to create a bar graph that displays the percentage of NBS milestones which were
|
||||
considered "Documented". It uses the NBS data to derive an intermediate dataset arranged for easy graphing which can be
|
||||
exported by providing a valid CSV path + filename to the source_data_export_path parameter
|
||||
"""
|
||||
# Aggregate the counts of each documentation level per center
|
||||
agg_df = nbs_df.groupby([col_neo_center, col_documentation_level]).size().reset_index(name='Count') #pyright:ignore
|
||||
|
||||
# Select only the documented counts and sum them to get a total
|
||||
nbs_documented = agg_df[agg_df[col_documentation_level] == documented_tag].groupby(col_neo_center)['Count'].sum()
|
||||
|
||||
# Sum all of the milestones to get a denominator
|
||||
nbs_total = agg_df.groupby(col_neo_center)['Count'].sum()
|
||||
|
||||
# Both of these input pandas series' is indexed by center, so matching will occur automatically
|
||||
combined_df = pd.DataFrame({'Total': nbs_total, 'Documented Count':nbs_documented})
|
||||
|
||||
# Remove any nan values so calculations do not fail
|
||||
combined_df['Documented Count'] = combined_df['Documented Count'].fillna(0)
|
||||
|
||||
combined_df['Percent Documented'] = combined_df['Documented Count'] / combined_df['Total']
|
||||
|
||||
combined_df = combined_df.reset_index()
|
||||
|
||||
total_nbs_milestones = nbs_df.shape[0]
|
||||
total_documented = combined_df['Documented Count'].sum()
|
||||
network_total = total_documented / total_nbs_milestones
|
||||
|
||||
network_total_count = pd.DataFrame({
|
||||
col_neo_center: ["Network Total"],
|
||||
'Total' : [total_nbs_milestones],
|
||||
'Documented Count': [total_documented],
|
||||
'Percent Documented': [network_total]
|
||||
})
|
||||
|
||||
combined_df = pd.concat([combined_df, network_total_count], ignore_index=True)
|
||||
|
||||
# Save the derived dataset only if the user wants it
|
||||
if source_data_export_path:
|
||||
combined_df.to_csv(source_data_export_path, index=False)
|
||||
|
||||
# The above code gets us a dataframe with the following columns:
|
||||
# Center, Total, Documented Count, Percent Documented
|
||||
|
||||
# Now we can visualize it
|
||||
fig = px.bar(
|
||||
# Exclude the network total from the graphed data, will be added as a line
|
||||
combined_df[~combined_df[col_neo_center].isin(['Network Total'])],
|
||||
x=col_neo_center,
|
||||
y='Percent Documented',
|
||||
text='Percent Documented',
|
||||
color_discrete_sequence=['#71bf44']
|
||||
)
|
||||
|
||||
# Add a Network total average line across the bars
|
||||
net_total = combined_df[combined_df[col_neo_center].isin(['Network Total'])]['Percent Documented'].iloc[0] #pyright:ignore
|
||||
fig.add_hline(y=net_total,
|
||||
line_dash="dash",
|
||||
line_color="#004649",
|
||||
annotation_text=f"Network Total: {net_total * 100:.1f}%",
|
||||
annotation_position="top right",
|
||||
annotation=dict(
|
||||
xref="paper",
|
||||
x=1.02,
|
||||
xanchor="left",
|
||||
)
|
||||
)
|
||||
|
||||
fig.update_traces(
|
||||
textposition='inside',
|
||||
textfont_size=12,
|
||||
texttemplate="%{text:.0%}"
|
||||
)
|
||||
|
||||
fig.update_layout(
|
||||
xaxis_title='Center',
|
||||
yaxis_title='Documented Percentage',
|
||||
title=f'New Business Start Attribution Rates Per Center {fiscalyear}',
|
||||
yaxis_tickformat='.0%',
|
||||
height=700,
|
||||
width=1500,
|
||||
margin=dict(r=150)
|
||||
)
|
||||
|
||||
return fig
|
||||
|
||||
|
||||
def make_theoretical_attribution_rate_chart(
|
||||
nbs_df:pd.DataFrame,
|
||||
title:str='Documented Percentage if All NBS Milestones With an Attribution Source had an Affirmation FY 25',
|
||||
source_data_export_path:str="",
|
||||
documented_tag:str=OUT_COLUMNS.val_documented,
|
||||
affirmation_missing_tag:str=OUT_COLUMNS.val_affirmation_missing,
|
||||
col_neo_center:str=NEOSERRA_COLUMNS.center,
|
||||
col_documentation_level:str=OUT_COLUMNS.milestone_documentation_level
|
||||
) -> go.Figure:
|
||||
"""
|
||||
parameters:
|
||||
nbs_df:pd.DataFrame - The new business starts analysis dataframe
|
||||
title:str - The title to place on the graph
|
||||
source_data_export_path - The path to save the intermediate dataset that produced the graph (if provided)
|
||||
documented_tag:str - The value that tells the function to consider a milestone documented
|
||||
documented_wrong_spot_tag:str - The value that tells the function to consider a milestone documented in the incorrect spot
|
||||
col_neo_center:str - The column of the dataset that determines the center
|
||||
col_documentation_level:str - The column of the dataset containing the documentation level
|
||||
|
||||
returns: go.Figure - The constructed figure object
|
||||
|
||||
description:
|
||||
Generates a bar chart that displays what the correct documentation rate would be if all of the milestones that had their
|
||||
documentation in the incorrect spot had the documentation in the correct spot.
|
||||
"""
|
||||
# group by center and documentation level, then count the number of milestones for each grouping
|
||||
temp_agg = nbs_df.groupby([col_neo_center, col_documentation_level]).size().reset_index(name='Count') #pyright:ignore
|
||||
|
||||
# Select only the the milestones that fell into the documented or documented in wrong spot category
|
||||
combined_and_wrong_spot_df = temp_agg[temp_agg[col_documentation_level].isin(
|
||||
[documented_tag, affirmation_missing_tag]
|
||||
)]
|
||||
|
||||
# Regroup by center, then concatenate the documentation level columns for traceability
|
||||
# then sum up the counts that fall into each group
|
||||
documented_agg_df = combined_and_wrong_spot_df.groupby(col_neo_center).agg(
|
||||
Documentation_Levels_Combined=(col_documentation_level, ','.join),
|
||||
Documented_Count=('Count', 'sum')
|
||||
).reset_index()
|
||||
|
||||
# Count up the milestones regardless of their documentation level
|
||||
total_agg_df = temp_agg.groupby(col_neo_center).agg(
|
||||
Grand_Total_Count=('Count', 'sum')
|
||||
).reset_index()
|
||||
|
||||
# Merge both the "good" results with the grand total
|
||||
final_df = pd.merge(
|
||||
documented_agg_df,
|
||||
total_agg_df,
|
||||
on=col_neo_center,
|
||||
how='outer'
|
||||
|
||||
)
|
||||
|
||||
# Fill nan values so calculations do not fail
|
||||
final_df['Documented_Count'] = final_df['Documented_Count'].fillna(0).astype(int)
|
||||
final_df['Documentation_Levels_Combined'] = final_df['Documentation_Levels_Combined'].fillna(0)
|
||||
|
||||
# Calculate the documentation percentage for each center
|
||||
final_df['Percent_Documented'] = final_df['Documented_Count'] / final_df['Grand_Total_Count']
|
||||
|
||||
total_nbs_milestones = nbs_df.shape[0]
|
||||
total_group = final_df['Documented_Count'].sum()
|
||||
network_total = total_group / total_nbs_milestones
|
||||
|
||||
# Determine the network total values, then merge it with the rest of the centers
|
||||
network_total_count = pd.DataFrame({
|
||||
'Center': ["Network Total"],
|
||||
'Grand_Total_Count' : [total_nbs_milestones],
|
||||
'Documented_Count': [total_group],
|
||||
'Percent_Documented': [network_total]
|
||||
})
|
||||
|
||||
final_df = pd.concat([final_df, network_total_count], ignore_index=True)
|
||||
if source_data_export_path:
|
||||
final_df.to_csv(source_data_export_path, index=False)
|
||||
|
||||
net_total = final_df[final_df[col_neo_center].isin(['Network Total'])]['Percent_Documented'].iloc[0] #pyright: ignore
|
||||
fig = px.bar(
|
||||
final_df[~final_df[col_neo_center].isin(['Network Total'])],
|
||||
x=col_neo_center,
|
||||
y='Percent_Documented',
|
||||
text='Percent_Documented',
|
||||
color_discrete_sequence=['#71bf44']
|
||||
)
|
||||
|
||||
# Network total
|
||||
fig.add_hline(y=net_total,
|
||||
line_dash="dash",
|
||||
line_color="#004649",
|
||||
annotation_text=f"Network Total: {net_total * 100:.1f}%",
|
||||
annotation_position="top right",
|
||||
annotation=dict(
|
||||
xref="paper",
|
||||
x=1.02,
|
||||
xanchor="left",
|
||||
)
|
||||
)
|
||||
|
||||
fig.update_traces(
|
||||
textposition='inside',
|
||||
textfont_size=12,
|
||||
texttemplate="%{text:.0%}"
|
||||
)
|
||||
|
||||
fig.update_layout(
|
||||
xaxis_title='Center',
|
||||
yaxis_title='Documented and Affirmation Missing Percentage',
|
||||
yaxis_tickformat='.0%',
|
||||
title=title,
|
||||
height=700,
|
||||
width=1500,
|
||||
margin=dict(r=150)
|
||||
)
|
||||
|
||||
return fig
|
||||
|
||||
def make_director_confirmed_graph(
|
||||
nbs_df:pd.DataFrame,
|
||||
title:str='Percentage of Director Confirmed NBS Attributions Per Center FY 25',
|
||||
source_data_export_path:str="",
|
||||
col_neo_center:str=NEOSERRA_COLUMNS.center,
|
||||
col_neo_attribution_source:str=NEOSERRA_COLUMNS.milestone_attribution_source
|
||||
) -> go.Figure:
|
||||
"""
|
||||
parameters:
|
||||
nbs_df:pd.DataFrame - The new business starts analysis data
|
||||
title:str - The title to place on the graph
|
||||
source_data_export_path:str - The path + filename to save the intermediate calculation dataset to (if provided)
|
||||
col_neo_center:str="Center" - The column in the dataset to consider the center column
|
||||
col_neo_attribution_source:str - The column of the dataset that contains the attribution source for a milestone
|
||||
|
||||
returns: go.Figure - The generated graph figure
|
||||
|
||||
description:
|
||||
Generates a bar graph displaying how many milestones from each center were director confirmed and not true confirmed milestones.
|
||||
|
||||
"""
|
||||
# Determine how many total milestones there are
|
||||
total_counts = nbs_df.groupby(col_neo_center).size()
|
||||
|
||||
# Count how many milestones were director confirmed
|
||||
# director_counts = nbs_df[nbs_df[col_neo_attribution_source].str.lower().contains("director", na=False)] \
|
||||
# .groupby(col_neo_center) \
|
||||
# .size()
|
||||
director_counts = nbs_df[nbs_df[col_neo_attribution_source].str.contains("director", case=False, na=False)].groupby(col_neo_center).size()
|
||||
|
||||
# Create a new dataframe with this data, both inputs are indexed by center so they will be joined automatically here
|
||||
combined_df = pd.DataFrame({
|
||||
'Total Count': total_counts,
|
||||
'Director Count': director_counts
|
||||
})
|
||||
|
||||
# fill missing data and calculate a rate
|
||||
combined_df['Director Count'] = combined_df['Director Count'].fillna(0).astype(int)
|
||||
|
||||
combined_df['Percent Director'] = combined_df['Director Count'] / combined_df['Total Count']
|
||||
|
||||
combined_df = combined_df.reset_index()
|
||||
if source_data_export_path:
|
||||
combined_df.to_csv(source_data_export_path, index=False)
|
||||
|
||||
fig = px.bar(
|
||||
combined_df,
|
||||
x=col_neo_center,
|
||||
y='Percent Director',
|
||||
text='Percent Director',
|
||||
color_discrete_sequence=['#ffba31']
|
||||
)
|
||||
|
||||
fig.update_traces(
|
||||
textposition='inside',
|
||||
textfont_size=12,
|
||||
texttemplate="%{text:.0%}"
|
||||
)
|
||||
|
||||
fig.update_layout(
|
||||
xaxis_title='Center',
|
||||
yaxis_title='Percent of Director Confirmed Attributions',
|
||||
yaxis_tickformat='.0%',
|
||||
title=title,
|
||||
height=700,
|
||||
width=1500
|
||||
)
|
||||
|
||||
return fig
|
||||
@@ -0,0 +1,254 @@
|
||||
# FILE: survey_analysis.py
|
||||
# CREATED: 12/23/25
|
||||
# AUTHOR: Vincent Allen
|
||||
# CONTACT: vincent@vtallen.com valle276@live.kutztown.edu
|
||||
# PURPOSE:
|
||||
|
||||
# Contains the functions used to generate the plotly graphs for the client satisfaction survey analysis.
|
||||
|
||||
# external libraries
|
||||
import plotly.express as px
|
||||
import plotly.graph_objects as go
|
||||
import pandas as pd
|
||||
|
||||
#python modules
|
||||
|
||||
# Custom modules
|
||||
from constants_module import NEOSERRA_COLUMNS, OUT_COLUMNS
|
||||
|
||||
|
||||
def make_survey_response_count_graph(
|
||||
survey_df: pd.DataFrame,
|
||||
title: str = 'Client Satisfaction Survey Responses Per Center FY 25',
|
||||
source_data_export_path: str = "",
|
||||
col_neo_center: str = NEOSERRA_COLUMNS.center
|
||||
) -> go.Figure:
|
||||
"""
|
||||
parameters:
|
||||
survey_df:pd.DataFrame - The raw survey data
|
||||
title:str - The title to place on the graph
|
||||
source_data_export_path:str - If a csv path is provided, the intermediate dataset will be exported there
|
||||
col_neo_center:str - The column of the dataset containing the center name
|
||||
|
||||
returns: go.Figure - The generated graph figure
|
||||
|
||||
description:
|
||||
Generates a bar graph displaying the total count of survey responses received per center.
|
||||
"""
|
||||
total_responses = survey_df.groupby(col_neo_center).size()
|
||||
total_responses = total_responses.reset_index(name='Responses') # pyright: ignore
|
||||
|
||||
if source_data_export_path:
|
||||
total_responses.to_csv(source_data_export_path, index=False)
|
||||
|
||||
fig = px.bar(
|
||||
total_responses,
|
||||
x=col_neo_center,
|
||||
y='Responses',
|
||||
text='Responses',
|
||||
height=500
|
||||
)
|
||||
|
||||
# Add a total sum of responses
|
||||
grand_total = total_responses['Responses'].sum()
|
||||
fig.add_annotation(xref='paper', yref='paper',
|
||||
x=0.0, y=1.03,
|
||||
showarrow=False,
|
||||
text=f"{grand_total} total responses")
|
||||
|
||||
fig.update_layout(
|
||||
xaxis_title='Center',
|
||||
yaxis_title='Survey Responses',
|
||||
title=title,
|
||||
height=700,
|
||||
width=1500,
|
||||
)
|
||||
fig.update_traces(showlegend=False, marker_color="#71bf44")
|
||||
|
||||
return fig
|
||||
|
||||
|
||||
def make_average_survey_score_graph(
|
||||
survey_df: pd.DataFrame,
|
||||
title: str = 'Average Score FY 25 - How likely is it that you would recommend the SBDC to a friend or colleague? (1-10 scale)',
|
||||
source_data_export_path: str = "",
|
||||
col_neo_center: str = NEOSERRA_COLUMNS.center,
|
||||
col_score: str = NEOSERRA_COLUMNS.satisfaction_score
|
||||
) -> go.Figure:
|
||||
"""
|
||||
parameters:
|
||||
survey_df:pd.DataFrame - The raw survey data
|
||||
title:str - The title to place on the graph
|
||||
source_data_export_path:str - If a csv path is provided, the intermediate dataset will be exported there
|
||||
col_neo_center:str - The column of the dataset containing the center name
|
||||
col_score:str - The column containing the satisfaction score (1-10)
|
||||
|
||||
returns: go.Figure - The generated graph figure
|
||||
|
||||
description:
|
||||
Generates a bar graph displaying the average satisfaction score (Question 1) for each center,
|
||||
along with a network-wide average line.
|
||||
"""
|
||||
# Clean up the answers
|
||||
local_df = survey_df.copy()
|
||||
local_df[col_score] = [int(str(x)[:2]) if len(str(x)) > 2 else int(x) for x in local_df[col_score]]
|
||||
|
||||
average_q1_score = local_df.groupby(col_neo_center)[col_score].mean().reset_index()
|
||||
network_wide_q1_score = local_df[col_score].mean()
|
||||
|
||||
if source_data_export_path:
|
||||
average_q1_score.to_csv(source_data_export_path, index=False)
|
||||
|
||||
fig = px.bar(average_q1_score, height=500, x=col_neo_center, y=col_score, text=col_score)
|
||||
|
||||
fig.update_layout(
|
||||
xaxis_title='Center',
|
||||
yaxis_title='Average',
|
||||
title=title,
|
||||
height=700,
|
||||
width=1500,
|
||||
)
|
||||
|
||||
# Add a network wide value
|
||||
fig.add_hline(
|
||||
y=network_wide_q1_score,
|
||||
line_dash="dash",
|
||||
line_color="#73e0c6",
|
||||
annotation_text=f"Network Total: {network_wide_q1_score:.1f}",
|
||||
annotation_position="top right",
|
||||
annotation_y=9.5)
|
||||
|
||||
fig.update_traces(
|
||||
showlegend=False,
|
||||
marker_color="#197f60",
|
||||
texttemplate='%{text:.1f}'
|
||||
)
|
||||
|
||||
return fig
|
||||
|
||||
|
||||
def make_responses_per_client_graph(
|
||||
survey_df: pd.DataFrame,
|
||||
client_list_df: pd.DataFrame,
|
||||
title: str = 'Survey Responses Per 100 Clients Served FY 25',
|
||||
source_data_export_path: str = "",
|
||||
col_neo_center: str = NEOSERRA_COLUMNS.center
|
||||
) -> go.Figure:
|
||||
"""
|
||||
parameters:
|
||||
survey_df:pd.DataFrame - The raw survey data
|
||||
client_list_df:pd.DataFrame - The dataset containing the list of clients served (NAICS client list)
|
||||
title:str - The title to place on the graph
|
||||
source_data_export_path:str - If a csv path is provided, the intermediate dataset will be exported there
|
||||
col_neo_center:str - The column of the dataset containing the center name
|
||||
|
||||
returns: go.Figure - The generated graph figure
|
||||
|
||||
description:
|
||||
Generates a bar graph displaying the number of survey responses received per 100 clients served
|
||||
by combining the survey data with the provided client list data.
|
||||
"""
|
||||
total_responses = survey_df.groupby(col_neo_center).size()
|
||||
total_responses = total_responses.reset_index(name='Responses') #pyright: ignore
|
||||
|
||||
# Aggregate client list
|
||||
client_counts = client_list_df.groupby(col_neo_center).size().reset_index(name='Client Count') #pyright:ignore
|
||||
|
||||
total_responses = total_responses.merge(client_counts, on=col_neo_center, how='left')
|
||||
total_responses['Per Client Served'] = total_responses['Responses'] / total_responses['Client Count']
|
||||
|
||||
display_df = total_responses.copy()
|
||||
display_df['Per Client Served'] = display_df['Per Client Served'] * 100
|
||||
|
||||
if source_data_export_path:
|
||||
display_df.to_csv(source_data_export_path, index=False)
|
||||
|
||||
fig = px.bar(display_df, x=col_neo_center, y='Per Client Served', text='Per Client Served', height=500)
|
||||
|
||||
fig.update_layout(
|
||||
xaxis_title='Center',
|
||||
yaxis_title='Survey Responses Per 100 Clients Served',
|
||||
title=title,
|
||||
height=700,
|
||||
width=1500,
|
||||
)
|
||||
fig.update_traces(showlegend=False, marker_color="#71bf44", texttemplate="%{text:.1f}")
|
||||
|
||||
return fig
|
||||
|
||||
|
||||
def make_nps_graph(
|
||||
survey_df: pd.DataFrame,
|
||||
title: str = "Net Promoter Score (NPS) By Center FY 25",
|
||||
source_data_export_path: str = "",
|
||||
col_neo_center: str = NEOSERRA_COLUMNS.center,
|
||||
col_score: str = NEOSERRA_COLUMNS.satisfaction_score
|
||||
) -> go.Figure:
|
||||
"""
|
||||
parameters:
|
||||
survey_df:pd.DataFrame - The raw survey data
|
||||
title:str - The title to place on the graph
|
||||
source_data_export_path:str - If a csv path is provided, the intermediate dataset will be exported there
|
||||
col_neo_center:str - The column of the dataset containing the center name
|
||||
col_score:str - The column containing the satisfaction score (1-10)
|
||||
|
||||
returns: go.Figure - The generated graph figure
|
||||
|
||||
description:
|
||||
Generates a bar graph displaying the Net Promoter Score (NPS) for each center.
|
||||
Includes a line indicating the Network-wide NPS.
|
||||
"""
|
||||
# Clean up the answers
|
||||
local_df = survey_df.copy()
|
||||
local_df[col_score] = [int(str(x)[:2]) if len(str(x)) > 2 else int(x) for x in local_df[col_score]]
|
||||
|
||||
# Calculating the network wide NPS
|
||||
total_detractors_count_net = local_df[local_df[col_score] <= 6].shape[0]
|
||||
total_promoters_count_net = local_df[local_df[col_score] >= 9].shape[0]
|
||||
total_responses_net = total_detractors_count_net + total_promoters_count_net
|
||||
|
||||
if total_responses_net > 0:
|
||||
network_nps = ((total_promoters_count_net / total_responses_net) - (total_detractors_count_net / total_responses_net)) * 100
|
||||
else:
|
||||
network_nps = 0
|
||||
|
||||
# Calculate per center
|
||||
center_group_df = local_df[[col_neo_center, col_score]].groupby(col_neo_center)
|
||||
|
||||
nps_df = pd.DataFrame({col_neo_center:[], "Detractors":[], "Promoters":[], "NPS":[]})
|
||||
for name, group in center_group_df:
|
||||
detractors_count = group[group[col_score] <= 6].shape[0]
|
||||
promoters_count = group[group[col_score] >= 9].shape[0]
|
||||
total = detractors_count + promoters_count
|
||||
|
||||
if total > 0:
|
||||
nps = ((promoters_count / total) - (detractors_count / total)) * 100
|
||||
else:
|
||||
nps = 0
|
||||
|
||||
row = pd.DataFrame({col_neo_center:[name], "Detractors": [detractors_count], "Promoters": [promoters_count], "NPS": [nps]})
|
||||
|
||||
nps_df = pd.concat([nps_df, row], ignore_index=True)
|
||||
|
||||
if source_data_export_path:
|
||||
nps_df.to_csv(source_data_export_path, index=False)
|
||||
|
||||
fig = px.bar(nps_df, x=col_neo_center, y='NPS', text='NPS', title=title, height=600, width=1250)
|
||||
|
||||
fig.update_traces(showlegend=False, marker_color="#73e0c6", texttemplate="%{text:.1f}")
|
||||
|
||||
fig.add_hline(
|
||||
y=network_nps,
|
||||
line_dash="dash",
|
||||
line_color="#004649",
|
||||
annotation_text=f"Network NPS: {network_nps:.1f}",
|
||||
annotation_position="bottom right",
|
||||
)
|
||||
|
||||
fig.add_annotation(xref='paper', yref='paper',
|
||||
x=0.0, y=1.08,
|
||||
showarrow=False,
|
||||
text=f'NOTE: NPS is calculated as the difference between promoter responses (9 or 10) and the % of detractor responses (1-6).<br> Participents are responding to the question "How likely is it that you would recommend the SBDC to a friend or colleague? (1-10 scale)"',
|
||||
align='left')
|
||||
|
||||
return fig
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user