first commit
This commit is contained in:
305
section_1_graph_export_module/trainings_analysis_script.py
Normal file
305
section_1_graph_export_module/trainings_analysis_script.py
Normal file
@@ -0,0 +1,305 @@
|
||||
# FILE: pasbdc_trainings_analysis_script.py
|
||||
# CREATED: 12/26/25
|
||||
# AUTHOR: Vincent Allen
|
||||
# PURPOSE: Full pipeline script: Cleans raw data, generates statistics, and creates analysis graphs per funding group.
|
||||
|
||||
# Third party libraries
|
||||
import pandas as pd
|
||||
import sys
|
||||
import os.path
|
||||
import argparse
|
||||
import json
|
||||
|
||||
# Custom modules
|
||||
from section_1_datasets_module import ( #pyright:ignore
|
||||
generate_cleaned_trainings_dataset,
|
||||
generate_center_trainings_count_statistics
|
||||
)
|
||||
|
||||
|
||||
from section_1_graph_library_module import ( # pyright:ignore
|
||||
make_attendee_bins_statistics_charts,
|
||||
make_primary_training_topic_statistics_charts,
|
||||
make_center_attendee_statistics_charts,
|
||||
make_center_event_count_charts,
|
||||
make_center_attendee_range_charts,
|
||||
make_primary_training_topic_pie_charts,
|
||||
make_network_trainings_count_statistics_charts
|
||||
)
|
||||
|
||||
from shared_tools_module import save_variant_charts, csv_url_to_dataframe
|
||||
from constants_module import NEOSERRA_COLUMNS, OUT_COLUMNS
|
||||
|
||||
DEFAULT_FUNDING_GROUPS = ['Core Services', 'LEXNET', 'PDA', 'NAP']
|
||||
|
||||
REPORT_NAME = "trainingsreport"
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description="Clean Data and Generate Trainings Analysis Graphs")
|
||||
dataset_group = parser.add_mutually_exclusive_group(required=True)
|
||||
# --- INPUT/OUTPUT ---
|
||||
dataset_group.add_argument("--inputcsv",
|
||||
type=str,
|
||||
help="The path to the RAW Neoserra trainings CSV export.")
|
||||
dataset_group.add_argument("--exportmoduleurl",
|
||||
type=str,
|
||||
help="The Neoserra export module url for the Trainings data export")
|
||||
|
||||
parser.add_argument("--outpath",
|
||||
type=str,
|
||||
required=True,
|
||||
help="The base directory path to place generated files into.")
|
||||
|
||||
parser.add_argument("--cleanedfilename",
|
||||
type=str,
|
||||
default="cleaned_trainings_data.csv",
|
||||
help="The name to give the intermediate cleaned dataset.")
|
||||
|
||||
parser.add_argument("--mapping",
|
||||
type=str,
|
||||
required=False,
|
||||
help="Path to a JSON file to override default column names mappings.")
|
||||
|
||||
parser.add_argument("--fiscalyear",
|
||||
type=str,
|
||||
default="FY25",
|
||||
help="The label for the fiscal year to appear in graph titles.")
|
||||
|
||||
# --- BASE FILENAMES (Prefixes) ---
|
||||
parser.add_argument("--name_stats", type=str, default="center-statistics",
|
||||
help="Base filename for the detailed center statistics charts.")
|
||||
|
||||
parser.add_argument("--name_bins", type=str, default="attendee-bins",
|
||||
help="Base filename for attendee bins charts.")
|
||||
|
||||
parser.add_argument("--name_topics", type=str, default="training-topics",
|
||||
help="Base filename for primary topic charts.")
|
||||
|
||||
parser.add_argument("--name_center_attendees", type=str, default="center-attendees",
|
||||
help="Base filename for center attendee statistics charts.")
|
||||
|
||||
parser.add_argument("--name_event_counts", type=str, default="event-counts",
|
||||
help="Base filename for center event count charts.")
|
||||
|
||||
parser.add_argument("--name_ranges", type=str, default="attendee-ranges",
|
||||
help="Base filename for center attendee range stacked bars.")
|
||||
|
||||
parser.add_argument("--name_pie", type=str, default="topics-pie",
|
||||
help="Base filename for network wide topic pie charts.")
|
||||
|
||||
# NOTE: This needs to be manually checked to provide the default before execution as argparse does not support
|
||||
# providing a full replacement as a default, whatever is passed to default is just appended to whatever the user inputs
|
||||
parser.add_argument("--funding",
|
||||
action='append',
|
||||
required=False,
|
||||
default=None,
|
||||
help="Which funding groups should be included in the final analysis" )
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
|
||||
if args.funding is None:
|
||||
args.funding = DEFAULT_FUNDING_GROUPS
|
||||
|
||||
# Handle optional JSON mapping override
|
||||
if args.mapping:
|
||||
try:
|
||||
with open(args.mapping, 'r') as f:
|
||||
NEOSERRA_COLUMNS.apply_json_mapping(args.mapping)
|
||||
OUT_COLUMNS.apply_json_mapping(args.mapping)
|
||||
except Exception as e:
|
||||
print(f'Failed to load user column configuration JSON file, got={e}')
|
||||
sys.exit(1)
|
||||
|
||||
# Ensure output directory exists
|
||||
if not os.path.exists(args.outpath):
|
||||
try:
|
||||
os.makedirs(args.outpath)
|
||||
except OSError as e:
|
||||
print(f"Error creating output directory: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# DATA CLEANING
|
||||
print(f"Loading and Cleaning data from {args.inputcsv}...\n")
|
||||
try:
|
||||
if args.inputcsv:
|
||||
trainings_df = pd.read_csv(args.inputcsv)
|
||||
elif args.exportmoduleurl:
|
||||
trainings_df = csv_url_to_dataframe(args.exportmoduleurl)
|
||||
else:
|
||||
raise RuntimeError("No dataset defined, an inputcsv or exportmoduleurl is required. This should not be possible unless you have changed the code.")
|
||||
|
||||
# Filter for reportable records only.
|
||||
# This will fail with a KeyError if the column is missing, as required.
|
||||
trainings_df = trainings_df[trainings_df[NEOSERRA_COLUMNS.reportable] == 1]
|
||||
|
||||
trainings_df = generate_cleaned_trainings_dataset(
|
||||
trainings_df,
|
||||
funding_sources=args.funding,
|
||||
col_neo_event_title=NEOSERRA_COLUMNS.event_title,
|
||||
col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic,
|
||||
col_neo_training_topics=NEOSERRA_COLUMNS.training_topics,
|
||||
col_neo_center=NEOSERRA_COLUMNS.center,
|
||||
col_is_preplanning=OUT_COLUMNS.is_preplanning,
|
||||
col_neo_attendees_total=NEOSERRA_COLUMNS.attendees_total,
|
||||
col_out_attendees_range=OUT_COLUMNS.attendees_range,
|
||||
)
|
||||
|
||||
# Save cleaned data
|
||||
clean_out = os.path.join(args.outpath, args.cleanedfilename)
|
||||
trainings_df.to_csv(clean_out, index=False)
|
||||
print(f"Cleaned dataset saved to {clean_out}\n")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Failed to clean input CSV: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# Values for "First Steps"
|
||||
first_steps_values = ['First Steps', 'Next Steps']
|
||||
|
||||
# Generate Statistics DataFrame for this group ---
|
||||
# This is required for Graph Set 1 (Center Statistics)
|
||||
try:
|
||||
stats_df = generate_center_trainings_count_statistics(
|
||||
full_df=trainings_df,
|
||||
filtered_df=trainings_df[trainings_df[NEOSERRA_COLUMNS.attendees_total] == 0],
|
||||
funding_source_group=args.funding,
|
||||
col_primary_topic=NEOSERRA_COLUMNS.primary_training_topic,
|
||||
col_center=NEOSERRA_COLUMNS.center,
|
||||
col_funding_source=NEOSERRA_COLUMNS.funding_source,
|
||||
col_attendees_total=NEOSERRA_COLUMNS.attendees_total,
|
||||
col_is_preplanning=OUT_COLUMNS.is_preplanning
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"Error generating statistics dataframe input dataset")
|
||||
print(e.with_traceback())
|
||||
sys.exit(1)
|
||||
|
||||
# Generate Graphs
|
||||
print(f"Starting graph generation...\n")
|
||||
# 1. Center Statistics Charts (Uses stats_df)
|
||||
print(f" Generating Center Statistics charts...")
|
||||
stats_figs = make_network_trainings_count_statistics_charts(
|
||||
funding_group_df=stats_df,
|
||||
filter_description_tag="With 0 Attendees",
|
||||
fiscal_year_tag=args.fiscalyear,
|
||||
# Columns in stats_df are generated with fixed names by generate_center_statistics
|
||||
# so we rely on the defaults in the graph function or pass them if needed.
|
||||
# The cleaning library outputs standard names like 'Selected Events', 'Percent Selected Events'
|
||||
# which match the defaults of the graph library.
|
||||
)
|
||||
|
||||
save_variant_charts(
|
||||
chart_dict=stats_figs,
|
||||
base_path=args.outpath,
|
||||
report=REPORT_NAME,
|
||||
chart_type=args.name_stats
|
||||
)
|
||||
|
||||
# 2. Attendee Bins Charts (Uses cleaned trainings_df)
|
||||
print(f" Generating Attendee Bins charts...")
|
||||
bins_figs = make_attendee_bins_statistics_charts(
|
||||
trainings_df,
|
||||
center="Network Wide",
|
||||
fiscal_year_tag=args.fiscalyear,
|
||||
first_steps_vals=first_steps_values,
|
||||
preplanning_val=OUT_COLUMNS.val_preplanning,
|
||||
col_neo_attendees_total=NEOSERRA_COLUMNS.attendees_total,
|
||||
col_attendees_range=OUT_COLUMNS.attendees_range,
|
||||
col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic
|
||||
)
|
||||
save_variant_charts(
|
||||
chart_dict=bins_figs,
|
||||
base_path=args.outpath,
|
||||
report=REPORT_NAME,
|
||||
chart_type=args.name_bins)
|
||||
|
||||
# 3. Primary Training Topic Charts
|
||||
print(f" Generating Primary Topic charts...")
|
||||
topic_figs = make_primary_training_topic_statistics_charts(
|
||||
trainings_df,
|
||||
center="Network Wide",
|
||||
fiscal_year_tag=args.fiscalyear,
|
||||
first_steps_vals=first_steps_values,
|
||||
col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic,
|
||||
col_neo_attendees_total=NEOSERRA_COLUMNS.attendees_total
|
||||
)
|
||||
save_variant_charts(
|
||||
chart_dict=topic_figs,
|
||||
base_path=args.outpath,
|
||||
report=REPORT_NAME,
|
||||
chart_type=args.name_topics)
|
||||
|
||||
# 4. Center Attendee Statistics
|
||||
print(f" Generating Center Attendee Stats charts...")
|
||||
center_att_figs = make_center_attendee_statistics_charts(
|
||||
trainings_df,
|
||||
fiscal_year_tag=args.fiscalyear,
|
||||
col_neo_center=NEOSERRA_COLUMNS.center,
|
||||
col_neo_attendees_total=NEOSERRA_COLUMNS.attendees_total,
|
||||
col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic,
|
||||
first_steps_vals=first_steps_values,
|
||||
preplanning_val=OUT_COLUMNS.val_preplanning
|
||||
)
|
||||
save_variant_charts(
|
||||
chart_dict=center_att_figs,
|
||||
base_path=args.outpath,
|
||||
report=REPORT_NAME,
|
||||
chart_type=args.name_center_attendees)
|
||||
|
||||
# 5. Center Event Counts
|
||||
print(f" Generating Center Event Count charts...")
|
||||
event_count_figs = make_center_event_count_charts(
|
||||
trainings_df,
|
||||
fiscal_year_tag=args.fiscalyear,
|
||||
col_neo_center=NEOSERRA_COLUMNS.center,
|
||||
col_neo_attendees_total=NEOSERRA_COLUMNS.attendees_total,
|
||||
col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic,
|
||||
first_steps_vals=first_steps_values,
|
||||
preplanning_val=OUT_COLUMNS.val_preplanning
|
||||
)
|
||||
save_variant_charts(
|
||||
chart_dict=event_count_figs,
|
||||
base_path=args.outpath,
|
||||
report=REPORT_NAME,
|
||||
chart_type=args.name_event_counts
|
||||
)
|
||||
|
||||
# Center Attendee Range (Stacked Bars)
|
||||
print(f" Generating Center Attendee Range charts...")
|
||||
range_figs = make_center_attendee_range_charts(
|
||||
trainings_df,
|
||||
fiscal_year_tag=args.fiscalyear,
|
||||
col_neo_center=NEOSERRA_COLUMNS.center,
|
||||
col_attendees_range=OUT_COLUMNS.attendees_range,
|
||||
col_neo_training_id=NEOSERRA_COLUMNS.training_id,
|
||||
col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic,
|
||||
first_steps_vals=first_steps_values,
|
||||
preplanning_val=OUT_COLUMNS.val_preplanning
|
||||
)
|
||||
save_variant_charts(
|
||||
chart_dict=range_figs,
|
||||
base_path=args.outpath,
|
||||
report=REPORT_NAME,
|
||||
chart_type=args.name_ranges)
|
||||
|
||||
# 7. Network Wide Topic Pie Charts
|
||||
print(f" Generating Topic Pie charts...")
|
||||
pie_figs = make_primary_training_topic_pie_charts(
|
||||
trainings_df,
|
||||
center="Network Wide",
|
||||
fiscal_year_tag=args.fiscalyear,
|
||||
first_steps_vals=first_steps_values,
|
||||
col_neo_attendees_total=NEOSERRA_COLUMNS.attendees_total,
|
||||
col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic
|
||||
)
|
||||
save_variant_charts(
|
||||
chart_dict=pie_figs,
|
||||
base_path=args.outpath,
|
||||
report=REPORT_NAME,
|
||||
chart_type=args.name_pie
|
||||
)
|
||||
|
||||
print("\nDONE! All charts generated for all funding groups.")
|
||||
Reference in New Issue
Block a user