# FILE: pasbdc_trainings_analysis_script.py # CREATED: 12/26/25 # AUTHOR: Vincent Allen # PURPOSE: Full pipeline script: Cleans raw data, generates statistics, and creates analysis graphs per funding group. # Third party libraries import pandas as pd import sys import os.path import argparse import json # Custom modules from section_1_datasets_module import ( #pyright:ignore generate_cleaned_trainings_dataset, generate_center_trainings_count_statistics ) from section_1_graph_library_module import ( # pyright:ignore make_attendee_bins_statistics_charts, make_primary_training_topic_statistics_charts, make_center_attendee_statistics_charts, make_center_event_count_charts, make_center_attendee_range_charts, make_primary_training_topic_pie_charts, make_network_trainings_count_statistics_charts ) from shared_tools_module import save_variant_charts, csv_url_to_dataframe from constants_module import NEOSERRA_COLUMNS, OUT_COLUMNS DEFAULT_FUNDING_GROUPS = ['Core Services', 'LEXNET', 'PDA', 'NAP'] REPORT_NAME = "trainingsreport" def parse_args(): parser = argparse.ArgumentParser(description="Clean Data and Generate Trainings Analysis Graphs") dataset_group = parser.add_mutually_exclusive_group(required=True) # --- INPUT/OUTPUT --- dataset_group.add_argument("--inputcsv", type=str, help="The path to the RAW Neoserra trainings CSV export.") dataset_group.add_argument("--exportmoduleurl", type=str, help="The Neoserra export module url for the Trainings data export") parser.add_argument("--outpath", type=str, required=True, help="The base directory path to place generated files into.") parser.add_argument("--cleanedfilename", type=str, default="cleaned_trainings_data.csv", help="The name to give the intermediate cleaned dataset.") parser.add_argument("--mapping", type=str, required=False, help="Path to a JSON file to override default column names mappings.") parser.add_argument("--fiscalyear", type=str, default="FY25", help="The label for the fiscal year to appear in graph titles.") # --- BASE FILENAMES (Prefixes) --- parser.add_argument("--name_stats", type=str, default="center-statistics", help="Base filename for the detailed center statistics charts.") parser.add_argument("--name_bins", type=str, default="attendee-bins", help="Base filename for attendee bins charts.") parser.add_argument("--name_topics", type=str, default="training-topics", help="Base filename for primary topic charts.") parser.add_argument("--name_center_attendees", type=str, default="center-attendees", help="Base filename for center attendee statistics charts.") parser.add_argument("--name_event_counts", type=str, default="event-counts", help="Base filename for center event count charts.") parser.add_argument("--name_ranges", type=str, default="attendee-ranges", help="Base filename for center attendee range stacked bars.") parser.add_argument("--name_pie", type=str, default="topics-pie", help="Base filename for network wide topic pie charts.") # NOTE: This needs to be manually checked to provide the default before execution as argparse does not support # providing a full replacement as a default, whatever is passed to default is just appended to whatever the user inputs parser.add_argument("--funding", action='append', required=False, default=None, help="Which funding groups should be included in the final analysis" ) return parser.parse_args() if __name__ == "__main__": args = parse_args() if args.funding is None: args.funding = DEFAULT_FUNDING_GROUPS # Handle optional JSON mapping override if args.mapping: try: with open(args.mapping, 'r') as f: NEOSERRA_COLUMNS.apply_json_mapping(args.mapping) OUT_COLUMNS.apply_json_mapping(args.mapping) except Exception as e: print(f'Failed to load user column configuration JSON file, got={e}') sys.exit(1) # Ensure output directory exists if not os.path.exists(args.outpath): try: os.makedirs(args.outpath) except OSError as e: print(f"Error creating output directory: {e}") sys.exit(1) # DATA CLEANING print(f"Loading and Cleaning data from {args.inputcsv}...\n") try: if args.inputcsv: trainings_df = pd.read_csv(args.inputcsv) elif args.exportmoduleurl: trainings_df = csv_url_to_dataframe(args.exportmoduleurl) else: raise RuntimeError("No dataset defined, an inputcsv or exportmoduleurl is required. This should not be possible unless you have changed the code.") # Filter for reportable records only. # This will fail with a KeyError if the column is missing, as required. trainings_df = trainings_df[trainings_df[NEOSERRA_COLUMNS.reportable] == 1] trainings_df = generate_cleaned_trainings_dataset( trainings_df, funding_sources=args.funding, col_neo_event_title=NEOSERRA_COLUMNS.event_title, col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic, col_neo_training_topics=NEOSERRA_COLUMNS.training_topics, col_neo_center=NEOSERRA_COLUMNS.center, col_is_preplanning=OUT_COLUMNS.is_preplanning, col_neo_attendees_total=NEOSERRA_COLUMNS.attendees_total, col_out_attendees_range=OUT_COLUMNS.attendees_range, ) # Save cleaned data clean_out = os.path.join(args.outpath, args.cleanedfilename) trainings_df.to_csv(clean_out, index=False) print(f"Cleaned dataset saved to {clean_out}\n") except Exception as e: print(f"Failed to clean input CSV: {e}") sys.exit(1) # Values for "First Steps" first_steps_values = ['First Steps', 'Next Steps'] # Generate Statistics DataFrame for this group --- # This is required for Graph Set 1 (Center Statistics) try: stats_df = generate_center_trainings_count_statistics( full_df=trainings_df, filtered_df=trainings_df[trainings_df[NEOSERRA_COLUMNS.attendees_total] == 0], funding_source_group=args.funding, col_primary_topic=NEOSERRA_COLUMNS.primary_training_topic, col_center=NEOSERRA_COLUMNS.center, col_funding_source=NEOSERRA_COLUMNS.funding_source, col_attendees_total=NEOSERRA_COLUMNS.attendees_total, col_is_preplanning=OUT_COLUMNS.is_preplanning ) except Exception as e: print(f"Error generating statistics dataframe input dataset") print(e.with_traceback()) sys.exit(1) # Generate Graphs print(f"Starting graph generation...\n") # 1. Center Statistics Charts (Uses stats_df) print(f" Generating Center Statistics charts...") stats_figs = make_network_trainings_count_statistics_charts( funding_group_df=stats_df, filter_description_tag="With 0 Attendees", fiscal_year_tag=args.fiscalyear, # Columns in stats_df are generated with fixed names by generate_center_statistics # so we rely on the defaults in the graph function or pass them if needed. # The cleaning library outputs standard names like 'Selected Events', 'Percent Selected Events' # which match the defaults of the graph library. ) save_variant_charts( chart_dict=stats_figs, base_path=args.outpath, report=REPORT_NAME, chart_type=args.name_stats ) # 2. Attendee Bins Charts (Uses cleaned trainings_df) print(f" Generating Attendee Bins charts...") bins_figs = make_attendee_bins_statistics_charts( trainings_df, center="Network Wide", fiscal_year_tag=args.fiscalyear, first_steps_vals=first_steps_values, preplanning_val=OUT_COLUMNS.val_preplanning, col_neo_attendees_total=NEOSERRA_COLUMNS.attendees_total, col_attendees_range=OUT_COLUMNS.attendees_range, col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic ) save_variant_charts( chart_dict=bins_figs, base_path=args.outpath, report=REPORT_NAME, chart_type=args.name_bins) # 3. Primary Training Topic Charts print(f" Generating Primary Topic charts...") topic_figs = make_primary_training_topic_statistics_charts( trainings_df, center="Network Wide", fiscal_year_tag=args.fiscalyear, first_steps_vals=first_steps_values, col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic, col_neo_attendees_total=NEOSERRA_COLUMNS.attendees_total ) save_variant_charts( chart_dict=topic_figs, base_path=args.outpath, report=REPORT_NAME, chart_type=args.name_topics) # 4. Center Attendee Statistics print(f" Generating Center Attendee Stats charts...") center_att_figs = make_center_attendee_statistics_charts( trainings_df, fiscal_year_tag=args.fiscalyear, col_neo_center=NEOSERRA_COLUMNS.center, col_neo_attendees_total=NEOSERRA_COLUMNS.attendees_total, col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic, first_steps_vals=first_steps_values, preplanning_val=OUT_COLUMNS.val_preplanning ) save_variant_charts( chart_dict=center_att_figs, base_path=args.outpath, report=REPORT_NAME, chart_type=args.name_center_attendees) # 5. Center Event Counts print(f" Generating Center Event Count charts...") event_count_figs = make_center_event_count_charts( trainings_df, fiscal_year_tag=args.fiscalyear, col_neo_center=NEOSERRA_COLUMNS.center, col_neo_attendees_total=NEOSERRA_COLUMNS.attendees_total, col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic, first_steps_vals=first_steps_values, preplanning_val=OUT_COLUMNS.val_preplanning ) save_variant_charts( chart_dict=event_count_figs, base_path=args.outpath, report=REPORT_NAME, chart_type=args.name_event_counts ) # Center Attendee Range (Stacked Bars) print(f" Generating Center Attendee Range charts...") range_figs = make_center_attendee_range_charts( trainings_df, fiscal_year_tag=args.fiscalyear, col_neo_center=NEOSERRA_COLUMNS.center, col_attendees_range=OUT_COLUMNS.attendees_range, col_neo_training_id=NEOSERRA_COLUMNS.training_id, col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic, first_steps_vals=first_steps_values, preplanning_val=OUT_COLUMNS.val_preplanning ) save_variant_charts( chart_dict=range_figs, base_path=args.outpath, report=REPORT_NAME, chart_type=args.name_ranges) # 7. Network Wide Topic Pie Charts print(f" Generating Topic Pie charts...") pie_figs = make_primary_training_topic_pie_charts( trainings_df, center="Network Wide", fiscal_year_tag=args.fiscalyear, first_steps_vals=first_steps_values, col_neo_attendees_total=NEOSERRA_COLUMNS.attendees_total, col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic ) save_variant_charts( chart_dict=pie_figs, base_path=args.outpath, report=REPORT_NAME, chart_type=args.name_pie ) print("\nDONE! All charts generated for all funding groups.")