# FILE: pasbdc_trainings_analysis_script.py # CREATED: 12/26/25 # AUTHOR: Vincent Allen # PURPOSE: Full pipeline script: Cleans raw data, generates statistics, and creates analysis graphs per funding group for the center specific section 3 # Third party libraries import pandas as pd import sys import os.path import argparse import json # Custom modules # 1. The Cleaning Library from section_1_datasets_module import ( #pyright:ignore generate_cleaned_trainings_dataset, ) # 2. The Graphing Library from section_1_graph_library_module import ( # pyright:ignore make_attendee_bins_statistics_charts, make_primary_training_topic_statistics_charts, make_primary_training_topic_pie_charts, ) from shared_tools_module import save_variant_charts, csv_url_to_dataframe from constants_module import NEOSERRA_COLUMNS, OUT_COLUMNS DEFAULT_FUNDING_GROUPS = ['Core Services', 'LEXNET', 'PDA', 'NAP'] def parse_args(): parser = argparse.ArgumentParser(description="Clean Data and Generate Trainings Analysis Graphs") inputdata_group = parser.add_mutually_exclusive_group(required=True) # --- INPUT/OUTPUT --- inputdata_group.add_argument("--inputcsv", type=str, help="The path to the RAW Neoserra trainings CSV export.") inputdata_group.add_argument("--exportmoduleurl", type=str, help="The URL to the export module for this script") parser.add_argument("--outpath", type=str, required=True, help="The base directory path to place generated files into.") parser.add_argument("--cleanedfilename", type=str, default="cleaned_trainings_data.csv", help="The name to give the intermediate cleaned dataset.") parser.add_argument("--mapping", type=str, required=False, help="Path to a JSON file to override default column names mappings.") parser.add_argument("--fiscalyear", type=str, default="FY25", help="The label for the fiscal year to appear in graph titles.") parser.add_argument("--reportname", type=str, default="trainingsreport", help="The report name used in the output file names to allow identification via ImageRegistry. The center name will be appended to the end of this to keep the number of images grabbed by the ImageRegistry smaller.") # Base Filenames (Prefixes) parser.add_argument("--name_bins", type=str, default="attendee-bins", help="Base filename for attendee bins charts.") parser.add_argument("--name_topics", type=str, default="training-topics", help="Base filename for primary topic charts.") parser.add_argument("--name_pie", type=str, default="topics-pie", help="Base filename for network wide topic pie charts.") # NOTE: This needs to be manually checked to provide the default before execution as argparse does not support # providing a full replacement as a default, whatever is passed to default is just appended to whatever the user inputs parser.add_argument("--funding", action='append', required=False, default=None, help="For each occurrence of the --center argument, the script will generate a per center trainings report and look for the appropriate images in the images directory." ) return parser.parse_args() if __name__ == "__main__": args = parse_args() if args.funding is None: args.funding = DEFAULT_FUNDING_GROUPS # Handle optional JSON mapping override if args.mapping: try: with open(args.mapping, 'r') as f: NEOSERRA_COLUMNS.apply_json_mapping(args.mapping) OUT_COLUMNS.apply_json_mapping(args.mapping) except Exception as e: print(f'Failed to load user column configuration JSON file, got={e}') sys.exit(1) # Ensure output directory exists if not os.path.exists(args.outpath): try: os.makedirs(args.outpath) except OSError as e: print(f"Error creating output directory: {e}") sys.exit(1) # DATA CLEANING print(f"Loading and Cleaning data from {args.inputcsv}...\n") try: if args.inputcsv: trainings_df = pd.read_csv(args.inputcsv) elif args.exportmoduleurl: trainings_df = csv_url_to_dataframe(args.exportmoduleurl) else: raise RuntimeError("A data source for this script has not been defined") # Filter for reportable records only. # This will fail with a KeyError if the column is missing, as required. trainings_df = trainings_df[trainings_df[NEOSERRA_COLUMNS.reportable] == 1] trainings_df = generate_cleaned_trainings_dataset( trainings_df, col_neo_event_title=NEOSERRA_COLUMNS.event_title, col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic, col_neo_training_topics=NEOSERRA_COLUMNS.training_topics, col_neo_center=NEOSERRA_COLUMNS.center, col_is_preplanning=OUT_COLUMNS.is_preplanning, col_neo_attendees_total=NEOSERRA_COLUMNS.attendees_total, col_out_attendees_range=OUT_COLUMNS.attendees_range, ) # Save cleaned data clean_out = os.path.join(args.outpath, args.cleanedfilename) trainings_df.to_csv(str(clean_out), index=False) print(f"Cleaned dataset saved to {clean_out}\n") except Exception as e: print(f"Failed to clean input CSV: {e}") sys.exit(1) # Filter the dataset for only the funding sources we care about trainings_df = trainings_df[trainings_df[NEOSERRA_COLUMNS.funding_source].isin(args.funding)] # Values for "First Steps" first_steps_values = ['First Steps', 'Next Steps'] # Find all centers that we have in this data set unique_centers = trainings_df[NEOSERRA_COLUMNS.center].unique().tolist() for center in unique_centers: center_trainings_df = trainings_df[(trainings_df[NEOSERRA_COLUMNS.center] == center)] print(f"Starting generation for center {center}...\n") # 2. Attendee Bins Charts (Uses cleaned trainings_df) print(f" Generating Attendee Bins charts...") bins_figs = make_attendee_bins_statistics_charts( center_trainings_df, center=center, fiscal_year_tag=args.fiscalyear, first_steps_vals=first_steps_values, preplanning_val=OUT_COLUMNS.val_preplanning, col_neo_attendees_total=NEOSERRA_COLUMNS.attendees_total, col_attendees_range=OUT_COLUMNS.attendees_range, col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic ) save_variant_charts( chart_dict=bins_figs, base_path=args.outpath, report=f'{args.reportname}{center}', chart_type=args.name_bins) print(f" Generating Primary Topic charts...") topic_figs = make_primary_training_topic_statistics_charts( center_trainings_df, center=center, fiscal_year_tag=args.fiscalyear, first_steps_vals=first_steps_values, col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic, col_neo_attendees_total=NEOSERRA_COLUMNS.attendees_total ) save_variant_charts( chart_dict=topic_figs, base_path=args.outpath, report=f'{args.reportname}{center}', chart_type=args.name_topics) # Topic Pie Charts print(f" Generating Topic Pie charts...") pie_figs = make_primary_training_topic_pie_charts( center_trainings_df, center=center, fiscal_year_tag=args.fiscalyear, first_steps_vals=first_steps_values, col_neo_attendees_total=NEOSERRA_COLUMNS.attendees_total, col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic ) save_variant_charts( chart_dict=pie_figs, base_path=args.outpath, report=f'{args.reportname}{center}', chart_type=args.name_pie ) print("\nDONE!")