first commit

2026-05-21 08:40:24 -04:00
commit b084545275
711 changed files with 3659856 additions and 0 deletions
--- a/section_3_graph_export_module/trainings_analysis_script.py
+++ b/section_3_graph_export_module/trainings_analysis_script.py
@@ -0,0 +1,211 @@
+# FILE: pasbdc_trainings_analysis_script.py
+# CREATED: 12/26/25
+# AUTHOR: Vincent Allen
+# PURPOSE: Full pipeline script: Cleans raw data, generates statistics, and creates analysis graphs per funding group for the center specific section 3
+
+# Third party libraries
+import pandas as pd
+import sys
+import os.path
+import argparse
+import json
+
+# Custom modules
+# 1. The Cleaning Library
+from section_1_datasets_module import ( #pyright:ignore
+    generate_cleaned_trainings_dataset,
+)
+
+# 2. The Graphing Library
+from section_1_graph_library_module import ( # pyright:ignore
+    make_attendee_bins_statistics_charts,
+    make_primary_training_topic_statistics_charts,
+    make_primary_training_topic_pie_charts,
+)
+
+from shared_tools_module import save_variant_charts, csv_url_to_dataframe
+from constants_module import NEOSERRA_COLUMNS, OUT_COLUMNS
+
+
+DEFAULT_FUNDING_GROUPS = ['Core Services', 'LEXNET', 'PDA', 'NAP']
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Clean Data and Generate Trainings Analysis Graphs")
+
+    inputdata_group = parser.add_mutually_exclusive_group(required=True)
+
+    # --- INPUT/OUTPUT ---
+    inputdata_group.add_argument("--inputcsv",
+                        type=str,
+                        help="The path to the RAW Neoserra trainings CSV export.")
+    inputdata_group.add_argument("--exportmoduleurl", type=str, help="The URL to the export module for this script")
+
+    parser.add_argument("--outpath",
+                        type=str,
+                        required=True,
+                        help="The base directory path to place generated files into.")
+
+    parser.add_argument("--cleanedfilename",
+                        type=str,
+                        default="cleaned_trainings_data.csv",
+                        help="The name to give the intermediate cleaned dataset.")
+
+    parser.add_argument("--mapping",
+                        type=str,
+                        required=False,
+                        help="Path to a JSON file to override default column names mappings.")
+
+    parser.add_argument("--fiscalyear",
+                        type=str,
+                        default="FY25",
+                        help="The label for the fiscal year to appear in graph titles.")
+
+    parser.add_argument("--reportname",
+                        type=str,
+                        default="trainingsreport",
+                        help="The report name used in the output file names to allow identification via ImageRegistry. The center name will be appended to the end of this to keep the number of images grabbed by the ImageRegistry smaller.")
+
+    # Base Filenames (Prefixes)
+    parser.add_argument("--name_bins", type=str, default="attendee-bins",
+                        help="Base filename for attendee bins charts.")
+
+    parser.add_argument("--name_topics", type=str, default="training-topics",
+                        help="Base filename for primary topic charts.")
+
+
+    parser.add_argument("--name_pie", type=str, default="topics-pie",
+                        help="Base filename for network wide topic pie charts.")
+
+    # NOTE: This needs to be manually checked to provide the default before execution as argparse does not support
+    # providing a full replacement as a default, whatever is passed to default is just appended to whatever the user inputs
+    parser.add_argument("--funding",
+                        action='append',
+                        required=False,
+                        default=None,
+                        help="For each occurrence of the --center argument, the script will generate a per center trainings report and look for the appropriate images in the images directory." )
+
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    if args.funding is None:
+        args.funding = DEFAULT_FUNDING_GROUPS
+
+    # Handle optional JSON mapping override
+    if args.mapping:
+        try:
+            with open(args.mapping, 'r') as f:
+                NEOSERRA_COLUMNS.apply_json_mapping(args.mapping)
+                OUT_COLUMNS.apply_json_mapping(args.mapping)
+        except Exception as e:
+            print(f'Failed to load user column configuration JSON file, got={e}')
+            sys.exit(1)
+
+    # Ensure output directory exists
+    if not os.path.exists(args.outpath):
+        try:
+            os.makedirs(args.outpath)
+        except OSError as e:
+            print(f"Error creating output directory: {e}")
+            sys.exit(1)
+
+    # DATA CLEANING
+    print(f"Loading and Cleaning data from {args.inputcsv}...\n")
+    try:
+        if args.inputcsv:
+            trainings_df = pd.read_csv(args.inputcsv)
+        elif args.exportmoduleurl:
+            trainings_df = csv_url_to_dataframe(args.exportmoduleurl)
+        else:
+            raise RuntimeError("A data source for this script has not been defined")
+
+        # Filter for reportable records only.
+        # This will fail with a KeyError if the column is missing, as required.
+        trainings_df = trainings_df[trainings_df[NEOSERRA_COLUMNS.reportable] == 1]
+
+        trainings_df = generate_cleaned_trainings_dataset(
+            trainings_df,
+            col_neo_event_title=NEOSERRA_COLUMNS.event_title,
+            col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic,
+            col_neo_training_topics=NEOSERRA_COLUMNS.training_topics,
+            col_neo_center=NEOSERRA_COLUMNS.center,
+            col_is_preplanning=OUT_COLUMNS.is_preplanning,
+            col_neo_attendees_total=NEOSERRA_COLUMNS.attendees_total,
+            col_out_attendees_range=OUT_COLUMNS.attendees_range,
+        )
+
+        # Save cleaned data
+        clean_out = os.path.join(args.outpath, args.cleanedfilename)
+        trainings_df.to_csv(str(clean_out), index=False)
+        print(f"Cleaned dataset saved to {clean_out}\n")
+
+    except Exception as e:
+        print(f"Failed to clean input CSV: {e}")
+        sys.exit(1)
+
+    # Filter the dataset for only the funding sources we care about
+    trainings_df = trainings_df[trainings_df[NEOSERRA_COLUMNS.funding_source].isin(args.funding)]
+
+    # Values for "First Steps"
+    first_steps_values = ['First Steps', 'Next Steps']
+
+    # Find all centers that we have in this data set
+    unique_centers = trainings_df[NEOSERRA_COLUMNS.center].unique().tolist()
+
+    for center in unique_centers:
+        center_trainings_df = trainings_df[(trainings_df[NEOSERRA_COLUMNS.center] == center)]
+        print(f"Starting generation for center {center}...\n")
+
+        # 2. Attendee Bins Charts (Uses cleaned trainings_df)
+        print(f"   Generating Attendee Bins charts...")
+        bins_figs = make_attendee_bins_statistics_charts(
+            center_trainings_df,
+            center=center,
+            fiscal_year_tag=args.fiscalyear,
+            first_steps_vals=first_steps_values,
+            preplanning_val=OUT_COLUMNS.val_preplanning,
+            col_neo_attendees_total=NEOSERRA_COLUMNS.attendees_total,
+            col_attendees_range=OUT_COLUMNS.attendees_range,
+            col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic
+        )
+        save_variant_charts(
+            chart_dict=bins_figs,
+            base_path=args.outpath,
+            report=f'{args.reportname}{center}',
+            chart_type=args.name_bins)
+
+        print(f"   Generating Primary Topic charts...")
+        topic_figs = make_primary_training_topic_statistics_charts(
+            center_trainings_df,
+            center=center,
+            fiscal_year_tag=args.fiscalyear,
+            first_steps_vals=first_steps_values,
+            col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic,
+            col_neo_attendees_total=NEOSERRA_COLUMNS.attendees_total
+        )
+        save_variant_charts(
+            chart_dict=topic_figs,
+            base_path=args.outpath,
+            report=f'{args.reportname}{center}',
+            chart_type=args.name_topics)
+
+        # Topic Pie Charts
+        print(f"   Generating Topic Pie charts...")
+        pie_figs = make_primary_training_topic_pie_charts(
+            center_trainings_df,
+            center=center,
+            fiscal_year_tag=args.fiscalyear,
+            first_steps_vals=first_steps_values,
+            col_neo_attendees_total=NEOSERRA_COLUMNS.attendees_total,
+            col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic
+        )
+        save_variant_charts(
+            chart_dict=pie_figs,
+            base_path=args.outpath,
+            report=f'{args.reportname}{center}',
+            chart_type=args.name_pie
+        )
+
+        print("\nDONE!")