first commit

2026-05-21 08:40:24 -04:00
commit b084545275
711 changed files with 3659856 additions and 0 deletions
--- a/section_3_graph_export_module/trainings_analysis_script.py
+++ b/section_3_graph_export_module/trainings_analysis_script.py
@@ -0,0 +1,211 @@
+# FILE: pasbdc_trainings_analysis_script.py
+# CREATED: 12/26/25
+# AUTHOR: Vincent Allen
+# PURPOSE: Full pipeline script: Cleans raw data, generates statistics, and creates analysis graphs per funding group for the center specific section 3
+
+# Third party libraries
+import pandas as pd
+import sys
+import os.path
+import argparse
+import json
+
+# Custom modules
+# 1. The Cleaning Library
+from section_1_datasets_module import ( #pyright:ignore
+    generate_cleaned_trainings_dataset,
+)
+
+# 2. The Graphing Library
+from section_1_graph_library_module import ( # pyright:ignore
+    make_attendee_bins_statistics_charts,
+    make_primary_training_topic_statistics_charts,
+    make_primary_training_topic_pie_charts,
+)
+
+from shared_tools_module import save_variant_charts, csv_url_to_dataframe
+from constants_module import NEOSERRA_COLUMNS, OUT_COLUMNS
+
+
+DEFAULT_FUNDING_GROUPS = ['Core Services', 'LEXNET', 'PDA', 'NAP']
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Clean Data and Generate Trainings Analysis Graphs")
+
+    inputdata_group = parser.add_mutually_exclusive_group(required=True)
+
+    # --- INPUT/OUTPUT ---
+    inputdata_group.add_argument("--inputcsv",
+                        type=str,
+                        help="The path to the RAW Neoserra trainings CSV export.")
+    inputdata_group.add_argument("--exportmoduleurl", type=str, help="The URL to the export module for this script")
+
+    parser.add_argument("--outpath",
+                        type=str,
+                        required=True,
+                        help="The base directory path to place generated files into.")
+
+    parser.add_argument("--cleanedfilename",
+                        type=str,
+                        default="cleaned_trainings_data.csv",
+                        help="The name to give the intermediate cleaned dataset.")
+
+    parser.add_argument("--mapping",
+                        type=str,
+                        required=False,
+                        help="Path to a JSON file to override default column names mappings.")
+
+    parser.add_argument("--fiscalyear",
+                        type=str,
+                        default="FY25",
+                        help="The label for the fiscal year to appear in graph titles.")
+
+    parser.add_argument("--reportname",
+                        type=str,
+                        default="trainingsreport",
+                        help="The report name used in the output file names to allow identification via ImageRegistry. The center name will be appended to the end of this to keep the number of images grabbed by the ImageRegistry smaller.")
+
+    # Base Filenames (Prefixes)
+    parser.add_argument("--name_bins", type=str, default="attendee-bins",
+                        help="Base filename for attendee bins charts.")
+
+    parser.add_argument("--name_topics", type=str, default="training-topics",
+                        help="Base filename for primary topic charts.")
+
+
+    parser.add_argument("--name_pie", type=str, default="topics-pie",
+                        help="Base filename for network wide topic pie charts.")
+
+    # NOTE: This needs to be manually checked to provide the default before execution as argparse does not support
+    # providing a full replacement as a default, whatever is passed to default is just appended to whatever the user inputs
+    parser.add_argument("--funding",
+                        action='append',
+                        required=False,
+                        default=None,
+                        help="For each occurrence of the --center argument, the script will generate a per center trainings report and look for the appropriate images in the images directory." )
+
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    if args.funding is None:
+        args.funding = DEFAULT_FUNDING_GROUPS
+
+    # Handle optional JSON mapping override
+    if args.mapping:
+        try:
+            with open(args.mapping, 'r') as f:
+                NEOSERRA_COLUMNS.apply_json_mapping(args.mapping)
+                OUT_COLUMNS.apply_json_mapping(args.mapping)
+        except Exception as e:
+            print(f'Failed to load user column configuration JSON file, got={e}')
+            sys.exit(1)
+
+    # Ensure output directory exists
+    if not os.path.exists(args.outpath):
+        try:
+            os.makedirs(args.outpath)
+        except OSError as e:
+            print(f"Error creating output directory: {e}")
+            sys.exit(1)
+
+    # DATA CLEANING
+    print(f"Loading and Cleaning data from {args.inputcsv}...\n")
+    try:
+        if args.inputcsv:
+            trainings_df = pd.read_csv(args.inputcsv)
+        elif args.exportmoduleurl:
+            trainings_df = csv_url_to_dataframe(args.exportmoduleurl)
+        else:
+            raise RuntimeError("A data source for this script has not been defined")
+
+        # Filter for reportable records only.
+        # This will fail with a KeyError if the column is missing, as required.
+        trainings_df = trainings_df[trainings_df[NEOSERRA_COLUMNS.reportable] == 1]
+
+        trainings_df = generate_cleaned_trainings_dataset(
+            trainings_df,
+            col_neo_event_title=NEOSERRA_COLUMNS.event_title,
+            col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic,
+            col_neo_training_topics=NEOSERRA_COLUMNS.training_topics,
+            col_neo_center=NEOSERRA_COLUMNS.center,
+            col_is_preplanning=OUT_COLUMNS.is_preplanning,
+            col_neo_attendees_total=NEOSERRA_COLUMNS.attendees_total,
+            col_out_attendees_range=OUT_COLUMNS.attendees_range,
+        )
+
+        # Save cleaned data
+        clean_out = os.path.join(args.outpath, args.cleanedfilename)
+        trainings_df.to_csv(str(clean_out), index=False)
+        print(f"Cleaned dataset saved to {clean_out}\n")
+
+    except Exception as e:
+        print(f"Failed to clean input CSV: {e}")
+        sys.exit(1)
+
+    # Filter the dataset for only the funding sources we care about
+    trainings_df = trainings_df[trainings_df[NEOSERRA_COLUMNS.funding_source].isin(args.funding)]
+
+    # Values for "First Steps"
+    first_steps_values = ['First Steps', 'Next Steps']
+
+    # Find all centers that we have in this data set
+    unique_centers = trainings_df[NEOSERRA_COLUMNS.center].unique().tolist()
+
+    for center in unique_centers:
+        center_trainings_df = trainings_df[(trainings_df[NEOSERRA_COLUMNS.center] == center)]
+        print(f"Starting generation for center {center}...\n")
+
+        # 2. Attendee Bins Charts (Uses cleaned trainings_df)
+        print(f"   Generating Attendee Bins charts...")
+        bins_figs = make_attendee_bins_statistics_charts(
+            center_trainings_df,
+            center=center,
+            fiscal_year_tag=args.fiscalyear,
+            first_steps_vals=first_steps_values,
+            preplanning_val=OUT_COLUMNS.val_preplanning,
+            col_neo_attendees_total=NEOSERRA_COLUMNS.attendees_total,
+            col_attendees_range=OUT_COLUMNS.attendees_range,
+            col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic
+        )
+        save_variant_charts(
+            chart_dict=bins_figs,
+            base_path=args.outpath,
+            report=f'{args.reportname}{center}',
+            chart_type=args.name_bins)
+
+        print(f"   Generating Primary Topic charts...")
+        topic_figs = make_primary_training_topic_statistics_charts(
+            center_trainings_df,
+            center=center,
+            fiscal_year_tag=args.fiscalyear,
+            first_steps_vals=first_steps_values,
+            col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic,
+            col_neo_attendees_total=NEOSERRA_COLUMNS.attendees_total
+        )
+        save_variant_charts(
+            chart_dict=topic_figs,
+            base_path=args.outpath,
+            report=f'{args.reportname}{center}',
+            chart_type=args.name_topics)
+
+        # Topic Pie Charts
+        print(f"   Generating Topic Pie charts...")
+        pie_figs = make_primary_training_topic_pie_charts(
+            center_trainings_df,
+            center=center,
+            fiscal_year_tag=args.fiscalyear,
+            first_steps_vals=first_steps_values,
+            col_neo_attendees_total=NEOSERRA_COLUMNS.attendees_total,
+            col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic
+        )
+        save_variant_charts(
+            chart_dict=pie_figs,
+            base_path=args.outpath,
+            report=f'{args.reportname}{center}',
+            chart_type=args.name_pie
+        )
+
+        print("\nDONE!")
--- a/section_3_graph_export_module/trainings_topic_per_center_script.py
+++ b/section_3_graph_export_module/trainings_topic_per_center_script.py
@@ -0,0 +1,165 @@
+# FILE: pasbdc_center_topic_analysis_script.py
+# CREATED: 12/26/25
+# AUTHOR: Vincent Allen
+# PURPOSE: Script to generate per-center training topic analysis graphs across funding groups.
+
+# Third party libraries
+import pandas as pd
+import sys
+import os.path
+import argparse
+import json
+import re
+
+from shared_tools_module import save_variant_charts, csv_url_to_dataframe
+from section_1_datasets_module import generate_cleaned_trainings_dataset
+from section_3_graph_library_module import make_center_topic_analysis
+from constants_module import NEOSERRA_COLUMNS, OUT_COLUMNS
+
+
+# The default funding groups defined in the requirements
+DEFAULT_FUNDING_GROUPS = ['Core Services', 'LEXNET', 'PDA', 'NAP']
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Generate Center-Specific Topic Analysis Graphs")
+
+    datasource_group = parser.add_mutually_exclusive_group(required=True)
+
+    # --- INPUT/OUTPUT ---
+    datasource_group.add_argument("--inputcsv",
+                        type=str,
+                        help="The path to the raw Neoserra trainings CSV.")
+    datasource_group.add_argument("--exportmoduleurl", type=str, help="The url to the export module created for this script.")
+
+    parser.add_argument("--outpath",
+                        type=str,
+                        required=True,
+                        help="The base directory path to place generated files into.")
+
+    parser.add_argument("--mapping",
+                        type=str,
+                        required=False,
+                        help="Path to a JSON file to override default column names mappings.")
+
+    parser.add_argument("--fiscalyear",
+                        type=str,
+                        default="FY25",
+                        help="The label for the fiscal year to appear in graph titles.")
+
+    parser.add_argument("--reportname", type=str, default="trainingsreport",
+                        help="Prefix for the output files. Used by ImageRegistry in later parts of the pipeline to find the correct report's images")
+
+    parser.add_argument("--chartcategorybase", type=str, default="topicanalysis", help="The chart category to use for this graph. It will be the second part of the filename and will have the center name for the chart appended to this value in the filename.")
+
+    # NOTE: This needs to be manually checked to provide the default before execution as argparse does not support
+    # providing a full replacement as a default, whatever is passed to default is just appended to whatever the user inputs
+    parser.add_argument("--funding",
+                        action='append',
+                        required=False,
+                        default=None,
+                        help="For each occurrence of the --center argument, the script will generate a per center trainings report and look for the appropriate images in the images directory." )
+
+    return parser.parse_args()
+
+def sanitize_filename(name: str) -> str:
+    """
+    Converts a string (like a center name) into a safe filename.
+    Replaces spaces with underscores and removes non-alphanumeric characters.
+    """
+    # Replace spaces with underscores
+    name = name.replace(" ", "_")
+    # Remove characters that aren't alphanumerics, underscores, or hyphens
+    name = re.sub(r'(?u)[^-\w]', '', name)
+    return name.lower()
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    if args.funding is None:
+        args.funding = DEFAULT_FUNDING_GROUPS
+
+    # Handle optional JSON mapping override
+    if args.mapping:
+        try:
+            with open(args.mapping, 'r') as f:
+                NEOSERRA_COLUMNS.apply_json_mapping(args.mapping)
+                OUT_COLUMNS.apply_json_mapping(args.mapping)
+        except Exception as e:
+            print(f'Failed to load user column configuration JSON file, got={e}')
+            sys.exit(1)
+
+    # Ensure output directory exists
+    if not os.path.exists(args.outpath):
+        try:
+            os.makedirs(args.outpath)
+        except OSError as e:
+            print(f"Error creating output directory: {e}")
+            sys.exit(1)
+
+    print(f"Loading input data from {args.inputcsv}...\n")
+    try:
+        if args.inputcsv:
+            trainings_df = pd.read_csv(args.inputcsv)
+        elif args.exportmoduleurl:
+            trainings_df = csv_url_to_dataframe(args.exportmoduleurl)
+        else:
+            raise RuntimeError("A datasource was not defined in the script.")
+    except Exception as e:
+        print(f"Failed to read input CSV: {e}")
+        sys.exit(1)
+
+    # Filter for reportable records only.
+    # This will fail with a KeyError if the column is missing, as required.
+    trainings_df = trainings_df[trainings_df[NEOSERRA_COLUMNS.reportable] == 1]
+
+    print(f"Cleaning input data...")
+    trainings_df = generate_cleaned_trainings_dataset(
+        trainings_df,
+        col_neo_event_title = NEOSERRA_COLUMNS.event_title,
+        col_neo_primary_topic = NEOSERRA_COLUMNS.primary_training_topic,
+        col_neo_training_topics = NEOSERRA_COLUMNS.training_topics,
+        col_neo_center = NEOSERRA_COLUMNS.center,
+        col_is_preplanning = OUT_COLUMNS.is_preplanning,
+        col_neo_attendees_total = NEOSERRA_COLUMNS.attendees_total,
+        col_out_attendees_range = OUT_COLUMNS.attendees_range,
+    )
+
+    print(f"Starting generation for center specific charts\n")
+
+    # Filter Data by Funding Group
+    # The library function calculates stats based on whatever dataframe receives.
+    # To get stats specific to this funding group, we must filter *before* passing.
+    filtered_df = trainings_df[trainings_df[NEOSERRA_COLUMNS.funding_source].isin(args.funding)].copy()
+
+    if filtered_df.empty:
+        print(f"No data found for Funding Groups {args.funding}.")
+
+    # Identify Centers in this filtered subset
+    centers_in_group = filtered_df[NEOSERRA_COLUMNS.center].unique() #pyright:ignore
+    print(f"\tFound {len(centers_in_group)} centers with data in this funding group: {args.funding}")
+    print(f"\tCenters: {centers_in_group}")
+
+    # Iterate through Centers and make charts for each
+    for center_name in centers_in_group:
+        print(f'\tGenerating center {center_name}')
+        try:
+            # Call library function
+            center_figs = make_center_topic_analysis(
+                trainings_df=filtered_df,
+                center_name=center_name,
+                fiscal_year_tag=args.fiscalyear,
+                col_neo_center=NEOSERRA_COLUMNS.center,
+                col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic
+            )
+
+            save_variant_charts(
+                chart_dict=center_figs,
+                base_path=args.outpath,
+                report=f"{args.reportname}",
+                chart_type=f"{args.chartcategorybase}{center_name}"
+            )
+
+        except Exception as e:
+            print(f"      Error processing center '{center_name}': {e}")
+
+    print("\nDONE! All center topic analysis charts generated.")