Files
testing123/section_3_graph_export_module/trainings_analysis_script.py
2026-05-21 08:40:24 -04:00

211 lines
8.4 KiB
Python

# FILE: pasbdc_trainings_analysis_script.py
# CREATED: 12/26/25
# AUTHOR: Vincent Allen
# PURPOSE: Full pipeline script: Cleans raw data, generates statistics, and creates analysis graphs per funding group for the center specific section 3
# Third party libraries
import pandas as pd
import sys
import os.path
import argparse
import json
# Custom modules
# 1. The Cleaning Library
from section_1_datasets_module import ( #pyright:ignore
generate_cleaned_trainings_dataset,
)
# 2. The Graphing Library
from section_1_graph_library_module import ( # pyright:ignore
make_attendee_bins_statistics_charts,
make_primary_training_topic_statistics_charts,
make_primary_training_topic_pie_charts,
)
from shared_tools_module import save_variant_charts, csv_url_to_dataframe
from constants_module import NEOSERRA_COLUMNS, OUT_COLUMNS
DEFAULT_FUNDING_GROUPS = ['Core Services', 'LEXNET', 'PDA', 'NAP']
def parse_args():
parser = argparse.ArgumentParser(description="Clean Data and Generate Trainings Analysis Graphs")
inputdata_group = parser.add_mutually_exclusive_group(required=True)
# --- INPUT/OUTPUT ---
inputdata_group.add_argument("--inputcsv",
type=str,
help="The path to the RAW Neoserra trainings CSV export.")
inputdata_group.add_argument("--exportmoduleurl", type=str, help="The URL to the export module for this script")
parser.add_argument("--outpath",
type=str,
required=True,
help="The base directory path to place generated files into.")
parser.add_argument("--cleanedfilename",
type=str,
default="cleaned_trainings_data.csv",
help="The name to give the intermediate cleaned dataset.")
parser.add_argument("--mapping",
type=str,
required=False,
help="Path to a JSON file to override default column names mappings.")
parser.add_argument("--fiscalyear",
type=str,
default="FY25",
help="The label for the fiscal year to appear in graph titles.")
parser.add_argument("--reportname",
type=str,
default="trainingsreport",
help="The report name used in the output file names to allow identification via ImageRegistry. The center name will be appended to the end of this to keep the number of images grabbed by the ImageRegistry smaller.")
# Base Filenames (Prefixes)
parser.add_argument("--name_bins", type=str, default="attendee-bins",
help="Base filename for attendee bins charts.")
parser.add_argument("--name_topics", type=str, default="training-topics",
help="Base filename for primary topic charts.")
parser.add_argument("--name_pie", type=str, default="topics-pie",
help="Base filename for network wide topic pie charts.")
# NOTE: This needs to be manually checked to provide the default before execution as argparse does not support
# providing a full replacement as a default, whatever is passed to default is just appended to whatever the user inputs
parser.add_argument("--funding",
action='append',
required=False,
default=None,
help="For each occurrence of the --center argument, the script will generate a per center trainings report and look for the appropriate images in the images directory." )
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
if args.funding is None:
args.funding = DEFAULT_FUNDING_GROUPS
# Handle optional JSON mapping override
if args.mapping:
try:
with open(args.mapping, 'r') as f:
NEOSERRA_COLUMNS.apply_json_mapping(args.mapping)
OUT_COLUMNS.apply_json_mapping(args.mapping)
except Exception as e:
print(f'Failed to load user column configuration JSON file, got={e}')
sys.exit(1)
# Ensure output directory exists
if not os.path.exists(args.outpath):
try:
os.makedirs(args.outpath)
except OSError as e:
print(f"Error creating output directory: {e}")
sys.exit(1)
# DATA CLEANING
print(f"Loading and Cleaning data from {args.inputcsv}...\n")
try:
if args.inputcsv:
trainings_df = pd.read_csv(args.inputcsv)
elif args.exportmoduleurl:
trainings_df = csv_url_to_dataframe(args.exportmoduleurl)
else:
raise RuntimeError("A data source for this script has not been defined")
# Filter for reportable records only.
# This will fail with a KeyError if the column is missing, as required.
trainings_df = trainings_df[trainings_df[NEOSERRA_COLUMNS.reportable] == 1]
trainings_df = generate_cleaned_trainings_dataset(
trainings_df,
col_neo_event_title=NEOSERRA_COLUMNS.event_title,
col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic,
col_neo_training_topics=NEOSERRA_COLUMNS.training_topics,
col_neo_center=NEOSERRA_COLUMNS.center,
col_is_preplanning=OUT_COLUMNS.is_preplanning,
col_neo_attendees_total=NEOSERRA_COLUMNS.attendees_total,
col_out_attendees_range=OUT_COLUMNS.attendees_range,
)
# Save cleaned data
clean_out = os.path.join(args.outpath, args.cleanedfilename)
trainings_df.to_csv(str(clean_out), index=False)
print(f"Cleaned dataset saved to {clean_out}\n")
except Exception as e:
print(f"Failed to clean input CSV: {e}")
sys.exit(1)
# Filter the dataset for only the funding sources we care about
trainings_df = trainings_df[trainings_df[NEOSERRA_COLUMNS.funding_source].isin(args.funding)]
# Values for "First Steps"
first_steps_values = ['First Steps', 'Next Steps']
# Find all centers that we have in this data set
unique_centers = trainings_df[NEOSERRA_COLUMNS.center].unique().tolist()
for center in unique_centers:
center_trainings_df = trainings_df[(trainings_df[NEOSERRA_COLUMNS.center] == center)]
print(f"Starting generation for center {center}...\n")
# 2. Attendee Bins Charts (Uses cleaned trainings_df)
print(f" Generating Attendee Bins charts...")
bins_figs = make_attendee_bins_statistics_charts(
center_trainings_df,
center=center,
fiscal_year_tag=args.fiscalyear,
first_steps_vals=first_steps_values,
preplanning_val=OUT_COLUMNS.val_preplanning,
col_neo_attendees_total=NEOSERRA_COLUMNS.attendees_total,
col_attendees_range=OUT_COLUMNS.attendees_range,
col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic
)
save_variant_charts(
chart_dict=bins_figs,
base_path=args.outpath,
report=f'{args.reportname}{center}',
chart_type=args.name_bins)
print(f" Generating Primary Topic charts...")
topic_figs = make_primary_training_topic_statistics_charts(
center_trainings_df,
center=center,
fiscal_year_tag=args.fiscalyear,
first_steps_vals=first_steps_values,
col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic,
col_neo_attendees_total=NEOSERRA_COLUMNS.attendees_total
)
save_variant_charts(
chart_dict=topic_figs,
base_path=args.outpath,
report=f'{args.reportname}{center}',
chart_type=args.name_topics)
# Topic Pie Charts
print(f" Generating Topic Pie charts...")
pie_figs = make_primary_training_topic_pie_charts(
center_trainings_df,
center=center,
fiscal_year_tag=args.fiscalyear,
first_steps_vals=first_steps_values,
col_neo_attendees_total=NEOSERRA_COLUMNS.attendees_total,
col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic
)
save_variant_charts(
chart_dict=pie_figs,
base_path=args.outpath,
report=f'{args.reportname}{center}',
chart_type=args.name_pie
)
print("\nDONE!")