Files
testing123/section_1_graph_export_module/trainings_analysis_script.py
2026-05-21 08:40:24 -04:00

306 lines
12 KiB
Python

# FILE: pasbdc_trainings_analysis_script.py
# CREATED: 12/26/25
# AUTHOR: Vincent Allen
# PURPOSE: Full pipeline script: Cleans raw data, generates statistics, and creates analysis graphs per funding group.
# Third party libraries
import pandas as pd
import sys
import os.path
import argparse
import json
# Custom modules
from section_1_datasets_module import ( #pyright:ignore
generate_cleaned_trainings_dataset,
generate_center_trainings_count_statistics
)
from section_1_graph_library_module import ( # pyright:ignore
make_attendee_bins_statistics_charts,
make_primary_training_topic_statistics_charts,
make_center_attendee_statistics_charts,
make_center_event_count_charts,
make_center_attendee_range_charts,
make_primary_training_topic_pie_charts,
make_network_trainings_count_statistics_charts
)
from shared_tools_module import save_variant_charts, csv_url_to_dataframe
from constants_module import NEOSERRA_COLUMNS, OUT_COLUMNS
DEFAULT_FUNDING_GROUPS = ['Core Services', 'LEXNET', 'PDA', 'NAP']
REPORT_NAME = "trainingsreport"
def parse_args():
parser = argparse.ArgumentParser(description="Clean Data and Generate Trainings Analysis Graphs")
dataset_group = parser.add_mutually_exclusive_group(required=True)
# --- INPUT/OUTPUT ---
dataset_group.add_argument("--inputcsv",
type=str,
help="The path to the RAW Neoserra trainings CSV export.")
dataset_group.add_argument("--exportmoduleurl",
type=str,
help="The Neoserra export module url for the Trainings data export")
parser.add_argument("--outpath",
type=str,
required=True,
help="The base directory path to place generated files into.")
parser.add_argument("--cleanedfilename",
type=str,
default="cleaned_trainings_data.csv",
help="The name to give the intermediate cleaned dataset.")
parser.add_argument("--mapping",
type=str,
required=False,
help="Path to a JSON file to override default column names mappings.")
parser.add_argument("--fiscalyear",
type=str,
default="FY25",
help="The label for the fiscal year to appear in graph titles.")
# --- BASE FILENAMES (Prefixes) ---
parser.add_argument("--name_stats", type=str, default="center-statistics",
help="Base filename for the detailed center statistics charts.")
parser.add_argument("--name_bins", type=str, default="attendee-bins",
help="Base filename for attendee bins charts.")
parser.add_argument("--name_topics", type=str, default="training-topics",
help="Base filename for primary topic charts.")
parser.add_argument("--name_center_attendees", type=str, default="center-attendees",
help="Base filename for center attendee statistics charts.")
parser.add_argument("--name_event_counts", type=str, default="event-counts",
help="Base filename for center event count charts.")
parser.add_argument("--name_ranges", type=str, default="attendee-ranges",
help="Base filename for center attendee range stacked bars.")
parser.add_argument("--name_pie", type=str, default="topics-pie",
help="Base filename for network wide topic pie charts.")
# NOTE: This needs to be manually checked to provide the default before execution as argparse does not support
# providing a full replacement as a default, whatever is passed to default is just appended to whatever the user inputs
parser.add_argument("--funding",
action='append',
required=False,
default=None,
help="Which funding groups should be included in the final analysis" )
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
if args.funding is None:
args.funding = DEFAULT_FUNDING_GROUPS
# Handle optional JSON mapping override
if args.mapping:
try:
with open(args.mapping, 'r') as f:
NEOSERRA_COLUMNS.apply_json_mapping(args.mapping)
OUT_COLUMNS.apply_json_mapping(args.mapping)
except Exception as e:
print(f'Failed to load user column configuration JSON file, got={e}')
sys.exit(1)
# Ensure output directory exists
if not os.path.exists(args.outpath):
try:
os.makedirs(args.outpath)
except OSError as e:
print(f"Error creating output directory: {e}")
sys.exit(1)
# DATA CLEANING
print(f"Loading and Cleaning data from {args.inputcsv}...\n")
try:
if args.inputcsv:
trainings_df = pd.read_csv(args.inputcsv)
elif args.exportmoduleurl:
trainings_df = csv_url_to_dataframe(args.exportmoduleurl)
else:
raise RuntimeError("No dataset defined, an inputcsv or exportmoduleurl is required. This should not be possible unless you have changed the code.")
# Filter for reportable records only.
# This will fail with a KeyError if the column is missing, as required.
trainings_df = trainings_df[trainings_df[NEOSERRA_COLUMNS.reportable] == 1]
trainings_df = generate_cleaned_trainings_dataset(
trainings_df,
funding_sources=args.funding,
col_neo_event_title=NEOSERRA_COLUMNS.event_title,
col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic,
col_neo_training_topics=NEOSERRA_COLUMNS.training_topics,
col_neo_center=NEOSERRA_COLUMNS.center,
col_is_preplanning=OUT_COLUMNS.is_preplanning,
col_neo_attendees_total=NEOSERRA_COLUMNS.attendees_total,
col_out_attendees_range=OUT_COLUMNS.attendees_range,
)
# Save cleaned data
clean_out = os.path.join(args.outpath, args.cleanedfilename)
trainings_df.to_csv(clean_out, index=False)
print(f"Cleaned dataset saved to {clean_out}\n")
except Exception as e:
print(f"Failed to clean input CSV: {e}")
sys.exit(1)
# Values for "First Steps"
first_steps_values = ['First Steps', 'Next Steps']
# Generate Statistics DataFrame for this group ---
# This is required for Graph Set 1 (Center Statistics)
try:
stats_df = generate_center_trainings_count_statistics(
full_df=trainings_df,
filtered_df=trainings_df[trainings_df[NEOSERRA_COLUMNS.attendees_total] == 0],
funding_source_group=args.funding,
col_primary_topic=NEOSERRA_COLUMNS.primary_training_topic,
col_center=NEOSERRA_COLUMNS.center,
col_funding_source=NEOSERRA_COLUMNS.funding_source,
col_attendees_total=NEOSERRA_COLUMNS.attendees_total,
col_is_preplanning=OUT_COLUMNS.is_preplanning
)
except Exception as e:
print(f"Error generating statistics dataframe input dataset")
print(e.with_traceback())
sys.exit(1)
# Generate Graphs
print(f"Starting graph generation...\n")
# 1. Center Statistics Charts (Uses stats_df)
print(f" Generating Center Statistics charts...")
stats_figs = make_network_trainings_count_statistics_charts(
funding_group_df=stats_df,
filter_description_tag="With 0 Attendees",
fiscal_year_tag=args.fiscalyear,
# Columns in stats_df are generated with fixed names by generate_center_statistics
# so we rely on the defaults in the graph function or pass them if needed.
# The cleaning library outputs standard names like 'Selected Events', 'Percent Selected Events'
# which match the defaults of the graph library.
)
save_variant_charts(
chart_dict=stats_figs,
base_path=args.outpath,
report=REPORT_NAME,
chart_type=args.name_stats
)
# 2. Attendee Bins Charts (Uses cleaned trainings_df)
print(f" Generating Attendee Bins charts...")
bins_figs = make_attendee_bins_statistics_charts(
trainings_df,
center="Network Wide",
fiscal_year_tag=args.fiscalyear,
first_steps_vals=first_steps_values,
preplanning_val=OUT_COLUMNS.val_preplanning,
col_neo_attendees_total=NEOSERRA_COLUMNS.attendees_total,
col_attendees_range=OUT_COLUMNS.attendees_range,
col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic
)
save_variant_charts(
chart_dict=bins_figs,
base_path=args.outpath,
report=REPORT_NAME,
chart_type=args.name_bins)
# 3. Primary Training Topic Charts
print(f" Generating Primary Topic charts...")
topic_figs = make_primary_training_topic_statistics_charts(
trainings_df,
center="Network Wide",
fiscal_year_tag=args.fiscalyear,
first_steps_vals=first_steps_values,
col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic,
col_neo_attendees_total=NEOSERRA_COLUMNS.attendees_total
)
save_variant_charts(
chart_dict=topic_figs,
base_path=args.outpath,
report=REPORT_NAME,
chart_type=args.name_topics)
# 4. Center Attendee Statistics
print(f" Generating Center Attendee Stats charts...")
center_att_figs = make_center_attendee_statistics_charts(
trainings_df,
fiscal_year_tag=args.fiscalyear,
col_neo_center=NEOSERRA_COLUMNS.center,
col_neo_attendees_total=NEOSERRA_COLUMNS.attendees_total,
col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic,
first_steps_vals=first_steps_values,
preplanning_val=OUT_COLUMNS.val_preplanning
)
save_variant_charts(
chart_dict=center_att_figs,
base_path=args.outpath,
report=REPORT_NAME,
chart_type=args.name_center_attendees)
# 5. Center Event Counts
print(f" Generating Center Event Count charts...")
event_count_figs = make_center_event_count_charts(
trainings_df,
fiscal_year_tag=args.fiscalyear,
col_neo_center=NEOSERRA_COLUMNS.center,
col_neo_attendees_total=NEOSERRA_COLUMNS.attendees_total,
col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic,
first_steps_vals=first_steps_values,
preplanning_val=OUT_COLUMNS.val_preplanning
)
save_variant_charts(
chart_dict=event_count_figs,
base_path=args.outpath,
report=REPORT_NAME,
chart_type=args.name_event_counts
)
# Center Attendee Range (Stacked Bars)
print(f" Generating Center Attendee Range charts...")
range_figs = make_center_attendee_range_charts(
trainings_df,
fiscal_year_tag=args.fiscalyear,
col_neo_center=NEOSERRA_COLUMNS.center,
col_attendees_range=OUT_COLUMNS.attendees_range,
col_neo_training_id=NEOSERRA_COLUMNS.training_id,
col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic,
first_steps_vals=first_steps_values,
preplanning_val=OUT_COLUMNS.val_preplanning
)
save_variant_charts(
chart_dict=range_figs,
base_path=args.outpath,
report=REPORT_NAME,
chart_type=args.name_ranges)
# 7. Network Wide Topic Pie Charts
print(f" Generating Topic Pie charts...")
pie_figs = make_primary_training_topic_pie_charts(
trainings_df,
center="Network Wide",
fiscal_year_tag=args.fiscalyear,
first_steps_vals=first_steps_values,
col_neo_attendees_total=NEOSERRA_COLUMNS.attendees_total,
col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic
)
save_variant_charts(
chart_dict=pie_figs,
base_path=args.outpath,
report=REPORT_NAME,
chart_type=args.name_pie
)
print("\nDONE! All charts generated for all funding groups.")