first commit

This commit is contained in:
2026-05-21 08:40:24 -04:00
commit b084545275
711 changed files with 3659856 additions and 0 deletions

View File

@@ -0,0 +1,211 @@
# FILE: pasbdc_trainings_analysis_script.py
# CREATED: 12/26/25
# AUTHOR: Vincent Allen
# PURPOSE: Full pipeline script: Cleans raw data, generates statistics, and creates analysis graphs per funding group for the center specific section 3
# Third party libraries
import pandas as pd
import sys
import os.path
import argparse
import json
# Custom modules
# 1. The Cleaning Library
from section_1_datasets_module import ( #pyright:ignore
generate_cleaned_trainings_dataset,
)
# 2. The Graphing Library
from section_1_graph_library_module import ( # pyright:ignore
make_attendee_bins_statistics_charts,
make_primary_training_topic_statistics_charts,
make_primary_training_topic_pie_charts,
)
from shared_tools_module import save_variant_charts, csv_url_to_dataframe
from constants_module import NEOSERRA_COLUMNS, OUT_COLUMNS
DEFAULT_FUNDING_GROUPS = ['Core Services', 'LEXNET', 'PDA', 'NAP']
def parse_args():
parser = argparse.ArgumentParser(description="Clean Data and Generate Trainings Analysis Graphs")
inputdata_group = parser.add_mutually_exclusive_group(required=True)
# --- INPUT/OUTPUT ---
inputdata_group.add_argument("--inputcsv",
type=str,
help="The path to the RAW Neoserra trainings CSV export.")
inputdata_group.add_argument("--exportmoduleurl", type=str, help="The URL to the export module for this script")
parser.add_argument("--outpath",
type=str,
required=True,
help="The base directory path to place generated files into.")
parser.add_argument("--cleanedfilename",
type=str,
default="cleaned_trainings_data.csv",
help="The name to give the intermediate cleaned dataset.")
parser.add_argument("--mapping",
type=str,
required=False,
help="Path to a JSON file to override default column names mappings.")
parser.add_argument("--fiscalyear",
type=str,
default="FY25",
help="The label for the fiscal year to appear in graph titles.")
parser.add_argument("--reportname",
type=str,
default="trainingsreport",
help="The report name used in the output file names to allow identification via ImageRegistry. The center name will be appended to the end of this to keep the number of images grabbed by the ImageRegistry smaller.")
# Base Filenames (Prefixes)
parser.add_argument("--name_bins", type=str, default="attendee-bins",
help="Base filename for attendee bins charts.")
parser.add_argument("--name_topics", type=str, default="training-topics",
help="Base filename for primary topic charts.")
parser.add_argument("--name_pie", type=str, default="topics-pie",
help="Base filename for network wide topic pie charts.")
# NOTE: This needs to be manually checked to provide the default before execution as argparse does not support
# providing a full replacement as a default, whatever is passed to default is just appended to whatever the user inputs
parser.add_argument("--funding",
action='append',
required=False,
default=None,
help="For each occurrence of the --center argument, the script will generate a per center trainings report and look for the appropriate images in the images directory." )
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
if args.funding is None:
args.funding = DEFAULT_FUNDING_GROUPS
# Handle optional JSON mapping override
if args.mapping:
try:
with open(args.mapping, 'r') as f:
NEOSERRA_COLUMNS.apply_json_mapping(args.mapping)
OUT_COLUMNS.apply_json_mapping(args.mapping)
except Exception as e:
print(f'Failed to load user column configuration JSON file, got={e}')
sys.exit(1)
# Ensure output directory exists
if not os.path.exists(args.outpath):
try:
os.makedirs(args.outpath)
except OSError as e:
print(f"Error creating output directory: {e}")
sys.exit(1)
# DATA CLEANING
print(f"Loading and Cleaning data from {args.inputcsv}...\n")
try:
if args.inputcsv:
trainings_df = pd.read_csv(args.inputcsv)
elif args.exportmoduleurl:
trainings_df = csv_url_to_dataframe(args.exportmoduleurl)
else:
raise RuntimeError("A data source for this script has not been defined")
# Filter for reportable records only.
# This will fail with a KeyError if the column is missing, as required.
trainings_df = trainings_df[trainings_df[NEOSERRA_COLUMNS.reportable] == 1]
trainings_df = generate_cleaned_trainings_dataset(
trainings_df,
col_neo_event_title=NEOSERRA_COLUMNS.event_title,
col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic,
col_neo_training_topics=NEOSERRA_COLUMNS.training_topics,
col_neo_center=NEOSERRA_COLUMNS.center,
col_is_preplanning=OUT_COLUMNS.is_preplanning,
col_neo_attendees_total=NEOSERRA_COLUMNS.attendees_total,
col_out_attendees_range=OUT_COLUMNS.attendees_range,
)
# Save cleaned data
clean_out = os.path.join(args.outpath, args.cleanedfilename)
trainings_df.to_csv(str(clean_out), index=False)
print(f"Cleaned dataset saved to {clean_out}\n")
except Exception as e:
print(f"Failed to clean input CSV: {e}")
sys.exit(1)
# Filter the dataset for only the funding sources we care about
trainings_df = trainings_df[trainings_df[NEOSERRA_COLUMNS.funding_source].isin(args.funding)]
# Values for "First Steps"
first_steps_values = ['First Steps', 'Next Steps']
# Find all centers that we have in this data set
unique_centers = trainings_df[NEOSERRA_COLUMNS.center].unique().tolist()
for center in unique_centers:
center_trainings_df = trainings_df[(trainings_df[NEOSERRA_COLUMNS.center] == center)]
print(f"Starting generation for center {center}...\n")
# 2. Attendee Bins Charts (Uses cleaned trainings_df)
print(f" Generating Attendee Bins charts...")
bins_figs = make_attendee_bins_statistics_charts(
center_trainings_df,
center=center,
fiscal_year_tag=args.fiscalyear,
first_steps_vals=first_steps_values,
preplanning_val=OUT_COLUMNS.val_preplanning,
col_neo_attendees_total=NEOSERRA_COLUMNS.attendees_total,
col_attendees_range=OUT_COLUMNS.attendees_range,
col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic
)
save_variant_charts(
chart_dict=bins_figs,
base_path=args.outpath,
report=f'{args.reportname}{center}',
chart_type=args.name_bins)
print(f" Generating Primary Topic charts...")
topic_figs = make_primary_training_topic_statistics_charts(
center_trainings_df,
center=center,
fiscal_year_tag=args.fiscalyear,
first_steps_vals=first_steps_values,
col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic,
col_neo_attendees_total=NEOSERRA_COLUMNS.attendees_total
)
save_variant_charts(
chart_dict=topic_figs,
base_path=args.outpath,
report=f'{args.reportname}{center}',
chart_type=args.name_topics)
# Topic Pie Charts
print(f" Generating Topic Pie charts...")
pie_figs = make_primary_training_topic_pie_charts(
center_trainings_df,
center=center,
fiscal_year_tag=args.fiscalyear,
first_steps_vals=first_steps_values,
col_neo_attendees_total=NEOSERRA_COLUMNS.attendees_total,
col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic
)
save_variant_charts(
chart_dict=pie_figs,
base_path=args.outpath,
report=f'{args.reportname}{center}',
chart_type=args.name_pie
)
print("\nDONE!")

View File

@@ -0,0 +1,165 @@
# FILE: pasbdc_center_topic_analysis_script.py
# CREATED: 12/26/25
# AUTHOR: Vincent Allen
# PURPOSE: Script to generate per-center training topic analysis graphs across funding groups.
# Third party libraries
import pandas as pd
import sys
import os.path
import argparse
import json
import re
from shared_tools_module import save_variant_charts, csv_url_to_dataframe
from section_1_datasets_module import generate_cleaned_trainings_dataset
from section_3_graph_library_module import make_center_topic_analysis
from constants_module import NEOSERRA_COLUMNS, OUT_COLUMNS
# The default funding groups defined in the requirements
DEFAULT_FUNDING_GROUPS = ['Core Services', 'LEXNET', 'PDA', 'NAP']
def parse_args():
parser = argparse.ArgumentParser(description="Generate Center-Specific Topic Analysis Graphs")
datasource_group = parser.add_mutually_exclusive_group(required=True)
# --- INPUT/OUTPUT ---
datasource_group.add_argument("--inputcsv",
type=str,
help="The path to the raw Neoserra trainings CSV.")
datasource_group.add_argument("--exportmoduleurl", type=str, help="The url to the export module created for this script.")
parser.add_argument("--outpath",
type=str,
required=True,
help="The base directory path to place generated files into.")
parser.add_argument("--mapping",
type=str,
required=False,
help="Path to a JSON file to override default column names mappings.")
parser.add_argument("--fiscalyear",
type=str,
default="FY25",
help="The label for the fiscal year to appear in graph titles.")
parser.add_argument("--reportname", type=str, default="trainingsreport",
help="Prefix for the output files. Used by ImageRegistry in later parts of the pipeline to find the correct report's images")
parser.add_argument("--chartcategorybase", type=str, default="topicanalysis", help="The chart category to use for this graph. It will be the second part of the filename and will have the center name for the chart appended to this value in the filename.")
# NOTE: This needs to be manually checked to provide the default before execution as argparse does not support
# providing a full replacement as a default, whatever is passed to default is just appended to whatever the user inputs
parser.add_argument("--funding",
action='append',
required=False,
default=None,
help="For each occurrence of the --center argument, the script will generate a per center trainings report and look for the appropriate images in the images directory." )
return parser.parse_args()
def sanitize_filename(name: str) -> str:
"""
Converts a string (like a center name) into a safe filename.
Replaces spaces with underscores and removes non-alphanumeric characters.
"""
# Replace spaces with underscores
name = name.replace(" ", "_")
# Remove characters that aren't alphanumerics, underscores, or hyphens
name = re.sub(r'(?u)[^-\w]', '', name)
return name.lower()
if __name__ == "__main__":
args = parse_args()
if args.funding is None:
args.funding = DEFAULT_FUNDING_GROUPS
# Handle optional JSON mapping override
if args.mapping:
try:
with open(args.mapping, 'r') as f:
NEOSERRA_COLUMNS.apply_json_mapping(args.mapping)
OUT_COLUMNS.apply_json_mapping(args.mapping)
except Exception as e:
print(f'Failed to load user column configuration JSON file, got={e}')
sys.exit(1)
# Ensure output directory exists
if not os.path.exists(args.outpath):
try:
os.makedirs(args.outpath)
except OSError as e:
print(f"Error creating output directory: {e}")
sys.exit(1)
print(f"Loading input data from {args.inputcsv}...\n")
try:
if args.inputcsv:
trainings_df = pd.read_csv(args.inputcsv)
elif args.exportmoduleurl:
trainings_df = csv_url_to_dataframe(args.exportmoduleurl)
else:
raise RuntimeError("A datasource was not defined in the script.")
except Exception as e:
print(f"Failed to read input CSV: {e}")
sys.exit(1)
# Filter for reportable records only.
# This will fail with a KeyError if the column is missing, as required.
trainings_df = trainings_df[trainings_df[NEOSERRA_COLUMNS.reportable] == 1]
print(f"Cleaning input data...")
trainings_df = generate_cleaned_trainings_dataset(
trainings_df,
col_neo_event_title = NEOSERRA_COLUMNS.event_title,
col_neo_primary_topic = NEOSERRA_COLUMNS.primary_training_topic,
col_neo_training_topics = NEOSERRA_COLUMNS.training_topics,
col_neo_center = NEOSERRA_COLUMNS.center,
col_is_preplanning = OUT_COLUMNS.is_preplanning,
col_neo_attendees_total = NEOSERRA_COLUMNS.attendees_total,
col_out_attendees_range = OUT_COLUMNS.attendees_range,
)
print(f"Starting generation for center specific charts\n")
# Filter Data by Funding Group
# The library function calculates stats based on whatever dataframe receives.
# To get stats specific to this funding group, we must filter *before* passing.
filtered_df = trainings_df[trainings_df[NEOSERRA_COLUMNS.funding_source].isin(args.funding)].copy()
if filtered_df.empty:
print(f"No data found for Funding Groups {args.funding}.")
# Identify Centers in this filtered subset
centers_in_group = filtered_df[NEOSERRA_COLUMNS.center].unique() #pyright:ignore
print(f"\tFound {len(centers_in_group)} centers with data in this funding group: {args.funding}")
print(f"\tCenters: {centers_in_group}")
# Iterate through Centers and make charts for each
for center_name in centers_in_group:
print(f'\tGenerating center {center_name}')
try:
# Call library function
center_figs = make_center_topic_analysis(
trainings_df=filtered_df,
center_name=center_name,
fiscal_year_tag=args.fiscalyear,
col_neo_center=NEOSERRA_COLUMNS.center,
col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic
)
save_variant_charts(
chart_dict=center_figs,
base_path=args.outpath,
report=f"{args.reportname}",
chart_type=f"{args.chartcategorybase}{center_name}"
)
except Exception as e:
print(f" Error processing center '{center_name}': {e}")
print("\nDONE! All center topic analysis charts generated.")