first commit

This commit is contained in:
2026-05-21 08:40:24 -04:00
commit b084545275
711 changed files with 3659856 additions and 0 deletions

View File

@@ -0,0 +1,14 @@
# scripts/dataset/pyproject.toml
[build-system]
requires = ["setuptools>=64.0.0", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "section_1_datasets_module"
version = "0.1.0"
description = "Internal dataset generatimake_client_list_dataseton scripts for the PASBDC network wide desk reviews."
[tool.setuptools]
packages = ["section_1_datasets_module"]

View File

@@ -0,0 +1,4 @@
Metadata-Version: 2.4
Name: section_1_datasets_module
Version: 0.1.0
Summary: Internal dataset generatimake_client_list_dataseton scripts for the PASBDC network wide desk reviews.

View File

@@ -0,0 +1,11 @@
pyproject.toml
section_1_datasets_module/__init__.py
section_1_datasets_module/make_client_list_dataset.py
section_1_datasets_module/make_county_naics_dataset.py
section_1_datasets_module/make_nps_dataset.py
section_1_datasets_module/make_satisfaction_survey_dataset.py
section_1_datasets_module/make_trainings_dataset.py
section_1_datasets_module.egg-info/PKG-INFO
section_1_datasets_module.egg-info/SOURCES.txt
section_1_datasets_module.egg-info/dependency_links.txt
section_1_datasets_module.egg-info/top_level.txt

View File

@@ -0,0 +1 @@
section_1_datasets_module

View File

@@ -0,0 +1,20 @@
# scripts/dataset/__init__.py
from .make_client_list_dataset import generate_client_list_dataset, get_pa_naics_data, get_bls_naics11_data, get_bls_naics92_data, create_naics_census_percentage_table
from .make_county_naics_dataset import make_county_naics_dataset
from .make_satisfaction_survey_dataset import make_survey_dataset
from .make_trainings_dataset import generate_cleaned_trainings_dataset, generate_center_trainings_count_statistics
__all__ = [
'generate_client_list_dataset',
'make_county_naics_dataset',
'make_survey_dataset',
'generate_cleaned_trainings_dataset',
'generate_center_trainings_count_statistics',
'get_pa_naics_data',
'get_bls_naics92_data',
'get_bls_naics11_data'
]

View File

@@ -0,0 +1,536 @@
# FILE: make_client_list_dataset.py
# CREATED: 12/16/25
# AUTHOR: Vincent Allen
# CONTACT: vincent@vtallen.com valle276@live.kutztown.edu
# PURPOSE:
# Script was created using an export from this neoserra link: https://pasbdc.neoserra.com/clients?__formid=3&remove=&savename=&sort=CLIENT_ID&sortdir=ASC&expr=&field_1=REVIEWID&opt_1=13539873&field_2=&sortdir=ASC
# If you are someone in the future troubleshooting this, start there and see what changed between then and your current data.
# I tried to make things as modular as I can, you might be able to modify some constants and get it to work. All the math should still be good
# External libraries:
from typing import Dict, Tuple
import pandas as pd
import plotly.express as px
import requests
import numpy as np
import socket
import urllib
import urllib3.util.connection as urllib3_cn
urllib3_cn.allowed_gai_family = lambda: socket.AF_INET
# Python modules:
import argparse
import json
import sys
from pathlib import Path
import io
# Custom libraries:
# These libraries need to be installed from their git submodules into the venv used to execute this script
# pip3 install -e <path to library folder>
# If I'm gone and you do not know how to do this,
# just drag the python files into this folder and import the needed functions from those files and it should work. Or just copy the functions into this file
from pasbdc_data_cleaning import clean_center_name, tag_county_out_of_state # pyright:ignore
from constants_module import NEOSERRA_COLUMNS, OUT_COLUMNS
def get_pa_naics_data(year:str="2022", ucgid:str='0400000US42') -> pd.DataFrame:
"""
parameters:
year:str - The census year to get data from
ucgid:str - The Uniform Census Geography Identifier to obtain data for (defaults to PA)
returns:
pd.DataFrame - A dataframe of the returned data table
description:
Queries the census API to obtain NAICS data for the state of PA. Parameters must be from a valid census year with a valid geography
identifier.
"""
api_url = f'https://api.census.gov/data/{year}/ecnbasic?get=group(EC2200BASIC)&NAICS{year}=pseudo(N0200.00)&ucgid={ucgid}'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
try:
census_naics_json = requests.get(api_url, headers=headers)
if census_naics_json.status_code == 200:
table_data = census_naics_json.json()
headers = table_data[0]
rows = table_data[1:]
df = pd.DataFrame(rows, columns=headers)
return df
else:
return pd.DataFrame()
except requests.exceptions.RequestException as e:
print("Failed to get NAICS census data from the API. Verify that the url is still correct and that your parameters are from a valid census year.")
raise e
def get_bls_naics92_data(year:str="2022", area_code:str="42000") -> pd.DataFrame:
"""
parameters:
year:str - The census year to obtain bls data for
area_code:str - The state code to obtain data for. See BLS API docs for valid values
returns:
pd.DataFrame - A dataframe of the bls data on government institutions in PA
description:
This function queries the BLS api for their census data on the number of public administration organizations in PA.
This allows us to show accurate NAICS data as this is not included in the main census data.
This function uses an http stream instead of plain requests to get the data. If you try any other way the script will get blocked
as a bot.
"""
api_url=f"http://data.bls.gov/cew/data/api/{year}/a/area/{area_code}.csv"
req = urllib.request.Request(
api_url,
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
)
httpStream = urllib.request.urlopen(req)
csv = httpStream.read()
httpStream.close()
return pd.read_csv(io.StringIO(csv.decode('UTF-8')))
def get_bls_naics11_data(
api_key:str="72A4453A-E4CB-3D1C-BF42-03B6FAF9E7E6",
source_desc:str="CENSUS",
sector_desc:str="ECONOMICS",
group_desc:str="FARMS & LAND & ASSETS",
commodity_desc:str="FARM OPERATIONS",
unit_desc:str="OPERATIONS",
agg_level:str="STATE",
state_alpha:str="PA",
domain_desc:str="TOTAL",
short_desc:str="FARM OPERATIONS - NUMBER OF OPERATIONS",
year:str="2022"
) -> pd.DataFrame:
"""
parameters:
please only change the api_key if you expect this one to work.
api_key:str="72A4453A-E4CB-3D1C-BF42-03B6FAF9E7E6" - May need to be changed, you can get a new one, this is the one attached to my email
source_desc:str="CENSUS",
sector_desc:str="ECONOMICS",
group_desc:str="FARMS & LAND & ASSETS",
commodity_desc:str="FARM OPERATIONS",
unit_desc:str="OPERATIONS",
agg_level:str="STATE",
state_alpha:str="PA",
domain_desc:str="TOTAL",
short_desc:str="FARM OPERATIONS - NUMBER OF OPERATIONS",
year:str="2022"
returns:
pd.DataFrame - The number of farm operations data returned as a pandas dataframe
description:
Queries the USDA census data for the number of farm operations in PA. Please do not change the parameters except for the year (to a valid census year)
and the API key.
This api was a PAIN to get working and I have only validated this combination of parameters. That's the government for you.
"""
api_url = f"https://quickstats.nass.usda.gov/api/api_GET/?key={api_key}&source_desc={source_desc}&sector_desc={sector_desc}&group_desc={urllib.parse.quote_plus(group_desc)}&commodity_desc={urllib.parse.quote_plus(commodity_desc)}&unit_desc={unit_desc}&agg_level_desc={agg_level}&state_alpha={state_alpha}&domain_desc={domain_desc}&short_desc={urllib.parse.quote_plus(short_desc)}&year={year}&format=csv"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
result = requests.get(api_url, headers=headers)
if result.status_code == 200:
return pd.read_csv(io.StringIO(result.text))
else:
raise Exception("Could not query quickstats API, check the url!")
def parse_args():
"""
parameters: None
returns: optparse_parser.Values - The parsed command line arguments from the argeparse module
description:
Uses the python argeparse module to parse the command line arguments for this application
"""
parser = argparse.ArgumentParser()
parser.add_argument("-c", "--censusyear",
type=str,
default="2022",
required=False,
help='The census year to use to obtain NAICS code data. Must be a valid census year.')
parser.add_argument("-o", "--out",
type=str,
default="pasbdc_cleaned_client_data.csv",
required=False,
help="The filename to write the output dataset to."
)
parser.add_argument("-t", "--tableout",
type=str,
default="",
required=False,
help="The filename to write the output NAICs census data to."
)
parser.add_argument("-u", "--usdaapikey",
default="72A4453A-E4CB-3D1C-BF42-03B6FAF9E7E6",
required=False,
help="The API key for the USDA statistics API")
parser.add_argument("-i", "--inputcsv",
required=True,
help="The filename of the input Neoserra client list data obtained from the active clients scorecard with the following columns: Client ID,Client,Last Counseling,Center,Physical Address County,Physical Address ZIP Code,Primary NAICS,NAICs. Columns must be named as given, or the cleaning script will not work. If you need custom columns, set them with the command line arguments")
parser.add_argument("-m", "--mapping",
type=str,
required=False,
help="Path to a JSON file to override default column names mappings.")
return parser.parse_args()
def create_naics_census_percentage_table(
df_naics_census:pd.DataFrame,
df_naics_11:pd.DataFrame,
df_naics_92:pd.DataFrame,
col_bls_industry:str = OUT_COLUMNS.bls_industry,
col_bls_estab:str = OUT_COLUMNS.bls_estab,
col_usda_value:str = OUT_COLUMNS.usda_value,
col_unified_naics:str = OUT_COLUMNS.unified_naics,
col_census_estab:str = OUT_COLUMNS.census_estab,
col_census_pct:str = OUT_COLUMNS.census_pct,
col_naics_label:str= OUT_COLUMNS.naics_label,
col_census_naics:str= OUT_COLUMNS.census_naics
) -> pd.DataFrame:
"""
parameters:
df_naics_census:pd.DataFrame - USA NAICS Census data for a region
naics_92_count:int - The number of NAICS 92 organizations in that region (not included in standard census data)
naics_11_count:int - The number of NAICS 11 organizations in that region (not included in standard census data)
total_establishments:int - The total number of establishments in the region
col_unified_naics:str - Name of the unified NAICS code column
col_census_estab:str - Name of the census establishment count column
col_census_pct:str - Name of the output percentage column
col_census_naics:str - The column in the census NAICS data that contains the NAICS code. It seems to always contain the year of the census in it. Seems questionable to me.
returns: pd.DataFrame - The constructed NAICS industry table dataframe
description:
Takes in the NAICS census data from the US census API, the number of naics code 92 and 11 organizations, and the total number of
business establishments in PA and produces a data table containing columns with a NAICS code, a description of that code, and the
percentage that that code makes up of the businesses in the census
"""
# Ensure that the establishment count column is an int
df_naics_census[col_census_estab] = df_naics_census[col_census_estab].map(int)
naics_92_count = df_naics_92[df_naics_92[col_bls_industry].str.strip() == "92"][col_bls_estab].sum()
# Get the count of all naics 11 organizations
naics_11_count = int(str(df_naics_11.iloc[0][col_usda_value]).replace(",", ""))
total_establishments = df_naics_census[col_census_estab].sum() + naics_92_count + naics_11_count
# Create a new NAICS code column as there are two in the data for whatever reason
df_naics_census[col_unified_naics] = df_naics_census[col_census_naics].iloc[:, 0]
# Group the records by their NAICS_CODE and name, then aggregate the sums for the groups
naics_df = df_naics_census.groupby(
[col_unified_naics]
).agg({col_census_estab: "sum", col_naics_label: "first"}).reset_index()
# Append the NAICS 92 data
naics_92_data = pd.DataFrame({
col_unified_naics: ["92"],
col_naics_label: ["Government Institutions (local, state, and federal)"],
col_census_estab: [naics_92_count]
})
naics_11_data = pd.DataFrame({
col_unified_naics: ["11"],
col_naics_label: ["Farm Operations"],
col_census_estab: [naics_11_count]
})
# So we have a sentinal value that means no value. NAICS 0 does not exist
missing_data = pd.DataFrame({
col_unified_naics: ["0"],
col_naics_label: ["Missing NAICS value"],
col_census_estab: [0]
})
naics_df = pd.concat([naics_df, naics_92_data, naics_11_data, missing_data], ignore_index=True)
naics_df = naics_df.sort_values(by=col_unified_naics, ascending=False)
naics_df[col_census_pct] = (naics_df[col_census_estab] / total_establishments) * 100
naics_df = naics_df.sort_values(by=[col_census_pct], ascending=False)
return naics_df
def tag_two_digit_naics(
row,
col_neo_naics:str = NEOSERRA_COLUMNS.naics,
col_neo_primary_naics:str = NEOSERRA_COLUMNS.primary_naics,
col_naics_2:str = OUT_COLUMNS.naics_2,
bypass_secondary_naics_list:bool = False,
):
"""
parameters:
row: pd.Series - The row to operate on
col_neo_naics:str - Name of the Neoserra NAICS column
col_neo_primary_naics:str - Name of the Neoserra Primary NAICS column
col_naics_2:str - Name of the output 2-digit NAICS column
bypass_secondary_naics_list:bool = False - Do not check for a col_neo_naics column and assume only the primary naics column exists
returns:
pd.Series - The modified series
description:
For use with the .apply method with axis=1.
Takes a NAICS dataframe and extracts the 2 digit NAICS value from any longer value like a 5 or 6 digit NAICS
"""
if bypass_secondary_naics_list:
if pd.isna(row[col_neo_primary_naics]):
row[col_naics_2] = np.nan
else:
row[col_naics_2] = int(str(row[col_neo_primary_naics])[:2])
return row
if pd.isna(row[col_neo_naics]) and pd.isna(row[col_neo_primary_naics]):
row[col_naics_2] = np.nan
elif pd.isna(row[col_neo_naics]):
row[col_naics_2] = int(str(row[col_neo_primary_naics])[:2])
elif pd.isna(row[col_neo_primary_naics]):
row[col_naics_2] = int(str(row[col_neo_naics])[:2])
else:
# Both exist, default to the Primary NAICS
row[col_naics_2] = int(str(row[col_neo_primary_naics])[:2])
return row
def tag_naics_percentatge(
row,
naics_mapping:Dict[int, float],
col_naics_2:str = OUT_COLUMNS.naics_2,
col_pa_naics_pct:str = OUT_COLUMNS.pa_naics_pct
):
"""
parameters:
row:pd.Series or row like object - The row of data to tag
naics_mapping:Dict[int, float] - The mapping of naics code to the associated census percentage value
col_naics_2:str - Name of the 2-digit NAICS column
col_pa_naics_pct:str - Name of the output PA census percentage column
returns:
pd.Series - The modified row
description:
Tags a row of a dataframe with the associated percentage of its share of naics codes from census data
"""
try:
row[col_pa_naics_pct] = naics_mapping[int(row[col_naics_2])]
except Exception:
if not pd.isna(row[col_naics_2]):
if int(row[col_naics_2]) == 11 or int(row[col_naics_2]) == 92:
return row
if not pd.isna(row[col_naics_2]):
print(type(row[col_naics_2]), row[col_naics_2], "did not have an associated percentage")
return row
return row
def tag_pasbdc_percentage(
series,
naics_value_counts:Dict[int, int],
total_clients:int,
col_naics_2:str = OUT_COLUMNS.naics_2,
col_pasbdc_pct:str = OUT_COLUMNS.pasbdc_pct
):
"""
parameters:
series - The row of data to operate on
naics_value_counts:Dict[int, int] - The mapping of NAICS value to the count of that naics value within the Neoserra client data
total_clients:int - The total number of clients in the neoserra client data
col_naics_2:str - Name of the 2-digit NAICS column
col_pasbdc_pct:str - Name of the output PASBDC percentage column
"""
if not pd.isna(series[col_naics_2]):
series[col_pasbdc_pct] = (int(naics_value_counts[series[col_naics_2]]) / total_clients) * 100
return series
return series
'''
print("Obtaining NAICS 11 data.")
# get pa naics 11 data
df_naics_11 = pd.DataFrame()
try:
df_naics_11 = get_bls_naics11_data(
api_key=usda_api_key,
year=census_year)
except Exception as e:
print("Failed to get naics 11 data from the USDA, check your API key and internet connection.")
raise e
print("Obtaining NAICS 92 data.")
# Get PA naics 92 data
df_naics_92 = pd.DataFrame()
try:
df_naics_92 = get_bls_naics92_data(year=census_year)
except Exception as e:
print("Failed to get naics 92 data from the BLS, check your internet connection and the census year.")
raise e
print("Obtaining PA census NAICS data.")
df_naics_census = pd.DataFrame()
try:
df_naics_census = get_pa_naics_data(year=census_year)
except Exception as e:
print("Failed to obtain census naics data from the census api. Check API parameters and your internet connection and try again.")
raise e
'''
def generate_client_list_dataset(
naics_df:pd.DataFrame,
df_client_list:pd.DataFrame,
col_unified_naics:str = OUT_COLUMNS.unified_naics,
col_census_pct:str = OUT_COLUMNS.census_pct,
col_naics_2:str = OUT_COLUMNS.naics_2,
col_pa_naics_pct:str = OUT_COLUMNS.pa_naics_pct,
col_pasbdc_pct:str = OUT_COLUMNS.pasbdc_pct,
col_neo_primary_naics:str = NEOSERRA_COLUMNS.primary_naics,
col_neo_naics:str = NEOSERRA_COLUMNS.naics,
bypass_secondary_naics_list:bool = False
) -> pd.DataFrame:
"""
parameters:
clients_csv_path:str - The path to the Neoserra client list data
usda_api_key:str - The API key to access the USDA's census data
census_year:str - The census year to generate the datasets for, must be a valid census year
col_unified_naics:str - Column name for unified NAICS code
col_census_pct:str - Column name for Census percentage
col_naics_2:str - Column name for 2-digit NAICS
col_pa_naics_pct:str - Column name for PA NAICS percentage
col_pasbdc_pct:str - Column name for PASBDC NAICS percentage
col_census_estab:str - Column name for Census establishment count
col_census_naics:str - The column in the Census NAICS data that contains the NAICS code
col_bls_industry:str - Column name for BLS industry code
col_bls_estab:str - Column name for BLS establishment count
col_usda_value:str - Column name for USDA value
col_neo_primary_naics:str - Column name for Neoserra Primary NAICS
col_neo_naics:str - Column name for Neoserra NAICS
returns:
pd.DataFrame, pd.DataFrame - The first dataframe is the NAICs code table for the PA census which was used to tag the second dataframe
which is the Neoserra clients list cleaned and tagged with the census data
description:
Generates the datasets using parameterized column names.
"""
primary_naics = df_client_list[col_neo_primary_naics].astype(str).replace(['nan', 'None'], np.nan)
if bypass_secondary_naics_list:
naics_source = primary_naics
else:
secondary_naics = df_client_list[col_neo_naics].astype(str).replace(['nan', 'None'], np.nan)
naics_source = primary_naics.fillna(secondary_naics)
df_client_list[col_naics_2] = pd.to_numeric(naics_source.str[:2], errors='coerce').fillna(0)
# Now we need to tage each client with the corresponding census percentage of their NAICS code
naics_mapping = {0: 0.0}
for _, row in naics_df.iterrows():
split = str(row[col_unified_naics]).split('-')
if len(split) == 2:
for code in range(int(split[0]), int(split[1]) + 1):
naics_mapping[code] = float(row[col_census_pct])
else:
naics_mapping[int(row[col_unified_naics])] = float(row[col_census_pct])
df_client_list[col_pa_naics_pct] = df_client_list[col_naics_2].map(naics_mapping)
naics_value_counts = df_client_list[col_naics_2].value_counts() # pyright: ignore
total_clients = naics_value_counts.sum()
if total_clients > 0:
df_client_list[col_pasbdc_pct] = (df_client_list[col_naics_2].map(naics_value_counts) / total_clients) * 100
# =============================
# MISC attributes
# =============================
# Tag each row with in or out of state based on it county
df_client_list = df_client_list.apply(tag_county_out_of_state, axis=1)
return df_client_list #pyright: ignore for some reason, my LSP thinks the client list is a series not a dataframe. Its not I promise you
if __name__ == "__main__":
args = parse_args()
if args.mapping:
NEOSERRA_COLUMNS.apply_json_mapping(args.mapping)
OUT_COLUMNS.apply_json_mapping(args.mapping)
print("Obtaining NAICS 11 data.")
# get pa naics 11 data
df_naics_11 = pd.DataFrame()
try:
df_naics_11 = get_bls_naics11_data(
api_key=args.usdaapikey,
year=args.censusyear)
except Exception as e:
print("Failed to get naics 11 data from the USDA, check your API key and internet connection.")
raise e
print("Obtaining NAICS 92 data.")
# Get PA naics 92 data
df_naics_92 = pd.DataFrame()
try:
df_naics_92 = get_bls_naics92_data(year=args.censusyear)
except Exception as e:
print("Failed to get naics 92 data from the BLS, check your internet connection and the census year.")
raise e
print("Obtaining PA census NAICS data.")
df_naics_census = pd.DataFrame()
try:
df_naics_census = get_pa_naics_data(year=args.censusyear)
except Exception as e:
print(
"Failed to obtain census naics data from the census api. Check API parameters and your internet connection and try again.")
raise e
df_client_list = pd.read_csv(args.inputcsv)
naics_df = create_naics_census_percentage_table(
df_naics_census=df_naics_census,
df_naics_92=df_naics_92,
df_naics_11=df_naics_11,
col_unified_naics=OUT_COLUMNS.unified_naics,
col_census_estab=OUT_COLUMNS.census_estab,
col_census_pct=OUT_COLUMNS.census_pct,
col_census_naics=OUT_COLUMNS.census_naics,
col_naics_label=OUT_COLUMNS.naics_label
)
client_list_df = generate_client_list_dataset(
naics_df=naics_df,
df_client_list=df_client_list,
col_unified_naics=OUT_COLUMNS.unified_naics,
col_census_pct=OUT_COLUMNS.census_pct,
col_naics_2=OUT_COLUMNS.naics_2,
col_pa_naics_pct=OUT_COLUMNS.pa_naics_pct,
col_pasbdc_pct=OUT_COLUMNS.pasbdc_pct,
col_neo_primary_naics=NEOSERRA_COLUMNS.primary_naics,
col_neo_naics=NEOSERRA_COLUMNS.naics,
)
client_list_df.to_csv(args.out, index=False)
if args.tableout:
naics_df.to_csv(args.tableout, index=False)

View File

@@ -0,0 +1,222 @@
# FILE: make_county_naics_dataset.py
# CREATED: 12/17/25
# AUTHOR: Vincent Allen
# CONTACT: vincent@vtallen.com valle276@live.kutztown.edu
# PURPOSE:
# Takes in the clients list from the client dataset generator and derives information about missing NAICs codes per county from that
# data to track which counties are not properly entering their NAICS data into client profiles
# python modules
import argparse
import json
import sys
from typing import Dict
# external libraries
import addfips
import pandas as pd
import numpy as np
# custom libraries
from .make_client_list_dataset import generate_client_list_dataset, get_pa_naics_data, get_bls_naics92_data, get_bls_naics11_data, create_naics_census_percentage_table
from constants_module import NEOSERRA_COLUMNS, OUT_COLUMNS
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--inputcsv",
required=True,
type=str,
help='The raw PASBDC client list dataset csv file.')
parser.add_argument("-o", "--out",
default="bycounty_naics_dataset.csv",
type=str,
help="The path to save the output CSV file to")
parser.add_argument("-m", "--mapping",
required=False,
default="",
type=str,
help="A .json file to override the column mappings used by this program. See documentation for meanings.")
parser.add_argument("-c", "--censusyear",
type=str,
default="2022",
required=False,
help='The census year to use to obtain NAICS code data. Must be a valid census year.')
parser.add_argument("-u", "--usdaapikey",
default="72A4453A-E4CB-3D1C-BF42-03B6FAF9E7E6",
required=False,
help="The API key for the USDA statistics API")
return parser.parse_args()
def make_county_naics_dataset(
client_naics_df: pd.DataFrame,
col_out_county: str = OUT_COLUMNS.county,
col_out_fips: str = OUT_COLUMNS.fips,
col_out_unique: str = OUT_COLUMNS.unique_valid_naics,
col_out_missing: str = OUT_COLUMNS.missing_naics,
col_out_total: str = OUT_COLUMNS.total_clients,
col_out_pct_missing: str = OUT_COLUMNS.pct_missing_naics,
col_neo_county: str = NEOSERRA_COLUMNS.physical_address_county,
col_naics_2: str = OUT_COLUMNS.naics_2,
col_out_of_state: str = OUT_COLUMNS.county_out_of_state
) -> pd.DataFrame:
"""
parameters:
client_df:pd.DataFrame - The cleaned client dataframe
col_out_county:str - Output column name for County
col_out_fips:str - Output column name for FIPS code
col_out_unique:str - Output column name for count of unique NAICS codes
col_out_missing:str - Output column name for count of missing NAICS codes
col_out_total:str - Output column name for total clients
col_out_pct_missing:str - Output column name for percentage of missing NAICS
col_neo_county:str - Input column name for Physical Address County
col_naics_2:str - Input column name for 2-digit NAICS code
col_out_of_state:str - Input column name for out of state flag
returns:
pd.DataFrame - A dataframe containing NAICS statistics by county
description:
Generates a dataset summarizing NAICS code completeness by county.
"""
county_naics_groups = client_naics_df.groupby(col_neo_county)
# This gets the counts for non-null NAICS codes (including your 0s)
naics_value_counts = county_naics_groups[col_naics_2].value_counts()
# .size() gets the *total* number of clients in each county
# It includes rows where NAICS_2 is 0, valid, or NaN.
total_clients_by_county = county_naics_groups.size()
total_unique_counties = client_naics_df[client_naics_df[col_out_of_state] == False][col_neo_county].unique() #pyright:ignore
county_stats_list = []
af = addfips.AddFIPS()
for county in total_unique_counties:
# Use .get() to avoid errors if a county has 100% NaN NAICS codes
# In that case, value_counts() would be empty for it
county_counts_series = naics_value_counts.get(county, pd.Series(dtype=float))
missing_naics_count = 0
unique_valid_naics_count = len(county_counts_series) #pyright:ignore
if 0 in county_counts_series.index: # pyright: ignore
# This line was always correct
missing_naics_count = county_counts_series.loc[0] # pyright:ignore
# Decrement the *unique* count
unique_valid_naics_count -= 1
# Now, just look up the true total for the county
total_records = total_clients_by_county.loc[county]
fips = np.nan
if not pd.isna(county):
fips = af.get_county_fips(county, 'Pennsylvania')
county_data = {
col_out_county: county,
col_out_fips: fips,
col_out_unique: unique_valid_naics_count,
col_out_missing: missing_naics_count,
col_out_total: total_records,
col_out_pct_missing: (missing_naics_count / total_records) * 100 if total_records > 0 else 0
}
county_stats_list.append(county_data)
county_stats_df = pd.DataFrame(county_stats_list)
# Helper function to add FIPS codes using the correct column name
def add_fips_apply(series):
return af.add_county_fips(series, county_field=col_out_county, state='Pennsylvania')
county_stats_df = county_stats_df.apply(add_fips_apply, axis=1)
# Should never happen, but it makes the return value technically correct now
if type(county_stats_df) == pd.Series:
raise Exception("Got a single series dataframe, something is wrong")
return county_stats_df # pyright:ignore
if __name__ == "__main__":
args = parse_args()
if args.mapping:
NEOSERRA_COLUMNS.apply_json_mapping(args.mapping)
OUT_COLUMNS.apply_json_mapping(args.mapping)
print("Obtaining NAICS 11 data.")
# get pa naics 11 data
df_naics_11 = pd.DataFrame()
try:
df_naics_11 = get_bls_naics11_data(
api_key=args.usdaapikey,
year=args.censusyear)
except Exception as e:
print("Failed to get naics 11 data from the USDA, check your API key and internet connection.")
raise e
print("Obtaining NAICS 92 data.")
# Get PA naics 92 data
df_naics_92 = pd.DataFrame()
try:
df_naics_92 = get_bls_naics92_data(year=args.censusyear)
except Exception as e:
print("Failed to get naics 92 data from the BLS, check your internet connection and the census year.")
raise e
print("Obtaining PA census NAICS data.")
df_naics_census = pd.DataFrame()
try:
df_naics_census = get_pa_naics_data(year=args.censusyear)
except Exception as e:
print(
"Failed to obtain census naics data from the census api. Check API parameters and your internet connection and try again.")
raise e
raw_client_list_df = pd.read_csv(args.inputcsv)
naics_df = create_naics_census_percentage_table(
df_naics_census=df_naics_census,
df_naics_92=df_naics_92,
df_naics_11=df_naics_11,
col_unified_naics=OUT_COLUMNS.unified_naics,
col_census_estab=OUT_COLUMNS.census_estab,
col_census_pct=OUT_COLUMNS.census_pct,
col_census_naics=OUT_COLUMNS.census_naics,
col_naics_label=OUT_COLUMNS.naics_label
)
client_list_df = generate_client_list_dataset(
naics_df=naics_df,
df_client_list=raw_client_list_df,
col_unified_naics=OUT_COLUMNS.unified_naics,
col_census_pct=OUT_COLUMNS.census_pct,
col_naics_2=OUT_COLUMNS.naics_2,
col_pa_naics_pct=OUT_COLUMNS.pa_naics_pct,
col_pasbdc_pct=OUT_COLUMNS.pasbdc_pct,
col_neo_primary_naics=NEOSERRA_COLUMNS.primary_naics,
col_neo_naics=NEOSERRA_COLUMNS.naics
)
out_df = make_county_naics_dataset(
client_naics_df=client_list_df,
col_out_county=OUT_COLUMNS.county,
col_out_fips=OUT_COLUMNS.fips,
col_out_unique=OUT_COLUMNS.unique_valid_naics,
col_out_missing=OUT_COLUMNS.missing_naics,
col_out_total=OUT_COLUMNS.total_clients,
col_out_pct_missing=OUT_COLUMNS.pct_missing_naics,
col_neo_county=NEOSERRA_COLUMNS.physical_address_county,
col_naics_2=OUT_COLUMNS.naics_2,
col_out_of_state=OUT_COLUMNS.county_out_of_state
)
out_df.to_csv(args.out, index=False)

View File

@@ -0,0 +1,106 @@
# FILE: make_nps_dataset.py
# CREATED: 12/23/25
# AUTHOR: Vincent Allen
# CONTACT: vincent@vtallen.com valle276@live.kutztown.edu
# PURPOSE:
# Contains the logic required to turn a cleaned export of the satisfaction survey from Neoserra into data
# that shows the net promoter score per center
# THIS SCRIPT REQUIRES THE OUTPUT OF make_satisfaction_survey_dataset.py to produce correct results
# See https://en.wikipedia.org/wiki/Net_promoter_score
# python modules
import argparse
import sys
import json
# external libraries
import pandas as pd
from constants_module import NEOSERRA_COLUMNS, OUT_COLUMNS
def make_nps_dataset(
survey_df: pd.DataFrame,
col_neo_center: str = NEOSERRA_COLUMNS.center,
col_satisfaction_score: str = NEOSERRA_COLUMNS.satisfaction_score
) -> pd.DataFrame:
"""
parameters:
survey_df:pd.DataFrame - The raw survey data
source_data_export_path:str - The path + filename to save the NPS dataset to (if provided)
col_neo_center:str - The column of the dataset containing the center name
col_score:str - The column containing the satisfaction score (1-10)
returns: pd.DataFrame - The calculated NPS dataframe
description:
Calculates the Net Promoter Score (NPS) for each center based on the survey responses.
Returns a dataframe containing Detractors, Promoters, and the calculated NPS.
"""
# Clean up the answers
# This logic handles cases where the answer might be "10." vs "10"
# We work on a copy to avoid SettingWithCopy warnings on the original dataframe
local_df = survey_df.copy()
local_df[col_satisfaction_score] = [int(str(x)[:2]) if len(str(x)) > 2 else int(x) for x in local_df[col_satisfaction_score]]
center_group_df = local_df[[col_neo_center, col_satisfaction_score]].groupby(col_neo_center)
nps_df = pd.DataFrame({col_neo_center:[], "Detractors":[], "Promoters":[], "NPS":[]})
for name, group in center_group_df:
detractors_count = group[group[col_satisfaction_score] <= 6].shape[0]
promoters_count = group[group[col_satisfaction_score] >= 9].shape[0]
total = detractors_count + promoters_count
# Handle edge case where total is 0 to avoid division by zero
if total > 0:
nps = ((promoters_count / total) - (detractors_count / total)) * 100
else:
nps = 0
row = pd.DataFrame({col_neo_center:[name], "Detractors": [detractors_count], "Promoters": [promoters_count], "NPS": [nps]})
nps_df = pd.concat([nps_df, row], ignore_index=True)
return nps_df
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--inputcsv",
type=str,
required=True,
help="The parsed and cleaned satisfaction survey dataset")
parser.add_argument("-m", "--mapping",
type=str,
required=False,
default="",
help="The JSON file that can be used to overwrite the internal column mappings of the script")
parser.add_argument("-o", "--out",
type=str,
required=True,
help="The file to save the NPS score dataset to")
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
# Override column mappings
if args.mapping:
NEOSERRA_COLUMNS.apply_json_mapping(args.mapping)
OUT_COLUMNS.apply_json_mapping(args.mapping)
survey_df = pd.read_csv(args.inputcsv)
nps_df = make_nps_dataset(
survey_df,
col_neo_center=NEOSERRA_COLUMNS.center,
col_satisfaction_score=NEOSERRA_COLUMNS.satisfaction_score
)
nps_df.to_csv(args.out, index=False)

View File

@@ -0,0 +1,105 @@
# FILE: make_survey_dataset.py
# CREATED: 12/17/25
# AUTHOR: Vincent Allen
# CONTACT: vincent@vtallen.com valle276@live.kutztown.edu
# PURPOSE:
# Takes in the Neoserra survey data export and parses the "Answers" column, which contains
# multiple questions and answers in a single text block, into separate columns for analysis.
# python modules
import argparse
import sys
import json
import re
from typing import Dict
# external libraries
import pandas as pd
import numpy as np
# custom libraries
from pasbdc_data_cleaning import clean_center_name # pyright:ignore
from constants_module import NEOSERRA_COLUMNS, OUT_COLUMNS
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--inputcsv",
required=True,
type=str,
help='The Neoserra survey data input csv file. This script was tested on the satisfaction survey but it should work with other survey types given the Answers column is in the same format with answers between questions that start with a number and a period.')
parser.add_argument("-o", "--out",
default="cleaned_survey_data.csv",
type=str,
help="The path to save the output CSV file to")
parser.add_argument("-m", "--mapping",
required=False,
default="",
type=str,
help="A .json file to override the column mappings used by this program. See documentation for meanings.")
return parser.parse_args()
def make_survey_dataset(survey_df_path:str, col_neo_answers:str=NEOSERRA_COLUMNS.answers) -> pd.DataFrame:
"""
parameters:
survey_df_path:str - Path to the input survey CSV file
col_neo_answers:str - The column name containing the combined questions and answers
returns:
pd.DataFrame - A dataframe with parsed survey questions and answers
description:
Parses the Neoserra survey export, splitting the multi-line "Answers" column into
distinct columns for each question text and answer value.
"""
survey_df = pd.read_csv(survey_df_path)
# replace the neoserra center names with the real ones
clean_center_name(survey_df)
# This part of the code takes the Answers column and creates new columns for each question. One with the question text and another
# with the response to the question
for row_index, row in survey_df.iterrows():
lines = [x.strip() for x in str(row[col_neo_answers]).split('\n') if x.strip()] # Remove empty lines
# Find question indices (lines that start with a number followed by a period)
question_indices = []
for i, line in enumerate(lines):
if re.match(r'^\d+\.', line): # Matches 1., 2., etc.
question_indices.append(i)
question_number = 1
# Extract questions and answers
for i, q_idx in enumerate(question_indices):
question = lines[q_idx][3:].strip() # Remove 1. prefix
# Find where the answer ends (either at next question or end of list)
if i + 1 < len(question_indices):
answer_end = question_indices[i + 1]
else:
answer_end = len(lines)
# Join all answer lines between this question and the next
answer_lines = lines[q_idx + 1:answer_end]
answer = ' '.join(answer_lines)
# Assign to dataframe
survey_df.at[row_index, f"Question {question_number} text"] = question
survey_df.at[row_index, f"Question {question_number}"] = answer
question_number += 1
return survey_df
if __name__ == "__main__":
args = parse_args()
if args.mapping:
NEOSERRA_COLUMNS.apply_json_mapping(args.mapping)
OUT_COLUMNS.apply_json_mapping(args.mapping)
out_df = make_survey_dataset(args.inputcsv, col_neo_answers=NEOSERRA_COLUMNS.answers)
out_df.to_csv(args.out, index=False)

View File

@@ -0,0 +1,476 @@
# FILE: make_training_statistics_dataset.py
# CREATED: 12/24/25
# AUTHOR: Vincent Allen
# CONTACT: vincent@vtallen.com valle276@live.kutztown.edu
# PURPOSE:
# Script was created using logic extracted from a Jupyter notebook to clean training data and generate
# per-center statistics.
# This script allows for the generation of two datasets: a cleaned master list of trainings, and a
# derived statistics table calculating attendee and event performance metrics per center.
# Logic is preserved strictly from the original notebook cells.
# External libraries:
from typing import Dict, Tuple, List, Union
import pandas as pd
import plotly.express as px
import numpy as np
# Python modules:
import argparse
import json
import sys
# Custom libraries:
# These libraries need to be installed from their git submodules into the venv used to execute this script
# pip3 install -e <path to library folder>
# If I'm gone, and you do not know how to do this,
# just drag the python files into this folder and import the needed functions from those files and it should work. Or just copy the functions into this file
from pasbdc_data_cleaning import clean_center_name # pyright:ignore
from constants_module import NEOSERRA_COLUMNS, OUT_COLUMNS, TRAINING_COUNT_COLUMNS, Constants
# Constants defined in the original notebook logic
LEAD_OFFICE_CENTERS = [
'Pennsylvania SBDC Lead Office',
' Pennsylvania SBDC Lead Office',
'Southeast Pennsylvania APEX Accelerator',
'Primary Training Topic',
'State Small Business Credit Initiative (SSBCI)'
]
FIRST_STEPS_COLS = ['First Steps', 'Next Steps']
def parse_args():
"""
parameters: None
returns: optparse_parser.Values - The parsed command line arguments from the argeparse module
description:
Uses the python argeparse module to parse the command line arguments for this application
"""
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--inputcsv",
required=True,
help="The filename of the input raw trainings CSV file.")
parser.add_argument("-o", "--out",
type=str,
default="cleaned_trainings_data.csv",
required=False,
help="The filename to write the output dataset to."
)
parser.add_argument("-m", "--mapping",
type=str,
required=False,
help="Path to a JSON file to override default column names mappings.")
parser.add_argument("-mode", "--mode",
type=str,
default="clean",
choices=["clean", "stats"],
required=False,
help="Select 'clean' to output the cleaned dataset, or 'stats' to output the per-center statistics table.")
parser.add_argument("-f", "--fundingsources",
nargs='+',
default=None,
required=False,
help="List of funding sources to filter by for the statistics generation (e.g. -f SBA State). If omitted, all sources are used.")
return parser.parse_args()
def tag_first_steps(
series,
col_neo_event_title:str= NEOSERRA_COLUMNS.event_title,
col_neo_primary_topic:str= NEOSERRA_COLUMNS.primary_training_topic
):
"""
parameters:
series: pd.Series - The row to operate on
col_neo_event_title:str - Name of the Event Title column
col_primary_topic:str - Name of the Primary Training Topic column
returns:
pd.Series - The modified series
description:
Checks the event title for keywords indicating it is a "First Steps" or "Next Steps" course
and updates the Primary Training Topic accordingly.
"""
if 'first steps' in series[col_neo_event_title].lower() or 'first step' in series[col_neo_event_title].lower():
series[col_neo_primary_topic] = 'First Steps'
elif 'next steps' in series[col_neo_event_title].lower() or 'next step' in series[col_neo_event_title].lower() or 'the next step' in series[col_neo_event_title].lower():
series[col_neo_primary_topic] = 'Next Steps'
return series
def apply_lead_office(
series,
col_neo_center:str= NEOSERRA_COLUMNS.center
):
'''
parameters:
series: pd.Series - The row to operate on
col_center:str - Name of the Center column
returns:
pd.Series - The modified series
description:
Normalizes various lead office center names into a single "Lead Office" string based on a hardcoded list.
'''
if series[col_neo_center] in LEAD_OFFICE_CENTERS:
series[col_neo_center] = "Lead Office"
return series
def tag_pre_planning(
series,
col_neo_primary_topic:str= NEOSERRA_COLUMNS.primary_training_topic,
col_neo_training_topics:str= NEOSERRA_COLUMNS.training_topics,
col_is_preplanning:str= OUT_COLUMNS.is_preplanning
):
"""
parameters:
series: pd.Series - The row to operate on
col_primary_topic:str - Name of the Primary Training Topic column
col_training_topics:str - Name of the Training Topics column
col_is_preplanning:str - Name of the new output boolean column
returns:
pd.Series - The modified series
description:
Determines if a training event is related to "Business Start-up/Preplanning" based on
the primary topic or the list of training topics, creating a boolean flag column.
"""
key = 'Business Start-up/Preplanning'.lower()
training_topics = str(series[col_neo_training_topics])
if series[col_neo_primary_topic] == key or key in training_topics.lower():
series[col_is_preplanning] = True
else:
series[col_is_preplanning] = False
return series
def generate_cleaned_trainings_dataset(
trainings_df,
funding_sources:List[str] = ['Core Services', 'LEXNET', 'PDA', 'NAP'],
col_neo_event_title:str= NEOSERRA_COLUMNS.event_title,
col_neo_primary_topic:str= NEOSERRA_COLUMNS.primary_training_topic,
col_neo_training_topics:str= NEOSERRA_COLUMNS.training_topics,
col_neo_center:str= NEOSERRA_COLUMNS.center,
col_is_preplanning:str= OUT_COLUMNS.is_preplanning,
col_neo_attendees_total:str= NEOSERRA_COLUMNS.attendees_total,
col_out_attendees_range:str= OUT_COLUMNS.attendees_range,
**kwargs
) -> pd.DataFrame:
"""
parameters:
trainings_csv_path:str - Path to the raw trainings CSV file
col_neo_event_title:str - Column name for Event Title
col_neo_primary_topic:str - Column name for Primary Training Topic
col_neo_training_topics:str - Column name for Training Topics
col_neo_center:str - Column name for Center
col_is_preplanning:str - Column name for the Preplanning flag
col_neo_attendees_total:str - Column name for Total Attendees
col_out_attendees_range:str - Output column for the tagged attendees range of an event (1-5, 6-10, etc)
**kwargs: kwargs - Consumes any extra un-needed arguments
returns:
pd.DataFrame - The cleaned trainings dataframe
description:
Loads the training data and applies a sequence of cleaning operations including
standardizing center names, tagging first/next steps, consolidating lead office names,
and flagging preplanning events.
"""
print("Cleaning center names...")
clean_center_name(trainings_df)
print("Tagging 'First Steps' and 'Next Steps'...")
trainings_df = trainings_df.apply(
tag_first_steps,
axis=1,
col_neo_event_title=col_neo_event_title,
col_neo_primary_topic=col_neo_primary_topic
)
print("Consolidating Lead Office centers...")
trainings_df = trainings_df.apply(
apply_lead_office,
axis=1,
col_neo_center=col_neo_center
)
print("Tagging Pre-planning events...")
trainings_df = trainings_df.apply(
tag_pre_planning,
axis=1,
col_neo_primary_topic=col_neo_primary_topic,
col_neo_training_topics=col_neo_training_topics,
col_is_preplanning=col_is_preplanning
)
print("Tagging attendee ranges...")
bins = [0, 5, 14, 24, 49, 99, np.inf]
labels = ['1-5', '6-14', '15-24', '25-49', '50-99', '100+']
trainings_df[col_out_attendees_range] = pd.cut(trainings_df[col_neo_attendees_total], bins=bins, labels=labels, right=True)
# Filter for the funding sources that we want
trainings_df = trainings_df[trainings_df[NEOSERRA_COLUMNS.funding_source].isin(funding_sources)]
# Sort the dataframe alphabetically by the center to ensure it shows up properly in visualizations
trainings_df = trainings_df.sort_values(col_neo_center, ascending=True)
return trainings_df #pyright:ignore
def generate_center_trainings_count_statistics(
full_df: pd.DataFrame,
filtered_df: pd.DataFrame,
funding_source_group: List[str],
col_neo_primary_topic:str= NEOSERRA_COLUMNS.primary_training_topic,
col_neo_center:str= NEOSERRA_COLUMNS.center,
col_neo_funding_source:str= NEOSERRA_COLUMNS.funding_source,
col_neo_attendees_total:str= NEOSERRA_COLUMNS.attendees_total,
col_neo_program_format:str= NEOSERRA_COLUMNS.program_format,
**kwargs
) -> pd.DataFrame:
"""
:param full_df: pd.Dataframe - The cleaned network wide trainings data
:param filtered_df: pd.DataFrame - The cleaned network wide trainings data, with the desired filter applied (ex. 0 attendee events)
:param funding_source_group: List[str] - The list of funding sources to filter the input datasets for
:param col_neo_primary_topic: str - The column of the input dataset where the primary training topic can be found
:param col_neo_center: str - The column of the input dataset where the center can be found
:param col_neo_funding_source: str - The column of the input dataset where the funding source can be found
:param col_neo_attendees_total: str - The column of the input dataset where the total attendees count can be found
:param col_neo_program_format: str - The column if the input dataset where the program format can be found
:param kwargs: str - Takes care of ignoring any unused kwargs passed into the function
:return: pd.Dataframe
description:
Takes in a list of all the trainings and a filtered list of all of the trainings. It then iterates through each center
in the data and derives a set of statistics about each center comparing the whole dataset to the filtered dataset.
The derived columns are as follows:
'Center', 'Total Events', 'Selected Events', 'Percent Selected Events', 'Total Attendees', 'Selected Attendees',
'Percent Selected Attendees', 'Selected Attendees No First Steps', 'Percent Selected Attendees No First Steps',
'Selected Attendees No First Steps No Preplanning', 'Percent Selected Attendees No First Steps No Preplanning'
'Selected Attendees First Steps and Preplanning', 'Percent Selected Attendees First Steps and Preplanning'
'Selected Events First Steps and Preplanning', 'Percent Selected Events First Steps and Preplanning'
'Selected Events No First Steps','Percent Selected Events No First Steps', 'Selected Events No Preplanning No First Steps',
'Percent Selected Events No Preplanning No First Steps', 'Total Ondemand', 'Selected Total Ondemand',
'Percent Selected Ondemand', 'Selected Total Ondemand No First Steps', 'Selected Percent Ondemand No First Steps'
'Selected Total Ondemand No Preplanning No First Steps', 'Selected Percent Ondemand No Preplanning No First Steps'
"""
rows_list = []
# --- STEP 1: CALCULATE GLOBAL DENOMINATORS ---
global_all_network = full_df[full_df[col_neo_funding_source].isin(funding_source_group)]
# Denominators:
full_network_event_total = global_all_network.shape[0]
print(f"global_all_network event count: {full_network_event_total}")
full_network_attendee_total = global_all_network[col_neo_attendees_total].sum()
print(f"global_all_network attendee count: {full_network_attendee_total }")
# --- STEP 2: LOOP PER CENTER ---
centers = full_df[col_neo_center].unique()
for center_name in centers:
center_full_df = global_all_network[
(global_all_network[col_neo_center] == center_name)
]
center_filtered_df = filtered_df[
(filtered_df[col_neo_center] == center_name) &
(filtered_df[col_neo_funding_source].isin(funding_source_group))
]
# --- ATTENDEES CALCULATIONS ---
total_attendees = center_full_df[col_neo_attendees_total].sum()
total_selected_attendees = center_filtered_df[col_neo_attendees_total].sum()
selected_attendees_nofirst = center_filtered_df[
~center_filtered_df[col_neo_primary_topic].isin(FIRST_STEPS_COLS) #pyright:ignore
][col_neo_attendees_total].sum()
selected_attendees_nofirst_nopre = center_filtered_df[
(~center_filtered_df[col_neo_primary_topic].isin(FIRST_STEPS_COLS)) & #pyright:ignore
(center_filtered_df[col_neo_primary_topic] != "Business Start-up/Preplanning")
][col_neo_attendees_total].sum()
# --- First Steps & Preplanning ONLY Attendees ---
first_pre_mask_center = (center_filtered_df[col_neo_primary_topic].isin(FIRST_STEPS_COLS)) | (center_filtered_df[col_neo_primary_topic] == "Business Start-up/Preplanning") #pyright:ignore
selected_attendees_first_pre = center_filtered_df[first_pre_mask_center][col_neo_attendees_total].sum()
if full_network_attendee_total > 0:
percent_selected_attendees_first_pre = selected_attendees_first_pre / full_network_attendee_total
percent_selected_attendees_nofirst_nopre = selected_attendees_nofirst_nopre / full_network_attendee_total
percent_selected_attendees_nofirst = selected_attendees_nofirst / full_network_attendee_total
percent_total_selected_attendees = total_selected_attendees / full_network_attendee_total
else:
percent_selected_attendees_first_pre = 0
percent_selected_attendees_nofirst_nopre = 0
percent_selected_attendees_nofirst = 0
percent_total_selected_attendees = 0
# --- EVENTS CALCULATIONS ---
total_events = center_full_df.shape[0]
total_selected_events = center_filtered_df.shape[0]
selected_events_nofirst = center_filtered_df[
~center_filtered_df[col_neo_primary_topic].isin(FIRST_STEPS_COLS) #pyright:ignore
].shape[0]
selected_events_nofirst_nopre = center_filtered_df[
(~center_filtered_df[col_neo_primary_topic].isin(FIRST_STEPS_COLS)) & #pyright:ignore
(center_filtered_df[col_neo_primary_topic] != "Business Start-up/Preplanning")
].shape[0]
# --- First Steps & Preplanning ONLY Events ---
selected_events_first_pre = center_filtered_df[first_pre_mask_center].shape[0]
# --- ONDEMAND CALCULATIONS ---
total_ondemand = center_full_df[
center_full_df[col_neo_program_format] == Constants.ON_DEMAND_VALUE.value
].shape[0]
selected_total_ondemand = center_filtered_df[
center_filtered_df[col_neo_program_format] == Constants.ON_DEMAND_VALUE.value
].shape[0]
selected_ondemand_nofirst = center_filtered_df[
(center_filtered_df[col_neo_program_format] == Constants.ON_DEMAND_VALUE.value) &
(~center_filtered_df[col_neo_primary_topic].isin(FIRST_STEPS_COLS)) #pyright:ignore
].shape[0]
selected_ondemand_nofirst_nopre = center_filtered_df[
(center_filtered_df[col_neo_program_format] == Constants.ON_DEMAND_VALUE.value) &
(~center_filtered_df[col_neo_primary_topic].isin(FIRST_STEPS_COLS)) & #pyright:ignore
(center_filtered_df[col_neo_primary_topic] != "Business Start-up/Preplanning")
].shape[0]
if full_network_event_total > 0:
percent_selected_ondemand_nofirst_nopre = selected_ondemand_nofirst_nopre / full_network_event_total
percent_selected_ondemand_nofirst = selected_ondemand_nofirst / full_network_event_total
percent_selected_ondemand = selected_total_ondemand / full_network_event_total
percent_selected_events_first_pre = selected_events_first_pre / full_network_event_total
percent_selected_events_nofirst = selected_events_nofirst / full_network_event_total
percent_selected_events_nofirst_nopre = selected_events_nofirst_nopre / full_network_event_total
percent_selected_events = total_selected_events / full_network_event_total
else:
percent_selected_ondemand_nofirst_nopre = 0
percent_selected_ondemand_nofirst = 0
percent_selected_ondemand = 0
percent_selected_events_nofirst = 0
percent_selected_events_first_pre = 0
percent_selected_events_nofirst_nopre = 0
percent_selected_events = 0
# Build Row
row = {
TRAINING_COUNT_COLUMNS.CENTER: center_name,
TRAINING_COUNT_COLUMNS.TOTAL_EVENTS: total_events,
TRAINING_COUNT_COLUMNS.SELECTED_EVENTS: total_selected_events,
TRAINING_COUNT_COLUMNS.PERCENT_SELECTED_EVENTS: percent_selected_events,
TRAINING_COUNT_COLUMNS.TOTAL_ATTENDEES: total_attendees,
TRAINING_COUNT_COLUMNS.SELECTED_ATTENDEES: total_selected_attendees,
TRAINING_COUNT_COLUMNS.PERCENT_SELECTED_ATTENDEES: percent_total_selected_attendees,
TRAINING_COUNT_COLUMNS.SELECTED_ATTENDEES_NO_FIRST_STEPS: selected_attendees_nofirst,
TRAINING_COUNT_COLUMNS.PERCENT_SELECTED_ATTENDEES_NO_FIRST_STEPS: percent_selected_attendees_nofirst,
TRAINING_COUNT_COLUMNS.SELECTED_ATTENDEES_NO_FIRST_STEPS_NO_PREPLANNING: selected_attendees_nofirst_nopre,
TRAINING_COUNT_COLUMNS.PERCENT_SELECTED_ATTENDEES_NO_FIRST_STEPS_NO_PREPLANNING: percent_selected_attendees_nofirst_nopre,
TRAINING_COUNT_COLUMNS.SELECTED_ATTENDEES_FIRST_STEPS_AND_PREPLANNING: selected_attendees_first_pre,
TRAINING_COUNT_COLUMNS.PERCENT_SELECTED_ATTENDEES_FIRST_STEPS_AND_PREPLANNING: percent_selected_attendees_first_pre,
TRAINING_COUNT_COLUMNS.SELECTED_EVENTS_FIRST_STEPS_AND_PREPLANNING: selected_events_first_pre,
TRAINING_COUNT_COLUMNS.PERCENT_SELECTED_EVENTS_FIRST_STEPS_AND_PREPLANNING: percent_selected_events_first_pre,
TRAINING_COUNT_COLUMNS.SELECTED_EVENTS_NO_FIRST_STEPS: selected_events_nofirst,
TRAINING_COUNT_COLUMNS.PERCENT_SELECTED_EVENTS_NO_FIRST_STEPS: percent_selected_events_nofirst,
TRAINING_COUNT_COLUMNS.SELECTED_EVENTS_NO_PREPLANNING_NO_FIRST_STEPS: selected_events_nofirst_nopre,
TRAINING_COUNT_COLUMNS.PERCENT_SELECTED_EVENTS_NO_PREPLANNING_NO_FIRST_STEPS: percent_selected_events_nofirst_nopre,
TRAINING_COUNT_COLUMNS.TOTAL_ONDEMAND: total_ondemand,
TRAINING_COUNT_COLUMNS.SELECTED_TOTAL_ONDEMAND: selected_total_ondemand,
TRAINING_COUNT_COLUMNS.PERCENT_SELECTED_ONDEMAND: percent_selected_ondemand,
TRAINING_COUNT_COLUMNS.SELECTED_TOTAL_ONDEMAND_NO_FIRST_STEPS: selected_ondemand_nofirst,
TRAINING_COUNT_COLUMNS.SELECTED_PERCENT_ONDEMAND_NO_FIRST_STEPS: percent_selected_ondemand_nofirst,
TRAINING_COUNT_COLUMNS.SELECTED_TOTAL_ONDEMAND_NO_PREPLANNING_NO_FIRST_STEPS: selected_ondemand_nofirst_nopre,
TRAINING_COUNT_COLUMNS.SELECTED_PERCENT_ONDEMAND_NO_PREPLANNING_NO_FIRST_STEPS: percent_selected_ondemand_nofirst_nopre,
}
rows_list.append(row)
result_df = pd.DataFrame(rows_list)
return result_df
if __name__ == "__main__":
args = parse_args()
if args.mapping:
try:
with open(args.mapping, 'r') as f:
NEOSERRA_COLUMNS.apply_json_mapping(args.mapping)
OUT_COLUMNS.apply_json_mapping(args.mapping)
except Exception as e:
print(f'Failed to load user column configuration JSON file, got={e}')
sys.exit(1)
# Perform the basic cleaning (Required for both modes)
trainings_df = pd.read_csv(args.inputcsv)
cleaned_trainings_df = generate_cleaned_trainings_dataset(
trainings_df,
col_neo_event_title= NEOSERRA_COLUMNS.event_title,
col_neo_primary_topic= NEOSERRA_COLUMNS.primary_training_topic,
col_neo_training_topics= NEOSERRA_COLUMNS.training_topics,
col_neo_center= NEOSERRA_COLUMNS.center,
col_is_preplanning = OUT_COLUMNS.is_preplanning,
col_neo_attendees_total= NEOSERRA_COLUMNS.attendees_total,
col_out_attendees_range = OUT_COLUMNS.attendees_range,
)
if args.mode == "clean":
print(f"Writing cleaned dataset to {args.out}")
cleaned_trainings_df.to_csv(args.out, index=False)
elif args.mode == "stats":
# Determine funding sources to use
funding_sources = args.fundingsources
if funding_sources is None:
# If not provided, assume all unique funding sources found in the cleaned data
print("No funding sources specified via -f, using all available funding sources in dataset.")
funding_sources = list(cleaned_trainings_df[NEOSERRA_COLUMNS.funding_source].unique())
print(f"Generating statistics for funding sources: {funding_sources}")
stats_df = generate_center_trainings_count_statistics(
full_df=cleaned_trainings_df,
filtered_df=cleaned_trainings_df,
funding_source_group=funding_sources,
col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic,
col_neo_center=NEOSERRA_COLUMNS.center,
col_neo_funding_source=NEOSERRA_COLUMNS.funding_source,
col_neo_attendees_total=NEOSERRA_COLUMNS.attendees_total,
col_neo_program_format=NEOSERRA_COLUMNS.program_format
)
print(f"Writing statistics table to {args.out}")
stats_df.to_csv(args.out, index=False)
else:
print(f"Invalid mode selected: {args.mode}")
sys.exit(1)