first commit
This commit is contained in:
14
section_1_datasets_module/pyproject.toml
Normal file
14
section_1_datasets_module/pyproject.toml
Normal file
@@ -0,0 +1,14 @@
|
||||
# scripts/dataset/pyproject.toml
|
||||
[build-system]
|
||||
requires = ["setuptools>=64.0.0", "wheel"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "section_1_datasets_module"
|
||||
version = "0.1.0"
|
||||
description = "Internal dataset generatimake_client_list_dataseton scripts for the PASBDC network wide desk reviews."
|
||||
|
||||
[tool.setuptools]
|
||||
packages = ["section_1_datasets_module"]
|
||||
|
||||
|
||||
@@ -0,0 +1,4 @@
|
||||
Metadata-Version: 2.4
|
||||
Name: section_1_datasets_module
|
||||
Version: 0.1.0
|
||||
Summary: Internal dataset generatimake_client_list_dataseton scripts for the PASBDC network wide desk reviews.
|
||||
@@ -0,0 +1,11 @@
|
||||
pyproject.toml
|
||||
section_1_datasets_module/__init__.py
|
||||
section_1_datasets_module/make_client_list_dataset.py
|
||||
section_1_datasets_module/make_county_naics_dataset.py
|
||||
section_1_datasets_module/make_nps_dataset.py
|
||||
section_1_datasets_module/make_satisfaction_survey_dataset.py
|
||||
section_1_datasets_module/make_trainings_dataset.py
|
||||
section_1_datasets_module.egg-info/PKG-INFO
|
||||
section_1_datasets_module.egg-info/SOURCES.txt
|
||||
section_1_datasets_module.egg-info/dependency_links.txt
|
||||
section_1_datasets_module.egg-info/top_level.txt
|
||||
@@ -0,0 +1 @@
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
section_1_datasets_module
|
||||
@@ -0,0 +1,20 @@
|
||||
# scripts/dataset/__init__.py
|
||||
from .make_client_list_dataset import generate_client_list_dataset, get_pa_naics_data, get_bls_naics11_data, get_bls_naics92_data, create_naics_census_percentage_table
|
||||
|
||||
from .make_county_naics_dataset import make_county_naics_dataset
|
||||
|
||||
|
||||
from .make_satisfaction_survey_dataset import make_survey_dataset
|
||||
|
||||
from .make_trainings_dataset import generate_cleaned_trainings_dataset, generate_center_trainings_count_statistics
|
||||
|
||||
__all__ = [
|
||||
'generate_client_list_dataset',
|
||||
'make_county_naics_dataset',
|
||||
'make_survey_dataset',
|
||||
'generate_cleaned_trainings_dataset',
|
||||
'generate_center_trainings_count_statistics',
|
||||
'get_pa_naics_data',
|
||||
'get_bls_naics92_data',
|
||||
'get_bls_naics11_data'
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,536 @@
|
||||
# FILE: make_client_list_dataset.py
|
||||
# CREATED: 12/16/25
|
||||
# AUTHOR: Vincent Allen
|
||||
# CONTACT: vincent@vtallen.com valle276@live.kutztown.edu
|
||||
# PURPOSE:
|
||||
|
||||
# Script was created using an export from this neoserra link: https://pasbdc.neoserra.com/clients?__formid=3&remove=&savename=&sort=CLIENT_ID&sortdir=ASC&expr=&field_1=REVIEWID&opt_1=13539873&field_2=&sortdir=ASC
|
||||
# If you are someone in the future troubleshooting this, start there and see what changed between then and your current data.
|
||||
# I tried to make things as modular as I can, you might be able to modify some constants and get it to work. All the math should still be good
|
||||
|
||||
# External libraries:
|
||||
from typing import Dict, Tuple
|
||||
import pandas as pd
|
||||
import plotly.express as px
|
||||
import requests
|
||||
import numpy as np
|
||||
|
||||
import socket
|
||||
import urllib
|
||||
import urllib3.util.connection as urllib3_cn
|
||||
urllib3_cn.allowed_gai_family = lambda: socket.AF_INET
|
||||
|
||||
# Python modules:
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import io
|
||||
|
||||
# Custom libraries:
|
||||
# These libraries need to be installed from their git submodules into the venv used to execute this script
|
||||
# pip3 install -e <path to library folder>
|
||||
# If I'm gone and you do not know how to do this,
|
||||
# just drag the python files into this folder and import the needed functions from those files and it should work. Or just copy the functions into this file
|
||||
from pasbdc_data_cleaning import clean_center_name, tag_county_out_of_state # pyright:ignore
|
||||
from constants_module import NEOSERRA_COLUMNS, OUT_COLUMNS
|
||||
|
||||
def get_pa_naics_data(year:str="2022", ucgid:str='0400000US42') -> pd.DataFrame:
|
||||
"""
|
||||
parameters:
|
||||
year:str - The census year to get data from
|
||||
ucgid:str - The Uniform Census Geography Identifier to obtain data for (defaults to PA)
|
||||
|
||||
returns:
|
||||
pd.DataFrame - A dataframe of the returned data table
|
||||
|
||||
description:
|
||||
Queries the census API to obtain NAICS data for the state of PA. Parameters must be from a valid census year with a valid geography
|
||||
identifier.
|
||||
"""
|
||||
api_url = f'https://api.census.gov/data/{year}/ecnbasic?get=group(EC2200BASIC)&NAICS{year}=pseudo(N0200.00)&ucgid={ucgid}'
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
}
|
||||
try:
|
||||
census_naics_json = requests.get(api_url, headers=headers)
|
||||
|
||||
if census_naics_json.status_code == 200:
|
||||
table_data = census_naics_json.json()
|
||||
|
||||
headers = table_data[0]
|
||||
rows = table_data[1:]
|
||||
df = pd.DataFrame(rows, columns=headers)
|
||||
|
||||
return df
|
||||
else:
|
||||
return pd.DataFrame()
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
print("Failed to get NAICS census data from the API. Verify that the url is still correct and that your parameters are from a valid census year.")
|
||||
raise e
|
||||
|
||||
def get_bls_naics92_data(year:str="2022", area_code:str="42000") -> pd.DataFrame:
|
||||
"""
|
||||
parameters:
|
||||
year:str - The census year to obtain bls data for
|
||||
area_code:str - The state code to obtain data for. See BLS API docs for valid values
|
||||
|
||||
returns:
|
||||
pd.DataFrame - A dataframe of the bls data on government institutions in PA
|
||||
|
||||
description:
|
||||
This function queries the BLS api for their census data on the number of public administration organizations in PA.
|
||||
This allows us to show accurate NAICS data as this is not included in the main census data.
|
||||
|
||||
This function uses an http stream instead of plain requests to get the data. If you try any other way the script will get blocked
|
||||
as a bot.
|
||||
"""
|
||||
api_url=f"http://data.bls.gov/cew/data/api/{year}/a/area/{area_code}.csv"
|
||||
|
||||
req = urllib.request.Request(
|
||||
api_url,
|
||||
headers={
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
|
||||
)
|
||||
httpStream = urllib.request.urlopen(req)
|
||||
csv = httpStream.read()
|
||||
httpStream.close()
|
||||
|
||||
return pd.read_csv(io.StringIO(csv.decode('UTF-8')))
|
||||
|
||||
def get_bls_naics11_data(
|
||||
api_key:str="72A4453A-E4CB-3D1C-BF42-03B6FAF9E7E6",
|
||||
source_desc:str="CENSUS",
|
||||
sector_desc:str="ECONOMICS",
|
||||
group_desc:str="FARMS & LAND & ASSETS",
|
||||
commodity_desc:str="FARM OPERATIONS",
|
||||
unit_desc:str="OPERATIONS",
|
||||
agg_level:str="STATE",
|
||||
state_alpha:str="PA",
|
||||
domain_desc:str="TOTAL",
|
||||
short_desc:str="FARM OPERATIONS - NUMBER OF OPERATIONS",
|
||||
year:str="2022"
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
parameters:
|
||||
please only change the api_key if you expect this one to work.
|
||||
|
||||
api_key:str="72A4453A-E4CB-3D1C-BF42-03B6FAF9E7E6" - May need to be changed, you can get a new one, this is the one attached to my email
|
||||
source_desc:str="CENSUS",
|
||||
sector_desc:str="ECONOMICS",
|
||||
group_desc:str="FARMS & LAND & ASSETS",
|
||||
commodity_desc:str="FARM OPERATIONS",
|
||||
unit_desc:str="OPERATIONS",
|
||||
agg_level:str="STATE",
|
||||
state_alpha:str="PA",
|
||||
domain_desc:str="TOTAL",
|
||||
short_desc:str="FARM OPERATIONS - NUMBER OF OPERATIONS",
|
||||
year:str="2022"
|
||||
|
||||
returns:
|
||||
pd.DataFrame - The number of farm operations data returned as a pandas dataframe
|
||||
|
||||
description:
|
||||
Queries the USDA census data for the number of farm operations in PA. Please do not change the parameters except for the year (to a valid census year)
|
||||
and the API key.
|
||||
|
||||
This api was a PAIN to get working and I have only validated this combination of parameters. That's the government for you.
|
||||
|
||||
"""
|
||||
|
||||
api_url = f"https://quickstats.nass.usda.gov/api/api_GET/?key={api_key}&source_desc={source_desc}§or_desc={sector_desc}&group_desc={urllib.parse.quote_plus(group_desc)}&commodity_desc={urllib.parse.quote_plus(commodity_desc)}&unit_desc={unit_desc}&agg_level_desc={agg_level}&state_alpha={state_alpha}&domain_desc={domain_desc}&short_desc={urllib.parse.quote_plus(short_desc)}&year={year}&format=csv"
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
}
|
||||
result = requests.get(api_url, headers=headers)
|
||||
|
||||
if result.status_code == 200:
|
||||
return pd.read_csv(io.StringIO(result.text))
|
||||
else:
|
||||
raise Exception("Could not query quickstats API, check the url!")
|
||||
|
||||
def parse_args():
|
||||
"""
|
||||
parameters: None
|
||||
|
||||
returns: optparse_parser.Values - The parsed command line arguments from the argeparse module
|
||||
|
||||
description:
|
||||
Uses the python argeparse module to parse the command line arguments for this application
|
||||
"""
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("-c", "--censusyear",
|
||||
type=str,
|
||||
default="2022",
|
||||
required=False,
|
||||
help='The census year to use to obtain NAICS code data. Must be a valid census year.')
|
||||
|
||||
parser.add_argument("-o", "--out",
|
||||
type=str,
|
||||
default="pasbdc_cleaned_client_data.csv",
|
||||
required=False,
|
||||
help="The filename to write the output dataset to."
|
||||
)
|
||||
|
||||
parser.add_argument("-t", "--tableout",
|
||||
type=str,
|
||||
default="",
|
||||
required=False,
|
||||
help="The filename to write the output NAICs census data to."
|
||||
)
|
||||
|
||||
parser.add_argument("-u", "--usdaapikey",
|
||||
default="72A4453A-E4CB-3D1C-BF42-03B6FAF9E7E6",
|
||||
required=False,
|
||||
help="The API key for the USDA statistics API")
|
||||
|
||||
parser.add_argument("-i", "--inputcsv",
|
||||
required=True,
|
||||
help="The filename of the input Neoserra client list data obtained from the active clients scorecard with the following columns: Client ID,Client,Last Counseling,Center,Physical Address County,Physical Address ZIP Code,Primary NAICS,NAICs. Columns must be named as given, or the cleaning script will not work. If you need custom columns, set them with the command line arguments")
|
||||
|
||||
parser.add_argument("-m", "--mapping",
|
||||
type=str,
|
||||
required=False,
|
||||
help="Path to a JSON file to override default column names mappings.")
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
def create_naics_census_percentage_table(
|
||||
df_naics_census:pd.DataFrame,
|
||||
df_naics_11:pd.DataFrame,
|
||||
df_naics_92:pd.DataFrame,
|
||||
col_bls_industry:str = OUT_COLUMNS.bls_industry,
|
||||
col_bls_estab:str = OUT_COLUMNS.bls_estab,
|
||||
col_usda_value:str = OUT_COLUMNS.usda_value,
|
||||
col_unified_naics:str = OUT_COLUMNS.unified_naics,
|
||||
col_census_estab:str = OUT_COLUMNS.census_estab,
|
||||
col_census_pct:str = OUT_COLUMNS.census_pct,
|
||||
col_naics_label:str= OUT_COLUMNS.naics_label,
|
||||
col_census_naics:str= OUT_COLUMNS.census_naics
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
parameters:
|
||||
df_naics_census:pd.DataFrame - USA NAICS Census data for a region
|
||||
naics_92_count:int - The number of NAICS 92 organizations in that region (not included in standard census data)
|
||||
naics_11_count:int - The number of NAICS 11 organizations in that region (not included in standard census data)
|
||||
total_establishments:int - The total number of establishments in the region
|
||||
col_unified_naics:str - Name of the unified NAICS code column
|
||||
col_census_estab:str - Name of the census establishment count column
|
||||
col_census_pct:str - Name of the output percentage column
|
||||
col_census_naics:str - The column in the census NAICS data that contains the NAICS code. It seems to always contain the year of the census in it. Seems questionable to me.
|
||||
|
||||
returns: pd.DataFrame - The constructed NAICS industry table dataframe
|
||||
|
||||
description:
|
||||
Takes in the NAICS census data from the US census API, the number of naics code 92 and 11 organizations, and the total number of
|
||||
business establishments in PA and produces a data table containing columns with a NAICS code, a description of that code, and the
|
||||
percentage that that code makes up of the businesses in the census
|
||||
"""
|
||||
# Ensure that the establishment count column is an int
|
||||
df_naics_census[col_census_estab] = df_naics_census[col_census_estab].map(int)
|
||||
|
||||
naics_92_count = df_naics_92[df_naics_92[col_bls_industry].str.strip() == "92"][col_bls_estab].sum()
|
||||
|
||||
# Get the count of all naics 11 organizations
|
||||
naics_11_count = int(str(df_naics_11.iloc[0][col_usda_value]).replace(",", ""))
|
||||
|
||||
total_establishments = df_naics_census[col_census_estab].sum() + naics_92_count + naics_11_count
|
||||
|
||||
# Create a new NAICS code column as there are two in the data for whatever reason
|
||||
df_naics_census[col_unified_naics] = df_naics_census[col_census_naics].iloc[:, 0]
|
||||
|
||||
# Group the records by their NAICS_CODE and name, then aggregate the sums for the groups
|
||||
naics_df = df_naics_census.groupby(
|
||||
[col_unified_naics]
|
||||
).agg({col_census_estab: "sum", col_naics_label: "first"}).reset_index()
|
||||
|
||||
# Append the NAICS 92 data
|
||||
naics_92_data = pd.DataFrame({
|
||||
col_unified_naics: ["92"],
|
||||
col_naics_label: ["Government Institutions (local, state, and federal)"],
|
||||
col_census_estab: [naics_92_count]
|
||||
})
|
||||
|
||||
naics_11_data = pd.DataFrame({
|
||||
col_unified_naics: ["11"],
|
||||
col_naics_label: ["Farm Operations"],
|
||||
col_census_estab: [naics_11_count]
|
||||
})
|
||||
|
||||
# So we have a sentinal value that means no value. NAICS 0 does not exist
|
||||
missing_data = pd.DataFrame({
|
||||
col_unified_naics: ["0"],
|
||||
col_naics_label: ["Missing NAICS value"],
|
||||
col_census_estab: [0]
|
||||
})
|
||||
|
||||
naics_df = pd.concat([naics_df, naics_92_data, naics_11_data, missing_data], ignore_index=True)
|
||||
naics_df = naics_df.sort_values(by=col_unified_naics, ascending=False)
|
||||
naics_df[col_census_pct] = (naics_df[col_census_estab] / total_establishments) * 100
|
||||
naics_df = naics_df.sort_values(by=[col_census_pct], ascending=False)
|
||||
return naics_df
|
||||
|
||||
def tag_two_digit_naics(
|
||||
row,
|
||||
col_neo_naics:str = NEOSERRA_COLUMNS.naics,
|
||||
col_neo_primary_naics:str = NEOSERRA_COLUMNS.primary_naics,
|
||||
col_naics_2:str = OUT_COLUMNS.naics_2,
|
||||
bypass_secondary_naics_list:bool = False,
|
||||
):
|
||||
"""
|
||||
parameters:
|
||||
row: pd.Series - The row to operate on
|
||||
col_neo_naics:str - Name of the Neoserra NAICS column
|
||||
col_neo_primary_naics:str - Name of the Neoserra Primary NAICS column
|
||||
col_naics_2:str - Name of the output 2-digit NAICS column
|
||||
bypass_secondary_naics_list:bool = False - Do not check for a col_neo_naics column and assume only the primary naics column exists
|
||||
returns:
|
||||
pd.Series - The modified series
|
||||
|
||||
description:
|
||||
For use with the .apply method with axis=1.
|
||||
Takes a NAICS dataframe and extracts the 2 digit NAICS value from any longer value like a 5 or 6 digit NAICS
|
||||
"""
|
||||
if bypass_secondary_naics_list:
|
||||
if pd.isna(row[col_neo_primary_naics]):
|
||||
row[col_naics_2] = np.nan
|
||||
else:
|
||||
row[col_naics_2] = int(str(row[col_neo_primary_naics])[:2])
|
||||
return row
|
||||
|
||||
|
||||
if pd.isna(row[col_neo_naics]) and pd.isna(row[col_neo_primary_naics]):
|
||||
row[col_naics_2] = np.nan
|
||||
elif pd.isna(row[col_neo_naics]):
|
||||
row[col_naics_2] = int(str(row[col_neo_primary_naics])[:2])
|
||||
elif pd.isna(row[col_neo_primary_naics]):
|
||||
row[col_naics_2] = int(str(row[col_neo_naics])[:2])
|
||||
else:
|
||||
# Both exist, default to the Primary NAICS
|
||||
row[col_naics_2] = int(str(row[col_neo_primary_naics])[:2])
|
||||
|
||||
return row
|
||||
|
||||
def tag_naics_percentatge(
|
||||
row,
|
||||
naics_mapping:Dict[int, float],
|
||||
col_naics_2:str = OUT_COLUMNS.naics_2,
|
||||
col_pa_naics_pct:str = OUT_COLUMNS.pa_naics_pct
|
||||
):
|
||||
"""
|
||||
parameters:
|
||||
row:pd.Series or row like object - The row of data to tag
|
||||
naics_mapping:Dict[int, float] - The mapping of naics code to the associated census percentage value
|
||||
col_naics_2:str - Name of the 2-digit NAICS column
|
||||
col_pa_naics_pct:str - Name of the output PA census percentage column
|
||||
|
||||
returns:
|
||||
pd.Series - The modified row
|
||||
|
||||
description:
|
||||
Tags a row of a dataframe with the associated percentage of its share of naics codes from census data
|
||||
"""
|
||||
|
||||
try:
|
||||
row[col_pa_naics_pct] = naics_mapping[int(row[col_naics_2])]
|
||||
except Exception:
|
||||
if not pd.isna(row[col_naics_2]):
|
||||
if int(row[col_naics_2]) == 11 or int(row[col_naics_2]) == 92:
|
||||
return row
|
||||
|
||||
if not pd.isna(row[col_naics_2]):
|
||||
print(type(row[col_naics_2]), row[col_naics_2], "did not have an associated percentage")
|
||||
|
||||
return row
|
||||
|
||||
return row
|
||||
|
||||
def tag_pasbdc_percentage(
|
||||
series,
|
||||
naics_value_counts:Dict[int, int],
|
||||
total_clients:int,
|
||||
col_naics_2:str = OUT_COLUMNS.naics_2,
|
||||
col_pasbdc_pct:str = OUT_COLUMNS.pasbdc_pct
|
||||
):
|
||||
"""
|
||||
parameters:
|
||||
series - The row of data to operate on
|
||||
naics_value_counts:Dict[int, int] - The mapping of NAICS value to the count of that naics value within the Neoserra client data
|
||||
total_clients:int - The total number of clients in the neoserra client data
|
||||
col_naics_2:str - Name of the 2-digit NAICS column
|
||||
col_pasbdc_pct:str - Name of the output PASBDC percentage column
|
||||
"""
|
||||
if not pd.isna(series[col_naics_2]):
|
||||
series[col_pasbdc_pct] = (int(naics_value_counts[series[col_naics_2]]) / total_clients) * 100
|
||||
return series
|
||||
|
||||
return series
|
||||
|
||||
'''
|
||||
print("Obtaining NAICS 11 data.")
|
||||
# get pa naics 11 data
|
||||
df_naics_11 = pd.DataFrame()
|
||||
try:
|
||||
df_naics_11 = get_bls_naics11_data(
|
||||
api_key=usda_api_key,
|
||||
year=census_year)
|
||||
except Exception as e:
|
||||
print("Failed to get naics 11 data from the USDA, check your API key and internet connection.")
|
||||
raise e
|
||||
|
||||
print("Obtaining NAICS 92 data.")
|
||||
# Get PA naics 92 data
|
||||
df_naics_92 = pd.DataFrame()
|
||||
try:
|
||||
df_naics_92 = get_bls_naics92_data(year=census_year)
|
||||
except Exception as e:
|
||||
print("Failed to get naics 92 data from the BLS, check your internet connection and the census year.")
|
||||
raise e
|
||||
|
||||
print("Obtaining PA census NAICS data.")
|
||||
df_naics_census = pd.DataFrame()
|
||||
try:
|
||||
df_naics_census = get_pa_naics_data(year=census_year)
|
||||
except Exception as e:
|
||||
print("Failed to obtain census naics data from the census api. Check API parameters and your internet connection and try again.")
|
||||
raise e
|
||||
'''
|
||||
|
||||
def generate_client_list_dataset(
|
||||
naics_df:pd.DataFrame,
|
||||
df_client_list:pd.DataFrame,
|
||||
col_unified_naics:str = OUT_COLUMNS.unified_naics,
|
||||
col_census_pct:str = OUT_COLUMNS.census_pct,
|
||||
col_naics_2:str = OUT_COLUMNS.naics_2,
|
||||
col_pa_naics_pct:str = OUT_COLUMNS.pa_naics_pct,
|
||||
col_pasbdc_pct:str = OUT_COLUMNS.pasbdc_pct,
|
||||
col_neo_primary_naics:str = NEOSERRA_COLUMNS.primary_naics,
|
||||
col_neo_naics:str = NEOSERRA_COLUMNS.naics,
|
||||
bypass_secondary_naics_list:bool = False
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
parameters:
|
||||
clients_csv_path:str - The path to the Neoserra client list data
|
||||
usda_api_key:str - The API key to access the USDA's census data
|
||||
census_year:str - The census year to generate the datasets for, must be a valid census year
|
||||
col_unified_naics:str - Column name for unified NAICS code
|
||||
col_census_pct:str - Column name for Census percentage
|
||||
col_naics_2:str - Column name for 2-digit NAICS
|
||||
col_pa_naics_pct:str - Column name for PA NAICS percentage
|
||||
col_pasbdc_pct:str - Column name for PASBDC NAICS percentage
|
||||
col_census_estab:str - Column name for Census establishment count
|
||||
col_census_naics:str - The column in the Census NAICS data that contains the NAICS code
|
||||
col_bls_industry:str - Column name for BLS industry code
|
||||
col_bls_estab:str - Column name for BLS establishment count
|
||||
col_usda_value:str - Column name for USDA value
|
||||
col_neo_primary_naics:str - Column name for Neoserra Primary NAICS
|
||||
col_neo_naics:str - Column name for Neoserra NAICS
|
||||
|
||||
returns:
|
||||
pd.DataFrame, pd.DataFrame - The first dataframe is the NAICs code table for the PA census which was used to tag the second dataframe
|
||||
which is the Neoserra clients list cleaned and tagged with the census data
|
||||
|
||||
description:
|
||||
Generates the datasets using parameterized column names.
|
||||
"""
|
||||
primary_naics = df_client_list[col_neo_primary_naics].astype(str).replace(['nan', 'None'], np.nan)
|
||||
if bypass_secondary_naics_list:
|
||||
naics_source = primary_naics
|
||||
else:
|
||||
secondary_naics = df_client_list[col_neo_naics].astype(str).replace(['nan', 'None'], np.nan)
|
||||
naics_source = primary_naics.fillna(secondary_naics)
|
||||
|
||||
df_client_list[col_naics_2] = pd.to_numeric(naics_source.str[:2], errors='coerce').fillna(0)
|
||||
|
||||
# Now we need to tage each client with the corresponding census percentage of their NAICS code
|
||||
naics_mapping = {0: 0.0}
|
||||
for _, row in naics_df.iterrows():
|
||||
split = str(row[col_unified_naics]).split('-')
|
||||
if len(split) == 2:
|
||||
for code in range(int(split[0]), int(split[1]) + 1):
|
||||
naics_mapping[code] = float(row[col_census_pct])
|
||||
else:
|
||||
naics_mapping[int(row[col_unified_naics])] = float(row[col_census_pct])
|
||||
|
||||
df_client_list[col_pa_naics_pct] = df_client_list[col_naics_2].map(naics_mapping)
|
||||
|
||||
naics_value_counts = df_client_list[col_naics_2].value_counts() # pyright: ignore
|
||||
total_clients = naics_value_counts.sum()
|
||||
|
||||
if total_clients > 0:
|
||||
df_client_list[col_pasbdc_pct] = (df_client_list[col_naics_2].map(naics_value_counts) / total_clients) * 100
|
||||
|
||||
|
||||
# =============================
|
||||
# MISC attributes
|
||||
# =============================
|
||||
# Tag each row with in or out of state based on it county
|
||||
df_client_list = df_client_list.apply(tag_county_out_of_state, axis=1)
|
||||
|
||||
return df_client_list #pyright: ignore for some reason, my LSP thinks the client list is a series not a dataframe. Its not I promise you
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
|
||||
if args.mapping:
|
||||
NEOSERRA_COLUMNS.apply_json_mapping(args.mapping)
|
||||
OUT_COLUMNS.apply_json_mapping(args.mapping)
|
||||
|
||||
print("Obtaining NAICS 11 data.")
|
||||
# get pa naics 11 data
|
||||
df_naics_11 = pd.DataFrame()
|
||||
try:
|
||||
df_naics_11 = get_bls_naics11_data(
|
||||
api_key=args.usdaapikey,
|
||||
year=args.censusyear)
|
||||
except Exception as e:
|
||||
print("Failed to get naics 11 data from the USDA, check your API key and internet connection.")
|
||||
raise e
|
||||
|
||||
print("Obtaining NAICS 92 data.")
|
||||
# Get PA naics 92 data
|
||||
df_naics_92 = pd.DataFrame()
|
||||
try:
|
||||
df_naics_92 = get_bls_naics92_data(year=args.censusyear)
|
||||
except Exception as e:
|
||||
print("Failed to get naics 92 data from the BLS, check your internet connection and the census year.")
|
||||
raise e
|
||||
|
||||
print("Obtaining PA census NAICS data.")
|
||||
df_naics_census = pd.DataFrame()
|
||||
try:
|
||||
df_naics_census = get_pa_naics_data(year=args.censusyear)
|
||||
except Exception as e:
|
||||
print(
|
||||
"Failed to obtain census naics data from the census api. Check API parameters and your internet connection and try again.")
|
||||
raise e
|
||||
|
||||
df_client_list = pd.read_csv(args.inputcsv)
|
||||
naics_df = create_naics_census_percentage_table(
|
||||
df_naics_census=df_naics_census,
|
||||
df_naics_92=df_naics_92,
|
||||
df_naics_11=df_naics_11,
|
||||
col_unified_naics=OUT_COLUMNS.unified_naics,
|
||||
col_census_estab=OUT_COLUMNS.census_estab,
|
||||
col_census_pct=OUT_COLUMNS.census_pct,
|
||||
col_census_naics=OUT_COLUMNS.census_naics,
|
||||
col_naics_label=OUT_COLUMNS.naics_label
|
||||
)
|
||||
|
||||
client_list_df = generate_client_list_dataset(
|
||||
naics_df=naics_df,
|
||||
df_client_list=df_client_list,
|
||||
col_unified_naics=OUT_COLUMNS.unified_naics,
|
||||
col_census_pct=OUT_COLUMNS.census_pct,
|
||||
col_naics_2=OUT_COLUMNS.naics_2,
|
||||
col_pa_naics_pct=OUT_COLUMNS.pa_naics_pct,
|
||||
col_pasbdc_pct=OUT_COLUMNS.pasbdc_pct,
|
||||
col_neo_primary_naics=NEOSERRA_COLUMNS.primary_naics,
|
||||
col_neo_naics=NEOSERRA_COLUMNS.naics,
|
||||
)
|
||||
|
||||
client_list_df.to_csv(args.out, index=False)
|
||||
if args.tableout:
|
||||
naics_df.to_csv(args.tableout, index=False)
|
||||
@@ -0,0 +1,222 @@
|
||||
# FILE: make_county_naics_dataset.py
|
||||
# CREATED: 12/17/25
|
||||
# AUTHOR: Vincent Allen
|
||||
# CONTACT: vincent@vtallen.com valle276@live.kutztown.edu
|
||||
# PURPOSE:
|
||||
|
||||
# Takes in the clients list from the client dataset generator and derives information about missing NAICs codes per county from that
|
||||
# data to track which counties are not properly entering their NAICS data into client profiles
|
||||
|
||||
# python modules
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from typing import Dict
|
||||
|
||||
# external libraries
|
||||
import addfips
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
# custom libraries
|
||||
from .make_client_list_dataset import generate_client_list_dataset, get_pa_naics_data, get_bls_naics92_data, get_bls_naics11_data, create_naics_census_percentage_table
|
||||
from constants_module import NEOSERRA_COLUMNS, OUT_COLUMNS
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument("-i", "--inputcsv",
|
||||
required=True,
|
||||
type=str,
|
||||
help='The raw PASBDC client list dataset csv file.')
|
||||
|
||||
parser.add_argument("-o", "--out",
|
||||
default="bycounty_naics_dataset.csv",
|
||||
type=str,
|
||||
help="The path to save the output CSV file to")
|
||||
|
||||
parser.add_argument("-m", "--mapping",
|
||||
required=False,
|
||||
default="",
|
||||
type=str,
|
||||
help="A .json file to override the column mappings used by this program. See documentation for meanings.")
|
||||
|
||||
parser.add_argument("-c", "--censusyear",
|
||||
type=str,
|
||||
default="2022",
|
||||
required=False,
|
||||
help='The census year to use to obtain NAICS code data. Must be a valid census year.')
|
||||
|
||||
parser.add_argument("-u", "--usdaapikey",
|
||||
default="72A4453A-E4CB-3D1C-BF42-03B6FAF9E7E6",
|
||||
required=False,
|
||||
help="The API key for the USDA statistics API")
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
def make_county_naics_dataset(
|
||||
client_naics_df: pd.DataFrame,
|
||||
col_out_county: str = OUT_COLUMNS.county,
|
||||
col_out_fips: str = OUT_COLUMNS.fips,
|
||||
col_out_unique: str = OUT_COLUMNS.unique_valid_naics,
|
||||
col_out_missing: str = OUT_COLUMNS.missing_naics,
|
||||
col_out_total: str = OUT_COLUMNS.total_clients,
|
||||
col_out_pct_missing: str = OUT_COLUMNS.pct_missing_naics,
|
||||
col_neo_county: str = NEOSERRA_COLUMNS.physical_address_county,
|
||||
col_naics_2: str = OUT_COLUMNS.naics_2,
|
||||
col_out_of_state: str = OUT_COLUMNS.county_out_of_state
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
parameters:
|
||||
client_df:pd.DataFrame - The cleaned client dataframe
|
||||
col_out_county:str - Output column name for County
|
||||
col_out_fips:str - Output column name for FIPS code
|
||||
col_out_unique:str - Output column name for count of unique NAICS codes
|
||||
col_out_missing:str - Output column name for count of missing NAICS codes
|
||||
col_out_total:str - Output column name for total clients
|
||||
col_out_pct_missing:str - Output column name for percentage of missing NAICS
|
||||
col_neo_county:str - Input column name for Physical Address County
|
||||
col_naics_2:str - Input column name for 2-digit NAICS code
|
||||
col_out_of_state:str - Input column name for out of state flag
|
||||
|
||||
returns:
|
||||
pd.DataFrame - A dataframe containing NAICS statistics by county
|
||||
|
||||
description:
|
||||
Generates a dataset summarizing NAICS code completeness by county.
|
||||
"""
|
||||
|
||||
county_naics_groups = client_naics_df.groupby(col_neo_county)
|
||||
|
||||
# This gets the counts for non-null NAICS codes (including your 0s)
|
||||
naics_value_counts = county_naics_groups[col_naics_2].value_counts()
|
||||
|
||||
# .size() gets the *total* number of clients in each county
|
||||
# It includes rows where NAICS_2 is 0, valid, or NaN.
|
||||
total_clients_by_county = county_naics_groups.size()
|
||||
|
||||
total_unique_counties = client_naics_df[client_naics_df[col_out_of_state] == False][col_neo_county].unique() #pyright:ignore
|
||||
|
||||
county_stats_list = []
|
||||
af = addfips.AddFIPS()
|
||||
|
||||
for county in total_unique_counties:
|
||||
# Use .get() to avoid errors if a county has 100% NaN NAICS codes
|
||||
# In that case, value_counts() would be empty for it
|
||||
county_counts_series = naics_value_counts.get(county, pd.Series(dtype=float))
|
||||
|
||||
missing_naics_count = 0
|
||||
unique_valid_naics_count = len(county_counts_series) #pyright:ignore
|
||||
|
||||
if 0 in county_counts_series.index: # pyright: ignore
|
||||
# This line was always correct
|
||||
missing_naics_count = county_counts_series.loc[0] # pyright:ignore
|
||||
|
||||
# Decrement the *unique* count
|
||||
unique_valid_naics_count -= 1
|
||||
|
||||
# Now, just look up the true total for the county
|
||||
total_records = total_clients_by_county.loc[county]
|
||||
|
||||
fips = np.nan
|
||||
if not pd.isna(county):
|
||||
fips = af.get_county_fips(county, 'Pennsylvania')
|
||||
|
||||
county_data = {
|
||||
col_out_county: county,
|
||||
col_out_fips: fips,
|
||||
col_out_unique: unique_valid_naics_count,
|
||||
col_out_missing: missing_naics_count,
|
||||
col_out_total: total_records,
|
||||
col_out_pct_missing: (missing_naics_count / total_records) * 100 if total_records > 0 else 0
|
||||
}
|
||||
|
||||
county_stats_list.append(county_data)
|
||||
|
||||
county_stats_df = pd.DataFrame(county_stats_list)
|
||||
|
||||
# Helper function to add FIPS codes using the correct column name
|
||||
def add_fips_apply(series):
|
||||
return af.add_county_fips(series, county_field=col_out_county, state='Pennsylvania')
|
||||
|
||||
county_stats_df = county_stats_df.apply(add_fips_apply, axis=1)
|
||||
|
||||
# Should never happen, but it makes the return value technically correct now
|
||||
if type(county_stats_df) == pd.Series:
|
||||
raise Exception("Got a single series dataframe, something is wrong")
|
||||
|
||||
return county_stats_df # pyright:ignore
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
|
||||
if args.mapping:
|
||||
NEOSERRA_COLUMNS.apply_json_mapping(args.mapping)
|
||||
OUT_COLUMNS.apply_json_mapping(args.mapping)
|
||||
|
||||
print("Obtaining NAICS 11 data.")
|
||||
# get pa naics 11 data
|
||||
df_naics_11 = pd.DataFrame()
|
||||
try:
|
||||
df_naics_11 = get_bls_naics11_data(
|
||||
api_key=args.usdaapikey,
|
||||
year=args.censusyear)
|
||||
except Exception as e:
|
||||
print("Failed to get naics 11 data from the USDA, check your API key and internet connection.")
|
||||
raise e
|
||||
|
||||
print("Obtaining NAICS 92 data.")
|
||||
# Get PA naics 92 data
|
||||
df_naics_92 = pd.DataFrame()
|
||||
try:
|
||||
df_naics_92 = get_bls_naics92_data(year=args.censusyear)
|
||||
except Exception as e:
|
||||
print("Failed to get naics 92 data from the BLS, check your internet connection and the census year.")
|
||||
raise e
|
||||
|
||||
print("Obtaining PA census NAICS data.")
|
||||
df_naics_census = pd.DataFrame()
|
||||
try:
|
||||
df_naics_census = get_pa_naics_data(year=args.censusyear)
|
||||
except Exception as e:
|
||||
print(
|
||||
"Failed to obtain census naics data from the census api. Check API parameters and your internet connection and try again.")
|
||||
raise e
|
||||
|
||||
raw_client_list_df = pd.read_csv(args.inputcsv)
|
||||
naics_df = create_naics_census_percentage_table(
|
||||
df_naics_census=df_naics_census,
|
||||
df_naics_92=df_naics_92,
|
||||
df_naics_11=df_naics_11,
|
||||
col_unified_naics=OUT_COLUMNS.unified_naics,
|
||||
col_census_estab=OUT_COLUMNS.census_estab,
|
||||
col_census_pct=OUT_COLUMNS.census_pct,
|
||||
col_census_naics=OUT_COLUMNS.census_naics,
|
||||
col_naics_label=OUT_COLUMNS.naics_label
|
||||
)
|
||||
|
||||
client_list_df = generate_client_list_dataset(
|
||||
naics_df=naics_df,
|
||||
df_client_list=raw_client_list_df,
|
||||
col_unified_naics=OUT_COLUMNS.unified_naics,
|
||||
col_census_pct=OUT_COLUMNS.census_pct,
|
||||
col_naics_2=OUT_COLUMNS.naics_2,
|
||||
col_pa_naics_pct=OUT_COLUMNS.pa_naics_pct,
|
||||
col_pasbdc_pct=OUT_COLUMNS.pasbdc_pct,
|
||||
col_neo_primary_naics=NEOSERRA_COLUMNS.primary_naics,
|
||||
col_neo_naics=NEOSERRA_COLUMNS.naics
|
||||
)
|
||||
|
||||
out_df = make_county_naics_dataset(
|
||||
client_naics_df=client_list_df,
|
||||
col_out_county=OUT_COLUMNS.county,
|
||||
col_out_fips=OUT_COLUMNS.fips,
|
||||
col_out_unique=OUT_COLUMNS.unique_valid_naics,
|
||||
col_out_missing=OUT_COLUMNS.missing_naics,
|
||||
col_out_total=OUT_COLUMNS.total_clients,
|
||||
col_out_pct_missing=OUT_COLUMNS.pct_missing_naics,
|
||||
col_neo_county=NEOSERRA_COLUMNS.physical_address_county,
|
||||
col_naics_2=OUT_COLUMNS.naics_2,
|
||||
col_out_of_state=OUT_COLUMNS.county_out_of_state
|
||||
)
|
||||
out_df.to_csv(args.out, index=False)
|
||||
@@ -0,0 +1,106 @@
|
||||
# FILE: make_nps_dataset.py
|
||||
# CREATED: 12/23/25
|
||||
# AUTHOR: Vincent Allen
|
||||
# CONTACT: vincent@vtallen.com valle276@live.kutztown.edu
|
||||
# PURPOSE:
|
||||
|
||||
# Contains the logic required to turn a cleaned export of the satisfaction survey from Neoserra into data
|
||||
# that shows the net promoter score per center
|
||||
|
||||
# THIS SCRIPT REQUIRES THE OUTPUT OF make_satisfaction_survey_dataset.py to produce correct results
|
||||
|
||||
# See https://en.wikipedia.org/wiki/Net_promoter_score
|
||||
|
||||
# python modules
|
||||
import argparse
|
||||
import sys
|
||||
import json
|
||||
|
||||
# external libraries
|
||||
import pandas as pd
|
||||
from constants_module import NEOSERRA_COLUMNS, OUT_COLUMNS
|
||||
|
||||
|
||||
def make_nps_dataset(
|
||||
survey_df: pd.DataFrame,
|
||||
col_neo_center: str = NEOSERRA_COLUMNS.center,
|
||||
col_satisfaction_score: str = NEOSERRA_COLUMNS.satisfaction_score
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
parameters:
|
||||
survey_df:pd.DataFrame - The raw survey data
|
||||
source_data_export_path:str - The path + filename to save the NPS dataset to (if provided)
|
||||
col_neo_center:str - The column of the dataset containing the center name
|
||||
col_score:str - The column containing the satisfaction score (1-10)
|
||||
|
||||
returns: pd.DataFrame - The calculated NPS dataframe
|
||||
|
||||
description:
|
||||
Calculates the Net Promoter Score (NPS) for each center based on the survey responses.
|
||||
Returns a dataframe containing Detractors, Promoters, and the calculated NPS.
|
||||
"""
|
||||
|
||||
# Clean up the answers
|
||||
# This logic handles cases where the answer might be "10." vs "10"
|
||||
# We work on a copy to avoid SettingWithCopy warnings on the original dataframe
|
||||
local_df = survey_df.copy()
|
||||
local_df[col_satisfaction_score] = [int(str(x)[:2]) if len(str(x)) > 2 else int(x) for x in local_df[col_satisfaction_score]]
|
||||
|
||||
center_group_df = local_df[[col_neo_center, col_satisfaction_score]].groupby(col_neo_center)
|
||||
|
||||
nps_df = pd.DataFrame({col_neo_center:[], "Detractors":[], "Promoters":[], "NPS":[]})
|
||||
|
||||
for name, group in center_group_df:
|
||||
detractors_count = group[group[col_satisfaction_score] <= 6].shape[0]
|
||||
promoters_count = group[group[col_satisfaction_score] >= 9].shape[0]
|
||||
total = detractors_count + promoters_count
|
||||
|
||||
# Handle edge case where total is 0 to avoid division by zero
|
||||
if total > 0:
|
||||
nps = ((promoters_count / total) - (detractors_count / total)) * 100
|
||||
else:
|
||||
nps = 0
|
||||
|
||||
row = pd.DataFrame({col_neo_center:[name], "Detractors": [detractors_count], "Promoters": [promoters_count], "NPS": [nps]})
|
||||
|
||||
nps_df = pd.concat([nps_df, row], ignore_index=True)
|
||||
|
||||
return nps_df
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument("--inputcsv",
|
||||
type=str,
|
||||
required=True,
|
||||
help="The parsed and cleaned satisfaction survey dataset")
|
||||
|
||||
parser.add_argument("-m", "--mapping",
|
||||
type=str,
|
||||
required=False,
|
||||
default="",
|
||||
help="The JSON file that can be used to overwrite the internal column mappings of the script")
|
||||
|
||||
parser.add_argument("-o", "--out",
|
||||
type=str,
|
||||
required=True,
|
||||
help="The file to save the NPS score dataset to")
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
|
||||
# Override column mappings
|
||||
if args.mapping:
|
||||
NEOSERRA_COLUMNS.apply_json_mapping(args.mapping)
|
||||
OUT_COLUMNS.apply_json_mapping(args.mapping)
|
||||
|
||||
survey_df = pd.read_csv(args.inputcsv)
|
||||
nps_df = make_nps_dataset(
|
||||
survey_df,
|
||||
col_neo_center=NEOSERRA_COLUMNS.center,
|
||||
col_satisfaction_score=NEOSERRA_COLUMNS.satisfaction_score
|
||||
)
|
||||
|
||||
nps_df.to_csv(args.out, index=False)
|
||||
@@ -0,0 +1,105 @@
|
||||
# FILE: make_survey_dataset.py
|
||||
# CREATED: 12/17/25
|
||||
# AUTHOR: Vincent Allen
|
||||
# CONTACT: vincent@vtallen.com valle276@live.kutztown.edu
|
||||
# PURPOSE:
|
||||
|
||||
# Takes in the Neoserra survey data export and parses the "Answers" column, which contains
|
||||
# multiple questions and answers in a single text block, into separate columns for analysis.
|
||||
|
||||
# python modules
|
||||
import argparse
|
||||
import sys
|
||||
import json
|
||||
import re
|
||||
from typing import Dict
|
||||
|
||||
# external libraries
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
# custom libraries
|
||||
from pasbdc_data_cleaning import clean_center_name # pyright:ignore
|
||||
from constants_module import NEOSERRA_COLUMNS, OUT_COLUMNS
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument("-i", "--inputcsv",
|
||||
required=True,
|
||||
type=str,
|
||||
help='The Neoserra survey data input csv file. This script was tested on the satisfaction survey but it should work with other survey types given the Answers column is in the same format with answers between questions that start with a number and a period.')
|
||||
|
||||
parser.add_argument("-o", "--out",
|
||||
default="cleaned_survey_data.csv",
|
||||
type=str,
|
||||
help="The path to save the output CSV file to")
|
||||
|
||||
parser.add_argument("-m", "--mapping",
|
||||
required=False,
|
||||
default="",
|
||||
type=str,
|
||||
help="A .json file to override the column mappings used by this program. See documentation for meanings.")
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
def make_survey_dataset(survey_df_path:str, col_neo_answers:str=NEOSERRA_COLUMNS.answers) -> pd.DataFrame:
|
||||
"""
|
||||
parameters:
|
||||
survey_df_path:str - Path to the input survey CSV file
|
||||
col_neo_answers:str - The column name containing the combined questions and answers
|
||||
|
||||
returns:
|
||||
pd.DataFrame - A dataframe with parsed survey questions and answers
|
||||
|
||||
description:
|
||||
Parses the Neoserra survey export, splitting the multi-line "Answers" column into
|
||||
distinct columns for each question text and answer value.
|
||||
"""
|
||||
survey_df = pd.read_csv(survey_df_path)
|
||||
|
||||
# replace the neoserra center names with the real ones
|
||||
clean_center_name(survey_df)
|
||||
|
||||
# This part of the code takes the Answers column and creates new columns for each question. One with the question text and another
|
||||
# with the response to the question
|
||||
for row_index, row in survey_df.iterrows():
|
||||
lines = [x.strip() for x in str(row[col_neo_answers]).split('\n') if x.strip()] # Remove empty lines
|
||||
|
||||
# Find question indices (lines that start with a number followed by a period)
|
||||
question_indices = []
|
||||
for i, line in enumerate(lines):
|
||||
if re.match(r'^\d+\.', line): # Matches 1., 2., etc.
|
||||
question_indices.append(i)
|
||||
|
||||
question_number = 1
|
||||
# Extract questions and answers
|
||||
for i, q_idx in enumerate(question_indices):
|
||||
question = lines[q_idx][3:].strip() # Remove 1. prefix
|
||||
|
||||
# Find where the answer ends (either at next question or end of list)
|
||||
if i + 1 < len(question_indices):
|
||||
answer_end = question_indices[i + 1]
|
||||
else:
|
||||
answer_end = len(lines)
|
||||
|
||||
# Join all answer lines between this question and the next
|
||||
answer_lines = lines[q_idx + 1:answer_end]
|
||||
answer = ' '.join(answer_lines)
|
||||
|
||||
# Assign to dataframe
|
||||
survey_df.at[row_index, f"Question {question_number} text"] = question
|
||||
survey_df.at[row_index, f"Question {question_number}"] = answer
|
||||
question_number += 1
|
||||
|
||||
return survey_df
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
|
||||
if args.mapping:
|
||||
NEOSERRA_COLUMNS.apply_json_mapping(args.mapping)
|
||||
OUT_COLUMNS.apply_json_mapping(args.mapping)
|
||||
|
||||
out_df = make_survey_dataset(args.inputcsv, col_neo_answers=NEOSERRA_COLUMNS.answers)
|
||||
out_df.to_csv(args.out, index=False)
|
||||
@@ -0,0 +1,476 @@
|
||||
# FILE: make_training_statistics_dataset.py
|
||||
# CREATED: 12/24/25
|
||||
# AUTHOR: Vincent Allen
|
||||
# CONTACT: vincent@vtallen.com valle276@live.kutztown.edu
|
||||
# PURPOSE:
|
||||
|
||||
# Script was created using logic extracted from a Jupyter notebook to clean training data and generate
|
||||
# per-center statistics.
|
||||
# This script allows for the generation of two datasets: a cleaned master list of trainings, and a
|
||||
# derived statistics table calculating attendee and event performance metrics per center.
|
||||
# Logic is preserved strictly from the original notebook cells.
|
||||
|
||||
# External libraries:
|
||||
from typing import Dict, Tuple, List, Union
|
||||
import pandas as pd
|
||||
|
||||
import plotly.express as px
|
||||
import numpy as np
|
||||
|
||||
# Python modules:
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
|
||||
# Custom libraries:
|
||||
# These libraries need to be installed from their git submodules into the venv used to execute this script
|
||||
# pip3 install -e <path to library folder>
|
||||
# If I'm gone, and you do not know how to do this,
|
||||
# just drag the python files into this folder and import the needed functions from those files and it should work. Or just copy the functions into this file
|
||||
from pasbdc_data_cleaning import clean_center_name # pyright:ignore
|
||||
from constants_module import NEOSERRA_COLUMNS, OUT_COLUMNS, TRAINING_COUNT_COLUMNS, Constants
|
||||
|
||||
# Constants defined in the original notebook logic
|
||||
LEAD_OFFICE_CENTERS = [
|
||||
'Pennsylvania SBDC Lead Office',
|
||||
' Pennsylvania SBDC Lead Office',
|
||||
'Southeast Pennsylvania APEX Accelerator',
|
||||
'Primary Training Topic',
|
||||
'State Small Business Credit Initiative (SSBCI)'
|
||||
]
|
||||
|
||||
FIRST_STEPS_COLS = ['First Steps', 'Next Steps']
|
||||
|
||||
def parse_args():
|
||||
"""
|
||||
parameters: None
|
||||
|
||||
returns: optparse_parser.Values - The parsed command line arguments from the argeparse module
|
||||
|
||||
description:
|
||||
Uses the python argeparse module to parse the command line arguments for this application
|
||||
"""
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument("-i", "--inputcsv",
|
||||
required=True,
|
||||
help="The filename of the input raw trainings CSV file.")
|
||||
|
||||
parser.add_argument("-o", "--out",
|
||||
type=str,
|
||||
default="cleaned_trainings_data.csv",
|
||||
required=False,
|
||||
help="The filename to write the output dataset to."
|
||||
)
|
||||
|
||||
parser.add_argument("-m", "--mapping",
|
||||
type=str,
|
||||
required=False,
|
||||
help="Path to a JSON file to override default column names mappings.")
|
||||
|
||||
parser.add_argument("-mode", "--mode",
|
||||
type=str,
|
||||
default="clean",
|
||||
choices=["clean", "stats"],
|
||||
required=False,
|
||||
help="Select 'clean' to output the cleaned dataset, or 'stats' to output the per-center statistics table.")
|
||||
|
||||
parser.add_argument("-f", "--fundingsources",
|
||||
nargs='+',
|
||||
default=None,
|
||||
required=False,
|
||||
help="List of funding sources to filter by for the statistics generation (e.g. -f SBA State). If omitted, all sources are used.")
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
def tag_first_steps(
|
||||
series,
|
||||
col_neo_event_title:str= NEOSERRA_COLUMNS.event_title,
|
||||
col_neo_primary_topic:str= NEOSERRA_COLUMNS.primary_training_topic
|
||||
):
|
||||
"""
|
||||
parameters:
|
||||
series: pd.Series - The row to operate on
|
||||
col_neo_event_title:str - Name of the Event Title column
|
||||
col_primary_topic:str - Name of the Primary Training Topic column
|
||||
|
||||
returns:
|
||||
pd.Series - The modified series
|
||||
|
||||
description:
|
||||
Checks the event title for keywords indicating it is a "First Steps" or "Next Steps" course
|
||||
and updates the Primary Training Topic accordingly.
|
||||
"""
|
||||
if 'first steps' in series[col_neo_event_title].lower() or 'first step' in series[col_neo_event_title].lower():
|
||||
series[col_neo_primary_topic] = 'First Steps'
|
||||
elif 'next steps' in series[col_neo_event_title].lower() or 'next step' in series[col_neo_event_title].lower() or 'the next step' in series[col_neo_event_title].lower():
|
||||
series[col_neo_primary_topic] = 'Next Steps'
|
||||
|
||||
return series
|
||||
|
||||
def apply_lead_office(
|
||||
series,
|
||||
col_neo_center:str= NEOSERRA_COLUMNS.center
|
||||
):
|
||||
'''
|
||||
parameters:
|
||||
series: pd.Series - The row to operate on
|
||||
col_center:str - Name of the Center column
|
||||
|
||||
returns:
|
||||
pd.Series - The modified series
|
||||
|
||||
description:
|
||||
Normalizes various lead office center names into a single "Lead Office" string based on a hardcoded list.
|
||||
'''
|
||||
if series[col_neo_center] in LEAD_OFFICE_CENTERS:
|
||||
series[col_neo_center] = "Lead Office"
|
||||
|
||||
return series
|
||||
|
||||
def tag_pre_planning(
|
||||
series,
|
||||
col_neo_primary_topic:str= NEOSERRA_COLUMNS.primary_training_topic,
|
||||
col_neo_training_topics:str= NEOSERRA_COLUMNS.training_topics,
|
||||
col_is_preplanning:str= OUT_COLUMNS.is_preplanning
|
||||
):
|
||||
"""
|
||||
parameters:
|
||||
series: pd.Series - The row to operate on
|
||||
col_primary_topic:str - Name of the Primary Training Topic column
|
||||
col_training_topics:str - Name of the Training Topics column
|
||||
col_is_preplanning:str - Name of the new output boolean column
|
||||
|
||||
returns:
|
||||
pd.Series - The modified series
|
||||
|
||||
description:
|
||||
Determines if a training event is related to "Business Start-up/Preplanning" based on
|
||||
the primary topic or the list of training topics, creating a boolean flag column.
|
||||
"""
|
||||
key = 'Business Start-up/Preplanning'.lower()
|
||||
training_topics = str(series[col_neo_training_topics])
|
||||
|
||||
if series[col_neo_primary_topic] == key or key in training_topics.lower():
|
||||
series[col_is_preplanning] = True
|
||||
else:
|
||||
series[col_is_preplanning] = False
|
||||
|
||||
return series
|
||||
|
||||
def generate_cleaned_trainings_dataset(
|
||||
trainings_df,
|
||||
funding_sources:List[str] = ['Core Services', 'LEXNET', 'PDA', 'NAP'],
|
||||
col_neo_event_title:str= NEOSERRA_COLUMNS.event_title,
|
||||
col_neo_primary_topic:str= NEOSERRA_COLUMNS.primary_training_topic,
|
||||
col_neo_training_topics:str= NEOSERRA_COLUMNS.training_topics,
|
||||
col_neo_center:str= NEOSERRA_COLUMNS.center,
|
||||
col_is_preplanning:str= OUT_COLUMNS.is_preplanning,
|
||||
col_neo_attendees_total:str= NEOSERRA_COLUMNS.attendees_total,
|
||||
col_out_attendees_range:str= OUT_COLUMNS.attendees_range,
|
||||
**kwargs
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
parameters:
|
||||
trainings_csv_path:str - Path to the raw trainings CSV file
|
||||
col_neo_event_title:str - Column name for Event Title
|
||||
col_neo_primary_topic:str - Column name for Primary Training Topic
|
||||
col_neo_training_topics:str - Column name for Training Topics
|
||||
col_neo_center:str - Column name for Center
|
||||
col_is_preplanning:str - Column name for the Preplanning flag
|
||||
col_neo_attendees_total:str - Column name for Total Attendees
|
||||
col_out_attendees_range:str - Output column for the tagged attendees range of an event (1-5, 6-10, etc)
|
||||
**kwargs: kwargs - Consumes any extra un-needed arguments
|
||||
|
||||
returns:
|
||||
pd.DataFrame - The cleaned trainings dataframe
|
||||
|
||||
description:
|
||||
Loads the training data and applies a sequence of cleaning operations including
|
||||
standardizing center names, tagging first/next steps, consolidating lead office names,
|
||||
and flagging preplanning events.
|
||||
"""
|
||||
|
||||
print("Cleaning center names...")
|
||||
clean_center_name(trainings_df)
|
||||
|
||||
print("Tagging 'First Steps' and 'Next Steps'...")
|
||||
trainings_df = trainings_df.apply(
|
||||
tag_first_steps,
|
||||
axis=1,
|
||||
col_neo_event_title=col_neo_event_title,
|
||||
col_neo_primary_topic=col_neo_primary_topic
|
||||
)
|
||||
|
||||
print("Consolidating Lead Office centers...")
|
||||
trainings_df = trainings_df.apply(
|
||||
apply_lead_office,
|
||||
axis=1,
|
||||
col_neo_center=col_neo_center
|
||||
)
|
||||
|
||||
print("Tagging Pre-planning events...")
|
||||
trainings_df = trainings_df.apply(
|
||||
tag_pre_planning,
|
||||
axis=1,
|
||||
col_neo_primary_topic=col_neo_primary_topic,
|
||||
col_neo_training_topics=col_neo_training_topics,
|
||||
col_is_preplanning=col_is_preplanning
|
||||
)
|
||||
|
||||
print("Tagging attendee ranges...")
|
||||
bins = [0, 5, 14, 24, 49, 99, np.inf]
|
||||
labels = ['1-5', '6-14', '15-24', '25-49', '50-99', '100+']
|
||||
|
||||
trainings_df[col_out_attendees_range] = pd.cut(trainings_df[col_neo_attendees_total], bins=bins, labels=labels, right=True)
|
||||
|
||||
# Filter for the funding sources that we want
|
||||
trainings_df = trainings_df[trainings_df[NEOSERRA_COLUMNS.funding_source].isin(funding_sources)]
|
||||
|
||||
# Sort the dataframe alphabetically by the center to ensure it shows up properly in visualizations
|
||||
trainings_df = trainings_df.sort_values(col_neo_center, ascending=True)
|
||||
return trainings_df #pyright:ignore
|
||||
|
||||
def generate_center_trainings_count_statistics(
|
||||
full_df: pd.DataFrame,
|
||||
filtered_df: pd.DataFrame,
|
||||
funding_source_group: List[str],
|
||||
col_neo_primary_topic:str= NEOSERRA_COLUMNS.primary_training_topic,
|
||||
col_neo_center:str= NEOSERRA_COLUMNS.center,
|
||||
col_neo_funding_source:str= NEOSERRA_COLUMNS.funding_source,
|
||||
col_neo_attendees_total:str= NEOSERRA_COLUMNS.attendees_total,
|
||||
col_neo_program_format:str= NEOSERRA_COLUMNS.program_format,
|
||||
**kwargs
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
:param full_df: pd.Dataframe - The cleaned network wide trainings data
|
||||
:param filtered_df: pd.DataFrame - The cleaned network wide trainings data, with the desired filter applied (ex. 0 attendee events)
|
||||
:param funding_source_group: List[str] - The list of funding sources to filter the input datasets for
|
||||
:param col_neo_primary_topic: str - The column of the input dataset where the primary training topic can be found
|
||||
:param col_neo_center: str - The column of the input dataset where the center can be found
|
||||
:param col_neo_funding_source: str - The column of the input dataset where the funding source can be found
|
||||
:param col_neo_attendees_total: str - The column of the input dataset where the total attendees count can be found
|
||||
:param col_neo_program_format: str - The column if the input dataset where the program format can be found
|
||||
:param kwargs: str - Takes care of ignoring any unused kwargs passed into the function
|
||||
|
||||
:return: pd.Dataframe
|
||||
|
||||
description:
|
||||
Takes in a list of all the trainings and a filtered list of all of the trainings. It then iterates through each center
|
||||
in the data and derives a set of statistics about each center comparing the whole dataset to the filtered dataset.
|
||||
|
||||
The derived columns are as follows:
|
||||
'Center', 'Total Events', 'Selected Events', 'Percent Selected Events', 'Total Attendees', 'Selected Attendees',
|
||||
'Percent Selected Attendees', 'Selected Attendees No First Steps', 'Percent Selected Attendees No First Steps',
|
||||
'Selected Attendees No First Steps No Preplanning', 'Percent Selected Attendees No First Steps No Preplanning'
|
||||
'Selected Attendees First Steps and Preplanning', 'Percent Selected Attendees First Steps and Preplanning'
|
||||
'Selected Events First Steps and Preplanning', 'Percent Selected Events First Steps and Preplanning'
|
||||
'Selected Events No First Steps','Percent Selected Events No First Steps', 'Selected Events No Preplanning No First Steps',
|
||||
'Percent Selected Events No Preplanning No First Steps', 'Total Ondemand', 'Selected Total Ondemand',
|
||||
'Percent Selected Ondemand', 'Selected Total Ondemand No First Steps', 'Selected Percent Ondemand No First Steps'
|
||||
'Selected Total Ondemand No Preplanning No First Steps', 'Selected Percent Ondemand No Preplanning No First Steps'
|
||||
"""
|
||||
|
||||
rows_list = []
|
||||
|
||||
# --- STEP 1: CALCULATE GLOBAL DENOMINATORS ---
|
||||
global_all_network = full_df[full_df[col_neo_funding_source].isin(funding_source_group)]
|
||||
|
||||
# Denominators:
|
||||
full_network_event_total = global_all_network.shape[0]
|
||||
print(f"global_all_network event count: {full_network_event_total}")
|
||||
full_network_attendee_total = global_all_network[col_neo_attendees_total].sum()
|
||||
print(f"global_all_network attendee count: {full_network_attendee_total }")
|
||||
|
||||
# --- STEP 2: LOOP PER CENTER ---
|
||||
centers = full_df[col_neo_center].unique()
|
||||
|
||||
for center_name in centers:
|
||||
center_full_df = global_all_network[
|
||||
(global_all_network[col_neo_center] == center_name)
|
||||
]
|
||||
|
||||
center_filtered_df = filtered_df[
|
||||
(filtered_df[col_neo_center] == center_name) &
|
||||
(filtered_df[col_neo_funding_source].isin(funding_source_group))
|
||||
]
|
||||
|
||||
# --- ATTENDEES CALCULATIONS ---
|
||||
total_attendees = center_full_df[col_neo_attendees_total].sum()
|
||||
total_selected_attendees = center_filtered_df[col_neo_attendees_total].sum()
|
||||
|
||||
selected_attendees_nofirst = center_filtered_df[
|
||||
~center_filtered_df[col_neo_primary_topic].isin(FIRST_STEPS_COLS) #pyright:ignore
|
||||
][col_neo_attendees_total].sum()
|
||||
|
||||
selected_attendees_nofirst_nopre = center_filtered_df[
|
||||
(~center_filtered_df[col_neo_primary_topic].isin(FIRST_STEPS_COLS)) & #pyright:ignore
|
||||
(center_filtered_df[col_neo_primary_topic] != "Business Start-up/Preplanning")
|
||||
][col_neo_attendees_total].sum()
|
||||
|
||||
# --- First Steps & Preplanning ONLY Attendees ---
|
||||
first_pre_mask_center = (center_filtered_df[col_neo_primary_topic].isin(FIRST_STEPS_COLS)) | (center_filtered_df[col_neo_primary_topic] == "Business Start-up/Preplanning") #pyright:ignore
|
||||
|
||||
selected_attendees_first_pre = center_filtered_df[first_pre_mask_center][col_neo_attendees_total].sum()
|
||||
|
||||
if full_network_attendee_total > 0:
|
||||
percent_selected_attendees_first_pre = selected_attendees_first_pre / full_network_attendee_total
|
||||
percent_selected_attendees_nofirst_nopre = selected_attendees_nofirst_nopre / full_network_attendee_total
|
||||
percent_selected_attendees_nofirst = selected_attendees_nofirst / full_network_attendee_total
|
||||
percent_total_selected_attendees = total_selected_attendees / full_network_attendee_total
|
||||
else:
|
||||
percent_selected_attendees_first_pre = 0
|
||||
percent_selected_attendees_nofirst_nopre = 0
|
||||
percent_selected_attendees_nofirst = 0
|
||||
percent_total_selected_attendees = 0
|
||||
|
||||
# --- EVENTS CALCULATIONS ---
|
||||
total_events = center_full_df.shape[0]
|
||||
total_selected_events = center_filtered_df.shape[0]
|
||||
|
||||
selected_events_nofirst = center_filtered_df[
|
||||
~center_filtered_df[col_neo_primary_topic].isin(FIRST_STEPS_COLS) #pyright:ignore
|
||||
].shape[0]
|
||||
|
||||
selected_events_nofirst_nopre = center_filtered_df[
|
||||
(~center_filtered_df[col_neo_primary_topic].isin(FIRST_STEPS_COLS)) & #pyright:ignore
|
||||
(center_filtered_df[col_neo_primary_topic] != "Business Start-up/Preplanning")
|
||||
].shape[0]
|
||||
|
||||
# --- First Steps & Preplanning ONLY Events ---
|
||||
selected_events_first_pre = center_filtered_df[first_pre_mask_center].shape[0]
|
||||
|
||||
# --- ONDEMAND CALCULATIONS ---
|
||||
total_ondemand = center_full_df[
|
||||
center_full_df[col_neo_program_format] == Constants.ON_DEMAND_VALUE.value
|
||||
].shape[0]
|
||||
|
||||
selected_total_ondemand = center_filtered_df[
|
||||
center_filtered_df[col_neo_program_format] == Constants.ON_DEMAND_VALUE.value
|
||||
].shape[0]
|
||||
|
||||
selected_ondemand_nofirst = center_filtered_df[
|
||||
(center_filtered_df[col_neo_program_format] == Constants.ON_DEMAND_VALUE.value) &
|
||||
(~center_filtered_df[col_neo_primary_topic].isin(FIRST_STEPS_COLS)) #pyright:ignore
|
||||
].shape[0]
|
||||
|
||||
selected_ondemand_nofirst_nopre = center_filtered_df[
|
||||
(center_filtered_df[col_neo_program_format] == Constants.ON_DEMAND_VALUE.value) &
|
||||
(~center_filtered_df[col_neo_primary_topic].isin(FIRST_STEPS_COLS)) & #pyright:ignore
|
||||
(center_filtered_df[col_neo_primary_topic] != "Business Start-up/Preplanning")
|
||||
].shape[0]
|
||||
|
||||
if full_network_event_total > 0:
|
||||
percent_selected_ondemand_nofirst_nopre = selected_ondemand_nofirst_nopre / full_network_event_total
|
||||
percent_selected_ondemand_nofirst = selected_ondemand_nofirst / full_network_event_total
|
||||
percent_selected_ondemand = selected_total_ondemand / full_network_event_total
|
||||
percent_selected_events_first_pre = selected_events_first_pre / full_network_event_total
|
||||
percent_selected_events_nofirst = selected_events_nofirst / full_network_event_total
|
||||
percent_selected_events_nofirst_nopre = selected_events_nofirst_nopre / full_network_event_total
|
||||
percent_selected_events = total_selected_events / full_network_event_total
|
||||
else:
|
||||
percent_selected_ondemand_nofirst_nopre = 0
|
||||
percent_selected_ondemand_nofirst = 0
|
||||
percent_selected_ondemand = 0
|
||||
percent_selected_events_nofirst = 0
|
||||
percent_selected_events_first_pre = 0
|
||||
percent_selected_events_nofirst_nopre = 0
|
||||
percent_selected_events = 0
|
||||
|
||||
# Build Row
|
||||
row = {
|
||||
TRAINING_COUNT_COLUMNS.CENTER: center_name,
|
||||
TRAINING_COUNT_COLUMNS.TOTAL_EVENTS: total_events,
|
||||
TRAINING_COUNT_COLUMNS.SELECTED_EVENTS: total_selected_events,
|
||||
TRAINING_COUNT_COLUMNS.PERCENT_SELECTED_EVENTS: percent_selected_events,
|
||||
|
||||
TRAINING_COUNT_COLUMNS.TOTAL_ATTENDEES: total_attendees,
|
||||
TRAINING_COUNT_COLUMNS.SELECTED_ATTENDEES: total_selected_attendees,
|
||||
TRAINING_COUNT_COLUMNS.PERCENT_SELECTED_ATTENDEES: percent_total_selected_attendees,
|
||||
|
||||
TRAINING_COUNT_COLUMNS.SELECTED_ATTENDEES_NO_FIRST_STEPS: selected_attendees_nofirst,
|
||||
TRAINING_COUNT_COLUMNS.PERCENT_SELECTED_ATTENDEES_NO_FIRST_STEPS: percent_selected_attendees_nofirst,
|
||||
TRAINING_COUNT_COLUMNS.SELECTED_ATTENDEES_NO_FIRST_STEPS_NO_PREPLANNING: selected_attendees_nofirst_nopre,
|
||||
TRAINING_COUNT_COLUMNS.PERCENT_SELECTED_ATTENDEES_NO_FIRST_STEPS_NO_PREPLANNING: percent_selected_attendees_nofirst_nopre,
|
||||
|
||||
TRAINING_COUNT_COLUMNS.SELECTED_ATTENDEES_FIRST_STEPS_AND_PREPLANNING: selected_attendees_first_pre,
|
||||
TRAINING_COUNT_COLUMNS.PERCENT_SELECTED_ATTENDEES_FIRST_STEPS_AND_PREPLANNING: percent_selected_attendees_first_pre,
|
||||
TRAINING_COUNT_COLUMNS.SELECTED_EVENTS_FIRST_STEPS_AND_PREPLANNING: selected_events_first_pre,
|
||||
TRAINING_COUNT_COLUMNS.PERCENT_SELECTED_EVENTS_FIRST_STEPS_AND_PREPLANNING: percent_selected_events_first_pre,
|
||||
|
||||
TRAINING_COUNT_COLUMNS.SELECTED_EVENTS_NO_FIRST_STEPS: selected_events_nofirst,
|
||||
TRAINING_COUNT_COLUMNS.PERCENT_SELECTED_EVENTS_NO_FIRST_STEPS: percent_selected_events_nofirst,
|
||||
TRAINING_COUNT_COLUMNS.SELECTED_EVENTS_NO_PREPLANNING_NO_FIRST_STEPS: selected_events_nofirst_nopre,
|
||||
TRAINING_COUNT_COLUMNS.PERCENT_SELECTED_EVENTS_NO_PREPLANNING_NO_FIRST_STEPS: percent_selected_events_nofirst_nopre,
|
||||
|
||||
TRAINING_COUNT_COLUMNS.TOTAL_ONDEMAND: total_ondemand,
|
||||
TRAINING_COUNT_COLUMNS.SELECTED_TOTAL_ONDEMAND: selected_total_ondemand,
|
||||
TRAINING_COUNT_COLUMNS.PERCENT_SELECTED_ONDEMAND: percent_selected_ondemand,
|
||||
|
||||
TRAINING_COUNT_COLUMNS.SELECTED_TOTAL_ONDEMAND_NO_FIRST_STEPS: selected_ondemand_nofirst,
|
||||
TRAINING_COUNT_COLUMNS.SELECTED_PERCENT_ONDEMAND_NO_FIRST_STEPS: percent_selected_ondemand_nofirst,
|
||||
|
||||
TRAINING_COUNT_COLUMNS.SELECTED_TOTAL_ONDEMAND_NO_PREPLANNING_NO_FIRST_STEPS: selected_ondemand_nofirst_nopre,
|
||||
TRAINING_COUNT_COLUMNS.SELECTED_PERCENT_ONDEMAND_NO_PREPLANNING_NO_FIRST_STEPS: percent_selected_ondemand_nofirst_nopre,
|
||||
}
|
||||
rows_list.append(row)
|
||||
|
||||
result_df = pd.DataFrame(rows_list)
|
||||
return result_df
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
if args.mapping:
|
||||
try:
|
||||
with open(args.mapping, 'r') as f:
|
||||
NEOSERRA_COLUMNS.apply_json_mapping(args.mapping)
|
||||
OUT_COLUMNS.apply_json_mapping(args.mapping)
|
||||
except Exception as e:
|
||||
print(f'Failed to load user column configuration JSON file, got={e}')
|
||||
sys.exit(1)
|
||||
|
||||
# Perform the basic cleaning (Required for both modes)
|
||||
trainings_df = pd.read_csv(args.inputcsv)
|
||||
cleaned_trainings_df = generate_cleaned_trainings_dataset(
|
||||
trainings_df,
|
||||
col_neo_event_title= NEOSERRA_COLUMNS.event_title,
|
||||
col_neo_primary_topic= NEOSERRA_COLUMNS.primary_training_topic,
|
||||
col_neo_training_topics= NEOSERRA_COLUMNS.training_topics,
|
||||
col_neo_center= NEOSERRA_COLUMNS.center,
|
||||
col_is_preplanning = OUT_COLUMNS.is_preplanning,
|
||||
col_neo_attendees_total= NEOSERRA_COLUMNS.attendees_total,
|
||||
col_out_attendees_range = OUT_COLUMNS.attendees_range,
|
||||
)
|
||||
|
||||
if args.mode == "clean":
|
||||
print(f"Writing cleaned dataset to {args.out}")
|
||||
cleaned_trainings_df.to_csv(args.out, index=False)
|
||||
|
||||
elif args.mode == "stats":
|
||||
# Determine funding sources to use
|
||||
funding_sources = args.fundingsources
|
||||
if funding_sources is None:
|
||||
# If not provided, assume all unique funding sources found in the cleaned data
|
||||
print("No funding sources specified via -f, using all available funding sources in dataset.")
|
||||
funding_sources = list(cleaned_trainings_df[NEOSERRA_COLUMNS.funding_source].unique())
|
||||
|
||||
print(f"Generating statistics for funding sources: {funding_sources}")
|
||||
|
||||
stats_df = generate_center_trainings_count_statistics(
|
||||
full_df=cleaned_trainings_df,
|
||||
filtered_df=cleaned_trainings_df,
|
||||
funding_source_group=funding_sources,
|
||||
col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic,
|
||||
col_neo_center=NEOSERRA_COLUMNS.center,
|
||||
col_neo_funding_source=NEOSERRA_COLUMNS.funding_source,
|
||||
col_neo_attendees_total=NEOSERRA_COLUMNS.attendees_total,
|
||||
col_neo_program_format=NEOSERRA_COLUMNS.program_format
|
||||
)
|
||||
|
||||
print(f"Writing statistics table to {args.out}")
|
||||
stats_df.to_csv(args.out, index=False)
|
||||
|
||||
else:
|
||||
print(f"Invalid mode selected: {args.mode}")
|
||||
sys.exit(1)
|
||||
Reference in New Issue
Block a user