first commit

This commit is contained in:
2026-05-21 08:40:24 -04:00
commit b084545275
711 changed files with 3659856 additions and 0 deletions

View File

@@ -0,0 +1,104 @@
from typing import List, Tuple
import pandas as pd
import streamlit as st
from constants_module import OUT_COLUMNS, NEOSERRA_COLUMNS
from section_1_datasets_module import generate_client_list_dataset, get_pa_naics_data, get_bls_naics11_data, get_bls_naics92_data, create_naics_census_percentage_table, make_county_naics_dataset
from shared_tools_module import csv_url_to_dataframe
from cached_function_wrappers.shared import cached_csv_url_to_dataframe
from pasbdc_data_cleaning import clean_center_name, remove_duplicate_client_records, remove_api_testing_clients
@st.cache_data
def cached_get_pa_naics_source_data(census_year:str) -> pd.DataFrame:
return get_pa_naics_data(census_year)
@st.cache_data
def cached_get_bls_naics11_data(usda_api_key:str, census_year:str) -> pd.DataFrame:
return get_bls_naics11_data(api_key=usda_api_key, year=census_year)
@st.cache_data
def cached_get_bls_naics92_data(census_year:str) -> pd.DataFrame:
return get_bls_naics92_data(year=census_year)
@st.cache_data
def filter_df_by_naics_codes(in_df:pd.DataFrame, naics_codes: List[int]) -> pd.DataFrame:
return in_df[
in_df[OUT_COLUMNS.naics_2].isin(naics_codes)
]
def cached_create_naics_census_percentage_table(usda_api_key:str, census_year:str):
df_naics_census = cached_get_pa_naics_source_data(census_year=census_year)
df_naics_11 = cached_get_bls_naics11_data(usda_api_key=usda_api_key, census_year=census_year)
df_naics_92 = cached_get_bls_naics92_data(census_year=census_year)
census_table = create_naics_census_percentage_table(
df_naics_census=df_naics_census,
df_naics_11=df_naics_11,
df_naics_92=df_naics_92,
col_bls_industry = OUT_COLUMNS.bls_industry,
col_bls_estab = OUT_COLUMNS.bls_estab,
col_usda_value = OUT_COLUMNS.usda_value,
col_unified_naics = OUT_COLUMNS.unified_naics,
col_census_estab = OUT_COLUMNS.census_estab,
col_census_pct = OUT_COLUMNS.census_pct,
col_naics_label = OUT_COLUMNS.naics_label,
col_census_naics = OUT_COLUMNS.census_naics
)
return census_table
@st.cache_data
def cached_generate_client_naics_dataset(export_module_url, usda_api_key, census_year, centers) -> pd.DataFrame:
raw_client_df = cached_csv_url_to_dataframe(export_module_url).copy()
clean_center_name(raw_client_df)
raw_client_df = raw_client_df[raw_client_df[NEOSERRA_COLUMNS.center].isin(centers)]
naics_df = cached_create_naics_census_percentage_table(usda_api_key, census_year)
client_list_df = generate_client_list_dataset(
naics_df=naics_df,
df_client_list=raw_client_df,
col_unified_naics=OUT_COLUMNS.unified_naics,
col_census_pct=OUT_COLUMNS.census_pct,
col_naics_2=OUT_COLUMNS.naics_2,
col_pa_naics_pct=OUT_COLUMNS.pa_naics_pct,
col_pasbdc_pct=OUT_COLUMNS.pasbdc_pct,
col_neo_primary_naics=NEOSERRA_COLUMNS.primary_naics,
col_neo_naics=NEOSERRA_COLUMNS.naics,
# This tells the functions that there is no NAICs column, only a primary NAICs one
# Neoserra in their infinite wisdom does not allow you to see this column
# in the export module output
bypass_secondary_naics_list=True
)
client_list_df = remove_duplicate_client_records(client_list_df)
client_list_df = remove_api_testing_clients(client_list_df)
return client_list_df
@st.cache_data
def cached_get_county_dataset(client_list_export_module_url:str, usda_api_key:str, census_year:str, centers:List[str]) -> pd.DataFrame:
client_list_df = cached_generate_client_naics_dataset(
client_list_export_module_url,
usda_api_key,
census_year,
centers
)
county_df = make_county_naics_dataset(
client_list_df,
col_out_county=OUT_COLUMNS.county,
col_out_fips=OUT_COLUMNS.fips,
col_out_unique=OUT_COLUMNS.unique_valid_naics,
col_out_missing=OUT_COLUMNS.missing_naics,
col_out_total=OUT_COLUMNS.total_clients,
col_out_pct_missing=OUT_COLUMNS.pct_missing_naics,
col_neo_county=NEOSERRA_COLUMNS.physical_address_county,
col_naics_2=OUT_COLUMNS.naics_2,
col_out_of_state=OUT_COLUMNS.county_out_of_state
)
return county_df

View File

@@ -0,0 +1,29 @@
from typing import List
import streamlit as st
from .shared import cached_csv_url_to_dataframe, remove_duplicate_client_records
from milestone_attribution_dataset_module import sanitize_funding_data
from constants_module import NEOSERRA_COLUMNS, OUT_COLUMNS
@st.cache_data
def cached_sanitize_funding_data(export_url:str, reportable_only:bool, allowed_centers:List[str] | None = None):
funding_df = cached_csv_url_to_dataframe(export_url)
funding_df = sanitize_funding_data(
funding_df,
col_neo_attribution_source=NEOSERRA_COLUMNS.milestone_attribution_source,
col_neo_affirmation=NEOSERRA_COLUMNS.milestone_affirmation,
col_out_documentation_level=OUT_COLUMNS.milestone_documentation_level,
col_neo_center=NEOSERRA_COLUMNS.center
)
funding_df = remove_duplicate_client_records(funding_df)
if allowed_centers is not None:
funding_df = funding_df[funding_df[NEOSERRA_COLUMNS.center].isin(allowed_centers)]
if reportable_only:
funding_df = funding_df[funding_df[NEOSERRA_COLUMNS.reportable] == 1]
return funding_df

View File

@@ -0,0 +1,38 @@
from typing import List
import streamlit as st
from constants_module import NEOSERRA_COLUMNS, OUT_COLUMNS
from milestone_attribution_dataset_module import sanitize_nbs_data
from cached_function_wrappers.shared import cached_csv_url_to_dataframe
from pasbdc_data_cleaning import remove_duplicate_client_records
@st.cache_data
def cached_get_nbs_data(export_url:str, reportable_only:bool, allowed_centers:List[str] | None = None):
nbs_df = cached_csv_url_to_dataframe(export_url)
nbs_df = sanitize_nbs_data(
nbs_df,
col_neo_center=NEOSERRA_COLUMNS.center,
col_neo_client_id=NEOSERRA_COLUMNS.client_id,
col_neo_milestone_date=NEOSERRA_COLUMNS.milestone_date,
col_neo_attribution_date=NEOSERRA_COLUMNS.attribution_date,
col_neo_attribution_source=NEOSERRA_COLUMNS.milestone_attribution_source,
col_neo_affirmation=NEOSERRA_COLUMNS.milestone_affirmation,
col_neo_milestone_type=NEOSERRA_COLUMNS.milestone_type_name,
col_out_documentation_level=OUT_COLUMNS.milestone_documentation_level,
col_neo_reportable=NEOSERRA_COLUMNS.reportable,
business_start_impact_val=NEOSERRA_COLUMNS.business_start_impact_val,
business_established_val=NEOSERRA_COLUMNS.business_established_val
)
nbs_df = remove_duplicate_client_records(nbs_df)
if allowed_centers is not None:
nbs_df = nbs_df[nbs_df[NEOSERRA_COLUMNS.center].isin(allowed_centers)]
if reportable_only:
nbs_df = nbs_df[nbs_df[NEOSERRA_COLUMNS.reportable] == 1]
return nbs_df

View File

@@ -0,0 +1,20 @@
import streamlit as st
import pandas as pd
from shared_tools_module import csv_url_to_dataframe
from pasbdc_data_cleaning import clean_center_name, remove_duplicate_client_records, remove_api_testing_clients
from constants_module import NEOSERRA_COLUMNS, OUT_COLUMNS
@st.cache_data
def cached_csv_url_to_dataframe(export_module_url) -> pd.DataFrame:
return csv_url_to_dataframe(export_module_url)
@st.cache_data
def get_df_centers(export_module_url:str) -> pd.DataFrame:
raw_client_df = csv_url_to_dataframe(export_module_url).copy()
clean_center_name(raw_client_df)
raw_client_df = remove_duplicate_client_records(raw_client_df)
raw_client_df = remove_api_testing_clients(raw_client_df)
return raw_client_df[NEOSERRA_COLUMNS.center].unique()

View File

@@ -0,0 +1,61 @@
from typing import List
import datetime
from .shared import cached_csv_url_to_dataframe
from section_1_datasets_module import generate_cleaned_trainings_dataset, generate_center_trainings_count_statistics
from constants_module import NEOSERRA_COLUMNS, OUT_COLUMNS, Constants
import pandas as pd
import streamlit as st
@st.cache_data
def cached_generate_center_trainings_count_statistics(export_url:str, reportable_only:bool, include_future_events:bool, include_on_demand:bool, allowed_centers:List[str] | None = None):
trainings_df = cached_generate_cleaned_trainings_dataset(export_url,
reportable_only=reportable_only,
allowed_centers=allowed_centers,
include_future_events=include_future_events,
include_on_demand=include_on_demand
)
attendees_numeric = pd.to_numeric(trainings_df[NEOSERRA_COLUMNS.attendees_total], errors='coerce').fillna(0)
stats_df = generate_center_trainings_count_statistics(
full_df=trainings_df,
filtered_df=trainings_df[attendees_numeric == 0],
funding_source_group=['Core Services', 'LEXNET', 'PDA', 'NAP'],
col_primary_topic=NEOSERRA_COLUMNS.primary_training_topic,
col_center=NEOSERRA_COLUMNS.center,
col_funding_source=NEOSERRA_COLUMNS.funding_source,
col_attendees_total=NEOSERRA_COLUMNS.attendees_total,
col_is_preplanning=OUT_COLUMNS.is_preplanning
)
return stats_df
@st.cache_data
def cached_generate_cleaned_trainings_dataset(export_url:str, reportable_only:bool, include_future_events:bool, include_on_demand:bool, allowed_centers:List[str] | None = None):
trainings_df = cached_csv_url_to_dataframe(export_url)
trainings_df = generate_cleaned_trainings_dataset(
trainings_df,
col_neo_event_title=NEOSERRA_COLUMNS.event_title,
col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic,
col_neo_training_topics=NEOSERRA_COLUMNS.training_topics,
col_neo_center=NEOSERRA_COLUMNS.center,
col_is_preplanning=OUT_COLUMNS.is_preplanning,
col_neo_attendees_total=NEOSERRA_COLUMNS.attendees_total,
col_out_attendees_range=OUT_COLUMNS.attendees_range
)
if allowed_centers is not None:
trainings_df = trainings_df[trainings_df[NEOSERRA_COLUMNS.center].isin(allowed_centers)]
if reportable_only:
trainings_df = trainings_df[trainings_df[NEOSERRA_COLUMNS.reportable] == 1]
# Convert the start date to an actual date object, then filter out all future events if they are not desired by the user
trainings_df[NEOSERRA_COLUMNS.start_date] = pd.to_datetime(trainings_df[NEOSERRA_COLUMNS.start_date], format="%m/%d/%Y")
if not include_future_events:
trainings_df = trainings_df[trainings_df[NEOSERRA_COLUMNS.start_date].dt.date < datetime.date.today()]
if not include_on_demand:
trainings_df = trainings_df[trainings_df[NEOSERRA_COLUMNS.program_format] != Constants.ON_DEMAND_VALUE.value]
return trainings_df