first commit

2026-05-21 08:40:24 -04:00
commit b084545275
711 changed files with 3659856 additions and 0 deletions
--- a/streamlit_dashboard/cached_function_wrappers/init.py
+++ b/streamlit_dashboard/cached_function_wrappers/init.py
--- a/streamlit_dashboard/cached_function_wrappers/pycache/init.cpython-312.pyc
+++ b/streamlit_dashboard/cached_function_wrappers/pycache/init.cpython-312.pyc
--- a/streamlit_dashboard/cached_function_wrappers/pycache/client_list_cached_functions.cpython-312.pyc
+++ b/streamlit_dashboard/cached_function_wrappers/pycache/client_list_cached_functions.cpython-312.pyc
--- a/streamlit_dashboard/cached_function_wrappers/pycache/funding_milestones_cached_functions.cpython-312.pyc
+++ b/streamlit_dashboard/cached_function_wrappers/pycache/funding_milestones_cached_functions.cpython-312.pyc
--- a/streamlit_dashboard/cached_function_wrappers/pycache/nbs_cached_functions.cpython-312.pyc
+++ b/streamlit_dashboard/cached_function_wrappers/pycache/nbs_cached_functions.cpython-312.pyc
--- a/streamlit_dashboard/cached_function_wrappers/pycache/shared.cpython-312.pyc
+++ b/streamlit_dashboard/cached_function_wrappers/pycache/shared.cpython-312.pyc
--- a/streamlit_dashboard/cached_function_wrappers/pycache/trainings_cached_functions.cpython-312.pyc
+++ b/streamlit_dashboard/cached_function_wrappers/pycache/trainings_cached_functions.cpython-312.pyc
--- a/streamlit_dashboard/cached_function_wrappers/client_list_cached_functions.py
+++ b/streamlit_dashboard/cached_function_wrappers/client_list_cached_functions.py
@@ -0,0 +1,104 @@
+from typing import List, Tuple
+
+import pandas as pd
+import streamlit as st
+
+from constants_module import OUT_COLUMNS, NEOSERRA_COLUMNS
+from section_1_datasets_module import generate_client_list_dataset, get_pa_naics_data, get_bls_naics11_data, get_bls_naics92_data, create_naics_census_percentage_table, make_county_naics_dataset
+from shared_tools_module import csv_url_to_dataframe
+from cached_function_wrappers.shared import cached_csv_url_to_dataframe
+from pasbdc_data_cleaning import clean_center_name, remove_duplicate_client_records, remove_api_testing_clients
+
+
+@st.cache_data
+def cached_get_pa_naics_source_data(census_year:str) -> pd.DataFrame:
+    return get_pa_naics_data(census_year)
+
+@st.cache_data
+def cached_get_bls_naics11_data(usda_api_key:str, census_year:str) -> pd.DataFrame:
+    return get_bls_naics11_data(api_key=usda_api_key, year=census_year)
+
+@st.cache_data
+def cached_get_bls_naics92_data(census_year:str) -> pd.DataFrame:
+    return get_bls_naics92_data(year=census_year)
+
+@st.cache_data
+def filter_df_by_naics_codes(in_df:pd.DataFrame, naics_codes: List[int]) -> pd.DataFrame:
+    return in_df[
+        in_df[OUT_COLUMNS.naics_2].isin(naics_codes)
+    ]
+
+def cached_create_naics_census_percentage_table(usda_api_key:str, census_year:str):
+    df_naics_census = cached_get_pa_naics_source_data(census_year=census_year)
+    df_naics_11 = cached_get_bls_naics11_data(usda_api_key=usda_api_key, census_year=census_year)
+    df_naics_92 = cached_get_bls_naics92_data(census_year=census_year)
+
+    census_table = create_naics_census_percentage_table(
+        df_naics_census=df_naics_census,
+        df_naics_11=df_naics_11,
+        df_naics_92=df_naics_92,
+        col_bls_industry = OUT_COLUMNS.bls_industry,
+        col_bls_estab = OUT_COLUMNS.bls_estab,
+        col_usda_value = OUT_COLUMNS.usda_value,
+        col_unified_naics = OUT_COLUMNS.unified_naics,
+        col_census_estab = OUT_COLUMNS.census_estab,
+        col_census_pct = OUT_COLUMNS.census_pct,
+        col_naics_label = OUT_COLUMNS.naics_label,
+        col_census_naics = OUT_COLUMNS.census_naics
+    )
+
+    return census_table
+
+@st.cache_data
+def cached_generate_client_naics_dataset(export_module_url, usda_api_key, census_year, centers) -> pd.DataFrame:
+    raw_client_df = cached_csv_url_to_dataframe(export_module_url).copy()
+    clean_center_name(raw_client_df)
+
+    raw_client_df = raw_client_df[raw_client_df[NEOSERRA_COLUMNS.center].isin(centers)]
+
+    naics_df = cached_create_naics_census_percentage_table(usda_api_key, census_year)
+
+    client_list_df = generate_client_list_dataset(
+        naics_df=naics_df,
+        df_client_list=raw_client_df,
+        col_unified_naics=OUT_COLUMNS.unified_naics,
+        col_census_pct=OUT_COLUMNS.census_pct,
+        col_naics_2=OUT_COLUMNS.naics_2,
+        col_pa_naics_pct=OUT_COLUMNS.pa_naics_pct,
+        col_pasbdc_pct=OUT_COLUMNS.pasbdc_pct,
+        col_neo_primary_naics=NEOSERRA_COLUMNS.primary_naics,
+        col_neo_naics=NEOSERRA_COLUMNS.naics,
+        # This tells the functions that there is no NAICs column, only a primary NAICs one
+        # Neoserra in their infinite wisdom does not allow you to see this column
+        # in the export module output
+        bypass_secondary_naics_list=True
+    )
+
+    client_list_df = remove_duplicate_client_records(client_list_df)
+    client_list_df = remove_api_testing_clients(client_list_df)
+
+    return client_list_df
+
+@st.cache_data
+def cached_get_county_dataset(client_list_export_module_url:str, usda_api_key:str, census_year:str, centers:List[str]) -> pd.DataFrame:
+    client_list_df = cached_generate_client_naics_dataset(
+        client_list_export_module_url,
+        usda_api_key,
+        census_year,
+        centers
+    )
+
+    county_df = make_county_naics_dataset(
+        client_list_df,
+        col_out_county=OUT_COLUMNS.county,
+        col_out_fips=OUT_COLUMNS.fips,
+        col_out_unique=OUT_COLUMNS.unique_valid_naics,
+        col_out_missing=OUT_COLUMNS.missing_naics,
+        col_out_total=OUT_COLUMNS.total_clients,
+        col_out_pct_missing=OUT_COLUMNS.pct_missing_naics,
+        col_neo_county=NEOSERRA_COLUMNS.physical_address_county,
+        col_naics_2=OUT_COLUMNS.naics_2,
+        col_out_of_state=OUT_COLUMNS.county_out_of_state
+    )
+
+    return county_df
--- a/streamlit_dashboard/cached_function_wrappers/funding_milestones_cached_functions.py
+++ b/streamlit_dashboard/cached_function_wrappers/funding_milestones_cached_functions.py
@@ -0,0 +1,29 @@
+from typing import List
+
+import streamlit as st
+
+from .shared import cached_csv_url_to_dataframe, remove_duplicate_client_records
+from milestone_attribution_dataset_module import sanitize_funding_data
+from constants_module import NEOSERRA_COLUMNS, OUT_COLUMNS
+
+@st.cache_data
+def cached_sanitize_funding_data(export_url:str, reportable_only:bool, allowed_centers:List[str] | None = None):
+    funding_df = cached_csv_url_to_dataframe(export_url)
+
+    funding_df = sanitize_funding_data(
+        funding_df,
+        col_neo_attribution_source=NEOSERRA_COLUMNS.milestone_attribution_source,
+        col_neo_affirmation=NEOSERRA_COLUMNS.milestone_affirmation,
+        col_out_documentation_level=OUT_COLUMNS.milestone_documentation_level,
+        col_neo_center=NEOSERRA_COLUMNS.center
+    )
+
+    funding_df = remove_duplicate_client_records(funding_df)
+
+    if allowed_centers is not None:
+        funding_df = funding_df[funding_df[NEOSERRA_COLUMNS.center].isin(allowed_centers)]
+
+    if reportable_only:
+        funding_df = funding_df[funding_df[NEOSERRA_COLUMNS.reportable] == 1]
+
+    return funding_df
--- a/streamlit_dashboard/cached_function_wrappers/nbs_cached_functions.py
+++ b/streamlit_dashboard/cached_function_wrappers/nbs_cached_functions.py
@@ -0,0 +1,38 @@
+from typing import List
+
+import streamlit as st
+
+from constants_module import NEOSERRA_COLUMNS, OUT_COLUMNS
+from milestone_attribution_dataset_module import sanitize_nbs_data
+from cached_function_wrappers.shared import cached_csv_url_to_dataframe
+from pasbdc_data_cleaning import remove_duplicate_client_records
+
+
+@st.cache_data
+def cached_get_nbs_data(export_url:str, reportable_only:bool, allowed_centers:List[str] | None = None):
+    nbs_df = cached_csv_url_to_dataframe(export_url)
+
+    nbs_df = sanitize_nbs_data(
+            nbs_df,
+            col_neo_center=NEOSERRA_COLUMNS.center,
+            col_neo_client_id=NEOSERRA_COLUMNS.client_id,
+            col_neo_milestone_date=NEOSERRA_COLUMNS.milestone_date,
+            col_neo_attribution_date=NEOSERRA_COLUMNS.attribution_date,
+            col_neo_attribution_source=NEOSERRA_COLUMNS.milestone_attribution_source,
+            col_neo_affirmation=NEOSERRA_COLUMNS.milestone_affirmation,
+            col_neo_milestone_type=NEOSERRA_COLUMNS.milestone_type_name,
+            col_out_documentation_level=OUT_COLUMNS.milestone_documentation_level,
+            col_neo_reportable=NEOSERRA_COLUMNS.reportable,
+            business_start_impact_val=NEOSERRA_COLUMNS.business_start_impact_val,
+            business_established_val=NEOSERRA_COLUMNS.business_established_val
+        )
+
+    nbs_df = remove_duplicate_client_records(nbs_df)
+
+    if allowed_centers is not None:
+        nbs_df = nbs_df[nbs_df[NEOSERRA_COLUMNS.center].isin(allowed_centers)]
+
+    if reportable_only:
+        nbs_df = nbs_df[nbs_df[NEOSERRA_COLUMNS.reportable] == 1]
+
+    return nbs_df
--- a/streamlit_dashboard/cached_function_wrappers/shared.py
+++ b/streamlit_dashboard/cached_function_wrappers/shared.py
@@ -0,0 +1,20 @@
+import streamlit as st
+import pandas as pd
+
+from shared_tools_module import csv_url_to_dataframe
+from pasbdc_data_cleaning import clean_center_name, remove_duplicate_client_records, remove_api_testing_clients
+from constants_module import NEOSERRA_COLUMNS, OUT_COLUMNS
+
+@st.cache_data
+def cached_csv_url_to_dataframe(export_module_url) -> pd.DataFrame:
+    return csv_url_to_dataframe(export_module_url)
+
+@st.cache_data
+def get_df_centers(export_module_url:str) -> pd.DataFrame:
+    raw_client_df = csv_url_to_dataframe(export_module_url).copy()
+
+    clean_center_name(raw_client_df)
+    raw_client_df = remove_duplicate_client_records(raw_client_df)
+    raw_client_df = remove_api_testing_clients(raw_client_df)
+
+    return raw_client_df[NEOSERRA_COLUMNS.center].unique()
--- a/streamlit_dashboard/cached_function_wrappers/trainings_cached_functions.py
+++ b/streamlit_dashboard/cached_function_wrappers/trainings_cached_functions.py
@@ -0,0 +1,61 @@
+from typing import List
+import datetime
+
+from .shared import cached_csv_url_to_dataframe
+from section_1_datasets_module import generate_cleaned_trainings_dataset, generate_center_trainings_count_statistics
+from constants_module import NEOSERRA_COLUMNS, OUT_COLUMNS, Constants
+
+import pandas as pd
+import streamlit as st
+@st.cache_data
+def cached_generate_center_trainings_count_statistics(export_url:str, reportable_only:bool, include_future_events:bool, include_on_demand:bool, allowed_centers:List[str] | None = None):
+        trainings_df = cached_generate_cleaned_trainings_dataset(export_url,
+                                                                 reportable_only=reportable_only,
+                                                                 allowed_centers=allowed_centers,
+                                                                 include_future_events=include_future_events,
+                                                                 include_on_demand=include_on_demand
+                                                                 )
+        attendees_numeric = pd.to_numeric(trainings_df[NEOSERRA_COLUMNS.attendees_total], errors='coerce').fillna(0)
+        stats_df = generate_center_trainings_count_statistics(
+            full_df=trainings_df,
+            filtered_df=trainings_df[attendees_numeric == 0],
+            funding_source_group=['Core Services', 'LEXNET', 'PDA', 'NAP'],
+            col_primary_topic=NEOSERRA_COLUMNS.primary_training_topic,
+            col_center=NEOSERRA_COLUMNS.center,
+            col_funding_source=NEOSERRA_COLUMNS.funding_source,
+            col_attendees_total=NEOSERRA_COLUMNS.attendees_total,
+            col_is_preplanning=OUT_COLUMNS.is_preplanning
+        )
+
+        return stats_df
+
+@st.cache_data
+def cached_generate_cleaned_trainings_dataset(export_url:str, reportable_only:bool, include_future_events:bool, include_on_demand:bool, allowed_centers:List[str] | None = None):
+    trainings_df = cached_csv_url_to_dataframe(export_url)
+
+    trainings_df = generate_cleaned_trainings_dataset(
+        trainings_df,
+        col_neo_event_title=NEOSERRA_COLUMNS.event_title,
+        col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic,
+        col_neo_training_topics=NEOSERRA_COLUMNS.training_topics,
+        col_neo_center=NEOSERRA_COLUMNS.center,
+        col_is_preplanning=OUT_COLUMNS.is_preplanning,
+        col_neo_attendees_total=NEOSERRA_COLUMNS.attendees_total,
+        col_out_attendees_range=OUT_COLUMNS.attendees_range
+    )
+
+    if allowed_centers is not None:
+        trainings_df = trainings_df[trainings_df[NEOSERRA_COLUMNS.center].isin(allowed_centers)]
+
+    if reportable_only:
+        trainings_df = trainings_df[trainings_df[NEOSERRA_COLUMNS.reportable] == 1]
+
+    # Convert the start date to an actual date object, then filter out all future events if they are not desired by the user
+    trainings_df[NEOSERRA_COLUMNS.start_date] = pd.to_datetime(trainings_df[NEOSERRA_COLUMNS.start_date], format="%m/%d/%Y")
+    if not include_future_events:
+        trainings_df = trainings_df[trainings_df[NEOSERRA_COLUMNS.start_date].dt.date < datetime.date.today()]
+
+    if not include_on_demand:
+        trainings_df = trainings_df[trainings_df[NEOSERRA_COLUMNS.program_format] != Constants.ON_DEMAND_VALUE.value]
+
+    return trainings_df