first commit
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,104 @@
|
||||
from typing import List, Tuple
|
||||
|
||||
import pandas as pd
|
||||
import streamlit as st
|
||||
|
||||
from constants_module import OUT_COLUMNS, NEOSERRA_COLUMNS
|
||||
from section_1_datasets_module import generate_client_list_dataset, get_pa_naics_data, get_bls_naics11_data, get_bls_naics92_data, create_naics_census_percentage_table, make_county_naics_dataset
|
||||
from shared_tools_module import csv_url_to_dataframe
|
||||
from cached_function_wrappers.shared import cached_csv_url_to_dataframe
|
||||
from pasbdc_data_cleaning import clean_center_name, remove_duplicate_client_records, remove_api_testing_clients
|
||||
|
||||
|
||||
@st.cache_data
|
||||
def cached_get_pa_naics_source_data(census_year:str) -> pd.DataFrame:
|
||||
return get_pa_naics_data(census_year)
|
||||
|
||||
@st.cache_data
|
||||
def cached_get_bls_naics11_data(usda_api_key:str, census_year:str) -> pd.DataFrame:
|
||||
return get_bls_naics11_data(api_key=usda_api_key, year=census_year)
|
||||
|
||||
@st.cache_data
|
||||
def cached_get_bls_naics92_data(census_year:str) -> pd.DataFrame:
|
||||
return get_bls_naics92_data(year=census_year)
|
||||
|
||||
@st.cache_data
|
||||
def filter_df_by_naics_codes(in_df:pd.DataFrame, naics_codes: List[int]) -> pd.DataFrame:
|
||||
return in_df[
|
||||
in_df[OUT_COLUMNS.naics_2].isin(naics_codes)
|
||||
]
|
||||
|
||||
def cached_create_naics_census_percentage_table(usda_api_key:str, census_year:str):
|
||||
df_naics_census = cached_get_pa_naics_source_data(census_year=census_year)
|
||||
df_naics_11 = cached_get_bls_naics11_data(usda_api_key=usda_api_key, census_year=census_year)
|
||||
df_naics_92 = cached_get_bls_naics92_data(census_year=census_year)
|
||||
|
||||
census_table = create_naics_census_percentage_table(
|
||||
df_naics_census=df_naics_census,
|
||||
df_naics_11=df_naics_11,
|
||||
df_naics_92=df_naics_92,
|
||||
col_bls_industry = OUT_COLUMNS.bls_industry,
|
||||
col_bls_estab = OUT_COLUMNS.bls_estab,
|
||||
col_usda_value = OUT_COLUMNS.usda_value,
|
||||
col_unified_naics = OUT_COLUMNS.unified_naics,
|
||||
col_census_estab = OUT_COLUMNS.census_estab,
|
||||
col_census_pct = OUT_COLUMNS.census_pct,
|
||||
col_naics_label = OUT_COLUMNS.naics_label,
|
||||
col_census_naics = OUT_COLUMNS.census_naics
|
||||
)
|
||||
|
||||
return census_table
|
||||
|
||||
@st.cache_data
|
||||
def cached_generate_client_naics_dataset(export_module_url, usda_api_key, census_year, centers) -> pd.DataFrame:
|
||||
raw_client_df = cached_csv_url_to_dataframe(export_module_url).copy()
|
||||
clean_center_name(raw_client_df)
|
||||
|
||||
raw_client_df = raw_client_df[raw_client_df[NEOSERRA_COLUMNS.center].isin(centers)]
|
||||
|
||||
naics_df = cached_create_naics_census_percentage_table(usda_api_key, census_year)
|
||||
|
||||
client_list_df = generate_client_list_dataset(
|
||||
naics_df=naics_df,
|
||||
df_client_list=raw_client_df,
|
||||
col_unified_naics=OUT_COLUMNS.unified_naics,
|
||||
col_census_pct=OUT_COLUMNS.census_pct,
|
||||
col_naics_2=OUT_COLUMNS.naics_2,
|
||||
col_pa_naics_pct=OUT_COLUMNS.pa_naics_pct,
|
||||
col_pasbdc_pct=OUT_COLUMNS.pasbdc_pct,
|
||||
col_neo_primary_naics=NEOSERRA_COLUMNS.primary_naics,
|
||||
col_neo_naics=NEOSERRA_COLUMNS.naics,
|
||||
# This tells the functions that there is no NAICs column, only a primary NAICs one
|
||||
# Neoserra in their infinite wisdom does not allow you to see this column
|
||||
# in the export module output
|
||||
bypass_secondary_naics_list=True
|
||||
)
|
||||
|
||||
client_list_df = remove_duplicate_client_records(client_list_df)
|
||||
client_list_df = remove_api_testing_clients(client_list_df)
|
||||
|
||||
return client_list_df
|
||||
|
||||
@st.cache_data
|
||||
def cached_get_county_dataset(client_list_export_module_url:str, usda_api_key:str, census_year:str, centers:List[str]) -> pd.DataFrame:
|
||||
client_list_df = cached_generate_client_naics_dataset(
|
||||
client_list_export_module_url,
|
||||
usda_api_key,
|
||||
census_year,
|
||||
centers
|
||||
)
|
||||
|
||||
county_df = make_county_naics_dataset(
|
||||
client_list_df,
|
||||
col_out_county=OUT_COLUMNS.county,
|
||||
col_out_fips=OUT_COLUMNS.fips,
|
||||
col_out_unique=OUT_COLUMNS.unique_valid_naics,
|
||||
col_out_missing=OUT_COLUMNS.missing_naics,
|
||||
col_out_total=OUT_COLUMNS.total_clients,
|
||||
col_out_pct_missing=OUT_COLUMNS.pct_missing_naics,
|
||||
col_neo_county=NEOSERRA_COLUMNS.physical_address_county,
|
||||
col_naics_2=OUT_COLUMNS.naics_2,
|
||||
col_out_of_state=OUT_COLUMNS.county_out_of_state
|
||||
)
|
||||
|
||||
return county_df
|
||||
@@ -0,0 +1,29 @@
|
||||
from typing import List
|
||||
|
||||
import streamlit as st
|
||||
|
||||
from .shared import cached_csv_url_to_dataframe, remove_duplicate_client_records
|
||||
from milestone_attribution_dataset_module import sanitize_funding_data
|
||||
from constants_module import NEOSERRA_COLUMNS, OUT_COLUMNS
|
||||
|
||||
@st.cache_data
|
||||
def cached_sanitize_funding_data(export_url:str, reportable_only:bool, allowed_centers:List[str] | None = None):
|
||||
funding_df = cached_csv_url_to_dataframe(export_url)
|
||||
|
||||
funding_df = sanitize_funding_data(
|
||||
funding_df,
|
||||
col_neo_attribution_source=NEOSERRA_COLUMNS.milestone_attribution_source,
|
||||
col_neo_affirmation=NEOSERRA_COLUMNS.milestone_affirmation,
|
||||
col_out_documentation_level=OUT_COLUMNS.milestone_documentation_level,
|
||||
col_neo_center=NEOSERRA_COLUMNS.center
|
||||
)
|
||||
|
||||
funding_df = remove_duplicate_client_records(funding_df)
|
||||
|
||||
if allowed_centers is not None:
|
||||
funding_df = funding_df[funding_df[NEOSERRA_COLUMNS.center].isin(allowed_centers)]
|
||||
|
||||
if reportable_only:
|
||||
funding_df = funding_df[funding_df[NEOSERRA_COLUMNS.reportable] == 1]
|
||||
|
||||
return funding_df
|
||||
@@ -0,0 +1,38 @@
|
||||
from typing import List
|
||||
|
||||
import streamlit as st
|
||||
|
||||
from constants_module import NEOSERRA_COLUMNS, OUT_COLUMNS
|
||||
from milestone_attribution_dataset_module import sanitize_nbs_data
|
||||
from cached_function_wrappers.shared import cached_csv_url_to_dataframe
|
||||
from pasbdc_data_cleaning import remove_duplicate_client_records
|
||||
|
||||
|
||||
@st.cache_data
|
||||
def cached_get_nbs_data(export_url:str, reportable_only:bool, allowed_centers:List[str] | None = None):
|
||||
nbs_df = cached_csv_url_to_dataframe(export_url)
|
||||
|
||||
nbs_df = sanitize_nbs_data(
|
||||
nbs_df,
|
||||
col_neo_center=NEOSERRA_COLUMNS.center,
|
||||
col_neo_client_id=NEOSERRA_COLUMNS.client_id,
|
||||
col_neo_milestone_date=NEOSERRA_COLUMNS.milestone_date,
|
||||
col_neo_attribution_date=NEOSERRA_COLUMNS.attribution_date,
|
||||
col_neo_attribution_source=NEOSERRA_COLUMNS.milestone_attribution_source,
|
||||
col_neo_affirmation=NEOSERRA_COLUMNS.milestone_affirmation,
|
||||
col_neo_milestone_type=NEOSERRA_COLUMNS.milestone_type_name,
|
||||
col_out_documentation_level=OUT_COLUMNS.milestone_documentation_level,
|
||||
col_neo_reportable=NEOSERRA_COLUMNS.reportable,
|
||||
business_start_impact_val=NEOSERRA_COLUMNS.business_start_impact_val,
|
||||
business_established_val=NEOSERRA_COLUMNS.business_established_val
|
||||
)
|
||||
|
||||
nbs_df = remove_duplicate_client_records(nbs_df)
|
||||
|
||||
if allowed_centers is not None:
|
||||
nbs_df = nbs_df[nbs_df[NEOSERRA_COLUMNS.center].isin(allowed_centers)]
|
||||
|
||||
if reportable_only:
|
||||
nbs_df = nbs_df[nbs_df[NEOSERRA_COLUMNS.reportable] == 1]
|
||||
|
||||
return nbs_df
|
||||
20
streamlit_dashboard/cached_function_wrappers/shared.py
Normal file
20
streamlit_dashboard/cached_function_wrappers/shared.py
Normal file
@@ -0,0 +1,20 @@
|
||||
import streamlit as st
|
||||
import pandas as pd
|
||||
|
||||
from shared_tools_module import csv_url_to_dataframe
|
||||
from pasbdc_data_cleaning import clean_center_name, remove_duplicate_client_records, remove_api_testing_clients
|
||||
from constants_module import NEOSERRA_COLUMNS, OUT_COLUMNS
|
||||
|
||||
@st.cache_data
|
||||
def cached_csv_url_to_dataframe(export_module_url) -> pd.DataFrame:
|
||||
return csv_url_to_dataframe(export_module_url)
|
||||
|
||||
@st.cache_data
|
||||
def get_df_centers(export_module_url:str) -> pd.DataFrame:
|
||||
raw_client_df = csv_url_to_dataframe(export_module_url).copy()
|
||||
|
||||
clean_center_name(raw_client_df)
|
||||
raw_client_df = remove_duplicate_client_records(raw_client_df)
|
||||
raw_client_df = remove_api_testing_clients(raw_client_df)
|
||||
|
||||
return raw_client_df[NEOSERRA_COLUMNS.center].unique()
|
||||
@@ -0,0 +1,61 @@
|
||||
from typing import List
|
||||
import datetime
|
||||
|
||||
from .shared import cached_csv_url_to_dataframe
|
||||
from section_1_datasets_module import generate_cleaned_trainings_dataset, generate_center_trainings_count_statistics
|
||||
from constants_module import NEOSERRA_COLUMNS, OUT_COLUMNS, Constants
|
||||
|
||||
import pandas as pd
|
||||
import streamlit as st
|
||||
@st.cache_data
|
||||
def cached_generate_center_trainings_count_statistics(export_url:str, reportable_only:bool, include_future_events:bool, include_on_demand:bool, allowed_centers:List[str] | None = None):
|
||||
trainings_df = cached_generate_cleaned_trainings_dataset(export_url,
|
||||
reportable_only=reportable_only,
|
||||
allowed_centers=allowed_centers,
|
||||
include_future_events=include_future_events,
|
||||
include_on_demand=include_on_demand
|
||||
)
|
||||
attendees_numeric = pd.to_numeric(trainings_df[NEOSERRA_COLUMNS.attendees_total], errors='coerce').fillna(0)
|
||||
stats_df = generate_center_trainings_count_statistics(
|
||||
full_df=trainings_df,
|
||||
filtered_df=trainings_df[attendees_numeric == 0],
|
||||
funding_source_group=['Core Services', 'LEXNET', 'PDA', 'NAP'],
|
||||
col_primary_topic=NEOSERRA_COLUMNS.primary_training_topic,
|
||||
col_center=NEOSERRA_COLUMNS.center,
|
||||
col_funding_source=NEOSERRA_COLUMNS.funding_source,
|
||||
col_attendees_total=NEOSERRA_COLUMNS.attendees_total,
|
||||
col_is_preplanning=OUT_COLUMNS.is_preplanning
|
||||
)
|
||||
|
||||
return stats_df
|
||||
|
||||
@st.cache_data
|
||||
def cached_generate_cleaned_trainings_dataset(export_url:str, reportable_only:bool, include_future_events:bool, include_on_demand:bool, allowed_centers:List[str] | None = None):
|
||||
trainings_df = cached_csv_url_to_dataframe(export_url)
|
||||
|
||||
trainings_df = generate_cleaned_trainings_dataset(
|
||||
trainings_df,
|
||||
col_neo_event_title=NEOSERRA_COLUMNS.event_title,
|
||||
col_neo_primary_topic=NEOSERRA_COLUMNS.primary_training_topic,
|
||||
col_neo_training_topics=NEOSERRA_COLUMNS.training_topics,
|
||||
col_neo_center=NEOSERRA_COLUMNS.center,
|
||||
col_is_preplanning=OUT_COLUMNS.is_preplanning,
|
||||
col_neo_attendees_total=NEOSERRA_COLUMNS.attendees_total,
|
||||
col_out_attendees_range=OUT_COLUMNS.attendees_range
|
||||
)
|
||||
|
||||
if allowed_centers is not None:
|
||||
trainings_df = trainings_df[trainings_df[NEOSERRA_COLUMNS.center].isin(allowed_centers)]
|
||||
|
||||
if reportable_only:
|
||||
trainings_df = trainings_df[trainings_df[NEOSERRA_COLUMNS.reportable] == 1]
|
||||
|
||||
# Convert the start date to an actual date object, then filter out all future events if they are not desired by the user
|
||||
trainings_df[NEOSERRA_COLUMNS.start_date] = pd.to_datetime(trainings_df[NEOSERRA_COLUMNS.start_date], format="%m/%d/%Y")
|
||||
if not include_future_events:
|
||||
trainings_df = trainings_df[trainings_df[NEOSERRA_COLUMNS.start_date].dt.date < datetime.date.today()]
|
||||
|
||||
if not include_on_demand:
|
||||
trainings_df = trainings_df[trainings_df[NEOSERRA_COLUMNS.program_format] != Constants.ON_DEMAND_VALUE.value]
|
||||
|
||||
return trainings_df
|
||||
Reference in New Issue
Block a user