first commit

This commit is contained in:
2026-05-21 08:40:24 -04:00
commit b084545275
711 changed files with 3659856 additions and 0 deletions

View File

@@ -0,0 +1,6 @@
# Common Cleaning Functions Library
---
This python library aims to provide some common utility functions you can use while working with data that has come from Neoserra.
## Functions:
clean_center_name - Turns the messy center names from Neoserra to the standard center labels we should be using in graphs

View File

@@ -0,0 +1,4 @@
Metadata-Version: 2.4
Name: pasbdc_data_cleaning
Version: 0.1.0
Summary: Internal data cleaning library for the PASBDC written by Vincent Allen

View File

@@ -0,0 +1,8 @@
README.md
pyproject.toml
pasbdc_data_cleaning/__init__.py
pasbdc_data_cleaning/sbdclibrary.py
pasbdc_data_cleaning.egg-info/PKG-INFO
pasbdc_data_cleaning.egg-info/SOURCES.txt
pasbdc_data_cleaning.egg-info/dependency_links.txt
pasbdc_data_cleaning.egg-info/top_level.txt

View File

@@ -0,0 +1 @@
pasbdc_data_cleaning

View File

@@ -0,0 +1,3 @@
# libs/pasbdc_data_cleaning/__init__.py
from .sbdclibrary import clean_center_name, tag_county_out_of_state, remove_duplicate_client_records, remove_api_testing_clients
__all__ = ['clean_center_name', 'tag_county_out_of_state', 'remove_duplicate_client_records', 'remove_api_testing_clients']

View File

@@ -0,0 +1,169 @@
import pandas as pd
from constants_module import NEOSERRA_COLUMNS
CENTER_NAME_MAPPING = {
"PI - Washington County":"Pittsburgh",
"University of Pittsburgh SBDC":"Pittsburgh",
"Indiana County":"Pittsburgh",
"PI - Greene County":"Pittsburgh",
"TE - TEMPLE SBDC":"Temple",
"TE - WCU Procurement Assistance Center":"Temple",
"Kutztown University SBDC": "Kutztown",
"K - Kutztown SBDC":"Kutztown",
"WD - WIDENER SBDC": "Widener",
"WD - Norristown Outreach":"Widener",
"The University of Scranton SBDC": "Scranton",
"SC - Monroe Outreach":"Scranton",
"SC - Wyoming Outreach":"Scranton",
"SC - Susquehanna Outreach":"Scranton",
"SC - Scranton Outreach":"Scranton",
"SC - Pike Outreach":"Scranton",
"SC - Wayne Outreach":"Scranton",
"SC - Lackawanna Outreach":"Scranton",
"PennWest University Clarion SBDC":"Clarion",
"Clarion CARES Act":"Clarion",
"WI - WILKES SBDC":"Wilkes",
"WI - WIlkes":"Wilkes",
"WI - Bloomsburg":"Wilkes",
"LE - LEHIGH UNIVERSITY SBDC":"Lehigh",
"G - GANNON SBDC":"Gannon",
"G - Meadville":"Gannon",
"G - Mercer":"Gannon",
"G - Warren":"Gannon",
"Penn State SBDC":"Penn State",
"SH - SHIPPENSBURG SBDC":"Shippensburg",
"Duquesne University SBDC":"Duquesne",
"Bucknell SBDC":"Bucknell",
"SF - ST. FRANCIS UNIVERSITY SBDC": "St. Francis",
"SF - Somerset Outreach":"St. Francis",
"SF - Bedford Outreach":"St. Francis",
"SF - Blair Outreach":"St. Francis",
"SF - St Francis University SBDC":"St. Francis",
"SF - St Francis Universty SBDC":"St. Francis",
"SF - St Francis Univeristy SBDC":"St. Francis",
"SV - ST. VINCENT COLLEGE SBDC":"St. Vincent",
"SV - AIHP Outreach":"St. Vincent",
"SV - WEDC Outreach":"St. Vincent",
"SV - Fayette Outreach":"St. Vincent",
"SV - Jeannette Outreach":"St. Vincent",
"LE - Bucks County/Lehigh SBDC":"Lehigh",
"Southeast Pennsylvania APEX Accelerator":"Z-Lead Office",
"EMAP":"Z-Lead Office",
"Pennsylvania SBDC Lead Office":"Z-Lead Office",
"State Small Business Credit Initiative (SSBCI)":"Z-Lead Office",
"D - Beaver":"Duquesne",
"AA - Stakeholders/Partners":"Z-Lead Office",
"AA - Temple Stakeholders/Partners":"Z-Lead Office",
"Wharton":"Z-Lead Office",
"Wharton SBDC":"Z-Lead Office",
"Lock Haven SBDC":"Penn State"
# "Kutztown University SBDC ": "Kutztown",
#" Kutztown University SBDC": "Kutztown",
#" Pennsylvania SBDC Lead Office":"Lead Office",
}
VALID_PA_COUNTIES = [
"juniata",
"montour",
"northumberland",
"perry",
"snyder",
"union",
"allegheny",
"beaver",
"butler",
"lawrence",
"washington",
"crawford",
"erie",
"mercer",
"warren",
"dauphin",
"lancaster",
"lebanon",
"berks",
"chester",
"bucks",
"lehigh",
"northampton",
"centre",
"clinton",
"lycoming",
"mifflin",
"armstrong",
"cameron",
"clarion",
"clearfield",
"elk",
"forest",
"jefferson",
"mckean",
"potter",
"venango",
"bedford",
"blair",
"cambria",
"fulton",
"huntingdon",
"somerset",
"fayette",
"westmoreland",
"adams",
"cumberland",
"franklin",
"york",
"montgomery",
"philadelphia",
"greene",
"indiana",
"bradford",
"lackawanna",
"monroe",
"pike",
"susquehanna",
"tioga",
"wayne",
"wyoming",
"delaware",
"carbon",
"columbia",
"luzerne",
"schuylkill",
"sullivan"
]
def tag_county_out_of_state(df):
if str(df['Physical Address County']).lower() not in VALID_PA_COUNTIES:
df['County Out of State'] = True
else:
df['County Out of State'] = False
return df
def remove_duplicate_client_records(df:pd.DataFrame) -> pd.DataFrame:
new_df = df.copy()
new_df = new_df[new_df[NEOSERRA_COLUMNS.center] != "Duplicate Client Records"]
return new_df
def remove_api_testing_clients(df:pd.DataFrame) -> pd.DataFrame:
new_df = df.copy()
new_df = new_df[new_df[NEOSERRA_COLUMNS.center] != "API Testing Sandbox"]
return new_df
def clean_center_name(df: pd.DataFrame, center_name_column:str="Center"):
'''
Cleans up the Center column of Neoserra data export using the conversion mapping defined in the library code which can be accessed
via the constant CENTER_NAME_MAPPING
Parameters:
df - the dataframe to clean up
center_name_column - the column of the datafram containing the center names
Returns:
None
'''
df[center_name_column] = df[center_name_column].astype(str).str.strip()
df[center_name_column] = df[center_name_column].replace(CENTER_NAME_MAPPING)

View File

@@ -0,0 +1,13 @@
# libs/pasbdc_data_cleaning/pyproject.toml
[build-system]
requires = ["setuptools", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "pasbdc_data_cleaning"
version = "0.1.0"
description = "Internal data cleaning library for the PASBDC written by Vincent Allen"
# MOVED: Configuration specific to setuptools goes here
[tool.setuptools]
packages = ["pasbdc_data_cleaning"]