first commit
This commit is contained in:
6
libs/pasbdc_data_cleaning/README.md
Normal file
6
libs/pasbdc_data_cleaning/README.md
Normal file
@@ -0,0 +1,6 @@
|
||||
# Common Cleaning Functions Library
|
||||
---
|
||||
This python library aims to provide some common utility functions you can use while working with data that has come from Neoserra.
|
||||
|
||||
## Functions:
|
||||
clean_center_name - Turns the messy center names from Neoserra to the standard center labels we should be using in graphs
|
||||
@@ -0,0 +1,4 @@
|
||||
Metadata-Version: 2.4
|
||||
Name: pasbdc_data_cleaning
|
||||
Version: 0.1.0
|
||||
Summary: Internal data cleaning library for the PASBDC written by Vincent Allen
|
||||
@@ -0,0 +1,8 @@
|
||||
README.md
|
||||
pyproject.toml
|
||||
pasbdc_data_cleaning/__init__.py
|
||||
pasbdc_data_cleaning/sbdclibrary.py
|
||||
pasbdc_data_cleaning.egg-info/PKG-INFO
|
||||
pasbdc_data_cleaning.egg-info/SOURCES.txt
|
||||
pasbdc_data_cleaning.egg-info/dependency_links.txt
|
||||
pasbdc_data_cleaning.egg-info/top_level.txt
|
||||
@@ -0,0 +1 @@
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
pasbdc_data_cleaning
|
||||
@@ -0,0 +1,3 @@
|
||||
# libs/pasbdc_data_cleaning/__init__.py
|
||||
from .sbdclibrary import clean_center_name, tag_county_out_of_state, remove_duplicate_client_records, remove_api_testing_clients
|
||||
__all__ = ['clean_center_name', 'tag_county_out_of_state', 'remove_duplicate_client_records', 'remove_api_testing_clients']
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
169
libs/pasbdc_data_cleaning/pasbdc_data_cleaning/sbdclibrary.py
Normal file
169
libs/pasbdc_data_cleaning/pasbdc_data_cleaning/sbdclibrary.py
Normal file
@@ -0,0 +1,169 @@
|
||||
import pandas as pd
|
||||
|
||||
from constants_module import NEOSERRA_COLUMNS
|
||||
|
||||
CENTER_NAME_MAPPING = {
|
||||
"PI - Washington County":"Pittsburgh",
|
||||
"University of Pittsburgh SBDC":"Pittsburgh",
|
||||
"Indiana County":"Pittsburgh",
|
||||
"PI - Greene County":"Pittsburgh",
|
||||
"TE - TEMPLE SBDC":"Temple",
|
||||
"TE - WCU Procurement Assistance Center":"Temple",
|
||||
"Kutztown University SBDC": "Kutztown",
|
||||
"K - Kutztown SBDC":"Kutztown",
|
||||
"WD - WIDENER SBDC": "Widener",
|
||||
"WD - Norristown Outreach":"Widener",
|
||||
"The University of Scranton SBDC": "Scranton",
|
||||
"SC - Monroe Outreach":"Scranton",
|
||||
"SC - Wyoming Outreach":"Scranton",
|
||||
"SC - Susquehanna Outreach":"Scranton",
|
||||
"SC - Scranton Outreach":"Scranton",
|
||||
"SC - Pike Outreach":"Scranton",
|
||||
"SC - Wayne Outreach":"Scranton",
|
||||
"SC - Lackawanna Outreach":"Scranton",
|
||||
"PennWest University Clarion SBDC":"Clarion",
|
||||
"Clarion CARES Act":"Clarion",
|
||||
"WI - WILKES SBDC":"Wilkes",
|
||||
"WI - WIlkes":"Wilkes",
|
||||
"WI - Bloomsburg":"Wilkes",
|
||||
"LE - LEHIGH UNIVERSITY SBDC":"Lehigh",
|
||||
"G - GANNON SBDC":"Gannon",
|
||||
"G - Meadville":"Gannon",
|
||||
"G - Mercer":"Gannon",
|
||||
"G - Warren":"Gannon",
|
||||
"Penn State SBDC":"Penn State",
|
||||
"SH - SHIPPENSBURG SBDC":"Shippensburg",
|
||||
"Duquesne University SBDC":"Duquesne",
|
||||
"Bucknell SBDC":"Bucknell",
|
||||
"SF - ST. FRANCIS UNIVERSITY SBDC": "St. Francis",
|
||||
"SF - Somerset Outreach":"St. Francis",
|
||||
"SF - Bedford Outreach":"St. Francis",
|
||||
"SF - Blair Outreach":"St. Francis",
|
||||
"SF - St Francis University SBDC":"St. Francis",
|
||||
"SF - St Francis Universty SBDC":"St. Francis",
|
||||
"SF - St Francis Univeristy SBDC":"St. Francis",
|
||||
"SV - ST. VINCENT COLLEGE SBDC":"St. Vincent",
|
||||
"SV - AIHP Outreach":"St. Vincent",
|
||||
"SV - WEDC Outreach":"St. Vincent",
|
||||
"SV - Fayette Outreach":"St. Vincent",
|
||||
"SV - Jeannette Outreach":"St. Vincent",
|
||||
"LE - Bucks County/Lehigh SBDC":"Lehigh",
|
||||
"Southeast Pennsylvania APEX Accelerator":"Z-Lead Office",
|
||||
"EMAP":"Z-Lead Office",
|
||||
"Pennsylvania SBDC Lead Office":"Z-Lead Office",
|
||||
"State Small Business Credit Initiative (SSBCI)":"Z-Lead Office",
|
||||
"D - Beaver":"Duquesne",
|
||||
"AA - Stakeholders/Partners":"Z-Lead Office",
|
||||
"AA - Temple Stakeholders/Partners":"Z-Lead Office",
|
||||
"Wharton":"Z-Lead Office",
|
||||
"Wharton SBDC":"Z-Lead Office",
|
||||
"Lock Haven SBDC":"Penn State"
|
||||
|
||||
# "Kutztown University SBDC ": "Kutztown",
|
||||
#" Kutztown University SBDC": "Kutztown",
|
||||
#" Pennsylvania SBDC Lead Office":"Lead Office",
|
||||
}
|
||||
|
||||
VALID_PA_COUNTIES = [
|
||||
"juniata",
|
||||
"montour",
|
||||
"northumberland",
|
||||
"perry",
|
||||
"snyder",
|
||||
"union",
|
||||
"allegheny",
|
||||
"beaver",
|
||||
"butler",
|
||||
"lawrence",
|
||||
"washington",
|
||||
"crawford",
|
||||
"erie",
|
||||
"mercer",
|
||||
"warren",
|
||||
"dauphin",
|
||||
"lancaster",
|
||||
"lebanon",
|
||||
"berks",
|
||||
"chester",
|
||||
"bucks",
|
||||
"lehigh",
|
||||
"northampton",
|
||||
"centre",
|
||||
"clinton",
|
||||
"lycoming",
|
||||
"mifflin",
|
||||
"armstrong",
|
||||
"cameron",
|
||||
"clarion",
|
||||
"clearfield",
|
||||
"elk",
|
||||
"forest",
|
||||
"jefferson",
|
||||
"mckean",
|
||||
"potter",
|
||||
"venango",
|
||||
"bedford",
|
||||
"blair",
|
||||
"cambria",
|
||||
"fulton",
|
||||
"huntingdon",
|
||||
"somerset",
|
||||
"fayette",
|
||||
"westmoreland",
|
||||
"adams",
|
||||
"cumberland",
|
||||
"franklin",
|
||||
"york",
|
||||
"montgomery",
|
||||
"philadelphia",
|
||||
"greene",
|
||||
"indiana",
|
||||
"bradford",
|
||||
"lackawanna",
|
||||
"monroe",
|
||||
"pike",
|
||||
"susquehanna",
|
||||
"tioga",
|
||||
"wayne",
|
||||
"wyoming",
|
||||
"delaware",
|
||||
"carbon",
|
||||
"columbia",
|
||||
"luzerne",
|
||||
"schuylkill",
|
||||
"sullivan"
|
||||
]
|
||||
def tag_county_out_of_state(df):
|
||||
if str(df['Physical Address County']).lower() not in VALID_PA_COUNTIES:
|
||||
df['County Out of State'] = True
|
||||
else:
|
||||
df['County Out of State'] = False
|
||||
|
||||
return df
|
||||
|
||||
def remove_duplicate_client_records(df:pd.DataFrame) -> pd.DataFrame:
|
||||
new_df = df.copy()
|
||||
new_df = new_df[new_df[NEOSERRA_COLUMNS.center] != "Duplicate Client Records"]
|
||||
return new_df
|
||||
|
||||
def remove_api_testing_clients(df:pd.DataFrame) -> pd.DataFrame:
|
||||
new_df = df.copy()
|
||||
new_df = new_df[new_df[NEOSERRA_COLUMNS.center] != "API Testing Sandbox"]
|
||||
return new_df
|
||||
|
||||
def clean_center_name(df: pd.DataFrame, center_name_column:str="Center"):
|
||||
'''
|
||||
Cleans up the Center column of Neoserra data export using the conversion mapping defined in the library code which can be accessed
|
||||
via the constant CENTER_NAME_MAPPING
|
||||
|
||||
Parameters:
|
||||
df - the dataframe to clean up
|
||||
center_name_column - the column of the datafram containing the center names
|
||||
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
df[center_name_column] = df[center_name_column].astype(str).str.strip()
|
||||
df[center_name_column] = df[center_name_column].replace(CENTER_NAME_MAPPING)
|
||||
|
||||
|
||||
13
libs/pasbdc_data_cleaning/pyproject.toml
Normal file
13
libs/pasbdc_data_cleaning/pyproject.toml
Normal file
@@ -0,0 +1,13 @@
|
||||
# libs/pasbdc_data_cleaning/pyproject.toml
|
||||
[build-system]
|
||||
requires = ["setuptools", "wheel"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "pasbdc_data_cleaning"
|
||||
version = "0.1.0"
|
||||
description = "Internal data cleaning library for the PASBDC written by Vincent Allen"
|
||||
|
||||
# MOVED: Configuration specific to setuptools goes here
|
||||
[tool.setuptools]
|
||||
packages = ["pasbdc_data_cleaning"]
|
||||
Reference in New Issue
Block a user