first commit

This commit is contained in:
2026-05-21 08:40:24 -04:00
commit b084545275
711 changed files with 3659856 additions and 0 deletions

View File

@@ -0,0 +1,6 @@
# Common Cleaning Functions Library
---
This python library aims to provide some common utility functions you can use while working with data that has come from Neoserra.
## Functions:
clean_center_name - Turns the messy center names from Neoserra to the standard center labels we should be using in graphs

View File

@@ -0,0 +1,4 @@
Metadata-Version: 2.4
Name: pasbdc_data_cleaning
Version: 0.1.0
Summary: Internal data cleaning library for the PASBDC written by Vincent Allen

View File

@@ -0,0 +1,8 @@
README.md
pyproject.toml
pasbdc_data_cleaning/__init__.py
pasbdc_data_cleaning/sbdclibrary.py
pasbdc_data_cleaning.egg-info/PKG-INFO
pasbdc_data_cleaning.egg-info/SOURCES.txt
pasbdc_data_cleaning.egg-info/dependency_links.txt
pasbdc_data_cleaning.egg-info/top_level.txt

View File

@@ -0,0 +1 @@
pasbdc_data_cleaning

View File

@@ -0,0 +1,3 @@
# libs/pasbdc_data_cleaning/__init__.py
from .sbdclibrary import clean_center_name, tag_county_out_of_state, remove_duplicate_client_records, remove_api_testing_clients
__all__ = ['clean_center_name', 'tag_county_out_of_state', 'remove_duplicate_client_records', 'remove_api_testing_clients']

View File

@@ -0,0 +1,169 @@
import pandas as pd
from constants_module import NEOSERRA_COLUMNS
CENTER_NAME_MAPPING = {
"PI - Washington County":"Pittsburgh",
"University of Pittsburgh SBDC":"Pittsburgh",
"Indiana County":"Pittsburgh",
"PI - Greene County":"Pittsburgh",
"TE - TEMPLE SBDC":"Temple",
"TE - WCU Procurement Assistance Center":"Temple",
"Kutztown University SBDC": "Kutztown",
"K - Kutztown SBDC":"Kutztown",
"WD - WIDENER SBDC": "Widener",
"WD - Norristown Outreach":"Widener",
"The University of Scranton SBDC": "Scranton",
"SC - Monroe Outreach":"Scranton",
"SC - Wyoming Outreach":"Scranton",
"SC - Susquehanna Outreach":"Scranton",
"SC - Scranton Outreach":"Scranton",
"SC - Pike Outreach":"Scranton",
"SC - Wayne Outreach":"Scranton",
"SC - Lackawanna Outreach":"Scranton",
"PennWest University Clarion SBDC":"Clarion",
"Clarion CARES Act":"Clarion",
"WI - WILKES SBDC":"Wilkes",
"WI - WIlkes":"Wilkes",
"WI - Bloomsburg":"Wilkes",
"LE - LEHIGH UNIVERSITY SBDC":"Lehigh",
"G - GANNON SBDC":"Gannon",
"G - Meadville":"Gannon",
"G - Mercer":"Gannon",
"G - Warren":"Gannon",
"Penn State SBDC":"Penn State",
"SH - SHIPPENSBURG SBDC":"Shippensburg",
"Duquesne University SBDC":"Duquesne",
"Bucknell SBDC":"Bucknell",
"SF - ST. FRANCIS UNIVERSITY SBDC": "St. Francis",
"SF - Somerset Outreach":"St. Francis",
"SF - Bedford Outreach":"St. Francis",
"SF - Blair Outreach":"St. Francis",
"SF - St Francis University SBDC":"St. Francis",
"SF - St Francis Universty SBDC":"St. Francis",
"SF - St Francis Univeristy SBDC":"St. Francis",
"SV - ST. VINCENT COLLEGE SBDC":"St. Vincent",
"SV - AIHP Outreach":"St. Vincent",
"SV - WEDC Outreach":"St. Vincent",
"SV - Fayette Outreach":"St. Vincent",
"SV - Jeannette Outreach":"St. Vincent",
"LE - Bucks County/Lehigh SBDC":"Lehigh",
"Southeast Pennsylvania APEX Accelerator":"Z-Lead Office",
"EMAP":"Z-Lead Office",
"Pennsylvania SBDC Lead Office":"Z-Lead Office",
"State Small Business Credit Initiative (SSBCI)":"Z-Lead Office",
"D - Beaver":"Duquesne",
"AA - Stakeholders/Partners":"Z-Lead Office",
"AA - Temple Stakeholders/Partners":"Z-Lead Office",
"Wharton":"Z-Lead Office",
"Wharton SBDC":"Z-Lead Office",
"Lock Haven SBDC":"Penn State"
# "Kutztown University SBDC ": "Kutztown",
#" Kutztown University SBDC": "Kutztown",
#" Pennsylvania SBDC Lead Office":"Lead Office",
}
VALID_PA_COUNTIES = [
"juniata",
"montour",
"northumberland",
"perry",
"snyder",
"union",
"allegheny",
"beaver",
"butler",
"lawrence",
"washington",
"crawford",
"erie",
"mercer",
"warren",
"dauphin",
"lancaster",
"lebanon",
"berks",
"chester",
"bucks",
"lehigh",
"northampton",
"centre",
"clinton",
"lycoming",
"mifflin",
"armstrong",
"cameron",
"clarion",
"clearfield",
"elk",
"forest",
"jefferson",
"mckean",
"potter",
"venango",
"bedford",
"blair",
"cambria",
"fulton",
"huntingdon",
"somerset",
"fayette",
"westmoreland",
"adams",
"cumberland",
"franklin",
"york",
"montgomery",
"philadelphia",
"greene",
"indiana",
"bradford",
"lackawanna",
"monroe",
"pike",
"susquehanna",
"tioga",
"wayne",
"wyoming",
"delaware",
"carbon",
"columbia",
"luzerne",
"schuylkill",
"sullivan"
]
def tag_county_out_of_state(df):
if str(df['Physical Address County']).lower() not in VALID_PA_COUNTIES:
df['County Out of State'] = True
else:
df['County Out of State'] = False
return df
def remove_duplicate_client_records(df:pd.DataFrame) -> pd.DataFrame:
new_df = df.copy()
new_df = new_df[new_df[NEOSERRA_COLUMNS.center] != "Duplicate Client Records"]
return new_df
def remove_api_testing_clients(df:pd.DataFrame) -> pd.DataFrame:
new_df = df.copy()
new_df = new_df[new_df[NEOSERRA_COLUMNS.center] != "API Testing Sandbox"]
return new_df
def clean_center_name(df: pd.DataFrame, center_name_column:str="Center"):
'''
Cleans up the Center column of Neoserra data export using the conversion mapping defined in the library code which can be accessed
via the constant CENTER_NAME_MAPPING
Parameters:
df - the dataframe to clean up
center_name_column - the column of the datafram containing the center names
Returns:
None
'''
df[center_name_column] = df[center_name_column].astype(str).str.strip()
df[center_name_column] = df[center_name_column].replace(CENTER_NAME_MAPPING)

View File

@@ -0,0 +1,13 @@
# libs/pasbdc_data_cleaning/pyproject.toml
[build-system]
requires = ["setuptools", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "pasbdc_data_cleaning"
version = "0.1.0"
description = "Internal data cleaning library for the PASBDC written by Vincent Allen"
# MOVED: Configuration specific to setuptools goes here
[tool.setuptools]
packages = ["pasbdc_data_cleaning"]

View File

@@ -0,0 +1,64 @@
## Easy Word Docs in Python
---
This library implements a word document builder class that allows you to pass a list of functions and it will generate a word document using each function to generate a page.
This has the benefit of providing modularity in scripts that make word documents as their outputs as a script can be configured to use any combination
of page functions to build a word doc.
### Tips:
---
The current page number can be accessed by accessing the current_section member variable of the document builder.
The current figure number and table number can be accessed with the member variables figure_number and table_number. This allows you to have dynanmically
labeled graphs and tables in your document pages
The PageConfig class allows you to customize the page breaking behavior to ensure the correct number of pages are present in the final document
## Usage:
---
```
from docx import Document
from docx.shared import Inches, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from typing import Callable, List, Any
# Example page function
def title_page(builder: WordDocumentBuilder, title: str = "Document Title", author: str = "Author", **kwargs):
"""Create a title page."""
title_para = builder.doc.add_paragraph()
title_run = title_para.add_run(title)
title_run.font.size = Pt(28)
title_run.bold = True
title_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
builder.doc.add_paragraph() # Spacing
author_para = builder.doc.add_paragraph(author)
author_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
# Create document builder
builder = WordDocumentBuilder()
# Define pages, these are functions
pages = [
PageConfig(title_page, add_page_break=False)
]
# Create document with custom parameters. Only pages and the output path are required, the following parameters will be passed into the functions that
# request them through **kwargs. Think of it kinda like dependancy injection.
doc = builder.create_document(
pages,
"output/my_document.docx",
title="My Custom Report",
author="John Doe",
heading="Introduction",
content="This is the introduction section with detailed information.",
table_data=[
["Product", "Price", "Quantity"],
["Widget", "$10", "100"],
["Gadget", "$25", "50"]
],
table_title="Sales Data"
)
```

View File

@@ -0,0 +1,4 @@
Metadata-Version: 2.4
Name: pasbdc_word_library
Version: 0.1.0
Summary: A tool used to make the generation of word documents in python a little easier.

View File

@@ -0,0 +1,9 @@
README.md
pyproject.toml
pasbdc_word_library/__init__.py
pasbdc_word_library/doclibrary.py
pasbdc_word_library/theme_helpers.py
pasbdc_word_library.egg-info/PKG-INFO
pasbdc_word_library.egg-info/SOURCES.txt
pasbdc_word_library.egg-info/dependency_links.txt
pasbdc_word_library.egg-info/top_level.txt

View File

@@ -0,0 +1 @@
pasbdc_word_library

View File

@@ -0,0 +1,4 @@
# libs/word_library/__init__.py
from .doclibrary import WordDocumentBuilder, PageConfig, title_page, content_page, table_page, image_page
from .theme_helpers import theme_paragraph, theme_title
__all__ = ['WordDocumentBuilder', 'PageConfig', 'title_page', 'content_page', 'table_page', 'image_page', 'theme_paragraph', 'theme_title']

View File

@@ -0,0 +1,127 @@
from docx import Document
from docx.shared import Inches, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from typing import Callable, List, Any
import os
from dataclasses import dataclass
@dataclass
class PageConfig:
page_function: Callable
add_page_break: bool = True
class WordDocumentBuilder:
"""Build Word documents using configurable page functions."""
def __init__(self):
self.doc = Document()
self.current_section = 1
self.figure_number = 0;
self.table_number = 0;
def add_page_break(self):
"""Add a page break to the document."""
self.doc.add_page_break()
def create_document(self, page_functions: List[PageConfig], output_path: str, **kwargs):
"""
Create a Word document by executing page functions.
Args:
page_functions: List of functions that add content to the document
output_path: Path where the document will be saved
**kwargs: Additional parameters passed to all page functions
Returns:
The created Document object
"""
for i, page_conf in enumerate(page_functions):
# Execute page function with builder and any additional kwargs
page_conf.page_function(self, **kwargs)
# Add page break after each page except the last
if i < len(page_functions) - 1 and page_conf.add_page_break:
self.add_page_break()
# Increment the section state
self.current_section += 1
# Ensure directory exists
os.makedirs(os.path.dirname(output_path) if os.path.dirname(output_path) else '.', exist_ok=True)
# Save document
self.doc.save(output_path)
return self.doc
# Example page functions
def title_page(builder: WordDocumentBuilder, title: str = "Document Title", author: str = "Author", **kwargs):
"""Create a title page."""
title_para = builder.doc.add_paragraph()
title_run = title_para.add_run(title)
title_run.font.size = Pt(28)
title_run.bold = True
title_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
builder.doc.add_paragraph() # Spacing
author_para = builder.doc.add_paragraph(author)
author_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
def content_page(builder: WordDocumentBuilder, heading: str = "Section", content: str = "", **kwargs):
"""Create a content page with heading and body text."""
builder.doc.add_heading(heading, level=1)
builder.doc.add_paragraph(content)
def table_page(builder: WordDocumentBuilder, table_data: List[List[str]] = [], table_title: str = "Data Table", **kwargs):
"""Create a page with a table."""
if table_data is None:
table_data = [["Header 1", "Header 2"], ["Data 1", "Data 2"]]
builder.doc.add_heading(table_title, level=1)
table = builder.doc.add_table(rows=len(table_data), cols=len(table_data[0]))
table.style = 'Light Grid Accent 1'
for i, row in enumerate(table_data):
for j, cell_value in enumerate(row):
table.rows[i].cells[j].text = str(cell_value)
def image_page(builder: WordDocumentBuilder, image_path: str = "", caption: str = "", **kwargs):
"""Create a page with an image."""
if image_path and os.path.exists(image_path):
builder.doc.add_paragraph(caption)
builder.doc.add_picture(image_path, width=Inches(5))
# Usage example
if __name__ == "__main__":
# Create document builder
builder = WordDocumentBuilder()
# Define pages
pages = [
PageConfig(title_page, add_page_break=True),
PageConfig(content_page, add_page_break=True),
PageConfig(table_page, add_page_break=False),
]
# Create document with custom parameters
doc = builder.create_document(
pages,
"output/my_document.docx",
title="My Custom Report",
author="John Doe",
heading="Introduction",
content="This is the introduction section with detailed information.",
table_data=[
["Product", "Price", "Quantity"],
["Widget", "$10", "100"],
["Gadget", "$25", "50"]
],
table_title="Sales Data"
)
print("Document created successfully!")

View File

@@ -0,0 +1,22 @@
import docx
from docx.shared import RGBColor, Pt
from docx.shared import Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
def theme_paragraph(
paragraph,
font_size_pt:int=9,
font_name:str="Futera",
color:RGBColor=RGBColor(15,27,38)
):
for run in paragraph.runs:
run.font.name = font_name
run.font.size = Pt(font_size_pt)
run.font.color.rgb = color
def theme_title(title_run):
title_run.bold = True
title_run.font.name = 'Futera'
title_run.font.size = Pt(12)
title_run.font.color.rgb = RGBColor(113, 191, 68)
title_run.alignment = WD_ALIGN_PARAGRAPH.LEFT

View File

@@ -0,0 +1,12 @@
[build-system]
requires = ["setuptools", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "pasbdc_word_library"
version = "0.1.0"
description = "A tool used to make the generation of word documents in python a little easier."
[tool.setuptools]
packages = ["pasbdc_word_library"]