first commit
This commit is contained in:
6
libs/pasbdc_data_cleaning/README.md
Normal file
6
libs/pasbdc_data_cleaning/README.md
Normal file
@@ -0,0 +1,6 @@
|
||||
# Common Cleaning Functions Library
|
||||
---
|
||||
This python library aims to provide some common utility functions you can use while working with data that has come from Neoserra.
|
||||
|
||||
## Functions:
|
||||
clean_center_name - Turns the messy center names from Neoserra to the standard center labels we should be using in graphs
|
||||
@@ -0,0 +1,4 @@
|
||||
Metadata-Version: 2.4
|
||||
Name: pasbdc_data_cleaning
|
||||
Version: 0.1.0
|
||||
Summary: Internal data cleaning library for the PASBDC written by Vincent Allen
|
||||
@@ -0,0 +1,8 @@
|
||||
README.md
|
||||
pyproject.toml
|
||||
pasbdc_data_cleaning/__init__.py
|
||||
pasbdc_data_cleaning/sbdclibrary.py
|
||||
pasbdc_data_cleaning.egg-info/PKG-INFO
|
||||
pasbdc_data_cleaning.egg-info/SOURCES.txt
|
||||
pasbdc_data_cleaning.egg-info/dependency_links.txt
|
||||
pasbdc_data_cleaning.egg-info/top_level.txt
|
||||
@@ -0,0 +1 @@
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
pasbdc_data_cleaning
|
||||
@@ -0,0 +1,3 @@
|
||||
# libs/pasbdc_data_cleaning/__init__.py
|
||||
from .sbdclibrary import clean_center_name, tag_county_out_of_state, remove_duplicate_client_records, remove_api_testing_clients
|
||||
__all__ = ['clean_center_name', 'tag_county_out_of_state', 'remove_duplicate_client_records', 'remove_api_testing_clients']
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
169
libs/pasbdc_data_cleaning/pasbdc_data_cleaning/sbdclibrary.py
Normal file
169
libs/pasbdc_data_cleaning/pasbdc_data_cleaning/sbdclibrary.py
Normal file
@@ -0,0 +1,169 @@
|
||||
import pandas as pd
|
||||
|
||||
from constants_module import NEOSERRA_COLUMNS
|
||||
|
||||
CENTER_NAME_MAPPING = {
|
||||
"PI - Washington County":"Pittsburgh",
|
||||
"University of Pittsburgh SBDC":"Pittsburgh",
|
||||
"Indiana County":"Pittsburgh",
|
||||
"PI - Greene County":"Pittsburgh",
|
||||
"TE - TEMPLE SBDC":"Temple",
|
||||
"TE - WCU Procurement Assistance Center":"Temple",
|
||||
"Kutztown University SBDC": "Kutztown",
|
||||
"K - Kutztown SBDC":"Kutztown",
|
||||
"WD - WIDENER SBDC": "Widener",
|
||||
"WD - Norristown Outreach":"Widener",
|
||||
"The University of Scranton SBDC": "Scranton",
|
||||
"SC - Monroe Outreach":"Scranton",
|
||||
"SC - Wyoming Outreach":"Scranton",
|
||||
"SC - Susquehanna Outreach":"Scranton",
|
||||
"SC - Scranton Outreach":"Scranton",
|
||||
"SC - Pike Outreach":"Scranton",
|
||||
"SC - Wayne Outreach":"Scranton",
|
||||
"SC - Lackawanna Outreach":"Scranton",
|
||||
"PennWest University Clarion SBDC":"Clarion",
|
||||
"Clarion CARES Act":"Clarion",
|
||||
"WI - WILKES SBDC":"Wilkes",
|
||||
"WI - WIlkes":"Wilkes",
|
||||
"WI - Bloomsburg":"Wilkes",
|
||||
"LE - LEHIGH UNIVERSITY SBDC":"Lehigh",
|
||||
"G - GANNON SBDC":"Gannon",
|
||||
"G - Meadville":"Gannon",
|
||||
"G - Mercer":"Gannon",
|
||||
"G - Warren":"Gannon",
|
||||
"Penn State SBDC":"Penn State",
|
||||
"SH - SHIPPENSBURG SBDC":"Shippensburg",
|
||||
"Duquesne University SBDC":"Duquesne",
|
||||
"Bucknell SBDC":"Bucknell",
|
||||
"SF - ST. FRANCIS UNIVERSITY SBDC": "St. Francis",
|
||||
"SF - Somerset Outreach":"St. Francis",
|
||||
"SF - Bedford Outreach":"St. Francis",
|
||||
"SF - Blair Outreach":"St. Francis",
|
||||
"SF - St Francis University SBDC":"St. Francis",
|
||||
"SF - St Francis Universty SBDC":"St. Francis",
|
||||
"SF - St Francis Univeristy SBDC":"St. Francis",
|
||||
"SV - ST. VINCENT COLLEGE SBDC":"St. Vincent",
|
||||
"SV - AIHP Outreach":"St. Vincent",
|
||||
"SV - WEDC Outreach":"St. Vincent",
|
||||
"SV - Fayette Outreach":"St. Vincent",
|
||||
"SV - Jeannette Outreach":"St. Vincent",
|
||||
"LE - Bucks County/Lehigh SBDC":"Lehigh",
|
||||
"Southeast Pennsylvania APEX Accelerator":"Z-Lead Office",
|
||||
"EMAP":"Z-Lead Office",
|
||||
"Pennsylvania SBDC Lead Office":"Z-Lead Office",
|
||||
"State Small Business Credit Initiative (SSBCI)":"Z-Lead Office",
|
||||
"D - Beaver":"Duquesne",
|
||||
"AA - Stakeholders/Partners":"Z-Lead Office",
|
||||
"AA - Temple Stakeholders/Partners":"Z-Lead Office",
|
||||
"Wharton":"Z-Lead Office",
|
||||
"Wharton SBDC":"Z-Lead Office",
|
||||
"Lock Haven SBDC":"Penn State"
|
||||
|
||||
# "Kutztown University SBDC ": "Kutztown",
|
||||
#" Kutztown University SBDC": "Kutztown",
|
||||
#" Pennsylvania SBDC Lead Office":"Lead Office",
|
||||
}
|
||||
|
||||
VALID_PA_COUNTIES = [
|
||||
"juniata",
|
||||
"montour",
|
||||
"northumberland",
|
||||
"perry",
|
||||
"snyder",
|
||||
"union",
|
||||
"allegheny",
|
||||
"beaver",
|
||||
"butler",
|
||||
"lawrence",
|
||||
"washington",
|
||||
"crawford",
|
||||
"erie",
|
||||
"mercer",
|
||||
"warren",
|
||||
"dauphin",
|
||||
"lancaster",
|
||||
"lebanon",
|
||||
"berks",
|
||||
"chester",
|
||||
"bucks",
|
||||
"lehigh",
|
||||
"northampton",
|
||||
"centre",
|
||||
"clinton",
|
||||
"lycoming",
|
||||
"mifflin",
|
||||
"armstrong",
|
||||
"cameron",
|
||||
"clarion",
|
||||
"clearfield",
|
||||
"elk",
|
||||
"forest",
|
||||
"jefferson",
|
||||
"mckean",
|
||||
"potter",
|
||||
"venango",
|
||||
"bedford",
|
||||
"blair",
|
||||
"cambria",
|
||||
"fulton",
|
||||
"huntingdon",
|
||||
"somerset",
|
||||
"fayette",
|
||||
"westmoreland",
|
||||
"adams",
|
||||
"cumberland",
|
||||
"franklin",
|
||||
"york",
|
||||
"montgomery",
|
||||
"philadelphia",
|
||||
"greene",
|
||||
"indiana",
|
||||
"bradford",
|
||||
"lackawanna",
|
||||
"monroe",
|
||||
"pike",
|
||||
"susquehanna",
|
||||
"tioga",
|
||||
"wayne",
|
||||
"wyoming",
|
||||
"delaware",
|
||||
"carbon",
|
||||
"columbia",
|
||||
"luzerne",
|
||||
"schuylkill",
|
||||
"sullivan"
|
||||
]
|
||||
def tag_county_out_of_state(df):
|
||||
if str(df['Physical Address County']).lower() not in VALID_PA_COUNTIES:
|
||||
df['County Out of State'] = True
|
||||
else:
|
||||
df['County Out of State'] = False
|
||||
|
||||
return df
|
||||
|
||||
def remove_duplicate_client_records(df:pd.DataFrame) -> pd.DataFrame:
|
||||
new_df = df.copy()
|
||||
new_df = new_df[new_df[NEOSERRA_COLUMNS.center] != "Duplicate Client Records"]
|
||||
return new_df
|
||||
|
||||
def remove_api_testing_clients(df:pd.DataFrame) -> pd.DataFrame:
|
||||
new_df = df.copy()
|
||||
new_df = new_df[new_df[NEOSERRA_COLUMNS.center] != "API Testing Sandbox"]
|
||||
return new_df
|
||||
|
||||
def clean_center_name(df: pd.DataFrame, center_name_column:str="Center"):
|
||||
'''
|
||||
Cleans up the Center column of Neoserra data export using the conversion mapping defined in the library code which can be accessed
|
||||
via the constant CENTER_NAME_MAPPING
|
||||
|
||||
Parameters:
|
||||
df - the dataframe to clean up
|
||||
center_name_column - the column of the datafram containing the center names
|
||||
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
df[center_name_column] = df[center_name_column].astype(str).str.strip()
|
||||
df[center_name_column] = df[center_name_column].replace(CENTER_NAME_MAPPING)
|
||||
|
||||
|
||||
13
libs/pasbdc_data_cleaning/pyproject.toml
Normal file
13
libs/pasbdc_data_cleaning/pyproject.toml
Normal file
@@ -0,0 +1,13 @@
|
||||
# libs/pasbdc_data_cleaning/pyproject.toml
|
||||
[build-system]
|
||||
requires = ["setuptools", "wheel"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "pasbdc_data_cleaning"
|
||||
version = "0.1.0"
|
||||
description = "Internal data cleaning library for the PASBDC written by Vincent Allen"
|
||||
|
||||
# MOVED: Configuration specific to setuptools goes here
|
||||
[tool.setuptools]
|
||||
packages = ["pasbdc_data_cleaning"]
|
||||
64
libs/word_library/README.md
Normal file
64
libs/word_library/README.md
Normal file
@@ -0,0 +1,64 @@
|
||||
## Easy Word Docs in Python
|
||||
---
|
||||
This library implements a word document builder class that allows you to pass a list of functions and it will generate a word document using each function to generate a page.
|
||||
|
||||
This has the benefit of providing modularity in scripts that make word documents as their outputs as a script can be configured to use any combination
|
||||
of page functions to build a word doc.
|
||||
|
||||
### Tips:
|
||||
---
|
||||
The current page number can be accessed by accessing the current_section member variable of the document builder.
|
||||
|
||||
The current figure number and table number can be accessed with the member variables figure_number and table_number. This allows you to have dynanmically
|
||||
labeled graphs and tables in your document pages
|
||||
|
||||
The PageConfig class allows you to customize the page breaking behavior to ensure the correct number of pages are present in the final document
|
||||
|
||||
## Usage:
|
||||
---
|
||||
```
|
||||
from docx import Document
|
||||
from docx.shared import Inches, Pt
|
||||
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||||
from typing import Callable, List, Any
|
||||
|
||||
# Example page function
|
||||
def title_page(builder: WordDocumentBuilder, title: str = "Document Title", author: str = "Author", **kwargs):
|
||||
"""Create a title page."""
|
||||
title_para = builder.doc.add_paragraph()
|
||||
title_run = title_para.add_run(title)
|
||||
title_run.font.size = Pt(28)
|
||||
title_run.bold = True
|
||||
title_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||
|
||||
builder.doc.add_paragraph() # Spacing
|
||||
|
||||
author_para = builder.doc.add_paragraph(author)
|
||||
author_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||
|
||||
# Create document builder
|
||||
builder = WordDocumentBuilder()
|
||||
|
||||
# Define pages, these are functions
|
||||
pages = [
|
||||
PageConfig(title_page, add_page_break=False)
|
||||
]
|
||||
|
||||
# Create document with custom parameters. Only pages and the output path are required, the following parameters will be passed into the functions that
|
||||
# request them through **kwargs. Think of it kinda like dependancy injection.
|
||||
doc = builder.create_document(
|
||||
pages,
|
||||
"output/my_document.docx",
|
||||
title="My Custom Report",
|
||||
author="John Doe",
|
||||
heading="Introduction",
|
||||
content="This is the introduction section with detailed information.",
|
||||
table_data=[
|
||||
["Product", "Price", "Quantity"],
|
||||
["Widget", "$10", "100"],
|
||||
["Gadget", "$25", "50"]
|
||||
],
|
||||
table_title="Sales Data"
|
||||
)
|
||||
```
|
||||
|
||||
4
libs/word_library/pasbdc_word_library.egg-info/PKG-INFO
Normal file
4
libs/word_library/pasbdc_word_library.egg-info/PKG-INFO
Normal file
@@ -0,0 +1,4 @@
|
||||
Metadata-Version: 2.4
|
||||
Name: pasbdc_word_library
|
||||
Version: 0.1.0
|
||||
Summary: A tool used to make the generation of word documents in python a little easier.
|
||||
@@ -0,0 +1,9 @@
|
||||
README.md
|
||||
pyproject.toml
|
||||
pasbdc_word_library/__init__.py
|
||||
pasbdc_word_library/doclibrary.py
|
||||
pasbdc_word_library/theme_helpers.py
|
||||
pasbdc_word_library.egg-info/PKG-INFO
|
||||
pasbdc_word_library.egg-info/SOURCES.txt
|
||||
pasbdc_word_library.egg-info/dependency_links.txt
|
||||
pasbdc_word_library.egg-info/top_level.txt
|
||||
@@ -0,0 +1 @@
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
pasbdc_word_library
|
||||
4
libs/word_library/pasbdc_word_library/__init__.py
Normal file
4
libs/word_library/pasbdc_word_library/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
# libs/word_library/__init__.py
|
||||
from .doclibrary import WordDocumentBuilder, PageConfig, title_page, content_page, table_page, image_page
|
||||
from .theme_helpers import theme_paragraph, theme_title
|
||||
__all__ = ['WordDocumentBuilder', 'PageConfig', 'title_page', 'content_page', 'table_page', 'image_page', 'theme_paragraph', 'theme_title']
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
127
libs/word_library/pasbdc_word_library/doclibrary.py
Normal file
127
libs/word_library/pasbdc_word_library/doclibrary.py
Normal file
@@ -0,0 +1,127 @@
|
||||
from docx import Document
|
||||
from docx.shared import Inches, Pt
|
||||
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||||
from typing import Callable, List, Any
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
|
||||
@dataclass
|
||||
class PageConfig:
|
||||
page_function: Callable
|
||||
add_page_break: bool = True
|
||||
|
||||
class WordDocumentBuilder:
|
||||
"""Build Word documents using configurable page functions."""
|
||||
|
||||
def __init__(self):
|
||||
self.doc = Document()
|
||||
self.current_section = 1
|
||||
self.figure_number = 0;
|
||||
self.table_number = 0;
|
||||
|
||||
def add_page_break(self):
|
||||
"""Add a page break to the document."""
|
||||
self.doc.add_page_break()
|
||||
|
||||
def create_document(self, page_functions: List[PageConfig], output_path: str, **kwargs):
|
||||
"""
|
||||
Create a Word document by executing page functions.
|
||||
|
||||
Args:
|
||||
page_functions: List of functions that add content to the document
|
||||
output_path: Path where the document will be saved
|
||||
**kwargs: Additional parameters passed to all page functions
|
||||
|
||||
Returns:
|
||||
The created Document object
|
||||
"""
|
||||
for i, page_conf in enumerate(page_functions):
|
||||
# Execute page function with builder and any additional kwargs
|
||||
page_conf.page_function(self, **kwargs)
|
||||
|
||||
# Add page break after each page except the last
|
||||
if i < len(page_functions) - 1 and page_conf.add_page_break:
|
||||
self.add_page_break()
|
||||
|
||||
# Increment the section state
|
||||
self.current_section += 1
|
||||
|
||||
# Ensure directory exists
|
||||
os.makedirs(os.path.dirname(output_path) if os.path.dirname(output_path) else '.', exist_ok=True)
|
||||
|
||||
# Save document
|
||||
self.doc.save(output_path)
|
||||
return self.doc
|
||||
|
||||
|
||||
# Example page functions
|
||||
def title_page(builder: WordDocumentBuilder, title: str = "Document Title", author: str = "Author", **kwargs):
|
||||
"""Create a title page."""
|
||||
title_para = builder.doc.add_paragraph()
|
||||
title_run = title_para.add_run(title)
|
||||
title_run.font.size = Pt(28)
|
||||
title_run.bold = True
|
||||
title_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||
|
||||
builder.doc.add_paragraph() # Spacing
|
||||
|
||||
author_para = builder.doc.add_paragraph(author)
|
||||
author_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||
|
||||
|
||||
def content_page(builder: WordDocumentBuilder, heading: str = "Section", content: str = "", **kwargs):
|
||||
"""Create a content page with heading and body text."""
|
||||
builder.doc.add_heading(heading, level=1)
|
||||
builder.doc.add_paragraph(content)
|
||||
|
||||
|
||||
def table_page(builder: WordDocumentBuilder, table_data: List[List[str]] = [], table_title: str = "Data Table", **kwargs):
|
||||
"""Create a page with a table."""
|
||||
if table_data is None:
|
||||
table_data = [["Header 1", "Header 2"], ["Data 1", "Data 2"]]
|
||||
|
||||
builder.doc.add_heading(table_title, level=1)
|
||||
|
||||
table = builder.doc.add_table(rows=len(table_data), cols=len(table_data[0]))
|
||||
table.style = 'Light Grid Accent 1'
|
||||
|
||||
for i, row in enumerate(table_data):
|
||||
for j, cell_value in enumerate(row):
|
||||
table.rows[i].cells[j].text = str(cell_value)
|
||||
|
||||
|
||||
def image_page(builder: WordDocumentBuilder, image_path: str = "", caption: str = "", **kwargs):
|
||||
"""Create a page with an image."""
|
||||
if image_path and os.path.exists(image_path):
|
||||
builder.doc.add_paragraph(caption)
|
||||
builder.doc.add_picture(image_path, width=Inches(5))
|
||||
|
||||
# Usage example
|
||||
if __name__ == "__main__":
|
||||
# Create document builder
|
||||
builder = WordDocumentBuilder()
|
||||
|
||||
# Define pages
|
||||
pages = [
|
||||
PageConfig(title_page, add_page_break=True),
|
||||
PageConfig(content_page, add_page_break=True),
|
||||
PageConfig(table_page, add_page_break=False),
|
||||
]
|
||||
|
||||
# Create document with custom parameters
|
||||
doc = builder.create_document(
|
||||
pages,
|
||||
"output/my_document.docx",
|
||||
title="My Custom Report",
|
||||
author="John Doe",
|
||||
heading="Introduction",
|
||||
content="This is the introduction section with detailed information.",
|
||||
table_data=[
|
||||
["Product", "Price", "Quantity"],
|
||||
["Widget", "$10", "100"],
|
||||
["Gadget", "$25", "50"]
|
||||
],
|
||||
table_title="Sales Data"
|
||||
)
|
||||
|
||||
print("Document created successfully!")
|
||||
22
libs/word_library/pasbdc_word_library/theme_helpers.py
Normal file
22
libs/word_library/pasbdc_word_library/theme_helpers.py
Normal file
@@ -0,0 +1,22 @@
|
||||
import docx
|
||||
from docx.shared import RGBColor, Pt
|
||||
from docx.shared import Pt
|
||||
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||||
|
||||
def theme_paragraph(
|
||||
paragraph,
|
||||
font_size_pt:int=9,
|
||||
font_name:str="Futera",
|
||||
color:RGBColor=RGBColor(15,27,38)
|
||||
):
|
||||
for run in paragraph.runs:
|
||||
run.font.name = font_name
|
||||
run.font.size = Pt(font_size_pt)
|
||||
run.font.color.rgb = color
|
||||
|
||||
def theme_title(title_run):
|
||||
title_run.bold = True
|
||||
title_run.font.name = 'Futera'
|
||||
title_run.font.size = Pt(12)
|
||||
title_run.font.color.rgb = RGBColor(113, 191, 68)
|
||||
title_run.alignment = WD_ALIGN_PARAGRAPH.LEFT
|
||||
12
libs/word_library/pyproject.toml
Normal file
12
libs/word_library/pyproject.toml
Normal file
@@ -0,0 +1,12 @@
|
||||
[build-system]
|
||||
requires = ["setuptools", "wheel"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "pasbdc_word_library"
|
||||
version = "0.1.0"
|
||||
description = "A tool used to make the generation of word documents in python a little easier."
|
||||
|
||||
[tool.setuptools]
|
||||
packages = ["pasbdc_word_library"]
|
||||
|
||||
Reference in New Issue
Block a user