A lightweight, python-based, multi-process CSV batcher suitable for use with Pandas dataframes, as a standalone tool, or other tools that deal with large CSV files (or files that require timely processing).
pip install csv-batcher
https://github.com/tangledpath/csv-batcher
https://tangledpath.github.io/csv-batcher/csv_batcher.html
- Possibly implement pooling with celery (for use in django apps, etc.), which can bring about horizontal scaling.
Arguments sent to callback function can be controlled by
creating pooler with callback_with
and the CallbackWith enum
values:
from csv_batcher.csv_pooler import CSVPooler, CallbackWith
# Callback function passed to pooler; accepts a dataframe row
# as a pandas Series (via apply)
def process_dataframe_row(row):
return row.iloc[0]
pooler = CSVPooler(
"5mSalesRecords.csv",
process_dataframe_row,
callback_with=CallbackWith.DATAFRAME_ROW,
pool_size=16
)
for processed_batch in pooler.process():
print(processed_batch)
from csv_batcher.csv_pooler import CSVPooler, CallbackWith
# Used from process_datafrom's apply:
def process_dataframe_row(row):
return row.iloc[0]
# Callback function passed to pooler; accepts a dataframe:
def process_dataframe(df):
foo = df.apply(process_dataframe_row, axis=1)
# Or do something more complicated....
return len(df)
pooler = CSVPooler(
"5mSalesRecords.csv",
process_dataframe,
callback_with=CallbackWith.DATAFRAME,
pool_size=16
)
for processed_batch in pooler.process():
print(processed_batch)
import pandas as pd
from csv_batcher.csv_pooler import CSVPooler, CallbackWith
# Used from process_csv_filename's apply:
def process_dataframe_row(row):
return row.iloc[0]
def process_csv_filename(csv_chunk_filename):
# print("processing ", csv_chunk_filename)
df = pd.read_csv(csv_chunk_filename, skipinitialspace=True, index_col=None)
foo = df.apply(process_dataframe_row, axis=1)
return len(df)
pooler = CSVPooler(
"5mSalesRecords.csv",
process_csv_filename,
callback_with=CallbackWith.CSV_FILENAME,
chunk_lines=10000,
pool_size=16
)
for processed_batch in pooler.process():
print(processed_batch)
ruff check . # Find linting errors
ruff check . --fix # Auto-fix linting errors (where possible)
# Shows in browser
poetry run pdoc csv_batcher
# Generates to ./docs
poetry run pdoc csv_batcher -o ./docs
# OR (recommended)
bin/build.sh
clear; pytest
poetry publish --build -u __token__ -p $PYPI_TOKEN`
# OR (recommended)
bin/publish.sh