-
Notifications
You must be signed in to change notification settings - Fork 0
/
03-standardize.py
79 lines (61 loc) · 2.47 KB
/
03-standardize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
"""
Replaces all truncated (sub)categories with standardized (sub)categories.
"""
import pandas as pd
COMBINED_DIR = "output/02-combined"
STANDARDIZED_DIR = "output/03-standardized"
def clean_complaints(raw_data_path, lookup_path=None):
raw = pd.read_csv(raw_data_path)
print(f"{len(raw)} records in original dataset")
if lookup_path:
lookup = pd.read_csv(lookup_path)
join_cols = [col for col in raw.columns if col in lookup.columns]
status_cols = [col for col in lookup.columns if "status" in col]
cleaned = raw.merge(lookup, on=join_cols, how="left").sort_values(
[
"airport",
"year_month",
]
+ join_cols
)
for col in status_cols:
unmatched = cleaned.loc[lambda df: df[col].fillna("") == ""]
if len(unmatched):
unmatched_values = (
unmatched[join_cols].drop_duplicates().to_dict("records")
)
msg = f"Cannot find {col} value for {unmatched_values}"
raise ValueError(msg)
print(f"{len(cleaned)} records in cleaned dataset (should match)\n")
# Report metrics
for col in lookup.columns:
if "status" in col or "prefix_removed" in col:
print(f"{cleaned[col].value_counts()}\n\n")
else:
# If no lookup, just drop unused categories
cleaned = raw.drop(columns=["category", "subcategory"])
print("No cleaning performed\n")
return cleaned
def main():
print("-- Airports --")
clean_airport = clean_complaints(
f"{COMBINED_DIR}/complaints-by-airport-raw.csv", None
)
clean_airport.to_csv(f"{STANDARDIZED_DIR}/complaints-by-airport.csv", index=False)
print("-- Categories --")
clean_cat = clean_complaints(
f"{COMBINED_DIR}/complaints-by-category-raw.csv",
"lookups/lkp_cleaner_categories.csv",
)
clean_cat.drop(columns=["subcategory"], inplace=True)
clean_cat.to_csv(f"{STANDARDIZED_DIR}/complaints-by-category.csv", index=False)
print("-- Subcategories --")
clean_subcat = clean_complaints(
f"{COMBINED_DIR}/complaints-by-subcategory-raw.csv",
"lookups/lkp_cleaner_subcategories.csv",
)
clean_subcat.to_csv(
f"{STANDARDIZED_DIR}/complaints-by-subcategory.csv", index=False
)
if __name__ == "__main__":
main()