diff --git a/.gitignore b/.gitignore index 40ffa5b..2b02faf 100644 --- a/.gitignore +++ b/.gitignore @@ -162,3 +162,5 @@ cython_debug/ #.idea/ input.csv +output.csv +output.json diff --git a/README.md b/README.md index 4832b11..4f7deab 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,11 @@ Specifically the global variables at the top of the file `RATELIMIT_REQUESTS` an These values define how many requests may be made within a specific time range (for example, 5 requests every 20 seconds). As of right now, there's no way to adjust these dynamically per invocation of the program based on command line parameters - it must be done with hard-coded values. +### Output +The program can output in either of two formats: JSON or CSV +Right now, this is handled by hard-coded values at the top of the file. +The location of the output is also handled by the hard-coded values at the top of the file. + ## Localhost Before you can run the program, you should create a virtual environment for the python executable ```bash diff --git a/db.py b/db.py index 97cd315..0cbbe4e 100644 --- a/db.py +++ b/db.py @@ -1,13 +1,23 @@ '''Data storage and retrieval interface''' +import csv +from enum import Enum +from io import TextIOWrapper import json +import sys SUCCESS_KEY = "succeed_domains" PRIVACY_KEY = "private_domains" -FAILED_KEY = "failed_domains" +FAILED_KEY = "failed_domains" + class Db: '''Provides an interface to store/retrieve results''' + class Format(Enum): + '''Output formats''' + JSON = 1 + CSV = 2 + DB = {} def record_country(self, domain, country): @@ -46,7 +56,40 @@ def get_failed_domain_count(self): def __str__(self): return str(self.DB) - # Feature Request: Multiple output locations - def output_results(self): + def output_results(self, output_loc: TextIOWrapper = None, fmt: Format = Format.JSON): '''Outputs the results stored in the DB''' - print(json.dumps(self.DB, indent=4)) + if fmt == Db.Format.JSON: + self._output_results_json(output_loc) + elif fmt == Db.Format.CSV: + self._output_results_csv(output_loc) + + def _output_results_json(self, output_loc: TextIOWrapper = None): + '''Outputs the results stored in the DB to a JSON file''' + results = json.dumps(self.DB, indent=4) + if output_loc is None: + print(results) + else: + output_loc.write(results) + + def _output_results_csv(self, output_loc: TextIOWrapper = None): + '''Outputs the results stored in the DB to a CSV file''' + fieldnames = ["domain", "country"] + data = [] + if SUCCESS_KEY in self.DB: + for country in self.DB[SUCCESS_KEY]: + for domain in self.DB[SUCCESS_KEY][country]: + data.append( + {"country": "N/A" if country is None else country, "domain": domain}) + if PRIVACY_KEY in self.DB: + for term in self.DB[PRIVACY_KEY]: + for domain in self.DB[PRIVACY_KEY][term]: + data.append( + {"country": f"Privacy Protected ({term})", "domain": domain}) + if FAILED_KEY in self.DB: + for domain in self.DB[FAILED_KEY]: + data.append({"country": "Failed", "domain": domain}) + writer = csv.DictWriter(output_loc, fieldnames=fieldnames) + if output_loc is None: + output_loc = sys.stdout + writer.writeheader() + writer.writerows(data) diff --git a/main.py b/main.py index dffa137..4646791 100644 --- a/main.py +++ b/main.py @@ -15,11 +15,13 @@ from db import Db # Basic rate limiting: 5 requests per 20 seconds -RATELIMIT_REQUESTS = 5 # Number of requests to rate limit -RATELIMIT_TIMERANGE = 20 # Amount of time to rate limit +RATELIMIT_REQUESTS = 50 # Number of requests to rate limit +RATELIMIT_TIMERANGE = 60 # Amount of time to rate limit SCHEMA_FILE = "rules.schema.json" RULES_FILE = "rules.json" DOMAINS_FILE = "input.csv" +OUTPUT_FORMAT = Db.Format.CSV +OUTPUT_FILE = "output.csv" ENCODING = "UTF-8" DB = Db() @@ -173,7 +175,8 @@ def main(pagenum: int, pagesize: int) -> int: log.exception(ex) return -100 # stop processing immediately - DB.output_results() + with open(OUTPUT_FILE, "w", encoding=ENCODING) as output: + DB.output_results(output, OUTPUT_FORMAT) return DB.get_failed_domain_count()