From 9f6bd00c47a67b3927f5dc94678cb2e8a5ea64a0 Mon Sep 17 00:00:00 2001 From: Graham Herceg Date: Mon, 5 Feb 2024 11:19:11 -0500 Subject: [PATCH] Add throttle option to load_domain_data command Only used by CouchDataLoader for now. In practice, couch can be easily overloaded by the high load encountered when saving many different documents. The throttle value of 0.25 seconds has worked with projects that have encountered this issue. If in the future this value is insufficient, it might be worth allowing the calling user to specify a throttle value rather than setting a flag. --- corehq/apps/dump_reload/couch/load.py | 3 ++- corehq/apps/dump_reload/interface.py | 3 ++- .../apps/dump_reload/management/commands/load_domain_data.py | 4 +++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/corehq/apps/dump_reload/couch/load.py b/corehq/apps/dump_reload/couch/load.py index 53707218ec85..571dd39ee0ed 100644 --- a/corehq/apps/dump_reload/couch/load.py +++ b/corehq/apps/dump_reload/couch/load.py @@ -62,7 +62,8 @@ def _create_db_for_doc_type(self, doc_type): callback = LoaderCallback(self._success_counter, self.stdout) large_doc_types = [Application._doc_type, LinkedApplication._doc_type, RemoteApp._doc_type] chunksize = 1 if doc_type in large_doc_types else self.chunksize - db = IterDB(couch_db, new_edits=False, callback=callback, chunksize=chunksize) + throttle_secs = 0.25 if self.should_throttle else None + db = IterDB(couch_db, new_edits=False, callback=callback, chunksize=chunksize, throttle_secs=throttle_secs) db.__enter__() return db diff --git a/corehq/apps/dump_reload/interface.py b/corehq/apps/dump_reload/interface.py index fdf7c765bd57..c6bb446f475c 100644 --- a/corehq/apps/dump_reload/interface.py +++ b/corehq/apps/dump_reload/interface.py @@ -37,11 +37,12 @@ def dump(self, output_stream): class DataLoader(metaclass=ABCMeta): - def __init__(self, object_filter=None, stdout=None, stderr=None, chunksize=None): + def __init__(self, object_filter=None, stdout=None, stderr=None, chunksize=None, should_throttle=False): self.stdout = stdout or sys.stdout self.stderr = stderr or sys.stderr self.object_filter = re.compile(object_filter, re.IGNORECASE) if object_filter else None self.chunksize = chunksize + self.should_throttle = should_throttle @abstractproperty def slug(self): diff --git a/corehq/apps/dump_reload/management/commands/load_domain_data.py b/corehq/apps/dump_reload/management/commands/load_domain_data.py index 3da3b07baf5c..db53c4852935 100644 --- a/corehq/apps/dump_reload/management/commands/load_domain_data.py +++ b/corehq/apps/dump_reload/management/commands/load_domain_data.py @@ -70,12 +70,14 @@ def add_arguments(self, parser): parser.add_argument('--json-output', action="store_true", help="Produce JSON output for use in tests") parser.add_argument('--chunksize', type=int, default=100, help="Set custom chunksize in case it runs into large couch documents") + parser.add_argument('--throttle', action="store_false", help="Throttle saves to database") def handle(self, dump_file_path, **options): self.force = options.get('force') self.dry_run = options.get('dry_run') self.use_extracted = options.get('use_extracted') self.chunksize = options.get('chunksize') + self.should_throttle = options.get('throttle') if not os.path.isfile(dump_file_path): raise CommandError("Dump file not found: {}".format(dump_file_path)) @@ -125,7 +127,7 @@ def extract_dump_archive(self, dump_file_path): def _load_data(self, loader_class, extracted_dump_path, object_filter, dump_meta): try: - loader = loader_class(object_filter, self.stdout, self.stderr, self.chunksize) + loader = loader_class(object_filter, self.stdout, self.stderr, self.chunksize, self.should_throttle) return loader.load_from_path(extracted_dump_path, dump_meta, force=self.force, dry_run=self.dry_run) except DataExistsException as e: raise CommandError('Some data already exists. Use --force to load anyway: {}'.format(str(e)))