From e260440e5968b8c9d36e9a41c87d96de1ff3f596 Mon Sep 17 00:00:00 2001 From: Paul Prescod Date: Sun, 10 Jul 2022 14:48:00 -0400 Subject: [PATCH 1/3] Serialize just once random_referenced objects --- .pre-commit-config.yaml | 2 +- snowfakery/data_generator_runtime.py | 58 +++++++++++++++------------- snowfakery/object_rows.py | 14 ++++++- tests/deep-random-nesting.yml | 19 +++++++++ tests/test_data_generator.py | 1 + tests/test_embedding.py | 3 +- tests/test_references.py | 10 ++++- 7 files changed, 76 insertions(+), 31 deletions(-) create mode 100644 tests/deep-random-nesting.yml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 99f57707..34d2d714 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,7 +2,7 @@ default_language_version: python: python3 repos: - repo: https://github.com/ambv/black - rev: 21.4b2 + rev: 22.6.0 hooks: - id: black - repo: https://github.com/pre-commit/pre-commit-hooks diff --git a/snowfakery/data_generator_runtime.py b/snowfakery/data_generator_runtime.py index cad83c88..4785f7dd 100644 --- a/snowfakery/data_generator_runtime.py +++ b/snowfakery/data_generator_runtime.py @@ -36,7 +36,7 @@ # save every single object to history. Useful for testing saving of datatypes -SAVE_EVERYTHING = os.environ.get("SF_SAVE_EVERYTHING") +SAVE_EVERYTHING = os.environ.get("SF_SAVE_EVERYTHING", False) class StoppingCriteria(NamedTuple): @@ -130,6 +130,7 @@ def __init__( # They survive iterations and continuations. self.persistent_nicknames = {} self.persistent_objects_by_table = {} + self.persistent_random_referenceable_objects = [] self.id_manager = IdManager() self.intertable_dependencies = OrderedSet() @@ -139,16 +140,25 @@ def __init__( self.reset_slots() def register_object( - self, obj: ObjectRow, nickname: Optional[str], persistent_object: bool + self, + obj: ObjectRow, + nickname: Optional[str], + persistent_object: bool, + random_referenced_object: bool, ): """Register an object for lookup by object type and (optionally) Nickname""" if nickname: + # should survive continuations. Somebody will probably `reference:`` it if persistent_object: self.persistent_nicknames[nickname] = obj else: self.transients.nicknamed_objects[nickname] = obj if persistent_object: self.persistent_objects_by_table[obj._tablename] = obj + + if persistent_object and random_referenced_object: + self.persistent_random_referenceable_objects.append((nickname, obj)) + self.transients.last_seen_obj_by_table[obj._tablename] = obj @property @@ -214,6 +224,10 @@ def serialize_dict_of_object_rows(dct): "today": self.today, "nicknames_and_tables": self.nicknames_and_tables, "intertable_dependencies": intertable_dependencies, + "persistent_random_referenceable_objects": [ + (nn, obj.__getstate__()) + for (nn, obj) in self.persistent_random_referenceable_objects + ], } return state @@ -233,6 +247,10 @@ def deserialize_dict_of_object_rows(dct): self.intertable_dependencies = OrderedSet( Dependency(*dep) for dep in getattr(state, "intertable_dependencies", []) ) + self.persistent_random_referenceable_objects = [ + (nickname, hydrate(ObjectRow, v)) + for (nickname, v) in state["persistent_random_referenceable_objects"] + ] self.today = state["today"] persistent_objects_by_table = state.get("persistent_objects_by_table") @@ -373,26 +391,8 @@ def resave_objects_from_continuation( ): """Re-save just_once objects to the local history cache after resuming a continuation""" - # deal with objs known by their nicknames - relevant_objs = [ - (obj._tablename, nickname, obj) - for nickname, obj in globals.persistent_nicknames.items() - ] - already_saved = set(obj._id for (_, _, obj) in relevant_objs) - # and those known by their tablename, if not already in the list - relevant_objs.extend( - (tablename, None, obj) - for tablename, obj in globals.persistent_objects_by_table.items() - if obj._id not in already_saved - ) - # filter out those in tables that are not history-backed - relevant_objs = ( - (table, nick, obj) - for (table, nick, obj) in relevant_objs - if table in tables_to_keep_history_for - ) - for tablename, nickname, obj in relevant_objs: - self.row_history.save_row(tablename, nickname, obj._values) + for nickname, obj in globals.persistent_random_referenceable_objects: + self.row_history.save_row(obj._tablename, nickname, obj._values) def execute(self): RowHistoryCV.set(self.row_history) @@ -569,19 +569,25 @@ def remember_row(self, tablename: str, nickname: T.Optional[str], row: dict): self.interpreter.globals.register_intertable_reference( tablename, fieldvalue._tablename, fieldname ) + if self._should_save(tablename, nickname): + self.interpreter.row_history.save_row(tablename, nickname, row) + + def _should_save(self, tablename: str, nickname: T.Optional[str]) -> bool: history_tables = self.interpreter.tables_to_keep_history_for - should_save: bool = ( + return ( (tablename in history_tables) or (nickname in history_tables) or SAVE_EVERYTHING ) - if should_save: - self.interpreter.row_history.save_row(tablename, nickname, row) def register_object(self, obj, name: Optional[str], persistent: bool): "Keep track of this object in case other objects refer to it." self.obj = obj - self.interpreter.globals.register_object(obj, name, persistent) + should_save = self._should_save(obj._tablename, name) + # `persistent means`: is it `just_once` and therefore might be + # referred to by `reference` in a future iteration + # `should_save` means it may be referred to by `random_reference` + self.interpreter.globals.register_object(obj, name, persistent, should_save) @contextmanager def child_context(self, template): diff --git a/snowfakery/object_rows.py b/snowfakery/object_rows.py index 3e836a35..6e4871b2 100644 --- a/snowfakery/object_rows.py +++ b/snowfakery/object_rows.py @@ -70,6 +70,9 @@ def __init__(self, tablename: str, id: int): class LazyLoadedObjectReference(ObjectReference): _data = None + yaml_loader = yaml.SafeLoader + yaml_dumper = SnowfakeryDumper + yaml_tag = "!snowfakery_lazyloadedobjectrow" def __init__( self, @@ -85,10 +88,17 @@ def __getattr__(self, attrname): if attrname.endswith("__"): # pragma: no cover raise AttributeError(attrname) if self._data is None: - row_history = RowHistoryCV.get() - self._data = row_history.load_row(self.sql_tablename, self.id) + self._load_data() return self._data[attrname] + def _load_data(self): + row_history = RowHistoryCV.get() + self._data = row_history.load_row(self.sql_tablename, self.id) + + def __reduce_ex__(self, *args, **kwargs): + self._load_data() + return super().__reduce_ex__(*args, **kwargs) + class SlotState(Enum): """The current state of a NicknameSlot. diff --git a/tests/deep-random-nesting.yml b/tests/deep-random-nesting.yml new file mode 100644 index 00000000..5b04a81b --- /dev/null +++ b/tests/deep-random-nesting.yml @@ -0,0 +1,19 @@ +### This recipe creates reference Account record for PMM data +# Look at examples/salesforce/Account.recipe.yml for more examples. + +# Run this like this: + +# cci task run generate_and_load_from_yaml --generator_yaml snowfakery_samples/PMM/pmm_0_Account.recipe.yml --num_records 300 --num_records_tablename Account --org qa +# snowfakery snowfakery_samples/PMM/pmm_0_Account.recipe.yml --output-format json --output-file src/foo.json + +# Set Macro for Household and Organization Record Type + +- object: Account + count: 3 + just_once: True + +- object: Account + just_once: True + fields: + parent: + random_reference: Account diff --git a/tests/test_data_generator.py b/tests/test_data_generator.py index eca67fba..1792c1f7 100644 --- a/tests/test_data_generator.py +++ b/tests/test_data_generator.py @@ -64,6 +64,7 @@ def test_stopping_criteria_with_startids(self, write_row): nicknames_and_tables: {} today: 2022-11-03 persistent_nicknames: {} +persistent_random_referenceable_objects: [] """ generate( StringIO(yaml), diff --git a/tests/test_embedding.py b/tests/test_embedding.py index e44ef0ea..a10e232c 100644 --- a/tests/test_embedding.py +++ b/tests/test_embedding.py @@ -121,7 +121,8 @@ def test_parent_application__streams_instead_of_files(self, generated_rows): Foo: Foo persistent_nicknames: {} persistent_objects_by_table: {} - today: 2021-04-07""" + today: 2021-04-07 + persistent_random_referenceable_objects: []""" ) generate_continuation_file = StringIO() decls = """[{"sf_object": Opportunity, "api": bulk}]""" diff --git a/tests/test_references.py b/tests/test_references.py index 5fc93c5e..6d513913 100644 --- a/tests/test_references.py +++ b/tests/test_references.py @@ -610,15 +610,23 @@ class TestRandomReferencesNew: def test_random_reference_to_just_once_obj(self, generated_rows): yaml = """ - object: Parent + count: 3 just_once: true + fields: + name: Poppy - object: Child + count: 5 fields: parent: random_reference: Parent + deep_ref: ${{parent.name}} """ generate(StringIO(yaml), stopping_criteria=StoppingCriteria("Child", 3)) - assert len(generated_rows.mock_calls) == 4 + assert len(generated_rows.mock_calls) == 8 + assert generated_rows.table_values("Child", 1, "deep_ref") == "Poppy" + assert generated_rows.table_values("Child", 2, "deep_ref") == "Poppy" + assert generated_rows.table_values("Child", 5, "deep_ref") == "Poppy" @pytest.mark.parametrize("rand_top", [True, False]) def test_random_reference_to_just_once_obj_many(self, generated_rows, rand_top): From 1625c62f4840c21f8040e040ab8e3b461e99218c Mon Sep 17 00:00:00 2001 From: Paul Prescod Date: Mon, 11 Jul 2022 17:42:30 -0400 Subject: [PATCH 2/3] Add a bit of info about continuations. --- docs/arch/ArchIndex.md | 95 +++++++++++++++++++++++------------------- 1 file changed, 53 insertions(+), 42 deletions(-) diff --git a/docs/arch/ArchIndex.md b/docs/arch/ArchIndex.md index 1db684c2..2cd5f317 100644 --- a/docs/arch/ArchIndex.md +++ b/docs/arch/ArchIndex.md @@ -8,8 +8,6 @@ The Snowfakery interpreter reads a recipe, translates it into internal data stru Obviously, Snowfakery architecture will be easier to understand in the context of the language itself, so understanding the syntax is a good first step. - - ## Levels of Looping Snowfakery recipes are designed to be evaluated over and over again, top to bottom. Each run-through is called @@ -21,15 +19,15 @@ This is useful for generating chunks of data called _portions_, and then handing Here is the overall pattern: -| CumulusCI | Snowfakery | Data Loader | -| ------------- |-------------| -------------| -| Generate Data | Start | Wait | -| Load Data | Stop | Start | -| Generate Data | Start | Stop | -| Load Data | Stop | Start | -| Generate Data | Start | Stop | -| Load Data | Finish | Start | -| Load Data | Finished | Finish | +| CumulusCI | Snowfakery | Data Loader | +| ------------- | ---------- | ----------- | +| Generate Data | Start | Wait | +| Load Data | Stop | Start | +| Generate Data | Start | Stop | +| Load Data | Stop | Start | +| Generate Data | Start | Stop | +| Load Data | Finish | Start | +| Load Data | Finished | Finish | Note that every time you Start and Stop Snowfakery, you generate a whole new Interpreter object, which re-reads the recipe. In some contexts, the new Intepreter object may be in a different process or (theoretically) on a different computer altogether. @@ -57,9 +55,9 @@ So Snowfakery would run it once snapshot the "continuation state" and then fan t When reading Snowfakery code, you must always think about the lifetime of each data structure: -* Will it survive for a single iteration, like local variables? We call these Transients. -* Will it survive for a single continuation, like "FakerData" objects? We could call these Interpreter Managed objects. -* Will it be saved and loaded between continuations, and thus survive across continuations? These are Globals. +- Will it survive for a single iteration, like local variables? We call these Transients. +- Will it survive for a single continuation, like "FakerData" objects? We could call these Interpreter Managed objects. +- Will it be saved and loaded between continuations, and thus survive across continuations? These are Globals. ## The Parser @@ -76,12 +74,12 @@ is executed once per continuation (or just once if the recipe is not continued). The Interpreter mediates access betewen the recipe (represented by the ParseResult) and resources such as: - * the Output Stream - * Global persistent data that survives continuations by being saved to and loaded from YAML - * Transient persistent data that is discarded and rebuilt (as necessary) after continuation - * The Row History which is used for allowing randomized access to objects for the `random_reference` feature - * Plugins and Providers which extend Snowfakery - * Runtime Object Model objects +- the Output Stream +- Global persistent data that survives continuations by being saved to and loaded from YAML +- Transient persistent data that is discarded and rebuilt (as necessary) after continuation +- The Row History which is used for allowing randomized access to objects for the `random_reference` feature +- Plugins and Providers which extend Snowfakery +- Runtime Object Model objects On my relatively slow computer it takes 1/25 of a second to initialize an Interpreter from a Recipe once all modules are loaded. It takes about 3/4 of a second to launch an interpreter and load the corre, required modules. @@ -97,8 +95,7 @@ For example, a VariableDefinition represents this structure: ``` - - An ObjectTemplate represents this one: +An ObjectTemplate represents this one: ``` - object: XXX @@ -128,12 +125,12 @@ id_manager: Contact: 2 Opportunity: 5 intertable_dependencies: -- field_name: AccountId - table_name_from: Contact - table_name_to: Account -- field_name: AccountId - table_name_from: Opportunity - table_name_to: Account + - field_name: AccountId + table_name_from: Contact + table_name_to: Account + - field_name: AccountId + table_name_from: Opportunity + table_name_to: Account nicknames_and_tables: Account: Account Contact: Contact @@ -173,16 +170,16 @@ today: 2022-06-06 This also shows the contents of the Globals object. Things we track: -* The last used IDs for various Tables, so we don't generate overlapping IDs -* Inter-table dependencies, so we can generate a CCI mapping file or other output schema that depends on +- The last used IDs for various Tables, so we don't generate overlapping IDs +- Inter-table dependencies, so we can generate a CCI mapping file or other output schema that depends on relationships -* Mapping from nicknames to tablenames, with tables own names being registered as nicknames for convenience -* Data from specific ("persistent") objects which the user asked to be generated just once and may want to refer to again later -* The current date to allow the `today` function to be consistent even if a process runs across midnight (perhaps we should revisit this) +- Mapping from nicknames to tablenames, with tables own names being registered as nicknames for convenience +- Data from specific ("persistent") objects which the user asked to be generated just once and may want to refer to again later +- The current date to allow the `today` function to be consistent even if a process runs across midnight (perhaps we should revisit this) ### Transients -If data should be discarded on every iteration (analogous to 'local variables' in a programming language) then it should be stored in the Transients object which is recreated on every iteration. This object is accessible through the Globals but is not saved to YAML. +If data should be discarded on every iteration (analogous to 'local variables' in a programming language) then it should be stored in the Transients object which is recreated on every iteration. This object is accessible through the Globals but is not saved to YAML. ### Row History @@ -190,11 +187,10 @@ RowHistory is a way of keeping track of the contents of a subset of all of the r There are a few Recipe patterns enabled by the row history: - * `random_reference` lookups to nicknames - * `random_reference` lookups to objects that have data of interest, such as _another_ `random_reference` +- `random_reference` lookups to nicknames +- `random_reference` lookups to objects that have data of interest, such as _another_ `random_reference` - -Row History data structures survive for as long as a single process/interpreter/continuation. A new +Row History data structures survive for as long as a single process/interpreter/continuation. A new continuation gets a new Row History, so it is not possible to use Row History to make links across continuation boundaries. @@ -215,11 +211,10 @@ Here is the kind of recipe that might blow up memory: fields: ref: random_reference: target - name: - ${{ref.bloat}} + name: ${{ref.bloat}} ``` -The second object picks from one of a 100M unique strings +The second object picks from one of a 100M unique strings which are each approx 80M in size. That's a lot of data and would quickly blow up memory. @@ -242,8 +237,24 @@ All Fake Data is mediated through the [FakeData](https://github.com/SFDO-Tooling Snowfakery extends and customizes the set of fake data providers through its [FakeNames](https://github.com/SFDO-Tooling/Snowfakery/search?q=%22class+FakeNames%22) class. For example, Snowfakery's email address provider incorporates the first name and last name of the imaginary person into the email. Snowfakery renames `postcode` to `postalcode` to match Salesforc conventions. Snowfakery adds timezones to date-time fakers. -## Formulas +## Formulas Snowfakery `${{formulas}}` are Jinja Templates controlled by a class called the [`JinjaTemplateEvaluatorFactory`](https://github.com/SFDO-Tooling/Snowfakery/search?q=%22class+JinjaTemplateEvaluatorFactory%22). The `Interpreter` object keeps a reference to this class. +## Continuations + +Recall that there are multiple [Levels of Looping](#levels-of-looping). Data which +survives beyond continutation (process) boundaries lives in continuation files. +You can see how that works here: + +```sh +$ snowfakery foo.yml --generate-continuation-file /tmp/continue.yml && snowfakery foo.yml --continuation-file /tmp/continue.yml + +$ cat /tmp/continue.yml +``` + +The contents of `/tmp/continue.yml` are specific to a version of Snowfakery and subject +to change over time. +In general, it saves the contents of `just_once` objects and recently created +objects. From b671d0f53aa96f4176d2bb4f5c5cd4d6ba622876 Mon Sep 17 00:00:00 2001 From: Paul Prescod Date: Tue, 12 Jul 2022 07:22:10 -0400 Subject: [PATCH 3/3] Add clarifying comments --- snowfakery/data_generator_runtime.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/snowfakery/data_generator_runtime.py b/snowfakery/data_generator_runtime.py index 4785f7dd..0f58bb3b 100644 --- a/snowfakery/data_generator_runtime.py +++ b/snowfakery/data_generator_runtime.py @@ -126,10 +126,16 @@ def __init__( today: date = None, name_slots: Mapping[str, str] = None, ): - # these lists start empty and are filled. - # They survive iterations and continuations. + # all of these properties start empty and are filled. + # They all survive iterations and continuations. + + # These two are indexed by name self.persistent_nicknames = {} self.persistent_objects_by_table = {} + + # Not indexed because it is used only to refresh the RowHistory DB + # after continuation + # Otherwise the data is never read or written self.persistent_random_referenceable_objects = [] self.id_manager = IdManager()