From 661c8dd5ab3c61973c883f59f357ce898f3079f5 Mon Sep 17 00:00:00 2001 From: Martin-Rey Date: Thu, 14 Apr 2022 11:41:53 +0100 Subject: [PATCH 1/6] Test that links gets succesfully deleted Rebasing against massive master changes --- tests/test_remove_duplicate.py | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/tests/test_remove_duplicate.py b/tests/test_remove_duplicate.py index 55d5cd68..6097be5f 100644 --- a/tests/test_remove_duplicate.py +++ b/tests/test_remove_duplicate.py @@ -3,7 +3,7 @@ from tangos.cached_writer import create_property from tangos.scripts.manager import remove_duplicates from tangos.testing.simulation_generator import SimulationGeneratorForTests - +from tangos.core.halo_data import link def setup_module(): testing.init_blank_db_for_testing() @@ -23,6 +23,29 @@ def setup_module(): session.add(px, session) session.commit() + # Also create links between halos, including duplicates + halo2 = tangos.get_halo(2) + halo3 = tangos.get_halo(3) + halo9 = tangos.get_halo(9) + + # twice halo 1 to halo 2 + d_test = tangos.core.get_or_create_dictionary_item(session, "test") + l_obj = link.HaloLink(halo, halo2, d_test, 1.0) + session.add(l_obj) + d_test = tangos.core.get_or_create_dictionary_item(session, "test") + l_obj = link.HaloLink(halo, halo2, d_test, 1.0) + session.add(l_obj) + + # once halo 1 to halo 3 + d_test = tangos.core.get_or_create_dictionary_item(session, "test") + l_obj = link.HaloLink(halo, halo3, d_test, 1.0) + session.add(l_obj) + + # once halo 2 to halo 9 + d_test = tangos.core.get_or_create_dictionary_item(session, "test") + l_obj = link.HaloLink(halo2, halo9, d_test, 1.0) + session.add(l_obj) + def teardown_module(): core.close_db() @@ -37,6 +60,10 @@ def test(): halo = tangos.get_halo(ihalo) assert halo["Mvir"] == ihalo + # And three links for halo 1 and one for halo 2 + assert tangos.get_halo(1).links.count() == 3 + assert tangos.get_halo(2).links.count() == 1 + # Let's cleanup remove_duplicates(None) @@ -46,3 +73,7 @@ def test(): for ihalo in range(2, 10): halo = tangos.get_halo(ihalo) assert halo["Mvir"] == ihalo + + # And two links for halo 1, pointing to different halos, still one for halo 2 + assert tangos.get_halo(1).links.count() == 2 + assert tangos.get_halo(2).links.count() == 1 \ No newline at end of file From 4c0c73c54c3bc8b1e62c4de912a793bdf48dd1ed Mon Sep 17 00:00:00 2001 From: Martin-Rey Date: Thu, 14 Apr 2022 12:09:14 +0100 Subject: [PATCH 2/6] Adapating SQl query to also flag duplicate links, and refine testing to cover more cases --- tangos/scripts/manager.py | 23 ++++++++++++++++++---- tests/test_remove_duplicate.py | 35 +++++++++++++++++++++++----------- 2 files changed, 43 insertions(+), 15 deletions(-) diff --git a/tangos/scripts/manager.py b/tangos/scripts/manager.py index 3c920d5e..48c9a16b 100755 --- a/tangos/scripts/manager.py +++ b/tangos/scripts/manager.py @@ -168,8 +168,11 @@ def flag_duplicates_deprecated(opts): session = db.core.get_default_session() - print("unmark all:",session.execute("update haloproperties set deprecated=0").rowcount) - print(" mark:",session.execute("update haloproperties set deprecated=1 where id in (SELECT min(id) FROM haloproperties GROUP BY halo_id, name_id HAVING COUNT(*)>1 ORDER BY halo_id, name_id)").rowcount) + print("unmark all properties:", session.execute("update haloproperties set deprecated=0").rowcount) + print("duplicate properties marked:", session.execute("update haloproperties set deprecated=1 where id in (SELECT min(id) FROM haloproperties GROUP BY halo_id, name_id HAVING COUNT(*)>1 ORDER BY halo_id, name_id)").rowcount) + + print("unmark all links:", session.execute("update halolink set deprecated=0").rowcount) + print("duplicate links marked:", session.execute("update halolink set deprecated=1 where id in (SELECT min(id) FROM halolink GROUP BY halo_from_id, halo_to_id, weight HAVING COUNT(*)>1 ORDER BY halo_from_id, halo_to_id, weight)").rowcount) session.commit() @@ -204,9 +207,21 @@ def remove_duplicates(options): ) """)).rowcount + count_links = session.execute(dedent(""" + DELETE FROM halolink + WHERE id NOT IN ( + SELECT * FROM ( + SELECT MAX(id) + FROM halolink + GROUP BY halo_from_id, halo_to_id, weight + ) as t + ) + """)).rowcount + if dialect == 'mysql': session.execute("SET @@SESSION.optimizer_switch = @__optimizations") print("Deleted %d rows" % count) + print("Deleted %d links" % count_links) session.commit() @@ -522,10 +537,10 @@ def get_argument_parser_and_subparsers(): subparse_deprecate = subparse.add_parser("flag-duplicates", - help="Flag old copies of properties (if they are present)") + help="Flag old copies of properties and duplicate links (if they are present)") subparse_deprecate.set_defaults(func=flag_duplicates_deprecated) subparse_deprecate = subparse.add_parser("remove-duplicates", - help="Remove old copies of properties (if they are present)") + help="Remove old copies of properties and duplicate links (if they are present)") subparse_deprecate.set_defaults(func=remove_duplicates) subparse_rollback = subparse.add_parser("rollback", help="Remove database updates") diff --git a/tests/test_remove_duplicate.py b/tests/test_remove_duplicate.py index 6097be5f..c18939d7 100644 --- a/tests/test_remove_duplicate.py +++ b/tests/test_remove_duplicate.py @@ -1,3 +1,8 @@ +import numpy as np +from tangos import testing +from tangos.testing.simulation_generator import SimulationGeneratorForTests +from tangos import core +from tangos.cached_writer import create_property import tangos from tangos import core, testing from tangos.cached_writer import create_property @@ -28,21 +33,20 @@ def setup_module(): halo3 = tangos.get_halo(3) halo9 = tangos.get_halo(9) - # twice halo 1 to halo 2 + # duplicate link between halo 1 to halo 2 with the same weight d_test = tangos.core.get_or_create_dictionary_item(session, "test") l_obj = link.HaloLink(halo, halo2, d_test, 1.0) session.add(l_obj) - d_test = tangos.core.get_or_create_dictionary_item(session, "test") l_obj = link.HaloLink(halo, halo2, d_test, 1.0) session.add(l_obj) - - # once halo 1 to halo 3 - d_test = tangos.core.get_or_create_dictionary_item(session, "test") + # and another time, but with different weight + l_obj = link.HaloLink(halo, halo2, d_test, 0.5) + session.add(l_obj) + # once more for halo 1 but to a different halo l_obj = link.HaloLink(halo, halo3, d_test, 1.0) session.add(l_obj) - # once halo 2 to halo 9 - d_test = tangos.core.get_or_create_dictionary_item(session, "test") + # and now a completely independent link between halo 2 to halo 9 l_obj = link.HaloLink(halo2, halo9, d_test, 1.0) session.add(l_obj) @@ -61,8 +65,11 @@ def test(): assert halo["Mvir"] == ihalo # And three links for halo 1 and one for halo 2 - assert tangos.get_halo(1).links.count() == 3 + assert tangos.get_halo(1).links.count() == 4 assert tangos.get_halo(2).links.count() == 1 + # but only 3 links in halo 1 are unique + triplets = [[l.halo_from.id, l.halo_to.id, l.weight] for l in tangos.get_halo(1).all_links] + assert len(np.unique(triplets, axis=0)) == 3 # Let's cleanup remove_duplicates(None) @@ -74,6 +81,12 @@ def test(): halo = tangos.get_halo(ihalo) assert halo["Mvir"] == ihalo - # And two links for halo 1, pointing to different halos, still one for halo 2 - assert tangos.get_halo(1).links.count() == 2 - assert tangos.get_halo(2).links.count() == 1 \ No newline at end of file + # Now halo 1 should have one less link, which are all unique + assert tangos.get_halo(1).links.count() == 3 + triplets = [[l.halo_from.id, l.halo_to.id, l.weight] for l in tangos.get_halo(1).all_links] + assert len(np.unique(triplets, axis=0)) == 3 + + # halo 2 should not have changed + assert tangos.get_halo(2).links.count() == 1 + assert tangos.get_halo(2).all_links[0].halo_from.id == 2 + assert tangos.get_halo(2).all_links[0].halo_to.id == 9 From 12bf877822156adcf4cc1051770e58ba4c3b2cd3 Mon Sep 17 00:00:00 2001 From: Martin-Rey Date: Mon, 18 Apr 2022 11:31:23 +0100 Subject: [PATCH 3/6] Also group duplicate link by relation id and add additional test --- tangos/scripts/manager.py | 2 +- tests/test_remove_duplicate.py | 31 ++++++++++++++++--------------- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/tangos/scripts/manager.py b/tangos/scripts/manager.py index 48c9a16b..aa86e771 100755 --- a/tangos/scripts/manager.py +++ b/tangos/scripts/manager.py @@ -213,7 +213,7 @@ def remove_duplicates(options): SELECT * FROM ( SELECT MAX(id) FROM halolink - GROUP BY halo_from_id, halo_to_id, weight + GROUP BY halo_from_id, halo_to_id, weight, relation_id ) as t ) """)).rowcount diff --git a/tests/test_remove_duplicate.py b/tests/test_remove_duplicate.py index c18939d7..3eaed623 100644 --- a/tests/test_remove_duplicate.py +++ b/tests/test_remove_duplicate.py @@ -1,14 +1,11 @@ -import numpy as np -from tangos import testing -from tangos.testing.simulation_generator import SimulationGeneratorForTests -from tangos import core -from tangos.cached_writer import create_property import tangos from tangos import core, testing from tangos.cached_writer import create_property from tangos.scripts.manager import remove_duplicates from tangos.testing.simulation_generator import SimulationGeneratorForTests from tangos.core.halo_data import link +import numpy as np + def setup_module(): testing.init_blank_db_for_testing() @@ -33,16 +30,20 @@ def setup_module(): halo3 = tangos.get_halo(3) halo9 = tangos.get_halo(9) - # duplicate link between halo 1 to halo 2 with the same weight + # two links between halo 1 to halo 2 with the same weight and name (maximal duplicate) d_test = tangos.core.get_or_create_dictionary_item(session, "test") l_obj = link.HaloLink(halo, halo2, d_test, 1.0) session.add(l_obj) l_obj = link.HaloLink(halo, halo2, d_test, 1.0) session.add(l_obj) - # and another time, but with different weight + # and another time but with same weight but different name (not a duplicate) + diff_name = tangos.core.get_or_create_dictionary_item(session, "other_test") + l_obj = link.HaloLink(halo, halo2, diff_name, 1.0) + session.add(l_obj) + # and another time, with same name but different weight (conceptual duplicate but non-maximal) l_obj = link.HaloLink(halo, halo2, d_test, 0.5) session.add(l_obj) - # once more for halo 1 but to a different halo + # and another time, with same weight and name as previous but linking to a different halo (not a duplicate) l_obj = link.HaloLink(halo, halo3, d_test, 1.0) session.add(l_obj) @@ -65,11 +66,11 @@ def test(): assert halo["Mvir"] == ihalo # And three links for halo 1 and one for halo 2 - assert tangos.get_halo(1).links.count() == 4 + assert tangos.get_halo(1).links.count() == 5 assert tangos.get_halo(2).links.count() == 1 - # but only 3 links in halo 1 are unique - triplets = [[l.halo_from.id, l.halo_to.id, l.weight] for l in tangos.get_halo(1).all_links] - assert len(np.unique(triplets, axis=0)) == 3 + # but only 4 links in halo 1 are maximally unique + quads = [[l.halo_from.id, l.halo_to.id, l.weight, l.relation_id] for l in tangos.get_halo(1).all_links] + assert len(np.unique(quads, axis=0)) == 4 # Let's cleanup remove_duplicates(None) @@ -82,9 +83,9 @@ def test(): assert halo["Mvir"] == ihalo # Now halo 1 should have one less link, which are all unique - assert tangos.get_halo(1).links.count() == 3 - triplets = [[l.halo_from.id, l.halo_to.id, l.weight] for l in tangos.get_halo(1).all_links] - assert len(np.unique(triplets, axis=0)) == 3 + assert tangos.get_halo(1).links.count() == 4 + quads = [[l.halo_from.id, l.halo_to.id, l.weight, l.relation_id] for l in tangos.get_halo(1).all_links] + assert len(np.unique(quads, axis=0)) == 4 # halo 2 should not have changed assert tangos.get_halo(2).links.count() == 1 From 1405411821fcb8cb1afa94e6dc644710c9c6ce31 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 19 Apr 2022 08:35:47 +0000 Subject: [PATCH 4/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_remove_duplicate.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_remove_duplicate.py b/tests/test_remove_duplicate.py index 3eaed623..0dd5f4e1 100644 --- a/tests/test_remove_duplicate.py +++ b/tests/test_remove_duplicate.py @@ -1,10 +1,11 @@ +import numpy as np + import tangos from tangos import core, testing from tangos.cached_writer import create_property +from tangos.core.halo_data import link from tangos.scripts.manager import remove_duplicates from tangos.testing.simulation_generator import SimulationGeneratorForTests -from tangos.core.halo_data import link -import numpy as np def setup_module(): From 6fba173b96b51130c4944037c1f406c354b8ccc5 Mon Sep 17 00:00:00 2001 From: Martin-Rey Date: Tue, 19 Apr 2022 08:56:39 +0100 Subject: [PATCH 5/6] Modified comment in test --- tests/test_remove_duplicate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_remove_duplicate.py b/tests/test_remove_duplicate.py index 0dd5f4e1..7f526b17 100644 --- a/tests/test_remove_duplicate.py +++ b/tests/test_remove_duplicate.py @@ -41,7 +41,7 @@ def setup_module(): diff_name = tangos.core.get_or_create_dictionary_item(session, "other_test") l_obj = link.HaloLink(halo, halo2, diff_name, 1.0) session.add(l_obj) - # and another time, with same name but different weight (conceptual duplicate but non-maximal) + # and another time, with same name but different weight (conceptual duplicate but non-maximal, does not get deleted) l_obj = link.HaloLink(halo, halo2, d_test, 0.5) session.add(l_obj) # and another time, with same weight and name as previous but linking to a different halo (not a duplicate) From 6fef1285e1ff6b6b06a8cb263f48bde9c6a06a95 Mon Sep 17 00:00:00 2001 From: Martin-Rey Date: Tue, 19 Apr 2022 12:30:06 +0100 Subject: [PATCH 6/6] Delete non-maximal duplicate by keeping the most recent addition and implement test for it --- tangos/scripts/manager.py | 4 ++-- tests/test_remove_duplicate.py | 28 ++++++++++++++++++++-------- 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/tangos/scripts/manager.py b/tangos/scripts/manager.py index aa86e771..d67b6332 100755 --- a/tangos/scripts/manager.py +++ b/tangos/scripts/manager.py @@ -172,7 +172,7 @@ def flag_duplicates_deprecated(opts): print("duplicate properties marked:", session.execute("update haloproperties set deprecated=1 where id in (SELECT min(id) FROM haloproperties GROUP BY halo_id, name_id HAVING COUNT(*)>1 ORDER BY halo_id, name_id)").rowcount) print("unmark all links:", session.execute("update halolink set deprecated=0").rowcount) - print("duplicate links marked:", session.execute("update halolink set deprecated=1 where id in (SELECT min(id) FROM halolink GROUP BY halo_from_id, halo_to_id, weight HAVING COUNT(*)>1 ORDER BY halo_from_id, halo_to_id, weight)").rowcount) + print("duplicate links marked:", session.execute("update halolink set deprecated=1 where id in (SELECT min(id) FROM halolink GROUP BY halo_from_id, halo_to_id, relation_id HAVING COUNT(*)>1 ORDER BY halo_from_id, halo_to_id, weight)").rowcount) session.commit() @@ -213,7 +213,7 @@ def remove_duplicates(options): SELECT * FROM ( SELECT MAX(id) FROM halolink - GROUP BY halo_from_id, halo_to_id, weight, relation_id + GROUP BY halo_from_id, halo_to_id, relation_id ) as t ) """)).rowcount diff --git a/tests/test_remove_duplicate.py b/tests/test_remove_duplicate.py index 7f526b17..d30118d5 100644 --- a/tests/test_remove_duplicate.py +++ b/tests/test_remove_duplicate.py @@ -41,7 +41,8 @@ def setup_module(): diff_name = tangos.core.get_or_create_dictionary_item(session, "other_test") l_obj = link.HaloLink(halo, halo2, diff_name, 1.0) session.add(l_obj) - # and another time, with same name but different weight (conceptual duplicate but non-maximal, does not get deleted) + # and another time, with same name but different weight + # (this is a non-maximal duplicate, oldest addition gets deleted and we keep the most recent link) l_obj = link.HaloLink(halo, halo2, d_test, 0.5) session.add(l_obj) # and another time, with same weight and name as previous but linking to a different halo (not a duplicate) @@ -66,12 +67,15 @@ def test(): halo = tangos.get_halo(ihalo) assert halo["Mvir"] == ihalo - # And three links for halo 1 and one for halo 2 + # We also have five links for halo 1 and one for halo 2 assert tangos.get_halo(1).links.count() == 5 assert tangos.get_halo(2).links.count() == 1 - # but only 4 links in halo 1 are maximally unique + # Only 4 links in halo 1 are maximally unique quads = [[l.halo_from.id, l.halo_to.id, l.weight, l.relation_id] for l in tangos.get_halo(1).all_links] assert len(np.unique(quads, axis=0)) == 4 + # And 3 links are unique by name, halo from and to + triplets = [[l.halo_from.id, l.halo_to.id, l.relation_id] for l in tangos.get_halo(1).all_links] + assert len(np.unique(triplets, axis=0)) == 3 # Let's cleanup remove_duplicates(None) @@ -83,12 +87,20 @@ def test(): halo = tangos.get_halo(ihalo) assert halo["Mvir"] == ihalo - # Now halo 1 should have one less link, which are all unique - assert tangos.get_halo(1).links.count() == 4 - quads = [[l.halo_from.id, l.halo_to.id, l.weight, l.relation_id] for l in tangos.get_halo(1).all_links] - assert len(np.unique(quads, axis=0)) == 4 + # Now halo 1 should have two less links + assert tangos.get_halo(1).links.count() == 3 + # which are all unique according to name, halo from and to + triplets = [[l.halo_from.id, l.halo_to.id, l.relation_id] for l in tangos.get_halo(1).all_links] + assert len(np.unique(triplets, axis=0)) == tangos.get_halo(1).links.count() + + # When deleting non-maximal duplicate (link index 1), + # we have kept the latest addition to the database with weight 0.5 + test_link = tangos.get_halo(1).all_links[1] + assert test_link.halo_from.id == 1 + assert test_link.halo_to.id == 2 + assert test_link.weight == 0.5 - # halo 2 should not have changed + # And links of halo 2 should not have changed assert tangos.get_halo(2).links.count() == 1 assert tangos.get_halo(2).all_links[0].halo_from.id == 2 assert tangos.get_halo(2).all_links[0].halo_to.id == 9