diff --git a/tangos/scripts/manager.py b/tangos/scripts/manager.py index 3c920d5e..d67b6332 100755 --- a/tangos/scripts/manager.py +++ b/tangos/scripts/manager.py @@ -168,8 +168,11 @@ def flag_duplicates_deprecated(opts): session = db.core.get_default_session() - print("unmark all:",session.execute("update haloproperties set deprecated=0").rowcount) - print(" mark:",session.execute("update haloproperties set deprecated=1 where id in (SELECT min(id) FROM haloproperties GROUP BY halo_id, name_id HAVING COUNT(*)>1 ORDER BY halo_id, name_id)").rowcount) + print("unmark all properties:", session.execute("update haloproperties set deprecated=0").rowcount) + print("duplicate properties marked:", session.execute("update haloproperties set deprecated=1 where id in (SELECT min(id) FROM haloproperties GROUP BY halo_id, name_id HAVING COUNT(*)>1 ORDER BY halo_id, name_id)").rowcount) + + print("unmark all links:", session.execute("update halolink set deprecated=0").rowcount) + print("duplicate links marked:", session.execute("update halolink set deprecated=1 where id in (SELECT min(id) FROM halolink GROUP BY halo_from_id, halo_to_id, relation_id HAVING COUNT(*)>1 ORDER BY halo_from_id, halo_to_id, weight)").rowcount) session.commit() @@ -204,9 +207,21 @@ def remove_duplicates(options): ) """)).rowcount + count_links = session.execute(dedent(""" + DELETE FROM halolink + WHERE id NOT IN ( + SELECT * FROM ( + SELECT MAX(id) + FROM halolink + GROUP BY halo_from_id, halo_to_id, relation_id + ) as t + ) + """)).rowcount + if dialect == 'mysql': session.execute("SET @@SESSION.optimizer_switch = @__optimizations") print("Deleted %d rows" % count) + print("Deleted %d links" % count_links) session.commit() @@ -522,10 +537,10 @@ def get_argument_parser_and_subparsers(): subparse_deprecate = subparse.add_parser("flag-duplicates", - help="Flag old copies of properties (if they are present)") + help="Flag old copies of properties and duplicate links (if they are present)") subparse_deprecate.set_defaults(func=flag_duplicates_deprecated) subparse_deprecate = subparse.add_parser("remove-duplicates", - help="Remove old copies of properties (if they are present)") + help="Remove old copies of properties and duplicate links (if they are present)") subparse_deprecate.set_defaults(func=remove_duplicates) subparse_rollback = subparse.add_parser("rollback", help="Remove database updates") diff --git a/tests/test_remove_duplicate.py b/tests/test_remove_duplicate.py index 55d5cd68..d30118d5 100644 --- a/tests/test_remove_duplicate.py +++ b/tests/test_remove_duplicate.py @@ -1,6 +1,9 @@ +import numpy as np + import tangos from tangos import core, testing from tangos.cached_writer import create_property +from tangos.core.halo_data import link from tangos.scripts.manager import remove_duplicates from tangos.testing.simulation_generator import SimulationGeneratorForTests @@ -23,6 +26,33 @@ def setup_module(): session.add(px, session) session.commit() + # Also create links between halos, including duplicates + halo2 = tangos.get_halo(2) + halo3 = tangos.get_halo(3) + halo9 = tangos.get_halo(9) + + # two links between halo 1 to halo 2 with the same weight and name (maximal duplicate) + d_test = tangos.core.get_or_create_dictionary_item(session, "test") + l_obj = link.HaloLink(halo, halo2, d_test, 1.0) + session.add(l_obj) + l_obj = link.HaloLink(halo, halo2, d_test, 1.0) + session.add(l_obj) + # and another time but with same weight but different name (not a duplicate) + diff_name = tangos.core.get_or_create_dictionary_item(session, "other_test") + l_obj = link.HaloLink(halo, halo2, diff_name, 1.0) + session.add(l_obj) + # and another time, with same name but different weight + # (this is a non-maximal duplicate, oldest addition gets deleted and we keep the most recent link) + l_obj = link.HaloLink(halo, halo2, d_test, 0.5) + session.add(l_obj) + # and another time, with same weight and name as previous but linking to a different halo (not a duplicate) + l_obj = link.HaloLink(halo, halo3, d_test, 1.0) + session.add(l_obj) + + # and now a completely independent link between halo 2 to halo 9 + l_obj = link.HaloLink(halo2, halo9, d_test, 1.0) + session.add(l_obj) + def teardown_module(): core.close_db() @@ -37,6 +67,16 @@ def test(): halo = tangos.get_halo(ihalo) assert halo["Mvir"] == ihalo + # We also have five links for halo 1 and one for halo 2 + assert tangos.get_halo(1).links.count() == 5 + assert tangos.get_halo(2).links.count() == 1 + # Only 4 links in halo 1 are maximally unique + quads = [[l.halo_from.id, l.halo_to.id, l.weight, l.relation_id] for l in tangos.get_halo(1).all_links] + assert len(np.unique(quads, axis=0)) == 4 + # And 3 links are unique by name, halo from and to + triplets = [[l.halo_from.id, l.halo_to.id, l.relation_id] for l in tangos.get_halo(1).all_links] + assert len(np.unique(triplets, axis=0)) == 3 + # Let's cleanup remove_duplicates(None) @@ -46,3 +86,21 @@ def test(): for ihalo in range(2, 10): halo = tangos.get_halo(ihalo) assert halo["Mvir"] == ihalo + + # Now halo 1 should have two less links + assert tangos.get_halo(1).links.count() == 3 + # which are all unique according to name, halo from and to + triplets = [[l.halo_from.id, l.halo_to.id, l.relation_id] for l in tangos.get_halo(1).all_links] + assert len(np.unique(triplets, axis=0)) == tangos.get_halo(1).links.count() + + # When deleting non-maximal duplicate (link index 1), + # we have kept the latest addition to the database with weight 0.5 + test_link = tangos.get_halo(1).all_links[1] + assert test_link.halo_from.id == 1 + assert test_link.halo_to.id == 2 + assert test_link.weight == 0.5 + + # And links of halo 2 should not have changed + assert tangos.get_halo(2).links.count() == 1 + assert tangos.get_halo(2).all_links[0].halo_from.id == 2 + assert tangos.get_halo(2).all_links[0].halo_to.id == 9