Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Removing link duplicates #187

Merged
merged 6 commits into from
Apr 19, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 19 additions & 4 deletions tangos/scripts/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,8 +168,11 @@ def flag_duplicates_deprecated(opts):

session = db.core.get_default_session()

print("unmark all:",session.execute("update haloproperties set deprecated=0").rowcount)
print(" mark:",session.execute("update haloproperties set deprecated=1 where id in (SELECT min(id) FROM haloproperties GROUP BY halo_id, name_id HAVING COUNT(*)>1 ORDER BY halo_id, name_id)").rowcount)
print("unmark all properties:", session.execute("update haloproperties set deprecated=0").rowcount)
print("duplicate properties marked:", session.execute("update haloproperties set deprecated=1 where id in (SELECT min(id) FROM haloproperties GROUP BY halo_id, name_id HAVING COUNT(*)>1 ORDER BY halo_id, name_id)").rowcount)

print("unmark all links:", session.execute("update halolink set deprecated=0").rowcount)
print("duplicate links marked:", session.execute("update halolink set deprecated=1 where id in (SELECT min(id) FROM halolink GROUP BY halo_from_id, halo_to_id, relation_id HAVING COUNT(*)>1 ORDER BY halo_from_id, halo_to_id, weight)").rowcount)

session.commit()

Expand Down Expand Up @@ -204,9 +207,21 @@ def remove_duplicates(options):
)
""")).rowcount

count_links = session.execute(dedent("""
DELETE FROM halolink
WHERE id NOT IN (
SELECT * FROM (
SELECT MAX(id)
FROM halolink
GROUP BY halo_from_id, halo_to_id, relation_id
) as t
)
""")).rowcount

if dialect == 'mysql':
session.execute("SET @@SESSION.optimizer_switch = @__optimizations")
print("Deleted %d rows" % count)
print("Deleted %d links" % count_links)
session.commit()


Expand Down Expand Up @@ -522,10 +537,10 @@ def get_argument_parser_and_subparsers():


subparse_deprecate = subparse.add_parser("flag-duplicates",
help="Flag old copies of properties (if they are present)")
help="Flag old copies of properties and duplicate links (if they are present)")
subparse_deprecate.set_defaults(func=flag_duplicates_deprecated)
subparse_deprecate = subparse.add_parser("remove-duplicates",
help="Remove old copies of properties (if they are present)")
help="Remove old copies of properties and duplicate links (if they are present)")
subparse_deprecate.set_defaults(func=remove_duplicates)

subparse_rollback = subparse.add_parser("rollback", help="Remove database updates")
Expand Down
58 changes: 58 additions & 0 deletions tests/test_remove_duplicate.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import numpy as np

import tangos
from tangos import core, testing
from tangos.cached_writer import create_property
from tangos.core.halo_data import link
from tangos.scripts.manager import remove_duplicates
from tangos.testing.simulation_generator import SimulationGeneratorForTests

Expand All @@ -23,6 +26,33 @@ def setup_module():
session.add(px, session)
session.commit()

# Also create links between halos, including duplicates
halo2 = tangos.get_halo(2)
halo3 = tangos.get_halo(3)
halo9 = tangos.get_halo(9)

# two links between halo 1 to halo 2 with the same weight and name (maximal duplicate)
d_test = tangos.core.get_or_create_dictionary_item(session, "test")
l_obj = link.HaloLink(halo, halo2, d_test, 1.0)
session.add(l_obj)
l_obj = link.HaloLink(halo, halo2, d_test, 1.0)
session.add(l_obj)
# and another time but with same weight but different name (not a duplicate)
diff_name = tangos.core.get_or_create_dictionary_item(session, "other_test")
l_obj = link.HaloLink(halo, halo2, diff_name, 1.0)
session.add(l_obj)
# and another time, with same name but different weight
# (this is a non-maximal duplicate, oldest addition gets deleted and we keep the most recent link)
l_obj = link.HaloLink(halo, halo2, d_test, 0.5)
session.add(l_obj)
# and another time, with same weight and name as previous but linking to a different halo (not a duplicate)
l_obj = link.HaloLink(halo, halo3, d_test, 1.0)
session.add(l_obj)

# and now a completely independent link between halo 2 to halo 9
l_obj = link.HaloLink(halo2, halo9, d_test, 1.0)
session.add(l_obj)


def teardown_module():
core.close_db()
Expand All @@ -37,6 +67,16 @@ def test():
halo = tangos.get_halo(ihalo)
assert halo["Mvir"] == ihalo

# We also have five links for halo 1 and one for halo 2
assert tangos.get_halo(1).links.count() == 5
assert tangos.get_halo(2).links.count() == 1
# Only 4 links in halo 1 are maximally unique
quads = [[l.halo_from.id, l.halo_to.id, l.weight, l.relation_id] for l in tangos.get_halo(1).all_links]
assert len(np.unique(quads, axis=0)) == 4
# And 3 links are unique by name, halo from and to
triplets = [[l.halo_from.id, l.halo_to.id, l.relation_id] for l in tangos.get_halo(1).all_links]
assert len(np.unique(triplets, axis=0)) == 3

# Let's cleanup
remove_duplicates(None)

Expand All @@ -46,3 +86,21 @@ def test():
for ihalo in range(2, 10):
halo = tangos.get_halo(ihalo)
assert halo["Mvir"] == ihalo

# Now halo 1 should have two less links
assert tangos.get_halo(1).links.count() == 3
# which are all unique according to name, halo from and to
triplets = [[l.halo_from.id, l.halo_to.id, l.relation_id] for l in tangos.get_halo(1).all_links]
assert len(np.unique(triplets, axis=0)) == tangos.get_halo(1).links.count()

# When deleting non-maximal duplicate (link index 1),
# we have kept the latest addition to the database with weight 0.5
test_link = tangos.get_halo(1).all_links[1]
assert test_link.halo_from.id == 1
assert test_link.halo_to.id == 2
assert test_link.weight == 0.5

# And links of halo 2 should not have changed
assert tangos.get_halo(2).links.count() == 1
assert tangos.get_halo(2).all_links[0].halo_from.id == 2
assert tangos.get_halo(2).all_links[0].halo_to.id == 9