Skip to content

Commit

Permalink
Merge pull request #187 from Martin-Rey/new-attempt-duplicates
Browse files Browse the repository at this point in the history
Removing link duplicates
  • Loading branch information
apontzen authored Apr 19, 2022
2 parents f887de1 + 6fef128 commit 32a0b8a
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 4 deletions.
23 changes: 19 additions & 4 deletions tangos/scripts/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,8 +168,11 @@ def flag_duplicates_deprecated(opts):

session = db.core.get_default_session()

print("unmark all:",session.execute("update haloproperties set deprecated=0").rowcount)
print(" mark:",session.execute("update haloproperties set deprecated=1 where id in (SELECT min(id) FROM haloproperties GROUP BY halo_id, name_id HAVING COUNT(*)>1 ORDER BY halo_id, name_id)").rowcount)
print("unmark all properties:", session.execute("update haloproperties set deprecated=0").rowcount)
print("duplicate properties marked:", session.execute("update haloproperties set deprecated=1 where id in (SELECT min(id) FROM haloproperties GROUP BY halo_id, name_id HAVING COUNT(*)>1 ORDER BY halo_id, name_id)").rowcount)

print("unmark all links:", session.execute("update halolink set deprecated=0").rowcount)
print("duplicate links marked:", session.execute("update halolink set deprecated=1 where id in (SELECT min(id) FROM halolink GROUP BY halo_from_id, halo_to_id, relation_id HAVING COUNT(*)>1 ORDER BY halo_from_id, halo_to_id, weight)").rowcount)

session.commit()

Expand Down Expand Up @@ -204,9 +207,21 @@ def remove_duplicates(options):
)
""")).rowcount

count_links = session.execute(dedent("""
DELETE FROM halolink
WHERE id NOT IN (
SELECT * FROM (
SELECT MAX(id)
FROM halolink
GROUP BY halo_from_id, halo_to_id, relation_id
) as t
)
""")).rowcount

if dialect == 'mysql':
session.execute("SET @@SESSION.optimizer_switch = @__optimizations")
print("Deleted %d rows" % count)
print("Deleted %d links" % count_links)
session.commit()


Expand Down Expand Up @@ -522,10 +537,10 @@ def get_argument_parser_and_subparsers():


subparse_deprecate = subparse.add_parser("flag-duplicates",
help="Flag old copies of properties (if they are present)")
help="Flag old copies of properties and duplicate links (if they are present)")
subparse_deprecate.set_defaults(func=flag_duplicates_deprecated)
subparse_deprecate = subparse.add_parser("remove-duplicates",
help="Remove old copies of properties (if they are present)")
help="Remove old copies of properties and duplicate links (if they are present)")
subparse_deprecate.set_defaults(func=remove_duplicates)

subparse_rollback = subparse.add_parser("rollback", help="Remove database updates")
Expand Down
58 changes: 58 additions & 0 deletions tests/test_remove_duplicate.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import numpy as np

import tangos
from tangos import core, testing
from tangos.cached_writer import create_property
from tangos.core.halo_data import link
from tangos.scripts.manager import remove_duplicates
from tangos.testing.simulation_generator import SimulationGeneratorForTests

Expand All @@ -23,6 +26,33 @@ def setup_module():
session.add(px, session)
session.commit()

# Also create links between halos, including duplicates
halo2 = tangos.get_halo(2)
halo3 = tangos.get_halo(3)
halo9 = tangos.get_halo(9)

# two links between halo 1 to halo 2 with the same weight and name (maximal duplicate)
d_test = tangos.core.get_or_create_dictionary_item(session, "test")
l_obj = link.HaloLink(halo, halo2, d_test, 1.0)
session.add(l_obj)
l_obj = link.HaloLink(halo, halo2, d_test, 1.0)
session.add(l_obj)
# and another time but with same weight but different name (not a duplicate)
diff_name = tangos.core.get_or_create_dictionary_item(session, "other_test")
l_obj = link.HaloLink(halo, halo2, diff_name, 1.0)
session.add(l_obj)
# and another time, with same name but different weight
# (this is a non-maximal duplicate, oldest addition gets deleted and we keep the most recent link)
l_obj = link.HaloLink(halo, halo2, d_test, 0.5)
session.add(l_obj)
# and another time, with same weight and name as previous but linking to a different halo (not a duplicate)
l_obj = link.HaloLink(halo, halo3, d_test, 1.0)
session.add(l_obj)

# and now a completely independent link between halo 2 to halo 9
l_obj = link.HaloLink(halo2, halo9, d_test, 1.0)
session.add(l_obj)


def teardown_module():
core.close_db()
Expand All @@ -37,6 +67,16 @@ def test():
halo = tangos.get_halo(ihalo)
assert halo["Mvir"] == ihalo

# We also have five links for halo 1 and one for halo 2
assert tangos.get_halo(1).links.count() == 5
assert tangos.get_halo(2).links.count() == 1
# Only 4 links in halo 1 are maximally unique
quads = [[l.halo_from.id, l.halo_to.id, l.weight, l.relation_id] for l in tangos.get_halo(1).all_links]
assert len(np.unique(quads, axis=0)) == 4
# And 3 links are unique by name, halo from and to
triplets = [[l.halo_from.id, l.halo_to.id, l.relation_id] for l in tangos.get_halo(1).all_links]
assert len(np.unique(triplets, axis=0)) == 3

# Let's cleanup
remove_duplicates(None)

Expand All @@ -46,3 +86,21 @@ def test():
for ihalo in range(2, 10):
halo = tangos.get_halo(ihalo)
assert halo["Mvir"] == ihalo

# Now halo 1 should have two less links
assert tangos.get_halo(1).links.count() == 3
# which are all unique according to name, halo from and to
triplets = [[l.halo_from.id, l.halo_to.id, l.relation_id] for l in tangos.get_halo(1).all_links]
assert len(np.unique(triplets, axis=0)) == tangos.get_halo(1).links.count()

# When deleting non-maximal duplicate (link index 1),
# we have kept the latest addition to the database with weight 0.5
test_link = tangos.get_halo(1).all_links[1]
assert test_link.halo_from.id == 1
assert test_link.halo_to.id == 2
assert test_link.weight == 0.5

# And links of halo 2 should not have changed
assert tangos.get_halo(2).links.count() == 1
assert tangos.get_halo(2).all_links[0].halo_from.id == 2
assert tangos.get_halo(2).all_links[0].halo_to.id == 9

0 comments on commit 32a0b8a

Please sign in to comment.