diff --git a/sparsimony/pruners/unstructured.py b/sparsimony/pruners/unstructured.py index 4ba40b7..342340f 100644 --- a/sparsimony/pruners/unstructured.py +++ b/sparsimony/pruners/unstructured.py @@ -5,6 +5,7 @@ from sparsimony.pruners.base import BasePruner, BaseGrower +_EPS=0.001 class UnstructuredRandomPruner(BasePruner): """Pruning method that randomly prunes tensor.""" @@ -29,7 +30,7 @@ def calculate_mask( """ n_drop = int(mask.sum() * prune_ratio) scores = torch.where( - mask == 1, torch.abs(torch.rand_like(mask)), torch.zeros_like(mask) + mask == 1, torch.abs(torch.rand_like(mask)+_EPS), torch.zeros_like(mask) ) if dist.is_initialized(): dist.all_reduce(scores, dist.ReduceOp.AVG, async_op=False) @@ -75,7 +76,7 @@ def calculate_mask( n_grow = cls.get_n_grow(sparsity, mask) scores = torch.where( mask == 0, - torch.abs(torch.rand_like(mask) + 0.1), # small eps for avoiding 0s + torch.abs(torch.rand_like(mask) + _EPS), # small eps for avoiding 0s torch.zeros_like(mask), ) if dist.is_initialized(): @@ -97,12 +98,12 @@ def calculate_mask( ) -> torch.Tensor: if grads is None: # Randomly grow - grads = torch.rand_like(mask) + grads = torch.rand_like(mask)+_EPS n_grow = cls.get_n_grow(sparsity, mask) # Set scores of active params to 0 scores = torch.where( - mask == 0, torch.abs(grads), torch.full_like(grads, -1) + mask == 0, torch.abs(grads)+_EPS, torch.full_like(grads, -1) ) if dist.is_initialized(): dist.all_reduce(scores, dist.ReduceOp.AVG, async_op=False) diff --git a/tests/sparsimony/test_dst_mixin.py b/tests/sparsimony/test_dst_mixin.py index 6d56732..9b7dfd6 100644 --- a/tests/sparsimony/test_dst_mixin.py +++ b/tests/sparsimony/test_dst_mixin.py @@ -13,12 +13,14 @@ (5, 5), # 5x5 mask and initial sparsity of 20% # (32, 3, 3), # 32x3x3 mask and initial sparsity of 90% # TODO: Conv (768, 3072), # 768x3072 mask and initial sparsity of 99% + # (768, 670091), ], ids=[ "10x10", "5x5", # "32x3x3", "768x3072", + # "768x670091", ], ) def model(request): @@ -38,7 +40,7 @@ def id_fn(sparsity): @pytest.mark.parametrize( - "sparsity", [0.0, 0.1, 0.5, 0.75, 0.9, 0.99], ids=id_fn + "sparsity", [0.0, 0.1, 0.5, 0.75, 0.83, 0.9, 0.99], ids=id_fn ) def test_zero_inactive_param_momentum_buffers_sgd(model, sparsity): # Create a mock Linear layer and optimizer @@ -76,7 +78,7 @@ def test_zero_inactive_param_momentum_buffers_sgd(model, sparsity): @pytest.mark.parametrize( - "sparsity", [0.0, 0.1, 0.5, 0.75, 0.9, 0.99], ids=id_fn + "sparsity", [0.0, 0.1, 0.5, 0.75, 0.83, 0.9, 0.99], ids=id_fn ) def test_zero_inactive_param_momentum_buffers_adamw(model, sparsity): optimizer = optim.AdamW(model.parameters(), lr=0.1)