Skip to content

Commit

Permalink
.distinct: do not support dedup on non-hashable values
Browse files Browse the repository at this point in the history
  • Loading branch information
ebonnal committed Dec 28, 2024
1 parent c4ef670 commit c89941e
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 30 deletions.
21 changes: 3 additions & 18 deletions streamable/iterators.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,31 +99,16 @@ def __init__(self, iterator: Iterator[T], by: Optional[Callable[[T], Any]]) -> N
validate_iterator(iterator)
self.iterator = iterator
self.by = noop_stopiteration(by) if by else None
self._already_seen_set: Set[Any] = set()
self._already_seen_list: List[Any] = list()
self._already_seen: Set[Any] = set()

def _value(self, elem):
return self.by(elem) if self.by else elem

def _see(self, elem: Any):
value = self._value(elem)
try:
self._already_seen_set.add(value)
except TypeError:
self._already_seen_list.append(value)

def _has_been_seen(self, elem: Any):
value = self._value(elem)
try:
return value in self._already_seen_set
except TypeError:
return value in self._already_seen_list

def __next__(self) -> T:
elem = next(self.iterator)
while self._has_been_seen(elem):
while self._value(elem) in self._already_seen:
elem = next(self.iterator)
self._see(elem)
self._already_seen.add(self._value(elem))
return elem


Expand Down
4 changes: 2 additions & 2 deletions streamable/stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,8 +167,8 @@ def distinct(
self, by: Optional[Callable[[T], Any]] = None, consecutive_only: bool = False
) -> "Stream":
"""
Filters the stream to yield only distinct elements, `foo` and `bar` considered duplicates if `foo == bar`.
If `by` is specified, `foo` and `bar` are considered duplicates if `by(foo) == by(bar)`.
Filters the stream to yield only distinct elements, `foo` and `bar` considered duplicates if `hash(foo) == hash(bar)`.
If `by` is specified, `foo` and `bar` are considered duplicates if `hash(by(foo)) == hash(by(bar))`.
Among duplicates, the first encountered occurence in upstream order is yielded.
Expand Down
16 changes: 6 additions & 10 deletions tests/test_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -1253,16 +1253,12 @@ def test_distinct(self) -> None:
[],
msg="`distinct` should yield zero elements on empty stream",
)
self.assertEqual(
list(Stream([[1], [2], [1], [2]]).distinct()),
[[1], [2]],
msg="`distinct` should work with non-hashable elements",
)
self.assertEqual(
list(Stream([[1], "foo", [2], [1], [2], "foo"]).distinct()),
[[1], "foo", [2]],
msg="`distinct` should work with a mix of hashable and non-hashable elements",
)
with self.assertRaisesRegex(
TypeError,
"unhashable type: 'list'",
msg="`distinct` should raise for non-hashable elements",
):
list(Stream([[1]]).distinct())

def test_catch(self) -> None:
self.assertEqual(
Expand Down

0 comments on commit c89941e

Please sign in to comment.