Skip to content

Commit

Permalink
Don't add archived posts to feeds
Browse files Browse the repository at this point in the history
An archived post is one whose creation date is at least 24 hours
before the actual publish date shown in clients. Usually those kinds
of posts are imported from another social media site such as 𝕏/Twitter
and Mastodon.

SkyFeed (used by many user created feeds including @\aither.bsky.social's
LoveLive! feed) filters them out anyway, also their presence adds a
bit more workload to remove false positives based on their content.

To make sure false positives (usually those posted directly to Bluesky)
aren't left out especially after being disconnected from the firehose
for long enough, we compare the post creation date against the commit
timestamp given by the firehose instead of the current time.

Inspired by MarshalX/bluesky-feed-generator#21

Also move the porn label check to its own function and add comments
explaining what each key-value pair in record create dict represents.
  • Loading branch information
p1timmy committed Dec 30, 2024
1 parent 0cb48e3 commit 04f5ac2
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 5 deletions.
42 changes: 37 additions & 5 deletions server/data_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
_ADULT_LABELS = ("porn", "nudity", "sexual")
_BSKY_MOD_SERVICE = "did:plc:ar7c4by46qjdydhdevvrndac"
_MAX_COMMIT_LAG = timedelta(seconds=0.25)
_ARCHIVED_THRESHOLD = timedelta(days=1)

logger = logging.getLogger(__name__)

Expand All @@ -24,6 +25,32 @@
_all_feeds[row.uri] = row


def post_has_pr0n_label(post: dict) -> bool:
"""
Check if a post was published with a porn (explicit adult content) label added by
the author
"""
record: models.AppBskyFeedPost.Record = post["record"]
return (
record.labels
and record.labels.values is not None
and _PR0N_LABEL in record.labels.values
)


def is_archived_post(post: dict) -> bool:
"""
Check if a post is an archived one, meaning post creation date is over 24 hours old
as indicated by the official Bluesky app
(See
https://github.com/bluesky-social/social-app/blob/6471e809aa28f0319bde4aa1f362679e3723d298/src/view/com/post-thread/PostThreadItem.tsx#L779)
"""
created_at = datetime.fromisoformat(post["record"].created_at)
published_at: datetime = post["time"]
return published_at - created_at > _ARCHIVED_THRESHOLD


def operations_callback(ops: defaultdict):
# Here we can filter, process, run ML classification, etc.
# After our feed alg we can save posts into our DB
Expand All @@ -37,11 +64,16 @@ def operations_callback(ops: defaultdict):
# Hide post if it has adult content (porn) label
pr0n_post_count = 0
has_pr0n_label = False
if (
record.labels
and record.labels.values is not None
and _PR0N_LABEL in record.labels.values
):
if post_record_has_pr0n_label(record):
has_pr0n_label = True

# Skip archived posts (mostly those imported from 𝕏/Twitter or similar)
if is_archived_post(created_post):
continue

# Hide post if it has adult content (porn) label
has_pr0n_label = False
if post_has_pr0n_label(created_post):
has_pr0n_label = True

feeds: list[Feed] = []
Expand Down
11 changes: 11 additions & 0 deletions server/data_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,21 @@ def _get_commit_ops_by_type(
):
operation_by_type[record_nsid]["created"].append(
{
# Object containing all the details of the record
"record": record,
# ATProto URI of the record
# (hint: for posts use https://hopper.at/ to get its
# regular link)
"uri": str(uri),
# Unique identifier of a record, used by `cursor` parameter
# in feed URLs
"cid": str(op.cid),
# DID of account that made the record
"author": commit.repo,
# Date/time of when the record was made.
# In the case of posts, it's the real publish date aka the
# post date/time as shown in clients
"time": datetime.fromisoformat(commit.time),
}
)
break
Expand Down

0 comments on commit 04f5ac2

Please sign in to comment.