From 04f5ac2c9865ba06e4268d7b68e866462ee27d8f Mon Sep 17 00:00:00 2001 From: p1timmy Date: Mon, 30 Dec 2024 09:46:05 -0700 Subject: [PATCH] Don't add archived posts to feeds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit An archived post is one whose creation date is at least 24 hours before the actual publish date shown in clients. Usually those kinds of posts are imported from another social media site such as 𝕏/Twitter and Mastodon. SkyFeed (used by many user created feeds including @\aither.bsky.social's LoveLive! feed) filters them out anyway, also their presence adds a bit more workload to remove false positives based on their content. To make sure false positives (usually those posted directly to Bluesky) aren't left out especially after being disconnected from the firehose for long enough, we compare the post creation date against the commit timestamp given by the firehose instead of the current time. Inspired by https://github.com/MarshalX/bluesky-feed-generator/pull/21 Also move the porn label check to its own function and add comments explaining what each key-value pair in record create dict represents. --- server/data_filter.py | 42 +++++++++++++++++++++++++++++++++++++----- server/data_stream.py | 11 +++++++++++ 2 files changed, 48 insertions(+), 5 deletions(-) diff --git a/server/data_filter.py b/server/data_filter.py index 7edda27..89666fd 100644 --- a/server/data_filter.py +++ b/server/data_filter.py @@ -15,6 +15,7 @@ _ADULT_LABELS = ("porn", "nudity", "sexual") _BSKY_MOD_SERVICE = "did:plc:ar7c4by46qjdydhdevvrndac" _MAX_COMMIT_LAG = timedelta(seconds=0.25) +_ARCHIVED_THRESHOLD = timedelta(days=1) logger = logging.getLogger(__name__) @@ -24,6 +25,32 @@ _all_feeds[row.uri] = row +def post_has_pr0n_label(post: dict) -> bool: + """ + Check if a post was published with a porn (explicit adult content) label added by + the author + """ + record: models.AppBskyFeedPost.Record = post["record"] + return ( + record.labels + and record.labels.values is not None + and _PR0N_LABEL in record.labels.values + ) + + +def is_archived_post(post: dict) -> bool: + """ + Check if a post is an archived one, meaning post creation date is over 24 hours old + as indicated by the official Bluesky app + + (See + https://github.com/bluesky-social/social-app/blob/6471e809aa28f0319bde4aa1f362679e3723d298/src/view/com/post-thread/PostThreadItem.tsx#L779) + """ + created_at = datetime.fromisoformat(post["record"].created_at) + published_at: datetime = post["time"] + return published_at - created_at > _ARCHIVED_THRESHOLD + + def operations_callback(ops: defaultdict): # Here we can filter, process, run ML classification, etc. # After our feed alg we can save posts into our DB @@ -37,11 +64,16 @@ def operations_callback(ops: defaultdict): # Hide post if it has adult content (porn) label pr0n_post_count = 0 has_pr0n_label = False - if ( - record.labels - and record.labels.values is not None - and _PR0N_LABEL in record.labels.values - ): + if post_record_has_pr0n_label(record): + has_pr0n_label = True + + # Skip archived posts (mostly those imported from 𝕏/Twitter or similar) + if is_archived_post(created_post): + continue + + # Hide post if it has adult content (porn) label + has_pr0n_label = False + if post_has_pr0n_label(created_post): has_pr0n_label = True feeds: list[Feed] = [] diff --git a/server/data_stream.py b/server/data_stream.py index 655ec7a..7c457f5 100644 --- a/server/data_stream.py +++ b/server/data_stream.py @@ -60,10 +60,21 @@ def _get_commit_ops_by_type( ): operation_by_type[record_nsid]["created"].append( { + # Object containing all the details of the record "record": record, + # ATProto URI of the record + # (hint: for posts use https://hopper.at/ to get its + # regular link) "uri": str(uri), + # Unique identifier of a record, used by `cursor` parameter + # in feed URLs "cid": str(op.cid), + # DID of account that made the record "author": commit.repo, + # Date/time of when the record was made. + # In the case of posts, it's the real publish date aka the + # post date/time as shown in clients + "time": datetime.fromisoformat(commit.time), } ) break