Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Asset snapshot cleanup task #4901

Merged
merged 13 commits into from
Apr 23, 2024
11 changes: 0 additions & 11 deletions kobo/tasks.py

This file was deleted.

2 changes: 2 additions & 0 deletions kpi/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,3 +126,5 @@
manually from the django shell.
"""
)

LIMIT_HOURS_23 = 82800
7 changes: 5 additions & 2 deletions kpi/deployment_backends/mixin.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
# coding: utf-8
import celery
from django.utils import timezone

from kpi.constants import ASSET_TYPE_SURVEY
from kpi.exceptions import BadAssetTypeException, DeploymentNotFound
from kpi.models.asset_file import AssetFile
from kpi.tasks import sync_media_files

from .backends import DEPLOYMENT_BACKENDS
from .base_backend import BaseDeploymentBackend

Expand All @@ -19,7 +20,9 @@ def sync_media_files_async(self, always=True):
file_type=AssetFile.FORM_MEDIA, synced_with_backend=False
).exists():
self.save(create_version=False, adjust_content=False)
sync_media_files.delay(self.uid)

# Not using .delay() due to circular import in tasks.py
celery.current_app.send_task('kpi.tasks.sync_media_files', (self.uid,))
jamesrkiger marked this conversation as resolved.
Show resolved Hide resolved

@property
def can_be_deployed(self):
Expand Down
28 changes: 28 additions & 0 deletions kpi/maintenance_tasks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import constance
jamesrkiger marked this conversation as resolved.
Show resolved Hide resolved
from datetime import timedelta

from django.db.models import Exists, OuterRef, Q
from django.utils import timezone

from kpi.models import AssetSnapshot


def remove_old_asset_snapshots():
days = constance.config.ASSET_SNAPSHOT_DAYS_RETENTION

# We don't want to delete an asset's latest versioned snapshot,
# even if it is older than the retention period
newer_snapshot_for_asset = AssetSnapshot.objects.exclude(
asset_version=None
).filter(asset_id=OuterRef('asset_id'), pk__gt=OuterRef('pk'))

delete_queryset = AssetSnapshot.objects.filter(
date_created__lt=timezone.now() - timedelta(days=days),
).filter(Exists(newer_snapshot_for_asset) | Q(asset_version=None))

while True:
count, _ = delete_queryset.filter(
pk__in=delete_queryset[:1000]
).delete()
if not count:
break
3 changes: 0 additions & 3 deletions kpi/models/asset.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,6 @@
from kpi.utils.asset_content_analyzer import AssetContentAnalyzer
from kpi.utils.object_permission import get_cached_code_names
from kpi.utils.sluggify import sluggify_label
from kpi.tasks import remove_asset_snapshots


class AssetDeploymentStatus(models.TextChoices):
Expand Down Expand Up @@ -1235,8 +1234,6 @@ def snapshot(

if regenerate:
snapshot = False
# Let's do some housekeeping
remove_asset_snapshots.delay(self.id)
else:
snapshot = AssetSnapshot.objects.filter(**snap_params).order_by(
'-date_created'
Expand Down
6 changes: 0 additions & 6 deletions kpi/models/asset_snapshot.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,6 @@ class AssetSnapshot(
"""
This model serves as a cache of the XML that was exported by the installed
version of pyxform.

TODO: come up with a policy to clear this cache out.
DO NOT: depend on these snapshots existing for more than a day
until a policy is set.
Done with https://github.com/kobotoolbox/kpi/pull/2434.
Remove above lines when PR is merged
"""
xml = models.TextField()
source = models.JSONField(default=dict)
Expand Down
36 changes: 16 additions & 20 deletions kpi/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,24 +3,29 @@
import requests
from django.conf import settings
from django.contrib.auth.models import User
from django.core.management import call_command
from django.core.mail import send_mail
from django.core.management import call_command

from kobo.apps.markdownx_uploader.tasks import remove_unused_markdown_files
from kobo.celery import celery_app
from kpi.constants import LIMIT_HOURS_23
from kpi.maintenance_tasks import remove_old_asset_snapshots
from kpi.models.asset import Asset
from kpi.models.import_export_task import (
ExportTask,
ImportTask,
ProjectViewExportTask,
)


@celery_app.task
def import_in_background(import_task_uid):
from kpi.models.import_export_task import ImportTask # avoid circular imports

import_task = ImportTask.objects.get(uid=import_task_uid)
import_task.run()


@celery_app.task
def export_in_background(export_task_uid):
from kpi.models.import_export_task import ExportTask # avoid circular imports

export_task = ExportTask.objects.get(uid=export_task_uid)
export_task.run()

Expand All @@ -29,10 +34,6 @@ def export_in_background(export_task_uid):
def project_view_export_in_background(
export_task_uid: str, username: str
) -> None:
from kpi.models.import_export_task import (
ProjectViewExportTask,
) # avoid circular imports

user = User.objects.get(username=username)

export_task = ProjectViewExportTask.objects.get(uid=export_task_uid)
Expand Down Expand Up @@ -72,8 +73,6 @@ def sync_kobocat_xforms(

@celery_app.task
def sync_media_files(asset_uid):
from kpi.models.asset import Asset # avoid circular imports

asset = Asset.objects.get(uid=asset_uid)
asset.deployment.sync_media_files()

Expand All @@ -96,14 +95,11 @@ def enketo_flush_cached_preview(server_url, form_id):
response.raise_for_status()


@celery_app.task
def remove_asset_snapshots(asset_id: int):

@celery_app.task(time_limit=LIMIT_HOURS_23, soft_time_limit=LIMIT_HOURS_23)
def perform_maintenance():
"""
Temporary task to delete old snapshots.
TODO remove when kpi#2434 is merged
Run daily maintenance tasks
"""
call_command(
'delete_assets_snapshots',
days=constance.config.ASSET_SNAPSHOT_DAYS_RETENTION,
asset_id=asset_id,
)
remove_unused_markdown_files()
remove_old_asset_snapshots()
73 changes: 48 additions & 25 deletions kpi/tests/test_asset_snapshots.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@
from constance.test import override_config
from django.contrib.auth.models import User
from django.test import TestCase
from django.utils.timezone import now
from django.utils import timezone

from kpi.maintenance_tasks import remove_old_asset_snapshots
from kpi.tests.api.v2 import test_api_asset_snapshots
from ..models import Asset
from ..models import AssetSnapshot
Expand Down Expand Up @@ -83,31 +84,53 @@ def test_snapshots_allow_choice_duplicates(self):
class AssetSnapshotHousekeeping(AssetSnapshotsTestCase):

jamesrkiger marked this conversation as resolved.
Show resolved Hide resolved
@override_config(ASSET_SNAPSHOT_DAYS_RETENTION=2)
def test_delete_old_asset_snapshots_on_regenerate(self):
two_days_before = now() - timedelta(days=3) # One more day than Constance setting
yesterday = now() - timedelta(days=1)
def test_delete_old_asset_snapshots_task(self):
# Because of `auto_date_now` , we cannot specify the date with `create()`
older_snapshot = AssetSnapshot.objects.create(asset=self.asset)
older_snapshot.date_created = two_days_before
older_snapshot.date_created = timezone.now() - timedelta(days=5)
older_snapshot.save(update_fields=['date_created'])
old_snapshot = AssetSnapshot.objects.create(asset=self.asset)
old_snapshot.date_created = yesterday
old_snapshot.save(update_fields=['date_created'])
# versioned snapshots are always regenerated
versioned_snapshot = self.asset.snapshot(
regenerate=True,
version_uid=self.asset.latest_deployed_version_uid
)
snapshot_uids = list(AssetSnapshot.objects.filter(
asset=self.asset
).values_list('uid', flat=True))
expected_snapshot_uids = [
versioned_snapshot.uid,
old_snapshot.uid,
self.asset_snapshot.uid,
]
# Older snapshot should be gone
newer_snapshot = AssetSnapshot.objects.create(asset=self.asset)
newer_snapshot.date_created = timezone.now() - timedelta(days=4)
newer_snapshot.save(update_fields=['date_created'])

remove_old_asset_snapshots()

# Both are outside retention date, oldest gets removed
assert AssetSnapshot.objects.filter(pk=newer_snapshot.id).exists()
assert not AssetSnapshot.objects.filter(pk=older_snapshot.id).exists()
# Older snapshot should still exist
assert AssetSnapshot.objects.filter(pk=old_snapshot.id).exists()
assert sorted(expected_snapshot_uids) == sorted(snapshot_uids)

newest_unversioned_snapshot = AssetSnapshot.objects.create(
asset=self.asset, source=self.asset.content
)
newest_unversioned_snapshot.date_created = timezone.now() - timedelta(
days=3
)
newest_unversioned_snapshot.save(update_fields=['date_created'])

remove_old_asset_snapshots()

# Newest still gets deleted because it doesn't have version
assert AssetSnapshot.objects.filter(pk=newer_snapshot.id).exists()
assert not AssetSnapshot.objects.filter(
pk=newest_unversioned_snapshot.id
).exists()

asset_2 = Asset.objects.create(
content=self.asset.content,
owner=self.user,
asset_type='survey',
)
asset_2_older_snapshot = AssetSnapshot.objects.create(asset=asset_2)
asset_2_older_snapshot.date_created = timezone.now() - timedelta(days=1)
asset_2_older_snapshot.save(update_fields=['date_created'])
asset_2_newer_snapshot = AssetSnapshot.objects.create(asset=asset_2)

remove_old_asset_snapshots()

# Both remain because they are within retention date
assert AssetSnapshot.objects.filter(
pk=asset_2_older_snapshot.id
).exists()
assert AssetSnapshot.objects.filter(
pk=asset_2_newer_snapshot.id
).exists()
Loading