From a93c0c96559dfacc148af12816ae72277ad3614c Mon Sep 17 00:00:00 2001 From: Samson Akol Date: Thu, 13 Jun 2024 15:03:45 +0300 Subject: [PATCH 01/21] add docstring to embed methods --- .../contentcuration/utils/recommendations.py | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/contentcuration/contentcuration/utils/recommendations.py b/contentcuration/contentcuration/utils/recommendations.py index 1c015d2050..df5f074632 100644 --- a/contentcuration/contentcuration/utils/recommendations.py +++ b/contentcuration/contentcuration/utils/recommendations.py @@ -89,7 +89,19 @@ def get_recommendations(self, embedding) -> RecommendationsResponse: return self.backend.make_request(request) def embed_topics(self, topics: Dict[str, Any]) -> EmbeddingsResponse: + """ + Embeds the topics and returns an EmbeddingsResponse. + This method connects to the backend, sends the provided topics as a JSON payload, + and requests the backend to embed the topics. If an exception occurs during + this process, it returns an EmbeddingsResponse with the exception as the error. + + Parameters: + topics (Dict[str, Any]): A dictionary of topics to be embedded. + + Returns: + EmbeddingsResponse: An EmbeddingsResponse object containing the embeddings or an error. + """ if not self.backend.connect(): raise errors.ConnectionError("Connection to the backend failed") @@ -100,7 +112,19 @@ def embed_topics(self, topics: Dict[str, Any]) -> EmbeddingsResponse: return EmbeddingsResponse(error=e) def embed_content(self, nodes: List[ContentNode]) -> EmbeddingsResponse: + """ + Embeds the content nodes and returns an EmbeddingsResponse. + + This method connects to the backend, extracts content metadata from the provided nodes, + and sends a request to the backend to embed the content. If an exception occurs during + this process, it returns an EmbeddingsResponse with the exception as the error. + + Parameters: + nodes (List[ContentNode]): A list of ContentNode objects to be embedded. + Returns: + EmbeddingsResponse: An EmbeddingsResponse object containing the embeddings or an error. + """ if not self.backend.connect(): raise errors.ConnectionError("Connection to the backend failed") @@ -116,6 +140,18 @@ def embed_content(self, nodes: List[ContentNode]) -> EmbeddingsResponse: return EmbeddingsResponse(error=e) def extract_content(self, node: ContentNode) -> Dict[str, Any]: + """ + Extracts content metadata from a given ContentNode object. + + This method extracts the content metadata from the provided ContentNode object. + The extracted metadata is returned as a dictionary. + + Parameters: + node (ContentNode): The ContentNode object from which to extract the content metadata. + + Returns: + Dict[str, Any]: A dictionary containing the extracted content metadata. + """ return {} From df864405c33b385b44a50ab9dcf293826af9d5a0 Mon Sep 17 00:00:00 2001 From: Samson Akol Date: Tue, 18 Jun 2024 19:27:30 +0300 Subject: [PATCH 02/21] updates embed_content to use methods as building blocks --- .../tests/utils/test_automation_manager.py | 28 ----- .../tests/utils/test_recommendations.py | 21 ++-- .../utils/automation_manager.py | 28 +++-- .../contentcuration/utils/recommendations.py | 105 +++++++++++++----- 4 files changed, 106 insertions(+), 76 deletions(-) diff --git a/contentcuration/contentcuration/tests/utils/test_automation_manager.py b/contentcuration/contentcuration/tests/utils/test_automation_manager.py index a01eaaa228..5f1833d6cf 100644 --- a/contentcuration/contentcuration/tests/utils/test_automation_manager.py +++ b/contentcuration/contentcuration/tests/utils/test_automation_manager.py @@ -1,5 +1,4 @@ import unittest -from unittest.mock import MagicMock from contentcuration.utils.automation_manager import AutomationManager @@ -11,30 +10,3 @@ def setUp(self): def test_creation(self): # Check if an instance of AutomationManager is created successfully self.assertIsInstance(self.automation_manager, AutomationManager) - - def test_generate_embedding(self): - text = "Some text that needs to be embedded" - # Mock the generate_embedding method of RecommendationsAdapter - # as the implementation is yet to be done - self.automation_manager.recommendations_backend_adapter.generate_embedding = MagicMock(return_value=[0.1, 0.2, 0.3]) - embedding_vector = self.automation_manager.generate_embedding(text) - self.assertIsNotNone(embedding_vector) - - def test_embedding_exists(self): - embedding_vector = [0.1, 0.2, 0.3] - # Currently no solid implementation exists for this - # So the embadding_exists function returns true anyways - exists = self.automation_manager.embedding_exists(embedding_vector) - self.assertTrue(exists) - - def test_load_recommendations(self): - embedding_vector = [0.1, 0.2, 0.3] - self.automation_manager.recommendations_backend_adapter.get_recommendations = MagicMock(return_value=["item1", "item2"]) - recommendations = self.automation_manager.load_recommendations(embedding_vector) - self.assertIsInstance(recommendations, list) - - def test_cache_embeddings(self): - embeddings_list = [[0.1, 0.2, 0.3]] - # Currently the function returns true anyways - success = self.automation_manager.cache_embeddings(embeddings_list) - self.assertTrue(success) diff --git a/contentcuration/contentcuration/tests/utils/test_recommendations.py b/contentcuration/contentcuration/tests/utils/test_recommendations.py index 3ef11e43e5..0bbb9272dd 100644 --- a/contentcuration/contentcuration/tests/utils/test_recommendations.py +++ b/contentcuration/contentcuration/tests/utils/test_recommendations.py @@ -7,6 +7,7 @@ from contentcuration.utils.recommendations import EmbeddingsResponse from contentcuration.utils.recommendations import Recommendations from contentcuration.utils.recommendations import RecommendationsAdapter +from contentcuration.utils.recommendations import RecommendationsResponse class RecommendationsTestCase(TestCase): @@ -41,33 +42,33 @@ def test_adapter_initialization(self): self.assertIsInstance(self.adapter, RecommendationsAdapter) @patch('contentcuration.utils.recommendations.EmbedTopicsRequest') - def test_embed_topics_backend_connect_success(self, embed_topics_request_mock): + def test_get_recommendations_backend_connect_success(self, get_recommendations_request_mock): self.adapter.backend.connect.return_value = True - self.adapter.backend.make_request.return_value = MagicMock(spec=EmbeddingsResponse) - response = self.adapter.embed_topics(self.topic) + self.adapter.backend.make_request.return_value = MagicMock(spec=RecommendationsResponse) + response = self.adapter.get_recommendations(self.topic) self.adapter.backend.connect.assert_called_once() self.adapter.backend.make_request.assert_called_once() - self.assertIsInstance(response, EmbeddingsResponse) + self.assertIsInstance(response, RecommendationsResponse) - def test_embed_topics_backend_connect_failure(self): + def test_get_recommendations_backend_connect_failure(self): self.adapter.backend.connect.return_value = False with self.assertRaises(errors.ConnectionError): - self.adapter.embed_topics(self.topic) + self.adapter.get_recommendations(self.topic) self.adapter.backend.connect.assert_called_once() self.adapter.backend.make_request.assert_not_called() @patch('contentcuration.utils.recommendations.EmbedTopicsRequest') - def test_embed_topics_make_request_exception(self, embed_topics_request_mock): + def test_get_recommendations_make_request_exception(self, get_recommendations_request_mock): self.adapter.backend.connect.return_value = True self.adapter.backend.make_request.side_effect = Exception("Mocked exception") - response = self.adapter.embed_topics(self.topic) + response = self.adapter.get_recommendations(self.topic) self.adapter.backend.connect.assert_called_once() self.adapter.backend.make_request.assert_called_once() - self.assertIsInstance(response, EmbeddingsResponse) + self.assertIsInstance(response, RecommendationsResponse) self.assertEqual(str(response.error), "Mocked exception") @patch('contentcuration.utils.recommendations.EmbedContentRequest') - def test_embed_content_backend_connect_success(self, embed_content_request_mock): + def test_embed_content_backend_connect_success(self, get_recommendations_request_mock): self.adapter.backend.connect.return_value = True self.adapter.backend.make_request.return_value = MagicMock(spec=EmbeddingsResponse) response = self.adapter.embed_content(self.resources) diff --git a/contentcuration/contentcuration/utils/automation_manager.py b/contentcuration/contentcuration/utils/automation_manager.py index c609064fe7..9ccb2ea22a 100644 --- a/contentcuration/contentcuration/utils/automation_manager.py +++ b/contentcuration/contentcuration/utils/automation_manager.py @@ -1,3 +1,6 @@ +from typing import Any +from typing import Dict + from contentcuration.utils.recommendations import RecommendationsAdapter from contentcuration.utils.recommendations import RecommendationsBackendFactory @@ -8,16 +11,15 @@ def __init__(self): self.recommendations_backend_instance = self.recommendations_backend_factory.create_backend() self.recommendations_backend_adapter = RecommendationsAdapter(self.recommendations_backend_instance) - def generate_embedding(self, text): + def generate_embedding(self, topic: Dict[str, Any]): """ - Generate an embedding vector for the given text. + Generate an embedding vector for the given topic. Args: - text (str): The text for which to generate an embedding. + topic (dict): The topic for which to generate an embedding vector. Returns: Vector: The generated embedding vector. """ - embedding_vector = self.recommendations_backend_adapter.generate_embedding(text=text) - return embedding_vector + return self.recommendations_backend_adapter.generate_embedding(topic) def embedding_exists(self, embedding): """ @@ -29,16 +31,18 @@ def embedding_exists(self, embedding): """ return self.recommendations_backend_adapter.embedding_exists(embedding=embedding) - def load_recommendations(self, embedding): + def load_recommendations(self, topic: Dict[str, Any], override_threshold=False): """ - Load recommendations based on the given embedding vector. - Args: - embedding (Vector): The embedding vector to use for recommendations. + Load recommendations for the given topic. + + Parameters: + :param topic: A dictionary containing the topic for which to get recommendations. + :param override_threshold: A boolean flag to override the recommendation threshold. + Returns: - list: A list of recommended items. + list: A list of recommended resources. """ - # Need to extract the recommendation list from the ResponseObject and change the return statement - self.recommendations_backend_adapter.get_recommendations(embedding=embedding) + self.recommendations_backend_adapter.get_recommendations(topic=topic, override_threshold=override_threshold) return [] def cache_embeddings(self, embeddings): diff --git a/contentcuration/contentcuration/utils/recommendations.py b/contentcuration/contentcuration/utils/recommendations.py index df5f074632..ba1df07375 100644 --- a/contentcuration/contentcuration/utils/recommendations.py +++ b/contentcuration/contentcuration/utils/recommendations.py @@ -9,6 +9,7 @@ from automation.utils.appnexus.base import BackendFactory from automation.utils.appnexus.base import BackendRequest from automation.utils.appnexus.base import BackendResponse +from le_utils.constants import content_kinds from contentcuration.models import ContentNode @@ -38,12 +39,12 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) -class EmbedTopicsRequest(RecommendationsBackendRequest): +class EmbedTopicsRequest(EmbeddingsRequest): path = '/embed-topics' method = 'POST' -class EmbedContentRequest(RecommendationsBackendRequest): +class EmbedContentRequest(EmbeddingsRequest): path = '/embed-content' method = 'POST' @@ -61,8 +62,16 @@ def create_backend(self) -> Backend: class RecommendationsAdapter(Adapter): - def generate_embedding(self, text) -> EmbeddingsResponse: - request = EmbeddingsRequest() + def generate_embedding(self, request: EmbeddingsRequest) -> EmbeddingsResponse: + """ + Generate an embedding vector for the given content. + + Parameters: + :param request: The EmbeddingsRequest object containing the request details. + :param content: The topic for which to generate an embedding vector. + Returns: + EmbeddingsResponse: An object containing the embeddings or an error. + """ return self.backend.make_request(request) def embedding_exists(self, embedding) -> bool: @@ -84,32 +93,43 @@ def cache_embeddings(self, embeddings_list) -> bool: return False return True - def get_recommendations(self, embedding) -> RecommendationsResponse: - request = RecommendationsRequest(embedding) - return self.backend.make_request(request) - - def embed_topics(self, topics: Dict[str, Any]) -> EmbeddingsResponse: + def get_recommendations(self, topic: Dict[str, Any], override_threshold=False) -> RecommendationsResponse: """ - Embeds the topics and returns an EmbeddingsResponse. + Get recommendations for the given topic. - This method connects to the backend, sends the provided topics as a JSON payload, - and requests the backend to embed the topics. If an exception occurs during - this process, it returns an EmbeddingsResponse with the exception as the error. + This method connects to the backend, sends a request to get recommendations for a given + topic, and returns a RecommendationsResponse object containing the recommendations + or an error. Parameters: - topics (Dict[str, Any]): A dictionary of topics to be embedded. + :param topic: The topic for which to get recommendations. + :param override_threshold: A boolean flag to override the recommendation threshold. Returns: - EmbeddingsResponse: An EmbeddingsResponse object containing the embeddings or an error. + RecommendationsResponse: An object containing a list of recommendations or an error. """ if not self.backend.connect(): raise errors.ConnectionError("Connection to the backend failed") - try: - embed_topics_request = EmbedTopicsRequest(json=topics) - return self.backend.make_request(embed_topics_request) + # Generate the embedding for the topic + request = EmbedTopicsRequest(json=topic) + embedding = self.generate_embedding(request) + if embedding.error is not None: + return RecommendationsResponse(error=embedding.error) + # Cache the embedding if it does not exist + if not self.embedding_exists(embedding): + self.cache_embeddings([embedding]) + + # Send a request to get recommendations for the topic + request = RecommendationsRequest( + method='GET', + path='/recommendations', + params={'override_threshold': override_threshold}, + json=topic, + ) + return self.backend.make_request(request) except Exception as e: - return EmbeddingsResponse(error=e) + return RecommendationsResponse(error=e) def embed_content(self, nodes: List[ContentNode]) -> EmbeddingsResponse: """ @@ -129,13 +149,20 @@ def embed_content(self, nodes: List[ContentNode]) -> EmbeddingsResponse: raise errors.ConnectionError("Connection to the backend failed") try: - resources = [self.extract_content(node) for node in nodes] - json = { - 'resources': resources, - 'metadata': {} - } - embed_content_request = EmbedContentRequest(json=json) - return self.backend.make_request(embed_content_request) + failed_requests = [] + for node in nodes: + resource = self.extract_content(node) + json = { + 'resources': [resource], + 'metadata': {} + } + request = EmbedContentRequest(json=json) + embedding = self.generate_embedding(request) + if embedding.error is not None: + failed_requests.append(request) + if embedding.error is None and not self.embedding_exists(embedding): + self.cache_embeddings([embedding]) + return EmbeddingsResponse(failed_requests=failed_requests) except Exception as e: return EmbeddingsResponse(error=e) @@ -152,6 +179,32 @@ def extract_content(self, node: ContentNode) -> Dict[str, Any]: Returns: Dict[str, Any]: A dictionary containing the extracted content metadata. """ + contentkind = node.kind + if contentkind.kind == content_kinds.AUDIO: + # handle audio content + pass + elif contentkind.kind == content_kinds.VIDEO: + # handle video content + pass + elif contentkind.kind == content_kinds.EXERCISE: + # handle exercise content + pass + elif contentkind.kind == content_kinds.DOCUMENT: + # handle document content + pass + elif contentkind.kind == content_kinds.HTML5: + # handle html5 content + pass + elif contentkind.kind == content_kinds.H5P: + # handle h5p content + pass + elif contentkind.kind == content_kinds.ZIM: + # handle zim content + pass + else: + # handle topic content or any other kind + pass + return {} From b8fdb18fb4389b26c78cb5e8d7601334f2a9e606 Mon Sep 17 00:00:00 2001 From: Samson Akol Date: Thu, 11 Jul 2024 14:03:49 +0300 Subject: [PATCH 03/21] Implements extract content urls --- contentcuration/automation/models.py | 13 +- .../automation/utils/appnexus/base.py | 7 +- .../utils/automation_manager.py | 24 +- .../contentcuration/utils/recommendations.py | 219 ++++++++---------- 4 files changed, 120 insertions(+), 143 deletions(-) diff --git a/contentcuration/automation/models.py b/contentcuration/automation/models.py index 0b4331b362..ef6adc38c7 100644 --- a/contentcuration/automation/models.py +++ b/contentcuration/automation/models.py @@ -1,3 +1,12 @@ -# from django.db import models +import uuid -# Create your models here. +from django.db import models +from django.db.models import JSONField + + +class RecommendationsCache(models.Model): + id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) + request = JSONField(default=dict, null=True) + response = JSONField(default=dict, null=True) + priority = models.IntegerField(default=0) + timestamp = models.DateTimeField(auto_now_add=True) diff --git a/contentcuration/automation/utils/appnexus/base.py b/contentcuration/automation/utils/appnexus/base.py index 0998753bd8..f3ea8a9023 100644 --- a/contentcuration/automation/utils/appnexus/base.py +++ b/contentcuration/automation/utils/appnexus/base.py @@ -169,11 +169,10 @@ def connect(self, **kwargs): def make_request(self, request): """ Make a request to the backend service. """ - response = self._make_request(request) try: - info = response.json() - info.update({"status_code": response.status_code}) - return BackendResponse(**info) + response = self._make_request(request) + response_body = dict(data=response.json()) + return BackendResponse(**response_body) except ValueError as e: logging.exception(e) raise errors.InvalidResponse("Invalid response from backend") diff --git a/contentcuration/contentcuration/utils/automation_manager.py b/contentcuration/contentcuration/utils/automation_manager.py index 9ccb2ea22a..82a4344c02 100644 --- a/contentcuration/contentcuration/utils/automation_manager.py +++ b/contentcuration/contentcuration/utils/automation_manager.py @@ -19,17 +19,10 @@ def generate_embedding(self, topic: Dict[str, Any]): Returns: Vector: The generated embedding vector. """ - return self.recommendations_backend_adapter.generate_embedding(topic) + return self.recommendations_backend_adapter.generate_embeddings(topic) - def embedding_exists(self, embedding): - """ - Check if the given embedding vector exists. - Args: - embedding (Vector): The embedding vector to check. - Returns: - bool: True if the embedding exists, False otherwise. - """ - return self.recommendations_backend_adapter.embedding_exists(embedding=embedding) + def response_exists(self, request): + return self.recommendations_backend_adapter.response_exists(request=request) def load_recommendations(self, topic: Dict[str, Any], override_threshold=False): """ @@ -45,12 +38,5 @@ def load_recommendations(self, topic: Dict[str, Any], override_threshold=False): self.recommendations_backend_adapter.get_recommendations(topic=topic, override_threshold=override_threshold) return [] - def cache_embeddings(self, embeddings): - """ - Cache a list of embedding vectors. - Args: - embeddings (list): A list of embedding vectors to cache. - Returns: - bool: True if caching was successful, False otherwise. - """ - return self.recommendations_backend_adapter.cache_embeddings(embeddings) + def cache_embeddings(self, request, response): + return self.recommendations_backend_adapter.cache_embeddings_request(request, response) diff --git a/contentcuration/contentcuration/utils/recommendations.py b/contentcuration/contentcuration/utils/recommendations.py index ba1df07375..5e6f2e36c0 100644 --- a/contentcuration/contentcuration/utils/recommendations.py +++ b/contentcuration/contentcuration/utils/recommendations.py @@ -1,8 +1,10 @@ +import logging from typing import Any from typing import Dict from typing import List from typing import Union +from automation.models import RecommendationsCache from automation.utils.appnexus import errors from automation.utils.appnexus.base import Adapter from automation.utils.appnexus.base import Backend @@ -10,8 +12,11 @@ from automation.utils.appnexus.base import BackendRequest from automation.utils.appnexus.base import BackendResponse from le_utils.constants import content_kinds +from le_utils.constants import format_presets +from contentcuration.models import Channel from contentcuration.models import ContentNode +from contentcuration.models import File class RecommendationsBackendRequest(BackendRequest): @@ -62,109 +67,67 @@ def create_backend(self) -> Backend: class RecommendationsAdapter(Adapter): - def generate_embedding(self, request: EmbeddingsRequest) -> EmbeddingsResponse: - """ - Generate an embedding vector for the given content. - - Parameters: - :param request: The EmbeddingsRequest object containing the request details. - :param content: The topic for which to generate an embedding vector. - Returns: - EmbeddingsResponse: An object containing the embeddings or an error. - """ - return self.backend.make_request(request) - - def embedding_exists(self, embedding) -> bool: - # Need to implement the logic to check if the embeddigns exist - # Return True if the embedding exists, or False otherwise - return True - - def cache_embeddings(self, embeddings_list) -> bool: - for embedding in embeddings_list: - try: - # Attempt to cache the embedding - # Write the caching logic - # A conrner case to look at here is if one of the embedding fails to get cached - # we need to handel it so that only the once that were not succesfull - # are attempted to cache again - pass - except Exception as e: - print(e) - return False - return True - - def get_recommendations(self, topic: Dict[str, Any], override_threshold=False) -> RecommendationsResponse: - """ - Get recommendations for the given topic. - - This method connects to the backend, sends a request to get recommendations for a given - topic, and returns a RecommendationsResponse object containing the recommendations - or an error. + def generate_embeddings(self, request: EmbeddingsRequest, + cache: bool = True) -> EmbeddingsResponse: - Parameters: - :param topic: The topic for which to get recommendations. - :param override_threshold: A boolean flag to override the recommendation threshold. - - Returns: - RecommendationsResponse: An object containing a list of recommendations or an error. - """ if not self.backend.connect(): raise errors.ConnectionError("Connection to the backend failed") + + if cache: + cached_response = self.response_exists(request) + if cached_response: + return cached_response + try: - # Generate the embedding for the topic - request = EmbedTopicsRequest(json=topic) - embedding = self.generate_embedding(request) - if embedding.error is not None: - return RecommendationsResponse(error=embedding.error) - # Cache the embedding if it does not exist - if not self.embedding_exists(embedding): - self.cache_embeddings([embedding]) - - # Send a request to get recommendations for the topic - request = RecommendationsRequest( - method='GET', - path='/recommendations', - params={'override_threshold': override_threshold}, - json=topic, - ) - return self.backend.make_request(request) + response = self.backend.make_request(request) except Exception as e: - return RecommendationsResponse(error=e) - - def embed_content(self, nodes: List[ContentNode]) -> EmbeddingsResponse: - """ - Embeds the content nodes and returns an EmbeddingsResponse. + response = EmbeddingsResponse(error=e) - This method connects to the backend, extracts content metadata from the provided nodes, - and sends a request to the backend to embed the content. If an exception occurs during - this process, it returns an EmbeddingsResponse with the exception as the error. + if not response.error and cache: + self.cache_embeddings_request(request, response) - Parameters: - nodes (List[ContentNode]): A list of ContentNode objects to be embedded. + return response - Returns: - EmbeddingsResponse: An EmbeddingsResponse object containing the embeddings or an error. - """ - if not self.backend.connect(): - raise errors.ConnectionError("Connection to the backend failed") + def response_exists(self, request) -> EmbeddingsResponse: + cache = RecommendationsCache.objects.filter(request=request).first() + if cache: + cached_response = cache.response + else: + cached_response = None + return EmbeddingsResponse(cached_response) + def cache_embeddings_request(self, request: BackendRequest, response: BackendResponse) -> bool: try: - failed_requests = [] - for node in nodes: - resource = self.extract_content(node) - json = { - 'resources': [resource], - 'metadata': {} - } - request = EmbedContentRequest(json=json) - embedding = self.generate_embedding(request) - if embedding.error is not None: - failed_requests.append(request) - if embedding.error is None and not self.embedding_exists(embedding): - self.cache_embeddings([embedding]) - return EmbeddingsResponse(failed_requests=failed_requests) + RecommendationsCache.objects.create(request=request, response=response, priority=2) + return True except Exception as e: - return EmbeddingsResponse(error=e) + logging.exception(e) + return False + + def get_recommendations(self, topic: Dict[str, Any], + override_threshold=False) -> RecommendationsResponse: + # Generate the embedding for the topic + recommendations = [] + request = EmbedTopicsRequest( + params={'override_threshold': override_threshold}, + json=topic, + ) + response = self.generate_embeddings(request=request) + nodes = response.data + if nodes is not None and len(nodes) > 0: + node_ids = [node['contentnode_id'] for node in nodes] + recommendations = ContentNode.objects.filter(id__in=node_ids) + + return RecommendationsResponse(recommendations) + + def embed_content(self, nodes: List[ContentNode]) -> EmbeddingsResponse: + for i in range(0, len(nodes), 20): + batch = nodes[i:i + 20] + content = [self.extract_content(node) for node in batch] + request = EmbedContentRequest(json=content) + self.generate_embeddings(request=request, cache=False) + + return EmbeddingsResponse() def extract_content(self, node: ContentNode) -> Dict[str, Any]: """ @@ -179,33 +142,53 @@ def extract_content(self, node: ContentNode) -> Dict[str, Any]: Returns: Dict[str, Any]: A dictionary containing the extracted content metadata. """ - contentkind = node.kind - if contentkind.kind == content_kinds.AUDIO: - # handle audio content - pass - elif contentkind.kind == content_kinds.VIDEO: - # handle video content - pass - elif contentkind.kind == content_kinds.EXERCISE: - # handle exercise content - pass - elif contentkind.kind == content_kinds.DOCUMENT: - # handle document content - pass - elif contentkind.kind == content_kinds.HTML5: - # handle html5 content - pass - elif contentkind.kind == content_kinds.H5P: - # handle h5p content - pass - elif contentkind.kind == content_kinds.ZIM: - # handle zim content - pass - else: - # handle topic content or any other kind - pass + contentkind_to_presets = { + content_kinds.AUDIO: [format_presets.AUDIO, format_presets.AUDIO_DEPENDENCY], + content_kinds.VIDEO: [ + format_presets.VIDEO_DEPENDENCY, + format_presets.VIDEO_HIGH_RES, + format_presets.VIDEO_LOW_RES, + format_presets.VIDEO_SUBTITLE, + ], + content_kinds.EXERCISE: [format_presets.DOCUMENT, format_presets.EPUB], + content_kinds.DOCUMENT: [format_presets.DOCUMENT, format_presets.EPUB], + content_kinds.HTML5: [format_presets.HTML5_ZIP], + content_kinds.H5P: [format_presets.H5P_ZIP], + content_kinds.ZIM: [format_presets.ZIM], + } - return {} + contentkind = node.kind + presets = contentkind_to_presets.get(contentkind.kind) + files = self.get_content_files(node, presets) if presets else None + + channel = Channel.object.filter(main_tree=node).first() + channel_id = channel.id if channel else None + + return { + "resources": { + "id": node.node_id, + "title": node.title, + "description": node.description, + "text": "", + "language": node.language.lang_code if node.language else None, + "files": files + }, + "metadata": { + "channel_id": channel_id, + }, + } + + def get_content_files(self, node, presets) -> List[Dict[str, Any]]: + node_files = File.objects.filter(contentnode=node, preset__in=presets) + files = [] + for file in node_files: + file_dict = { + 'url': file.source_url, + 'preset': file.preset_id, + 'language': file.language.lang_code if file.language else None + } + files.append(file_dict) + return files class Recommendations(Backend): From 8c10f77a7f3515289cdd6a6866f8d01c20c5a73a Mon Sep 17 00:00:00 2001 From: Samson Akol Date: Thu, 11 Jul 2024 14:34:43 +0300 Subject: [PATCH 04/21] Runs recommendations cache migrations --- .../automation/migrations/0001_initial.py | 26 +++++++++++++++++++ contentcuration/contentcuration/settings.py | 1 + 2 files changed, 27 insertions(+) create mode 100644 contentcuration/automation/migrations/0001_initial.py diff --git a/contentcuration/automation/migrations/0001_initial.py b/contentcuration/automation/migrations/0001_initial.py new file mode 100644 index 0000000000..85e58b7e16 --- /dev/null +++ b/contentcuration/automation/migrations/0001_initial.py @@ -0,0 +1,26 @@ +# Generated by Django 3.2.24 on 2024-07-11 11:33 +import uuid + +from django.db import migrations +from django.db import models + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ] + + operations = [ + migrations.CreateModel( + name='RecommendationsCache', + fields=[ + ('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)), + ('request', models.JSONField(default=dict, null=True)), + ('response', models.JSONField(default=dict, null=True)), + ('priority', models.IntegerField(default=0)), + ('timestamp', models.DateTimeField(auto_now_add=True)), + ], + ), + ] diff --git a/contentcuration/contentcuration/settings.py b/contentcuration/contentcuration/settings.py index 8d2d9d8832..d162c5c6de 100644 --- a/contentcuration/contentcuration/settings.py +++ b/contentcuration/contentcuration/settings.py @@ -89,6 +89,7 @@ 'django.contrib.postgres', 'django_celery_results', 'kolibri_public', + 'automation', ) SESSION_ENGINE = "django.contrib.sessions.backends.cached_db" From 63de9d92ef7333c4d001e466daf0e233ffb638fd Mon Sep 17 00:00:00 2001 From: Samson Akol Date: Mon, 15 Jul 2024 19:19:47 +0300 Subject: [PATCH 05/21] clean up and tests --- .../migrations/0002_auto_20240711_1938.py | 22 +++ contentcuration/automation/models.py | 2 +- .../automation/tests/appnexus/test_base.py | 121 ++++++------- .../automation/utils/appnexus/base.py | 3 +- .../tests/utils/test_recommendations.py | 52 +++--- .../utils/automation_manager.py | 40 ++--- .../contentcuration/utils/recommendations.py | 160 +++++++++++++----- 7 files changed, 228 insertions(+), 172 deletions(-) create mode 100644 contentcuration/automation/migrations/0002_auto_20240711_1938.py diff --git a/contentcuration/automation/migrations/0002_auto_20240711_1938.py b/contentcuration/automation/migrations/0002_auto_20240711_1938.py new file mode 100644 index 0000000000..b0f813b599 --- /dev/null +++ b/contentcuration/automation/migrations/0002_auto_20240711_1938.py @@ -0,0 +1,22 @@ +# Generated by Django 3.2.24 on 2024-07-11 19:38 +from django.db import migrations +from django.db import models + + +class Migration(migrations.Migration): + + dependencies = [ + ('automation', '0001_initial'), + ] + + operations = [ + migrations.RemoveField( + model_name='recommendationscache', + name='priority', + ), + migrations.AddField( + model_name='recommendationscache', + name='rank', + field=models.FloatField(default=0.0, null=True), + ), + ] diff --git a/contentcuration/automation/models.py b/contentcuration/automation/models.py index ef6adc38c7..c5ade9096d 100644 --- a/contentcuration/automation/models.py +++ b/contentcuration/automation/models.py @@ -8,5 +8,5 @@ class RecommendationsCache(models.Model): id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) request = JSONField(default=dict, null=True) response = JSONField(default=dict, null=True) - priority = models.IntegerField(default=0) + rank = models.FloatField(default=0.0, null=True) timestamp = models.DateTimeField(auto_now_add=True) diff --git a/contentcuration/automation/tests/appnexus/test_base.py b/contentcuration/automation/tests/appnexus/test_base.py index b06cc213b5..3fa3fa8e74 100644 --- a/contentcuration/automation/tests/appnexus/test_base.py +++ b/contentcuration/automation/tests/appnexus/test_base.py @@ -1,69 +1,26 @@ import time -import pytest -import requests from unittest.mock import patch -from automation.utils.appnexus.base import Adapter +import mock +import pytest +import requests from automation.utils.appnexus.base import Backend from automation.utils.appnexus.base import BackendRequest +from automation.utils.appnexus.base import BackendResponse from automation.utils.appnexus.base import SessionWithMaxConnectionAge from automation.utils.appnexus.errors import ConnectionError +from automation.utils.appnexus.errors import InvalidResponse -class MockBackend(Backend): - base_url = 'https://kolibri-dev.learningequality.org' - connect_endpoint = '/status' - def connect(self) -> None: - return super().connect() - - def make_request(self, request): - return super().make_request(request) - -class ErrorBackend(Backend): - base_url = 'https://bad-url.com' - connect_endpoint = '/status' - def connect(self) -> None: - return super().connect() - - def make_request(self, request): - return super().make_request(request) - - -class MockAdapter(Adapter): - def mockoperation(self): - pass - - -def test_backend_singleton(): - b1, b2 = MockBackend(), MockBackend() - assert id(b1) == id(b2) - - -def test_adapter_creation(): - a = MockAdapter(backend=MockBackend) - assert isinstance(a, Adapter) - - -def test_adapter_backend_default(): - b = MockBackend() - adapter = Adapter(backend=b) - assert isinstance(adapter.backend, Backend) - - -def test_adapter_backend_custom(): - b = MockBackend() - a = Adapter(backend=b) - assert a.backend is b - def test_session_with_max_connection_age_request(): with patch.object(requests.Session, 'request') as mock_request: session = SessionWithMaxConnectionAge() session.request('GET', 'https://example.com') assert mock_request.call_count == 1 + def test_session_with_max_connection_age_not_closing_connections(): - with patch.object(requests.Session, 'close') as mock_close,\ - patch.object(requests.Session, 'request') as mock_request: + with patch.object(requests.Session, 'close') as mock_close, patch.object(requests.Session, 'request') as mock_request: session = SessionWithMaxConnectionAge(60) session.request('GET', 'https://example.com') time.sleep(0.1) @@ -72,9 +29,9 @@ def test_session_with_max_connection_age_not_closing_connections(): assert mock_close.call_count == 0 assert mock_request.call_count == 2 + def test_session_with_max_connection_age_closing_connections(): - with patch.object(requests.Session, 'close') as mock_close,\ - patch.object(requests.Session, 'request') as mock_request: + with patch.object(requests.Session, 'close') as mock_close, patch.object(requests.Session, 'request') as mock_request: session = SessionWithMaxConnectionAge(1) session.request('GET', 'https://example.com') time.sleep(2) @@ -83,33 +40,55 @@ def test_session_with_max_connection_age_closing_connections(): assert mock_close.call_count == 1 assert mock_request.call_count == 2 -def test_backend_connect(): - backend = MockBackend() - connected = backend.connect() - assert connected is True +@mock.patch("automation.utils.appnexus.base.Backend.connect") +def test_backend_connect(mock_connect): + mock_connect.return_value = True + + backend = Backend() + result = backend.connect() + + mock_connect.assert_called_once() + assert result is True -def test_backend_connect_error(): - backend = ErrorBackend() - connected = backend.connect() - assert connected is False +@mock.patch("automation.utils.appnexus.base.Backend.connect") +def test_backend_connect_error(mock_connect): + mock_connect.side_effect = [ConnectionError("Failed to connect"), False] -def test_backend_request(): - request = BackendRequest('GET', '/api/public/info') + backend = Backend() - backend = MockBackend() + with pytest.raises(ConnectionError) as exc_info: + backend.connect() + assert str(exc_info.value) == "Failed to connect" + + result = backend.connect() + assert result is False + + assert mock_connect.call_count == 2 + + +@mock.patch("automation.utils.appnexus.base.Backend.make_request") +def test_backend_request(mock_make_request): + mock_response = BackendResponse(data=[{"key": "value"}]) + mock_make_request.return_value = mock_response + + backend = Backend() + request = BackendRequest(method="GET", path="/api/test") response = backend.make_request(request) - assert response.status_code == 200 - assert len(response.__dict__) > 0 + assert response == mock_response + mock_make_request.assert_called_once_with(request) + -def test_backend_request_error(): - request = BackendRequest('GET', '/api/public/info') +@mock.patch("automation.utils.appnexus.base.Backend.make_request") +def test_backend_request_error(mock_make_request): + mock_make_request.side_effect = InvalidResponse("Request failed") - backend = ErrorBackend() + backend = Backend() + request = BackendRequest(method="GET", path="/api/test") - with pytest.raises(ConnectionError) as error: + with pytest.raises(InvalidResponse) as exc_info: backend.make_request(request) - - assert "Unable to connect to" in str(error.value) + assert str(exc_info.value) == "Request failed" + mock_make_request.assert_called_once_with(request) diff --git a/contentcuration/automation/utils/appnexus/base.py b/contentcuration/automation/utils/appnexus/base.py index f3ea8a9023..626bca1d58 100644 --- a/contentcuration/automation/utils/appnexus/base.py +++ b/contentcuration/automation/utils/appnexus/base.py @@ -56,8 +56,7 @@ def __init__( class BackendResponse(object): """ Class that should be inherited by specific backend for its responses""" - def __init__(self, error=None, **kwargs): - self.error = error + def __init__(self, **kwargs): for key, value in kwargs.items(): setattr(self, key, value) diff --git a/contentcuration/contentcuration/tests/utils/test_recommendations.py b/contentcuration/contentcuration/tests/utils/test_recommendations.py index 0bbb9272dd..e0fce53b76 100644 --- a/contentcuration/contentcuration/tests/utils/test_recommendations.py +++ b/contentcuration/contentcuration/tests/utils/test_recommendations.py @@ -5,6 +5,7 @@ from contentcuration.models import ContentNode from contentcuration.utils.recommendations import EmbeddingsResponse +from contentcuration.utils.recommendations import EmbedTopicsRequest from contentcuration.utils.recommendations import Recommendations from contentcuration.utils.recommendations import RecommendationsAdapter from contentcuration.utils.recommendations import RecommendationsResponse @@ -12,9 +13,9 @@ class RecommendationsTestCase(TestCase): def test_backend_initialization(self): - recomendations = Recommendations() - self.assertIsNotNone(recomendations) - self.assertIsInstance(recomendations, Recommendations) + recommendations = Recommendations() + self.assertIsNotNone(recommendations) + self.assertIsInstance(recommendations, Recommendations) class RecommendationsAdapterTestCase(TestCase): @@ -43,52 +44,45 @@ def test_adapter_initialization(self): @patch('contentcuration.utils.recommendations.EmbedTopicsRequest') def test_get_recommendations_backend_connect_success(self, get_recommendations_request_mock): + mock_response = MagicMock(spec=RecommendationsResponse) + mock_response.data = [] + mock_response.error = None + self.adapter.backend.connect.return_value = True - self.adapter.backend.make_request.return_value = MagicMock(spec=RecommendationsResponse) + self.adapter.backend.make_request.return_value = mock_response response = self.adapter.get_recommendations(self.topic) self.adapter.backend.connect.assert_called_once() self.adapter.backend.make_request.assert_called_once() self.assertIsInstance(response, RecommendationsResponse) - def test_get_recommendations_backend_connect_failure(self): + @patch('contentcuration.utils.recommendations.EmbedTopicsRequest') + def test_get_recommendations_backend_connect_failure(self, embed_topics_request_mock): + mock_request_instance = MagicMock(spec=EmbedTopicsRequest) + embed_topics_request_mock.return_value = mock_request_instance + self.adapter.backend.connect.return_value = False with self.assertRaises(errors.ConnectionError): self.adapter.get_recommendations(self.topic) self.adapter.backend.connect.assert_called_once() self.adapter.backend.make_request.assert_not_called() - @patch('contentcuration.utils.recommendations.EmbedTopicsRequest') - def test_get_recommendations_make_request_exception(self, get_recommendations_request_mock): - self.adapter.backend.connect.return_value = True - self.adapter.backend.make_request.side_effect = Exception("Mocked exception") - response = self.adapter.get_recommendations(self.topic) - self.adapter.backend.connect.assert_called_once() - self.adapter.backend.make_request.assert_called_once() - self.assertIsInstance(response, RecommendationsResponse) - self.assertEqual(str(response.error), "Mocked exception") - @patch('contentcuration.utils.recommendations.EmbedContentRequest') - def test_embed_content_backend_connect_success(self, get_recommendations_request_mock): + def test_embed_content_backend_connect_success(self, embed_content_request_mock): + mock_response = MagicMock(spec=EmbeddingsResponse) + mock_response.error = None + self.adapter.backend.connect.return_value = True - self.adapter.backend.make_request.return_value = MagicMock(spec=EmbeddingsResponse) + self.adapter.backend.make_request.return_value = mock_response response = self.adapter.embed_content(self.resources) self.adapter.backend.connect.assert_called_once() self.adapter.backend.make_request.assert_called_once() - self.assertIsInstance(response, EmbeddingsResponse) + self.assertIsInstance(response, bool) + self.assertTrue(response) - def test_embed_content_backend_connect_failure(self): + @patch('contentcuration.utils.recommendations.EmbedContentRequest') + def test_embed_content_backend_connect_failure(self, embed_content_request_mock): self.adapter.backend.connect.return_value = False with self.assertRaises(errors.ConnectionError): self.adapter.embed_content(self.resources) self.adapter.backend.connect.assert_called_once() self.adapter.backend.make_request.assert_not_called() - - @patch('contentcuration.utils.recommendations.EmbedContentRequest') - def test_embed_content_make_request_exception(self, embed_content_request_mock): - self.adapter.backend.connect.return_value = True - self.adapter.backend.make_request.side_effect = Exception("Mocked exception") - response = self.adapter.embed_content(self.resources) - self.adapter.backend.connect.assert_called_once() - self.adapter.backend.make_request.assert_called_once() - self.assertIsInstance(response, EmbeddingsResponse) - self.assertEqual(str(response.error), "Mocked exception") diff --git a/contentcuration/contentcuration/utils/automation_manager.py b/contentcuration/contentcuration/utils/automation_manager.py index 82a4344c02..e84fc38389 100644 --- a/contentcuration/contentcuration/utils/automation_manager.py +++ b/contentcuration/contentcuration/utils/automation_manager.py @@ -1,42 +1,36 @@ from typing import Any from typing import Dict +from typing import List +from contentcuration.models import ContentNode from contentcuration.utils.recommendations import RecommendationsAdapter from contentcuration.utils.recommendations import RecommendationsBackendFactory class AutomationManager: def __init__(self): - self.recommendations_backend_factory = RecommendationsBackendFactory() - self.recommendations_backend_instance = self.recommendations_backend_factory.create_backend() - self.recommendations_backend_adapter = RecommendationsAdapter(self.recommendations_backend_instance) + self.factory = RecommendationsBackendFactory() + self.instance = self.factory.create_backend() + self.adapter = RecommendationsAdapter(self.instance) - def generate_embedding(self, topic: Dict[str, Any]): + def generate_embeddings(self, nodes: List[ContentNode]): """ - Generate an embedding vector for the given topic. - Args: - topic (dict): The topic for which to generate an embedding vector. - Returns: - Vector: The generated embedding vector. - """ - return self.recommendations_backend_adapter.generate_embeddings(topic) + Generates embeddings for the given list of nodes. This process is async. + + :param nodes: The list of nodes for which to generate embeddings. - def response_exists(self, request): - return self.recommendations_backend_adapter.response_exists(request=request) + :return: A boolean indicating that the process has started. + """ + return self.adapter.embed_content(nodes) def load_recommendations(self, topic: Dict[str, Any], override_threshold=False): """ - Load recommendations for the given topic. + Loads recommendations for the given topic. - Parameters: - :param topic: A dictionary containing the topic for which to get recommendations. - :param override_threshold: A boolean flag to override the recommendation threshold. + :param topic: A dictionary containing the topic for which to get recommendations. + :param override_threshold: A boolean flag to override the recommendation threshold. - Returns: - list: A list of recommended resources. + :return: A list of recommendations for the given topic. """ - self.recommendations_backend_adapter.get_recommendations(topic=topic, override_threshold=override_threshold) + self.adapter.get_recommendations(topic=topic, override_threshold=override_threshold) return [] - - def cache_embeddings(self, request, response): - return self.recommendations_backend_adapter.cache_embeddings_request(request, response) diff --git a/contentcuration/contentcuration/utils/recommendations.py b/contentcuration/contentcuration/utils/recommendations.py index 5e6f2e36c0..d172dcbbb4 100644 --- a/contentcuration/contentcuration/utils/recommendations.py +++ b/contentcuration/contentcuration/utils/recommendations.py @@ -20,28 +20,29 @@ class RecommendationsBackendRequest(BackendRequest): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) + def __init__(self, **kwargs): + super().__init__(**kwargs) class RecommendationsRequest(RecommendationsBackendRequest): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) + def __init__(self, **kwargs): + super().__init__(**kwargs) class EmbeddingsRequest(RecommendationsBackendRequest): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) + def __init__(self, **kwargs): + super().__init__(**kwargs) class RecommendationsBackendResponse(BackendResponse): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) + def __init__(self, **kwargs): + super().__init__(**kwargs) class RecommendationsResponse(RecommendationsBackendResponse): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) + def __init__(self, results: List[Any], **kwargs): + super().__init__(**kwargs) + self.results = results class EmbedTopicsRequest(EmbeddingsRequest): @@ -55,8 +56,8 @@ class EmbedContentRequest(EmbeddingsRequest): class EmbeddingsResponse(RecommendationsBackendResponse): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) + def __init__(self, **kwargs): + super().__init__(**kwargs) class RecommendationsBackendFactory(BackendFactory): @@ -69,18 +70,25 @@ class RecommendationsAdapter(Adapter): def generate_embeddings(self, request: EmbeddingsRequest, cache: bool = True) -> EmbeddingsResponse: + """ + Generates recommendations for the given request. + :param request: The request for which to generate embeddings. + :param cache: Whether to cache the embeddings request. + :return: The response containing the recommendations. + :rtype: EmbeddingsResponse + """ if not self.backend.connect(): raise errors.ConnectionError("Connection to the backend failed") - if cache: - cached_response = self.response_exists(request) - if cached_response: - return cached_response + cached_response = self.response_exists(request) + if cached_response: + return cached_response try: response = self.backend.make_request(request) except Exception as e: + logging.exception(e) response = EmbeddingsResponse(error=e) if not response.error and cache: @@ -88,17 +96,44 @@ def generate_embeddings(self, request: EmbeddingsRequest, return response - def response_exists(self, request) -> EmbeddingsResponse: - cache = RecommendationsCache.objects.filter(request=request).first() - if cache: - cached_response = cache.response - else: - cached_response = None - return EmbeddingsResponse(cached_response) + def response_exists(self, request) -> Union[EmbeddingsResponse, None]: + """ + Checks if a cached response exists for the given request. + + :param request: The request for which to check if a cached response exists. + :return: The cached response if it exists, otherwise None. + :rtype: Union[EmbeddingsResponse, None] + """ + try: + cache = list(RecommendationsCache.objects.filter(request=request).order_by('rank')) + data = [entry.response for entry in cache] + if len(data) > 0: + return EmbeddingsResponse(data=data) + else: + return None + except Exception as e: + logging.exception(e) + return None def cache_embeddings_request(self, request: BackendRequest, response: BackendResponse) -> bool: + """ + Caches the recommendations request and response. + + :param request: The request to cache. + :param response: The response to cache. + :return: A boolean indicating whether the caching was successful. + :rtype: bool + """ try: - RecommendationsCache.objects.create(request=request, response=response, priority=2) + nodes = self._extract_data(response) + cache = [ + RecommendationsCache( + request=request, + response=node, + rank=node['rank'], + ) for node in nodes + ] + RecommendationsCache.objects.bulk_create(cache) return True except Exception as e: logging.exception(e) @@ -106,41 +141,55 @@ def cache_embeddings_request(self, request: BackendRequest, response: BackendRes def get_recommendations(self, topic: Dict[str, Any], override_threshold=False) -> RecommendationsResponse: - # Generate the embedding for the topic + """ + Get recommendations for the given topic. + + :param topic: A dictionary containing the topic for which to get recommendations. See + https://github.com/learningequality/le-utils/blob/main/spec/schema-embed_topics_request.json + :param override_threshold: A boolean flag to override the recommendation threshold. + :return: The recommendations for the given topic. :rtype: RecommendationsResponse + """ recommendations = [] request = EmbedTopicsRequest( params={'override_threshold': override_threshold}, json=topic, ) response = self.generate_embeddings(request=request) - nodes = response.data - if nodes is not None and len(nodes) > 0: + nodes = self._extract_data(response) + if len(nodes) > 0: node_ids = [node['contentnode_id'] for node in nodes] - recommendations = ContentNode.objects.filter(id__in=node_ids) + recommendations = list(ContentNode.objects.filter(id__in=node_ids)) - return RecommendationsResponse(recommendations) + return RecommendationsResponse(results=recommendations) - def embed_content(self, nodes: List[ContentNode]) -> EmbeddingsResponse: + def _extract_data(self, response: BackendResponse) -> List[Dict[str, Any]]: + return response.data if response.data is not None else [] + + def embed_content(self, nodes: List[ContentNode]) -> bool: + """ + Embeds the content for the given nodes. This is an asynchronous process and could take a + while to complete. This process is handled by our curriculum automation service. + See https://github.com/learningequality/curriculum-automation + + :param nodes: The nodes for which to embed the content. + :return: A boolean indicating that content embedding process has started. + :rtype: bool + """ for i in range(0, len(nodes), 20): batch = nodes[i:i + 20] content = [self.extract_content(node) for node in batch] request = EmbedContentRequest(json=content) self.generate_embeddings(request=request, cache=False) - return EmbeddingsResponse() + return True def extract_content(self, node: ContentNode) -> Dict[str, Any]: """ - Extracts content metadata from a given ContentNode object. + Extracts the content from the given node. - This method extracts the content metadata from the provided ContentNode object. - The extracted metadata is returned as a dictionary. - - Parameters: - node (ContentNode): The ContentNode object from which to extract the content metadata. - - Returns: - Dict[str, Any]: A dictionary containing the extracted content metadata. + :param node: The node from which to extract the content. + :return: A dictionary containing the extracted content. + :rtype: Dict[str, Any] """ contentkind_to_presets = { content_kinds.AUDIO: [format_presets.AUDIO, format_presets.AUDIO_DEPENDENCY], @@ -159,10 +208,14 @@ def extract_content(self, node: ContentNode) -> Dict[str, Any]: contentkind = node.kind presets = contentkind_to_presets.get(contentkind.kind) - files = self.get_content_files(node, presets) if presets else None + files = self._get_content_files(node, presets) if presets else None - channel = Channel.object.filter(main_tree=node).first() - channel_id = channel.id if channel else None + try: + channel = Channel.object.filter(main_tree=node).first() + channel_id = channel.id if channel else None + except Exception as e: + logging.exception(e) + channel_id = None return { "resources": { @@ -178,8 +231,23 @@ def extract_content(self, node: ContentNode) -> Dict[str, Any]: }, } - def get_content_files(self, node, presets) -> List[Dict[str, Any]]: - node_files = File.objects.filter(contentnode=node, preset__in=presets) + def _get_content_files(self, node, presets) -> List[Dict[str, Any]]: + """ + Get the content files for the given node and presets. See + https://github.com/learningequality/le-utils/blob/main/spec/schema-embed_topics_request.json + for the file schema. + + :param node: The node for which to get the content files. + :param presets: The presets for which to get the content files. + :return: A list of dictionaries containing the content files. + :rtype: List[Dict[str, Any]] + """ + try: + node_files = File.objects.filter(contentnode=node, preset__in=presets) + except Exception as e: + logging.exception(e) + node_files = [] + files = [] for file in node_files: file_dict = { @@ -193,7 +261,7 @@ def get_content_files(self, node, presets) -> List[Dict[str, Any]]: class Recommendations(Backend): - def connect(self) -> None: + def connect(self) -> bool: return super().connect() def make_request(self, request) -> Union[EmbeddingsResponse, RecommendationsResponse]: From 61e8b62979a56ab22b44cdaae501db012099a773 Mon Sep 17 00:00:00 2001 From: Samson Akol Date: Tue, 16 Jul 2024 18:28:06 +0300 Subject: [PATCH 06/21] Remove extra migrations --- .../automation/migrations/0001_initial.py | 26 ------------------- .../migrations/0002_auto_20240711_1938.py | 22 ---------------- 2 files changed, 48 deletions(-) delete mode 100644 contentcuration/automation/migrations/0001_initial.py delete mode 100644 contentcuration/automation/migrations/0002_auto_20240711_1938.py diff --git a/contentcuration/automation/migrations/0001_initial.py b/contentcuration/automation/migrations/0001_initial.py deleted file mode 100644 index 85e58b7e16..0000000000 --- a/contentcuration/automation/migrations/0001_initial.py +++ /dev/null @@ -1,26 +0,0 @@ -# Generated by Django 3.2.24 on 2024-07-11 11:33 -import uuid - -from django.db import migrations -from django.db import models - - -class Migration(migrations.Migration): - - initial = True - - dependencies = [ - ] - - operations = [ - migrations.CreateModel( - name='RecommendationsCache', - fields=[ - ('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)), - ('request', models.JSONField(default=dict, null=True)), - ('response', models.JSONField(default=dict, null=True)), - ('priority', models.IntegerField(default=0)), - ('timestamp', models.DateTimeField(auto_now_add=True)), - ], - ), - ] diff --git a/contentcuration/automation/migrations/0002_auto_20240711_1938.py b/contentcuration/automation/migrations/0002_auto_20240711_1938.py deleted file mode 100644 index b0f813b599..0000000000 --- a/contentcuration/automation/migrations/0002_auto_20240711_1938.py +++ /dev/null @@ -1,22 +0,0 @@ -# Generated by Django 3.2.24 on 2024-07-11 19:38 -from django.db import migrations -from django.db import models - - -class Migration(migrations.Migration): - - dependencies = [ - ('automation', '0001_initial'), - ] - - operations = [ - migrations.RemoveField( - model_name='recommendationscache', - name='priority', - ), - migrations.AddField( - model_name='recommendationscache', - name='rank', - field=models.FloatField(default=0.0, null=True), - ), - ] From b920691832399b9ce57c3573b2ac5c7f4881ccea Mon Sep 17 00:00:00 2001 From: Samson Akol Date: Tue, 16 Jul 2024 18:31:55 +0300 Subject: [PATCH 07/21] Adds new migration --- .../automation/migrations/0001_initial.py | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 contentcuration/automation/migrations/0001_initial.py diff --git a/contentcuration/automation/migrations/0001_initial.py b/contentcuration/automation/migrations/0001_initial.py new file mode 100644 index 0000000000..c23cdeeba5 --- /dev/null +++ b/contentcuration/automation/migrations/0001_initial.py @@ -0,0 +1,26 @@ +# Generated by Django 3.2.24 on 2024-07-16 15:30 +import uuid + +from django.db import migrations +from django.db import models + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ] + + operations = [ + migrations.CreateModel( + name='RecommendationsCache', + fields=[ + ('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)), + ('request', models.JSONField(default=dict, null=True)), + ('response', models.JSONField(default=dict, null=True)), + ('rank', models.FloatField(default=0.0, null=True)), + ('timestamp', models.DateTimeField(auto_now_add=True)), + ], + ), + ] From 8c23f5fdf522469dea92684d4db6a49703ad8dcc Mon Sep 17 00:00:00 2001 From: Samson Akol Date: Wed, 24 Jul 2024 17:42:40 +0300 Subject: [PATCH 08/21] implements feedback --- .../automation/migrations/0001_initial.py | 16 +- contentcuration/automation/models.py | 13 +- .../tests/utils/test_recommendations.py | 50 ++++- .../utils/automation_manager.py | 10 +- .../contentcuration/utils/recommendations.py | 190 ++++++++++++------ 5 files changed, 205 insertions(+), 74 deletions(-) diff --git a/contentcuration/automation/migrations/0001_initial.py b/contentcuration/automation/migrations/0001_initial.py index c23cdeeba5..87ee422398 100644 --- a/contentcuration/automation/migrations/0001_initial.py +++ b/contentcuration/automation/migrations/0001_initial.py @@ -1,26 +1,32 @@ -# Generated by Django 3.2.24 on 2024-07-16 15:30 +# Generated by Django 3.2.24 on 2024-07-24 13:18 import uuid +import django.db.models.deletion from django.db import migrations from django.db import models class Migration(migrations.Migration): - initial = True dependencies = [ + ('kolibri_public', '0003_alter_file_preset'), ] operations = [ migrations.CreateModel( name='RecommendationsCache', fields=[ - ('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)), - ('request', models.JSONField(default=dict, null=True)), - ('response', models.JSONField(default=dict, null=True)), + ('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, + serialize=False)), + ('request_hash', models.CharField(max_length=32, null=True)), ('rank', models.FloatField(default=0.0, null=True)), + ('override_threshold', models.BooleanField(default=False)), ('timestamp', models.DateTimeField(auto_now_add=True)), + ('response', models.ForeignKey(blank=True, null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name='recommendations', + to='kolibri_public.contentnode')), ], ), ] diff --git a/contentcuration/automation/models.py b/contentcuration/automation/models.py index c5ade9096d..39b1819f59 100644 --- a/contentcuration/automation/models.py +++ b/contentcuration/automation/models.py @@ -1,12 +1,19 @@ import uuid from django.db import models -from django.db.models import JSONField +from kolibri_public.models import ContentNode class RecommendationsCache(models.Model): id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) - request = JSONField(default=dict, null=True) - response = JSONField(default=dict, null=True) + request_hash = models.CharField(max_length=32, null=True) + response = models.ForeignKey( + ContentNode, + null=True, + blank=True, + related_name='recommendations', + on_delete=models.SET_NULL, + ) rank = models.FloatField(default=0.0, null=True) + override_threshold = models.BooleanField(default=False) timestamp = models.DateTimeField(auto_now_add=True) diff --git a/contentcuration/contentcuration/tests/utils/test_recommendations.py b/contentcuration/contentcuration/tests/utils/test_recommendations.py index e0fce53b76..f395005b63 100644 --- a/contentcuration/contentcuration/tests/utils/test_recommendations.py +++ b/contentcuration/contentcuration/tests/utils/test_recommendations.py @@ -34,6 +34,7 @@ def setUp(self): } ] } + self.channel_id = 'test_channel_id' self.resources = [ MagicMock(spec=ContentNode), ] @@ -42,6 +43,51 @@ def test_adapter_initialization(self): self.assertIsNotNone(self.adapter) self.assertIsInstance(self.adapter, RecommendationsAdapter) + def test_generate_embeddings_connect_failure(self): + mock_response = MagicMock(spec=EmbeddingsResponse) + + self.adapter.backend.connect.return_value = False + self.adapter.backend.make_request.return_value = mock_response + with self.assertRaises(errors.ConnectionError): + self.adapter.generate_embeddings(EmbedTopicsRequest( + method='POST', + url='http://test.com', + path='/test/path', + )) + self.adapter.backend.connect.assert_called_once() + self.adapter.backend.make_request.assert_not_called() + + def test_generate_embeddings(self): + mock_response = MagicMock(spec=EmbeddingsResponse) + mock_response.error = None + + self.adapter.backend.connect.return_value = True + self.adapter.backend.make_request.return_value = mock_response + response = self.adapter.generate_embeddings(EmbedTopicsRequest( + method='POST', + url='http://test.com', + path='/test/path', + )) + self.adapter.backend.connect.assert_called_once() + self.adapter.backend.make_request.assert_called_once() + self.assertIsInstance(response, EmbeddingsResponse) + + def test_generate_embeddings_failure(self): + mock_response = MagicMock(spec=EmbeddingsResponse) + mock_response.error = {} + + self.adapter.backend.connect.return_value = True + self.adapter.backend.make_request.return_value = mock_response + response = self.adapter.generate_embeddings(EmbedTopicsRequest( + method='POST', + url='http://test.com', + path='/test/path', + )) + self.adapter.backend.connect.assert_called_once() + self.adapter.backend.make_request.assert_called_once() + self.assertIsInstance(response, EmbeddingsResponse) + self.assertIsNotNone(response.error) + @patch('contentcuration.utils.recommendations.EmbedTopicsRequest') def test_get_recommendations_backend_connect_success(self, get_recommendations_request_mock): mock_response = MagicMock(spec=RecommendationsResponse) @@ -73,7 +119,7 @@ def test_embed_content_backend_connect_success(self, embed_content_request_mock) self.adapter.backend.connect.return_value = True self.adapter.backend.make_request.return_value = mock_response - response = self.adapter.embed_content(self.resources) + response = self.adapter.embed_content(self.channel_id, self.resources) self.adapter.backend.connect.assert_called_once() self.adapter.backend.make_request.assert_called_once() self.assertIsInstance(response, bool) @@ -83,6 +129,6 @@ def test_embed_content_backend_connect_success(self, embed_content_request_mock) def test_embed_content_backend_connect_failure(self, embed_content_request_mock): self.adapter.backend.connect.return_value = False with self.assertRaises(errors.ConnectionError): - self.adapter.embed_content(self.resources) + self.adapter.embed_content(self.channel_id, self.resources) self.adapter.backend.connect.assert_called_once() self.adapter.backend.make_request.assert_not_called() diff --git a/contentcuration/contentcuration/utils/automation_manager.py b/contentcuration/contentcuration/utils/automation_manager.py index e84fc38389..fbc76a0b5e 100644 --- a/contentcuration/contentcuration/utils/automation_manager.py +++ b/contentcuration/contentcuration/utils/automation_manager.py @@ -1,8 +1,11 @@ from typing import Any from typing import Dict from typing import List +from typing import Union -from contentcuration.models import ContentNode +from kolibri_public.models import ContentNode as PublicContentNode + +from contentcuration.models import ContentNode as ContentNode from contentcuration.utils.recommendations import RecommendationsAdapter from contentcuration.utils.recommendations import RecommendationsBackendFactory @@ -13,15 +16,16 @@ def __init__(self): self.instance = self.factory.create_backend() self.adapter = RecommendationsAdapter(self.instance) - def generate_embeddings(self, nodes: List[ContentNode]): + def generate_embeddings(self, channel_id: str, nodes: List[Union[ContentNode, PublicContentNode]]): """ Generates embeddings for the given list of nodes. This process is async. + :param channel_id: The channel id to which the nodes belong. :param nodes: The list of nodes for which to generate embeddings. :return: A boolean indicating that the process has started. """ - return self.adapter.embed_content(nodes) + return self.adapter.embed_content(channel_id, nodes) def load_recommendations(self, topic: Dict[str, Any], override_threshold=False): """ diff --git a/contentcuration/contentcuration/utils/recommendations.py b/contentcuration/contentcuration/utils/recommendations.py index d172dcbbb4..c0a604911d 100644 --- a/contentcuration/contentcuration/utils/recommendations.py +++ b/contentcuration/contentcuration/utils/recommendations.py @@ -1,3 +1,5 @@ +import hashlib +import json import logging from typing import Any from typing import Dict @@ -11,11 +13,11 @@ from automation.utils.appnexus.base import BackendFactory from automation.utils.appnexus.base import BackendRequest from automation.utils.appnexus.base import BackendResponse +from kolibri_public.models import ContentNode as PublicContentNode from le_utils.constants import content_kinds from le_utils.constants import format_presets -from contentcuration.models import Channel -from contentcuration.models import ContentNode +from contentcuration.models import ContentNode as ContentNode from contentcuration.models import File @@ -68,32 +70,23 @@ def create_backend(self) -> Backend: class RecommendationsAdapter(Adapter): - def generate_embeddings(self, request: EmbeddingsRequest, - cache: bool = True) -> EmbeddingsResponse: + def generate_embeddings(self, request: EmbeddingsRequest) -> EmbeddingsResponse: """ - Generates recommendations for the given request. + Generates embeddings for the given request. :param request: The request for which to generate embeddings. - :param cache: Whether to cache the embeddings request. :return: The response containing the recommendations. :rtype: EmbeddingsResponse """ if not self.backend.connect(): raise errors.ConnectionError("Connection to the backend failed") - cached_response = self.response_exists(request) - if cached_response: - return cached_response - try: response = self.backend.make_request(request) except Exception as e: logging.exception(e) response = EmbeddingsResponse(error=e) - if not response.error and cache: - self.cache_embeddings_request(request, response) - return response def response_exists(self, request) -> Union[EmbeddingsResponse, None]: @@ -105,8 +98,13 @@ def response_exists(self, request) -> Union[EmbeddingsResponse, None]: :rtype: Union[EmbeddingsResponse, None] """ try: - cache = list(RecommendationsCache.objects.filter(request=request).order_by('rank')) - data = [entry.response for entry in cache] + request_hash = self._generate_request_hash(request) + cache = list( + RecommendationsCache.objects.filter(request_hash=request_hash).order_by('rank')) + data = [{ + 'contentnode_id': entry.response, + 'rank': entry.rank, + } for entry in cache] if len(data) > 0: return EmbeddingsResponse(data=data) else: @@ -115,6 +113,23 @@ def response_exists(self, request) -> Union[EmbeddingsResponse, None]: logging.exception(e) return None + def _generate_request_hash(self, request) -> str: + """ + Generates a unique hash for a given request. + + This method serializes the request attributes that make it unique, + then generates a hash of this serialization. + + :param request: The request for which to generate a unique hash. + :return: A unique hash representing the request + """ + unique_attributes = json.dumps({ + 'params': request.params, + 'json': request.json, + }, sort_keys=True).encode('utf-8') + + return hashlib.md5(unique_attributes).hexdigest() + def cache_embeddings_request(self, request: BackendRequest, response: BackendResponse) -> bool: """ Caches the recommendations request and response. @@ -125,12 +140,15 @@ def cache_embeddings_request(self, request: BackendRequest, response: BackendRes :rtype: bool """ try: + request_hash = self._generate_request_hash(request) nodes = self._extract_data(response) + override_threshold = request.params.get('override_threshold', False) cache = [ RecommendationsCache( - request=request, - response=node, + request_hash=request_hash, + response=node['contentnode_id'], rank=node['rank'], + override_threshold=override_threshold, ) for node in nodes ] RecommendationsCache.objects.bulk_create(cache) @@ -149,12 +167,21 @@ def get_recommendations(self, topic: Dict[str, Any], :param override_threshold: A boolean flag to override the recommendation threshold. :return: The recommendations for the given topic. :rtype: RecommendationsResponse """ + recommendations = [] request = EmbedTopicsRequest( params={'override_threshold': override_threshold}, json=topic, ) - response = self.generate_embeddings(request=request) + + cached_response = self.response_exists(request) + if cached_response: + response = cached_response + else: + response = self.generate_embeddings(request=request) + if not response.error: + self.cache_embeddings_request(request, response) + nodes = self._extract_data(response) if len(nodes) > 0: node_ids = [node['contentnode_id'] for node in nodes] @@ -163,27 +190,61 @@ def get_recommendations(self, topic: Dict[str, Any], return RecommendationsResponse(results=recommendations) def _extract_data(self, response: BackendResponse) -> List[Dict[str, Any]]: - return response.data if response.data is not None else [] + """ + Extracts the data from the given response. + + The response is of the form: + + { + "data": [ + { + "contentnode_id": "", + "rank": 0.7 + } + ] + } + + :param response: A response from which to extract the data. + :return: The extracted data. + :rtype: List[Dict[str, Any]] + """ + return response.data if not response.data else [] - def embed_content(self, nodes: List[ContentNode]) -> bool: + def embed_content(self, channel_id: str, + nodes: List[Union[ContentNode, PublicContentNode]]) -> bool: """ Embeds the content for the given nodes. This is an asynchronous process and could take a while to complete. This process is handled by our curriculum automation service. - See https://github.com/learningequality/curriculum-automation + See https://github.com/learningequality/curriculum-automation. Also, see + https://github.com/learningequality/le-utils/blob/main/spec/schema-embed_content_request.json + for the schema. + :param channel_id: The channel ID to which the nodes belong. :param nodes: The nodes for which to embed the content. :return: A boolean indicating that content embedding process has started. :rtype: bool """ + if not self.backend.connect(): + raise errors.ConnectionError("Connection to the backend failed") + for i in range(0, len(nodes), 20): - batch = nodes[i:i + 20] - content = [self.extract_content(node) for node in batch] - request = EmbedContentRequest(json=content) - self.generate_embeddings(request=request, cache=False) + try: + batch = nodes[i:i + 20] + content = [self.extract_content(node) for node in batch] + content_body = { + 'resources': content, + 'metadata': { + 'channel_id': channel_id, + } + } + request = EmbedContentRequest(json=content_body) + self.backend.make_request(request) + except Exception as e: + logging.exception(e) return True - def extract_content(self, node: ContentNode) -> Dict[str, Any]: + def extract_content(self, node) -> Dict[str, Any]: """ Extracts the content from the given node. @@ -192,16 +253,29 @@ def extract_content(self, node: ContentNode) -> Dict[str, Any]: :rtype: Dict[str, Any] """ contentkind_to_presets = { - content_kinds.AUDIO: [format_presets.AUDIO, format_presets.AUDIO_DEPENDENCY], + content_kinds.AUDIO: [ + format_presets.AUDIO, + format_presets.AUDIO_DEPENDENCY, + ], content_kinds.VIDEO: [ - format_presets.VIDEO_DEPENDENCY, format_presets.VIDEO_HIGH_RES, format_presets.VIDEO_LOW_RES, format_presets.VIDEO_SUBTITLE, + format_presets.VIDEO_DEPENDENCY, + ], + content_kinds.EXERCISE: [ + format_presets.EXERCISE, + format_presets.QTI_ZIP, + ], + content_kinds.DOCUMENT: [ + format_presets.DOCUMENT, + format_presets.EPUB, + ], + content_kinds.HTML5: [ + format_presets.HTML5_ZIP, + format_presets.AUDIO_DEPENDENCY, + format_presets.VIDEO_DEPENDENCY, ], - content_kinds.EXERCISE: [format_presets.DOCUMENT, format_presets.EPUB], - content_kinds.DOCUMENT: [format_presets.DOCUMENT, format_presets.EPUB], - content_kinds.HTML5: [format_presets.HTML5_ZIP], content_kinds.H5P: [format_presets.H5P_ZIP], content_kinds.ZIM: [format_presets.ZIM], } @@ -210,54 +284,48 @@ def extract_content(self, node: ContentNode) -> Dict[str, Any]: presets = contentkind_to_presets.get(contentkind.kind) files = self._get_content_files(node, presets) if presets else None - try: - channel = Channel.object.filter(main_tree=node).first() - channel_id = channel.id if channel else None - except Exception as e: - logging.exception(e) - channel_id = None - return { - "resources": { - "id": node.node_id, - "title": node.title, - "description": node.description, - "text": "", - "language": node.language.lang_code if node.language else None, - "files": files - }, - "metadata": { - "channel_id": channel_id, - }, + "id": node.node_id, + "title": node.title, + "description": node.description, + "text": "", + "language": node.language.lang_code if node.language else None, + "files": files, } def _get_content_files(self, node, presets) -> List[Dict[str, Any]]: """ - Get the content files for the given node and presets. See - https://github.com/learningequality/le-utils/blob/main/spec/schema-embed_topics_request.json - for the file schema. + Get the content files for the given node and presets. :param node: The node for which to get the content files. :param presets: The presets for which to get the content files. :return: A list of dictionaries containing the content files. :rtype: List[Dict[str, Any]] """ + files = [] try: node_files = File.objects.filter(contentnode=node, preset__in=presets) + for file in node_files: + files.append(self._format_file_data(file)) except Exception as e: logging.exception(e) - node_files = [] - files = [] - for file in node_files: - file_dict = { - 'url': file.source_url, - 'preset': file.preset_id, - 'language': file.language.lang_code if file.language else None - } - files.append(file_dict) return files + def _format_file_data(self, file) -> Dict[str, Any]: + """ + Format the file data into a dictionary. + + :param file: The file for which to format its data. + :return: A dictionary containing the formatted file data. + :rtype: Dict[str, Any] + """ + return { + 'url': file.file_on_disk, + 'preset': file.preset_id, + 'language': file.language.lang_code if file.language else None, + } + class Recommendations(Backend): From 9dbbc5c0fa0b62843e0fb577c478bfe8dda9a3e1 Mon Sep 17 00:00:00 2001 From: Samson Akol Date: Fri, 26 Jul 2024 14:56:34 +0300 Subject: [PATCH 09/21] Use values to query dictionary --- .../automation/migrations/0001_initial.py | 32 ------------------- contentcuration/automation/models.py | 2 +- .../contentcuration/utils/recommendations.py | 13 ++++---- 3 files changed, 7 insertions(+), 40 deletions(-) delete mode 100644 contentcuration/automation/migrations/0001_initial.py diff --git a/contentcuration/automation/migrations/0001_initial.py b/contentcuration/automation/migrations/0001_initial.py deleted file mode 100644 index 87ee422398..0000000000 --- a/contentcuration/automation/migrations/0001_initial.py +++ /dev/null @@ -1,32 +0,0 @@ -# Generated by Django 3.2.24 on 2024-07-24 13:18 -import uuid - -import django.db.models.deletion -from django.db import migrations -from django.db import models - - -class Migration(migrations.Migration): - initial = True - - dependencies = [ - ('kolibri_public', '0003_alter_file_preset'), - ] - - operations = [ - migrations.CreateModel( - name='RecommendationsCache', - fields=[ - ('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, - serialize=False)), - ('request_hash', models.CharField(max_length=32, null=True)), - ('rank', models.FloatField(default=0.0, null=True)), - ('override_threshold', models.BooleanField(default=False)), - ('timestamp', models.DateTimeField(auto_now_add=True)), - ('response', models.ForeignKey(blank=True, null=True, - on_delete=django.db.models.deletion.SET_NULL, - related_name='recommendations', - to='kolibri_public.contentnode')), - ], - ), - ] diff --git a/contentcuration/automation/models.py b/contentcuration/automation/models.py index 39b1819f59..a3d618f462 100644 --- a/contentcuration/automation/models.py +++ b/contentcuration/automation/models.py @@ -7,7 +7,7 @@ class RecommendationsCache(models.Model): id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) request_hash = models.CharField(max_length=32, null=True) - response = models.ForeignKey( + contentnode_id = models.ForeignKey( ContentNode, null=True, blank=True, diff --git a/contentcuration/contentcuration/utils/recommendations.py b/contentcuration/contentcuration/utils/recommendations.py index c0a604911d..49859744c7 100644 --- a/contentcuration/contentcuration/utils/recommendations.py +++ b/contentcuration/contentcuration/utils/recommendations.py @@ -99,12 +99,11 @@ def response_exists(self, request) -> Union[EmbeddingsResponse, None]: """ try: request_hash = self._generate_request_hash(request) - cache = list( - RecommendationsCache.objects.filter(request_hash=request_hash).order_by('rank')) - data = [{ - 'contentnode_id': entry.response, - 'rank': entry.rank, - } for entry in cache] + data = list( + RecommendationsCache.objects.filter(request_hash=request_hash) + .order_by('rank') + .values('contentnode_id', 'rank') + ) if len(data) > 0: return EmbeddingsResponse(data=data) else: @@ -146,7 +145,7 @@ def cache_embeddings_request(self, request: BackendRequest, response: BackendRes cache = [ RecommendationsCache( request_hash=request_hash, - response=node['contentnode_id'], + contentnode_id=node['contentnode_id'], rank=node['rank'], override_threshold=override_threshold, ) for node in nodes From c7c598a5d7ce344f991ea25e34bc2024efa8ed64 Mon Sep 17 00:00:00 2001 From: Samson Akol Date: Mon, 29 Jul 2024 14:18:50 +0300 Subject: [PATCH 10/21] Improves cache implementation --- .../contentcuration/utils/recommendations.py | 35 +++++++++++++++---- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/contentcuration/contentcuration/utils/recommendations.py b/contentcuration/contentcuration/utils/recommendations.py index 49859744c7..04fc6b8a3c 100644 --- a/contentcuration/contentcuration/utils/recommendations.py +++ b/contentcuration/contentcuration/utils/recommendations.py @@ -119,11 +119,18 @@ def _generate_request_hash(self, request) -> str: This method serializes the request attributes that make it unique, then generates a hash of this serialization. + To prevent cache duplication, the hash is generated + independent of the override_threshold parameter. + :param request: The request for which to generate a unique hash. :return: A unique hash representing the request """ + + params_copy = request.params.copy() if request.params else {} + params_copy.pop('override_threshold', None) + unique_attributes = json.dumps({ - 'params': request.params, + 'params': params_copy, 'json': request.json, }, sort_keys=True).encode('utf-8') @@ -131,31 +138,45 @@ def _generate_request_hash(self, request) -> str: def cache_embeddings_request(self, request: BackendRequest, response: BackendResponse) -> bool: """ - Caches the recommendations request and response. + Caches the recommendations request and response. It performs a bulk insert of the + recommendations into the RecommendationsCache table, ignoring any conflicts. :param request: The request to cache. :param response: The response to cache. :return: A boolean indicating whether the caching was successful. :rtype: bool """ + try: - request_hash = self._generate_request_hash(request) nodes = self._extract_data(response) - override_threshold = request.params.get('override_threshold', False) - cache = [ + request_hash = self._generate_request_hash(request) + existing_cache = set(RecommendationsCache.objects.filter(request_hash=request_hash) + .values_list('contentnode_id', flat=True)) + override_threshold = self._extract_override_threshold(request) + new_cache = [ RecommendationsCache( request_hash=request_hash, contentnode_id=node['contentnode_id'], rank=node['rank'], override_threshold=override_threshold, - ) for node in nodes + ) for node in nodes if node['contentnode_id'] not in existing_cache ] - RecommendationsCache.objects.bulk_create(cache) + RecommendationsCache.objects.bulk_create(new_cache, ignore_conflicts=True) return True except Exception as e: logging.exception(e) return False + def _extract_override_threshold(self, request) -> bool: + """ + Extracts the override_threshold parameter from the request safely. + + :param request: The request containing the parameters. + :return: The value of the override_threshold parameter, or False if not present. + :rtype: bool + """ + return request.params.get('override_threshold', False) if request.params else False + def get_recommendations(self, topic: Dict[str, Any], override_threshold=False) -> RecommendationsResponse: """ From a8cca1e5664443e73c94e198a3da266fa5ffa628 Mon Sep 17 00:00:00 2001 From: Samson Akol Date: Tue, 30 Jul 2024 15:27:26 +0300 Subject: [PATCH 11/21] Adjusts docstring description --- contentcuration/contentcuration/utils/recommendations.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/contentcuration/contentcuration/utils/recommendations.py b/contentcuration/contentcuration/utils/recommendations.py index 04fc6b8a3c..33e3bcaaf0 100644 --- a/contentcuration/contentcuration/utils/recommendations.py +++ b/contentcuration/contentcuration/utils/recommendations.py @@ -114,10 +114,8 @@ def response_exists(self, request) -> Union[EmbeddingsResponse, None]: def _generate_request_hash(self, request) -> str: """ - Generates a unique hash for a given request. - - This method serializes the request attributes that make it unique, - then generates a hash of this serialization. + Generates a unique hash for a given request. It serializes the request + attributes that make it unique, then generates a hash of this serialization. To prevent cache duplication, the hash is generated independent of the override_threshold parameter. From 2ebd5ee3433c4258e413f1428c2c205e9496ab03 Mon Sep 17 00:00:00 2001 From: Samson Akol Date: Tue, 30 Jul 2024 15:36:56 +0300 Subject: [PATCH 12/21] Adds unique constraint to cache model --- contentcuration/automation/models.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/contentcuration/automation/models.py b/contentcuration/automation/models.py index a3d618f462..8e1bc2496b 100644 --- a/contentcuration/automation/models.py +++ b/contentcuration/automation/models.py @@ -17,3 +17,6 @@ class RecommendationsCache(models.Model): rank = models.FloatField(default=0.0, null=True) override_threshold = models.BooleanField(default=False) timestamp = models.DateTimeField(auto_now_add=True) + + class Meta: + unique_together = ('request_hash', 'contentnode_id') From b7af67fcc78036c2b5757bdc31eee0ee7e0c8fc9 Mon Sep 17 00:00:00 2001 From: Samson Akol Date: Tue, 30 Jul 2024 16:35:09 +0300 Subject: [PATCH 13/21] Adds indexes for optimized data querying --- contentcuration/automation/models.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/contentcuration/automation/models.py b/contentcuration/automation/models.py index 8e1bc2496b..2d7c5807e6 100644 --- a/contentcuration/automation/models.py +++ b/contentcuration/automation/models.py @@ -4,6 +4,11 @@ from kolibri_public.models import ContentNode +REQUEST_HASH_INDEX_NAME = "request_hash_idx" +CONTENTNODE_ID_INDEX_NAME = "contentnode_id_idx" +REQUEST_CONTENTNODE_INDEX_NAME = "request_hash_contentnode_id_idx" + + class RecommendationsCache(models.Model): id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) request_hash = models.CharField(max_length=32, null=True) @@ -20,3 +25,11 @@ class RecommendationsCache(models.Model): class Meta: unique_together = ('request_hash', 'contentnode_id') + indexes = [ + models.Index(fields=['request_hash'], name=REQUEST_HASH_INDEX_NAME), + models.Index(fields=['contentnode_id'], name=CONTENTNODE_ID_INDEX_NAME), + models.Index( + fields=['request_hash', 'contentnode_id'], + name=REQUEST_CONTENTNODE_INDEX_NAME, + ), + ] From 50fc1d824bc8945d52bcc7aa16a314df796109a7 Mon Sep 17 00:00:00 2001 From: Samson Akol Date: Tue, 30 Jul 2024 16:42:20 +0300 Subject: [PATCH 14/21] Adds migrations --- .../automation/migrations/0001_initial.py | 49 +++++++++++++++++++ contentcuration/automation/models.py | 2 +- 2 files changed, 50 insertions(+), 1 deletion(-) create mode 100644 contentcuration/automation/migrations/0001_initial.py diff --git a/contentcuration/automation/migrations/0001_initial.py b/contentcuration/automation/migrations/0001_initial.py new file mode 100644 index 0000000000..82ee0c1188 --- /dev/null +++ b/contentcuration/automation/migrations/0001_initial.py @@ -0,0 +1,49 @@ +# Generated by Django 3.2.24 on 2024-07-30 13:36 +import uuid + +import django.db.models.deletion +from django.db import migrations +from django.db import models + + +class Migration(migrations.Migration): + initial = True + + dependencies = [ + ('kolibri_public', '0003_alter_file_preset'), + ] + + operations = [ + migrations.CreateModel( + name='RecommendationsCache', + fields=[ + ('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, + serialize=False)), + ('request_hash', models.CharField(max_length=32, null=True)), + ('rank', models.FloatField(default=0.0, null=True)), + ('override_threshold', models.BooleanField(default=False)), + ('timestamp', models.DateTimeField(auto_now_add=True)), + ('contentnode_id', models.ForeignKey(blank=True, null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name='recommendations', + to='kolibri_public.contentnode')), + ], + ), + migrations.AddIndex( + model_name='recommendationscache', + index=models.Index(fields=['request_hash'], name='request_hash_idx'), + ), + migrations.AddIndex( + model_name='recommendationscache', + index=models.Index(fields=['contentnode_id'], name='contentnode_id_idx'), + ), + migrations.AddIndex( + model_name='recommendationscache', + index=models.Index(fields=['request_hash', 'contentnode_id'], + name='request_hash_contentnode_idx'), + ), + migrations.AlterUniqueTogether( + name='recommendationscache', + unique_together={('request_hash', 'contentnode_id')}, + ), + ] diff --git a/contentcuration/automation/models.py b/contentcuration/automation/models.py index 2d7c5807e6..dc518eeda9 100644 --- a/contentcuration/automation/models.py +++ b/contentcuration/automation/models.py @@ -6,7 +6,7 @@ REQUEST_HASH_INDEX_NAME = "request_hash_idx" CONTENTNODE_ID_INDEX_NAME = "contentnode_id_idx" -REQUEST_CONTENTNODE_INDEX_NAME = "request_hash_contentnode_id_idx" +REQUEST_CONTENTNODE_INDEX_NAME = "request_hash_contentnode_idx" class RecommendationsCache(models.Model): From 0def99e926facd3aa40a2f8a478b117f1e739369 Mon Sep 17 00:00:00 2001 From: Samson Akol Date: Thu, 1 Aug 2024 16:48:37 +0300 Subject: [PATCH 15/21] Writes queries to get main_tree_id --- .../automation/migrations/0001_initial.py | 49 ------------------- contentcuration/automation/models.py | 5 -- .../contentcuration/utils/recommendations.py | 41 +++++++++++++--- 3 files changed, 35 insertions(+), 60 deletions(-) delete mode 100644 contentcuration/automation/migrations/0001_initial.py diff --git a/contentcuration/automation/migrations/0001_initial.py b/contentcuration/automation/migrations/0001_initial.py deleted file mode 100644 index 82ee0c1188..0000000000 --- a/contentcuration/automation/migrations/0001_initial.py +++ /dev/null @@ -1,49 +0,0 @@ -# Generated by Django 3.2.24 on 2024-07-30 13:36 -import uuid - -import django.db.models.deletion -from django.db import migrations -from django.db import models - - -class Migration(migrations.Migration): - initial = True - - dependencies = [ - ('kolibri_public', '0003_alter_file_preset'), - ] - - operations = [ - migrations.CreateModel( - name='RecommendationsCache', - fields=[ - ('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, - serialize=False)), - ('request_hash', models.CharField(max_length=32, null=True)), - ('rank', models.FloatField(default=0.0, null=True)), - ('override_threshold', models.BooleanField(default=False)), - ('timestamp', models.DateTimeField(auto_now_add=True)), - ('contentnode_id', models.ForeignKey(blank=True, null=True, - on_delete=django.db.models.deletion.SET_NULL, - related_name='recommendations', - to='kolibri_public.contentnode')), - ], - ), - migrations.AddIndex( - model_name='recommendationscache', - index=models.Index(fields=['request_hash'], name='request_hash_idx'), - ), - migrations.AddIndex( - model_name='recommendationscache', - index=models.Index(fields=['contentnode_id'], name='contentnode_id_idx'), - ), - migrations.AddIndex( - model_name='recommendationscache', - index=models.Index(fields=['request_hash', 'contentnode_id'], - name='request_hash_contentnode_idx'), - ), - migrations.AlterUniqueTogether( - name='recommendationscache', - unique_together={('request_hash', 'contentnode_id')}, - ), - ] diff --git a/contentcuration/automation/models.py b/contentcuration/automation/models.py index dc518eeda9..9925efb20f 100644 --- a/contentcuration/automation/models.py +++ b/contentcuration/automation/models.py @@ -6,7 +6,6 @@ REQUEST_HASH_INDEX_NAME = "request_hash_idx" CONTENTNODE_ID_INDEX_NAME = "contentnode_id_idx" -REQUEST_CONTENTNODE_INDEX_NAME = "request_hash_contentnode_idx" class RecommendationsCache(models.Model): @@ -28,8 +27,4 @@ class Meta: indexes = [ models.Index(fields=['request_hash'], name=REQUEST_HASH_INDEX_NAME), models.Index(fields=['contentnode_id'], name=CONTENTNODE_ID_INDEX_NAME), - models.Index( - fields=['request_hash', 'contentnode_id'], - name=REQUEST_CONTENTNODE_INDEX_NAME, - ), ] diff --git a/contentcuration/contentcuration/utils/recommendations.py b/contentcuration/contentcuration/utils/recommendations.py index 33e3bcaaf0..7748a7f622 100644 --- a/contentcuration/contentcuration/utils/recommendations.py +++ b/contentcuration/contentcuration/utils/recommendations.py @@ -13,10 +13,16 @@ from automation.utils.appnexus.base import BackendFactory from automation.utils.appnexus.base import BackendRequest from automation.utils.appnexus.base import BackendResponse +from django.db.models import F +from django.db.models import OuterRef +from django.db.models import Subquery +from django.db.models import Value +from django.db.models.functions import Replace from kolibri_public.models import ContentNode as PublicContentNode from le_utils.constants import content_kinds from le_utils.constants import format_presets +from contentcuration.models import Channel from contentcuration.models import ContentNode as ContentNode from contentcuration.models import File @@ -99,11 +105,11 @@ def response_exists(self, request) -> Union[EmbeddingsResponse, None]: """ try: request_hash = self._generate_request_hash(request) - data = list( - RecommendationsCache.objects.filter(request_hash=request_hash) - .order_by('rank') - .values('contentnode_id', 'rank') - ) + override_threshold = self._extract_override_threshold(request) + data = list(RecommendationsCache.objects + .filter(request_hash=request_hash, override_threshold=override_threshold) + .order_by('override_threshold', 'rank') + .values('contentnode_id', 'rank')) if len(data) > 0: return EmbeddingsResponse(data=data) else: @@ -203,7 +209,21 @@ def get_recommendations(self, topic: Dict[str, Any], nodes = self._extract_data(response) if len(nodes) > 0: node_ids = [node['contentnode_id'] for node in nodes] - recommendations = list(ContentNode.objects.filter(id__in=node_ids)) + + # Get the channel_id from PublicContentNode based on matching node_id from ContentNode + channel_id_subquery = PublicContentNode.objects.filter( + self._normalize_uuid(F('id')) == self._normalize_uuid(OuterRef('node_id')) + ).values('channel_id')[:1] + + # Get main_tree_id from Channel based on channel_id obtained from channel_id_subquery + main_tree_id_subquery = Channel.objects.filter( + self._normalize_uuid(F('id')) == self._normalize_uuid(Subquery(channel_id_subquery)) + ).values('main_tree_id')[:1] + + # Annotate main_tree_id onto ContentNode + recommendations = ContentNode.objects.filter(id__in=node_ids).annotate( + main_tree_id=Subquery(main_tree_id_subquery) + ).values('id', 'node_id', 'main_tree_id', 'parent_id') return RecommendationsResponse(results=recommendations) @@ -228,6 +248,15 @@ def _extract_data(self, response: BackendResponse) -> List[Dict[str, Any]]: """ return response.data if not response.data else [] + def _normalize_uuid(self, field): + """ + Removes hyphens from a UUID field. + + :param field: The field (such as F() object or OuterRef) whose value needs normalization. + :return: The normalized field expression without hyphens. + """ + return Replace(field, Value('-'), Value('')) + def embed_content(self, channel_id: str, nodes: List[Union[ContentNode, PublicContentNode]]) -> bool: """ From 374da3fe5f181aef2321895b6b18080f1ad49c26 Mon Sep 17 00:00:00 2001 From: Samson Akol Date: Thu, 1 Aug 2024 16:53:46 +0300 Subject: [PATCH 16/21] Reruns migrations --- .../automation/migrations/0001_initial.py | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 contentcuration/automation/migrations/0001_initial.py diff --git a/contentcuration/automation/migrations/0001_initial.py b/contentcuration/automation/migrations/0001_initial.py new file mode 100644 index 0000000000..96e54e69c9 --- /dev/null +++ b/contentcuration/automation/migrations/0001_initial.py @@ -0,0 +1,44 @@ +# Generated by Django 3.2.24 on 2024-08-01 13:52 +import uuid + +import django.db.models.deletion +from django.db import migrations +from django.db import models + + +class Migration(migrations.Migration): + initial = True + + dependencies = [ + ('kolibri_public', '0003_alter_file_preset'), + ] + + operations = [ + migrations.CreateModel( + name='RecommendationsCache', + fields=[ + ('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, + serialize=False)), + ('request_hash', models.CharField(max_length=32, null=True)), + ('rank', models.FloatField(default=0.0, null=True)), + ('override_threshold', models.BooleanField(default=False)), + ('timestamp', models.DateTimeField(auto_now_add=True)), + ('contentnode_id', models.ForeignKey(blank=True, null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name='recommendations', + to='kolibri_public.contentnode')), + ], + ), + migrations.AddIndex( + model_name='recommendationscache', + index=models.Index(fields=['request_hash'], name='request_hash_idx'), + ), + migrations.AddIndex( + model_name='recommendationscache', + index=models.Index(fields=['contentnode_id'], name='contentnode_id_idx'), + ), + migrations.AlterUniqueTogether( + name='recommendationscache', + unique_together={('request_hash', 'contentnode_id')}, + ), + ] From d70e97817962551b244314af56b8d9927c486187 Mon Sep 17 00:00:00 2001 From: Samson Akol Date: Wed, 7 Aug 2024 17:17:45 +0300 Subject: [PATCH 17/21] adds more tests --- .../automation/migrations/0001_initial.py | 14 +- contentcuration/automation/models.py | 10 +- .../tests/test_recommendations_cache_model.py | 64 ++++ .../tests/utils/test_recommendations.py | 335 ++++++++++++++---- .../contentcuration/utils/recommendations.py | 72 ++-- 5 files changed, 395 insertions(+), 100 deletions(-) create mode 100644 contentcuration/automation/tests/test_recommendations_cache_model.py diff --git a/contentcuration/automation/migrations/0001_initial.py b/contentcuration/automation/migrations/0001_initial.py index 96e54e69c9..6152c0f4b2 100644 --- a/contentcuration/automation/migrations/0001_initial.py +++ b/contentcuration/automation/migrations/0001_initial.py @@ -1,4 +1,4 @@ -# Generated by Django 3.2.24 on 2024-08-01 13:52 +# Generated by Django 3.2.24 on 2024-08-05 21:23 import uuid import django.db.models.deletion @@ -23,10 +23,10 @@ class Migration(migrations.Migration): ('rank', models.FloatField(default=0.0, null=True)), ('override_threshold', models.BooleanField(default=False)), ('timestamp', models.DateTimeField(auto_now_add=True)), - ('contentnode_id', models.ForeignKey(blank=True, null=True, - on_delete=django.db.models.deletion.SET_NULL, - related_name='recommendations', - to='kolibri_public.contentnode')), + ('contentnode', models.ForeignKey(blank=True, null=True, + on_delete=django.db.models.deletion.CASCADE, + related_name='recommendations', + to='kolibri_public.contentnode')), ], ), migrations.AddIndex( @@ -35,10 +35,10 @@ class Migration(migrations.Migration): ), migrations.AddIndex( model_name='recommendationscache', - index=models.Index(fields=['contentnode_id'], name='contentnode_id_idx'), + index=models.Index(fields=['contentnode'], name='contentnode_idx'), ), migrations.AlterUniqueTogether( name='recommendationscache', - unique_together={('request_hash', 'contentnode_id')}, + unique_together={('request_hash', 'contentnode')}, ), ] diff --git a/contentcuration/automation/models.py b/contentcuration/automation/models.py index 9925efb20f..7000ddf420 100644 --- a/contentcuration/automation/models.py +++ b/contentcuration/automation/models.py @@ -5,26 +5,26 @@ REQUEST_HASH_INDEX_NAME = "request_hash_idx" -CONTENTNODE_ID_INDEX_NAME = "contentnode_id_idx" +CONTENTNODE_INDEX_NAME = "contentnode_idx" class RecommendationsCache(models.Model): id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) request_hash = models.CharField(max_length=32, null=True) - contentnode_id = models.ForeignKey( + contentnode = models.ForeignKey( ContentNode, null=True, blank=True, related_name='recommendations', - on_delete=models.SET_NULL, + on_delete=models.CASCADE, ) rank = models.FloatField(default=0.0, null=True) override_threshold = models.BooleanField(default=False) timestamp = models.DateTimeField(auto_now_add=True) class Meta: - unique_together = ('request_hash', 'contentnode_id') + unique_together = ('request_hash', 'contentnode') indexes = [ models.Index(fields=['request_hash'], name=REQUEST_HASH_INDEX_NAME), - models.Index(fields=['contentnode_id'], name=CONTENTNODE_ID_INDEX_NAME), + models.Index(fields=['contentnode'], name=CONTENTNODE_INDEX_NAME), ] diff --git a/contentcuration/automation/tests/test_recommendations_cache_model.py b/contentcuration/automation/tests/test_recommendations_cache_model.py new file mode 100644 index 0000000000..60caaca95e --- /dev/null +++ b/contentcuration/automation/tests/test_recommendations_cache_model.py @@ -0,0 +1,64 @@ +import uuid + +from automation.models import RecommendationsCache +from django.db import IntegrityError +from kolibri_public.models import ContentNode + +from contentcuration.tests.base import StudioTestCase + + +class TestRecommendationsCache(StudioTestCase): + + def setUp(self): + self.content_node = ContentNode.objects.create( + id=uuid.uuid4(), + title='Test Content Node', + content_id=uuid.uuid4(), + channel_id=uuid.uuid4(), + ) + self.cache = RecommendationsCache.objects.create( + request_hash='test_hash', + contentnode_id=self.content_node, + rank=1.0, + override_threshold=False + ) + + def test_cache_creation(self): + self.assertIsInstance(self.cache, RecommendationsCache) + self.assertEqual(self.cache.request_hash, 'test_hash') + self.assertEqual(self.cache.contentnode_id, self.content_node) + self.assertEqual(self.cache.rank, 1.0) + self.assertFalse(self.cache.override_threshold) + + def test_cache_retrieval(self): + retrieved_cache = RecommendationsCache.objects.get(request_hash='test_hash') + self.assertEqual(retrieved_cache, self.cache) + + def test_cache_uniqueness(self): + with self.assertRaises(IntegrityError): + RecommendationsCache.objects.create( + request_hash='test_hash', + contentnode_id=self.content_node, + rank=2.0, + override_threshold=True + ) + + def test_bulk_create_ignore_conflicts_true(self): + initial_count = RecommendationsCache.objects.count() + try: + RecommendationsCache.objects.bulk_create( + [self.cache, self.cache], + ignore_conflicts=True + ) + except IntegrityError: + self.fail("bulk_create raised IntegrityError unexpectedly!") + + final_count = RecommendationsCache.objects.count() + self.assertEqual(initial_count, final_count) + + def test_bulk_create_ignore_conflicts_false(self): + with self.assertRaises(IntegrityError): + RecommendationsCache.objects.bulk_create( + [self.cache, self.cache], + ignore_conflicts=False + ) diff --git a/contentcuration/contentcuration/tests/utils/test_recommendations.py b/contentcuration/contentcuration/tests/utils/test_recommendations.py index f395005b63..c6e9796d4a 100644 --- a/contentcuration/contentcuration/tests/utils/test_recommendations.py +++ b/contentcuration/contentcuration/tests/utils/test_recommendations.py @@ -1,8 +1,16 @@ +import copy +import uuid + +from automation.models import RecommendationsCache from automation.utils.appnexus import errors +from automation.utils.appnexus.base import BackendResponse from django.test import TestCase +from kolibri_public.models import ContentNode as PublicContentNode from mock import MagicMock from mock import patch +from contentcuration.models import Channel +from contentcuration.models import ContentKind from contentcuration.models import ContentNode from contentcuration.utils.recommendations import EmbeddingsResponse from contentcuration.utils.recommendations import EmbedTopicsRequest @@ -19,9 +27,10 @@ def test_backend_initialization(self): class RecommendationsAdapterTestCase(TestCase): - def setUp(self): - self.adapter = RecommendationsAdapter(MagicMock()) - self.topic = { + @classmethod + def setUpTestData(cls): + cls.adapter = RecommendationsAdapter(MagicMock()) + cls.topic = { 'id': 'topic_id', 'title': 'topic_title', 'description': 'topic_description', @@ -34,101 +43,297 @@ def setUp(self): } ] } - self.channel_id = 'test_channel_id' - self.resources = [ - MagicMock(spec=ContentNode), + cls.channel_id = 'test_channel_id' + cls.resources = [MagicMock(spec=ContentNode)] + + cls.request = EmbedTopicsRequest( + method='POST', + url='http://test.com', + path='/test/path', + params={'override_threshold': False}, + json=cls.topic + ) + cls.api_response = BackendResponse(data=[ + {'contentnode_id': '1234567890abcdef1234567890abcdef', 'rank': 0.9}, + {'contentnode_id': 'abcdef1234567890abcdef1234567890', 'rank': 0.8} + ]) + cls.recommendations_response = [ + { + 'id': '1234567890abcdef1234567890abcdef', + 'node_id': '1234567890abcdef1234567890abcdef', + 'parent_id': None, + 'main_tree_id': '4387374055304864a731f3e705d64639' + }, + { + 'id': 'abcdef1234567890abcdef1234567890', + 'node_id': 'abcdef1234567890abcdef1234567890', + 'parent_id': None, + 'main_tree_id': '0548a548dda8487b8ac2f81145737cfc' + } ] + cls.content_kind = ContentKind.objects.create(kind='topic') + + cls.channel_1 = Channel.objects.create( + id='ddec09d74e834241a580c480ee37879c', + name='Channel 1', + main_tree=ContentNode.objects.create( + id='4387374055304864a731f3e705d64639', + title='Main tree 1', + content_id=uuid.uuid4(), + node_id='e947222469504e789476cf3ffc5e3801', + kind=cls.content_kind + ) + ) + cls.channel_2 = Channel.objects.create( + id='84fcaec1e0514b62899d7f436384c401', + name='Channel 2', + main_tree=ContentNode.objects.create( + id='0548a548dda8487b8ac2f81145737cfc', + title='Main tree 2', + content_id=uuid.uuid4(), + node_id='f7547941d75d4712a53f566b4bf93250', + kind=cls.content_kind + ) + ) + + cls.public_content_node_1 = PublicContentNode.objects.create( + id=uuid.UUID('1234567890abcdef1234567890abcdef'), + title='Public Content Node 1', + content_id=uuid.uuid4(), + channel_id=uuid.UUID('ddec09d74e834241a580c480ee37879c'), + ) + cls.public_content_node_2 = PublicContentNode.objects.create( + id=uuid.UUID('abcdef1234567890abcdef1234567890'), + title='Public Content Node 2', + content_id=uuid.uuid4(), + channel_id=uuid.UUID('84fcaec1e0514b62899d7f436384c401'), + ) + + cls.content_node_1 = ContentNode.objects.create( + id='1234567890abcdef1234567890abcdef', + title='Content Node 1', + content_id=uuid.uuid4(), + node_id='1234567890abcdef1234567890abcdef', + kind=cls.content_kind + ) + cls.content_node_2 = ContentNode.objects.create( + id='abcdef1234567890abcdef1234567890', + title='Content Node 2', + content_id=uuid.uuid4(), + node_id='abcdef1234567890abcdef1234567890', + kind=cls.content_kind + ) + + cls.cache = RecommendationsCache.objects.create( + request_hash=cls.adapter._generate_request_hash(cls.request), + contentnode=cls.public_content_node_1, + rank=1.0, + override_threshold=False + ) + + def assert_backend_call(self, mock_response_exists, response_exists_value, connect_value, + make_request_value, method, *args): + mock_response_exists.return_value = response_exists_value + self.adapter.backend.connect.return_value = connect_value + self.adapter.backend.make_request.return_value = make_request_value + + if response_exists_value: + result = method(*args) + mock_response_exists.assert_called_once() + self.adapter.backend.connect.assert_not_called() + self.adapter.backend.make_request.assert_not_called() + return result + else: + if connect_value: + result = method(*args) + self.adapter.backend.connect.assert_called_once() + self.adapter.backend.make_request.assert_called_once() + return result + else: + with self.assertRaises(errors.ConnectionError): + method(*args) + self.adapter.backend.connect.assert_called_once() + self.adapter.backend.make_request.assert_not_called() + def test_adapter_initialization(self): self.assertIsNotNone(self.adapter) self.assertIsInstance(self.adapter, RecommendationsAdapter) - def test_generate_embeddings_connect_failure(self): + @patch('contentcuration.utils.recommendations.RecommendationsAdapter.response_exists') + def test_generate_embeddings_connect_failure(self, mock_response_exists): mock_response = MagicMock(spec=EmbeddingsResponse) + self.assert_backend_call(mock_response_exists, None, False, mock_response, + self.adapter.generate_embeddings, self.request) - self.adapter.backend.connect.return_value = False - self.adapter.backend.make_request.return_value = mock_response - with self.assertRaises(errors.ConnectionError): - self.adapter.generate_embeddings(EmbedTopicsRequest( - method='POST', - url='http://test.com', - path='/test/path', - )) - self.adapter.backend.connect.assert_called_once() - self.adapter.backend.make_request.assert_not_called() - - def test_generate_embeddings(self): + @patch('contentcuration.utils.recommendations.RecommendationsAdapter.response_exists') + def test_generate_embeddings(self, mock_response_exists): mock_response = MagicMock(spec=EmbeddingsResponse) mock_response.error = None - - self.adapter.backend.connect.return_value = True - self.adapter.backend.make_request.return_value = mock_response - response = self.adapter.generate_embeddings(EmbedTopicsRequest( - method='POST', - url='http://test.com', - path='/test/path', - )) - self.adapter.backend.connect.assert_called_once() - self.adapter.backend.make_request.assert_called_once() + response = self.assert_backend_call(mock_response_exists, None, True, mock_response, + self.adapter.generate_embeddings, self.request) self.assertIsInstance(response, EmbeddingsResponse) - def test_generate_embeddings_failure(self): + @patch('contentcuration.utils.recommendations.RecommendationsAdapter.response_exists') + def test_generate_embeddings_failure(self, mock_response_exists): mock_response = MagicMock(spec=EmbeddingsResponse) mock_response.error = {} + response = self.assert_backend_call(mock_response_exists, None, True, mock_response, + self.adapter.generate_embeddings, self.request) + self.assertIsInstance(response, EmbeddingsResponse) + self.assertIsNotNone(response.error) + + def test_response_exists(self): + response = self.adapter.response_exists(self.request) + public_content_node_id = str(self.public_content_node_1.id).replace('-', '') - self.adapter.backend.connect.return_value = True - self.adapter.backend.make_request.return_value = mock_response - response = self.adapter.generate_embeddings(EmbedTopicsRequest( + self.assertIsNotNone(response) + self.assertIsInstance(response, EmbeddingsResponse) + self.assertEqual(len(response.data), 1) + self.assertEqual(response.data[0]['contentnode_id'], public_content_node_id) + self.assertEqual(response.data[0]['rank'], 1.0) + + def test_response_does_not_exist(self): + new_request = EmbedTopicsRequest( method='POST', url='http://test.com', path='/test/path', - )) - self.adapter.backend.connect.assert_called_once() - self.adapter.backend.make_request.assert_called_once() - self.assertIsInstance(response, EmbeddingsResponse) - self.assertIsNotNone(response.error) + params={'override_threshold': True}, + json={'topic': 'new_test_topic'} + ) + response = self.adapter.response_exists(new_request) + self.assertIsNone(response) + + def cache_request_test_helper(self, request_json, response_data, expected_count): + new_request = copy.deepcopy(self.request) + new_request.json = request_json + response_copy = copy.deepcopy(self.api_response) + response_copy.data = response_data + + result = self.adapter.cache_embeddings_request(new_request, response_copy) + self.assertTrue(result) + + cached_items = RecommendationsCache.objects.filter( + request_hash=self.adapter._generate_request_hash(new_request) + ) + self.assertEqual(cached_items.count(), expected_count) + + def test_cache_embeddings_request_success(self): + self.cache_request_test_helper({'topic': 'new_test_topic_1'}, self.api_response.data, 2) + + def test_cache_embeddings_request_empty_data(self): + self.cache_request_test_helper({'topic': 'new_test_topic_2'}, [], 0) + def test_cache_embeddings_request_ignore_duplicates(self): + duplicate_data = [ + {'contentnode_id': '1234567890abcdef1234567890abcdef', 'rank': 0.9}, + {'contentnode_id': '1234567890abcdef1234567890abcdef', 'rank': 0.9} + ] + self.cache_request_test_helper({'topic': 'new_test_topic_3'}, duplicate_data, 1) + + @patch('contentcuration.utils.recommendations.RecommendationsAdapter.cache_embeddings_request') + @patch('contentcuration.utils.recommendations.RecommendationsAdapter.generate_embeddings') + @patch('contentcuration.utils.recommendations.RecommendationsAdapter.response_exists') @patch('contentcuration.utils.recommendations.EmbedTopicsRequest') - def test_get_recommendations_backend_connect_success(self, get_recommendations_request_mock): - mock_response = MagicMock(spec=RecommendationsResponse) - mock_response.data = [] + def test_get_recommendations_success(self, mock_embed_topics_request, mock_response_exists, + mock_generate_embeddings, mock_cache_embeddings_request): + mock_response_exists.return_value = None + mock_response = MagicMock(spec=EmbeddingsResponse) + mock_response.data = copy.deepcopy(self.api_response.data) mock_response.error = None + mock_generate_embeddings.return_value = mock_response - self.adapter.backend.connect.return_value = True - self.adapter.backend.make_request.return_value = mock_response response = self.adapter.get_recommendations(self.topic) - self.adapter.backend.connect.assert_called_once() - self.adapter.backend.make_request.assert_called_once() + + mock_response_exists.assert_called_once() + mock_generate_embeddings.assert_called_once() self.assertIsInstance(response, RecommendationsResponse) + self.assertListEqual(list(response.results), self.recommendations_response) + self.assertEqual(len(response.results), 2) + @patch('contentcuration.utils.recommendations.RecommendationsAdapter._extract_data') + @patch('contentcuration.utils.recommendations.RecommendationsAdapter.response_exists') @patch('contentcuration.utils.recommendations.EmbedTopicsRequest') - def test_get_recommendations_backend_connect_failure(self, embed_topics_request_mock): + def test_get_recommendations_failure(self, mock_embed_topics_request, mock_response_exists, + mock_extract_data): mock_request_instance = MagicMock(spec=EmbedTopicsRequest) - embed_topics_request_mock.return_value = mock_request_instance + mock_embed_topics_request.return_value = mock_request_instance - self.adapter.backend.connect.return_value = False - with self.assertRaises(errors.ConnectionError): - self.adapter.get_recommendations(self.topic) - self.adapter.backend.connect.assert_called_once() - self.adapter.backend.make_request.assert_not_called() + self.assert_backend_call(mock_response_exists, None, False, None, + self.adapter.get_recommendations, self.topic) + @patch('contentcuration.utils.recommendations.RecommendationsAdapter._extract_data') + @patch('contentcuration.utils.recommendations.RecommendationsAdapter.response_exists') @patch('contentcuration.utils.recommendations.EmbedContentRequest') - def test_embed_content_backend_connect_success(self, embed_content_request_mock): + def test_embed_content_success(self, mock_embed_topics_request, mock_response_exists, + mock_extract_data): mock_response = MagicMock(spec=EmbeddingsResponse) mock_response.error = None - - self.adapter.backend.connect.return_value = True - self.adapter.backend.make_request.return_value = mock_response - response = self.adapter.embed_content(self.channel_id, self.resources) - self.adapter.backend.connect.assert_called_once() - self.adapter.backend.make_request.assert_called_once() + response = self.assert_backend_call(mock_response_exists, None, True, mock_response, + self.adapter.embed_content, self.channel_id, + self.resources) self.assertIsInstance(response, bool) self.assertTrue(response) + @patch('contentcuration.utils.recommendations.RecommendationsAdapter.response_exists') @patch('contentcuration.utils.recommendations.EmbedContentRequest') - def test_embed_content_backend_connect_failure(self, embed_content_request_mock): - self.adapter.backend.connect.return_value = False - with self.assertRaises(errors.ConnectionError): - self.adapter.embed_content(self.channel_id, self.resources) - self.adapter.backend.connect.assert_called_once() - self.adapter.backend.make_request.assert_not_called() + def test_embed_content_failure(self, mock_embed_topics_request, mock_response_exists): + response = self.assert_backend_call(mock_response_exists, None, False, + None, self.adapter.embed_content, + self.channel_id, + self.resources) + + self.assertIsNone(response) + + def extract_content_test_helper(self, mock_node, file_return_value, expected_result): + with patch('contentcuration.utils.recommendations.File.objects.filter', + return_value=file_return_value): + result = self.adapter.extract_content(mock_node) + self.assertEqual(result, expected_result) + + def test_extract_content(self): + mock_node = MagicMock(spec=ContentNode) + mock_node.node_id = '1234567890abcdef1234567890abcdef' + mock_node.title = 'Sample Title' + mock_node.description = 'Sample Description' + mock_node.language.lang_code = 'en' + mock_node.kind.kind = 'video' + + mock_file_instance = MagicMock() + mock_file_instance.file_on_disk = 'path/to/file.mp4' + mock_file_instance.preset_id = 'video_high_res' + mock_file_instance.language.lang_code = 'en' + + expected_result = { + "id": '1234567890abcdef1234567890abcdef', + "title": 'Sample Title', + "description": 'Sample Description', + "text": "", + "language": 'en', + "files": [ + { + 'url': 'path/to/file.mp4', + 'preset': 'video_high_res', + 'language': 'en', + } + ], + } + self.extract_content_test_helper(mock_node, [mock_file_instance], expected_result) + + def test_extract_content_no_files(self): + mock_node = MagicMock(spec=ContentNode) + mock_node.node_id = '1234567890abcdef1234567890abcdef' + mock_node.title = 'Sample Title' + mock_node.description = 'Sample Description' + mock_node.language.lang_code = 'en' + mock_node.kind.kind = 'video' + + expected_result = { + "id": '1234567890abcdef1234567890abcdef', + "title": 'Sample Title', + "description": 'Sample Description', + "text": "", + "language": 'en', + "files": [], + } + self.extract_content_test_helper(mock_node, [], expected_result) diff --git a/contentcuration/contentcuration/utils/recommendations.py b/contentcuration/contentcuration/utils/recommendations.py index 7748a7f622..2ea5f7b611 100644 --- a/contentcuration/contentcuration/utils/recommendations.py +++ b/contentcuration/contentcuration/utils/recommendations.py @@ -16,8 +16,8 @@ from django.db.models import F from django.db.models import OuterRef from django.db.models import Subquery -from django.db.models import Value -from django.db.models.functions import Replace +from django.db.models import UUIDField +from django.db.models.functions import Cast from kolibri_public.models import ContentNode as PublicContentNode from le_utils.constants import content_kinds from le_utils.constants import format_presets @@ -153,9 +153,8 @@ def cache_embeddings_request(self, request: BackendRequest, response: BackendRes try: nodes = self._extract_data(response) + valid_nodes = self._validate_nodes(nodes) request_hash = self._generate_request_hash(request) - existing_cache = set(RecommendationsCache.objects.filter(request_hash=request_hash) - .values_list('contentnode_id', flat=True)) override_threshold = self._extract_override_threshold(request) new_cache = [ RecommendationsCache( @@ -163,7 +162,7 @@ def cache_embeddings_request(self, request: BackendRequest, response: BackendRes contentnode_id=node['contentnode_id'], rank=node['rank'], override_threshold=override_threshold, - ) for node in nodes if node['contentnode_id'] not in existing_cache + ) for node in valid_nodes ] RecommendationsCache.objects.bulk_create(new_cache, ignore_conflicts=True) return True @@ -172,7 +171,7 @@ def cache_embeddings_request(self, request: BackendRequest, response: BackendRes return False def _extract_override_threshold(self, request) -> bool: - """ + """\ Extracts the override_threshold parameter from the request safely. :param request: The request containing the parameters. @@ -208,21 +207,25 @@ def get_recommendations(self, topic: Dict[str, Any], nodes = self._extract_data(response) if len(nodes) > 0: - node_ids = [node['contentnode_id'] for node in nodes] + node_ids = self._extract_node_ids(nodes) - # Get the channel_id from PublicContentNode based on matching node_id from ContentNode - channel_id_subquery = PublicContentNode.objects.filter( - self._normalize_uuid(F('id')) == self._normalize_uuid(OuterRef('node_id')) - ).values('channel_id')[:1] + # Get Channel.main_tree_id using the PublicContentNode.channel_id + channel_subquery = Channel.objects.annotate( + channel_id=self._cast_to_uuid(F('id')) + ).filter( + channel_id=OuterRef('channel_id') + ).values('main_tree_id')[:1] - # Get main_tree_id from Channel based on channel_id obtained from channel_id_subquery - main_tree_id_subquery = Channel.objects.filter( - self._normalize_uuid(F('id')) == self._normalize_uuid(Subquery(channel_id_subquery)) + # Get the PublicContentNode.channel_id based on ContentNode.node_id + public_contentnode_subquery = PublicContentNode.objects.filter( + id=self._cast_to_uuid(OuterRef('node_id')) + ).annotate( + main_tree_id=Subquery(channel_subquery) ).values('main_tree_id')[:1] - # Annotate main_tree_id onto ContentNode - recommendations = ContentNode.objects.filter(id__in=node_ids).annotate( - main_tree_id=Subquery(main_tree_id_subquery) + # Annotate `main_tree_id` onto ContentNode + recommendations = ContentNode.objects.filter(node_id__in=node_ids).annotate( + main_tree_id=Subquery(public_contentnode_subquery), ).values('id', 'node_id', 'main_tree_id', 'parent_id') return RecommendationsResponse(results=recommendations) @@ -246,16 +249,39 @@ def _extract_data(self, response: BackendResponse) -> List[Dict[str, Any]]: :return: The extracted data. :rtype: List[Dict[str, Any]] """ - return response.data if not response.data else [] + return response.data if response.data else [] + + def _validate_nodes(self, nodes: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """ + Validates the recommended nodes by checking if they exist in the database. + + :param nodes: The nodes to validate. + :return: A list of valid recommended nodes that exist in the database. + :rtype: List[Dict[str, Any]] + """ + node_ids = self._extract_node_ids(nodes) + existing_node_ids = set( + PublicContentNode.objects.filter(id__in=node_ids).values_list('id', flat=True)) + return [node for node in nodes if node['contentnode_id'] in existing_node_ids] + + def _extract_node_ids(self, nodes: List[Dict[str, Any]]) -> List[str]: + """ + Extracts the node IDs from the given nodes. + + :param nodes: The nodes from which to extract the node IDs. + :return: A list of node IDs. + :rtype: List[str] + """ + return [node['contentnode_id'] for node in nodes] - def _normalize_uuid(self, field): + def _cast_to_uuid(self, field): """ - Removes hyphens from a UUID field. + Casts the given field to a UUIDField. - :param field: The field (such as F() object or OuterRef) whose value needs normalization. - :return: The normalized field expression without hyphens. + :param field: The field (such as F() object or OuterRef) to cast. + :return: The field cast to a UUIDField. """ - return Replace(field, Value('-'), Value('')) + return Cast(field, output_field=UUIDField()) def embed_content(self, channel_id: str, nodes: List[Union[ContentNode, PublicContentNode]]) -> bool: From bd5c01896f2ae60bb14ccfc437e6356269cf25c7 Mon Sep 17 00:00:00 2001 From: Samson Akol Date: Wed, 7 Aug 2024 17:24:15 +0300 Subject: [PATCH 18/21] Fixes failing tests --- .../automation/tests/test_recommendations_cache_model.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/contentcuration/automation/tests/test_recommendations_cache_model.py b/contentcuration/automation/tests/test_recommendations_cache_model.py index 60caaca95e..9cb2e96c9a 100644 --- a/contentcuration/automation/tests/test_recommendations_cache_model.py +++ b/contentcuration/automation/tests/test_recommendations_cache_model.py @@ -18,7 +18,7 @@ def setUp(self): ) self.cache = RecommendationsCache.objects.create( request_hash='test_hash', - contentnode_id=self.content_node, + contentnode=self.content_node, rank=1.0, override_threshold=False ) @@ -26,7 +26,7 @@ def setUp(self): def test_cache_creation(self): self.assertIsInstance(self.cache, RecommendationsCache) self.assertEqual(self.cache.request_hash, 'test_hash') - self.assertEqual(self.cache.contentnode_id, self.content_node) + self.assertEqual(self.cache.contentnode, self.content_node) self.assertEqual(self.cache.rank, 1.0) self.assertFalse(self.cache.override_threshold) @@ -38,7 +38,7 @@ def test_cache_uniqueness(self): with self.assertRaises(IntegrityError): RecommendationsCache.objects.create( request_hash='test_hash', - contentnode_id=self.content_node, + contentnode=self.content_node, rank=2.0, override_threshold=True ) From fa5253b481bbe298f23de4bf0fc389c1e3873df4 Mon Sep 17 00:00:00 2001 From: Samson Akol Date: Wed, 7 Aug 2024 17:47:18 +0300 Subject: [PATCH 19/21] adds more tests --- .../contentcuration/tests/utils/test_recommendations.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/contentcuration/contentcuration/tests/utils/test_recommendations.py b/contentcuration/contentcuration/tests/utils/test_recommendations.py index c6e9796d4a..f63ee53668 100644 --- a/contentcuration/contentcuration/tests/utils/test_recommendations.py +++ b/contentcuration/contentcuration/tests/utils/test_recommendations.py @@ -231,6 +231,12 @@ def test_cache_embeddings_request_ignore_duplicates(self): ] self.cache_request_test_helper({'topic': 'new_test_topic_3'}, duplicate_data, 1) + def test_cache_embeddings_request_invalid_data(self): + invalid_data = [ + {'contentnode_id': '1234567890abcdef1234567890abcdee', 'rank': 0.6} + ] + self.cache_request_test_helper({'topic': 'new_test_topic_4'}, invalid_data, 0) + @patch('contentcuration.utils.recommendations.RecommendationsAdapter.cache_embeddings_request') @patch('contentcuration.utils.recommendations.RecommendationsAdapter.generate_embeddings') @patch('contentcuration.utils.recommendations.RecommendationsAdapter.response_exists') From cb2f47cf8fdb100cd3057f1cc29ad51f93685bb2 Mon Sep 17 00:00:00 2001 From: Samson Akol Date: Thu, 8 Aug 2024 18:25:10 +0300 Subject: [PATCH 20/21] cte implementation initial commit --- .../tests/utils/test_recommendations.py | 24 ++++----- .../contentcuration/utils/recommendations.py | 54 ++++++++++++------- 2 files changed, 46 insertions(+), 32 deletions(-) diff --git a/contentcuration/contentcuration/tests/utils/test_recommendations.py b/contentcuration/contentcuration/tests/utils/test_recommendations.py index f63ee53668..8cb64e5929 100644 --- a/contentcuration/contentcuration/tests/utils/test_recommendations.py +++ b/contentcuration/contentcuration/tests/utils/test_recommendations.py @@ -54,8 +54,8 @@ def setUpTestData(cls): json=cls.topic ) cls.api_response = BackendResponse(data=[ - {'contentnode_id': '1234567890abcdef1234567890abcdef', 'rank': 0.9}, - {'contentnode_id': 'abcdef1234567890abcdef1234567890', 'rank': 0.8} + {'contentnode_id': 'f0ab32ce7dee4ee8a2d8e3dc2cf8a4a3', 'rank': 0.9}, + {'contentnode_id': '98bd9283c7d24e02b338d7f52eabf9f6', 'rank': 0.8} ]) cls.recommendations_response = [ { @@ -82,7 +82,7 @@ def setUpTestData(cls): title='Main tree 1', content_id=uuid.uuid4(), node_id='e947222469504e789476cf3ffc5e3801', - kind=cls.content_kind + kind=cls.content_kind, ) ) cls.channel_2 = Channel.objects.create( @@ -93,36 +93,36 @@ def setUpTestData(cls): title='Main tree 2', content_id=uuid.uuid4(), node_id='f7547941d75d4712a53f566b4bf93250', - kind=cls.content_kind + kind=cls.content_kind, ) ) cls.public_content_node_1 = PublicContentNode.objects.create( - id=uuid.UUID('1234567890abcdef1234567890abcdef'), + id=uuid.UUID('f0ab32ce7dee4ee8a2d8e3dc2cf8a4a3'), title='Public Content Node 1', content_id=uuid.uuid4(), channel_id=uuid.UUID('ddec09d74e834241a580c480ee37879c'), ) cls.public_content_node_2 = PublicContentNode.objects.create( - id=uuid.UUID('abcdef1234567890abcdef1234567890'), + id=uuid.UUID('98bd9283c7d24e02b338d7f52eabf9f6'), title='Public Content Node 2', content_id=uuid.uuid4(), channel_id=uuid.UUID('84fcaec1e0514b62899d7f436384c401'), ) cls.content_node_1 = ContentNode.objects.create( - id='1234567890abcdef1234567890abcdef', + id='3c4d5847dd2f45568ce9ecd53e843a76', title='Content Node 1', content_id=uuid.uuid4(), - node_id='1234567890abcdef1234567890abcdef', - kind=cls.content_kind + node_id='f0ab32ce7dee4ee8a2d8e3dc2cf8a4a3', + kind=cls.content_kind, ) cls.content_node_2 = ContentNode.objects.create( - id='abcdef1234567890abcdef1234567890', + id='b41b9289c1e84ea4869a8fbbf85b9a15', title='Content Node 2', content_id=uuid.uuid4(), - node_id='abcdef1234567890abcdef1234567890', - kind=cls.content_kind + node_id='98bd9283c7d24e02b338d7f52eabf9f6', + kind=cls.content_kind, ) cls.cache = RecommendationsCache.objects.create( diff --git a/contentcuration/contentcuration/utils/recommendations.py b/contentcuration/contentcuration/utils/recommendations.py index 2ea5f7b611..be2daae66d 100644 --- a/contentcuration/contentcuration/utils/recommendations.py +++ b/contentcuration/contentcuration/utils/recommendations.py @@ -1,6 +1,7 @@ import hashlib import json import logging +import uuid from typing import Any from typing import Dict from typing import List @@ -13,11 +14,12 @@ from automation.utils.appnexus.base import BackendFactory from automation.utils.appnexus.base import BackendRequest from automation.utils.appnexus.base import BackendResponse +from django.db.models import Exists from django.db.models import F from django.db.models import OuterRef -from django.db.models import Subquery from django.db.models import UUIDField from django.db.models.functions import Cast +from django_cte import With from kolibri_public.models import ContentNode as PublicContentNode from le_utils.constants import content_kinds from le_utils.constants import format_presets @@ -208,25 +210,37 @@ def get_recommendations(self, topic: Dict[str, Any], nodes = self._extract_data(response) if len(nodes) > 0: node_ids = self._extract_node_ids(nodes) - - # Get Channel.main_tree_id using the PublicContentNode.channel_id - channel_subquery = Channel.objects.annotate( - channel_id=self._cast_to_uuid(F('id')) - ).filter( - channel_id=OuterRef('channel_id') - ).values('main_tree_id')[:1] - - # Get the PublicContentNode.channel_id based on ContentNode.node_id - public_contentnode_subquery = PublicContentNode.objects.filter( - id=self._cast_to_uuid(OuterRef('node_id')) - ).annotate( - main_tree_id=Subquery(channel_subquery) - ).values('main_tree_id')[:1] - - # Annotate `main_tree_id` onto ContentNode - recommendations = ContentNode.objects.filter(node_id__in=node_ids).annotate( - main_tree_id=Subquery(public_contentnode_subquery), - ).values('id', 'node_id', 'main_tree_id', 'parent_id') + cast_node_ids = [uuid.UUID(node_id) for node_id in node_ids] + + channel_cte = With( + Channel.objects.annotate( + channel_id=self._cast_to_uuid(F('id')) + ).filter( + Exists( + PublicContentNode.objects.filter( + id__in=cast_node_ids, + channel_id=OuterRef('channel_id') + ) + ) + ).values( + 'main_tree_id', + tree_id=F('main_tree__tree_id'), + ).distinct() + ) + print(list(Channel.objects.all())) + + recommendations = channel_cte.join( + ContentNode.objects.filter(node_id__in=node_ids), + tree_id=channel_cte.col.tree_id + ).with_cte(channel_cte) .annotate( + main_tree_id=channel_cte.col.main_tree_id + ).values( + 'id', + 'node_id', + 'main_tree_id', + 'parent_id', + ) + print(list(recommendations)) return RecommendationsResponse(results=recommendations) From 902875eb55228831e87a3dbe4f734bce64490479 Mon Sep 17 00:00:00 2001 From: Samson Akol Date: Sat, 10 Aug 2024 02:01:11 +0300 Subject: [PATCH 21/21] Implements CTE to get recommendations --- .../tests/utils/test_recommendations.py | 96 +++++++------------ .../contentcuration/utils/recommendations.py | 4 +- 2 files changed, 34 insertions(+), 66 deletions(-) diff --git a/contentcuration/contentcuration/tests/utils/test_recommendations.py b/contentcuration/contentcuration/tests/utils/test_recommendations.py index 8cb64e5929..92c730cded 100644 --- a/contentcuration/contentcuration/tests/utils/test_recommendations.py +++ b/contentcuration/contentcuration/tests/utils/test_recommendations.py @@ -9,9 +9,9 @@ from mock import MagicMock from mock import patch -from contentcuration.models import Channel -from contentcuration.models import ContentKind from contentcuration.models import ContentNode +from contentcuration.tests import testdata +from contentcuration.tests.base import StudioTestCase from contentcuration.utils.recommendations import EmbeddingsResponse from contentcuration.utils.recommendations import EmbedTopicsRequest from contentcuration.utils.recommendations import Recommendations @@ -26,7 +26,8 @@ def test_backend_initialization(self): self.assertIsInstance(recommendations, Recommendations) -class RecommendationsAdapterTestCase(TestCase): +class RecommendationsAdapterTestCase(StudioTestCase): + @classmethod def setUpTestData(cls): cls.adapter = RecommendationsAdapter(MagicMock()) @@ -54,77 +55,25 @@ def setUpTestData(cls): json=cls.topic ) cls.api_response = BackendResponse(data=[ - {'contentnode_id': 'f0ab32ce7dee4ee8a2d8e3dc2cf8a4a3', 'rank': 0.9}, - {'contentnode_id': '98bd9283c7d24e02b338d7f52eabf9f6', 'rank': 0.8} + {'contentnode_id': '1234567890abcdef1234567890abcdef', 'rank': 0.9}, + {'contentnode_id': 'abcdef1234567890abcdef1234567890', 'rank': 0.8}, + {'contentnode_id': '00000000000000000000000000000003', 'rank': 0.8}, + {'contentnode_id': '00000000000000000000000000000005', 'rank': 0.8}, ]) - cls.recommendations_response = [ - { - 'id': '1234567890abcdef1234567890abcdef', - 'node_id': '1234567890abcdef1234567890abcdef', - 'parent_id': None, - 'main_tree_id': '4387374055304864a731f3e705d64639' - }, - { - 'id': 'abcdef1234567890abcdef1234567890', - 'node_id': 'abcdef1234567890abcdef1234567890', - 'parent_id': None, - 'main_tree_id': '0548a548dda8487b8ac2f81145737cfc' - } - ] - - cls.content_kind = ContentKind.objects.create(kind='topic') - - cls.channel_1 = Channel.objects.create( - id='ddec09d74e834241a580c480ee37879c', - name='Channel 1', - main_tree=ContentNode.objects.create( - id='4387374055304864a731f3e705d64639', - title='Main tree 1', - content_id=uuid.uuid4(), - node_id='e947222469504e789476cf3ffc5e3801', - kind=cls.content_kind, - ) - ) - cls.channel_2 = Channel.objects.create( - id='84fcaec1e0514b62899d7f436384c401', - name='Channel 2', - main_tree=ContentNode.objects.create( - id='0548a548dda8487b8ac2f81145737cfc', - title='Main tree 2', - content_id=uuid.uuid4(), - node_id='f7547941d75d4712a53f566b4bf93250', - kind=cls.content_kind, - ) - ) cls.public_content_node_1 = PublicContentNode.objects.create( - id=uuid.UUID('f0ab32ce7dee4ee8a2d8e3dc2cf8a4a3'), + id=uuid.UUID('1234567890abcdef1234567890abcdef'), title='Public Content Node 1', content_id=uuid.uuid4(), channel_id=uuid.UUID('ddec09d74e834241a580c480ee37879c'), ) cls.public_content_node_2 = PublicContentNode.objects.create( - id=uuid.UUID('98bd9283c7d24e02b338d7f52eabf9f6'), + id=uuid.UUID('abcdef1234567890abcdef1234567890'), title='Public Content Node 2', content_id=uuid.uuid4(), channel_id=uuid.UUID('84fcaec1e0514b62899d7f436384c401'), ) - cls.content_node_1 = ContentNode.objects.create( - id='3c4d5847dd2f45568ce9ecd53e843a76', - title='Content Node 1', - content_id=uuid.uuid4(), - node_id='f0ab32ce7dee4ee8a2d8e3dc2cf8a4a3', - kind=cls.content_kind, - ) - cls.content_node_2 = ContentNode.objects.create( - id='b41b9289c1e84ea4869a8fbbf85b9a15', - title='Content Node 2', - content_id=uuid.uuid4(), - node_id='98bd9283c7d24e02b338d7f52eabf9f6', - kind=cls.content_kind, - ) - cls.cache = RecommendationsCache.objects.create( request_hash=cls.adapter._generate_request_hash(cls.request), contentnode=cls.public_content_node_1, @@ -216,6 +165,7 @@ def cache_request_test_helper(self, request_json, response_data, expected_count) cached_items = RecommendationsCache.objects.filter( request_hash=self.adapter._generate_request_hash(new_request) ) + print(list(cached_items.values('request_hash', 'contentnode', 'rank', 'override_threshold'))) self.assertEqual(cached_items.count(), expected_count) def test_cache_embeddings_request_success(self): @@ -243,6 +193,23 @@ def test_cache_embeddings_request_invalid_data(self): @patch('contentcuration.utils.recommendations.EmbedTopicsRequest') def test_get_recommendations_success(self, mock_embed_topics_request, mock_response_exists, mock_generate_embeddings, mock_cache_embeddings_request): + channel = testdata.channel('Public Channel') + channel.public = True + channel.save() + + public_node_1 = PublicContentNode.objects.create( + id=uuid.UUID('00000000000000000000000000000003'), + title='Video 1', + content_id=uuid.uuid4(), + channel_id=uuid.UUID(channel.id), + ) + public_node_2 = PublicContentNode.objects.create( + id=uuid.UUID('00000000000000000000000000000005'), + title='Exercise 1', + content_id=uuid.uuid4(), + channel_id=uuid.UUID(channel.id), + ) + mock_response_exists.return_value = None mock_response = MagicMock(spec=EmbeddingsResponse) mock_response.data = copy.deepcopy(self.api_response.data) @@ -250,12 +217,15 @@ def test_get_recommendations_success(self, mock_embed_topics_request, mock_respo mock_generate_embeddings.return_value = mock_response response = self.adapter.get_recommendations(self.topic) + results = list(response.results) + expected_node_ids = [public_node_1.id.hex, public_node_2.id.hex] + actual_node_ids = [result["node_id"] for result in results] mock_response_exists.assert_called_once() mock_generate_embeddings.assert_called_once() self.assertIsInstance(response, RecommendationsResponse) - self.assertListEqual(list(response.results), self.recommendations_response) - self.assertEqual(len(response.results), 2) + self.assertListEqual(expected_node_ids, actual_node_ids) + self.assertEqual(len(results), 2) @patch('contentcuration.utils.recommendations.RecommendationsAdapter._extract_data') @patch('contentcuration.utils.recommendations.RecommendationsAdapter.response_exists') diff --git a/contentcuration/contentcuration/utils/recommendations.py b/contentcuration/contentcuration/utils/recommendations.py index be2daae66d..20c6cb05f2 100644 --- a/contentcuration/contentcuration/utils/recommendations.py +++ b/contentcuration/contentcuration/utils/recommendations.py @@ -227,12 +227,11 @@ def get_recommendations(self, topic: Dict[str, Any], tree_id=F('main_tree__tree_id'), ).distinct() ) - print(list(Channel.objects.all())) recommendations = channel_cte.join( ContentNode.objects.filter(node_id__in=node_ids), tree_id=channel_cte.col.tree_id - ).with_cte(channel_cte) .annotate( + ).with_cte(channel_cte).annotate( main_tree_id=channel_cte.col.main_tree_id ).values( 'id', @@ -240,7 +239,6 @@ def get_recommendations(self, topic: Dict[str, Any], 'main_tree_id', 'parent_id', ) - print(list(recommendations)) return RecommendationsResponse(results=recommendations)