From ae83f5d9c23cf5c9bdc8b2f06fcb966c22bb7849 Mon Sep 17 00:00:00 2001 From: Olivier Leger Date: Thu, 17 Jun 2021 15:51:15 -0400 Subject: [PATCH 1/3] Allow partial `PATCH` on metadata objects, fix content type detection when PATCHing existing objects --- dependencies/pip/dev.in | 1 - dependencies/pip/dev.txt | 2 +- dependencies/pip/prod.txt | 2 + dependencies/pip/requirements.in | 3 + dependencies/pip/requirements.txt | 2 + .../libs/serializers/metadata_serializer.py | 119 +++++++++++++----- 6 files changed, 99 insertions(+), 30 deletions(-) diff --git a/dependencies/pip/dev.in b/dependencies/pip/dev.in index 8bf348510..4259d3b95 100644 --- a/dependencies/pip/dev.in +++ b/dependencies/pip/dev.in @@ -3,7 +3,6 @@ ipdb ipython shell_command -Werkzeug sqlparse pytest pytest-django diff --git a/dependencies/pip/dev.txt b/dependencies/pip/dev.txt index d9913d66b..6e0f07500 100644 --- a/dependencies/pip/dev.txt +++ b/dependencies/pip/dev.txt @@ -344,7 +344,7 @@ wcwidth==0.2.5 # prompt-toolkit # pytest werkzeug==1.0.1 - # via -r dependencies/pip/dev.in + # via -r dependencies/pip/requirements.in xlrd==1.2.0 # via # -r dependencies/pip/requirements.in diff --git a/dependencies/pip/prod.txt b/dependencies/pip/prod.txt index 2ac0260ab..efd91a010 100644 --- a/dependencies/pip/prod.txt +++ b/dependencies/pip/prod.txt @@ -270,6 +270,8 @@ vine==1.3.0 # via # amqp # celery +werkzeug==2.0.1 + # via -r dependencies/pip/requirements.in xlrd==1.2.0 # via # -r dependencies/pip/requirements.in diff --git a/dependencies/pip/requirements.in b/dependencies/pip/requirements.in index f1e4691ea..72d9dd220 100644 --- a/dependencies/pip/requirements.in +++ b/dependencies/pip/requirements.in @@ -93,3 +93,6 @@ boto3 # Sentry sentry-sdk + +# mimetype detection +Werkzeug diff --git a/dependencies/pip/requirements.txt b/dependencies/pip/requirements.txt index 745f4ba93..f349a0fd4 100644 --- a/dependencies/pip/requirements.txt +++ b/dependencies/pip/requirements.txt @@ -268,6 +268,8 @@ vine==1.3.0 # via # amqp # celery +werkzeug==2.0.1 + # via -r dependencies/pip/requirements.in xlrd==1.2.0 # via # -r dependencies/pip/requirements.in diff --git a/onadata/libs/serializers/metadata_serializer.py b/onadata/libs/serializers/metadata_serializer.py index 07be4e3f5..cd04ecae5 100644 --- a/onadata/libs/serializers/metadata_serializer.py +++ b/onadata/libs/serializers/metadata_serializer.py @@ -1,10 +1,14 @@ # coding: utf-8 import mimetypes +import os +from urllib.parse import urlparse +import requests from django.conf import settings from django.core.validators import URLValidator from django.utils.translation import ugettext as _ from rest_framework import serializers +from werkzeug.http import parse_options_header from onadata.apps.main.models.meta_data import MetaData from onadata.apps.logger.models import XForm @@ -27,7 +31,6 @@ class MetaDataSerializer(serializers.HyperlinkedModelSerializer): required=False) data_type = serializers.ChoiceField(choices=METADATA_TYPES) data_file = serializers.FileField(required=False) - data_file_type = serializers.CharField(max_length=255, required=False) from_kpi = serializers.BooleanField(required=False) class Meta: @@ -43,6 +46,10 @@ class Meta: 'url', 'from_kpi', ) + read_only_fields = ( + 'id', + 'data_file_type', + ) # was previously validate_data_value but the signature change in DRF3. def validate(self, attrs): @@ -50,21 +57,9 @@ def validate(self, attrs): Ensure we have a valid url if we are adding a media uri instead of a media file """ - value = attrs.get('data_value') - media = attrs.get('data_type') - data_file = attrs.get('data_file') - data_file_type = attrs.get('data_file_type') - - if media == 'media' and data_file is None: - URLValidator(message=_("Invalid url %s." % value))(value) + self._validate_data_value(attrs) + self._validate_data_file_type(attrs) - if value is None: - msg = {'data_value': _('This field is required.')} - raise serializers.ValidationError(msg) - - attrs['data_file_type'] = self._validate_data_file_type( - data_file_type=data_file_type, data_file=data_file, data_value=value - ) return super().validate(attrs) def validate_xform(self, xform): @@ -101,23 +96,91 @@ def create(self, validated_data): from_kpi=from_kpi, ) - def _validate_data_file_type(self, data_file_type, data_file, data_value): + def _validate_data_value(self, attrs): + if self.instance and self.instance.pk: + # ToDo , Should we allow PATCHing the object with different file? + attrs['data_file'] = attrs.get( + 'data_file', self.instance.data_file + ) + attrs['data_type'] = attrs.get( + 'data_type', self.instance.data_type + ) + try: + attrs['data_value'] + except KeyError: + attrs['data_value'] = self.instance.data_value + return attrs - data_value = ( - data_file.name if data_file else data_value - ) - allowed_types = settings.SUPPORTED_MEDIA_UPLOAD_TYPES + data_value = attrs.get('data_value') + + if attrs.get('data_type') == 'media' and attrs.get('data_file') is None: + URLValidator(message=_("Invalid url %s." % data_value))(data_value) - if not data_file_type: - data_file_type = ( - data_file.content_type - if data_file and data_file.content_type in allowed_types - else mimetypes.guess_type(data_value)[0] + if not data_value: + raise serializers.ValidationError( + {'data_value': _('This field is required.')} ) - if data_file_type not in allowed_types: + return attrs + + def _validate_data_file_type(self, attrs): + + allowed_types = settings.SUPPORTED_MEDIA_UPLOAD_TYPES + + data_file = attrs.get('data_file') + data_value = attrs.get('data_value') + + if data_value.lower().startswith('http'): + # If `data_value` is a URL but we cannot get the filename from it, + # let's try from the headers + parsed_url = urlparse(data_value) + filename, extension = os.path.splitext( + os.path.basename(os.path.basename(parsed_url.path)) + ) + if not extension: + try: + # `stream=True` makes `requests` to not download the whole + # file until `response.content` is called. + # Useful to get 'Content-Disposition' header. + response = requests.get(data_value, stream=True) + response.raise_for_status() + except requests.exceptions.RequestException: + response.close() + raise serializers.ValidationError({ + {'data_file_type': _('Cannot determine content type')} + }) + else: + response.close() + + try: + filename_from_header = parse_options_header( + response.headers['Content-Disposition'] + ) + except KeyError: + raise serializers.ValidationError({ + {'data_file_type': _('Cannot determine content type')} + }) + + try: + data_value = filename_from_header[1]['filename'] + except (TypeError, IndexError, KeyError): + raise serializers.ValidationError({ + {'data_file_type': _('Cannot determine content type')} + }) + else: + # In case the url contains a querystring, let's rebuild the + # `data_value` within this scope to detect its mimetype + data_value = f'{filename}{extension}' + + # Used to rely on `data_file.content_type`. Unfortunately, when object + # is PATCHed, `data_file` is not a `InMemoryUploadedFile` object anymore + # but a `FieldFile` object which doesn't have a `content_type` attribute + filename = data_file.name if data_file else data_value + attrs['data_file_type'] = mimetypes.guess_type(filename)[0] + + if attrs['data_file_type'] not in allowed_types: raise serializers.ValidationError( - {'data_file_type': _('Invalid content type.')} + {'data_file_type': _('Invalid content type')} ) - return data_file_type + return attrs From 4deb7ec141d6cf07c99e1d17896f7f989c6df5bb Mon Sep 17 00:00:00 2001 From: Olivier Leger Date: Fri, 18 Jun 2021 10:16:41 -0400 Subject: [PATCH 2/3] Apply requested changes for PR#738 --- onadata/libs/serializers/metadata_serializer.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/onadata/libs/serializers/metadata_serializer.py b/onadata/libs/serializers/metadata_serializer.py index cd04ecae5..c6471f916 100644 --- a/onadata/libs/serializers/metadata_serializer.py +++ b/onadata/libs/serializers/metadata_serializer.py @@ -114,7 +114,8 @@ def _validate_data_value(self, attrs): data_value = attrs.get('data_value') if attrs.get('data_type') == 'media' and attrs.get('data_file') is None: - URLValidator(message=_("Invalid url %s." % data_value))(data_value) + message = {'data_value': _('Invalid url {}').format(data_value)} + URLValidator(message=message)(data_value) if not data_value: raise serializers.ValidationError( @@ -127,15 +128,15 @@ def _validate_data_file_type(self, attrs): allowed_types = settings.SUPPORTED_MEDIA_UPLOAD_TYPES - data_file = attrs.get('data_file') - data_value = attrs.get('data_value') + data_file = attrs.get('data_file') # This is could be `None` + data_value = attrs['data_value'] if data_value.lower().startswith('http'): # If `data_value` is a URL but we cannot get the filename from it, # let's try from the headers parsed_url = urlparse(data_value) filename, extension = os.path.splitext( - os.path.basename(os.path.basename(parsed_url.path)) + os.path.basename(parsed_url.path) ) if not extension: try: From 00e02048c9a3ae9f18687f71a47f38af115402fb Mon Sep 17 00:00:00 2001 From: Olivier Leger Date: Fri, 18 Jun 2021 13:04:32 -0400 Subject: [PATCH 3/3] Fixed "test_add_invalid_media_url" unit test --- onadata/apps/api/tests/viewsets/test_metadata_viewset.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/onadata/apps/api/tests/viewsets/test_metadata_viewset.py b/onadata/apps/api/tests/viewsets/test_metadata_viewset.py index 126d063b9..b7eb08e92 100644 --- a/onadata/apps/api/tests/viewsets/test_metadata_viewset.py +++ b/onadata/apps/api/tests/viewsets/test_metadata_viewset.py @@ -108,8 +108,12 @@ def test_add_invalid_media_url(self): } response = self._post_form_metadata(data, False) self.assertEqual(response.status_code, 400) - error = {"non_field_errors": ["Invalid url %s." % data['data_value']]} - self.assertEqual(response.data, error) + error = {'data_value': [f"Invalid url {data['data_value']}"]} + self.assertTrue('data_value' in response.data) + error_details = [ + str(error_detail) for error_detail in response.data['data_value'] + ] + self.assertEqual(error_details, error['data_value']) def test_invalid_post(self): response = self._post_form_metadata({}, False)