Skip to content

Commit

Permalink
Merge pull request #738 from kobotoolbox/improve-metadata-validations
Browse files Browse the repository at this point in the history
Improve metadata content type validation
  • Loading branch information
joshuaberetta authored Jun 18, 2021
2 parents 00bbe8b + 00e0204 commit 740df02
Show file tree
Hide file tree
Showing 7 changed files with 106 additions and 32 deletions.
1 change: 0 additions & 1 deletion dependencies/pip/dev.in
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
ipdb
ipython
shell_command
Werkzeug
sqlparse
pytest
pytest-django
Expand Down
2 changes: 1 addition & 1 deletion dependencies/pip/dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,7 @@ wcwidth==0.2.5
# prompt-toolkit
# pytest
werkzeug==1.0.1
# via -r dependencies/pip/dev.in
# via -r dependencies/pip/requirements.in
xlrd==1.2.0
# via
# -r dependencies/pip/requirements.in
Expand Down
2 changes: 2 additions & 0 deletions dependencies/pip/prod.txt
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,8 @@ vine==1.3.0
# via
# amqp
# celery
werkzeug==2.0.1
# via -r dependencies/pip/requirements.in
xlrd==1.2.0
# via
# -r dependencies/pip/requirements.in
Expand Down
3 changes: 3 additions & 0 deletions dependencies/pip/requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -93,3 +93,6 @@ boto3

# Sentry
sentry-sdk

# mimetype detection
Werkzeug
2 changes: 2 additions & 0 deletions dependencies/pip/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,8 @@ vine==1.3.0
# via
# amqp
# celery
werkzeug==2.0.1
# via -r dependencies/pip/requirements.in
xlrd==1.2.0
# via
# -r dependencies/pip/requirements.in
Expand Down
8 changes: 6 additions & 2 deletions onadata/apps/api/tests/viewsets/test_metadata_viewset.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,12 @@ def test_add_invalid_media_url(self):
}
response = self._post_form_metadata(data, False)
self.assertEqual(response.status_code, 400)
error = {"non_field_errors": ["Invalid url %s." % data['data_value']]}
self.assertEqual(response.data, error)
error = {'data_value': [f"Invalid url {data['data_value']}"]}
self.assertTrue('data_value' in response.data)
error_details = [
str(error_detail) for error_detail in response.data['data_value']
]
self.assertEqual(error_details, error['data_value'])

def test_invalid_post(self):
response = self._post_form_metadata({}, False)
Expand Down
120 changes: 92 additions & 28 deletions onadata/libs/serializers/metadata_serializer.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
# coding: utf-8
import mimetypes
import os
from urllib.parse import urlparse

import requests
from django.conf import settings
from django.core.validators import URLValidator
from django.utils.translation import ugettext as _
from rest_framework import serializers
from werkzeug.http import parse_options_header

from onadata.apps.main.models.meta_data import MetaData
from onadata.apps.logger.models import XForm
Expand All @@ -27,7 +31,6 @@ class MetaDataSerializer(serializers.HyperlinkedModelSerializer):
required=False)
data_type = serializers.ChoiceField(choices=METADATA_TYPES)
data_file = serializers.FileField(required=False)
data_file_type = serializers.CharField(max_length=255, required=False)
from_kpi = serializers.BooleanField(required=False)

class Meta:
Expand All @@ -43,28 +46,20 @@ class Meta:
'url',
'from_kpi',
)
read_only_fields = (
'id',
'data_file_type',
)

# was previously validate_data_value but the signature change in DRF3.
def validate(self, attrs):
"""
Ensure we have a valid url if we are adding a media uri
instead of a media file
"""
value = attrs.get('data_value')
media = attrs.get('data_type')
data_file = attrs.get('data_file')
data_file_type = attrs.get('data_file_type')

if media == 'media' and data_file is None:
URLValidator(message=_("Invalid url %s." % value))(value)

if value is None:
msg = {'data_value': _('This field is required.')}
raise serializers.ValidationError(msg)
self._validate_data_value(attrs)
self._validate_data_file_type(attrs)

attrs['data_file_type'] = self._validate_data_file_type(
data_file_type=data_file_type, data_file=data_file, data_value=value
)
return super().validate(attrs)

def validate_xform(self, xform):
Expand Down Expand Up @@ -101,23 +96,92 @@ def create(self, validated_data):
from_kpi=from_kpi,
)

def _validate_data_file_type(self, data_file_type, data_file, data_value):
def _validate_data_value(self, attrs):
if self.instance and self.instance.pk:
# ToDo , Should we allow PATCHing the object with different file?
attrs['data_file'] = attrs.get(
'data_file', self.instance.data_file
)
attrs['data_type'] = attrs.get(
'data_type', self.instance.data_type
)
try:
attrs['data_value']
except KeyError:
attrs['data_value'] = self.instance.data_value
return attrs

data_value = (
data_file.name if data_file else data_value
)
allowed_types = settings.SUPPORTED_MEDIA_UPLOAD_TYPES
data_value = attrs.get('data_value')

if attrs.get('data_type') == 'media' and attrs.get('data_file') is None:
message = {'data_value': _('Invalid url {}').format(data_value)}
URLValidator(message=message)(data_value)

if not data_file_type:
data_file_type = (
data_file.content_type
if data_file and data_file.content_type in allowed_types
else mimetypes.guess_type(data_value)[0]
if not data_value:
raise serializers.ValidationError(
{'data_value': _('This field is required.')}
)

if data_file_type not in allowed_types:
return attrs

def _validate_data_file_type(self, attrs):

allowed_types = settings.SUPPORTED_MEDIA_UPLOAD_TYPES

data_file = attrs.get('data_file') # This is could be `None`
data_value = attrs['data_value']

if data_value.lower().startswith('http'):
# If `data_value` is a URL but we cannot get the filename from it,
# let's try from the headers
parsed_url = urlparse(data_value)
filename, extension = os.path.splitext(
os.path.basename(parsed_url.path)
)
if not extension:
try:
# `stream=True` makes `requests` to not download the whole
# file until `response.content` is called.
# Useful to get 'Content-Disposition' header.
response = requests.get(data_value, stream=True)
response.raise_for_status()
except requests.exceptions.RequestException:
response.close()
raise serializers.ValidationError({
{'data_file_type': _('Cannot determine content type')}
})
else:
response.close()

try:
filename_from_header = parse_options_header(
response.headers['Content-Disposition']
)
except KeyError:
raise serializers.ValidationError({
{'data_file_type': _('Cannot determine content type')}
})

try:
data_value = filename_from_header[1]['filename']
except (TypeError, IndexError, KeyError):
raise serializers.ValidationError({
{'data_file_type': _('Cannot determine content type')}
})
else:
# In case the url contains a querystring, let's rebuild the
# `data_value` within this scope to detect its mimetype
data_value = f'{filename}{extension}'

# Used to rely on `data_file.content_type`. Unfortunately, when object
# is PATCHed, `data_file` is not a `InMemoryUploadedFile` object anymore
# but a `FieldFile` object which doesn't have a `content_type` attribute
filename = data_file.name if data_file else data_value
attrs['data_file_type'] = mimetypes.guess_type(filename)[0]

if attrs['data_file_type'] not in allowed_types:
raise serializers.ValidationError(
{'data_file_type': _('Invalid content type.')}
{'data_file_type': _('Invalid content type')}
)

return data_file_type
return attrs

0 comments on commit 740df02

Please sign in to comment.