From de8b8a6ac1dda89926f28b1865f132b352903b42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phan=20Mestach?= Date: Wed, 4 Sep 2024 15:46:36 +0200 Subject: [PATCH 1/2] IA-3401 enketo emoji support --- iaso/enketo/enketo_xml.py | 8 +++++++- iaso/models/base.py | 7 +++++-- iaso/tests/enketo/test_enketo_lib.py | 7 +++++++ iaso/tests/fixtures/submission_with_emoji.xml | 1 + iaso/tests/models/test_instance.py | 14 ++++++++++++++ iaso/utils/emoji.py | 17 +++++++++++++++++ 6 files changed, 51 insertions(+), 3 deletions(-) create mode 100644 iaso/tests/fixtures/submission_with_emoji.xml create mode 100644 iaso/utils/emoji.py diff --git a/iaso/enketo/enketo_xml.py b/iaso/enketo/enketo_xml.py index f68b8ed1ae..e106c6a713 100644 --- a/iaso/enketo/enketo_xml.py +++ b/iaso/enketo/enketo_xml.py @@ -1,6 +1,10 @@ from typing import Tuple +import re from lxml import etree # type: ignore +from lxml.etree import XMLParser + +from iaso.utils.emoji import fix_emoji ENKETO_FORM_ID_SEPARATOR = "-" @@ -80,10 +84,12 @@ def inject_xml_find_uuid(instance_xml, instance_id, version_id, user_id) -> Tupl """ "Inject the attribute in different place in the xml Return the uuid found in the xml """ + # use custom parser to match the recover flag as in beautifulsoup while flattening + lxml_parser = XMLParser(huge_tree=True, recover=True) xml_str = instance_xml.decode("utf-8") # Get the instanceID (uuid) from the //meta/instanceID # We have an uuid on instance. but it seems not always filled? - root = etree.fromstring(xml_str) + root = etree.fromstring(fix_emoji(xml_str), parser=lxml_parser) instance_id_tag = root.find(".//meta/instanceID") instance_uuid = instance_id_tag.text.replace("uuid:", "") # type: ignore diff --git a/iaso/models/base.py b/iaso/models/base.py index 8566e9e3a0..a3d8d5ec1c 100644 --- a/iaso/models/base.py +++ b/iaso/models/base.py @@ -44,6 +44,7 @@ from ..utils.models.common import get_creator_name from .device import Device, DeviceOwnership from .forms import Form, FormVersion +from ..utils.emoji import fix_emoji logger = getLogger(__name__) @@ -1046,8 +1047,10 @@ def convert_correlation(self): self.save() def xml_file_to_json(self, file: typing.IO) -> typing.Dict[str, typing.Any]: - copy_io_utf8 = StringIO(file.read().decode("utf-8")) - soup = Soup(copy_io_utf8, "xml", from_encoding="utf-8") + raw_content = file.read().decode("utf-8") + fixed_content = fix_emoji(raw_content).decode("utf-8") + copy_io_utf8 = StringIO(fixed_content) + soup = Soup(copy_io_utf8, "lxml-xml", from_encoding="utf-8") form_version_id = extract_form_version_id(soup) if form_version_id: diff --git a/iaso/tests/enketo/test_enketo_lib.py b/iaso/tests/enketo/test_enketo_lib.py index a9eb7b91c0..ea1b8ddc90 100644 --- a/iaso/tests/enketo/test_enketo_lib.py +++ b/iaso/tests/enketo/test_enketo_lib.py @@ -23,6 +23,13 @@ def test_inject_userid_update_tag_text_if_present(self): self.assertEqual(str(xml), str(expectedInjected)) self.assertEqual(uuid, "demo") + def test_inject_user_id_with_emoji_content(self): + original_xml = b'uuid:demo546��Yellow' + uuid, xml = inject_xml_find_uuid(original_xml, 123, 2012010601, 977) + expectedInjected = b'uuid:demo977' + self.assertEqual(str(xml), str(expectedInjected)) + self.assertEqual(uuid, "demo") + def test_to_xforms_xml(self): form = m.Form.objects.create(name="name < with entity", form_id="odk_form_id") m.FormVersion.objects.create(form=form, version_id="2012010601") diff --git a/iaso/tests/fixtures/submission_with_emoji.xml b/iaso/tests/fixtures/submission_with_emoji.xml new file mode 100644 index 0000000000..eb1111a5bc --- /dev/null +++ b/iaso/tests/fixtures/submission_with_emoji.xml @@ -0,0 +1 @@ +2024-07-01T00:00:00.000+02:00yesTest 1_2Test 1_2Test 1_2years2525nopregnant19.0000��YellowTSFP2024-07-15yesyes0000yes000260124Namuruuid:61edabd0-4759-4560-b0d7-bedd1be38675 \ No newline at end of file diff --git a/iaso/tests/models/test_instance.py b/iaso/tests/models/test_instance.py index 5e16d95f37..17640f3278 100644 --- a/iaso/tests/models/test_instance.py +++ b/iaso/tests/models/test_instance.py @@ -194,6 +194,20 @@ def assertStatusIs(self, instance: m.Instance, status: str): instance_with_status = m.Instance.objects.with_status().get(pk=instance.pk) self.assertEqual(instance_with_status.status, status) + def test_xml_to_json_should_contains_emoji(self): + self.maxDiff = None + instance = m.Instance.objects.create( + form=self.form_1, + period="202001", + org_unit=self.jedi_council_coruscant, + file=UploadedFile(open("iaso/tests/fixtures/submission_with_emoji.xml")), + ) + json_instance = instance.get_and_save_json_of_xml() + + self.assertEqual(json_instance["_version"], "2024080903") + # assert flattened and lowered case keys + self.assertEqual(json_instance["prevous_muac_color"], "🟡Yellow") + def test_xml_to_json_should_contains_chars_encoding(self): instance = m.Instance.objects.create( form=self.form_1, diff --git a/iaso/utils/emoji.py b/iaso/utils/emoji.py new file mode 100644 index 0000000000..6af09f59d4 --- /dev/null +++ b/iaso/utils/emoji.py @@ -0,0 +1,17 @@ +import re + + +def fix_emoji(payload): + chrifnotspecial = lambda dec: ( + "&#%d;" % dec if dec in [10, 13, 35, 38, 59, 60, 62] else chr(dec) + ) # don't convert `\n\r#&;<>` + payload = re.sub(r"&#(\d+);", lambda x: chrifnotspecial(int(x.group(1))), payload) + # No idea why 'INFORMATION SEPARATOR's ended up in some messages, + # but I decide that I don't need them, and they make the parser barf out... + for dec in [28, 29, 30, 31]: + payload = payload.replace(chr(dec), "") + + # combine surrogate pairs + payload = payload.encode("utf-16", "surrogatepass").decode("utf-16") + payload = payload.encode("utf-8") + return payload From ea5970a12106ea8123f787ba8a686fb75d0f0e5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phan=20Mestach?= Date: Wed, 4 Sep 2024 16:03:16 +0200 Subject: [PATCH 2/2] IA-3401 enketo emoji --- iaso/tests/enketo/test_enketo_lib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/iaso/tests/enketo/test_enketo_lib.py b/iaso/tests/enketo/test_enketo_lib.py index ea1b8ddc90..b7a613fc17 100644 --- a/iaso/tests/enketo/test_enketo_lib.py +++ b/iaso/tests/enketo/test_enketo_lib.py @@ -26,7 +26,7 @@ def test_inject_userid_update_tag_text_if_present(self): def test_inject_user_id_with_emoji_content(self): original_xml = b'uuid:demo546��Yellow' uuid, xml = inject_xml_find_uuid(original_xml, 123, 2012010601, 977) - expectedInjected = b'uuid:demo977' + expectedInjected = b'uuid:demo977\xf0\x9f\x9f\xa1Yellow' self.assertEqual(str(xml), str(expectedInjected)) self.assertEqual(uuid, "demo")