Skip to content

Commit

Permalink
fix: avoid confusing language prefixes and property names (#513)
Browse files Browse the repository at this point in the history
We are confusing "web:en: https://xxx.com" with a synonyms entry…

---------

Co-authored-by: Pierre Slamich <[email protected]>
  • Loading branch information
alexgarel and teolemon authored Jul 17, 2024
1 parent 2bc56e6 commit e0053b6
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 6 deletions.
3 changes: 1 addition & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -167,13 +167,12 @@ tests: backend_tests ## Run all tests
backend_tests: ## Run python tests
@echo "🍜 Running python tests"
${DOCKER_COMPOSE_TEST} up -d neo4j
${DOCKER_COMPOSE_TEST} run --rm taxonomy_api pytest /parser /parser
${DOCKER_COMPOSE_TEST} run --rm taxonomy_api pytest /parser
${DOCKER_COMPOSE_TEST} run --rm taxonomy_api pytest /code/tests
${DOCKER_COMPOSE_TEST} stop neo4j

checks: quality tests ## Run all checks (quality + tests)


#------------#
# production #
#------------#
Expand Down
19 changes: 15 additions & 4 deletions parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,14 +235,25 @@ def _get_node_data_with_parent_and_end_comments(self, data: NodeData) -> NodeDat

return new_data

_language_code_prefix = re.compile(
r"[a-zA-Z][a-zA-Z][a-zA-Z]?([-_][a-zA-Z][a-zA-Z][a-zA-Z]?)?:"
)

def is_entry_synonyms_line(self, line):
matching_prefix = self._language_code_prefix.match(line)
if matching_prefix:
# verify it's not a property, that is a name followed by a colon and a language
return not (
self._language_code_prefix.match(line[matching_prefix.end():])
)
return False

def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[NodeData]:
"""Transform data from file to dictionary"""
saved_nodes = []
index_stopwords = 0
index_synonyms = 0
language_code_prefix = re.compile(
r"[a-zA-Z][a-zA-Z][a-zA-Z]?([-_][a-zA-Z][a-zA-Z][a-zA-Z]?)?:"
)

# Check if it is correctly written
correctly_written = re.compile(r"\w+\Z")
# stopwords will contain a list of stopwords with their language code as key
Expand Down Expand Up @@ -328,7 +339,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
elif line[0] == "<":
# parent definition
data.parent_tags.append((self._normalize_entry_id(line[1:]), line_number + 1))
elif language_code_prefix.match(line):
elif self.is_entry_synonyms_line(line):
# synonyms definition
if not data.id:
data.id = self._normalize_entry_id(line.split(",", 1)[0])
Expand Down
23 changes: 23 additions & 0 deletions parser/tests/data/test_property_confused_lang.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# test a bug found with labels taxonomy

stopwords:en:ingredients, agricultural, from, with, of, by, the, verified, certified, approved, 100%, pure, guaranty, guaranteed, product guaranteed, product certified, standard, during, period, phase, label, mark, logo

en:1% for the planet, 1 percent for the planet, one percent for the planet
bg:1% за планетата
ca:1% pel planeta
de:1% für den Planeten
es:1% para el planeta, uno por ciento para el planeta
fi:1% planeetalle, prosentti planeetalle
fr:1% pour la planète
it:1% per il pianeta
nl:1% voor de planeet
pt:1% para o planeta, 1 porcento para o planeta, 1 por cento para o planeta, um por cento para o planeta, um porcento para o planeta
description:en:Commit to donating at least 1% of annual sales to environmental organizations.
image:en:1-for-the-planet.75x90.png
label_categories:en: en:Environnment
web:en:https://www.onepercentfortheplanet.org/


en:100% natural
bg:100% натурално
ca:100% natural
13 changes: 13 additions & 0 deletions parser/tests/integration/test_parser_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,3 +351,16 @@ def test_error_log(neo4j, tmp_path, caplog):
assert "duplicate id in file at line 12" in error
assert "The two nodes will be merged, keeping the last" in error
assert "values in case of conflicts." in error


def test_properties_confused_lang(neo4j, tmp_path):
"""Test that short property names don't get confused with language prefixes"""
with neo4j.session() as session:
test_parser = parser.Parser(session)
fpath = str(pathlib.Path(__file__).parent.parent / "data" / "test_property_confused_lang.txt")
test_parser(fpath, None, "branch", "test")
query = "MATCH (n:p_test_branch) WHERE n.id = 'en:1-for-planet' RETURN n"
result = session.run(query)
node = result.value()[0]
# "web:en" was not confused with a language prefix "web:"
assert "prop_web_en" in node.keys()

0 comments on commit e0053b6

Please sign in to comment.