diff --git a/services/search/api.py b/services/search/api.py index 7c9f4d0d0..71bf17c7a 100644 --- a/services/search/api.py +++ b/services/search/api.py @@ -45,6 +45,7 @@ from .constants import ( DEFAULT_MODEL_LIMIT_VALUE, + DEFAULT_RANK_THRESHOLD, DEFAULT_SEARCH_SQL_LIMIT_VALUE, DEFAULT_SRS, DEFAULT_TRIGRAM_THRESHOLD, @@ -208,10 +209,18 @@ def get(self, request): try: trigram_threshold = float(params.get("trigram_threshold")) except ValueError: - raise ParseError("'trigram_threshold' need to be of type float.") + raise ParseError("'trigram_threshold' needs to be of type float.") else: trigram_threshold = DEFAULT_TRIGRAM_THRESHOLD + if "rank_threshold" in params: + try: + rank_threshold = float(params.get("rank_threshold")) + except ValueError: + raise ParseError("'rank_threshold' needs to be of type float.") + else: + rank_threshold = DEFAULT_RANK_THRESHOLD + if "geometry" in params: try: show_geometry = strtobool(params["geometry"]) @@ -283,10 +292,11 @@ def get(self, request): # This is ~100 times faster than using Djangos SearchRank and allows searching using wildard "|*" # and by rankig gives better results, e.g. extra fields weight is counted. sql = f""" + SELECT * from ( SELECT id, type_name, name_{language_short}, ts_rank_cd(search_column_{language_short}, search_query) AS rank FROM search_view, to_tsquery('{config_language}','{search_query_str}') search_query WHERE search_query @@ search_column_{language_short} - ORDER BY rank DESC LIMIT {sql_query_limit}; + ORDER BY rank DESC LIMIT {sql_query_limit}) AS sub_query where sub_query.rank >= {rank_threshold}; """ cursor = connection.cursor() diff --git a/services/search/constants.py b/services/search/constants.py index 5bb0e4e87..9d22fdaed 100644 --- a/services/search/constants.py +++ b/services/search/constants.py @@ -16,5 +16,4 @@ # The limit value for the search query that search the search_view. "NULL" = no limit DEFAULT_SEARCH_SQL_LIMIT_VALUE = "NULL" DEFAULT_TRIGRAM_THRESHOLD = 0.15 -# If word length is greater or equal then hyphenate word. -LENGTH_OF_HYPHENATED_WORDS = 8 +DEFAULT_RANK_THRESHOLD = 0 diff --git a/services/search/specification.swagger.yaml b/services/search/specification.swagger.yaml index c59950cba..802e6ae0a 100644 --- a/services/search/specification.swagger.yaml +++ b/services/search/specification.swagger.yaml @@ -49,10 +49,16 @@ components: type: string example: unit,address default: unit + rank_threshold_param: + name: rank_threshold + in: query + desription: Include results with search rank greater or equal than to the value. + type: number + default: 0 trigram_threshold_param: name: trigram_threshold in: query - desription: The threshold value, if trigram similarity is greater-than or equal to this value return the result. + description: The threshold value, if trigram similarity is greater than or equal to this value return the result. type: number default: 0.15 sql_query_limit_param: @@ -173,6 +179,7 @@ paths: - $ref: "#/components/parameters/q_param" - $ref: "#/components/parameters/language_param" - $ref: "#/components/parameters/use_trigram_param" + - $ref: "#/components/parameters/rank_threshold_param" - $ref: "#/components/parameters/trigram_threshold_param" - $ref: "#/components/parameters/order_units_by_num_services_param" - $ref: "#/components/parameters/geometry_param" diff --git a/services/search/utils.py b/services/search/utils.py index 472849b18..008b0c211 100644 --- a/services/search/utils.py +++ b/services/search/utils.py @@ -5,7 +5,6 @@ from services.models import ServiceNode, ServiceNodeUnitCount, Unit from services.search.constants import ( DEFAULT_TRIGRAM_THRESHOLD, - LENGTH_OF_HYPHENATED_WORDS, SEARCHABLE_MODEL_TYPE_NAMES, ) @@ -13,17 +12,22 @@ voikko.setNoUglyHyphenation(True) +def is_compound_word(word): + result = voikko.analyze(word) + if len(result) == 0: + return False + return True if result[0]["WORDBASES"].count("+") > 1 else False + + def hyphenate(word): """ - Returns a list of syllables of the word if word length - is >= LENGTH_OF_HYPHENATE_WORDS + Returns a list of syllables of the word if it is a compound word. """ - word_length = len(word) - if word_length >= LENGTH_OF_HYPHENATED_WORDS: - # By Setting the value to word_length, voikko returns - # the words that are in the compound word, if the word is - # not a compound word it returns the syllables as normal. - voikko.setMinHyphenatedWordLength(word_length) + word = word.strip() + if is_compound_word(word): + # By Setting the setMinHyphenatedWordLength to word_length, + # voikko returns the words that are in the compound word + voikko.setMinHyphenatedWordLength(len(word)) syllables = voikko.hyphenate(word) return syllables.split("-") else: