Skip to content

Commit

Permalink
Merge pull request #6087 from fecgov/feature/6076-add-proximity-search
Browse files Browse the repository at this point in the history
Feature/6076 add proximity search
  • Loading branch information
fec-jli authored Jan 3, 2025
2 parents cf229d7 + 288b9d2 commit f303b47
Show file tree
Hide file tree
Showing 6 changed files with 149 additions and 17 deletions.
28 changes: 28 additions & 0 deletions tests/integration/test_ao_elasticsearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,34 @@ def test_q_filters(self):
self.assertEqual(len(response), 1)
self.assertEqual(response[0]["ao_no"], "2014-19")

def test_q_proximity_filters(self):
search_phrase = "Random document third ao"
proximity_filter = "before"
proximity_filter_term = "document"
max_gaps = 3

response = self._results_ao(api.url_for(UniversalSearch,
q_proximity=search_phrase,
proximity_filter=proximity_filter,
proximity_filter_term=proximity_filter_term,
max_gaps=max_gaps))

self.assertEqual(len(response), 1)
self.assertEqual(response[0]["ao_no"], "2024-12")

multiple_phrases = ["fourth ao", "proximity document"]
max_gaps = 3

response = self._results_ao(api.url_for(UniversalSearch,
q_proximity=multiple_phrases,
proximity_filter=proximity_filter,
proximity_filter_term=proximity_filter_term,
max_gaps=max_gaps))
self.assertEqual(len(response), 1)
self.assertEqual(response[0]["ao_no"], "2014-19")

self.check_incorrect_values({"q_proximity": search_phrase, "max_gaps": 1}, False)

def test_citation_filters(self):
statutory_title = 52
statutory_section = "30101"
Expand Down
47 changes: 38 additions & 9 deletions tests/integration/test_cases_elasticsearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,19 @@ def check_filters(self, params, field_name, doc_type):
self.assertNotEqual(response["total_" + doc_type], 0)
assert all(x[field_name] == list(params.values())[0] for x in response[doc_type])

def check_incorrect_values(self, params, doc_type, raiseError):
def check_incorrect_values(self, params, doc_types, raiseError):
response = self.app.get(api.url_for(UniversalSearch, **params))
# logging.info(response.json)

if raiseError:
assert response.status_code == 422
else:
assert response.status_code == 200
assert response.json[doc_type] == 0
if isinstance(doc_types, list):
for typ in doc_types:
assert response.json[typ] == 0
else:
assert response.json[doc_types] == 0

def check_sort_asc(self, doc_dict):
for i in range(len(doc_dict) - 1):
Expand Down Expand Up @@ -143,9 +147,7 @@ def test_penalty_filter(self):
"case_min_penalty_amount": 999999
}

self.check_incorrect_values(params, "total_murs", False)
self.check_incorrect_values(params, "total_admin_fines", False)
self.check_incorrect_values(params, "total_adrs", False)
self.check_incorrect_values(params, ["total_murs", "total_admin_fines", "total_adrs"], False)

def test_case_doc_cat_id_filter(self):
# for archived and current murs, adrs, and afs
Expand Down Expand Up @@ -244,6 +246,35 @@ def test_q_filters(self):
self.assertEqual(response["total_admin_fines"], 0)
self.assertEqual(response["total_all"], 10)

def test_q_proximity_filters(self):
# for archived and current murs, advisory_opinions, adrs, and afs
search_phrase = "first document archived mur"
proximity_filter = "after"
proximity_filter_term = "sample text"
max_gaps = 2

response = self._results_case(api.url_for(UniversalSearch,
q_proximity=search_phrase,
proximity_filter=proximity_filter,
proximity_filter_term=proximity_filter_term,
max_gaps=max_gaps))

self.assertEqual(response["total_murs"], 1)
self.assertEqual(response["total_admin_fines"], 0)
self.assertEqual(response["total_adrs"], 0)

multiple_phrases = ["second adr", "sample text"]
max_gaps = 6
response = self._results_case(api.url_for(UniversalSearch, q_proximity=multiple_phrases, max_gaps=max_gaps))

self.assertEqual(response["total_murs"], 0)
self.assertEqual(response["total_admin_fines"], 0)
self.assertEqual(response["total_adrs"], 1)

self.check_incorrect_values({"q_proximity": search_phrase, "max_gaps": 1},
["total_murs", "total_admin_fines", "total_adrs"],
False)

def test_sort(self):
sort_value = "case_no"
ignore_type = ["advisory_opinions", "statutes"]
Expand Down Expand Up @@ -392,8 +423,7 @@ def test_case_respondents(self):
for rsp in adr["respondents"]
) for adr in response["adrs"])

self.check_incorrect_values({"case_respondents": "Bad value"}, "total_murs", False)
self.check_incorrect_values({"case_respondents": "Bad value"}, "total_adrs", False)
self.check_incorrect_values({"case_respondents": "Bad value"}, ["total_murs", "total_adrs"], False)

def test_citation_filters(self):
# filter for current murs and adrs
Expand Down Expand Up @@ -447,8 +477,7 @@ def test_citation_filters(self):
["case_regulatory_citation", "1111 CFR §112.4111"]
]
for filter in filters:
self.check_incorrect_values({filter[0]: filter[1]}, "total_murs", False)
self.check_incorrect_values({filter[0]: filter[1]}, "total_adrs", False)
self.check_incorrect_values({filter[0]: filter[1]}, ["total_murs", "total_adrs"], False)

# ---------------------- End MUR and ADR filters ------------------------------------------------
# ---------------------- Start MUR filters ------------------------------------------------
Expand Down
2 changes: 1 addition & 1 deletion tests/test_legal_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -1837,7 +1837,7 @@
"date": "2015-01-16T00:00:00",
"description": "2014-19",
"document_id": 80838,
"text": "Document text for the fourth ao document",
"text": "Proximity document text for the fourth ao document",
"url": "/files/legal/aos/2014-19/AO_2014-19_(ActBlue)_Final_(1.15.15).pdf"
}
],
Expand Down
4 changes: 4 additions & 0 deletions webservices/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -425,6 +425,10 @@ def make_seek_args(field=fields.Int, description=None):
'sort': IStr(required=False, description=docs.SORT),
'case_min_penalty_amount': fields.Str(required=False, description=docs.CASE_MIN_PENALTY_AMOUNT),
'case_max_penalty_amount': fields.Str(required=False, description=docs.CASE_MAX_PENALTY_AMOUNT),
'q_proximity': fields.List(fields.Str, description=docs.Q_PROXIMITY),
'max_gaps': fields.Int(required=False, description=docs.MAX_GAPS),
"proximity_filter": fields.Str(validate=validate.OneOf(["after", "before"]), description=docs.PROXIMITY_FILTER),
'proximity_filter_term': fields.Str(required=False, description=docs.PROXIMITY_FILTER_TERM),
}

citation = {
Expand Down
20 changes: 20 additions & 0 deletions webservices/docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2354,6 +2354,26 @@ def add_ytd(var):
Show cases with a penalty less than this amount
'''

Q_PROXIMITY = '''
This search identifies documents where the specified phrases appear near each other. The field supports both a single \
phrase or multiple phrases. For a single phrase, the maximum gap is applied between the words in the phrase. For \
multiple phrases, the maximum gap is applied between the phrases themselves.
'''

MAX_GAPS = '''
The maximum number of positions allowed between terms specified in `q_proximity`
'''

PROXIMITY_FILTER = '''
Adds additional filters to the proximity search that provides options to specify positional constraints
'''

PROXIMITY_FILTER_TERM = '''
Specifies the term to which the `proximity_filter` option applies to and defines what must appear in relation to the \
`q_proximity` phrase
'''

# ======== legal end =========


Expand Down
65 changes: 58 additions & 7 deletions webservices/resources/legal.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,19 +180,28 @@ def generic_query_builder(q, type_, from_hit, hits_returned, **kwargs):
.index(SEARCH_ALIAS)
.sort("sort1", "sort2")
)
if type_ == "advisory_opinions":
query = query.highlight("summary", "documents.text", "documents.description")
elif type_ == "statutes":
query = query.highlight("name", "no")
else:
query = query.highlight("documents.text", "documents.description")
proximity_search = False

if kwargs.get("q_proximity") and kwargs.get("max_gaps") and type_ != "statutes":
proximity_search = True

if not proximity_search:
if type_ == "advisory_opinions":
query = query.highlight("summary", "documents.text", "documents.description")
elif type_ == "statutes":
query = query.highlight("name", "no")
else:
query = query.highlight("documents.text", "documents.description")

if kwargs.get("q_exclude"):
must_not = []
must_not.append(Q("nested", path="documents", query=Q("match", documents__text=kwargs.get("q_exclude"))))
query = query.query("bool", must_not=must_not)

# logger.debug("generic_query_builder =" + json.dumps(query.to_dict(), indent=3, cls=DateTimeEncoder))
if proximity_search:
query = get_proximity_query(q, query, **kwargs)

# logging.warning("generic_query_builder =" + json.dumps(query.to_dict(), indent=3, cls=DateTimeEncoder))
return query


Expand Down Expand Up @@ -266,6 +275,48 @@ def case_query_builder(q, type_, from_hit, hits_returned, **kwargs):
else:
return apply_adr_specific_query_params(query, **kwargs)


def get_proximity_query(q, query, **kwargs):
q_proximity = kwargs.get("q_proximity")
max_gaps = kwargs.get("max_gaps")
intervals_list = []
contains_filter = False

if kwargs.get("proximity_filter") and kwargs.get("proximity_filter_term"):
contains_filter = True
filter = kwargs.get("proximity_filter")
filters = {filter: {'match': {'query': kwargs.get("proximity_filter_term")}}}

if len(q_proximity) == 1:
if contains_filter:
intervals_inner_query = Q('intervals', documents__text={
'match': {'query': q_proximity[0], 'max_gaps': max_gaps, "filter": filters}
})
else:
intervals_inner_query = Q('intervals', documents__text={
'match': {'query': q_proximity[0], 'max_gaps': max_gaps}
})
else:
for q in q_proximity:
dict_item = {"match": {"query": q, "max_gaps": 0, }}
intervals_list.append(dict_item)

if contains_filter:
intervals_inner_query = Q('intervals', documents__text={
'all_of': {'max_gaps': max_gaps, "intervals": intervals_list, "filter": filters}
})
else:
intervals_inner_query = Q('intervals', documents__text={
'all_of': {'max_gaps': max_gaps, "intervals": intervals_list}
})

intervals_query = Q(
"nested",
path="documents",
query=intervals_inner_query)

return query.query("bool", must=intervals_query)

# Select one or more case_doc_category_id to filter by corresponding case_document_category
# - 1 - Conciliation and Settlement Agreements
# - 2 - Complaint, Responses, Designation of Counsel and Extensions of Time
Expand Down

0 comments on commit f303b47

Please sign in to comment.