Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Elasticsearch 7.* and 8.* integration. OpenSearch integration. #469

Open
wants to merge 21 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
fb50d63
Small mapping update and response parsing fix.
ivanmrsulja Jun 10, 2024
6f314b9
Added query parser to transform Solr queries to ES JSON-like queries.
ivanmrsulja Jun 11, 2024
cb4fc02
Updated mapping to support aggregations.
ivanmrsulja Jun 12, 2024
609dd76
Added small ES query optimizations.
ivanmrsulja Jun 18, 2024
ee28715
Completed implementation of missing ES engine methods. Improved docum…
ivanmrsulja Jun 24, 2024
567fbfe
Switched to one search engine configuration URL. Implemented a server…
ivanmrsulja Jun 26, 2024
fa52b27
Added SSL and Basic auth support. Added OpenSearch support.
ivanmrsulja Jul 5, 2024
8059404
Added fallback configuration property for legacy Solr configurations.
ivanmrsulja Aug 28, 2024
2b55ed6
Refactored code so that common property fallback resolution is in it'…
ivanmrsulja Sep 3, 2024
3691ee4
Merge branch 'main' into feature/elasticsearch-integration
ivanmrsulja Oct 9, 2024
0cbec00
Fixed fetch count bug. Fixed advanced search filter bug where filter …
ivanmrsulja Oct 10, 2024
693d5b9
Fixed facets query bug. Fixed delete by query bug.
ivanmrsulja Oct 11, 2024
14422f5
Fixed UTF-8 parsing bug while indexing.
ivanmrsulja Oct 15, 2024
370946e
Added support for *_drsim fields. Fixed pagination and statistics bug.
ivanmrsulja Oct 28, 2024
113ca73
Fixed sort by relevance bug.
ivanmrsulja Oct 31, 2024
3c47e27
Updated documentation.
ivanmrsulja Nov 14, 2024
c760397
Updated example.runtime.properties
ivanmrsulja Nov 19, 2024
a5dc6ea
Fixed inaccurate count retrieved for large indexes when using search …
ivanmrsulja Dec 3, 2024
8b659c1
Added field mappings which were missing for autocomplete search.
ivanmrsulja Dec 9, 2024
aa4606a
Small bugfix and code refactor.
ivanmrsulja Dec 13, 2024
6919b7b
Fixed sorting issue.
ivanmrsulja Dec 26, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions api/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,11 @@
<artifactId>guava</artifactId>
<version>32.0.0-jre</version>
</dependency>
<dependency>
<groupId>co.elastic.clients</groupId>
<artifactId>elasticsearch-java</artifactId>
<version>8.9.0</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>fluent-hc</artifactId>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
package edu.cornell.mannlib.vitro.webapp.searchengine.base;

import edu.cornell.mannlib.vitro.webapp.config.ConfigurationProperties;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

public class SearchEngineUtil {

private static final Log log = LogFactory.getLog(SearchEngineUtil.class);

public static String getSearchEngineURLProperty() {
ConfigurationProperties config = ConfigurationProperties.getInstance();
if (config.getProperty("vitro.local.searchengine.url", "").isEmpty()) {
wwelling marked this conversation as resolved.
Show resolved Hide resolved
return tryFetchLegacySolrConfiguration(config);
}

return config.getProperty("vitro.local.searchengine.url", "");
}

private static String tryFetchLegacySolrConfiguration(ConfigurationProperties config) {
String legacyConfigValue = config.getProperty("vitro.local.solr.url", "");
if (!legacyConfigValue.isEmpty()) {
log.warn(
"vitro.local.solr.url is deprecated, switch to using" +
" vitro.local.searchengine.url as soon as possible.");
}

return legacyConfigValue;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
package edu.cornell.mannlib.vitro.webapp.searchengine.elasticsearch;

import co.elastic.clients.elasticsearch._types.query_dsl.ExistsQuery;
import co.elastic.clients.elasticsearch._types.query_dsl.FuzzyQuery;
import co.elastic.clients.elasticsearch._types.query_dsl.MatchPhraseQuery;
import co.elastic.clients.elasticsearch._types.query_dsl.MatchQuery;
import co.elastic.clients.elasticsearch._types.query_dsl.PrefixQuery;
import co.elastic.clients.elasticsearch._types.query_dsl.Query;
import co.elastic.clients.elasticsearch._types.query_dsl.RangeQuery;
import co.elastic.clients.elasticsearch._types.query_dsl.WildcardQuery;

public class CustomQueryBuilder {

private static final String MAX_FUZZY_EDITS = "2";

public static Query buildQuery(SearchType queryType, String field, String value) {
ivanmrsulja marked this conversation as resolved.
Show resolved Hide resolved
validateInput(field, value);

switch (queryType) {
case MATCH:
return MatchQuery.of(m -> m
.field(field)
.query(value)
)._toQuery();
case FUZZY:
return FuzzyQuery.of(m -> m
.field(field)
.value(value.replace("~", ""))
.fuzziness(MAX_FUZZY_EDITS)
)._toQuery();
case PREFIX:
return PrefixQuery.of(m -> m
.field(field)
.value(value)
)._toQuery();
case RANGE:
String[] values = value.split("TO");
return RangeQuery.of(m -> m
.field(field)
.from(values[0])
.to(values[1])
)._toQuery();
case EXISTS:
return ExistsQuery.of(m -> m
.field(field)
)._toQuery();
case WILDCARD:
return WildcardQuery.of(m -> m
.field(field)
.value(value)
)._toQuery();
default:
return MatchPhraseQuery.of(m -> m
.field(field)
.query(value.substring(1, value.length() - 1)) // Remove leading and trailing '"' character
)._toQuery();
}
}

private static void validateInput(String field, String value) {
if (field == null || field.isEmpty()) {
throw new IllegalArgumentException("Field not specified");
}
if (value == null) {
throw new IllegalArgumentException("Value not specified");
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,31 @@
package edu.cornell.mannlib.vitro.webapp.searchengine.elasticsearch;

import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import edu.cornell.mannlib.vitro.webapp.search.VitroSearchTermNames;
import edu.cornell.mannlib.vitro.webapp.utils.http.HttpClientFactory;
import edu.cornell.mannlib.vitro.webapp.utils.http.ESHttpsBasicClientFactory;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.client.fluent.Request;
import org.apache.http.client.fluent.Response;
import org.apache.http.entity.ContentType;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpPut;

import com.fasterxml.jackson.databind.ObjectMapper;

import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchEngineException;
import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchInputDocument;
import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchInputField;
import org.apache.http.entity.StringEntity;
import org.apache.http.util.EntityUtils;

/**
* The nuts and bolts of adding a document to the Elasticsearch index
Expand All @@ -44,7 +52,16 @@ private void addDocument(SearchInputDocument doc)
throws SearchEngineException {
try {
Map<String, List<Object>> map = convertDocToMap(doc);

if (map.containsKey(VitroSearchTermNames.NAME_RAW)) {
map.putIfAbsent(VitroSearchTermNames.AC_NAME_STEMMED, map.get(VitroSearchTermNames.NAME_RAW));
map.putIfAbsent(VitroSearchTermNames.AC_NAME_UNTOKENIZED, map.get(VitroSearchTermNames.NAME_RAW));
}

String json = new ObjectMapper().writeValueAsString(map);
if (json.contains("_drsim")) {
json = reformatDRSIMFields(json);
}
log.debug("Adding document for '" + doc.getField("DocId") + "': "
+ json);

Expand Down Expand Up @@ -75,15 +92,51 @@ private Map<String, List<Object>> convertDocToMap(SearchInputDocument doc) {
return map;
}

private String reformatDRSIMFields(String json) {
String patternString = "\\[(\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z) TO (\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z)]";
Pattern pattern = Pattern.compile(patternString);
Matcher matcher = pattern.matcher(json);

StringBuffer result = new StringBuffer();

while (matcher.find()) {
String dateStart = matcher.group(1);
String dateEnd = matcher.group(2);

String replacement = String.format("{\"gte\": \"%s\", \"lte\": \"%s\"}", dateStart, dateEnd)
.replace("{", "\\{")
.replace("}", "\\}");

matcher.appendReplacement(result, replacement);
}

matcher.appendTail(result);
return result.toString().replace("[\"{", "{").replace("}\"]", "}");
}

private void putToElastic(String json, String docId)
throws SearchEngineException {
try {
String url = baseUrl + "/_doc/"
+ URLEncoder.encode(docId, "UTF8");
Response response = Request.Put(url)
.bodyString(json, ContentType.APPLICATION_JSON).execute();
log.debug("Response from Elasticsearch: "
+ response.returnContent().asString());
HttpClient httpClient;
if (baseUrl.startsWith("https")) {
httpClient = ESHttpsBasicClientFactory.getHttpClient();
} else {
httpClient = HttpClientFactory.getHttpClient();
}

HttpPut request = new HttpPut(url);
request.addHeader("Content-Type", "application/json");
request.setEntity(new StringEntity(json, "UTF-8"));
HttpResponse response = httpClient.execute(request);
if (response.getStatusLine().getStatusCode() >= 400) {
log.warn("Response from Elasticsearch: "
+ EntityUtils.toString(response.getEntity()));
} else {
log.debug("Response from Elasticsearch: "
+ EntityUtils.toString(response.getEntity()));
}
} catch (Exception e) {
throw new SearchEngineException("Failed to put to Elasticsearch",
e);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,16 @@
import java.util.HashMap;
import java.util.Map;

import org.apache.http.client.fluent.Request;
import org.apache.http.client.fluent.Response;
import edu.cornell.mannlib.vitro.webapp.utils.http.HttpClientFactory;
import edu.cornell.mannlib.vitro.webapp.utils.http.ESHttpsBasicClientFactory;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;

import com.fasterxml.jackson.databind.ObjectMapper;

import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchEngineException;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.util.EntityUtils;

/**
* The nuts and bolts of getting the number of documents in the Elasticsearch
Expand All @@ -25,9 +29,15 @@ public ESCounter(String baseUrl) {

public int count() throws SearchEngineException {
try {
String url = baseUrl + "/_doc/_count";
Response response = Request.Get(url).execute();
String json = response.returnContent().asString();
String url = baseUrl + "/_count";
HttpClient httpClient;
if (baseUrl.startsWith("https")) {
httpClient = ESHttpsBasicClientFactory.getHttpClient();
} else {
httpClient = HttpClientFactory.getHttpClient();
}
HttpResponse response = httpClient.execute(new HttpGet(url));
String json = EntityUtils.toString(response.getEntity());

@SuppressWarnings("unchecked")
Map<String, Object> map = new ObjectMapper().readValue(json,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,20 @@
import java.util.List;
import java.util.Map;

import edu.cornell.mannlib.vitro.webapp.utils.http.HttpClientFactory;
import edu.cornell.mannlib.vitro.webapp.utils.http.ESHttpsBasicClientFactory;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.StatusLine;
import org.apache.http.client.HttpClient;
import org.apache.http.client.HttpResponseException;
import org.apache.http.client.ResponseHandler;
import org.apache.http.client.fluent.Request;
import org.apache.http.client.fluent.Response;
import org.apache.http.entity.ContentType;
import org.apache.http.client.methods.HttpDelete;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.entity.StringEntity;
import org.apache.http.util.EntityUtils;

import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchEngineException;
Expand Down Expand Up @@ -52,8 +56,15 @@ private void deleteById(String id) throws SearchEngineException {
try {
String url = baseUrl + "/_doc/"
+ URLEncoder.encode(id, "UTF8");
Response response = Request.Delete(url).execute();
String json = response.returnContent().asString();
HttpClient httpClient;
if (baseUrl.startsWith("https")) {
httpClient = ESHttpsBasicClientFactory.getHttpClient();
} else {
httpClient = HttpClientFactory.getHttpClient();
}

HttpResponse response = httpClient.execute(new HttpDelete(url));
String json = EntityUtils.toString(response.getEntity());
} catch (HttpResponseException e) {
if (e.getStatusCode() == 404) {
// Don't care if it has already been deleted.
Expand All @@ -69,16 +80,28 @@ private void deleteById(String id) throws SearchEngineException {

public void deleteByQuery(String queryString) throws SearchEngineException {
String url = baseUrl + "/_delete_by_query";
queryString = queryString.replace(" ", "");
if (queryString.contains("*TO")) {
queryString = queryString.replace("[", "").replace("]", "").replace("*", "0");
}
SearchQuery query = new BaseSearchQuery().setQuery(queryString);
String queryJson = new QueryConverter(query).asString();

try {
Response response = Request.Post(url)
.bodyString(queryJson, ContentType.APPLICATION_JSON)
.execute();
HttpClient httpClient;
if (baseUrl.startsWith("https")) {
httpClient = ESHttpsBasicClientFactory.getHttpClient();
} else {
httpClient = HttpClientFactory.getHttpClient();
}

HttpPost request = new HttpPost(url);
request.addHeader("Content-Type", "application/json");
request.setEntity(new StringEntity(queryJson));
HttpResponse response = httpClient.execute(request);

BaseResponseHandler handler = new BaseResponseHandler();
response.handleResponse(handler);
handler.handleResponse(response);
if (handler.getStatusCode() >= 400) {
log.warn(String.format(
"Failed to delete Elasticsearch documents by query: %s, %d - %s\n%s",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,16 @@

package edu.cornell.mannlib.vitro.webapp.searchengine.elasticsearch;

import edu.cornell.mannlib.vitro.webapp.utils.http.HttpClientFactory;
import edu.cornell.mannlib.vitro.webapp.utils.http.ESHttpsBasicClientFactory;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.client.fluent.Request;
import org.apache.http.client.fluent.Response;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;

import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchEngineException;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.util.EntityUtils;

/**
* Just does a "commit" or "flush" to the index.
Expand All @@ -29,8 +33,14 @@ public void flush(boolean wait) throws SearchEngineException {
try {
String url = baseUrl + "/_flush"
+ (wait ? "?wait_for_ongoing" : "");
Response response = Request.Get(url).execute();
String json = response.returnContent().asString();
HttpClient httpClient;
if (baseUrl.startsWith("https")) {
httpClient = ESHttpsBasicClientFactory.getHttpClient();
} else {
httpClient = HttpClientFactory.getHttpClient();
}
HttpResponse response = httpClient.execute(new HttpGet(url));
String json = EntityUtils.toString(response.getEntity());
log.debug("flush response: " + json);
} catch (Exception e) {
throw new SearchEngineException("Failed to put to Elasticsearch",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import java.net.URI;
import java.net.URISyntaxException;

import edu.cornell.mannlib.vitro.webapp.utils.http.HttpClientFactory;
import edu.cornell.mannlib.vitro.webapp.utils.http.ESHttpsBasicClientFactory;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
Expand All @@ -17,7 +19,6 @@
import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchEngineException;
import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchQuery;
import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchResponse;
import edu.cornell.mannlib.vitro.webapp.utils.http.HttpClientFactory;

/**
* Convert a SearchQuery to JSON, send it to Elasticsearch, and convert the JSON
Expand Down Expand Up @@ -90,6 +91,9 @@ public ESFunkyGetRequest bodyString(String contents,

public HttpResponse execute() throws SearchEngineException {
try {
if (this.getURI().getScheme().equals("https")) {
return ESHttpsBasicClientFactory.getHttpClient().execute(this);
}
return HttpClientFactory.getHttpClient().execute(this);
} catch (IOException e) {
throw new SearchEngineException(e);
Expand Down
Loading
Loading