From 4e98ce53c9321fdcf3a4867e0febe0e14b6f06ad Mon Sep 17 00:00:00 2001 From: Matt Wheeler Date: Mon, 27 Jan 2014 12:59:12 -0800 Subject: [PATCH] BOBO-290 Facets with some surrogate pairs can't be loaded --- bobo-browse/pom.xml | 5 ++++ .../bobo/facets/data/TermStringList.java | 24 +++++++++++++------ .../bobo/test/BoboFacetIteratorTest.java | 9 +++++++ 3 files changed, 31 insertions(+), 7 deletions(-) diff --git a/bobo-browse/pom.xml b/bobo-browse/pom.xml index 8b5d68ce..ae544307 100644 --- a/bobo-browse/pom.xml +++ b/bobo-browse/pom.xml @@ -137,6 +137,11 @@ 1.2 compile + + com.ibm.icu + icu4j + 52.1 + junit junit diff --git a/bobo-browse/src/main/java/com/browseengine/bobo/facets/data/TermStringList.java b/bobo-browse/src/main/java/com/browseengine/bobo/facets/data/TermStringList.java index 916e1be5..972ae4e0 100644 --- a/bobo-browse/src/main/java/com/browseengine/bobo/facets/data/TermStringList.java +++ b/bobo-browse/src/main/java/com/browseengine/bobo/facets/data/TermStringList.java @@ -1,13 +1,23 @@ package com.browseengine.bobo.facets.data; +import com.ibm.icu.text.UTF16; + import java.util.ArrayList; import java.util.Collections; +import java.util.Comparator; import java.util.List; public class TermStringList extends TermValueList { private String sanity = null; private boolean withDummy = true; + /** + * A string comparator that orders Java strings according to Unicode codepoint + * order, same as Lucene, and not code unit order as done by the String class. + */ + private static final Comparator STRING_COMPARATOR = + new UTF16.StringComparator(true, false, 0); + public TermStringList(int capacity) { super(capacity); } @@ -21,7 +31,7 @@ public TermStringList() { public boolean add(String o) { if (_innerList.size() == 0 && o != null) withDummy = false; // the first value added is not null if (o == null) o = ""; - if (sanity != null && sanity.compareTo(o) >= 0) throw new RuntimeException( + if (sanity != null && STRING_COMPARATOR.compare(sanity, o) >= 0) throw new RuntimeException( "Values need to be added in ascending order. Previous value: " + sanity + " adding value: " + o); if (_innerList.size() > 0 || !withDummy) sanity = o; @@ -65,9 +75,9 @@ public int indexOf(Object o) { return -1; } } - return Collections.binarySearch(((ArrayList) _innerList), (String) o); + return Collections.binarySearch(((ArrayList) _innerList), (String) o, STRING_COMPARATOR); } else { - return Collections.binarySearch(((ArrayList) _innerList), (String) o); + return Collections.binarySearch(((ArrayList) _innerList), (String) o, STRING_COMPARATOR); } } @@ -85,9 +95,9 @@ public boolean containsWithType(String val) { if (val.equals("")) { return _innerList.size() > 1 && "".equals(_innerList.get(1)); } - return Collections.binarySearch(((ArrayList) _innerList), val) >= 0; + return Collections.binarySearch(((ArrayList) _innerList), val, STRING_COMPARATOR) >= 0; } else { - return Collections.binarySearch(((ArrayList) _innerList), val) >= 0; + return Collections.binarySearch(((ArrayList) _innerList), val, STRING_COMPARATOR) >= 0; } } @@ -103,9 +113,9 @@ public int indexOfWithType(String o) { return -1; } } - return Collections.binarySearch(((ArrayList) _innerList), o); + return Collections.binarySearch(((ArrayList) _innerList), o, STRING_COMPARATOR); } else { - return Collections.binarySearch(((ArrayList) _innerList), o); + return Collections.binarySearch(((ArrayList) _innerList), o, STRING_COMPARATOR); } } diff --git a/bobo-browse/src/test/java/com/browseengine/bobo/test/BoboFacetIteratorTest.java b/bobo-browse/src/test/java/com/browseengine/bobo/test/BoboFacetIteratorTest.java index ebf36bbd..a915ed9e 100644 --- a/bobo-browse/src/test/java/com/browseengine/bobo/test/BoboFacetIteratorTest.java +++ b/bobo-browse/src/test/java/com/browseengine/bobo/test/BoboFacetIteratorTest.java @@ -48,6 +48,15 @@ public void testTermStringListAddCorrectOrder() { tsl1.add("m"); tsl1.add("s"); tsl1.add("t"); + /* Following strings should be in this order according to the Unicode + * codepoints. Java String does not correctly compare surrogate pairs + * and is inconsistent with the UTF-8 byte sorting used by Lucene. + * Sorting UTF8 byte by byte is consistent with Unicode codepoint + * ordering. + * + */ + tsl1.add("\ufe6f"); // SMALL DOLLAR SIGN codepoint 0xFE69 + tsl1.add("\ud83d\ude00"); // GRINNING FACE codepoint 0x1F600 } catch (Exception e) { fail("There should NOT be an exception and the message contains ascending order"); return;