Skip to content

Commit

Permalink
Use DecoderFallback.ExceptionFallback to match Java's CodingErrorActi…
Browse files Browse the repository at this point in the history
…on.REPORT, #1076 (#1089)

* Use DecoderFallback.ExceptionFallback to match Java behavior, #1076

* Add unit test for WithDecoderExceptionFallback

* Fix unit test namespace and doc comment

* Lucene.Net.Support.Buffers: Added ArrayPoolExtensions class to simplify returning arrays that might be null

* Lucene.Net.Index.Term::ToString(): Optimized writing UTF8 string on target frameworks that support System.Text.Unicode.Utf8. Added tests to verify fallback is working.

* Cache decoder fallback encoding lookup, #1076

* Treat Encoder/DecoderFallbackExceptions as IOExceptions to match Java, #1076

* Fix translation of replacement fallback test code, IOException/RuntimeException tests

* Use Encoding.Default instead of GetEncoding(0), #1076

* Cache GB2312 encoding lookup, #1076

* Replace StandardCharsets.UTF_8 with Encoding.UTF8 in two tests, #1076

* Fix test extension method for detecting IllegalArgumentException, #1076

* Cascade call from IsIllegalArgumentException

---------

Co-authored-by: Shad Storhaug <[email protected]>
  • Loading branch information
paulirwin and NightOwl888 authored Jan 12, 2025
1 parent 24fb64b commit 4bf492c
Show file tree
Hide file tree
Showing 32 changed files with 352 additions and 100 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -65,4 +65,5 @@ websites/apidocs/api/**/*.manifest
svn-*/

# vscode files
.vscode/
.vscode/
.idea/**/misc.xml
7 changes: 7 additions & 0 deletions Directory.Build.targets
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,13 @@

</PropertyGroup>

<!-- Features in .NET 8.x and .NET 9.x only -->
<PropertyGroup Condition=" $(TargetFramework.StartsWith('net8.')) Or $(TargetFramework.StartsWith('net9.')) ">

<DefineConstants>$(DefineConstants);FEATURE_UTF8_TOUTF16</DefineConstants>

</PropertyGroup>

<!-- Features in .NET 6.x, .NET 7.x, .NET 8.x, and .NET 9.x only -->
<PropertyGroup Condition=" $(TargetFramework.StartsWith('net6.')) Or $(TargetFramework.StartsWith('net7.')) Or $(TargetFramework.StartsWith('net8.')) Or $(TargetFramework.StartsWith('net9.')) ">

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -746,7 +746,7 @@ private static Encoding GetSystemEncoding(string encoding) // LUCENENET: CA1822:
}
if ("ISO8859-14".Equals(encoding, StringComparison.OrdinalIgnoreCase))
{
return new ISO8859_14Encoding();
return ISO8859_14Encoding.Default;
}
// .NET doesn't recognize the encoding without a dash between ISO and the number
// https://msdn.microsoft.com/en-us/library/system.text.encodinginfo.getencoding(v=vs.110).aspx
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,11 @@ namespace Lucene.Net.Analysis.Hunspell
[ExceptionToClassNameConvention]
internal sealed class ISO8859_14Encoding : Encoding
{
/// <summary>
/// The default singleton instance of the <see cref="ISO8859_14Encoding"/> class.
/// </summary>
public static new ISO8859_14Encoding Default { get; } = new ISO8859_14Encoding();

private static readonly Decoder decoder = new ISO8859_14Decoder();
public override Decoder GetDecoder()
{
Expand Down Expand Up @@ -119,4 +124,4 @@ public override int GetChars(byte[] bytesIn, int byteIndex, int byteCount, char[
return writeCount;
}
}
}
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Lucene version compatibility level 4.8.1
using Lucene.Net.Analysis.Core;
using Lucene.Net.Analysis.Util;
using Lucene.Net.Support.Text;
using Lucene.Net.Util;
using System;
using System.Collections.Generic;
Expand Down Expand Up @@ -117,7 +118,7 @@ public void Inform(IResourceLoader loader)
/// </summary>
private SynonymMap LoadSynonyms(IResourceLoader loader, string cname, bool dedup, Analyzer analyzer)
{
Encoding decoder = Encoding.UTF8;
Encoding decoder = Encoding.UTF8.WithDecoderExceptionFallback();

SynonymMap.Parser parser;
Type clazz = loader.FindType(cname /*, typeof(SynonymMap.Parser) */);
Expand Down Expand Up @@ -165,4 +166,4 @@ private TokenizerFactory LoadTokenizerFactory(IResourceLoader loader, string cna
}
}
}
}
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Lucene version compatibility level 4.8.1
using Lucene.Net.Analysis.Core;
using Lucene.Net.Support;
using Lucene.Net.Support.Text;
using Lucene.Net.Util;
using System;
using System.Collections.Generic;
Expand Down Expand Up @@ -385,8 +386,9 @@ protected CharArraySet GetSnowballWordSet(IResourceLoader loader, string wordFil
words = new CharArraySet(m_luceneMatchVersion, files.Count * 10, ignoreCase);
foreach (string file in files)
{
Encoding decoder = Encoding.UTF8.WithDecoderExceptionFallback();
using (Stream stream = loader.OpenResource(file.Trim()))
using (TextReader reader = new StreamReader(stream, Encoding.UTF8))
using (TextReader reader = new StreamReader(stream, decoder))
{
WordlistLoader.GetSnowballWordSet(reader, words);
}
Expand Down
3 changes: 2 additions & 1 deletion src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizerFactory.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
using Lucene.Net.Analysis.Ja.Dict;
using Lucene.Net.Analysis.Util;
using Lucene.Net.Support.Text;
using Lucene.Net.Util;
using System;
using System.Collections.Generic;
Expand Down Expand Up @@ -88,7 +89,7 @@ public virtual void Inform(IResourceLoader loader)
{
encoding = Encoding.UTF8.WebName;
}
Encoding decoder = Encoding.GetEncoding(encoding);
Encoding decoder = Encoding.GetEncoding(encoding).WithDecoderExceptionFallback();
TextReader reader = new StreamReader(stream, decoder);
userDictionary = new UserDictionary(reader);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
using J2N.Text;
using Lucene.Net.Diagnostics;
using Lucene.Net.Support.Text;
using System.Globalization;
using System.IO;
using System.Text;
Expand Down Expand Up @@ -31,7 +32,8 @@ public static class ConnectionCostsBuilder // LUCENENET specific: CA1052 Static
public static ConnectionCostsWriter Build(string filename)
{
using Stream inputStream = new FileStream(filename, FileMode.Open, FileAccess.Read);
using StreamReader streamReader = new StreamReader(inputStream, Encoding.ASCII, detectEncodingFromByteOrderMarks: true, bufferSize: 1024, leaveOpen: true); // LUCENENET: CA2000: Use using statement
Encoding decoder = Encoding.ASCII.WithDecoderExceptionFallback();
using StreamReader streamReader = new StreamReader(inputStream, decoder, detectEncodingFromByteOrderMarks: true, bufferSize: 1024, leaveOpen: true); // LUCENENET: CA2000: Use using statement

string line = streamReader.ReadLine();
string[] dimensions = whiteSpaceRegex.Split(line).TrimEnd();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
using J2N.Text;
using Lucene.Net.Support;
using Lucene.Net.Support.Text;
using Lucene.Net.Util;
using Lucene.Net.Util.Fst;
using Lucene.Net.Util.Packed;
Expand Down Expand Up @@ -71,7 +72,7 @@ public virtual TokenInfoDictionaryWriter BuildDictionary(IList<string> csvFiles)
foreach (string file in csvFiles)
{
using Stream inputStream = new FileStream(file, FileMode.Open, FileAccess.Read);
Encoding decoder = Encoding.GetEncoding(encoding);
Encoding decoder = Encoding.GetEncoding(encoding).WithDecoderExceptionFallback();
using TextReader reader = new StreamReader(inputStream, decoder, detectEncodingFromByteOrderMarks: true, bufferSize: 1024, leaveOpen: true); // LUCENENET: CA2000: Use using statement

string line = null;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
using J2N.Text;
using Lucene.Net.Analysis.Ja.Dict;
using Lucene.Net.Support.Text;
using System;
using System.Collections.Generic;
using System.Globalization;
Expand Down Expand Up @@ -55,7 +56,7 @@ public virtual UnknownDictionaryWriter ReadDictionaryFile(string filename, strin
UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(5 * 1024 * 1024);

JCG.List<string[]> lines = new JCG.List<string[]>();
Encoding decoder = Encoding.GetEncoding(encoding);
Encoding decoder = Encoding.GetEncoding(encoding).WithDecoderExceptionFallback();
using (Stream inputStream = new FileStream(filename, FileMode.Open, FileAccess.Read))
using (TextReader reader = new StreamReader(inputStream, decoder))
{
Expand Down
11 changes: 7 additions & 4 deletions src/Lucene.Net.Analysis.SmartCn/Hhmm/AbstractDictionary.cs
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,17 @@ namespace Lucene.Net.Analysis.Cn.Smart.Hhmm
/// </summary>
internal abstract class AbstractDictionary
{
// LUCENENET specific: cached GB2312 encoding to avoid repeated calls to Encoding.GetEncoding("GB2312")
protected static readonly Encoding gb2312Encoding = Encoding.GetEncoding("GB2312");

/// <summary>
/// First Chinese Character in GB2312 (15 * 94)
/// Characters in GB2312 are arranged in a grid of 94 * 94, 0-14 are unassigned or punctuation.
/// </summary>
public const int GB2312_FIRST_CHAR = 1410;

/// <summary>
/// Last Chinese Character in GB2312 (87 * 94).
/// Last Chinese Character in GB2312 (87 * 94).
/// Characters in GB2312 are arranged in a grid of 94 * 94, 88-94 are unassigned.
/// </summary>
public const int GB2312_CHAR_NUM = 87 * 94;
Expand Down Expand Up @@ -98,7 +101,7 @@ public virtual string GetCCByGB2312Id(int ccid)
try
{
//String cchar = new String(buffer, "GB2312");
string cchar = Encoding.GetEncoding("GB2312").GetString(buffer);
string cchar = gb2312Encoding.GetString(buffer); // LUCENENET specific: use cached encoding instance
return cchar;
}
catch (Exception e) when (e.IsUnsupportedEncodingException()) // Encoding is not supported by the platform
Expand All @@ -117,15 +120,15 @@ public virtual short GetGB2312Id(char ch)
try
{
//byte[] buffer = Character.ToString(ch).getBytes("GB2312");
byte[] buffer = Encoding.GetEncoding("GB2312").GetBytes(ch.ToString());
byte[] buffer = gb2312Encoding.GetBytes(ch.ToString()); // LUCENENET specific: use cached encoding instance
//byte[] buffer = Encoding.GetEncoding("hz-gb-2312").GetBytes(ch.ToString());
if (buffer.Length != 2)
{
// Should be a two-byte character
return -1;
}
int b0 = (buffer[0] & 0x0FF) - 161; // Code starts from A1, therefore subtract 0xA1=161
int b1 = (buffer[1] & 0x0FF) - 161; // There is no Chinese char for the first and last symbol.
int b1 = (buffer[1] & 0x0FF) - 161; // There is no Chinese char for the first and last symbol.
// Therefore, each code page only has 16*6-2=94 characters.
return (short)(b0 * 94 + b1);
}
Expand Down
2 changes: 1 addition & 1 deletion src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ public virtual void LoadFromFile(string dctFilePath)
byte[] lchBuffer = new byte[length];
dctFile.Read(lchBuffer, 0, lchBuffer.Length);
//tmpword = new String(lchBuffer, "GB2312");
tmpword = Encoding.GetEncoding("GB2312").GetString(lchBuffer);
tmpword = gb2312Encoding.GetString(lchBuffer); // LUCENENET specific: use cached encoding instance from base class
//tmpword = Encoding.GetEncoding("hz-gb-2312").GetString(lchBuffer);
if (i != 3755 + GB2312_FIRST_CHAR)
{
Expand Down
2 changes: 1 addition & 1 deletion src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs
Original file line number Diff line number Diff line change
Expand Up @@ -395,7 +395,7 @@ private int LoadMainDataFromFile(string dctFilePath)
{
byte[] lchBuffer = new byte[length];
dctFile.Read(lchBuffer, 0, lchBuffer.Length);
tmpword = Encoding.GetEncoding("GB2312").GetString(lchBuffer);
tmpword = gb2312Encoding.GetString(lchBuffer); // LUCENENET specific: use cached encoding instance from base class
wordItem_charArrayTable[i][j] = tmpword.ToCharArray();
}
else
Expand Down
6 changes: 3 additions & 3 deletions src/Lucene.Net.Benchmark/ByTask/Feeds/ContentItemsSource.cs
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,15 @@ namespace Lucene.Net.Benchmarks.ByTask.Feeds
/// Base class for source of data for benchmarking.
/// </summary>
/// <remarks>
/// Keeps track of various statistics, such as how many data items were generated,
/// Keeps track of various statistics, such as how many data items were generated,
/// size in bytes etc.
/// <para/>
/// Supports the following configuration parameters:
/// <list type="bullet">
/// <item><term>content.source.forever</term><description>specifies whether to generate items forever (<b>default=true</b>).</description></item>
/// <item><term>content.source.verbose</term><description>specifies whether messages should be output by the content source (<b>default=false</b>).</description></item>
/// <item><term>content.source.encoding</term><description>
/// specifies which encoding to use when
/// specifies which encoding to use when
/// reading the files of that content source. Certain implementations may define
/// a default value if this parameter is not specified. (<b>default=null</b>).
/// </description></item>
Expand Down Expand Up @@ -199,7 +199,7 @@ public virtual void SetConfig(Config config)
}
else
{
m_encoding = Encoding.GetEncoding(0); // Default system encoding
m_encoding = Encoding.Default; // Default system encoding
}
}

Expand Down
2 changes: 1 addition & 1 deletion src/Lucene.Net.Benchmark/ByTask/Tasks/CreateIndexTask.cs
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ public static IndexWriter ConfigureWriter(Config config, PerfRunData runData, Op
else
{
FileInfo f = new FileInfo(infoStreamVal);
iwc.SetInfoStream(new StreamWriter(new FileStream(f.FullName, FileMode.Create, FileAccess.Write), Encoding.GetEncoding(0)));
iwc.SetInfoStream(new StreamWriter(new FileStream(f.FullName, FileMode.Create, FileAccess.Write), Encoding.Default));
}
}
IndexWriter writer = new IndexWriter(runData.Directory, iwc);
Expand Down
9 changes: 6 additions & 3 deletions src/Lucene.Net.Benchmark/Quality/Trec/QueryDriver.cs
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,13 @@ public static void Main(string[] args)
string fieldSpec = args.Length == 5 ? args[4] : "T"; // default to Title-only if not specified.
IndexSearcher searcher = new IndexSearcher(reader);

int maxResults = 1000;
string docNameField = "docname";
const int maxResults = 1000;
const string docNameField = "docname";

TextWriter logger = Console.Out; //new StreamWriter(Console, Encoding.GetEncoding(0));
using TextWriter logger = new StreamWriter(System.Console.OpenStandardOutput(), Encoding.Default)
{
AutoFlush = true,
};

// use trec utilities to read trec topics into quality queries
TrecTopicsReader qReader = new TrecTopicsReader();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
using System;
using System.Runtime.CompilerServices;
using System.Text;

namespace Lucene.Net
{
Expand Down Expand Up @@ -53,12 +54,9 @@ internal static class ExceptionExtensions
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool IsIllegalArgumentException(this Exception e)
{
// If our exception implements IError and subclasses ArgumentException, we will ignore it.
if (e is null || e.IsError() || e.IsAlwaysIgnored()) return false;

return e is ArgumentException &&
e is not ArgumentNullException && // Corresponds to NullPointerException, so we don't catch it here.
e is not ArgumentOutOfRangeException; // Corresponds to IndexOutOfBoundsException (and subclasses), so we don't catch it here.
return Lucene.ExceptionExtensions.IsIllegalArgumentException(e)
&& e is not ArgumentNullException // Corresponds to NullPointerException, so we don't catch it here.
and not ArgumentOutOfRangeException; // Corresponds to IndexOutOfBoundsException (and subclasses), so we don't catch it here.
}
}
}
6 changes: 4 additions & 2 deletions src/Lucene.Net.TestFramework/Util/LineFileDocs.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
using J2N.Threading.Atomic;
using Lucene.Net.Documents;
using Lucene.Net.Support.IO;
using Lucene.Net.Support.Text;
using Lucene.Net.Support.Threading;
using RandomizedTesting.Generators;
using System;
Expand Down Expand Up @@ -236,7 +237,8 @@ private void Open(Random random)
} while (b >= 0 && b != 13 && b != 10);
}

reader = new StreamReader(@is, Encoding.UTF8, detectEncodingFromByteOrderMarks: false, bufferSize: BUFFER_SIZE);
Encoding decoder = Encoding.UTF8.WithDecoderExceptionFallback();
reader = new StreamReader(@is, decoder, detectEncodingFromByteOrderMarks: false, bufferSize: BUFFER_SIZE);

if (seekTo > 0L)
{
Expand Down Expand Up @@ -399,4 +401,4 @@ internal static string MaybeCreateTempFile(bool removeAfterClass = true)
return result;
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
using System.Reflection;
using System.Resources;
using System.Security;
using System.Text;
using Assert = Lucene.Net.TestFramework.Assert;

namespace Lucene.Net.Support.ExceptionHandling
Expand Down Expand Up @@ -184,6 +185,8 @@ private static IEnumerable<Type> LoadKnownErrorExceptionTypes()
typeof(UnauthorizedAccessException),
typeof(ObjectDisposedException),
typeof(Lucene.AlreadyClosedException),
typeof(EncoderFallbackException), // In Java, CharacterCodingException subclasses IOException
typeof(DecoderFallbackException),
}.Union(AllIOExceptionTypes)
// .NET Framework only - Subclasses UnauthorizedAccessException
.Union(new[] { PrivilegeNotHeldExceptionType });
Expand Down Expand Up @@ -221,8 +224,6 @@ private static IEnumerable<Type> LoadKnownErrorExceptionTypes()
// Subclasses
typeof(System.DuplicateWaitObjectException),
typeof(System.Globalization.CultureNotFoundException),
typeof(System.Text.DecoderFallbackException),
typeof(System.Text.EncoderFallbackException),
};

public static readonly IEnumerable<Type> KnownIllegalArgumentExceptionTypes_TestEnvironment = new Type[] {
Expand All @@ -234,8 +235,6 @@ private static IEnumerable<Type> LoadKnownErrorExceptionTypes()
// Subclasses
typeof(System.DuplicateWaitObjectException),
typeof(System.Globalization.CultureNotFoundException),
typeof(System.Text.DecoderFallbackException),
typeof(System.Text.EncoderFallbackException),
};

public static readonly IEnumerable<Type> KnownRuntimeExceptionTypes = LoadKnownRuntimeExceptionTypes();
Expand Down Expand Up @@ -367,8 +366,6 @@ private static IEnumerable<Type> LoadKnownRuntimeExceptionTypes()
typeof(System.Runtime.Serialization.SerializationException),
typeof(System.Security.Cryptography.CryptographicException),
typeof(System.Security.VerificationException),
typeof(System.Text.DecoderFallbackException), // LUCENENET TODO: Need to be sure about this one
typeof(System.Text.EncoderFallbackException), // LUCENENET TODO: Need to be sure about this one
typeof(System.Threading.AbandonedMutexException),
typeof(System.Threading.SemaphoreFullException),
typeof(System.Threading.SynchronizationLockException),
Expand Down
Loading

0 comments on commit 4bf492c

Please sign in to comment.