fix: DocumentInputStream does not handle surrogate pairs correctly

eclipse-tm4e · Aug 19, 2024 · d8ee729 · d8ee729
1 parent 27b13e3
commit d8ee729
Show file tree

Hide file tree

Showing 4 changed files with 375 additions and 56 deletions.
diff --git a/org.eclipse.tm4e.ui/src/main/java/org/eclipse/tm4e/ui/internal/utils/CharsInputStream.java b/org.eclipse.tm4e.ui/src/main/java/org/eclipse/tm4e/ui/internal/utils/CharsInputStream.java
@@ -0,0 +1,220 @@
+/*******************************************************************************
+ * Copyright (c) 2024 Sebastian Thomschke and others.
+ * This program and the accompanying materials are made
+ * available under the terms of the Eclipse Public License 2.0
+ * which is available at https://www.eclipse.org/legal/epl-2.0/
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ * Contributors:
+ * Sebastian Thomschke - initial implementation
+ *******************************************************************************/
+package org.eclipse.tm4e.ui.internal.utils;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetEncoder;
+import java.nio.charset.CoderResult;
+import java.nio.charset.StandardCharsets;
+import java.util.Objects;
+import java.util.function.IntSupplier;
+
+import org.eclipse.jdt.annotation.Nullable;
+
+class CharsInputStream extends InputStream {
+	@FunctionalInterface
+	interface CharsSupplier {
+		char charAt(int index) throws Exception;
+	}
+
+	enum EncoderState {
+		ENCODING,
+		FLUSHING,
+		DONE
+	}
+
+	/** 512 surrogate character pairs */
+	private static final int DEFAULT_BUFFER_SIZE = 512;
+	private static final int EOF = -1;
+
+	private final int bufferSize;
+	private final CharBuffer charBuffer;
+	private final ByteBuffer byteBuffer;
+	private final CharsetEncoder encoder;
+	private EncoderState encoderState = EncoderState.ENCODING;
+
+	private int charIndex = 0;
+	private final CharsSupplier chars;
+	private final IntSupplier charsLength;
+
+	CharsInputStream(final CharSequence chars) {
+		this(chars, null);
+	}
+
+	CharsInputStream(final CharSequence chars, final @Nullable Charset charset) {
+		this(chars, charset, DEFAULT_BUFFER_SIZE);
+	}
+
+	CharsInputStream(final CharSequence chars, final @Nullable Charset charset, final int bufferSize) {
+		this(chars::charAt, chars::length, charset, bufferSize);
+	}
+
+	CharsInputStream(final CharsSupplier chars, final IntSupplier charsLength) {
+		this(chars, charsLength, null);
+	}
+
+	/**
+	 * @param chars function to access indexed chars.
+	 * @param charsLength function to get the number of indexed chars provided by the <code>chars</code> parameter.
+	 */
+	CharsInputStream(final CharsSupplier chars, final IntSupplier charsLength, final @Nullable Charset charset) {
+		this(chars, charsLength, charset, DEFAULT_BUFFER_SIZE);
+	}
+
+	/**
+	 * @param chars function to access indexed chars.
+	 * @param charsLength function to get the number of indexed chars provided by the <code>chars</code> parameter.
+	 * @param bufferSize number of surrogate character pairs to encode at once.
+	 */
+	CharsInputStream(final CharsSupplier chars, final IntSupplier charsLength, final @Nullable Charset charset, final int bufferSize) {
+		if (bufferSize < 1)
+			throw new IllegalArgumentException("[bufferSize] must be 1 or larger");
+		encoder = (charset == null ? StandardCharsets.UTF_8 : charset).newEncoder();
+
+		this.bufferSize = bufferSize;
+		charBuffer = CharBuffer.allocate(bufferSize * 2); // buffer for 2 chars (high/low surrogate)
+		byteBuffer = ByteBuffer.allocate(bufferSize * 4); // buffer for one UTF character (up to 4 bytes)
+		byteBuffer.flip();
+		charBuffer.flip();
+
+		this.chars = chars;
+		this.charsLength = charsLength;
+	}
+
+	@Override
+	public int available() {
+		final int remaining = byteBuffer.remaining();
+		return remaining == 0 ? charsLength.getAsInt() - charIndex : remaining;
+	}
+
+	private boolean flushEncoder() throws IOException {
+		if (encoderState == EncoderState.DONE)
+			return false;
+
+		if (encoderState == EncoderState.ENCODING) {
+			encoderState = EncoderState.FLUSHING;
+		}
+
+		// flush
+		byteBuffer.clear();
+		final CoderResult result = encoder.flush(byteBuffer);
+		byteBuffer.flip();
+
+		if (result.isOverflow()) // byteBuffer too small
+			return true;
+
+		if (result.isError()) {
+			result.throwException();
+		}
+
+		encoderState = EncoderState.DONE;
+		return byteBuffer.hasRemaining();
+	}
+
+	@Override
+	public int read() throws IOException {
+		if (!byteBuffer.hasRemaining() && !refillBuffer())
+			return EOF;
+		return byteBuffer.get() & 0xFF; // next byte as an unsigned integer (0 to 255)
+	}
+
+	@Override
+	public int read(final byte[] buf, final int off, final int bytesToRead) throws IOException {
+		Objects.checkFromIndexSize(off, bytesToRead, buf.length);
+		if (bytesToRead == 0)
+			return 0;
+
+		int bytesRead = 0;
+		int bytesReadable = byteBuffer.remaining();
+
+		while (bytesRead < bytesToRead) {
+			if (bytesReadable == 0) {
+				if (refillBuffer()) {
+					bytesReadable = byteBuffer.remaining();
+				} else
+					return bytesRead == 0 ? EOF : bytesRead;
+			}
+
+			final int bytesToReadNow = Math.min(bytesToRead - bytesRead, bytesReadable);
+			byteBuffer.get(buf, off + bytesRead, bytesToReadNow);
+			bytesRead += bytesToReadNow;
+			bytesReadable -= bytesToReadNow;
+		}
+
+		return bytesRead;
+	}
+
+	private boolean refillBuffer() throws IOException {
+		if (encoderState == EncoderState.DONE)
+			return false;
+
+		if (encoderState == EncoderState.FLUSHING)
+			return flushEncoder();
+
+		final int charsLen = charsLength.getAsInt();
+
+		// if EOF is reached transition to flushing
+		if (charIndex >= charsLen) {
+			// finalize encoding before switching to flushing
+			byteBuffer.clear();
+			final CoderResult result = encoder.encode(CharBuffer.allocate(0), byteBuffer, true /* signal EOF */);
+			byteBuffer.flip();
+			if (result.isError()) {
+				result.throwException();
+			}
+			return flushEncoder();
+		}
+
+		try {
+			charBuffer.clear();
+			for (int i = 0; i < bufferSize && charIndex < charsLen; i++) {
+				final char nextChar = chars.charAt(charIndex++);
+				if (Character.isHighSurrogate(nextChar)) { // handle surrogate pairs
+					if (charIndex < charsLen) {
+						final char lowSurrogate = chars.charAt(charIndex);
+						if (Character.isLowSurrogate(lowSurrogate)) {
+							charIndex++;
+							charBuffer.put(nextChar);
+							charBuffer.put(lowSurrogate);
+						} else {
+							// missing low surrogate - fallback to replacement character
+							charBuffer.put('\uFFFD');
+						}
+					} else {
+						// missing low surrogate - fallback to replacement character
+						charBuffer.put('\uFFFD');
+						break;
+					}
+				} else {
+					charBuffer.put(nextChar);
+				}
+			}
+			charBuffer.flip();
+
+			// encode chars into bytes
+			byteBuffer.clear();
+			final CoderResult result = encoder.encode(charBuffer, byteBuffer, false);
+			byteBuffer.flip();
+			if (result.isError()) {
+				result.throwException();
+			}
+		} catch (final Exception ex) {
+			throw new IOException(ex);
+		}
+
+		return true;
+	}
+}
diff --git a/org.eclipse.tm4e.ui/src/main/java/org/eclipse/tm4e/ui/internal/utils/ContentTypeHelper.java b/org.eclipse.tm4e.ui/src/main/java/org/eclipse/tm4e/ui/internal/utils/ContentTypeHelper.java
@@ -112,7 +112,7 @@ private static ContentTypeInfo getContentTypes(final ITextFileBuffer buffer) {
 			if (bufferContentType != null) {
 				contentTypes.add(bufferContentType);
 			}
-			if (buffer.isDirty()) {
+			if (buffer.isDirty() && buffer.getDocument() != null) {
 				// Buffer is dirty (content of the filesystem is not synch with
 				// the editor content), use IDocument content.
 				try (var input = new DocumentInputStream(buffer.getDocument())) {

diff --git a/...eclipse.tm4e.ui/src/main/java/org/eclipse/tm4e/ui/internal/utils/DocumentInputStream.java b/...eclipse.tm4e.ui/src/main/java/org/eclipse/tm4e/ui/internal/utils/DocumentInputStream.java
@@ -1,71 +1,45 @@
 /*******************************************************************************
- * Copyright (c) 2005, 2008 IBM Corporation and others.
- * All rights reserved. This program and the accompanying materials
- * are made available under the terms of the Eclipse Public License v1.0
- * which accompanies this distribution, and is available at
- * http://www.eclipse.org/legal/epl-v10.html
+ * Copyright (c) 2024 Sebastian Thomschke and others.
+ * This program and the accompanying materials are made
+ * available under the terms of the Eclipse Public License 2.0
+ * which is available at https://www.eclipse.org/legal/epl-2.0/
+ *
+ * SPDX-License-Identifier: EPL-2.0
  *
  * Contributors:
- * IBM Corporation - initial API and implementation
- * QNX Software System
- * Sebastian Thomschke - implement read(byte[], int, int)
+ * Sebastian Thomschke - initial implementation
  *******************************************************************************/
 package org.eclipse.tm4e.ui.internal.utils;
 
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Objects;
+import java.nio.charset.Charset;
 
-import org.eclipse.jface.text.BadLocationException;
+import org.eclipse.core.filebuffers.FileBuffers;
+import org.eclipse.core.filebuffers.ITextFileBuffer;
+import org.eclipse.core.filebuffers.ITextFileBufferManager;
+import org.eclipse.jdt.annotation.Nullable;
 import org.eclipse.jface.text.IDocument;
+import org.eclipse.tm4e.ui.TMUIPlugin;
 
-/**
- * Input stream which reads from a document
- */
-final class DocumentInputStream extends InputStream {
-
-	private final IDocument doc;
-	private int pos = 0;
-
-	DocumentInputStream(final IDocument document) {
-		doc = document;
-	}
-
-	@Override
-	public int read(final byte[] buff, final int buffOffset, final int len) throws IOException {
-		Objects.checkFromIndexSize(buffOffset, len, buff.length);
+final class DocumentInputStream extends CharsInputStream {
 
-		if (len == 0)
-			return 0;
-
-		final var docLen = doc.getLength();
-		if (pos >= docLen)
-			return -1;
-
-		var bytesRead = -1;
+	private static @Nullable Charset getCharset(final IDocument document) {
+		final ITextFileBufferManager bufferManager = FileBuffers.getTextFileBufferManager();
+		if (bufferManager == null)
+			return null;
+		final ITextFileBuffer buffer = bufferManager.getTextFileBuffer(document);
+		if (buffer == null)
+			return null;
 		try {
-			buff[buffOffset] = (byte) doc.getChar(pos++);
-			bytesRead = 1;
-			while (bytesRead < len) {
-				if (pos >= docLen) {
-					break;
-				}
-				buff[buffOffset + bytesRead++] = (byte) doc.getChar(pos++);
-			}
-		} catch (final BadLocationException ex) {
-			// ignore
+			final String charsetName = buffer.getEncoding();
+			if (charsetName != null)
+				return Charset.forName(charsetName);
+		} catch (final Exception ex) {
+			TMUIPlugin.logError(ex);
 		}
-		return bytesRead;
+		return null;
 	}
 
-	@Override
-	public int read() throws IOException {
-		try {
-			if (pos < doc.getLength())
-				return doc.getChar(pos++) & 0xFF;
-		} catch (final BadLocationException ex) {
-			// ignore
-		}
-		return -1;
+	DocumentInputStream(final IDocument doc) {
+		super(doc::getChar, doc::getLength, getCharset(doc));
 	}
 }