Skip to content

Commit

Permalink
fix: DocumentInputStream does not handle surrogate pairs correctly
Browse files Browse the repository at this point in the history
  • Loading branch information
sebthom committed Aug 19, 2024
1 parent 27b13e3 commit d8ee729
Show file tree
Hide file tree
Showing 4 changed files with 375 additions and 56 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
/*******************************************************************************
* Copyright (c) 2024 Sebastian Thomschke and others.
* This program and the accompanying materials are made
* available under the terms of the Eclipse Public License 2.0
* which is available at https://www.eclipse.org/legal/epl-2.0/
*
* SPDX-License-Identifier: EPL-2.0
*
* Contributors:
* Sebastian Thomschke - initial implementation
*******************************************************************************/
package org.eclipse.tm4e.ui.internal.utils;

import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.nio.charset.StandardCharsets;
import java.util.Objects;
import java.util.function.IntSupplier;

import org.eclipse.jdt.annotation.Nullable;

class CharsInputStream extends InputStream {
@FunctionalInterface
interface CharsSupplier {
char charAt(int index) throws Exception;
}

enum EncoderState {
ENCODING,
FLUSHING,
DONE
}

/** 512 surrogate character pairs */
private static final int DEFAULT_BUFFER_SIZE = 512;
private static final int EOF = -1;

private final int bufferSize;
private final CharBuffer charBuffer;
private final ByteBuffer byteBuffer;
private final CharsetEncoder encoder;
private EncoderState encoderState = EncoderState.ENCODING;

private int charIndex = 0;
private final CharsSupplier chars;
private final IntSupplier charsLength;

CharsInputStream(final CharSequence chars) {
this(chars, null);
}

CharsInputStream(final CharSequence chars, final @Nullable Charset charset) {
this(chars, charset, DEFAULT_BUFFER_SIZE);
}

CharsInputStream(final CharSequence chars, final @Nullable Charset charset, final int bufferSize) {
this(chars::charAt, chars::length, charset, bufferSize);
}

CharsInputStream(final CharsSupplier chars, final IntSupplier charsLength) {
this(chars, charsLength, null);
}

/**
* @param chars function to access indexed chars.
* @param charsLength function to get the number of indexed chars provided by the <code>chars</code> parameter.
*/
CharsInputStream(final CharsSupplier chars, final IntSupplier charsLength, final @Nullable Charset charset) {
this(chars, charsLength, charset, DEFAULT_BUFFER_SIZE);
}

/**
* @param chars function to access indexed chars.
* @param charsLength function to get the number of indexed chars provided by the <code>chars</code> parameter.
* @param bufferSize number of surrogate character pairs to encode at once.
*/
CharsInputStream(final CharsSupplier chars, final IntSupplier charsLength, final @Nullable Charset charset, final int bufferSize) {
if (bufferSize < 1)
throw new IllegalArgumentException("[bufferSize] must be 1 or larger");
encoder = (charset == null ? StandardCharsets.UTF_8 : charset).newEncoder();

this.bufferSize = bufferSize;
charBuffer = CharBuffer.allocate(bufferSize * 2); // buffer for 2 chars (high/low surrogate)
byteBuffer = ByteBuffer.allocate(bufferSize * 4); // buffer for one UTF character (up to 4 bytes)
byteBuffer.flip();
charBuffer.flip();

this.chars = chars;
this.charsLength = charsLength;
}

@Override
public int available() {
final int remaining = byteBuffer.remaining();
return remaining == 0 ? charsLength.getAsInt() - charIndex : remaining;
}

private boolean flushEncoder() throws IOException {
if (encoderState == EncoderState.DONE)
return false;

if (encoderState == EncoderState.ENCODING) {
encoderState = EncoderState.FLUSHING;
}

// flush
byteBuffer.clear();
final CoderResult result = encoder.flush(byteBuffer);
byteBuffer.flip();

if (result.isOverflow()) // byteBuffer too small
return true;

if (result.isError()) {
result.throwException();
}

encoderState = EncoderState.DONE;
return byteBuffer.hasRemaining();
}

@Override
public int read() throws IOException {
if (!byteBuffer.hasRemaining() && !refillBuffer())
return EOF;
return byteBuffer.get() & 0xFF; // next byte as an unsigned integer (0 to 255)
}

@Override
public int read(final byte[] buf, final int off, final int bytesToRead) throws IOException {
Objects.checkFromIndexSize(off, bytesToRead, buf.length);
if (bytesToRead == 0)
return 0;

int bytesRead = 0;
int bytesReadable = byteBuffer.remaining();

while (bytesRead < bytesToRead) {
if (bytesReadable == 0) {
if (refillBuffer()) {
bytesReadable = byteBuffer.remaining();
} else
return bytesRead == 0 ? EOF : bytesRead;
}

final int bytesToReadNow = Math.min(bytesToRead - bytesRead, bytesReadable);
byteBuffer.get(buf, off + bytesRead, bytesToReadNow);
bytesRead += bytesToReadNow;
bytesReadable -= bytesToReadNow;
}

return bytesRead;
}

private boolean refillBuffer() throws IOException {
if (encoderState == EncoderState.DONE)
return false;

if (encoderState == EncoderState.FLUSHING)
return flushEncoder();

final int charsLen = charsLength.getAsInt();

// if EOF is reached transition to flushing
if (charIndex >= charsLen) {
// finalize encoding before switching to flushing
byteBuffer.clear();
final CoderResult result = encoder.encode(CharBuffer.allocate(0), byteBuffer, true /* signal EOF */);
byteBuffer.flip();
if (result.isError()) {
result.throwException();
}
return flushEncoder();
}

try {
charBuffer.clear();
for (int i = 0; i < bufferSize && charIndex < charsLen; i++) {
final char nextChar = chars.charAt(charIndex++);
if (Character.isHighSurrogate(nextChar)) { // handle surrogate pairs
if (charIndex < charsLen) {
final char lowSurrogate = chars.charAt(charIndex);
if (Character.isLowSurrogate(lowSurrogate)) {
charIndex++;
charBuffer.put(nextChar);
charBuffer.put(lowSurrogate);
} else {
// missing low surrogate - fallback to replacement character
charBuffer.put('\uFFFD');
}
} else {
// missing low surrogate - fallback to replacement character
charBuffer.put('\uFFFD');
break;
}
} else {
charBuffer.put(nextChar);
}
}
charBuffer.flip();

// encode chars into bytes
byteBuffer.clear();
final CoderResult result = encoder.encode(charBuffer, byteBuffer, false);
byteBuffer.flip();
if (result.isError()) {
result.throwException();
}
} catch (final Exception ex) {
throw new IOException(ex);
}

return true;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ private static ContentTypeInfo getContentTypes(final ITextFileBuffer buffer) {
if (bufferContentType != null) {
contentTypes.add(bufferContentType);
}
if (buffer.isDirty()) {
if (buffer.isDirty() && buffer.getDocument() != null) {
// Buffer is dirty (content of the filesystem is not synch with
// the editor content), use IDocument content.
try (var input = new DocumentInputStream(buffer.getDocument())) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,71 +1,45 @@
/*******************************************************************************
* Copyright (c) 2005, 2008 IBM Corporation and others.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
* Copyright (c) 2024 Sebastian Thomschke and others.
* This program and the accompanying materials are made
* available under the terms of the Eclipse Public License 2.0
* which is available at https://www.eclipse.org/legal/epl-2.0/
*
* SPDX-License-Identifier: EPL-2.0
*
* Contributors:
* IBM Corporation - initial API and implementation
* QNX Software System
* Sebastian Thomschke - implement read(byte[], int, int)
* Sebastian Thomschke - initial implementation
*******************************************************************************/
package org.eclipse.tm4e.ui.internal.utils;

import java.io.IOException;
import java.io.InputStream;
import java.util.Objects;
import java.nio.charset.Charset;

import org.eclipse.jface.text.BadLocationException;
import org.eclipse.core.filebuffers.FileBuffers;
import org.eclipse.core.filebuffers.ITextFileBuffer;
import org.eclipse.core.filebuffers.ITextFileBufferManager;
import org.eclipse.jdt.annotation.Nullable;
import org.eclipse.jface.text.IDocument;
import org.eclipse.tm4e.ui.TMUIPlugin;

/**
* Input stream which reads from a document
*/
final class DocumentInputStream extends InputStream {

private final IDocument doc;
private int pos = 0;

DocumentInputStream(final IDocument document) {
doc = document;
}

@Override
public int read(final byte[] buff, final int buffOffset, final int len) throws IOException {
Objects.checkFromIndexSize(buffOffset, len, buff.length);
final class DocumentInputStream extends CharsInputStream {

if (len == 0)
return 0;

final var docLen = doc.getLength();
if (pos >= docLen)
return -1;

var bytesRead = -1;
private static @Nullable Charset getCharset(final IDocument document) {
final ITextFileBufferManager bufferManager = FileBuffers.getTextFileBufferManager();
if (bufferManager == null)
return null;
final ITextFileBuffer buffer = bufferManager.getTextFileBuffer(document);
if (buffer == null)
return null;
try {
buff[buffOffset] = (byte) doc.getChar(pos++);
bytesRead = 1;
while (bytesRead < len) {
if (pos >= docLen) {
break;
}
buff[buffOffset + bytesRead++] = (byte) doc.getChar(pos++);
}
} catch (final BadLocationException ex) {
// ignore
final String charsetName = buffer.getEncoding();
if (charsetName != null)
return Charset.forName(charsetName);
} catch (final Exception ex) {
TMUIPlugin.logError(ex);
}
return bytesRead;
return null;
}

@Override
public int read() throws IOException {
try {
if (pos < doc.getLength())
return doc.getChar(pos++) & 0xFF;
} catch (final BadLocationException ex) {
// ignore
}
return -1;
DocumentInputStream(final IDocument doc) {
super(doc::getChar, doc::getLength, getCharset(doc));
}
}
Loading

0 comments on commit d8ee729

Please sign in to comment.