Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add plugin for document text splitting #239

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.microsoft.semantic-kernel</groupId>
Expand Down Expand Up @@ -81,6 +82,16 @@
<groupId>com.microsoft.semantic-kernel</groupId>
<artifactId>semantickernel-aiservices-google</artifactId>
</dependency>
<dependency>
<groupId>com.microsoft.semantic-kernel</groupId>
<artifactId>semantickernel-text-splitter-plugin</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>3.0.3</version>
</dependency>

<dependency>
<groupId>com.google.cloud</groupId>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
// Copyright (c) Microsoft. All rights reserved.
package com.microsoft.semantickernel.samples.syntaxexamples.rag;

import com.microsoft.semantic.kernel.rag.splitting.Chunk;
import com.microsoft.semantic.kernel.rag.splitting.Document;
import com.microsoft.semantic.kernel.rag.splitting.Splitter;
import com.microsoft.semantic.kernel.rag.splitting.TextSplitter;
import com.microsoft.semantic.kernel.rag.splitting.document.TextDocument;
import com.microsoft.semantic.kernel.rag.splitting.overlap.NoOverlapCondition;
import com.microsoft.semantic.kernel.rag.splitting.splitconditions.CountSplitCondition;
import com.microsoft.semantic.kernel.rag.splitting.splitconditions.SplitPoint;
import com.microsoft.semantickernel.implementation.EmbeddedResourceLoader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.net.URI;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.net.http.HttpResponse.BodyHandlers;
import java.util.List;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.apache.pdfbox.io.RandomAccessReadBuffer;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import reactor.core.publisher.Flux;
import reactor.core.publisher.Mono;

public class DocumentSplittingExample {

private static String BENEFITS_DOC = "https://raw.githubusercontent.com/Azure-Samples/azure-search-openai-demo-java/refs/heads/main/data/Benefit_Options.pdf";

private static class PDFDocument implements Document {

private final byte[] pdf;

private PDFDocument(byte[] pdf) {
this.pdf = pdf;
}

@Override
public Flux<String> getContent() {
try {
PDFParser parser = new PDFParser(
RandomAccessReadBuffer.createBufferFromStream(new ByteArrayInputStream(pdf)));
PDDocument document = parser.parse();
String text = new PDFTextStripper().getText(document);

return Flux.just(text);
} catch (IOException e) {
return Flux.error(e);
}
}
}

public static void main(String[] args) throws IOException, InterruptedException {
useCustomChunker();
useInbuiltChunker();
}

private static void useInbuiltChunker() throws IOException, InterruptedException {
byte[] pdfBytes = getPdfDoc();
PDFDocument pdfDoc = new PDFDocument(pdfBytes);

Splitter splitter = Splitter
.builder()
.maxParagraphsPerChunk(4)
.overlapNPercent(30.0f)
.trimWhitespace()
.build();

List<Chunk> chunks = splitter
.splitDocument(pdfDoc)
.collectList()
.block();

chunks
.forEach(chunk -> {
System.out.println("=========");
System.out.println(chunk.getContents());
});
}

public static void useCustomChunker() throws IOException, InterruptedException {

String example = EmbeddedResourceLoader.readFile("example.md",
DocumentSplittingExample.class);

// Define how we are splitting tokens, in this case we are splitting on headers of an md file
// i.e <new line> followed by one or more # characters
TextSplitter textSplitter = (doc, numTokens) -> {
// Split on headers
Pattern pattern = Pattern.compile("(\\r?\\n|\\r)\s*#+", Pattern.MULTILINE);

Flux<Integer> splitPoints = Flux.fromStream(pattern.matcher(doc).results())
.map(window -> window.start());

return createWindows(doc, splitPoints);
};

// Split into single sections
CountSplitCondition condition = new CountSplitCondition(1, textSplitter);

Splitter splitter = Splitter
.builder()
.addChunkEndCondition(condition)
// No overlap
.setOverlapCondition(NoOverlapCondition.build())
// Tidy up the text
.trimWhitespace()
.build();

String chunks = splitter
.splitDocument(new TextDocument(example))
.collectList()
.map(it -> it.stream()
.map(chunk -> chunk.getContents())
.collect(Collectors.joining("\n============\n")))
.block();

System.out.println(chunks);
}

/*
* Transforms: [ 2, 10, 20, 100 ] -> [ (0, 2), (2, 10), (10, 20), (20, 100), (100, <doc length>)
* ]
*/
private static List<SplitPoint> createWindows(String doc, Flux<Integer> splitPoints) {
return Flux.concat(
Flux.just(0),
splitPoints,
Flux.just(doc.length()))
.window(2, 1)
.concatMap(window -> {
return window.collectList()
.flatMap(list -> {
if (list.size() <= 1) {
return Mono.empty();
}
return Mono.just(
new SplitPoint(list.get(0), list.get(1)));
});
})
.collectList()
.block();
}

private static byte[] getPdfDoc() throws IOException, InterruptedException {
HttpResponse<byte[]> doc = HttpClient.newHttpClient()
.send(HttpRequest.newBuilder()
.GET()
.uri(URI.create(BENEFITS_DOC))
.build(),
BodyHandlers.ofByteArray());
return doc.body();
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
## Section 1

Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna
aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis
aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint
occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

## Section 2

Another section.

### Subsection 1

1, 2, 3, 4, 5, 6, 7, 8, 9, 10.

# Section 3

This is the last section.

```
some code
```
4 changes: 3 additions & 1 deletion samples/semantickernel-sample-plugins/pom.xml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
<?xml version="1.0" encoding="UTF-8" ?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.microsoft.semantic-kernel</groupId>
Expand All @@ -15,5 +16,6 @@
<modules>
<module>semantickernel-openapi-plugin</module>
<module>semantickernel-presidio-plugin</module>
<module>semantickernel-text-splitter-plugin</module>
</modules>
</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.microsoft.semantic-kernel</groupId>
<artifactId>semantickernel-sample-plugins</artifactId>
<version>1.3.1-SNAPSHOT</version>
<relativePath>../pom.xml</relativePath>
</parent>

<artifactId>semantickernel-text-splitter-plugin</artifactId>
<name>semantickernel-text-splitter-plugin</name>
<packaging>jar</packaging>

<dependencyManagement>
<dependencies>
<dependency>
<groupId>com.microsoft.semantic-kernel</groupId>
<artifactId>semantickernel-bom</artifactId>
<version>${project.version}</version>
<type>pom</type>
<scope>import</scope>
</dependency>
</dependencies>
</dependencyManagement>

<dependencies>
<dependency>
<groupId>com.microsoft.semantic-kernel</groupId>
<artifactId>semantickernel-api</artifactId>
</dependency>

<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-api</artifactId>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-slf4j2-impl</artifactId>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-api</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
// Copyright (c) Microsoft. All rights reserved.
package com.microsoft.semantic.kernel.rag.splitting;

public class Chunk {

private final String chunk;

public Chunk(String chunk) {
this.chunk = chunk;
}

public String getContents() {
return chunk;
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
// Copyright (c) Microsoft. All rights reserved.
package com.microsoft.semantic.kernel.rag.splitting;

/**
* Defines the condition that should be met for a chunk to be considered full.
*/
public interface ChunkEndCondition {

/**
* Accepts a string and returns the number of character that should be considered as the end of
* the FIRST chunk within the string. This method will be subsequently called until all pages
* are found.
* <p>
* Return -1 if the value does not contain enough characters to be considered as a full chunk.
*
* @param value the value to be checked
* @return the index of the character that should be considered as the end of the first chunk in
* the string
*/
public int getEndOfNextChunk(String value);

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
// Copyright (c) Microsoft. All rights reserved.
package com.microsoft.semantic.kernel.rag.splitting;

/**
* A post processor that processes a chunk after it has been split.
*/
public interface ChunkPostProcessor {
Chunk process(Chunk chunk);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
// Copyright (c) Microsoft. All rights reserved.
package com.microsoft.semantic.kernel.rag.splitting;

import reactor.core.publisher.Flux;

/**
* A document to be read and split into chunks.
*/
public interface Document {
Flux<String> getContent();
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
// Copyright (c) Microsoft. All rights reserved.
package com.microsoft.semantic.kernel.rag.splitting;

/**
* Defines how much overlap is allowed between two pages.
*/
public interface OverlapCondition {

/**
* Returns the index of the first character that should be considered as the beginning of the
* overlap.
*
* @param chunk the chunk to be checked
* @return the index of the first character that should be considered as the beginning of the
* overlap
*/
public int getOverlapIndex(String chunk);

}
Loading