microsoft · johnoliver · Oct 4, 2024 · Oct 7, 2024 · Oct 7, 2024 · Oct 7, 2024
diff --git a/samples/semantickernel-concepts/semantickernel-syntax-examples/pom.xml b/samples/semantickernel-concepts/semantickernel-syntax-examples/pom.xml
@@ -1,5 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
     <modelVersion>4.0.0</modelVersion>
     <parent>
         <groupId>com.microsoft.semantic-kernel</groupId>
@@ -81,6 +82,16 @@
             <groupId>com.microsoft.semantic-kernel</groupId>
             <artifactId>semantickernel-aiservices-google</artifactId>
         </dependency>
+        <dependency>
+            <groupId>com.microsoft.semantic-kernel</groupId>
+            <artifactId>semantickernel-text-splitter-plugin</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.pdfbox</groupId>
+            <artifactId>pdfbox</artifactId>
+            <version>3.0.3</version>
+        </dependency>
 
         <dependency>
             <groupId>com.google.cloud</groupId>

diff --git a/...ava/com/microsoft/semantickernel/samples/syntaxexamples/rag/DocumentSplittingExample.java b/...ava/com/microsoft/semantickernel/samples/syntaxexamples/rag/DocumentSplittingExample.java
@@ -0,0 +1,159 @@
+// Copyright (c) Microsoft. All rights reserved.
+package com.microsoft.semantickernel.samples.syntaxexamples.rag;
+
+import com.microsoft.semantic.kernel.rag.splitting.Chunk;
+import com.microsoft.semantic.kernel.rag.splitting.Document;
+import com.microsoft.semantic.kernel.rag.splitting.Splitter;
+import com.microsoft.semantic.kernel.rag.splitting.TextSplitter;
+import com.microsoft.semantic.kernel.rag.splitting.document.TextDocument;
+import com.microsoft.semantic.kernel.rag.splitting.overlap.NoOverlapCondition;
+import com.microsoft.semantic.kernel.rag.splitting.splitconditions.CountSplitCondition;
+import com.microsoft.semantic.kernel.rag.splitting.splitconditions.SplitPoint;
+import com.microsoft.semantickernel.implementation.EmbeddedResourceLoader;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.net.URI;
+import java.net.http.HttpClient;
+import java.net.http.HttpRequest;
+import java.net.http.HttpResponse;
+import java.net.http.HttpResponse.BodyHandlers;
+import java.util.List;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+import org.apache.pdfbox.io.RandomAccessReadBuffer;
+import org.apache.pdfbox.pdfparser.PDFParser;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.text.PDFTextStripper;
+import reactor.core.publisher.Flux;
+import reactor.core.publisher.Mono;
+
+public class DocumentSplittingExample {
+
+    private static String BENEFITS_DOC = "https://raw.githubusercontent.com/Azure-Samples/azure-search-openai-demo-java/refs/heads/main/data/Benefit_Options.pdf";
+
+    private static class PDFDocument implements Document {
+
+        private final byte[] pdf;
+
+        private PDFDocument(byte[] pdf) {
+            this.pdf = pdf;
+        }
+
+        @Override
+        public Flux<String> getContent() {
+            try {
+                PDFParser parser = new PDFParser(
+                    RandomAccessReadBuffer.createBufferFromStream(new ByteArrayInputStream(pdf)));
+                PDDocument document = parser.parse();
+                String text = new PDFTextStripper().getText(document);
+
+                return Flux.just(text);
+            } catch (IOException e) {
+                return Flux.error(e);
+            }
+        }
+    }
+
+    public static void main(String[] args) throws IOException, InterruptedException {
+        useCustomChunker();
+        useInbuiltChunker();
+    }
+
+    private static void useInbuiltChunker() throws IOException, InterruptedException {
+        byte[] pdfBytes = getPdfDoc();
+        PDFDocument pdfDoc = new PDFDocument(pdfBytes);
+
+        Splitter splitter = Splitter
+            .builder()
+            .maxParagraphsPerChunk(4)
+            .overlapNPercent(30.0f)
+            .trimWhitespace()
+            .build();
+
+        List<Chunk> chunks = splitter
+            .splitDocument(pdfDoc)
+            .collectList()
+            .block();
+
+        chunks
+            .forEach(chunk -> {
+                System.out.println("=========");
+                System.out.println(chunk.getContents());
+            });
+    }
+
+    public static void useCustomChunker() throws IOException, InterruptedException {
+
+        String example = EmbeddedResourceLoader.readFile("example.md",
+            DocumentSplittingExample.class);
+
+        // Define how we are splitting tokens, in this case we are splitting on headers of an md file
+        // i.e <new line> followed by one or more # characters
+        TextSplitter textSplitter = (doc, numTokens) -> {
+            // Split on headers
+            Pattern pattern = Pattern.compile("(\\r?\\n|\\r)\s*#+", Pattern.MULTILINE);
+
+            Flux<Integer> splitPoints = Flux.fromStream(pattern.matcher(doc).results())
+                .map(window -> window.start());
+
+            return createWindows(doc, splitPoints);
+        };
+
+        // Split into single sections
+        CountSplitCondition condition = new CountSplitCondition(1, textSplitter);
+
+        Splitter splitter = Splitter
+            .builder()
+            .addChunkEndCondition(condition)
+            // No overlap
+            .setOverlapCondition(NoOverlapCondition.build())
+            // Tidy up the text
+            .trimWhitespace()
+            .build();
+
+        String chunks = splitter
+            .splitDocument(new TextDocument(example))
+            .collectList()
+            .map(it -> it.stream()
+                .map(chunk -> chunk.getContents())
+                .collect(Collectors.joining("\n============\n")))
+            .block();
+
+        System.out.println(chunks);
+    }
+
+    /*
+     * Transforms: [ 2, 10, 20, 100 ] -> [ (0, 2), (2, 10), (10, 20), (20, 100), (100, <doc length>)
+     * ]
+     */
+    private static List<SplitPoint> createWindows(String doc, Flux<Integer> splitPoints) {
+        return Flux.concat(
+            Flux.just(0),
+            splitPoints,
+            Flux.just(doc.length()))
+            .window(2, 1)
+            .concatMap(window -> {
+                return window.collectList()
+                    .flatMap(list -> {
+                        if (list.size() <= 1) {
+                            return Mono.empty();
+                        }
+                        return Mono.just(
+                            new SplitPoint(list.get(0), list.get(1)));
+                    });
+            })
+            .collectList()
+            .block();
+    }
+
+    private static byte[] getPdfDoc() throws IOException, InterruptedException {
+        HttpResponse<byte[]> doc = HttpClient.newHttpClient()
+            .send(HttpRequest.newBuilder()
+                .GET()
+                .uri(URI.create(BENEFITS_DOC))
+                .build(),
+                BodyHandlers.ofByteArray());
+        return doc.body();
+    }
+
+}
diff --git a/...in/resources/com/microsoft/semantickernel/samples/syntaxexamples/rag/example.md b/...in/resources/com/microsoft/semantickernel/samples/syntaxexamples/rag/example.md
@@ -0,0 +1,22 @@
+## Section 1
+
+Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna
+aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis
+aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint
+occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
+
+## Section 2
+
+Another section.
+
+### Subsection 1
+
+1, 2, 3, 4, 5, 6, 7, 8, 9, 10.
+
+# Section 3
+
+This is the last section.
+
+```
+some code
+```
diff --git a/samples/semantickernel-sample-plugins/pom.xml b/samples/semantickernel-sample-plugins/pom.xml
@@ -1,5 +1,6 @@
 <?xml version="1.0" encoding="UTF-8" ?>
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
     <modelVersion>4.0.0</modelVersion>
     <parent>
         <groupId>com.microsoft.semantic-kernel</groupId>
@@ -15,5 +16,6 @@
     <modules>
         <module>semantickernel-openapi-plugin</module>
         <module>semantickernel-presidio-plugin</module>
+        <module>semantickernel-text-splitter-plugin</module>
     </modules>
 </project>
diff --git a/samples/semantickernel-sample-plugins/semantickernel-text-splitter-plugin/pom.xml b/samples/semantickernel-sample-plugins/semantickernel-text-splitter-plugin/pom.xml
@@ -0,0 +1,65 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+    <parent>
+        <groupId>com.microsoft.semantic-kernel</groupId>
+        <artifactId>semantickernel-sample-plugins</artifactId>
+        <version>1.3.1-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+
+    <artifactId>semantickernel-text-splitter-plugin</artifactId>
+    <name>semantickernel-text-splitter-plugin</name>
+    <packaging>jar</packaging>
+
+    <dependencyManagement>
+        <dependencies>
+            <dependency>
+                <groupId>com.microsoft.semantic-kernel</groupId>
+                <artifactId>semantickernel-bom</artifactId>
+                <version>${project.version}</version>
+                <type>pom</type>
+                <scope>import</scope>
+            </dependency>
+        </dependencies>
+    </dependencyManagement>
+
+    <dependencies>
+        <dependency>
+            <groupId>com.microsoft.semantic-kernel</groupId>
+            <artifactId>semantickernel-api</artifactId>
+        </dependency>
+
+        <dependency>
+            <groupId>org.apache.logging.log4j</groupId>
+            <artifactId>log4j-api</artifactId>
+            <scope>runtime</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.logging.log4j</groupId>
+            <artifactId>log4j-core</artifactId>
+            <scope>runtime</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.logging.log4j</groupId>
+            <artifactId>log4j-slf4j2-impl</artifactId>
+            <scope>runtime</scope>
+        </dependency>
+        <dependency>
+            <groupId>com.fasterxml.jackson.core</groupId>
+            <artifactId>jackson-databind</artifactId>
+            <scope>compile</scope>
+        </dependency>
+        <dependency>
+            <groupId>com.fasterxml.jackson.core</groupId>
+            <artifactId>jackson-core</artifactId>
+            <scope>compile</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.junit.jupiter</groupId>
+            <artifactId>junit-jupiter-api</artifactId>
+            <scope>test</scope>
+        </dependency>
+    </dependencies>
+</project>
diff --git a/...text-splitter-plugin/src/main/java/com/microsoft/semantic/kernel/rag/splitting/Chunk.java b/...text-splitter-plugin/src/main/java/com/microsoft/semantic/kernel/rag/splitting/Chunk.java
@@ -0,0 +1,16 @@
+// Copyright (c) Microsoft. All rights reserved.
+package com.microsoft.semantic.kernel.rag.splitting;
+
+public class Chunk {
+
+    private final String chunk;
+
+    public Chunk(String chunk) {
+        this.chunk = chunk;
+    }
+
+    public String getContents() {
+        return chunk;
+    }
+
+}
diff --git a/...r-plugin/src/main/java/com/microsoft/semantic/kernel/rag/splitting/ChunkEndCondition.java b/...r-plugin/src/main/java/com/microsoft/semantic/kernel/rag/splitting/ChunkEndCondition.java
@@ -0,0 +1,22 @@
+// Copyright (c) Microsoft. All rights reserved.
+package com.microsoft.semantic.kernel.rag.splitting;
+
+/**
+ * Defines the condition that should be met for a chunk to be considered full.
+ */
+public interface ChunkEndCondition {
+
+    /**
+     * Accepts a string and returns the number of character that should be considered as the end of
+     * the FIRST chunk within the string. This method will be subsequently called until all pages
+     * are found.
+     * <p>
+     * Return -1 if the value does not contain enough characters to be considered as a full chunk.
+     *
+     * @param value the value to be checked
+     * @return the index of the character that should be considered as the end of the first chunk in
+     * the string
+     */
+    public int getEndOfNextChunk(String value);
+
+}
diff --git a/...-plugin/src/main/java/com/microsoft/semantic/kernel/rag/splitting/ChunkPostProcessor.java b/...-plugin/src/main/java/com/microsoft/semantic/kernel/rag/splitting/ChunkPostProcessor.java
@@ -0,0 +1,9 @@
+// Copyright (c) Microsoft. All rights reserved.
+package com.microsoft.semantic.kernel.rag.splitting;
+
+/**
+ * A post processor that processes a chunk after it has been split.
+ */
+public interface ChunkPostProcessor {
+    Chunk process(Chunk chunk);
+}
diff --git a/...t-splitter-plugin/src/main/java/com/microsoft/semantic/kernel/rag/splitting/Document.java b/...t-splitter-plugin/src/main/java/com/microsoft/semantic/kernel/rag/splitting/Document.java
@@ -0,0 +1,11 @@
+// Copyright (c) Microsoft. All rights reserved.
+package com.microsoft.semantic.kernel.rag.splitting;
+
+import reactor.core.publisher.Flux;
+
+/**
+ * A document to be read and split into chunks.
+ */
+public interface Document {
+    Flux<String> getContent();
+}
diff --git a/...er-plugin/src/main/java/com/microsoft/semantic/kernel/rag/splitting/OverlapCondition.java b/...er-plugin/src/main/java/com/microsoft/semantic/kernel/rag/splitting/OverlapCondition.java
@@ -0,0 +1,19 @@
+// Copyright (c) Microsoft. All rights reserved.
+package com.microsoft.semantic.kernel.rag.splitting;
+
+/**
+ * Defines how much overlap is allowed between two pages.
+ */
+public interface OverlapCondition {
+
+    /**
+     * Returns the index of the first character that should be considered as the beginning of the
+     * overlap.
+     *
+     * @param chunk the chunk to be checked
+     * @return the index of the first character that should be considered as the beginning of the
+     * overlap
+     */
+    public int getOverlapIndex(String chunk);
+
+}