Merge branch 'main' of github.com:run-llama/LlamaIndexTS into astra

mfortman11 · Jan 30, 2024 · 0c843b0 · 0c843b0
2 parents 6c46990 + 5692997
commit 0c843b0
Show file tree

Hide file tree

Showing 85 changed files with 2,621 additions and 2,548 deletions.
diff --git a/.changeset/bright-tips-develop.md b/.changeset/bright-tips-develop.md
@@ -0,0 +1,5 @@
+---
+"create-llama": patch
+---
+
+Add an option to provide an URL and chat with the website data
diff --git a/.changeset/curly-mangos-brake.md b/.changeset/curly-mangos-brake.md
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -32,7 +32,35 @@ jobs:
       - name: Install dependencies
         run: pnpm install
       - name: Build
-        run: pnpm run build
-        working-directory: ./packages/core
+        run: pnpm run build --filter llamaindex
       - name: Run Type Check
         run: pnpm run type-check
+      - name: Run Circular Dependency Check
+        run: pnpm run circular-check
+        working-directory: ./packages/core
+  typecheck-examples:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+      - uses: pnpm/action-setup@v2
+      - name: Setup Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version-file: ".nvmrc"
+          cache: "pnpm"
+      - name: Install dependencies
+        run: pnpm install
+      - name: Build
+        run: pnpm run build --filter llamaindex
+      - name: Copy examples
+        run: rsync -rv --exclude=node_modules ./examples ${{ runner.temp }}
+      - name: Pack
+        run: pnpm pack --pack-destination ${{ runner.temp }}
+        working-directory: packages/core
+      - name: Install llamaindex
+        run: npm add ${{ runner.temp }}/*.tgz
+        working-directory: ${{ runner.temp }}/examples
+      - name: Run Type Check
+        run: npx tsc --project ./tsconfig.json
+        working-directory: ${{ runner.temp }}/examples
diff --git a/README.md b/README.md
@@ -105,6 +105,9 @@ export const runtime = "nodejs"; // default
 // next.config.js
 /** @type {import('next').NextConfig} */
 const nextConfig = {
+  experimental: {
+    serverComponentsExternalPackages: ["pdf2json"],
+  },
   webpack: (config) => {
     config.resolve.alias = {
       ...config.resolve.alias,

diff --git a/apps/docs/docs/getting_started/concepts.md b/apps/docs/docs/getting_started/concepts.md
@@ -32,12 +32,12 @@ LlamaIndex.TS help you prepare the knowledge base with a suite of data connector
 
 ![](../_static/concepts/indexing.jpg)
 
-[**Data Loaders**](./modules/high_level/data_loader.md):
+[**Data Loaders**](../modules/data_loader.md):
 A data connector (i.e. `Reader`) ingest data from different data sources and data formats into a simple `Document` representation (text and simple metadata).
 
-[**Documents / Nodes**](./modules/high_level/documents_and_nodes.md): A `Document` is a generic container around any data source - for instance, a PDF, an API output, or retrieved data from a database. A `Node` is the atomic unit of data in LlamaIndex and represents a "chunk" of a source `Document`. It's a rich representation that includes metadata and relationships (to other nodes) to enable accurate and expressive retrieval operations.
+[**Documents / Nodes**](../modules/documents_and_nodes.md): A `Document` is a generic container around any data source - for instance, a PDF, an API output, or retrieved data from a database. A `Node` is the atomic unit of data in LlamaIndex and represents a "chunk" of a source `Document`. It's a rich representation that includes metadata and relationships (to other nodes) to enable accurate and expressive retrieval operations.
 
-[**Data Indexes**](./modules/high_level/data_index.md):
+[**Data Indexes**](../modules/data_index.md):
 Once you've ingested your data, LlamaIndex helps you index data into a format that's easy to retrieve.
 
 Under the hood, LlamaIndex parses the raw documents into intermediate representations, calculates vector embeddings, and stores your data in-memory or to disk.
@@ -60,19 +60,19 @@ These building blocks can be customized to reflect ranking preferences, as well
 
 #### Building Blocks
 
-[**Retrievers**](./modules/low_level/retriever.md):
+[**Retrievers**](../modules/retriever.md):
 A retriever defines how to efficiently retrieve relevant context from a knowledge base (i.e. index) when given a query.
 The specific retrieval logic differs for difference indices, the most popular being dense retrieval against a vector index.
 
-[**Response Synthesizers**](./modules/low_level/response_synthesizer.md):
+[**Response Synthesizers**](../modules/response_synthesizer.md):
 A response synthesizer generates a response from an LLM, using a user query and a given set of retrieved text chunks.
 
 #### Pipelines
 
-[**Query Engines**](./modules/high_level/query_engine.md):
+[**Query Engines**](../modules/query_engine.md):
 A query engine is an end-to-end pipeline that allow you to ask question over your data.
 It takes in a natural language query, and returns a response, along with reference context retrieved and passed to the LLM.
 
-[**Chat Engines**](./modules/high_level/chat_engine.md):
+[**Chat Engines**](../modules/chat_engine.md):
 A chat engine is an end-to-end pipeline for having a conversation with your data
 (multiple back-and-forth instead of a single question & answer).
diff --git a/apps/docs/docs/modules/data_index.md b/apps/docs/docs/modules/data_index.md
@@ -1,5 +1,5 @@
 ---
-sidebar_position: 2
+sidebar_position: 4
 ---
 
 # Index

diff --git a/apps/docs/docs/modules/data_loader.md b/apps/docs/docs/modules/data_loader.md
@@ -1,5 +1,5 @@
 ---
-sidebar_position: 1
+sidebar_position: 3
 ---
 
 # Reader / Loader

diff --git a/apps/docs/docs/modules/documents_and_nodes/_category_.yml b/apps/docs/docs/modules/documents_and_nodes/_category_.yml
@@ -0,0 +1,2 @@
+label: "Document / Nodes"
+position: 0
diff --git a/.../docs/docs/modules/documents_and_nodes.md → ...docs/modules/documents_and_nodes/index.md b/.../docs/docs/modules/documents_and_nodes.md → ...docs/modules/documents_and_nodes/index.md
diff --git a/apps/docs/docs/modules/documents_and_nodes/metadata_extraction.md b/apps/docs/docs/modules/documents_and_nodes/metadata_extraction.md
@@ -0,0 +1,45 @@
+# Metadata Extraction Usage Pattern
+
+You can use LLMs to automate metadata extraction with our `Metadata Extractor` modules.
+
+Our metadata extractor modules include the following "feature extractors":
+
+- `SummaryExtractor` - automatically extracts a summary over a set of Nodes
+- `QuestionsAnsweredExtractor` - extracts a set of questions that each Node can answer
+- `TitleExtractor` - extracts a title over the context of each Node by document and combine them
+- `KeywordExtractor` - extracts keywords over the context of each Node
+
+Then you can chain the `Metadata Extractors` with the `IngestionPipeline` to extract metadata from a set of documents.
+
+```ts
+import {
+  IngestionPipeline,
+  TitleExtractor,
+  QuestionsAnsweredExtractor,
+  Document,
+  OpenAI,
+} from "llamaindex";
+
+async function main() {
+  const pipeline = new IngestionPipeline({
+    transformations: [
+      new TitleExtractor(),
+      new QuestionsAnsweredExtractor({
+        questions: 5,
+      }),
+    ],
+  });
+
+  const nodes = await pipeline.run({
+    documents: [
+      new Document({ text: "I am 10 years old. John is 20 years old." }),
+    ],
+  });
+
+  for (const node of nodes) {
+    console.log(node.metadata);
+  }
+}
+
+main().then(() => console.log("done"));
+```
diff --git a/apps/docs/docs/modules/embedding.md b/apps/docs/docs/modules/embedding.md
@@ -1,5 +1,5 @@
 ---
-sidebar_position: 1
+sidebar_position: 3
 ---
 
 # Embedding

diff --git a/apps/docs/docs/modules/index.md b/apps/docs/docs/modules/index.md
diff --git a/apps/docs/docs/modules/ingestion_pipeline/_category_.yml b/apps/docs/docs/modules/ingestion_pipeline/_category_.yml
@@ -0,0 +1,2 @@
+label: "Ingestion Pipeline"
+position: 2
diff --git a/apps/docs/docs/modules/ingestion_pipeline/index.md b/apps/docs/docs/modules/ingestion_pipeline/index.md
@@ -0,0 +1,99 @@
+# Ingestion Pipeline
+
+An `IngestionPipeline` uses a concept of `Transformations` that are applied to input data.
+These `Transformations` are applied to your input data, and the resulting nodes are either returned or inserted into a vector database (if given).
+
+## Usage Pattern
+
+The simplest usage is to instantiate an IngestionPipeline like so:
+
+```ts
+import fs from "node:fs/promises";
+
+import {
+  Document,
+  IngestionPipeline,
+  MetadataMode,
+  OpenAIEmbedding,
+  TitleExtractor,
+  SimpleNodeParser,
+} from "llamaindex";
+
+async function main() {
+  // Load essay from abramov.txt in Node
+  const path = "node_modules/llamaindex/examples/abramov.txt";
+
+  const essay = await fs.readFile(path, "utf-8");
+
+  // Create Document object with essay
+  const document = new Document({ text: essay, id_: path });
+  const pipeline = new IngestionPipeline({
+    transformations: [
+      new SimpleNodeParser({ chunkSize: 1024, chunkOverlap: 20 }),
+      new TitleExtractor(),
+      new OpenAIEmbedding(),
+    ],
+  });
+
+  // run the pipeline
+  const nodes = await pipeline.run({ documents: [document] });
+
+  // print out the result of the pipeline run
+  for (const node of nodes) {
+    console.log(node.getContent(MetadataMode.NONE));
+  }
+}
+
+main().catch(console.error);
+```
+
+## Connecting to Vector Databases
+
+When running an ingestion pipeline, you can also chose to automatically insert the resulting nodes into a remote vector store.
+
+Then, you can construct an index from that vector store later on.
+
+```ts
+import fs from "node:fs/promises";
+
+import {
+  Document,
+  IngestionPipeline,
+  MetadataMode,
+  OpenAIEmbedding,
+  TitleExtractor,
+  SimpleNodeParser,
+  QdrantVectorStore,
+  VectorStoreIndex,
+} from "llamaindex";
+
+async function main() {
+  // Load essay from abramov.txt in Node
+  const path = "node_modules/llamaindex/examples/abramov.txt";
+
+  const essay = await fs.readFile(path, "utf-8");
+
+  const vectorStore = new QdrantVectorStore({
+    host: "http://localhost:6333",
+  });
+
+  // Create Document object with essay
+  const document = new Document({ text: essay, id_: path });
+  const pipeline = new IngestionPipeline({
+    transformations: [
+      new SimpleNodeParser({ chunkSize: 1024, chunkOverlap: 20 }),
+      new TitleExtractor(),
+      new OpenAIEmbedding(),
+    ],
+    vectorStore,
+  });
+
+  // run the pipeline
+  const nodes = await pipeline.run({ documents: [document] });
+
+  // create an index
+  const index = VectorStoreIndex.fromVectorStore(vectorStore);
+}
+
+main().catch(console.error);
+```
diff --git a/apps/docs/docs/modules/ingestion_pipeline/transformations.md b/apps/docs/docs/modules/ingestion_pipeline/transformations.md
@@ -0,0 +1,77 @@
+# Transformations
+
+A transformation is something that takes a list of nodes as an input, and returns a list of nodes. Each component that implements the Transformatio class has both a `transform` definition responsible for transforming the nodes
+
+Currently, the following components are Transformation objects:
+
+- [SimpleNodeParser](../../api/classes/SimpleNodeParser.md)
+- [MetadataExtractor](../documents_and_nodes/metadata_extraction.md)
+- Embeddings
+
+## Usage Pattern
+
+While transformations are best used with with an IngestionPipeline, they can also be used directly.
+
+```ts
+import { SimpleNodeParser, TitleExtractor, Document } from "llamaindex";
+
+async function main() {
+  let nodes = new SimpleNodeParser().getNodesFromDocuments([
+    new Document({ text: "I am 10 years old. John is 20 years old." }),
+  ]);
+
+  const titleExtractor = new TitleExtractor();
+
+  nodes = await titleExtractor.transform(nodes);
+
+  for (const node of nodes) {
+    console.log(node.getContent(MetadataMode.NONE));
+  }
+}
+
+main().catch(console.error);
+```
+
+## Custom Transformations
+
+You can implement any transformation yourself by implementing the `TransformerComponent`.
+
+The following custom transformation will remove any special characters or punctutaion in text.
+
+```ts
+import { TransformerComponent, Node } from "llamaindex";
+
+class RemoveSpecialCharacters extends TransformerComponent {
+  async transform(nodes: Node[]): Promise<Node[]> {
+    for (const node of nodes) {
+      node.text = node.text.replace(/[^\w\s]/gi, "");
+    }
+
+    return nodes;
+  }
+}
+```
+
+These can then be used directly or in any IngestionPipeline.
+
+```ts
+import { IngestionPipeline, Document } from "llamaindex";
+
+async function main() {
+  const pipeline = new IngestionPipeline({
+    transformations: [new RemoveSpecialCharacters()],
+  });
+
+  const nodes = await pipeline.run({
+    documents: [
+      new Document({ text: "I am 10 years old. John is 20 years old." }),
+    ],
+  });
+
+  for (const node of nodes) {
+    console.log(node.getContent(MetadataMode.NONE));
+  }
+}
+
+main().catch(console.error);
+```