LangStream · eolivelli · Oct 25, 2023 · Oct 25, 2023 · Oct 25, 2023 · Oct 25, 2023
diff --git a/examples/applications/langchain-chat/.langstreamignore b/examples/applications/langchain-chat/.langstreamignore
@@ -0,0 +1,149 @@
+# .langstreamignore file inspired by https://github.com/github/gitignore/blob/main/Python.gitignore
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# These folders hold the libs built for the target
+# and we need them in the package
+!python/lib/
+!java/lib/
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+Pipfile.lock
+
+# poetry
+poetry.lock
+
+# pdm
+pdm.lock
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+.idea/
diff --git a/examples/applications/langchain-chat/README.md b/examples/applications/langchain-chat/README.md
@@ -0,0 +1,66 @@
+# Simple Chat-bot using LangChain
+
+This sample application shows how to use LangChain to build a chat-bot.
+
+These are the main components:
+- DataStax Astra DB to store the documents to support the chat-bot
+- Cassio.org as Driver to connect to Astra DB (compatible with Cassandra) to connect to the Vector Database
+- OpenAI as LLM for the chat-completion and for computing the embeddings.
+
+The application is made of two pipelines:
+- The Chat bot pipeline, that is built using the LangChain SDK
+- The Vector Database pipeline, that is built using the native LangStream agents
+
+This is the table schema used by the application:
+
+```
+CREATE TABLE documents.documents (
+    row_id text PRIMARY KEY,
+    attributes_blob text,
+    body_blob text,
+    metadata_s map<text, text>,
+    vector vector<float, 1536>
+
+    ) ..
+```
+
+
+## Configure you OpenAI API Key
+
+Export to the ENV the access key to OpenAI
+
+```
+export OPEN_AI_ACCESS_KEY=...
+```
+
+The default [secrets file](../../secrets/secrets.yaml) reads from the ENV. Check out the file to learn more about
+the default settings, you can change them by exporting other ENV variables.
+
+## Configure access to the Vector Database
+
+Export some ENV variables in order to configure access to the database:
+
+```
+export ASTRA_LANGCHAIN_TOKEN=...
+export ASTRA_LANGCHAIN_CLIENT_ID=...
+export ASTRA_LANGCHAIN_SECRET=...
+export ASTRA_LANGCHAIN_DATABASE=...
+export ASTRA_LANGCHAIN_DATABASE_ID=...
+export ASTRA_LANGCHAIN_KEYSPACE=...
+export ASTRA_LANGCHAIN_TABLE=...
+```
+
+You can find the credentials in the Astra DB console when you create a Token.
+
+The table is automatically created by the application if it doesn't exist (this is performed by Cassio.org).
+
+The examples/secrets/secrets.yaml resolves those environment variables for you.
+When you go in production you are supposed to create a dedicated secrets.yaml file for each environment.
+
+## Deploy the LangStream application
+
+```
+./bin/langstream docker run test -app examples/applications/langchain-chatbot -s examples/secrets/secrets.yaml
+```
+
+Then you can talk with the chat-bot using the UI.
diff --git a/examples/applications/langchain-chat/configuration.yaml b/examples/applications/langchain-chat/configuration.yaml
@@ -0,0 +1,34 @@
+#
+#
+# Copyright DataStax, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+configuration:
+  resources:
+    - type: "open-ai-configuration"
+      name: "OpenAI Azure configuration"
+      configuration:
+        url: "${secrets.open-ai.url}"
+        access-key: "${secrets.open-ai.access-key}"
+        provider: "${secrets.open-ai.provider}"
+    - type: "datasource"
+      name: "AstraDatasource"
+      configuration:
+        service: "astra"
+        clientId: "${secrets.astra-langchain.clientId}"
+        secret: "${secrets.astra-langchain.secret}"
+        database: "${secrets.astra-langchain.database}"
+        token: "${secrets.astra-langchain.token}"
+        environment: "PROD"
diff --git a/examples/applications/langchain-chat/crawler.yaml b/examples/applications/langchain-chat/crawler.yaml
@@ -0,0 +1,105 @@
+#
+# Copyright DataStax, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+name: "Crawl a website"
+resources:
+      size: 2
+pipeline:
+  - name: "Crawl the WebSite"
+    type: "webcrawler-source"
+    configuration:
+      seed-urls: ["https://docs.langstream.ai/"]
+      allowed-domains: ["https://docs.langstream.ai"]
+      forbidden-paths: []
+      min-time-between-requests: 500
+      reindex-interval-seconds: 3600
+      max-error-count: 5
+      max-urls: 1000
+      max-depth: 50
+      handle-robots-file: true
+      user-agent: "" # this is computed automatically, but you can override it
+      scan-html-documents: true
+      http-timeout: 10000
+      handle-cookies: true
+      max-unflushed-pages: 100
+      bucketName: "${secrets.s3.bucket-name}"
+      endpoint: "${secrets.s3.endpoint}"
+      access-key: "${secrets.s3.access-key}"
+      secret-key: "${secrets.s3.secret}"
+      region: "${secrets.s3.region}"
+  - name: "Extract text"
+    type: "text-extractor"
+  - name: "Normalise text"
+    type: "text-normaliser"
+    configuration:
+      make-lowercase: true
+      trim-spaces: true
+  - name: "Detect language"
+    type: "language-detector"
+    configuration:
+      allowedLanguages: ["en", "fr"]
+      property: "language"
+  - name: "Split into chunks"
+    type: "text-splitter"
+    configuration:
+      splitter_type: "RecursiveCharacterTextSplitter"
+      chunk_size: 400
+      separators: ["\n\n", "\n", " ", ""]
+      keep_separator: false
+      chunk_overlap: 100
+      length_function: "cl100k_base"
+  - name: "Convert to structured data"
+    type: "document-to-json"
+    configuration:
+      text-field: text
+      copy-properties: true
+  - name: "prepare-structure"
+    type: "compute"
+    configuration:
+      fields:
+        - name: "value.filename"
+          expression: "properties.url"
+          type: STRING
+        - name: "value.chunk_id"
+          expression: "properties.chunk_id"
+          type: STRING
+        - name: "value.language"
+          expression: "properties.language"
+          type: STRING
+        - name: "value.chunk_num_tokens"
+          expression: "properties.chunk_num_tokens"
+          type: STRING
+        - name: "value.row_id"
+          expression: "fn:concat3(value.filename, '_', value.chunk_id)"
+          type: STRING
+  - name: "compute-embeddings"
+    id: "step1"
+    type: "compute-ai-embeddings"
+    configuration:
+      model: "${secrets.open-ai.embeddings-model}" # This needs to match the name of the model deployment, not the base model
+      embeddings-field: "value.embeddings_vector"
+      text: "{{ value.text }}"
+      batch-size: 10
+      flush-interval: 500
+  - name: "Write to Astra"
+    type: "vector-db-sink"
+    resources:
+      size: 2
+    configuration:
+      datasource: "AstraDatasource"
+      table-name: "documents"
+      keyspace: "documents"
+      mapping: "row_id=value.row_id, body_blob=value.text, vector=value.embeddings_vector"