diff --git a/POC_Documents/V1/From Local to Global.docx b/POC_Documents/V1/From Local to Global.docx
deleted file mode 100644
index 267ad247..00000000
Binary files a/POC_Documents/V1/From Local to Global.docx and /dev/null differ
diff --git a/POC_Documents/V1/propsed RAG genAI Architecture.docx b/POC_Documents/V1/propsed RAG genAI Architecture.docx
deleted file mode 100644
index d149b519..00000000
Binary files a/POC_Documents/V1/propsed RAG genAI Architecture.docx and /dev/null differ
diff --git a/POC_Documents/V1/propsed chatbot architecture.jpg b/POC_Documents/V1/propsed chatbot architecture.jpg
deleted file mode 100644
index 5cb87df2..00000000
Binary files a/POC_Documents/V1/propsed chatbot architecture.jpg and /dev/null differ
diff --git a/README.md b/README.md
index a048e6d6..6f4b324d 100644
--- a/README.md
+++ b/README.md
@@ -1,15 +1,34 @@
-
# Knowledge Graph Builder App
-This application is designed to convert PDF documents into a knowledge graph stored in Neo4j. It utilizes the power of OpenAI's GPT/Diffbot LLM(Large language model) to extract nodes, relationships and properties from the text content of the PDF and then organizes them into a structured knowledge graph using Langchain framework.
-Files can be uploaded from local machine or S3 bucket and then LLM model can be chosen to create the knowledge graph.
-### Getting started
+Creating knowledge graphs from unstructured data
+
+
+# LLM Graph Builder
+
+![Python](https://img.shields.io/badge/Python-yellow)
+![FastAPI](https://img.shields.io/badge/FastAPI-green)
+![React](https://img.shields.io/badge/React-blue)
+
+## Overview
+This application is designed to turn Unstructured data (pdfs,docs,txt,youtube video,web pages,etc.) into a knowledge graph stored in Neo4j. It utilizes the power of Large language models (OpenAI,Gemini,etc.) to extract nodes, relationships and their properties from the text and create a structured knowledge graph using Langchain framework.
+
+Upload your files from local machine, GCS or S3 bucket or from web sources, choose your LLM model and generate knowledge graph.
+
+## Key Features
+- **Knowledge Graph Creation**: Transform unstructured data into structured knowledge graphs using LLMs.
+- **Providing Schema**: Provide your own custom schema or use existing schema in settings to generate graph.
+- **View Graph**: View graph for a particular source or multiple sources at a time in Bloom.
+- **Chat with Data**: Interact with your data in a Neo4j database through conversational queries, also retrive metadata about the source of response to your queries.
+
+## Getting started
:warning: You will need to have a Neo4j Database V5.15 or later with [APOC installed](https://neo4j.com/docs/apoc/current/installation/) to use this Knowledge Graph Builder.
You can use any [Neo4j Aura database](https://neo4j.com/aura/) (including the free database)
If you are using Neo4j Desktop, you will not be able to use the docker-compose but will have to follow the [separate deployment of backend and frontend section](#running-backend-and-frontend-separately-dev-environment). :warning:
-### Deploy locally
+
+## Deployment
+### Local deployment
#### Running through docker-compose
By default only OpenAI and Diffbot are enabled since Gemini requires extra GCP configurations.
@@ -21,13 +40,13 @@ DIFFBOT_API_KEY="your-diffbot-key"
if you only want OpenAI:
```env
-LLM_MODELS="OpenAI GPT 3.5,OpenAI GPT 4o"
+LLM_MODELS="gpt-3.5,gpt-4o"
OPENAI_API_KEY="your-openai-key"
```
if you only want Diffbot:
```env
-LLM_MODELS="Diffbot"
+LLM_MODELS="diffbot"
DIFFBOT_API_KEY="your-diffbot-key"
```
@@ -36,16 +55,16 @@ You can then run Docker Compose to build and start all components:
docker-compose up --build
```
-##### Additional configs
+#### Additional configs
-By default, the input sources will be: Local files, Youtube, Wikipedia and AWS S3. As this default config is applied:
+By default, the input sources will be: Local files, Youtube, Wikipedia ,AWS S3 and Webpages. As this default config is applied:
```env
-REACT_APP_SOURCES="local,youtube,wiki,s3"
+REACT_APP_SOURCES="local,youtube,wiki,s3,web"
```
If however you want the Google GCS integration, add `gcs` and your Google client ID:
```env
-REACT_APP_SOURCES="local,youtube,wiki,s3,gcs"
+REACT_APP_SOURCES="local,youtube,wiki,s3,gcs,web"
GOOGLE_CLIENT_ID="xxxx"
```
@@ -76,7 +95,24 @@ Alternatively, you can run the backend and frontend separately:
pip install -r requirements.txt
uvicorn score:app --reload
```
-### ENV
+### Deploy in Cloud
+To deploy the app and packages on Google Cloud Platform, run the following command on google cloud run:
+```bash
+# Frontend deploy
+gcloud run deploy
+source location current directory > Frontend
+region : 32 [us-central 1]
+Allow unauthenticated request : Yes
+```
+```bash
+# Backend deploy
+gcloud run deploy --set-env-vars "OPENAI_API_KEY = " --set-env-vars "DIFFBOT_API_KEY = " --set-env-vars "NEO4J_URI = " --set-env-vars "NEO4J_PASSWORD = " --set-env-vars "NEO4J_USERNAME = "
+source location current directory > Backend
+region : 32 [us-central 1]
+Allow unauthenticated request : Yes
+```
+
+## ENV
| Env Variable Name | Mandatory/Optional | Default Value | Description |
|-------------------------|--------------------|---------------|--------------------------------------------------------------------------------------------------|
| OPENAI_API_KEY | Mandatory | | API key for OpenAI |
@@ -86,7 +122,7 @@ Alternatively, you can run the backend and frontend separately:
| KNN_MIN_SCORE | Optional | 0.94 | Minimum score for KNN algorithm |
| GEMINI_ENABLED | Optional | False | Flag to enable Gemini |
| GCP_LOG_METRICS_ENABLED | Optional | False | Flag to enable Google Cloud logs |
-| NUMBER_OF_CHUNKS_TO_COMBINE | Optional | 6 | Number of chunks to combine when processing embeddings |
+| NUMBER_OF_CHUNKS_TO_COMBINE | Optional | 5 | Number of chunks to combine when processing embeddings |
| UPDATE_GRAPH_CHUNKS_PROCESSED | Optional | 20 | Number of chunks processed before updating progress |
| NEO4J_URI | Optional | neo4j://database:7687 | URI for Neo4j database |
| NEO4J_USERNAME | Optional | neo4j | Username for Neo4j database |
@@ -98,86 +134,36 @@ Alternatively, you can run the backend and frontend separately:
| BACKEND_API_URL | Optional | http://localhost:8000 | URL for backend API |
| BLOOM_URL | Optional | https://workspace-preview.neo4j.io/workspace/explore?connectURL={CONNECT_URL}&search=Show+me+a+graph&featureGenAISuggestions=true&featureGenAISuggestionsInternal=true | URL for Bloom visualization |
| REACT_APP_SOURCES | Optional | local,youtube,wiki,s3 | List of input sources that will be available |
-| LLM_MODELS | Optional | Diffbot,OpenAI GPT 3.5,OpenAI GPT 4o | Models available for selection on the frontend, used for entities extraction and Q&A Chatbot |
+| LLM_MODELS | Optional | diffbot,gpt-3.5,gpt-4o | Models available for selection on the frontend, used for entities extraction and Q&A Chatbot |
| ENV | Optional | DEV | Environment variable for the app |
| TIME_PER_CHUNK | Optional | 4 | Time per chunk for processing |
-| CHUNK_SIZE | Optional | 5242880 | Size of each chunk for processing |
+| CHUNK_SIZE | Optional | 5242880 | Size of each chunk of file for upload |
| GOOGLE_CLIENT_ID | Optional | | Client ID for Google authentication |
+| GCS_FILE_CACHE | Optional | False | If set to True, will save the files to process into GCS. If set to False, will save the files locally |
-###
-To deploy the app and packages on Google Cloud Platform, run the following command on google cloud run:
-```bash
-# Frontend deploy
-gcloud run deploy
-source location current directory > Frontend
-region : 32 [us-central 1]
-Allow unauthenticated request : Yes
-```
-```bash
-# Backend deploy
-gcloud run deploy --set-env-vars "OPENAI_API_KEY = " --set-env-vars "DIFFBOT_API_KEY = " --set-env-vars "NEO4J_URI = " --set-env-vars "NEO4J_PASSWORD = " --set-env-vars "NEO4J_USERNAME = "
-source location current directory > Backend
-region : 32 [us-central 1]
-Allow unauthenticated request : Yes
-```
-### Features
-- **PDF Upload**: Users can upload PDF documents using the Drop Zone.
-- **S3 Bucket Integration**: Users can also specify PDF documents stored in an S3 bucket for processing.
-- **Knowledge Graph Generation**: The application employs OpenAI/Diffbot's LLM to extract relevant information from the PDFs and construct a knowledge graph.
-- **Neo4j Integration**: The extracted nodes and relationships are stored in a Neo4j database for easy visualization and querying.
-- **Grid View of source node files with** : Name,Type,Size,Nodes,Relations,Duration,Status,Source,Model
-
-## Functions/Modules
-
-#### extract_graph_from_file(uri, userName, password, file_path, model):
- Extracts nodes , relationships and properties from a PDF file leveraging LLM models.
-
- Args:
- uri: URI of the graph to extract
- userName: Username to use for graph creation ( if None will use username from config file )
- password: Password to use for graph creation ( if None will use password from config file )
- file: File object containing the PDF file path to be used
- model: Type of model to use ('Gemini Pro' or 'Diffbot')
-
- Returns:
- Json response to API with fileName, nodeCount, relationshipCount, processingTime,
- status and model as attributes.
-
-
-
-#### create_source_node_graph(uri, userName, password, file):
-
- Creates a source node in Neo4jGraph and sets properties.
-
- Args:
- uri: URI of Graph Service to connect to
- userName: Username to connect to Graph Service with ( default : None )
- password: Password to connect to Graph Service with ( default : None )
- file: File object with information about file to be added
-
- Returns:
- Success or Failure message of node creation
-
-
-
-
-#### get_source_list_from_graph():
-
- Returns a list of file sources in the database by querying the graph and
- sorting the list by the last updated date.
-
-
-
-#### Chunk nodes and embeddings creation in Neo4j
-
-
-
-
-## Application Walkthrough
-https://github.com/neo4j-labs/llm-graph-builder/assets/121786590/b725a503-6ade-46d2-9e70-61d57443c311
+
+## Usage
+1. Connect to Neo4j Aura Instance by passing URI and password or using Neo4j credentials file.
+2. Choose your source from a list of Unstructured sources to create graph.
+3. Change the LLM (if required) from drop down, which will be used to generate graph.
+4. Optionally, define schema(nodes and relationship labels) in entity graph extraction settings.
+5. Either select multiple files to 'Generate Graph' or all the files in 'New' status will be processed for graph creation.
+6. Have a look at the graph for individial files using 'View' in grid or select one or more files and 'Preview Graph'
+7. Ask questions related to the processed/completed sources to chat-bot, Also get detailed information about your answers generated by LLM.
## Links
- The Public [ Google cloud Run URL](https://devfrontend-dcavk67s4a-uc.a.run.app).
- [Workspace URL](https://workspace-preview.neo4j.io/workspace)
+[LLM Knowledge Graph Builder Application](https://llm-graph-builder.neo4jlabs.com/)
+
+[Neo4j Workspace](https://workspace-preview.neo4j.io/workspace/query)
+
+## Reference
+
+[Demo of application](https://www.youtube.com/watch?v=LlNy5VmV290)
+
+## Contact
+For any inquiries or support, feel free to raise [Github Issue](https://github.com/neo4j-labs/llm-graph-builder/issues)
+
+
+## Happy Graph Building!
\ No newline at end of file
diff --git a/backend/example.env b/backend/example.env
index 20574cc6..fe9124bc 100644
--- a/backend/example.env
+++ b/backend/example.env
@@ -20,4 +20,6 @@ LANGCHAIN_API_KEY = ""
LANGCHAIN_PROJECT = ""
LANGCHAIN_TRACING_V2 = ""
LANGCHAIN_ENDPOINT = ""
-GCS_FILE_CACHE = "" #save the file into GCS or local, SHould be True or False
\ No newline at end of file
+GCS_FILE_CACHE = "" #save the file into GCS or local, SHould be True or False
+NEO4J_USER_AGENT = ""
+ENABLE_USER_AGENT = ""
\ No newline at end of file
diff --git a/backend/score.py b/backend/score.py
index 70cb1048..abfa5007 100644
--- a/backend/score.py
+++ b/backend/score.py
@@ -18,6 +18,7 @@
from src.graphDB_dataAccess import graphDBdataAccess
from src.graph_query import get_graph_results
from src.chunkid_entities import get_entities_from_chunkids
+from src.post_processing import create_fulltext
from sse_starlette.sse import EventSourceResponse
import json
from typing import List, Mapping
@@ -30,7 +31,9 @@
from google.cloud import logging as gclogger
from src.logger import CustomLogger
from datetime import datetime
+from fastapi.middleware.gzip import GZipMiddleware
import time
+import gc
logger = CustomLogger()
CHUNK_DIR = os.path.join(os.path.dirname(__file__), "chunks")
@@ -55,6 +58,7 @@ def sick():
allow_methods=["*"],
allow_headers=["*"],
)
+app.add_middleware(GZipMiddleware, minimum_size=1000)
is_gemini_enabled = os.environ.get("GEMINI_ENABLED", "False").lower() in ("true", "1", "yes")
if is_gemini_enabled:
@@ -97,6 +101,9 @@ async def create_source_knowledge_graph_url(
elif source_type == 'gcs bucket':
lst_file_name,success_count,failed_count = create_source_node_graph_url_gcs(graph, model, gcs_project_id, gcs_bucket_name, gcs_bucket_folder, source_type,Credentials(access_token)
)
+ elif source_type == 'web-url':
+ lst_file_name,success_count,failed_count = await asyncio.to_thread(create_source_node_graph_web_url,graph, model, source_url, source_type
+ )
elif source_type == 'youtube':
lst_file_name,success_count,failed_count = await asyncio.to_thread(create_source_node_graph_url_youtube,graph, model, source_url, source_type
)
@@ -107,7 +114,7 @@ async def create_source_knowledge_graph_url(
return create_api_response('Failed',message='source_type is other than accepted source')
message = f"Source Node created successfully for source type: {source_type} and source: {source}"
- josn_obj = {'api_name':'url_scan','db_url':uri,'url_scanned_file':lst_file_name}
+ josn_obj = {'api_name':'url_scan','db_url':uri,'url_scanned_file':lst_file_name, 'source_url':source_url, 'wiki_query':wiki_query}
logger.log_struct(josn_obj)
return create_api_response("Success",message=message,success_count=success_count,failed_count=failed_count,file_name=lst_file_name)
except Exception as e:
@@ -116,6 +123,8 @@ async def create_source_knowledge_graph_url(
logging.exception(f'Exception Stack trace:')
return create_api_response('Failed',message=message + error_message[:80],error=error_message,file_source=source_type)
finally:
+ gc.collect()
+ if graph is not None:
close_db_connection(graph, 'url/scan')
@app.post("/extract")
@@ -167,6 +176,10 @@ async def extract_knowledge_graph_from_file(
elif source_type == 's3 bucket' and source_url:
result = await asyncio.to_thread(
extract_graph_from_file_s3, graph, model, source_url, aws_access_key_id, aws_secret_access_key, allowedNodes, allowedRelationship)
+
+ elif source_type == 'web-url':
+ result = await asyncio.to_thread(
+ extract_graph_from_web_page, graph, model, source_url, allowedNodes, allowedRelationship)
elif source_type == 'youtube' and source_url:
result = await asyncio.to_thread(
@@ -184,6 +197,9 @@ async def extract_knowledge_graph_from_file(
if result is not None:
result['db_url'] = uri
result['api_name'] = 'extract'
+ result['source_url'] = source_url
+ result['wiki_query'] = wiki_query
+ result['source_type'] = source_type
logger.log_struct(result)
return create_api_response('Success', data=result, file_source= source_type)
except Exception as e:
@@ -194,15 +210,19 @@ async def extract_knowledge_graph_from_file(
if source_type == 'local file':
if gcs_file_cache == 'True':
folder_name = create_gcs_bucket_folder_name_hashed(uri,file_name)
+ copy_failed_file(BUCKET_UPLOAD, BUCKET_FAILED_FILE, folder_name, file_name)
+ time.sleep(5)
delete_file_from_gcs(BUCKET_UPLOAD,folder_name,file_name)
else:
logging.info(f'Deleted File Path: {merged_file_path} and Deleted File Name : {file_name}')
delete_uploaded_local_file(merged_file_path,file_name)
- josn_obj = {'message':message,'error_message':error_message, 'file_name': file_name,'status':'Failed','db_url':uri,'failed_count':1, 'source_type': source_type}
+ josn_obj = {'message':message,'error_message':error_message, 'file_name': file_name,'status':'Failed','db_url':uri,'failed_count':1, 'source_type': source_type, 'source_url':source_url, 'wiki_query':wiki_query}
logger.log_struct(josn_obj)
logging.exception(f'File Failed in extraction: {josn_obj}')
return create_api_response('Failed', message=message + error_message[:100], error=error_message, file_name = file_name)
finally:
+ gc.collect()
+ if graph is not None:
close_db_connection(graph, 'extract')
@app.get("/sources_list")
@@ -225,41 +245,51 @@ async def get_source_list(uri:str, userName:str, password:str, database:str=None
logging.exception(f'Exception:{error_message}')
return create_api_response(job_status, message=message, error=error_message)
-@app.post("/update_similarity_graph")
-async def update_similarity_graph(uri=Form(None), userName=Form(None), password=Form(None), database=Form(None)):
- """
- Calls 'update_graph' which post the query to update the similiar nodes in the graph
- """
+@app.post("/post_processing")
+async def post_processing(uri=Form(None), userName=Form(None), password=Form(None), database=Form(None), tasks=Form(None)):
try:
graph = create_graph_database_connection(uri, userName, password, database)
- await asyncio.to_thread(update_graph, graph)
-
- josn_obj = {'api_name':'update_similarity_graph','db_url':uri}
- logger.log_struct(josn_obj)
- return create_api_response('Success',message='Updated KNN Graph')
+ tasks = set(map(str.strip, json.loads(tasks)))
+
+ if "update_similarity_graph" in tasks:
+ await asyncio.to_thread(update_graph, graph)
+ josn_obj = {'api_name': 'post_processing/update_similarity_graph', 'db_url': uri}
+ logger.log_struct(josn_obj)
+ logging.info(f'Updated KNN Graph')
+ if "create_fulltext_index" in tasks:
+ await asyncio.to_thread(create_fulltext, uri=uri, username=userName, password=password, database=database)
+ josn_obj = {'api_name': 'post_processing/create_fulltext_index', 'db_url': uri}
+ logger.log_struct(josn_obj)
+ logging.info(f'Full Text index created')
+
+ return create_api_response('Success', message='All tasks completed successfully')
+
except Exception as e:
job_status = "Failed"
- message="Unable to update KNN Graph"
error_message = str(e)
- logging.exception(f'Exception in update KNN graph:{error_message}')
+ message = f"Unable to complete tasks"
+ logging.exception(f'Exception in post_processing tasks: {error_message}')
return create_api_response(job_status, message=message, error=error_message)
+
finally:
- close_db_connection(graph, 'update_similarity_graph')
+ gc.collect()
+ if graph is not None:
+ close_db_connection(graph, 'post_processing')
@app.post("/chat_bot")
-async def chat_bot(uri=Form(None),model=Form(None),userName=Form(None), password=Form(None), database=Form(None),question=Form(None), session_id=Form(None)):
+async def chat_bot(uri=Form(None),model=Form(None),userName=Form(None), password=Form(None), database=Form(None),question=Form(None), session_id=Form(None),mode=Form(None)):
logging.info(f"QA_RAG called at {datetime.now()}")
qa_rag_start_time = time.time()
try:
# database = "neo4j"
graph = create_graph_database_connection(uri, userName, password, database)
- result = await asyncio.to_thread(QA_RAG,graph=graph,model=model,question=question,session_id=session_id)
+ result = await asyncio.to_thread(QA_RAG,graph=graph,model=model,question=question,session_id=session_id,mode=mode)
total_call_time = time.time() - qa_rag_start_time
logging.info(f"Total Response time is {total_call_time:.2f} seconds")
result["info"]["response_time"] = round(total_call_time, 2)
- josn_obj = {'api_name':'chat_bot','db_url':uri}
+ josn_obj = {'api_name':'chat_bot','db_url':uri,'session_id':session_id}
logger.log_struct(josn_obj)
return create_api_response('Success',data=result)
except Exception as e:
@@ -268,11 +298,13 @@ async def chat_bot(uri=Form(None),model=Form(None),userName=Form(None), password
error_message = str(e)
logging.exception(f'Exception in chat bot:{error_message}')
return create_api_response(job_status, message=message, error=error_message)
+ finally:
+ gc.collect()
@app.post("/chunk_entities")
async def chunk_entities(uri=Form(None),userName=Form(None), password=Form(None), chunk_ids=Form(None)):
try:
- logging.info(f"URI: {uri}, Username: {userName},password:{password}, chunk_ids: {chunk_ids}")
+ logging.info(f"URI: {uri}, Username: {userName}, chunk_ids: {chunk_ids}")
result = await asyncio.to_thread(get_entities_from_chunkids,uri=uri, username=userName, password=password, chunk_ids=chunk_ids)
josn_obj = {'api_name':'chunk_entities','db_url':uri}
logger.log_struct(josn_obj)
@@ -283,6 +315,8 @@ async def chunk_entities(uri=Form(None),userName=Form(None), password=Form(None)
error_message = str(e)
logging.exception(f'Exception in chat bot:{error_message}')
return create_api_response(job_status, message=message, error=error_message)
+ finally:
+ gc.collect()
@app.post("/graph_query")
async def graph_query(
@@ -302,7 +336,7 @@ async def graph_query(
query_type=query_type,
document_names=document_names
)
- josn_obj = {'api_name':'graph_query','db_url':uri}
+ josn_obj = {'api_name':'graph_query','db_url':uri,'document_names':document_names}
logger.log_struct(josn_obj)
return create_api_response('Success', data=result)
except Exception as e:
@@ -311,6 +345,9 @@ async def graph_query(
error_message = str(e)
logging.exception(f'Exception in graph query: {error_message}')
return create_api_response(job_status, message=message, error=error_message)
+ finally:
+ gc.collect()
+
@app.post("/clear_chat_bot")
async def clear_chat_bot(uri=Form(None),userName=Form(None), password=Form(None), database=Form(None), session_id=Form(None)):
@@ -325,6 +362,8 @@ async def clear_chat_bot(uri=Form(None),userName=Form(None), password=Form(None)
logging.exception(f'Exception in chat bot:{error_message}')
return create_api_response(job_status, message=message, error=error_message)
finally:
+ gc.collect()
+ if graph is not None:
close_db_connection(graph, 'clear_chat_bot')
@app.post("/connect")
@@ -363,6 +402,8 @@ async def upload_large_file_into_chunks(file:UploadFile = File(...), chunkNumber
logging.exception(f'Exception:{error_message}')
return create_api_response('Failed', message=message + error_message[:100], error=error_message, file_name = originalname)
finally:
+ gc.collect()
+ if graph is not None:
close_db_connection(graph, 'upload')
@app.post("/schema")
@@ -381,6 +422,8 @@ async def get_structured_schema(uri=Form(None), userName=Form(None), password=Fo
logging.exception(f'Exception:{error_message}')
return create_api_response("Failed", message=message, error=error_message)
finally:
+ gc.collect()
+ if graph is not None:
close_db_connection(graph, 'schema')
def decode_password(pwd):
@@ -424,13 +467,13 @@ async def generate():
return EventSourceResponse(generate(),ping=60)
@app.post("/delete_document_and_entities")
-async def delete_document_and_entities(uri=Form(None),
- userName=Form(None),
- password=Form(None),
- database=Form(None),
- filenames=Form(None),
- source_types=Form(None),
- deleteEntities=Form(None)):
+async def delete_document_and_entities(uri=Form(),
+ userName=Form(),
+ password=Form(),
+ database=Form(),
+ filenames=Form(),
+ source_types=Form(),
+ deleteEntities=Form()):
try:
graph = create_graph_database_connection(uri, userName, password, database)
graphDb_data_Access = graphDBdataAccess(graph)
@@ -447,6 +490,8 @@ async def delete_document_and_entities(uri=Form(None),
logging.exception(f'{message}:{error_message}')
return create_api_response(job_status, message=message, error=error_message)
finally:
+ gc.collect()
+ if graph is not None:
close_db_connection(graph, 'delete_document_and_entities')
@app.get('/document_status/{file_name}')
@@ -497,7 +542,9 @@ async def cancelled_job(uri=Form(None), userName=Form(None), password=Form(None)
logging.exception(f'Exception in cancelling the running job:{error_message}')
return create_api_response(job_status, message=message, error=error_message)
finally:
- close_db_connection(graph, 'cancelled_job')
+ gc.collect()
+ if graph is not None:
+ close_db_connection(graph, 'cancelled_job')
@app.post("/populate_graph_schema")
async def populate_graph_schema(input_text=Form(None), model=Form(None), is_schema_description_checked=Form(None)):
@@ -510,6 +557,44 @@ async def populate_graph_schema(input_text=Form(None), model=Form(None), is_sche
error_message = str(e)
logging.exception(f'Exception in getting the schema from text:{error_message}')
return create_api_response(job_status, message=message, error=error_message)
+ finally:
+ gc.collect()
+
+@app.post("/get_unconnected_nodes_list")
+async def get_unconnected_nodes_list(uri=Form(), userName=Form(), password=Form(), database=Form()):
+ try:
+ graph = create_graph_database_connection(uri, userName, password, database)
+ graphDb_data_Access = graphDBdataAccess(graph)
+ result = graphDb_data_Access.list_unconnected_nodes()
+ return create_api_response('Success',data=result)
+ except Exception as e:
+ job_status = "Failed"
+ message="Unable to get the list of unconnected nodes"
+ error_message = str(e)
+ logging.exception(f'Exception in getting list of unconnected nodes:{error_message}')
+ return create_api_response(job_status, message=message, error=error_message)
+ finally:
+ if graph is not None:
+ close_db_connection(graph,"get_unconnected_nodes_list")
+ gc.collect()
+
+@app.post("/delete_unconnected_nodes")
+async def get_unconnected_nodes_list(uri=Form(), userName=Form(), password=Form(), database=Form(),unconnected_entities_list=Form()):
+ try:
+ graph = create_graph_database_connection(uri, userName, password, database)
+ graphDb_data_Access = graphDBdataAccess(graph)
+ result = graphDb_data_Access.delete_unconnected_nodes(unconnected_entities_list)
+ return create_api_response('Success',data=result,message="Unconnected entities delete successfully")
+ except Exception as e:
+ job_status = "Failed"
+ message="Unable to delete the unconnected nodes"
+ error_message = str(e)
+ logging.exception(f'Exception in delete the unconnected nodes:{error_message}')
+ return create_api_response(job_status, message=message, error=error_message)
+ finally:
+ if graph is not None:
+ close_db_connection(graph,"delete_unconnected_nodes")
+ gc.collect()
if __name__ == "__main__":
uvicorn.run(app)
\ No newline at end of file
diff --git a/backend/src/QA_integration_new.py b/backend/src/QA_integration_new.py
index 8a706369..20347d07 100644
--- a/backend/src/QA_integration_new.py
+++ b/backend/src/QA_integration_new.py
@@ -2,9 +2,6 @@
from langchain.graphs import Neo4jGraph
import os
from dotenv import load_dotenv
-from langchain_openai import ChatOpenAI
-from langchain_google_vertexai import ChatVertexAI
-from langchain_google_vertexai import HarmBlockThreshold, HarmCategory
import logging
from langchain_community.chat_message_histories import Neo4jChatMessageHistory
from src.shared.common_fn import load_embedding_model, get_llm
@@ -28,74 +25,13 @@
EMBEDDING_MODEL = os.getenv('EMBEDDING_MODEL')
EMBEDDING_FUNCTION , _ = load_embedding_model(EMBEDDING_MODEL)
-RETRIEVAL_QUERY = """
-WITH node as chunk, score
-MATCH (chunk)-[:PART_OF]->(d:Document)
-CALL { WITH chunk
-MATCH (chunk)-[:HAS_ENTITY]->(e)
-MATCH path=(e)(()-[rels:!HAS_ENTITY&!PART_OF]-()){0,2}(:!Chunk&!Document)
-UNWIND rels as r
-RETURN collect(distinct r) as rels
-}
-WITH d, collect(distinct chunk) as chunks, avg(score) as score, apoc.coll.toSet(apoc.coll.flatten(collect(rels))) as rels
-WITH d, score,
-[c in chunks | c.text] as texts, [c in chunks | c.id] as chunkIds, [c in chunks | c.start_time] as start_time, [c in chunks | c.page_number] as page_numbers, [c in chunks | c.start_time] as start_times,
-[r in rels | coalesce(apoc.coll.removeAll(labels(startNode(r)),['__Entity__'])[0],"") +":"+ startNode(r).id + " "+ type(r) + " " + coalesce(apoc.coll.removeAll(labels(endNode(r)),['__Entity__'])[0],"") +":" + endNode(r).id] as entities
-WITH d, score,
-apoc.text.join(texts,"\n----\n") +
-apoc.text.join(entities,"\n")
-as text, entities, chunkIds, page_numbers ,start_times
-RETURN text, score, {source: COALESCE(CASE WHEN d.url CONTAINS "None" THEN d.fileName ELSE d.url END, d.fileName), chunkIds:chunkIds, page_numbers:page_numbers,start_times:start_times,entities:entities} as metadata
-"""
-
-SYSTEM_TEMPLATE = """
-You are an AI-powered question-answering agent. Your task is to provide accurate and comprehensive responses to user queries based on the given context, chat history, and available resources.
-
-### Response Guidelines:
-1. **Direct Answers**: Provide clear and thorough answers to the user's queries without headers unless requested. Avoid speculative responses.
-2. **Utilize History and Context**: Leverage relevant information from previous interactions, the current user input, and the context provided below.
-3. **No Greetings in Follow-ups**: Start with a greeting in initial interactions. Avoid greetings in subsequent responses unless there's a significant break or the chat restarts.
-4. **Admit Unknowns**: Clearly state if an answer is unknown. Avoid making unsupported statements.
-5. **Avoid Hallucination**: Only provide information based on the context provided. Do not invent information.
-6. **Response Length**: Keep responses concise and relevant. Aim for clarity and completeness within 4-5 sentences unless more detail is requested.
-7. **Tone and Style**: Maintain a professional and informative tone. Be friendly and approachable.
-8. **Error Handling**: If a query is ambiguous or unclear, ask for clarification rather than providing a potentially incorrect answer.
-9. **Fallback Options**: If the required information is not available in the provided context, provide a polite and helpful response. Example: "I don't have that information right now." or "I'm sorry, but I don't have that information. Is there something else I can help with?"
-10. **Context Availability**: If the context is empty, do not provide answers based solely on internal knowledge. Instead, respond appropriately by indicating the lack of information.
-
-
-**IMPORTANT** : DO NOT ANSWER FROM YOUR KNOWLEDGE BASE USE THE BELOW CONTEXT
-
-### Context:
-
-{context}
-
-
-### Example Responses:
-User: Hi
-AI Response: 'Hello there! How can I assist you today?'
-
-User: "What is Langchain?"
-AI Response: "Langchain is a framework that enables the development of applications powered by large language models, such as chatbots. It simplifies the integration of language models into various applications by providing useful tools and components."
-
-User: "Can you explain how to use memory management in Langchain?"
-AI Response: "Langchain's memory management involves utilizing built-in mechanisms to manage conversational context effectively. It ensures that the conversation remains coherent and relevant by maintaining the history of interactions and using it to inform responses."
-
-User: "I need help with PyCaret's classification model."
-AI Response: "PyCaret simplifies the process of building and deploying machine learning models. For classification tasks, you can use PyCaret's setup function to prepare your data. After setup, you can compare multiple models to find the best one, and then fine-tune it for better performance."
-
-User: "What can you tell me about the latest realtime trends in AI?"
-AI Response: "I don't have that information right now. Is there something else I can help with?"
-
-Note: This system does not generate answers based solely on internal knowledge. It answers from the information provided in the user's current and previous inputs, and from the context.
-"""
-
-def get_neo4j_retriever(graph, index_name="vector", search_k=CHAT_SEARCH_KWARG_K, score_threshold=CHAT_SEARCH_KWARG_SCORE_THRESHOLD):
+
+def get_neo4j_retriever(graph, retrieval_query,index_name="vector", search_k=CHAT_SEARCH_KWARG_K, score_threshold=CHAT_SEARCH_KWARG_SCORE_THRESHOLD):
try:
neo_db = Neo4jVector.from_existing_index(
embedding=EMBEDDING_FUNCTION,
index_name=index_name,
- retrieval_query=RETRIEVAL_QUERY,
+ retrieval_query=retrieval_query,
graph=graph
)
logging.info(f"Successfully retrieved Neo4jVector index '{index_name}'")
@@ -107,20 +43,16 @@ def get_neo4j_retriever(graph, index_name="vector", search_k=CHAT_SEARCH_KWARG_K
return None
def create_document_retriever_chain(llm,retriever):
- question_template= "Given the below conversation, generate a search query to look up in order to get information relevant to the conversation. Only respond with the query, nothing else."
-
query_transform_prompt = ChatPromptTemplate.from_messages(
[
- ("system", question_template),
+ ("system", QUESTION_TRANSFORM_TEMPLATE),
MessagesPlaceholder(variable_name="messages")
]
)
output_parser = StrOutputParser()
- splitter = TokenTextSplitter(chunk_size=2000, chunk_overlap=0)
- # extractor = LLMChainExtractor.from_llm(llm)
- # redundant_filter = EmbeddingsRedundantFilter(embeddings=EMBEDDING_FUNCTION)
- embeddings_filter = EmbeddingsFilter(embeddings=EMBEDDING_FUNCTION, similarity_threshold=0.25)
+ splitter = TokenTextSplitter(chunk_size=CHAT_DOC_SPLIT_SIZE, chunk_overlap=0)
+ embeddings_filter = EmbeddingsFilter(embeddings=EMBEDDING_FUNCTION, similarity_threshold=CHAT_EMBEDDING_FILTER_SCORE_THRESHOLD)
pipeline_compressor = DocumentCompressorPipeline(
transformers=[splitter, embeddings_filter]
@@ -157,9 +89,14 @@ def create_neo4j_chat_message_history(graph, session_id):
logging.error(f"Error creating Neo4jChatMessageHistory: {e}")
return None
-def format_documents(documents):
+def format_documents(documents,model):
+ prompt_token_cutoff = 4
+ for models,value in CHAT_TOKEN_CUT_OFF.items():
+ if model in models:
+ prompt_token_cutoff = value
+
sorted_documents = sorted(documents, key=lambda doc: doc.state["query_similarity_score"], reverse=True)
- sorted_documents = sorted_documents[:7]
+ sorted_documents = sorted_documents[:prompt_token_cutoff]
formatted_docs = []
sources = set()
@@ -178,7 +115,7 @@ def format_documents(documents):
return "\n\n".join(formatted_docs), sources
-def get_rag_chain(llm,system_template=SYSTEM_TEMPLATE):
+def get_rag_chain(llm,system_template=CHAT_SYSTEM_TEMPLATE):
question_answering_prompt = ChatPromptTemplate.from_messages(
[
("system", system_template),
@@ -193,54 +130,26 @@ def get_rag_chain(llm,system_template=SYSTEM_TEMPLATE):
return question_answering_chain
-def update_timestamps_with_min_seconds(result_dict):
- def time_to_seconds(time_str):
- h, m, s = map(int, time_str.split(':'))
- return h * 3600 + m * 60 + s
-
- for source in result_dict.get('sources', []):
- time_stamps = source.get('start_time', [])
- if time_stamps:
- seconds_list = [time_to_seconds(ts) for ts in time_stamps]
- min_seconds = min(seconds_list)
- source['start_time'] = min_seconds
-
- return result_dict
-
def get_sources_and_chunks(sources_used, docs):
- docs_metadata = dict()
+ chunkdetails_list = []
+ sources_used_set = set(sources_used)
+
for doc in docs:
source = doc.metadata["source"]
- chunkids = doc.metadata["chunkIds"]
- page_numbers = doc.metadata["page_numbers"]
- start_times = doc.metadata["start_times"]
- docs_metadata[source] = [chunkids,page_numbers,start_times]
- chunkids = list()
- output_sources = list()
- for source in sources_used:
- if source in set(docs_metadata.keys()):
- chunkids.extend(docs_metadata[source][0])
- page_numbers = docs_metadata[source][1]
- start_times = docs_metadata[source][2]
- current_source = {
- "source_name":source,
- "page_numbers":page_numbers if len(page_numbers) > 1 and page_numbers[0] is not None else [],
- "start_time": start_times if len(start_times) > 1 and start_times[0] is not None else [],
- }
- output_sources.append(current_source)
+ chunkdetails = doc.metadata["chunkdetails"]
+ if source in sources_used_set:
+ chunkdetails = [{**chunkdetail, "score": round(chunkdetail["score"], 4)} for chunkdetail in chunkdetails]
+ chunkdetails_list.extend(chunkdetails)
result = {
- 'sources': output_sources,
- 'chunkIds': chunkids
+ 'sources': sources_used,
+ 'chunkdetails': chunkdetails_list
}
-
- result = update_timestamps_with_min_seconds(result)
return result
def summarize_messages(llm,history,stored_messages):
if len(stored_messages) == 0:
return False
- # summarization_template = "Distill the below chat messages into a single summary message. Include as many specific details as you can."
summarization_prompt = ChatPromptTemplate.from_messages(
[
MessagesPlaceholder(variable_name="chat_history"),
@@ -273,75 +182,98 @@ def clear_chat_history(graph,session_id):
"user": "chatbot"
}
-def QA_RAG(graph,model,question,session_id):
+def setup_chat(model, graph, session_id, retrieval_query):
+ start_time = time.time()
+ model_version = MODEL_VERSIONS[model]
+ llm = get_llm(model_version)
+ retriever = get_neo4j_retriever(graph=graph,retrieval_query=retrieval_query)
+ doc_retriever = create_document_retriever_chain(llm, retriever)
+ history = create_neo4j_chat_message_history(graph, session_id)
+ chat_setup_time = time.time() - start_time
+ logging.info(f"Chat setup completed in {chat_setup_time:.2f} seconds")
+
+ return llm, doc_retriever, history, model_version
+
+def retrieve_documents(doc_retriever, messages):
+ start_time = time.time()
+ docs = doc_retriever.invoke({"messages": messages})
+ doc_retrieval_time = time.time() - start_time
+ logging.info(f"Documents retrieved in {doc_retrieval_time:.2f} seconds")
+ return docs
+
+def process_documents(docs, question, messages, llm,model):
+ start_time = time.time()
+ formatted_docs, sources = format_documents(docs,model)
+ rag_chain = get_rag_chain(llm=llm)
+ ai_response = rag_chain.invoke({
+ "messages": messages[:-1],
+ "context": formatted_docs,
+ "input": question
+ })
+ result = get_sources_and_chunks(sources, docs)
+ content = ai_response.content
+
+ if "gemini" in model:
+ total_tokens = ai_response.response_metadata['usage_metadata']['prompt_token_count']
+ else:
+ total_tokens = ai_response.response_metadata['token_usage']['total_tokens']
+
+ predict_time = time.time() - start_time
+ logging.info(f"Final Response predicted in {predict_time:.2f} seconds")
+
+ return content, result, total_tokens
+
+def summarize_and_log(history, messages, llm):
+ start_time = time.time()
+ summarize_messages(llm, history, messages)
+ history_summarized_time = time.time() - start_time
+ logging.info(f"Chat History summarized in {history_summarized_time:.2f} seconds")
+
+def QA_RAG(graph, model, question, session_id, mode):
try:
- start_time = time.time()
- print(model)
- model_version = MODEL_VERSIONS[model]
- llm = get_llm(model_version)
- retriever = get_neo4j_retriever(graph=graph)
- doc_retriever = create_document_retriever_chain(llm,retriever)
- history = create_neo4j_chat_message_history(graph,session_id )
- chat_setup_time = time.time() - start_time
- logging.info(f"Chat setup completed in {chat_setup_time:.2f} seconds")
-
- start_time = time.time()
+ logging.info(f"Chat Mode : {mode}")
+ if mode == "vector":
+ retrieval_query = VECTOR_SEARCH_QUERY
+ elif mode == "graph":
+ #WIP
+ result = {
+ "session_id": session_id,
+ "user": "chatbot"
+ }
+ return result
+ else:
+ retrieval_query = VECTOR_GRAPH_SEARCH_QUERY
+
+ llm, doc_retriever, history, model_version = setup_chat(model, graph, session_id, retrieval_query)
messages = history.messages
user_question = HumanMessage(content=question)
messages.append(user_question)
- docs = doc_retriever.invoke(
- {
- "messages":messages
- }
- )
+
+ docs = retrieve_documents(doc_retriever, messages)
+
if docs:
- # print(docs)
- formatted_docs,sources = format_documents(docs)
-
- doc_retrieval_time = time.time() - start_time
- logging.info(f"Modified question and Documents retrieved in {doc_retrieval_time:.2f} seconds")
-
- start_time = time.time()
- rag_chain = get_rag_chain(llm=llm)
- ai_response = rag_chain.invoke(
- {
- "messages" : messages[:-1],
- "context" : formatted_docs,
- "input" : question
- }
- )
- result = get_sources_and_chunks(sources,docs)
- content = ai_response.content
- if "Gemini" in model:
- total_tokens = ai_response.response_metadata['usage_metadata']['prompt_token_count']
- else:
- total_tokens = ai_response.response_metadata['token_usage']['total_tokens']
- predict_time = time.time() - start_time
- logging.info(f"Final Response predicted in {predict_time:.2f} seconds")
+ content, result, total_tokens = process_documents(docs, question, messages, llm,model)
else:
- ai_response = AIMessage(content="I couldn't find any relevant documents to answer your question.")
- result = {"sources": [], "chunkIds": []}
+ content = "I couldn't find any relevant documents to answer your question."
+ result = {"sources": [], "chunkdetails": []}
total_tokens = 0
- content = ai_response.content
-
- start_time = time.time()
+
+ ai_response = AIMessage(content=content)
messages.append(ai_response)
- summarize_messages(llm,history,messages)
- history_summarized_time = time.time() - start_time
- logging.info(f"Chat History summarized in {history_summarized_time:.2f} seconds")
-
+ summarize_and_log(history, messages, llm)
+
return {
"session_id": session_id,
"message": content,
"info": {
"sources": result["sources"],
"model": model_version,
- "chunkids":result["chunkIds"],
+ "chunkdetails": result["chunkdetails"],
"total_tokens": total_tokens,
"response_time": 0
},
"user": "chatbot"
- }
+ }
except Exception as e:
logging.exception(f"Exception in QA component at {datetime.now()}: {str(e)}")
@@ -354,4 +286,5 @@ def QA_RAG(graph,model,question,session_id):
"chunkids": [],
"error": f"{error_name} :- {str(e)}"
},
- "user": "chatbot"}
+ "user": "chatbot"
+ }
diff --git a/backend/src/chunkid_entities.py b/backend/src/chunkid_entities.py
index aeaf6659..9785403a 100644
--- a/backend/src/chunkid_entities.py
+++ b/backend/src/chunkid_entities.py
@@ -102,14 +102,23 @@ def get_entities_from_chunkids(uri, username, password, chunk_ids):
"""
try:
logging.info(f"Starting graph query process for chunk ids")
- chunk_ids_list = chunk_ids.split(",")
- driver = get_graphDB_driver(uri, username, password)
- records, summary, keys = driver.execute_query(CHUNK_QUERY, chunksIds=chunk_ids_list)
- result = process_records(records)
- logging.info(f"Nodes and relationships are processed")
- result["chunk_data"] = process_chunk_data(records)
- logging.info(f"Query process completed successfully for chunk ids")
- return result
+ if chunk_ids:
+ chunk_ids_list = chunk_ids.split(",")
+ driver = get_graphDB_driver(uri, username, password)
+ records, summary, keys = driver.execute_query(CHUNK_QUERY, chunksIds=chunk_ids_list)
+ result = process_records(records)
+ logging.info(f"Nodes and relationships are processed")
+ result["chunk_data"] = process_chunk_data(records)
+ logging.info(f"Query process completed successfully for chunk ids")
+ return result
+ else:
+ logging.info(f"chunkid_entities module: No chunk ids are passed")
+ result = {
+ "nodes": [],
+ "relationships": [],
+ "chunk_data":[]
+ }
+ return result
except Exception as e:
logging.error(f"chunkid_entities module: An error occurred in get_entities_from_chunkids. Error: {str(e)}")
diff --git a/backend/src/document_sources/gcs_bucket.py b/backend/src/document_sources/gcs_bucket.py
index 7c5dc4a5..5d8ba90c 100644
--- a/backend/src/document_sources/gcs_bucket.py
+++ b/backend/src/document_sources/gcs_bucket.py
@@ -139,3 +139,14 @@ def delete_file_from_gcs(bucket_name,folder_name, file_name):
logging.info('File deleted from GCS successfully')
except Exception as e:
raise Exception(e)
+
+def copy_failed_file(source_bucket_name,dest_bucket_name,folder_name, file_name):
+ try:
+ storage_client = storage.Client()
+ bucket = storage_client.bucket(source_bucket_name)
+ folder_file_name = folder_name +'/'+file_name
+ source_blob = bucket.blob(folder_file_name)
+ bucket.copy_blob(source_blob,dest_bucket_name,file_name)
+ logging.info(f'Failed file {file_name} copied to {dest_bucket_name} from {source_bucket_name} in GCS successfully')
+ except Exception as e:
+ raise Exception(e)
diff --git a/backend/src/document_sources/web_pages.py b/backend/src/document_sources/web_pages.py
new file mode 100644
index 00000000..39f2fb85
--- /dev/null
+++ b/backend/src/document_sources/web_pages.py
@@ -0,0 +1,16 @@
+import logging
+from langchain_community.document_loaders import WebBaseLoader
+from src.api_response import create_api_response
+
+def get_documents_from_web_page(source_url:str):
+ try:
+ pages = WebBaseLoader(source_url).load()
+ file_name = pages[0].metadata['title']
+ return file_name, pages
+ except Exception as e:
+ job_status = "Failed"
+ message="Failed To Process Web URL"
+ error_message = str(e)
+ logging.error(f"Failed To Process Web URL: {file_name}")
+ logging.exception(f'Exception Stack trace: {error_message}')
+ return create_api_response(job_status,message=message,error=error_message,file_name=file_name)
\ No newline at end of file
diff --git a/backend/src/graphDB_dataAccess.py b/backend/src/graphDB_dataAccess.py
index 3c0ec191..367de329 100644
--- a/backend/src/graphDB_dataAccess.py
+++ b/backend/src/graphDB_dataAccess.py
@@ -213,4 +213,25 @@ def delete_file_from_graph(self, filenames, source_types, deleteEntities:str, me
result = self.execute_query(query_to_delete_document, param)
logging.info(f"Deleting {len(filename_list)} documents = '{filename_list}' from '{source_types_list}' with their entities from database")
- return result, len(filename_list)
\ No newline at end of file
+ return result, len(filename_list)
+
+ def list_unconnected_nodes(self):
+ query = """
+ MATCH (e:!Chunk&!Document)
+ WHERE NOT exists { (e)--(:!Chunk&!Document) }
+ OPTIONAL MATCH (doc:Document)<-[:PART_OF]-(c:Chunk)-[:HAS_ENTITY]->(e)
+ RETURN e {.*, embedding:null, elementId:elementId(e), labels:labels(e)} as e,
+ collect(distinct doc.fileName) as documents, count(distinct c) as chunkConnections
+ ORDER BY e.id ASC
+ LIMIT 100
+ """
+ return self.execute_query(query)
+
+ def delete_unconnected_nodes(self,unconnected_entities_list):
+ entities_list = list(map(str.strip, json.loads(unconnected_entities_list)))
+ query = """
+ MATCH (e) WHERE elementId(e) IN $elementIds
+ DETACH DELETE e
+ """
+ param = {"elementIds":entities_list}
+ return self.execute_query(query,param)
\ No newline at end of file
diff --git a/backend/src/graph_query.py b/backend/src/graph_query.py
index 4a321db1..dcce6fad 100644
--- a/backend/src/graph_query.py
+++ b/backend/src/graph_query.py
@@ -55,7 +55,11 @@ def get_graphDB_driver(uri, username, password):
"""
try:
logging.info(f"Attempting to connect to the Neo4j database at {uri}")
- driver = GraphDatabase.driver(uri, auth=(username, password), user_agent=os.environ.get('NEO4J_USER_AGENT'))
+ enable_user_agent = os.environ.get("ENABLE_USER_AGENT", "False").lower() in ("true", "1", "yes")
+ if enable_user_agent:
+ driver = GraphDatabase.driver(uri, auth=(username, password), user_agent=os.environ.get('NEO4J_USER_AGENT'))
+ else:
+ driver = GraphDatabase.driver(uri, auth=(username, password))
logging.info("Connection successful")
return driver
except Exception as e:
diff --git a/backend/src/main.py b/backend/src/main.py
index f51a334a..34adacb0 100644
--- a/backend/src/main.py
+++ b/backend/src/main.py
@@ -15,8 +15,9 @@
from src.document_sources.youtube import *
from src.shared.common_fn import *
from src.make_relationships import *
+from src.document_sources.web_pages import *
import re
-from langchain_community.document_loaders import WikipediaLoader
+from langchain_community.document_loaders import WikipediaLoader, WebBaseLoader
import warnings
from pytube import YouTube
import sys
@@ -96,6 +97,32 @@ def create_source_node_graph_url_gcs(graph, model, gcs_project_id, gcs_bucket_na
'gcsBucketName': gcs_bucket_name, 'gcsBucketFolder':obj_source_node.gcsBucketFolder, 'gcsProjectId':obj_source_node.gcsProjectId})
return lst_file_name,success_count,failed_count
+def create_source_node_graph_web_url(graph, model, source_url, source_type):
+ success_count=0
+ failed_count=0
+ lst_file_name = []
+ pages = WebBaseLoader(source_url, verify_ssl=False).load()
+ if pages==None or len(pages)==0:
+ failed_count+=1
+ message = f"Unable to read data for given url : {source_url}"
+ raise Exception(message)
+ obj_source_node = sourceNode()
+ obj_source_node.file_type = 'text'
+ obj_source_node.file_source = source_type
+ obj_source_node.model = model
+ obj_source_node.total_pages = 1
+ obj_source_node.url = urllib.parse.unquote(source_url)
+ obj_source_node.created_at = datetime.now()
+ obj_source_node.file_name = pages[0].metadata['title']
+ obj_source_node.language = pages[0].metadata['language']
+ obj_source_node.file_size = sys.getsizeof(pages[0].page_content)
+
+ graphDb_data_Access = graphDBdataAccess(graph)
+ graphDb_data_Access.create_source_node(obj_source_node)
+ lst_file_name.append({'fileName':obj_source_node.file_name,'fileSize':obj_source_node.file_size,'url':obj_source_node.url,'status':'Success'})
+ success_count+=1
+ return lst_file_name,success_count,failed_count
+
def create_source_node_graph_url_youtube(graph, model, source_url, source_type):
youtube_url, language = check_url_source(source_type=source_type, yt_url=source_url)
@@ -110,7 +137,7 @@ def create_source_node_graph_url_youtube(graph, model, source_url, source_type):
obj_source_node.url = youtube_url
obj_source_node.created_at = datetime.now()
match = re.search(r'(?:v=)([0-9A-Za-z_-]{11})\s*',obj_source_node.url)
- logging.info(f"match value{match}")
+ logging.info(f"match value: {match}")
obj_source_node.file_name = YouTube(obj_source_node.url).title
transcript= get_youtube_combined_transcript(match.group(1))
if transcript==None or len(transcript)==0:
@@ -165,7 +192,7 @@ def extract_graph_from_file_local_file(graph, model, merged_file_path, fileName,
else:
file_name, pages, file_extension = get_documents_from_file_by_path(merged_file_path,fileName)
if pages==None or len(pages)==0:
- raise Exception(f'Pdf content is not available for file : {file_name}')
+ raise Exception(f'File content is not available for file : {file_name}')
return processing_source(graph, model, file_name, pages, allowedNodes, allowedRelationship, True, merged_file_path, uri)
@@ -178,7 +205,16 @@ def extract_graph_from_file_s3(graph, model, source_url, aws_access_key_id, aws_
file_name, pages = get_documents_from_s3(source_url, aws_access_key_id, aws_secret_access_key)
if pages==None or len(pages)==0:
- raise Exception(f'Pdf content is not available for file : {file_name}')
+ raise Exception(f'File content is not available for file : {file_name}')
+
+ return processing_source(graph, model, file_name, pages, allowedNodes, allowedRelationship)
+
+def extract_graph_from_web_page(graph, model, source_url, allowedNodes, allowedRelationship):
+
+ file_name, pages = get_documents_from_web_page(source_url)
+
+ if pages==None or len(pages)==0:
+ raise Exception(f'Content is not available for given URL : {file_name}')
return processing_source(graph, model, file_name, pages, allowedNodes, allowedRelationship)
@@ -203,7 +239,7 @@ def extract_graph_from_file_gcs(graph, model, gcs_project_id, gcs_bucket_name, g
file_name, pages = get_documents_from_gcs(gcs_project_id, gcs_bucket_name, gcs_bucket_folder, gcs_blob_filename, access_token)
if pages==None or len(pages)==0:
- raise Exception(f'Pdf content is not available for file : {file_name}')
+ raise Exception(f'File content is not available for file : {file_name}')
return processing_source(graph, model, file_name, pages, allowedNodes, allowedRelationship)
@@ -239,7 +275,7 @@ def processing_source(graph, model, file_name, pages, allowedNodes, allowedRelat
pages[i]=Document(page_content=str(text), metadata=pages[i].metadata)
create_chunks_obj = CreateChunksofDocument(pages, graph)
chunks = create_chunks_obj.split_file_into_chunks()
-
+ chunkId_chunkDoc_list = create_relation_between_chunks(graph,file_name,chunks)
if result[0]['Status'] != 'Processing':
obj_source_node = sourceNode()
status = "Processing"
@@ -259,12 +295,12 @@ def processing_source(graph, model, file_name, pages, allowedNodes, allowedRelat
job_status = "Completed"
node_count = 0
rel_count = 0
- for i in range(0, len(chunks), update_graph_chunk_processed):
+ for i in range(0, len(chunkId_chunkDoc_list), update_graph_chunk_processed):
select_chunks_upto = i+update_graph_chunk_processed
logging.info(f'Selected Chunks upto: {select_chunks_upto}')
- if len(chunks) <= select_chunks_upto:
- select_chunks_upto = len(chunks)
- selected_chunks = chunks[i:select_chunks_upto]
+ if len(chunkId_chunkDoc_list) <= select_chunks_upto:
+ select_chunks_upto = len(chunkId_chunkDoc_list)
+ selected_chunks = chunkId_chunkDoc_list[i:select_chunks_upto]
result = graphDb_data_Access.get_current_status_document_node(file_name)
is_cancelled_status = result[0]['is_cancelled']
logging.info(f"Value of is_cancelled : {result[0]['is_cancelled']}")
@@ -326,8 +362,7 @@ def processing_source(graph, model, file_name, pages, allowedNodes, allowedRelat
else:
logging.info('File does not process because it\'s already in Processing status')
-def processing_chunks(chunks,graph,file_name,model,allowedNodes,allowedRelationship, node_count, rel_count):
- chunkId_chunkDoc_list = create_relation_between_chunks(graph,file_name,chunks)
+def processing_chunks(chunkId_chunkDoc_list,graph,file_name,model,allowedNodes,allowedRelationship, node_count, rel_count):
#create vector index and update chunk node with embedding
update_embedding_create_vector_index( graph, chunkId_chunkDoc_list, file_name)
logging.info("Get graph document list from models")
diff --git a/backend/src/post_processing.py b/backend/src/post_processing.py
new file mode 100644
index 00000000..60c202d2
--- /dev/null
+++ b/backend/src/post_processing.py
@@ -0,0 +1,58 @@
+from neo4j import GraphDatabase
+import logging
+import time
+
+
+DROP_INDEX_QUERY = "DROP INDEX entities IF EXISTS;"
+LABELS_QUERY = "CALL db.labels()"
+FULL_TEXT_QUERY = "CREATE FULLTEXT INDEX entities FOR (n{labels_str}) ON EACH [n.id, n.description];"
+FILTER_LABELS = ["Chunk","Document"]
+
+def create_fulltext(uri, username, password, database):
+ start_time = time.time()
+ logging.info("Starting the process of creating a full-text index.")
+
+ try:
+ driver = GraphDatabase.driver(uri, auth=(username, password), database=database)
+ driver.verify_connectivity()
+ logging.info("Database connectivity verified.")
+ except Exception as e:
+ logging.error(f"Failed to create a database driver or verify connectivity: {e}")
+ return
+
+ try:
+ with driver.session() as session:
+ try:
+ start_step = time.time()
+ session.run(DROP_INDEX_QUERY)
+ logging.info(f"Dropped existing index (if any) in {time.time() - start_step:.2f} seconds.")
+ except Exception as e:
+ logging.error(f"Failed to drop index: {e}")
+ return
+ try:
+ start_step = time.time()
+ result = session.run(LABELS_QUERY)
+ labels = [record["label"] for record in result]
+
+ for label in FILTER_LABELS:
+ if label in labels:
+ labels.remove(label)
+
+ labels_str = ":" + "|".join([f"`{label}`" for label in labels])
+ logging.info(f"Fetched labels in {time.time() - start_step:.2f} seconds.")
+ except Exception as e:
+ logging.error(f"Failed to fetch labels: {e}")
+ return
+ try:
+ start_step = time.time()
+ session.run(FULL_TEXT_QUERY.format(labels_str=labels_str))
+ logging.info(f"Created full-text index in {time.time() - start_step:.2f} seconds.")
+ except Exception as e:
+ logging.error(f"Failed to create full-text index: {e}")
+ return
+ except Exception as e:
+ logging.error(f"An error occurred during the session: {e}")
+ finally:
+ driver.close()
+ logging.info("Driver closed.")
+ logging.info(f"Process completed in {time.time() - start_time:.2f} seconds.")
\ No newline at end of file
diff --git a/backend/src/shared/common_fn.py b/backend/src/shared/common_fn.py
index ec52bfd8..67a09491 100644
--- a/backend/src/shared/common_fn.py
+++ b/backend/src/shared/common_fn.py
@@ -18,7 +18,7 @@
from langchain_experimental.graph_transformers.diffbot import DiffbotGraphTransformer
# from neo4j.debug import watch
-#watch("neo4j")
+# watch("neo4j")
def check_url_source(source_type, yt_url:str=None, wiki_query:str=None):
@@ -77,8 +77,11 @@ def get_chunk_and_graphDocument(graph_document_list, chunkId_chunkDoc_list):
return lst_chunk_chunkId_document
def create_graph_database_connection(uri, userName, password, database):
- graph = Neo4jGraph(url=uri, database=database, username=userName, password=password, refresh_schema=False, sanitize=True)
- #driver_config={'user_agent':os.environ.get('NEO4J_USER_AGENT')}
+ enable_user_agent = os.environ.get("ENABLE_USER_AGENT", "False").lower() in ("true", "1", "yes")
+ if enable_user_agent:
+ graph = Neo4jGraph(url=uri, database=database, username=userName, password=password, refresh_schema=False, sanitize=True,driver_config={'user_agent':os.environ.get('NEO4J_USER_AGENT')})
+ else:
+ graph = Neo4jGraph(url=uri, database=database, username=userName, password=password, refresh_schema=False, sanitize=True)
return graph
@@ -142,7 +145,7 @@ def get_llm(model_version:str) :
model_name=model_version)
else:
- llm = DiffbotGraphTransformer(diffbot_api_key=os.environ.get('DIFFBOT_API_KEY'))
+ llm = DiffbotGraphTransformer(diffbot_api_key=os.environ.get('DIFFBOT_API_KEY'),extract_types=['entities','facts'])
logging.info(f"Model created - Model Version: {model_version}")
return llm
diff --git a/backend/src/shared/constants.py b/backend/src/shared/constants.py
index ba9a68df..22cd5d8f 100644
--- a/backend/src/shared/constants.py
+++ b/backend/src/shared/constants.py
@@ -9,9 +9,168 @@
}
OPENAI_MODELS = ["gpt-3.5", "gpt-4o"]
GEMINI_MODELS = ["gemini-1.0-pro", "gemini-1.5-pro"]
+GROQ_MODELS = ["groq-llama3"]
+BUCKET_UPLOAD = 'llm-graph-builder-upload'
+BUCKET_FAILED_FILE = 'llm-graph-builder-failed'
+PROJECT_ID = 'llm-experiments-387609'
+
+
+## CHAT SETUP
CHAT_MAX_TOKENS = 1000
CHAT_SEARCH_KWARG_K = 3
CHAT_SEARCH_KWARG_SCORE_THRESHOLD = 0.7
-GROQ_MODELS = ["groq-llama3"]
-BUCKET_UPLOAD = 'llm-graph-builder-upload'
-PROJECT_ID = 'llm-experiments-387609'
+CHAT_DOC_SPLIT_SIZE = 3000
+CHAT_EMBEDDING_FILTER_SCORE_THRESHOLD = 0.10
+CHAT_TOKEN_CUT_OFF = {
+ ("gpt-3.5","gemini-1.0-pro","gemini-1.5-pro","groq-llama3" ) : 4,
+ ("gpt-4","diffbot" , "gpt-4o") : 28
+}
+
+
+### CHAT TEMPLATES
+CHAT_SYSTEM_TEMPLATE = """
+You are an AI-powered question-answering agent. Your task is to provide accurate and comprehensive responses to user queries based on the given context, chat history, and available resources.
+
+### Response Guidelines:
+1. **Direct Answers**: Provide clear and thorough answers to the user's queries without headers unless requested. Avoid speculative responses.
+2. **Utilize History and Context**: Leverage relevant information from previous interactions, the current user input, and the context provided below.
+3. **No Greetings in Follow-ups**: Start with a greeting in initial interactions. Avoid greetings in subsequent responses unless there's a significant break or the chat restarts.
+4. **Admit Unknowns**: Clearly state if an answer is unknown. Avoid making unsupported statements.
+5. **Avoid Hallucination**: Only provide information based on the context provided. Do not invent information.
+6. **Response Length**: Keep responses concise and relevant. Aim for clarity and completeness within 4-5 sentences unless more detail is requested.
+7. **Tone and Style**: Maintain a professional and informative tone. Be friendly and approachable.
+8. **Error Handling**: If a query is ambiguous or unclear, ask for clarification rather than providing a potentially incorrect answer.
+9. **Fallback Options**: If the required information is not available in the provided context, provide a polite and helpful response. Example: "I don't have that information right now." or "I'm sorry, but I don't have that information. Is there something else I can help with?"
+10. **Context Availability**: If the context is empty, do not provide answers based solely on internal knowledge. Instead, respond appropriately by indicating the lack of information.
+
+
+**IMPORTANT** : DO NOT ANSWER FROM YOUR KNOWLEDGE BASE USE THE BELOW CONTEXT
+
+### Context:
+
+{context}
+
+
+### Example Responses:
+User: Hi
+AI Response: 'Hello there! How can I assist you today?'
+
+User: "What is Langchain?"
+AI Response: "Langchain is a framework that enables the development of applications powered by large language models, such as chatbots. It simplifies the integration of language models into various applications by providing useful tools and components."
+
+User: "Can you explain how to use memory management in Langchain?"
+AI Response: "Langchain's memory management involves utilizing built-in mechanisms to manage conversational context effectively. It ensures that the conversation remains coherent and relevant by maintaining the history of interactions and using it to inform responses."
+
+User: "I need help with PyCaret's classification model."
+AI Response: "PyCaret simplifies the process of building and deploying machine learning models. For classification tasks, you can use PyCaret's setup function to prepare your data. After setup, you can compare multiple models to find the best one, and then fine-tune it for better performance."
+
+User: "What can you tell me about the latest realtime trends in AI?"
+AI Response: "I don't have that information right now. Is there something else I can help with?"
+
+Note: This system does not generate answers based solely on internal knowledge. It answers from the information provided in the user's current and previous inputs, and from the context.
+"""
+
+
+QUESTION_TRANSFORM_TEMPLATE = "Given the below conversation, generate a search query to look up in order to get information relevant to the conversation. Only respond with the query, nothing else."
+
+
+## CHAT QUERIES
+VECTOR_SEARCH_QUERY = """
+WITH node AS chunk, score
+MATCH (chunk)-[:PART_OF]->(d:Document)
+WITH d, collect(distinct {chunk: chunk, score: score}) as chunks, avg(score) as avg_score
+WITH d, avg_score,
+ [c in chunks | c.chunk.text] as texts,
+ [c in chunks | {id: c.chunk.id, score: c.score}] as chunkdetails
+WITH d, avg_score, chunkdetails,
+ apoc.text.join(texts, "\n----\n") as text
+RETURN text, avg_score AS score,
+ {source: COALESCE(CASE WHEN d.url CONTAINS "None" THEN d.fileName ELSE d.url END, d.fileName), chunkdetails: chunkdetails} as metadata
+"""
+
+# VECTOR_GRAPH_SEARCH_QUERY="""
+# WITH node as chunk, score
+# MATCH (chunk)-[:PART_OF]->(d:Document)
+# CALL { WITH chunk
+# MATCH (chunk)-[:HAS_ENTITY]->(e)
+# MATCH path=(e)(()-[rels:!HAS_ENTITY&!PART_OF]-()){0,2}(:!Chunk&!Document)
+# UNWIND rels as r
+# RETURN collect(distinct r) as rels
+# }
+# WITH d, collect(DISTINCT {chunk: chunk, score: score}) AS chunks, avg(score) as avg_score, apoc.coll.toSet(apoc.coll.flatten(collect(rels))) as rels
+# WITH d, avg_score,
+# [c IN chunks | c.chunk.text] AS texts,
+# [c IN chunks | {id: c.chunk.id, score: c.score}] AS chunkdetails,
+# [r in rels | coalesce(apoc.coll.removeAll(labels(startNode(r)),['__Entity__'])[0],"") +":"+ startNode(r).id + " "+ type(r) + " " + coalesce(apoc.coll.removeAll(labels(endNode(r)),['__Entity__'])[0],"") +":" + endNode(r).id] as entities
+# WITH d, avg_score,chunkdetails,
+# apoc.text.join(texts,"\n----\n") +
+# apoc.text.join(entities,"\n")
+# as text
+# RETURN text, avg_score AS score, {source: COALESCE( CASE WHEN d.url CONTAINS "None" THEN d.fileName ELSE d.url END, d.fileName), chunkdetails: chunkdetails} AS metadata
+# """
+
+
+VECTOR_GRAPH_SEARCH_QUERY = """
+WITH node as chunk, score
+// find the document of the chunk
+MATCH (chunk)-[:PART_OF]->(d:Document)
+// fetch entities
+CALL { WITH chunk
+// entities connected to the chunk
+// todo only return entities that are actually in the chunk, remember we connect all extracted entities to all chunks
+MATCH (chunk)-[:HAS_ENTITY]->(e)
+
+// depending on match to query embedding either 1 or 2 step expansion
+WITH CASE WHEN true // vector.similarity.cosine($embedding, e.embedding ) <= 0.95
+THEN
+collect { MATCH path=(e)(()-[rels:!HAS_ENTITY&!PART_OF]-()){0,1}(:!Chunk&!Document) RETURN path }
+ELSE
+collect { MATCH path=(e)(()-[rels:!HAS_ENTITY&!PART_OF]-()){0,2}(:!Chunk&!Document) RETURN path }
+END as paths
+
+RETURN collect{ unwind paths as p unwind relationships(p) as r return distinct r} as rels,
+collect{ unwind paths as p unwind nodes(p) as n return distinct n} as nodes
+}
+// aggregate chunk-details and de-duplicate nodes and relationships
+WITH d, collect(DISTINCT {chunk: chunk, score: score}) AS chunks, avg(score) as avg_score, apoc.coll.toSet(apoc.coll.flatten(collect(rels))) as rels,
+
+// TODO sort by relevancy (embeddding comparision?) cut off after X (e.g. 25) nodes?
+apoc.coll.toSet(apoc.coll.flatten(collect(
+ [r in rels |[startNode(r),endNode(r)]]),true)) as nodes
+
+// generate metadata and text components for chunks, nodes and relationships
+WITH d, avg_score,
+ [c IN chunks | c.chunk.text] AS texts,
+ [c IN chunks | {id: c.chunk.id, score: c.score}] AS chunkdetails,
+ apoc.coll.sort([n in nodes |
+
+coalesce(apoc.coll.removeAll(labels(n),['__Entity__'])[0],"") +":"+
+n.id + (case when n.description is not null then " ("+ n.description+")" else "" end)]) as nodeTexts,
+ apoc.coll.sort([r in rels
+ // optional filter if we limit the node-set
+ // WHERE startNode(r) in nodes AND endNode(r) in nodes
+ |
+coalesce(apoc.coll.removeAll(labels(startNode(r)),['__Entity__'])[0],"") +":"+
+startNode(r).id +
+" " + type(r) + " " +
+coalesce(apoc.coll.removeAll(labels(endNode(r)),['__Entity__'])[0],"") +":" +
+endNode(r).id
+]) as relTexts
+
+// combine texts into response-text
+WITH d, avg_score,chunkdetails,
+"Text Content:\n" +
+apoc.text.join(texts,"\n----\n") +
+"\n----\nEntities:\n"+
+apoc.text.join(nodeTexts,"\n") +
+"\n----\nRelationships:\n"+
+apoc.text.join(relTexts,"\n")
+
+as text
+RETURN text, avg_score as score, {length:size(text), source: COALESCE( CASE WHEN d.url CONTAINS "None" THEN d.fileName ELSE d.url END, d.fileName), chunkdetails: chunkdetails} AS metadata
+"""
+
+
+
+
+
diff --git a/docker-compose.yml b/docker-compose.yml
index 63ade96e..a93be695 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -24,6 +24,7 @@ services:
- GCP_LOG_METRICS_ENABLED=${GCP_LOG_METRICS_ENABLED-False}
- UPDATE_GRAPH_CHUNKS_PROCESSED=${UPDATE_GRAPH_CHUNKS_PROCESSED-20}
- NUMBER_OF_CHUNKS_TO_COMBINE=${NUMBER_OF_CHUNKS_TO_COMBINE-6}
+ - GCS_FILE_CACHE=${GCS_FILE_CACHE-False}
container_name: backend
ports:
- "8000:8000"
@@ -39,7 +40,7 @@ services:
args:
- BACKEND_API_URL=${BACKEND_API_URL-http://localhost:8000}
- REACT_APP_SOURCES=${REACT_APP_SOURCES-local,youtube,wiki,s3}
- - LLM_MODELS=${LLM_MODELS-Diffbot,OpenAI GPT 3.5,OpenAI GPT 4o}
+ - LLM_MODELS=${LLM_MODELS-diffbot,gpt-3.5,gpt-4o}
- GOOGLE_CLIENT_ID=${GOOGLE_CLIENT_ID-""}
- BLOOM_URL=${BLOOM_URL-https://workspace-preview.neo4j.io/workspace/explore?connectURL={CONNECT_URL}&search=Show+me+a+graph&featureGenAISuggestions=true&featureGenAISuggestionsInternal=true}
- TIME_PER_CHUNK=${TIME_PER_CHUNK-4}
diff --git a/docs/backend/backend_docs.adoc b/docs/backend/backend_docs.adoc
new file mode 100644
index 00000000..a2887d6f
--- /dev/null
+++ b/docs/backend/backend_docs.adoc
@@ -0,0 +1,712 @@
+= LLM Knowledge Graph Builder Backend
+
+== API Reference
+
+
+=== Connect to Neo4j Graph Database
+-----
+POST /connect
+-----
+
+Neo4j database connection on frontend is done with this API.
+
+**API Parameters :**
+
+* `uri`= Neo4j uri,
+* `userName`= Neo4j db username,
+* `password`= Neo4j db password,
+* `database`= Neo4j database name
+
+
+**Response :**
+[source,json,indent=0]
+----
+{
+ "status":"Success",
+ "message":"Connection Successful"
+}
+----
+
+
+=== Upload Files from Local
+----
+POST /upload
+----
+
+The upload endpoint is designed to handle the uploading of large files by breaking them into smaller chunks. This method ensures that large files can be uploaded efficiently without overloading the server.
+
+***API Parameters***
+
+* `file`=The file to be uploaded, received in chunks,
+* `chunkNumber`=The current chunk number being uploaded,
+* `totalChunks`=The total number of chunks the file is divided into (each chunk of 1Mb size),
+* `originalname`=The original name of the file,
+* `model`=The model associated with the file,
+* `uri`=Neo4j uri,
+* `userName`= Neo4j db username,
+* `password`= Neo4j db password,
+* `database`= Neo4j database name
+
+**Response :**
+[source,json,indent=0]
+....
+{
+ "status": "Success",
+ "message": "File uploaded and chunks merged successfully."
+}
+....
+
+
+=== User defined schema
+----
+POST /schema
+----
+
+User can set schema for graph generation (i.e. Nodes and relationship labels) in settings panel or get existing db schema through this API.
+
+**API Parameters :**
+
+* `uri`=Neo4j uri,
+* `userName`= Neo4j db username,
+* `password`= Neo4j db password,
+* `database`= Neo4j database name
+
+
+**Response :**
+[source,json,indent=0]
+....
+{
+ "status": "Success",
+ "data": [
+ {
+ "labels": [
+ "Access_token",
+ "Activity",
+ "Ai chatbot",
+ "Book",
+ "Metric",
+ "Mode",
+ "Mountain"
+ ],
+ "relationshipTypes": [
+ "ACCELERATE",
+ "ACCEPTS",
+ "CONVERT",
+ "CORRELATE",
+ "ESTABLISHED",
+ "EXAMPLE_OF"
+ ]
+ }
+ ]
+}
+....
+
+=== Graph schema from input text
+----
+POST /populate_graph_schema
+----
+
+The API is used to populate a graph schema based on the provided input text, model, and schema description flag.
+
+**API Parameters :**
+
+* `input_text`=The input text used to populate the graph schema.
+* `model`=The model to be used for populating the graph schema.
+* `is_schema_description_checked`=A flag indicating whether the schema description should be considered.
+
+
+**Response :**
+[source,json,indent=0]
+....
+{
+ "status": "Success",
+ "data": [
+ {
+ "labels": [
+ "Technology",
+ "Company",
+ "Person",
+ "Location",
+ "Organization",
+ "Concept"
+ ],
+ "relationshipTypes": [
+ "LOCATED_AT",
+ "SUBSIDARY_OF",
+ "BORN_IN",
+ "LAST_MESSAGE",
+ "ATTENDED",
+ "PARTNERED_WITH"
+ ]
+ }
+ ]
+}
+....
+
+
+=== Unstructured sources scan other than local
+----
+POST /url/scan
+----
+
+Create Document node for other sources - s3 bucket, gcs bucket, wikipedia, youtube url and web pages.
+
+**API Parameters :**
+
+* `uri`=Neo4j uri,
+* `userName`= Neo4j db username,
+* `password`= Neo4j db password,
+* `database`= Neo4j database name
+* `model`= LLM model,
+* `source_url`= ,
+* `aws_access_key_id`= AWS access key,
+* `aws_secret_access_key`= AWS secret key,
+* `wiki_query`= Wikipedia query sources,
+* `gcs_project_id`= GCS project id,
+* `gcs_bucket_name`= GCS bucket name,
+* `gcs_bucket_folder`= GCS bucket folder,
+* `source_type`= s3 bucket/ gcs bucket/ youtube/Wikipedia as source type
+* `gcs_project_id`=Form(None),
+* `access_token`=Form(None)
+
+
+**Response :**
+[source,json,indent=0]
+....
+{
+ "status": "Success",
+ "success_count": 2,
+ "failed_count": 0,
+ "message": "Source Node created successfully for source type: Wikipedia and source: Albert Einstein, neo4j",
+ "file_name": [
+ {
+ "fileName": "Albert Einstein",
+ "fileSize": 8074,
+ "url": "https://en.wikipedia.org/wiki/Albert_Einstein",
+ "status": "Success"
+ }
+ ]
+}
+....
+
+
+=== Extration of nodes and relations from content
+----
+POST /extract :
+----
+
+This API is responsible for -
+
+** Reading the content of source provided in the form of langchain Document object from respective langchain loaders
+
+** Dividing the document into multiple chunks, and make below relations -
+*** PART_OF - relation from Document node to all chunk nodes
+*** FIRST_CHUNK - relation from document node to first chunk node
+*** NEXT_CHUNK - relation from a chunk pointing to next chunk of the document.
+*** HAS_ENTITY - relation between chunk node and entities extracted from LLM.
+
+** Extracting nodes and relations in the form of GraphDocument from respective LLM.
+
+** Update embedding of chunks and create vector index.
+
+** Update K-Nearest Neighbors graph for similar chunks.
+
+
+**Implementation :**
+
+** For multiple sources of content -
+
+*** Local file - User can upload pdf file from their device.
+
+*** s3 bucket - User passes the bucket url and all the pdf files inside folders and subfolders will be listed.
+
+*** GCS bucket - User passes gcs project id, gcs bucket name and folder name, do google authentication to access all the pdf files under that folder and its subfolders and if folder name is not passed by user, all the pdf files under the bucket and its subfolders will be listed if user have read access of the bucket.
+
+*** Web Sources -
+**** Wikipedia - Wikipedia 1st page content is rendered url passed by user.
+
+**** Youtube - Youtube video transcript is processed and if no transcript is available then respective error is thrown.
+
+**** Web urls - Text Content from any web url is processed for generating graph.
+
+** Langchain's LLMGraphTransformer library is used to get nodes and relations in the form of GraphDocument from LLMs. User and System prompts, LLM chain, graphDocument schema are defined in the library itself.
+
+** SentenceTransformer embeddingds are used by default, also embeddings are made configurable to use either OpenAIEmbeddings or VertexAIEmbeddings.
+
+** Vector index is created in databse on embeddingds created for chunks.
+
+**API Parameters :**
+
+* `uri`=Neo4j uri,
+* `userName`= Neo4j db username,
+* `password`= Neo4j db password,
+* `database`= Neo4j database name
+* `model`= LLM model,
+* `file_name` = File uploaded from device
+* `source_url`= ,
+* `aws_access_key_id`= AWS access key,
+* `aws_secret_access_key`= AWS secret key,
+* `wiki_query`= Wikipedia query sources,
+* `gcs_project_id`=GCS project id,
+* `gcs_bucket_name`= GCS bucket name,
+* `gcs_bucket_folder`= GCS bucket folder,
+* `gcs_blob_filename` = GCS file name,
+* `source_type`= local file/ s3 bucket/ gcs bucket/ youtube/ Wikipedia as source,
+allowedNodes=Node labels passed from settings panel,
+* `allowedRelationship`=Relationship labels passed from settings panel,
+* `language`=Language in which wikipedia content will be extracted
+
+**Response :**
+[source,json,indent=0]
+....
+{
+ "status": "Success",
+ "data": {
+ "fileName": ,
+ "nodeCount": ,
+ "relationshipCount": ,
+ "processingTime": ,
+ "status": "Completed",
+ "model":
+ }
+}
+....
+
+
+=== Get list of sources
+----
+GET /sources_list
+----
+
+List all sources (Document nodes) present in Neo4j graph database.
+
+**API Parameters :**
+
+* `uri`=Neo4j uri,
+* `userName`= Neo4j db username,
+* `password`= Neo4j db password,
+* `database`= Neo4j database name
+
+**Response :**
+[source,json,indent=0]
+....
+{
+ "status": "Success",
+ "data": [
+ {
+ "fileName": "About Amazon.pdf",
+ "fileSize": 163931,
+ "errorMessage": "",
+ "fileSource": "local file",
+ "nodeCount": 62,
+ "model": "OpenAI GPT 4",
+ "fileType": "pdf",
+ "processingTime": 122.71,
+ "relationshipCount": 187,
+ "status": "Completed",
+ "updatedAt": {
+ "_DateTime__date": {
+ "_Date__ordinal": 738993,
+ "_Date__year": 2024,
+ "_Date__month": 4,
+ "_Date__day": 17
+ },
+ "_DateTime__time": {
+ "_Time__ticks": 28640715768000,
+ "_Time__hour": 7,
+ "_Time__minute": 57,
+ "_Time__second": 20,
+ "_Time__nanosecond": 715768000,
+ "_Time__tzinfo": null
+ }
+ }
+ }
+ ]
+}
+....
+
+
+=== Post processing after graph generation
+----
+POST /post_processing :
+----
+
+This API is called at the end of processing of whole document to get create k-nearest neighbor relations between similar chunks of document based on KNN_MIN_SCORE which is 0.8 by default and to drop and create a full text index on db labels.
+
+**API Parameters :**
+
+* `uri`=Neo4j uri,
+* `userName`= Neo4j db username,
+* `password`= Neo4j db password,
+* `database`= Neo4j database name
+* `tasks`= List of tasks to perform
+
+
+**Response :**
+[source,json,indent=0]
+....
+{
+ "status":"Success",
+ "message":"All tasks completed successfully"
+}
+....
+
+
+=== Chat with Data
+----
+POST /chat_bot
+----
+
+The API responsible for a chatbot system designed to leverage multiple AI models and a Neo4j graph database, providing answers to user queries. It interacts with AI models from OpenAI and Google's Vertex AI and utilizes embedding models to enhance the retrieval of relevant information.
+
+**Components :**
+
+** Embedding Models - Includes OpenAI Embeddings, VertexAI Embeddings, and SentenceTransformer Embeddings to support vector-based query operations.
+** AI Models - OpenAI GPT 3.5, GPT 4o, Gemini Pro, Gemini 1.5 Pro and Groq llama3 can be configured for the chatbot backend to generate responses and process natural language.
+** Graph Database (Neo4jGraph) - Manages interactions with the Neo4j database, retrieving, and storing conversation histories.
+** Response Generation - Utilizes Vector Embeddings from the Neo4j database, chat history, and the knowledge base of the LLM used.
+
+**API Parameters :**
+
+* `uri`= Neo4j uri
+* `userName`= Neo4j database username
+* `password`= Neo4j database password
+* `model`= LLM model
+* `question`= User query for the chatbot
+* `session_id`= Session ID used to maintain the history of chats during the user's connection
+
+**Response :**
+[source,json,indent=0]
+....
+{
+ "status": "Success",
+ "data": {
+ "session_id": "0901",
+ "message": "Fibrosis, also known as fibrotic scarring, is a pathological wound healing process where connective tissue replaces normal parenchymal tissue."
+ "info": {
+ "sources": [
+ {
+ "source_name": "https://en.wikipedia.org/wiki/Fibrosis",
+ "page_numbers": [],
+ "start_time": []
+ }
+ ],
+ "model": "gpt-4o",
+ "chunkids": [
+ "54d8c0dbefb67f1ed3f6939d59267e1ff557a94c",
+ "4cc02ee8419706c8decdf71ab0d3896aad5c7dca",
+ "266ce95311bb1921791b4f1cd29a48d433027139",
+ "11e19513247e1e396475728fa6a197695045b248",
+ "8bafa01b6d851f70822bcb86863e485e1785a64c"
+ ],
+ "total_tokens": 2213,
+ "response_time": 10.17
+ },
+ "user": "chatbot"
+ }
+}
+....
+
+=== Get entities from chunks
+----
+/chunk_entities
+----
+
+This API is used to get the entities and relations associated with a particular chunk and chunk metadata.
+
+**API Parameters :**
+
+* `uri`=Neo4j uri,
+* `userName`= Neo4j db username,
+* `password`= Neo4j db password,
+* `database`= Neo4j database name
+* `chunk_ids` = Chunk ids of document
+
+
+**Response :**
+[source,json,indent=0]
+....
+{
+ "status": "Success",
+ "data": {
+ "nodes": [
+ {
+ "element_id": "4:a69712a5-1102-40da-a96d-70c1143ea8e5:73267",
+ "labels": [
+ "Condition"
+ ],
+ "properties": {
+ "id": "Fibrosis"
+ }
+ },
+
+ ],
+ "relationships": [
+ {
+ "element_id": "5:a69712a5-1102-40da-a96d-70c1143ea8e5:1153057844048764467",
+ "type": "AFFECTS",
+ "start_node_element_id": "4:a69712a5-1102-40da-a96d-70c1143ea8e5:73267",
+ "end_node_element_id": "4:a69712a5-1102-40da-a96d-70c1143ea8e5:73282"
+ },
+ {
+ "element_id": "5:a69712a5-1102-40da-a96d-70c1143ea8e5:1155309643862449715",
+ "type": "AFFECTS",
+ "start_node_element_id": "4:a69712a5-1102-40da-a96d-70c1143ea8e5:73267",
+ "end_node_element_id": "4:a69712a5-1102-40da-a96d-70c1143ea8e5:73294"
+ },
+ ],
+ "chunk_data": [
+ {
+ "id": "54d8c0dbefb67f1ed3f6939d59267e1ff557a94c",
+ "position": 1,
+ "text": "Fibrosis, also known as fibrotic scarring, is a pathological wound healing ...",
+ "content_offset": 0,
+ "fileName": "fibrosis",
+ "length": 1002,
+ "embedding": null
+ }
+ ]
+ }
+}
+....
+
+=== View graph for a file
+----
+POST /graph_query
+----
+
+This API is used to view graph for a particular file.
+
+**API Parameters :**
+
+* `uri`=Neo4j uri,
+* `userName`= Neo4j db username,
+* `password`= Neo4j db password,
+* `query_type`= Neo4j database name
+* `document_names` = File name for which user wants to view graph
+
+
+**Response :**
+[source,json,indent=0]
+....
+{
+ "status": "Success",
+ "data": {
+ "nodes": [
+ {
+ "element_id": "4:98e5e9bb-8095-440d-9462-03985fed2fa2:9972",
+ "labels": [
+ "Person"
+ ],
+ "properties": {
+ "id": "Jeff"
+ }
+ },
+ {
+ "element_id": "4:98e5e9bb-8095-440d-9462-03985fed2fa2:9973",
+ "labels": [
+ "Team"
+ ],
+ "properties": {
+ "id": "Miami"
+ }
+ }
+ ],
+ "relationships": [
+ {
+ "element_id": "5:98e5e9bb-8095-440d-9462-03985fed2fa2:1153200780560312052",
+ "type": "PLAYER",
+ "start_node_element_id": "4:98e5e9bb-8095-440d-9462-03985fed2fa2:9972",
+ "end_node_element_id": "4:98e5e9bb-8095-440d-9462-03985fed2fa2:9973"
+ }
+ ]
+ }
+}
+....
+
+=== Clear chat history
+----
+POST /clear_chat_bot
+----
+
+This API is used to clear the chat history which is saved in Neo4j DB.
+
+**API Parameters :**
+
+* `uri`=Neo4j uri,
+* `userName`= Neo4j db username,
+* `password`= Neo4j db password,
+* `database`= Neo4j database name,
+* `session_id` = User session id for QA chat
+
+
+**Response :**
+[source,json,indent=0]
+....
+{
+ "status": "Success",
+ "data": {
+ "session_id": "99c1a808-377f-448f-9ea6-4b4a8de46b14",
+ "message": "The chat History is cleared",
+ "user": "chatbot"
+ }
+}
+....
+
+=== SSE event to update processing status
+----
+GET /update_extract_status
+----
+
+The API provides a continuous update on the extraction status of a specified file. It uses Server-Sent Events (SSE) to stream updates to the client.
+
+**API Parameters :**
+
+* `file_name`=The name of the file whose extraction status is being tracked,
+* `uri`=Neo4j uri,
+* `userName`= Neo4j db username,
+* `password`= Neo4j db password,
+* `database`= Neo4j database name
+
+
+**Response :**
+[source,json,indent=0]
+....
+{
+ "fileName": "testFile.pdf",
+ "status": "Processing",
+ "processingTime": 0,
+ "nodeCount": 0,
+ "relationshipCount": 0,
+ "model": "OpenAI GPT 3.5",
+ "total_chunks": 3,
+ "total_pages": 1,
+ "fileSize": 92373,
+ "processed_chunk": 0
+}
+....
+
+=== Delete selected documents
+----
+POST /delete_document_and_entities
+----
+
+**Overview :**
+
+Deleteion of nodes and relations for multiple files is done through this API. User can choose multiple documents to be deleted, also user have option to delete only 'Document' and 'Chunk' nodes and keep the entities extracted from that document.
+
+**API Parameters :**
+
+* `uri`=Neo4j uri,
+* `userName`= Neo4j db username,
+* `password`= Neo4j db password,
+* `database`= Neo4j database name,
+* `filenames`= List of files to be deleted,
+* `source_types`= Document sources(Wikipedia, youtube, etc.),
+* `deleteEntities`= Boolean value to check entities deletion is requested or not
+
+**Response :**
+[source,json,indent=0]
+....
+{
+ "status": "Success",
+ "message": "Deleted 1 documents with 68 entities from database"
+}
+....
+
+=== Cancel processing job
+----
+/cancelled_job
+----
+
+This API is responsible for cancelling an in process job.
+
+**API Parameters :**
+
+* `uri`=Neo4j uri,
+* `userName`= Neo4j db username,
+* `password`= Neo4j db password,
+* `database`= Neo4j database name,
+* `filenames`= Name of the file whose processing need to be stopped,
+* `source_types`= Source of the file
+
+
+**Response :**
+[source,json,indent=0]
+....
+{
+ "message":"Cancelled the processing job successfully"
+}
+....
+
+
+=== Get the list of orphan nodes
+----
+POST /get_unconnected_nodes_list
+----
+
+The API retrieves a list of nodes in the graph database that are not connected to any other nodes.
+
+**API Parameters :**
+
+* `uri`=Neo4j uri,
+* `userName`= Neo4j db username,
+* `password`= Neo4j db password,
+* `database`= Neo4j database name
+
+
+**Response :**
+[source,json,indent=0]
+....
+{ "status": "Success",
+ "data": [
+ "e":
+ {
+ "id": "Leela Chess Zero",
+ "elementId": "4:abf6f691-928d-4b1c-80fc-2914ae517b4c:336",
+ "labels": ["Technology"],
+ "embedding": null
+ },
+ "documents": ["AlphaZero - Wikipedia.pdf"],
+ "chunkConnections": 7
+ ]
+}
+....
+
+
+=== Deletion of orpahn nodes
+----
+POST /delete_unconnected_nodes
+----
+
+The API is used to delete unconnected entities from database.
+
+**API Parameters :**
+
+* `uri`=Neo4j uri,
+* `userName`= Neo4j db username,
+* `password`= Neo4j db password,
+* `database`= Neo4j database name,
+* `unconnected_entities_list`=selected entities list to delete of unconnected entities.
+
+
+**Response :**
+[source,json,indent=0]
+....
+{
+ "status": "Success",
+ "message: "Unconnected entities delete successfully"
+}
+....
+
+
+== Decisions
+
+* Process only 1st page of Wikipedia
+* Split document content into chunks of size 200 and overlap of 20
+* Configurable elements -
+** Number of chunks to combine
+** Generate Embedding or not
+** Embedding model
+** minimum score for KNN graph
+** Uploaded file storage location (GCS bucket or container)
diff --git a/docs/frontend/frontend_docs.adoc b/docs/frontend/frontend_docs.adoc
new file mode 100644
index 00000000..e69de29b
diff --git a/docs/project architecture.png b/docs/project architecture.png
new file mode 100644
index 00000000..81574930
Binary files /dev/null and b/docs/project architecture.png differ
diff --git a/docs/project_docs.adoc b/docs/project_docs.adoc
new file mode 100644
index 00000000..22e99721
--- /dev/null
+++ b/docs/project_docs.adoc
@@ -0,0 +1,126 @@
+= LLM Knowledge Graph Builder
+
+== Introduction
+
+This document provides comprehensive documentation for the Neo4j llm-graph-builder Project, a Python web application built with the FastAPI framework. It covers various aspects of the project, including its features, architecture, usage, development, deployment, limitations and known issues.
+
+
+== Features
+
+* Upload unstructured data from multiple sources to generate structuted Neo4j knowledge graph.
+
+* Extraction of nodes and relations from multiple LLMs(OpenAI GPT-3.5, OpenAI GPT-4, Gemini 1.0-Pro and Diffbot).
+
+* View complete graph or only a particular element of graph(ex: Only chunks, only entities, document and entities, etc.)
+
+* Generate embedding of chunks created from unstructured content.
+
+* Generate k-nearest neighbors graph for similar chunks.
+
+* Chat with graph data using chat bot.
+
+== Local Setup and Execution
+
+Run Docker Compose to build and start all components:
+....
+docker-compose up --build
+....
+
+Alternatively, run specific directories separately:
+
+** For frontend
+....
+cd frontend
+yarn
+yarn run dev
+....
+
+** For backend
+....
+cd backend
+python -m venv envName
+source envName/bin/activate
+pip install -r requirements.txt
+uvicorn score:app --reload
+....
+
+Set up environment variables
+....
+OPENAI_API_KEY = ""
+DIFFBOT_API_KEY = ""
+NEO4J_URI = ""
+NEO4J_USERNAME = ""
+NEO4J_PASSWORD = ""
+NEO4J_DATABASE = ""
+AWS_ACCESS_KEY_ID = ""
+AWS_SECRET_ACCESS_KEY = ""
+EMBEDDING_MODEL = ""
+IS_EMBEDDING = "TRUE"
+KNN_MIN_SCORE = ""
+LANGCHAIN_API_KEY = ""
+LANGCHAIN_PROJECT = ""
+LANGCHAIN_TRACING_V2 = ""
+LANGCHAIN_ENDPOINT = ""
+NUMBER_OF_CHUNKS_TO_COMBINE = ""
+....
+
+== Architecture
+image::project architecture.png[Archirecture diagram, 600, align='left']
+
+== Development
+
+==== Backend
+link:backend/backend_docs.adoc[backend_docs.adoc]
+
+==== Frontend
+link:frontend/frontend_docs.adoc[frontend_docs.adoc]
+
+== Deployment and Monitoring
+* The application is deployed on Google Cloud Platform.
+
+ To deploy frontend
+....
+gcloud run deploy
+source location current directory > Frontend
+region : 32 [us-central 1]
+Allow unauthenticated request : Yes
+....
+
+ To deploy backend
+....
+gcloud run deploy --set-env-vars "OPENAI_API_KEY = " --set-env-vars "DIFFBOT_API_KEY = " --set-env-vars "NEO4J_URI = " --set-env-vars "NEO4J_PASSWORD = " --set-env-vars "NEO4J_USERNAME = "
+source location current directory > Backend
+region : 32 [us-central 1]
+Allow unauthenticated request : Yes
+....
+
+* Langserve is used with FAST API to deploy Langchain runnables and chains as a REST API.
+
+* Langsmith is used to monitor and evaluate the application
+
+
+Developement url
+
+Production url
+
+
+
+== Appendix
+
+=== Limitations
+
+** Only pdf file uploaded from device or uploaded from s3 bucket or gcs bucket can be processed.
+
+** GCS buckets present under 1051503595507@cloudbuild.gserviceaccount.com service account can only be accessed.
+
+** Only 1st page of Wikipedia content is processed to generate graphDocument.
+
+
+=== Known issues
+
+** InactiveRpcError error with Gemini 1.0 Pro - grpc_status:13, grpc_message:"Internal error encountered."
+
+** ResourceExhausted error with Gemini 1.5 Pro - 429 Quota exceeded for aiplatform.googleapis.com/generate_content_requests_per_minute_per_project_per_base_model with base model: gemini-1.5-pro
+
+** Gemini response validation errors even after making safety_settings parameters to BLOCK_NONE.
+
diff --git a/example.env b/example.env
index 3eef484d..68132545 100644
--- a/example.env
+++ b/example.env
@@ -19,12 +19,13 @@ LANGCHAIN_API_KEY = ""
LANGCHAIN_PROJECT = ""
LANGCHAIN_TRACING_V2 = "true"
LANGCHAIN_ENDPOINT = "https://api.smith.langchain.com"
+GCS_FILE_CACHE = False
# Optional Frontend
BACKEND_API_URL="http://localhost:8000"
BLOOM_URL="https://workspace-preview.neo4j.io/workspace/explore?connectURL={CONNECT_URL}&search=Show+me+a+graph&featureGenAISuggestions=true&featureGenAISuggestionsInternal=true"
-REACT_APP_SOURCES="local,youtube,wiki,s3"
-LLM_MODELS="Diffbot,OpenAI GPT 3.5,OpenAI GPT 4o"
+REACT_APP_SOURCES="local,youtube,wiki,s3,web"
+LLM_MODELS="diffbot,gpt-3.5,gpt-4o"
ENV="DEV"
TIME_PER_CHUNK=4
TIME_PER_PAGE=50
diff --git a/frontend/Dockerfile b/frontend/Dockerfile
index 504bcc04..7e166bda 100644
--- a/frontend/Dockerfile
+++ b/frontend/Dockerfile
@@ -3,14 +3,14 @@ FROM node:20 AS build
ARG BACKEND_API_URL="http://localhost:8000"
ARG REACT_APP_SOURCES=""
-ARG LLM_MODELS="diffbot,gpt-3.5,gpt-4o,gemini-1.0-pro"
+ARG LLM_MODELS=""
ARG GOOGLE_CLIENT_ID=""
ARG BLOOM_URL="https://workspace-preview.neo4j.io/workspace/explore?connectURL={CONNECT_URL}&search=Show+me+a+graph&featureGenAISuggestions=true&featureGenAISuggestionsInternal=true"
ARG TIME_PER_CHUNK=4
ARG TIME_PER_PAGE=50
ARG LARGE_FILE_SIZE=5242880
ARG CHUNK_SIZE=5242880
-ARG ENV="PROD"
+ARG ENV="DEV"
WORKDIR /app
COPY package.json yarn.lock ./
diff --git a/frontend/example.env b/frontend/example.env
index ced97193..bf7a7b03 100644
--- a/frontend/example.env
+++ b/frontend/example.env
@@ -1,9 +1,10 @@
BACKEND_API_URL="http://localhost:8000"
BLOOM_URL="https://workspace-preview.neo4j.io/workspace/explore?connectURL={CONNECT_URL}&search=Show+me+a+graph&featureGenAISuggestions=true&featureGenAISuggestionsInternal=true"
-REACT_APP_SOURCES="local,youtube,wiki,s3"
-LLM_MODELS="Diffbot,OpenAI GPT 3.5,OpenAI GPT 4o"
+REACT_APP_SOURCES="local,youtube,wiki,s3,web"
+LLM_MODELS="diffbot,gpt-3.5,gpt-4o"
ENV="DEV"
TIME_PER_CHUNK=4
TIME_PER_PAGE=50
CHUNK_SIZE=5242880
+LARGE_FILE_SIZE=5242880
GOOGLE_CLIENT_ID=""
\ No newline at end of file
diff --git a/frontend/src/App.css b/frontend/src/App.css
index 5c501b6f..52d8fcd5 100644
--- a/frontend/src/App.css
+++ b/frontend/src/App.css
@@ -25,7 +25,7 @@
}
.contentWithExpansion {
- width: calc(-800px + 100dvw);
+ width: calc(-840px + 100dvw);
height: calc(100dvh - 58px);
padding: 3px;
display: flex;
@@ -121,6 +121,10 @@
height: 55px;
object-fit: contain;
}
+.webImg{
+ width: 80px;
+ height: 80px;
+}
::placeholder {
color: rgb(135, 130, 130) !important;
diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx
index 88f0164d..0caa501e 100644
--- a/frontend/src/App.tsx
+++ b/frontend/src/App.tsx
@@ -2,22 +2,27 @@ import './App.css';
import '@neo4j-ndl/base/lib/neo4j-ds-styles.css';
import ThemeWrapper from './context/ThemeWrapper';
import QuickStarter from './components/QuickStarter';
-
import { GoogleOAuthProvider } from '@react-oauth/google';
import { APP_SOURCES } from './utils/Constants';
+import ErrorBoundary from './components/UI/ErrroBoundary';
+
const App: React.FC = () => {
return (
<>
{APP_SOURCES != undefined && APP_SOURCES.includes('gcs') ? (
-
+
+
+
+
+
+
+
+ ) : (
+
-
- ) : (
-
-
-
+
)}
>
);
diff --git a/frontend/src/HOC/SettingModalHOC.tsx b/frontend/src/HOC/SettingModalHOC.tsx
new file mode 100644
index 00000000..7afe46d0
--- /dev/null
+++ b/frontend/src/HOC/SettingModalHOC.tsx
@@ -0,0 +1,28 @@
+import React from 'react';
+import { SettingsModalProps } from '../types';
+import SettingsModal from '../components/Popups/Settings/SettingModal';
+
+const SettingModalHOC: React.FC = ({
+ openTextSchema,
+ open,
+ onClose,
+ isSchema,
+ settingView,
+ setIsSchema,
+ onContinue,
+ onClear,
+}) => {
+ return (
+
+ );
+};
+export default SettingModalHOC;
diff --git a/frontend/src/assets/images/db-search.svg b/frontend/src/assets/images/db-search.svg
new file mode 100644
index 00000000..33c7a3ca
--- /dev/null
+++ b/frontend/src/assets/images/db-search.svg
@@ -0,0 +1,8 @@
+
diff --git a/frontend/src/assets/images/graph-search.svg b/frontend/src/assets/images/graph-search.svg
new file mode 100644
index 00000000..1be52147
--- /dev/null
+++ b/frontend/src/assets/images/graph-search.svg
@@ -0,0 +1,16 @@
+
diff --git a/frontend/src/assets/images/internet_logo.png b/frontend/src/assets/images/internet_logo.png
new file mode 100644
index 00000000..f79962b7
Binary files /dev/null and b/frontend/src/assets/images/internet_logo.png differ
diff --git a/frontend/src/assets/images/web-search-svgrepo-com.svg b/frontend/src/assets/images/web-search-svgrepo-com.svg
new file mode 100644
index 00000000..199dd7cb
--- /dev/null
+++ b/frontend/src/assets/images/web-search-svgrepo-com.svg
@@ -0,0 +1,11 @@
+
+
+
\ No newline at end of file
diff --git a/frontend/src/assets/images/web-svgrepo-com.svg b/frontend/src/assets/images/web-svgrepo-com.svg
new file mode 100644
index 00000000..86089066
--- /dev/null
+++ b/frontend/src/assets/images/web-svgrepo-com.svg
@@ -0,0 +1,11 @@
+
+
+
\ No newline at end of file
diff --git a/frontend/src/components/ChatBot/ChatModeToggle.tsx b/frontend/src/components/ChatBot/ChatModeToggle.tsx
new file mode 100644
index 00000000..f80b2dfb
--- /dev/null
+++ b/frontend/src/components/ChatBot/ChatModeToggle.tsx
@@ -0,0 +1,50 @@
+import { SegmentedControl, Tip } from '@neo4j-ndl/react';
+import { ChatModeOptions } from '../../utils/Constants';
+import { useFileContext } from '../../context/UsersFiles';
+import { DbmsIcon } from '@neo4j-ndl/react/icons';
+import { capitalize } from '@mui/material';
+
+export default function ChatModeToggle({ inSidenav = false }) {
+ const [vector, _] = ChatModeOptions;
+ const { chatMode, setchatMode } = useFileContext();
+
+ return (
+
+ {ChatModeOptions.map((i, idx) => {
+ return (
+
+
+
+ {i.Icon === 'abc' ? (
+
+
+ +
+
+
+ ) : (
+
+ )}
+
+
+ {capitalize(i.value)}
+
+ );
+ })}
+
+ );
+}
diff --git a/frontend/src/components/Chatbot.tsx b/frontend/src/components/ChatBot/Chatbot.tsx
similarity index 93%
rename from frontend/src/components/Chatbot.tsx
rename to frontend/src/components/ChatBot/Chatbot.tsx
index 44ca6024..096a6c31 100644
--- a/frontend/src/components/Chatbot.tsx
+++ b/frontend/src/components/ChatBot/Chatbot.tsx
@@ -6,33 +6,33 @@ import {
SpeakerWaveIconOutline,
SpeakerXMarkIconOutline,
} from '@neo4j-ndl/react/icons';
-import ChatBotAvatar from '../assets/images/chatbot-ai.png';
-import { ChatbotProps, Source, UserCredentials } from '../types';
-import { useCredentials } from '../context/UserCredentials';
-import { chatBotAPI } from '../services/QnaAPI';
+import ChatBotAvatar from '../../assets/images/chatbot-ai.png';
+import { ChatbotProps, UserCredentials, chunk } from '../../types';
+import { useCredentials } from '../../context/UserCredentials';
+import { chatBotAPI } from '../../services/QnaAPI';
import { v4 as uuidv4 } from 'uuid';
-import { useFileContext } from '../context/UsersFiles';
-import InfoModal from './InfoModal';
+import { useFileContext } from '../../context/UsersFiles';
+import InfoModal from './Info/InfoModal';
import clsx from 'clsx';
import ReactMarkdown from 'react-markdown';
-import IconButtonWithToolTip from './IconButtonToolTip';
-import { buttonCaptions, tooltips } from '../utils/Constants';
-import useSpeechSynthesis from '../hooks/useSpeech';
-import ButtonWithToolTip from './ButtonWithToolTip';
+import IconButtonWithToolTip from '../UI/IconButtonToolTip';
+import { buttonCaptions, tooltips } from '../../utils/Constants';
+import useSpeechSynthesis from '../../hooks/useSpeech';
+import ButtonWithToolTip from '../UI/ButtonWithToolTip';
const Chatbot: React.FC = (props) => {
const { messages: listMessages, setMessages: setListMessages, isLoading, isFullScreen, clear } = props;
const [inputMessage, setInputMessage] = useState('');
const [loading, setLoading] = useState(isLoading);
const { userCredentials } = useCredentials();
- const { model } = useFileContext();
+ const { model, chatMode } = useFileContext();
const messagesEndRef = useRef(null);
const [sessionId, setSessionId] = useState(sessionStorage.getItem('session_id') ?? '');
const [showInfoModal, setShowInfoModal] = useState(false);
- const [sourcesModal, setSourcesModal] = useState