From c2721c7889926c1c415dcef1756cd7aad49c8930 Mon Sep 17 00:00:00 2001 From: Yuhong Sun Date: Mon, 2 Oct 2023 22:56:24 -0700 Subject: [PATCH] Option to have very verbose LLM logging (#500) --- backend/danswer/chat/chat_llm.py | 2 - backend/danswer/chat/chat_prompts.py | 10 +-- backend/danswer/configs/app_configs.py | 5 ++ .../danswer/connectors/hubspot/connector.py | 70 ++++++++++--------- backend/danswer/direct_qa/qa_utils.py | 2 +- backend/danswer/llm/llm.py | 30 +++++++- 6 files changed, 78 insertions(+), 41 deletions(-) diff --git a/backend/danswer/chat/chat_llm.py b/backend/danswer/chat/chat_llm.py index 243e4be0a00..fe302a4c62e 100644 --- a/backend/danswer/chat/chat_llm.py +++ b/backend/danswer/chat/chat_llm.py @@ -73,8 +73,6 @@ def _parse_embedded_json_streamed_response( yield DanswerAnswerPiece(answer_piece=hold) hold = "" - logger.debug(model_output) - model_final = extract_embedded_json(model_output) if "action" not in model_final or "action_input" not in model_final: raise ValueError("Model did not provide all required action values") diff --git a/backend/danswer/chat/chat_prompts.py b/backend/danswer/chat/chat_prompts.py index 5fdad7355f4..a8af1ce4cca 100644 --- a/backend/danswer/chat/chat_prompts.py +++ b/backend/danswer/chat/chat_prompts.py @@ -15,8 +15,9 @@ DANSWER_SYSTEM_MSG = ( "Given a conversation (between Human and Assistant) and a final message from Human, " - "rewrite the last message to be a standalone question that captures required/relevant context from the previous " - "conversation messages." + "rewrite the last message to be a standalone question which captures required/relevant context " + "from previous messages. This question must be useful for a semantic search engine. " + "It is used for a natural language search." ) TOOL_TEMPLATE = """ @@ -181,8 +182,9 @@ def build_combined_query( combined_query_msgs.append( HumanMessage( content=( - "Help me rewrite this final query into a standalone question that takes into consideration the " - f"past messages of the conversation. You must ONLY return the rewritten query and nothing else." + "Help me rewrite this final message into a standalone query that takes into consideration the " + f"past messages of the conversation if relevant. This query is used with a semantic search engine to " + f"retrieve documents. You must ONLY return the rewritten query and nothing else." f"\n\nQuery:\n{query_message.message}" ) ) diff --git a/backend/danswer/configs/app_configs.py b/backend/danswer/configs/app_configs.py index a1198f00f37..777061348f2 100644 --- a/backend/danswer/configs/app_configs.py +++ b/backend/danswer/configs/app_configs.py @@ -205,6 +205,11 @@ # each worker loads the embedding models into memory. NUM_INDEXING_WORKERS = int(os.environ.get("NUM_INDEXING_WORKERS") or 1) +# Logs every model prompt and output, mostly used for development or exploration purposes +LOG_ALL_MODEL_INTERACTIONS = ( + os.environ.get("LOG_ALL_MODEL_INTERACTIONS", "").lower() == "true" +) + ##### # Danswer Slack Bot Configs diff --git a/backend/danswer/connectors/hubspot/connector.py b/backend/danswer/connectors/hubspot/connector.py index 0c8b84ae0ec..178d0a0e15f 100644 --- a/backend/danswer/connectors/hubspot/connector.py +++ b/backend/danswer/connectors/hubspot/connector.py @@ -1,8 +1,9 @@ -import requests -import json -from typing import Any from datetime import datetime -from hubspot import HubSpot +from typing import Any + +import requests +from hubspot import HubSpot # type: ignore + from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.constants import DocumentSource from danswer.connectors.interfaces import GenerateDocumentsOutput @@ -19,19 +20,22 @@ logger = setup_logger() + class HubSpotConnector(LoadConnector, PollConnector): - def __init__(self, batch_size: int = INDEX_BATCH_SIZE, access_token: str | None = None) -> None: + def __init__( + self, batch_size: int = INDEX_BATCH_SIZE, access_token: str | None = None + ) -> None: self.batch_size = batch_size self.access_token = access_token - self.portal_id = None + self.portal_id: str | None = None self.ticket_base_url = HUBSPOT_BASE_URL - + def get_portal_id(self) -> str: headers = { - 'Authorization': f'Bearer {self.access_token}', - 'Content-Type': 'application/json' + "Authorization": f"Bearer {self.access_token}", + "Content-Type": "application/json", } - + response = requests.get(HUBSPOT_API_URL, headers=headers) if response.status_code != 200: raise Exception("Error fetching portal ID") @@ -41,7 +45,7 @@ def get_portal_id(self) -> str: def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: self.access_token = credentials["hubspot_access_token"] - + if self.access_token: self.portal_id = self.get_portal_id() self.ticket_base_url = f"{HUBSPOT_BASE_URL}{self.portal_id}/ticket/" @@ -53,25 +57,25 @@ def _process_tickets( ) -> GenerateDocumentsOutput: if self.access_token is None: raise ConnectorMissingCredentialError("HubSpot") - + api_client = HubSpot(access_token=self.access_token) - all_tickets = api_client.crm.tickets.get_all(associations=['contacts', 'notes']) + all_tickets = api_client.crm.tickets.get_all(associations=["contacts", "notes"]) doc_batch: list[Document] = [] - + for ticket in all_tickets: updated_at = ticket.updated_at.replace(tzinfo=None) if start is not None and updated_at < start: continue if end is not None and updated_at > end: continue - + title = ticket.properties["subject"] link = self.ticket_base_url + ticket.id content_text = title + "\n" + ticket.properties["content"] - associated_emails = [] - associated_notes = [] + associated_emails: list[str] = [] + associated_notes: list[str] = [] if ticket.associations: contacts = ticket.associations.get("contacts") @@ -79,18 +83,22 @@ def _process_tickets( if contacts: for contact in contacts.results: - contact = api_client.crm.contacts.basic_api.get_by_id(contact_id=contact.id) + contact = api_client.crm.contacts.basic_api.get_by_id( + contact_id=contact.id + ) associated_emails.append(contact.properties["email"]) - if notes: + if notes: for note in notes.results: - note = api_client.crm.objects.notes.basic_api.get_by_id(note_id=note.id, properties=["content", "hs_body_preview"]) + note = api_client.crm.objects.notes.basic_api.get_by_id( + note_id=note.id, properties=["content", "hs_body_preview"] + ) associated_notes.append(note.properties["hs_body_preview"]) - - associated_emails = " ,".join(associated_emails) - associated_notes = " ".join(associated_notes) - content_text = f"{content_text}\n emails: {associated_emails} \n notes: {associated_notes}" + associated_emails_str = " ,".join(associated_emails) + associated_notes_str = " ".join(associated_notes) + + content_text = f"{content_text}\n emails: {associated_emails_str} \n notes: {associated_notes_str}" doc_batch.append( Document( @@ -101,7 +109,7 @@ def _process_tickets( metadata={}, ) ) - + if len(doc_batch) >= self.batch_size: yield doc_batch doc_batch = [] @@ -120,19 +128,17 @@ def poll_source( return self._process_tickets(start_datetime, end_datetime) - if __name__ == "__main__": import os import time + test_connector = HubSpotConnector() - test_connector.load_credentials({ - "hubspot_access_token": os.environ["HUBSPOT_ACCESS_TOKEN"] - }) + test_connector.load_credentials( + {"hubspot_access_token": os.environ["HUBSPOT_ACCESS_TOKEN"]} + ) all_docs = test_connector.load_from_state() - + current = time.time() one_day_ago = current - 24 * 60 * 60 # 1 day latest_docs = test_connector.poll_source(one_day_ago, current) print(latest_docs) - - diff --git a/backend/danswer/direct_qa/qa_utils.py b/backend/danswer/direct_qa/qa_utils.py index f1a52b4fa41..f1dab5ae9ca 100644 --- a/backend/danswer/direct_qa/qa_utils.py +++ b/backend/danswer/direct_qa/qa_utils.py @@ -251,7 +251,7 @@ def process_model_tokens( yield DanswerAnswerPiece(answer_piece=hold_quote + token) hold_quote = "" - logger.debug(f"Raw model output: {model_output}") + logger.debug(f"Raw Model QnA Output: {model_output}") # for a JSON prompt, make sure that we're only passing through the "JSON part" # since that is what `extract_quotes_from_completed_token_stream` expects diff --git a/backend/danswer/llm/llm.py b/backend/danswer/llm/llm.py index c008fab43eb..9522fcb5f16 100644 --- a/backend/danswer/llm/llm.py +++ b/backend/danswer/llm/llm.py @@ -4,6 +4,7 @@ from langchain.chat_models.base import BaseChatModel from langchain.schema.language_model import LanguageModelInput +from danswer.configs.app_configs import LOG_ALL_MODEL_INTERACTIONS from danswer.llm.utils import message_generator_to_string_generator from danswer.utils.logger import setup_logger @@ -40,10 +41,35 @@ def _log_model_config(self) -> None: f"Model Class: {self.llm.__class__.__name__}, Model Config: {self.llm.__dict__}" ) + @staticmethod + def _log_prompt(prompt: LanguageModelInput) -> None: + if isinstance(prompt, list): + for ind, msg in enumerate(prompt): + logger.debug(f"Message {ind}:\n{msg.content}") + if isinstance(prompt, str): + logger.debug(f"Prompt:\n{prompt}") + def invoke(self, prompt: LanguageModelInput) -> str: self._log_model_config() - return self.llm.invoke(prompt).content + if LOG_ALL_MODEL_INTERACTIONS: + self._log_prompt(prompt) + + model_raw = self.llm.invoke(prompt).content + if LOG_ALL_MODEL_INTERACTIONS: + logger.debug(f"Raw Model Output:\n{model_raw}") + + return model_raw def stream(self, prompt: LanguageModelInput) -> Iterator[str]: self._log_model_config() - yield from message_generator_to_string_generator(self.llm.stream(prompt)) + if LOG_ALL_MODEL_INTERACTIONS: + self._log_prompt(prompt) + + output_tokens = [] + for token in message_generator_to_string_generator(self.llm.stream(prompt)): + output_tokens.append(token) + yield token + + full_output = "".join(output_tokens) + if LOG_ALL_MODEL_INTERACTIONS: + logger.debug(f"Raw Model Output:\n{full_output}")