Merge pull request #9 from alexminnaar/readme

adding terminal print color
alexminnaar · Aug 1, 2023 · 019fa70 · 019fa70
2 parents 715fe7d + 6506078
commit 019fa70
Show file tree

Hide file tree

Showing 13 changed files with 125 additions and 57 deletions.
diff --git a/README.md b/README.md
@@ -26,11 +26,31 @@ RepoGPT adds additional context to the chunk including
 
 ## Demo
 
+In this demo, the [Pandas](https://github.com/pandas-dev/pandas/tree/main) python library repo has been crawled and 
+we will ask RepoGPT some questions about it.  This demo's config.ini file specifies `sentence-transformers/all-mpnet-base-v2` 
+huggingface embeddings and OpenAI's `gpt-4` model.
 ### Use Case #1: Code Search
 
+With RepoGPT you can search for a piece of code.  For example, let's ask RepoGPT to "show the `value_counts` method in
+the `ArrowExtensionArray` class".
+
+![demo1](https://github.com/alexminnaar/RepoGPT/blob/main/demos/repogpt_demo1.png "demo1")
+
 ### Use Case #2: Code Understanding
 
+RepoGPT can also explain pieces of code.  For example, let's ask RepoGPT to "explain the `value_counts` method in
+the `ArrowExtensionArray` class".
+
+![demo2](https://github.com/alexminnaar/RepoGPT/blob/main/demos/repogpt_demo2.png "demo2")
+
+
 ### Use Case #3: Code Writing
+
+RepoGPT can also write new code based on the repo.  For example let's ask RepoGPT to "write unit tests for the 
+`value_counts` method in the `ArrowExtensionArray` class".
+
+![demo3](https://github.com/alexminnaar/RepoGPT/blob/main/demos/repogpt_demo3.png "demo3")
+
 
 ## Supported Languages
 
@@ -96,8 +116,4 @@ Ask a question:
 ```
 Then ask your question and wait for the response.  To exit, type 'exit'.
 
-## Recommended Config
-
-For the best quality results, 
-
 
diff --git a/cli.py b/cli.py
@@ -4,10 +4,13 @@
 from repogpt import config_utils
 import argparse
 import logging
+from colorama import Fore, Back, Style, init
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger("repogpt_cli_logger")
 
+init(autoreset=True)
+
 
 def parse_arguments():
     parser = argparse.ArgumentParser()
@@ -48,7 +51,7 @@ def main():
 
             try:
                 resp = qa.get_resp(query)
-                print(f"Response:\n{resp}")
+                print(Fore.GREEN + f"Response:\n{resp}")
             except Exception as e:
                 logger.error(f"Exception occurred computing LLM Response: {e}")
 

diff --git a/demos/repogpt_demo1.png b/demos/repogpt_demo1.png
diff --git a/demos/repogpt_demo2.png b/demos/repogpt_demo2.png
diff --git a/demos/repogpt_demo3.png b/demos/repogpt_demo3.png
diff --git a/example_config_files/GPT4ALL_config.ini b/example_config_files/GPT4ALL_config.ini
@@ -12,4 +12,8 @@ EMBEDDING_TYPE = openai
 MODEL_NAME = GPT4All
 MODEL_PATH = models/ggml-gpt4all-j-v1.3-groovy.bin
 MODEL_N_CTX=1000
-MODEL_N_BATCH=8
+MODEL_N_BATCH=8
+
+[crawler]
+CHUNK_SIZE = 1000
+CHUNK_OVERLAP = 100
diff --git a/example_config_files/huggingface_config.ini b/example_config_files/huggingface_config.ini
@@ -9,4 +9,8 @@ NUM_RESULTS = 5
 EMBEDDING_TYPE = ll-MiniLM-L6-v2
 
 [openai-llm]
-MODEL_NAME = gpt-3.5-turbo-16k
+MODEL_NAME = gpt-3.5-turbo-16k
+
+[crawler]
+CHUNK_SIZE = 1000
+CHUNK_OVERLAP = 100
diff --git a/example_config_files/openai_config.ini b/example_config_files/openai_config.ini
@@ -3,10 +3,14 @@ REPO_PATH = /my/repo/path
 
 [vectorstore]
 VS_PATH = /my/vs/path
-NUM_RESULTS = 5
+NUM_RESULTS = 25
 
 [openai-embeddings]
 EMBEDDING_TYPE = openai
 
 [openai-llm]
-MODEL_NAME = gpt-3.5-turbo-16k
+MODEL_NAME = gpt-4
+
+[crawler]
+CHUNK_SIZE = 1000
+CHUNK_OVERLAP = 100
diff --git a/repogpt/crawler.py b/repogpt/crawler.py
@@ -5,14 +5,12 @@
 from langchain.vectorstores import DeepLake
 from repogpt.parsers.pygments_parser import PygmentsParser
 from repogpt.parsers.python_parser import PythonParser
-from repogpt.parsers.base import SummaryPosition, FileSummary
 from multiprocessing import Pool
 from tqdm import tqdm
-from typing import List, Optional, Tuple
+from typing import List, Optional
 import os
 import fnmatch
 import logging
-import traceback
 from functools import partial
 
 logging.basicConfig(level=logging.INFO)
@@ -61,8 +59,14 @@ def is_git_dir(dir_path: str) -> bool:
     return os.path.isdir(git_dir)
 
 
-def process_file(file_contents: List[Document], dir_path: str, file_name: str, extension: str, chunk_size: int,
-                 chunk_overlap: int) -> List[Document]:
+def process_file(
+        file_contents: List[Document],
+        dir_path: str,
+        file_name: str,
+        extension: str,
+        chunk_size: int,
+        chunk_overlap: int
+) -> List[Document]:
     """For a given file, get the summary, split into chunks and create context document chunks to be indexed"""
     file_doc = file_contents[0]
     # get file summary for raw file
@@ -84,15 +88,19 @@ def process_file(file_contents: List[Document], dir_path: str, file_name: str, e
         doc.metadata['starting_line'] = starting_line
         doc.metadata['ending_line'] = ending_line
 
+        # get methods and classes associated with chunk
         if extension == '.py':
             method_class_summary = PythonParser.get_closest_method_class_in_snippet(file_summary, starting_line,
                                                                                     ending_line)
         else:
             method_class_summary = PygmentsParser.get_closest_method_class_in_snippet(file_summary, starting_line,
                                                                                       ending_line)
-        doc.page_content = f"The following code snippet is from a file at location {os.path.join(dir_path, file_name)} " \
-                           f"starting at line {starting_line} and ending at line {ending_line}. {method_class_summary} " \
-                           f"The code snippet starting at line {starting_line} and ending at line {ending_line} is \n ```\n{doc.page_content}\n``` "
+        doc.page_content = f"The following code snippet is from a file at location " \
+                           f"{os.path.join(dir_path, file_name)} " \
+                           f"starting at line {starting_line} and ending at line {ending_line}. " \
+                           f"{method_class_summary} " \
+                           f"The code snippet starting at line {starting_line} and ending at line " \
+                           f"{ending_line} is \n ```\n{doc.page_content}\n``` "
     return split_docs
 
 
@@ -109,19 +117,17 @@ def filter_files(root_dir: str) -> List[FileProperties]:
             if extension in LANG_MAPPING and not contains_hidden_dir(dir_path):
                 files_to_crawl.append(FileProperties(dir_path, file, extension))
             else:
-                logger.info(f"Skipping {os.path.join(dir_path, file)} - File/directory type not supported.")
+                logger.info(f"Skipping {os.path.join(dir_path, file)} - File or directory type not supported.")
     return files_to_crawl
 
 
 def process_and_split(file: FileProperties, chunk_size: int, chunk_overlap: int) -> Optional[List[Document]]:
     """For a given file, load it into memory and process it"""
-
     try:
         loader = TextLoader(os.path.join(file.dir_path, file.file_name), encoding='utf-8')
         chunks = process_file(loader.load(), file.dir_path, file.file_name, file.extension, chunk_size, chunk_overlap)
     except Exception as e:
         logger.error(f"Error processing file {os.path.join(file.dir_path, file.file_name)}. Skipping file. {e}")
-        traceback.print_exc()
         return None
     return chunks
 
@@ -144,5 +150,4 @@ def crawl_and_split(root_dir: str, chunk_size: int = 3000, chunk_overlap: int =
 
 
 def index(docs: List[Document], embedding_type: Embeddings, vs_path: str):
-    return None
-    # return DeepLake.from_documents(docs, embedding_type, dataset_path=vs_path)
+    return DeepLake.from_documents(docs, embedding_type, dataset_path=vs_path)
diff --git a/repogpt/parsers/base.py b/repogpt/parsers/base.py
@@ -1,4 +1,5 @@
 from abc import ABC, abstractmethod
+from typing import List, Tuple
 
 
 class SummaryPosition:
@@ -21,6 +22,17 @@ def add_method(self, method_name: str, method_start_line: int, method_end_line:
 
 
 class CodeParser(ABC):
+    @staticmethod
+    @abstractmethod
+    def get_summary_from_position(summary_positions: List[SummaryPosition], start_line: int,
+                                  end_line: int) -> Tuple[List[SummaryPosition], List[SummaryPosition]]:
+        """Helper function to get object positions within snippet"""
+
+    @staticmethod
+    @abstractmethod
+    def get_closest_method_class_in_snippet(file_summary: FileSummary, snippet_start_line: int,
+                                            snippet_end_line: int) -> str:
+        """Get the relevent methods and classes in a snippet and convert to prompt"""
 
     @staticmethod
     @abstractmethod

diff --git a/repogpt/parsers/pygments_parser.py b/repogpt/parsers/pygments_parser.py
@@ -7,21 +7,25 @@
 class PygmentsParser(CodeParser):
 
     @staticmethod
-    def get_summary_from_position(summary_positions: List[SummaryPosition], start_line: int,
-                                  end_line: int) -> Tuple[List[str], List[str]]:
-        """For a given list of summary positions and start/end lines find which positions are before and inside the lines"""
+    def get_summary_from_position(
+            summary_positions: List[SummaryPosition],
+            start_line: int,
+            end_line: int
+    ) -> Tuple[List[SummaryPosition], List[SummaryPosition]]:
+        """For a given list of summary positions and start/end lines find which positions are before and inside the
+        lines"""
         last_obj = []
         current_obj = []
 
         # TODO: binary search-ify this
         for s_pos in summary_positions:
             # get last defined obj before the snippet
             if s_pos.start_line < start_line:
-                last_obj.append(s_pos.name)
+                last_obj.append(s_pos)
 
             # get any obj defined in this snippet
             if start_line <= s_pos.start_line <= end_line:
-                current_obj.append(s_pos.name)
+                current_obj.append(s_pos)
 
             # ignore everything past this snippet
             if s_pos.start_line > end_line:
@@ -30,8 +34,11 @@ def get_summary_from_position(summary_positions: List[SummaryPosition], start_li
         return last_obj, current_obj
 
     @staticmethod
-    def get_closest_method_class_in_snippet(file_summary: FileSummary, snippet_start_line: int,
-                                            snippet_end_line: int) -> str:
+    def get_closest_method_class_in_snippet(
+            file_summary: FileSummary,
+            snippet_start_line: int,
+            snippet_end_line: int
+    ) -> str:
         """For a given file summary and snippet start/end lines extract summary information for the snippet"""
 
         closest_method_class_summary = ""
@@ -40,26 +47,27 @@ def get_closest_method_class_in_snippet(file_summary: FileSummary, snippet_start
                                                                              snippet_end_line)
 
         if len(last_class) == 1:
-            closest_method_class_summary += f"  The last class defined before this snippet was called {last_class[0]}."
+            closest_method_class_summary += f"  The last class defined before this snippet was called " \
+                                            f"{last_class[0].name}."
         elif len(last_class) > 1:
-            multi_class_summary = " and ".join([f"{c}" for c in last_class])
+            multi_class_summary = " and ".join([f"{c.name}" for c in last_class])
             closest_method_class_summary += f"  The classes defined before this snippet are {multi_class_summary}."
         if len(current_class) == 1:
-            closest_method_class_summary += f"  The class defined in this snippet is called {current_class[0]}."
+            closest_method_class_summary += f"  The class defined in this snippet is called {current_class[0].name}."
         elif len(current_class) > 1:
-            multi_class_summary = " and ".join([f"{c}" for c in current_class])
+            multi_class_summary = " and ".join([f"{c.name}" for c in current_class])
             closest_method_class_summary += f"  The classes defined in this snippet are {multi_class_summary}."
 
         last_method, current_method = PygmentsParser.get_summary_from_position(file_summary.methods, snippet_start_line,
-                                                                     snippet_end_line)
+                                                                               snippet_end_line)
 
         if last_method:
-            closest_method_class_summary += f"  The beginning of this snippet contains the end of the {last_method[-1]} " \
-                                            "method."
+            closest_method_class_summary += f"  The beginning of this snippet contains the end of the " \
+                                            f"{last_method[-1].name} method."
         if len(current_method) == 1:
-            closest_method_class_summary += f"  The method defined in this snippet is called {current_method[0]}."
+            closest_method_class_summary += f"  The method defined in this snippet is called {current_method[0].name}."
         elif len(current_method) > 1:
-            multi_method_summary = " and ".join([f"{meth}" for meth in current_method])
+            multi_method_summary = " and ".join([f"{meth.name}" for meth in current_method])
             closest_method_class_summary += f"  The methods defined in this snippet are {multi_method_summary}."
 
         return closest_method_class_summary

diff --git a/repogpt/parsers/python_parser.py b/repogpt/parsers/python_parser.py
@@ -5,8 +5,11 @@
 
 class PythonParser(CodeParser):
     @staticmethod
-    def get_summary_from_position(summary_positions: List[SummaryPosition], start_line: int,
-                                  end_line: int) -> Tuple[List[SummaryPosition], List[SummaryPosition]]:
+    def get_summary_from_position(
+            summary_positions: List[SummaryPosition],
+            start_line: int,
+            end_line: int
+    ) -> Tuple[List[SummaryPosition], List[SummaryPosition]]:
 
         last_obj = []
         current_obj = []
@@ -27,19 +30,24 @@ def get_summary_from_position(summary_positions: List[SummaryPosition], start_li
         return last_obj, current_obj
 
     @staticmethod
-    def get_closest_method_class_in_snippet(file_summary: FileSummary, snippet_start_line: int,
-                                            snippet_end_line: int) -> str:
+    def get_closest_method_class_in_snippet(
+            file_summary: FileSummary,
+            snippet_start_line: int,
+            snippet_end_line: int
+    ) -> str:
         closest_method_class_summary = ""
 
         last_class, current_class = PythonParser.get_summary_from_position(file_summary.classes, snippet_start_line,
                                                                            snippet_end_line)
 
         if last_class:
-            closest_method_class_summary += f"  The last class defined before this snippet was called `{last_class[-1].name}` " \
-                                            f"starting at line {last_class[-1].start_line} and ending at line {last_class[-1].end_line}."
+            closest_method_class_summary += f"  The last class defined before this snippet was called " \
+                                            f"`{last_class[-1].name}` starting at line {last_class[-1].start_line} " \
+                                            f"and ending at line {last_class[-1].end_line}."
         if len(current_class) == 1:
             closest_method_class_summary += f"  The class defined in this snippet is called `{current_class[0].name}`" \
-                                            f"starting at line {current_class[0].start_line} and ending at line {current_class[0].end_line}."
+                                            f"starting at line {current_class[0].start_line} and ending at line " \
+                                            f"{current_class[0].end_line}."
         elif len(current_class) > 1:
             multi_class_summary = " and ".join(
                 [f"`{c.name}` starting at line {c.start_line} and ending at line {c.end_line}" for c in current_class])
@@ -49,12 +57,13 @@ def get_closest_method_class_in_snippet(file_summary: FileSummary, snippet_start
                                                                              snippet_end_line)
 
         if last_method:
-            closest_method_class_summary += f"  The last method starting before this snippet is called `{last_method[-1].name}` " \
-                                            f"which starts on line {last_method[-1].start_line} and ends at " \
-                                            f"line {last_method[-1].end_line}."
+            closest_method_class_summary += f"  The last method starting before this snippet is called " \
+                                            f"`{last_method[-1].name}` which starts on line " \
+                                            f"{last_method[-1].start_line} and ends at line {last_method[-1].end_line}."
         if len(current_method) == 1:
-            closest_method_class_summary += f"  The method defined in this snippet is called `{current_method[0].name}` " \
-                                            f"starting at line {current_method[0].start_line} and ending at line " \
+            closest_method_class_summary += f"  The method defined in this snippet is called " \
+                                            f"`{current_method[0].name}` starting at line " \
+                                            f"{current_method[0].start_line} and ending at line " \
                                             f"{current_method[0].end_line}."
         elif len(current_method) > 1:
             multi_method_summary = " and ".join(

diff --git a/repogpt/qa/qa.py b/repogpt/qa/qa.py
@@ -2,6 +2,9 @@
 from langchain.llms import BaseLLM
 from langchain.docstore.document import Document
 from typing import List
+from colorama import Fore, Back, Style, init
+
+init(autoreset=True)
 
 
 class QA:
@@ -17,21 +20,21 @@ def __init__(self, llm: BaseLLM, deeplake_store: DeepLake, num_results: int):
     def create_prompt(self, query_str: str, similar_chunks: List[Document]) -> str:
         """Build the final prompt string using query and similar chunks"""
         similar_chunk_str = '\n'.join([chunk.page_content for chunk in similar_chunks])
-        # TODO: add to prompt to not use test files if query does not explicitly mention test files
         # TODO: Try structured json prompt
-        final_prompt = f"Given these code snippets, \n {similar_chunk_str}\n The question is: {query_str}"
+        final_prompt = f"You will be asked a question based on the following code snippets, \n {similar_chunk_str}\n " \
+                       f"You may need to combine the above snippets according to their line numbers to answer the " \
+                       f"following question.  The question is: {query_str}"
         return final_prompt
 
     def get_resp(self, query_str: str) -> str:
         """Given a string, get similar chunks and construct a prompt feed it to LLM and return response"""
         # reverse similar chunks so that most relevant are less likely to be forgotten
         similar_chunks = self.retriever.get_relevant_documents(query_str)[::-1]
-        print("Relevant files:")
+        print(Fore.RED + "Relevant files:")
         for chunk in similar_chunks:
-            print(
-                f"{chunk.metadata['source']} - lines {chunk.metadata['starting_line']} - {chunk.metadata['ending_line']}")
-            print(f"{chunk.page_content}")
-            print("="*100)
+            print(Fore.RED +
+                  f"{chunk.metadata['source']} - lines {chunk.metadata['starting_line']} - {chunk.metadata['ending_line']}")
 
         qa_prompt = self.create_prompt(query_str, similar_chunks)
+        print("Computing response...")
         return self.llm(qa_prompt)