Skip to content

Commit

Permalink
Merge pull request #9 from alexminnaar/readme
Browse files Browse the repository at this point in the history
adding terminal print color
  • Loading branch information
alexminnaar authored Aug 1, 2023
2 parents 715fe7d + 6506078 commit 019fa70
Show file tree
Hide file tree
Showing 13 changed files with 125 additions and 57 deletions.
24 changes: 20 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,31 @@ RepoGPT adds additional context to the chunk including

## Demo

In this demo, the [Pandas](https://github.com/pandas-dev/pandas/tree/main) python library repo has been crawled and
we will ask RepoGPT some questions about it. This demo's config.ini file specifies `sentence-transformers/all-mpnet-base-v2`
huggingface embeddings and OpenAI's `gpt-4` model.
### Use Case #1: Code Search

With RepoGPT you can search for a piece of code. For example, let's ask RepoGPT to "show the `value_counts` method in
the `ArrowExtensionArray` class".

![demo1](https://github.com/alexminnaar/RepoGPT/blob/main/demos/repogpt_demo1.png "demo1")

### Use Case #2: Code Understanding

RepoGPT can also explain pieces of code. For example, let's ask RepoGPT to "explain the `value_counts` method in
the `ArrowExtensionArray` class".

![demo2](https://github.com/alexminnaar/RepoGPT/blob/main/demos/repogpt_demo2.png "demo2")


### Use Case #3: Code Writing

RepoGPT can also write new code based on the repo. For example let's ask RepoGPT to "write unit tests for the
`value_counts` method in the `ArrowExtensionArray` class".

![demo3](https://github.com/alexminnaar/RepoGPT/blob/main/demos/repogpt_demo3.png "demo3")


## Supported Languages

Expand Down Expand Up @@ -96,8 +116,4 @@ Ask a question:
```
Then ask your question and wait for the response. To exit, type 'exit'.

## Recommended Config

For the best quality results,


5 changes: 4 additions & 1 deletion cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,13 @@
from repogpt import config_utils
import argparse
import logging
from colorama import Fore, Back, Style, init

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("repogpt_cli_logger")

init(autoreset=True)


def parse_arguments():
parser = argparse.ArgumentParser()
Expand Down Expand Up @@ -48,7 +51,7 @@ def main():

try:
resp = qa.get_resp(query)
print(f"Response:\n{resp}")
print(Fore.GREEN + f"Response:\n{resp}")
except Exception as e:
logger.error(f"Exception occurred computing LLM Response: {e}")

Expand Down
Binary file added demos/repogpt_demo1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added demos/repogpt_demo2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added demos/repogpt_demo3.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
6 changes: 5 additions & 1 deletion example_config_files/GPT4ALL_config.ini
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,8 @@ EMBEDDING_TYPE = openai
MODEL_NAME = GPT4All
MODEL_PATH = models/ggml-gpt4all-j-v1.3-groovy.bin
MODEL_N_CTX=1000
MODEL_N_BATCH=8
MODEL_N_BATCH=8

[crawler]
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 100
6 changes: 5 additions & 1 deletion example_config_files/huggingface_config.ini
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,8 @@ NUM_RESULTS = 5
EMBEDDING_TYPE = ll-MiniLM-L6-v2

[openai-llm]
MODEL_NAME = gpt-3.5-turbo-16k
MODEL_NAME = gpt-3.5-turbo-16k

[crawler]
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 100
8 changes: 6 additions & 2 deletions example_config_files/openai_config.ini
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,14 @@ REPO_PATH = /my/repo/path

[vectorstore]
VS_PATH = /my/vs/path
NUM_RESULTS = 5
NUM_RESULTS = 25

[openai-embeddings]
EMBEDDING_TYPE = openai

[openai-llm]
MODEL_NAME = gpt-3.5-turbo-16k
MODEL_NAME = gpt-4

[crawler]
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 100
31 changes: 18 additions & 13 deletions repogpt/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,12 @@
from langchain.vectorstores import DeepLake
from repogpt.parsers.pygments_parser import PygmentsParser
from repogpt.parsers.python_parser import PythonParser
from repogpt.parsers.base import SummaryPosition, FileSummary
from multiprocessing import Pool
from tqdm import tqdm
from typing import List, Optional, Tuple
from typing import List, Optional
import os
import fnmatch
import logging
import traceback
from functools import partial

logging.basicConfig(level=logging.INFO)
Expand Down Expand Up @@ -61,8 +59,14 @@ def is_git_dir(dir_path: str) -> bool:
return os.path.isdir(git_dir)


def process_file(file_contents: List[Document], dir_path: str, file_name: str, extension: str, chunk_size: int,
chunk_overlap: int) -> List[Document]:
def process_file(
file_contents: List[Document],
dir_path: str,
file_name: str,
extension: str,
chunk_size: int,
chunk_overlap: int
) -> List[Document]:
"""For a given file, get the summary, split into chunks and create context document chunks to be indexed"""
file_doc = file_contents[0]
# get file summary for raw file
Expand All @@ -84,15 +88,19 @@ def process_file(file_contents: List[Document], dir_path: str, file_name: str, e
doc.metadata['starting_line'] = starting_line
doc.metadata['ending_line'] = ending_line

# get methods and classes associated with chunk
if extension == '.py':
method_class_summary = PythonParser.get_closest_method_class_in_snippet(file_summary, starting_line,
ending_line)
else:
method_class_summary = PygmentsParser.get_closest_method_class_in_snippet(file_summary, starting_line,
ending_line)
doc.page_content = f"The following code snippet is from a file at location {os.path.join(dir_path, file_name)} " \
f"starting at line {starting_line} and ending at line {ending_line}. {method_class_summary} " \
f"The code snippet starting at line {starting_line} and ending at line {ending_line} is \n ```\n{doc.page_content}\n``` "
doc.page_content = f"The following code snippet is from a file at location " \
f"{os.path.join(dir_path, file_name)} " \
f"starting at line {starting_line} and ending at line {ending_line}. " \
f"{method_class_summary} " \
f"The code snippet starting at line {starting_line} and ending at line " \
f"{ending_line} is \n ```\n{doc.page_content}\n``` "
return split_docs


Expand All @@ -109,19 +117,17 @@ def filter_files(root_dir: str) -> List[FileProperties]:
if extension in LANG_MAPPING and not contains_hidden_dir(dir_path):
files_to_crawl.append(FileProperties(dir_path, file, extension))
else:
logger.info(f"Skipping {os.path.join(dir_path, file)} - File/directory type not supported.")
logger.info(f"Skipping {os.path.join(dir_path, file)} - File or directory type not supported.")
return files_to_crawl


def process_and_split(file: FileProperties, chunk_size: int, chunk_overlap: int) -> Optional[List[Document]]:
"""For a given file, load it into memory and process it"""

try:
loader = TextLoader(os.path.join(file.dir_path, file.file_name), encoding='utf-8')
chunks = process_file(loader.load(), file.dir_path, file.file_name, file.extension, chunk_size, chunk_overlap)
except Exception as e:
logger.error(f"Error processing file {os.path.join(file.dir_path, file.file_name)}. Skipping file. {e}")
traceback.print_exc()
return None
return chunks

Expand All @@ -144,5 +150,4 @@ def crawl_and_split(root_dir: str, chunk_size: int = 3000, chunk_overlap: int =


def index(docs: List[Document], embedding_type: Embeddings, vs_path: str):
return None
# return DeepLake.from_documents(docs, embedding_type, dataset_path=vs_path)
return DeepLake.from_documents(docs, embedding_type, dataset_path=vs_path)
12 changes: 12 additions & 0 deletions repogpt/parsers/base.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from abc import ABC, abstractmethod
from typing import List, Tuple


class SummaryPosition:
Expand All @@ -21,6 +22,17 @@ def add_method(self, method_name: str, method_start_line: int, method_end_line:


class CodeParser(ABC):
@staticmethod
@abstractmethod
def get_summary_from_position(summary_positions: List[SummaryPosition], start_line: int,
end_line: int) -> Tuple[List[SummaryPosition], List[SummaryPosition]]:
"""Helper function to get object positions within snippet"""

@staticmethod
@abstractmethod
def get_closest_method_class_in_snippet(file_summary: FileSummary, snippet_start_line: int,
snippet_end_line: int) -> str:
"""Get the relevent methods and classes in a snippet and convert to prompt"""

@staticmethod
@abstractmethod
Expand Down
40 changes: 24 additions & 16 deletions repogpt/parsers/pygments_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,25 @@
class PygmentsParser(CodeParser):

@staticmethod
def get_summary_from_position(summary_positions: List[SummaryPosition], start_line: int,
end_line: int) -> Tuple[List[str], List[str]]:
"""For a given list of summary positions and start/end lines find which positions are before and inside the lines"""
def get_summary_from_position(
summary_positions: List[SummaryPosition],
start_line: int,
end_line: int
) -> Tuple[List[SummaryPosition], List[SummaryPosition]]:
"""For a given list of summary positions and start/end lines find which positions are before and inside the
lines"""
last_obj = []
current_obj = []

# TODO: binary search-ify this
for s_pos in summary_positions:
# get last defined obj before the snippet
if s_pos.start_line < start_line:
last_obj.append(s_pos.name)
last_obj.append(s_pos)

# get any obj defined in this snippet
if start_line <= s_pos.start_line <= end_line:
current_obj.append(s_pos.name)
current_obj.append(s_pos)

# ignore everything past this snippet
if s_pos.start_line > end_line:
Expand All @@ -30,8 +34,11 @@ def get_summary_from_position(summary_positions: List[SummaryPosition], start_li
return last_obj, current_obj

@staticmethod
def get_closest_method_class_in_snippet(file_summary: FileSummary, snippet_start_line: int,
snippet_end_line: int) -> str:
def get_closest_method_class_in_snippet(
file_summary: FileSummary,
snippet_start_line: int,
snippet_end_line: int
) -> str:
"""For a given file summary and snippet start/end lines extract summary information for the snippet"""

closest_method_class_summary = ""
Expand All @@ -40,26 +47,27 @@ def get_closest_method_class_in_snippet(file_summary: FileSummary, snippet_start
snippet_end_line)

if len(last_class) == 1:
closest_method_class_summary += f" The last class defined before this snippet was called {last_class[0]}."
closest_method_class_summary += f" The last class defined before this snippet was called " \
f"{last_class[0].name}."
elif len(last_class) > 1:
multi_class_summary = " and ".join([f"{c}" for c in last_class])
multi_class_summary = " and ".join([f"{c.name}" for c in last_class])
closest_method_class_summary += f" The classes defined before this snippet are {multi_class_summary}."
if len(current_class) == 1:
closest_method_class_summary += f" The class defined in this snippet is called {current_class[0]}."
closest_method_class_summary += f" The class defined in this snippet is called {current_class[0].name}."
elif len(current_class) > 1:
multi_class_summary = " and ".join([f"{c}" for c in current_class])
multi_class_summary = " and ".join([f"{c.name}" for c in current_class])
closest_method_class_summary += f" The classes defined in this snippet are {multi_class_summary}."

last_method, current_method = PygmentsParser.get_summary_from_position(file_summary.methods, snippet_start_line,
snippet_end_line)
snippet_end_line)

if last_method:
closest_method_class_summary += f" The beginning of this snippet contains the end of the {last_method[-1]} " \
"method."
closest_method_class_summary += f" The beginning of this snippet contains the end of the " \
f"{last_method[-1].name} method."
if len(current_method) == 1:
closest_method_class_summary += f" The method defined in this snippet is called {current_method[0]}."
closest_method_class_summary += f" The method defined in this snippet is called {current_method[0].name}."
elif len(current_method) > 1:
multi_method_summary = " and ".join([f"{meth}" for meth in current_method])
multi_method_summary = " and ".join([f"{meth.name}" for meth in current_method])
closest_method_class_summary += f" The methods defined in this snippet are {multi_method_summary}."

return closest_method_class_summary
Expand Down
33 changes: 21 additions & 12 deletions repogpt/parsers/python_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,11 @@

class PythonParser(CodeParser):
@staticmethod
def get_summary_from_position(summary_positions: List[SummaryPosition], start_line: int,
end_line: int) -> Tuple[List[SummaryPosition], List[SummaryPosition]]:
def get_summary_from_position(
summary_positions: List[SummaryPosition],
start_line: int,
end_line: int
) -> Tuple[List[SummaryPosition], List[SummaryPosition]]:

last_obj = []
current_obj = []
Expand All @@ -27,19 +30,24 @@ def get_summary_from_position(summary_positions: List[SummaryPosition], start_li
return last_obj, current_obj

@staticmethod
def get_closest_method_class_in_snippet(file_summary: FileSummary, snippet_start_line: int,
snippet_end_line: int) -> str:
def get_closest_method_class_in_snippet(
file_summary: FileSummary,
snippet_start_line: int,
snippet_end_line: int
) -> str:
closest_method_class_summary = ""

last_class, current_class = PythonParser.get_summary_from_position(file_summary.classes, snippet_start_line,
snippet_end_line)

if last_class:
closest_method_class_summary += f" The last class defined before this snippet was called `{last_class[-1].name}` " \
f"starting at line {last_class[-1].start_line} and ending at line {last_class[-1].end_line}."
closest_method_class_summary += f" The last class defined before this snippet was called " \
f"`{last_class[-1].name}` starting at line {last_class[-1].start_line} " \
f"and ending at line {last_class[-1].end_line}."
if len(current_class) == 1:
closest_method_class_summary += f" The class defined in this snippet is called `{current_class[0].name}`" \
f"starting at line {current_class[0].start_line} and ending at line {current_class[0].end_line}."
f"starting at line {current_class[0].start_line} and ending at line " \
f"{current_class[0].end_line}."
elif len(current_class) > 1:
multi_class_summary = " and ".join(
[f"`{c.name}` starting at line {c.start_line} and ending at line {c.end_line}" for c in current_class])
Expand All @@ -49,12 +57,13 @@ def get_closest_method_class_in_snippet(file_summary: FileSummary, snippet_start
snippet_end_line)

if last_method:
closest_method_class_summary += f" The last method starting before this snippet is called `{last_method[-1].name}` " \
f"which starts on line {last_method[-1].start_line} and ends at " \
f"line {last_method[-1].end_line}."
closest_method_class_summary += f" The last method starting before this snippet is called " \
f"`{last_method[-1].name}` which starts on line " \
f"{last_method[-1].start_line} and ends at line {last_method[-1].end_line}."
if len(current_method) == 1:
closest_method_class_summary += f" The method defined in this snippet is called `{current_method[0].name}` " \
f"starting at line {current_method[0].start_line} and ending at line " \
closest_method_class_summary += f" The method defined in this snippet is called " \
f"`{current_method[0].name}` starting at line " \
f"{current_method[0].start_line} and ending at line " \
f"{current_method[0].end_line}."
elif len(current_method) > 1:
multi_method_summary = " and ".join(
Expand Down
17 changes: 10 additions & 7 deletions repogpt/qa/qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
from langchain.llms import BaseLLM
from langchain.docstore.document import Document
from typing import List
from colorama import Fore, Back, Style, init

init(autoreset=True)


class QA:
Expand All @@ -17,21 +20,21 @@ def __init__(self, llm: BaseLLM, deeplake_store: DeepLake, num_results: int):
def create_prompt(self, query_str: str, similar_chunks: List[Document]) -> str:
"""Build the final prompt string using query and similar chunks"""
similar_chunk_str = '\n'.join([chunk.page_content for chunk in similar_chunks])
# TODO: add to prompt to not use test files if query does not explicitly mention test files
# TODO: Try structured json prompt
final_prompt = f"Given these code snippets, \n {similar_chunk_str}\n The question is: {query_str}"
final_prompt = f"You will be asked a question based on the following code snippets, \n {similar_chunk_str}\n " \
f"You may need to combine the above snippets according to their line numbers to answer the " \
f"following question. The question is: {query_str}"
return final_prompt

def get_resp(self, query_str: str) -> str:
"""Given a string, get similar chunks and construct a prompt feed it to LLM and return response"""
# reverse similar chunks so that most relevant are less likely to be forgotten
similar_chunks = self.retriever.get_relevant_documents(query_str)[::-1]
print("Relevant files:")
print(Fore.RED + "Relevant files:")
for chunk in similar_chunks:
print(
f"{chunk.metadata['source']} - lines {chunk.metadata['starting_line']} - {chunk.metadata['ending_line']}")
print(f"{chunk.page_content}")
print("="*100)
print(Fore.RED +
f"{chunk.metadata['source']} - lines {chunk.metadata['starting_line']} - {chunk.metadata['ending_line']}")

qa_prompt = self.create_prompt(query_str, similar_chunks)
print("Computing response...")
return self.llm(qa_prompt)

0 comments on commit 019fa70

Please sign in to comment.