Skip to content

Commit

Permalink
fix: fix ignore logic
Browse files Browse the repository at this point in the history
* fix: Fix processing files from .gitignore

Refs #711

* refactor: Move filtering ignored files to Repository class

* ruff format

* chore: Add a test

* chore: Cleanup

* chore: Smallest possible working sleep time

* refactor: Add separate add_file_delete_commit function

* refactor: Use rg instead of git ls-files

* chore: Fix test name
  • Loading branch information
last-partizan authored Nov 17, 2024
1 parent b2f661d commit 9d302ae
Show file tree
Hide file tree
Showing 5 changed files with 78 additions and 27 deletions.
15 changes: 2 additions & 13 deletions seagoat/engine.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
This module allows you to use seagoat as a library
This module allows you to use seagoat as a library
"""

import asyncio
from concurrent.futures import ThreadPoolExecutor
from functools import partial
Expand Down Expand Up @@ -82,13 +83,6 @@ def _add_to_collection(self, chunk):
for source in chain(*self._fetchers.values()):
source["cache_chunk"](chunk)

def _is_file_ignored(self, path: str):
for pattern in self.config["server"]["ignorePatterns"]:
if Path(path).match(pattern):
return True

return False

def process_chunk(self, chunk):
if chunk.chunk_id in self.cache.data["chunks_already_analyzed"]:
return
Expand All @@ -105,8 +99,6 @@ def _create_vector_embeddings(self, minimum_chunks_to_analyze=None):
chunks_to_process = []

for file, _ in self.repository.top_files():
if self._is_file_ignored(file.path):
continue
for chunk in file.get_chunks():
if chunk.chunk_id not in self.cache.data["chunks_already_analyzed"]:
chunks_to_process.append(chunk)
Expand Down Expand Up @@ -189,9 +181,6 @@ def _format_results(self, query: str, hard_count_limit: int = 1000):
merged_results = {}

for result_item in self._results:
if self._is_file_ignored(result_item.gitfile.path):
continue

if result_item.gitfile.path not in merged_results:
merged_results[result_item.gitfile.path] = result_item
continue
Expand Down
22 changes: 18 additions & 4 deletions seagoat/repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from pathlib import Path

from seagoat.gitfile import GitFile
from seagoat.utils.config import get_config_values
from seagoat.utils.file_reader import autodecode_bytes
from seagoat.utils.file_types import is_file_type_supported

Expand All @@ -23,6 +24,7 @@ def parse_commit_info(raw_line: str):
class Repository:
def __init__(self, repo_path: str):
self.path = Path(repo_path)
self.config = get_config_values(self.path)
self.file_changes = defaultdict(list)
self.frecency_scores = {}

Expand All @@ -36,6 +38,13 @@ def _get_working_tree_diff(self):
["git", "-C", str(self.path), "diff"], text=True
).strip()

def _is_file_ignored(self, path: str):
for pattern in self.config["server"]["ignorePatterns"]:
if Path(path).match(pattern):
return True

return False

def get_file_object_id(self, file_path: str):
"""
Returns the git object id for the current version
Expand Down Expand Up @@ -85,6 +94,10 @@ def analyze_files(self):

self.file_changes.clear()

files = set(
subprocess.check_output(["rg", "--files"], cwd=self.path, text=True).split()
)

current_commit_info = None
with subprocess.Popen(cmd, stdout=subprocess.PIPE, text=True) as proc:
assert proc.stdout is not None
Expand All @@ -95,10 +108,11 @@ def analyze_files(self):
elif line:
filename = line

if not is_file_type_supported(filename):
continue

if not (self.path / filename).exists():
if (
not is_file_type_supported(filename)
or self._is_file_ignored(filename)
or filename not in files
):
continue

self.file_changes[filename].append(current_commit_info)
Expand Down
34 changes: 25 additions & 9 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,23 +153,26 @@ def add_fake_data(self):

def add_file_change_commit(
self,
file_name,
contents,
author,
commit_message,
file_name: str,
contents: str,
author: Actor,
commit_message: str,
encoding="utf-8",
):
) -> str:
# Create parent directory if it doesn't exist:
parent_folder = os.path.dirname(os.path.join(self.working_dir, file_name))
if parent_folder:
os.makedirs(parent_folder, exist_ok=True)

with open(
os.path.join(self.working_dir, file_name), "w", encoding=encoding
) as output_file:
output_file.write(contents)
file_path = os.path.join(self.working_dir, file_name)

with open(file_path, "w", encoding=encoding) as output_file:
output_file.write(contents)
self.index.add([file_name])

return self._commit_changes(commit_message, author)

def _commit_changes(self, commit_message: str, author: Actor) -> str:
return self.index.commit(
commit_message,
author=author,
Expand All @@ -179,6 +182,19 @@ def add_file_change_commit(
skip_hooks=True,
).hexsha

def add_file_delete_commit(
self,
file_name: str,
author: Actor,
commit_message: str,
) -> str:
file_path = os.path.join(self.working_dir, file_name)

os.unlink(file_path)
self.index.remove(file_name)

return self._commit_changes(commit_message, author)


@pytest.fixture
def generate_repo():
Expand Down
2 changes: 1 addition & 1 deletion tests/test_queue.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def test_handle_get_stats(
def test_important_files_are_analyzed_first(create_task_queue, mocker, repo):
enqueue = mocker.patch("seagoat.queue.task_queue.TaskQueue.enqueue")
create_task_queue()
sleep(0.5)
sleep(2.0)
repository = Repository(repo.working_dir)
repository.analyze_files()
order_of_files_analyzed = []
Expand Down
32 changes: 32 additions & 0 deletions tests/test_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from freezegun import freeze_time

from seagoat.engine import Engine
from tests.conftest import MockRepo
from tests.test_server import pytest


Expand Down Expand Up @@ -265,6 +266,37 @@ def test_does_not_crash_because_of_non_existent_files(repo):
}


def test_ignored_files_is_really_ignored(repo: MockRepo):
file_name = "node_modules/acorn/README.md"
repo.add_file_change_commit(
file_name=file_name,
contents="Hello",
author=repo.actors["John Doe"],
commit_message="Incidentally commited node_modules.",
)
repo.add_file_change_commit(
file_name=".gitignore",
contents="node_modules",
author=repo.actors["John Doe"],
commit_message="Added node_modules to gitignore.",
)
repo.add_file_delete_commit(
file_name=file_name,
author=repo.actors["John Doe"],
commit_message="Removed node_modules.",
)
# File is added in the filesystem, but should be ignored by git.
file_path = Path(repo.working_dir) / file_name
file_path.write_text("Hello.")

seagoat = Engine(repo.working_dir)
seagoat.analyze_codebase()

top_files = set(file.path for file, _ in seagoat.repository.top_files())

assert file_name not in top_files


@pytest.mark.asyncio
@pytest.mark.parametrize("chunks_to_analyze", [1, 2, 4])
async def test_allows_limiting_how_many_files_are_automatically_analized(
Expand Down

0 comments on commit 9d302ae

Please sign in to comment.