Skip to content

Commit

Permalink
Merge pull request #28 from tawanda-kembo/feat/toggle-include-comments
Browse files Browse the repository at this point in the history
Feat/toggle include comments
  • Loading branch information
tawandakembo authored Aug 24, 2024
2 parents 318bcaf + b7e6ae6 commit e0f0abe
Show file tree
Hide file tree
Showing 8 changed files with 143 additions and 52 deletions.
Binary file added .DS_Store
Binary file not shown.
3 changes: 2 additions & 1 deletion .flake8
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
[flake8]
max-line-length = 120
exclude = .git,__pycache__,docs/source/conf.py,old,build,dist
exclude = .git,__pycache__,docs/source/conf.py,old,build,dist
ignore = E402
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -142,3 +142,4 @@ build/
dist/
.aider*
.aider.*
.benchmarks/
9 changes: 8 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ Have you ever needed to provide a comprehensive overview of your codebase for a
- **.gitignore Support**: Automatically ignores files specified in the `.gitignore` file if one exists.
- **Customizable Output**: Outputs a single Markdown file named `collated-code.md` by default, with options to specify the path to the codebase directory and output file name.
- **Binary File Inclusion**: Includes binary files such as images in the output with a note about their file type.
- **Comment Exclusion Option**: Allows users to exclude comments and docstrings from the collated code.
- **Help Command**: Provides a help command to display usage instructions.

## Installation
Expand All @@ -38,6 +39,12 @@ For more detailed usage instructions, use the help command:
code-collator --help
```

To exclude comments and docstrings from the collated code:

```sh
code-collator --path /path/to/codebase --output my-collated-code.md --comments off
```



## Running Tests
Expand Down Expand Up @@ -86,4 +93,4 @@ Please see CONTRIBUTING.md for details on our code of conduct and the process fo

## License

This project is licensed under the MIT License - see the LICENSE file for details.
This project is licensed under the MIT License - see the LICENSE file for details.
56 changes: 47 additions & 9 deletions code_collator/collate.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
import os
import sys
import argparse
from pathlib import Path
import logging
from fnmatch import fnmatch
from pygments import lexers, token
from pygments.util import ClassNotFound


def setup_logging():
"""Set up logging configuration."""
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(sys.stdout)
handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
logger.addHandler(handler)


def is_binary_file(filepath):
Expand All @@ -33,7 +37,7 @@ def read_gitignore(path):
try:
with open(gitignore_path, 'r') as f:
patterns = f.read().splitlines()
logging.info("Loaded .gitignore patterns from {gitignore_path}")
logging.info(f"Loaded .gitignore patterns from {gitignore_path}")
return patterns
except Exception as e:
logging.error(f"Error reading .gitignore file {gitignore_path}: {e}")
Expand Down Expand Up @@ -65,7 +69,38 @@ def should_ignore(file_path, ignore_patterns):
return False


def collate_codebase(path, output_file):
def process_file_content(content, file_path, include_comments):
"""Process file content, optionally removing comments and docstrings."""
if include_comments:
return content

try:
lexer = lexers.get_lexer_for_filename(file_path)
except ClassNotFound:
logging.warning(f"No lexer found for {file_path}. Returning original content.")
return content

tokens = list(lexer.get_tokens(content))
processed_tokens = []
in_multiline_comment = False

for token_type, value in tokens:
if token_type in token.Comment or token_type in token.String.Doc:
if token_type == token.Comment.Multiline:
in_multiline_comment = not in_multiline_comment
continue
if not in_multiline_comment:
processed_tokens.append((token_type, value))

processed_content = ''.join(value for _, value in processed_tokens).strip()

# Remove any remaining single-line comments
processed_content = '\n'.join(line for line in processed_content.split('\n') if not line.strip().startswith('#'))

return processed_content


def collate_codebase(path, output_file, include_comments=True):
"""Aggregate the codebase into a single Markdown file."""
ignore_patterns = read_gitignore(path)
try:
Expand All @@ -89,7 +124,8 @@ def collate_codebase(path, output_file):
try:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()
output.write(f"```\n{content}\n```\n\n")
processed_content = process_file_content(content, file_path, include_comments)
output.write(f"```\n{processed_content}\n```\n\n")
except Exception as e:
logging.error("Error reading file %s: %s", file_path, e)
output.write("**Note**: Error reading this file.\n\n")
Expand All @@ -100,7 +136,6 @@ def collate_codebase(path, output_file):

def main():
"""Parse arguments and initiate codebase collation."""
setup_logging()
parser = argparse.ArgumentParser(description="Aggregate codebase into a single Markdown file.")
parser.add_argument(
'-p',
Expand All @@ -110,11 +145,14 @@ def main():
help="Specify the path to the codebase directory (default: current directory)")
parser.add_argument('-o', '--output', type=str, default='collated-code.md',
help="Specify output file (default: collated-code.md)")
parser.add_argument('-c', '--comments', type=str, choices=['on', 'off'], default='on',
help="Include comments and docstrings (default: on)")

args = parser.parse_args()

setup_logging()
logging.info("Starting code collation for directory: %s", args.path)
collate_codebase(args.path, args.output)
collate_codebase(args.path, args.output, include_comments=(args.comments == 'on'))
logging.info("Code collation completed.")


Expand Down
12 changes: 0 additions & 12 deletions output.md

This file was deleted.

1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,4 @@ urllib3==2.2.2
wheel==0.43.0
zipp==3.19.2
autopep8==2.3.1
pygments
113 changes: 84 additions & 29 deletions tests/test_collate.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,66 +3,121 @@
import pytest
import logging
from unittest.mock import mock_open, patch
from code_collator.collate import is_binary_file, read_gitignore, should_ignore, main

# Add the parent directory to sys.path
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

from code_collator import collate
import sys
import os

# Add the parent directory to sys.path
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))


# Add the parent directory to sys.path
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))


def test_is_binary_file():
with patch('builtins.open', mock_open(read_data=b'\x00binary\xff')):
assert is_binary_file('test.bin') is True
assert collate.is_binary_file('test.bin') is True

with patch('builtins.open', mock_open(read_data=b'hello world')):
assert is_binary_file('test.txt') is False
assert collate.is_binary_file('test.txt') is False


def test_read_gitignore():
with patch('builtins.open', mock_open(read_data='*.pyc\n__pycache__\n')):
patterns = read_gitignore('.')
patterns = collate.read_gitignore('.')
assert patterns == ['*.pyc', '__pycache__']


def test_should_ignore():
patterns = ['*.pyc', '__pycache__']
assert should_ignore('test.pyc', patterns)
assert should_ignore('test.py', patterns) is False
assert should_ignore('.git/config', patterns)
assert collate.should_ignore('test.pyc', patterns)
assert collate.should_ignore('test.py', patterns) is False
assert collate.should_ignore('.git/config', patterns)


def test_process_file_content():
content = '''
def hello():
"""This is a docstring."""
# This is a comment
print("Hello, World!")
'''
file_path = "test.py"

# Test with comments included
processed = collate.process_file_content(content, file_path, include_comments=True)
assert '"""This is a docstring."""' in processed
assert '# This is a comment' in processed

# Test with comments excluded
processed = collate.process_file_content(content, file_path, include_comments=False)
assert '"""This is a docstring."""' not in processed
assert '# This is a comment' not in processed
assert 'print("Hello, World!")' in processed


@pytest.fixture
def mock_file_system(tmp_path):
d = tmp_path / "test_dir"
d.mkdir()
(d / "test.py").write_text("print('hello')")
(d / "test.py").write_text("# This is a comment\nprint('hello')")
(d / "test.pyc").write_bytes(b'\x00\x01\x02')
return d


# def test_collate_codebase(mock_file_system, caplog):
# caplog.set_level(logging.INFO)
# output_file = mock_file_system / "output.md"
# collate_codebase(str(mock_file_system), str(output_file))
def test_collate_codebase(mock_file_system, caplog):
caplog.set_level(logging.INFO)
output_file = mock_file_system / "output.md"

# Test with comments included
collate.collate_codebase(str(mock_file_system), str(output_file), include_comments=True)
with open(output_file, 'r') as f:
content = f.read()
assert "# Collated Codebase" in content
assert "test.py" in content
assert "print('hello')" in content
assert "# This is a comment" in content

# Test with comments excluded
collate.collate_codebase(str(mock_file_system), str(output_file), include_comments=False)
with open(output_file, 'r') as f:
content = f.read()
assert "# Collated Codebase" in content
assert "test.py" in content
assert "print('hello')" in content
assert "# This is a comment" not in content


def test_main(mock_file_system, caplog, capsys):
caplog.set_level(logging.INFO)

# with open(output_file, 'r') as f:
# content = f.read()
# Test with comments included
output_with_comments = mock_file_system / 'output_with_comments.md'
with patch('sys.argv', ['collate', '-p', str(mock_file_system), '-o', str(output_with_comments), '-c', 'on']):
collate.main()

# print("Content of output file:")
# print(content)
with open(output_with_comments, 'r') as f:
content = f.read()
assert "# This is a comment" in content

# print("Captured logs:")
# print(caplog.text)
# Test with comments excluded
output_without_comments = mock_file_system / 'output_without_comments.md'
with patch('sys.argv', ['collate', '-p', str(mock_file_system), '-o', str(output_without_comments), '-c', 'off']):
collate.main()

# assert "# Collated Codebase" in content
# assert "test.py" in content
# assert "print('hello')" in content
# assert "test.pyc" in content
# assert "This is a binary file" in content
with open(output_without_comments, 'r') as f:
content = f.read()
assert "# This is a comment" not in content

def test_main(mock_file_system, caplog):
caplog.set_level(logging.INFO)
with patch('sys.argv', ['collate', '-p', str(mock_file_system), '-o', 'output.md']):
main()
# Assert log messages
assert "Starting code collation for directory:" in caplog.text
assert "Code collation completed." in caplog.text

assert "Starting code collation" in caplog.text
assert "Code collation completed" in caplog.text
# Check if specific files were processed
assert f"File {mock_file_system}/test.py is binary: False" in caplog.text
assert f"File {mock_file_system}/test.pyc is binary: True" in caplog.text

0 comments on commit e0f0abe

Please sign in to comment.