diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..981b7d4 Binary files /dev/null and b/.DS_Store differ diff --git a/.flake8 b/.flake8 index 8921822..3d7489a 100644 --- a/.flake8 +++ b/.flake8 @@ -1,3 +1,4 @@ [flake8] max-line-length = 120 -exclude = .git,__pycache__,docs/source/conf.py,old,build,dist \ No newline at end of file +exclude = .git,__pycache__,docs/source/conf.py,old,build,dist +ignore = E402 \ No newline at end of file diff --git a/.gitignore b/.gitignore index 77ab56b..0152a67 100644 --- a/.gitignore +++ b/.gitignore @@ -142,3 +142,4 @@ build/ dist/ .aider* .aider.* +.benchmarks/ diff --git a/README.md b/README.md index 6bb8dc9..f8d8f5c 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,7 @@ Have you ever needed to provide a comprehensive overview of your codebase for a - **.gitignore Support**: Automatically ignores files specified in the `.gitignore` file if one exists. - **Customizable Output**: Outputs a single Markdown file named `collated-code.md` by default, with options to specify the path to the codebase directory and output file name. - **Binary File Inclusion**: Includes binary files such as images in the output with a note about their file type. +- **Comment Exclusion Option**: Allows users to exclude comments and docstrings from the collated code. - **Help Command**: Provides a help command to display usage instructions. ## Installation @@ -38,6 +39,12 @@ For more detailed usage instructions, use the help command: code-collator --help ``` +To exclude comments and docstrings from the collated code: + +```sh +code-collator --path /path/to/codebase --output my-collated-code.md --comments off +``` + ## Running Tests @@ -86,4 +93,4 @@ Please see CONTRIBUTING.md for details on our code of conduct and the process fo ## License -This project is licensed under the MIT License - see the LICENSE file for details. \ No newline at end of file +This project is licensed under the MIT License - see the LICENSE file for details. diff --git a/code_collator/collate.py b/code_collator/collate.py index d5751d9..849c2d3 100644 --- a/code_collator/collate.py +++ b/code_collator/collate.py @@ -1,16 +1,20 @@ import os +import sys import argparse from pathlib import Path import logging from fnmatch import fnmatch +from pygments import lexers, token +from pygments.util import ClassNotFound def setup_logging(): """Set up logging configuration.""" - logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(levelname)s - %(message)s' - ) + logger = logging.getLogger() + logger.setLevel(logging.INFO) + handler = logging.StreamHandler(sys.stdout) + handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) + logger.addHandler(handler) def is_binary_file(filepath): @@ -33,7 +37,7 @@ def read_gitignore(path): try: with open(gitignore_path, 'r') as f: patterns = f.read().splitlines() - logging.info("Loaded .gitignore patterns from {gitignore_path}") + logging.info(f"Loaded .gitignore patterns from {gitignore_path}") return patterns except Exception as e: logging.error(f"Error reading .gitignore file {gitignore_path}: {e}") @@ -65,7 +69,38 @@ def should_ignore(file_path, ignore_patterns): return False -def collate_codebase(path, output_file): +def process_file_content(content, file_path, include_comments): + """Process file content, optionally removing comments and docstrings.""" + if include_comments: + return content + + try: + lexer = lexers.get_lexer_for_filename(file_path) + except ClassNotFound: + logging.warning(f"No lexer found for {file_path}. Returning original content.") + return content + + tokens = list(lexer.get_tokens(content)) + processed_tokens = [] + in_multiline_comment = False + + for token_type, value in tokens: + if token_type in token.Comment or token_type in token.String.Doc: + if token_type == token.Comment.Multiline: + in_multiline_comment = not in_multiline_comment + continue + if not in_multiline_comment: + processed_tokens.append((token_type, value)) + + processed_content = ''.join(value for _, value in processed_tokens).strip() + + # Remove any remaining single-line comments + processed_content = '\n'.join(line for line in processed_content.split('\n') if not line.strip().startswith('#')) + + return processed_content + + +def collate_codebase(path, output_file, include_comments=True): """Aggregate the codebase into a single Markdown file.""" ignore_patterns = read_gitignore(path) try: @@ -89,7 +124,8 @@ def collate_codebase(path, output_file): try: with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: content = f.read() - output.write(f"```\n{content}\n```\n\n") + processed_content = process_file_content(content, file_path, include_comments) + output.write(f"```\n{processed_content}\n```\n\n") except Exception as e: logging.error("Error reading file %s: %s", file_path, e) output.write("**Note**: Error reading this file.\n\n") @@ -100,7 +136,6 @@ def collate_codebase(path, output_file): def main(): """Parse arguments and initiate codebase collation.""" - setup_logging() parser = argparse.ArgumentParser(description="Aggregate codebase into a single Markdown file.") parser.add_argument( '-p', @@ -110,11 +145,14 @@ def main(): help="Specify the path to the codebase directory (default: current directory)") parser.add_argument('-o', '--output', type=str, default='collated-code.md', help="Specify output file (default: collated-code.md)") + parser.add_argument('-c', '--comments', type=str, choices=['on', 'off'], default='on', + help="Include comments and docstrings (default: on)") args = parser.parse_args() + setup_logging() logging.info("Starting code collation for directory: %s", args.path) - collate_codebase(args.path, args.output) + collate_codebase(args.path, args.output, include_comments=(args.comments == 'on')) logging.info("Code collation completed.") diff --git a/output.md b/output.md deleted file mode 100644 index d154906..0000000 --- a/output.md +++ /dev/null @@ -1,12 +0,0 @@ -# Collated Codebase - -## /private/var/folders/zw/q3rt8wmn6wn_bgj0fy5g_vnr0000gn/T/pytest-of-tawandakembo/pytest-7/test_main0/test_dir/test.py - -``` -print('hello') -``` - -## /private/var/folders/zw/q3rt8wmn6wn_bgj0fy5g_vnr0000gn/T/pytest-of-tawandakembo/pytest-7/test_main0/test_dir/test.pyc - -**Note**: This is a binary file. - diff --git a/requirements.txt b/requirements.txt index fedb107..c3e3d2b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -35,3 +35,4 @@ urllib3==2.2.2 wheel==0.43.0 zipp==3.19.2 autopep8==2.3.1 +pygments diff --git a/tests/test_collate.py b/tests/test_collate.py index 914520b..38ba1e8 100644 --- a/tests/test_collate.py +++ b/tests/test_collate.py @@ -3,7 +3,17 @@ import pytest import logging from unittest.mock import mock_open, patch -from code_collator.collate import is_binary_file, read_gitignore, should_ignore, main + +# Add the parent directory to sys.path +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +from code_collator import collate +import sys +import os + +# Add the parent directory to sys.path +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + # Add the parent directory to sys.path sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) @@ -11,58 +21,103 @@ def test_is_binary_file(): with patch('builtins.open', mock_open(read_data=b'\x00binary\xff')): - assert is_binary_file('test.bin') is True + assert collate.is_binary_file('test.bin') is True with patch('builtins.open', mock_open(read_data=b'hello world')): - assert is_binary_file('test.txt') is False + assert collate.is_binary_file('test.txt') is False def test_read_gitignore(): with patch('builtins.open', mock_open(read_data='*.pyc\n__pycache__\n')): - patterns = read_gitignore('.') + patterns = collate.read_gitignore('.') assert patterns == ['*.pyc', '__pycache__'] def test_should_ignore(): patterns = ['*.pyc', '__pycache__'] - assert should_ignore('test.pyc', patterns) - assert should_ignore('test.py', patterns) is False - assert should_ignore('.git/config', patterns) + assert collate.should_ignore('test.pyc', patterns) + assert collate.should_ignore('test.py', patterns) is False + assert collate.should_ignore('.git/config', patterns) + + +def test_process_file_content(): + content = ''' +def hello(): + """This is a docstring.""" + # This is a comment + print("Hello, World!") +''' + file_path = "test.py" + + # Test with comments included + processed = collate.process_file_content(content, file_path, include_comments=True) + assert '"""This is a docstring."""' in processed + assert '# This is a comment' in processed + + # Test with comments excluded + processed = collate.process_file_content(content, file_path, include_comments=False) + assert '"""This is a docstring."""' not in processed + assert '# This is a comment' not in processed + assert 'print("Hello, World!")' in processed @pytest.fixture def mock_file_system(tmp_path): d = tmp_path / "test_dir" d.mkdir() - (d / "test.py").write_text("print('hello')") + (d / "test.py").write_text("# This is a comment\nprint('hello')") (d / "test.pyc").write_bytes(b'\x00\x01\x02') return d -# def test_collate_codebase(mock_file_system, caplog): -# caplog.set_level(logging.INFO) -# output_file = mock_file_system / "output.md" -# collate_codebase(str(mock_file_system), str(output_file)) +def test_collate_codebase(mock_file_system, caplog): + caplog.set_level(logging.INFO) + output_file = mock_file_system / "output.md" + + # Test with comments included + collate.collate_codebase(str(mock_file_system), str(output_file), include_comments=True) + with open(output_file, 'r') as f: + content = f.read() + assert "# Collated Codebase" in content + assert "test.py" in content + assert "print('hello')" in content + assert "# This is a comment" in content + + # Test with comments excluded + collate.collate_codebase(str(mock_file_system), str(output_file), include_comments=False) + with open(output_file, 'r') as f: + content = f.read() + assert "# Collated Codebase" in content + assert "test.py" in content + assert "print('hello')" in content + assert "# This is a comment" not in content + + +def test_main(mock_file_system, caplog, capsys): + caplog.set_level(logging.INFO) -# with open(output_file, 'r') as f: -# content = f.read() + # Test with comments included + output_with_comments = mock_file_system / 'output_with_comments.md' + with patch('sys.argv', ['collate', '-p', str(mock_file_system), '-o', str(output_with_comments), '-c', 'on']): + collate.main() -# print("Content of output file:") -# print(content) + with open(output_with_comments, 'r') as f: + content = f.read() + assert "# This is a comment" in content -# print("Captured logs:") -# print(caplog.text) + # Test with comments excluded + output_without_comments = mock_file_system / 'output_without_comments.md' + with patch('sys.argv', ['collate', '-p', str(mock_file_system), '-o', str(output_without_comments), '-c', 'off']): + collate.main() -# assert "# Collated Codebase" in content -# assert "test.py" in content -# assert "print('hello')" in content -# assert "test.pyc" in content -# assert "This is a binary file" in content + with open(output_without_comments, 'r') as f: + content = f.read() + assert "# This is a comment" not in content -def test_main(mock_file_system, caplog): - caplog.set_level(logging.INFO) - with patch('sys.argv', ['collate', '-p', str(mock_file_system), '-o', 'output.md']): - main() + # Assert log messages + assert "Starting code collation for directory:" in caplog.text + assert "Code collation completed." in caplog.text - assert "Starting code collation" in caplog.text - assert "Code collation completed" in caplog.text + # Check if specific files were processed + assert f"File {mock_file_system}/test.py is binary: False" in caplog.text + assert f"File {mock_file_system}/test.pyc is binary: True" in caplog.text