From 4aa69db2e8dbcec6c79ed604243bb53e0dae063c Mon Sep 17 00:00:00 2001 From: Mike Casale <46603283+mikewcasale@users.noreply.github.com> Date: Tue, 17 Dec 2024 06:49:56 -0500 Subject: [PATCH] refactor: improve code quality and documentation (v0.1.6) - Add comprehensive docstrings with Examples sections - Add proper type hints for all functions and classes - Add Raises sections in docstrings for error handling - Improve error handling with custom exceptions - Enhance code organization and module structure - Update project structure to use flat directory layout - Improve README with current project structure and standards --- CHANGELOG.md | 19 + README.md | 71 ++-- repominify/__init__.py | 12 +- repominify/cli.py | 118 +++--- repominify/constants.py | 61 +++ repominify/core/__init__.py | 15 - repominify/core/types.py | 70 ---- .../dependency_checker.py => dependencies.py} | 180 ++++----- repominify/exceptions.py | 108 +++++ repominify/{io => }/exporters.py | 101 ++++- repominify/formatters.py | 300 ++++++++++++++ repominify/{core => }/graph.py | 368 +++++++++++------- repominify/io/__init__.py | 12 - repominify/io/formatters.py | 145 ------- repominify/{utils => }/logging.py | 27 +- repominify/{core => }/parser.py | 63 ++- repominify/stats.py | 125 ++++++ repominify/types.py | 214 ++++++++++ repominify/utils/stats.py | 212 ---------- 19 files changed, 1363 insertions(+), 858 deletions(-) create mode 100644 repominify/constants.py delete mode 100644 repominify/core/__init__.py delete mode 100644 repominify/core/types.py rename repominify/{utils/dependency_checker.py => dependencies.py} (67%) create mode 100644 repominify/exceptions.py rename repominify/{io => }/exporters.py (51%) create mode 100644 repominify/formatters.py rename repominify/{core => }/graph.py (50%) delete mode 100644 repominify/io/__init__.py delete mode 100644 repominify/io/formatters.py rename repominify/{utils => }/logging.py (75%) rename repominify/{core => }/parser.py (74%) create mode 100644 repominify/stats.py create mode 100644 repominify/types.py delete mode 100644 repominify/utils/stats.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 0140683..dcbd8af 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,24 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.1.6] - 2024-12-15 + +### Changed +- Improved code quality and documentation: + - Added comprehensive docstrings with Examples sections + - Added proper type hints for all functions and classes + - Added Raises sections in docstrings for error handling + - Improved error handling with custom exceptions + - Enhanced code organization and module structure +- Updated project structure to use flat directory layout +- Improved README with current project structure and standards + +### Fixed +- Function and method signatures in docstrings +- Error handling in dependency management +- Type hints in graph building and analysis +- Code formatting and consistency + ## [0.1.5] - 2024-12-14 ### Added @@ -68,6 +86,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Restructured project to use src/ directory layout - Updated package metadata and documentation +[0.1.6]: https://github.com/mikewcasale/repominify/compare/v0.1.5...v0.1.6 [0.1.5]: https://github.com/mikewcasale/repominify/compare/v0.1.4...v0.1.5 [0.1.4]: https://github.com/mikewcasale/repominify/compare/v0.1.3...v0.1.4 [0.1.3]: https://github.com/mikewcasale/repominify/compare/v0.1.2...v0.1.3 diff --git a/README.md b/README.md index 4d60a16..032362d 100644 --- a/README.md +++ b/README.md @@ -163,46 +163,37 @@ When you run repominify, it generates several files in your output directory: ``` repominify/ -├── src/ # Source code -│ ├── core/ # Core functionality -│ │ ├── graph.py # Graph building and analysis -│ │ ├── parser.py # Repomix file parsing -│ │ └── types.py # Shared types and data structures -│ ├── io/ # Input/Output operations -│ │ ├── exporters.py # Graph export functionality -│ │ └── formatters.py # Text representation formatting -│ ├── utils/ # Utility modules -│ │ ├── dependency_checker.py # Dependency management -│ │ ├── logging.py # Logging configuration -│ │ └── stats.py # Statistics and comparison -│ ├── cli.py # Command-line interface -│ └── __init__.py # Package initialization -├── tests/ # Test suite -│ ├── test_end2end.py # End-to-end tests +├── repominify/ # Source code +│ ├── graph.py # Graph building and analysis +│ ├── parser.py # Repomix file parsing +│ ├── types.py # Core types and data structures +│ ├── exporters.py # Graph export functionality +│ ├── formatters.py # Text representation formatting +│ ├── dependencies.py # Dependency management +│ ├── logging.py # Logging configuration +│ ├── stats.py # Statistics and comparison +│ ├── constants.py # Shared constants +│ ├── exceptions.py # Custom exceptions +│ ├── cli.py # Command-line interface +│ └── __init__.py # Package initialization +├── tests/ # Test suite +│ ├── test_end2end.py # End-to-end tests │ └── data/ # Test data files -├── setup.py # Package configuration -├── pyproject.toml # Build system requirements -├── LICENSE # MIT License -└── README.md # This file +├── setup.py # Package configuration +├── LICENSE # MIT License +└── README.md # This file ``` -## Performance +## Code Style -repominify is designed to handle large codebases efficiently: +The project follows these coding standards for consistency and maintainability: -- Memory usage scales linearly with codebase size -- File I/O is buffered for efficiency -- Graph operations are optimized -- Performance statistics available in debug mode - -## Error Handling - -The package provides detailed error messages and proper error boundaries: - -- Dependency errors (Node.js, npm, Repomix) -- File parsing errors -- Graph building errors -- Export errors +- Comprehensive docstrings with Examples sections for all public APIs +- Type hints for all functions, methods, and class attributes +- Custom exceptions for proper error handling and reporting +- Clear separation of concerns between modules +- Consistent code formatting and naming conventions +- Detailed logging with configurable debug support ## Development @@ -224,7 +215,7 @@ pytest tests/ Contributions are welcome! Please feel free to submit a Pull Request. By contributing to this project, you agree to abide by its terms. -This project has adopted the [Python Style Guide](STYLEGUIDE.md) for consistency and maintainability. +Please ensure your code follows the project's coding standards, including proper docstrings, type hints, and error handling. ## Authors @@ -246,17 +237,9 @@ This project makes use of or was influenced by several excellent open source pro - [PyYAML](https://pyyaml.org/) - YAML file handling - [GraphRAG Accelerator](https://github.com/Azure-Samples/graphrag-accelerator) - Graph-based code analysis patterns and implementation concepts -## Disclaimer - -This project is not an officially supported product. It is provided as-is, without warranty or support. Users should evaluate its suitability for their use case and understand the implications of deep code analysis on their systems. - ## How to Get Help - For bugs and feature requests, please [open an issue](https://github.com/mikewcasale/repominify/issues) - For usage questions, please [start a discussion](https://github.com/mikewcasale/repominify/discussions) - For security concerns, please email security@casale.xyz directly -## Trademarks - -Any trademarks or registered trademarks mentioned in this project are the property of their respective owners. - diff --git a/repominify/__init__.py b/repominify/__init__.py index 7b8ca2d..38806af 100644 --- a/repominify/__init__.py +++ b/repominify/__init__.py @@ -1,13 +1,17 @@ """ repominify - Optimize codebase representations for LLMs + +Author: Mike Casale +Email: mike@casale.xyz +GitHub: https://github.com/mikewcasale """ -__version__ = "0.1.5" +__version__ = "0.1.6" __author__ = "Mike Casale" __email__ = "mike@casale.xyz" -from .core.graph import CodeGraphBuilder -from .utils.dependency_checker import ensure_dependencies -from .utils.logging import configure_logging +from .graph import CodeGraphBuilder +from .dependencies import ensure_dependencies +from .logging import configure_logging __all__ = ["CodeGraphBuilder", "ensure_dependencies", "configure_logging"] diff --git a/repominify/cli.py b/repominify/cli.py index e376c0f..876f6ba 100644 --- a/repominify/cli.py +++ b/repominify/cli.py @@ -1,25 +1,6 @@ """Command-line interface for repo-minify. This module provides the main entry point for the repo-minify command-line tool. - -Author: Mike Casale -Email: mike@casale.xyz -GitHub: https://github.com/mikewcasale - -Error Handling: - - Dependency errors are reported with clear instructions - - File access errors include path information - - Graph building errors show detailed context - - All errors are logged with debug info when --debug is enabled - -Performance: - - Memory usage scales with input file size - - Progress feedback for long operations - - Graceful handling of large files - -Version Compatibility: - - Python 3.7+: Full support - - Type hints: Required for static analysis """ from __future__ import annotations @@ -28,24 +9,23 @@ import sys from dataclasses import dataclass from pathlib import Path -from typing import NoReturn, Optional, Final - -from .core.graph import CodeGraphBuilder -from .utils.logging import configure_logging, get_logger -from .utils.dependency_checker import ensure_dependencies -from .core.types import GraphBuildError, FileParseError, ValidationError +from typing import NoReturn, Optional + +from .graph import CodeGraphBuilder +from .logging import configure_logging, get_logger +from .dependencies import ensure_dependencies +from .exceptions import GraphBuildError, FileParseError, ValidationError +from .constants import ( + EXIT_SUCCESS, + EXIT_GENERAL_ERROR, + EXIT_FILE_NOT_FOUND, + EXIT_PERMISSION_ERROR, + EXIT_PARSE_ERROR, + EXIT_GRAPH_ERROR, +) -# Configure logging logger = get_logger(__name__) -# Exit codes -EXIT_SUCCESS: Final[int] = 0 -EXIT_GENERAL_ERROR: Final[int] = 1 -EXIT_FILE_NOT_FOUND: Final[int] = 2 -EXIT_PERMISSION_ERROR: Final[int] = 3 -EXIT_PARSE_ERROR: Final[int] = 4 -EXIT_GRAPH_ERROR: Final[int] = 5 - @dataclass class CliOptions: @@ -56,19 +36,30 @@ class CliOptions: output_dir: Directory for analysis output files debug: Whether to enable debug logging - Example: + Raises: + ValueError: If input_file or output_dir is empty + + Examples:: >>> opts = CliOptions(Path("input.txt"), Path("output"), debug=True) - >>> print(opts.input_file) - input.txt + >>> str(opts.input_file) + 'input.txt' """ - # Required fields input_file: Path output_dir: Path - - # Optional fields with defaults debug: bool = False + def __post_init__(self) -> None: + """Validate CLI options. + + Raises: + ValueError: If input_file or output_dir is empty + """ + if not str(self.input_file).strip(): + raise ValueError("Input file path cannot be empty") + if not str(self.output_dir).strip(): + raise ValueError("Output directory path cannot be empty") + def parse_args() -> CliOptions: """Parse command-line arguments. @@ -76,13 +67,15 @@ def parse_args() -> CliOptions: Returns: CliOptions containing validated arguments - Example: - >>> args = parse_args() - >>> print(f"Processing {args.input_file}") - Processing repomix-output.txt + Raises: + SystemExit: If invalid arguments are provided - Note: - Uses argparse's built-in help and error handling + Examples:: + >>> import sys + >>> sys.argv = ["repo-minify", "input.txt", "-o", "output"] + >>> args = parse_args() + >>> str(args.input_file) + 'input.txt' """ parser = argparse.ArgumentParser( description="Analyze and minify code repository structure using Repomix.", @@ -122,15 +115,15 @@ def handle_error(error: Exception, debug: bool) -> NoReturn: error: The exception to handle debug: Whether debug mode is enabled - Note: - Always exits the program with an appropriate status code + Raises: + Exception: Re-raises the original error if in debug mode - Exit Codes: - 1: General error - 2: File not found - 3: Permission denied - 4: Parse error - 5: Graph build error + Examples:: + >>> try: + ... raise FileNotFoundError("test.txt") + ... except Exception as e: + ... handle_error(e, debug=False) + Error: File not found: test.txt """ if isinstance(error, FileNotFoundError): print(f"Error: File not found: {error.filename}", file=sys.stderr) @@ -157,16 +150,15 @@ def main() -> int: Returns: Exit code (0 for success, non-zero for error) - Exit Codes: - 0: Success - 1: General error - 2: File not found - 3: Permission denied - 4: Parse error - 5: Graph build error + Raises: + SystemExit: With appropriate exit code on error + Exception: Any unhandled exceptions in debug mode - Example: - >>> sys.exit(main()) # Run the CLI + Examples:: + >>> import sys + >>> sys.argv = ["repo-minify", "--help"] + >>> main() # doctest: +SKIP + 0 """ try: # Parse arguments diff --git a/repominify/constants.py b/repominify/constants.py new file mode 100644 index 0000000..4960c63 --- /dev/null +++ b/repominify/constants.py @@ -0,0 +1,61 @@ +"""Constants used throughout the repo-minify package. + +This module centralizes all constant values used across different modules. + +Attributes: + EXIT_SUCCESS: Exit code for successful execution (0) + EXIT_GENERAL_ERROR: Exit code for general errors (1) + EXIT_FILE_NOT_FOUND: Exit code for file not found errors (2) + EXIT_PERMISSION_ERROR: Exit code for permission errors (3) + EXIT_PARSE_ERROR: Exit code for parsing errors (4) + EXIT_GRAPH_ERROR: Exit code for graph building errors (5) + NODE_TYPES: Mapping of node types to their display colors + CONSTANT_PATTERNS: Regular expressions for identifying constants + SUSPICIOUS_PATTERNS: Set of security-sensitive pattern strings + +Examples:: + >>> from repominify.constants import NODE_TYPES + >>> NODE_TYPES["module"] + '#A5D6A7' +""" + +from __future__ import annotations + +from typing import Dict, List, Set, Final + +# Exit codes +EXIT_SUCCESS: Final[int] = 0 +EXIT_GENERAL_ERROR: Final[int] = 1 +EXIT_FILE_NOT_FOUND: Final[int] = 2 +EXIT_PERMISSION_ERROR: Final[int] = 3 +EXIT_PARSE_ERROR: Final[int] = 4 +EXIT_GRAPH_ERROR: Final[int] = 5 + +# Node type constants with color codes +NODE_TYPES: Final[Dict[str, str]] = { + "module": "#A5D6A7", # Light green + "class": "#90CAF9", # Light blue + "function": "#FFCC80", # Light orange + "import": "#CE93D8", # Light purple + "constant": "#FFB74D", # Orange + "env_var": "#81C784", # Green +} + +# Regular expression patterns for identifying constants +CONSTANT_PATTERNS: Final[List[str]] = [ + r"^[A-Z][A-Z0-9_]*$", # All caps with underscores + r"__[a-zA-Z0-9_]+__", # Dunder names + r"Final\[[^]]+\]", # Type hints with Final +] + +# Security-sensitive patterns to detect +SUSPICIOUS_PATTERNS: Final[Set[str]] = { + "password", + "secret", + "token", + "api_key", + "private_key", + "ssh_key", + "credentials", + "auth", +} diff --git a/repominify/core/__init__.py b/repominify/core/__init__.py deleted file mode 100644 index c45e80f..0000000 --- a/repominify/core/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -"""Core functionality for repo-minify. - -This package provides the core graph building and analysis functionality. -""" - -from .graph import CodeGraphBuilder -from .types import FileEntry, GraphBuildError, FileParseError, ValidationError - -__all__ = [ - "CodeGraphBuilder", - "FileEntry", - "GraphBuildError", - "FileParseError", - "ValidationError", -] diff --git a/repominify/core/types.py b/repominify/core/types.py deleted file mode 100644 index 1239930..0000000 --- a/repominify/core/types.py +++ /dev/null @@ -1,70 +0,0 @@ -"""Core types and data structures for repo-minify. - -This module defines the fundamental data types and structures used throughout -the repo-minify package. - -Version Compatibility: - - Python 3.7+: Full support for all types - - Python 3.6: Not supported (uses dataclasses) -""" - -from __future__ import annotations - -from dataclasses import dataclass, field -from typing import Dict, List, Set, Any - -# Type aliases for graph operations -NodeID = str -NodeData = Dict[str, str] -EdgeData = Dict[str, str] -GraphData = Dict[str, List[Dict[str, str]]] - - -@dataclass -class FileEntry: - """Container for file information from Repomix output. - - Attributes: - path: File path relative to repository root - content: File content as string - size: Content size in bytes - line_count: Number of lines in content - """ - - path: str - content: str - size: int = field(init=False) - line_count: int = field(init=False) - - def __post_init__(self) -> None: - """Initialize computed fields.""" - self.size = len(self.content.encode("utf-8")) - self.line_count = len(self.content.splitlines()) - - def __str__(self) -> str: - return ( - f"FileEntry(path='{self.path}', size={self.size}B, lines={self.line_count})" - ) - - -# Custom exceptions -class GraphBuildError(Exception): - """Base exception for graph building errors.""" - - pass - - -class FileParseError(GraphBuildError): - """Raised when Repomix file parsing fails.""" - - pass - - -class ValidationError(GraphBuildError): - """Raised when input validation fails.""" - - pass - - -# Performance tracking type -Stats = Dict[str, Any] diff --git a/repominify/utils/dependency_checker.py b/repominify/dependencies.py similarity index 67% rename from repominify/utils/dependency_checker.py rename to repominify/dependencies.py index 66b0275..6226fc2 100644 --- a/repominify/utils/dependency_checker.py +++ b/repominify/dependencies.py @@ -2,21 +2,6 @@ This module handles the checking and installation of required system dependencies for repo-minify to function properly. - -Version Compatibility: - - Python 3.7+: Full support - - Python 3.6: Not supported (uses dataclasses) - - Node.js 12+: Required for Repomix - - npm 6+: Required for package installation - -Performance Considerations: - - Network I/O: Required for npm installation - - Disk I/O: Required for package installation - - Command execution: Blocking operations - -Note: - All functions return tuples of (success: bool, message: str) for consistent - error handling and user feedback. """ from __future__ import annotations @@ -26,61 +11,13 @@ import subprocess import sys import time -from dataclasses import dataclass, field from pathlib import Path -from typing import Tuple, Optional, Dict, Union, List - -# Configure logging -logger = logging.getLogger(__name__) - -# Type aliases for clarity -CommandResult = Tuple[bool, str] -ProcessOutput = Union[str, bytes] -VersionInfo = Dict[str, str] - - -class DependencyError(Exception): - """Base exception for dependency-related errors.""" - - pass - - -class CommandExecutionError(DependencyError): - """Raised when a system command fails.""" - - pass - - -class InstallationError(DependencyError): - """Raised when package installation fails.""" - - pass - +from typing import Dict, Union, List, Optional, Tuple -@dataclass -class DependencyVersion: - """Container for dependency version information. +from .exceptions import DependencyError, CommandExecutionError, InstallationError +from .types import DependencyVersion, CommandResult, ProcessOutput, VersionInfo - Attributes: - name: Name of the dependency - version: Version string - is_installed: Whether the dependency is installed - install_time: Installation timestamp (if installed) - """ - - name: str - version: str - is_installed: bool = False - install_time: Optional[float] = field(default=None) - install_path: Optional[str] = field(default=None) - - def __str__(self) -> str: - return f"{self.name} {self.version}" - - def __post_init__(self) -> None: - """Validate version string format.""" - if not self.version.strip(): - raise ValueError("Version string cannot be empty") +logger = logging.getLogger(__name__) class DependencyManager: @@ -91,37 +28,41 @@ class DependencyManager: Attributes: debug: Whether debug mode is enabled - stats: Runtime statistics for monitoring - - Performance: - - Network operations are performed only when necessary - - Command execution is optimized for response time - - Installation state is cached + stats: Runtime statistics for monitoring installations + _version_cache: Cache of dependency version information """ + debug: bool + stats: Dict[str, Union[int, float, str]] + _version_cache: Dict[str, DependencyVersion] + def __init__(self, debug: bool = False) -> None: """Initialize dependency manager. Args: - debug: Enable debug logging and performance tracking + debug: Enable debug logging and version tracking + + Examples:: + >>> manager = DependencyManager(debug=True) + >>> manager.stats["commands_executed"] + 0 """ self.debug = debug if debug: logging.basicConfig(level=logging.DEBUG) - self.stats: Dict[str, Union[int, float, str]] = { + self.stats = { "commands_executed": 0, "install_attempts": 0, "total_install_time_ms": 0, "last_check_time": 0, } - # Cache for version checks - self._version_cache: Dict[str, DependencyVersion] = {} + self._version_cache = {} def _run_command( self, cmd: List[str], capture_stderr: bool = True, timeout: Optional[int] = 30 - ) -> subprocess.CompletedProcess: + ) -> subprocess.CompletedProcess: # sourcery skip: extract-method """Execute a system command and return the result. Args: @@ -135,6 +76,12 @@ def _run_command( Raises: CommandExecutionError: If command execution fails subprocess.TimeoutExpired: If command exceeds timeout + + Examples:: + >>> manager = DependencyManager() + >>> result = manager._run_command(["echo", "test"]) + >>> result.returncode + 0 """ start_time = time.time() @@ -154,24 +101,32 @@ def _run_command( raise except Exception as e: logger.error(f"Command failed: {e}") - raise CommandExecutionError(f"Failed to execute {cmd[0]}: {str(e)}") + raise CommandExecutionError(f"Failed to execute {cmd[0]}: {str(e)}") from e - def check_node_npm(self) -> CommandResult: + def check_node_npm(self) -> CommandResult: # sourcery skip: extract-method """Check if Node.js and npm are installed. Returns: Tuple of (success, message) indicating installation status - Performance: - - Results are cached for 60 seconds - - Commands are executed with 5s timeout + Raises: + FileNotFoundError: If Node.js or npm executables are not found + CommandExecutionError: If version check commands fail + subprocess.TimeoutExpired: If version checks timeout + + Examples:: + >>> manager = DependencyManager() + >>> success, msg = manager.check_node_npm() + >>> success + True """ current_time = time.time() - if (current_time - self.stats["last_check_time"]) < 60: - if "node" in self._version_cache and "npm" in self._version_cache: - node = self._version_cache["node"] - npm = self._version_cache["npm"] - return True, f"{node} and {npm} found." + if (current_time - self.stats["last_check_time"]) < 60 and ( + "node" in self._version_cache and "npm" in self._version_cache + ): + node = self._version_cache["node"] + npm = self._version_cache["npm"] + return True, f"{node} and {npm} found." try: # Check node version @@ -214,15 +169,23 @@ def check_repomix(self) -> CommandResult: Returns: Tuple of (success, message) indicating installation status - Performance: - - Results are cached for 60 seconds - - Commands are executed with 5s timeout + Raises: + FileNotFoundError: If Repomix executable is not found + CommandExecutionError: If version check command fails + subprocess.TimeoutExpired: If version check times out + + Examples:: + >>> manager = DependencyManager() + >>> success, msg = manager.check_repomix() + >>> isinstance(success, bool) + True """ current_time = time.time() - if (current_time - self.stats["last_check_time"]) < 60: - if "repomix" in self._version_cache: - version = self._version_cache["repomix"] - return True, f"{version} found." + if ( + current_time - self.stats["last_check_time"] + ) < 60 and "repomix" in self._version_cache: + version = self._version_cache["repomix"] + return True, f"{version} found." try: result = self._run_command(["repomix", "--version"], timeout=5) @@ -251,10 +214,15 @@ def install_repomix(self) -> CommandResult: Returns: Tuple of (success, message) indicating installation result - Performance: - - Network-dependent operation - - May take several seconds to complete - - Progress is logged for long operations + Raises: + CommandExecutionError: If npm install command fails + subprocess.TimeoutExpired: If installation times out + + Examples:: + >>> manager = DependencyManager() + >>> success, msg = manager.install_repomix() + >>> isinstance(success, bool) + True """ start_time = time.time() self.stats["install_attempts"] += 1 @@ -285,14 +253,18 @@ def ensure_dependencies(debug: bool = False) -> bool: attempting to install Repomix if it's not found. Args: - debug: Enable debug logging and performance tracking + debug: Enable debug logging and version tracking Returns: - bool: True if all dependencies are satisfied, False otherwise + True if all dependencies are satisfied, False otherwise + + Raises: + CommandExecutionError: If dependency checks or installation fails - Note: - Error messages are printed to stderr for user feedback - Performance statistics are logged in debug mode + Examples:: + >>> success = ensure_dependencies(debug=True) + >>> isinstance(success, bool) + True """ manager = DependencyManager(debug=debug) diff --git a/repominify/exceptions.py b/repominify/exceptions.py new file mode 100644 index 0000000..f37c568 --- /dev/null +++ b/repominify/exceptions.py @@ -0,0 +1,108 @@ +"""Custom exceptions for repo-minify. + +This module contains all custom exceptions used throughout the codebase. +""" + +from __future__ import annotations + + +class GraphBuildError(Exception): + """Base exception for graph building errors. + + Raises: + GraphBuildError: When graph construction fails + + Examples:: + >>> try: + ... raise GraphBuildError("Failed to build graph") + ... except GraphBuildError as e: + ... str(e) + 'Failed to build graph' + """ + + pass + + +class FileParseError(GraphBuildError): + """Raised when Repomix file parsing fails. + + Raises: + FileParseError: When file parsing fails + + Examples:: + >>> try: + ... raise FileParseError("Invalid file format") + ... except FileParseError as e: + ... str(e) + 'Invalid file format' + """ + + pass + + +class ValidationError(GraphBuildError): + """Raised when input validation fails. + + Raises: + ValidationError: When input validation fails + + Examples:: + >>> try: + ... raise ValidationError("Invalid input") + ... except ValidationError as e: + ... str(e) + 'Invalid input' + """ + + pass + + +class DependencyError(Exception): + """Base exception for dependency-related errors. + + Raises: + DependencyError: When dependency management fails + + Examples:: + >>> try: + ... raise DependencyError("Missing dependency") + ... except DependencyError as e: + ... str(e) + 'Missing dependency' + """ + + pass + + +class CommandExecutionError(DependencyError): + """Raised when a system command fails. + + Raises: + CommandExecutionError: When command execution fails + + Examples:: + >>> try: + ... raise CommandExecutionError("Command failed") + ... except CommandExecutionError as e: + ... str(e) + 'Command failed' + """ + + pass + + +class InstallationError(DependencyError): + """Raised when package installation fails. + + Raises: + InstallationError: When package installation fails + + Examples:: + >>> try: + ... raise InstallationError("Installation failed") + ... except InstallationError as e: + ... str(e) + 'Installation failed' + """ + + pass diff --git a/repominify/io/exporters.py b/repominify/exporters.py similarity index 51% rename from repominify/io/exporters.py rename to repominify/exporters.py index 8497066..a82a23d 100644 --- a/repominify/io/exporters.py +++ b/repominify/exporters.py @@ -1,24 +1,21 @@ """Graph export functionality for repo-minify. This module handles exporting graph data to various file formats. - -Performance Considerations: - - File I/O is buffered for efficiency - - Large graphs are written in chunks - - Memory usage scales with graph size """ from __future__ import annotations import json from pathlib import Path -from typing import Any, Dict +from typing import Dict, Any import networkx as nx import yaml -from ..utils.logging import get_logger -from ..core.types import GraphData +from .logging import get_logger +from .types import GraphData + +__all__ = ["GraphExporter"] logger = get_logger(__name__) @@ -30,11 +27,22 @@ class GraphExporter: use cases (visualization, analysis, etc.). Attributes: - stats: Runtime statistics for performance monitoring + stats: Dictionary tracking export statistics (files written, bytes, time) """ + __constants__ = [] # No constants defined for this class + + # Type hints for instance attributes + stats: Dict[str, int] + def __init__(self) -> None: - """Initialize exporter with performance tracking.""" + """Initialize exporter with statistics tracking. + + Examples:: + >>> exporter = GraphExporter() + >>> exporter.stats["files_written"] + 0 + """ self.stats = {"files_written": 0, "total_bytes": 0, "export_time_ms": 0} def export_graph( @@ -43,23 +51,65 @@ def export_graph( """Export graph to multiple formats. Args: - graph: NetworkX graph to export - output_dir: Directory to save output files - node_types: Mapping of node types to colors + graph (nx.DiGraph): NetworkX graph to export + output_dir (str): Directory to save output files + node_types (Dict[str, str]): Mapping of node types to colors Raises: OSError: If directory creation or file writing fails + ValueError: If graph is empty or node_types is empty + + Examples:: + >>> exporter = GraphExporter() + >>> exporter.export_graph(graph, "output", {"module": "#000"}) + >>> Path("output/code_graph.graphml").exists() + True + + Notes: + Creates three files: + - code_graph.graphml: GraphML format for visualization tools + - code_graph.json: JSON format for web visualization + - graph_statistics.yaml: YAML format for statistics """ + if not graph: + raise ValueError("Cannot export empty graph") + if not node_types: + raise ValueError("Node types mapping cannot be empty") + output_path = Path(output_dir) output_path.mkdir(parents=True, exist_ok=True) - # Save as GraphML + self._export_graphml(graph, output_path) + self._export_json(graph, output_path) + self._export_statistics(graph, output_path, node_types) + + logger.info(f"Graph exported to {output_dir}/") + + def _export_graphml(self, graph: nx.DiGraph, output_path: Path) -> None: + """Export graph in GraphML format. + + Args: + graph (nx.DiGraph): NetworkX graph to export + output_path (Path): Directory to save the file + + Raises: + OSError: If file writing fails + """ logger.debug("Exporting GraphML format...") graphml_path = output_path / "code_graph.graphml" nx.write_graphml(graph, graphml_path) self._update_stats(graphml_path) - # Save as JSON for visualization + def _export_json(self, graph: nx.DiGraph, output_path: Path) -> None: + """Export graph in JSON format for visualization. + + Args: + graph (nx.DiGraph): NetworkX graph to export + output_path (Path): Directory to save the file + + Raises: + OSError: If file writing fails + """ logger.debug("Exporting JSON format...") graph_data: GraphData = { "nodes": [ @@ -86,7 +136,19 @@ def export_graph( json.dump(graph_data, f, indent=2) self._update_stats(json_path) - # Save statistics + def _export_statistics( + self, graph: nx.DiGraph, output_path: Path, node_types: Dict[str, str] + ) -> None: + """Export graph statistics in YAML format. + + Args: + graph (nx.DiGraph): NetworkX graph to export + output_path (Path): Directory to save the file + node_types (Dict[str, str]): Mapping of node types to colors + + Raises: + OSError: If file writing fails + """ logger.debug("Exporting statistics...") stats = { "total_nodes": graph.number_of_nodes(), @@ -104,13 +166,14 @@ def export_graph( yaml.dump(stats, f, default_flow_style=False) self._update_stats(yaml_path) - logger.info(f"Graph exported to {output_dir}/") - def _update_stats(self, file_path: Path) -> None: """Update export statistics. Args: - file_path: Path to exported file + file_path (Path): Path to exported file + + Raises: + OSError: If file size cannot be determined """ self.stats["files_written"] += 1 self.stats["total_bytes"] += file_path.stat().st_size diff --git a/repominify/formatters.py b/repominify/formatters.py new file mode 100644 index 0000000..ce5daad --- /dev/null +++ b/repominify/formatters.py @@ -0,0 +1,300 @@ +"""Text formatting functionality for repo-minify. + +This module handles generating human-readable text representations of code graphs. +""" + +from __future__ import annotations + +from typing import Dict, List, Any + +import networkx as nx + +from .logging import get_logger + +__all__ = ["GraphFormatter"] + +logger = get_logger(__name__) + + +class GraphFormatter: + """Formats graph data into human-readable text. + + This class provides functionality to generate text representations of + code graphs for documentation and analysis. + + Attributes: + stats: Dictionary tracking number of nodes processed and output size + """ + + __constants__ = [] # No constants defined for this class + + # Type hints for instance attributes + stats: Dict[str, Any] + + def __init__(self) -> None: + """Initialize formatter with statistics tracking. + + Examples:: + >>> formatter = GraphFormatter() + >>> formatter.stats["nodes_processed"] + 0 + """ + self.stats = {"nodes_processed": 0, "total_chars": 0, "format_time_ms": 0} + + def generate_text_representation( + self, graph: nx.DiGraph, node_types: Dict[str, str] + ) -> str: + """Generate a comprehensive text representation of the codebase. + + Args: + graph (nx.DiGraph): NetworkX graph to format + node_types (Dict[str, str]): Mapping of node types to colors + + Returns: + str: Formatted string containing graph analysis and statistics + + Raises: + KeyError: If required node attributes are missing + ValueError: If graph is empty or malformed + + Examples:: + >>> formatter = GraphFormatter() + >>> text = formatter.generate_text_representation(graph, node_types) + >>> print(text.split("\\n")[0]) + # Code Graph Overview + + Notes: + The output is organized into sections: overview, modules, and environment + variables if present. + """ + if not graph: + raise ValueError("Cannot generate representation of empty graph") + + text_parts = self._generate_overview(graph, node_types) + text_parts.extend(self._generate_module_structure(graph)) + text_parts.extend(self._generate_env_vars(graph)) + + result = "\n".join(text_parts) + self.stats["total_chars"] = len(result) + return result + + def _generate_overview( + self, graph: nx.DiGraph, node_types: Dict[str, str] + ) -> List[str]: + """Generate the overview section including node counts and type distribution. + + Args: + graph (nx.DiGraph): NetworkX graph to format + node_types (Dict[str, str]): Mapping of node types to colors + + Returns: + List[str]: Lines of text for the overview section + + Raises: + KeyError: If node type information is missing + """ + text_parts = [ + "# Code Graph Overview", + f"Total nodes: {graph.number_of_nodes()}", + f"Total edges: {graph.number_of_edges()}\n", + "## Node Type Distribution", + ] + + for node_type in sorted(node_types): + count = len( + [n for n, d in graph.nodes(data=True) if d.get("type") == node_type] + ) + text_parts.append(f"- {node_type}: {count}") + self.stats["nodes_processed"] += count + + text_parts.extend(("", "## Module Structure")) + return text_parts + + def _generate_module_structure(self, graph: nx.DiGraph) -> List[str]: + """Generate the module structure section including imports, constants, classes, and functions. + + Args: + graph (nx.DiGraph): NetworkX graph to format + + Returns: + List[str]: Lines of text for the module structure section + + Raises: + KeyError: If required node attributes are missing + """ + text_parts = [] + for node, data in sorted(graph.nodes(data=True)): + if data.get("type") != "module": + continue + + text_parts.extend(self._format_module_header(node, data)) + text_parts.extend(self._format_module_imports(graph, node)) + text_parts.extend(self._format_module_constants(graph, node)) + text_parts.extend(self._format_module_classes(graph, node)) + text_parts.extend(self._format_module_functions(graph, node)) + + return text_parts + + def _format_module_header(self, node: str, data: Dict) -> List[str]: + """Format the module header section. + + Args: + node (str): Name of the module + data (Dict): Module metadata + + Returns: + List[str]: Lines of text for the module header + + Raises: + KeyError: If required metadata is missing + """ + parts = [f"\n### Module: {node}"] + if "path" in data: + parts.append(f"Path: {data['path']}") + return parts + + def _format_module_imports(self, graph: nx.DiGraph, module: str) -> List[str]: + """Format the module imports section. + + Args: + graph (nx.DiGraph): NetworkX graph to format + module (str): Name of the module + + Returns: + List[str]: Lines of text for the imports section + + Raises: + KeyError: If required node attributes are missing + """ + imports = self._get_neighbors_by_type(graph, module, "import") + if not imports: + return [] + + return ["\nImports:"] + [f"- {imp}" for imp in sorted(imports)] + + def _format_module_constants(self, graph: nx.DiGraph, module: str) -> List[str]: + """Format the module constants section. + + Args: + graph (nx.DiGraph): NetworkX graph to format + module (str): Name of the module + + Returns: + List[str]: Lines of text for the constants section + + Raises: + KeyError: If required node attributes are missing + """ + constants = self._get_neighbors_by_type(graph, module, "constant") + if not constants: + return [] + + parts = ["\nConstants:"] + for const in sorted(constants): + const_data = graph.nodes[const] + parts.append(f"- {const.split('.')[-1]}: {const_data.get('value', '')}") + return parts + + def _format_module_classes(self, graph: nx.DiGraph, module: str) -> List[str]: + """Format the module classes section. + + Args: + graph (nx.DiGraph): NetworkX graph to format + module (str): Name of the module + + Returns: + List[str]: Lines of text for the classes section + + Raises: + KeyError: If required node attributes are missing + """ + classes = self._get_neighbors_by_type(graph, module, "class") + if not classes: + return [] + + parts = ["\nClasses:"] + for class_name in sorted(classes): + parts.extend(self._format_code_item(graph.nodes[class_name], class_name)) + return parts + + def _format_module_functions(self, graph: nx.DiGraph, module: str) -> List[str]: + """Format the module functions section. + + Args: + graph (nx.DiGraph): NetworkX graph to format + module (str): Name of the module + + Returns: + List[str]: Lines of text for the functions section + + Raises: + KeyError: If required node attributes are missing + """ + functions = self._get_neighbors_by_type(graph, module, "function") + if not functions: + return [] + + parts = ["\nFunctions:"] + for func_name in sorted(functions): + parts.extend(self._format_code_item(graph.nodes[func_name], func_name)) + return parts + + def _get_neighbors_by_type( + self, graph: nx.DiGraph, node: str, node_type: str + ) -> List[str]: + """Get all neighbors of a node with a specific type. + + Args: + graph (nx.DiGraph): NetworkX graph to query + node (str): Source node name + node_type (str): Type of neighbors to find + + Returns: + List[str]: Names of neighboring nodes of the specified type + + Raises: + KeyError: If node type information is missing + """ + return [ + n for n in graph.neighbors(node) if graph.nodes[n].get("type") == node_type + ] + + def _format_code_item(self, item_data: Dict, item_name: str) -> List[str]: + """Format a code item (class or function) with its signature and docstring. + + Args: + item_data (Dict): Item metadata including signature and docstring + item_name (str): Name of the code item + + Returns: + List[str]: Lines of text for the code item + + Raises: + KeyError: If required metadata is missing + """ + parts = [f"\n{item_data.get('signature', item_name.split('.')[-1])}"] + if "docstring" in item_data: + parts.append(f"'''\n{item_data['docstring']}\n'''") + return parts + + def _generate_env_vars(self, graph: nx.DiGraph) -> List[str]: + """Generate the environment variables section. + + Args: + graph (nx.DiGraph): NetworkX graph to format + + Returns: + List[str]: Lines of text for the environment variables section + + Raises: + KeyError: If required node attributes are missing + """ + env_vars = [n for n, d in graph.nodes(data=True) if d.get("type") == "env_var"] + if not env_vars: + return [] + + parts = ["\n## Environment Variables"] + for var in sorted(env_vars): + var_data = graph.nodes[var] + parts.append(f"- {var.split('.')[-1]}: {var_data.get('value', '')}") + return parts diff --git a/repominify/core/graph.py b/repominify/graph.py similarity index 50% rename from repominify/core/graph.py rename to repominify/graph.py index c8ead4c..c8e6360 100644 --- a/repominify/core/graph.py +++ b/repominify/graph.py @@ -1,58 +1,33 @@ """Core graph building functionality for repo-minify. This module provides the main graph building and analysis functionality. - -Author: Mike Casale -Email: mike@casale.xyz -GitHub: https://github.com/mikewcasale - -Performance Considerations: - - Memory: O(N) where N is the total number of code entities - - Time: O(M*L) where M is number of files and L is average lines per file - - Graph operations are optimized for large codebases - -Version Compatibility: - - Python 3.7+: Full support - - NetworkX 2.6+: Required for graph operations """ from __future__ import annotations +import contextlib import os import re import time from pathlib import Path -from typing import Dict, List, Optional, Tuple, Final, Set +from typing import Dict, List, Optional, Tuple, Set import networkx as nx -from ..utils.logging import get_logger -from ..utils.stats import FileStats, analyze_file, compare_files +from .logging import get_logger +from .stats import FileStats, analyze_file, compare_files from .parser import RepomixParser -from .types import FileEntry, GraphBuildError, ValidationError -from ..io.exporters import GraphExporter -from ..io.formatters import GraphFormatter +from .types import FileEntry +from .exceptions import GraphBuildError, ValidationError +from .exporters import GraphExporter +from .formatters import GraphFormatter +from .constants import NODE_TYPES, CONSTANT_PATTERNS + +__all__ = ["CodeGraphBuilder"] # Configure logging logger = get_logger(__name__) -# Node type constants -NODE_TYPES: Final[Dict[str, str]] = { - "module": "#A5D6A7", # Light green - "class": "#90CAF9", # Light blue - "function": "#FFCC80", # Light orange - "import": "#CE93D8", # Light purple - "constant": "#FFB74D", # Orange - "env_var": "#81C784", # Green -} - -# Patterns for finding constants -CONSTANT_PATTERNS: Final[List[str]] = [ - r"^[A-Z][A-Z0-9_]*$", # All caps with underscores - r"__[a-zA-Z0-9_]+__", # Dunder names - r"Final\[[^]]+\]", # Type hints with Final -] - class CodeGraphBuilder: """Builds and analyzes code dependency graphs from Repomix output. @@ -64,13 +39,12 @@ class CodeGraphBuilder: graph: A directed graph representing code dependencies node_types: Mapping of node types to their display colors stats: Runtime statistics for performance monitoring + debug: Whether debug logging is enabled + parser: Parser for Repomix output + exporter: Graph export functionality + formatter: Text formatting functionality - Performance: - - Memory usage scales linearly with codebase size - - Graph operations are O(N) for N nodes - - File I/O is buffered for efficiency - - Thread Safety: + Notes: This class is not thread-safe. Each instance should be used by a single thread. """ @@ -104,10 +78,15 @@ def __init__(self, debug: bool = False) -> None: """Initialize a new CodeGraphBuilder instance. Args: - debug: Enable debug logging and performance tracking + debug (bool): Enable debug logging and performance tracking Raises: ImportError: If required dependencies are not available + + Examples:: + >>> builder = CodeGraphBuilder(debug=True) + >>> builder.debug + True """ # Validate dependencies if not hasattr(nx, "write_graphml"): @@ -139,18 +118,20 @@ def build_graph(self, file_entries: List[FileEntry]) -> nx.DiGraph: """Build a knowledge graph from the code analysis. Args: - file_entries: List of FileEntry objects to analyze + file_entries (List[FileEntry]): List of FileEntry objects to analyze Returns: - NetworkX DiGraph representing the code structure + nx.DiGraph: NetworkX DiGraph representing the code structure Raises: ValidationError: If input validation fails GraphBuildError: If graph construction fails - Performance: - - Time: O(M*L) where M is number of files and L is lines per file - - Memory: O(N) where N is total number of code entities + Examples:: + >>> entries = [FileEntry("test.py", "def hello(): pass")] + >>> graph = builder.build_graph(entries) + >>> len(graph.nodes()) + 1 """ if not file_entries: raise ValidationError("No file entries provided") @@ -175,19 +156,22 @@ def build_graph(self, file_entries: List[FileEntry]) -> nx.DiGraph: self._add_env_var_nodes(env_file) except Exception as e: - raise GraphBuildError(f"Failed to build graph: {str(e)}") - - build_time = (time.time() - start_time) * 1000 - self.stats["build_time_ms"] = build_time - logger.debug(f"Graph built in {build_time:.2f}ms") + raise GraphBuildError(f"Failed to build graph: {str(e)}") from e + self._update_stats(start_time, "build_time_ms", "Graph built in ") return self.graph def _add_module_node(self, entry: FileEntry) -> None: """Add a module node to the graph. Args: - entry: FileEntry containing module information + entry (FileEntry): FileEntry containing module information + + Examples:: + >>> entry = FileEntry("test.py", "") + >>> builder._add_module_node(entry) + >>> "test" in builder.graph.nodes() + True """ module_name = Path(entry.path).stem self.graph.add_node( @@ -201,7 +185,13 @@ def _add_import_nodes(self, entry: FileEntry) -> None: """Add import nodes and relationships to the graph. Args: - entry: FileEntry containing import information + entry (FileEntry): FileEntry containing import information + + Examples:: + >>> entry = FileEntry("test.py", "import os") + >>> builder._add_import_nodes(entry) + >>> "os" in builder.graph.nodes() + True """ module_name = Path(entry.path).stem imports = self.parser.analyze_imports(entry.content) @@ -214,101 +204,188 @@ def _add_import_nodes(self, entry: FileEntry) -> None: ) self.graph.add_edge(module_name, imp, relationship="imports") + def _update_stats(self, start_time: float, stat_key: str, message: str) -> None: + """Update timing statistics and log debug message. + + Args: + start_time (float): Start time of the operation + stat_key (str): Key to update in stats dictionary + message (str): Message prefix for debug logging + """ + elapsed = (time.time() - start_time) * 1000 + self.stats[stat_key] = elapsed + logger.debug(f"{message}{elapsed:.2f}ms") + def _add_class_and_function_nodes(self, entry: FileEntry) -> None: """Add class and function nodes to the graph. Args: - entry: FileEntry containing class and function information + entry (FileEntry): FileEntry containing class and function information + + Examples:: + >>> entry = FileEntry("test.py", "class Test: pass") + >>> builder._add_class_and_function_nodes(entry) + >>> "test.Test" in builder.graph.nodes() + True """ module_name = Path(entry.path).stem content_lines = entry.content.split("\n") - # Track current class/function and its docstring - current_block = None - current_signature = None - current_docstring = [] - in_docstring = False - for i, line in enumerate(content_lines): - stripped = line.strip() - - # Handle docstring collection - if in_docstring: - if stripped.endswith('"""') or stripped.endswith("'''"): - current_docstring.append(stripped.rstrip("'\"")) - in_docstring = False - - # Add collected docstring to current node - if current_block and current_signature: - node_id = f"{module_name}.{current_block}" - self.graph.nodes[node_id]["docstring"] = "\n".join( - current_docstring - ) - current_docstring = [] - else: - current_docstring.append(stripped) + if not line.strip(): continue - # Look for class definitions - if stripped.startswith("class "): - current_block = stripped[6:].split("(")[0].split(":")[0].strip() - current_signature = stripped + if line.strip().startswith("class "): + self._handle_class_definition(module_name, content_lines, i) + elif line.strip().startswith("def "): + self._handle_function_definition(module_name, content_lines, i) - # Add class node with signature - full_class_name = f"{module_name}.{current_block}" - self.graph.add_node( - full_class_name, - type=self.NODE_TYPE_CLASS, - color=self.node_types[self.NODE_TYPE_CLASS], - signature=current_signature, - ) - self.graph.add_edge( - module_name, full_class_name, relationship="contains" - ) + def _handle_class_definition( + self, module_name: str, content_lines: List[str], line_idx: int + ) -> None: + """Handle a class definition and add it to the graph. - # Check for docstring start - if i + 1 < len(content_lines): - next_line = content_lines[i + 1].strip() - if next_line.startswith('"""') or next_line.startswith("'''"): - in_docstring = True - current_docstring = [next_line.lstrip("'\"")] - - # Look for function definitions - elif stripped.startswith("def "): - # Capture multi-line function signature - current_signature = stripped - while not current_signature.endswith(":"): - i += 1 - if i >= len(content_lines): - break - current_signature += " " + content_lines[i].strip() - - current_block = stripped[4:].split("(")[0].strip() - - # Add function node with signature - full_func_name = f"{module_name}.{current_block}" - self.graph.add_node( - full_func_name, - type=self.NODE_TYPE_FUNCTION, - color=self.node_types[self.NODE_TYPE_FUNCTION], - signature=current_signature, - ) - self.graph.add_edge( - module_name, full_func_name, relationship="contains" - ) + Args: + module_name (str): Name of the module containing the class + content_lines (List[str]): Lines of code to process + line_idx (int): Index of the class definition line + + Examples:: + >>> lines = ["class Test:", " pass"] + >>> builder._handle_class_definition("module", lines, 0) + >>> "module.Test" in builder.graph.nodes() + True + """ + line = content_lines[line_idx].strip() + class_name = line[6:].split("(")[0].split(":")[0].strip() + full_class_name = f"{module_name}.{class_name}" - # Check for docstring start - if i + 1 < len(content_lines): - next_line = content_lines[i + 1].strip() - if next_line.startswith('"""') or next_line.startswith("'''"): - in_docstring = True - current_docstring = [next_line.lstrip("'\"")] + self._add_node_with_signature( + full_class_name, self.NODE_TYPE_CLASS, line, module_name + ) + + self._add_docstring_if_present(full_class_name, content_lines, line_idx) + + def _handle_function_definition( + self, module_name: str, content_lines: List[str], line_idx: int + ) -> None: + """Handle a function definition and add it to the graph. + + Args: + module_name (str): Name of the module containing the function + content_lines (List[str]): Lines of code to process + line_idx (int): Index of the function definition line + + Examples:: + >>> lines = ["def test():", " pass"] + >>> builder._handle_function_definition("module", lines, 0) + >>> "module.test" in builder.graph.nodes() + True + """ + signature = self._get_full_signature(content_lines, line_idx) + func_name = content_lines[line_idx].strip()[4:].split("(")[0].strip() + full_func_name = f"{module_name}.{func_name}" + + self._add_node_with_signature( + full_func_name, self.NODE_TYPE_FUNCTION, signature, module_name + ) + + self._add_docstring_if_present(full_func_name, content_lines, line_idx) + + def _get_full_signature(self, content_lines: List[str], start_idx: int) -> str: + """Get the complete function signature, handling multi-line definitions. + + Args: + content_lines (List[str]): Lines of code to process + start_idx (int): Starting line index of the signature + + Returns: + str: Complete function signature including all lines + + Examples:: + >>> lines = ["def test(", " x: int", " ):", " pass"] + >>> builder._get_full_signature(lines, 0) + 'def test( x: int ):' + """ + signature = content_lines[start_idx].strip() + current_idx = start_idx + + while not signature.endswith(":"): + current_idx += 1 + if current_idx >= len(content_lines): + break + signature += f" {content_lines[current_idx].strip()}" + + return signature + + def _add_node_with_signature( + self, node_id: str, node_type: str, signature: str, module_name: str + ) -> None: + """Add a node with its signature to the graph. + + Args: + node_id (str): Unique identifier for the node + node_type (str): Type of node (class or function) + signature (str): Full signature of the code item + module_name (str): Name of the containing module + + Examples:: + >>> builder._add_node_with_signature("mod.func", "function", "def func():", "mod") + >>> "mod.func" in builder.graph.nodes() + True + """ + self.graph.add_node( + node_id, + type=node_type, + color=self.node_types[node_type], + signature=signature, + ) + self.graph.add_edge(module_name, node_id, relationship="contains") + + def _add_docstring_if_present( + self, node_id: str, content_lines: List[str], line_idx: int + ) -> None: + """Add docstring to a node if one follows the definition. + + Args: + node_id (str): Identifier of the node to add docstring to + content_lines (List[str]): Lines of code to process + line_idx (int): Index of the definition line + + Examples:: + >>> lines = ["def test():", " # docstring", " pass"] + >>> builder._add_docstring_if_present("mod.test", lines, 0) + >>> "docstring" in builder.graph.nodes["mod.test"].get("docstring", "") + True + """ + if line_idx + 1 >= len(content_lines): + return + + next_line = content_lines[line_idx + 1].strip() + if not (next_line.startswith('"""') or next_line.startswith("'''")): + return + + docstring = [next_line.lstrip("'\"")] + for line in content_lines[line_idx + 2 :]: + stripped = line.strip() + if stripped.endswith('"""') or stripped.endswith("'''"): + docstring.append(stripped.rstrip("'\"")) + break + docstring.append(stripped) + + self.graph.nodes[node_id]["docstring"] = "\n".join(docstring) def _add_constant_nodes(self, entry: FileEntry) -> None: """Add constant nodes to the graph. Args: - entry: FileEntry containing constant definitions + entry (FileEntry): FileEntry containing constant definitions + + Examples:: + >>> entry = FileEntry("test.py", "CONSTANT = 42") + >>> builder._add_constant_nodes(entry) + >>> "test.CONSTANT" in builder.graph.nodes() + True """ module_name = Path(entry.path).stem constants: Set[str] = set() @@ -331,14 +408,12 @@ def _add_constant_nodes(self, entry: FileEntry) -> None: # Look for __constants__ list if line.startswith("__constants__"): - try: + with contextlib.suppress(Exception): # Extract constants from list/tuple definition const_list = line.split("=")[1].strip(" [](){}").replace("'", '"') constants.update( c.strip(" \"'") for c in const_list.split(",") if c.strip() ) - except Exception: - pass # Add constant nodes for const in constants: @@ -355,7 +430,13 @@ def _add_env_var_nodes(self, env_file: FileEntry) -> None: """Add environment variable nodes to the graph. Args: - env_file: FileEntry containing environment variables + env_file (FileEntry): FileEntry containing environment variables + + Examples:: + >>> env = FileEntry(".env", "API_KEY=secret") + >>> builder._add_env_var_nodes(env) + >>> "env.API_KEY" in builder.graph.nodes() + True """ for line in env_file.content.split("\n"): line = line.strip() @@ -384,15 +465,16 @@ def save_graph( """Save the graph in multiple formats and generate comparison if input file provided. Args: - output_dir: Directory to save output files - input_file: Optional path to original Repomix output file for comparison + output_dir (str): Directory to save output files + input_file (Optional[str]): Optional path to original Repomix output file for comparison Returns: - Tuple of (text representation, comparison report if input file provided) + Tuple[str, Optional[str]]: Tuple of (text representation, comparison report if input file provided) - Performance: - - Time: O(N) where N is number of nodes - - I/O: Multiple file write operations + Examples:: + >>> text, report = builder.save_graph("output") + >>> isinstance(text, str) + True """ start_time = time.time() @@ -415,9 +497,7 @@ def save_graph( minified_stats = analyze_file(str(text_path)) comparison_report = compare_files(original_stats, minified_stats) - export_time = (time.time() - start_time) * 1000 - self.stats["export_time_ms"] = export_time - logger.debug(f"Graph exported in {export_time:.2f}ms") + self._update_stats(start_time, "export_time_ms", "Graph exported in ") # Update total stats self.stats["total_time_ms"] = ( diff --git a/repominify/io/__init__.py b/repominify/io/__init__.py deleted file mode 100644 index ac1118f..0000000 --- a/repominify/io/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -"""I/O functionality for repo-minify. - -This package provides functionality for exporting and formatting graph data. -""" - -from .exporters import GraphExporter -from .formatters import GraphFormatter - -__all__ = [ - "GraphExporter", - "GraphFormatter", -] diff --git a/repominify/io/formatters.py b/repominify/io/formatters.py deleted file mode 100644 index 994be3e..0000000 --- a/repominify/io/formatters.py +++ /dev/null @@ -1,145 +0,0 @@ -"""Text formatting functionality for repo-minify. - -This module handles generating human-readable text representations of code graphs. - -Performance Considerations: - - Memory usage is O(N) where N is number of nodes - - String operations are optimized for large graphs -""" - -from __future__ import annotations - -from typing import Dict, List - -import networkx as nx - -from ..utils.logging import get_logger - -logger = get_logger(__name__) - - -class GraphFormatter: - """Formats graph data into human-readable text. - - This class provides functionality to generate text representations of - code graphs for documentation and analysis. - - Attributes: - stats: Runtime statistics for performance monitoring - """ - - def __init__(self) -> None: - """Initialize formatter with performance tracking.""" - self.stats = {"nodes_processed": 0, "total_chars": 0, "format_time_ms": 0} - - def generate_text_representation( - self, graph: nx.DiGraph, node_types: Dict[str, str] - ) -> str: - """Generate a comprehensive text representation of the codebase. - - Args: - graph: NetworkX graph to format - node_types: Mapping of node types to colors - - Returns: - Formatted string containing graph analysis and statistics - - Performance: - - Memory usage scales with graph size - - String concatenation is optimized - """ - text_parts: List[str] = [] - - # Add graph overview - text_parts.append("# Code Graph Overview") - text_parts.append(f"Total nodes: {graph.number_of_nodes()}") - text_parts.append(f"Total edges: {graph.number_of_edges()}\n") - - # Add node type statistics - text_parts.append("## Node Type Distribution") - for node_type in sorted(node_types): - count = len( - [n for n, d in graph.nodes(data=True) if d.get("type") == node_type] - ) - text_parts.append(f"- {node_type}: {count}") - self.stats["nodes_processed"] += count - text_parts.append("") - - # Add module information - text_parts.append("## Module Structure") - for node, data in sorted(graph.nodes(data=True)): - if data.get("type") == "module": - text_parts.append(f"\n### Module: {node}") - if "path" in data: - text_parts.append(f"Path: {data['path']}") - - # List imports - imports = [ - n - for n in graph.neighbors(node) - if graph.nodes[n].get("type") == "import" - ] - if imports: - text_parts.append("\nImports:") - for imp in sorted(imports): - text_parts.append(f"- {imp}") - - # List constants - constants = [ - n - for n in graph.neighbors(node) - if graph.nodes[n].get("type") == "constant" - ] - if constants: - text_parts.append("\nConstants:") - for const in sorted(constants): - const_data = graph.nodes[const] - text_parts.append( - f"- {const.split('.')[-1]}: {const_data.get('value', '')}" - ) - - # List classes with signatures and docstrings - classes = [ - n - for n in graph.neighbors(node) - if graph.nodes[n].get("type") == "class" - ] - if classes: - text_parts.append("\nClasses:") - for class_name in sorted(classes): - class_data = graph.nodes[class_name] - text_parts.append( - f"\n{class_data.get('signature', class_name.split('.')[-1])}" - ) - if "docstring" in class_data: - text_parts.append(f"'''\n{class_data['docstring']}\n'''") - - # List functions with signatures and docstrings - functions = [ - n - for n in graph.neighbors(node) - if graph.nodes[n].get("type") == "function" - ] - if functions: - text_parts.append("\nFunctions:") - for func_name in sorted(functions): - func_data = graph.nodes[func_name] - text_parts.append( - f"\n{func_data.get('signature', func_name.split('.')[-1])}" - ) - if "docstring" in func_data: - text_parts.append(f"'''\n{func_data['docstring']}\n'''") - - # Add environment variables if present - env_vars = [n for n, d in graph.nodes(data=True) if d.get("type") == "env_var"] - if env_vars: - text_parts.append("\n## Environment Variables") - for var in sorted(env_vars): - var_data = graph.nodes[var] - text_parts.append( - f"- {var.split('.')[-1]}: {var_data.get('value', '')}" - ) - - result = "\n".join(text_parts) - self.stats["total_chars"] = len(result) - return result diff --git a/repominify/utils/logging.py b/repominify/logging.py similarity index 75% rename from repominify/utils/logging.py rename to repominify/logging.py index 3181fca..4f8900c 100644 --- a/repominify/utils/logging.py +++ b/repominify/logging.py @@ -1,18 +1,14 @@ """Logging configuration for repo-minify. This module provides consistent logging configuration across the package. - -Example: - >>> from repo_minify.utils.logging import configure_logging - >>> configure_logging(debug=True) - >>> logger = get_logger(__name__) - >>> logger.debug("Debug message") """ import logging import sys from typing import Optional +from .types import LogLevel + def configure_logging(debug: bool = False, log_file: Optional[str] = None) -> None: """Configure logging for repo-minify. @@ -21,9 +17,15 @@ def configure_logging(debug: bool = False, log_file: Optional[str] = None) -> No debug: Enable debug level logging log_file: Optional file path for logging output - Note: - - Debug mode includes performance metrics and detailed tracing - - Log file will contain all messages regardless of debug setting + Raises: + OSError: If log file cannot be created or written to + ValueError: If log file path is invalid + + Examples:: + >>> configure_logging(debug=True) + >>> logger = get_logger(__name__) + >>> logger.debug("Debug message") + DEBUG: Debug message """ root_logger = logging.getLogger("repo_minify") root_logger.setLevel(logging.DEBUG if debug else logging.INFO) @@ -56,6 +58,13 @@ def get_logger(name: str) -> logging.Logger: Returns: Configured logger instance + + Examples:: + >>> logger = get_logger(__name__) + >>> isinstance(logger, logging.Logger) + True + >>> logger.name.startswith('repo_minify.') + True """ return logging.getLogger(f"repo_minify.{name}") diff --git a/repominify/core/parser.py b/repominify/parser.py similarity index 74% rename from repominify/core/parser.py rename to repominify/parser.py index 23afc43..620238a 100644 --- a/repominify/core/parser.py +++ b/repominify/parser.py @@ -1,21 +1,18 @@ """Parser for Repomix output files. This module handles parsing of Repomix output files into structured data. - -Performance Considerations: - - Memory: O(N) where N is file size - - Time: O(L) where L is number of lines - - Large files are read in chunks to manage memory usage """ from __future__ import annotations +import logging import re from pathlib import Path -from typing import List, Set, Tuple, Optional +from typing import List, Set, Tuple, Optional, Dict, Any -from ..utils.logging import get_logger -from .types import FileEntry, FileParseError +from .logging import get_logger +from .types import FileEntry +from .exceptions import FileParseError logger = get_logger(__name__) @@ -27,11 +24,19 @@ class RepomixParser: structure information. Attributes: - stats: Runtime statistics for performance monitoring + stats: Runtime statistics tracking parsed files and content """ + stats: Dict[str, int] + def __init__(self) -> None: - """Initialize parser with performance tracking.""" + """Initialize parser with statistics tracking. + + Examples:: + >>> parser = RepomixParser() + >>> parser.stats["files_processed"] + 0 + """ self.stats = { "files_processed": 0, "total_lines": 0, @@ -50,18 +55,26 @@ def parse_file(self, file_path: str) -> List[FileEntry]: Raises: FileNotFoundError: If the input file doesn't exist - FileParseError: If the file format is invalid + FileParseError: If the file format is invalid or no entries found UnicodeDecodeError: If file encoding is not UTF-8 + + Examples:: + >>> parser = RepomixParser() + >>> entries = parser.parse_file("output.txt") + >>> len(entries) + 3 """ logger.debug(f"Parsing Repomix file: {file_path}") try: with open(file_path, "r", encoding="utf-8") as f: content = f.read() - except FileNotFoundError: - raise FileNotFoundError(f"Repomix output file not found: {file_path}") + except FileNotFoundError as e: + raise FileNotFoundError( + f"Repomix output file not found: {file_path}" + ) from e except UnicodeDecodeError as e: - raise FileParseError(f"Invalid file encoding: {e}") + raise FileParseError(f"Invalid file encoding: {e}") from e file_entries: List[FileEntry] = [] current_file: Optional[str] = None @@ -99,7 +112,7 @@ def parse_file(self, file_path: str) -> List[FileEntry]: self.stats["total_size"] += entry.size except Exception as e: - raise FileParseError(f"Failed to parse Repomix file: {e}") + raise FileParseError(f"Failed to parse Repomix file: {e}") from e if not file_entries: raise FileParseError("No valid file entries found in Repomix output") @@ -115,6 +128,12 @@ def analyze_imports(self, content: str) -> Set[str]: Returns: Set of imported module names + + Examples:: + >>> parser = RepomixParser() + >>> imports = parser.analyze_imports("import os\\nfrom typing import List") + >>> sorted(list(imports)) + ['os', 'typing.List'] """ imports: Set[str] = set() import_pattern = r"^(?:from\s+(\S+)\s+)?import\s+(.+)$" @@ -144,6 +163,16 @@ def extract_classes_and_functions( Returns: Tuple of (class names, function names) + + Examples:: + >>> parser = RepomixParser() + >>> classes, funcs = parser.extract_classes_and_functions( + ... "class MyClass:\\n def my_func():\\n pass" + ... ) + >>> classes + ['MyClass'] + >>> funcs + ['my_func'] """ classes: List[str] = [] functions: List[str] = [] @@ -154,11 +183,11 @@ def extract_classes_and_functions( for line in content.split("\n"): class_match = re.match(class_pattern, line) if class_match: - classes.append(class_match.group(1).strip()) + classes.append(class_match[1].strip()) continue func_match = re.match(function_pattern, line) if func_match: - functions.append(func_match.group(1).strip()) + functions.append(func_match[1].strip()) return classes, functions diff --git a/repominify/stats.py b/repominify/stats.py new file mode 100644 index 0000000..1f0879e --- /dev/null +++ b/repominify/stats.py @@ -0,0 +1,125 @@ +"""Statistics and comparison functionality for repo-minify. + +This module provides utilities for analyzing and comparing file sizes and content. +""" + +from __future__ import annotations + +import os +from pathlib import Path +from typing import Union, Set + +from .types import FileStats +from .constants import SUSPICIOUS_PATTERNS + + +def analyze_file(file_path: Union[str, Path]) -> FileStats: + """Analyze a file and generate statistics. + + Args: + file_path: Path to the file to analyze + + Returns: + FileStats object containing the analysis results + + Raises: + FileNotFoundError: If the file doesn't exist + PermissionError: If the file can't be read + UnicodeDecodeError: If the file has invalid encoding + + Examples:: + >>> stats = analyze_file("repomix-output.txt") + >>> print(f"Found {stats.total_files} files") + Found 18 files + """ + try: + with open(file_path, "r", encoding="utf-8") as f: + content = f.read() + except UnicodeDecodeError as e: + raise UnicodeDecodeError( + f"File {file_path} has invalid encoding. Please ensure it's UTF-8: {e}" + ) from e + except PermissionError as e: + raise PermissionError( + f"Cannot read file {file_path}. Please check permissions: {e}" + ) from e + + total_files: int = 0 + file_path_str: str = str(file_path) + + # Count files based on the file type + if file_path_str.endswith("code_graph.txt"): + # For the output file, count module nodes from the Node Type Distribution section + for line in content.splitlines(): + if line.strip().startswith("- module:"): + try: + total_files = int(line.split(":")[1].strip()) + break + except (ValueError, IndexError): + total_files = 0 + else: + # For input file, count unique "File: " entries + unique_files: Set[str] = set() + for line in content.splitlines(): + if line.strip().startswith("File: "): + file_entry = line.strip()[6:].strip() # Remove "File: " prefix + unique_files.add(file_entry) + total_files = len(unique_files) + + # Get total characters (excluding whitespace) + total_chars: int = len("".join(content.split())) + + # Estimate tokens (words and symbols) + total_tokens: int = len([t for t in content.split() if t.strip()]) + + return FileStats( + total_files=total_files, + total_chars=total_chars, + total_tokens=total_tokens, + file_path=file_path_str, + has_suspicious_files=False, # No longer used in output + ) + + +def compare_files(original_stats: FileStats, minified_stats: FileStats) -> str: + """Generate a comparison report between original and minified files. + + Args: + original_stats: Statistics for the original file + minified_stats: Statistics for the minified file + + Returns: + Formatted string containing the comparison + + Raises: + ZeroDivisionError: If original file has zero characters or tokens + + Examples:: + >>> original = FileStats(1, 1000, 200, "original.txt") + >>> minified = FileStats(1, 500, 100, "minified.txt") + >>> print(compare_files(original, minified)) + ... # Shows comparison with 50% reduction + """ + # Calculate reductions + char_reduction: float = ( + (original_stats.total_chars - minified_stats.total_chars) + / original_stats.total_chars + * 100 + ) + token_reduction: float = ( + (original_stats.total_tokens - minified_stats.total_tokens) + / original_stats.total_tokens + * 100 + ) + + # Generate report + return ( + f"{original_stats}\n" + f"\n" + f"{minified_stats}\n" + f"\n" + f"📈 Comparison:\n" + f"────────────────\n" + f" Char Reduction: {char_reduction:.1f}%\n" + f"Token Reduction: {token_reduction:.1f}%\n" + ) diff --git a/repominify/types.py b/repominify/types.py new file mode 100644 index 0000000..b60f429 --- /dev/null +++ b/repominify/types.py @@ -0,0 +1,214 @@ +"""Core types and data structures for repo-minify. + +This module defines the fundamental data types and structures used throughout +the repo-minify package. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from pathlib import Path +from typing import Dict, List, Set, Any, Tuple, Optional, Union, Final, Literal + +# Type aliases for graph operations +NodeID = str +NodeData = Dict[str, str] +EdgeData = Dict[str, str] +GraphData = Dict[str, List[Dict[str, str]]] + +# Type aliases for dependency management +CommandResult = Tuple[bool, str] +ProcessOutput = Union[str, bytes] +VersionInfo = Dict[str, str] + +# Type alias for logging levels +LogLevel = Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] + +# Performance tracking type +Stats = Dict[str, Any] + + +@dataclass +class FileEntry: + """Container for file content and metadata. + + Attributes: + path: Relative path to the file + content: File content as string + + Raises: + ValueError: If path or content is empty + + Examples:: + >>> entry = FileEntry("test.py", "print('hello')") + >>> entry.line_count + 1 + >>> entry.size + 13 + """ + + path: str + content: str + + def __post_init__(self) -> None: + """Validate file entry attributes. + + Raises: + ValueError: If path or content is empty + """ + if not self.path.strip(): + raise ValueError("File path cannot be empty") + if not self.content: + raise ValueError("File content cannot be empty") + + @property + def line_count(self) -> int: + """Get the number of lines in the file. + + Returns: + Number of lines in the file content + + Examples:: + >>> entry = FileEntry("test.py", "line1\\nline2") + >>> entry.line_count + 2 + """ + return len(self.content.split("\n")) + + @property + def size(self) -> int: + """Get the file size in bytes. + + Returns: + Size of file content in bytes + + Examples:: + >>> entry = FileEntry("test.py", "hello") + >>> entry.size + 5 + """ + return len(self.content.encode("utf-8")) + + def __str__(self) -> str: + """Get a string representation of the file entry. + + Returns: + Formatted string with file info + + Examples:: + >>> str(FileEntry("test.py", "hello")) + 'test.py (1 lines, 5 bytes)' + """ + return f"{self.path} ({self.line_count} lines, {self.size} bytes)" + + +@dataclass +class FileStats: + """Statistics for a file or collection of files. + + Attributes: + total_files: Number of files analyzed + total_chars: Total character count (excluding whitespace) + total_tokens: Total token count (words and symbols) + file_path: Path to the analyzed file + has_suspicious_files: Whether suspicious patterns were detected + + Raises: + ValueError: If any numeric values are negative + + Examples:: + >>> stats = FileStats(1, 100, 20, "output.txt") + >>> str(stats) + '📊 File Stats:\\n────────────────\\n Total Files: 1\\n Total Chars: 100\\n Total Tokens: 20\\n Output: output.txt\\n' + """ + + total_files: int + total_chars: int + total_tokens: int + file_path: str + has_suspicious_files: bool = False + + def __post_init__(self) -> None: + """Validate statistics values. + + Raises: + ValueError: If any numeric values are negative + """ + if self.total_files < 0: + raise ValueError("Total files cannot be negative") + if self.total_chars < 0: + raise ValueError("Total characters cannot be negative") + if self.total_tokens < 0: + raise ValueError("Total tokens cannot be negative") + + def __str__(self) -> str: + """Format statistics for display. + + Returns: + Formatted string with emoji and alignment + + Examples:: + >>> print(FileStats(1, 100, 20, "test.txt")) + 📊 File Stats: + ──────────────── + Total Files: 1 + Total Chars: 100 + Total Tokens: 20 + Output: test.txt + """ + return ( + f"📊 File Stats:\n" + f"────────────────\n" + f" Total Files: {self.total_files}\n" + f" Total Chars: {self.total_chars:,}\n" + f" Total Tokens: {self.total_tokens:,}\n" + f" Output: {Path(self.file_path).name}\n" + ) + + +@dataclass +class DependencyVersion: + """Container for dependency version information. + + Attributes: + name: Name of the dependency + version: Version string + is_installed: Whether the dependency is installed + install_time: Installation timestamp (if installed) + install_path: Installation path (if installed) + + Raises: + ValueError: If version string is empty or invalid + + Examples:: + >>> dep = DependencyVersion("python", "3.9.0") + >>> str(dep) + 'python 3.9.0' + """ + + name: str + version: str + is_installed: bool = False + install_time: Optional[float] = field(default=None) + install_path: Optional[str] = field(default=None) + + def __post_init__(self) -> None: + """Validate version string format. + + Raises: + ValueError: If version string is empty + """ + if not self.version.strip(): + raise ValueError("Version string cannot be empty") + + def __str__(self) -> str: + """Format dependency version for display. + + Returns: + Formatted string with name and version + + Examples:: + >>> str(DependencyVersion("node", "14.0.0")) + 'node 14.0.0' + """ + return f"{self.name} {self.version}" diff --git a/repominify/utils/stats.py b/repominify/utils/stats.py deleted file mode 100644 index 38bfc83..0000000 --- a/repominify/utils/stats.py +++ /dev/null @@ -1,212 +0,0 @@ -"""Statistics and comparison functionality for repo-minify. - -This module provides utilities for analyzing and comparing file sizes and content. - -Author: Mike Casale -Email: mike@casale.xyz -GitHub: https://github.com/mikewcasale - -Performance Considerations: - - Memory: O(N) where N is file size - - I/O: One read operation per file - - CPU: Linear scan for pattern matching - -Error Handling: - - FileNotFoundError: When input files don't exist - - UnicodeDecodeError: When files have invalid encoding - - PermissionError: When files can't be accessed - -Version Compatibility: - - Python 3.7+: Full support - - Type hints: Required for static analysis -""" - -from __future__ import annotations - -import os -from dataclasses import dataclass, field -from pathlib import Path -from typing import List, Dict, Set, Final - -# Security patterns to detect -SUSPICIOUS_PATTERNS: Final[Set[str]] = { - "password", - "secret", - "token", - "api_key", - "private_key", - "ssh_key", - "credentials", - "auth", -} - - -@dataclass -class FileStats: - """Statistics for a file or collection of files. - - Attributes: - total_files: Number of files analyzed - total_chars: Total character count (excluding whitespace) - total_tokens: Total token count (words and symbols) - file_path: Path to the analyzed file - has_suspicious_files: Whether suspicious patterns were detected - - Example: - >>> stats = FileStats(total_files=1, total_chars=1000, - ... total_tokens=200, file_path="example.txt") - >>> print(stats) - 📊 File Stats: - ──────────────── - Total Files: 1 - Total Chars: 1,000 - Total Tokens: 200 - Output: example.txt - """ - - # Required fields - total_files: int - total_chars: int - total_tokens: int - file_path: str - - # Optional fields with defaults - has_suspicious_files: bool = False - - def __str__(self) -> str: - """Format statistics for display. - - Returns: - Formatted string with emoji and alignment - - Performance: - - Time: O(1) - Fixed string operations - - Memory: O(1) - Fixed size output - """ - return ( - f"📊 File Stats:\n" - f"────────────────\n" - f" Total Files: {self.total_files}\n" - f" Total Chars: {self.total_chars:,}\n" - f" Total Tokens: {self.total_tokens:,}\n" - f" Output: {Path(self.file_path).name}\n" - ) - - -def analyze_file(file_path: str | Path) -> FileStats: - """Analyze a file and generate statistics. - - Args: - file_path: Path to the file to analyze - - Returns: - FileStats object containing the analysis results - - Raises: - FileNotFoundError: If the file doesn't exist - PermissionError: If the file can't be read - UnicodeDecodeError: If the file has invalid encoding - - Example: - >>> stats = analyze_file("repomix-output.txt") - >>> print(f"Found {stats.total_files} files") - Found 18 files - - Performance: - - Time: O(N) where N is file size - - Memory: O(N) for file content - - I/O: One read operation - """ - try: - with open(file_path, "r", encoding="utf-8") as f: - content = f.read() - except UnicodeDecodeError as e: - raise UnicodeDecodeError( - f"File {file_path} has invalid encoding. Please ensure it's UTF-8: {e}" - ) - except PermissionError as e: - raise PermissionError( - f"Cannot read file {file_path}. Please check permissions: {e}" - ) - - # Count files based on the file type - if str(file_path).endswith("code_graph.txt"): - # For the output file, count module nodes from the Node Type Distribution section - for line in content.splitlines(): - if line.strip().startswith("- module:"): - try: - total_files = int(line.split(":")[1].strip()) - break - except (ValueError, IndexError): - total_files = 0 - else: - # For input file, count unique "File: " entries - unique_files = set() - for line in content.splitlines(): - if line.strip().startswith("File: "): - file_path_str = line.strip()[6:].strip() # Remove "File: " prefix - unique_files.add(file_path_str) - total_files = len(unique_files) - - # Get total characters (excluding whitespace) - total_chars = len("".join(content.split())) - - # Estimate tokens (words and symbols) - total_tokens = len([t for t in content.split() if t.strip()]) - - return FileStats( - total_files=total_files, - total_chars=total_chars, - total_tokens=total_tokens, - file_path=str(file_path), - has_suspicious_files=False # No longer used in output - ) - - -def compare_files(original_stats: FileStats, minified_stats: FileStats) -> str: - """Generate a comparison report between original and minified files. - - Args: - original_stats: Statistics for the original file - minified_stats: Statistics for the minified file - - Returns: - Formatted string containing the comparison - - Example: - >>> original = FileStats(1, 1000, 200, "original.txt") - >>> minified = FileStats(1, 500, 100, "minified.txt") - >>> print(compare_files(original, minified)) - ... # Shows comparison with 50% reduction - - Performance: - - Time: O(1) - Simple arithmetic operations - - Memory: O(1) - Fixed size string output - - Note: - The comparison assumes both files contain valid data and - the minified file is derived from the original. - """ - # Calculate reductions - char_reduction = ( - (original_stats.total_chars - minified_stats.total_chars) - / original_stats.total_chars - * 100 - ) - token_reduction = ( - (original_stats.total_tokens - minified_stats.total_tokens) - / original_stats.total_tokens - * 100 - ) - - # Generate report - return ( - f"{original_stats}\n" - f"\n" - f"{minified_stats}\n" - f"\n" - f"📈 Comparison:\n" - f"────────────────\n" - f" Char Reduction: {char_reduction:.1f}%\n" - f"Token Reduction: {token_reduction:.1f}%\n" - )