From 99cbe3c27978bd7db6385e37beed27948ea920a7 Mon Sep 17 00:00:00 2001 From: JayGhiya Date: Thu, 25 Jul 2024 12:09:44 +0530 Subject: [PATCH 1/2] fix: graph data structure for construction of codebase --- .../data_models/chapi_unoplat_package.py | 6 +++- .../dspy/dspy_unoplat_codebase_summary.py | 3 +- .../dspy/dspy_unoplat_function_summary.py | 4 ++- .../dspy/dspy_unoplat_node_summary.py | 3 +- .../dspy/dspy_unoplat_package_summary.py | 8 ++--- .../dspy_codebase_summary.py | 4 +-- .../dspy_package_summary.py | 4 +-- .../loader/parse_json.py | 32 ++++++++++++------- .../summary_parser/codebase_summary.py | 4 +-- 9 files changed, 41 insertions(+), 27 deletions(-) diff --git a/unoplat-code-confluence/unoplat_code_confluence/data_models/chapi_unoplat_package.py b/unoplat-code-confluence/unoplat_code_confluence/data_models/chapi_unoplat_package.py index 3bec709..480ad3d 100644 --- a/unoplat-code-confluence/unoplat_code_confluence/data_models/chapi_unoplat_package.py +++ b/unoplat-code-confluence/unoplat_code_confluence/data_models/chapi_unoplat_package.py @@ -5,4 +5,8 @@ class UnoplatPackage(BaseModel): - package_dict: Optional[Dict[str,List[DspyUnoplatNodeSubset]]] = Field(default_factory=dict,alias="package_dict") + node_subsets: Optional[List[DspyUnoplatNodeSubset]] = Field( default_factory=list,description="List of the node subsets for the package") + sub_packages: Optional[Dict[str, 'UnoplatPackage']] = Field( default_factory=dict,description="Dict of the sub-packages for the package") + +UnoplatPackage.model_rebuild() + \ No newline at end of file diff --git a/unoplat-code-confluence/unoplat_code_confluence/data_models/dspy/dspy_unoplat_codebase_summary.py b/unoplat-code-confluence/unoplat_code_confluence/data_models/dspy/dspy_unoplat_codebase_summary.py index 45c6991..1aa1d0a 100644 --- a/unoplat-code-confluence/unoplat_code_confluence/data_models/dspy/dspy_unoplat_codebase_summary.py +++ b/unoplat-code-confluence/unoplat_code_confluence/data_models/dspy/dspy_unoplat_codebase_summary.py @@ -5,8 +5,7 @@ class DspyUnoplatCodebaseSummary(BaseModel): codebase_summary: Optional[str] = Field(default=None, description="A summary of the codebase") - codebase_objective: Optional[str] = Field(default=None, description="The objective of the codebase") - + metadata: Optional[dict] = Field(default=None, description="The metadata of the codebase") codebase_name: Optional[str] = Field( default=None,description="The file id of the codebase summary") codebase_package: Optional[DspyUnoplatPackageSummary] = Field(default=None,description="A summary of the codebase package") \ No newline at end of file diff --git a/unoplat-code-confluence/unoplat_code_confluence/data_models/dspy/dspy_unoplat_function_summary.py b/unoplat-code-confluence/unoplat_code_confluence/data_models/dspy/dspy_unoplat_function_summary.py index c8311cf..3b0b591 100644 --- a/unoplat-code-confluence/unoplat_code_confluence/data_models/dspy/dspy_unoplat_function_summary.py +++ b/unoplat-code-confluence/unoplat_code_confluence/data_models/dspy/dspy_unoplat_function_summary.py @@ -1,7 +1,9 @@ +from typing import Optional from pydantic import BaseModel,Field from unoplat_code_confluence.data_models.dspy.dspy_o_function_summary import DspyFunctionSummary class DspyUnoplatFunctionSummary(BaseModel): function_name: str = Field( alias="FunctionName", description="The name of the function") - function_summary: DspyFunctionSummary = Field( alias="FunctionSummary", description="A summary of the function") \ No newline at end of file + function_summary: DspyFunctionSummary = Field( alias="FunctionSummary", description="A summary of the function") + metadata: Optional[dict] = Field(default=None, description="Additional metadata for the function") \ No newline at end of file diff --git a/unoplat-code-confluence/unoplat_code_confluence/data_models/dspy/dspy_unoplat_node_summary.py b/unoplat-code-confluence/unoplat_code_confluence/data_models/dspy/dspy_unoplat_node_summary.py index 29e13b7..33f3287 100644 --- a/unoplat-code-confluence/unoplat_code_confluence/data_models/dspy/dspy_unoplat_node_summary.py +++ b/unoplat-code-confluence/unoplat_code_confluence/data_models/dspy/dspy_unoplat_node_summary.py @@ -7,4 +7,5 @@ class DspyUnoplatNodeSummary(BaseModel): node_name: Optional[str] = Field(default=None, alias="NodeName",description="The name of the class") node_summary: Optional[str] = Field(default=None, alias="NodeSummary",description="A summary of the class") node_objective: Optional[str] = Field(default=None, alias="NodeObjective",description="The objective of the class") - functions_summary: Optional[List[DspyUnoplatFunctionSummary]] = Field(default=None, alias="FunctionsSummary",description="A list of functions in the class") \ No newline at end of file + functions_summary: Optional[List[DspyUnoplatFunctionSummary]] = Field(default=None, alias="FunctionsSummary",description="A list of functions in the class") + metadata: Optional[dict] = Field(default=None, description="Additional metadata for the node") \ No newline at end of file diff --git a/unoplat-code-confluence/unoplat_code_confluence/data_models/dspy/dspy_unoplat_package_summary.py b/unoplat-code-confluence/unoplat_code_confluence/data_models/dspy/dspy_unoplat_package_summary.py index eb8843c..c957368 100644 --- a/unoplat-code-confluence/unoplat_code_confluence/data_models/dspy/dspy_unoplat_package_summary.py +++ b/unoplat-code-confluence/unoplat_code_confluence/data_models/dspy/dspy_unoplat_package_summary.py @@ -6,12 +6,12 @@ -class DspyUnoplatPackageNodeSummary(BaseModel): +class DspyUnoplatPackageSummary(BaseModel): package_objective: str = Field( description="The objective of the package in a concise manner") package_summary: str = Field( description="The detailed summary of the package") class_summary: List[DspyUnoplatNodeSummary] = Field( default_factory=list,description="List of the class summaries for the package") + metadata: Optional[dict] = Field(default=None, description="Additional metadata for the package") + sub_packages: List['DspyUnoplatPackageSummary'] = Field( default_factory=list,description="List of the sub-packages for the package") - -class DspyUnoplatPackageSummary(BaseModel): - package_summary_dict: Optional[Dict[str, DspyUnoplatPackageNodeSummary]] = Field(default_factory=dict,description="Dict to hold the summary of packages") +DspyUnoplatPackageSummary.model_rebuild() diff --git a/unoplat-code-confluence/unoplat_code_confluence/dspy_codebase_summary.py b/unoplat-code-confluence/unoplat_code_confluence/dspy_codebase_summary.py index 0fc3d3f..081bda3 100644 --- a/unoplat-code-confluence/unoplat_code_confluence/dspy_codebase_summary.py +++ b/unoplat-code-confluence/unoplat_code_confluence/dspy_codebase_summary.py @@ -1,6 +1,6 @@ from typing import Dict import dspy -from unoplat_code_confluence.data_models.dspy.dspy_unoplat_package_summary import DspyUnoplatPackageNodeSummary +from unoplat_code_confluence.data_models.dspy.dspy_unoplat_package_summary import DspyUnoplatPackageSummary @@ -24,7 +24,7 @@ def __init__(self): self.generate_codebase_objective = dspy.ChainOfThoughtWithHint(CodeConfluenceCodebaseObjectiveSignature) - def forward(self, package_objective_dict: Dict[str, DspyUnoplatPackageNodeSummary]): + def forward(self, package_objective_dict: Dict[str, DspyUnoplatPackageSummary]): codebase_summary = "" summary_hint="Enhance the existing codebase summary based on current package objective without loosing important details from existing codebase summary. So be cautious while being concise. " diff --git a/unoplat-code-confluence/unoplat_code_confluence/dspy_package_summary.py b/unoplat-code-confluence/unoplat_code_confluence/dspy_package_summary.py index e96e6c1..fd2f699 100644 --- a/unoplat-code-confluence/unoplat_code_confluence/dspy_package_summary.py +++ b/unoplat-code-confluence/unoplat_code_confluence/dspy_package_summary.py @@ -1,7 +1,7 @@ from typing import Dict, List import dspy from unoplat_code_confluence.data_models.dspy.dspy_unoplat_node_summary import DspyUnoplatNodeSummary -from unoplat_code_confluence.data_models.dspy.dspy_unoplat_package_summary import DspyUnoplatPackageNodeSummary +from unoplat_code_confluence.data_models.dspy.dspy_unoplat_package_summary import DspyUnoplatPackageSummary from loguru import logger #TODO: optimise using gpt4 judge and miprov2s @@ -36,7 +36,7 @@ def forward(self, class_objective_list: List[DspyUnoplatNodeSummary],package_nam package_objective_hint = "First capture all highlights from summary and based on highlights generate the package objective for the package by being concise and dnt miss on any details for:"+package_name+". Do not extrapolate or make up anything. Strictly be factual and grounded." class_objective_signature: CodeConfluencePackageObjectiveSignature = self.generate_package_objective(final_package_summary=package_summary,package_name=package_name,hint=package_objective_hint) - dspy_package_summary = DspyUnoplatPackageNodeSummary(package_objective=class_objective_signature.package_objective,package_summary=package_summary,class_summary=class_objective_list) + dspy_package_summary = DspyUnoplatPackageSummary(package_objective=class_objective_signature.package_objective,package_summary=package_summary,class_summary=class_objective_list) return dspy.Prediction(answer=dspy_package_summary) diff --git a/unoplat-code-confluence/unoplat_code_confluence/loader/parse_json.py b/unoplat-code-confluence/unoplat_code_confluence/loader/parse_json.py index d0f77e4..885247c 100644 --- a/unoplat-code-confluence/unoplat_code_confluence/loader/parse_json.py +++ b/unoplat-code-confluence/unoplat_code_confluence/loader/parse_json.py @@ -7,6 +7,7 @@ from unoplat_code_confluence.data_models.dspy.dspy_unoplat_fs_function_call_subset import DspyUnoplatFunctionCallSubset from unoplat_code_confluence.data_models.dspy.dspy_unoplat_fs_function_subset import DspyUnoplatFunctionSubset from unoplat_code_confluence.data_models.dspy.dspy_unoplat_fs_node_subset import DspyUnoplatNodeSubset +from unoplat_code_confluence.data_models.dspy.dspy_unoplat_package_summary import DspyUnoplatPackageSummary from unoplat_code_confluence.loader.iparse_json import IParseJson from unoplat_code_confluence.data_models.chapi_unoplat_node import Node from loguru import logger @@ -17,10 +18,8 @@ def parse_json_to_nodes(self, json_data: dict, local_workspace_path: str, progra """Concrete implementation of the parse_json_to_nodes method.""" unoplat_codebase = UnoplatCodebase() - - unoplat_package = UnoplatPackage() - unoplat_package_dict: Dict[str,List[DspyUnoplatNodeSubset]] = {} + unoplat_package_dict: Dict[str,UnoplatPackage] = {} for item in json_data: try: @@ -58,16 +57,25 @@ def parse_json_to_nodes(self, json_data: dict, local_workspace_path: str, progra node_subset.functions = function_subset_list - if node.package in unoplat_package_dict: - unoplat_package_dict[node.package].append(node_subset) - else: - logger.debug(f"Identified new package: {node.package}") - list_node_subset: List[DspyUnoplatNodeSubset] = [] - list_node_subset.append(node_subset) - unoplat_package_dict[node.package] = list_node_subset + package_parts = node.package.split('.') + current_package = None + + + for i, part in enumerate(package_parts): + if i == 0: + if part not in unoplat_package_dict: + unoplat_package_dict[part] = UnoplatPackage() + current_package = unoplat_package_dict[part] + else: + if part not in current_package.sub_packages: + current_package.sub_packages[part] = UnoplatPackage() + current_package = current_package.sub_packages[part] + + + current_package.node_subsets.append(node_subset) + except Exception as e: logger.error(f"Error processing node: {e}") - unoplat_package.package_dict = unoplat_package_dict - unoplat_codebase.packages = unoplat_package + unoplat_codebase.packages = unoplat_package_dict return unoplat_codebase \ No newline at end of file diff --git a/unoplat-code-confluence/unoplat_code_confluence/summary_parser/codebase_summary.py b/unoplat-code-confluence/unoplat_code_confluence/summary_parser/codebase_summary.py index 2e48506..a29a5c9 100644 --- a/unoplat-code-confluence/unoplat_code_confluence/summary_parser/codebase_summary.py +++ b/unoplat-code-confluence/unoplat_code_confluence/summary_parser/codebase_summary.py @@ -5,7 +5,7 @@ from unoplat_code_confluence.data_models.dspy.dspy_unoplat_codebase_summary import DspyUnoplatCodebaseSummary from unoplat_code_confluence.data_models.dspy.dspy_unoplat_function_summary import DspyUnoplatFunctionSummary from unoplat_code_confluence.data_models.dspy.dspy_unoplat_node_summary import DspyUnoplatNodeSummary -from unoplat_code_confluence.data_models.dspy.dspy_unoplat_package_summary import DspyUnoplatPackageNodeSummary, DspyUnoplatPackageSummary +from unoplat_code_confluence.data_models.dspy.dspy_unoplat_package_summary import DspyUnoplatPackageSummary from unoplat_code_confluence.dspy_class_summary import CodeConfluenceClassModule from unoplat_code_confluence.dspy_codebase_summary import CodeConfluenceCodebaseModule, CodeConfluenceCodebaseObjectiveSignature from unoplat_code_confluence.dspy_function_summary import CodeConfluenceFunctionModule @@ -79,7 +79,7 @@ def parse_codebase(self) -> DspyUnoplatCodebaseSummary: logger.error(f"Error generating class summary for {node.name}: {e}") logger.exception("Traceback:") try: - dspy_pipeline_package_node_summary: DspyUnoplatPackageNodeSummary = self.dspy_pipeline_package(class_summaries,package_name).answer + dspy_pipeline_package_node_summary: DspyUnoplatPackageSummary = self.dspy_pipeline_package(class_summaries,package_name).answer except Exception as e: logger.error(f"Error generating package summary for {package_name}: {e}") logger.exception("Traceback:") From a0f9f8d7830ec71942a06e95ea465c29a828da21 Mon Sep 17 00:00:00 2001 From: JayGhiya Date: Sun, 28 Jul 2024 09:49:55 +0530 Subject: [PATCH 2/2] fix: added nested progress bars, support for efficient graph traversal and support for files that are not object oriented --- .gitignore | 2 + unoplat-code-confluence/pyproject.toml | 1 + .../unoplat_code_confluence/__main__.py | 2 +- .../data_models/chapi_unoplat_package.py | 1 + .../dspy/dspy_unoplat_codebase_summary.py | 4 +- .../dspy/dspy_unoplat_package_summary.py | 4 +- .../dspy_class_summary.py | 7 +- .../dspy_function_summary.py | 1 + .../dspy_package_summary.py | 40 ++-- .../loader/parse_json.py | 83 ++++----- .../__init__.py | 0 .../isummariser.py | 0 .../markdownparser/markdownsummariser.py | 63 +++++++ .../nodeparser/markdownsummariser.py | 41 ---- .../summary_parser/codebase_summary.py | 176 +++++++++++++++++- 15 files changed, 314 insertions(+), 111 deletions(-) rename unoplat-code-confluence/unoplat_code_confluence/{nodeparser => markdownparser}/__init__.py (100%) rename unoplat-code-confluence/unoplat_code_confluence/{nodeparser => markdownparser}/isummariser.py (100%) create mode 100644 unoplat-code-confluence/unoplat_code_confluence/markdownparser/markdownsummariser.py delete mode 100644 unoplat-code-confluence/unoplat_code_confluence/nodeparser/markdownsummariser.py diff --git a/.gitignore b/.gitignore index a560505..c67f92b 100755 --- a/.gitignore +++ b/.gitignore @@ -52,3 +52,5 @@ unoplat-code-confluence/unoplat_code_confluence/example_config.json unoplat-code-confluence/poetry.lock app.log unoplat-code-confluence/unoplat_code_confluence/configuration/__pycache__ +unoplat-code-confluence/unoplat_code_confluence/markdownparser/__pycache__ +unoplat-code-confluence/~/Documents/unoplat diff --git a/unoplat-code-confluence/pyproject.toml b/unoplat-code-confluence/pyproject.toml index e23656e..c72ee52 100644 --- a/unoplat-code-confluence/pyproject.toml +++ b/unoplat-code-confluence/pyproject.toml @@ -18,6 +18,7 @@ litellm = "^1.37.19" pytest = "^8.2.1" dspy-ai = "^2.4.9" packaging = "^24.1" +progiter = "^2.0.0" [tool.poetry.scripts] unoplat-code-confluence = "unoplat_code_confluence.__main__:start_pipeline" diff --git a/unoplat-code-confluence/unoplat_code_confluence/__main__.py b/unoplat-code-confluence/unoplat_code_confluence/__main__.py index 243a4b9..3abf30e 100644 --- a/unoplat-code-confluence/unoplat_code_confluence/__main__.py +++ b/unoplat-code-confluence/unoplat_code_confluence/__main__.py @@ -15,7 +15,7 @@ from unoplat_code_confluence.loader import iload_json, iparse_json from unoplat_code_confluence.loader.json_loader import JsonLoader from unoplat_code_confluence.loader.parse_json import JsonParser -from unoplat_code_confluence.nodeparser.markdownsummariser import MarkdownSummariser +from unoplat_code_confluence.markdownparser.markdownsummariser import MarkdownSummariser from unoplat_code_confluence.summary_parser.codebase_summary import CodebaseSummaryParser import warnings from packaging import version diff --git a/unoplat-code-confluence/unoplat_code_confluence/data_models/chapi_unoplat_package.py b/unoplat-code-confluence/unoplat_code_confluence/data_models/chapi_unoplat_package.py index 480ad3d..1a949b3 100644 --- a/unoplat-code-confluence/unoplat_code_confluence/data_models/chapi_unoplat_package.py +++ b/unoplat-code-confluence/unoplat_code_confluence/data_models/chapi_unoplat_package.py @@ -5,6 +5,7 @@ class UnoplatPackage(BaseModel): + name: Optional[str] = Field(default=None,description="Name of the package") node_subsets: Optional[List[DspyUnoplatNodeSubset]] = Field( default_factory=list,description="List of the node subsets for the package") sub_packages: Optional[Dict[str, 'UnoplatPackage']] = Field( default_factory=dict,description="Dict of the sub-packages for the package") diff --git a/unoplat-code-confluence/unoplat_code_confluence/data_models/dspy/dspy_unoplat_codebase_summary.py b/unoplat-code-confluence/unoplat_code_confluence/data_models/dspy/dspy_unoplat_codebase_summary.py index 1aa1d0a..9793eb0 100644 --- a/unoplat-code-confluence/unoplat_code_confluence/data_models/dspy/dspy_unoplat_codebase_summary.py +++ b/unoplat-code-confluence/unoplat_code_confluence/data_models/dspy/dspy_unoplat_codebase_summary.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Dict, Optional from pydantic import BaseModel,Field from unoplat_code_confluence.data_models.dspy.dspy_unoplat_package_summary import DspyUnoplatPackageSummary @@ -8,4 +8,4 @@ class DspyUnoplatCodebaseSummary(BaseModel): codebase_objective: Optional[str] = Field(default=None, description="The objective of the codebase") metadata: Optional[dict] = Field(default=None, description="The metadata of the codebase") codebase_name: Optional[str] = Field( default=None,description="The file id of the codebase summary") - codebase_package: Optional[DspyUnoplatPackageSummary] = Field(default=None,description="A summary of the codebase package") \ No newline at end of file + codebase_package: Optional[Dict[str,DspyUnoplatPackageSummary]] = Field(default_factory=dict,description="A summary of the codebase package") \ No newline at end of file diff --git a/unoplat-code-confluence/unoplat_code_confluence/data_models/dspy/dspy_unoplat_package_summary.py b/unoplat-code-confluence/unoplat_code_confluence/data_models/dspy/dspy_unoplat_package_summary.py index c957368..b272ea5 100644 --- a/unoplat-code-confluence/unoplat_code_confluence/data_models/dspy/dspy_unoplat_package_summary.py +++ b/unoplat-code-confluence/unoplat_code_confluence/data_models/dspy/dspy_unoplat_package_summary.py @@ -11,7 +11,7 @@ class DspyUnoplatPackageSummary(BaseModel): package_summary: str = Field( description="The detailed summary of the package") class_summary: List[DspyUnoplatNodeSummary] = Field( default_factory=list,description="List of the class summaries for the package") metadata: Optional[dict] = Field(default=None, description="Additional metadata for the package") - sub_packages: List['DspyUnoplatPackageSummary'] = Field( default_factory=list,description="List of the sub-packages for the package") - + sub_package_summaries: Dict[str, 'DspyUnoplatPackageSummary'] = Field(default_factory=dict, description="Dictionary of sub-package summaries, keyed by package name") + DspyUnoplatPackageSummary.model_rebuild() diff --git a/unoplat-code-confluence/unoplat_code_confluence/dspy_class_summary.py b/unoplat-code-confluence/unoplat_code_confluence/dspy_class_summary.py index 9d4c518..ee0bb12 100644 --- a/unoplat-code-confluence/unoplat_code_confluence/dspy_class_summary.py +++ b/unoplat-code-confluence/unoplat_code_confluence/dspy_class_summary.py @@ -18,7 +18,7 @@ class CodeConfluenceClassSummarySignature(dspy.Signature): class CodeConfluenceClassObjectiveSignature(dspy.Signature): """This signature takes in class summary and returns concise class_objective of the class. Do not include your reasoning in class_objective.""" - final_class_summary: str = dspy.InputField(desc="This should contain concise detailed implementation summary of the class or in some cases direct content of the class if it is just a data model object") + final_class_summary: str = dspy.InputField(desc="This should contain concise detailed implementation summary of the class or in some cases direct content of the class if it is just a data model object.") class_objective: str = dspy.OutputField(desc="This should contain concise objective of the class based on implementation summary in under 2 lines without loosing on any details") @@ -36,7 +36,10 @@ def forward(self, class_metadata: DspyUnoplatNodeSubset, function_objective_summ signature_class_summary = self.generate_class_summary(class_existing_summary=class_summary, function_summary=function_objective.function_summary.objective, class_metadata=str(class_metadata.model_dump_json()),hint="Generate the class detailed summary for the class by being concise , factual and grounded.:"+class_metadata.node_name) class_summary = signature_class_summary.final_class_summary - hint="Generate the class objective for the class by being concise and dnt miss on any details.:"+class_metadata.node_name + if class_metadata.node_name is not None: + hint="Generate the class objective for the class by being concise and dnt miss on any details.:"+class_metadata.node_name + else: + hint="Generate the class objective for the class by being concise and dnt miss on any details." if len(function_objective_summary) > 0: class_objective_signature = self.generate_class_objective(final_class_summary = class_summary,hint=hint) diff --git a/unoplat-code-confluence/unoplat_code_confluence/dspy_function_summary.py b/unoplat-code-confluence/unoplat_code_confluence/dspy_function_summary.py index e42378c..d1e0c9a 100644 --- a/unoplat-code-confluence/unoplat_code_confluence/dspy_function_summary.py +++ b/unoplat-code-confluence/unoplat_code_confluence/dspy_function_summary.py @@ -41,6 +41,7 @@ def forward(self, function_metadata: DspyUnoplatFunctionSubset, class_metadata: class_subset = str(class_metadata.model_dump_json()) function_subset = str(function_metadata.model_dump_json()) + function_summary = self.generate_function_summary(chapi_function_metadata=function_subset).unoplat_function_summary for function_call in function_metadata.function_calls: diff --git a/unoplat-code-confluence/unoplat_code_confluence/dspy_package_summary.py b/unoplat-code-confluence/unoplat_code_confluence/dspy_package_summary.py index fd2f699..91c8908 100644 --- a/unoplat-code-confluence/unoplat_code_confluence/dspy_package_summary.py +++ b/unoplat-code-confluence/unoplat_code_confluence/dspy_package_summary.py @@ -5,38 +5,52 @@ from loguru import logger #TODO: optimise using gpt4 judge and miprov2s + +class CodeConfluenceSubPackageSignature(dspy.Signature): + """This signature takes in existing summary of a root package and sub package summary and refines root_package_existing_summary with new insights without loosing on existing insights and returns root_package_final_summary. """ + root_package_existing_summary: str = dspy.InputField(default="package existing summary:",desc="This will contain existing package summary") + sub_package_summary: str = dspy.InputField(desc="This will contain summary of the sub package") + sub_package_name: str = dspy.InputField(desc="This will contain name of the sub package") + root_package_final_summary: str = dspy.OutputField(desc="This will contain improved concise package summary without loosing on existing details") + + class CodeConfluencePackageSignature(dspy.Signature): - """This signature takes in existing summary of a class and function summary of a class one at a time and refines package_existing_summary with new insights and returns final_package_summary. """ - package_existing_summary: str = dspy.InputField(default="package existing summary:",desc="This will contain existing package summary") - class_objective: str = dspy.InputField(desc="This will contain current class objective based on which existing package summary has to be improved") - package_name: str = dspy.InputField(desc="This will contain name of the package") - final_package_summary: str = dspy.OutputField(desc="This will contain improved concise package summary") + """This signature takes in existing summary of root package and based on class summary of that package one at a time refines root_package_existing_summary with new insights without loosing on any existing details and returns root_package_final_summary. """ + root_package_existing_summary: str = dspy.InputField(default="package existing summary:",desc="This will contain existing package summary") + root_class_objective: str = dspy.InputField(desc="This will contain current class objective based on which existing package summary has to be enhanced") + root_package_name: str = dspy.InputField(desc="This will contain name of the package") + root_package_final_summary: str = dspy.OutputField(desc="This will contain improved concise package summary") class CodeConfluencePackageObjectiveSignature(dspy.Signature): """This signature takes in package summary and returns concise package_objective of the package.""" - final_package_summary: str = dspy.InputField(desc="This will contain concise detailed implementation summary of the package") - package_name: str = dspy.InputField(desc="This will contain name of the package") - package_objective: str = dspy.OutputField(desc="This will contain concise objective of the package based on package summary") + root_package_summary: str = dspy.InputField(desc="This will contain concise detailed implementation summary of the package") + root_package_name: str = dspy.InputField(desc="This will contain name of the package") + root_package_objective: str = dspy.OutputField(desc="This will contain concise objective of the package based on package summary") class CodeConfluencePackageModule(dspy.Module): def __init__(self): super().__init__() + self.generate_sub_package_summary = dspy.ChainOfThought(CodeConfluenceSubPackageSignature) self.generate_package_summary = dspy.ChainOfThoughtWithHint(CodeConfluencePackageSignature) self.generate_package_objective = dspy.ChainOfThoughtWithHint(CodeConfluencePackageObjectiveSignature) - def forward(self, class_objective_list: List[DspyUnoplatNodeSummary],package_name: str): + def forward(self, class_objective_list: List[DspyUnoplatNodeSummary],package_name: str,sub_package_summaries: Dict[str,DspyUnoplatPackageSummary]): package_summary_hint="Enhance the package summary +:"+package_name+" based on class objective. Do not extrapolate or make up anything. Strictly be factual and grounded.While enhancing the package summary do not loose any existing important details by being overly concise." package_summary = "" + + for sub_package_name,sub_package_summary in sub_package_summaries.items(): + package_summary = self.generate_sub_package_summary(root_package_existing_summary=package_summary,sub_package_summary=sub_package_summary.package_summary,sub_package_name=sub_package_name).root_package_final_summary + for class_objective in class_objective_list: - signature_package_summary: CodeConfluencePackageSignature = self.generate_package_summary(package_existing_summary=package_summary, package_name=package_name,class_objective=class_objective.node_objective,hint=package_summary_hint) - package_summary = signature_package_summary.final_package_summary + signature_package_summary: CodeConfluencePackageSignature = self.generate_package_summary(root_package_existing_summary=package_summary, root_class_objective=class_objective.node_objective,root_package_name=package_name,hint=package_summary_hint) + package_summary = signature_package_summary.root_package_final_summary package_objective_hint = "First capture all highlights from summary and based on highlights generate the package objective for the package by being concise and dnt miss on any details for:"+package_name+". Do not extrapolate or make up anything. Strictly be factual and grounded." - class_objective_signature: CodeConfluencePackageObjectiveSignature = self.generate_package_objective(final_package_summary=package_summary,package_name=package_name,hint=package_objective_hint) - dspy_package_summary = DspyUnoplatPackageSummary(package_objective=class_objective_signature.package_objective,package_summary=package_summary,class_summary=class_objective_list) + package_objective_signature: CodeConfluencePackageObjectiveSignature = self.generate_package_objective(root_package_summary=package_summary,root_package_name=package_name,hint=package_objective_hint) + dspy_package_summary = DspyUnoplatPackageSummary(package_objective=package_objective_signature.root_package_objective,package_summary=package_summary,class_summary=class_objective_list) return dspy.Prediction(answer=dspy_package_summary) diff --git a/unoplat-code-confluence/unoplat_code_confluence/loader/parse_json.py b/unoplat-code-confluence/unoplat_code_confluence/loader/parse_json.py index 885247c..f6ef8f7 100644 --- a/unoplat-code-confluence/unoplat_code_confluence/loader/parse_json.py +++ b/unoplat-code-confluence/unoplat_code_confluence/loader/parse_json.py @@ -11,7 +11,7 @@ from unoplat_code_confluence.loader.iparse_json import IParseJson from unoplat_code_confluence.data_models.chapi_unoplat_node import Node from loguru import logger -from unoplat_code_confluence.nodeparser.isummariser import ISummariser +from unoplat_code_confluence.markdownparser.isummariser import ISummariser class JsonParser(IParseJson): def parse_json_to_nodes(self, json_data: dict, local_workspace_path: str, programming_language: str) -> UnoplatCodebase: @@ -24,55 +24,54 @@ def parse_json_to_nodes(self, json_data: dict, local_workspace_path: str, progra for item in json_data: try: node = Node(**item) + if node.node_name == "default": + node.node_name = os.path.basename(node.file_path).split('.')[0] - # TODO: node package is not present in python. so use node file path and local workspace path to identify package if programming_language.lower() == 'python': relative_path = os.path.relpath(node.file_path, local_workspace_path) node.package = os.path.dirname(relative_path).replace(os.path.sep, '.') node.package = node.package if node.package else 'root' - if node.type == 'CLASS': - # Creating node subset - node_subset = DspyUnoplatNodeSubset( - NodeName=node.node_name, - Imports=node.imports, - Extend=node.extend, - MultipleExtend=node.multiple_extend, - Fields=node.fields, + + # Creating node subset + node_subset = DspyUnoplatNodeSubset( + NodeName=node.node_name, + Imports=node.imports, + Extend=node.extend, + MultipleExtend=node.multiple_extend, + Fields=node.fields, + Annotations=[DspyUnoplatAnnotationSubset(Name=annotation.name,KeyValues=annotation.key_values) for annotation in node.annotations], + Content=node.content + ) + function_subset_list = [] + + # Creating list function subset + + for func in node.functions: + function_subset = DspyUnoplatFunctionSubset( + Name=func.name, + ReturnType=func.return_type, Annotations=[DspyUnoplatAnnotationSubset(Name=annotation.name,KeyValues=annotation.key_values) for annotation in node.annotations], - Content=node.content - ) - function_subset_list = [] - - # Creating list function subset - - for func in node.functions: - function_subset = DspyUnoplatFunctionSubset( - Name=func.name, - ReturnType=func.return_type, - Annotations=[DspyUnoplatAnnotationSubset(Name=annotation.name,KeyValues=annotation.key_values) for annotation in node.annotations], - LocalVariables=func.local_variables, - Content=func.content, - FunctionCalls=[DspyUnoplatFunctionCallSubset(NodeName=call.node_name, FunctionName=call.function_name, Parameters=call.parameters) for call in func.function_calls]) - function_subset_list.append(function_subset) - - node_subset.functions = function_subset_list - - package_parts = node.package.split('.') - current_package = None + LocalVariables=func.local_variables, + Content=func.content, + FunctionCalls=[DspyUnoplatFunctionCallSubset(NodeName=call.node_name, FunctionName=call.function_name, Parameters=call.parameters) for call in func.function_calls]) + function_subset_list.append(function_subset) + + node_subset.functions = function_subset_list + + package_parts = node.package.split('.') + current_package = unoplat_package_dict + full_package_name = "" + for i, part in enumerate(package_parts): + full_package_name = part if i == 0 else f"{full_package_name}.{part}" + if full_package_name not in current_package: + current_package[full_package_name] = UnoplatPackage(name=full_package_name) + if i == len(package_parts) - 1: + current_package[full_package_name].node_subsets.append(node_subset) + else: + current_package = current_package[full_package_name].sub_packages + - for i, part in enumerate(package_parts): - if i == 0: - if part not in unoplat_package_dict: - unoplat_package_dict[part] = UnoplatPackage() - current_package = unoplat_package_dict[part] - else: - if part not in current_package.sub_packages: - current_package.sub_packages[part] = UnoplatPackage() - current_package = current_package.sub_packages[part] - - - current_package.node_subsets.append(node_subset) except Exception as e: logger.error(f"Error processing node: {e}") diff --git a/unoplat-code-confluence/unoplat_code_confluence/nodeparser/__init__.py b/unoplat-code-confluence/unoplat_code_confluence/markdownparser/__init__.py similarity index 100% rename from unoplat-code-confluence/unoplat_code_confluence/nodeparser/__init__.py rename to unoplat-code-confluence/unoplat_code_confluence/markdownparser/__init__.py diff --git a/unoplat-code-confluence/unoplat_code_confluence/nodeparser/isummariser.py b/unoplat-code-confluence/unoplat_code_confluence/markdownparser/isummariser.py similarity index 100% rename from unoplat-code-confluence/unoplat_code_confluence/nodeparser/isummariser.py rename to unoplat-code-confluence/unoplat_code_confluence/markdownparser/isummariser.py diff --git a/unoplat-code-confluence/unoplat_code_confluence/markdownparser/markdownsummariser.py b/unoplat-code-confluence/unoplat_code_confluence/markdownparser/markdownsummariser.py new file mode 100644 index 0000000..fd0d487 --- /dev/null +++ b/unoplat-code-confluence/unoplat_code_confluence/markdownparser/markdownsummariser.py @@ -0,0 +1,63 @@ +from abc import ABC +from unoplat_code_confluence.data_models.chapi_unoplat_node import Node +from unoplat_code_confluence.data_models.dspy.dspy_unoplat_codebase_summary import DspyUnoplatCodebaseSummary +from unoplat_code_confluence.markdownparser.isummariser import ISummariser +from litellm import completion +from loguru import logger + +class MarkdownSummariser(ISummariser): + def summarise_to_markdown(self, unoplat_codebase_summary: DspyUnoplatCodebaseSummary) -> str: + markdown_output = [] + + # Codebase Summary + markdown_output.append("# Codebase Summary\n") + markdown_output.append(f"**Name:** {unoplat_codebase_summary.codebase_name or 'N/A'}\n") + markdown_output.append(f"**Objective:** {unoplat_codebase_summary.codebase_objective}\n") + markdown_output.append(f"**Summary:** {unoplat_codebase_summary.codebase_summary}\n\n") + + # Package Summaries + markdown_output.append("## Package Summaries\n") + for package_name, package_summary in unoplat_codebase_summary.codebase_package.items(): + markdown_output.append(f"### {package_name}\n") + markdown_output.append(f"**Objective:** {package_summary.package_objective}\n") + markdown_output.append(f"**Summary:** {package_summary.package_summary}\n") + + if package_summary.class_summary: + markdown_output.append("#### Classes\n") + for class_detail in package_summary.class_summary: + markdown_output.append(f"##### {class_detail.node_name}\n") + markdown_output.append(f"**Objective:** {class_detail.node_objective}\n") + if class_detail.node_summary: + markdown_output.append(f"**Summary:** {class_detail.node_summary}\n") + + if class_detail.functions_summary: + markdown_output.append("**Functions:**\n") + for function in class_detail.functions_summary: + markdown_output.append(f"- `{function.function_name}`\n") + markdown_output.append(f" - Objective: {function.function_summary.objective}\n") + markdown_output.append(f" - Implementation: {function.function_summary.implementation_summary.strip().replace('\n', ' ')}\n") + markdown_output.append("\n") + + if package_summary.sub_package_summaries: + markdown_output.append("#### Sub-packages\n") + for sub_package_name,sub_package in package_summary.sub_package_summaries.items(): + markdown_output.append(f"##### {sub_package_name}\n") + markdown_output.append(f"**Objective:** {sub_package.package_objective}\n") + markdown_output.append(f"**Summary:** {sub_package.package_summary}\n") + + if sub_package.class_summary: + markdown_output.append("**Classes:**\n") + for class_detail in sub_package.class_summary: + markdown_output.append(f"- {class_detail.node_name}\n") + markdown_output.append(f" - Objective: {class_detail.node_objective}\n") + if class_detail.functions_summary: + markdown_output.append(" - Functions:\n") + for function in class_detail.functions_summary: + markdown_output.append(f" - `{function.function_name}`\n") + markdown_output.append(f" - Objective: {function.function_summary.objective}\n") + markdown_output.append(f" - Implementation: {function.function_summary.implementation_summary.strip().replace('\n', ' ')}\n") + markdown_output.append("\n") + + markdown_output.append("\n") + + return "\n".join(markdown_output) \ No newline at end of file diff --git a/unoplat-code-confluence/unoplat_code_confluence/nodeparser/markdownsummariser.py b/unoplat-code-confluence/unoplat_code_confluence/nodeparser/markdownsummariser.py deleted file mode 100644 index e85490d..0000000 --- a/unoplat-code-confluence/unoplat_code_confluence/nodeparser/markdownsummariser.py +++ /dev/null @@ -1,41 +0,0 @@ -from abc import ABC -from unoplat_code_confluence.data_models.chapi_unoplat_node import Node -from unoplat_code_confluence.data_models.dspy.dspy_unoplat_codebase_summary import DspyUnoplatCodebaseSummary -from unoplat_code_confluence.nodeparser.isummariser import ISummariser -from litellm import completion -from loguru import logger - -class MarkdownSummariser(ISummariser): - def summarise_to_markdown(self, unoplat_codebase_summary: DspyUnoplatCodebaseSummary) -> str: - markdown_output = [] - - # Codebase Summary - markdown_output.append("# Codebase Summary\n") - markdown_output.append(f"**Objective:**

{unoplat_codebase_summary.codebase_objective}

\n") - markdown_output.append(f"**Summary:**

{unoplat_codebase_summary.codebase_summary}

\n") - markdown_output.append(f"**Name:** {unoplat_codebase_summary.codebase_name or 'N/A'}\n") - - # Package Summaries - markdown_output.append("## Package Summaries\n") - for package_name, DspyUnoplatPackageNodeSummary in unoplat_codebase_summary.codebase_package.package_summary_dict.items(): - markdown_output.append(f"- **Package:** {package_name}\n") - markdown_output.append(f" - **Objective:**

{DspyUnoplatPackageNodeSummary.package_objective}

\n") - markdown_output.append(f" - **Summary:**

{DspyUnoplatPackageNodeSummary.package_summary}

\n") - - markdown_output.append("### Class Summaries\n") - - for class_detail in DspyUnoplatPackageNodeSummary.class_summary: - - markdown_output.append(f"- **{class_detail.node_name}**\n") - markdown_output.append(f" - **Objective:**

{class_detail.node_objective}

\n") - - if class_detail.node_summary: - markdown_output.append(f" - **Summary:**

{class_detail.node_summary}

\n") - - if len(class_detail.functions_summary) > 0: - markdown_output.append("#### Function Summaries\n") - for function in class_detail.functions_summary: - markdown_output.append(f"- **{function.function_name}**\n") - markdown_output.append(f" - **Objective:**

{function.function_summary.objective}

\n") - markdown_output.append(f" - **Implementation:**

{function.function_summary.implementation_summary.strip().replace('\n', ' ')}

\n") - return "\n".join(markdown_output) diff --git a/unoplat-code-confluence/unoplat_code_confluence/summary_parser/codebase_summary.py b/unoplat-code-confluence/unoplat_code_confluence/summary_parser/codebase_summary.py index a29a5c9..9fcdcd9 100644 --- a/unoplat-code-confluence/unoplat_code_confluence/summary_parser/codebase_summary.py +++ b/unoplat-code-confluence/unoplat_code_confluence/summary_parser/codebase_summary.py @@ -1,8 +1,11 @@ -from typing import List +from collections import deque +from typing import Dict, List from unoplat_code_confluence.configuration.external_config import AppConfig from unoplat_code_confluence.data_models.chapi_unoplat_codebase import UnoplatCodebase from unoplat_code_confluence.data_models.chapi_unoplat_package import UnoplatPackage from unoplat_code_confluence.data_models.dspy.dspy_unoplat_codebase_summary import DspyUnoplatCodebaseSummary +from unoplat_code_confluence.data_models.dspy.dspy_unoplat_fs_function_subset import DspyUnoplatFunctionSubset +from unoplat_code_confluence.data_models.dspy.dspy_unoplat_fs_node_subset import DspyUnoplatNodeSubset from unoplat_code_confluence.data_models.dspy.dspy_unoplat_function_summary import DspyUnoplatFunctionSummary from unoplat_code_confluence.data_models.dspy.dspy_unoplat_node_summary import DspyUnoplatNodeSummary from unoplat_code_confluence.data_models.dspy.dspy_unoplat_package_summary import DspyUnoplatPackageSummary @@ -12,16 +15,19 @@ from unoplat_code_confluence.dspy_package_summary import CodeConfluencePackageModule import dspy from loguru import logger +from progiter import ProgIter +from progiter.manager import ProgressManager + class CodebaseSummaryParser: def __init__(self, codebase: UnoplatCodebase, dspy_pipeline_function: CodeConfluenceFunctionModule, dspy_pipeline_class: CodeConfluenceClassModule,dspy_pipeline_package: CodeConfluencePackageModule,dspy_pipeline_codebase: CodeConfluenceCodebaseModule,app_config: AppConfig): self.codebase = codebase - self.dspy_pipeline_function = dspy_pipeline_function - self.dspy_pipeline_class = dspy_pipeline_class - self.dspy_pipeline_package = dspy_pipeline_package - self.dspy_pipeline_codebase = dspy_pipeline_codebase + self.dspy_pipeline_function: CodeConfluenceFunctionModule = dspy_pipeline_function + self.dspy_pipeline_class: CodeConfluenceClassModule = dspy_pipeline_class + self.dspy_pipeline_package: CodeConfluencePackageModule = dspy_pipeline_package + self.dspy_pipeline_codebase: CodeConfluenceCodebaseModule = dspy_pipeline_codebase #TODO: we will be externalise the different llms that can be used at all dspy pipelines and within dspy pipelines once dspy switches to litellm self.init_dspy_lm(app_config.llm_provider_config) @@ -50,17 +56,171 @@ def init_dspy_lm(self,llm_config: dict): dspy.configure(lm=cohere_provider,experimental=True) + def parse_codebase(self) -> DspyUnoplatCodebaseSummary: + + + unoplat_codebase_summary = DspyUnoplatCodebaseSummary() + + root_packages: Dict[str,UnoplatPackage] = self.codebase.packages - def parse_codebase(self) -> DspyUnoplatCodebaseSummary: + root_package_summaries = self.process_packages(root_packages) + + + try: + dspy_codebase_summary = self.dspy_pipeline_codebase(package_objective_dict=root_package_summaries) + except Exception as e: + logger.error(f"Error generating codebase summary: {e}") + logger.exception("Traceback:") + + unoplat_codebase_summary.codebase_summary = dspy_codebase_summary.summary + unoplat_codebase_summary.codebase_objective = dspy_codebase_summary.answer + unoplat_codebase_summary.codebase_package = root_package_summaries + + #todo: pydantic out to a file of unoplat codebase summary + return unoplat_codebase_summary + + def count_total_packages(self, packages: Dict[str, UnoplatPackage]) -> int: + total = 0 + stack = list(packages.values()) + while stack: + package = stack.pop() + total += 1 + stack.extend(package.sub_packages.values()) + return total + + def process_packages(self, packages: Dict[str,UnoplatPackage]) -> Dict[str,DspyUnoplatPackageSummary]: + package_summaries: Dict[str, DspyUnoplatPackageSummary] = {} + stack = deque([(name, package, True) for name, package in packages.items()]) + processed = set() + memo = {} + + total_packages = self.count_total_packages(packages) + + + pman = ProgressManager(backend='rich') + + with pman: + outer_prog = pman.progiter(range(total_packages), desc='Processing packages', verbose=2) + outer_prog.begin() + + while stack: + package_name, package, is_root = stack.pop() + logger.debug("Current package popped from stack: {}",package_name) + + if package_name in processed: + continue + + sub_package_summaries: Dict[str, DspyUnoplatPackageSummary] = {} + all_sub_packages_processed = True + + for sub_name, sub_package in package.sub_packages.items(): + + + if sub_name not in processed: + stack.append((package_name, package, is_root)) + logger.debug("Adding current package {} to stack",package_name) + stack.append((sub_name, sub_package, False)) + logger.debug("Adding sub package {} to stack",sub_name) + all_sub_packages_processed = False + break + + if not all_sub_packages_processed: + continue + + # Process current package + if package_name in memo: + package_summary = memo[package_name] + else: + class_summaries = self.process_classes(package.node_subsets,package_name,pman=pman) + for sub_name in package.sub_packages: + if sub_name in memo: + logger.debug("Sub package {} already processed, adding to sub_package_summaries",sub_name) + sub_package_summaries[sub_name] = memo[sub_name] + + try: + logger.debug("Generating package summary for {}",package_name) + package_summary = self.dspy_pipeline_package( + package_name=package_name, + class_objective_list=class_summaries, + sub_package_summaries=sub_package_summaries + ).answer + + package_summary_object = DspyUnoplatPackageSummary( + package_objective=package_summary.package_objective, + package_summary=package_summary.package_summary, + class_summary=class_summaries, + sub_package_summaries=sub_package_summaries + ) + memo[package_name] = package_summary_object + except Exception as e: + logger.error(f"Error generating package summary for {package_name}: {e}") + logger.exception("Traceback:") + continue + + if is_root: + logger.debug("Adding root package {} to package_summaries",package_name) + package_summaries[package_name] = package_summary_object + + processed.add(package_name) + outer_prog.update(1) + + return package_summaries + + + def process_classes(self, classes: List[DspyUnoplatNodeSubset],package_name: str,pman: ProgressManager) -> List[DspyUnoplatNodeSummary]: + class_summaries: List[DspyUnoplatNodeSummary] = [] + + class_prog = pman.progiter(iterable = classes, desc=f"Processing classes of {package_name}", verbose=2,total=len(classes)) + + for node in class_prog: + function_summaries = self.process_functions(node.functions,node,pman=pman) + + try: + class_summary = self.dspy_pipeline_class(class_metadata=node, function_objective_summary=function_summaries).answer + class_summaries.append(class_summary) + except Exception as e: + logger.error(f"Error generating class summary for {node}: {e}") + logger.exception("Traceback:") + + + return class_summaries + + + def process_functions(self,functions: List[DspyUnoplatFunctionSubset],node: DspyUnoplatNodeSubset,pman: ProgressManager) -> List[DspyUnoplatFunctionSummary]: + function_summaries: List[DspyUnoplatFunctionSummary] = [] + + + + function_prog = pman.progiter(iterable =functions, desc=f"Processing functions of {node.node_name}", verbose=2,total=len(functions)) + + + for function in function_prog: + if function.name is not None: + try: + function_summary = self.dspy_pipeline_function(function_metadata=function,class_metadata=node).answer + dspyUnoplatFunctionSummary: DspyUnoplatFunctionSummary = DspyUnoplatFunctionSummary(FunctionName=function.name,FunctionSummary=function_summary) + function_summaries.append(dspyUnoplatFunctionSummary) + function_prog.update(1) + except Exception as e: + logger.error(f"Error generating function summary for {function.name}: {e}") + logger.exception("Traceback:") + + + + return function_summaries + + + + def old_parse_codebase(self) -> DspyUnoplatCodebaseSummary: unoplat_codebase_summary: DspyUnoplatCodebaseSummary = DspyUnoplatCodebaseSummary() unoplat_packages :UnoplatPackage = self.codebase.packages unoplat_package_summary: DspyUnoplatPackageSummary = DspyUnoplatPackageSummary() - for package_name, list_node_subset in unoplat_packages.package_dict.items(): + for package_name, unoplat_package in unoplat_packages.package_dict.items(): class_summaries: List[DspyUnoplatNodeSummary] = [] - for node in list_node_subset: + for node in unoplat_package.node_subsets: function_summaries :List[DspyUnoplatFunctionSummary] = [] for function in node.functions: