diff --git a/srctag/storage.py b/srctag/storage.py index 90b61f0..03768e9 100644 --- a/srctag/storage.py +++ b/srctag/storage.py @@ -1,4 +1,4 @@ -import os +import json import re import typing @@ -7,9 +7,11 @@ from chromadb.api.models.Collection import Collection from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction from loguru import logger +from networkx import Graph from pydantic import BaseModel from pydantic_settings import BaseSettings from tqdm import tqdm +import networkx as nx from srctag.model import FileContext, RuntimeContext, SrcTagException @@ -24,6 +26,8 @@ class MetadataConstant(object): KEY_SOURCE = "source" KEY_COMMIT_SHA = "commit_sha" KEY_DATA_TYPE = "data_type" + KEY_ISSUE_ID = "issue_id" + KEY_TAG = "tag" DATA_TYPE_COMMIT_MSG = "commit_msg" DATA_TYPE_ISSUE = "issue" @@ -40,10 +44,22 @@ class StorageConfig(BaseSettings): # issue regex for matching issue grammar # by default, we use GitHub standard issue_regex: str = r"(#\d+)" + # content mapping for avoiding too much I/O + # "#11" -> "content for #11" issue_mapping: typing.Dict[str, str] = dict() data_types: typing.Set[str] = {MetadataConstant.DATA_TYPE_COMMIT_MSG, MetadataConstant.DATA_TYPE_ISSUE} + def load_issue_mapping_from_gh_json_file(self, gh_json_file: str): + with open(gh_json_file) as f: + content = json.load(f) + assert isinstance(content, list), "not a valid issue dump" + + for each in content: + sharp_id = f'#{each["number"]}' + self.issue_mapping[sharp_id] = each["title"] + logger.info(f"load {len(content)} issues from {gh_json_file}") + class Storage(object): def __init__(self, config: StorageConfig = None): @@ -53,6 +69,7 @@ def __init__(self, config: StorageConfig = None): self.chromadb: typing.Optional[API] = None self.chromadb_collection: typing.Optional[Collection] = None + self.relation_graph: Graph = nx.Graph() def init_chroma(self): if self.chromadb and self.chromadb_collection: @@ -85,7 +102,7 @@ def process_commit_msg(self, file: FileContext, collection: Collection): MetadataConstant.KEY_COMMIT_SHA: str(each.hexsha), MetadataConstant.KEY_DATA_TYPE: MetadataConstant.DATA_TYPE_COMMIT_MSG, }, - id=f"{file.name}|{each.hexsha}|{MetadataConstant.DATA_TYPE_COMMIT_MSG}" + id=f"{MetadataConstant.DATA_TYPE_COMMIT_MSG}|{file.name}|{each.hexsha}" ) targets.append(item) @@ -107,30 +124,31 @@ def process_issue(self, file: FileContext, collection: Collection): targets = [] for each in file.commits: issue_id_list = regex.findall(each.message) - issue_contents = [] - for each_issue in issue_id_list: - each_issue_content = self.process_issue_id_to_title(each_issue) + for each_issue_id in issue_id_list: + each_issue_content = self.process_issue_id_to_title(each_issue_id) if not each_issue_content: continue - issue_contents.append(each_issue_content) - # END issue loop - if not issue_contents: - continue - item = StorageDoc( - document=os.sep.join(issue_contents), - metadata={ - MetadataConstant.KEY_SOURCE: file.name, - MetadataConstant.KEY_COMMIT_SHA: str(each.hexsha), - MetadataConstant.KEY_DATA_TYPE: MetadataConstant.DATA_TYPE_ISSUE, - }, - id=f"{file.name}|{each.hexsha}|{MetadataConstant.DATA_TYPE_ISSUE}" - ) - targets.append(item) + item = StorageDoc( + document=each_issue_content, + metadata={ + MetadataConstant.KEY_ISSUE_ID: each_issue_id, + MetadataConstant.KEY_DATA_TYPE: MetadataConstant.DATA_TYPE_ISSUE, + }, + id=f"{MetadataConstant.DATA_TYPE_ISSUE}|{each_issue_id}" + ) + targets.append(item) + + # save to graph + self.relation_graph.add_node(each_issue_id, node_type=MetadataConstant.KEY_ISSUE_ID) + self.relation_graph.add_node(file.name, node_type=MetadataConstant.KEY_SOURCE) + self.relation_graph.add_edge(each_issue_id, file.name) + + # END issue loop # END commit loop for each in targets: - collection.add( + collection.upsert( documents=[each.document], metadatas=[each.metadata], ids=[each.id], diff --git a/srctag/tagger.py b/srctag/tagger.py index 295f717..34b58a8 100644 --- a/srctag/tagger.py +++ b/srctag/tagger.py @@ -95,19 +95,18 @@ def __init__(self, config: TaggerConfig = None): config = TaggerConfig() self.config = config - def tag(self, storage: Storage) -> TagResult: - storage.init_chroma() + def tag_with_commit(self, storage: Storage) -> TagResult: doc_count = storage.chromadb_collection.count() n_results = int(doc_count * self.config.n_percent) - logger.info(f"start tagging source files ...") - tag_results = [] + relation_graph = storage.relation_graph.copy() for each_tag in tqdm(self.config.tags): query_result: QueryResult = storage.chromadb_collection.query( query_texts=each_tag, n_results=n_results, include=["metadatas", "distances"], + where={MetadataConstant.KEY_DATA_TYPE: MetadataConstant.DATA_TYPE_COMMIT_MSG} ) metadatas: typing.List[Metadata] = query_result["metadatas"][0] @@ -137,6 +136,77 @@ def tag(self, storage: Storage) -> TagResult: else: # has been touched by other commits, merge each_file_tag_result[each_tag] += each_score + + # update graph + relation_graph.add_node(each_tag, node_type=MetadataConstant.KEY_TAG) + relation_graph.add_edge(each_tag, each_file_name) + # END tag_results + + scores_df = pd.DataFrame.from_dict(ret, orient="index") + if self.config.optimize: + scores_df = self.optimize(scores_df) + + # convert score matrix into rank (use reversed rank as score). because: + # 1. score/distance is meaningless to users + # 2. can not be evaluated both rows and cols + scores_df = scores_df.rank(axis=0, method='min') + + if self.config.normalize: + scores_df = (scores_df - scores_df.min()) / (scores_df.max() - scores_df.min()) + + logger.info(f"tag finished") + # update relation graph in storage + storage.relation_graph = relation_graph + + return TagResult(scores_df=scores_df) + + def tag_with_issue(self, storage: Storage) -> TagResult: + doc_count = storage.chromadb_collection.count() + n_results = int(doc_count * self.config.n_percent) + + tag_results = [] + relation_graph = storage.relation_graph.copy() + for each_tag in tqdm(self.config.tags): + query_result: QueryResult = storage.chromadb_collection.query( + query_texts=each_tag, + n_results=n_results, + include=["metadatas", "distances"], + where={MetadataConstant.KEY_DATA_TYPE: MetadataConstant.DATA_TYPE_ISSUE} + ) + + metadatas: typing.List[Metadata] = query_result["metadatas"][0] + # https://github.com/langchain-ai/langchain/blob/master/libs/langchain/langchain/vectorstores/chroma.py + # https://stats.stackexchange.com/questions/158279/how-i-can-convert-distance-euclidean-to-similarity-score + distances: typing.List[float] = query_result["distances"][0] + normalized_scores = [ + 1.0 / (1.0 + x) for x in distances + ] + + for each_metadata, each_score in zip(metadatas, normalized_scores): + each_issue_id = each_metadata[MetadataConstant.KEY_ISSUE_ID] + tag_results.append((each_tag, each_issue_id, each_score)) + # END file loop + # END tag loop + + ret = dict() + for each_tag, each_issue_id, each_score in tag_results: + files = storage.relation_graph.neighbors(each_issue_id) + for each_file in files: + if each_file not in ret: + # has not been touched by other tags + # the score order is decreasing + ret[each_file] = OrderedDict() + each_file_tag_result = ret[each_file] + + if each_tag not in each_file_tag_result: + each_file_tag_result[each_tag] = each_score + else: + # has been touched by other commits, merge + each_file_tag_result[each_tag] += each_score + + # update graph + relation_graph.add_node(each_tag, node_type=MetadataConstant.KEY_TAG) + relation_graph.add_edge(each_tag, each_issue_id) # END tag_results scores_df = pd.DataFrame.from_dict(ret, orient="index") @@ -152,8 +222,21 @@ def tag(self, storage: Storage) -> TagResult: scores_df = (scores_df - scores_df.min()) / (scores_df.max() - scores_df.min()) logger.info(f"tag finished") + # update relation graph in storage + storage.relation_graph = relation_graph return TagResult(scores_df=scores_df) + def tag(self, storage: Storage) -> TagResult: + logger.info(f"start tagging source files ...") + storage.init_chroma() + + if storage.relation_graph.number_of_nodes(): + logger.info("tag with issue") + return self.tag_with_issue(storage) + else: + logger.info("tag with commit") + return self.tag_with_commit(storage) + def optimize(self, df: pd.DataFrame) -> pd.DataFrame: scale_factor = 2.0 df = np.exp(df * scale_factor)