Skip to content

Commit

Permalink
refactor: reduce the impacts of common files
Browse files Browse the repository at this point in the history
  • Loading branch information
williamfzc committed Dec 4, 2023
1 parent 0e3644f commit 7860726
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 2 deletions.
10 changes: 10 additions & 0 deletions srctag/tagger.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ def top_n_tags(self, file_name: str, n: int) -> typing.List[str]:
def top_n_files(self, tag_name: str, n: int) -> typing.List[str]:
return self.files_by_tag(tag_name).nlargest(n).index.tolist()

def normalize_score(self, origin: float) -> float:
return origin / len(self.files())


class TaggerConfig(BaseSettings):
tags: typing.Set[str] = set()
Expand Down Expand Up @@ -107,6 +110,13 @@ def tag(self, storage: Storage) -> TagResult:
# END tag_results

scores_df = pd.DataFrame.from_dict(ret, orient="index")

# reduce the impacts of common files
row_variances = scores_df.var(axis=1)
max_variance = row_variances.max()
weights = 1.0 - row_variances / max_variance
scores_df = scores_df.multiply(weights, axis=0)

# convert score matrix into rank (use reversed rank as score). because:
# 1. score/distance is meaningless to users
# 2. can not be evaluated both rows and cols
Expand Down
8 changes: 6 additions & 2 deletions tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,14 @@ def test_query(setup_tagger):

tags_series = tag_result.tags_by_file("examples/write.py")
assert len(tags_series) == len(all_tags)
tags_series = tags_series[:5]
assert len(tags_series) == 5
for k, v in tags_series.items():
logger.info(f"tag: {k}, score: {v}")
normalize_score = tag_result.normalize_score(v)
logger.info(f"tag: {k}, score: {normalize_score}")

files_series = tag_result.files_by_tag("example")
assert len(files_series) > 10
for k, v in files_series.items():
logger.info(f"file: {k}, score: {v}")
normalize_score = tag_result.normalize_score(v)
logger.info(f"file: {k}, score: {normalize_score}")

0 comments on commit 7860726

Please sign in to comment.