From 44450c0226a2798718e83b890a94750ac59bd3ab Mon Sep 17 00:00:00 2001 From: J S <49557684+svilupp@users.noreply.github.com> Date: Mon, 1 Jul 2024 11:02:06 +0100 Subject: [PATCH] Reciprocal Rank Fusion --- CHANGELOG.md | 5 +++ Project.toml | 2 +- src/Experimental/RAGTools/utils.jl | 29 ++++++++++++++++ test/Experimental/RAGTools/utils.jl | 53 ++++++++++++++++++++++++++++- 4 files changed, 87 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5f02cde1e..e3e522f9c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed +## [0.35.0] + +### Added +- Added a utility function to RAGTools `reciprocal_rank_fusion`, as a principled way to merge multiple rankings. See `?RAGTools.Experimental.reciprocal_rank_fusion` for more information. + ## [0.34.0] ### Added diff --git a/Project.toml b/Project.toml index ae08c61db..40a3cc6cc 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "PromptingTools" uuid = "670122d1-24a8-4d70-bfce-740807c42192" authors = ["J S @svilupp and contributors"] -version = "0.34.0" +version = "0.35.0" [deps] AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" diff --git a/src/Experimental/RAGTools/utils.jl b/src/Experimental/RAGTools/utils.jl index 370746471..1e490aa52 100644 --- a/src/Experimental/RAGTools/utils.jl +++ b/src/Experimental/RAGTools/utils.jl @@ -563,3 +563,32 @@ function unpack_bits(packed_matrix::AbstractMatrix{UInt64}) return output_matrix end + +""" + reciprocal_rank_fusion(args...; k::Int=60) + +Merges multiple rankings and calculates the reciprocal rank score for each chunk (discounted by the inverse of the rank). + +# Example +```julia +positions1 = [1, 3, 5, 7, 9] +positions2 = [2, 4, 6, 8, 10] +positions3 = [2, 4, 6, 11, 12] + +merged_positions, scores = reciprocal_rank_fusion(positions1, positions2, positions3) +``` +""" +function reciprocal_rank_fusion(args...; k::Int = 60) + merged = Vector{Int}() + scores = Dict{Int, Float64}() + + for positions in args + for (idx, pos) in enumerate(positions) + scores[pos] = get(scores, pos, 0.0) + 1.0 / (k + idx) + end + end + + merged = [first(item) for item in sort(collect(scores), by = last, rev = true)] + + return merged, scores +end \ No newline at end of file diff --git a/test/Experimental/RAGTools/utils.jl b/test/Experimental/RAGTools/utils.jl index 99a96a3ee..8924fcebe 100644 --- a/test/Experimental/RAGTools/utils.jl +++ b/test/Experimental/RAGTools/utils.jl @@ -6,7 +6,8 @@ using PromptingTools.Experimental.RAGTools: token_with_boundaries, text_to_trigr using PromptingTools.Experimental.RAGTools: split_into_code_and_sentences using PromptingTools.Experimental.RAGTools: getpropertynested, setpropertynested, merge_kwargs_nested -using PromptingTools.Experimental.RAGTools: pack_bits, unpack_bits, preprocess_tokens +using PromptingTools.Experimental.RAGTools: pack_bits, unpack_bits, preprocess_tokens, + reciprocal_rank_fusion @testset "_check_aiextract_capability" begin @test _check_aiextract_capability("gpt-3.5-turbo") == nothing @@ -548,3 +549,53 @@ end @test_throws ArgumentError RT._stem(nothing, "abc") @test_throws ArgumentError RT._unicode_normalize(nothing) end + +@testset "reciprocal_rank_fusion" begin + # Test with two simple lists + positions, scores = reciprocal_rank_fusion([1, 2, 3], [4, 5, 6]; k = 0) + @test Set(positions) == Set([1, 2, 3, 4, 5, 6]) + @test Set(positions[1:2]) == Set([1, 4]) + @test Set(positions[3:4]) == Set([2, 5]) + @test Set(positions[5:6]) == Set([3, 6]) + @test scores == Dict(1 => 1.0, 2 => 0.5, 3 => 0.3333333333333333, + 4 => 1.0, 5 => 0.5, 6 => 0.3333333333333333) + + # Test with overlapping lists + positions, scores = reciprocal_rank_fusion([1, 2, 3], [2, 3, 4]; k = 0) + @test Set(positions) == Set([2, 3, 1, 4]) + @test positions[1] == 2 + @test positions[2] == 1 + @test positions[3] == 3 + @test positions[4] == 4 + + # Higher discount to reward more appearances + positions, scores = reciprocal_rank_fusion([1, 2, 3], [2, 3, 4]; k = 60) + @test Set(positions) == Set([2, 3, 1, 4]) + @test positions[1] == 2 + @test positions[2] == 3 + @test positions[3] == 1 + @test positions[4] == 4 + + # Test with three lists + positions, scores = reciprocal_rank_fusion([1, 2, 3], [2, 3, 4], [3, 4, 5]; k = 0) + @test Set(positions) == Set([3, 2, 4, 1, 5]) + @test positions[1] == 3 + @test positions[2] == 2 + @test positions[3] == 1 + @test positions[4] == 4 + @test positions[5] == 5 + + # Test with empty list + @test reciprocal_rank_fusion([]; k = 0) == ([], Dict{Int, Float64}()) + + # Test with one empty and one non-empty list + @test reciprocal_rank_fusion([], [1, 2, 3]; k = 0) == + ([1, 2, 3], Dict(1 => 1.0, 2 => 0.5, 3 => 0.3333333333333333)) + + # Test with different lengths of lists + positions, scores = reciprocal_rank_fusion([1, 2], [3, 4, 5]; k = 0) + @test Set(positions) == Set([1, 2, 3, 4, 5]) + @test Set(positions[1:2]) == Set([1, 3]) + @test Set(positions[3:4]) == Set([2, 4]) + @test positions[5] == 5 +end \ No newline at end of file