From 44450c0226a2798718e83b890a94750ac59bd3ab Mon Sep 17 00:00:00 2001
From: J S <49557684+svilupp@users.noreply.github.com>
Date: Mon, 1 Jul 2024 11:02:06 +0100
Subject: [PATCH] Reciprocal Rank Fusion

---
 CHANGELOG.md                        |  5 +++
 Project.toml                        |  2 +-
 src/Experimental/RAGTools/utils.jl  | 29 ++++++++++++++++
 test/Experimental/RAGTools/utils.jl | 53 ++++++++++++++++++++++++++++-
 4 files changed, 87 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5f02cde1e..e3e522f9c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Fixed
 
+## [0.35.0]
+
+### Added
+- Added a utility function to RAGTools `reciprocal_rank_fusion`, as a principled way to merge multiple rankings. See `?RAGTools.Experimental.reciprocal_rank_fusion` for more information.
+
 ## [0.34.0]
 
 ### Added
diff --git a/Project.toml b/Project.toml
index ae08c61db..40a3cc6cc 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "PromptingTools"
 uuid = "670122d1-24a8-4d70-bfce-740807c42192"
 authors = ["J S @svilupp and contributors"]
-version = "0.34.0"
+version = "0.35.0"
 
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
diff --git a/src/Experimental/RAGTools/utils.jl b/src/Experimental/RAGTools/utils.jl
index 370746471..1e490aa52 100644
--- a/src/Experimental/RAGTools/utils.jl
+++ b/src/Experimental/RAGTools/utils.jl
@@ -563,3 +563,32 @@ function unpack_bits(packed_matrix::AbstractMatrix{UInt64})
 
     return output_matrix
 end
+
+"""
+    reciprocal_rank_fusion(args...; k::Int=60)
+
+Merges multiple rankings and calculates the reciprocal rank score for each chunk (discounted by the inverse of the rank).
+
+# Example
+```julia
+positions1 = [1, 3, 5, 7, 9]
+positions2 = [2, 4, 6, 8, 10]
+positions3 = [2, 4, 6, 11, 12]
+
+merged_positions, scores = reciprocal_rank_fusion(positions1, positions2, positions3)
+```
+"""
+function reciprocal_rank_fusion(args...; k::Int = 60)
+    merged = Vector{Int}()
+    scores = Dict{Int, Float64}()
+
+    for positions in args
+        for (idx, pos) in enumerate(positions)
+            scores[pos] = get(scores, pos, 0.0) + 1.0 / (k + idx)
+        end
+    end
+
+    merged = [first(item) for item in sort(collect(scores), by = last, rev = true)]
+
+    return merged, scores
+end
\ No newline at end of file
diff --git a/test/Experimental/RAGTools/utils.jl b/test/Experimental/RAGTools/utils.jl
index 99a96a3ee..8924fcebe 100644
--- a/test/Experimental/RAGTools/utils.jl
+++ b/test/Experimental/RAGTools/utils.jl
@@ -6,7 +6,8 @@ using PromptingTools.Experimental.RAGTools: token_with_boundaries, text_to_trigr
 using PromptingTools.Experimental.RAGTools: split_into_code_and_sentences
 using PromptingTools.Experimental.RAGTools: getpropertynested, setpropertynested,
                                             merge_kwargs_nested
-using PromptingTools.Experimental.RAGTools: pack_bits, unpack_bits, preprocess_tokens
+using PromptingTools.Experimental.RAGTools: pack_bits, unpack_bits, preprocess_tokens,
+                                            reciprocal_rank_fusion
 
 @testset "_check_aiextract_capability" begin
     @test _check_aiextract_capability("gpt-3.5-turbo") == nothing
@@ -548,3 +549,53 @@ end
     @test_throws ArgumentError RT._stem(nothing, "abc")
     @test_throws ArgumentError RT._unicode_normalize(nothing)
 end
+
+@testset "reciprocal_rank_fusion" begin
+    # Test with two simple lists
+    positions, scores = reciprocal_rank_fusion([1, 2, 3], [4, 5, 6]; k = 0)
+    @test Set(positions) == Set([1, 2, 3, 4, 5, 6])
+    @test Set(positions[1:2]) == Set([1, 4])
+    @test Set(positions[3:4]) == Set([2, 5])
+    @test Set(positions[5:6]) == Set([3, 6])
+    @test scores == Dict(1 => 1.0, 2 => 0.5, 3 => 0.3333333333333333,
+        4 => 1.0, 5 => 0.5, 6 => 0.3333333333333333)
+
+    # Test with overlapping lists
+    positions, scores = reciprocal_rank_fusion([1, 2, 3], [2, 3, 4]; k = 0)
+    @test Set(positions) == Set([2, 3, 1, 4])
+    @test positions[1] == 2
+    @test positions[2] == 1
+    @test positions[3] == 3
+    @test positions[4] == 4
+
+    # Higher discount to reward more appearances
+    positions, scores = reciprocal_rank_fusion([1, 2, 3], [2, 3, 4]; k = 60)
+    @test Set(positions) == Set([2, 3, 1, 4])
+    @test positions[1] == 2
+    @test positions[2] == 3
+    @test positions[3] == 1
+    @test positions[4] == 4
+
+    # Test with three lists
+    positions, scores = reciprocal_rank_fusion([1, 2, 3], [2, 3, 4], [3, 4, 5]; k = 0)
+    @test Set(positions) == Set([3, 2, 4, 1, 5])
+    @test positions[1] == 3
+    @test positions[2] == 2
+    @test positions[3] == 1
+    @test positions[4] == 4
+    @test positions[5] == 5
+
+    # Test with empty list
+    @test reciprocal_rank_fusion([]; k = 0) == ([], Dict{Int, Float64}())
+
+    # Test with one empty and one non-empty list
+    @test reciprocal_rank_fusion([], [1, 2, 3]; k = 0) ==
+          ([1, 2, 3], Dict(1 => 1.0, 2 => 0.5, 3 => 0.3333333333333333))
+
+    # Test with different lengths of lists
+    positions, scores = reciprocal_rank_fusion([1, 2], [3, 4, 5]; k = 0)
+    @test Set(positions) == Set([1, 2, 3, 4, 5])
+    @test Set(positions[1:2]) == Set([1, 3])
+    @test Set(positions[3:4]) == Set([2, 4])
+    @test positions[5] == 5
+end
\ No newline at end of file