diff --git a/CHANGELOG.md b/CHANGELOG.md index acfc1c88d..f6fb06130 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed +## [0.38.0] + +### Added +- Added a new tagging filter `RT.AllTagFilter` to `RT.find_tags`, which requires all tags to be present in a chunk. +- Added an option in `RT.get_keywords` to set the minimum length of the keywords. +- Added a new method for `reciprocal_rank_fusion` and utility for standardizing candidate chunk scores (`score_to_unit_scale`). + ## [0.37.1] ### Fixed diff --git a/Project.toml b/Project.toml index 9709a74ec..a40d82d6b 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "PromptingTools" uuid = "670122d1-24a8-4d70-bfce-740807c42192" authors = ["J S @svilupp and contributors"] -version = "0.37.1" +version = "0.38.0" [deps] AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" diff --git a/ext/SnowballPromptingToolsExt.jl b/ext/SnowballPromptingToolsExt.jl index dfd290baa..4e0107fbb 100644 --- a/ext/SnowballPromptingToolsExt.jl +++ b/ext/SnowballPromptingToolsExt.jl @@ -17,6 +17,7 @@ RT._stem(stemmer::Snowball.Stemmer, text::AbstractString) = Snowball.stem(stemme stemmer = nothing, stopwords::Set{String} = Set(STOPWORDS), return_keywords::Bool = false, + min_length::Integer = 3, kwargs...) Generate a `DocumentTermMatrix` from a vector of `docs` using the provided `stemmer` and `stopwords`. @@ -27,6 +28,7 @@ Generate a `DocumentTermMatrix` from a vector of `docs` using the provided `stem - `stemmer`: A stemmer to use for stemming. Default is `nothing`. - `stopwords`: A set of stopwords to remove. Default is `Set(STOPWORDS)`. - `return_keywords`: A boolean flag for returning the keywords. Default is `false`. Useful for query processing in search time. +- `min_length`: The minimum length of the keywords. Default is `3`. """ function RT.get_keywords( processor::RT.KeywordsProcessor, docs::AbstractVector{<:AbstractString}; @@ -34,6 +36,7 @@ function RT.get_keywords( stemmer = nothing, stopwords::Set{String} = Set(RT.STOPWORDS), return_keywords::Bool = false, + min_length::Integer = 3, kwargs...) ## check if extension is available ext = Base.get_extension(PromptingTools, :RAGToolsExperimentalExt) @@ -47,7 +50,7 @@ function RT.get_keywords( ## Preprocess text into tokens stemmer = !isnothing(stemmer) ? stemmer : Snowball.Stemmer("english") # Single-threaded as stemmer is not thread-safe - keywords = RT.preprocess_tokens(docs, stemmer; stopwords, min_length = 3) + keywords = RT.preprocess_tokens(docs, stemmer; stopwords, min_length) ## Early exit if we only want keywords (search time) return_keywords && return keywords diff --git a/src/Experimental/RAGTools/retrieval.jl b/src/Experimental/RAGTools/retrieval.jl index 6c147328a..4b091a288 100644 --- a/src/Experimental/RAGTools/retrieval.jl +++ b/src/Experimental/RAGTools/retrieval.jl @@ -95,6 +95,13 @@ Finds the chunks that have ANY OF the specified tag(s). """ struct AnyTagFilter <: AbstractTagFilter end +""" + AllTagFilter <: AbstractTagFilter + +Finds the chunks that have ALL OF the specified tag(s). +""" +struct AllTagFilter <: AbstractTagFilter end + ### Functions function rephrase(rephraser::AbstractRephraser, question::AbstractString; kwargs...) throw(ArgumentError("Not implemented yet for type $(typeof(rephraser))")) @@ -501,6 +508,39 @@ function find_tags(method::AnyTagFilter, index::AbstractChunkIndex, return CandidateChunks(index.id, pos, ones(Float32, length(pos))) end +""" + find_tags(method::AllTagFilter, index::AbstractChunkIndex, + tag::Union{AbstractString, Regex}; kwargs...) + + find_tags(method::AllTagFilter, index::AbstractChunkIndex, + tags::Vector{T}; kwargs...) where {T <: Union{AbstractString, Regex}} + +Finds the indices of chunks (represented by tags in `index`) that have ALL OF the specified `tag` or `tags`. +""" +function find_tags(method::AllTagFilter, index::AbstractChunkIndex, + tags_vec::Vector{T}; kwargs...) where {T <: Union{AbstractString, Regex}} + isnothing(tags(index)) && CandidateChunks(; index_id = index.id) + tag_idx = Int[] + for tag in tags_vec + if tag isa AbstractString + append!(tag_idx, findall(tags_vocab(index) .== tag)) + else # assume it's a regex + append!(tag_idx, findall(occursin.(Ref(tag), tags_vocab(index)))) + end + end + ## get rows with all values true + match_row_idx = if length(tag_idx) > 0 + reduce(.&, eachcol(@view(tags(index)[:, tag_idx]))) |> findall + else + Int[] + end + return CandidateChunks(index.id, match_row_idx, ones(Float32, length(match_row_idx))) +end +function find_tags(method::AllTagFilter, index::AbstractChunkIndex, + tag::Union{AbstractString, Regex}; kwargs...) + find_tags(method, index, [tag]; kwargs...) +end + """ find_tags(method::NoTagFilter, index::AbstractChunkIndex, tags::Union{T, AbstractVector{<:T}}; kwargs...) where {T <: @@ -516,8 +556,8 @@ function find_tags(method::NoTagFilter, index::AbstractChunkIndex, AbstractString, Regex, Nothing}} return nothing end -## Multi-index implementation -function find_tags(method::AnyTagFilter, index::AbstractMultiIndex, +## Multi-index implementation -- logic differs within each index and then we simply vcat them together +function find_tags(method::Union{AnyTagFilter, AllTagFilter}, index::AbstractMultiIndex, tag::Union{T, AbstractVector{<:T}}; kwargs...) where {T <: Union{AbstractString, Regex}} all_indexes = indexes(index) diff --git a/src/Experimental/RAGTools/utils.jl b/src/Experimental/RAGTools/utils.jl index 1e490aa52..3eebdaae4 100644 --- a/src/Experimental/RAGTools/utils.jl +++ b/src/Experimental/RAGTools/utils.jl @@ -591,4 +591,61 @@ function reciprocal_rank_fusion(args...; k::Int = 60) merged = [first(item) for item in sort(collect(scores), by = last, rev = true)] return merged, scores -end \ No newline at end of file +end + +""" + reciprocal_rank_fusion( + positions1::AbstractVector{<:Integer}, scores1::AbstractVector{<:T}, + positions2::AbstractVector{<:Integer}, + scores2::AbstractVector{<:T}; k::Int = 60) where {T <: Real} + +Merges two sets of rankings and their joint scores. Calculates the reciprocal rank score for each chunk (discounted by the inverse of the rank). + +# Example +```julia +positions1 = [1, 3, 5, 7, 9] +scores1 = [0.9, 0.8, 0.7, 0.6, 0.5] +positions2 = [2, 4, 6, 8, 10] +scores2 = [0.5, 0.6, 0.7, 0.8, 0.9] + +merged, scores = reciprocal_rank_fusion(positions1, scores1, positions2, scores2; k = 60) +``` +""" +function reciprocal_rank_fusion( + positions1::AbstractVector{<:Integer}, scores1::AbstractVector{<:T}, + positions2::AbstractVector{<:Integer}, + scores2::AbstractVector{<:T}; k::Int = 60) where {T <: Real} + merged = Vector{Int}() + scores = Dict{Int, T}() + + for (idx, (pos, sc)) in enumerate(zip(positions1, scores1)) + scores[pos] = get(scores, pos, 0.0) + sc / (k + idx) + end + for (idx, (pos, sc)) in enumerate(zip(positions2, scores2)) + scores[pos] = get(scores, pos, 0.0) + sc / (k + idx) + end + + merged = [first(item) for item in sort(collect(scores), by = last, rev = true)] + + return merged, scores +end + +""" + score_to_unit_scale(x::AbstractVector{T}) where T<:Real + +Shift and scale a vector of scores to the unit scale [0, 1]. + +# Example +```julia +x = [1.0, 2.0, 3.0, 4.0, 5.0] +scaled_x = score_to_unit_scale(x) +``` +""" +function score_to_unit_scale(x::AbstractVector{T}) where {T <: Real} + ex = extrema(x) + if ex[2] - ex[1] < eps(T) + ones(T, length(x)) + else + (x .- ex[1]) ./ (ex[2] - ex[1] + eps(T)) + end +end diff --git a/test/Experimental/RAGTools/retrieval.jl b/test/Experimental/RAGTools/retrieval.jl index 3ac17de70..33aea2d32 100644 --- a/test/Experimental/RAGTools/retrieval.jl +++ b/test/Experimental/RAGTools/retrieval.jl @@ -3,7 +3,7 @@ using PromptingTools.Experimental.RAGTools: ContextEnumerator, NoRephraser, Simp HyDERephraser, CosineSimilarity, BinaryCosineSimilarity, MultiFinder, BM25Similarity, - NoTagFilter, AnyTagFilter, + NoTagFilter, AllTagFilter, AnyTagFilter, SimpleRetriever, AdvancedRetriever using PromptingTools.Experimental.RAGTools: AbstractRephraser, AbstractTagFilter, AbstractSimilarityFinder, AbstractReranker, @@ -431,6 +431,20 @@ end # Test with multiple tags in vocab @test find_tags(tagger, index, ["python", "jr", "x"]).positions == [2] + ## With AllTagFilter -- no difference for individual + tagger2 = AllTagFilter() + @test find_tags(tagger2, index, "julia").positions == [1] + @test find_tags(tagger2, index, "julia").scores == [1.0] + @test find_tags(tagger2, index, "python").positions |> isempty + @test find_tags(tagger2, index, "java").positions |> isempty + @test find_tags(tagger2, index, r"^j").positions |> isempty + @test find_tags(tagger2, index, "jr").positions == [2] + + @test find_tags(tagger2, index, ["python", "jr", "x"]).positions |> isempty + @test find_tags(tagger2, index, ["julia", "jr"]).positions |> isempty + @test find_tags(tagger2, index, ["julia", "julia"]).positions == [1] + @test find_tags(tagger2, index, ["julia", "julia"]).scores == [1.0] + # No filter tag -- give everything cc = find_tags(NoTagFilter(), index, "julia") @test isnothing(cc) @@ -483,6 +497,11 @@ end @test mcc4.index_ids == [:indexX, :indexX] @test mcc4.positions == [1, 2] @test mcc4.scores == [1.0, 1.0] + + mcc5 = find_tags(AllTagFilter(), multi_index2, [r"^j"]) + @test mcc5.index_ids |> isempty + @test mcc5.positions |> isempty + @test mcc5.scores |> isempty end @testset "rerank" begin diff --git a/test/Experimental/RAGTools/utils.jl b/test/Experimental/RAGTools/utils.jl index 8924fcebe..9ab1f90c1 100644 --- a/test/Experimental/RAGTools/utils.jl +++ b/test/Experimental/RAGTools/utils.jl @@ -7,7 +7,7 @@ using PromptingTools.Experimental.RAGTools: split_into_code_and_sentences using PromptingTools.Experimental.RAGTools: getpropertynested, setpropertynested, merge_kwargs_nested using PromptingTools.Experimental.RAGTools: pack_bits, unpack_bits, preprocess_tokens, - reciprocal_rank_fusion + reciprocal_rank_fusion, score_to_unit_scale @testset "_check_aiextract_capability" begin @test _check_aiextract_capability("gpt-3.5-turbo") == nothing @@ -598,4 +598,58 @@ end @test Set(positions[1:2]) == Set([1, 3]) @test Set(positions[3:4]) == Set([2, 4]) @test positions[5] == 5 + + ## Paired reciprocal rank + positions1 = [1, 2, 3, 4, 5] + scores1 = [0.9, 0.8, 0.7, 0.6, 0.5] + positions2 = [3, 4, 5, 6, 7] + scores2 = [0.5, 0.6, 0.7, 0.9, 0.9] + + merged, scores = reciprocal_rank_fusion(positions1, scores1, positions2, scores2; k = 0) + @test length(merged) == 7 + @test Set(merged) == Set(1:7) + @test merged[1] == 1 + @test scores[1] == 0.9 + @test merged[2] == 3 + @test scores[3] == 0.7 / 3 + 0.5 + @test merged[end] == 7 + @test scores[7] == 0.9 / 5 + + merged, scores = reciprocal_rank_fusion( + positions1, scores1, positions2, scores2; k = 60) + @test length(merged) == 7 + @test merged[1] == 3 + @test merged[2] == 4 + @test merged[3] == 5 + @test scores[3] > scores[4] + @test scores[4] > scores[5] + @test scores[5] > scores[6] + @test scores[6] > scores[7] +end + +@testset "score_to_unit_scale" begin + # Test with a normal range of values + x = [1.0, 2.0, 3.0, 4.0, 5.0] + scaled_x = score_to_unit_scale(x) + @test extrema(scaled_x) == (0.0, 1.0) + + # Test with all values the same + y = [2.0, 2.0, 2.0, 2.0, 2.0] + scaled_y = score_to_unit_scale(y) + @test all(scaled_y .== 1.0) + + # Test with a single value + z = [3.0] + scaled_z = score_to_unit_scale(z) + @test scaled_z == [1.0] + + # Test with a range of negative values + w = [-5.0, -4.0, -3.0, -2.0, -1.0] + scaled_w = score_to_unit_scale(w) + @test extrema(scaled_w) == (0.0, 1.0) + + # Test with a mix of positive and negative values + v = [-1.0, 0.0, 1.0] + scaled_v = score_to_unit_scale(v) + @test extrema(scaled_v) == (0.0, 1.0) end \ No newline at end of file