From 68cbd32a6e5c5f00edf65c15bcea1c4b497fcbc4 Mon Sep 17 00:00:00 2001 From: J S <49557684+svilupp@users.noreply.github.com> Date: Sun, 21 Jan 2024 10:14:24 +0000 Subject: [PATCH] Fix separators (#55) --- CHANGELOG.md | 5 +++++ Project.toml | 2 +- src/utils.jl | 7 ++++--- test/utils.jl | 10 ++++++++++ 4 files changed, 20 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index adfb96f71..fae61485a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed +## [0.8.1] + +### Fixed +- Fixed `split_by_length` to not mutate `separators` argument (appeared in RAG use cases where we repeatedly apply splits to different documents) + ## [0.8.0] ### Added diff --git a/Project.toml b/Project.toml index d878ac13f..81f0a16d9 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "PromptingTools" uuid = "670122d1-24a8-4d70-bfce-740807c42192" authors = ["J S @svilupp and contributors"] -version = "0.8.0" +version = "0.8.1" [deps] Base64 = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" diff --git a/src/utils.jl b/src/utils.jl index 967a59c3c..3b8cacc9f 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -151,12 +151,13 @@ chunks = split_by_length(text, [","], max_length=10000) """ function split_by_length(text, separators::Vector{String}; max_length) @assert !isempty(separators) "`separators` can't be empty" - separator = popfirst!(separators) + separators_ = copy(separators) + separator = popfirst!(separators_) chunks = split_by_length(text; separator, max_length) - isempty(separators) && return chunks + isempty(separators_) && return chunks ## Iteratively split by separators - for separator in separators + for separator in separators_ chunks = mapreduce(text_ -> split_by_length(text_; max_length, separator), vcat, chunks) diff --git a/test/utils.jl b/test/utils.jl index 4c1865c8d..04a452d71 100644 --- a/test/utils.jl +++ b/test/utils.jl @@ -67,12 +67,22 @@ end # empty separators text = "Some text without separators." @test_throws AssertionError split_by_length(text, String[], max_length = 10) + # edge cases text = "Short text" separators = ["\n\n", ". ", "\n"] chunks = split_by_length(text, separators, max_length = 50) @test length(chunks) == 1 @test chunks[1] == text + + # do not mutate separators input + text = "Paragraph 1\n\nParagraph 2. Sentence 1. Sentence 2.\nParagraph 3" + separators = ["\n\n", ". ", "\n"] + sep_length = length(separators) + chunks = split_by_length(text, separators, max_length = 20) + chunks = split_by_length(text, separators, max_length = 20) + chunks = split_by_length(text, separators, max_length = 20) + @test length(separators) == sep_length end @testset "extract_handlebar_variables" begin