Skip to content

Commit

Permalink
Fix separators (#55)
Browse files Browse the repository at this point in the history
  • Loading branch information
svilupp authored Jan 21, 2024
1 parent e3ece6f commit 68cbd32
Show file tree
Hide file tree
Showing 4 changed files with 20 additions and 4 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Fixed

## [0.8.1]

### Fixed
- Fixed `split_by_length` to not mutate `separators` argument (appeared in RAG use cases where we repeatedly apply splits to different documents)

## [0.8.0]

### Added
Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "PromptingTools"
uuid = "670122d1-24a8-4d70-bfce-740807c42192"
authors = ["J S @svilupp and contributors"]
version = "0.8.0"
version = "0.8.1"

[deps]
Base64 = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
Expand Down
7 changes: 4 additions & 3 deletions src/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -151,12 +151,13 @@ chunks = split_by_length(text, [","], max_length=10000)
"""
function split_by_length(text, separators::Vector{String}; max_length)
@assert !isempty(separators) "`separators` can't be empty"
separator = popfirst!(separators)
separators_ = copy(separators)
separator = popfirst!(separators_)
chunks = split_by_length(text; separator, max_length)

isempty(separators) && return chunks
isempty(separators_) && return chunks
## Iteratively split by separators
for separator in separators
for separator in separators_
chunks = mapreduce(text_ -> split_by_length(text_; max_length, separator),
vcat,
chunks)
Expand Down
10 changes: 10 additions & 0 deletions test/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -67,12 +67,22 @@ end
# empty separators
text = "Some text without separators."
@test_throws AssertionError split_by_length(text, String[], max_length = 10)

# edge cases
text = "Short text"
separators = ["\n\n", ". ", "\n"]
chunks = split_by_length(text, separators, max_length = 50)
@test length(chunks) == 1
@test chunks[1] == text

# do not mutate separators input
text = "Paragraph 1\n\nParagraph 2. Sentence 1. Sentence 2.\nParagraph 3"
separators = ["\n\n", ". ", "\n"]
sep_length = length(separators)
chunks = split_by_length(text, separators, max_length = 20)
chunks = split_by_length(text, separators, max_length = 20)
chunks = split_by_length(text, separators, max_length = 20)
@test length(separators) == sep_length
end

@testset "extract_handlebar_variables" begin
Expand Down

0 comments on commit 68cbd32

Please sign in to comment.