From 0f1a334c14effe8c9df90fc117155abd2db5acd9 Mon Sep 17 00:00:00 2001 From: J S <49557684+svilupp@users.noreply.github.com> Date: Tue, 23 Jul 2024 18:55:07 +0100 Subject: [PATCH] Register Llama3.1 + minor retrieval improvements --- CHANGELOG.md | 10 ++++ Project.toml | 2 +- src/Experimental/RAGTools/retrieval.jl | 8 +++- src/Experimental/RAGTools/types.jl | 26 ++++++---- src/user_preferences.jl | 66 ++++++++++++++++++++++---- 5 files changed, 91 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0efb9b821..3ea5f0f06 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed +## [0.42.0] + +### Added +- Registered new Meta Llama 3.1 models hosted on GroqCloud and Together.ai (eg, Groq-hosted `gllama370` has been updated to point to the latest available model and 405b model now has alias `gllama3405`). Because that's quite clunky, I've added abbreviations based on sizes small/medium/large (that is 8b, 70b, 405b) under `gls/glm/gll` for Llama 3.1 hosted on GroqCloud (similarly, we now have `tls/tlm/tll` for Llama3.1 on Together.ai). +- Generic model aliases for Groq and Together.ai for Llama3 models have been updated to point to the latest available models (Llama 3.1). +- Added Gemma2 9b model hosted on GroqCloud to the model registry (alias `ggemma9`). + +### Updated +- Minor optimizations to `SubDocumentTermMatrix` to reduce memory allocations and improve performance. + ## [0.41.0] ### Added diff --git a/Project.toml b/Project.toml index 2147675a9..d372d0878 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "PromptingTools" uuid = "670122d1-24a8-4d70-bfce-740807c42192" authors = ["J S @svilupp and contributors"] -version = "0.41.0" +version = "0.42.0" [deps] AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" diff --git a/src/Experimental/RAGTools/retrieval.jl b/src/Experimental/RAGTools/retrieval.jl index 301841776..11bedc57d 100644 --- a/src/Experimental/RAGTools/retrieval.jl +++ b/src/Experimental/RAGTools/retrieval.jl @@ -230,7 +230,9 @@ function find_closest( finder::AbstractSimilarityFinder, index::AbstractChunkIndex, query_emb::AbstractVector{<:Real}, query_tokens::AbstractVector{<:AbstractString} = String[]; top_k::Int = 100, kwargs...) - isnothing(chunkdata(index)) && return CandidateChunks(; index_id = indexid(index)) + if isnothing(chunkdata(parent(index))) + return CandidateChunks(; index_id = indexid(index)) + end positions, scores = find_closest(finder, chunkdata(index), query_emb, query_tokens; top_k, kwargs...) @@ -244,7 +246,9 @@ function find_closest( finder::AbstractSimilarityFinder, index::AbstractChunkIndex, query_emb::AbstractMatrix{<:Real}, query_tokens::AbstractVector{<:AbstractVector{<:AbstractString}} = Vector{Vector{String}}(); top_k::Int = 100, kwargs...) - isnothing(chunkdata(index)) && CandidateChunks(; index_id = indexid(index)) + if isnothing(chunkdata(parent(index))) + return CandidateChunks(; index_id = indexid(index)) + end ## reduce top_k since we have more than one query top_k_ = top_k รท size(query_emb, 2) ## simply vcat together (gets sorted from the highest similarity to the lowest) diff --git a/src/Experimental/RAGTools/types.jl b/src/Experimental/RAGTools/types.jl index 942524535..9582a28bf 100644 --- a/src/Experimental/RAGTools/types.jl +++ b/src/Experimental/RAGTools/types.jl @@ -6,7 +6,8 @@ Base.parent(index::AbstractDocumentIndex) = index indexid(index::AbstractDocumentIndex) = index.id chunkdata(index::AbstractChunkIndex) = index.chunkdata "Access chunkdata for a subset of chunks, `chunk_idx` is a vector of chunk indices in the index" -function chunkdata(index::AbstractChunkIndex, chunk_idx::AbstractVector{<:Integer}) +Base.@propagate_inbounds function chunkdata( + index::AbstractChunkIndex, chunk_idx::AbstractVector{<:Integer}) ## We need this accessor because different chunk indices can have chunks in different dimensions!! chkdata = chunkdata(index) if isnothing(chkdata) @@ -209,7 +210,7 @@ tf(dtm::SubDocumentTermMatrix) = dtm.tf vocab(dtm::SubDocumentTermMatrix) = Base.parent(dtm) |> vocab vocab_lookup(dtm::SubDocumentTermMatrix) = Base.parent(dtm) |> vocab_lookup idf(dtm::SubDocumentTermMatrix) = Base.parent(dtm) |> idf -function doc_rel_length(dtm::SubDocumentTermMatrix) +Base.@propagate_inbounds function doc_rel_length(dtm::SubDocumentTermMatrix) view(doc_rel_length(Base.parent(dtm)), positions(dtm)) end # hcat for SubDocumentTermMatrix does not make sense -> the vocabulary is the same / shared @@ -227,6 +228,7 @@ Base.@propagate_inbounds function Base.view( throw(BoundsError(tf_mat, max_pos)) end ## computations on top of views of sparse arrays are expensive, materialize the view + ## Moreover, nonzeros and rowvals accessors for SparseCSCMatrix are not defined for views tf_ = tf_mat[doc_idx, :] SubDocumentTermMatrix(dtm, tf_, collect(doc_idx)) end @@ -315,7 +317,8 @@ end HasKeywords(::ChunkKeywordsIndex) = true "Access chunkdata for a subset of chunks, `chunk_idx` is a vector of chunk indices in the index" -function chunkdata(index::ChunkKeywordsIndex, chunk_idx::AbstractVector{<:Integer}) +Base.@propagate_inbounds function chunkdata( + index::ChunkKeywordsIndex, chunk_idx::AbstractVector{<:Integer}) chkdata = index.chunkdata if isnothing(chkdata) return nothing @@ -437,13 +440,18 @@ Base.parent(index::SubChunkIndex) = index.parent HasEmbeddings(index::SubChunkIndex) = HasEmbeddings(parent(index)) HasKeywords(index::SubChunkIndex) = HasKeywords(parent(index)) -chunks(index::SubChunkIndex) = view(chunks(parent(index)), positions(index)) -sources(index::SubChunkIndex) = view(sources(parent(index)), positions(index)) -function chunkdata(index::SubChunkIndex) - chkdata = chunkdata(parent(index), positions(index)) +Base.@propagate_inbounds function chunks(index::SubChunkIndex) + view(chunks(parent(index)), positions(index)) +end +Base.@propagate_inbounds function sources(index::SubChunkIndex) + view(sources(parent(index)), positions(index)) +end +Base.@propagate_inbounds function chunkdata(index::SubChunkIndex) + chunkdata(parent(index), positions(index)) end "Access chunkdata for a subset of chunks, `chunk_idx` is a vector of chunk indices in the index" -function chunkdata(index::SubChunkIndex, chunk_idx::AbstractVector{<:Integer}) +Base.@propagate_inbounds function chunkdata( + index::SubChunkIndex, chunk_idx::AbstractVector{<:Integer}) ## We need this accessor because different chunk indices can have chunks in different dimensions!! index_chunk_idx = translate_positions_to_parent(index, chunk_idx) pos = intersect(positions(index), index_chunk_idx) @@ -501,7 +509,7 @@ Translate positions to the parent index. Useful to convert between positions in Used whenever a `chunkdata()` or `tags()` are used to re-align positions to the "parent" index. """ -function translate_positions_to_parent( +Base.@propagate_inbounds function translate_positions_to_parent( index::SubChunkIndex, pos::AbstractVector{<:Integer}) sub_positions = positions(index) return sub_positions[pos] diff --git a/src/user_preferences.jl b/src/user_preferences.jl index 5cd85a35f..2080abcf9 100644 --- a/src/user_preferences.jl +++ b/src/user_preferences.jl @@ -346,8 +346,12 @@ aliases = merge( ## t-mixtral -> Together.ai Mixtral "tmixtral" => "mistralai/Mixtral-8x7B-Instruct-v0.1", "tmixtral22" => "mistralai/Mixtral-8x22B-Instruct-v0.1", - "tllama3" => "meta-llama/Llama-3-8b-chat-hf", - "tllama370" => "meta-llama/Llama-3-70b-chat-hf", + "tllama3" => "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", + "tllama370" => "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo", + "tllama3405" => "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo", + "tls" => "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", #s for small + "tlm" => "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo", #m for medium + "tll" => "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo", #l for large ## Mistral AI "mistral-tiny" => "mistral-tiny", "mistral-small" => "mistral-small-latest", @@ -365,11 +369,17 @@ aliases = merge( "claudes" => "claude-3-5-sonnet-20240620", "claudeh" => "claude-3-haiku-20240307", ## Groq - "gllama3" => "llama3-8b-8192", - "gl3" => "llama3-8b-8192", - "gllama370" => "llama3-70b-8192", - "gl70" => "llama3-70b-8192", + "gllama3" => "llama-3.1-8b-instant", + "gl3" => "llama-3.1-8b-instant", + "gllama370" => "llama-3.1-70b-versatile", + "gl70" => "llama-3.1-70b-versatile", + "gllama3405" => "llama-3.1-405b-reasoning", + "gl405" => "llama-3.1-405b-reasoning", + "gls" => "llama-3.1-8b-instant", #s for small + "glm" => "llama-3.1-70b-versatile", #m for medium + "gll" => "llama-3.1-405b-reasoning", #l for large "gmixtral" => "mixtral-8x7b-32768", + "ggemma9" => "gemma2-9b-it", ## DeepSeek "dschat" => "deepseek-chat", "dscode" => "deepseek-coder" @@ -665,13 +675,31 @@ registry = Dict{String, ModelSpec}( TogetherOpenAISchema(), 2e-7, 2e-7, - "Meta Llama3 8b from Mistral, hosted by Together.ai. For more information, see [models](https://docs.together.ai/docs/inference-models)."), + "Meta Llama3 8b, hosted by Together.ai. For more information, see [models](https://docs.together.ai/docs/inference-models)."), "meta-llama/Llama-3-70b-chat-hf" => ModelSpec( "meta-llama/Llama-3-70b-chat-hf", TogetherOpenAISchema(), 9e-7, 9e-7, - "Meta Llama3 70b from Mistral, hosted by Together.ai. For more information, see [models](https://docs.together.ai/docs/inference-models)."), + "Meta Llama3 70b, hosted by Together.ai. For more information, see [models](https://docs.together.ai/docs/inference-models)."), + "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo" => ModelSpec( + "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", + TogetherOpenAISchema(), + 1e-7, + 1.8e-7, + "Meta Llama3.1 8b, hosted by Together.ai. For more information, see [models](https://docs.together.ai/docs/inference-models)."), + "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo" => ModelSpec( + "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo", + TogetherOpenAISchema(), + 5.4e-7, + 8.8e-7, + "Meta Llama3.1 70b, hosted by Together.ai. For more information, see [models](https://docs.together.ai/docs/inference-models)."), + "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo" => ModelSpec( + "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo", + TogetherOpenAISchema(), + 5e-6, + 1.5e-5, + "Meta Llama3.1 405b, hosted by Together.ai. For more information, see [models](https://docs.together.ai/docs/inference-models)."), ### Anthropic models "claude-3-5-sonnet-20240620" => ModelSpec("claude-3-5-sonnet-20240620", AnthropicSchema(), @@ -699,10 +727,25 @@ registry = Dict{String, ModelSpec}( 2.4e-5, "Anthropic's Claude 2.1 model."), ## Groq -- using preliminary pricing on https://wow.groq.com/ + "llama-3.1-405b-reasoning" => ModelSpec("llama-3.1-405b-reasoning", + GroqOpenAISchema(), + 5e-6, # based on prices at together.ai... likely it will be much cheaper + 1.5e-5, # based on prices at together.ai... likely it will be much cheaper + "Meta's Llama3.1 405b, hosted by Groq. Max output 16384 tokens, 131K context - during preview window limited to max tokens=16K. See details [here](https://console.groq.com/docs/models)"), + "llama-3.1-70b-versatile" => ModelSpec("llama-3.1-70b-versatile", + GroqOpenAISchema(), + 5.9e-7, + 7.9e-7, + "Meta's Llama3.1 70b, hosted by Groq. Max output 8192 tokens, 131K context - during preview window limited to max tokens=8K. See details [here](https://console.groq.com/docs/models)"), + "llama-3.1-8b-instant" => ModelSpec("llama-3.1-8b-instant", + GroqOpenAISchema(), + 5e-8, + 8e-8, + "Meta's Llama3.1 8b, hosted by Groq. Max output 8192 tokens, 131K context - during preview window limited to max tokens=8K. See details [here](https://console.groq.com/docs/models)"), "llama3-8b-8192" => ModelSpec("llama3-8b-8192", GroqOpenAISchema(), 5e-8, - 1e-7, + 8e-8, "Meta's Llama3 8b, hosted by Groq. Max output 8192 tokens, 8K context. See details [here](https://console.groq.com/docs/models)"), "llama3-70b-8192" => ModelSpec("llama3-70b-8192", GroqOpenAISchema(), @@ -714,6 +757,11 @@ registry = Dict{String, ModelSpec}( 2.7e-7, 2.7e-7, "Mistral.ai Mixtral 8x7b, hosted by Groq. Max 32K context. See details [here](https://console.groq.com/docs/models)"), + "gemma2-9b-it" => ModelSpec("gemma2-9b-it", + GroqOpenAISchema(), + 2e-7, + 2e-7, + "Google's Gemma 2 9b, hosted by Groq. Max 8K context. See details [here](https://console.groq.com/docs/models)"), "deepseek-chat" => ModelSpec("deepseek-chat", DeepSeekOpenAISchema(), 1.4e-7,