From 0f1a334c14effe8c9df90fc117155abd2db5acd9 Mon Sep 17 00:00:00 2001
From: J S <49557684+svilupp@users.noreply.github.com>
Date: Tue, 23 Jul 2024 18:55:07 +0100
Subject: [PATCH] Register Llama3.1 + minor retrieval improvements

---
 CHANGELOG.md                           | 10 ++++
 Project.toml                           |  2 +-
 src/Experimental/RAGTools/retrieval.jl |  8 +++-
 src/Experimental/RAGTools/types.jl     | 26 ++++++----
 src/user_preferences.jl                | 66 ++++++++++++++++++++++----
 5 files changed, 91 insertions(+), 21 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0efb9b821..3ea5f0f06 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Fixed
 
+## [0.42.0]
+
+### Added
+- Registered new Meta Llama 3.1 models hosted on GroqCloud and Together.ai (eg, Groq-hosted `gllama370` has been updated to point to the latest available model and 405b model now has alias `gllama3405`). Because that's quite clunky, I've added abbreviations based on sizes small/medium/large (that is 8b, 70b, 405b) under `gls/glm/gll` for Llama 3.1 hosted on GroqCloud (similarly, we now have `tls/tlm/tll` for Llama3.1 on Together.ai).
+- Generic model aliases for Groq and Together.ai for Llama3 models have been updated to point to the latest available models (Llama 3.1).
+- Added Gemma2 9b model hosted on GroqCloud to the model registry (alias `ggemma9`).
+
+### Updated
+- Minor optimizations to `SubDocumentTermMatrix` to reduce memory allocations and improve performance.
+
 ## [0.41.0]
 
 ### Added 
diff --git a/Project.toml b/Project.toml
index 2147675a9..d372d0878 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "PromptingTools"
 uuid = "670122d1-24a8-4d70-bfce-740807c42192"
 authors = ["J S @svilupp and contributors"]
-version = "0.41.0"
+version = "0.42.0"
 
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
diff --git a/src/Experimental/RAGTools/retrieval.jl b/src/Experimental/RAGTools/retrieval.jl
index 301841776..11bedc57d 100644
--- a/src/Experimental/RAGTools/retrieval.jl
+++ b/src/Experimental/RAGTools/retrieval.jl
@@ -230,7 +230,9 @@ function find_closest(
         finder::AbstractSimilarityFinder, index::AbstractChunkIndex,
         query_emb::AbstractVector{<:Real}, query_tokens::AbstractVector{<:AbstractString} = String[];
         top_k::Int = 100, kwargs...)
-    isnothing(chunkdata(index)) && return CandidateChunks(; index_id = indexid(index))
+    if isnothing(chunkdata(parent(index)))
+        return CandidateChunks(; index_id = indexid(index))
+    end
     positions, scores = find_closest(finder, chunkdata(index),
         query_emb, query_tokens;
         top_k, kwargs...)
@@ -244,7 +246,9 @@ function find_closest(
         finder::AbstractSimilarityFinder, index::AbstractChunkIndex,
         query_emb::AbstractMatrix{<:Real}, query_tokens::AbstractVector{<:AbstractVector{<:AbstractString}} = Vector{Vector{String}}();
         top_k::Int = 100, kwargs...)
-    isnothing(chunkdata(index)) && CandidateChunks(; index_id = indexid(index))
+    if isnothing(chunkdata(parent(index)))
+        return CandidateChunks(; index_id = indexid(index))
+    end
     ## reduce top_k since we have more than one query
     top_k_ = top_k ÷ size(query_emb, 2)
     ## simply vcat together (gets sorted from the highest similarity to the lowest)
diff --git a/src/Experimental/RAGTools/types.jl b/src/Experimental/RAGTools/types.jl
index 942524535..9582a28bf 100644
--- a/src/Experimental/RAGTools/types.jl
+++ b/src/Experimental/RAGTools/types.jl
@@ -6,7 +6,8 @@ Base.parent(index::AbstractDocumentIndex) = index
 indexid(index::AbstractDocumentIndex) = index.id
 chunkdata(index::AbstractChunkIndex) = index.chunkdata
 "Access chunkdata for a subset of chunks, `chunk_idx` is a vector of chunk indices in the index"
-function chunkdata(index::AbstractChunkIndex, chunk_idx::AbstractVector{<:Integer})
+Base.@propagate_inbounds function chunkdata(
+        index::AbstractChunkIndex, chunk_idx::AbstractVector{<:Integer})
     ## We need this accessor because different chunk indices can have chunks in different dimensions!!
     chkdata = chunkdata(index)
     if isnothing(chkdata)
@@ -209,7 +210,7 @@ tf(dtm::SubDocumentTermMatrix) = dtm.tf
 vocab(dtm::SubDocumentTermMatrix) = Base.parent(dtm) |> vocab
 vocab_lookup(dtm::SubDocumentTermMatrix) = Base.parent(dtm) |> vocab_lookup
 idf(dtm::SubDocumentTermMatrix) = Base.parent(dtm) |> idf
-function doc_rel_length(dtm::SubDocumentTermMatrix)
+Base.@propagate_inbounds function doc_rel_length(dtm::SubDocumentTermMatrix)
     view(doc_rel_length(Base.parent(dtm)), positions(dtm))
 end
 # hcat for SubDocumentTermMatrix does not make sense -> the vocabulary is the same / shared
@@ -227,6 +228,7 @@ Base.@propagate_inbounds function Base.view(
         throw(BoundsError(tf_mat, max_pos))
     end
     ## computations on top of views of sparse arrays are expensive, materialize the view
+    ## Moreover, nonzeros and rowvals accessors for SparseCSCMatrix are not defined for views
     tf_ = tf_mat[doc_idx, :]
     SubDocumentTermMatrix(dtm, tf_, collect(doc_idx))
 end
@@ -315,7 +317,8 @@ end
 
 HasKeywords(::ChunkKeywordsIndex) = true
 "Access chunkdata for a subset of chunks, `chunk_idx` is a vector of chunk indices in the index"
-function chunkdata(index::ChunkKeywordsIndex, chunk_idx::AbstractVector{<:Integer})
+Base.@propagate_inbounds function chunkdata(
+        index::ChunkKeywordsIndex, chunk_idx::AbstractVector{<:Integer})
     chkdata = index.chunkdata
     if isnothing(chkdata)
         return nothing
@@ -437,13 +440,18 @@ Base.parent(index::SubChunkIndex) = index.parent
 HasEmbeddings(index::SubChunkIndex) = HasEmbeddings(parent(index))
 HasKeywords(index::SubChunkIndex) = HasKeywords(parent(index))
 
-chunks(index::SubChunkIndex) = view(chunks(parent(index)), positions(index))
-sources(index::SubChunkIndex) = view(sources(parent(index)), positions(index))
-function chunkdata(index::SubChunkIndex)
-    chkdata = chunkdata(parent(index), positions(index))
+Base.@propagate_inbounds function chunks(index::SubChunkIndex)
+    view(chunks(parent(index)), positions(index))
+end
+Base.@propagate_inbounds function sources(index::SubChunkIndex)
+    view(sources(parent(index)), positions(index))
+end
+Base.@propagate_inbounds function chunkdata(index::SubChunkIndex)
+    chunkdata(parent(index), positions(index))
 end
 "Access chunkdata for a subset of chunks, `chunk_idx` is a vector of chunk indices in the index"
-function chunkdata(index::SubChunkIndex, chunk_idx::AbstractVector{<:Integer})
+Base.@propagate_inbounds function chunkdata(
+        index::SubChunkIndex, chunk_idx::AbstractVector{<:Integer})
     ## We need this accessor because different chunk indices can have chunks in different dimensions!!
     index_chunk_idx = translate_positions_to_parent(index, chunk_idx)
     pos = intersect(positions(index), index_chunk_idx)
@@ -501,7 +509,7 @@ Translate positions to the parent index. Useful to convert between positions in
 
 Used whenever a `chunkdata()` or `tags()` are used to re-align positions to the "parent" index.
 """
-function translate_positions_to_parent(
+Base.@propagate_inbounds function translate_positions_to_parent(
         index::SubChunkIndex, pos::AbstractVector{<:Integer})
     sub_positions = positions(index)
     return sub_positions[pos]
diff --git a/src/user_preferences.jl b/src/user_preferences.jl
index 5cd85a35f..2080abcf9 100644
--- a/src/user_preferences.jl
+++ b/src/user_preferences.jl
@@ -346,8 +346,12 @@ aliases = merge(
         ## t-mixtral -> Together.ai Mixtral
         "tmixtral" => "mistralai/Mixtral-8x7B-Instruct-v0.1",
         "tmixtral22" => "mistralai/Mixtral-8x22B-Instruct-v0.1",
-        "tllama3" => "meta-llama/Llama-3-8b-chat-hf",
-        "tllama370" => "meta-llama/Llama-3-70b-chat-hf",
+        "tllama3" => "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
+        "tllama370" => "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
+        "tllama3405" => "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
+        "tls" => "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", #s for small
+        "tlm" => "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo", #m for medium
+        "tll" => "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo", #l for large
         ## Mistral AI
         "mistral-tiny" => "mistral-tiny",
         "mistral-small" => "mistral-small-latest",
@@ -365,11 +369,17 @@ aliases = merge(
         "claudes" => "claude-3-5-sonnet-20240620",
         "claudeh" => "claude-3-haiku-20240307",
         ## Groq
-        "gllama3" => "llama3-8b-8192",
-        "gl3" => "llama3-8b-8192",
-        "gllama370" => "llama3-70b-8192",
-        "gl70" => "llama3-70b-8192",
+        "gllama3" => "llama-3.1-8b-instant",
+        "gl3" => "llama-3.1-8b-instant",
+        "gllama370" => "llama-3.1-70b-versatile",
+        "gl70" => "llama-3.1-70b-versatile",
+        "gllama3405" => "llama-3.1-405b-reasoning",
+        "gl405" => "llama-3.1-405b-reasoning",
+        "gls" => "llama-3.1-8b-instant", #s for small
+        "glm" => "llama-3.1-70b-versatile", #m for medium
+        "gll" => "llama-3.1-405b-reasoning", #l for large
         "gmixtral" => "mixtral-8x7b-32768",
+        "ggemma9" => "gemma2-9b-it",
         ## DeepSeek
         "dschat" => "deepseek-chat",
         "dscode" => "deepseek-coder"
@@ -665,13 +675,31 @@ registry = Dict{String, ModelSpec}(
         TogetherOpenAISchema(),
         2e-7,
         2e-7,
-        "Meta Llama3 8b from Mistral, hosted by Together.ai. For more information, see [models](https://docs.together.ai/docs/inference-models)."),
+        "Meta Llama3 8b, hosted by Together.ai. For more information, see [models](https://docs.together.ai/docs/inference-models)."),
     "meta-llama/Llama-3-70b-chat-hf" => ModelSpec(
         "meta-llama/Llama-3-70b-chat-hf",
         TogetherOpenAISchema(),
         9e-7,
         9e-7,
-        "Meta Llama3 70b from Mistral, hosted by Together.ai. For more information, see [models](https://docs.together.ai/docs/inference-models)."),
+        "Meta Llama3 70b, hosted by Together.ai. For more information, see [models](https://docs.together.ai/docs/inference-models)."),
+    "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo" => ModelSpec(
+        "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
+        TogetherOpenAISchema(),
+        1e-7,
+        1.8e-7,
+        "Meta Llama3.1 8b, hosted by Together.ai. For more information, see [models](https://docs.together.ai/docs/inference-models)."),
+    "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo" => ModelSpec(
+        "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
+        TogetherOpenAISchema(),
+        5.4e-7,
+        8.8e-7,
+        "Meta Llama3.1 70b, hosted by Together.ai. For more information, see [models](https://docs.together.ai/docs/inference-models)."),
+    "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo" => ModelSpec(
+        "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
+        TogetherOpenAISchema(),
+        5e-6,
+        1.5e-5,
+        "Meta Llama3.1 405b, hosted by Together.ai. For more information, see [models](https://docs.together.ai/docs/inference-models)."),
     ### Anthropic models
     "claude-3-5-sonnet-20240620" => ModelSpec("claude-3-5-sonnet-20240620",
         AnthropicSchema(),
@@ -699,10 +727,25 @@ registry = Dict{String, ModelSpec}(
         2.4e-5,
         "Anthropic's Claude 2.1 model."),
     ## Groq -- using preliminary pricing on https://wow.groq.com/
+    "llama-3.1-405b-reasoning" => ModelSpec("llama-3.1-405b-reasoning",
+        GroqOpenAISchema(),
+        5e-6, # based on prices at together.ai... likely it will be much cheaper
+        1.5e-5, # based on prices at together.ai... likely it will be much cheaper
+        "Meta's Llama3.1 405b, hosted by Groq. Max output 16384 tokens, 131K context - during preview window limited to max tokens=16K. See details [here](https://console.groq.com/docs/models)"),
+    "llama-3.1-70b-versatile" => ModelSpec("llama-3.1-70b-versatile",
+        GroqOpenAISchema(),
+        5.9e-7,
+        7.9e-7,
+        "Meta's Llama3.1 70b, hosted by Groq. Max output 8192 tokens, 131K context - during preview window limited to max tokens=8K. See details [here](https://console.groq.com/docs/models)"),
+    "llama-3.1-8b-instant" => ModelSpec("llama-3.1-8b-instant",
+        GroqOpenAISchema(),
+        5e-8,
+        8e-8,
+        "Meta's Llama3.1 8b, hosted by Groq. Max output 8192 tokens, 131K context - during preview window limited to max tokens=8K. See details [here](https://console.groq.com/docs/models)"),
     "llama3-8b-8192" => ModelSpec("llama3-8b-8192",
         GroqOpenAISchema(),
         5e-8,
-        1e-7,
+        8e-8,
         "Meta's Llama3 8b, hosted by Groq. Max output 8192 tokens, 8K context. See details [here](https://console.groq.com/docs/models)"),
     "llama3-70b-8192" => ModelSpec("llama3-70b-8192",
         GroqOpenAISchema(),
@@ -714,6 +757,11 @@ registry = Dict{String, ModelSpec}(
         2.7e-7,
         2.7e-7,
         "Mistral.ai Mixtral 8x7b, hosted by Groq. Max 32K context. See details [here](https://console.groq.com/docs/models)"),
+    "gemma2-9b-it" => ModelSpec("gemma2-9b-it",
+        GroqOpenAISchema(),
+        2e-7,
+        2e-7,
+        "Google's Gemma 2 9b, hosted by Groq. Max 8K context. See details [here](https://console.groq.com/docs/models)"),
     "deepseek-chat" => ModelSpec("deepseek-chat",
         DeepSeekOpenAISchema(),
         1.4e-7,