From 573a074d12d232e450ea2f88633ef4d3c4a40206 Mon Sep 17 00:00:00 2001 From: J S <49557684+svilupp@users.noreply.github.com> Date: Sat, 25 Nov 2023 15:45:10 +0000 Subject: [PATCH 1/2] add templates for transcripts and surveys --- src/utils.jl | 71 +++++++++++++++++++ .../AnalystChaptersInTranscript.json | 22 ++++++ .../AnalystThemesInResponses.json | 23 ++++++ test/utils.jl | 33 +++++++++ 4 files changed, 149 insertions(+) create mode 100644 templates/persona-task/AnalystChaptersInTranscript.json create mode 100644 templates/persona-task/AnalystThemesInResponses.json diff --git a/src/utils.jl b/src/utils.jl index 1365fa166..af7b68378 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -1,3 +1,74 @@ +### USEFUL BUT NOT EXPORTED FUNCTIONS +""" + split_by_length(text::String; separator::String=" ", max_length::Int=35000) -> Vector{String} + +Split a given string `text` into chunks of a specified maximum length `max_length`. +This is particularly useful for splitting larger documents or texts into smaller segments, suitable for models or systems with smaller context windows. + +# Arguments +- `text::String`: The text to be split. +- `separator::String=" "`: The separator used to split the text into minichunks. Defaults to a space character. +- `max_length::Int=35000`: The maximum length of each chunk. Defaults to 35,000 characters, which should fit within 16K context window. + +# Returns +`Vector{String}`: A vector of strings, each representing a chunk of the original text that is smaller than or equal to `max_length`. + +# Notes + +- The function ensures that each chunk is as close to `max_length` as possible without exceeding it. +- If the `text` is empty, the function returns an empty array. +- The `separator` is re-added to the text chunks after splitting, preserving the original structure of the text as closely as possible. + +# Examples + +Splitting text with the default separator (" "): +```julia +text = "Hello world. How are you?" +chunks = splitbysize(text; max_length=13) +length(chunks) # Output: 2 +``` + +Using a custom separator and custom `max_length` +```julia +text = "Hello,World," ^ 2900 # length 34900 chars +split_by_length(text; separator=",", max_length=10000) # for 4K context window +length(chunks[1]) # Output: 4 +``` +""" +function split_by_length(text::String; separator::String = " ", max_length::Int = 35000) + minichunks = split(text, separator) + sep_length = length(separator) + chunks = String[] + current_chunk = IOBuffer() + current_length = 0 + for i in eachindex(minichunks) + sep_length_ = i < length(minichunks) ? sep_length : 0 + # Check if the current chunk is full + if current_length + length(minichunks[i]) + sep_length_ > max_length + # Save chunk, excluding the current mini chunk + save_chunk = String(take!(current_chunk)) + if length(save_chunk) > 0 + push!(chunks, save_chunk) + end + current_length = 0 + end + write(current_chunk, minichunks[i]) + current_length += length(minichunks[i]) + if i < length(minichunks) + write(current_chunk, separator) + current_length += sep_length + end + end + + # Add the last chunk if it's not empty + final_chunk = String(take!(current_chunk)) + if length(final_chunk) > 0 + push!(chunks, final_chunk) + end + + return chunks +end +### INTERNAL FUNCTIONS - DO NOT USE DIRECTLY # helper to extract handlebar variables (eg, `{{var}}`) from a prompt string function _extract_handlebar_variables(s::AbstractString) Symbol[Symbol(m[1]) for m in eachmatch(r"\{\{([^\}]+)\}\}", s)] diff --git a/templates/persona-task/AnalystChaptersInTranscript.json b/templates/persona-task/AnalystChaptersInTranscript.json new file mode 100644 index 000000000..903b538ca --- /dev/null +++ b/templates/persona-task/AnalystChaptersInTranscript.json @@ -0,0 +1,22 @@ +[ + { + "content": "Template Metadata", + "description": "Template for summarizing transcripts of videos and meetings into chapters with key insights. If you don't need the instructions, set `instructions=\"None.\"`. Placeholders: {{transcript}}, {{instructions}}", + "version": "1", + "source": "Customized version of [jxnl's Youtube Chapters prompt](https://github.com/jxnl/youtubechapters-backend/blob/main/summary_app/md_summarize.py)", + "_type": "metadatamessage" + }, + { + "content": "Act as a super-human AI analyst trained to precisely summarize transcripts of videos and meetings with incredible precision and quality. \nSummarize the transcript in a clear and concise manner that makes use of timestamps, when available, to help others study the transcript. Split the notes into Chapters, which should be meaningful and not too short.\n\nTo format your markdown file, follow this structure:\n```\n# Chapter 1: [Descriptive Title] [Timestamp as HH:MM:SS]\n\n- \n\n## Section 1.1: [Descriptive Title] [Timestamp as HH:MM:SS]\n\n\n- \n\nRepeat the above structure as necessary, and use subheadings to organize your notes.\n```\n\nFormatting Tips:\n* Do not make the chapters too short, ensure that each section has a few brief bullet points. \n* Bullet points should be concise and to the point, so people can scan them quickly.\n* Use [] to denote timestamps\n* Use subheadings and bullet points to organize your notes and make them easier to read and understand. When relevant, include timestamps to link to the corresponding part of the video.\n* Use bullet points to describe important steps and insights, being as comprehensive as possible.\n* Use quotes to highlight important points and insights.\n\nSummary Tips:\n* Do not mention anything if its only playing music and if nothing happens don't include it in the notes.\n* Use only content from the transcript. Do not add any additional information.\n* Make a new line after each # or ## and before each bullet point\n* Titles should be informative or even a question that the video answers\n* Titles should not be conclusions since you may only be getting a small part of the video\n\nKeep it CONCISE!!\nIf Special Instructions are provided by the user, they take precedence over any previous instructions and you MUST follow they precisely.\n", + "variables": [], + "_type": "systemmessage" + }, + { + "content": "# Transcript\n\n{{transcript}}\n\n\n\n# Special Instructions\n\n{{instructions}}", + "variables": [ + "transcript", + "instructions" + ], + "_type": "usermessage" + } +] \ No newline at end of file diff --git a/templates/persona-task/AnalystThemesInResponses.json b/templates/persona-task/AnalystThemesInResponses.json new file mode 100644 index 000000000..900fd9436 --- /dev/null +++ b/templates/persona-task/AnalystThemesInResponses.json @@ -0,0 +1,23 @@ +[ + { + "content": "Template Metadata", + "description": "Template for summarizing survey verbatim responses into 3-5 themes with an example for each theme. If you don't need the instructions, set `instructions=\"None.\"`. Placeholders: {{question}}, {{responses}}, {{instructions}}", + "version": "1", + "source": "", + "_type": "metadatamessage" + }, + { + "content": "\"Act a world-class behavioural researcher, who specializes on survey analysis. Categorize the provided survey responses into several themes. \nThe responses should be analyzed, and each theme identified should be labeled clearly. Examples from the responses should be given to illustrate each theme. The output should be formatted as specified, with a clear indication of the theme and corresponding verbatim examples.\n\n# Sub-tasks\n\n1. Read the provided survey responses carefully, especially in the context of the question. \n2. Identify 3-5 distinct themes present in the responses related to the survey question. It should be the most important themes that must be raised to the CEO/leadership. \n3. For each theme, choose at least one verbatim example from the responses that best represents it. This example should be a direct quote from the responses. This example should belong to only one theme and must not be applicable to any other themes.\n4. Format the output as specified.\n\n# Formatting\n\nTo format your markdown file, follow this structure (omit the triple backticks):\n ```\n # Theme 1: [Theme Description]\n - Best illustrated by: \"...\"\n\n # Theme 2: [Theme Description]\n - Best illustrated by: \"...\"\n ...\n ```\n\nKeep it CONCISE!!\nIf Special Instructions are provided by the user, they take precedence over any previous instructions and you MUST follow they precisely.\n", + "variables": [], + "_type": "systemmessage" + }, + { + "content": "# Survey Question\n\n{{question}}\n\n\n# Verbatim Responses\n\n{{responses}}\n\n\n# Special Instructions\n\n{{instructions}}\n", + "variables": [ + "question", + "responses", + "instructions" + ], + "_type": "usermessage" + } +] \ No newline at end of file diff --git a/test/utils.jl b/test/utils.jl index 138343257..e3a05cecd 100644 --- a/test/utils.jl +++ b/test/utils.jl @@ -1,6 +1,39 @@ +using PromptingTools: split_by_length using PromptingTools: _extract_handlebar_variables, _report_stats using PromptingTools: _string_to_vector, _encode_local_image +@testset "split_by_length" begin + text = "Hello world. How are you?" + chunks = split_by_length(text, max_length = 100) + @test length(chunks) == 1 + @test chunks[1] == text + chunks = split_by_length(text, max_length = 25) + @test length(chunks) == 1 + @test chunks[1] == text + @test maximum(length.(chunks)) <= 25 + chunks = split_by_length(text, max_length = 10) + @test length(chunks) == 4 + @test maximum(length.(chunks)) <= 10 + chunks = split_by_length(text, max_length = 11) + @test length(chunks) == 3 + @test maximum(length.(chunks)) <= 11 + @test join(chunks, "") == text + + # Test with empty text + chunks = split_by_length("") + @test isempty(chunks) + + # Test custom separator + text = "Hello,World,"^50 + chunks = split_by_length(text, separator = ",", max_length = length(text)) + @test length(chunks) == 1 + @test chunks[1] == text + chunks = split_by_length(text, separator = ",", max_length = 20) + @test length(chunks) == 34 + @test maximum(length.(chunks)) <= 20 + @test join(chunks, "") == text +end + @testset "extract_handlebar_variables" begin # Extracts handlebar variables enclosed in double curly braces input_string = "Hello {{name}}, how are you?" From 00e79aa3fd9c1dffefd4c42710bb4fcc558d0631 Mon Sep 17 00:00:00 2001 From: J S <49557684+svilupp@users.noreply.github.com> Date: Sun, 26 Nov 2023 10:39:39 +0000 Subject: [PATCH 2/2] add decisions template, fix tests --- CHANGELOG.md | 3 +- Project.toml | 2 ++ src/PromptingTools.jl | 7 +++-- src/utils.jl | 31 +++++++++++++++++++ .../AnalystDecisionsInTranscript.json | 22 +++++++++++++ templates/persona-task/DrafterEmailBrief.json | 21 +++++++++++++ test/templates.jl | 4 ++- test/utils.jl | 13 +++++++- 8 files changed, 98 insertions(+), 5 deletions(-) create mode 100644 templates/persona-task/AnalystDecisionsInTranscript.json create mode 100644 templates/persona-task/DrafterEmailBrief.json diff --git a/CHANGELOG.md b/CHANGELOG.md index 4439adea6..9d40e6bef 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,4 +9,5 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add support for prompt templates with `AITemplate` struct. Search for suitable templates with `aitemplates("query string")` and then simply use them with `aigenerate(AITemplate(:TemplateABC); variableX = "some value") -> AIMessage` or use a dispatch on the template name as a `Symbol`, eg, `aigenerate(:TemplateABC; variableX = "some value") -> AIMessage`. Templates are saved as JSON files in the folder `templates/`. If you add new templates, you can reload them with `load_templates!()` (notice the exclamation mark to override the existing `TEMPLATE_STORE`). - Add `aiextract` function to extract structured information from text quickly and easily. See `?aiextract` for more information. - Add `aiscan` for image scanning (ie, image comprehension tasks). You can transcribe screenshots or reason over images as if they were text. Images can be provided either as a local file (`image_path`) or as an url (`image_url`). See `?aiscan` for more information. -- Add support for [Ollama.ai](https://ollama.ai/)'s local models. Only `aigenerate` and `aiembed` functions are supported at the moment. \ No newline at end of file +- Add support for [Ollama.ai](https://ollama.ai/)'s local models. Only `aigenerate` and `aiembed` functions are supported at the moment. +- Add a few non-coding templates, eg, verbatim analysis (see `aitemplates("survey")`) and meeting summarization (see `aitemplates("meeting")`), and supporting utilities (non-exported): `split_by_length` and `replace_words` to make it easy to work with smaller open source models. \ No newline at end of file diff --git a/Project.toml b/Project.toml index 76050193d..bdc95d5ea 100644 --- a/Project.toml +++ b/Project.toml @@ -7,6 +7,7 @@ version = "0.2.0-DEV" Base64 = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1" +Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" OpenAI = "e9f21f70-7185-4079-aca2-91159181367c" PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" @@ -15,6 +16,7 @@ Aqua = "0.7" Base64 = "<0.0.1, 1" HTTP = "1" JSON3 = "1" +Logging = "<0.0.1, 1" OpenAI = "0.8.7" PrecompileTools = "1" Test = "<0.0.1, 1" diff --git a/src/PromptingTools.jl b/src/PromptingTools.jl index 466a4e1cb..82e12c7b9 100644 --- a/src/PromptingTools.jl +++ b/src/PromptingTools.jl @@ -1,6 +1,7 @@ module PromptingTools using Base64: base64encode +using Logging using OpenAI using JSON3 using JSON3: StructTypes @@ -61,7 +62,9 @@ function __init__() load_templates!() end -# Enable precompilation to reduce start time -@compile_workload include("precompilation.jl") +# Enable precompilation to reduce start time, disabled logging +with_logger(NullLogger()) do + @compile_workload include("precompilation.jl") +end end # module PromptingTools diff --git a/src/utils.jl b/src/utils.jl index af7b68378..53bacee6b 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -1,4 +1,35 @@ ### USEFUL BUT NOT EXPORTED FUNCTIONS + +""" + replace_words(text::AbstractString, words::Vector{<:AbstractString}; replacement::AbstractString="ABC") + +Replace all occurrences of words in `words` with `replacement` in `text`. Useful to quickly remove specific names or entities from a text. + +# Arguments +- `text::AbstractString`: The text to be processed. +- `words::Vector{<:AbstractString}`: A vector of words to be replaced. +- `replacement::AbstractString="ABC"`: The replacement string to be used. Defaults to "ABC". + +# Example +```julia +text = "Disney is a great company" +replace_words(text, ["Disney", "Snow White", "Mickey Mouse"]) +# Output: "ABC is a great company" +``` +""" +replace_words(text::AbstractString, words::Vector{<:AbstractString}; replacement::AbstractString = "ABC") = replace_words(text, + Regex("\\b$(join(words, "\\b|\\b"))\\b", "i"), + replacement) +function replace_words(text::AbstractString, pattern::Regex, replacement::AbstractString) + replace(text, pattern => replacement) +end +# dispatch for single word +function replace_words(text::AbstractString, + word::AbstractString; + replacement::AbstractString = "ABC") + replace_words(text, [word]; replacement) +end + """ split_by_length(text::String; separator::String=" ", max_length::Int=35000) -> Vector{String} diff --git a/templates/persona-task/AnalystDecisionsInTranscript.json b/templates/persona-task/AnalystDecisionsInTranscript.json new file mode 100644 index 000000000..711a8936f --- /dev/null +++ b/templates/persona-task/AnalystDecisionsInTranscript.json @@ -0,0 +1,22 @@ +[ + { + "content": "Template Metadata", + "description": "Template for summarizing transcripts of videos and meetings into decisions made and agreed next steps. If you don't need the instructions, set `instructions=\"None.\"`. Placeholders: {{transcript}}, {{instructions}}", + "version": "1", + "source": "Evolved from [jxnl's Youtube Chapters prompt](https://github.com/jxnl/youtubechapters-backend/blob/main/summary_app/md_summarize.py)", + "_type": "metadatamessage" + }, + { + "content": "Act as a super-human AI analyst trained to meticulously analyze transcripts of videos and meetings. Your role is to identify and summarize key decisions and next steps, enhancing clarity and utility for those studying the transcript. \nUse timestamps to pinpoint when these decisions and steps are discussed. Organize your notes into distinct sections, each dedicated to a significant decision or action plan.\n\nFormat your markdown file using this structure:\n```\n# Key Decision 1: [Descriptive Title] [Timestamp as HH:MM:SS]\n- \n\n## Next Steps for Decision 1\n- \n\nRepeat this structure for each key decision and its corresponding next steps.\n\n# Other Next Steps\n- \n```\n\nFormatting Tips:\n* Ensure each section is substantial, providing a clear and concise summary of each key decision and its next steps.\n* Use bullet points to make the summary easy to scan and understand.\n* All next steps should be actionable and clearly defined. All next steps must be relevant to the decision they are associated with. Any general next steps, should be included in section `Other Next Steps`\n* Include timestamps in brackets to refer to the specific parts of the video where these discussions occur.\n* Titles should be informative, reflecting the essence of the decision.\n\nSummary Tips:\n* Exclude sections where only music plays or no significant content is present.\n* Base your summary strictly on the transcript content without adding extra information.\n* Maintain a clear structure: place a new line after each # or ##, and before each bullet point.\n* Titles should pose a question answered by the decision or describe the nature of the next steps.\n\nKeep the summary concise and focused on key decisions and next steps. \nIf the user provides special instructions, prioritize these over the general guidelines.", + "variables": [], + "_type": "systemmessage" + }, + { + "content": "# Transcript\n\n{{transcript}}\n\n\n\n# Special Instructions\n\n{{instructions}}", + "variables": [ + "transcript", + "instructions" + ], + "_type": "usermessage" + } +] \ No newline at end of file diff --git a/templates/persona-task/DrafterEmailBrief.json b/templates/persona-task/DrafterEmailBrief.json new file mode 100644 index 000000000..ce74d89cf --- /dev/null +++ b/templates/persona-task/DrafterEmailBrief.json @@ -0,0 +1,21 @@ +[ + { + "content": "Template Metadata", + "description": "Template for quick email drafts. Provide a brief in 5-7 words as headlines, eg, `Follow up email. Sections: Agreements, Next steps` Placeholders: {{brief}}", + "version": "1", + "source": "", + "_type": "metadatamessage" + }, + { + "content": "Act as a world-class office communications expert, skilled in creating efficient, clear, and friendly internal email communications.\n Craft a concise email subject and email draft from the provided User Brief. \n\n Use the following format for the body of the email:\n ```\n Section Name \n - Bullet point 1\n - Bullet point 2\n\n \n ```\n\n # Guidelines\n - Focus on clear and efficient communication, suitable for internal business correspondence\n - Where information is missing, use your best judgement to fill in the gaps\n - It should be informal and friendly, eg, start with \"Hi\"\n - Ensure the tone is professional yet casual, suitable for internal communication\n - Write as plain text, with no markdown syntax\n - Format into Sections. Each section should have 3-5 bullet points\n - Close the email on a positive note, encouraging communication and collaboration\n - It should be brief and concise with 150 words or less\n \n\n Follow the above guidelines, unless the user explicitly asks for something different. In that case, follow the user's instructions precisely.\n", + "variables": [], + "_type": "systemmessage" + }, + { + "content": "# User Brief\n\n{{brief}}\n\n", + "variables": [ + "brief" + ], + "_type": "usermessage" + } +] \ No newline at end of file diff --git a/test/templates.jl b/test/templates.jl index 5a3a1b2c6..335fa0071 100644 --- a/test/templates.jl +++ b/test/templates.jl @@ -32,8 +32,10 @@ end @testset "Templates - search" begin # search all - tmps = aitemplates("") + tmps = aitemplates(""; limit = typemax(Int)) @test tmps == PT.TEMPLATE_METADATA + @info length(tmps) + @info length(PT.TEMPLATE_METADATA) # Exact search for JudgeIsItTrue tmps = aitemplates(:JudgeIsItTrue) @test length(tmps) == 1 diff --git a/test/utils.jl b/test/utils.jl index e3a05cecd..10f268b62 100644 --- a/test/utils.jl +++ b/test/utils.jl @@ -1,7 +1,18 @@ -using PromptingTools: split_by_length +using PromptingTools: split_by_length, replace_words using PromptingTools: _extract_handlebar_variables, _report_stats using PromptingTools: _string_to_vector, _encode_local_image +@testset "replace_words" begin + words = ["Disney", "Snow White", "Mickey Mouse"] + @test replace_words("Disney is a great company", + ["Disney", "Snow White", "Mickey Mouse"]) == "ABC is a great company" + @test replace_words("Snow White and Mickey Mouse are great", + ["Disney", "Snow White", "Mickey Mouse"]) == "ABC and ABC are great" + @test replace_words("LSTM is a great model", "LSTM") == "ABC is a great model" + @test replace_words("LSTM is a great model", "LSTM"; replacement = "XYZ") == + "XYZ is a great model" +end + @testset "split_by_length" begin text = "Hello world. How are you?" chunks = split_by_length(text, max_length = 100)