feat: Implement Telegram HTML support

n4ze3m · Aug 24, 2024 · 664b0e7 · 664b0e7
1 parent 8eb1c66
commit 664b0e7
Show file tree

Hide file tree

Showing 4 changed files with 154 additions and 9 deletions.
diff --git a/app/ui/src/components/Bot/Playground/Message.tsx b/app/ui/src/components/Bot/Playground/Message.tsx
@@ -56,7 +56,7 @@ export const PlaygroundMessage = (props: Props) => {
               <Markdown message={props.message} />
             </div>
 
-            {props.isBot && (
+            {props.isBot && props?.sources && props?.sources?.length > 0 && (
               <Collapse
                 className="mt-6"
                 ghost

diff --git a/server/src/integration/telegram.ts b/server/src/integration/telegram.ts
@@ -9,6 +9,7 @@ import { convertTextToAudio } from "./handlers/utils/audio-to-text";
 import { FileFlavor, hydrateFiles } from "@grammyjs/files";
 import * as fs from "fs/promises";
 import { convertOggToWave } from "../utils/ffmpeg";
+import { telegramFormat } from "../utils/telegram-format";
 type DialoqBaseContext = FileFlavor<Context>;
 export default class TelegramBot {
   static get clients() {
@@ -73,9 +74,14 @@ export default class TelegramBot {
           user_id
         );
 
-        return await ctx.reply(message, {
-           parse_mode: "MarkdownV2",
-        });
+        if (process.env.DB_TELEGEAM_PARSE_MODE === "normal") {
+          return await ctx.reply(message);
+        }
+
+        return await ctx.reply(telegramFormat(message),
+          {
+            parse_mode: "HTML",
+          });
       });
 
       bot.on("message:voice", async (ctx) => {
@@ -102,9 +108,15 @@ export default class TelegramBot {
             user_id
           );
 
-          return await ctx.reply(message, {
-            parse_mode: "MarkdownV2",
-          });
+
+          if (process.env.DB_TELEGEAM_PARSE_MODE === "normal") {
+            return await ctx.reply(message);
+          }
+
+          return await ctx.reply(telegramFormat(message),
+            {
+              parse_mode: "HTML",
+            });
         } catch (error) {
           console.log(error);
           return await ctx.reply("Opps! Something went wrong");

diff --git a/server/src/internet/index.ts b/server/src/internet/index.ts
@@ -117,7 +117,7 @@ const searchProviders = {
 
 export const searchInternet = async (embedding: Embeddings, { query }: { query: string }) => {
 
-    if(process.env.DISABLE_INTERNET_SEARCH == "true") {
+    if (process.env.DISABLE_INTERNET_SEARCH == "true") {
         return [];
     }
 
@@ -127,7 +127,9 @@ export const searchInternet = async (embedding: Embeddings, { query }: { query:
     }
     const datat = await searchProvider(query);
 
-    const results = datat.slice(0, TOTAL_RESULTS_LIMIT);
+    const data = datat.filter((doc) => doc?.content.length > 0);
+
+    const results = data.slice(0, TOTAL_RESULTS_LIMIT)
 
     const [docEmbeddings, queryEmbedding] = await Promise.all([
         embedding.embedDocuments(results.map((doc) => doc.content)),

diff --git a/server/src/utils/telegram-format.ts b/server/src/utils/telegram-format.ts
@@ -0,0 +1,131 @@
+// this code is a typescript conversion of the original python code from the repo: https://github.com/Latand/formatter-chatgpt-telegram
+
+function convertHtmlChars(text: string): string {
+    text = text.replace(/&/g, "&amp;");
+    text = text.replace(/</g, "&lt;");
+    text = text.replace(/>/g, "&gt;");
+    return text;
+}
+
+function splitByTag(outText: string, mdTag: string, htmlTag: string): string {
+    const tagPattern = new RegExp(
+        `(?<!\\w)${escapeRegExp(mdTag)}(.*?)${escapeRegExp(mdTag)}(?!\\w)`,
+        "gs"
+    );
+    return outText.replace(tagPattern, `<${htmlTag}>$1</${htmlTag}>`);
+}
+
+function escapeRegExp(string: string): string {
+    return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
+}
+
+function ensureClosingDelimiters(text: string): string {
+    if ((text.match(/```/g) || []).length % 2 !== 0) {
+        text += "```";
+    }
+    if ((text.match(/`/g) || []).length % 2 !== 0) {
+        text += "`";
+    }
+    return text;
+}
+
+function extractAndConvertCodeBlocks(text: string): [string, Record<string, string>] {
+    text = ensureClosingDelimiters(text);
+    const placeholders: string[] = [];
+    const codeBlocks: Record<string, string> = {};
+
+    const replacer = (match: RegExpMatchArray): [string, string] => {
+        const language = match[1] || "";
+        const codeContent = match[3];
+        const placeholder = `CODEBLOCKPLACEHOLDER${placeholders.length}`;
+        placeholders.push(placeholder);
+        const htmlCodeBlock = language
+            ? `<pre><code class="language-${language}">${codeContent}</code></pre>`
+            : `<pre><code>${codeContent}</code></pre>`;
+        return [placeholder, htmlCodeBlock];
+    };
+
+    let modifiedText = text;
+    const regex = /```(\w*)?(\n)?(.*?)```/gs;
+    let match: RegExpExecArray | null;
+
+    while ((match = regex.exec(text)) !== null) {
+        const [placeholder, htmlCodeBlock] = replacer(match);
+        codeBlocks[placeholder] = htmlCodeBlock;
+        modifiedText = modifiedText.replace(match[0], placeholder);
+    }
+
+    return [modifiedText, codeBlocks];
+}
+
+function reinsertCodeBlocks(text: string, codeBlocks: Record<string, string>): string {
+    for (const [placeholder, htmlCodeBlock] of Object.entries(codeBlocks)) {
+        text = text.replace(placeholder, htmlCodeBlock);
+    }
+    return text;
+}
+
+function combineBlockquotes(text: string): string {
+    const lines = text.split("\n");
+    const combinedLines: string[] = [];
+    let blockquoteLines: string[] = [];
+    let inBlockquote = false;
+
+    for (const line of lines) {
+        if (line.startsWith(">")) {
+            inBlockquote = true;
+            blockquoteLines.push(line.slice(1).trim());
+        } else {
+            if (inBlockquote) {
+                combinedLines.push(
+                    `<blockquote>${blockquoteLines.join("\n")}</blockquote>`
+                );
+                blockquoteLines = [];
+                inBlockquote = false;
+            }
+            combinedLines.push(line);
+        }
+    }
+
+    if (inBlockquote) {
+        combinedLines.push(
+            `<blockquote>${blockquoteLines.join("\n")}</blockquote>`
+        );
+    }
+
+    return combinedLines.join("\n");
+}
+
+function removeBlockquoteEscaping(output: string): string {
+    return output
+        .replace(/&lt;blockquote&gt;/g, "<blockquote>")
+        .replace(/&lt;\/blockquote&gt;/g, "</blockquote>");
+}
+
+export function telegramFormat(text: string): string {
+    text = combineBlockquotes(text);
+    text = convertHtmlChars(text);
+
+    let [output, codeBlocks] = extractAndConvertCodeBlocks(text);
+
+    output = output.replace(/</g, "&lt;").replace(/>/g, "&gt;");
+    output = output.replace(/`(.*?)`/g, "<code>$1</code>");
+    output = output.replace(/\*\*\*(.*?)\*\*\*/g, "<b><i>$1</i></b>");
+    output = output.replace(/\_\_\_(.*?)\_\_\_/g, "<u><i>$1</i></u>");
+
+    output = splitByTag(output, "**", "b");
+    output = splitByTag(output, "__", "u");
+    output = splitByTag(output, "_", "i");
+    output = splitByTag(output, "*", "i");
+    output = splitByTag(output, "~~", "s");
+
+    output = output.replace(/【[^】]+】/g, "");
+    output = output.replace(/!?\\[(.*?)\\]\\((.*?)\\)/g, '<a href="$2">$1</a>');
+    output = output.replace(/^\s*#+ (.+)/gm, "<b>$1</b>");
+    output = output.replace(/^(\s*)[\-\*] (.+)/gm, "$1• $2");
+
+    output = reinsertCodeBlocks(output, codeBlocks);
+    output = removeBlockquoteEscaping(output);
+
+    return output;
+}