triton-inference-server · sfc-gh-dbove · Jul 19, 2023 · Jul 22, 2023 · Jul 22, 2023 · Aug 19, 2023
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -0,0 +1 @@
+* @sfc-gh-zhwang @sfc-gh-hykim
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -113,9 +113,8 @@ if (EXISTS ${FT_DIR})
 else()
   FetchContent_Declare(
     repo-ft
-    GIT_REPOSITORY https://github.com/NVIDIA/FasterTransformer.git 
-    GIT_TAG main
-    GIT_SHALLOW ON
+    GIT_REPOSITORY https://github.com/neevaco/FasterTransformer.git 
+    GIT_TAG b6b21406449ab19f00d1d5f97338065037b5f8e3
   )
 endif()
 FetchContent_MakeAvailable(repo-common repo-core repo-backend repo-ft)

diff --git a/LEGAL.md b/LEGAL.md
@@ -0,0 +1 @@
+This application is not part of the Snowflake Service and is governed by the terms in LICENSE, unless expressly agreed to in writing. You use this application at your own risk, and Snowflake has no obligation to support your use of this application.
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -42,7 +42,9 @@ RUN apt-get update && \
 RUN pip3 install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cu118 torch==2.0.1+cu118 && \
     pip3 install --no-cache-dir --extra-index-url https://pypi.ngc.nvidia.com regex fire tritonclient[all] && \
     pip3 install --no-cache-dir accelerate transformers huggingface_hub tokenizers SentencePiece sacrebleu datasets tqdm omegaconf rouge_score && \
-    pip3 install --no-cache-dir cmake==3.24.3
+    pip3 install --no-cache-dir cmake==3.24.3 && \
+    pip3 install --no-cache-dir langid==1.1.6 && \
+    pip3 install --no-cache-dir lingua-language-detector==2.0.2
 
 # backend build
 ADD . /workspace/build/fastertransformer_backend
@@ -66,6 +68,11 @@ RUN CUDAFLAGS="-include stdio.h" cmake \
     rm /workspace/build/fastertransformer_backend/build/bin/*_example -rf && \
     rm /workspace/build/fastertransformer_backend/build/lib/lib*Backend.so -rf
 
+# Removing git because of CVEs, no longer needed after build
+RUN apt-get purge git git-man -y && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
 ENV NCCL_LAUNCH_MODE=GROUP
 ENV WORKSPACE /workspace
 WORKDIR /workspace

diff --git a/src/libfastertransformer.cc b/src/libfastertransformer.cc
@@ -49,10 +49,14 @@
 
 // FT's libraries have dependency with triton's lib
 #include "src/fastertransformer/triton_backend/bert/BertTritonModel.h"
+#include "src/fastertransformer/triton_backend/bart/BartTritonModel.h"
+#include "src/fastertransformer/triton_backend/m2m/M2MTritonModel.h"
+#include "src/fastertransformer/triton_backend/deberta/DebertaTritonModel.h"
 #include "src/fastertransformer/triton_backend/gptj/GptJTritonModel.h"
 #include "src/fastertransformer/triton_backend/gptj/GptJTritonModelInstance.h"
 #include "src/fastertransformer/triton_backend/gptneox/GptNeoXTritonModel.h"
 #include "src/fastertransformer/triton_backend/gptneox/GptNeoXTritonModelInstance.h"
+#include "src/fastertransformer/triton_backend/llama/LlamaTritonModel.h"
 #include "src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.h"
 #include "src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModelInstance.h"
 #include "src/fastertransformer/triton_backend/t5/T5TritonModel.h"
@@ -327,6 +331,63 @@ std::shared_ptr<AbstractTransformerModel> ModelState::ModelFactory(
     } else if (data_type == "bf16") {
       ft_model = std::make_shared<BertTritonModel<__nv_bfloat16>>(
             tp, pp, custom_ar, model_dir, int8_mode, is_sparse, remove_padding);
+#endif
+    }
+  } else if (model_type == "llama") {
+    const int int8_mode  = param_get_int(param, "int8_mode");
+
+    if (data_type == "fp16") {
+      ft_model = std::make_shared<LlamaTritonModel<half>>(
+            tp, pp, custom_ar, model_dir, int8_mode);
+    } else if (data_type == "fp32") {
+      ft_model = std::make_shared<LlamaTritonModel<float>>(
+            tp, pp, custom_ar, model_dir, int8_mode);
+#ifdef ENABLE_BF16
+    } else if (data_type == "bf16") {
+      ft_model = std::make_shared<LlamaTritonModel<__nv_bfloat16>>(
+            tp, pp, custom_ar, model_dir, int8_mode);
+#endif
+    }
+  } else if (model_type == "bart") {
+    if (data_type == "fp16") {
+      ft_model = std::make_shared<BartTritonModel<half>>(
+            tp, pp, custom_ar, model_dir, 0);
+    } else if (data_type == "fp32") {
+      ft_model = std::make_shared<BartTritonModel<float>>(
+            tp, pp, custom_ar, model_dir, 0);
+#ifdef ENABLE_BF16
+    } else if (data_type == "bf16") {
+      ft_model = std::make_shared<BartTritonModel<__nv_bfloat16>>(
+            tp, pp, custom_ar, model_dir, 0);
+#endif
+    }
+  } else if (model_type == "m2m") {
+    if (data_type == "fp16") {
+      ft_model = std::make_shared<M2MTritonModel<half>>(
+            tp, pp, custom_ar, model_dir, 0);
+    } else if (data_type == "fp32") {
+      ft_model = std::make_shared<M2MTritonModel<float>>(
+            tp, pp, custom_ar, model_dir, 0);
+#ifdef ENABLE_BF16
+    } else if (data_type == "bf16") {
+      ft_model = std::make_shared<M2MTritonModel<__nv_bfloat16>>(
+            tp, pp, custom_ar, model_dir, 0);
+#endif
+    }
+  } else if (model_type == "deberta") {
+    const int is_sparse      = param_get_bool(param,"is_sparse", false);
+    const int remove_padding = param_get_bool(param,"is_remove_padding", false);
+
+    if (data_type == "fp16") {
+      ft_model = std::make_shared<DebertaTritonModel<half>>(
+            tp, pp, custom_ar, model_dir, is_sparse, remove_padding);
+    } else if (data_type == "fp32") {
+      ft_model = std::make_shared<DebertaTritonModel<float>>(
+            tp, pp, custom_ar, model_dir, is_sparse, remove_padding);
+#ifdef ENABLE_BF16
+    } else if (data_type == "bf16") {
+      ft_model = std::make_shared<DebertaTritonModel<__nv_bfloat16>>(
+            tp, pp, custom_ar, model_dir, is_sparse, remove_padding);
 #endif
     }
   } else {
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		This application is not part of the Snowflake Service and is governed by the terms in LICENSE, unless expressly agreed to in writing. You use this application at your own risk, and Snowflake has no obligation to support your use of this application.