Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add FP8 support to gguf/llama: #10055

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,10 @@ if (NOT DEFINED GGML_LLAMAFILE)
set(GGML_LLAMAFILE_DEFAULT ON)
endif()

if (NOT DEFINED GGML_OPENMP_SIMD)
set(GGML_OPENMP_SIMD_DEFAULT ON)
endif()

if (NOT DEFINED GGML_AMX)
set(GGML_AMX ON)
endif()
Expand Down
27 changes: 16 additions & 11 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,10 @@ GGML_NO_OPENMP := 1
DEPRECATE_WARNING := 1
endif

ifdef LLAMA_NO_OPENMP_SIMD
GGML_NO_OPENMP_SIMD := 1
endif

ifdef LLAMA_NO_METAL
GGML_NO_METAL := 1
DEPRECATE_WARNING := 1
Expand Down Expand Up @@ -542,6 +546,12 @@ ifndef GGML_NO_OPENMP
MK_CXXFLAGS += -fopenmp
endif # GGML_NO_OPENMP

ifndef GGML_NO_OPENMP_SIMD
MK_CPPFLAGS += -DGGML_USE_OPENMP_SIMD
MK_CFLAGS += -fopenmp-simd
MK_CXXFLAGS += -fopenmp-simd
endif # GGML_NO_OPENMP_SIMD

ifdef GGML_OPENBLAS
MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas)
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
Expand Down Expand Up @@ -948,12 +958,14 @@ OBJ_GGML = \
$(DIR_GGML)/src/ggml-alloc.o \
$(DIR_GGML)/src/ggml-backend.o \
$(DIR_GGML)/src/ggml-backend-reg.o \
$(DIR_GGML)/src/ggml-fp8.o \
$(DIR_GGML)/src/ggml-opt.o \
$(DIR_GGML)/src/ggml-quants.o \
$(DIR_GGML)/src/ggml-threading.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu_cpp.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-aarch64.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-fp8.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-quants.o \
$(OBJ_GGML_EXT)

Expand Down Expand Up @@ -1094,17 +1106,10 @@ DEP_FILES = $(OBJ_GGML:.o=.d) $(OBJ_LLAMA:.o=.d) $(OBJ_COMMON:.o=.d)
# Default target
all: $(BUILD_TARGETS)

# force c++ build for source file that have same name as c file
# Note: need this exception because `ggml-cpu.c` and `ggml-cpu.cpp` both produce the same obj/dep files
# g++ -M -I ./ggml/include/ -I ./ggml/src ggml/src/ggml-cpu/ggml-cpu.cpp | grep ggml
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o: \
ggml/src/ggml-cpu/ggml-cpu.cpp \
ggml/include/ggml-backend.h \
ggml/include/ggml.h \
ggml/include/ggml-alloc.h \
ggml/src/ggml-backend-impl.h \
ggml/include/ggml-cpu.h \
ggml/src/ggml-impl.h
$(CXX) $(CXXFLAGS) -c $< -o $@
$(DIR_GGML)/%_cpp.o: $(DIR_GGML)/%.cpp
$(CXX) $(CXXFLAGS) -MMD -c $< -o $@

# Rules for building object files
$(DIR_GGML)/%.o: $(DIR_GGML)/%.c
Expand Down
4 changes: 3 additions & 1 deletion Package.swift
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ var sources = [
"ggml/src/ggml-cpu/ggml-cpu-quants.c",
"ggml/src/ggml-threading.cpp",
"ggml/src/ggml-quants.c",
"ggml/src/ggml-fp8.cpp",
"ggml/src/ggml-cpu/ggml-cpu-fp8.cpp",
]

var resources: [Resource] = []
Expand Down Expand Up @@ -88,5 +90,5 @@ let package = Package(
linkerSettings: linkerSettings
)
],
cxxLanguageStandard: .cxx11
cxxLanguageStandard: .cxx17
)
4 changes: 2 additions & 2 deletions examples/perplexity/perplexity.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1797,9 +1797,9 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
total_seconds = total_seconds % (60*60);
}
LOG("%.2f minutes\n", total_seconds / 60.0);
LOG("\n");
LOG("chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p\n");
Comment on lines +1800 to +1801
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this change necessary?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not for FP8, but you write header on all step in you case. So it add many line in the output.
But may be best in an other PR.

}
LOG("\n");
LOG("chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p\n");

const int first = n_ctx/2;
const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
Expand Down
2 changes: 2 additions & 0 deletions examples/quantize/quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
{ "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
{ "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
{ "E4M3_Q", LLAMA_FTYPE_MOSTLY_E4M3_Q, "12.21G, 0.0050 kld @ Mistral-Nemo", },
{ "E3M4_Q", LLAMA_FTYPE_MOSTLY_E3M4_Q, "12.21G, 0.0016 kld @ Mistral-Nemo", },
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", },
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
Expand Down
5 changes: 5 additions & 0 deletions ggml/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,10 @@ if (NOT GGML_LLAMAFILE_DEFAULT)
set(GGML_LLAMAFILE_DEFAULT OFF)
endif()

if (NOT GGML_OPENMP_SIMD_DEFAULT)
set(GGML_OPENMP_SIMD_DEFAULT OFF)
endif()

if (NOT GGML_CUDA_GRAPHS_DEFAULT)
set(GGML_CUDA_GRAPHS_DEFAULT OFF)
endif()
Expand Down Expand Up @@ -112,6 +116,7 @@ option(GGML_LASX "ggml: enable lasx" ON)
option(GGML_LSX "ggml: enable lsx" ON)
option(GGML_RVV "ggml: enable rvv" ON)
option(GGML_SVE "ggml: enable SVE" OFF)
option(GGML_OPENMP_SIMD "ggml: enable OPENMP_SIMD" ${GGML_OPENMP_SIMD_DEFAULT})

if (WIN32)
set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows Version")
Expand Down
8 changes: 8 additions & 0 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,10 @@ extern "C" {
GGML_TYPE_IQ4_NL_4_4 = 36,
// GGML_TYPE_IQ4_NL_4_8 = 37,
// GGML_TYPE_IQ4_NL_8_8 = 38,
GGML_TYPE_E5M2 = 39,
GGML_TYPE_E4M3 = 40,
GGML_TYPE_E4M3_Q = 41,
GGML_TYPE_E3M4_Q = 42,
GGML_TYPE_COUNT,
};

Expand Down Expand Up @@ -436,6 +440,10 @@ extern "C" {
GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
GGML_FTYPE_MOSTLY_E5M2 = 28, // except 1d tensors
GGML_FTYPE_MOSTLY_E4M3 = 29, // except 1d tensors
GGML_FTYPE_MOSTLY_E4M3_Q = 30, // except 1d tensors
GGML_FTYPE_MOSTLY_E3M4_Q = 31, // except 1d tensors
};

// available tensor operations:
Expand Down
5 changes: 4 additions & 1 deletion ggml/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,10 @@ add_library(ggml-base
ggml-quants.c
ggml-quants.h
ggml-aarch64.c
ggml-aarch64.h)
ggml-aarch64.h
ggml-fp8.cpp
ggml-fp8.h
)

target_include_directories(ggml-base PRIVATE .)

Expand Down
76 changes: 59 additions & 17 deletions ggml/src/ggml-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,20 @@
typedef uint16_t ggml_half;
typedef uint32_t ggml_half2;

#define GGML_COMMON_AGGR
#define GGML_COMMON_AGGR_U
#define GGML_COMMON_AGGR_S

#define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_CPP)
#include <cstdint>

typedef uint16_t ggml_half;
typedef uint32_t ggml_half2;

// std-c++ allow anonymous unions but some compiler warn on it
#define GGML_COMMON_AGGR_U data
// std-c++ do not allow it.
#define GGML_COMMON_AGGR_S data

#define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_METAL)
Expand All @@ -15,7 +28,8 @@ typedef uint32_t ggml_half2;
typedef half ggml_half;
typedef half2 ggml_half2;

#define GGML_COMMON_AGGR
#define GGML_COMMON_AGGR_U
#define GGML_COMMON_AGGR_S

#define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_CUDA)
Expand All @@ -29,7 +43,8 @@ typedef half2 ggml_half2;
typedef half ggml_half;
typedef half2 ggml_half2;

#define GGML_COMMON_AGGR data
#define GGML_COMMON_AGGR_U
#define GGML_COMMON_AGGR_S data

#define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_HIP)
Expand All @@ -39,7 +54,8 @@ typedef half2 ggml_half2;
typedef half ggml_half;
typedef half2 ggml_half2;

#define GGML_COMMON_AGGR data
#define GGML_COMMON_AGGR_U
#define GGML_COMMON_AGGR_S data

#define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_SYCL)
Expand All @@ -49,7 +65,8 @@ typedef half2 ggml_half2;
typedef sycl::half ggml_half;
typedef sycl::half2 ggml_half2;

#define GGML_COMMON_AGGR data
#define GGML_COMMON_AGGR_U
#define GGML_COMMON_AGGR_S data

#define GGML_COMMON_DECL
#endif
Expand Down Expand Up @@ -154,9 +171,9 @@ typedef struct {
struct {
ggml_half d; // delta
ggml_half m; // min
} GGML_COMMON_AGGR;
} GGML_COMMON_AGGR_S;
ggml_half2 dm;
};
} GGML_COMMON_AGGR_U;
uint8_t qs[QK4_1 / 2]; // nibbles / quants
} block_q4_1;
static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding");
Expand All @@ -175,9 +192,9 @@ typedef struct {
struct {
ggml_half d; // delta
ggml_half m; // min
} GGML_COMMON_AGGR;
} GGML_COMMON_AGGR_S;
ggml_half2 dm;
};
} GGML_COMMON_AGGR_U;
uint8_t qh[4]; // 5-th bit of quants
uint8_t qs[QK5_1 / 2]; // nibbles / quants
} block_q5_1;
Expand All @@ -196,9 +213,9 @@ typedef struct {
struct {
ggml_half d; // delta
ggml_half s; // d * sum(qs[i])
} GGML_COMMON_AGGR;
} GGML_COMMON_AGGR_S;
ggml_half2 ds;
};
} GGML_COMMON_AGGR_U;
int8_t qs[QK8_1]; // quants
} block_q8_1;
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");
Expand Down Expand Up @@ -261,9 +278,9 @@ typedef struct {
struct {
ggml_half d; // super-block scale for quantized scales
ggml_half dmin; // super-block scale for quantized mins
} GGML_COMMON_AGGR;
} GGML_COMMON_AGGR_S;
ggml_half2 dm;
};
} GGML_COMMON_AGGR_U;
} block_q2_K;
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");

Expand All @@ -288,9 +305,9 @@ typedef struct {
struct {
ggml_half d; // super-block scale for quantized scales
ggml_half dmin; // super-block scale for quantized mins
} GGML_COMMON_AGGR;
} GGML_COMMON_AGGR_S;
ggml_half2 dm;
};
} GGML_COMMON_AGGR_U;
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uint8_t qs[QK_K/2]; // 4--bit quants
} block_q4_K;
Expand All @@ -305,9 +322,9 @@ typedef struct {
struct {
ggml_half d; // super-block scale for quantized scales
ggml_half dmin; // super-block scale for quantized mins
} GGML_COMMON_AGGR;
} GGML_COMMON_AGGR_S;
ggml_half2 dm;
};
} GGML_COMMON_AGGR_U;
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
Expand Down Expand Up @@ -424,6 +441,24 @@ typedef struct {
} block_iq4_nlx4;
static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");

// fp8 support
// - fp8 simple type
typedef struct { uint8_t bits; } ggml_e5m2_t;
typedef struct { uint8_t bits; } ggml_e4m3_t;

// - fp8 with bloc delta => 8.125 bpw
typedef struct {
float d; // delta
uint8_t qs[QK_K];
} block_e4m3_q;
static_assert(sizeof(block_e4m3_q) == sizeof(float) + QK_K, "wrong block_e4m3_q block size/padding");

typedef struct {
float d; // delta
Djip007 marked this conversation as resolved.
Show resolved Hide resolved
uint8_t qs[QK_K];
} block_e3m4_q;
static_assert(sizeof(block_e3m4_q) == sizeof(float) + QK_K, "wrong block_e3m4_q block size/padding");

#endif // GGML_COMMON_DECL
#endif // GGML_COMMON_DECL

Expand All @@ -437,6 +472,13 @@ static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wro
#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
#define GGML_TABLE_END() };

#define GGML_COMMON_IMPL
#elif defined(GGML_COMMON_IMPL_CPP)
#include <cstdint>

#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
#define GGML_TABLE_END() };

#define GGML_COMMON_IMPL
#elif defined(GGML_COMMON_IMPL_METAL)
#include <metal_stdlib>
Expand Down
22 changes: 22 additions & 0 deletions ggml/src/ggml-cpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ list (APPEND GGML_CPU_SOURCES
ggml-cpu-aarch64.h
ggml-cpu-quants.c
ggml-cpu-quants.h
ggml-cpu-fp8.cpp
ggml-cpu-fp8.h
amx/amx.cpp
amx/amx.h
amx/mmq.cpp
Expand Down Expand Up @@ -45,6 +47,18 @@ if (GGML_OPENMP)
endif()
endif()

if (GGML_OPENMP_SIMD)
check_cxx_compiler_flag("-fopenmp-simd" SUPPORTS_OPENMP_SIMD)
if (SUPPORTS_OPENMP_SIMD)
# OpenMP_RUNTIME_MSVC=experimental / if (MSVC)
message(STATUS "Using OPENMP_SIMD.")
add_compile_definitions(GGML_USE_OPENMP_SIMD)
set(OPENMP_SIMD_FLAGS -fopenmp-simd)
else()
message(WARNING "C++ compiler lacks OPENMP_SIMD support.")
endif()
endif()

if (GGML_LLAMAFILE)
message(STATUS "Using llamafile")

Expand Down Expand Up @@ -304,3 +318,11 @@ set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_DEFINITIONS "
if (EMSCRIPTEN)
set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS "-msimd128")
endif()

# FP8
if (OPENMP_SIMD_FLAGS)
# set_source_files_properties(ggml-cpu-fp8.cpp PROPERTIES COMPILE_FLAGS ${OPENMP_SIMD_FLAGS})
set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS ${OPENMP_SIMD_FLAGS})
endif()


Loading
Loading