Skip to content

Commit

Permalink
fix conflict during rebase
Browse files Browse the repository at this point in the history
  • Loading branch information
zhouwg committed May 24, 2024
1 parent 791d35f commit 1e14197
Show file tree
Hide file tree
Showing 3 changed files with 3 additions and 113 deletions.
21 changes: 1 addition & 20 deletions ggml-qnn.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3340,13 +3340,6 @@ static void ggml_qnn_rope(const ggml_tensor * src0, const ggml_tensor * src1, gg
}


static void ggml_qnn_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
QNN_LOG_DEBUG("call %s\n", __func__);

QNN_LOG_DEBUG("call %s done\n", __func__);
}


static void ggml_qnn_pool2d(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
QNN_LOG_DEBUG("call %s\n", __func__);

Expand Down Expand Up @@ -3541,9 +3534,6 @@ bool ggml_qnn_compute_forward(struct ggml_compute_params * params, struct ggml_t
case GGML_OP_ROPE:
func = ggml_qnn_rope;
break;
case GGML_OP_ALIBI:
func = ggml_qnn_alibi;
break;
case GGML_OP_IM2COL:
func = ggml_qnn_im2col;
break;
Expand Down Expand Up @@ -4276,10 +4266,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
n_tasks = n_threads;
}
break;
case GGML_OP_ALIBI: {
n_tasks = 1;
}
break;
case GGML_OP_CLAMP: {
n_tasks = 1;
}
Expand Down Expand Up @@ -4325,13 +4311,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
n_tasks = n_threads;
}
break;
case GGML_OP_FLASH_ATTN: {
case GGML_OP_FLASH_ATTN_EXT:
n_tasks = n_threads;
}
break;
case GGML_OP_FLASH_FF: {
n_tasks = n_threads;
}
break;
case GGML_OP_FLASH_ATTN_BACK: {
n_tasks = n_threads;
Expand Down
2 changes: 1 addition & 1 deletion ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -17222,7 +17222,7 @@ static void ggml_compute_forward_cross_entropy_loss_back(

/////////////////////////////////

static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor, struct ggml_compute_state * state) {
GGML_ASSERT(params);

if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) {
Expand Down
93 changes: 1 addition & 92 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1671,95 +1671,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
GGML_UNUSED(host_buffer);
}

static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
ggml_backend_buffer_type_t buft = nullptr;

#ifdef GGML_USE_METAL
buft = ggml_backend_metal_buffer_type();
#elif defined(GGML_USE_CUDA)
buft = ggml_backend_cuda_buffer_type(gpu);
#elif defined(GGML_USE_VULKAN)
buft = ggml_backend_vk_buffer_type(gpu);
#elif defined(GGML_USE_SYCL)
buft = ggml_backend_sycl_buffer_type(gpu);
#elif defined(GGML_USE_CLBLAST)
buft = ggml_backend_opencl_buffer_type();
#elif defined(GGML_USE_KOMPUTE)
buft = ggml_backend_kompute_buffer_type(gpu);
if (buft == nullptr) {
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
}
#elif defined(GGML_USE_QNN)
buft = ggml_backend_qnn_buffer_type(gpu);
#endif

if (buft == nullptr) {
buft = llama_default_buffer_type_cpu(true);
}
return buft;

GGML_UNUSED(gpu);
}

static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
ggml_backend_buffer_type_t buft = nullptr;

#ifdef GGML_USE_CUDA
if (ggml_backend_cuda_get_device_count() > 1) {
buft = ggml_backend_cuda_split_buffer_type(tensor_split);
}
#endif

#ifdef GGML_USE_SYCL
if (ggml_backend_sycl_get_device_count() > 1) {
buft = ggml_backend_sycl_split_buffer_type(tensor_split);
}
#endif

if (buft == nullptr) {
buft = llama_default_buffer_type_offload(fallback_gpu);
}
return buft;

GGML_UNUSED(tensor_split);
}

static size_t llama_get_device_count() {
#if defined(GGML_USE_CUDA)
return ggml_backend_cuda_get_device_count();
#elif defined(GGML_USE_SYCL)
return ggml_backend_sycl_get_device_count();
#elif defined(GGML_USE_VULKAN)
return ggml_backend_vk_get_device_count();
#elif defined(GGML_USE_QNN)
return ggml_backend_qnn_get_device_count();
#else
return 1;
#endif
}

static size_t llama_get_device_memory(int device) {
#if defined(GGML_USE_CUDA)
size_t total;
size_t free;
ggml_backend_cuda_get_device_memory(device, &free, &total);
return free;
#elif defined(GGML_USE_SYCL)
size_t total;
size_t free;
ggml_backend_sycl_get_device_memory(device, &free, &total);
return free;
#elif defined(GGML_USE_VULKAN)
size_t total;
size_t free;
ggml_backend_vk_get_device_memory(device, &free, &total);
return free;
#else
return 1;
GGML_UNUSED(device);
#endif
}

//
// globals
//
Expand Down Expand Up @@ -15627,7 +15538,6 @@ bool llama_supports_mlock(void) {
bool llama_supports_gpu_offload(void) {
#if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC) || defined(GGML_USE_QNN)
=======
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
return true;
#else
Expand Down Expand Up @@ -15946,8 +15856,7 @@ struct llama_context * llama_new_context_with_model(
#elif defined(GGML_USE_QNN)
if (model->n_gpu_layers > 0) {
//the second param is data path of prebuit QNN libs provided by Qualcomm
//can be obtained through JNI from Java layer such as "/data/data/com.ggml.llamacpp/"
//or hardcoded to "/data/local/tmp/"
//can be hardcoded to "/data/local/tmp/"
ggml_backend_t backend = ggml_backend_qnn_init(model->main_gpu, "/data/local/tmp/");
if (nullptr == backend) {
LLAMA_LOG_ERROR("%s: failed to initialize QNN backend\n", __func__);
Expand Down

0 comments on commit 1e14197

Please sign in to comment.