From 17d54cbd4f74aa27a048c2fc88b3a5df21a3cdd6 Mon Sep 17 00:00:00 2001 From: Mirsad Makalic Date: Wed, 16 Sep 2020 08:22:20 +0100 Subject: [PATCH 1/9] -Missing new line --- src/CodeGen_OpenCL_Dev.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CodeGen_OpenCL_Dev.cpp b/src/CodeGen_OpenCL_Dev.cpp index 77fc1ee9a127..c5743785a5c2 100644 --- a/src/CodeGen_OpenCL_Dev.cpp +++ b/src/CodeGen_OpenCL_Dev.cpp @@ -899,7 +899,7 @@ void CodeGen_OpenCL_Dev::init_module() { // There does not appear to be a reliable way to safely ignore unused // variables in OpenCL C. See https://github.com/halide/Halide/issues/4918. - src_stream << "#define halide_unused(x)"; + src_stream << "#define halide_unused(x)\n"; if (target.has_feature(Target::CLDoubles)) { src_stream << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" From 34d6ee5618f10b165a10f211695006417da60e11 Mon Sep 17 00:00:00 2001 From: Mirsad Makalic Date: Wed, 16 Sep 2020 08:28:28 +0100 Subject: [PATCH 2/9] Add GPU memory allocation cache to OpenCL backend. --- src/runtime/opencl.cpp | 201 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 180 insertions(+), 21 deletions(-) diff --git a/src/runtime/opencl.cpp b/src/runtime/opencl.cpp index b66b66cebd40..b98ef82aaea6 100644 --- a/src/runtime/opencl.cpp +++ b/src/runtime/opencl.cpp @@ -3,6 +3,7 @@ #include "device_interface.h" #include "printer.h" #include "scoped_spin_lock.h" +#include "scoped_mutex_lock.h" #include "mini_cl.h" @@ -285,6 +286,16 @@ struct device_handle { cl_mem mem; }; +// A free list, used when allocations are being cached. +WEAK struct FreeListItem { + device_handle *ptr; + cl_context ctx; + cl_command_queue stream; + size_t size; + FreeListItem *next; +} *free_list = 0; +WEAK halide_mutex free_list_lock; + // Structure to hold the state of a module attached to the context. // Also used as a linked-list to keep track of all the different // modules that are attached to a context in order to release them all @@ -558,6 +569,131 @@ WEAK int create_opencl_context(void *user_context, cl_context *ctx, cl_command_q } // namespace Runtime } // namespace Halide +WEAK int halide_opencl_release_unused_device_allocations(void *user_context) { + FreeListItem *to_free; + { + ScopedMutexLock lock(&free_list_lock); + to_free = free_list; + free_list = NULL; + } + while (to_free) { + debug(user_context) << " clReleaseMemObject " << (void *)to_free->ptr->mem << "\n"; + cl_int err = clReleaseMemObject((cl_mem)to_free->ptr->mem); + free(to_free->ptr); + + if (err != CL_SUCCESS) { + debug(user_context) << " Error during clReleaseMemObject. Error code: " << err << "\n"; + } + FreeListItem *next = to_free->next; + free(to_free); + to_free = next; + } + return 0; +} + +namespace Halide { +namespace Runtime { +namespace Internal { + +WEAK halide_device_allocation_pool opencl_allocation_pool = {NULL, NULL}; + +__attribute__((constructor)) +WEAK void register_opencl_allocation_pool() { + opencl_allocation_pool.release_unused = &halide_opencl_release_unused_device_allocations; + halide_register_device_allocation_pool(&opencl_allocation_pool); +} + +__attribute__((always_inline)) +WEAK uint64_t quantize_allocation_size(uint64_t sz) { + int z = __builtin_clzll(sz); + if (z < 60) { + sz--; + sz = sz >> (60 - z); + sz++; + sz = sz << (60 - z); + } + return sz; +} + +void cache_allocation(void *user_context, ClContext& cl_ctx, halide_buffer_t *buf) { + cl_mem dev_ptr = ((device_handle *)buf->device)->mem; + + debug(user_context) << " caching allocation for later use: " << (void *)(dev_ptr) << "\n"; + FreeListItem *item = (FreeListItem *)malloc(sizeof(FreeListItem)); + item->ctx = cl_ctx.context; + item->size = quantize_allocation_size(buf->size_in_bytes()); + item->ptr = (device_handle*)buf->device; + item->stream = cl_ctx.cmd_queue; + { + ScopedMutexLock lock(&free_list_lock); + item->next = free_list; + free_list = item; + } +} + +device_handle *retrieve_allocation_from_cache(void *user_context, ClContext& cl_ctx, const size_t size) { + ScopedMutexLock lock(&free_list_lock); + // Best-fit allocation. There are three tunable constants + // here. A bucket is claimed if the size requested is at least + // 7/8 of the size of the bucket. We keep at most 32 unused + // allocations. We round up each allocation size to its top 4 + // most significant bits (see quantize_allocation_size). + device_handle *result = NULL; + + FreeListItem *best = NULL, *item = free_list; + FreeListItem **best_prev = NULL, **prev_ptr = &free_list; + FreeListItem *to_free = NULL; + + int depth = 0; + while (item) { + if ((size <= item->size) && // Fits + (size >= (item->size / 8) * 7) && // Not too much slop + (cl_ctx.context == item->ctx) && // Same cuda context + (cl_ctx.cmd_queue == item->stream) && // Can only safely re-use on the same stream on which it was freed + ((best == NULL) || (best->size > item->size))) { // Better than previous best fit + best = item; + best_prev = prev_ptr; + prev_ptr = &item->next; + item = item->next; + } else if (depth > 32) { + // Allocations after here have not been used for a + // long time. Just detach the rest of the free list + // and defer the actual cuMemFree calls until after we + // release the free_list_lock. + to_free = item; + *prev_ptr = NULL; + item = NULL; + break; + } else { + prev_ptr = &item->next; + item = item->next; + } + depth++; + } + + if (best) { + result = best->ptr; + *best_prev = best->next; + free(best); + } + + while (to_free) { + FreeListItem *next = to_free->next; + debug(user_context) << " clReleaseMemObject from allocation cache" << (void *)to_free->ptr->mem << "\n"; + cl_int err = clReleaseMemObject((cl_mem)to_free->ptr->mem); + free(to_free->ptr); + if (err != CL_SUCCESS) { + debug(user_context) << " Error during clReleaseMemObject. Error code: " << err << "\n"; + } + free(to_free); + to_free = next; + } + + return result; +} + +}}} + extern "C" { WEAK int halide_opencl_device_free(void *user_context, halide_buffer_t *buf) { @@ -585,11 +721,19 @@ WEAK int halide_opencl_device_free(void *user_context, halide_buffer_t *buf) { #endif halide_assert(user_context, validate_device_pointer(user_context, buf)); - debug(user_context) << " clReleaseMemObject " << (void *)dev_ptr << "\n"; - cl_int result = clReleaseMemObject((cl_mem)dev_ptr); - // If clReleaseMemObject fails, it is unlikely to succeed in a later call, so - // we just end our reference to it regardless. - free((device_handle *)buf->device); + + cl_int result = CL_SUCCESS; + + if (halide_can_reuse_device_allocations(user_context)) { + cache_allocation(user_context, ctx, buf); + } else { + debug(user_context) << " clReleaseMemObject " << (void *)dev_ptr << "\n"; + result = clReleaseMemObject((cl_mem)dev_ptr); + // If clReleaseMemObject fails, it is unlikely to succeed in a later call, so + // we just end our reference to it regardless. + free((device_handle *)buf->device); + } + buf->device = 0; buf->device_interface->impl->release_module(); buf->device_interface = NULL; @@ -768,6 +912,9 @@ WEAK int halide_opencl_device_release(void *user_context) { err = clFinish(q); halide_assert(user_context, err == CL_SUCCESS); + // Dump the contents of the free list, ignoring errors. + halide_opencl_release_unused_device_allocations(user_context); + // Unload the modules attached to this context. Note that the list // nodes themselves are not freed, only the program objects are // released. Subsequent calls to halide_init_kernels might re-create @@ -814,6 +961,10 @@ WEAK int halide_opencl_device_malloc(void *user_context, halide_buffer_t *buf) { } size_t size = buf->size_in_bytes(); + if (halide_can_reuse_device_allocations(user_context)) { + size = quantize_allocation_size(size); + } + halide_assert(user_context, size != 0); if (buf->device) { halide_assert(user_context, validate_device_pointer(user_context, buf, size)); @@ -830,26 +981,34 @@ WEAK int halide_opencl_device_malloc(void *user_context, halide_buffer_t *buf) { uint64_t t_before = halide_current_time_ns(user_context); #endif - device_handle *dev_handle = (device_handle *)malloc(sizeof(device_handle)); - if (dev_handle == NULL) { - return CL_OUT_OF_HOST_MEMORY; + device_handle *dev_handle = NULL; + if (halide_can_reuse_device_allocations(user_context)) { + dev_handle = retrieve_allocation_from_cache(user_context, ctx, size); } - cl_int err; - debug(user_context) << " clCreateBuffer -> " << (int)size << " "; - cl_mem dev_ptr = clCreateBuffer(ctx.context, CL_MEM_READ_WRITE, size, NULL, &err); - if (err != CL_SUCCESS || dev_ptr == 0) { - debug(user_context) << get_opencl_error_name(err) << "\n"; - error(user_context) << "CL: clCreateBuffer failed: " - << get_opencl_error_name(err); - free(dev_handle); - return err; - } else { - debug(user_context) << (void *)dev_ptr << " device_handle: " << dev_handle << "\n"; + if (!dev_handle) { + dev_handle = (device_handle *) malloc(sizeof(device_handle)); + if (dev_handle == NULL) { + return CL_OUT_OF_HOST_MEMORY; + } + + cl_int err; + debug(user_context) << " clCreateBuffer -> " << (int)size << " "; + cl_mem dev_ptr = clCreateBuffer(ctx.context, CL_MEM_READ_WRITE, size, NULL, &err); + if (err != CL_SUCCESS || dev_ptr == 0) { + debug(user_context) << get_opencl_error_name(err) << "\n"; + error(user_context) << "CL: clCreateBuffer failed: " + << get_opencl_error_name(err); + free(dev_handle); + return err; + } else { + debug(user_context) << (void *)dev_ptr << " device_handle: " << dev_handle << "\n"; + } + + dev_handle->mem = dev_ptr; + dev_handle->offset = 0; } - dev_handle->mem = dev_ptr; - dev_handle->offset = 0; buf->device = (uint64_t)dev_handle; buf->device_interface = &opencl_device_interface; buf->device_interface->impl->use_module(); From 64d6117a27e8f66835d04095cad5527c46c83694 Mon Sep 17 00:00:00 2001 From: Mirsad Makalic Date: Fri, 4 Dec 2020 10:52:29 +0000 Subject: [PATCH 3/9] - Merge fixes --- src/runtime/opencl.cpp | 54 +++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/src/runtime/opencl.cpp b/src/runtime/opencl.cpp index 58f973d45355..0f7d8c6546ef 100644 --- a/src/runtime/opencl.cpp +++ b/src/runtime/opencl.cpp @@ -577,7 +577,7 @@ WEAK int halide_opencl_release_unused_device_allocations(void *user_context) { { ScopedMutexLock lock(&free_list_lock); to_free = free_list; - free_list = NULL; + free_list = nullptr; } while (to_free) { debug(user_context) << " clReleaseMemObject " << (void *)to_free->ptr->mem << "\n"; @@ -598,7 +598,7 @@ namespace Halide { namespace Runtime { namespace Internal { -WEAK halide_device_allocation_pool opencl_allocation_pool = {NULL, NULL}; +WEAK halide_device_allocation_pool opencl_allocation_pool = {nullptr, nullptr}; __attribute__((constructor)) WEAK void register_opencl_allocation_pool() { @@ -641,11 +641,11 @@ device_handle *retrieve_allocation_from_cache(void *user_context, ClContext& cl_ // 7/8 of the size of the bucket. We keep at most 32 unused // allocations. We round up each allocation size to its top 4 // most significant bits (see quantize_allocation_size). - device_handle *result = NULL; + device_handle *result = nullptr; - FreeListItem *best = NULL, *item = free_list; - FreeListItem **best_prev = NULL, **prev_ptr = &free_list; - FreeListItem *to_free = NULL; + FreeListItem *best = nullptr, *item = free_list; + FreeListItem **best_prev = nullptr, **prev_ptr = &free_list; + FreeListItem *to_free = nullptr; int depth = 0; while (item) { @@ -653,7 +653,7 @@ device_handle *retrieve_allocation_from_cache(void *user_context, ClContext& cl_ (size >= (item->size / 8) * 7) && // Not too much slop (cl_ctx.context == item->ctx) && // Same cuda context (cl_ctx.cmd_queue == item->stream) && // Can only safely re-use on the same stream on which it was freed - ((best == NULL) || (best->size > item->size))) { // Better than previous best fit + ((best == nullptr) || (best->size > item->size))) { // Better than previous best fit best = item; best_prev = prev_ptr; prev_ptr = &item->next; @@ -664,8 +664,8 @@ device_handle *retrieve_allocation_from_cache(void *user_context, ClContext& cl_ // and defer the actual cuMemFree calls until after we // release the free_list_lock. to_free = item; - *prev_ptr = NULL; - item = NULL; + *prev_ptr = nullptr; + item = nullptr; break; } else { prev_ptr = &item->next; @@ -1033,23 +1033,29 @@ WEAK int halide_opencl_device_malloc(void *user_context, halide_buffer_t *buf) { uint64_t t_before = halide_current_time_ns(user_context); #endif - device_handle *dev_handle = (device_handle *)malloc(sizeof(device_handle)); - if (dev_handle == nullptr) { - return CL_OUT_OF_HOST_MEMORY; + device_handle *dev_handle = nullptr; + if (halide_can_reuse_device_allocations(user_context)) { + dev_handle = retrieve_allocation_from_cache(user_context, ctx, size); } - cl_int err; - debug(user_context) << " clCreateBuffer -> " << (int)size << " "; - cl_mem dev_ptr = clCreateBuffer(ctx.context, CL_MEM_READ_WRITE, size, NULL, &err); - if (err != CL_SUCCESS || dev_ptr == 0) { - debug(user_context) << get_opencl_error_name(err) << "\n"; - error(user_context) << "CL: clCreateBuffer failed: " - << get_opencl_error_name(err); - free(dev_handle); - return err; - } else { - debug(user_context) << (void *)dev_ptr << " device_handle: " << dev_handle << "\n"; - } + if (!dev_handle) { + dev_handle = (device_handle *)malloc(sizeof(device_handle)); + if (dev_handle == nullptr) { + return CL_OUT_OF_HOST_MEMORY; + } + + cl_int err; + debug(user_context) << " clCreateBuffer -> " << (int)size << " "; + cl_mem dev_ptr = clCreateBuffer(ctx.context, CL_MEM_READ_WRITE, size, nullptr, &err); + if (err != CL_SUCCESS || dev_ptr == 0) { + debug(user_context) << get_opencl_error_name(err) << "\n"; + error(user_context) << "CL: clCreateBuffer failed: " + << get_opencl_error_name(err); + free(dev_handle); + return err; + } else { + debug(user_context) << (void *)dev_ptr << " device_handle: " << dev_handle << "\n"; + } dev_handle->mem = dev_ptr; dev_handle->offset = 0; From 1658f97270999f72ca8baaeda57b9beb1f9f97ea Mon Sep 17 00:00:00 2001 From: Mirsad Makalic Date: Fri, 12 Feb 2021 11:20:42 +0000 Subject: [PATCH 4/9] - Backport OpenCL buffer caching --- src/runtime/opencl.cpp | 340 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 319 insertions(+), 21 deletions(-) diff --git a/src/runtime/opencl.cpp b/src/runtime/opencl.cpp index 5ee0d7559ab1..15ecac6907ef 100644 --- a/src/runtime/opencl.cpp +++ b/src/runtime/opencl.cpp @@ -4,6 +4,7 @@ #include "gpu_context_common.h" #include "printer.h" #include "scoped_spin_lock.h" +#include "scoped_mutex_lock.h" #include "mini_cl.h" @@ -232,6 +233,28 @@ WEAK int halide_release_cl_context(void *user_context) { return 0; } +WEAK int halide_opencl_release_unused_device_allocations(void *user_context) { + FreeListItem *to_free; + { + ScopedMutexLock lock(&free_list_lock); + to_free = free_list; + free_list = nullptr; + } + while (to_free) { + debug(user_context) << " clReleaseMemObject " << (void *)to_free->ptr->mem << "\n"; + cl_int err = clReleaseMemObject((cl_mem)to_free->ptr->mem); + free(to_free->ptr); + + if (err != CL_SUCCESS) { + debug(user_context) << " Error during clReleaseMemObject. Error code: " << err << "\n"; + } + FreeListItem *next = to_free->next; + free(to_free); + to_free = next; + } + return 0; +} + } // extern "C" namespace Halide { @@ -287,6 +310,16 @@ struct device_handle { cl_mem mem; }; +// A free list, used when allocations are being cached. +WEAK struct FreeListItem { + device_handle *ptr; + cl_context ctx; + cl_command_queue stream; + size_t size; + FreeListItem *next; +} *free_list = 0; +WEAK halide_mutex free_list_lock; + WEAK Halide::Internal::GPUCompilationCache compilation_cache; WEAK bool validate_device_pointer(void *user_context, halide_buffer_t *buf, size_t size = 0) { @@ -626,6 +659,103 @@ WEAK cl_program compile_kernel(void *user_context, cl_context ctx, const char *s return program; } +WEAK halide_device_allocation_pool opencl_allocation_pool = {nullptr, nullptr}; + +__attribute__((constructor)) +WEAK void register_opencl_allocation_pool() { + opencl_allocation_pool.release_unused = &halide_opencl_release_unused_device_allocations; + halide_register_device_allocation_pool(&opencl_allocation_pool); +} + +__attribute__((always_inline)) +WEAK uint64_t quantize_allocation_size(uint64_t sz) { + int z = __builtin_clzll(sz); + if (z < 60) { + sz--; + sz = sz >> (60 - z); + sz++; + sz = sz << (60 - z); + } + return sz; +} + +void cache_allocation(void *user_context, ClContext& cl_ctx, halide_buffer_t *buf) { + cl_mem dev_ptr = ((device_handle *)buf->device)->mem; + + debug(user_context) << " caching allocation for later use: " << (void *)(dev_ptr) << "\n"; + FreeListItem *item = (FreeListItem *)malloc(sizeof(FreeListItem)); + item->ctx = cl_ctx.context; + item->size = quantize_allocation_size(buf->size_in_bytes()); + item->ptr = (device_handle*)buf->device; + item->stream = cl_ctx.cmd_queue; + { + ScopedMutexLock lock(&free_list_lock); + item->next = free_list; + free_list = item; + } +} + +device_handle *retrieve_allocation_from_cache(void *user_context, ClContext& cl_ctx, const size_t size) { + ScopedMutexLock lock(&free_list_lock); + // Best-fit allocation. There are three tunable constants + // here. A bucket is claimed if the size requested is at least + // 7/8 of the size of the bucket. We keep at most 32 unused + // allocations. We round up each allocation size to its top 4 + // most significant bits (see quantize_allocation_size). + device_handle *result = nullptr; + + FreeListItem *best = nullptr, *item = free_list; + FreeListItem **best_prev = nullptr, **prev_ptr = &free_list; + FreeListItem *to_free = nullptr; + + int depth = 0; + while (item) { + if ((size <= item->size) && // Fits + (size >= (item->size / 8) * 7) && // Not too much slop + (cl_ctx.context == item->ctx) && // Same cuda context + (cl_ctx.cmd_queue == item->stream) && // Can only safely re-use on the same stream on which it was freed + ((best == nullptr) || (best->size > item->size))) { // Better than previous best fit + best = item; + best_prev = prev_ptr; + prev_ptr = &item->next; + item = item->next; + } else if (depth > 32) { + // Allocations after here have not been used for a + // long time. Just detach the rest of the free list + // and defer the actual cuMemFree calls until after we + // release the free_list_lock. + to_free = item; + *prev_ptr = nullptr; + item = nullptr; + break; + } else { + prev_ptr = &item->next; + item = item->next; + } + depth++; + } + + if (best) { + result = best->ptr; + *best_prev = best->next; + free(best); + } + + while (to_free) { + FreeListItem *next = to_free->next; + debug(user_context) << " clReleaseMemObject from allocation cache" << (void *)to_free->ptr->mem << "\n"; + cl_int err = clReleaseMemObject((cl_mem)to_free->ptr->mem); + free(to_free->ptr); + if (err != CL_SUCCESS) { + debug(user_context) << " Error during clReleaseMemObject. Error code: " << err << "\n"; + } + free(to_free); + to_free = next; + } + + return result; +} + } // namespace OpenCL } // namespace Internal } // namespace Runtime @@ -659,10 +789,19 @@ WEAK int halide_opencl_device_free(void *user_context, halide_buffer_t *buf) { halide_assert(user_context, validate_device_pointer(user_context, buf)); debug(user_context) << " clReleaseMemObject " << (void *)dev_ptr << "\n"; - cl_int result = clReleaseMemObject((cl_mem)dev_ptr); - // If clReleaseMemObject fails, it is unlikely to succeed in a later call, so - // we just end our reference to it regardless. - free((device_handle *)buf->device); + + cl_int result = CL_SUCCESS; + + if (halide_can_reuse_device_allocations(user_context)) { + cache_allocation(user_context, ctx, buf); + } else { + debug(user_context) << " clReleaseMemObject " << (void *)dev_ptr << "\n"; + result = clReleaseMemObject((cl_mem)dev_ptr); + // If clReleaseMemObject fails, it is unlikely to succeed in a later call, so + // we just end our reference to it regardless. + free((device_handle *)buf->device); + } + buf->device = 0; buf->device_interface->impl->release_module(); buf->device_interface = nullptr; @@ -816,6 +955,9 @@ WEAK int halide_opencl_device_release(void *user_context) { err = clFinish(q); halide_assert(user_context, err == CL_SUCCESS); + // Dump the contents of the free list, ignoring errors. + halide_opencl_release_unused_device_allocations(user_context); + compilation_cache.delete_context(user_context, ctx, clReleaseProgram); // Release the context itself, if we created it. @@ -848,6 +990,10 @@ WEAK int halide_opencl_device_malloc(void *user_context, halide_buffer_t *buf) { } size_t size = buf->size_in_bytes(); + if (halide_can_reuse_device_allocations(user_context)) { + size = quantize_allocation_size(size); + } + halide_assert(user_context, size != 0); if (buf->device) { halide_assert(user_context, validate_device_pointer(user_context, buf, size)); @@ -864,26 +1010,52 @@ WEAK int halide_opencl_device_malloc(void *user_context, halide_buffer_t *buf) { uint64_t t_before = halide_current_time_ns(user_context); #endif - device_handle *dev_handle = (device_handle *)malloc(sizeof(device_handle)); - if (dev_handle == nullptr) { - return CL_OUT_OF_HOST_MEMORY; - } + // device_handle *dev_handle = (device_handle *)malloc(sizeof(device_handle)); + // if (dev_handle == nullptr) { + // return CL_OUT_OF_HOST_MEMORY; + // } + + device_handle *dev_handle = nullptr; + if (halide_can_reuse_device_allocations(user_context)) { + dev_handle = retrieve_allocation_from_cache(user_context, ctx, size); + } + + // cl_int err; + // debug(user_context) << " clCreateBuffer -> " << (int)size << " "; + // cl_mem dev_ptr = clCreateBuffer(ctx.context, CL_MEM_READ_WRITE, size, nullptr, &err); + // if (err != CL_SUCCESS || dev_ptr == nullptr) { + // debug(user_context) << get_opencl_error_name(err) << "\n"; + // error(user_context) << "CL: clCreateBuffer failed: " + // << get_opencl_error_name(err); + // free(dev_handle); + // return err; + // } else { + // debug(user_context) << (void *)dev_ptr << " device_handle: " << dev_handle << "\n"; + // } + + if (!dev_handle) { + dev_handle = (device_handle *) malloc(sizeof(device_handle)); + if (dev_handle == nullptr) { + return CL_OUT_OF_HOST_MEMORY; + } - cl_int err; - debug(user_context) << " clCreateBuffer -> " << (int)size << " "; - cl_mem dev_ptr = clCreateBuffer(ctx.context, CL_MEM_READ_WRITE, size, nullptr, &err); - if (err != CL_SUCCESS || dev_ptr == nullptr) { - debug(user_context) << get_opencl_error_name(err) << "\n"; - error(user_context) << "CL: clCreateBuffer failed: " - << get_opencl_error_name(err); - free(dev_handle); - return err; - } else { - debug(user_context) << (void *)dev_ptr << " device_handle: " << dev_handle << "\n"; + cl_int err; + debug(user_context) << " clCreateBuffer -> " << (int)size << " "; + cl_mem dev_ptr = clCreateBuffer(ctx.context, CL_MEM_READ_WRITE, size, nullptr, &err); + if (err != CL_SUCCESS || dev_ptr == 0) { + debug(user_context) << get_opencl_error_name(err) << "\n"; + error(user_context) << "CL: clCreateBuffer failed: " + << get_opencl_error_name(err); + free(dev_handle); + return err; + } else { + debug(user_context) << (void *)dev_ptr << " device_handle: " << dev_handle << "\n"; + } + + dev_handle->mem = dev_ptr; + dev_handle->offset = 0; } - dev_handle->mem = dev_ptr; - dev_handle->offset = 0; buf->device = (uint64_t)dev_handle; buf->device_interface = &opencl_device_interface; buf->device_interface->impl->use_module(); @@ -1528,6 +1700,132 @@ WEAK halide_device_interface_t opencl_device_interface = { } // namespace Runtime } // namespace Halide +WEAK int halide_opencl_release_unused_device_allocations(void *user_context) { + FreeListItem *to_free; + { + ScopedMutexLock lock(&free_list_lock); + to_free = free_list; + free_list = nullptr; + } + while (to_free) { + debug(user_context) << " clReleaseMemObject " << (void *)to_free->ptr->mem << "\n"; + cl_int err = clReleaseMemObject((cl_mem)to_free->ptr->mem); + free(to_free->ptr); + + if (err != CL_SUCCESS) { + debug(user_context) << " Error during clReleaseMemObject. Error code: " << err << "\n"; + } + FreeListItem *next = to_free->next; + free(to_free); + to_free = next; + } + return 0; +} + +namespace Halide { +namespace Runtime { +namespace Internal { + +WEAK halide_device_allocation_pool opencl_allocation_pool = {nullptr, nullptr}; + +__attribute__((constructor)) +WEAK void register_opencl_allocation_pool() { + opencl_allocation_pool.release_unused = &halide_opencl_release_unused_device_allocations; + halide_register_device_allocation_pool(&opencl_allocation_pool); +} + +__attribute__((always_inline)) +WEAK uint64_t quantize_allocation_size(uint64_t sz) { + int z = __builtin_clzll(sz); + if (z < 60) { + sz--; + sz = sz >> (60 - z); + sz++; + sz = sz << (60 - z); + } + return sz; +} + +void cache_allocation(void *user_context, ClContext& cl_ctx, halide_buffer_t *buf) { + cl_mem dev_ptr = ((device_handle *)buf->device)->mem; + + debug(user_context) << " caching allocation for later use: " << (void *)(dev_ptr) << "\n"; + FreeListItem *item = (FreeListItem *)malloc(sizeof(FreeListItem)); + item->ctx = cl_ctx.context; + item->size = quantize_allocation_size(buf->size_in_bytes()); + item->ptr = (device_handle*)buf->device; + item->stream = cl_ctx.cmd_queue; + { + ScopedMutexLock lock(&free_list_lock); + item->next = free_list; + free_list = item; + } +} + +device_handle *retrieve_allocation_from_cache(void *user_context, ClContext& cl_ctx, const size_t size) { + ScopedMutexLock lock(&free_list_lock); + // Best-fit allocation. There are three tunable constants + // here. A bucket is claimed if the size requested is at least + // 7/8 of the size of the bucket. We keep at most 32 unused + // allocations. We round up each allocation size to its top 4 + // most significant bits (see quantize_allocation_size). + device_handle *result = nullptr; + + FreeListItem *best = nullptr, *item = free_list; + FreeListItem **best_prev = nullptr, **prev_ptr = &free_list; + FreeListItem *to_free = nullptr; + + int depth = 0; + while (item) { + if ((size <= item->size) && // Fits + (size >= (item->size / 8) * 7) && // Not too much slop + (cl_ctx.context == item->ctx) && // Same cuda context + (cl_ctx.cmd_queue == item->stream) && // Can only safely re-use on the same stream on which it was freed + ((best == nullptr) || (best->size > item->size))) { // Better than previous best fit + best = item; + best_prev = prev_ptr; + prev_ptr = &item->next; + item = item->next; + } else if (depth > 32) { + // Allocations after here have not been used for a + // long time. Just detach the rest of the free list + // and defer the actual cuMemFree calls until after we + // release the free_list_lock. + to_free = item; + *prev_ptr = nullptr; + item = nullptr; + break; + } else { + prev_ptr = &item->next; + item = item->next; + } + depth++; + } + + if (best) { + result = best->ptr; + *best_prev = best->next; + free(best); + } + + while (to_free) { + FreeListItem *next = to_free->next; + debug(user_context) << " clReleaseMemObject from allocation cache" << (void *)to_free->ptr->mem << "\n"; + cl_int err = clReleaseMemObject((cl_mem)to_free->ptr->mem); + free(to_free->ptr); + if (err != CL_SUCCESS) { + debug(user_context) << " Error during clReleaseMemObject. Error code: " << err << "\n"; + } + free(to_free); + to_free = next; + } + + return result; +} + +}}} + + extern "C" { WEAK int halide_opencl_image_device_malloc(void *user_context, halide_buffer_t *buf) { From 899ed8abd9bd80a0e955e86e7c81722b95268f44 Mon Sep 17 00:00:00 2001 From: Mirsad Makalic Date: Fri, 12 Feb 2021 12:00:31 +0000 Subject: [PATCH 5/9] -Fixes --- src/runtime/opencl.cpp | 66 ++++++++++++++---------------------------- 1 file changed, 22 insertions(+), 44 deletions(-) diff --git a/src/runtime/opencl.cpp b/src/runtime/opencl.cpp index 15ecac6907ef..97df1909be6f 100644 --- a/src/runtime/opencl.cpp +++ b/src/runtime/opencl.cpp @@ -233,28 +233,6 @@ WEAK int halide_release_cl_context(void *user_context) { return 0; } -WEAK int halide_opencl_release_unused_device_allocations(void *user_context) { - FreeListItem *to_free; - { - ScopedMutexLock lock(&free_list_lock); - to_free = free_list; - free_list = nullptr; - } - while (to_free) { - debug(user_context) << " clReleaseMemObject " << (void *)to_free->ptr->mem << "\n"; - cl_int err = clReleaseMemObject((cl_mem)to_free->ptr->mem); - free(to_free->ptr); - - if (err != CL_SUCCESS) { - debug(user_context) << " Error during clReleaseMemObject. Error code: " << err << "\n"; - } - FreeListItem *next = to_free->next; - free(to_free); - to_free = next; - } - return 0; -} - } // extern "C" namespace Halide { @@ -659,6 +637,28 @@ WEAK cl_program compile_kernel(void *user_context, cl_context ctx, const char *s return program; } +WEAK int halide_opencl_release_unused_device_allocations(void *user_context) { + FreeListItem *to_free; + { + ScopedMutexLock lock(&free_list_lock); + to_free = free_list; + free_list = nullptr; + } + while (to_free) { + debug(user_context) << " clReleaseMemObject " << (void *)to_free->ptr->mem << "\n"; + cl_int err = clReleaseMemObject((cl_mem)to_free->ptr->mem); + free(to_free->ptr); + + if (err != CL_SUCCESS) { + debug(user_context) << " Error during clReleaseMemObject. Error code: " << err << "\n"; + } + FreeListItem *next = to_free->next; + free(to_free); + to_free = next; + } + return 0; +} + WEAK halide_device_allocation_pool opencl_allocation_pool = {nullptr, nullptr}; __attribute__((constructor)) @@ -1700,28 +1700,6 @@ WEAK halide_device_interface_t opencl_device_interface = { } // namespace Runtime } // namespace Halide -WEAK int halide_opencl_release_unused_device_allocations(void *user_context) { - FreeListItem *to_free; - { - ScopedMutexLock lock(&free_list_lock); - to_free = free_list; - free_list = nullptr; - } - while (to_free) { - debug(user_context) << " clReleaseMemObject " << (void *)to_free->ptr->mem << "\n"; - cl_int err = clReleaseMemObject((cl_mem)to_free->ptr->mem); - free(to_free->ptr); - - if (err != CL_SUCCESS) { - debug(user_context) << " Error during clReleaseMemObject. Error code: " << err << "\n"; - } - FreeListItem *next = to_free->next; - free(to_free); - to_free = next; - } - return 0; -} - namespace Halide { namespace Runtime { namespace Internal { From 03e128b8519c2752a75cc913bfcac7c08a7fdaad Mon Sep 17 00:00:00 2001 From: Mirsad Makalic Date: Wed, 24 Mar 2021 16:58:09 +0000 Subject: [PATCH 6/9] Enable 3d image writes --- src/CodeGen_OpenCL_Dev.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/CodeGen_OpenCL_Dev.cpp b/src/CodeGen_OpenCL_Dev.cpp index a5259d20fe52..4256ad1ffb02 100644 --- a/src/CodeGen_OpenCL_Dev.cpp +++ b/src/CodeGen_OpenCL_Dev.cpp @@ -1086,6 +1086,8 @@ void CodeGen_OpenCL_Dev::init_module() { src_stream << "#pragma OPENCL FP_CONTRACT ON\n"; + src_stream << "#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable\n"; + // Write out the Halide math functions. src_stream << "inline float float_from_bits(unsigned int x) {return as_float(x);}\n" << "inline float nan_f32() { return NAN; }\n" From 7b536b9d1f04e6bc0bcbceaf47d0564a3abdf843 Mon Sep 17 00:00:00 2001 From: Mirsad Makalic Date: Mon, 19 Apr 2021 09:18:04 +0100 Subject: [PATCH 7/9] Workaround for OpenCL failing on Pixel devices --- src/runtime/opencl.cpp | 51 ++++++++++++++++++++++++++++++++---------- 1 file changed, 39 insertions(+), 12 deletions(-) diff --git a/src/runtime/opencl.cpp b/src/runtime/opencl.cpp index 5fac41d4b6ff..59d58956cbc4 100644 --- a/src/runtime/opencl.cpp +++ b/src/runtime/opencl.cpp @@ -27,30 +27,57 @@ namespace OpenCL { WEAK void *lib_opencl = nullptr; extern "C" WEAK void *halide_opencl_get_symbol(void *user_context, const char *name) { + static void *(*wrapper_loadOpenCLPointer)(const char *); + // Only try to load the library if the library isn't already // loaded, or we can't load the symbol from the process already. - void *symbol = halide_get_library_symbol(lib_opencl, name); + void *symbol; + if (wrapper_loadOpenCLPointer) { + symbol = wrapper_loadOpenCLPointer(name); + } else { + symbol = halide_get_library_symbol(lib_opencl, name); + } if (symbol) { return symbol; } - const char *lib_names[] = { -#ifdef WINDOWS - "opencl.dll", -#else - "libOpenCL.so", - "/System/Library/Frameworks/OpenCL.framework/OpenCL", -#endif + const char *lib_wrapper_names[] = { + "/vendor/lib64/libOpenCL-pixel.so" }; - for (size_t i = 0; i < sizeof(lib_names) / sizeof(lib_names[0]); i++) { - lib_opencl = halide_load_library(lib_names[i]); + for (size_t i = 0; i < sizeof(lib_wrapper_names) / sizeof(lib_wrapper_names[0]); i++) { + lib_opencl = halide_load_library(lib_wrapper_names[i]); if (lib_opencl) { - debug(user_context) << " Loaded OpenCL runtime library: " << lib_names[i] << "\n"; + debug(user_context) << " Loaded OpenCL runtime library wrapper: " << lib_wrapper_names[i] << "\n"; + void (*func_enable)() = (void(*)())halide_get_library_symbol(lib_opencl, "enableOpenCL"); + func_enable(); + wrapper_loadOpenCLPointer = (void *(*)(const char *))halide_get_library_symbol(lib_opencl, "loadOpenCLPointer"); break; } } - return halide_get_library_symbol(lib_opencl, name); + if (!lib_opencl) { + const char *lib_names[] = { +#ifdef WINDOWS + "opencl.dll", +#else + "libOpenCL.so", + "/System/Library/Frameworks/OpenCL.framework/OpenCL" +#endif + }; + for (size_t i = 0; i < sizeof(lib_names) / sizeof(lib_names[0]); i++) { + lib_opencl = halide_load_library(lib_names[i]); + if (lib_opencl) { + debug(user_context) << " Loaded OpenCL runtime library: " << lib_names[i] << "\n"; + break; + } + } + } + + if (wrapper_loadOpenCLPointer) { + return wrapper_loadOpenCLPointer(name); + } else { + return halide_get_library_symbol(lib_opencl, name); + } } template From 4ed82c73d49c7b40450d6a7e62df9582f7bf2b1b Mon Sep 17 00:00:00 2001 From: Mirsad Makalic Date: Sat, 16 Oct 2021 11:24:24 +0100 Subject: [PATCH 8/9] - Merge fix --- src/runtime/opencl.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/runtime/opencl.cpp b/src/runtime/opencl.cpp index b3ad4bd6fabf..c6f60d5d5e73 100644 --- a/src/runtime/opencl.cpp +++ b/src/runtime/opencl.cpp @@ -65,9 +65,9 @@ extern "C" WEAK void *halide_opencl_get_symbol(void *user_context, const char *n #endif }; for (auto &lib_name : lib_names) { - lib_opencl = halide_load_library(lib_names[i]); + lib_opencl = halide_load_library(lib_name); if (lib_opencl) { - debug(user_context) << " Loaded OpenCL runtime library: " << lib_names[i] << "\n"; + debug(user_context) << " Loaded OpenCL runtime library: " << lib_name << "\n"; break; } } From 0441c0dd7531cfabefbc5cfc80e840a8ef4ce714 Mon Sep 17 00:00:00 2001 From: Mirsad Makalic Date: Tue, 21 Dec 2021 15:42:32 +0000 Subject: [PATCH 9/9] Merge fix --- src/runtime/opencl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/runtime/opencl.cpp b/src/runtime/opencl.cpp index b05be8e71589..ba3b33b41b90 100644 --- a/src/runtime/opencl.cpp +++ b/src/runtime/opencl.cpp @@ -1040,7 +1040,7 @@ WEAK int halide_opencl_device_malloc(void *user_context, halide_buffer_t *buf) { size = quantize_allocation_size(size); } - halide_assert(user_context, size != 0); + halide_abort_if_false(user_context, size != 0); if (buf->device) { halide_abort_if_false(user_context, validate_device_pointer(user_context, buf, size)); return 0;