/* * Copyright © 2016 Red Hat. * Copyright © 2016 Bas Nieuwenhuizen * * based on amdgpu winsys. * Copyright © 2011 Marek Olšák * Copyright © 2015 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ #include #include "radv_amdgpu_bo.h" #include "radv_debug.h" #include #include #include #include #include "drm-uapi/amdgpu_drm.h" #include "util/os_time.h" #include "util/u_atomic.h" #include "util/u_math.h" #include "util/u_memory.h" static void radv_amdgpu_winsys_bo_destroy(struct radeon_winsys *_ws, struct radeon_winsys_bo *_bo); static int radv_amdgpu_bo_va_op(struct radv_amdgpu_winsys *ws, amdgpu_bo_handle bo, uint64_t offset, uint64_t size, uint64_t addr, uint32_t bo_flags, uint64_t internal_flags, uint32_t ops) { uint64_t flags = internal_flags; if (bo) { flags = AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_EXECUTABLE; if ((bo_flags & RADEON_FLAG_VA_UNCACHED) && ws->info.chip_class >= GFX9) flags |= AMDGPU_VM_MTYPE_UC; if (!(bo_flags & RADEON_FLAG_READ_ONLY)) flags |= AMDGPU_VM_PAGE_WRITEABLE; } size = align64(size, getpagesize()); return amdgpu_bo_va_op_raw(ws->dev, bo, offset, size, addr, flags, ops); } static void radv_amdgpu_winsys_virtual_map(struct radv_amdgpu_winsys *ws, struct radv_amdgpu_winsys_bo *bo, const struct radv_amdgpu_map_range *range) { uint64_t internal_flags = 0; assert(range->size); if (!range->bo) { if (!ws->info.has_sparse_vm_mappings) return; internal_flags |= AMDGPU_VM_PAGE_PRT; } else p_atomic_inc(&range->bo->ref_count); int r = radv_amdgpu_bo_va_op(ws, range->bo ? range->bo->bo : NULL, range->bo_offset, range->size, range->offset + bo->base.va, 0, internal_flags, AMDGPU_VA_OP_MAP); if (r) abort(); } static void radv_amdgpu_winsys_virtual_unmap(struct radv_amdgpu_winsys *ws, struct radv_amdgpu_winsys_bo *bo, const struct radv_amdgpu_map_range *range) { uint64_t internal_flags = 0; assert(range->size); if (!range->bo) { if (!ws->info.has_sparse_vm_mappings) return; /* Even though this is an unmap, if we don't set this flag, AMDGPU is going to complain about the missing buffer. */ internal_flags |= AMDGPU_VM_PAGE_PRT; } int r = radv_amdgpu_bo_va_op(ws, range->bo ? range->bo->bo : NULL, range->bo_offset, range->size, range->offset + bo->base.va, 0, internal_flags, AMDGPU_VA_OP_UNMAP); if (r) abort(); if (range->bo) ws->base.buffer_destroy(&ws->base, (struct radeon_winsys_bo *)range->bo); } static int bo_comparator(const void *ap, const void *bp) { struct radv_amdgpu_bo *a = *(struct radv_amdgpu_bo *const *)ap; struct radv_amdgpu_bo *b = *(struct radv_amdgpu_bo *const *)bp; return (a > b) ? 1 : (a < b) ? -1 : 0; } static VkResult radv_amdgpu_winsys_rebuild_bo_list(struct radv_amdgpu_winsys_bo *bo) { if (bo->bo_capacity < bo->range_count) { uint32_t new_count = MAX2(bo->bo_capacity * 2, bo->range_count); struct radv_amdgpu_winsys_bo **bos = realloc(bo->bos, new_count * sizeof(struct radv_amdgpu_winsys_bo *)); if (!bos) return VK_ERROR_OUT_OF_HOST_MEMORY; bo->bos = bos; bo->bo_capacity = new_count; } uint32_t temp_bo_count = 0; for (uint32_t i = 0; i < bo->range_count; ++i) if (bo->ranges[i].bo) bo->bos[temp_bo_count++] = bo->ranges[i].bo; qsort(bo->bos, temp_bo_count, sizeof(struct radv_amdgpu_winsys_bo *), &bo_comparator); uint32_t final_bo_count = 1; for (uint32_t i = 1; i < temp_bo_count; ++i) if (bo->bos[i] != bo->bos[i - 1]) bo->bos[final_bo_count++] = bo->bos[i]; bo->bo_count = final_bo_count; return VK_SUCCESS; } static VkResult radv_amdgpu_winsys_bo_virtual_bind(struct radeon_winsys *_ws, struct radeon_winsys_bo *_parent, uint64_t offset, uint64_t size, struct radeon_winsys_bo *_bo, uint64_t bo_offset) { struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws); struct radv_amdgpu_winsys_bo *parent = (struct radv_amdgpu_winsys_bo *)_parent; struct radv_amdgpu_winsys_bo *bo = (struct radv_amdgpu_winsys_bo *)_bo; int range_count_delta, new_idx; int first = 0, last; struct radv_amdgpu_map_range new_first, new_last; VkResult result; assert(parent->is_virtual); assert(!bo || !bo->is_virtual); /* We have at most 2 new ranges (1 by the bind, and another one by splitting a range that * contains the newly bound range). */ if (parent->range_capacity - parent->range_count < 2) { uint32_t range_capacity = parent->range_capacity + 2; struct radv_amdgpu_map_range *ranges = realloc(parent->ranges, range_capacity * sizeof(struct radv_amdgpu_map_range)); if (!ranges) return VK_ERROR_OUT_OF_HOST_MEMORY; parent->ranges = ranges; parent->range_capacity = range_capacity; } /* * [first, last] is exactly the range of ranges that either overlap the * new parent, or are adjacent to it. This corresponds to the bind ranges * that may change. */ while (first + 1 < parent->range_count && parent->ranges[first].offset + parent->ranges[first].size < offset) ++first; last = first; while (last + 1 < parent->range_count && parent->ranges[last + 1].offset <= offset + size) ++last; /* Whether the first or last range are going to be totally removed or just * resized/left alone. Note that in the case of first == last, we will split * this into a part before and after the new range. The remove flag is then * whether to not create the corresponding split part. */ bool remove_first = parent->ranges[first].offset == offset; bool remove_last = parent->ranges[last].offset + parent->ranges[last].size == offset + size; bool unmapped_first = false; assert(parent->ranges[first].offset <= offset); assert(parent->ranges[last].offset + parent->ranges[last].size >= offset + size); /* Try to merge the new range with the first range. */ if (parent->ranges[first].bo == bo && (!bo || offset - bo_offset == parent->ranges[first].offset - parent->ranges[first].bo_offset)) { size += offset - parent->ranges[first].offset; offset = parent->ranges[first].offset; bo_offset = parent->ranges[first].bo_offset; remove_first = true; } /* Try to merge the new range with the last range. */ if (parent->ranges[last].bo == bo && (!bo || offset - bo_offset == parent->ranges[last].offset - parent->ranges[last].bo_offset)) { size = parent->ranges[last].offset + parent->ranges[last].size - offset; remove_last = true; } range_count_delta = 1 - (last - first + 1) + !remove_first + !remove_last; new_idx = first + !remove_first; /* Any range between first and last is going to be entirely covered by the new range so just * unmap them. */ for (int i = first + 1; i < last; ++i) radv_amdgpu_winsys_virtual_unmap(ws, parent, parent->ranges + i); /* If the first/last range are not left alone we unmap then and optionally map * them again after modifications. Not that this implicitly can do the splitting * if first == last. */ new_first = parent->ranges[first]; new_last = parent->ranges[last]; if (parent->ranges[first].offset + parent->ranges[first].size > offset || remove_first) { radv_amdgpu_winsys_virtual_unmap(ws, parent, parent->ranges + first); unmapped_first = true; if (!remove_first) { new_first.size = offset - new_first.offset; radv_amdgpu_winsys_virtual_map(ws, parent, &new_first); } } if (parent->ranges[last].offset < offset + size || remove_last) { if (first != last || !unmapped_first) radv_amdgpu_winsys_virtual_unmap(ws, parent, parent->ranges + last); if (!remove_last) { new_last.size -= offset + size - new_last.offset; new_last.bo_offset += (offset + size - new_last.offset); new_last.offset = offset + size; radv_amdgpu_winsys_virtual_map(ws, parent, &new_last); } } /* Moves the range list after last to account for the changed number of ranges. */ memmove(parent->ranges + last + 1 + range_count_delta, parent->ranges + last + 1, sizeof(struct radv_amdgpu_map_range) * (parent->range_count - last - 1)); if (!remove_first) parent->ranges[first] = new_first; if (!remove_last) parent->ranges[new_idx + 1] = new_last; /* Actually set up the new range. */ parent->ranges[new_idx].offset = offset; parent->ranges[new_idx].size = size; parent->ranges[new_idx].bo = bo; parent->ranges[new_idx].bo_offset = bo_offset; radv_amdgpu_winsys_virtual_map(ws, parent, parent->ranges + new_idx); parent->range_count += range_count_delta; result = radv_amdgpu_winsys_rebuild_bo_list(parent); if (result != VK_SUCCESS) return result; return VK_SUCCESS; } struct radv_amdgpu_winsys_bo_log { struct list_head list; uint64_t va; uint64_t size; uint64_t timestamp; /* CPU timestamp */ uint8_t is_virtual : 1; uint8_t destroyed : 1; }; static void radv_amdgpu_log_bo(struct radv_amdgpu_winsys *ws, struct radv_amdgpu_winsys_bo *bo, bool destroyed) { struct radv_amdgpu_winsys_bo_log *bo_log = NULL; if (!ws->debug_log_bos) return; bo_log = malloc(sizeof(*bo_log)); if (!bo_log) return; bo_log->va = bo->base.va; bo_log->size = bo->size; bo_log->timestamp = os_time_get_nano(); bo_log->is_virtual = bo->is_virtual; bo_log->destroyed = destroyed; u_rwlock_wrlock(&ws->log_bo_list_lock); list_addtail(&bo_log->list, &ws->log_bo_list); u_rwlock_wrunlock(&ws->log_bo_list_lock); } static int radv_amdgpu_global_bo_list_add(struct radv_amdgpu_winsys *ws, struct radv_amdgpu_winsys_bo *bo) { u_rwlock_wrlock(&ws->global_bo_list.lock); if (ws->global_bo_list.count == ws->global_bo_list.capacity) { unsigned capacity = MAX2(4, ws->global_bo_list.capacity * 2); void *data = realloc(ws->global_bo_list.bos, capacity * sizeof(struct radv_amdgpu_winsys_bo *)); if (!data) { u_rwlock_wrunlock(&ws->global_bo_list.lock); return VK_ERROR_OUT_OF_HOST_MEMORY; } ws->global_bo_list.bos = (struct radv_amdgpu_winsys_bo **)data; ws->global_bo_list.capacity = capacity; } ws->global_bo_list.bos[ws->global_bo_list.count++] = bo; bo->base.use_global_list = true; u_rwlock_wrunlock(&ws->global_bo_list.lock); return VK_SUCCESS; } static void radv_amdgpu_global_bo_list_del(struct radv_amdgpu_winsys *ws, struct radv_amdgpu_winsys_bo *bo) { u_rwlock_wrlock(&ws->global_bo_list.lock); for (unsigned i = ws->global_bo_list.count; i-- > 0;) { if (ws->global_bo_list.bos[i] == bo) { ws->global_bo_list.bos[i] = ws->global_bo_list.bos[ws->global_bo_list.count - 1]; --ws->global_bo_list.count; bo->base.use_global_list = false; break; } } u_rwlock_wrunlock(&ws->global_bo_list.lock); } static void radv_amdgpu_winsys_bo_destroy(struct radeon_winsys *_ws, struct radeon_winsys_bo *_bo) { struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws); struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(_bo); if (p_atomic_dec_return(&bo->ref_count)) return; radv_amdgpu_log_bo(ws, bo, true); if (bo->is_virtual) { for (uint32_t i = 0; i < bo->range_count; ++i) { radv_amdgpu_winsys_virtual_unmap(ws, bo, bo->ranges + i); } free(bo->bos); free(bo->ranges); } else { if (ws->debug_all_bos) radv_amdgpu_global_bo_list_del(ws, bo); radv_amdgpu_bo_va_op(ws, bo->bo, 0, bo->size, bo->base.va, 0, 0, AMDGPU_VA_OP_UNMAP); amdgpu_bo_free(bo->bo); } if (bo->base.initial_domain & RADEON_DOMAIN_VRAM) { if (bo->base.vram_no_cpu_access) { p_atomic_add(&ws->allocated_vram, -align64(bo->size, ws->info.gart_page_size)); } else { p_atomic_add(&ws->allocated_vram_vis, -align64(bo->size, ws->info.gart_page_size)); } } if (bo->base.initial_domain & RADEON_DOMAIN_GTT) p_atomic_add(&ws->allocated_gtt, -align64(bo->size, ws->info.gart_page_size)); amdgpu_va_range_free(bo->va_handle); FREE(bo); } static VkResult radv_amdgpu_winsys_bo_create(struct radeon_winsys *_ws, uint64_t size, unsigned alignment, enum radeon_bo_domain initial_domain, enum radeon_bo_flag flags, unsigned priority, uint64_t replay_address, struct radeon_winsys_bo **out_bo) { struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws); struct radv_amdgpu_winsys_bo *bo; struct amdgpu_bo_alloc_request request = {0}; struct radv_amdgpu_map_range *ranges = NULL; amdgpu_bo_handle buf_handle; uint64_t va = 0; amdgpu_va_handle va_handle; int r; VkResult result = VK_SUCCESS; /* Just be robust for callers that might use NULL-ness for determining if things should be freed. */ *out_bo = NULL; bo = CALLOC_STRUCT(radv_amdgpu_winsys_bo); if (!bo) { return VK_ERROR_OUT_OF_HOST_MEMORY; } unsigned virt_alignment = alignment; if (size >= ws->info.pte_fragment_size) virt_alignment = MAX2(virt_alignment, ws->info.pte_fragment_size); assert(!replay_address || (flags & RADEON_FLAG_REPLAYABLE)); const uint64_t va_flags = AMDGPU_VA_RANGE_HIGH | (flags & RADEON_FLAG_32BIT ? AMDGPU_VA_RANGE_32_BIT : 0) | (flags & RADEON_FLAG_REPLAYABLE ? AMDGPU_VA_RANGE_REPLAYABLE : 0); r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general, size, virt_alignment, replay_address, &va, &va_handle, va_flags); if (r) { result = replay_address ? VK_ERROR_INVALID_OPAQUE_CAPTURE_ADDRESS : VK_ERROR_OUT_OF_DEVICE_MEMORY; goto error_va_alloc; } bo->base.va = va; bo->va_handle = va_handle; bo->size = size; bo->is_virtual = !!(flags & RADEON_FLAG_VIRTUAL); bo->ref_count = 1; if (flags & RADEON_FLAG_VIRTUAL) { ranges = realloc(NULL, sizeof(struct radv_amdgpu_map_range)); if (!ranges) { result = VK_ERROR_OUT_OF_HOST_MEMORY; goto error_ranges_alloc; } bo->ranges = ranges; bo->range_count = 1; bo->range_capacity = 1; bo->ranges[0].offset = 0; bo->ranges[0].size = size; bo->ranges[0].bo = NULL; bo->ranges[0].bo_offset = 0; radv_amdgpu_winsys_virtual_map(ws, bo, bo->ranges); radv_amdgpu_log_bo(ws, bo, false); *out_bo = (struct radeon_winsys_bo *)bo; return VK_SUCCESS; } request.alloc_size = size; request.phys_alignment = alignment; if (initial_domain & RADEON_DOMAIN_VRAM) { request.preferred_heap |= AMDGPU_GEM_DOMAIN_VRAM; /* Since VRAM and GTT have almost the same performance on * APUs, we could just set GTT. However, in order to decrease * GTT(RAM) usage, which is shared with the OS, allow VRAM * placements too. The idea is not to use VRAM usefully, but * to use it so that it's not unused and wasted. * * Furthermore, even on discrete GPUs this is beneficial. If * both GTT and VRAM are set then AMDGPU still prefers VRAM * for the initial placement, but it makes the buffers * spillable. Otherwise AMDGPU tries to place the buffers in * VRAM really hard to the extent that we are getting a lot * of unnecessary movement. This helps significantly when * e.g. Horizon Zero Dawn allocates more memory than we have * VRAM. */ request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT; } if (initial_domain & RADEON_DOMAIN_GTT) request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT; if (initial_domain & RADEON_DOMAIN_GDS) request.preferred_heap |= AMDGPU_GEM_DOMAIN_GDS; if (initial_domain & RADEON_DOMAIN_OA) request.preferred_heap |= AMDGPU_GEM_DOMAIN_OA; if (flags & RADEON_FLAG_CPU_ACCESS) request.flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED; if (flags & RADEON_FLAG_NO_CPU_ACCESS) { bo->base.vram_no_cpu_access = initial_domain & RADEON_DOMAIN_VRAM; request.flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS; } if (flags & RADEON_FLAG_GTT_WC) request.flags |= AMDGPU_GEM_CREATE_CPU_GTT_USWC; if (!(flags & RADEON_FLAG_IMPLICIT_SYNC)) request.flags |= AMDGPU_GEM_CREATE_EXPLICIT_SYNC; if (flags & RADEON_FLAG_NO_INTERPROCESS_SHARING && ((ws->perftest & RADV_PERFTEST_LOCAL_BOS) || (flags & RADEON_FLAG_PREFER_LOCAL_BO))) { bo->base.is_local = true; request.flags |= AMDGPU_GEM_CREATE_VM_ALWAYS_VALID; } /* this won't do anything on pre 4.9 kernels */ if (initial_domain & RADEON_DOMAIN_VRAM) { if (ws->zero_all_vram_allocs || (flags & RADEON_FLAG_ZERO_VRAM)) request.flags |= AMDGPU_GEM_CREATE_VRAM_CLEARED; } r = amdgpu_bo_alloc(ws->dev, &request, &buf_handle); if (r) { fprintf(stderr, "amdgpu: Failed to allocate a buffer:\n"); fprintf(stderr, "amdgpu: size : %" PRIu64 " bytes\n", size); fprintf(stderr, "amdgpu: alignment : %u bytes\n", alignment); fprintf(stderr, "amdgpu: domains : %u\n", initial_domain); result = VK_ERROR_OUT_OF_DEVICE_MEMORY; goto error_bo_alloc; } r = radv_amdgpu_bo_va_op(ws, buf_handle, 0, size, va, flags, 0, AMDGPU_VA_OP_MAP); if (r) { result = VK_ERROR_UNKNOWN; goto error_va_map; } bo->bo = buf_handle; bo->base.initial_domain = initial_domain; bo->base.use_global_list = bo->base.is_local; bo->is_shared = false; bo->priority = priority; r = amdgpu_bo_export(buf_handle, amdgpu_bo_handle_type_kms, &bo->bo_handle); assert(!r); if (initial_domain & RADEON_DOMAIN_VRAM) { /* Buffers allocated in VRAM with the NO_CPU_ACCESS flag * aren't mappable and they are counted as part of the VRAM * counter. * * Otherwise, buffers with the CPU_ACCESS flag or without any * of both (imported buffers) are counted as part of the VRAM * visible counter because they can be mapped. */ if (bo->base.vram_no_cpu_access) { p_atomic_add(&ws->allocated_vram, align64(bo->size, ws->info.gart_page_size)); } else { p_atomic_add(&ws->allocated_vram_vis, align64(bo->size, ws->info.gart_page_size)); } } if (initial_domain & RADEON_DOMAIN_GTT) p_atomic_add(&ws->allocated_gtt, align64(bo->size, ws->info.gart_page_size)); if (ws->debug_all_bos) radv_amdgpu_global_bo_list_add(ws, bo); radv_amdgpu_log_bo(ws, bo, false); *out_bo = (struct radeon_winsys_bo *)bo; return VK_SUCCESS; error_va_map: amdgpu_bo_free(buf_handle); error_bo_alloc: free(ranges); error_ranges_alloc: amdgpu_va_range_free(va_handle); error_va_alloc: FREE(bo); return result; } static void * radv_amdgpu_winsys_bo_map(struct radeon_winsys_bo *_bo) { struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(_bo); int ret; void *data; ret = amdgpu_bo_cpu_map(bo->bo, &data); if (ret) return NULL; return data; } static void radv_amdgpu_winsys_bo_unmap(struct radeon_winsys_bo *_bo) { struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(_bo); amdgpu_bo_cpu_unmap(bo->bo); } static uint64_t radv_amdgpu_get_optimal_vm_alignment(struct radv_amdgpu_winsys *ws, uint64_t size, unsigned alignment) { uint64_t vm_alignment = alignment; /* Increase the VM alignment for faster address translation. */ if (size >= ws->info.pte_fragment_size) vm_alignment = MAX2(vm_alignment, ws->info.pte_fragment_size); /* Gfx9: Increase the VM alignment to the most significant bit set * in the size for faster address translation. */ if (ws->info.chip_class >= GFX9) { unsigned msb = util_last_bit64(size); /* 0 = no bit is set */ uint64_t msb_alignment = msb ? 1ull << (msb - 1) : 0; vm_alignment = MAX2(vm_alignment, msb_alignment); } return vm_alignment; } static VkResult radv_amdgpu_winsys_bo_from_ptr(struct radeon_winsys *_ws, void *pointer, uint64_t size, unsigned priority, struct radeon_winsys_bo **out_bo) { struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws); amdgpu_bo_handle buf_handle; struct radv_amdgpu_winsys_bo *bo; uint64_t va; amdgpu_va_handle va_handle; uint64_t vm_alignment; VkResult result = VK_SUCCESS; /* Just be robust for callers that might use NULL-ness for determining if things should be freed. */ *out_bo = NULL; bo = CALLOC_STRUCT(radv_amdgpu_winsys_bo); if (!bo) return VK_ERROR_OUT_OF_HOST_MEMORY; if (amdgpu_create_bo_from_user_mem(ws->dev, pointer, size, &buf_handle)) { result = VK_ERROR_OUT_OF_DEVICE_MEMORY; goto error; } /* Using the optimal VM alignment also fixes GPU hangs for buffers that * are imported. */ vm_alignment = radv_amdgpu_get_optimal_vm_alignment(ws, size, ws->info.gart_page_size); if (amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general, size, vm_alignment, 0, &va, &va_handle, AMDGPU_VA_RANGE_HIGH)) { result = VK_ERROR_OUT_OF_DEVICE_MEMORY; goto error_va_alloc; } if (amdgpu_bo_va_op(buf_handle, 0, size, va, 0, AMDGPU_VA_OP_MAP)) { result = VK_ERROR_UNKNOWN; goto error_va_map; } /* Initialize it */ bo->base.va = va; bo->va_handle = va_handle; bo->size = size; bo->ref_count = 1; bo->bo = buf_handle; bo->base.initial_domain = RADEON_DOMAIN_GTT; bo->base.use_global_list = false; bo->priority = priority; ASSERTED int r = amdgpu_bo_export(buf_handle, amdgpu_bo_handle_type_kms, &bo->bo_handle); assert(!r); p_atomic_add(&ws->allocated_gtt, align64(bo->size, ws->info.gart_page_size)); if (ws->debug_all_bos) radv_amdgpu_global_bo_list_add(ws, bo); radv_amdgpu_log_bo(ws, bo, false); *out_bo = (struct radeon_winsys_bo *)bo; return VK_SUCCESS; error_va_map: amdgpu_va_range_free(va_handle); error_va_alloc: amdgpu_bo_free(buf_handle); error: FREE(bo); return result; } static VkResult radv_amdgpu_winsys_bo_from_fd(struct radeon_winsys *_ws, int fd, unsigned priority, struct radeon_winsys_bo **out_bo, uint64_t *alloc_size) { struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws); struct radv_amdgpu_winsys_bo *bo; uint64_t va; amdgpu_va_handle va_handle; enum amdgpu_bo_handle_type type = amdgpu_bo_handle_type_dma_buf_fd; struct amdgpu_bo_import_result result = {0}; struct amdgpu_bo_info info = {0}; enum radeon_bo_domain initial = 0; int r; VkResult vk_result = VK_SUCCESS; /* Just be robust for callers that might use NULL-ness for determining if things should be freed. */ *out_bo = NULL; bo = CALLOC_STRUCT(radv_amdgpu_winsys_bo); if (!bo) return VK_ERROR_OUT_OF_HOST_MEMORY; r = amdgpu_bo_import(ws->dev, type, fd, &result); if (r) { vk_result = VK_ERROR_INVALID_EXTERNAL_HANDLE; goto error; } r = amdgpu_bo_query_info(result.buf_handle, &info); if (r) { vk_result = VK_ERROR_UNKNOWN; goto error_query; } if (alloc_size) { *alloc_size = info.alloc_size; } r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general, result.alloc_size, 1 << 20, 0, &va, &va_handle, AMDGPU_VA_RANGE_HIGH); if (r) { vk_result = VK_ERROR_OUT_OF_DEVICE_MEMORY; goto error_query; } r = radv_amdgpu_bo_va_op(ws, result.buf_handle, 0, result.alloc_size, va, 0, 0, AMDGPU_VA_OP_MAP); if (r) { vk_result = VK_ERROR_UNKNOWN; goto error_va_map; } if (info.preferred_heap & AMDGPU_GEM_DOMAIN_VRAM) initial |= RADEON_DOMAIN_VRAM; if (info.preferred_heap & AMDGPU_GEM_DOMAIN_GTT) initial |= RADEON_DOMAIN_GTT; bo->bo = result.buf_handle; bo->base.va = va; bo->va_handle = va_handle; bo->base.initial_domain = initial; bo->base.use_global_list = false; bo->size = result.alloc_size; bo->is_shared = true; bo->priority = priority; bo->ref_count = 1; r = amdgpu_bo_export(result.buf_handle, amdgpu_bo_handle_type_kms, &bo->bo_handle); assert(!r); if (bo->base.initial_domain & RADEON_DOMAIN_VRAM) p_atomic_add(&ws->allocated_vram, align64(bo->size, ws->info.gart_page_size)); if (bo->base.initial_domain & RADEON_DOMAIN_GTT) p_atomic_add(&ws->allocated_gtt, align64(bo->size, ws->info.gart_page_size)); if (ws->debug_all_bos) radv_amdgpu_global_bo_list_add(ws, bo); radv_amdgpu_log_bo(ws, bo, false); *out_bo = (struct radeon_winsys_bo *)bo; return VK_SUCCESS; error_va_map: amdgpu_va_range_free(va_handle); error_query: amdgpu_bo_free(result.buf_handle); error: FREE(bo); return vk_result; } static bool radv_amdgpu_winsys_get_fd(struct radeon_winsys *_ws, struct radeon_winsys_bo *_bo, int *fd) { struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(_bo); enum amdgpu_bo_handle_type type = amdgpu_bo_handle_type_dma_buf_fd; int r; unsigned handle; r = amdgpu_bo_export(bo->bo, type, &handle); if (r) return false; *fd = (int)handle; bo->is_shared = true; return true; } static bool radv_amdgpu_bo_get_flags_from_fd(struct radeon_winsys *_ws, int fd, enum radeon_bo_domain *domains, enum radeon_bo_flag *flags) { struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws); struct amdgpu_bo_import_result result = {0}; struct amdgpu_bo_info info = {0}; int r; *domains = 0; *flags = 0; r = amdgpu_bo_import(ws->dev, amdgpu_bo_handle_type_dma_buf_fd, fd, &result); if (r) return false; r = amdgpu_bo_query_info(result.buf_handle, &info); amdgpu_bo_free(result.buf_handle); if (r) return false; if (info.preferred_heap & AMDGPU_GEM_DOMAIN_VRAM) *domains |= RADEON_DOMAIN_VRAM; if (info.preferred_heap & AMDGPU_GEM_DOMAIN_GTT) *domains |= RADEON_DOMAIN_GTT; if (info.preferred_heap & AMDGPU_GEM_DOMAIN_GDS) *domains |= RADEON_DOMAIN_GDS; if (info.preferred_heap & AMDGPU_GEM_DOMAIN_OA) *domains |= RADEON_DOMAIN_OA; if (info.alloc_flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED) *flags |= RADEON_FLAG_CPU_ACCESS; if (info.alloc_flags & AMDGPU_GEM_CREATE_NO_CPU_ACCESS) *flags |= RADEON_FLAG_NO_CPU_ACCESS; if (!(info.alloc_flags & AMDGPU_GEM_CREATE_EXPLICIT_SYNC)) *flags |= RADEON_FLAG_IMPLICIT_SYNC; if (info.alloc_flags & AMDGPU_GEM_CREATE_CPU_GTT_USWC) *flags |= RADEON_FLAG_GTT_WC; if (info.alloc_flags & AMDGPU_GEM_CREATE_VM_ALWAYS_VALID) *flags |= RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_PREFER_LOCAL_BO; if (info.alloc_flags & AMDGPU_GEM_CREATE_VRAM_CLEARED) *flags |= RADEON_FLAG_ZERO_VRAM; return true; } static unsigned eg_tile_split(unsigned tile_split) { switch (tile_split) { case 0: tile_split = 64; break; case 1: tile_split = 128; break; case 2: tile_split = 256; break; case 3: tile_split = 512; break; default: case 4: tile_split = 1024; break; case 5: tile_split = 2048; break; case 6: tile_split = 4096; break; } return tile_split; } static unsigned radv_eg_tile_split_rev(unsigned eg_tile_split) { switch (eg_tile_split) { case 64: return 0; case 128: return 1; case 256: return 2; case 512: return 3; default: case 1024: return 4; case 2048: return 5; case 4096: return 6; } } #define AMDGPU_TILING_DCC_MAX_COMPRESSED_BLOCK_SIZE_SHIFT 45 #define AMDGPU_TILING_DCC_MAX_COMPRESSED_BLOCK_SIZE_MASK 0x3 static void radv_amdgpu_winsys_bo_set_metadata(struct radeon_winsys *_ws, struct radeon_winsys_bo *_bo, struct radeon_bo_metadata *md) { struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws); struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(_bo); struct amdgpu_bo_metadata metadata = {0}; uint64_t tiling_flags = 0; if (ws->info.chip_class >= GFX9) { tiling_flags |= AMDGPU_TILING_SET(SWIZZLE_MODE, md->u.gfx9.swizzle_mode); tiling_flags |= AMDGPU_TILING_SET(DCC_OFFSET_256B, md->u.gfx9.dcc_offset_256b); tiling_flags |= AMDGPU_TILING_SET(DCC_PITCH_MAX, md->u.gfx9.dcc_pitch_max); tiling_flags |= AMDGPU_TILING_SET(DCC_INDEPENDENT_64B, md->u.gfx9.dcc_independent_64b_blocks); tiling_flags |= AMDGPU_TILING_SET(DCC_INDEPENDENT_128B, md->u.gfx9.dcc_independent_128b_blocks); tiling_flags |= AMDGPU_TILING_SET(DCC_MAX_COMPRESSED_BLOCK_SIZE, md->u.gfx9.dcc_max_compressed_block_size); tiling_flags |= AMDGPU_TILING_SET(SCANOUT, md->u.gfx9.scanout); } else { if (md->u.legacy.macrotile == RADEON_LAYOUT_TILED) tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 4); /* 2D_TILED_THIN1 */ else if (md->u.legacy.microtile == RADEON_LAYOUT_TILED) tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 2); /* 1D_TILED_THIN1 */ else tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 1); /* LINEAR_ALIGNED */ tiling_flags |= AMDGPU_TILING_SET(PIPE_CONFIG, md->u.legacy.pipe_config); tiling_flags |= AMDGPU_TILING_SET(BANK_WIDTH, util_logbase2(md->u.legacy.bankw)); tiling_flags |= AMDGPU_TILING_SET(BANK_HEIGHT, util_logbase2(md->u.legacy.bankh)); if (md->u.legacy.tile_split) tiling_flags |= AMDGPU_TILING_SET(TILE_SPLIT, radv_eg_tile_split_rev(md->u.legacy.tile_split)); tiling_flags |= AMDGPU_TILING_SET(MACRO_TILE_ASPECT, util_logbase2(md->u.legacy.mtilea)); tiling_flags |= AMDGPU_TILING_SET(NUM_BANKS, util_logbase2(md->u.legacy.num_banks) - 1); if (md->u.legacy.scanout) tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 0); /* DISPLAY_MICRO_TILING */ else tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 1); /* THIN_MICRO_TILING */ } metadata.tiling_info = tiling_flags; metadata.size_metadata = md->size_metadata; memcpy(metadata.umd_metadata, md->metadata, sizeof(md->metadata)); amdgpu_bo_set_metadata(bo->bo, &metadata); } static void radv_amdgpu_winsys_bo_get_metadata(struct radeon_winsys *_ws, struct radeon_winsys_bo *_bo, struct radeon_bo_metadata *md) { struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws); struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(_bo); struct amdgpu_bo_info info = {0}; int r = amdgpu_bo_query_info(bo->bo, &info); if (r) return; uint64_t tiling_flags = info.metadata.tiling_info; if (ws->info.chip_class >= GFX9) { md->u.gfx9.swizzle_mode = AMDGPU_TILING_GET(tiling_flags, SWIZZLE_MODE); md->u.gfx9.scanout = AMDGPU_TILING_GET(tiling_flags, SCANOUT); } else { md->u.legacy.microtile = RADEON_LAYOUT_LINEAR; md->u.legacy.macrotile = RADEON_LAYOUT_LINEAR; if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 4) /* 2D_TILED_THIN1 */ md->u.legacy.macrotile = RADEON_LAYOUT_TILED; else if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 2) /* 1D_TILED_THIN1 */ md->u.legacy.microtile = RADEON_LAYOUT_TILED; md->u.legacy.pipe_config = AMDGPU_TILING_GET(tiling_flags, PIPE_CONFIG); md->u.legacy.bankw = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_WIDTH); md->u.legacy.bankh = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_HEIGHT); md->u.legacy.tile_split = eg_tile_split(AMDGPU_TILING_GET(tiling_flags, TILE_SPLIT)); md->u.legacy.mtilea = 1 << AMDGPU_TILING_GET(tiling_flags, MACRO_TILE_ASPECT); md->u.legacy.num_banks = 2 << AMDGPU_TILING_GET(tiling_flags, NUM_BANKS); md->u.legacy.scanout = AMDGPU_TILING_GET(tiling_flags, MICRO_TILE_MODE) == 0; /* DISPLAY */ } md->size_metadata = info.metadata.size_metadata; memcpy(md->metadata, info.metadata.umd_metadata, sizeof(md->metadata)); } static VkResult radv_amdgpu_winsys_bo_make_resident(struct radeon_winsys *_ws, struct radeon_winsys_bo *_bo, bool resident) { struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws); struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(_bo); VkResult result = VK_SUCCESS; /* Do not add the BO to the global list if it's a local BO because the * kernel maintains a list for us. */ if (bo->base.is_local) return VK_SUCCESS; /* Do not add the BO twice to the global list if the allbos debug * option is enabled. */ if (ws->debug_all_bos) return VK_SUCCESS; if (resident) { result = radv_amdgpu_global_bo_list_add(ws, bo); } else { radv_amdgpu_global_bo_list_del(ws, bo); } return result; } static int radv_amdgpu_bo_va_compare(const void *a, const void *b) { const struct radv_amdgpu_winsys_bo *bo_a = *(const struct radv_amdgpu_winsys_bo *const *)a; const struct radv_amdgpu_winsys_bo *bo_b = *(const struct radv_amdgpu_winsys_bo *const *)b; return bo_a->base.va < bo_b->base.va ? -1 : bo_a->base.va > bo_b->base.va ? 1 : 0; } static void radv_amdgpu_dump_bo_log(struct radeon_winsys *_ws, FILE *file) { struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws); struct radv_amdgpu_winsys_bo_log *bo_log; if (!ws->debug_log_bos) return; u_rwlock_rdlock(&ws->log_bo_list_lock); LIST_FOR_EACH_ENTRY (bo_log, &ws->log_bo_list, list) { fprintf(file, "timestamp=%llu, VA=%.16llx-%.16llx, destroyed=%d, is_virtual=%d\n", (long long)bo_log->timestamp, (long long)bo_log->va, (long long)(bo_log->va + bo_log->size), bo_log->destroyed, bo_log->is_virtual); } u_rwlock_rdunlock(&ws->log_bo_list_lock); } static void radv_amdgpu_dump_bo_ranges(struct radeon_winsys *_ws, FILE *file) { struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws); if (ws->debug_all_bos) { struct radv_amdgpu_winsys_bo **bos = NULL; int i = 0; u_rwlock_rdlock(&ws->global_bo_list.lock); bos = malloc(sizeof(*bos) * ws->global_bo_list.count); if (!bos) { u_rwlock_rdunlock(&ws->global_bo_list.lock); fprintf(file, " Failed to allocate memory to sort VA ranges for dumping\n"); return; } for (i = 0; i < ws->global_bo_list.count; i++) { bos[i] = ws->global_bo_list.bos[i]; } qsort(bos, ws->global_bo_list.count, sizeof(bos[0]), radv_amdgpu_bo_va_compare); for (i = 0; i < ws->global_bo_list.count; ++i) { fprintf(file, " VA=%.16llx-%.16llx, handle=%d%s\n", (long long)bos[i]->base.va, (long long)(bos[i]->base.va + bos[i]->size), bos[i]->bo_handle, bos[i]->is_virtual ? " sparse" : ""); } free(bos); u_rwlock_rdunlock(&ws->global_bo_list.lock); } else fprintf(file, " To get BO VA ranges, please specify RADV_DEBUG=allbos\n"); } void radv_amdgpu_bo_init_functions(struct radv_amdgpu_winsys *ws) { ws->base.buffer_create = radv_amdgpu_winsys_bo_create; ws->base.buffer_destroy = radv_amdgpu_winsys_bo_destroy; ws->base.buffer_map = radv_amdgpu_winsys_bo_map; ws->base.buffer_unmap = radv_amdgpu_winsys_bo_unmap; ws->base.buffer_from_ptr = radv_amdgpu_winsys_bo_from_ptr; ws->base.buffer_from_fd = radv_amdgpu_winsys_bo_from_fd; ws->base.buffer_get_fd = radv_amdgpu_winsys_get_fd; ws->base.buffer_set_metadata = radv_amdgpu_winsys_bo_set_metadata; ws->base.buffer_get_metadata = radv_amdgpu_winsys_bo_get_metadata; ws->base.buffer_virtual_bind = radv_amdgpu_winsys_bo_virtual_bind; ws->base.buffer_get_flags_from_fd = radv_amdgpu_bo_get_flags_from_fd; ws->base.buffer_make_resident = radv_amdgpu_winsys_bo_make_resident; ws->base.dump_bo_ranges = radv_amdgpu_dump_bo_ranges; ws->base.dump_bo_log = radv_amdgpu_dump_bo_log; }