Fix SIGSEGV when compiled with avx-512 instructions. Due to unaligned allocations, library crashes in nontemporalMemcpy in _mm512_stream_si512 (which requires 64-aligned allocations, but used to copy default-aligned objects). Without this patch hipamd causes random crashes in hipMemcpy* callers (tensile, rocBLAS, miopen, rocThrust, etc.). Bug: https://bugs.gentoo.org/915969 Bug report in upstream: https://github.com/ROCm-Developer-Tools/clr/issues/18 --- a/rocclr/device/rocm/rocvirtual.cpp +++ b/rocclr/device/rocm/rocvirtual.cpp @@ -2790,44 +2790,6 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize) return true; } -// ================================================================================================ -__attribute__((optimize("unroll-all-loops"), always_inline)) -static inline void nontemporalMemcpy(void* __restrict dst, const void* __restrict src, - uint16_t size) { - #if defined(__AVX512F__) - for (auto i = 0u; i != size / sizeof(__m512i); ++i) { - _mm512_stream_si512(reinterpret_cast<__m512i* __restrict&>(dst)++, - *reinterpret_cast(src)++); - } - size = size % sizeof(__m512i); - #endif - - #if defined(__AVX__) - for (auto i = 0u; i != size / sizeof(__m256i); ++i) { - _mm256_stream_si256(reinterpret_cast<__m256i* __restrict&>(dst)++, - *reinterpret_cast(src)++); - } - size = size % sizeof(__m256i); - #endif - - for (auto i = 0u; i != size / sizeof(__m128i); ++i) { - _mm_stream_si128(reinterpret_cast<__m128i* __restrict&>(dst)++, - *(reinterpret_cast(src)++)); - } - size = size % sizeof(__m128i); - - for (auto i = 0u; i != size / sizeof(long long); ++i) { - _mm_stream_si64(reinterpret_cast(dst)++, - *reinterpret_cast(src)++); - } - size = size % sizeof(long long); - - for (auto i = 0u; i != size / sizeof(int); ++i) { - _mm_stream_si32(reinterpret_cast(dst)++, - *reinterpret_cast(src)++); - } -} - // ================================================================================================ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel, const_address parameters, void* eventHandle, @@ -3096,7 +3058,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, argBuffer = reinterpret_cast
(allocKernArg(gpuKernel.KernargSegmentByteSize(), gpuKernel.KernargSegmentAlignment())); // Load all kernel arguments - nontemporalMemcpy(argBuffer, parameters, + memcpy(argBuffer, parameters, std::min(gpuKernel.KernargSegmentByteSize(), signature.paramsSize())); }