From e492857a2fa0f3330f25417f947e23cf6eae13f9 Mon Sep 17 00:00:00 2001 From: Robert Toyonaga Date: Mon, 8 Jun 2026 11:32:24 -0400 Subject: [PATCH] Fix NUMA interleaving with VA2. --- src/hotspot/os/windows/os_windows.cpp | 190 ++++++++++++-- src/hotspot/os/windows/os_windows.hpp | 43 ++++ .../hotspot/gtest/runtime/test_os_windows.cpp | 239 ++++++++++++++++++ 3 files changed, 456 insertions(+), 16 deletions(-) diff --git a/src/hotspot/os/windows/os_windows.cpp b/src/hotspot/os/windows/os_windows.cpp index d00babef40f31..003fbe9be726a 100644 --- a/src/hotspot/os/windows/os_windows.cpp +++ b/src/hotspot/os/windows/os_windows.cpp @@ -3507,6 +3507,154 @@ char* os::pd_reserve_memory(size_t bytes, bool exec) { return pd_attempt_reserve_memory_at(nullptr /* addr */, bytes, exec); } +// This allocates a placeholder via VirtualAlloc2(MEM_RESERVE_PLACEHOLDER). +os::win32::PlaceholderRegion os::win32::reserve_placeholder_memory(size_t bytes, char* addr) { + assert(bytes > 0, "Size must be a value greater than 0"); + assert(is_aligned(addr, os::vm_allocation_granularity()), "Requested address should be aligned to allocation granularity."); + assert(is_aligned(bytes, os::vm_page_size()), "Requested size, bytes, should be aligned to page size."); + + if (!is_VirtualAlloc2_supported()) { + return PlaceholderRegion(); + } + + char* res = (char*)os::win32::VirtualAlloc2( + GetCurrentProcess(), + addr, + bytes, + MEM_RESERVE | MEM_RESERVE_PLACEHOLDER, + PAGE_NOACCESS, + nullptr, 0); + + if (res != nullptr) { + log_trace(os)("VirtualAlloc2 placeholder of size (%zu) returned " PTR_FORMAT ".", bytes, p2i(res)); + return PlaceholderRegion(res, bytes); + } else { + log_warning(os)("VirtualAlloc2 placeholder reservation of size (%zu) at " PTR_FORMAT ": error %lu.", bytes, p2i(addr), GetLastError()); + return PlaceholderRegion(); + } +} + +os::win32::PlaceholderRegionPair os::win32::split_memory(const PlaceholderRegion& orig, size_t offset) { + guarantee(is_VirtualAlloc2_supported(), "split_memory requires VirtualAlloc2."); + assert(!orig.is_empty(), "Region cannot be empty"); + assert(offset <= orig.size(), "Offset must be less than or equal to region size"); + assert(is_aligned(orig.base(), os::vm_page_size()), "Region base should be page-aligned"); + assert(is_aligned(offset, os::vm_page_size()), "Offset should be page-aligned"); + + char* original_base = orig.base(); + size_t original_size = orig.size(); + + if (offset == 0) { + log_trace(os)("Split memory has offset 0: " RANGEFMT, RANGEFMTARGS(original_base, original_size)); + return { PlaceholderRegion(), orig }; + } else if (offset == original_size) { + log_trace(os)("Split memory consumed the whole region: " RANGEFMT, RANGEFMTARGS(original_base, original_size)); + return { orig, PlaceholderRegion() }; + } + + // VirtualFree with MEM_RELEASE | MEM_PRESERVE_PLACEHOLDER splits the + // placeholder [original_base, original_base+original_size) in two: + // [original_base, original_base+offset) and [original_base+offset, original_base+original_size) + // + // With correct inputs, this should not fail. + // A failure indicates either a programming error (e.g., bad alignment, + // region not actually a placeholder) or a catastrophic system problem. + // Crashing with a diagnostic is more useful than attempting recovery. + BOOL result = virtualFree(original_base, offset, MEM_RELEASE | MEM_PRESERVE_PLACEHOLDER); + guarantee(result != FALSE, + "Failed to split placeholder at " PTR_FORMAT " (offset %zu): error %lu.", + p2i(original_base), offset, GetLastError()); + + log_trace(os)("Split placeholder " RANGE_FORMAT " at offset %zu.", + RANGE_FORMAT_ARGS(original_base, original_size), offset); + + return {PlaceholderRegion(original_base, offset), PlaceholderRegion(original_base + offset, original_size - offset)}; +} + +char* os::win32::convert_to_reserved(PlaceholderRegion region, int numa_node) { + guarantee(is_VirtualAlloc2_supported(), "convert_to_reserved requires VirtualAlloc2"); + assert(!region.is_empty(), "Region cannot be empty"); + assert(is_aligned(region.base(), os::vm_page_size()), "Region base should be page-aligned"); + assert(is_aligned(region.size(), os::vm_page_size()), "Region size should be page-aligned"); + + char* base = region.base(); + size_t size = region.size(); + + assert(base != nullptr, "Region base cannot be null"); + assert(size > 0, "Region size must be positive"); + + MEM_EXTENDED_PARAMETER param = { 0 }; + MEM_EXTENDED_PARAMETER* param_ptr = nullptr; + ULONG param_count = 0; + + if (numa_node >= 0) { + param.Type = MemExtendedParameterNumaNode; + param.ULong = (DWORD)numa_node; + param_ptr = ¶m; + param_count = 1; + } + + // Similar to split_memory, with correct inputs, this should never fail. + char* reserved = (char*)os::win32::VirtualAlloc2( + GetCurrentProcess(), + base, + size, + MEM_RESERVE | MEM_REPLACE_PLACEHOLDER, + PAGE_READWRITE, + param_ptr, param_count); + guarantee(reserved != nullptr, + "Failed to convert placeholder to reservation at " PTR_FORMAT " (%zu, numa node %d): error %lu.", + p2i(base), size, numa_node, GetLastError()); + + if (numa_node >= 0) { + log_trace(os)("Converted placeholder " RANGE_FORMAT " to reservation on NUMA node %d.", RANGE_FORMAT_ARGS(reserved, size), numa_node); + } else { + log_trace(os)("Converted placeholder " RANGE_FORMAT " to reservation.", RANGE_FORMAT_ARGS(reserved, size)); + } + + return reserved; +} + +// Reserve a region split across NUMA nodes. +// Uses VirtualAlloc2 placeholders in order to avoid races when splitting up the initial reservation into +// chunks assigned to different nodes. Returns the base address of the reserved range, or nullptr on failure. +static char* reserve_with_numa_placeholder(char* addr, size_t bytes) { + assert(is_VirtualAlloc2_supported(), "requires VirtualAlloc2"); + + const size_t chunk_size = NUMAInterleaveGranularity; + + // Reserve the full range as a placeholder. + // If we requested an address, reserve_placeholder_memory will obtain it or fail. + os::win32::PlaceholderRegion whole_range = os::win32::reserve_placeholder_memory(bytes, addr); + if (whole_range.is_empty()) { + log_warning(os)("Failed to reserve placeholder for NUMA interleaving (" PTR_FORMAT ", %zu).", p2i(addr), bytes); + return nullptr; + } + + char* const whole_range_base = whole_range.base(); + log_trace(os)("Created VirtualAlloc2 NUMA placeholder at " RANGE_FORMAT " (%zu bytes).", RANGE_FORMAT_ARGS(whole_range_base, bytes), bytes); + + char* cur = whole_range_base; + size_t remaining_len = whole_range.size(); + + int count = 0; + const int node_count = numa_node_list_holder.get_count(); + + while (remaining_len > 0) { + const size_t bytes_to_rq = MIN2(remaining_len, chunk_size - ((uintptr_t)cur % chunk_size)); + os::win32::PlaceholderRegion remaining(cur, remaining_len); + os::win32::PlaceholderRegionPair split = os::win32::split_memory(remaining, bytes_to_rq); + // Assign 0 for testing on systems without NUMA interleaving + DWORD node = node_count > 0 ? numa_node_list_holder.get_node_list_entry(count % node_count) : 0; + os::win32::convert_to_reserved(split.left, (int)node); + cur = split.right.base(); + remaining_len = split.right.size(); + count++; + } + + return whole_range_base; +} + // Reserve memory at an arbitrary address, only if that area is // available (and not reserved for something else). char* os::pd_attempt_reserve_memory_at(char* addr, size_t bytes, bool exec) { @@ -3516,23 +3664,33 @@ char* os::pd_attempt_reserve_memory_at(char* addr, size_t bytes, bool exec) { char* res; // note that if UseLargePages is on, all the areas that require interleaving // will go thru reserve_memory_special rather than thru here. - bool use_individual = (UseNUMAInterleaving && !UseLargePages); - if (!use_individual) { - res = (char*)virtualAlloc(addr, bytes, MEM_RESERVE, PAGE_READWRITE); - } else { - elapsedTimer reserveTimer; - if (Verbose && PrintMiscellaneous) reserveTimer.start(); - // in numa interleaving, we have to allocate pages individually - // (well really chunks of NUMAInterleaveGranularity size) - res = allocate_pages_individually(bytes, addr, MEM_RESERVE, PAGE_READWRITE); - if (res == nullptr) { - warning("NUMA page allocation failed"); - } - if (Verbose && PrintMiscellaneous) { - reserveTimer.stop(); - tty->print_cr("reserve_memory of %zx bytes took " JLONG_FORMAT " ms (" JLONG_FORMAT " ticks)", bytes, - reserveTimer.milliseconds(), reserveTimer.ticks()); + bool use_numa_interleaving = (UseNUMAInterleaving && !UseLargePages); + if (use_numa_interleaving) { + if (is_VirtualAlloc2_supported()) { + // Splittable NUMA interleaving with VirtualAlloc2 placeholders. + res = reserve_with_numa_placeholder(addr, bytes); + if (res == nullptr) { + log_warning(os)("NUMA allocation using placeholders failed"); + } + } else { + // Non-splittable NUMA interleaving: allocate_pages_individually (possible races). + elapsedTimer reserveTimer; + if (Verbose && PrintMiscellaneous) reserveTimer.start(); + // in numa interleaving, we have to allocate pages individually + // (well really chunks of NUMAInterleaveGranularity size) + res = allocate_pages_individually(bytes, addr, MEM_RESERVE, PAGE_READWRITE); + if (res == nullptr) { + log_warning(os)("NUMA page allocation failed"); + } + if (Verbose && PrintMiscellaneous) { + reserveTimer.stop(); + tty->print_cr("reserve_memory of %zx bytes took " JLONG_FORMAT " ms (" JLONG_FORMAT " ticks)", bytes, + reserveTimer.milliseconds(), reserveTimer.ticks()); + } } + } else { + // Standard reservation. + res = (char*)virtualAlloc(addr, bytes, MEM_RESERVE, PAGE_READWRITE); } assert(res == nullptr || addr == nullptr || addr == res, "Unexpected address from reserve."); diff --git a/src/hotspot/os/windows/os_windows.hpp b/src/hotspot/os/windows/os_windows.hpp index 5ebc80c817b70..6b08a7d0bce17 100644 --- a/src/hotspot/os/windows/os_windows.hpp +++ b/src/hotspot/os/windows/os_windows.hpp @@ -122,6 +122,49 @@ class os::win32 { typedef PVOID (WINAPI *MapViewOfFile3Fn)(HANDLE, HANDLE, PVOID, ULONG64, SIZE_T, ULONG, ULONG, MEM_EXTENDED_PARAMETER*, ULONG); static MapViewOfFile3Fn MapViewOfFile3; + // A "reserved" region of address space that can be split or converted to a + // normal reservation. Conceptually distinct from a reserved region: + // callers must NOT call commit_memory, map_memory, or other operations + // directly on the raw address. They must first convert it via + // convert_to_reserved(). + class PlaceholderRegion { + char* const _base; + size_t const _size; + public: + PlaceholderRegion() : _base(nullptr), _size(0) {} + PlaceholderRegion(char* base, size_t size) : _base(base), _size(size) {} + PlaceholderRegion(const PlaceholderRegion& source) : _base(source._base), _size(source._size) {} + char* base() const { return _base; } + size_t size() const { return _size; } + bool is_empty() const { return _base == nullptr; } + }; + + struct PlaceholderRegionPair { + PlaceholderRegion left; + PlaceholderRegion right; + }; + + // Reserves a virtual memory region that can be split after allocation. + // The returned region must be converted via convert_to_reserved() before committing. + // If the returned PlaceholderRegion is empty, the reservation failed. + // This should only be called after os::init_2() has completed, otherwise the Windows API may not be initialized. + // Uses VirtualAlloc2, which requires the base address be null or aligned to allocation granularity. + static PlaceholderRegion reserve_placeholder_memory(size_t bytes, char* addr); + + // Split 'orig' at 'offset'. Returns left and right placeholder pieces as a PlaceholderRegionPair. + // The caller must not use 'orig' afterward. + // Offset must be page-aligned. + // If offset == orig.size(), returns { orig, empty }. + // If offset == 0, returns { empty, orig }. + // This should not fail. If unsuccessful, this function fails fatally. + static PlaceholderRegionPair split_memory(const PlaceholderRegion& orig, size_t offset); + + // Convert a placeholder region into a regular reserved region via VirtualAlloc2(MEM_REPLACE_PLACEHOLDER). + // After conversion the Placeholder region should no longer be used. + // This should not fail. If unsuccessful, this function fails fatally. + // If numa_node >= 0, binds the reservation to that NUMA node. + static char* convert_to_reserved(PlaceholderRegion region, int numa_node = -1); + private: static void initialize_performance_counter(); diff --git a/test/hotspot/gtest/runtime/test_os_windows.cpp b/test/hotspot/gtest/runtime/test_os_windows.cpp index 6822e37b539d0..395221187723a 100644 --- a/test/hotspot/gtest/runtime/test_os_windows.cpp +++ b/test/hotspot/gtest/runtime/test_os_windows.cpp @@ -32,6 +32,8 @@ #include "concurrentTestRunner.inline.hpp" #include "unittest.hpp" +#include + namespace { class MemoryReleaser { char* const _ptr; @@ -873,4 +875,241 @@ TEST_VM(os_windows, SafeFetch32_with_page_guard_protection) { ::VirtualFree(p, 0, MEM_RELEASE); } +#define SKIP_IF_PLACEHOLDER_NOT_SUPPORTED \ + if (os::win32::VirtualAlloc2 == nullptr) GTEST_SKIP() << "VirtualAlloc2 not available"; + +TEST_VM(os, placeholder_reserve_and_convert) { + SKIP_IF_PLACEHOLDER_NOT_SUPPORTED; + + const size_t size = 4 * os::vm_allocation_granularity(); + + os::win32::PlaceholderRegion region = os::win32::reserve_placeholder_memory(size, nullptr); + ASSERT_FALSE(region.is_empty()); + ASSERT_EQ(region.size(), size); + ASSERT_NE(region.base(), (char*)nullptr); + + char* reserved = os::win32::convert_to_reserved(region); + ASSERT_EQ(reserved, region.base()); + + // Commit, but bypass NMT + ASSERT_NE(::VirtualAlloc(reserved, size, MEM_COMMIT, PAGE_READWRITE), nullptr); + // Touch the memory to confirm it's usable. + memset(reserved, 0xAB, size); + EXPECT_EQ((unsigned char)reserved[0], 0xAB); + EXPECT_EQ((unsigned char)reserved[size - 1], 0xAB); + + ASSERT_TRUE(::VirtualFree(reserved, 0, MEM_RELEASE)); +} + +TEST_VM(os, placeholder_split_two_way) { + SKIP_IF_PLACEHOLDER_NOT_SUPPORTED; + + const size_t granularity = os::vm_allocation_granularity(); + const size_t total = 4 * granularity; + const size_t split_offset = 3 * granularity; + + os::win32::PlaceholderRegion region = os::win32::reserve_placeholder_memory(total, nullptr); + ASSERT_FALSE(region.is_empty()); + + char* original_base = region.base(); + os::win32::PlaceholderRegionPair split = os::win32::split_memory(region, split_offset); + + // Leading piece: [base, base+split_offset) + ASSERT_EQ(split.left.base(), original_base); + ASSERT_EQ(split.left.size(), split_offset); + + // Trailing piece: [base+split_offset, base+total) + ASSERT_EQ(split.right.base(), original_base + split_offset); + ASSERT_EQ(split.right.size(), total - split_offset); + + // Convert both and commit. + char* addr1 = os::win32::convert_to_reserved(split.left); + char* addr2 = os::win32::convert_to_reserved(split.right); + ASSERT_EQ(addr1, original_base); + ASSERT_EQ(addr2, original_base + split_offset); + + // Commit, but bypass NMT + ASSERT_NE(::VirtualAlloc(addr1, split_offset, MEM_COMMIT, PAGE_READWRITE), nullptr); + ASSERT_NE(::VirtualAlloc(addr2, total - split_offset, MEM_COMMIT, PAGE_READWRITE), nullptr); + + // Touch the memory to confirm it's usable. + memset(addr1, 0x11, split_offset); + memset(addr2, 0x22, total - split_offset); + EXPECT_EQ((unsigned char)addr1[0], 0x11); + EXPECT_EQ((unsigned char)addr2[0], 0x22); + + // Verify we can release the parts separately. + ASSERT_TRUE(::VirtualFree(addr1, 0, MEM_RELEASE)); + ASSERT_TRUE(::VirtualFree(addr2, 0, MEM_RELEASE)); +} + +TEST_VM(os, placeholder_split_consumes_full_range) { + SKIP_IF_PLACEHOLDER_NOT_SUPPORTED; + + const size_t region_size = os::vm_allocation_granularity(); + os::win32::PlaceholderRegion region = os::win32::reserve_placeholder_memory(region_size, nullptr); + ASSERT_FALSE(region.is_empty()); + + char* original_base = region.base(); + os::win32::PlaceholderRegionPair split = os::win32::split_memory(region, region_size); + + // Leading piece + ASSERT_EQ(split.left.base(), original_base); + ASSERT_EQ(split.left.size(), region_size); + + // Trailing piece + ASSERT_TRUE(split.right.is_empty()); + + // Commit and touch to confirm it's usable. + char* addr = os::win32::convert_to_reserved(split.left); + ASSERT_NE(::VirtualAlloc(addr, region_size, MEM_COMMIT, PAGE_READWRITE), nullptr); + memset(addr, 0x11, region_size); + EXPECT_EQ((unsigned char)addr[0], 0x11); + + ASSERT_TRUE(::VirtualFree(addr, 0, MEM_RELEASE)); +} + +TEST_VM(os, placeholder_split_consumes_nothing) { + SKIP_IF_PLACEHOLDER_NOT_SUPPORTED; + + const size_t region_size = os::vm_allocation_granularity(); + os::win32::PlaceholderRegion region = os::win32::reserve_placeholder_memory(region_size, nullptr); + ASSERT_FALSE(region.is_empty()); + + char* original_base = region.base(); + os::win32::PlaceholderRegionPair split = os::win32::split_memory(region, 0); + + // Leading piece + ASSERT_TRUE(split.left.is_empty()); + + // Trailing piece + ASSERT_EQ(split.right.base(), original_base); + ASSERT_EQ(split.right.size(), region_size); + + // Commit and touch to confirm it's usable. + char* addr = os::win32::convert_to_reserved(split.right); + ASSERT_NE(::VirtualAlloc(addr, region_size, MEM_COMMIT, PAGE_READWRITE), nullptr); + memset(addr, 0x11, region_size); + EXPECT_EQ((unsigned char)addr[0], 0x11); + + ASSERT_TRUE(::VirtualFree(addr, 0, MEM_RELEASE)); +} + +TEST_VM_FATAL_ERROR_MSG(os, placeholder_double_convert, ".*Failed to convert placeholder.*") { + SKIP_IF_PLACEHOLDER_NOT_SUPPORTED; + const size_t size = 4 * os::vm_allocation_granularity(); + + os::win32::PlaceholderRegion region = os::win32::reserve_placeholder_memory(size, nullptr); + ASSERT_FALSE(region.is_empty()); + ASSERT_EQ(region.size(), size); + ASSERT_NE(region.base(), (char*)nullptr); + + // Double convert + char* reserved = os::win32::convert_to_reserved(region); + ASSERT_EQ(reserved, region.base()); + reserved = os::win32::convert_to_reserved(region); +} + +TEST_VM(os, placeholder_commit_before_convert) { + SKIP_IF_PLACEHOLDER_NOT_SUPPORTED; + const size_t size = 4 * os::vm_allocation_granularity(); + + os::win32::PlaceholderRegion region = os::win32::reserve_placeholder_memory(size, nullptr); + ASSERT_FALSE(region.is_empty()); + ASSERT_EQ(region.size(), size); + ASSERT_NE(region.base(), (char*)nullptr); + + // Committing should fail here, but not crash. + ASSERT_FALSE(::VirtualAlloc(region.base(), size, MEM_COMMIT, PAGE_READWRITE)); + ASSERT_TRUE(::VirtualFree(region.base(), 0, MEM_RELEASE)); +} + +TEST_VM(os, placeholder_release_before_convert) { + SKIP_IF_PLACEHOLDER_NOT_SUPPORTED; + + const size_t size = 4 * os::vm_allocation_granularity(); + + os::win32::PlaceholderRegion region = os::win32::reserve_placeholder_memory(size, nullptr); + ASSERT_FALSE(region.is_empty()); + ASSERT_EQ(region.size(), size); + ASSERT_NE(region.base(), (char*)nullptr); + + ASSERT_TRUE(::VirtualFree(region.base(), 0, MEM_RELEASE)); +} + +// Test that reserve_with_numa_placeholder works correctly. +// On NUMA systems with a single NUMA node, there is no true interleaving +// (all chunks are put on node 0) but the placeholder split/replace path +// is still properly exercised. +TEST_VM(os_windows, placeholder_numa_reserve_commit) { + SKIP_IF_PLACEHOLDER_NOT_SUPPORTED; + + const size_t num_nodes = os::numa_get_groups_num(); + + // Enable NUMA interleaving for this test so the correct code path is taken. + AutoSaveRestore FLAG_GUARD(UseNUMAInterleaving); + AutoSaveRestore FLAG_GUARD(UseLargePages); + FLAG_SET_CMDLINE(UseNUMAInterleaving, true); + FLAG_SET_CMDLINE(UseLargePages, false); + + // Allocate a region large enough to span multiple NUMA interleave chunks. + // NUMAInterleaveGranularity defaults to 2MB + const size_t chunk_size = NUMAInterleaveGranularity; + const size_t num_chunks = 4; + const size_t size = num_chunks * chunk_size; + + char* result = os::attempt_reserve_memory_at(nullptr, size, mtTest); + ASSERT_TRUE(result != nullptr) << "Failed to reserve memory"; + ASSERT_TRUE(is_aligned(result, os::vm_allocation_granularity())); + ASSERT_TRUE(os::commit_memory(result, size, false)); + + // Walk (and touch) the chunks using the same alignment logic as reserve_with_numa_placeholder: + // the first chunk may be shorter (up to the next chunk_size boundary), + // then full chunk_size pieces, with a possible shorter trailing chunk. + PSAPI_WORKING_SET_EX_INFORMATION wsi[num_chunks + 1]; + memset(wsi, 0, sizeof(wsi)); + size_t bytes_remaining = size; + char* addr = result; + size_t actual_chunks = 0; + + while (bytes_remaining > 0) { + size_t this_chunk_size = MIN2(bytes_remaining, chunk_size - ((size_t)addr % chunk_size)); + + memset(addr, 0xDA, this_chunk_size); + + wsi[actual_chunks] = {0}; + wsi[actual_chunks].VirtualAddress = addr; + actual_chunks++; + + bytes_remaining -= this_chunk_size; + addr += this_chunk_size; + } + + BOOL query_ok = QueryWorkingSetEx(GetCurrentProcess(), wsi, sizeof(wsi)); + ASSERT_TRUE(query_ok) << "QueryWorkingSetEx failed: " << GetLastError(); + + // Verify all pages are valid (in the working set). + for (size_t i = 0; i < actual_chunks; i++) { + EXPECT_TRUE(wsi[i].VirtualAttributes.Valid) << "Chunk " << i << " page not valid in working set"; + } + + if (num_nodes > 1) { + // On a multi-NUMA system, verify that not all chunks are assigned to the same node. + ULONG first_node = (ULONG)wsi[0].VirtualAttributes.Node; + bool found_different_node = false; + for (size_t i = 1; i < actual_chunks; i++) { + if (wsi[i].VirtualAttributes.Valid && + (ULONG)wsi[i].VirtualAttributes.Node != first_node) { + found_different_node = true; + break; + } + } + EXPECT_TRUE(found_different_node) + << "All " << actual_chunks << " chunks assigned to NUMA node " << first_node + << "; expected interleaving across " << num_nodes << " nodes"; + } + + os::release_memory(result, size); +} + #endif