Skip to content

Commit 3e44751

Browse files
committed
src: add ProfilingArrayBufferAllocator for external memory tracking
Track per-label Buffer/ArrayBuffer backing store allocations. The allocator is installed as a delegate when profiling is active, with zero overhead otherwise. Signed-off-by: Rudolf Meijering <[email protected]>
1 parent 37180c7 commit 3e44751

3 files changed

Lines changed: 328 additions & 9 deletions

File tree

src/api/environment.cc

Lines changed: 174 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,9 @@
1515
#include "node_realm-inl.h"
1616
#include "node_shadow_realm.h"
1717
#include "node_snapshot_builder.h"
18+
#include "node_v8.h"
1819
#include "node_v8_platform-inl.h"
20+
#include "v8-profiler.h"
1921
#include "node_wasm_web_api.h"
2022
#include "uv.h"
2123
#ifdef NODE_ENABLE_VTUNE_PROFILING
@@ -117,6 +119,10 @@ void* NodeArrayBufferAllocator::Allocate(size_t size) {
117119
ret = allocator_->Allocate(size);
118120
if (ret != nullptr) [[likely]] {
119121
total_mem_usage_.fetch_add(size, std::memory_order_relaxed);
122+
auto* pa = profiling_allocator_.load(std::memory_order_acquire);
123+
if (pa != nullptr) [[unlikely]] {
124+
pa->TrackAllocate(ret, size);
125+
}
120126
}
121127
return ret;
122128
}
@@ -126,15 +132,38 @@ void* NodeArrayBufferAllocator::AllocateUninitialized(size_t size) {
126132
void* ret = allocator_->AllocateUninitialized(size);
127133
if (ret != nullptr) [[likely]] {
128134
total_mem_usage_.fetch_add(size, std::memory_order_relaxed);
135+
auto* pa = profiling_allocator_.load(std::memory_order_acquire);
136+
if (pa != nullptr) [[unlikely]] {
137+
pa->TrackAllocate(ret, size);
138+
}
129139
}
130140
return ret;
131141
}
132142

133143
void NodeArrayBufferAllocator::Free(void* data, size_t size) {
144+
auto* pa = profiling_allocator_.load(std::memory_order_acquire);
145+
if (pa != nullptr) [[unlikely]] {
146+
pa->TrackFree(data);
147+
}
134148
total_mem_usage_.fetch_sub(size, std::memory_order_relaxed);
135149
allocator_->Free(data, size);
136150
}
137151

152+
ProfilingArrayBufferAllocator*
153+
NodeArrayBufferAllocator::CreateProfilingAllocator() {
154+
if (!owned_profiling_allocator_) {
155+
owned_profiling_allocator_ =
156+
std::make_unique<ProfilingArrayBufferAllocator>();
157+
}
158+
auto* pa = owned_profiling_allocator_.get();
159+
profiling_allocator_.store(pa, std::memory_order_release);
160+
return pa;
161+
}
162+
163+
void NodeArrayBufferAllocator::ClearProfilingAllocator() {
164+
profiling_allocator_.store(nullptr, std::memory_order_release);
165+
}
166+
138167
DebuggingArrayBufferAllocator::~DebuggingArrayBufferAllocator() {
139168
CHECK(allocations_.empty());
140169
}
@@ -191,11 +220,154 @@ void DebuggingArrayBufferAllocator::RegisterPointerInternal(void* data,
191220
allocations_[data] = size;
192221
}
193222

223+
void ProfilingArrayBufferAllocator::TrackAllocate(void* data, size_t size) {
224+
// GC safety: this runs inside the ArrayBuffer allocator where V8 may
225+
// prohibit heap allocation and JS execution (e.g. BackingStore::Allocate
226+
// inside the ArrayBuffer constructor). Extract only the ALS value
227+
// from the CPED at allocation time via HeapProfiler::LookupAlsValue(),
228+
// which uses OrderedHashMap::FindEntry (zero-allocation, GC-safe).
229+
// HandleScope, GetContinuationPreservedEmbedderData, Global::Reset,
230+
// and OrderedHashMap::FindEntry use the handle-scope stack / malloc,
231+
// not the V8 heap.
232+
v8::Isolate* isolate = isolate_.load(std::memory_order_relaxed);
233+
if (std::this_thread::get_id() != main_thread_id_ ||
234+
isolate == nullptr) {
235+
return;
236+
}
237+
v8::HandleScope handle_scope(isolate);
238+
v8::Local<v8::Value> cped =
239+
isolate->GetContinuationPreservedEmbedderData();
240+
if (!cped.IsEmpty() && cped->IsMap()) {
241+
v8::HeapProfiler* profiler = isolate->GetHeapProfiler();
242+
v8::Local<v8::Value> als_value;
243+
if (profiler->LookupAlsValue(cped).ToLocal(&als_value)) {
244+
Mutex::ScopedLock lock(mutex_);
245+
auto& entry = allocations_[data];
246+
entry.label_value.Reset(isolate, als_value);
247+
entry.size = size;
248+
}
249+
}
250+
}
251+
252+
void ProfilingArrayBufferAllocator::TrackFree(void* data) {
253+
Mutex::ScopedLock lock(mutex_);
254+
allocations_.erase(data);
255+
}
256+
257+
void ProfilingArrayBufferAllocator::Enable(v8::Isolate* isolate) {
258+
// Synchronization strategy: isolate_ serves as the "enabled" sentinel.
259+
// TrackAllocate() and GetPerLabelBytes() check isolate_ == nullptr without
260+
// holding the mutex as an early-exit guard. We set isolate_ LAST here so
261+
// that by the time any reader sees non-null isolate_, main_thread_id_ is
262+
// already valid. The corresponding Disable() sets isolate_ FIRST (to
263+
// nullptr) before clearing other fields. All accesses to isolate_ outside
264+
// the mutex happen on the main thread, so relaxed memory ordering
265+
// suffices — std::atomic prevents compiler reordering and torn reads.
266+
Mutex::ScopedLock lock(mutex_);
267+
main_thread_id_ = std::this_thread::get_id();
268+
isolate_.store(isolate, std::memory_order_relaxed);
269+
}
270+
271+
void ProfilingArrayBufferAllocator::Disable() {
272+
// Clear isolate_ FIRST — it is the "enabled" sentinel checked by
273+
// TrackAllocate() and GetPerLabelBytes() without holding the mutex.
274+
// Setting it to nullptr before clearing other state ensures any
275+
// re-entrant call (e.g., if allocations_.clear() triggers a Global
276+
// destructor → GC → allocation → TrackAllocate on the same thread)
277+
// will see null and exit early.
278+
isolate_.store(nullptr, std::memory_order_relaxed);
279+
Mutex::ScopedLock lock(mutex_);
280+
allocations_.clear();
281+
}
282+
283+
std::vector<ProfilingArrayBufferAllocator::LabeledBytes>
284+
ProfilingArrayBufferAllocator::GetPerLabelBytes() const {
285+
v8::Isolate* isolate = isolate_.load(std::memory_order_relaxed);
286+
if (isolate == nullptr) {
287+
return {};
288+
}
289+
290+
v8::HandleScope handle_scope(isolate);
291+
292+
// Snapshot: convert stored Globals to Locals under the lock, then release
293+
// the lock before doing V8 API calls that may trigger GC. If GC fires
294+
// weak callbacks that call Free(), they acquire the lock independently
295+
// without deadlocking.
296+
struct SnapshotEntry {
297+
v8::Local<v8::Value> label_value;
298+
size_t size;
299+
};
300+
std::vector<SnapshotEntry> snapshot;
301+
{
302+
Mutex::ScopedLock lock(mutex_);
303+
snapshot.reserve(allocations_.size());
304+
for (const auto& [ptr, entry] : allocations_) {
305+
if (!entry.label_value.IsEmpty()) {
306+
snapshot.push_back({entry.label_value.Get(isolate), entry.size});
307+
}
308+
}
309+
}
310+
311+
// Resolve labels outside the lock. Array::Get() and String::Utf8Value
312+
// may allocate on the V8 heap — safe here because GetPerLabelBytes is
313+
// called from GetAllocationProfile (a JS callback) where GC is allowed.
314+
// The stored values are ALS values (flat [key, val, ...] arrays) extracted
315+
// at allocation time — no Map lookup needed.
316+
v8::Local<v8::Context> v8_context = isolate->GetCurrentContext();
317+
std::unordered_map<std::string, LabeledBytes> aggregated;
318+
for (const auto& snap : snapshot) {
319+
if (!snap.label_value->IsArray()) continue;
320+
v8::Local<v8::Array> flat = snap.label_value.As<v8::Array>();
321+
uint32_t len = flat->Length();
322+
LabelPairs labels;
323+
for (uint32_t j = 0; j + 1 < len; j += 2) {
324+
v8::Local<v8::Value> k, v;
325+
if (!flat->Get(v8_context, j).ToLocal(&k)) continue;
326+
if (!flat->Get(v8_context, j + 1).ToLocal(&v)) continue;
327+
v8::String::Utf8Value key_str(isolate, k);
328+
v8::String::Utf8Value val_str(isolate, v);
329+
if (*key_str == nullptr || *val_str == nullptr) continue;
330+
labels.emplace_back(*key_str, *val_str);
331+
}
332+
if (labels.empty()) continue;
333+
std::string key = SerializeLabels(labels);
334+
auto& agg = aggregated[key];
335+
if (agg.labels.empty()) agg.labels = std::move(labels);
336+
agg.bytes += static_cast<int64_t>(snap.size);
337+
}
338+
339+
std::vector<LabeledBytes> result;
340+
result.reserve(aggregated.size());
341+
for (auto& [key, entry] : aggregated) {
342+
if (entry.bytes > 0) {
343+
result.push_back(std::move(entry));
344+
}
345+
}
346+
return result;
347+
}
348+
349+
std::string ProfilingArrayBufferAllocator::SerializeLabels(
350+
const LabelPairs& labels) {
351+
std::string key;
352+
for (const auto& [k, v] : labels) {
353+
if (!key.empty()) key += '\0';
354+
key += k;
355+
key += '\0';
356+
key += v;
357+
}
358+
return key;
359+
}
360+
194361
std::unique_ptr<ArrayBufferAllocator> ArrayBufferAllocator::Create(bool debug) {
195362
if (debug || per_process::cli_options->debug_arraybuffer_allocations)
196363
return std::make_unique<DebuggingArrayBufferAllocator>();
197-
else
198-
return std::make_unique<NodeArrayBufferAllocator>();
364+
// Use the plain NodeArrayBufferAllocator by default. When heap profiling
365+
// with labels is started (v8.startSamplingHeapProfiler), a
366+
// ProfilingArrayBufferAllocator tracker is created on demand and installed
367+
// as a delegate — see NodeArrayBufferAllocator::CreateProfilingAllocator().
368+
// This ensures ZERO per-allocation overhead when profiling is not active:
369+
// no atomic load, no virtual dispatch change, just a predicted null-branch.
370+
return std::make_unique<NodeArrayBufferAllocator>();
199371
}
200372

201373
ArrayBufferAllocator* CreateArrayBufferAllocator() {

src/node_internals.h

Lines changed: 81 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,9 +123,11 @@ v8::Maybe<void> InitializePrimordials(v8::Local<v8::Context> context,
123123
v8::MaybeLocal<v8::Object> InitializePrivateSymbols(
124124
v8::Local<v8::Context> context, IsolateData* isolate_data);
125125

126+
class ProfilingArrayBufferAllocator; // Forward declaration.
127+
126128
class NodeArrayBufferAllocator : public ArrayBufferAllocator {
127129
public:
128-
void* Allocate(size_t size) override; // Defined in src/node.cc
130+
void* Allocate(size_t size) override;
129131
void* AllocateUninitialized(size_t size) override;
130132
void Free(void* data, size_t size) override;
131133
virtual void RegisterPointer(void* data, size_t size) {
@@ -136,6 +138,17 @@ class NodeArrayBufferAllocator : public ArrayBufferAllocator {
136138
}
137139

138140
NodeArrayBufferAllocator* GetImpl() final { return this; }
141+
// Returns the profiling tracker if active, nullptr otherwise.
142+
ProfilingArrayBufferAllocator* GetProfilingAllocator() {
143+
return profiling_allocator_.load(std::memory_order_acquire);
144+
}
145+
// Lazily create and install a ProfilingArrayBufferAllocator. Returns the
146+
// tracker (reuses existing one if already created).
147+
ProfilingArrayBufferAllocator* CreateProfilingAllocator();
148+
// Disconnect the profiling allocator so Allocate/Free no longer delegate.
149+
// The tracker object stays alive for thread safety and is reused on next
150+
// CreateProfilingAllocator() call.
151+
void ClearProfilingAllocator();
139152
inline uint64_t total_mem_usage() const {
140153
return total_mem_usage_.load(std::memory_order_relaxed);
141154
}
@@ -146,6 +159,15 @@ class NodeArrayBufferAllocator : public ArrayBufferAllocator {
146159
// Delegate to V8's allocator for compatibility with the V8 memory cage.
147160
std::unique_ptr<v8::ArrayBuffer::Allocator> allocator_{
148161
v8::ArrayBuffer::Allocator::NewDefaultAllocator()};
162+
163+
// Owned profiling tracker — created lazily on first profiling start.
164+
std::unique_ptr<ProfilingArrayBufferAllocator> owned_profiling_allocator_;
165+
// Atomic pointer for fast null-check on the allocation hot path.
166+
// Points into owned_profiling_allocator_ when profiling is active, nullptr
167+
// otherwise. Uses acquire/release ordering to pair with the release store
168+
// in CreateProfilingAllocator/ClearProfilingAllocator — ensures Free()
169+
// (called from GC threads) sees the fully constructed object.
170+
std::atomic<ProfilingArrayBufferAllocator*> profiling_allocator_{nullptr};
149171
};
150172

151173
class DebuggingArrayBufferAllocator final : public NodeArrayBufferAllocator {
@@ -164,6 +186,64 @@ class DebuggingArrayBufferAllocator final : public NodeArrayBufferAllocator {
164186
std::unordered_map<void*, size_t> allocations_;
165187
};
166188

189+
// Tracks per-label external memory (Buffer/ArrayBuffer backing stores) when
190+
// heap profiling with labels is active. NOT an allocator itself — installed
191+
// as a delegate on NodeArrayBufferAllocator via CreateProfilingAllocator().
192+
// When inactive, there is zero overhead on the allocation path because
193+
// NodeArrayBufferAllocator skips delegation when the pointer is null.
194+
//
195+
// Synchronization model:
196+
// isolate_ serves as the "enabled" sentinel. TrackAllocate() and
197+
// GetPerLabelBytes() check isolate_ == nullptr as the first guard,
198+
// WITHOUT holding the mutex (they only lock to access allocations_).
199+
// Enable() sets isolate_ LAST (after main_thread_id_); Disable()
200+
// sets isolate_ FIRST (to nullptr, before clearing other state and
201+
// locking). This ordering ensures any re-entrant call that reads
202+
// isolate_ either sees null (exits early) or sees a fully initialized
203+
// state. All reads/writes of isolate_ happen on the main thread,
204+
// so relaxed memory ordering suffices, but isolate_ is std::atomic
205+
// to prevent compiler reordering and torn reads. The mutex protects
206+
// allocations_ which is accessed cross-thread (main thread writes,
207+
// GC threads call Free).
208+
class ProfilingArrayBufferAllocator {
209+
public:
210+
using LabelPairs = std::vector<std::pair<std::string, std::string>>;
211+
212+
struct LabeledBytes {
213+
LabelPairs labels;
214+
int64_t bytes = 0;
215+
};
216+
217+
// Called from NodeArrayBufferAllocator::Allocate/AllocateUninitialized
218+
// when the profiling delegate is active.
219+
void TrackAllocate(void* data, size_t size);
220+
// Called from NodeArrayBufferAllocator::Free.
221+
void TrackFree(void* data);
222+
223+
// Called from StartSamplingHeapProfiler/StopSamplingHeapProfiler.
224+
void Enable(v8::Isolate* isolate);
225+
void Disable();
226+
227+
// Returns per-label live external bytes (for getAllocationProfile).
228+
std::vector<LabeledBytes> GetPerLabelBytes() const;
229+
230+
private:
231+
static std::string SerializeLabels(const LabelPairs& labels);
232+
233+
std::atomic<v8::Isolate*> isolate_{nullptr};
234+
235+
std::thread::id main_thread_id_ = std::this_thread::get_id();
236+
237+
mutable Mutex mutex_;
238+
// Maps allocation pointer to the ALS value (flat label array) extracted
239+
// at allocation time and the allocation size.
240+
struct AllocationEntry {
241+
v8::Global<v8::Value> label_value;
242+
size_t size;
243+
};
244+
std::unordered_map<void*, AllocationEntry> allocations_;
245+
};
246+
167247
namespace Buffer {
168248
v8::MaybeLocal<v8::Object> Copy(Environment* env, const char* data, size_t len);
169249
v8::MaybeLocal<v8::Object> New(Environment* env, size_t size);

0 commit comments

Comments
 (0)