Skip to content
Merged
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
176 changes: 168 additions & 8 deletions src/encoding_binding.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@
#include "node_external_reference.h"
#include "simdutf.h"
#include "string_bytes.h"
#include "util.h"
#include "v8.h"

#include <algorithm>
#include <cstdint>

namespace node {
Expand Down Expand Up @@ -71,6 +73,93 @@ InternalFieldInfoBase* BindingData::Serialize(int index) {
return info;
}

// The following code is adapted from Cloudflare workers.
// Particularly from: https://github.com/cloudflare/workerd/pull/5448
//
// Copyright (c) 2017-2025 Cloudflare, Inc.
// Licensed under the Apache 2.0 license found in the LICENSE file or at:
// https://opensource.org/licenses/Apache-2.0
namespace {
constexpr int MAX_SIZE_FOR_STACK_ALLOC = 4096;
Comment thread
jasnell marked this conversation as resolved.

constexpr bool isSurrogatePair(uint16_t lead, uint16_t trail) {
return (lead & 0xfc00) == 0xd800 && (trail & 0xfc00) == 0xdc00;
}

constexpr size_t simpleUtfEncodingLength(uint16_t c) {
if (c < 0x80) return 1;
if (c < 0x400) return 2;
return 3;
}

template <typename Char>
size_t findBestFit(const Char* data, size_t length, size_t bufferSize) {
Comment thread
anonrig marked this conversation as resolved.
Comment thread
anonrig marked this conversation as resolved.
size_t pos = 0;
size_t utf8Accumulated = 0;
constexpr size_t CHUNK = 257;
constexpr bool UTF16 = sizeof(Char) == 2;
constexpr size_t MAX_FACTOR = UTF16 ? 3 : 2;

double expansion = 1.15;

while (pos < length && utf8Accumulated < bufferSize) {
size_t remainingInput = length - pos;
size_t spaceRemaining = bufferSize - utf8Accumulated;
DCHECK_GE(expansion, 1.15);

size_t guaranteedToFit = spaceRemaining / MAX_FACTOR;
if (guaranteedToFit >= remainingInput) {
return length;
}
size_t likelyToFit =
std::min(static_cast<size_t>(spaceRemaining / expansion), CHUNK);
size_t fitEstimate =
std::max(size_t{1}, std::max(guaranteedToFit, likelyToFit));
size_t chunkSize = std::min(remainingInput, fitEstimate);
if (chunkSize == 1) break;
DCHECK_GT(chunkSize, 1);
Comment thread
anonrig marked this conversation as resolved.
Outdated

size_t chunkUtf8Len;
if constexpr (UTF16) {
// TODO(anonrig): Use utf8_length_from_utf16_with_replacement when
Comment thread
anonrig marked this conversation as resolved.
// available For now, validate and use utf8_length_from_utf16
size_t newPos = pos + chunkSize;
if (newPos < length && isSurrogatePair(data[newPos - 1], data[newPos]))
chunkSize--;
chunkUtf8Len = simdutf::utf8_length_from_utf16(data + pos, chunkSize);
Comment thread
anonrig marked this conversation as resolved.
} else {
chunkUtf8Len = simdutf::utf8_length_from_latin1(data + pos, chunkSize);
}

if (utf8Accumulated + chunkUtf8Len > bufferSize) {
DCHECK_GT(chunkSize, guaranteedToFit);
expansion = std::max(expansion * 1.1, (chunkUtf8Len * 1.1) / chunkSize);
} else {
expansion = std::max(1.15, (chunkUtf8Len * 1.1) / chunkSize);
pos += chunkSize;
utf8Accumulated += chunkUtf8Len;
}
}

while (pos < length && utf8Accumulated < bufferSize) {
size_t extra = simpleUtfEncodingLength(data[pos]);
if (utf8Accumulated + extra > bufferSize) break;
pos++;
utf8Accumulated += extra;
}

if (UTF16 && pos != 0 && pos != length &&
isSurrogatePair(data[pos - 1], data[pos])) {
if (utf8Accumulated < bufferSize) {
pos++;
} else {
pos--;
}
}
return pos;
}
} // namespace

void BindingData::Deserialize(Local<Context> context,
Local<Object> holder,
int index,
Expand Down Expand Up @@ -98,18 +187,89 @@ void BindingData::EncodeInto(const FunctionCallbackInfo<Value>& args) {

Local<Uint8Array> dest = args[1].As<Uint8Array>();
Local<ArrayBuffer> buf = dest->Buffer();

// Handle detached buffers - return {read: 0, written: 0}
if (buf->Data() == nullptr) {
binding_data->encode_into_results_buffer_[0] = 0;
binding_data->encode_into_results_buffer_[1] = 0;
return;
}

char* write_result = static_cast<char*>(buf->Data()) + dest->ByteOffset();
size_t dest_length = dest->ByteLength();

size_t nchars;
size_t written = source->WriteUtf8V2(isolate,
write_result,
dest_length,
String::WriteFlags::kReplaceInvalidUtf8,
&nchars);
size_t read = 0;
size_t written = 0;
v8::String::ValueView view(isolate, source);
Comment thread
anonrig marked this conversation as resolved.
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
v8::String::ValueView view(isolate, source);
// Note: Take care not to perform any action in v8 that would
// trigger GC as it could cause the view to become invalid.
v8::String::ValueView view(isolate, source);

Comment thread
anonrig marked this conversation as resolved.
size_t length_that_fits =
std::min(static_cast<size_t>(view.length()), dest_length);

if (view.is_one_byte()) {
auto data = reinterpret_cast<const char*>(view.data8());
simdutf::result result =
simdutf::validate_ascii_with_errors(data, length_that_fits);
written = read = result.count;
memcpy(write_result, data, read);
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A DCHECK for bounds checking would be nice here.

Comment thread
anonrig marked this conversation as resolved.
write_result += read;
data += read;
length_that_fits -= read;
dest_length -= read;
if (length_that_fits != 0 && dest_length != 0) {
size_t rest = findBestFit(data, length_that_fits, dest_length);
if (rest != 0) {
Comment thread
anonrig marked this conversation as resolved.
Outdated
DCHECK_LE(simdutf::utf8_length_from_latin1(data, rest), dest_length);
written += simdutf::convert_latin1_to_utf8(data, rest, write_result);
read += rest;
}
}
} else {
auto data = reinterpret_cast<const char16_t*>(view.data16());

// Limit conversion to what could fit in destination, avoiding splitting
// a valid surrogate pair at the boundary, which could cause a spurious call
// of simdutf::to_well_formed_utf16()
if (length_that_fits > 0 && length_that_fits < view.length() &&
isSurrogatePair(data[length_that_fits - 1], data[length_that_fits])) {
length_that_fits--;
}

// Check if input has unpaired surrogates - if so, convert to well-formed
// first
simdutf::result validation_result =
simdutf::validate_utf16_with_errors(data, length_that_fits);

if (validation_result.error == simdutf::SUCCESS) {
// Valid UTF-16 - use the fast path
read = findBestFit(data, length_that_fits, dest_length);
if (read != 0) {
DCHECK_LE(simdutf::utf8_length_from_utf16(data, read), dest_length);
written = simdutf::convert_utf16_to_utf8(data, read, write_result);
}
} else {
// Invalid UTF-16 with unpaired surrogates - convert to well-formed first
// TODO(anonrig): Use utf8_length_from_utf16_with_replacement when
// available
MaybeStackBuffer<char16_t, MAX_SIZE_FOR_STACK_ALLOC> conversion_buffer(
length_that_fits);
simdutf::to_well_formed_utf16(
data, length_that_fits, conversion_buffer.out());

// Now use findBestFit with the well-formed data
read =
findBestFit(conversion_buffer.out(), length_that_fits, dest_length);
if (read != 0) {
DCHECK_LE(
simdutf::utf8_length_from_utf16(conversion_buffer.out(), read),
dest_length);
written = simdutf::convert_utf16_to_utf8(
conversion_buffer.out(), read, write_result);
}
}
}
DCHECK_LE(written, dest->ByteLength(););
Comment thread
anonrig marked this conversation as resolved.
Outdated
Comment thread
anonrig marked this conversation as resolved.
Outdated

binding_data->encode_into_results_buffer_[0] = nchars;
binding_data->encode_into_results_buffer_[1] = written;
binding_data->encode_into_results_buffer_[0] = static_cast<double>(read);
binding_data->encode_into_results_buffer_[1] = static_cast<double>(written);
}

// Encode a single string to a UTF-8 Uint8Array (not Buffer).
Expand Down
Loading