buffer: add buffer.isUtf8 for utf8 validation

PR-URL: https://github.com/nodejs/node/pull/45947 Reviewed-By: Robert Nagy <ronagy@icloud.com> Reviewed-By: Matteo Collina <matteo.collina@gmail.com> Reviewed-By: Luigi Pinca <luigipinca@gmail.com> Reviewed-By: Rafael Gonzaga <rafael.nunu@hotmail.com> Reviewed-By: Ben Noordhuis <info@bnoordhuis.nl> Reviewed-By: Anna Henningsen <anna@addaleax.net>
2022-12-24 21:32:05 -05:00 · 2022-12-24 21:32:05 -05:00 · d5a08c7e11
commit d5a08c7e11
parent 07fdbbd015
7 changed files with 131 additions and 1 deletions
--- a/doc/api/buffer.md
+++ b/doc/api/buffer.md
@ -5130,6 +5130,17 @@ For code running using Node.js APIs, converting between base64-encoded strings
 and binary data should be performed using `Buffer.from(str, 'base64')` and
 `buf.toString('base64')`.**

+### `buffer.isUtf8(input)`
+
+<!-- YAML
+added: REPLACEME
+-->
+
+* input {Buffer | ArrayBuffer | TypedArray} The input to validate.
+* Returns: {boolean} Returns `true` if and only if the input is valid UTF-8.
+
+This function is used to check if input contains UTF-8 code points (characters).
+
 ### `buffer.INSPECT_MAX_BYTES`

 <!-- YAML
--- a/lib/buffer.js
+++ b/lib/buffer.js
@ -57,6 +57,7 @@ const {
  compareOffset,
  createFromString,
  fill: bindingFill,
+  isUtf8: bindingIsUtf8,
  indexOfBuffer,
  indexOfNumber,
  indexOfString,
@ -84,7 +85,8 @@ const {
 const {
  isAnyArrayBuffer,
  isArrayBufferView,
-  isUint8Array
+  isUint8Array,
+  isTypedArray,
 } = require('internal/util/types');
 const {
  inspect: utilInspect
@ -1314,10 +1316,19 @@ function atob(input) {
  return Buffer.from(input, 'base64').toString('latin1');
 }

+function isUtf8(input) {
+  if (isTypedArray(input) || isAnyArrayBuffer(input)) {
+    return bindingIsUtf8(input);
+  }
+
+  throw new ERR_INVALID_ARG_TYPE('input', ['TypedArray', 'Buffer'], input);
+}
+
 module.exports = {
  Buffer,
  SlowBuffer,
  transcode,
+  isUtf8,

  // Legacy
  kMaxLength,
--- a/src/node_buffer.cc
+++ b/src/node_buffer.cc
@ -1223,6 +1223,20 @@ static void EncodeInto(const FunctionCallbackInfo<Value>& args) {
  results[1] = written;
 }

+static void IsUtf8(const FunctionCallbackInfo<Value>& args) {
+  Environment* env = Environment::GetCurrent(args);
+  CHECK_EQ(args.Length(), 1);
+  CHECK(args[0]->IsTypedArray() || args[0]->IsArrayBuffer() ||
+        args[0]->IsSharedArrayBuffer());
+  ArrayBufferViewContents<char> abv(args[0]);
+
+  if (abv.WasDetached()) {
+    return node::THROW_ERR_INVALID_STATE(
+        env, "Cannot validate on a detached buffer");
+  }
+
+  args.GetReturnValue().Set(simdutf::validate_utf8(abv.data(), abv.length()));
+}

 void SetBufferPrototype(const FunctionCallbackInfo<Value>& args) {
  Environment* env = Environment::GetCurrent(args);
@ -1358,6 +1372,8 @@ void Initialize(Local<Object> target,
  SetMethod(context, target, "encodeInto", EncodeInto);
  SetMethodNoSideEffect(context, target, "encodeUtf8String", EncodeUtf8String);

+  SetMethodNoSideEffect(context, target, "isUtf8", IsUtf8);
+
  target
      ->Set(context,
            FIXED_ONE_BYTE_STRING(isolate, "kMaxLength"),
@ -1413,6 +1429,8 @@ void RegisterExternalReferences(ExternalReferenceRegistry* registry) {
  registry->Register(EncodeInto);
  registry->Register(EncodeUtf8String);

+  registry->Register(IsUtf8);
+
  registry->Register(StringSlice<ASCII>);
  registry->Register(StringSlice<BASE64>);
  registry->Register(StringSlice<BASE64URL>);
--- a/src/node_errors.h
+++ b/src/node_errors.h
@ -68,6 +68,7 @@ void OOMErrorHandler(const char* location, const v8::OOMDetails& details);
  V(ERR_INVALID_ARG_TYPE, TypeError)                                           \
  V(ERR_INVALID_OBJECT_DEFINE_PROPERTY, TypeError)                             \
  V(ERR_INVALID_MODULE, Error)                                                 \
+  V(ERR_INVALID_STATE, Error)                                                  \
  V(ERR_INVALID_THIS, TypeError)                                               \
  V(ERR_INVALID_TRANSFER_OBJECT, TypeError)                                    \
  V(ERR_MEMORY_ALLOCATION_FAILED, Error)                                       \
--- a/src/util-inl.h
+++ b/src/util-inl.h
@ -555,6 +555,7 @@ void ArrayBufferViewContents<T, S>::ReadValue(v8::Local<v8::Value> buf) {
    auto ab = buf.As<v8::ArrayBuffer>();
    length_ = ab->ByteLength();
    data_ = static_cast<T*>(ab->Data());
+    was_detached_ = ab->WasDetached();
  } else {
    CHECK(buf->IsSharedArrayBuffer());
    auto sab = buf.As<v8::SharedArrayBuffer>();
--- a/src/util.h
+++ b/src/util.h
@ -511,6 +511,7 @@ class ArrayBufferViewContents {
  inline void Read(v8::Local<v8::ArrayBufferView> abv);
  inline void ReadValue(v8::Local<v8::Value> buf);

+  inline bool WasDetached() const { return was_detached_; }
  inline const T* data() const { return data_; }
  inline size_t length() const { return length_; }

@ -525,6 +526,7 @@ class ArrayBufferViewContents {
  T stack_storage_[kStackStorageSize];
  T* data_ = nullptr;
  size_t length_ = 0;
+  bool was_detached_ = false;
 };

 class Utf8Value : public MaybeStackBuffer<char> {
--- a/test/parallel/test-buffer-isutf8.js
+++ b/test/parallel/test-buffer-isutf8.js
@ -0,0 +1,86 @@
+'use strict';
+
+require('../common');
+const assert = require('assert');
+const { isUtf8, Buffer } = require('buffer');
+const { TextEncoder } = require('util');
+
+const encoder = new TextEncoder();
+
+assert.strictEqual(isUtf8(encoder.encode('hello')), true);
+assert.strictEqual(isUtf8(encoder.encode('ğ')), true);
+assert.strictEqual(isUtf8(Buffer.from([])), true);
+
+// Taken from test/fixtures/wpt/encoding/textdecoder-fatal.any.js
+[
+  [0xFF], // 'invalid code'
+  [0xC0], // 'ends early'
+  [0xE0], // 'ends early 2'
+  [0xC0, 0x00], // 'invalid trail'
+  [0xC0, 0xC0], // 'invalid trail 2'
+  [0xE0, 0x00], // 'invalid trail 3'
+  [0xE0, 0xC0], // 'invalid trail 4'
+  [0xE0, 0x80, 0x00], // 'invalid trail 5'
+  [0xE0, 0x80, 0xC0], // 'invalid trail 6'
+  [0xFC, 0x80, 0x80, 0x80, 0x80, 0x80], // '> 0x10FFFF'
+  [0xFE, 0x80, 0x80, 0x80, 0x80, 0x80], // 'obsolete lead byte'
+
+  // Overlong encodings
+  [0xC0, 0x80], // 'overlong U+0000 - 2 bytes'
+  [0xE0, 0x80, 0x80], // 'overlong U+0000 - 3 bytes'
+  [0xF0, 0x80, 0x80, 0x80], // 'overlong U+0000 - 4 bytes'
+  [0xF8, 0x80, 0x80, 0x80, 0x80], // 'overlong U+0000 - 5 bytes'
+  [0xFC, 0x80, 0x80, 0x80, 0x80, 0x80], // 'overlong U+0000 - 6 bytes'
+
+  [0xC1, 0xBF], // 'overlong U+007F - 2 bytes'
+  [0xE0, 0x81, 0xBF], // 'overlong U+007F - 3 bytes'
+  [0xF0, 0x80, 0x81, 0xBF], // 'overlong U+007F - 4 bytes'
+  [0xF8, 0x80, 0x80, 0x81, 0xBF], // 'overlong U+007F - 5 bytes'
+  [0xFC, 0x80, 0x80, 0x80, 0x81, 0xBF], // 'overlong U+007F - 6 bytes'
+
+  [0xE0, 0x9F, 0xBF], // 'overlong U+07FF - 3 bytes'
+  [0xF0, 0x80, 0x9F, 0xBF], // 'overlong U+07FF - 4 bytes'
+  [0xF8, 0x80, 0x80, 0x9F, 0xBF], // 'overlong U+07FF - 5 bytes'
+  [0xFC, 0x80, 0x80, 0x80, 0x9F, 0xBF], // 'overlong U+07FF - 6 bytes'
+
+  [0xF0, 0x8F, 0xBF, 0xBF], // 'overlong U+FFFF - 4 bytes'
+  [0xF8, 0x80, 0x8F, 0xBF, 0xBF], // 'overlong U+FFFF - 5 bytes'
+  [0xFC, 0x80, 0x80, 0x8F, 0xBF, 0xBF], // 'overlong U+FFFF - 6 bytes'
+
+  [0xF8, 0x84, 0x8F, 0xBF, 0xBF], // 'overlong U+10FFFF - 5 bytes'
+  [0xFC, 0x80, 0x84, 0x8F, 0xBF, 0xBF], // 'overlong U+10FFFF - 6 bytes'
+
+  // UTF-16 surrogates encoded as code points in UTF-8
+  [0xED, 0xA0, 0x80], // 'lead surrogate'
+  [0xED, 0xB0, 0x80], // 'trail surrogate'
+  [0xED, 0xA0, 0x80, 0xED, 0xB0, 0x80], // 'surrogate pair'
+].forEach((input) => {
+  assert.strictEqual(isUtf8(Buffer.from(input)), false);
+});
+
+[
+  null,
+  undefined,
+  'hello',
+  true,
+  false,
+].forEach((input) => {
+  assert.throws(
+    () => { isUtf8(input); },
+    {
+      code: 'ERR_INVALID_ARG_TYPE',
+    },
+  );
+});
+
+{
+  // Test with detached array buffers
+  const arrayBuffer = new ArrayBuffer(1024);
+  structuredClone(arrayBuffer, { transfer: [arrayBuffer] });
+  assert.throws(
+    () => { isUtf8(arrayBuffer); },
+    {
+      code: 'ERR_INVALID_STATE'
+    }
+  );
+}