2019-02-25 17:27:19 -05:00
|
|
|
#include "string_decoder.h" // NOLINT(build/include_inline)
|
|
|
|
#include "string_decoder-inl.h"
|
|
|
|
|
2019-01-13 17:17:26 +08:00
|
|
|
#include "env-inl.h"
|
2018-02-01 02:28:39 +01:00
|
|
|
#include "node_buffer.h"
|
2020-12-28 14:18:48 +01:00
|
|
|
#include "node_errors.h"
|
2020-04-22 02:57:16 +08:00
|
|
|
#include "node_external_reference.h"
|
2019-01-13 17:17:26 +08:00
|
|
|
#include "string_bytes.h"
|
2019-05-09 12:52:38 -07:00
|
|
|
#include "util.h"
|
2018-02-01 02:28:39 +01:00
|
|
|
|
|
|
|
using v8::Array;
|
2019-02-25 04:12:19 +01:00
|
|
|
using v8::ArrayBufferView;
|
2018-02-01 02:28:39 +01:00
|
|
|
using v8::Context;
|
|
|
|
using v8::FunctionCallbackInfo;
|
|
|
|
using v8::Integer;
|
|
|
|
using v8::Isolate;
|
|
|
|
using v8::Local;
|
|
|
|
using v8::MaybeLocal;
|
|
|
|
using v8::Object;
|
|
|
|
using v8::String;
|
|
|
|
using v8::Value;
|
|
|
|
|
|
|
|
namespace node {
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
MaybeLocal<String> MakeString(Isolate* isolate,
|
|
|
|
const char* data,
|
|
|
|
size_t length,
|
|
|
|
enum encoding encoding) {
|
|
|
|
Local<Value> error;
|
|
|
|
MaybeLocal<Value> ret;
|
|
|
|
if (encoding == UTF8) {
|
2024-03-26 13:50:33 +08:00
|
|
|
MaybeLocal<String> utf8_string;
|
|
|
|
if (length <= static_cast<size_t>(v8::String::kMaxLength)) {
|
|
|
|
utf8_string = String::NewFromUtf8(
|
|
|
|
isolate, data, v8::NewStringType::kNormal, length);
|
|
|
|
}
|
2020-12-28 14:18:48 +01:00
|
|
|
if (utf8_string.IsEmpty()) {
|
|
|
|
isolate->ThrowException(node::ERR_STRING_TOO_LONG(isolate));
|
|
|
|
return MaybeLocal<String>();
|
|
|
|
} else {
|
|
|
|
return utf8_string;
|
|
|
|
}
|
2018-02-01 02:28:39 +01:00
|
|
|
} else {
|
|
|
|
ret = StringBytes::Encode(
|
|
|
|
isolate,
|
|
|
|
data,
|
|
|
|
length,
|
|
|
|
encoding,
|
|
|
|
&error);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ret.IsEmpty()) {
|
|
|
|
CHECK(!error.IsEmpty());
|
|
|
|
isolate->ThrowException(error);
|
|
|
|
}
|
|
|
|
|
2018-12-24 12:24:08 -05:00
|
|
|
DCHECK(ret.IsEmpty() || ret.ToLocalChecked()->IsString());
|
2018-02-01 02:28:39 +01:00
|
|
|
return ret.FromMaybe(Local<Value>()).As<String>();
|
|
|
|
}
|
|
|
|
|
|
|
|
} // anonymous namespace
|
|
|
|
|
|
|
|
|
|
|
|
MaybeLocal<String> StringDecoder::DecodeData(Isolate* isolate,
|
|
|
|
const char* data,
|
|
|
|
size_t* nread_ptr) {
|
|
|
|
Local<String> prepend, body;
|
|
|
|
|
|
|
|
size_t nread = *nread_ptr;
|
|
|
|
|
2021-01-15 21:01:28 +01:00
|
|
|
if (Encoding() == UTF8 ||
|
|
|
|
Encoding() == UCS2 ||
|
|
|
|
Encoding() == BASE64 ||
|
|
|
|
Encoding() == BASE64URL) {
|
2018-02-01 02:28:39 +01:00
|
|
|
// See if we want bytes to finish a character from the previous
|
|
|
|
// chunk; if so, copy the new bytes to the missing bytes buffer
|
|
|
|
// and create a small string from it that is to be prepended to the
|
|
|
|
// main body.
|
|
|
|
if (MissingBytes() > 0) {
|
|
|
|
// There are never more bytes missing than the pre-calculated maximum.
|
|
|
|
CHECK_LE(MissingBytes() + BufferedBytes(),
|
|
|
|
kIncompleteCharactersEnd);
|
|
|
|
if (Encoding() == UTF8) {
|
|
|
|
// For UTF-8, we need special treatment to align with the V8 decoder:
|
2018-09-05 14:24:45 +02:00
|
|
|
// If an incomplete character is found at a chunk boundary, we use
|
|
|
|
// its remainder and pass it to V8 as-is.
|
2018-02-01 02:28:39 +01:00
|
|
|
for (size_t i = 0; i < nread && i < MissingBytes(); ++i) {
|
|
|
|
if ((data[i] & 0xC0) != 0x80) {
|
|
|
|
// This byte is not a continuation byte even though it should have
|
2018-09-05 14:24:45 +02:00
|
|
|
// been one. We stop decoding of the incomplete character at this
|
|
|
|
// point (but still use the rest of the incomplete bytes from this
|
|
|
|
// chunk) and assume that the new, unexpected byte starts a new one.
|
2018-02-01 02:28:39 +01:00
|
|
|
state_[kMissingBytes] = 0;
|
2018-09-05 14:24:45 +02:00
|
|
|
memcpy(IncompleteCharacterBuffer() + BufferedBytes(), data, i);
|
|
|
|
state_[kBufferedBytes] += i;
|
2018-02-01 02:28:39 +01:00
|
|
|
data += i;
|
|
|
|
nread -= i;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t found_bytes =
|
|
|
|
std::min(nread, static_cast<size_t>(MissingBytes()));
|
|
|
|
memcpy(IncompleteCharacterBuffer() + BufferedBytes(),
|
|
|
|
data,
|
|
|
|
found_bytes);
|
|
|
|
// Adjust the two buffers.
|
|
|
|
data += found_bytes;
|
|
|
|
nread -= found_bytes;
|
|
|
|
|
|
|
|
state_[kMissingBytes] -= found_bytes;
|
|
|
|
state_[kBufferedBytes] += found_bytes;
|
|
|
|
|
2024-09-30 14:04:16 -04:00
|
|
|
if (MissingBytes() == 0) [[likely]] {
|
2018-02-01 02:28:39 +01:00
|
|
|
// If no more bytes are missing, create a small string that we
|
|
|
|
// will later prepend.
|
|
|
|
if (!MakeString(isolate,
|
|
|
|
IncompleteCharacterBuffer(),
|
|
|
|
BufferedBytes(),
|
|
|
|
Encoding()).ToLocal(&prepend)) {
|
|
|
|
return MaybeLocal<String>();
|
|
|
|
}
|
|
|
|
|
|
|
|
*nread_ptr += BufferedBytes();
|
|
|
|
// No more buffered bytes.
|
|
|
|
state_[kBufferedBytes] = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// It could be that trying to finish the previous chunk already
|
|
|
|
// consumed all data that we received in this chunk.
|
2024-09-30 14:04:16 -04:00
|
|
|
if (nread == 0) [[unlikely]] {
|
2018-02-01 02:28:39 +01:00
|
|
|
body = !prepend.IsEmpty() ? prepend : String::Empty(isolate);
|
|
|
|
prepend = Local<String>();
|
|
|
|
} else {
|
|
|
|
// If not, that means is no character left to finish at this point.
|
2018-12-24 12:10:35 -05:00
|
|
|
DCHECK_EQ(MissingBytes(), 0);
|
|
|
|
DCHECK_EQ(BufferedBytes(), 0);
|
2018-02-01 02:28:39 +01:00
|
|
|
|
|
|
|
// See whether there is a character that we may have to cut off and
|
|
|
|
// finish when receiving the next chunk.
|
|
|
|
if (Encoding() == UTF8 && data[nread - 1] & 0x80) {
|
|
|
|
// This is UTF-8 encoded data and we ended on a non-ASCII UTF-8 byte.
|
|
|
|
// This means we'll need to figure out where the character to which
|
|
|
|
// the byte belongs begins.
|
|
|
|
for (size_t i = nread - 1; ; --i) {
|
2018-12-24 12:10:35 -05:00
|
|
|
DCHECK_LT(i, nread);
|
2018-02-01 02:28:39 +01:00
|
|
|
state_[kBufferedBytes]++;
|
|
|
|
if ((data[i] & 0xC0) == 0x80) {
|
|
|
|
// This byte does not start a character (a "trailing" byte).
|
|
|
|
if (state_[kBufferedBytes] >= 4 || i == 0) {
|
|
|
|
// We either have more then 4 trailing bytes (which means
|
|
|
|
// the current character would not be inside the range for
|
|
|
|
// valid Unicode, and in particular cannot be represented
|
|
|
|
// through JavaScript's UTF-16-based approach to strings), or the
|
|
|
|
// current buffer does not contain the start of an UTF-8 character
|
|
|
|
// at all. Either way, this is invalid UTF8 and we can just
|
|
|
|
// let the engine's decoder handle it.
|
|
|
|
state_[kBufferedBytes] = 0;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// Found the first byte of a UTF-8 character. By looking at the
|
|
|
|
// upper bits we can tell how long the character *should* be.
|
|
|
|
if ((data[i] & 0xE0) == 0xC0) {
|
|
|
|
state_[kMissingBytes] = 2;
|
|
|
|
} else if ((data[i] & 0xF0) == 0xE0) {
|
|
|
|
state_[kMissingBytes] = 3;
|
|
|
|
} else if ((data[i] & 0xF8) == 0xF0) {
|
|
|
|
state_[kMissingBytes] = 4;
|
|
|
|
} else {
|
|
|
|
// This lead byte would indicate a character outside of the
|
|
|
|
// representable range.
|
|
|
|
state_[kBufferedBytes] = 0;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (BufferedBytes() >= MissingBytes()) {
|
|
|
|
// Received more or exactly as many trailing bytes than the lead
|
|
|
|
// character would indicate. In the "==" case, we have valid
|
|
|
|
// data and don't need to slice anything off;
|
|
|
|
// in the ">" case, this is invalid UTF-8 anyway.
|
|
|
|
state_[kMissingBytes] = 0;
|
|
|
|
state_[kBufferedBytes] = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
state_[kMissingBytes] -= state_[kBufferedBytes];
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else if (Encoding() == UCS2) {
|
|
|
|
if ((nread % 2) == 1) {
|
|
|
|
// We got half a codepoint, and need the second byte of it.
|
|
|
|
state_[kBufferedBytes] = 1;
|
|
|
|
state_[kMissingBytes] = 1;
|
|
|
|
} else if ((data[nread - 1] & 0xFC) == 0xD8) {
|
|
|
|
// Half a split UTF-16 character.
|
|
|
|
state_[kBufferedBytes] = 2;
|
|
|
|
state_[kMissingBytes] = 2;
|
|
|
|
}
|
2021-01-15 21:01:28 +01:00
|
|
|
} else if (Encoding() == BASE64 || Encoding() == BASE64URL) {
|
2018-02-01 02:28:39 +01:00
|
|
|
state_[kBufferedBytes] = nread % 3;
|
|
|
|
if (state_[kBufferedBytes] > 0)
|
|
|
|
state_[kMissingBytes] = 3 - BufferedBytes();
|
|
|
|
}
|
|
|
|
|
|
|
|
if (BufferedBytes() > 0) {
|
|
|
|
// Copy the requested number of buffered bytes from the end of the
|
|
|
|
// input into the incomplete character buffer.
|
|
|
|
nread -= BufferedBytes();
|
|
|
|
*nread_ptr -= BufferedBytes();
|
|
|
|
memcpy(IncompleteCharacterBuffer(), data + nread, BufferedBytes());
|
|
|
|
}
|
|
|
|
|
|
|
|
if (nread > 0) {
|
|
|
|
if (!MakeString(isolate, data, nread, Encoding()).ToLocal(&body))
|
|
|
|
return MaybeLocal<String>();
|
|
|
|
} else {
|
|
|
|
body = String::Empty(isolate);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (prepend.IsEmpty()) {
|
|
|
|
return body;
|
|
|
|
} else {
|
2018-08-24 11:49:42 +02:00
|
|
|
return String::Concat(isolate, prepend, body);
|
2018-02-01 02:28:39 +01:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
CHECK(Encoding() == ASCII || Encoding() == HEX || Encoding() == LATIN1);
|
|
|
|
return MakeString(isolate, data, nread, Encoding());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
MaybeLocal<String> StringDecoder::FlushData(Isolate* isolate) {
|
|
|
|
if (Encoding() == ASCII || Encoding() == HEX || Encoding() == LATIN1) {
|
|
|
|
CHECK_EQ(MissingBytes(), 0);
|
|
|
|
CHECK_EQ(BufferedBytes(), 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (Encoding() == UCS2 && BufferedBytes() % 2 == 1) {
|
|
|
|
// Ignore a single trailing byte, like the JS decoder does.
|
|
|
|
state_[kMissingBytes]--;
|
|
|
|
state_[kBufferedBytes]--;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (BufferedBytes() == 0)
|
|
|
|
return String::Empty(isolate);
|
|
|
|
|
|
|
|
MaybeLocal<String> ret =
|
|
|
|
MakeString(isolate,
|
|
|
|
IncompleteCharacterBuffer(),
|
|
|
|
BufferedBytes(),
|
|
|
|
Encoding());
|
|
|
|
|
|
|
|
state_[kMissingBytes] = 0;
|
|
|
|
state_[kBufferedBytes] = 0;
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
void DecodeData(const FunctionCallbackInfo<Value>& args) {
|
|
|
|
StringDecoder* decoder =
|
|
|
|
reinterpret_cast<StringDecoder*>(Buffer::Data(args[0]));
|
2018-05-20 17:44:06 +02:00
|
|
|
CHECK_NOT_NULL(decoder);
|
2019-02-25 04:12:19 +01:00
|
|
|
|
|
|
|
CHECK(args[1]->IsArrayBufferView());
|
|
|
|
ArrayBufferViewContents<char> content(args[1].As<ArrayBufferView>());
|
|
|
|
size_t length = content.length();
|
|
|
|
|
2018-02-01 02:28:39 +01:00
|
|
|
MaybeLocal<String> ret =
|
2019-02-25 04:12:19 +01:00
|
|
|
decoder->DecodeData(args.GetIsolate(), content.data(), &length);
|
2018-02-01 02:28:39 +01:00
|
|
|
if (!ret.IsEmpty())
|
|
|
|
args.GetReturnValue().Set(ret.ToLocalChecked());
|
|
|
|
}
|
|
|
|
|
|
|
|
void FlushData(const FunctionCallbackInfo<Value>& args) {
|
|
|
|
StringDecoder* decoder =
|
|
|
|
reinterpret_cast<StringDecoder*>(Buffer::Data(args[0]));
|
2018-05-20 17:44:06 +02:00
|
|
|
CHECK_NOT_NULL(decoder);
|
2018-02-01 02:28:39 +01:00
|
|
|
MaybeLocal<String> ret = decoder->FlushData(args.GetIsolate());
|
|
|
|
if (!ret.IsEmpty())
|
|
|
|
args.GetReturnValue().Set(ret.ToLocalChecked());
|
|
|
|
}
|
|
|
|
|
|
|
|
void InitializeStringDecoder(Local<Object> target,
|
|
|
|
Local<Value> unused,
|
2018-11-29 17:24:47 -08:00
|
|
|
Local<Context> context,
|
|
|
|
void* priv) {
|
2018-02-01 02:28:39 +01:00
|
|
|
Environment* env = Environment::GetCurrent(context);
|
|
|
|
Isolate* isolate = env->isolate();
|
|
|
|
|
|
|
|
#define SET_DECODER_CONSTANT(name) \
|
|
|
|
target->Set(context, \
|
|
|
|
FIXED_ONE_BYTE_STRING(isolate, #name), \
|
|
|
|
Integer::New(isolate, StringDecoder::name)).FromJust()
|
|
|
|
|
|
|
|
SET_DECODER_CONSTANT(kIncompleteCharactersStart);
|
|
|
|
SET_DECODER_CONSTANT(kIncompleteCharactersEnd);
|
|
|
|
SET_DECODER_CONSTANT(kMissingBytes);
|
|
|
|
SET_DECODER_CONSTANT(kBufferedBytes);
|
|
|
|
SET_DECODER_CONSTANT(kEncodingField);
|
|
|
|
SET_DECODER_CONSTANT(kNumFields);
|
|
|
|
|
|
|
|
Local<Array> encodings = Array::New(isolate);
|
|
|
|
#define ADD_TO_ENCODINGS_ARRAY(cname, jsname) \
|
|
|
|
encodings->Set(context, \
|
|
|
|
static_cast<int32_t>(cname), \
|
|
|
|
FIXED_ONE_BYTE_STRING(isolate, jsname)).FromJust()
|
|
|
|
ADD_TO_ENCODINGS_ARRAY(ASCII, "ascii");
|
|
|
|
ADD_TO_ENCODINGS_ARRAY(UTF8, "utf8");
|
|
|
|
ADD_TO_ENCODINGS_ARRAY(BASE64, "base64");
|
2021-01-15 21:01:28 +01:00
|
|
|
ADD_TO_ENCODINGS_ARRAY(BASE64URL, "base64url");
|
2018-02-01 02:28:39 +01:00
|
|
|
ADD_TO_ENCODINGS_ARRAY(UCS2, "utf16le");
|
|
|
|
ADD_TO_ENCODINGS_ARRAY(HEX, "hex");
|
|
|
|
ADD_TO_ENCODINGS_ARRAY(BUFFER, "buffer");
|
|
|
|
ADD_TO_ENCODINGS_ARRAY(LATIN1, "latin1");
|
|
|
|
|
|
|
|
target->Set(context,
|
|
|
|
FIXED_ONE_BYTE_STRING(isolate, "encodings"),
|
2019-04-09 15:21:36 -07:00
|
|
|
encodings).Check();
|
2018-02-01 02:28:39 +01:00
|
|
|
|
|
|
|
target->Set(context,
|
|
|
|
FIXED_ONE_BYTE_STRING(isolate, "kSize"),
|
2019-04-09 15:21:36 -07:00
|
|
|
Integer::New(isolate, sizeof(StringDecoder))).Check();
|
2018-02-01 02:28:39 +01:00
|
|
|
|
2022-08-02 00:01:02 +08:00
|
|
|
SetMethod(context, target, "decode", DecodeData);
|
|
|
|
SetMethod(context, target, "flush", FlushData);
|
2018-02-01 02:28:39 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
} // anonymous namespace
|
|
|
|
|
2020-04-22 02:57:16 +08:00
|
|
|
void RegisterStringDecoderExternalReferences(
|
|
|
|
ExternalReferenceRegistry* registry) {
|
|
|
|
registry->Register(DecodeData);
|
|
|
|
registry->Register(FlushData);
|
|
|
|
}
|
|
|
|
|
2018-02-01 02:28:39 +01:00
|
|
|
} // namespace node
|
|
|
|
|
2022-11-21 01:37:56 +08:00
|
|
|
NODE_BINDING_CONTEXT_AWARE_INTERNAL(string_decoder,
|
|
|
|
node::InitializeStringDecoder)
|
|
|
|
NODE_BINDING_EXTERNAL_REFERENCE(string_decoder,
|
|
|
|
node::RegisterStringDecoderExternalReferences)
|