[ruby/prism] Symbol encoding follow-up
Ensure we don't accidentally parse the symbol encoding twice, and ensure we parse it in every circumstance we need to by requiring it as a parameter. https://github.com/ruby/prism/commit/9cea31c785
This commit is contained in:
parent
a120529020
commit
bcafd28a3e
@ -5475,46 +5475,40 @@ pm_super_node_create(pm_parser_t *parser, const pm_token_t *keyword, pm_argument
|
||||
* Read through the contents of a string and check if it consists solely of US ASCII code points.
|
||||
*/
|
||||
static bool
|
||||
ascii_only_p( const pm_string_t *contents) {
|
||||
const size_t length = contents->length;
|
||||
pm_ascii_only_p(const pm_string_t *contents) {
|
||||
const size_t length = pm_string_length(contents);
|
||||
const uint8_t *source = pm_string_source(contents);
|
||||
|
||||
for (size_t i = 0; i < length; i++) {
|
||||
if (contents->source[i] & 0x80) {
|
||||
return false;
|
||||
}
|
||||
for (size_t index = 0; index < length; index++) {
|
||||
if (source[index] & 0x80) return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Ruby "downgrades" the encoding of Symbols to US-ASCII if the associated encoding is ASCII-compatible and
|
||||
* the Symbol consists only of US-ASCII code points. Otherwise, the encoding may be explicitly set with an
|
||||
* escape sequence.
|
||||
* Ruby "downgrades" the encoding of Symbols to US-ASCII if the associated
|
||||
* encoding is ASCII-compatible and the Symbol consists only of US-ASCII code
|
||||
* points. Otherwise, the encoding may be explicitly set with an escape
|
||||
* sequence.
|
||||
*/
|
||||
static inline pm_node_flags_t
|
||||
parse_symbol_encoding(const pm_parser_t *parser, const pm_string_t *contents) {
|
||||
// Don't set any flags on the Symbol if it hasn't been populated yet.
|
||||
if (contents->source == NULL) {
|
||||
return 0;
|
||||
if (parser->explicit_encoding != NULL) {
|
||||
// A Symbol may optionally have its encoding explicitly set. This will
|
||||
// happen if an escape sequence results in a non-ASCII code point.
|
||||
if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) {
|
||||
return PM_SYMBOL_FLAGS_FORCED_UTF8_ENCODING;
|
||||
} else if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) {
|
||||
return PM_SYMBOL_FLAGS_FORCED_BINARY_ENCODING;
|
||||
}
|
||||
} else if (pm_ascii_only_p(contents)) {
|
||||
// Ruby stipulates that all source files must use an ASCII-compatible
|
||||
// encoding. Thus, all symbols appearing in source are eligible for
|
||||
// "downgrading" to US-ASCII.
|
||||
return PM_SYMBOL_FLAGS_FORCED_US_ASCII_ENCODING;
|
||||
}
|
||||
|
||||
// Ruby stipulates that all source files must use an ASCII-compatible encoding. Thus, all symbols appearing in
|
||||
// source are eligible for "downgrading" to US-ASCII.
|
||||
if (ascii_only_p(contents)) {
|
||||
return PM_SYMBOL_FLAGS_FORCED_US_ASCII_ENCODING;
|
||||
} else {
|
||||
// A Symbol may optionally have its encoding explicitly set.
|
||||
//
|
||||
// NB: an explicitly set encoding is ignored by Ruby if the Symbol consists of only US ASCII code points.
|
||||
if (parser->explicit_encoding != NULL) {
|
||||
if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) {
|
||||
return PM_SYMBOL_FLAGS_FORCED_UTF8_ENCODING;
|
||||
} else if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) {
|
||||
return PM_SYMBOL_FLAGS_FORCED_BINARY_ENCODING;
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -5523,13 +5517,13 @@ parse_symbol_encoding(const pm_parser_t *parser, const pm_string_t *contents) {
|
||||
* string.
|
||||
*/
|
||||
static pm_symbol_node_t *
|
||||
pm_symbol_node_create_unescaped(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *value, const pm_token_t *closing, const pm_string_t *unescaped) {
|
||||
pm_symbol_node_create_unescaped(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *value, const pm_token_t *closing, const pm_string_t *unescaped, pm_node_flags_t flags) {
|
||||
pm_symbol_node_t *node = PM_ALLOC_NODE(parser, pm_symbol_node_t);
|
||||
|
||||
*node = (pm_symbol_node_t) {
|
||||
{
|
||||
.type = PM_SYMBOL_NODE,
|
||||
.flags = PM_NODE_FLAG_STATIC_LITERAL,
|
||||
.flags = PM_NODE_FLAG_STATIC_LITERAL | flags,
|
||||
.location = {
|
||||
.start = (opening->type == PM_TOKEN_NOT_PROVIDED ? value->start : opening->start),
|
||||
.end = (closing->type == PM_TOKEN_NOT_PROVIDED ? value->end : closing->end)
|
||||
@ -5541,8 +5535,6 @@ pm_symbol_node_create_unescaped(pm_parser_t *parser, const pm_token_t *opening,
|
||||
.unescaped = *unescaped
|
||||
};
|
||||
|
||||
pm_node_flag_set((pm_node_t *)node, parse_symbol_encoding(parser, unescaped));
|
||||
|
||||
return node;
|
||||
}
|
||||
|
||||
@ -5551,7 +5543,7 @@ pm_symbol_node_create_unescaped(pm_parser_t *parser, const pm_token_t *opening,
|
||||
*/
|
||||
static inline pm_symbol_node_t *
|
||||
pm_symbol_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *value, const pm_token_t *closing) {
|
||||
return pm_symbol_node_create_unescaped(parser, opening, value, closing, &PM_STRING_EMPTY);
|
||||
return pm_symbol_node_create_unescaped(parser, opening, value, closing, &PM_STRING_EMPTY, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -5559,7 +5551,7 @@ pm_symbol_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_t
|
||||
*/
|
||||
static pm_symbol_node_t *
|
||||
pm_symbol_node_create_current_string(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *value, const pm_token_t *closing) {
|
||||
pm_symbol_node_t *node = pm_symbol_node_create_unescaped(parser, opening, value, closing, &parser->current_string);
|
||||
pm_symbol_node_t *node = pm_symbol_node_create_unescaped(parser, opening, value, closing, &parser->current_string, parse_symbol_encoding(parser, &parser->current_string));
|
||||
parser->current_string = PM_STRING_EMPTY;
|
||||
return node;
|
||||
}
|
||||
@ -5581,7 +5573,8 @@ pm_symbol_node_label_create(pm_parser_t *parser, const pm_token_t *token) {
|
||||
|
||||
assert((label.end - label.start) >= 0);
|
||||
pm_string_shared_init(&node->unescaped, label.start, label.end);
|
||||
pm_node_flag_set((pm_node_t *)node, parse_symbol_encoding(parser, &node->unescaped));
|
||||
pm_node_flag_set((pm_node_t *) node, parse_symbol_encoding(parser, &node->unescaped));
|
||||
|
||||
break;
|
||||
}
|
||||
case PM_TOKEN_MISSING: {
|
||||
@ -12644,12 +12637,15 @@ PM_STATIC_ASSERT(__LINE__, ((int) PM_STRING_FLAGS_FORCED_UTF8_ENCODING) == ((int
|
||||
static inline pm_node_flags_t
|
||||
parse_unescaped_encoding(const pm_parser_t *parser) {
|
||||
if (parser->explicit_encoding != NULL) {
|
||||
// If the there's an explicit encoding and it's using a UTF-8 escape sequence, then mark the string as UTF-8.
|
||||
if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) {
|
||||
// If the there's an explicit encoding and it's using a UTF-8 escape
|
||||
// sequence, then mark the string as UTF-8.
|
||||
return PM_STRING_FLAGS_FORCED_UTF8_ENCODING;
|
||||
// If there's a non-UTF-8 escape sequence being used, then the string uses the source encoding, unless the source
|
||||
// is marked as US-ASCII. In that case the string is forced as ASCII-8BIT in order to keep the string valid.
|
||||
} else if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) {
|
||||
// If there's a non-UTF-8 escape sequence being used, then the
|
||||
// string uses the source encoding, unless the source is marked as
|
||||
// US-ASCII. In that case the string is forced as ASCII-8BIT in
|
||||
// order to keep the string valid.
|
||||
return PM_STRING_FLAGS_FORCED_BINARY_ENCODING;
|
||||
}
|
||||
}
|
||||
@ -12801,7 +12797,8 @@ parse_operator_symbol(pm_parser_t *parser, const pm_token_t *opening, pm_lex_sta
|
||||
parser_lex(parser);
|
||||
|
||||
pm_string_shared_init(&symbol->unescaped, parser->previous.start, end);
|
||||
pm_node_flag_set((pm_node_t *)symbol, parse_symbol_encoding(parser, &symbol->unescaped));
|
||||
pm_node_flag_set((pm_node_t *) symbol, PM_SYMBOL_FLAGS_FORCED_US_ASCII_ENCODING);
|
||||
|
||||
return (pm_node_t *) symbol;
|
||||
}
|
||||
|
||||
@ -12840,7 +12837,7 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s
|
||||
pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &parser->previous, &closing);
|
||||
|
||||
pm_string_shared_init(&symbol->unescaped, parser->previous.start, parser->previous.end);
|
||||
pm_node_flag_set((pm_node_t *)symbol, parse_symbol_encoding(parser, &symbol->unescaped));
|
||||
pm_node_flag_set((pm_node_t *) symbol, parse_symbol_encoding(parser, &symbol->unescaped));
|
||||
|
||||
return (pm_node_t *) symbol;
|
||||
}
|
||||
@ -12927,7 +12924,6 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s
|
||||
} else {
|
||||
content = (pm_token_t) { .type = PM_TOKEN_STRING_CONTENT, .start = parser->previous.end, .end = parser->previous.end };
|
||||
pm_string_shared_init(&unescaped, content.start, content.end);
|
||||
|
||||
}
|
||||
|
||||
if (next_state != PM_LEX_STATE_NONE) {
|
||||
@ -12940,10 +12936,7 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s
|
||||
expect1(parser, PM_TOKEN_STRING_END, PM_ERR_SYMBOL_TERM_DYNAMIC);
|
||||
}
|
||||
|
||||
pm_symbol_node_t *symbol_node = pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped);
|
||||
pm_node_flag_set((pm_node_t *)symbol_node, parse_symbol_encoding(parser, &symbol_node->unescaped));
|
||||
|
||||
return (pm_node_t *) symbol_node;
|
||||
return (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped, parse_symbol_encoding(parser, &unescaped));
|
||||
}
|
||||
|
||||
/**
|
||||
@ -12968,6 +12961,8 @@ parse_undef_argument(pm_parser_t *parser) {
|
||||
pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &parser->previous, &closing);
|
||||
|
||||
pm_string_shared_init(&symbol->unescaped, parser->previous.start, parser->previous.end);
|
||||
pm_node_flag_set((pm_node_t *) symbol, parse_symbol_encoding(parser, &symbol->unescaped));
|
||||
|
||||
return (pm_node_t *) symbol;
|
||||
}
|
||||
case PM_TOKEN_SYMBOL_BEGIN: {
|
||||
@ -13007,7 +13002,8 @@ parse_alias_argument(pm_parser_t *parser, bool first) {
|
||||
pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &parser->previous, &closing);
|
||||
|
||||
pm_string_shared_init(&symbol->unescaped, parser->previous.start, parser->previous.end);
|
||||
pm_node_flag_set((pm_node_t *)symbol, parse_symbol_encoding(parser, &symbol->unescaped));
|
||||
pm_node_flag_set((pm_node_t *) symbol, parse_symbol_encoding(parser, &symbol->unescaped));
|
||||
|
||||
return (pm_node_t *) symbol;
|
||||
}
|
||||
case PM_TOKEN_SYMBOL_BEGIN: {
|
||||
@ -14066,7 +14062,7 @@ parse_strings(pm_parser_t *parser, pm_node_t *current) {
|
||||
expect1(parser, PM_TOKEN_STRING_END, PM_ERR_STRING_LITERAL_TERM);
|
||||
node = (pm_node_t *) pm_interpolated_string_node_create(parser, &opening, &parts, &parser->previous);
|
||||
} else if (accept1(parser, PM_TOKEN_LABEL_END) && !state_is_arg_labeled) {
|
||||
node = (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped);
|
||||
node = (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped, parse_symbol_encoding(parser, &unescaped));
|
||||
} else if (match1(parser, PM_TOKEN_EOF)) {
|
||||
pm_parser_err_token(parser, &opening, PM_ERR_STRING_LITERAL_TERM);
|
||||
node = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &content, &parser->current, &unescaped);
|
||||
@ -14088,7 +14084,7 @@ parse_strings(pm_parser_t *parser, pm_node_t *current) {
|
||||
pm_node_flag_set(node, parse_unescaped_encoding(parser));
|
||||
expect1(parser, PM_TOKEN_STRING_END, PM_ERR_STRING_LITERAL_TERM);
|
||||
} else if (accept1(parser, PM_TOKEN_LABEL_END)) {
|
||||
node = (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped);
|
||||
node = (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped, parse_symbol_encoding(parser, &unescaped));
|
||||
} else {
|
||||
// If we get here, then we have interpolation so we'll need
|
||||
// to create a string or symbol node with interpolation.
|
||||
|
@ -6,7 +6,7 @@
|
||||
├── @ UndefNode (location: (1,0)-(1,7))
|
||||
│ ├── names: (length: 1)
|
||||
│ │ └── @ SymbolNode (location: (1,6)-(1,7))
|
||||
│ │ ├── flags: ∅
|
||||
│ │ ├── flags: forced_us_ascii_encoding
|
||||
│ │ ├── opening_loc: ∅
|
||||
│ │ ├── value_loc: (1,6)-(1,7) = "a"
|
||||
│ │ ├── closing_loc: ∅
|
||||
@ -15,13 +15,13 @@
|
||||
├── @ UndefNode (location: (3,0)-(3,10))
|
||||
│ ├── names: (length: 2)
|
||||
│ │ ├── @ SymbolNode (location: (3,6)-(3,7))
|
||||
│ │ │ ├── flags: ∅
|
||||
│ │ │ ├── flags: forced_us_ascii_encoding
|
||||
│ │ │ ├── opening_loc: ∅
|
||||
│ │ │ ├── value_loc: (3,6)-(3,7) = "a"
|
||||
│ │ │ ├── closing_loc: ∅
|
||||
│ │ │ └── unescaped: "a"
|
||||
│ │ └── @ SymbolNode (location: (3,9)-(3,10))
|
||||
│ │ ├── flags: ∅
|
||||
│ │ ├── flags: forced_us_ascii_encoding
|
||||
│ │ ├── opening_loc: ∅
|
||||
│ │ ├── value_loc: (3,9)-(3,10) = "b"
|
||||
│ │ ├── closing_loc: ∅
|
||||
@ -30,7 +30,7 @@
|
||||
├── @ UndefNode (location: (5,0)-(5,8))
|
||||
│ ├── names: (length: 1)
|
||||
│ │ └── @ SymbolNode (location: (5,6)-(5,8))
|
||||
│ │ ├── flags: ∅
|
||||
│ │ ├── flags: forced_us_ascii_encoding
|
||||
│ │ ├── opening_loc: ∅
|
||||
│ │ ├── value_loc: (5,6)-(5,8) = "if"
|
||||
│ │ ├── closing_loc: ∅
|
||||
@ -108,7 +108,7 @@
|
||||
└── @ UndefNode (location: (17,0)-(17,14))
|
||||
├── names: (length: 1)
|
||||
│ └── @ SymbolNode (location: (17,6)-(17,14))
|
||||
│ ├── flags: ∅
|
||||
│ ├── flags: forced_us_ascii_encoding
|
||||
│ ├── opening_loc: ∅
|
||||
│ ├── value_loc: (17,6)-(17,14) = "Constant"
|
||||
│ ├── closing_loc: ∅
|
||||
|
@ -6,7 +6,7 @@
|
||||
├── @ UndefNode (location: (1,0)-(1,9))
|
||||
│ ├── names: (length: 1)
|
||||
│ │ └── @ SymbolNode (location: (1,6)-(1,9))
|
||||
│ │ ├── flags: ∅
|
||||
│ │ ├── flags: forced_us_ascii_encoding
|
||||
│ │ ├── opening_loc: ∅
|
||||
│ │ ├── value_loc: (1,6)-(1,9) = "foo"
|
||||
│ │ ├── closing_loc: ∅
|
||||
@ -15,13 +15,13 @@
|
||||
└── @ UndefNode (location: (2,0)-(2,14))
|
||||
├── names: (length: 2)
|
||||
│ ├── @ SymbolNode (location: (2,6)-(2,9))
|
||||
│ │ ├── flags: ∅
|
||||
│ │ ├── flags: forced_us_ascii_encoding
|
||||
│ │ ├── opening_loc: ∅
|
||||
│ │ ├── value_loc: (2,6)-(2,9) = "foo"
|
||||
│ │ ├── closing_loc: ∅
|
||||
│ │ └── unescaped: "foo"
|
||||
│ └── @ SymbolNode (location: (2,11)-(2,14))
|
||||
│ ├── flags: ∅
|
||||
│ ├── flags: forced_us_ascii_encoding
|
||||
│ ├── opening_loc: ∅
|
||||
│ ├── value_loc: (2,11)-(2,14) = "bar"
|
||||
│ ├── closing_loc: ∅
|
||||
|
@ -6,7 +6,7 @@
|
||||
└── @ UndefNode (location: (1,0)-(1,27))
|
||||
├── names: (length: 3)
|
||||
│ ├── @ SymbolNode (location: (1,6)-(1,9))
|
||||
│ │ ├── flags: ∅
|
||||
│ │ ├── flags: forced_us_ascii_encoding
|
||||
│ │ ├── opening_loc: ∅
|
||||
│ │ ├── value_loc: (1,6)-(1,9) = "foo"
|
||||
│ │ ├── closing_loc: ∅
|
||||
|
Loading…
x
Reference in New Issue
Block a user