[ruby/prism] Add macRomania encoding
https://github.com/ruby/prism/commit/bb73801cf4
This commit is contained in:
parent
6c2defdfaa
commit
4b2915f0b9
@ -191,6 +191,7 @@ extern pm_encoding_t pm_encoding_iso_8859_15;
|
||||
extern pm_encoding_t pm_encoding_iso_8859_16;
|
||||
extern pm_encoding_t pm_encoding_koi8_r;
|
||||
extern pm_encoding_t pm_encoding_mac_iceland;
|
||||
extern pm_encoding_t pm_encoding_mac_romania;
|
||||
extern pm_encoding_t pm_encoding_shift_jis;
|
||||
extern pm_encoding_t pm_encoding_utf_8;
|
||||
extern pm_encoding_t pm_encoding_utf8_mac;
|
||||
|
@ -744,6 +744,30 @@ static uint8_t pm_encoding_mac_iceland_table[256] = {
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx
|
||||
};
|
||||
|
||||
/**
|
||||
* Each element of the following table contains a bitfield that indicates a
|
||||
* piece of information about the corresponding macRomania character.
|
||||
*/
|
||||
static uint8_t pm_encoding_mac_romania_table[256] = {
|
||||
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x
|
||||
0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x
|
||||
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x
|
||||
0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Cx
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Dx
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ex
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx
|
||||
};
|
||||
|
||||
/**
|
||||
* Each element of the following table contains a bitfield that indicates a
|
||||
* piece of information about the corresponding windows-1250 character.
|
||||
@ -1055,6 +1079,7 @@ PRISM_ENCODING_TABLE(iso_8859_15)
|
||||
PRISM_ENCODING_TABLE(iso_8859_16)
|
||||
PRISM_ENCODING_TABLE(koi8_r)
|
||||
PRISM_ENCODING_TABLE(mac_iceland)
|
||||
PRISM_ENCODING_TABLE(mac_romania)
|
||||
PRISM_ENCODING_TABLE(windows_1250)
|
||||
PRISM_ENCODING_TABLE(windows_1251)
|
||||
PRISM_ENCODING_TABLE(windows_1252)
|
||||
@ -1387,6 +1412,16 @@ pm_encoding_t pm_encoding_mac_iceland = {
|
||||
.multibyte = false
|
||||
};
|
||||
|
||||
/** macRomania */
|
||||
pm_encoding_t pm_encoding_mac_romania = {
|
||||
.name = "macRomania",
|
||||
.char_width = pm_encoding_single_char_width,
|
||||
.alnum_char = pm_encoding_mac_romania_alnum_char,
|
||||
.alpha_char = pm_encoding_mac_romania_alpha_char,
|
||||
.isupper_char = pm_encoding_mac_romania_isupper_char,
|
||||
.multibyte = false
|
||||
};
|
||||
|
||||
/** Windows-1250 */
|
||||
pm_encoding_t pm_encoding_windows_1250 = {
|
||||
.name = "Windows-1250",
|
||||
|
@ -6136,6 +6136,7 @@ parser_lex_magic_comment_encoding_value(pm_parser_t *parser, const uint8_t *star
|
||||
break;
|
||||
case 'M': case 'm':
|
||||
ENCODING1("macIceland", pm_encoding_mac_iceland);
|
||||
ENCODING1("macRomania", pm_encoding_mac_romania);
|
||||
break;
|
||||
case 'P': case 'p':
|
||||
ENCODING1("PCK", pm_encoding_windows_31j);
|
||||
|
@ -4,65 +4,71 @@ require_relative "test_helper"
|
||||
|
||||
module Prism
|
||||
class EncodingTest < TestCase
|
||||
[
|
||||
Encoding::ASCII,
|
||||
Encoding::ASCII_8BIT,
|
||||
Encoding::Big5,
|
||||
Encoding::CP51932,
|
||||
Encoding::CP850,
|
||||
Encoding::CP852,
|
||||
Encoding::CP855,
|
||||
Encoding::EUC_JP,
|
||||
Encoding::GBK,
|
||||
Encoding::IBM437,
|
||||
Encoding::IBM720,
|
||||
Encoding::IBM737,
|
||||
Encoding::IBM775,
|
||||
Encoding::IBM852,
|
||||
Encoding::IBM855,
|
||||
Encoding::IBM857,
|
||||
Encoding::IBM860,
|
||||
Encoding::IBM861,
|
||||
Encoding::IBM862,
|
||||
Encoding::ISO_8859_1,
|
||||
Encoding::ISO_8859_2,
|
||||
Encoding::ISO_8859_3,
|
||||
Encoding::ISO_8859_4,
|
||||
Encoding::ISO_8859_5,
|
||||
Encoding::ISO_8859_6,
|
||||
Encoding::ISO_8859_7,
|
||||
Encoding::ISO_8859_8,
|
||||
Encoding::ISO_8859_9,
|
||||
Encoding::ISO_8859_10,
|
||||
Encoding::ISO_8859_11,
|
||||
Encoding::ISO_8859_13,
|
||||
Encoding::ISO_8859_14,
|
||||
Encoding::ISO_8859_15,
|
||||
Encoding::ISO_8859_16,
|
||||
Encoding::KOI8_R,
|
||||
Encoding::Shift_JIS,
|
||||
Encoding::UTF_8,
|
||||
Encoding::UTF8_MAC,
|
||||
Encoding::Windows_1250,
|
||||
Encoding::Windows_1251,
|
||||
Encoding::Windows_1252,
|
||||
Encoding::Windows_1253,
|
||||
Encoding::Windows_1254,
|
||||
Encoding::Windows_1255,
|
||||
Encoding::Windows_1256,
|
||||
Encoding::Windows_1257,
|
||||
Encoding::Windows_1258,
|
||||
Encoding::Windows_31J
|
||||
].each do |encoding|
|
||||
encoding.names.each do |name|
|
||||
# Even though UTF-8-MAC is an alias for UTF8-MAC, CRuby treats it as
|
||||
# UTF-8. So we'll skip this test.
|
||||
next if name == "UTF-8-MAC"
|
||||
encodings = {
|
||||
Encoding::ASCII => 0x00...0x100,
|
||||
Encoding::ASCII_8BIT => 0x00...0x100,
|
||||
Encoding::CP850 => 0x00...0x100,
|
||||
Encoding::CP852 => 0x00...0x100,
|
||||
Encoding::CP855 => 0x00...0x100,
|
||||
Encoding::IBM437 => 0x00...0x100,
|
||||
Encoding::IBM720 => 0x00...0x100,
|
||||
Encoding::IBM737 => 0x00...0x100,
|
||||
Encoding::IBM775 => 0x00...0x100,
|
||||
Encoding::IBM852 => 0x00...0x100,
|
||||
Encoding::IBM855 => 0x00...0x100,
|
||||
Encoding::IBM857 => 0x00...0x100,
|
||||
Encoding::IBM860 => 0x00...0x100,
|
||||
Encoding::IBM861 => 0x00...0x100,
|
||||
Encoding::IBM862 => 0x00...0x100,
|
||||
Encoding::ISO_8859_1 => 0x00...0x100,
|
||||
Encoding::ISO_8859_2 => 0x00...0x100,
|
||||
Encoding::ISO_8859_3 => 0x00...0x100,
|
||||
Encoding::ISO_8859_4 => 0x00...0x100,
|
||||
Encoding::ISO_8859_5 => 0x00...0x100,
|
||||
Encoding::ISO_8859_6 => 0x00...0x100,
|
||||
Encoding::ISO_8859_7 => 0x00...0x100,
|
||||
Encoding::ISO_8859_8 => 0x00...0x100,
|
||||
Encoding::ISO_8859_9 => 0x00...0x100,
|
||||
Encoding::ISO_8859_10 => 0x00...0x100,
|
||||
Encoding::ISO_8859_11 => 0x00...0x100,
|
||||
Encoding::ISO_8859_13 => 0x00...0x100,
|
||||
Encoding::ISO_8859_14 => 0x00...0x100,
|
||||
Encoding::ISO_8859_15 => 0x00...0x100,
|
||||
Encoding::ISO_8859_16 => 0x00...0x100,
|
||||
Encoding::KOI8_R => 0x00...0x100,
|
||||
Encoding::MACICELAND => 0x00...0x100,
|
||||
Encoding::MACROMANIA => 0x00...0x100,
|
||||
Encoding::Windows_1250 => 0x00...0x100,
|
||||
Encoding::Windows_1251 => 0x00...0x100,
|
||||
Encoding::Windows_1252 => 0x00...0x100,
|
||||
Encoding::Windows_1253 => 0x00...0x100,
|
||||
Encoding::Windows_1254 => 0x00...0x100,
|
||||
Encoding::Windows_1255 => 0x00...0x100,
|
||||
Encoding::Windows_1256 => 0x00...0x100,
|
||||
Encoding::Windows_1257 => 0x00...0x100,
|
||||
Encoding::Windows_1258 => 0x00...0x100,
|
||||
Encoding::Big5 => 0x00...0x10000,
|
||||
Encoding::CP51932 => 0x00...0x10000,
|
||||
Encoding::GBK => 0x00...0x10000,
|
||||
Encoding::Shift_JIS => 0x00...0x10000,
|
||||
Encoding::Windows_31J => 0x00...0x10000
|
||||
}
|
||||
|
||||
define_method "test_encoding_#{name}" do
|
||||
result = Prism.parse("# encoding: #{name}\n'string'")
|
||||
actual = result.value.statements.body.first.unescaped.encoding
|
||||
assert_equal encoding, actual
|
||||
# By default we don't test every codepoint in these encodings because they
|
||||
# are 3 and 4 byte representations so it can drastically slow down the test
|
||||
# suite.
|
||||
if ENV["PRISM_TEST_ALL_ENCODINGS"]
|
||||
encodings.merge!(
|
||||
Encoding::EUC_JP => 0x00...0x1000000,
|
||||
Encoding::UTF_8 => 0x00...0x110000,
|
||||
Encoding::UTF8_MAC => 0x00...0x110000
|
||||
)
|
||||
end
|
||||
|
||||
encodings.each do |encoding, range|
|
||||
encoding.names.each do |name|
|
||||
define_method(:"test_encoding_#{name}") do
|
||||
assert_encoding(encoding, name, range)
|
||||
end
|
||||
end
|
||||
end
|
||||
@ -124,5 +130,95 @@ module Prism
|
||||
assert_equal (+"ア").force_encoding(Encoding::SHIFT_JIS), slice
|
||||
assert_equal Encoding::SHIFT_JIS, slice.encoding
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
class ConstantContext < BasicObject
|
||||
def self.const_missing(const)
|
||||
const
|
||||
end
|
||||
end
|
||||
|
||||
def constant_context
|
||||
ConstantContext.new
|
||||
end
|
||||
|
||||
class IdentifierContext < BasicObject
|
||||
def method_missing(name, *)
|
||||
name
|
||||
end
|
||||
end
|
||||
|
||||
def identifier_context
|
||||
IdentifierContext.new
|
||||
end
|
||||
|
||||
def assert_encoding_constant(name, character)
|
||||
source = "# encoding: #{name}\n#{character}"
|
||||
expected = constant_context.instance_eval(source)
|
||||
|
||||
result = Prism.parse(source)
|
||||
assert result.success?
|
||||
|
||||
actual = result.value.statements.body.last
|
||||
assert_kind_of ConstantReadNode, actual
|
||||
assert_equal expected, actual.name
|
||||
end
|
||||
|
||||
def assert_encoding_identifier(name, character)
|
||||
source = "# encoding: #{name}\n#{character}"
|
||||
expected = identifier_context.instance_eval(source)
|
||||
|
||||
result = Prism.parse(source)
|
||||
assert result.success?
|
||||
|
||||
actual = result.value.statements.body.last
|
||||
assert_kind_of CallNode, actual
|
||||
assert_equal expected, actual.name
|
||||
end
|
||||
|
||||
# Check that we can properly parse every codepoint in the given encoding.
|
||||
def assert_encoding(encoding, name, range)
|
||||
# I'm not entirely sure, but I believe these codepoints are incorrect in
|
||||
# their parsing in CRuby. They all report as matching `[[:lower:]]` but
|
||||
# then they are parsed as constants. This is because CRuby determines if
|
||||
# an identifier is a constant or not by case folding it down to lowercase
|
||||
# and checking if there is a difference. And even though they report
|
||||
# themselves as lowercase, their case fold is different. I have reported
|
||||
# this bug upstream.
|
||||
case encoding
|
||||
when Encoding::UTF_8, Encoding::UTF_8_MAC
|
||||
range = range.to_a - [
|
||||
0x01c5, 0x01c8, 0x01cb, 0x01f2, 0x1f88, 0x1f89, 0x1f8a, 0x1f8b,
|
||||
0x1f8c, 0x1f8d, 0x1f8e, 0x1f8f, 0x1f98, 0x1f99, 0x1f9a, 0x1f9b,
|
||||
0x1f9c, 0x1f9d, 0x1f9e, 0x1f9f, 0x1fa8, 0x1fa9, 0x1faa, 0x1fab,
|
||||
0x1fac, 0x1fad, 0x1fae, 0x1faf, 0x1fbc, 0x1fcc, 0x1ffc,
|
||||
]
|
||||
when Encoding::Windows_1253
|
||||
range = range.to_a - [0xb5]
|
||||
end
|
||||
|
||||
range.each do |codepoint|
|
||||
character = codepoint.chr(encoding)
|
||||
|
||||
if character.match?(/[[:alpha:]]/)
|
||||
if character.match?(/[[:upper:]]/)
|
||||
assert_encoding_constant(name, character)
|
||||
else
|
||||
assert_encoding_identifier(name, character)
|
||||
end
|
||||
elsif character.match?(/[[:alnum:]]/)
|
||||
assert_encoding_identifier(name, "_#{character}")
|
||||
else
|
||||
next if ["/", "{"].include?(character)
|
||||
|
||||
source = "# encoding: #{name}\n/(?##{character})/\n"
|
||||
assert Prism.parse(source).success?
|
||||
end
|
||||
rescue RangeError
|
||||
source = "# encoding: #{name}\n\\x#{codepoint.to_s(16)}"
|
||||
refute Prism.parse(source).success?
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
Loading…
x
Reference in New Issue
Block a user