diff --git a/prism/enc/pm_encoding.h b/prism/enc/pm_encoding.h index 6a0ba3b6c8..4a6a6a5142 100644 --- a/prism/enc/pm_encoding.h +++ b/prism/enc/pm_encoding.h @@ -191,6 +191,7 @@ extern pm_encoding_t pm_encoding_iso_8859_15; extern pm_encoding_t pm_encoding_iso_8859_16; extern pm_encoding_t pm_encoding_koi8_r; extern pm_encoding_t pm_encoding_mac_iceland; +extern pm_encoding_t pm_encoding_mac_romania; extern pm_encoding_t pm_encoding_shift_jis; extern pm_encoding_t pm_encoding_utf_8; extern pm_encoding_t pm_encoding_utf8_mac; diff --git a/prism/enc/pm_tables.c b/prism/enc/pm_tables.c index 02041c2587..ed4b219dbf 100644 --- a/prism/enc/pm_tables.c +++ b/prism/enc/pm_tables.c @@ -744,6 +744,30 @@ static uint8_t pm_encoding_mac_iceland_table[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx }; +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding macRomania character. + */ +static uint8_t pm_encoding_mac_romania_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Cx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Dx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ex + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx +}; + /** * Each element of the following table contains a bitfield that indicates a * piece of information about the corresponding windows-1250 character. @@ -1055,6 +1079,7 @@ PRISM_ENCODING_TABLE(iso_8859_15) PRISM_ENCODING_TABLE(iso_8859_16) PRISM_ENCODING_TABLE(koi8_r) PRISM_ENCODING_TABLE(mac_iceland) +PRISM_ENCODING_TABLE(mac_romania) PRISM_ENCODING_TABLE(windows_1250) PRISM_ENCODING_TABLE(windows_1251) PRISM_ENCODING_TABLE(windows_1252) @@ -1387,6 +1412,16 @@ pm_encoding_t pm_encoding_mac_iceland = { .multibyte = false }; +/** macRomania */ +pm_encoding_t pm_encoding_mac_romania = { + .name = "macRomania", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_mac_romania_alnum_char, + .alpha_char = pm_encoding_mac_romania_alpha_char, + .isupper_char = pm_encoding_mac_romania_isupper_char, + .multibyte = false +}; + /** Windows-1250 */ pm_encoding_t pm_encoding_windows_1250 = { .name = "Windows-1250", diff --git a/prism/prism.c b/prism/prism.c index 95201f5c34..882a268887 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -6136,6 +6136,7 @@ parser_lex_magic_comment_encoding_value(pm_parser_t *parser, const uint8_t *star break; case 'M': case 'm': ENCODING1("macIceland", pm_encoding_mac_iceland); + ENCODING1("macRomania", pm_encoding_mac_romania); break; case 'P': case 'p': ENCODING1("PCK", pm_encoding_windows_31j); diff --git a/test/prism/encoding_test.rb b/test/prism/encoding_test.rb index c445d023a3..c9a754d047 100644 --- a/test/prism/encoding_test.rb +++ b/test/prism/encoding_test.rb @@ -4,65 +4,71 @@ require_relative "test_helper" module Prism class EncodingTest < TestCase - [ - Encoding::ASCII, - Encoding::ASCII_8BIT, - Encoding::Big5, - Encoding::CP51932, - Encoding::CP850, - Encoding::CP852, - Encoding::CP855, - Encoding::EUC_JP, - Encoding::GBK, - Encoding::IBM437, - Encoding::IBM720, - Encoding::IBM737, - Encoding::IBM775, - Encoding::IBM852, - Encoding::IBM855, - Encoding::IBM857, - Encoding::IBM860, - Encoding::IBM861, - Encoding::IBM862, - Encoding::ISO_8859_1, - Encoding::ISO_8859_2, - Encoding::ISO_8859_3, - Encoding::ISO_8859_4, - Encoding::ISO_8859_5, - Encoding::ISO_8859_6, - Encoding::ISO_8859_7, - Encoding::ISO_8859_8, - Encoding::ISO_8859_9, - Encoding::ISO_8859_10, - Encoding::ISO_8859_11, - Encoding::ISO_8859_13, - Encoding::ISO_8859_14, - Encoding::ISO_8859_15, - Encoding::ISO_8859_16, - Encoding::KOI8_R, - Encoding::Shift_JIS, - Encoding::UTF_8, - Encoding::UTF8_MAC, - Encoding::Windows_1250, - Encoding::Windows_1251, - Encoding::Windows_1252, - Encoding::Windows_1253, - Encoding::Windows_1254, - Encoding::Windows_1255, - Encoding::Windows_1256, - Encoding::Windows_1257, - Encoding::Windows_1258, - Encoding::Windows_31J - ].each do |encoding| - encoding.names.each do |name| - # Even though UTF-8-MAC is an alias for UTF8-MAC, CRuby treats it as - # UTF-8. So we'll skip this test. - next if name == "UTF-8-MAC" + encodings = { + Encoding::ASCII => 0x00...0x100, + Encoding::ASCII_8BIT => 0x00...0x100, + Encoding::CP850 => 0x00...0x100, + Encoding::CP852 => 0x00...0x100, + Encoding::CP855 => 0x00...0x100, + Encoding::IBM437 => 0x00...0x100, + Encoding::IBM720 => 0x00...0x100, + Encoding::IBM737 => 0x00...0x100, + Encoding::IBM775 => 0x00...0x100, + Encoding::IBM852 => 0x00...0x100, + Encoding::IBM855 => 0x00...0x100, + Encoding::IBM857 => 0x00...0x100, + Encoding::IBM860 => 0x00...0x100, + Encoding::IBM861 => 0x00...0x100, + Encoding::IBM862 => 0x00...0x100, + Encoding::ISO_8859_1 => 0x00...0x100, + Encoding::ISO_8859_2 => 0x00...0x100, + Encoding::ISO_8859_3 => 0x00...0x100, + Encoding::ISO_8859_4 => 0x00...0x100, + Encoding::ISO_8859_5 => 0x00...0x100, + Encoding::ISO_8859_6 => 0x00...0x100, + Encoding::ISO_8859_7 => 0x00...0x100, + Encoding::ISO_8859_8 => 0x00...0x100, + Encoding::ISO_8859_9 => 0x00...0x100, + Encoding::ISO_8859_10 => 0x00...0x100, + Encoding::ISO_8859_11 => 0x00...0x100, + Encoding::ISO_8859_13 => 0x00...0x100, + Encoding::ISO_8859_14 => 0x00...0x100, + Encoding::ISO_8859_15 => 0x00...0x100, + Encoding::ISO_8859_16 => 0x00...0x100, + Encoding::KOI8_R => 0x00...0x100, + Encoding::MACICELAND => 0x00...0x100, + Encoding::MACROMANIA => 0x00...0x100, + Encoding::Windows_1250 => 0x00...0x100, + Encoding::Windows_1251 => 0x00...0x100, + Encoding::Windows_1252 => 0x00...0x100, + Encoding::Windows_1253 => 0x00...0x100, + Encoding::Windows_1254 => 0x00...0x100, + Encoding::Windows_1255 => 0x00...0x100, + Encoding::Windows_1256 => 0x00...0x100, + Encoding::Windows_1257 => 0x00...0x100, + Encoding::Windows_1258 => 0x00...0x100, + Encoding::Big5 => 0x00...0x10000, + Encoding::CP51932 => 0x00...0x10000, + Encoding::GBK => 0x00...0x10000, + Encoding::Shift_JIS => 0x00...0x10000, + Encoding::Windows_31J => 0x00...0x10000 + } - define_method "test_encoding_#{name}" do - result = Prism.parse("# encoding: #{name}\n'string'") - actual = result.value.statements.body.first.unescaped.encoding - assert_equal encoding, actual + # By default we don't test every codepoint in these encodings because they + # are 3 and 4 byte representations so it can drastically slow down the test + # suite. + if ENV["PRISM_TEST_ALL_ENCODINGS"] + encodings.merge!( + Encoding::EUC_JP => 0x00...0x1000000, + Encoding::UTF_8 => 0x00...0x110000, + Encoding::UTF8_MAC => 0x00...0x110000 + ) + end + + encodings.each do |encoding, range| + encoding.names.each do |name| + define_method(:"test_encoding_#{name}") do + assert_encoding(encoding, name, range) end end end @@ -124,5 +130,95 @@ module Prism assert_equal (+"ア").force_encoding(Encoding::SHIFT_JIS), slice assert_equal Encoding::SHIFT_JIS, slice.encoding end + + private + + class ConstantContext < BasicObject + def self.const_missing(const) + const + end + end + + def constant_context + ConstantContext.new + end + + class IdentifierContext < BasicObject + def method_missing(name, *) + name + end + end + + def identifier_context + IdentifierContext.new + end + + def assert_encoding_constant(name, character) + source = "# encoding: #{name}\n#{character}" + expected = constant_context.instance_eval(source) + + result = Prism.parse(source) + assert result.success? + + actual = result.value.statements.body.last + assert_kind_of ConstantReadNode, actual + assert_equal expected, actual.name + end + + def assert_encoding_identifier(name, character) + source = "# encoding: #{name}\n#{character}" + expected = identifier_context.instance_eval(source) + + result = Prism.parse(source) + assert result.success? + + actual = result.value.statements.body.last + assert_kind_of CallNode, actual + assert_equal expected, actual.name + end + + # Check that we can properly parse every codepoint in the given encoding. + def assert_encoding(encoding, name, range) + # I'm not entirely sure, but I believe these codepoints are incorrect in + # their parsing in CRuby. They all report as matching `[[:lower:]]` but + # then they are parsed as constants. This is because CRuby determines if + # an identifier is a constant or not by case folding it down to lowercase + # and checking if there is a difference. And even though they report + # themselves as lowercase, their case fold is different. I have reported + # this bug upstream. + case encoding + when Encoding::UTF_8, Encoding::UTF_8_MAC + range = range.to_a - [ + 0x01c5, 0x01c8, 0x01cb, 0x01f2, 0x1f88, 0x1f89, 0x1f8a, 0x1f8b, + 0x1f8c, 0x1f8d, 0x1f8e, 0x1f8f, 0x1f98, 0x1f99, 0x1f9a, 0x1f9b, + 0x1f9c, 0x1f9d, 0x1f9e, 0x1f9f, 0x1fa8, 0x1fa9, 0x1faa, 0x1fab, + 0x1fac, 0x1fad, 0x1fae, 0x1faf, 0x1fbc, 0x1fcc, 0x1ffc, + ] + when Encoding::Windows_1253 + range = range.to_a - [0xb5] + end + + range.each do |codepoint| + character = codepoint.chr(encoding) + + if character.match?(/[[:alpha:]]/) + if character.match?(/[[:upper:]]/) + assert_encoding_constant(name, character) + else + assert_encoding_identifier(name, character) + end + elsif character.match?(/[[:alnum:]]/) + assert_encoding_identifier(name, "_#{character}") + else + next if ["/", "{"].include?(character) + + source = "# encoding: #{name}\n/(?##{character})/\n" + assert Prism.parse(source).success? + end + rescue RangeError + source = "# encoding: #{name}\n\\x#{codepoint.to_s(16)}" + refute Prism.parse(source).success? + end + end end end