[ruby/prism] Do not expose encodings that do not need to be exposed

https://github.com/ruby/prism/commit/c52c7f37ea
This commit is contained in:
Kevin Newton 2023-11-30 12:00:44 -05:00
parent ea409958b3
commit 6b234c1acf
3 changed files with 310 additions and 374 deletions

View File

@ -4212,7 +4212,7 @@ pm_encoding_shift_jis_char_width(const uint8_t *b, ptrdiff_t n) {
}
/** US-ASCII encoding */
pm_encoding_t pm_encoding_ascii = {
static pm_encoding_t pm_encoding_ascii = {
.name = "US-ASCII",
.char_width = pm_encoding_ascii_char_width,
.alnum_char = pm_encoding_ascii_alnum_char,
@ -4222,7 +4222,7 @@ pm_encoding_t pm_encoding_ascii = {
};
/** ASCII-8BIT encoding */
pm_encoding_t pm_encoding_ascii_8bit = {
static pm_encoding_t pm_encoding_ascii_8bit = {
.name = "ASCII-8BIT",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_ascii_alnum_char,
@ -4232,7 +4232,7 @@ pm_encoding_t pm_encoding_ascii_8bit = {
};
/** CP850 */
pm_encoding_t pm_encoding_cp850 = {
static pm_encoding_t pm_encoding_cp850 = {
.name = "CP850",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_cp850_alnum_char,
@ -4242,7 +4242,7 @@ pm_encoding_t pm_encoding_cp850 = {
};
/** CP852 */
pm_encoding_t pm_encoding_cp852 = {
static pm_encoding_t pm_encoding_cp852 = {
.name = "CP852",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_cp852_alnum_char,
@ -4252,7 +4252,7 @@ pm_encoding_t pm_encoding_cp852 = {
};
/** CP855 */
pm_encoding_t pm_encoding_cp855 = {
static pm_encoding_t pm_encoding_cp855 = {
.name = "CP855",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_cp855_alnum_char,
@ -4262,7 +4262,7 @@ pm_encoding_t pm_encoding_cp855 = {
};
/** GB1988 */
pm_encoding_t pm_encoding_gb1988 = {
static pm_encoding_t pm_encoding_gb1988 = {
.name = "GB1988",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_gb1988_alnum_char,
@ -4272,7 +4272,7 @@ pm_encoding_t pm_encoding_gb1988 = {
};
/** IBM437 */
pm_encoding_t pm_encoding_ibm437 = {
static pm_encoding_t pm_encoding_ibm437 = {
.name = "IBM437",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_ibm437_alnum_char,
@ -4282,7 +4282,7 @@ pm_encoding_t pm_encoding_ibm437 = {
};
/** IBM720 */
pm_encoding_t pm_encoding_ibm720 = {
static pm_encoding_t pm_encoding_ibm720 = {
.name = "IBM720",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_ibm720_alnum_char,
@ -4292,7 +4292,7 @@ pm_encoding_t pm_encoding_ibm720 = {
};
/** IBM737 */
pm_encoding_t pm_encoding_ibm737 = {
static pm_encoding_t pm_encoding_ibm737 = {
.name = "IBM737",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_ibm737_alnum_char,
@ -4302,7 +4302,7 @@ pm_encoding_t pm_encoding_ibm737 = {
};
/** IBM775 */
pm_encoding_t pm_encoding_ibm775 = {
static pm_encoding_t pm_encoding_ibm775 = {
.name = "IBM775",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_ibm775_alnum_char,
@ -4312,7 +4312,7 @@ pm_encoding_t pm_encoding_ibm775 = {
};
/** IBM850 */
pm_encoding_t pm_encoding_ibm852 = {
static pm_encoding_t pm_encoding_ibm852 = {
.name = "IBM852",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_ibm852_alnum_char,
@ -4322,7 +4322,7 @@ pm_encoding_t pm_encoding_ibm852 = {
};
/** IBM855 */
pm_encoding_t pm_encoding_ibm855 = {
static pm_encoding_t pm_encoding_ibm855 = {
.name = "IBM855",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_ibm855_alnum_char,
@ -4332,7 +4332,7 @@ pm_encoding_t pm_encoding_ibm855 = {
};
/** IBM857 */
pm_encoding_t pm_encoding_ibm857 = {
static pm_encoding_t pm_encoding_ibm857 = {
.name = "IBM857",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_ibm857_alnum_char,
@ -4342,7 +4342,7 @@ pm_encoding_t pm_encoding_ibm857 = {
};
/** IBM860 */
pm_encoding_t pm_encoding_ibm860 = {
static pm_encoding_t pm_encoding_ibm860 = {
.name = "IBM860",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_ibm860_alnum_char,
@ -4352,7 +4352,7 @@ pm_encoding_t pm_encoding_ibm860 = {
};
/** IBM861 */
pm_encoding_t pm_encoding_ibm861 = {
static pm_encoding_t pm_encoding_ibm861 = {
.name = "IBM861",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_ibm861_alnum_char,
@ -4362,7 +4362,7 @@ pm_encoding_t pm_encoding_ibm861 = {
};
/** IBM862 */
pm_encoding_t pm_encoding_ibm862 = {
static pm_encoding_t pm_encoding_ibm862 = {
.name = "IBM862",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_ibm862_alnum_char,
@ -4372,7 +4372,7 @@ pm_encoding_t pm_encoding_ibm862 = {
};
/** IBM863 */
pm_encoding_t pm_encoding_ibm863 = {
static pm_encoding_t pm_encoding_ibm863 = {
.name = "IBM863",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_ibm863_alnum_char,
@ -4382,7 +4382,7 @@ pm_encoding_t pm_encoding_ibm863 = {
};
/** IBM864 */
pm_encoding_t pm_encoding_ibm864 = {
static pm_encoding_t pm_encoding_ibm864 = {
.name = "IBM864",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_ibm864_alnum_char,
@ -4392,7 +4392,7 @@ pm_encoding_t pm_encoding_ibm864 = {
};
/** IBM866 */
pm_encoding_t pm_encoding_ibm865 = {
static pm_encoding_t pm_encoding_ibm865 = {
.name = "IBM865",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_ibm865_alnum_char,
@ -4402,7 +4402,7 @@ pm_encoding_t pm_encoding_ibm865 = {
};
/** IBM866 */
pm_encoding_t pm_encoding_ibm866 = {
static pm_encoding_t pm_encoding_ibm866 = {
.name = "IBM866",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_ibm866_alnum_char,
@ -4412,7 +4412,7 @@ pm_encoding_t pm_encoding_ibm866 = {
};
/** IBM869 */
pm_encoding_t pm_encoding_ibm869 = {
static pm_encoding_t pm_encoding_ibm869 = {
.name = "IBM869",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_ibm869_alnum_char,
@ -4422,7 +4422,7 @@ pm_encoding_t pm_encoding_ibm869 = {
};
/** ISO-8859-1 */
pm_encoding_t pm_encoding_iso_8859_1 = {
static pm_encoding_t pm_encoding_iso_8859_1 = {
.name = "ISO-8859-1",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_iso_8859_1_alnum_char,
@ -4432,7 +4432,7 @@ pm_encoding_t pm_encoding_iso_8859_1 = {
};
/** ISO-8859-2 */
pm_encoding_t pm_encoding_iso_8859_2 = {
static pm_encoding_t pm_encoding_iso_8859_2 = {
.name = "ISO-8859-2",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_iso_8859_2_alnum_char,
@ -4442,7 +4442,7 @@ pm_encoding_t pm_encoding_iso_8859_2 = {
};
/** ISO-8859-3 */
pm_encoding_t pm_encoding_iso_8859_3 = {
static pm_encoding_t pm_encoding_iso_8859_3 = {
.name = "ISO-8859-3",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_iso_8859_3_alnum_char,
@ -4452,7 +4452,7 @@ pm_encoding_t pm_encoding_iso_8859_3 = {
};
/** ISO-8859-4 */
pm_encoding_t pm_encoding_iso_8859_4 = {
static pm_encoding_t pm_encoding_iso_8859_4 = {
.name = "ISO-8859-4",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_iso_8859_4_alnum_char,
@ -4462,7 +4462,7 @@ pm_encoding_t pm_encoding_iso_8859_4 = {
};
/** ISO-8859-5 */
pm_encoding_t pm_encoding_iso_8859_5 = {
static pm_encoding_t pm_encoding_iso_8859_5 = {
.name = "ISO-8859-5",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_iso_8859_5_alnum_char,
@ -4472,7 +4472,7 @@ pm_encoding_t pm_encoding_iso_8859_5 = {
};
/** ISO-8859-6 */
pm_encoding_t pm_encoding_iso_8859_6 = {
static pm_encoding_t pm_encoding_iso_8859_6 = {
.name = "ISO-8859-6",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_iso_8859_6_alnum_char,
@ -4482,7 +4482,7 @@ pm_encoding_t pm_encoding_iso_8859_6 = {
};
/** ISO-8859-7 */
pm_encoding_t pm_encoding_iso_8859_7 = {
static pm_encoding_t pm_encoding_iso_8859_7 = {
.name = "ISO-8859-7",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_iso_8859_7_alnum_char,
@ -4492,7 +4492,7 @@ pm_encoding_t pm_encoding_iso_8859_7 = {
};
/** ISO-8859-8 */
pm_encoding_t pm_encoding_iso_8859_8 = {
static pm_encoding_t pm_encoding_iso_8859_8 = {
.name = "ISO-8859-8",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_iso_8859_8_alnum_char,
@ -4502,7 +4502,7 @@ pm_encoding_t pm_encoding_iso_8859_8 = {
};
/** ISO-8859-9 */
pm_encoding_t pm_encoding_iso_8859_9 = {
static pm_encoding_t pm_encoding_iso_8859_9 = {
.name = "ISO-8859-9",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_iso_8859_9_alnum_char,
@ -4512,7 +4512,7 @@ pm_encoding_t pm_encoding_iso_8859_9 = {
};
/** ISO-8859-10 */
pm_encoding_t pm_encoding_iso_8859_10 = {
static pm_encoding_t pm_encoding_iso_8859_10 = {
.name = "ISO-8859-10",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_iso_8859_10_alnum_char,
@ -4522,7 +4522,7 @@ pm_encoding_t pm_encoding_iso_8859_10 = {
};
/** ISO-8859-11 */
pm_encoding_t pm_encoding_iso_8859_11 = {
static pm_encoding_t pm_encoding_iso_8859_11 = {
.name = "ISO-8859-11",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_iso_8859_11_alnum_char,
@ -4532,7 +4532,7 @@ pm_encoding_t pm_encoding_iso_8859_11 = {
};
/** ISO-8859-13 */
pm_encoding_t pm_encoding_iso_8859_13 = {
static pm_encoding_t pm_encoding_iso_8859_13 = {
.name = "ISO-8859-13",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_iso_8859_13_alnum_char,
@ -4542,7 +4542,7 @@ pm_encoding_t pm_encoding_iso_8859_13 = {
};
/** ISO-8859-14 */
pm_encoding_t pm_encoding_iso_8859_14 = {
static pm_encoding_t pm_encoding_iso_8859_14 = {
.name = "ISO-8859-14",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_iso_8859_14_alnum_char,
@ -4552,7 +4552,7 @@ pm_encoding_t pm_encoding_iso_8859_14 = {
};
/** ISO-8859-15 */
pm_encoding_t pm_encoding_iso_8859_15 = {
static pm_encoding_t pm_encoding_iso_8859_15 = {
.name = "ISO-8859-15",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_iso_8859_15_alnum_char,
@ -4562,7 +4562,7 @@ pm_encoding_t pm_encoding_iso_8859_15 = {
};
/** ISO-8859-16 */
pm_encoding_t pm_encoding_iso_8859_16 = {
static pm_encoding_t pm_encoding_iso_8859_16 = {
.name = "ISO-8859-16",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_iso_8859_16_alnum_char,
@ -4572,7 +4572,7 @@ pm_encoding_t pm_encoding_iso_8859_16 = {
};
/** KOI8-R */
pm_encoding_t pm_encoding_koi8_r = {
static pm_encoding_t pm_encoding_koi8_r = {
.name = "KOI8-R",
.char_width = pm_encoding_koi8_char_width,
.alnum_char = pm_encoding_koi8_r_alnum_char,
@ -4582,7 +4582,7 @@ pm_encoding_t pm_encoding_koi8_r = {
};
/** KOI8-U */
pm_encoding_t pm_encoding_koi8_u = {
static pm_encoding_t pm_encoding_koi8_u = {
.name = "KOI8-U",
.char_width = pm_encoding_koi8_char_width,
.alnum_char = pm_encoding_koi8_u_alnum_char,
@ -4592,7 +4592,7 @@ pm_encoding_t pm_encoding_koi8_u = {
};
/** macCentEuro */
pm_encoding_t pm_encoding_mac_cent_euro = {
static pm_encoding_t pm_encoding_mac_cent_euro = {
.name = "macCentEuro",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_mac_cent_euro_alnum_char,
@ -4602,7 +4602,7 @@ pm_encoding_t pm_encoding_mac_cent_euro = {
};
/** macCroatian */
pm_encoding_t pm_encoding_mac_croatian = {
static pm_encoding_t pm_encoding_mac_croatian = {
.name = "macCroatian",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_mac_croatian_alnum_char,
@ -4612,7 +4612,7 @@ pm_encoding_t pm_encoding_mac_croatian = {
};
/** macCyrillic */
pm_encoding_t pm_encoding_mac_cyrillic = {
static pm_encoding_t pm_encoding_mac_cyrillic = {
.name = "macCyrillic",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_mac_cyrillic_alnum_char,
@ -4622,7 +4622,7 @@ pm_encoding_t pm_encoding_mac_cyrillic = {
};
/** macGreek */
pm_encoding_t pm_encoding_mac_greek = {
static pm_encoding_t pm_encoding_mac_greek = {
.name = "macGreek",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_mac_greek_alnum_char,
@ -4632,7 +4632,7 @@ pm_encoding_t pm_encoding_mac_greek = {
};
/** macIceland */
pm_encoding_t pm_encoding_mac_iceland = {
static pm_encoding_t pm_encoding_mac_iceland = {
.name = "macIceland",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_mac_iceland_alnum_char,
@ -4642,7 +4642,7 @@ pm_encoding_t pm_encoding_mac_iceland = {
};
/** macRoman */
pm_encoding_t pm_encoding_mac_roman = {
static pm_encoding_t pm_encoding_mac_roman = {
.name = "macRoman",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_mac_roman_alnum_char,
@ -4652,7 +4652,7 @@ pm_encoding_t pm_encoding_mac_roman = {
};
/** macRomania */
pm_encoding_t pm_encoding_mac_romania = {
static pm_encoding_t pm_encoding_mac_romania = {
.name = "macRomania",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_mac_romania_alnum_char,
@ -4662,7 +4662,7 @@ pm_encoding_t pm_encoding_mac_romania = {
};
/** macThai */
pm_encoding_t pm_encoding_mac_thai = {
static pm_encoding_t pm_encoding_mac_thai = {
.name = "macThai",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_mac_thai_alnum_char,
@ -4672,7 +4672,7 @@ pm_encoding_t pm_encoding_mac_thai = {
};
/** macTurkish */
pm_encoding_t pm_encoding_mac_turkish = {
static pm_encoding_t pm_encoding_mac_turkish = {
.name = "macTurkish",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_mac_turkish_alnum_char,
@ -4682,7 +4682,7 @@ pm_encoding_t pm_encoding_mac_turkish = {
};
/** macUkraine */
pm_encoding_t pm_encoding_mac_ukraine = {
static pm_encoding_t pm_encoding_mac_ukraine = {
.name = "macUkraine",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_mac_ukraine_alnum_char,
@ -4692,7 +4692,7 @@ pm_encoding_t pm_encoding_mac_ukraine = {
};
/** TIS-620 */
pm_encoding_t pm_encoding_tis_620 = {
static pm_encoding_t pm_encoding_tis_620 = {
.name = "TIS-620",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_tis_620_alnum_char,
@ -4702,7 +4702,7 @@ pm_encoding_t pm_encoding_tis_620 = {
};
/** Windows-1250 */
pm_encoding_t pm_encoding_windows_1250 = {
static pm_encoding_t pm_encoding_windows_1250 = {
.name = "Windows-1250",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_windows_1250_alnum_char,
@ -4712,7 +4712,7 @@ pm_encoding_t pm_encoding_windows_1250 = {
};
/** Windows-1251 */
pm_encoding_t pm_encoding_windows_1251 = {
static pm_encoding_t pm_encoding_windows_1251 = {
.name = "Windows-1251",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_windows_1251_alnum_char,
@ -4722,7 +4722,7 @@ pm_encoding_t pm_encoding_windows_1251 = {
};
/** Windows-1252 */
pm_encoding_t pm_encoding_windows_1252 = {
static pm_encoding_t pm_encoding_windows_1252 = {
.name = "Windows-1252",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_windows_1252_alnum_char,
@ -4732,7 +4732,7 @@ pm_encoding_t pm_encoding_windows_1252 = {
};
/** Windows-1253 */
pm_encoding_t pm_encoding_windows_1253 = {
static pm_encoding_t pm_encoding_windows_1253 = {
.name = "Windows-1253",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_windows_1253_alnum_char,
@ -4742,7 +4742,7 @@ pm_encoding_t pm_encoding_windows_1253 = {
};
/** Windows-1254 */
pm_encoding_t pm_encoding_windows_1254 = {
static pm_encoding_t pm_encoding_windows_1254 = {
.name = "Windows-1254",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_windows_1254_alnum_char,
@ -4752,7 +4752,7 @@ pm_encoding_t pm_encoding_windows_1254 = {
};
/** Windows-1255 */
pm_encoding_t pm_encoding_windows_1255 = {
static pm_encoding_t pm_encoding_windows_1255 = {
.name = "Windows-1255",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_windows_1255_alnum_char,
@ -4762,7 +4762,7 @@ pm_encoding_t pm_encoding_windows_1255 = {
};
/** Windows-1256 */
pm_encoding_t pm_encoding_windows_1256 = {
static pm_encoding_t pm_encoding_windows_1256 = {
.name = "Windows-1256",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_windows_1256_alnum_char,
@ -4772,7 +4772,7 @@ pm_encoding_t pm_encoding_windows_1256 = {
};
/** Windows-1257 */
pm_encoding_t pm_encoding_windows_1257 = {
static pm_encoding_t pm_encoding_windows_1257 = {
.name = "Windows-1257",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_windows_1257_alnum_char,
@ -4782,7 +4782,7 @@ pm_encoding_t pm_encoding_windows_1257 = {
};
/** Windows-1258 */
pm_encoding_t pm_encoding_windows_1258 = {
static pm_encoding_t pm_encoding_windows_1258 = {
.name = "Windows-1258",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_windows_1258_alnum_char,
@ -4792,7 +4792,7 @@ pm_encoding_t pm_encoding_windows_1258 = {
};
/** Windows-874 */
pm_encoding_t pm_encoding_windows_874 = {
static pm_encoding_t pm_encoding_windows_874 = {
.name = "Windows-874",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_windows_874_alnum_char,
@ -4802,7 +4802,7 @@ pm_encoding_t pm_encoding_windows_874 = {
};
/** Big5 encoding */
pm_encoding_t pm_encoding_big5 = {
static pm_encoding_t pm_encoding_big5 = {
.name = "Big5",
.char_width = pm_encoding_big5_char_width,
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
@ -4812,7 +4812,7 @@ pm_encoding_t pm_encoding_big5 = {
};
/** CP950 encoding */
pm_encoding_t pm_encoding_cp950 = {
static pm_encoding_t pm_encoding_cp950 = {
.name = "CP950",
.char_width = pm_encoding_big5_char_width,
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
@ -4822,7 +4822,7 @@ pm_encoding_t pm_encoding_cp950 = {
};
/** Big5-HKSCS encoding */
pm_encoding_t pm_encoding_big5_hkscs = {
static pm_encoding_t pm_encoding_big5_hkscs = {
.name = "Big5-HKSCS",
.char_width = pm_encoding_big5_char_width,
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
@ -4832,7 +4832,7 @@ pm_encoding_t pm_encoding_big5_hkscs = {
};
/** CP951 encoding */
pm_encoding_t pm_encoding_cp951 = {
static pm_encoding_t pm_encoding_cp951 = {
.name = "CP951",
.char_width = pm_encoding_big5_char_width,
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
@ -4842,7 +4842,7 @@ pm_encoding_t pm_encoding_cp951 = {
};
/** Big5-UAO encoding */
pm_encoding_t pm_encoding_big5_uao = {
static pm_encoding_t pm_encoding_big5_uao = {
.name = "Big5-UAO",
.char_width = pm_encoding_big5_char_width,
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
@ -4852,7 +4852,7 @@ pm_encoding_t pm_encoding_big5_uao = {
};
/** Emacs-Mule encoding */
pm_encoding_t pm_encoding_emacs_mule = {
static pm_encoding_t pm_encoding_emacs_mule = {
.name = "Emacs-Mule",
.char_width = pm_encoding_emacs_mule_char_width,
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
@ -4862,7 +4862,7 @@ pm_encoding_t pm_encoding_emacs_mule = {
};
/** stateless-ISO-2022-JP encoding */
pm_encoding_t pm_encoding_stateless_iso_2022_jp = {
static pm_encoding_t pm_encoding_stateless_iso_2022_jp = {
.name = "stateless-ISO-2022-JP",
.char_width = pm_encoding_emacs_mule_char_width,
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
@ -4872,7 +4872,7 @@ pm_encoding_t pm_encoding_stateless_iso_2022_jp = {
};
/** stateless-ISO-2022-JP-KDDI encoding */
pm_encoding_t pm_encoding_stateless_iso_2022_jp_kddi = {
static pm_encoding_t pm_encoding_stateless_iso_2022_jp_kddi = {
.name = "stateless-ISO-2022-JP-KDDI",
.char_width = pm_encoding_emacs_mule_char_width,
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
@ -4882,7 +4882,7 @@ pm_encoding_t pm_encoding_stateless_iso_2022_jp_kddi = {
};
/** GB18030 encoding */
pm_encoding_t pm_encoding_gb18030 = {
static pm_encoding_t pm_encoding_gb18030 = {
.name = "GB18030",
.char_width = pm_encoding_gb18030_char_width,
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
@ -4892,7 +4892,7 @@ pm_encoding_t pm_encoding_gb18030 = {
};
/** EUC-JP encoding */
pm_encoding_t pm_encoding_euc_jp = {
static pm_encoding_t pm_encoding_euc_jp = {
.name = "EUC-JP",
.char_width = pm_encoding_euc_jp_char_width,
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
@ -4902,7 +4902,7 @@ pm_encoding_t pm_encoding_euc_jp = {
};
/** eucJP-ms encoding */
pm_encoding_t pm_encoding_euc_jp_ms = {
static pm_encoding_t pm_encoding_euc_jp_ms = {
.name = "eucJP-ms",
.char_width = pm_encoding_euc_jp_char_width,
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
@ -4912,7 +4912,7 @@ pm_encoding_t pm_encoding_euc_jp_ms = {
};
/** EUC-JIS-2004 encoding */
pm_encoding_t pm_encoding_euc_jis_2004 = {
static pm_encoding_t pm_encoding_euc_jis_2004 = {
.name = "EUC-JIS-2004",
.char_width = pm_encoding_euc_jp_char_width,
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
@ -4922,7 +4922,7 @@ pm_encoding_t pm_encoding_euc_jis_2004 = {
};
/** CP51932 encoding */
pm_encoding_t pm_encoding_cp51932 = {
static pm_encoding_t pm_encoding_cp51932 = {
.name = "CP51932",
.char_width = pm_encoding_euc_jp_char_width,
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
@ -4932,7 +4932,7 @@ pm_encoding_t pm_encoding_cp51932 = {
};
/** EUC-KR encoding */
pm_encoding_t pm_encoding_euc_kr = {
static pm_encoding_t pm_encoding_euc_kr = {
.name = "EUC-KR",
.char_width = pm_encoding_euc_kr_char_width,
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
@ -4942,7 +4942,7 @@ pm_encoding_t pm_encoding_euc_kr = {
};
/** GB2312 encoding */
pm_encoding_t pm_encoding_gb2312 = {
static pm_encoding_t pm_encoding_gb2312 = {
.name = "GB2312",
.char_width = pm_encoding_euc_kr_char_width,
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
@ -4952,7 +4952,7 @@ pm_encoding_t pm_encoding_gb2312 = {
};
/** GB12345 encoding */
pm_encoding_t pm_encoding_gb12345 = {
static pm_encoding_t pm_encoding_gb12345 = {
.name = "GB12345",
.char_width = pm_encoding_euc_kr_char_width,
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
@ -4962,7 +4962,7 @@ pm_encoding_t pm_encoding_gb12345 = {
};
/** EUC-TW encoding */
pm_encoding_t pm_encoding_euc_tw = {
static pm_encoding_t pm_encoding_euc_tw = {
.name = "EUC-TW",
.char_width = pm_encoding_euc_tw_char_width,
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
@ -4972,7 +4972,7 @@ pm_encoding_t pm_encoding_euc_tw = {
};
/** GBK encoding */
pm_encoding_t pm_encoding_gbk = {
static pm_encoding_t pm_encoding_gbk = {
.name = "GBK",
.char_width = pm_encoding_gbk_char_width,
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
@ -4982,7 +4982,7 @@ pm_encoding_t pm_encoding_gbk = {
};
/** CP949 encoding */
pm_encoding_t pm_encoding_cp949 = {
static pm_encoding_t pm_encoding_cp949 = {
.name = "CP949",
.char_width = pm_encoding_cp949_char_width,
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
@ -4992,7 +4992,7 @@ pm_encoding_t pm_encoding_cp949 = {
};
/** Shift_JIS encoding */
pm_encoding_t pm_encoding_shift_jis = {
static pm_encoding_t pm_encoding_shift_jis = {
.name = "Shift_JIS",
.char_width = pm_encoding_shift_jis_char_width,
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
@ -5002,7 +5002,7 @@ pm_encoding_t pm_encoding_shift_jis = {
};
/** SJIS-DoCoMo encoding */
pm_encoding_t pm_encoding_sjis_docomo = {
static pm_encoding_t pm_encoding_sjis_docomo = {
.name = "SJIS-DoCoMo",
.char_width = pm_encoding_shift_jis_char_width,
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
@ -5012,7 +5012,7 @@ pm_encoding_t pm_encoding_sjis_docomo = {
};
/** SJIS-KDDI encoding */
pm_encoding_t pm_encoding_sjis_kddi = {
static pm_encoding_t pm_encoding_sjis_kddi = {
.name = "SJIS-KDDI",
.char_width = pm_encoding_shift_jis_char_width,
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
@ -5022,7 +5022,7 @@ pm_encoding_t pm_encoding_sjis_kddi = {
};
/** SJIS-SoftBank encoding */
pm_encoding_t pm_encoding_sjis_softbank = {
static pm_encoding_t pm_encoding_sjis_softbank = {
.name = "SJIS-SoftBank",
.char_width = pm_encoding_shift_jis_char_width,
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
@ -5032,7 +5032,7 @@ pm_encoding_t pm_encoding_sjis_softbank = {
};
/** MacJapanese encoding */
pm_encoding_t pm_encoding_mac_japanese = {
static pm_encoding_t pm_encoding_mac_japanese = {
.name = "MacJapanese",
.char_width = pm_encoding_shift_jis_char_width,
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
@ -5042,7 +5042,7 @@ pm_encoding_t pm_encoding_mac_japanese = {
};
/** Windows-31J */
pm_encoding_t pm_encoding_windows_31j = {
static pm_encoding_t pm_encoding_windows_31j = {
.name = "Windows-31J",
.char_width = pm_encoding_shift_jis_char_width,
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
@ -5062,7 +5062,7 @@ pm_encoding_t pm_encoding_utf_8 = {
};
/** UTF8-MAC */
pm_encoding_t pm_encoding_utf8_mac = {
static pm_encoding_t pm_encoding_utf8_mac = {
.name = "UTF8-MAC",
.char_width = pm_encoding_utf_8_char_width,
.alnum_char = pm_encoding_utf_8_alnum_char,
@ -5072,7 +5072,7 @@ pm_encoding_t pm_encoding_utf8_mac = {
};
/** UTF8-DoCoMo */
pm_encoding_t pm_encoding_utf8_docomo = {
static pm_encoding_t pm_encoding_utf8_docomo = {
.name = "UTF8-DoCoMo",
.char_width = pm_encoding_utf_8_char_width,
.alnum_char = pm_encoding_utf_8_alnum_char,
@ -5082,7 +5082,7 @@ pm_encoding_t pm_encoding_utf8_docomo = {
};
/** UTF8-KDDI */
pm_encoding_t pm_encoding_utf8_kddi = {
static pm_encoding_t pm_encoding_utf8_kddi = {
.name = "UTF8-KDDI",
.char_width = pm_encoding_utf_8_char_width,
.alnum_char = pm_encoding_utf_8_alnum_char,
@ -5092,7 +5092,7 @@ pm_encoding_t pm_encoding_utf8_kddi = {
};
/** UTF8-SoftBank */
pm_encoding_t pm_encoding_utf8_softbank = {
static pm_encoding_t pm_encoding_utf8_softbank = {
.name = "UTF8-SoftBank",
.char_width = pm_encoding_utf_8_char_width,
.alnum_char = pm_encoding_utf_8_alnum_char,
@ -5102,7 +5102,7 @@ pm_encoding_t pm_encoding_utf8_softbank = {
};
/** CESU-8 */
pm_encoding_t pm_encoding_cesu_8 = {
static pm_encoding_t pm_encoding_cesu_8 = {
.name = "CESU-8",
.char_width = pm_encoding_cesu_8_char_width,
.alnum_char = pm_encoding_cesu_8_alnum_char,
@ -5110,3 +5110,204 @@ pm_encoding_t pm_encoding_cesu_8 = {
.isupper_char = pm_encoding_cesu_8_isupper_char,
.multibyte = true
};
/**
* Parse the given name of an encoding and return a pointer to the corresponding
* encoding struct if one can be found, otherwise return NULL.
*/
pm_encoding_t *
pm_encoding_find(const uint8_t *start, const uint8_t *end) {
size_t width = (size_t) (end - start);
// First, we're going to check for UTF-8. This is the most common encoding.
// UTF-8 can contain extra information at the end about the platform it is
// encoded on, such as UTF-8-MAC or UTF-8-UNIX. We'll ignore those suffixes.
if ((start + 5 <= end) && (pm_strncasecmp(start, (const uint8_t *) "UTF-8", 5) == 0)) {
// We need to explicitly handle UTF-8-HFS, as that one needs to switch
// over to being UTF8-MAC.
if (width == 9 && (pm_strncasecmp(start + 5, (const uint8_t *) "-HFS", 4) == 0)) {
return &pm_encoding_utf8_mac;
}
// Otherwise we'll return the default UTF-8 encoding.
return &pm_encoding_utf_8;
}
// Next, we're going to loop through each of the encodings that we handle
// explicitly. If we found one that we understand, we'll use that value.
#define ENCODING1(value, prebuilt) if (width == sizeof(value) - 1 && pm_strncasecmp(start, (const uint8_t *) value, width) == 0) return &prebuilt;
// A convenience macros for comparing two aliases for the same encoding.
#define ENCODING2(value1, value2, prebuilt) ENCODING1(value1, prebuilt) ENCODING1(value2, prebuilt)
if (width >= 3) {
switch (*start) {
case 'A': case 'a':
ENCODING1("ASCII", pm_encoding_ascii);
ENCODING1("ASCII-8BIT", pm_encoding_ascii_8bit);
ENCODING1("ANSI_X3.4-1968", pm_encoding_ascii);
break;
case 'B': case 'b':
ENCODING1("BINARY", pm_encoding_ascii_8bit);
ENCODING1("Big5", pm_encoding_big5);
ENCODING2("Big5-HKSCS", "Big5-HKSCS:2008", pm_encoding_big5_hkscs);
ENCODING1("Big5-UAO", pm_encoding_big5_uao);
break;
case 'C': case 'c':
ENCODING1("CESU-8", pm_encoding_cesu_8);
ENCODING1("CP437", pm_encoding_ibm437);
ENCODING1("CP720", pm_encoding_ibm720);
ENCODING1("CP737", pm_encoding_ibm737);
ENCODING1("CP775", pm_encoding_ibm775);
ENCODING1("CP850", pm_encoding_cp850);
ENCODING1("CP852", pm_encoding_cp852);
ENCODING1("CP855", pm_encoding_cp855);
ENCODING1("CP857", pm_encoding_ibm857);
ENCODING1("CP860", pm_encoding_ibm860);
ENCODING1("CP861", pm_encoding_ibm861);
ENCODING1("CP862", pm_encoding_ibm862);
ENCODING1("CP864", pm_encoding_ibm864);
ENCODING1("CP865", pm_encoding_ibm865);
ENCODING1("CP866", pm_encoding_ibm866);
ENCODING1("CP869", pm_encoding_ibm869);
ENCODING1("CP874", pm_encoding_windows_874);
ENCODING1("CP878", pm_encoding_koi8_r);
ENCODING1("CP863", pm_encoding_ibm863);
ENCODING2("CP932", "csWindows31J", pm_encoding_windows_31j);
ENCODING1("CP936", pm_encoding_gbk);
ENCODING1("CP949", pm_encoding_cp949);
ENCODING1("CP950", pm_encoding_cp950);
ENCODING1("CP951", pm_encoding_cp951);
ENCODING1("CP1250", pm_encoding_windows_1250);
ENCODING1("CP1251", pm_encoding_windows_1251);
ENCODING1("CP1252", pm_encoding_windows_1252);
ENCODING1("CP1253", pm_encoding_windows_1253);
ENCODING1("CP1254", pm_encoding_windows_1254);
ENCODING1("CP1255", pm_encoding_windows_1255);
ENCODING1("CP1256", pm_encoding_windows_1256);
ENCODING1("CP1257", pm_encoding_windows_1257);
ENCODING1("CP1258", pm_encoding_windows_1258);
ENCODING1("CP51932", pm_encoding_cp51932);
ENCODING1("CP65001", pm_encoding_utf_8);
break;
case 'E': case 'e':
ENCODING2("EUC-JP", "eucJP", pm_encoding_euc_jp);
ENCODING2("eucJP-ms", "euc-jp-ms", pm_encoding_euc_jp_ms);
ENCODING2("EUC-JIS-2004", "EUC-JISX0213", pm_encoding_euc_jis_2004);
ENCODING2("EUC-KR", "eucKR", pm_encoding_euc_kr);
ENCODING2("EUC-CN", "eucCN", pm_encoding_gb2312);
ENCODING2("EUC-TW", "eucTW", pm_encoding_euc_tw);
ENCODING1("Emacs-Mule", pm_encoding_emacs_mule);
ENCODING1("external", pm_encoding_utf_8);
break;
case 'F': case 'f':
ENCODING1("filesystem", pm_encoding_utf_8);
break;
case 'G': case 'g':
ENCODING1("GBK", pm_encoding_gbk);
ENCODING1("GB12345", pm_encoding_gb12345);
ENCODING1("GB18030", pm_encoding_gb18030);
ENCODING1("GB1988", pm_encoding_gb1988);
ENCODING1("GB2312", pm_encoding_gb2312);
break;
case 'I': case 'i':
ENCODING1("IBM437", pm_encoding_ibm437);
ENCODING1("IBM720", pm_encoding_ibm720);
ENCODING1("IBM737", pm_encoding_ibm737);
ENCODING1("IBM775", pm_encoding_ibm775);
ENCODING1("IBM850", pm_encoding_cp850);
ENCODING1("IBM852", pm_encoding_ibm852);
ENCODING1("IBM855", pm_encoding_ibm855);
ENCODING1("IBM857", pm_encoding_ibm857);
ENCODING1("IBM860", pm_encoding_ibm860);
ENCODING1("IBM861", pm_encoding_ibm861);
ENCODING1("IBM862", pm_encoding_ibm862);
ENCODING1("IBM863", pm_encoding_ibm863);
ENCODING1("IBM864", pm_encoding_ibm864);
ENCODING1("IBM865", pm_encoding_ibm865);
ENCODING1("IBM866", pm_encoding_ibm866);
ENCODING1("IBM869", pm_encoding_ibm869);
ENCODING2("ISO-8859-1", "ISO8859-1", pm_encoding_iso_8859_1);
ENCODING2("ISO-8859-2", "ISO8859-2", pm_encoding_iso_8859_2);
ENCODING2("ISO-8859-3", "ISO8859-3", pm_encoding_iso_8859_3);
ENCODING2("ISO-8859-4", "ISO8859-4", pm_encoding_iso_8859_4);
ENCODING2("ISO-8859-5", "ISO8859-5", pm_encoding_iso_8859_5);
ENCODING2("ISO-8859-6", "ISO8859-6", pm_encoding_iso_8859_6);
ENCODING2("ISO-8859-7", "ISO8859-7", pm_encoding_iso_8859_7);
ENCODING2("ISO-8859-8", "ISO8859-8", pm_encoding_iso_8859_8);
ENCODING2("ISO-8859-9", "ISO8859-9", pm_encoding_iso_8859_9);
ENCODING2("ISO-8859-10", "ISO8859-10", pm_encoding_iso_8859_10);
ENCODING2("ISO-8859-11", "ISO8859-11", pm_encoding_iso_8859_11);
ENCODING2("ISO-8859-13", "ISO8859-13", pm_encoding_iso_8859_13);
ENCODING2("ISO-8859-14", "ISO8859-14", pm_encoding_iso_8859_14);
ENCODING2("ISO-8859-15", "ISO8859-15", pm_encoding_iso_8859_15);
ENCODING2("ISO-8859-16", "ISO8859-16", pm_encoding_iso_8859_16);
break;
case 'K': case 'k':
ENCODING1("KOI8-R", pm_encoding_koi8_r);
ENCODING1("KOI8-U", pm_encoding_koi8_u);
break;
case 'L': case 'l':
ENCODING1("locale", pm_encoding_utf_8);
break;
case 'M': case 'm':
ENCODING1("macCentEuro", pm_encoding_mac_cent_euro);
ENCODING1("macCroatian", pm_encoding_mac_croatian);
ENCODING1("macCyrillic", pm_encoding_mac_cyrillic);
ENCODING1("macGreek", pm_encoding_mac_greek);
ENCODING1("macIceland", pm_encoding_mac_iceland);
ENCODING1("MacJapanese", pm_encoding_mac_japanese);
ENCODING1("MacJapan", pm_encoding_mac_japanese);
ENCODING1("macRoman", pm_encoding_mac_roman);
ENCODING1("macRomania", pm_encoding_mac_romania);
ENCODING1("macThai", pm_encoding_mac_thai);
ENCODING1("macTurkish", pm_encoding_mac_turkish);
ENCODING1("macUkraine", pm_encoding_mac_ukraine);
break;
case 'P': case 'p':
ENCODING1("PCK", pm_encoding_windows_31j);
break;
case 'S': case 's':
ENCODING1("Shift_JIS", pm_encoding_shift_jis);
ENCODING1("SJIS", pm_encoding_windows_31j);
ENCODING1("SJIS-DoCoMo", pm_encoding_sjis_docomo);
ENCODING1("SJIS-KDDI", pm_encoding_sjis_kddi);
ENCODING1("SJIS-SoftBank", pm_encoding_sjis_softbank);
ENCODING1("stateless-ISO-2022-JP", pm_encoding_stateless_iso_2022_jp);
ENCODING1("stateless-ISO-2022-JP-KDDI", pm_encoding_stateless_iso_2022_jp_kddi);
break;
case 'T': case 't':
ENCODING1("TIS-620", pm_encoding_tis_620);
break;
case 'U': case 'u':
ENCODING1("US-ASCII", pm_encoding_ascii);
ENCODING2("UTF8-MAC", "UTF-8-HFS", pm_encoding_utf8_mac);
ENCODING1("UTF8-DoCoMo", pm_encoding_utf8_docomo);
ENCODING1("UTF8-KDDI", pm_encoding_utf8_kddi);
ENCODING1("UTF8-SoftBank", pm_encoding_utf8_softbank);
break;
case 'W': case 'w':
ENCODING1("Windows-31J", pm_encoding_windows_31j);
ENCODING1("Windows-874", pm_encoding_windows_874);
ENCODING1("Windows-1250", pm_encoding_windows_1250);
ENCODING1("Windows-1251", pm_encoding_windows_1251);
ENCODING1("Windows-1252", pm_encoding_windows_1252);
ENCODING1("Windows-1253", pm_encoding_windows_1253);
ENCODING1("Windows-1254", pm_encoding_windows_1254);
ENCODING1("Windows-1255", pm_encoding_windows_1255);
ENCODING1("Windows-1256", pm_encoding_windows_1256);
ENCODING1("Windows-1257", pm_encoding_windows_1257);
ENCODING1("Windows-1258", pm_encoding_windows_1258);
break;
case '6':
ENCODING1("646", pm_encoding_ascii);
break;
}
}
#undef ENCODING2
#undef ENCODING1
// If we didn't match any encodings, return NULL.
return NULL;
}

View File

@ -7,6 +7,7 @@
#define PRISM_ENCODING_H
#include "prism/defines.h"
#include "prism/util/pm_strncasecmp.h"
#include <assert.h>
#include <stdbool.h>
@ -119,98 +120,21 @@ bool pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n);
*/
extern const uint8_t pm_encoding_unicode_table[256];
// Below are the encodings that are supported by the parser. They are defined in
// their own files in the src/enc directory.
extern pm_encoding_t pm_encoding_ascii;
extern pm_encoding_t pm_encoding_ascii_8bit;
extern pm_encoding_t pm_encoding_big5;
extern pm_encoding_t pm_encoding_big5_hkscs;
extern pm_encoding_t pm_encoding_big5_uao;
extern pm_encoding_t pm_encoding_cesu_8;
extern pm_encoding_t pm_encoding_cp51932;
extern pm_encoding_t pm_encoding_cp850;
extern pm_encoding_t pm_encoding_cp852;
extern pm_encoding_t pm_encoding_cp855;
extern pm_encoding_t pm_encoding_cp949;
extern pm_encoding_t pm_encoding_cp950;
extern pm_encoding_t pm_encoding_cp951;
extern pm_encoding_t pm_encoding_emacs_mule;
extern pm_encoding_t pm_encoding_euc_jp;
extern pm_encoding_t pm_encoding_euc_jp_ms;
extern pm_encoding_t pm_encoding_euc_jis_2004;
extern pm_encoding_t pm_encoding_euc_kr;
extern pm_encoding_t pm_encoding_euc_tw;
extern pm_encoding_t pm_encoding_gb12345;
extern pm_encoding_t pm_encoding_gb18030;
extern pm_encoding_t pm_encoding_gb1988;
extern pm_encoding_t pm_encoding_gb2312;
extern pm_encoding_t pm_encoding_gbk;
extern pm_encoding_t pm_encoding_ibm437;
extern pm_encoding_t pm_encoding_ibm720;
extern pm_encoding_t pm_encoding_ibm737;
extern pm_encoding_t pm_encoding_ibm775;
extern pm_encoding_t pm_encoding_ibm852;
extern pm_encoding_t pm_encoding_ibm855;
extern pm_encoding_t pm_encoding_ibm857;
extern pm_encoding_t pm_encoding_ibm860;
extern pm_encoding_t pm_encoding_ibm861;
extern pm_encoding_t pm_encoding_ibm862;
extern pm_encoding_t pm_encoding_ibm863;
extern pm_encoding_t pm_encoding_ibm864;
extern pm_encoding_t pm_encoding_ibm865;
extern pm_encoding_t pm_encoding_ibm866;
extern pm_encoding_t pm_encoding_ibm869;
extern pm_encoding_t pm_encoding_iso_8859_1;
extern pm_encoding_t pm_encoding_iso_8859_2;
extern pm_encoding_t pm_encoding_iso_8859_3;
extern pm_encoding_t pm_encoding_iso_8859_4;
extern pm_encoding_t pm_encoding_iso_8859_5;
extern pm_encoding_t pm_encoding_iso_8859_6;
extern pm_encoding_t pm_encoding_iso_8859_7;
extern pm_encoding_t pm_encoding_iso_8859_8;
extern pm_encoding_t pm_encoding_iso_8859_9;
extern pm_encoding_t pm_encoding_iso_8859_10;
extern pm_encoding_t pm_encoding_iso_8859_11;
extern pm_encoding_t pm_encoding_iso_8859_13;
extern pm_encoding_t pm_encoding_iso_8859_14;
extern pm_encoding_t pm_encoding_iso_8859_15;
extern pm_encoding_t pm_encoding_iso_8859_16;
extern pm_encoding_t pm_encoding_koi8_r;
extern pm_encoding_t pm_encoding_koi8_u;
extern pm_encoding_t pm_encoding_mac_cent_euro;
extern pm_encoding_t pm_encoding_mac_croatian;
extern pm_encoding_t pm_encoding_mac_cyrillic;
extern pm_encoding_t pm_encoding_mac_greek;
extern pm_encoding_t pm_encoding_mac_iceland;
extern pm_encoding_t pm_encoding_mac_japanese;
extern pm_encoding_t pm_encoding_mac_roman;
extern pm_encoding_t pm_encoding_mac_romania;
extern pm_encoding_t pm_encoding_mac_thai;
extern pm_encoding_t pm_encoding_mac_turkish;
extern pm_encoding_t pm_encoding_mac_ukraine;
extern pm_encoding_t pm_encoding_shift_jis;
extern pm_encoding_t pm_encoding_sjis_docomo;
extern pm_encoding_t pm_encoding_sjis_kddi;
extern pm_encoding_t pm_encoding_sjis_softbank;
extern pm_encoding_t pm_encoding_stateless_iso_2022_jp;
extern pm_encoding_t pm_encoding_stateless_iso_2022_jp_kddi;
extern pm_encoding_t pm_encoding_tis_620;
/**
* This is the default encoding for Ruby source files. We keep a specific
* visible pointer around to it so that prism.c can compare it against the
* default.
*/
extern pm_encoding_t pm_encoding_utf_8;
extern pm_encoding_t pm_encoding_utf8_mac;
extern pm_encoding_t pm_encoding_utf8_docomo;
extern pm_encoding_t pm_encoding_utf8_kddi;
extern pm_encoding_t pm_encoding_utf8_softbank;
extern pm_encoding_t pm_encoding_windows_1250;
extern pm_encoding_t pm_encoding_windows_1251;
extern pm_encoding_t pm_encoding_windows_1252;
extern pm_encoding_t pm_encoding_windows_1253;
extern pm_encoding_t pm_encoding_windows_1254;
extern pm_encoding_t pm_encoding_windows_1255;
extern pm_encoding_t pm_encoding_windows_1256;
extern pm_encoding_t pm_encoding_windows_1257;
extern pm_encoding_t pm_encoding_windows_1258;
extern pm_encoding_t pm_encoding_windows_31j;
extern pm_encoding_t pm_encoding_windows_874;
/**
* Parse the given name of an encoding and return a pointer to the corresponding
* encoding struct if one can be found, otherwise return NULL.
*
* @param start A pointer to the first byte of the name.
* @param end A pointer to the last byte of the name.
* @returns A pointer to the encoding it finds, otherwise NULL.
*/
pm_encoding_t * pm_encoding_find(const uint8_t *start, const uint8_t *end);
#endif

View File

@ -6153,207 +6153,18 @@ next_newline(const uint8_t *cursor, ptrdiff_t length) {
*/
static bool
parser_lex_magic_comment_encoding_value(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) {
size_t width = (size_t) (end - start);
pm_encoding_t *encoding = pm_encoding_find(start, end);
// First, we're going to check for UTF-8. This is the most common encoding.
// utf-8 can contain extra information at the end about the platform it is
// encoded on, such as utf-8-mac or utf-8-unix. We'll ignore those suffixes.
if ((start + 5 <= end) && (pm_strncasecmp(start, (const uint8_t *) "utf-8", 5) == 0)) {
// We need to explicitly handle utf-8-hfs, as that one needs to switch
// over to being utf8-mac.
if (width == 9 && (pm_strncasecmp(start + 5, (const uint8_t *) "-hfs", 4) == 0)) {
parser->encoding = pm_encoding_utf8_mac;
if (encoding != NULL) {
if (encoding != &pm_encoding_utf_8) {
parser->encoding = *encoding;
parser->encoding_changed = true;
if (parser->encoding_changed_callback != NULL) parser->encoding_changed_callback(parser);
return true;
}
// We don't need to do anything here because the default encoding is
// already UTF-8. We'll just return.
return true;
}
// Next, we're going to loop through each of the encodings that we handle
// explicitly. If we found one that we understand, we'll use that value.
#define ENCODING1(value, prebuilt) \
if (width == sizeof(value) - 1 && start + width <= end && pm_strncasecmp(start, (const uint8_t *) value, width) == 0) { \
parser->encoding = prebuilt; \
parser->encoding_changed = true; \
if (parser->encoding_changed_callback != NULL) parser->encoding_changed_callback(parser); \
return true; \
}
// A convenience macros for comparing two aliases for the same encoding.
#define ENCODING2(value1, value2, prebuilt) ENCODING1(value1, prebuilt) ENCODING1(value2, prebuilt)
if (width >= 3) {
switch (*start) {
case 'A': case 'a':
ENCODING1("ASCII", pm_encoding_ascii);
ENCODING1("ASCII-8BIT", pm_encoding_ascii_8bit);
ENCODING1("ANSI_X3.4-1968", pm_encoding_ascii);
break;
case 'B': case 'b':
ENCODING1("BINARY", pm_encoding_ascii_8bit);
ENCODING1("Big5", pm_encoding_big5);
ENCODING2("Big5-HKSCS", "Big5-HKSCS:2008", pm_encoding_big5_hkscs);
ENCODING1("Big5-UAO", pm_encoding_big5_uao);
break;
case 'C': case 'c':
ENCODING1("CESU-8", pm_encoding_cesu_8);
ENCODING1("CP437", pm_encoding_ibm437);
ENCODING1("CP720", pm_encoding_ibm720);
ENCODING1("CP737", pm_encoding_ibm737);
ENCODING1("CP775", pm_encoding_ibm775);
ENCODING1("CP850", pm_encoding_cp850);
ENCODING1("CP852", pm_encoding_cp852);
ENCODING1("CP855", pm_encoding_cp855);
ENCODING1("CP857", pm_encoding_ibm857);
ENCODING1("CP860", pm_encoding_ibm860);
ENCODING1("CP861", pm_encoding_ibm861);
ENCODING1("CP862", pm_encoding_ibm862);
ENCODING1("CP864", pm_encoding_ibm864);
ENCODING1("CP865", pm_encoding_ibm865);
ENCODING1("CP866", pm_encoding_ibm866);
ENCODING1("CP869", pm_encoding_ibm869);
ENCODING1("CP874", pm_encoding_windows_874);
ENCODING1("CP878", pm_encoding_koi8_r);
ENCODING1("CP863", pm_encoding_ibm863);
ENCODING2("CP932", "csWindows31J", pm_encoding_windows_31j);
ENCODING1("CP936", pm_encoding_gbk);
ENCODING1("CP949", pm_encoding_cp949);
ENCODING1("CP950", pm_encoding_cp950);
ENCODING1("CP951", pm_encoding_cp951);
ENCODING1("CP1250", pm_encoding_windows_1250);
ENCODING1("CP1251", pm_encoding_windows_1251);
ENCODING1("CP1252", pm_encoding_windows_1252);
ENCODING1("CP1253", pm_encoding_windows_1253);
ENCODING1("CP1254", pm_encoding_windows_1254);
ENCODING1("CP1255", pm_encoding_windows_1255);
ENCODING1("CP1256", pm_encoding_windows_1256);
ENCODING1("CP1257", pm_encoding_windows_1257);
ENCODING1("CP1258", pm_encoding_windows_1258);
ENCODING1("CP51932", pm_encoding_cp51932);
ENCODING1("CP65001", pm_encoding_utf_8);
break;
case 'E': case 'e':
ENCODING2("EUC-JP", "eucJP", pm_encoding_euc_jp);
ENCODING2("eucJP-ms", "euc-jp-ms", pm_encoding_euc_jp_ms);
ENCODING2("EUC-JIS-2004", "EUC-JISX0213", pm_encoding_euc_jis_2004);
ENCODING2("EUC-KR", "eucKR", pm_encoding_euc_kr);
ENCODING2("EUC-CN", "eucCN", pm_encoding_gb2312);
ENCODING2("EUC-TW", "eucTW", pm_encoding_euc_tw);
ENCODING1("Emacs-Mule", pm_encoding_emacs_mule);
ENCODING1("external", pm_encoding_utf_8);
break;
case 'F': case 'f':
ENCODING1("filesystem", pm_encoding_utf_8);
break;
case 'G': case 'g':
ENCODING1("GBK", pm_encoding_gbk);
ENCODING1("GB12345", pm_encoding_gb12345);
ENCODING1("GB18030", pm_encoding_gb18030);
ENCODING1("GB1988", pm_encoding_gb1988);
ENCODING1("GB2312", pm_encoding_gb2312);
break;
case 'I': case 'i':
ENCODING1("IBM437", pm_encoding_ibm437);
ENCODING1("IBM720", pm_encoding_ibm720);
ENCODING1("IBM737", pm_encoding_ibm737);
ENCODING1("IBM775", pm_encoding_ibm775);
ENCODING1("IBM850", pm_encoding_cp850);
ENCODING1("IBM852", pm_encoding_ibm852);
ENCODING1("IBM855", pm_encoding_ibm855);
ENCODING1("IBM857", pm_encoding_ibm857);
ENCODING1("IBM860", pm_encoding_ibm860);
ENCODING1("IBM861", pm_encoding_ibm861);
ENCODING1("IBM862", pm_encoding_ibm862);
ENCODING1("IBM863", pm_encoding_ibm863);
ENCODING1("IBM864", pm_encoding_ibm864);
ENCODING1("IBM865", pm_encoding_ibm865);
ENCODING1("IBM866", pm_encoding_ibm866);
ENCODING1("IBM869", pm_encoding_ibm869);
ENCODING2("ISO-8859-1", "ISO8859-1", pm_encoding_iso_8859_1);
ENCODING2("ISO-8859-2", "ISO8859-2", pm_encoding_iso_8859_2);
ENCODING2("ISO-8859-3", "ISO8859-3", pm_encoding_iso_8859_3);
ENCODING2("ISO-8859-4", "ISO8859-4", pm_encoding_iso_8859_4);
ENCODING2("ISO-8859-5", "ISO8859-5", pm_encoding_iso_8859_5);
ENCODING2("ISO-8859-6", "ISO8859-6", pm_encoding_iso_8859_6);
ENCODING2("ISO-8859-7", "ISO8859-7", pm_encoding_iso_8859_7);
ENCODING2("ISO-8859-8", "ISO8859-8", pm_encoding_iso_8859_8);
ENCODING2("ISO-8859-9", "ISO8859-9", pm_encoding_iso_8859_9);
ENCODING2("ISO-8859-10", "ISO8859-10", pm_encoding_iso_8859_10);
ENCODING2("ISO-8859-11", "ISO8859-11", pm_encoding_iso_8859_11);
ENCODING2("ISO-8859-13", "ISO8859-13", pm_encoding_iso_8859_13);
ENCODING2("ISO-8859-14", "ISO8859-14", pm_encoding_iso_8859_14);
ENCODING2("ISO-8859-15", "ISO8859-15", pm_encoding_iso_8859_15);
ENCODING2("ISO-8859-16", "ISO8859-16", pm_encoding_iso_8859_16);
break;
case 'K': case 'k':
ENCODING1("KOI8-R", pm_encoding_koi8_r);
ENCODING1("KOI8-U", pm_encoding_koi8_u);
break;
case 'L': case 'l':
ENCODING1("locale", pm_encoding_utf_8);
break;
case 'M': case 'm':
ENCODING1("macCentEuro", pm_encoding_mac_cent_euro);
ENCODING1("macCroatian", pm_encoding_mac_croatian);
ENCODING1("macCyrillic", pm_encoding_mac_cyrillic);
ENCODING1("macGreek", pm_encoding_mac_greek);
ENCODING1("macIceland", pm_encoding_mac_iceland);
ENCODING1("MacJapanese", pm_encoding_mac_japanese);
ENCODING1("MacJapan", pm_encoding_mac_japanese);
ENCODING1("macRoman", pm_encoding_mac_roman);
ENCODING1("macRomania", pm_encoding_mac_romania);
ENCODING1("macThai", pm_encoding_mac_thai);
ENCODING1("macTurkish", pm_encoding_mac_turkish);
ENCODING1("macUkraine", pm_encoding_mac_ukraine);
break;
case 'P': case 'p':
ENCODING1("PCK", pm_encoding_windows_31j);
break;
case 'S': case 's':
ENCODING1("Shift_JIS", pm_encoding_shift_jis);
ENCODING1("SJIS", pm_encoding_windows_31j);
ENCODING1("SJIS-DoCoMo", pm_encoding_sjis_docomo);
ENCODING1("SJIS-KDDI", pm_encoding_sjis_kddi);
ENCODING1("SJIS-SoftBank", pm_encoding_sjis_softbank);
ENCODING1("stateless-ISO-2022-JP", pm_encoding_stateless_iso_2022_jp);
ENCODING1("stateless-ISO-2022-JP-KDDI", pm_encoding_stateless_iso_2022_jp_kddi);
break;
case 'T': case 't':
ENCODING1("TIS-620", pm_encoding_tis_620);
break;
case 'U': case 'u':
ENCODING1("US-ASCII", pm_encoding_ascii);
ENCODING2("UTF8-MAC", "UTF-8-HFS", pm_encoding_utf8_mac);
ENCODING1("UTF8-DoCoMo", pm_encoding_utf8_docomo);
ENCODING1("UTF8-KDDI", pm_encoding_utf8_kddi);
ENCODING1("UTF8-SoftBank", pm_encoding_utf8_softbank);
break;
case 'W': case 'w':
ENCODING1("Windows-31J", pm_encoding_windows_31j);
ENCODING1("Windows-874", pm_encoding_windows_874);
ENCODING1("Windows-1250", pm_encoding_windows_1250);
ENCODING1("Windows-1251", pm_encoding_windows_1251);
ENCODING1("Windows-1252", pm_encoding_windows_1252);
ENCODING1("Windows-1253", pm_encoding_windows_1253);
ENCODING1("Windows-1254", pm_encoding_windows_1254);
ENCODING1("Windows-1255", pm_encoding_windows_1255);
ENCODING1("Windows-1256", pm_encoding_windows_1256);
ENCODING1("Windows-1257", pm_encoding_windows_1257);
ENCODING1("Windows-1258", pm_encoding_windows_1258);
break;
case '6':
ENCODING1("646", pm_encoding_ascii);
break;
}
}
#undef ENCODING2
#undef ENCODING1
return false;
}