[ruby/prism] Documentation for the encodings

https://github.com/ruby/prism/commit/52a0d80a15
This commit is contained in:
Kevin Newton 2023-10-31 08:54:52 -04:00
parent 493439c9ce
commit 87c6fb8548
8 changed files with 513 additions and 154 deletions

View File

@ -42,7 +42,8 @@ pm_encoding_big5_isupper_char(const uint8_t *b, ptrdiff_t n) {
}
}
pm_encoding_t pm_encoding_big5 = {
/** Big5 encoding */
const pm_encoding_t pm_encoding_big5 = {
.name = "big5",
.char_width = pm_encoding_big5_char_width,
.alnum_char = pm_encoding_big5_alnum_char,

View File

@ -8,36 +8,50 @@
#include <stddef.h>
#include <stdint.h>
// This struct defines the functions necessary to implement the encoding
// interface so we can determine how many bytes the subsequent character takes.
// Each callback should return the number of bytes, or 0 if the next bytes are
// invalid for the encoding and type.
/**
* This struct defines the functions necessary to implement the encoding
* interface so we can determine how many bytes the subsequent character takes.
* Each callback should return the number of bytes, or 0 if the next bytes are
* invalid for the encoding and type.
*/
typedef struct {
// Return the number of bytes that the next character takes if it is valid
// in the encoding. Does not read more than n bytes. It is assumed that n is
// at least 1.
/**
* Return the number of bytes that the next character takes if it is valid
* in the encoding. Does not read more than n bytes. It is assumed that n is
* at least 1.
*/
size_t (*char_width)(const uint8_t *b, ptrdiff_t n);
// Return the number of bytes that the next character takes if it is valid
// in the encoding and is alphabetical. Does not read more than n bytes. It
// is assumed that n is at least 1.
/**
* Return the number of bytes that the next character takes if it is valid
* in the encoding and is alphabetical. Does not read more than n bytes. It
* is assumed that n is at least 1.
*/
size_t (*alpha_char)(const uint8_t *b, ptrdiff_t n);
// Return the number of bytes that the next character takes if it is valid
// in the encoding and is alphanumeric. Does not read more than n bytes. It
// is assumed that n is at least 1.
/**
* Return the number of bytes that the next character takes if it is valid
* in the encoding and is alphanumeric. Does not read more than n bytes. It
* is assumed that n is at least 1.
*/
size_t (*alnum_char)(const uint8_t *b, ptrdiff_t n);
// Return true if the next character is valid in the encoding and is an
// uppercase character. Does not read more than n bytes. It is assumed that
// n is at least 1.
/**
* Return true if the next character is valid in the encoding and is an
* uppercase character. Does not read more than n bytes. It is assumed that
* n is at least 1.
*/
bool (*isupper_char)(const uint8_t *b, ptrdiff_t n);
// The name of the encoding. This should correspond to a value that can be
// passed to Encoding.find in Ruby.
/**
* The name of the encoding. This should correspond to a value that can be
* passed to Encoding.find in Ruby.
*/
const char *name;
// Return true if the encoding is a multibyte encoding.
/**
* Return true if the encoding is a multibyte encoding.
*/
bool multibyte;
} pm_encoding_t;
@ -47,50 +61,109 @@ typedef struct {
#define PRISM_ENCODING_ALPHANUMERIC_BIT 1 << 1
#define PRISM_ENCODING_UPPERCASE_BIT 1 << 2
// These functions are reused by some other encodings, so they are defined here
// so they can be shared.
/**
* Return the size of the next character in the ASCII encoding if it is an
* alphabetical character.
*
* @param b The bytes to read.
* @param n The number of bytes that can be read.
* @returns The number of bytes that the next character takes if it is valid in
* the encoding, or 0 if it is not.
*/
size_t pm_encoding_ascii_alpha_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n);
/**
* Return the size of the next character in the ASCII encoding if it is an
* alphanumeric character.
*
* @param b The bytes to read.
* @param n The number of bytes that can be read.
* @returns The number of bytes that the next character takes if it is valid in
* the encoding, or 0 if it is not.
*/
size_t pm_encoding_ascii_alnum_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n);
/**
* Return true if the next character in the ASCII encoding if it is an uppercase
* character.
*
* @param b The bytes to read.
* @param n The number of bytes that can be read.
* @returns True if the next character is valid in the encoding and is an
* uppercase character, or false if it is not.
*/
bool pm_encoding_ascii_isupper_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n);
// These functions are shared between the actual encoding and the fast path in
// the parser so they need to be internally visible.
/**
* Return the size of the next character in the UTF-8 encoding if it is an
* alphabetical character.
*
* @param b The bytes to read.
* @param n The number of bytes that can be read.
* @returns The number of bytes that the next character takes if it is valid in
* the encoding, or 0 if it is not.
*/
size_t pm_encoding_utf_8_alpha_char(const uint8_t *b, ptrdiff_t n);
/**
* Return the size of the next character in the UTF-8 encoding if it is an
* alphanumeric character.
*
* @param b The bytes to read.
* @param n The number of bytes that can be read.
* @returns The number of bytes that the next character takes if it is valid in
* the encoding, or 0 if it is not.
*/
size_t pm_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n);
/**
* Return true if the next character in the UTF-8 encoding if it is an uppercase
* character.
*
* @param b The bytes to read.
* @param n The number of bytes that can be read.
* @returns True if the next character is valid in the encoding and is an
* uppercase character, or false if it is not.
*/
bool pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n);
// This lookup table is referenced in both the UTF-8 encoding file and the
// parser directly in order to speed up the default encoding processing.
/**
* This lookup table is referenced in both the UTF-8 encoding file and the
* parser directly in order to speed up the default encoding processing. It is
* used to indicate whether a character is alphabetical, alphanumeric, or
* uppercase in unicode mappings.
*/
extern const uint8_t pm_encoding_unicode_table[256];
// These are the encodings that are supported by the parser. They are defined in
// Below are the encodings that are supported by the parser. They are defined in
// their own files in the src/enc directory.
extern pm_encoding_t pm_encoding_ascii;
extern pm_encoding_t pm_encoding_ascii_8bit;
extern pm_encoding_t pm_encoding_big5;
extern pm_encoding_t pm_encoding_euc_jp;
extern pm_encoding_t pm_encoding_gbk;
extern pm_encoding_t pm_encoding_iso_8859_1;
extern pm_encoding_t pm_encoding_iso_8859_2;
extern pm_encoding_t pm_encoding_iso_8859_3;
extern pm_encoding_t pm_encoding_iso_8859_4;
extern pm_encoding_t pm_encoding_iso_8859_5;
extern pm_encoding_t pm_encoding_iso_8859_6;
extern pm_encoding_t pm_encoding_iso_8859_7;
extern pm_encoding_t pm_encoding_iso_8859_8;
extern pm_encoding_t pm_encoding_iso_8859_9;
extern pm_encoding_t pm_encoding_iso_8859_10;
extern pm_encoding_t pm_encoding_iso_8859_11;
extern pm_encoding_t pm_encoding_iso_8859_13;
extern pm_encoding_t pm_encoding_iso_8859_14;
extern pm_encoding_t pm_encoding_iso_8859_15;
extern pm_encoding_t pm_encoding_iso_8859_16;
extern pm_encoding_t pm_encoding_koi8_r;
extern pm_encoding_t pm_encoding_shift_jis;
extern pm_encoding_t pm_encoding_utf_8;
extern pm_encoding_t pm_encoding_utf8_mac;
extern pm_encoding_t pm_encoding_windows_31j;
extern pm_encoding_t pm_encoding_windows_1251;
extern pm_encoding_t pm_encoding_windows_1252;
const extern pm_encoding_t pm_encoding_ascii;
const extern pm_encoding_t pm_encoding_ascii_8bit;
const extern pm_encoding_t pm_encoding_big5;
const extern pm_encoding_t pm_encoding_euc_jp;
const extern pm_encoding_t pm_encoding_gbk;
const extern pm_encoding_t pm_encoding_iso_8859_1;
const extern pm_encoding_t pm_encoding_iso_8859_2;
const extern pm_encoding_t pm_encoding_iso_8859_3;
const extern pm_encoding_t pm_encoding_iso_8859_4;
const extern pm_encoding_t pm_encoding_iso_8859_5;
const extern pm_encoding_t pm_encoding_iso_8859_6;
const extern pm_encoding_t pm_encoding_iso_8859_7;
const extern pm_encoding_t pm_encoding_iso_8859_8;
const extern pm_encoding_t pm_encoding_iso_8859_9;
const extern pm_encoding_t pm_encoding_iso_8859_10;
const extern pm_encoding_t pm_encoding_iso_8859_11;
const extern pm_encoding_t pm_encoding_iso_8859_13;
const extern pm_encoding_t pm_encoding_iso_8859_14;
const extern pm_encoding_t pm_encoding_iso_8859_15;
const extern pm_encoding_t pm_encoding_iso_8859_16;
const extern pm_encoding_t pm_encoding_koi8_r;
const extern pm_encoding_t pm_encoding_shift_jis;
const extern pm_encoding_t pm_encoding_utf_8;
const extern pm_encoding_t pm_encoding_utf8_mac;
const extern pm_encoding_t pm_encoding_windows_31j;
const extern pm_encoding_t pm_encoding_windows_1251;
const extern pm_encoding_t pm_encoding_windows_1252;
#endif

View File

@ -48,7 +48,8 @@ pm_encoding_euc_jp_isupper_char(const uint8_t *b, ptrdiff_t n) {
}
}
pm_encoding_t pm_encoding_euc_jp = {
/** EUC-JP encoding */
const pm_encoding_t pm_encoding_euc_jp = {
.name = "euc-jp",
.char_width = pm_encoding_euc_jp_char_width,
.alnum_char = pm_encoding_euc_jp_alnum_char,

View File

@ -51,7 +51,8 @@ pm_encoding_gbk_isupper_char(const uint8_t *b, ptrdiff_t n) {
}
}
pm_encoding_t pm_encoding_gbk = {
/** GBK encoding */
const pm_encoding_t pm_encoding_gbk = {
.name = "gbk",
.char_width = pm_encoding_gbk_char_width,
.alnum_char = pm_encoding_gbk_alnum_char,

View File

@ -46,7 +46,8 @@ pm_encoding_shift_jis_isupper_char(const uint8_t *b, ptrdiff_t n) {
}
}
pm_encoding_t pm_encoding_shift_jis = {
/** Shift_JIS encoding */
const pm_encoding_t pm_encoding_shift_jis = {
.name = "shift_jis",
.char_width = pm_encoding_shift_jis_char_width,
.alnum_char = pm_encoding_shift_jis_alnum_char,

View File

@ -1,7 +1,9 @@
#include "prism/enc/pm_encoding.h"
// Each element of the following table contains a bitfield that indicates a
// piece of information about the corresponding ASCII character.
/**
* Each element of the following table contains a bitfield that indicates a
* piece of information about the corresponding ASCII character.
*/
static uint8_t pm_encoding_ascii_table[256] = {
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@ -22,8 +24,10 @@ static uint8_t pm_encoding_ascii_table[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx
};
// Each element of the following table contains a bitfield that indicates a
// piece of information about the corresponding ISO-8859-1 character.
/**
* Each element of the following table contains a bitfield that indicates a
* piece of information about the corresponding ISO-8859-1 character.
*/
static uint8_t pm_encoding_iso_8859_1_table[256] = {
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@ -44,8 +48,10 @@ static uint8_t pm_encoding_iso_8859_1_table[256] = {
3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, // Fx
};
// Each element of the following table contains a bitfield that indicates a
// piece of information about the corresponding ISO-8859-2 character.
/**
* Each element of the following table contains a bitfield that indicates a
* piece of information about the corresponding ISO-8859-2 character.
*/
static uint8_t pm_encoding_iso_8859_2_table[256] = {
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@ -66,8 +72,10 @@ static uint8_t pm_encoding_iso_8859_2_table[256] = {
3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 0, // Fx
};
// Each element of the following table contains a bitfield that indicates a
// piece of information about the corresponding ISO-8859-3 character.
/**
* Each element of the following table contains a bitfield that indicates a
* piece of information about the corresponding ISO-8859-3 character.
*/
static uint8_t pm_encoding_iso_8859_3_table[256] = {
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@ -88,8 +96,10 @@ static uint8_t pm_encoding_iso_8859_3_table[256] = {
0, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 0, // Fx
};
// Each element of the following table contains a bitfield that indicates a
// piece of information about the corresponding ISO-8859-4 character.
/**
* Each element of the following table contains a bitfield that indicates a
* piece of information about the corresponding ISO-8859-4 character.
*/
static uint8_t pm_encoding_iso_8859_4_table[256] = {
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@ -110,8 +120,10 @@ static uint8_t pm_encoding_iso_8859_4_table[256] = {
3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 0, // Fx
};
// Each element of the following table contains a bitfield that indicates a
// piece of information about the corresponding ISO-8859-5 character.
/**
* Each element of the following table contains a bitfield that indicates a
* piece of information about the corresponding ISO-8859-5 character.
*/
static uint8_t pm_encoding_iso_8859_5_table[256] = {
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@ -132,8 +144,10 @@ static uint8_t pm_encoding_iso_8859_5_table[256] = {
0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, // Fx
};
// Each element of the following table contains a bitfield that indicates a
// piece of information about the corresponding ISO-8859-6 character.
/**
* Each element of the following table contains a bitfield that indicates a
* piece of information about the corresponding ISO-8859-6 character.
*/
static uint8_t pm_encoding_iso_8859_6_table[256] = {
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@ -154,8 +168,10 @@ static uint8_t pm_encoding_iso_8859_6_table[256] = {
3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx
};
// Each element of the following table contains a bitfield that indicates a
// piece of information about the corresponding ISO-8859-7 character.
/**
* Each element of the following table contains a bitfield that indicates a
* piece of information about the corresponding ISO-8859-7 character.
*/
static uint8_t pm_encoding_iso_8859_7_table[256] = {
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@ -176,8 +192,10 @@ static uint8_t pm_encoding_iso_8859_7_table[256] = {
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, // Fx
};
// Each element of the following table contains a bitfield that indicates a
// piece of information about the corresponding ISO-8859-8 character.
/**
* Each element of the following table contains a bitfield that indicates a
* piece of information about the corresponding ISO-8859-8 character.
*/
static uint8_t pm_encoding_iso_8859_8_table[256] = {
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@ -198,8 +216,10 @@ static uint8_t pm_encoding_iso_8859_8_table[256] = {
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // Fx
};
// Each element of the following table contains a bitfield that indicates a
// piece of information about the corresponding ISO-8859-9 character.
/**
* Each element of the following table contains a bitfield that indicates a
* piece of information about the corresponding ISO-8859-9 character.
*/
static uint8_t pm_encoding_iso_8859_9_table[256] = {
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@ -220,8 +240,10 @@ static uint8_t pm_encoding_iso_8859_9_table[256] = {
3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, // Fx
};
// Each element of the following table contains a bitfield that indicates a
// piece of information about the corresponding ISO-8859-10 character.
/**
* Each element of the following table contains a bitfield that indicates a
* piece of information about the corresponding ISO-8859-10 character.
*/
static uint8_t pm_encoding_iso_8859_10_table[256] = {
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@ -242,8 +264,10 @@ static uint8_t pm_encoding_iso_8859_10_table[256] = {
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Fx
};
// Each element of the following table contains a bitfield that indicates a
// piece of information about the corresponding ISO-8859-11 character.
/**
* Each element of the following table contains a bitfield that indicates a
* piece of information about the corresponding ISO-8859-11 character.
*/
static uint8_t pm_encoding_iso_8859_11_table[256] = {
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@ -264,8 +288,10 @@ static uint8_t pm_encoding_iso_8859_11_table[256] = {
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, // Fx
};
// Each element of the following table contains a bitfield that indicates a
// piece of information about the corresponding ISO-8859-13 character.
/**
* Each element of the following table contains a bitfield that indicates a
* piece of information about the corresponding ISO-8859-13 character.
*/
static uint8_t pm_encoding_iso_8859_13_table[256] = {
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@ -286,8 +312,10 @@ static uint8_t pm_encoding_iso_8859_13_table[256] = {
3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 0, // Fx
};
// Each element of the following table contains a bitfield that indicates a
// piece of information about the corresponding ISO-8859-14 character.
/**
* Each element of the following table contains a bitfield that indicates a
* piece of information about the corresponding ISO-8859-14 character.
*/
static uint8_t pm_encoding_iso_8859_14_table[256] = {
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@ -308,8 +336,10 @@ static uint8_t pm_encoding_iso_8859_14_table[256] = {
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Fx
};
// Each element of the following table contains a bitfield that indicates a
// piece of information about the corresponding ISO-8859-15 character.
/**
* Each element of the following table contains a bitfield that indicates a
* piece of information about the corresponding ISO-8859-15 character.
*/
static uint8_t pm_encoding_iso_8859_15_table[256] = {
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@ -330,8 +360,10 @@ static uint8_t pm_encoding_iso_8859_15_table[256] = {
3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, // Fx
};
// Each element of the following table contains a bitfield that indicates a
// piece of information about the corresponding ISO-8859-16 character.
/**
* Each element of the following table contains a bitfield that indicates a
* piece of information about the corresponding ISO-8859-16 character.
*/
static uint8_t pm_encoding_iso_8859_16_table[256] = {
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@ -352,8 +384,10 @@ static uint8_t pm_encoding_iso_8859_16_table[256] = {
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Fx
};
// Each element of the following table contains a bitfield that indicates a
// piece of information about the corresponding KOI8-R character.
/**
* Each element of the following table contains a bitfield that indicates a
* piece of information about the corresponding KOI8-R character.
*/
static uint8_t pm_encoding_koi8_r_table[256] = {
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@ -374,8 +408,10 @@ static uint8_t pm_encoding_koi8_r_table[256] = {
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Fx
};
// Each element of the following table contains a bitfield that indicates a
// piece of information about the corresponding windows-1251 character.
/**
* Each element of the following table contains a bitfield that indicates a
* piece of information about the corresponding windows-1251 character.
*/
static uint8_t pm_encoding_windows_1251_table[256] = {
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@ -396,8 +432,10 @@ static uint8_t pm_encoding_windows_1251_table[256] = {
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Fx
};
// Each element of the following table contains a bitfield that indicates a
// piece of information about the corresponding windows-1252 character.
/**
* Each element of the following table contains a bitfield that indicates a
* piece of information about the corresponding windows-1252 character.
*/
static uint8_t pm_encoding_windows_1252_table[256] = {
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@ -418,37 +456,94 @@ static uint8_t pm_encoding_windows_1252_table[256] = {
3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, // Fx
};
/**
* Returns the size of the next character in the ASCII encoding. This basically
* means that if the top bit is not set, the character is 1 byte long.
*/
static size_t
pm_encoding_ascii_char_width(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
return *b < 0x80 ? 1 : 0;
}
/**
* Return the size of the next character in the ASCII encoding if it is an
* alphabetical character.
*/
size_t
pm_encoding_ascii_alpha_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
return (pm_encoding_ascii_table[*b] & PRISM_ENCODING_ALPHABETIC_BIT);
}
/**
* Return the size of the next character in the ASCII encoding if it is an
* alphanumeric character.
*/
size_t
pm_encoding_ascii_alnum_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
return (pm_encoding_ascii_table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT) ? 1 : 0;
}
/**
* Return true if the next character in the ASCII encoding if it is an uppercase
* character.
*/
bool
pm_encoding_ascii_isupper_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
return (pm_encoding_ascii_table[*b] & PRISM_ENCODING_UPPERCASE_BIT);
}
static size_t
pm_encoding_koi8_r_char_width(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
return ((*b >= 0x20 && *b <= 0x7E) || (*b >= 0x80)) ? 1 : 0;
}
/**
* For a lot of encodings the default is that they are a single byte long no
* matter what the codepoint, so this function is shared between them.
*/
static size_t
pm_encoding_single_char_width(PRISM_ATTRIBUTE_UNUSED const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
return 1;
}
pm_encoding_t pm_encoding_ascii = {
/**
* Returns the size of the next character in the KOI-8 encoding. This means
* checking if it's a valid codepoint in KOI-8 and if it is returning 1.
*/
static size_t
pm_encoding_koi8_r_char_width(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
return ((*b >= 0x20 && *b <= 0x7E) || (*b >= 0x80)) ? 1 : 0;
}
#define PRISM_ENCODING_TABLE(name) \
static size_t pm_encoding_ ##name ## _alpha_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) { \
return (pm_encoding_ ##name ## _table[*b] & PRISM_ENCODING_ALPHABETIC_BIT); \
} \
static size_t pm_encoding_ ##name ## _alnum_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) { \
return (pm_encoding_ ##name ## _table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT) ? 1 : 0; \
} \
static bool pm_encoding_ ##name ## _isupper_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) { \
return (pm_encoding_ ##name ## _table[*b] & PRISM_ENCODING_UPPERCASE_BIT); \
}
PRISM_ENCODING_TABLE(iso_8859_1)
PRISM_ENCODING_TABLE(iso_8859_2)
PRISM_ENCODING_TABLE(iso_8859_3)
PRISM_ENCODING_TABLE(iso_8859_4)
PRISM_ENCODING_TABLE(iso_8859_5)
PRISM_ENCODING_TABLE(iso_8859_6)
PRISM_ENCODING_TABLE(iso_8859_7)
PRISM_ENCODING_TABLE(iso_8859_8)
PRISM_ENCODING_TABLE(iso_8859_9)
PRISM_ENCODING_TABLE(iso_8859_10)
PRISM_ENCODING_TABLE(iso_8859_11)
PRISM_ENCODING_TABLE(iso_8859_13)
PRISM_ENCODING_TABLE(iso_8859_14)
PRISM_ENCODING_TABLE(iso_8859_15)
PRISM_ENCODING_TABLE(iso_8859_16)
PRISM_ENCODING_TABLE(koi8_r)
PRISM_ENCODING_TABLE(windows_1251)
PRISM_ENCODING_TABLE(windows_1252)
#undef PRISM_ENCODING_TABLE
/** ASCII encoding */
const pm_encoding_t pm_encoding_ascii = {
.name = "ascii",
.char_width = pm_encoding_ascii_char_width,
.alnum_char = pm_encoding_ascii_alnum_char,
@ -457,7 +552,8 @@ pm_encoding_t pm_encoding_ascii = {
.multibyte = false
};
pm_encoding_t pm_encoding_ascii_8bit = {
/** ASCII-8BIT encoding */
const pm_encoding_t pm_encoding_ascii_8bit = {
.name = "ascii-8bit",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_ascii_alnum_char,
@ -466,42 +562,182 @@ pm_encoding_t pm_encoding_ascii_8bit = {
.multibyte = false
};
#define PRISM_ENCODING_TABLE(s, i, w) \
static size_t pm_encoding_ ##i ## _alpha_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) { \
return (pm_encoding_ ##i ## _table[*b] & PRISM_ENCODING_ALPHABETIC_BIT); \
} \
static size_t pm_encoding_ ##i ## _alnum_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) { \
return (pm_encoding_ ##i ## _table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT) ? 1 : 0; \
} \
static bool pm_encoding_ ##i ## _isupper_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) { \
return (pm_encoding_ ##i ## _table[*b] & PRISM_ENCODING_UPPERCASE_BIT); \
} \
pm_encoding_t pm_encoding_ ##i = { \
.name = s, \
.char_width = w, \
.alnum_char = pm_encoding_ ##i ## _alnum_char, \
.alpha_char = pm_encoding_ ##i ## _alpha_char, \
.isupper_char = pm_encoding_ ##i ## _isupper_char, \
.multibyte = false, \
};
/** ISO-8859-1 */
const pm_encoding_t pm_encoding_iso_8859_1 = {
.name = "iso-8859-1",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_iso_8859_1_alnum_char,
.alpha_char = pm_encoding_iso_8859_1_alpha_char,
.isupper_char = pm_encoding_iso_8859_1_isupper_char,
.multibyte = false
};
PRISM_ENCODING_TABLE("iso-8859-1", iso_8859_1, pm_encoding_single_char_width)
PRISM_ENCODING_TABLE("iso-8859-2", iso_8859_2, pm_encoding_single_char_width)
PRISM_ENCODING_TABLE("iso-8859-3", iso_8859_3, pm_encoding_single_char_width)
PRISM_ENCODING_TABLE("iso-8859-4", iso_8859_4, pm_encoding_single_char_width)
PRISM_ENCODING_TABLE("iso-8859-5", iso_8859_5, pm_encoding_single_char_width)
PRISM_ENCODING_TABLE("iso-8859-6", iso_8859_6, pm_encoding_single_char_width)
PRISM_ENCODING_TABLE("iso-8859-7", iso_8859_7, pm_encoding_single_char_width)
PRISM_ENCODING_TABLE("iso-8859-8", iso_8859_8, pm_encoding_single_char_width)
PRISM_ENCODING_TABLE("iso-8859-9", iso_8859_9, pm_encoding_single_char_width)
PRISM_ENCODING_TABLE("iso-8859-10", iso_8859_10, pm_encoding_single_char_width)
PRISM_ENCODING_TABLE("iso-8859-11", iso_8859_11, pm_encoding_single_char_width)
PRISM_ENCODING_TABLE("iso-8859-13", iso_8859_13, pm_encoding_single_char_width)
PRISM_ENCODING_TABLE("iso-8859-14", iso_8859_14, pm_encoding_single_char_width)
PRISM_ENCODING_TABLE("iso-8859-15", iso_8859_15, pm_encoding_single_char_width)
PRISM_ENCODING_TABLE("iso-8859-16", iso_8859_16, pm_encoding_single_char_width)
PRISM_ENCODING_TABLE("koi8-r", koi8_r, pm_encoding_koi8_r_char_width)
PRISM_ENCODING_TABLE("windows-1251", windows_1251, pm_encoding_single_char_width)
PRISM_ENCODING_TABLE("windows-1252", windows_1252, pm_encoding_single_char_width)
/** ISO-8859-2 */
const pm_encoding_t pm_encoding_iso_8859_2 = {
.name = "iso-8859-2",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_iso_8859_2_alnum_char,
.alpha_char = pm_encoding_iso_8859_2_alpha_char,
.isupper_char = pm_encoding_iso_8859_2_isupper_char,
.multibyte = false
};
#undef PRISM_ENCODING_TABLE
/** ISO-8859-3 */
const pm_encoding_t pm_encoding_iso_8859_3 = {
.name = "iso-8859-3",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_iso_8859_3_alnum_char,
.alpha_char = pm_encoding_iso_8859_3_alpha_char,
.isupper_char = pm_encoding_iso_8859_3_isupper_char,
.multibyte = false
};
/** ISO-8859-4 */
const pm_encoding_t pm_encoding_iso_8859_4 = {
.name = "iso-8859-4",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_iso_8859_4_alnum_char,
.alpha_char = pm_encoding_iso_8859_4_alpha_char,
.isupper_char = pm_encoding_iso_8859_4_isupper_char,
.multibyte = false
};
/** ISO-8859-5 */
const pm_encoding_t pm_encoding_iso_8859_5 = {
.name = "iso-8859-5",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_iso_8859_5_alnum_char,
.alpha_char = pm_encoding_iso_8859_5_alpha_char,
.isupper_char = pm_encoding_iso_8859_5_isupper_char,
.multibyte = false
};
/** ISO-8859-6 */
const pm_encoding_t pm_encoding_iso_8859_6 = {
.name = "iso-8859-6",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_iso_8859_6_alnum_char,
.alpha_char = pm_encoding_iso_8859_6_alpha_char,
.isupper_char = pm_encoding_iso_8859_6_isupper_char,
.multibyte = false
};
/** ISO-8859-7 */
const pm_encoding_t pm_encoding_iso_8859_7 = {
.name = "iso-8859-7",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_iso_8859_7_alnum_char,
.alpha_char = pm_encoding_iso_8859_7_alpha_char,
.isupper_char = pm_encoding_iso_8859_7_isupper_char,
.multibyte = false
};
/** ISO-8859-8 */
const pm_encoding_t pm_encoding_iso_8859_8 = {
.name = "iso-8859-8",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_iso_8859_8_alnum_char,
.alpha_char = pm_encoding_iso_8859_8_alpha_char,
.isupper_char = pm_encoding_iso_8859_8_isupper_char,
.multibyte = false
};
/** ISO-8859-9 */
const pm_encoding_t pm_encoding_iso_8859_9 = {
.name = "iso-8859-9",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_iso_8859_9_alnum_char,
.alpha_char = pm_encoding_iso_8859_9_alpha_char,
.isupper_char = pm_encoding_iso_8859_9_isupper_char,
.multibyte = false
};
/** ISO-8859-10 */
const pm_encoding_t pm_encoding_iso_8859_10 = {
.name = "iso-8859-10",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_iso_8859_10_alnum_char,
.alpha_char = pm_encoding_iso_8859_10_alpha_char,
.isupper_char = pm_encoding_iso_8859_10_isupper_char,
.multibyte = false
};
/** ISO-8859-11 */
const pm_encoding_t pm_encoding_iso_8859_11 = {
.name = "iso-8859-11",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_iso_8859_11_alnum_char,
.alpha_char = pm_encoding_iso_8859_11_alpha_char,
.isupper_char = pm_encoding_iso_8859_11_isupper_char,
.multibyte = false
};
/** ISO-8859-13 */
const pm_encoding_t pm_encoding_iso_8859_13 = {
.name = "iso-8859-13",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_iso_8859_13_alnum_char,
.alpha_char = pm_encoding_iso_8859_13_alpha_char,
.isupper_char = pm_encoding_iso_8859_13_isupper_char,
.multibyte = false
};
/** ISO-8859-14 */
const pm_encoding_t pm_encoding_iso_8859_14 = {
.name = "iso-8859-14",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_iso_8859_14_alnum_char,
.alpha_char = pm_encoding_iso_8859_14_alpha_char,
.isupper_char = pm_encoding_iso_8859_14_isupper_char,
.multibyte = false
};
/** ISO-8859-15 */
const pm_encoding_t pm_encoding_iso_8859_15 = {
.name = "iso-8859-15",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_iso_8859_15_alnum_char,
.alpha_char = pm_encoding_iso_8859_15_alpha_char,
.isupper_char = pm_encoding_iso_8859_15_isupper_char,
.multibyte = false
};
/** ISO-8859-16 */
const pm_encoding_t pm_encoding_iso_8859_16 = {
.name = "iso-8859-16",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_iso_8859_16_alnum_char,
.alpha_char = pm_encoding_iso_8859_16_alpha_char,
.isupper_char = pm_encoding_iso_8859_16_isupper_char,
.multibyte = false
};
/** KOI8-R */
const pm_encoding_t pm_encoding_koi8_r = {
.name = "koi8-r",
.char_width = pm_encoding_koi8_r_char_width,
.alnum_char = pm_encoding_koi8_r_alnum_char,
.alpha_char = pm_encoding_koi8_r_alpha_char,
.isupper_char = pm_encoding_koi8_r_isupper_char,
.multibyte = false
};
/** Windows-1251 */
const pm_encoding_t pm_encoding_windows_1251 = {
.name = "windows-1251",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_windows_1251_alnum_char,
.alpha_char = pm_encoding_windows_1251_alpha_char,
.isupper_char = pm_encoding_windows_1251_isupper_char,
.multibyte = false
};
/** Windows-1252 */
const pm_encoding_t pm_encoding_windows_1252 = {
.name = "windows-1252",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_windows_1252_alnum_char,
.alpha_char = pm_encoding_windows_1252_alpha_char,
.isupper_char = pm_encoding_windows_1252_isupper_char,
.multibyte = false
};

View File

@ -1,15 +1,14 @@
// Note that the UTF-8 decoding code is based on Bjoern Hoehrmann's UTF-8 DFA
// decoder. See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
#include "prism/enc/pm_encoding.h"
typedef uint32_t pm_unicode_codepoint_t;
// Each element of the following table contains a bitfield that indicates a
// piece of information about the corresponding unicode codepoint. Note that
// this table is different from other encodings where we used a lookup table
// because the indices of those tables are the byte representations, not the
// codepoints themselves.
/**
* Each element of the following table contains a bitfield that indicates a
* piece of information about the corresponding unicode codepoint. Note that
* this table is different from other encodings where we used a lookup table
* because the indices of those tables are the byte representations, not the
* codepoints themselves.
*/
const uint8_t pm_encoding_unicode_table[256] = {
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@ -2179,8 +2178,12 @@ static const pm_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_C
0x1F170, 0x1F189,
};
/**
* Binary search through the given list of codepoints to see if the given
* codepoint is in the list.
*/
static bool
pm_unicode_codepoint_match(pm_unicode_codepoint_t codepoint, const pm_unicode_codepoint_t *codepoints, size_t size) {
pm_unicode_codepoint_match(pm_unicode_codepoint_t codepoint, size_t size, const pm_unicode_codepoint_t codepoints[size]) {
size_t start = 0;
size_t end = size;
@ -2202,6 +2205,29 @@ pm_unicode_codepoint_match(pm_unicode_codepoint_t codepoint, const pm_unicode_co
return false;
}
/**
* A state transition table for decoding UTF-8.
*
* Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
static const uint8_t pm_utf_8_dfa[] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
@ -2219,6 +2245,11 @@ static const uint8_t pm_utf_8_dfa[] = {
1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
};
/**
* Given a pointer to a string and the number of bytes remaining in the string,
* decode the next UTF-8 codepoint and return it. The number of bytes consumed
* is returned in the width out parameter.
*/
static pm_unicode_codepoint_t
pm_utf_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) {
assert(n >= 1);
@ -2253,6 +2284,10 @@ pm_encoding_utf_8_char_width(const uint8_t *b, ptrdiff_t n) {
return width;
}
/**
* Return the size of the next character in the UTF-8 encoding if it is an
* alphabetical character.
*/
size_t
pm_encoding_utf_8_alpha_char(const uint8_t *b, ptrdiff_t n) {
if (*b < 0x80) {
@ -2265,10 +2300,14 @@ pm_encoding_utf_8_alpha_char(const uint8_t *b, ptrdiff_t n) {
if (codepoint <= 0xFF) {
return (pm_encoding_unicode_table[(uint8_t) codepoint] & PRISM_ENCODING_ALPHABETIC_BIT) ? width : 0;
} else {
return pm_unicode_codepoint_match(codepoint, unicode_alpha_codepoints, UNICODE_ALPHA_CODEPOINTS_LENGTH) ? width : 0;
return pm_unicode_codepoint_match(codepoint, UNICODE_ALPHA_CODEPOINTS_LENGTH, unicode_alpha_codepoints) ? width : 0;
}
}
/**
* Return the size of the next character in the UTF-8 encoding if it is an
* alphanumeric character.
*/
size_t
pm_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n) {
if (*b < 0x80) {
@ -2281,10 +2320,14 @@ pm_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n) {
if (codepoint <= 0xFF) {
return (pm_encoding_unicode_table[(uint8_t) codepoint] & (PRISM_ENCODING_ALPHANUMERIC_BIT)) ? width : 0;
} else {
return pm_unicode_codepoint_match(codepoint, unicode_alnum_codepoints, UNICODE_ALNUM_CODEPOINTS_LENGTH) ? width : 0;
return pm_unicode_codepoint_match(codepoint, UNICODE_ALNUM_CODEPOINTS_LENGTH, unicode_alnum_codepoints) ? width : 0;
}
}
/**
* Return true if the next character in the UTF-8 encoding if it is an uppercase
* character.
*/
bool
pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n) {
if (*b < 0x80) {
@ -2297,7 +2340,7 @@ pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n) {
if (codepoint <= 0xFF) {
return (pm_encoding_unicode_table[(uint8_t) codepoint] & PRISM_ENCODING_UPPERCASE_BIT) ? true : false;
} else {
return pm_unicode_codepoint_match(codepoint, unicode_isupper_codepoints, UNICODE_ISUPPER_CODEPOINTS_LENGTH) ? true : false;
return pm_unicode_codepoint_match(codepoint, UNICODE_ISUPPER_CODEPOINTS_LENGTH, unicode_isupper_codepoints) ? true : false;
}
}
@ -2305,7 +2348,8 @@ pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n) {
#undef UNICODE_ALNUM_CODEPOINTS_LENGTH
#undef UNICODE_ISUPPER_CODEPOINTS_LENGTH
pm_encoding_t pm_encoding_utf_8 = {
/** UTF-8 */
const pm_encoding_t pm_encoding_utf_8 = {
.name = "utf-8",
.char_width = pm_encoding_utf_8_char_width,
.alnum_char = pm_encoding_utf_8_alnum_char,
@ -2314,7 +2358,8 @@ pm_encoding_t pm_encoding_utf_8 = {
.multibyte = true
};
pm_encoding_t pm_encoding_utf8_mac = {
/** UTF8-mac */
const pm_encoding_t pm_encoding_utf8_mac = {
.name = "utf8-mac",
.char_width = pm_encoding_utf_8_char_width,
.alnum_char = pm_encoding_utf_8_alnum_char,

View File

@ -46,7 +46,8 @@ pm_encoding_windows_31j_isupper_char(const uint8_t *b, ptrdiff_t n) {
}
}
pm_encoding_t pm_encoding_windows_31j = {
/** Windows-31J */
const pm_encoding_t pm_encoding_windows_31j = {
.name = "windows-31j",
.char_width = pm_encoding_windows_31j_char_width,
.alnum_char = pm_encoding_windows_31j_alnum_char,