[ruby/prism] Documentation for the encodings
https://github.com/ruby/prism/commit/52a0d80a15
This commit is contained in:
parent
493439c9ce
commit
87c6fb8548
@ -42,7 +42,8 @@ pm_encoding_big5_isupper_char(const uint8_t *b, ptrdiff_t n) {
|
||||
}
|
||||
}
|
||||
|
||||
pm_encoding_t pm_encoding_big5 = {
|
||||
/** Big5 encoding */
|
||||
const pm_encoding_t pm_encoding_big5 = {
|
||||
.name = "big5",
|
||||
.char_width = pm_encoding_big5_char_width,
|
||||
.alnum_char = pm_encoding_big5_alnum_char,
|
||||
|
@ -8,36 +8,50 @@
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
// This struct defines the functions necessary to implement the encoding
|
||||
// interface so we can determine how many bytes the subsequent character takes.
|
||||
// Each callback should return the number of bytes, or 0 if the next bytes are
|
||||
// invalid for the encoding and type.
|
||||
/**
|
||||
* This struct defines the functions necessary to implement the encoding
|
||||
* interface so we can determine how many bytes the subsequent character takes.
|
||||
* Each callback should return the number of bytes, or 0 if the next bytes are
|
||||
* invalid for the encoding and type.
|
||||
*/
|
||||
typedef struct {
|
||||
// Return the number of bytes that the next character takes if it is valid
|
||||
// in the encoding. Does not read more than n bytes. It is assumed that n is
|
||||
// at least 1.
|
||||
/**
|
||||
* Return the number of bytes that the next character takes if it is valid
|
||||
* in the encoding. Does not read more than n bytes. It is assumed that n is
|
||||
* at least 1.
|
||||
*/
|
||||
size_t (*char_width)(const uint8_t *b, ptrdiff_t n);
|
||||
|
||||
// Return the number of bytes that the next character takes if it is valid
|
||||
// in the encoding and is alphabetical. Does not read more than n bytes. It
|
||||
// is assumed that n is at least 1.
|
||||
/**
|
||||
* Return the number of bytes that the next character takes if it is valid
|
||||
* in the encoding and is alphabetical. Does not read more than n bytes. It
|
||||
* is assumed that n is at least 1.
|
||||
*/
|
||||
size_t (*alpha_char)(const uint8_t *b, ptrdiff_t n);
|
||||
|
||||
// Return the number of bytes that the next character takes if it is valid
|
||||
// in the encoding and is alphanumeric. Does not read more than n bytes. It
|
||||
// is assumed that n is at least 1.
|
||||
/**
|
||||
* Return the number of bytes that the next character takes if it is valid
|
||||
* in the encoding and is alphanumeric. Does not read more than n bytes. It
|
||||
* is assumed that n is at least 1.
|
||||
*/
|
||||
size_t (*alnum_char)(const uint8_t *b, ptrdiff_t n);
|
||||
|
||||
// Return true if the next character is valid in the encoding and is an
|
||||
// uppercase character. Does not read more than n bytes. It is assumed that
|
||||
// n is at least 1.
|
||||
/**
|
||||
* Return true if the next character is valid in the encoding and is an
|
||||
* uppercase character. Does not read more than n bytes. It is assumed that
|
||||
* n is at least 1.
|
||||
*/
|
||||
bool (*isupper_char)(const uint8_t *b, ptrdiff_t n);
|
||||
|
||||
// The name of the encoding. This should correspond to a value that can be
|
||||
// passed to Encoding.find in Ruby.
|
||||
/**
|
||||
* The name of the encoding. This should correspond to a value that can be
|
||||
* passed to Encoding.find in Ruby.
|
||||
*/
|
||||
const char *name;
|
||||
|
||||
// Return true if the encoding is a multibyte encoding.
|
||||
/**
|
||||
* Return true if the encoding is a multibyte encoding.
|
||||
*/
|
||||
bool multibyte;
|
||||
} pm_encoding_t;
|
||||
|
||||
@ -47,50 +61,109 @@ typedef struct {
|
||||
#define PRISM_ENCODING_ALPHANUMERIC_BIT 1 << 1
|
||||
#define PRISM_ENCODING_UPPERCASE_BIT 1 << 2
|
||||
|
||||
// These functions are reused by some other encodings, so they are defined here
|
||||
// so they can be shared.
|
||||
/**
|
||||
* Return the size of the next character in the ASCII encoding if it is an
|
||||
* alphabetical character.
|
||||
*
|
||||
* @param b The bytes to read.
|
||||
* @param n The number of bytes that can be read.
|
||||
* @returns The number of bytes that the next character takes if it is valid in
|
||||
* the encoding, or 0 if it is not.
|
||||
*/
|
||||
size_t pm_encoding_ascii_alpha_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n);
|
||||
|
||||
/**
|
||||
* Return the size of the next character in the ASCII encoding if it is an
|
||||
* alphanumeric character.
|
||||
*
|
||||
* @param b The bytes to read.
|
||||
* @param n The number of bytes that can be read.
|
||||
* @returns The number of bytes that the next character takes if it is valid in
|
||||
* the encoding, or 0 if it is not.
|
||||
*/
|
||||
size_t pm_encoding_ascii_alnum_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n);
|
||||
|
||||
/**
|
||||
* Return true if the next character in the ASCII encoding if it is an uppercase
|
||||
* character.
|
||||
*
|
||||
* @param b The bytes to read.
|
||||
* @param n The number of bytes that can be read.
|
||||
* @returns True if the next character is valid in the encoding and is an
|
||||
* uppercase character, or false if it is not.
|
||||
*/
|
||||
bool pm_encoding_ascii_isupper_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n);
|
||||
|
||||
// These functions are shared between the actual encoding and the fast path in
|
||||
// the parser so they need to be internally visible.
|
||||
/**
|
||||
* Return the size of the next character in the UTF-8 encoding if it is an
|
||||
* alphabetical character.
|
||||
*
|
||||
* @param b The bytes to read.
|
||||
* @param n The number of bytes that can be read.
|
||||
* @returns The number of bytes that the next character takes if it is valid in
|
||||
* the encoding, or 0 if it is not.
|
||||
*/
|
||||
size_t pm_encoding_utf_8_alpha_char(const uint8_t *b, ptrdiff_t n);
|
||||
|
||||
/**
|
||||
* Return the size of the next character in the UTF-8 encoding if it is an
|
||||
* alphanumeric character.
|
||||
*
|
||||
* @param b The bytes to read.
|
||||
* @param n The number of bytes that can be read.
|
||||
* @returns The number of bytes that the next character takes if it is valid in
|
||||
* the encoding, or 0 if it is not.
|
||||
*/
|
||||
size_t pm_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n);
|
||||
|
||||
/**
|
||||
* Return true if the next character in the UTF-8 encoding if it is an uppercase
|
||||
* character.
|
||||
*
|
||||
* @param b The bytes to read.
|
||||
* @param n The number of bytes that can be read.
|
||||
* @returns True if the next character is valid in the encoding and is an
|
||||
* uppercase character, or false if it is not.
|
||||
*/
|
||||
bool pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n);
|
||||
|
||||
// This lookup table is referenced in both the UTF-8 encoding file and the
|
||||
// parser directly in order to speed up the default encoding processing.
|
||||
/**
|
||||
* This lookup table is referenced in both the UTF-8 encoding file and the
|
||||
* parser directly in order to speed up the default encoding processing. It is
|
||||
* used to indicate whether a character is alphabetical, alphanumeric, or
|
||||
* uppercase in unicode mappings.
|
||||
*/
|
||||
extern const uint8_t pm_encoding_unicode_table[256];
|
||||
|
||||
// These are the encodings that are supported by the parser. They are defined in
|
||||
// Below are the encodings that are supported by the parser. They are defined in
|
||||
// their own files in the src/enc directory.
|
||||
extern pm_encoding_t pm_encoding_ascii;
|
||||
extern pm_encoding_t pm_encoding_ascii_8bit;
|
||||
extern pm_encoding_t pm_encoding_big5;
|
||||
extern pm_encoding_t pm_encoding_euc_jp;
|
||||
extern pm_encoding_t pm_encoding_gbk;
|
||||
extern pm_encoding_t pm_encoding_iso_8859_1;
|
||||
extern pm_encoding_t pm_encoding_iso_8859_2;
|
||||
extern pm_encoding_t pm_encoding_iso_8859_3;
|
||||
extern pm_encoding_t pm_encoding_iso_8859_4;
|
||||
extern pm_encoding_t pm_encoding_iso_8859_5;
|
||||
extern pm_encoding_t pm_encoding_iso_8859_6;
|
||||
extern pm_encoding_t pm_encoding_iso_8859_7;
|
||||
extern pm_encoding_t pm_encoding_iso_8859_8;
|
||||
extern pm_encoding_t pm_encoding_iso_8859_9;
|
||||
extern pm_encoding_t pm_encoding_iso_8859_10;
|
||||
extern pm_encoding_t pm_encoding_iso_8859_11;
|
||||
extern pm_encoding_t pm_encoding_iso_8859_13;
|
||||
extern pm_encoding_t pm_encoding_iso_8859_14;
|
||||
extern pm_encoding_t pm_encoding_iso_8859_15;
|
||||
extern pm_encoding_t pm_encoding_iso_8859_16;
|
||||
extern pm_encoding_t pm_encoding_koi8_r;
|
||||
extern pm_encoding_t pm_encoding_shift_jis;
|
||||
extern pm_encoding_t pm_encoding_utf_8;
|
||||
extern pm_encoding_t pm_encoding_utf8_mac;
|
||||
extern pm_encoding_t pm_encoding_windows_31j;
|
||||
extern pm_encoding_t pm_encoding_windows_1251;
|
||||
extern pm_encoding_t pm_encoding_windows_1252;
|
||||
|
||||
const extern pm_encoding_t pm_encoding_ascii;
|
||||
const extern pm_encoding_t pm_encoding_ascii_8bit;
|
||||
const extern pm_encoding_t pm_encoding_big5;
|
||||
const extern pm_encoding_t pm_encoding_euc_jp;
|
||||
const extern pm_encoding_t pm_encoding_gbk;
|
||||
const extern pm_encoding_t pm_encoding_iso_8859_1;
|
||||
const extern pm_encoding_t pm_encoding_iso_8859_2;
|
||||
const extern pm_encoding_t pm_encoding_iso_8859_3;
|
||||
const extern pm_encoding_t pm_encoding_iso_8859_4;
|
||||
const extern pm_encoding_t pm_encoding_iso_8859_5;
|
||||
const extern pm_encoding_t pm_encoding_iso_8859_6;
|
||||
const extern pm_encoding_t pm_encoding_iso_8859_7;
|
||||
const extern pm_encoding_t pm_encoding_iso_8859_8;
|
||||
const extern pm_encoding_t pm_encoding_iso_8859_9;
|
||||
const extern pm_encoding_t pm_encoding_iso_8859_10;
|
||||
const extern pm_encoding_t pm_encoding_iso_8859_11;
|
||||
const extern pm_encoding_t pm_encoding_iso_8859_13;
|
||||
const extern pm_encoding_t pm_encoding_iso_8859_14;
|
||||
const extern pm_encoding_t pm_encoding_iso_8859_15;
|
||||
const extern pm_encoding_t pm_encoding_iso_8859_16;
|
||||
const extern pm_encoding_t pm_encoding_koi8_r;
|
||||
const extern pm_encoding_t pm_encoding_shift_jis;
|
||||
const extern pm_encoding_t pm_encoding_utf_8;
|
||||
const extern pm_encoding_t pm_encoding_utf8_mac;
|
||||
const extern pm_encoding_t pm_encoding_windows_31j;
|
||||
const extern pm_encoding_t pm_encoding_windows_1251;
|
||||
const extern pm_encoding_t pm_encoding_windows_1252;
|
||||
|
||||
#endif
|
||||
|
@ -48,7 +48,8 @@ pm_encoding_euc_jp_isupper_char(const uint8_t *b, ptrdiff_t n) {
|
||||
}
|
||||
}
|
||||
|
||||
pm_encoding_t pm_encoding_euc_jp = {
|
||||
/** EUC-JP encoding */
|
||||
const pm_encoding_t pm_encoding_euc_jp = {
|
||||
.name = "euc-jp",
|
||||
.char_width = pm_encoding_euc_jp_char_width,
|
||||
.alnum_char = pm_encoding_euc_jp_alnum_char,
|
||||
|
@ -51,7 +51,8 @@ pm_encoding_gbk_isupper_char(const uint8_t *b, ptrdiff_t n) {
|
||||
}
|
||||
}
|
||||
|
||||
pm_encoding_t pm_encoding_gbk = {
|
||||
/** GBK encoding */
|
||||
const pm_encoding_t pm_encoding_gbk = {
|
||||
.name = "gbk",
|
||||
.char_width = pm_encoding_gbk_char_width,
|
||||
.alnum_char = pm_encoding_gbk_alnum_char,
|
||||
|
@ -46,7 +46,8 @@ pm_encoding_shift_jis_isupper_char(const uint8_t *b, ptrdiff_t n) {
|
||||
}
|
||||
}
|
||||
|
||||
pm_encoding_t pm_encoding_shift_jis = {
|
||||
/** Shift_JIS encoding */
|
||||
const pm_encoding_t pm_encoding_shift_jis = {
|
||||
.name = "shift_jis",
|
||||
.char_width = pm_encoding_shift_jis_char_width,
|
||||
.alnum_char = pm_encoding_shift_jis_alnum_char,
|
||||
|
@ -1,7 +1,9 @@
|
||||
#include "prism/enc/pm_encoding.h"
|
||||
|
||||
// Each element of the following table contains a bitfield that indicates a
|
||||
// piece of information about the corresponding ASCII character.
|
||||
/**
|
||||
* Each element of the following table contains a bitfield that indicates a
|
||||
* piece of information about the corresponding ASCII character.
|
||||
*/
|
||||
static uint8_t pm_encoding_ascii_table[256] = {
|
||||
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
|
||||
@ -22,8 +24,10 @@ static uint8_t pm_encoding_ascii_table[256] = {
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx
|
||||
};
|
||||
|
||||
// Each element of the following table contains a bitfield that indicates a
|
||||
// piece of information about the corresponding ISO-8859-1 character.
|
||||
/**
|
||||
* Each element of the following table contains a bitfield that indicates a
|
||||
* piece of information about the corresponding ISO-8859-1 character.
|
||||
*/
|
||||
static uint8_t pm_encoding_iso_8859_1_table[256] = {
|
||||
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
|
||||
@ -44,8 +48,10 @@ static uint8_t pm_encoding_iso_8859_1_table[256] = {
|
||||
3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, // Fx
|
||||
};
|
||||
|
||||
// Each element of the following table contains a bitfield that indicates a
|
||||
// piece of information about the corresponding ISO-8859-2 character.
|
||||
/**
|
||||
* Each element of the following table contains a bitfield that indicates a
|
||||
* piece of information about the corresponding ISO-8859-2 character.
|
||||
*/
|
||||
static uint8_t pm_encoding_iso_8859_2_table[256] = {
|
||||
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
|
||||
@ -66,8 +72,10 @@ static uint8_t pm_encoding_iso_8859_2_table[256] = {
|
||||
3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 0, // Fx
|
||||
};
|
||||
|
||||
// Each element of the following table contains a bitfield that indicates a
|
||||
// piece of information about the corresponding ISO-8859-3 character.
|
||||
/**
|
||||
* Each element of the following table contains a bitfield that indicates a
|
||||
* piece of information about the corresponding ISO-8859-3 character.
|
||||
*/
|
||||
static uint8_t pm_encoding_iso_8859_3_table[256] = {
|
||||
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
|
||||
@ -88,8 +96,10 @@ static uint8_t pm_encoding_iso_8859_3_table[256] = {
|
||||
0, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 0, // Fx
|
||||
};
|
||||
|
||||
// Each element of the following table contains a bitfield that indicates a
|
||||
// piece of information about the corresponding ISO-8859-4 character.
|
||||
/**
|
||||
* Each element of the following table contains a bitfield that indicates a
|
||||
* piece of information about the corresponding ISO-8859-4 character.
|
||||
*/
|
||||
static uint8_t pm_encoding_iso_8859_4_table[256] = {
|
||||
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
|
||||
@ -110,8 +120,10 @@ static uint8_t pm_encoding_iso_8859_4_table[256] = {
|
||||
3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 0, // Fx
|
||||
};
|
||||
|
||||
// Each element of the following table contains a bitfield that indicates a
|
||||
// piece of information about the corresponding ISO-8859-5 character.
|
||||
/**
|
||||
* Each element of the following table contains a bitfield that indicates a
|
||||
* piece of information about the corresponding ISO-8859-5 character.
|
||||
*/
|
||||
static uint8_t pm_encoding_iso_8859_5_table[256] = {
|
||||
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
|
||||
@ -132,8 +144,10 @@ static uint8_t pm_encoding_iso_8859_5_table[256] = {
|
||||
0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, // Fx
|
||||
};
|
||||
|
||||
// Each element of the following table contains a bitfield that indicates a
|
||||
// piece of information about the corresponding ISO-8859-6 character.
|
||||
/**
|
||||
* Each element of the following table contains a bitfield that indicates a
|
||||
* piece of information about the corresponding ISO-8859-6 character.
|
||||
*/
|
||||
static uint8_t pm_encoding_iso_8859_6_table[256] = {
|
||||
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
|
||||
@ -154,8 +168,10 @@ static uint8_t pm_encoding_iso_8859_6_table[256] = {
|
||||
3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx
|
||||
};
|
||||
|
||||
// Each element of the following table contains a bitfield that indicates a
|
||||
// piece of information about the corresponding ISO-8859-7 character.
|
||||
/**
|
||||
* Each element of the following table contains a bitfield that indicates a
|
||||
* piece of information about the corresponding ISO-8859-7 character.
|
||||
*/
|
||||
static uint8_t pm_encoding_iso_8859_7_table[256] = {
|
||||
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
|
||||
@ -176,8 +192,10 @@ static uint8_t pm_encoding_iso_8859_7_table[256] = {
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, // Fx
|
||||
};
|
||||
|
||||
// Each element of the following table contains a bitfield that indicates a
|
||||
// piece of information about the corresponding ISO-8859-8 character.
|
||||
/**
|
||||
* Each element of the following table contains a bitfield that indicates a
|
||||
* piece of information about the corresponding ISO-8859-8 character.
|
||||
*/
|
||||
static uint8_t pm_encoding_iso_8859_8_table[256] = {
|
||||
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
|
||||
@ -198,8 +216,10 @@ static uint8_t pm_encoding_iso_8859_8_table[256] = {
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // Fx
|
||||
};
|
||||
|
||||
// Each element of the following table contains a bitfield that indicates a
|
||||
// piece of information about the corresponding ISO-8859-9 character.
|
||||
/**
|
||||
* Each element of the following table contains a bitfield that indicates a
|
||||
* piece of information about the corresponding ISO-8859-9 character.
|
||||
*/
|
||||
static uint8_t pm_encoding_iso_8859_9_table[256] = {
|
||||
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
|
||||
@ -220,8 +240,10 @@ static uint8_t pm_encoding_iso_8859_9_table[256] = {
|
||||
3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, // Fx
|
||||
};
|
||||
|
||||
// Each element of the following table contains a bitfield that indicates a
|
||||
// piece of information about the corresponding ISO-8859-10 character.
|
||||
/**
|
||||
* Each element of the following table contains a bitfield that indicates a
|
||||
* piece of information about the corresponding ISO-8859-10 character.
|
||||
*/
|
||||
static uint8_t pm_encoding_iso_8859_10_table[256] = {
|
||||
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
|
||||
@ -242,8 +264,10 @@ static uint8_t pm_encoding_iso_8859_10_table[256] = {
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Fx
|
||||
};
|
||||
|
||||
// Each element of the following table contains a bitfield that indicates a
|
||||
// piece of information about the corresponding ISO-8859-11 character.
|
||||
/**
|
||||
* Each element of the following table contains a bitfield that indicates a
|
||||
* piece of information about the corresponding ISO-8859-11 character.
|
||||
*/
|
||||
static uint8_t pm_encoding_iso_8859_11_table[256] = {
|
||||
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
|
||||
@ -264,8 +288,10 @@ static uint8_t pm_encoding_iso_8859_11_table[256] = {
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, // Fx
|
||||
};
|
||||
|
||||
// Each element of the following table contains a bitfield that indicates a
|
||||
// piece of information about the corresponding ISO-8859-13 character.
|
||||
/**
|
||||
* Each element of the following table contains a bitfield that indicates a
|
||||
* piece of information about the corresponding ISO-8859-13 character.
|
||||
*/
|
||||
static uint8_t pm_encoding_iso_8859_13_table[256] = {
|
||||
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
|
||||
@ -286,8 +312,10 @@ static uint8_t pm_encoding_iso_8859_13_table[256] = {
|
||||
3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 0, // Fx
|
||||
};
|
||||
|
||||
// Each element of the following table contains a bitfield that indicates a
|
||||
// piece of information about the corresponding ISO-8859-14 character.
|
||||
/**
|
||||
* Each element of the following table contains a bitfield that indicates a
|
||||
* piece of information about the corresponding ISO-8859-14 character.
|
||||
*/
|
||||
static uint8_t pm_encoding_iso_8859_14_table[256] = {
|
||||
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
|
||||
@ -308,8 +336,10 @@ static uint8_t pm_encoding_iso_8859_14_table[256] = {
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Fx
|
||||
};
|
||||
|
||||
// Each element of the following table contains a bitfield that indicates a
|
||||
// piece of information about the corresponding ISO-8859-15 character.
|
||||
/**
|
||||
* Each element of the following table contains a bitfield that indicates a
|
||||
* piece of information about the corresponding ISO-8859-15 character.
|
||||
*/
|
||||
static uint8_t pm_encoding_iso_8859_15_table[256] = {
|
||||
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
|
||||
@ -330,8 +360,10 @@ static uint8_t pm_encoding_iso_8859_15_table[256] = {
|
||||
3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, // Fx
|
||||
};
|
||||
|
||||
// Each element of the following table contains a bitfield that indicates a
|
||||
// piece of information about the corresponding ISO-8859-16 character.
|
||||
/**
|
||||
* Each element of the following table contains a bitfield that indicates a
|
||||
* piece of information about the corresponding ISO-8859-16 character.
|
||||
*/
|
||||
static uint8_t pm_encoding_iso_8859_16_table[256] = {
|
||||
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
|
||||
@ -352,8 +384,10 @@ static uint8_t pm_encoding_iso_8859_16_table[256] = {
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Fx
|
||||
};
|
||||
|
||||
// Each element of the following table contains a bitfield that indicates a
|
||||
// piece of information about the corresponding KOI8-R character.
|
||||
/**
|
||||
* Each element of the following table contains a bitfield that indicates a
|
||||
* piece of information about the corresponding KOI8-R character.
|
||||
*/
|
||||
static uint8_t pm_encoding_koi8_r_table[256] = {
|
||||
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
|
||||
@ -374,8 +408,10 @@ static uint8_t pm_encoding_koi8_r_table[256] = {
|
||||
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Fx
|
||||
};
|
||||
|
||||
// Each element of the following table contains a bitfield that indicates a
|
||||
// piece of information about the corresponding windows-1251 character.
|
||||
/**
|
||||
* Each element of the following table contains a bitfield that indicates a
|
||||
* piece of information about the corresponding windows-1251 character.
|
||||
*/
|
||||
static uint8_t pm_encoding_windows_1251_table[256] = {
|
||||
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
|
||||
@ -396,8 +432,10 @@ static uint8_t pm_encoding_windows_1251_table[256] = {
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Fx
|
||||
};
|
||||
|
||||
// Each element of the following table contains a bitfield that indicates a
|
||||
// piece of information about the corresponding windows-1252 character.
|
||||
/**
|
||||
* Each element of the following table contains a bitfield that indicates a
|
||||
* piece of information about the corresponding windows-1252 character.
|
||||
*/
|
||||
static uint8_t pm_encoding_windows_1252_table[256] = {
|
||||
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
|
||||
@ -418,37 +456,94 @@ static uint8_t pm_encoding_windows_1252_table[256] = {
|
||||
3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, // Fx
|
||||
};
|
||||
|
||||
/**
|
||||
* Returns the size of the next character in the ASCII encoding. This basically
|
||||
* means that if the top bit is not set, the character is 1 byte long.
|
||||
*/
|
||||
static size_t
|
||||
pm_encoding_ascii_char_width(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
|
||||
return *b < 0x80 ? 1 : 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the size of the next character in the ASCII encoding if it is an
|
||||
* alphabetical character.
|
||||
*/
|
||||
size_t
|
||||
pm_encoding_ascii_alpha_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
|
||||
return (pm_encoding_ascii_table[*b] & PRISM_ENCODING_ALPHABETIC_BIT);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the size of the next character in the ASCII encoding if it is an
|
||||
* alphanumeric character.
|
||||
*/
|
||||
size_t
|
||||
pm_encoding_ascii_alnum_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
|
||||
return (pm_encoding_ascii_table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT) ? 1 : 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if the next character in the ASCII encoding if it is an uppercase
|
||||
* character.
|
||||
*/
|
||||
bool
|
||||
pm_encoding_ascii_isupper_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
|
||||
return (pm_encoding_ascii_table[*b] & PRISM_ENCODING_UPPERCASE_BIT);
|
||||
}
|
||||
|
||||
static size_t
|
||||
pm_encoding_koi8_r_char_width(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
|
||||
return ((*b >= 0x20 && *b <= 0x7E) || (*b >= 0x80)) ? 1 : 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* For a lot of encodings the default is that they are a single byte long no
|
||||
* matter what the codepoint, so this function is shared between them.
|
||||
*/
|
||||
static size_t
|
||||
pm_encoding_single_char_width(PRISM_ATTRIBUTE_UNUSED const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
pm_encoding_t pm_encoding_ascii = {
|
||||
/**
|
||||
* Returns the size of the next character in the KOI-8 encoding. This means
|
||||
* checking if it's a valid codepoint in KOI-8 and if it is returning 1.
|
||||
*/
|
||||
static size_t
|
||||
pm_encoding_koi8_r_char_width(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
|
||||
return ((*b >= 0x20 && *b <= 0x7E) || (*b >= 0x80)) ? 1 : 0;
|
||||
}
|
||||
|
||||
#define PRISM_ENCODING_TABLE(name) \
|
||||
static size_t pm_encoding_ ##name ## _alpha_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) { \
|
||||
return (pm_encoding_ ##name ## _table[*b] & PRISM_ENCODING_ALPHABETIC_BIT); \
|
||||
} \
|
||||
static size_t pm_encoding_ ##name ## _alnum_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) { \
|
||||
return (pm_encoding_ ##name ## _table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT) ? 1 : 0; \
|
||||
} \
|
||||
static bool pm_encoding_ ##name ## _isupper_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) { \
|
||||
return (pm_encoding_ ##name ## _table[*b] & PRISM_ENCODING_UPPERCASE_BIT); \
|
||||
}
|
||||
|
||||
PRISM_ENCODING_TABLE(iso_8859_1)
|
||||
PRISM_ENCODING_TABLE(iso_8859_2)
|
||||
PRISM_ENCODING_TABLE(iso_8859_3)
|
||||
PRISM_ENCODING_TABLE(iso_8859_4)
|
||||
PRISM_ENCODING_TABLE(iso_8859_5)
|
||||
PRISM_ENCODING_TABLE(iso_8859_6)
|
||||
PRISM_ENCODING_TABLE(iso_8859_7)
|
||||
PRISM_ENCODING_TABLE(iso_8859_8)
|
||||
PRISM_ENCODING_TABLE(iso_8859_9)
|
||||
PRISM_ENCODING_TABLE(iso_8859_10)
|
||||
PRISM_ENCODING_TABLE(iso_8859_11)
|
||||
PRISM_ENCODING_TABLE(iso_8859_13)
|
||||
PRISM_ENCODING_TABLE(iso_8859_14)
|
||||
PRISM_ENCODING_TABLE(iso_8859_15)
|
||||
PRISM_ENCODING_TABLE(iso_8859_16)
|
||||
PRISM_ENCODING_TABLE(koi8_r)
|
||||
PRISM_ENCODING_TABLE(windows_1251)
|
||||
PRISM_ENCODING_TABLE(windows_1252)
|
||||
|
||||
#undef PRISM_ENCODING_TABLE
|
||||
|
||||
/** ASCII encoding */
|
||||
const pm_encoding_t pm_encoding_ascii = {
|
||||
.name = "ascii",
|
||||
.char_width = pm_encoding_ascii_char_width,
|
||||
.alnum_char = pm_encoding_ascii_alnum_char,
|
||||
@ -457,7 +552,8 @@ pm_encoding_t pm_encoding_ascii = {
|
||||
.multibyte = false
|
||||
};
|
||||
|
||||
pm_encoding_t pm_encoding_ascii_8bit = {
|
||||
/** ASCII-8BIT encoding */
|
||||
const pm_encoding_t pm_encoding_ascii_8bit = {
|
||||
.name = "ascii-8bit",
|
||||
.char_width = pm_encoding_single_char_width,
|
||||
.alnum_char = pm_encoding_ascii_alnum_char,
|
||||
@ -466,42 +562,182 @@ pm_encoding_t pm_encoding_ascii_8bit = {
|
||||
.multibyte = false
|
||||
};
|
||||
|
||||
#define PRISM_ENCODING_TABLE(s, i, w) \
|
||||
static size_t pm_encoding_ ##i ## _alpha_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) { \
|
||||
return (pm_encoding_ ##i ## _table[*b] & PRISM_ENCODING_ALPHABETIC_BIT); \
|
||||
} \
|
||||
static size_t pm_encoding_ ##i ## _alnum_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) { \
|
||||
return (pm_encoding_ ##i ## _table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT) ? 1 : 0; \
|
||||
} \
|
||||
static bool pm_encoding_ ##i ## _isupper_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) { \
|
||||
return (pm_encoding_ ##i ## _table[*b] & PRISM_ENCODING_UPPERCASE_BIT); \
|
||||
} \
|
||||
pm_encoding_t pm_encoding_ ##i = { \
|
||||
.name = s, \
|
||||
.char_width = w, \
|
||||
.alnum_char = pm_encoding_ ##i ## _alnum_char, \
|
||||
.alpha_char = pm_encoding_ ##i ## _alpha_char, \
|
||||
.isupper_char = pm_encoding_ ##i ## _isupper_char, \
|
||||
.multibyte = false, \
|
||||
};
|
||||
/** ISO-8859-1 */
|
||||
const pm_encoding_t pm_encoding_iso_8859_1 = {
|
||||
.name = "iso-8859-1",
|
||||
.char_width = pm_encoding_single_char_width,
|
||||
.alnum_char = pm_encoding_iso_8859_1_alnum_char,
|
||||
.alpha_char = pm_encoding_iso_8859_1_alpha_char,
|
||||
.isupper_char = pm_encoding_iso_8859_1_isupper_char,
|
||||
.multibyte = false
|
||||
};
|
||||
|
||||
PRISM_ENCODING_TABLE("iso-8859-1", iso_8859_1, pm_encoding_single_char_width)
|
||||
PRISM_ENCODING_TABLE("iso-8859-2", iso_8859_2, pm_encoding_single_char_width)
|
||||
PRISM_ENCODING_TABLE("iso-8859-3", iso_8859_3, pm_encoding_single_char_width)
|
||||
PRISM_ENCODING_TABLE("iso-8859-4", iso_8859_4, pm_encoding_single_char_width)
|
||||
PRISM_ENCODING_TABLE("iso-8859-5", iso_8859_5, pm_encoding_single_char_width)
|
||||
PRISM_ENCODING_TABLE("iso-8859-6", iso_8859_6, pm_encoding_single_char_width)
|
||||
PRISM_ENCODING_TABLE("iso-8859-7", iso_8859_7, pm_encoding_single_char_width)
|
||||
PRISM_ENCODING_TABLE("iso-8859-8", iso_8859_8, pm_encoding_single_char_width)
|
||||
PRISM_ENCODING_TABLE("iso-8859-9", iso_8859_9, pm_encoding_single_char_width)
|
||||
PRISM_ENCODING_TABLE("iso-8859-10", iso_8859_10, pm_encoding_single_char_width)
|
||||
PRISM_ENCODING_TABLE("iso-8859-11", iso_8859_11, pm_encoding_single_char_width)
|
||||
PRISM_ENCODING_TABLE("iso-8859-13", iso_8859_13, pm_encoding_single_char_width)
|
||||
PRISM_ENCODING_TABLE("iso-8859-14", iso_8859_14, pm_encoding_single_char_width)
|
||||
PRISM_ENCODING_TABLE("iso-8859-15", iso_8859_15, pm_encoding_single_char_width)
|
||||
PRISM_ENCODING_TABLE("iso-8859-16", iso_8859_16, pm_encoding_single_char_width)
|
||||
PRISM_ENCODING_TABLE("koi8-r", koi8_r, pm_encoding_koi8_r_char_width)
|
||||
PRISM_ENCODING_TABLE("windows-1251", windows_1251, pm_encoding_single_char_width)
|
||||
PRISM_ENCODING_TABLE("windows-1252", windows_1252, pm_encoding_single_char_width)
|
||||
/** ISO-8859-2 */
|
||||
const pm_encoding_t pm_encoding_iso_8859_2 = {
|
||||
.name = "iso-8859-2",
|
||||
.char_width = pm_encoding_single_char_width,
|
||||
.alnum_char = pm_encoding_iso_8859_2_alnum_char,
|
||||
.alpha_char = pm_encoding_iso_8859_2_alpha_char,
|
||||
.isupper_char = pm_encoding_iso_8859_2_isupper_char,
|
||||
.multibyte = false
|
||||
};
|
||||
|
||||
#undef PRISM_ENCODING_TABLE
|
||||
/** ISO-8859-3 */
|
||||
const pm_encoding_t pm_encoding_iso_8859_3 = {
|
||||
.name = "iso-8859-3",
|
||||
.char_width = pm_encoding_single_char_width,
|
||||
.alnum_char = pm_encoding_iso_8859_3_alnum_char,
|
||||
.alpha_char = pm_encoding_iso_8859_3_alpha_char,
|
||||
.isupper_char = pm_encoding_iso_8859_3_isupper_char,
|
||||
.multibyte = false
|
||||
};
|
||||
|
||||
/** ISO-8859-4 */
|
||||
const pm_encoding_t pm_encoding_iso_8859_4 = {
|
||||
.name = "iso-8859-4",
|
||||
.char_width = pm_encoding_single_char_width,
|
||||
.alnum_char = pm_encoding_iso_8859_4_alnum_char,
|
||||
.alpha_char = pm_encoding_iso_8859_4_alpha_char,
|
||||
.isupper_char = pm_encoding_iso_8859_4_isupper_char,
|
||||
.multibyte = false
|
||||
};
|
||||
|
||||
/** ISO-8859-5 */
|
||||
const pm_encoding_t pm_encoding_iso_8859_5 = {
|
||||
.name = "iso-8859-5",
|
||||
.char_width = pm_encoding_single_char_width,
|
||||
.alnum_char = pm_encoding_iso_8859_5_alnum_char,
|
||||
.alpha_char = pm_encoding_iso_8859_5_alpha_char,
|
||||
.isupper_char = pm_encoding_iso_8859_5_isupper_char,
|
||||
.multibyte = false
|
||||
};
|
||||
|
||||
/** ISO-8859-6 */
|
||||
const pm_encoding_t pm_encoding_iso_8859_6 = {
|
||||
.name = "iso-8859-6",
|
||||
.char_width = pm_encoding_single_char_width,
|
||||
.alnum_char = pm_encoding_iso_8859_6_alnum_char,
|
||||
.alpha_char = pm_encoding_iso_8859_6_alpha_char,
|
||||
.isupper_char = pm_encoding_iso_8859_6_isupper_char,
|
||||
.multibyte = false
|
||||
};
|
||||
|
||||
/** ISO-8859-7 */
|
||||
const pm_encoding_t pm_encoding_iso_8859_7 = {
|
||||
.name = "iso-8859-7",
|
||||
.char_width = pm_encoding_single_char_width,
|
||||
.alnum_char = pm_encoding_iso_8859_7_alnum_char,
|
||||
.alpha_char = pm_encoding_iso_8859_7_alpha_char,
|
||||
.isupper_char = pm_encoding_iso_8859_7_isupper_char,
|
||||
.multibyte = false
|
||||
};
|
||||
|
||||
/** ISO-8859-8 */
|
||||
const pm_encoding_t pm_encoding_iso_8859_8 = {
|
||||
.name = "iso-8859-8",
|
||||
.char_width = pm_encoding_single_char_width,
|
||||
.alnum_char = pm_encoding_iso_8859_8_alnum_char,
|
||||
.alpha_char = pm_encoding_iso_8859_8_alpha_char,
|
||||
.isupper_char = pm_encoding_iso_8859_8_isupper_char,
|
||||
.multibyte = false
|
||||
};
|
||||
|
||||
/** ISO-8859-9 */
|
||||
const pm_encoding_t pm_encoding_iso_8859_9 = {
|
||||
.name = "iso-8859-9",
|
||||
.char_width = pm_encoding_single_char_width,
|
||||
.alnum_char = pm_encoding_iso_8859_9_alnum_char,
|
||||
.alpha_char = pm_encoding_iso_8859_9_alpha_char,
|
||||
.isupper_char = pm_encoding_iso_8859_9_isupper_char,
|
||||
.multibyte = false
|
||||
};
|
||||
|
||||
/** ISO-8859-10 */
|
||||
const pm_encoding_t pm_encoding_iso_8859_10 = {
|
||||
.name = "iso-8859-10",
|
||||
.char_width = pm_encoding_single_char_width,
|
||||
.alnum_char = pm_encoding_iso_8859_10_alnum_char,
|
||||
.alpha_char = pm_encoding_iso_8859_10_alpha_char,
|
||||
.isupper_char = pm_encoding_iso_8859_10_isupper_char,
|
||||
.multibyte = false
|
||||
};
|
||||
|
||||
/** ISO-8859-11 */
|
||||
const pm_encoding_t pm_encoding_iso_8859_11 = {
|
||||
.name = "iso-8859-11",
|
||||
.char_width = pm_encoding_single_char_width,
|
||||
.alnum_char = pm_encoding_iso_8859_11_alnum_char,
|
||||
.alpha_char = pm_encoding_iso_8859_11_alpha_char,
|
||||
.isupper_char = pm_encoding_iso_8859_11_isupper_char,
|
||||
.multibyte = false
|
||||
};
|
||||
|
||||
/** ISO-8859-13 */
|
||||
const pm_encoding_t pm_encoding_iso_8859_13 = {
|
||||
.name = "iso-8859-13",
|
||||
.char_width = pm_encoding_single_char_width,
|
||||
.alnum_char = pm_encoding_iso_8859_13_alnum_char,
|
||||
.alpha_char = pm_encoding_iso_8859_13_alpha_char,
|
||||
.isupper_char = pm_encoding_iso_8859_13_isupper_char,
|
||||
.multibyte = false
|
||||
};
|
||||
|
||||
/** ISO-8859-14 */
|
||||
const pm_encoding_t pm_encoding_iso_8859_14 = {
|
||||
.name = "iso-8859-14",
|
||||
.char_width = pm_encoding_single_char_width,
|
||||
.alnum_char = pm_encoding_iso_8859_14_alnum_char,
|
||||
.alpha_char = pm_encoding_iso_8859_14_alpha_char,
|
||||
.isupper_char = pm_encoding_iso_8859_14_isupper_char,
|
||||
.multibyte = false
|
||||
};
|
||||
|
||||
/** ISO-8859-15 */
|
||||
const pm_encoding_t pm_encoding_iso_8859_15 = {
|
||||
.name = "iso-8859-15",
|
||||
.char_width = pm_encoding_single_char_width,
|
||||
.alnum_char = pm_encoding_iso_8859_15_alnum_char,
|
||||
.alpha_char = pm_encoding_iso_8859_15_alpha_char,
|
||||
.isupper_char = pm_encoding_iso_8859_15_isupper_char,
|
||||
.multibyte = false
|
||||
};
|
||||
|
||||
/** ISO-8859-16 */
|
||||
const pm_encoding_t pm_encoding_iso_8859_16 = {
|
||||
.name = "iso-8859-16",
|
||||
.char_width = pm_encoding_single_char_width,
|
||||
.alnum_char = pm_encoding_iso_8859_16_alnum_char,
|
||||
.alpha_char = pm_encoding_iso_8859_16_alpha_char,
|
||||
.isupper_char = pm_encoding_iso_8859_16_isupper_char,
|
||||
.multibyte = false
|
||||
};
|
||||
|
||||
/** KOI8-R */
|
||||
const pm_encoding_t pm_encoding_koi8_r = {
|
||||
.name = "koi8-r",
|
||||
.char_width = pm_encoding_koi8_r_char_width,
|
||||
.alnum_char = pm_encoding_koi8_r_alnum_char,
|
||||
.alpha_char = pm_encoding_koi8_r_alpha_char,
|
||||
.isupper_char = pm_encoding_koi8_r_isupper_char,
|
||||
.multibyte = false
|
||||
};
|
||||
|
||||
/** Windows-1251 */
|
||||
const pm_encoding_t pm_encoding_windows_1251 = {
|
||||
.name = "windows-1251",
|
||||
.char_width = pm_encoding_single_char_width,
|
||||
.alnum_char = pm_encoding_windows_1251_alnum_char,
|
||||
.alpha_char = pm_encoding_windows_1251_alpha_char,
|
||||
.isupper_char = pm_encoding_windows_1251_isupper_char,
|
||||
.multibyte = false
|
||||
};
|
||||
|
||||
/** Windows-1252 */
|
||||
const pm_encoding_t pm_encoding_windows_1252 = {
|
||||
.name = "windows-1252",
|
||||
.char_width = pm_encoding_single_char_width,
|
||||
.alnum_char = pm_encoding_windows_1252_alnum_char,
|
||||
.alpha_char = pm_encoding_windows_1252_alpha_char,
|
||||
.isupper_char = pm_encoding_windows_1252_isupper_char,
|
||||
.multibyte = false
|
||||
};
|
||||
|
@ -1,15 +1,14 @@
|
||||
// Note that the UTF-8 decoding code is based on Bjoern Hoehrmann's UTF-8 DFA
|
||||
// decoder. See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
|
||||
|
||||
#include "prism/enc/pm_encoding.h"
|
||||
|
||||
typedef uint32_t pm_unicode_codepoint_t;
|
||||
|
||||
// Each element of the following table contains a bitfield that indicates a
|
||||
// piece of information about the corresponding unicode codepoint. Note that
|
||||
// this table is different from other encodings where we used a lookup table
|
||||
// because the indices of those tables are the byte representations, not the
|
||||
// codepoints themselves.
|
||||
/**
|
||||
* Each element of the following table contains a bitfield that indicates a
|
||||
* piece of information about the corresponding unicode codepoint. Note that
|
||||
* this table is different from other encodings where we used a lookup table
|
||||
* because the indices of those tables are the byte representations, not the
|
||||
* codepoints themselves.
|
||||
*/
|
||||
const uint8_t pm_encoding_unicode_table[256] = {
|
||||
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
|
||||
@ -2179,8 +2178,12 @@ static const pm_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_C
|
||||
0x1F170, 0x1F189,
|
||||
};
|
||||
|
||||
/**
|
||||
* Binary search through the given list of codepoints to see if the given
|
||||
* codepoint is in the list.
|
||||
*/
|
||||
static bool
|
||||
pm_unicode_codepoint_match(pm_unicode_codepoint_t codepoint, const pm_unicode_codepoint_t *codepoints, size_t size) {
|
||||
pm_unicode_codepoint_match(pm_unicode_codepoint_t codepoint, size_t size, const pm_unicode_codepoint_t codepoints[size]) {
|
||||
size_t start = 0;
|
||||
size_t end = size;
|
||||
|
||||
@ -2202,6 +2205,29 @@ pm_unicode_codepoint_match(pm_unicode_codepoint_t codepoint, const pm_unicode_co
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* A state transition table for decoding UTF-8.
|
||||
*
|
||||
* Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
static const uint8_t pm_utf_8_dfa[] = {
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
|
||||
@ -2219,6 +2245,11 @@ static const uint8_t pm_utf_8_dfa[] = {
|
||||
1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
|
||||
};
|
||||
|
||||
/**
|
||||
* Given a pointer to a string and the number of bytes remaining in the string,
|
||||
* decode the next UTF-8 codepoint and return it. The number of bytes consumed
|
||||
* is returned in the width out parameter.
|
||||
*/
|
||||
static pm_unicode_codepoint_t
|
||||
pm_utf_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) {
|
||||
assert(n >= 1);
|
||||
@ -2253,6 +2284,10 @@ pm_encoding_utf_8_char_width(const uint8_t *b, ptrdiff_t n) {
|
||||
return width;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the size of the next character in the UTF-8 encoding if it is an
|
||||
* alphabetical character.
|
||||
*/
|
||||
size_t
|
||||
pm_encoding_utf_8_alpha_char(const uint8_t *b, ptrdiff_t n) {
|
||||
if (*b < 0x80) {
|
||||
@ -2265,10 +2300,14 @@ pm_encoding_utf_8_alpha_char(const uint8_t *b, ptrdiff_t n) {
|
||||
if (codepoint <= 0xFF) {
|
||||
return (pm_encoding_unicode_table[(uint8_t) codepoint] & PRISM_ENCODING_ALPHABETIC_BIT) ? width : 0;
|
||||
} else {
|
||||
return pm_unicode_codepoint_match(codepoint, unicode_alpha_codepoints, UNICODE_ALPHA_CODEPOINTS_LENGTH) ? width : 0;
|
||||
return pm_unicode_codepoint_match(codepoint, UNICODE_ALPHA_CODEPOINTS_LENGTH, unicode_alpha_codepoints) ? width : 0;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the size of the next character in the UTF-8 encoding if it is an
|
||||
* alphanumeric character.
|
||||
*/
|
||||
size_t
|
||||
pm_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n) {
|
||||
if (*b < 0x80) {
|
||||
@ -2281,10 +2320,14 @@ pm_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n) {
|
||||
if (codepoint <= 0xFF) {
|
||||
return (pm_encoding_unicode_table[(uint8_t) codepoint] & (PRISM_ENCODING_ALPHANUMERIC_BIT)) ? width : 0;
|
||||
} else {
|
||||
return pm_unicode_codepoint_match(codepoint, unicode_alnum_codepoints, UNICODE_ALNUM_CODEPOINTS_LENGTH) ? width : 0;
|
||||
return pm_unicode_codepoint_match(codepoint, UNICODE_ALNUM_CODEPOINTS_LENGTH, unicode_alnum_codepoints) ? width : 0;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if the next character in the UTF-8 encoding if it is an uppercase
|
||||
* character.
|
||||
*/
|
||||
bool
|
||||
pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n) {
|
||||
if (*b < 0x80) {
|
||||
@ -2297,7 +2340,7 @@ pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n) {
|
||||
if (codepoint <= 0xFF) {
|
||||
return (pm_encoding_unicode_table[(uint8_t) codepoint] & PRISM_ENCODING_UPPERCASE_BIT) ? true : false;
|
||||
} else {
|
||||
return pm_unicode_codepoint_match(codepoint, unicode_isupper_codepoints, UNICODE_ISUPPER_CODEPOINTS_LENGTH) ? true : false;
|
||||
return pm_unicode_codepoint_match(codepoint, UNICODE_ISUPPER_CODEPOINTS_LENGTH, unicode_isupper_codepoints) ? true : false;
|
||||
}
|
||||
}
|
||||
|
||||
@ -2305,7 +2348,8 @@ pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n) {
|
||||
#undef UNICODE_ALNUM_CODEPOINTS_LENGTH
|
||||
#undef UNICODE_ISUPPER_CODEPOINTS_LENGTH
|
||||
|
||||
pm_encoding_t pm_encoding_utf_8 = {
|
||||
/** UTF-8 */
|
||||
const pm_encoding_t pm_encoding_utf_8 = {
|
||||
.name = "utf-8",
|
||||
.char_width = pm_encoding_utf_8_char_width,
|
||||
.alnum_char = pm_encoding_utf_8_alnum_char,
|
||||
@ -2314,7 +2358,8 @@ pm_encoding_t pm_encoding_utf_8 = {
|
||||
.multibyte = true
|
||||
};
|
||||
|
||||
pm_encoding_t pm_encoding_utf8_mac = {
|
||||
/** UTF8-mac */
|
||||
const pm_encoding_t pm_encoding_utf8_mac = {
|
||||
.name = "utf8-mac",
|
||||
.char_width = pm_encoding_utf_8_char_width,
|
||||
.alnum_char = pm_encoding_utf_8_alnum_char,
|
||||
|
@ -46,7 +46,8 @@ pm_encoding_windows_31j_isupper_char(const uint8_t *b, ptrdiff_t n) {
|
||||
}
|
||||
}
|
||||
|
||||
pm_encoding_t pm_encoding_windows_31j = {
|
||||
/** Windows-31J */
|
||||
const pm_encoding_t pm_encoding_windows_31j = {
|
||||
.name = "windows-31j",
|
||||
.char_width = pm_encoding_windows_31j_char_width,
|
||||
.alnum_char = pm_encoding_windows_31j_alnum_char,
|
||||
|
Loading…
x
Reference in New Issue
Block a user