MDEV-27009 Add UCA-14.0.0 collations - adding uca-dump into build targets
- Adding uca-dump into build targets - Adding ctype-uca.h and moving implicit weight related routines there - Reusing implicit weight routines in ctype-uca.c and uca-dump.c - Adding handling of command line arguments to uca-dump - Fixing some compile-time warnings in uca-dump.c
This commit is contained in:
parent
45e0373a78
commit
bb84f61a26
1
debian/not-installed
vendored
1
debian/not-installed
vendored
@ -24,6 +24,7 @@ usr/lib/x86_64-linux-gnu/libidbboot.a # ColumnStore header file
|
||||
usr/lib/x86_64-linux-gnu/libprocessor.a # ColumnStore header file
|
||||
usr/lib/x86_64-linux-gnu/libwe_xml.a # ColumnStore header file
|
||||
usr/bin/test-connect-t
|
||||
usr/bin/uca-dump
|
||||
usr/bin/wsrep_sst_backup
|
||||
usr/lib/mysql/plugin/type_test.so
|
||||
usr/lib/sysusers.d/mariadb.conf # Not used (yet) in Debian systemd
|
||||
|
@ -37,3 +37,5 @@ MAYBE_DISABLE_IPO(strings)
|
||||
ADD_EXECUTABLE(conf_to_src EXCLUDE_FROM_ALL conf_to_src.c)
|
||||
SET_TARGET_PROPERTIES(conf_to_src PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD TRUE)
|
||||
TARGET_LINK_LIBRARIES(conf_to_src mysys strings)
|
||||
|
||||
ADD_EXECUTABLE(uca-dump uca-dump.c)
|
||||
|
@ -34,6 +34,7 @@
|
||||
|
||||
#include "strings_def.h"
|
||||
#include <m_ctype.h>
|
||||
#include "ctype-uca.h"
|
||||
|
||||
typedef struct
|
||||
{
|
||||
@ -31689,62 +31690,13 @@ my_uca_context_weight_find(my_uca_scanner *scanner, my_wc_t currwc,
|
||||
|
||||
/****************************************************************/
|
||||
|
||||
/**
|
||||
Implicit weights for a code CP are constructed as follows:
|
||||
[.AAAA.0020.0002][.BBBB.0000.0000]
|
||||
|
||||
where:
|
||||
AAAA= BASE + (CP >> 15);
|
||||
BBBB= (CP & 0x7FFF) | 0x8000;
|
||||
|
||||
There are two weights in the primary level (AAAA followed by BBBB).
|
||||
There is one weight on other levels:
|
||||
- 0020 on the secondary level
|
||||
- 0002 on the tertiary level
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
Return BASE for an implicit weight on the primary level
|
||||
|
||||
According to UCA, BASE is calculated as follows:
|
||||
- FB40 for Unified_Ideograph=True AND
|
||||
((Block=CJK_Unified_Ideograph) OR
|
||||
(Block=CJK_Compatibility_Ideographs))
|
||||
- FB80 for Unified_Ideograph=True AND NOT
|
||||
((Block=CJK_Unified_Ideograph) OR
|
||||
(Block=CJK_Compatibility_Ideographs))
|
||||
- FBC0 for any other code point
|
||||
TODO: it seems we're not handling BASE correctly:
|
||||
- check what are those blocks
|
||||
- there are more Unified Ideograph blocks in the latest Unicode versions
|
||||
*/
|
||||
static inline uint16
|
||||
my_uca_implicit_weight_base(my_wc_t code)
|
||||
{
|
||||
if (code >= 0x3400 && code <= 0x4DB5)
|
||||
return 0xFB80;
|
||||
if (code >= 0x4E00 && code <= 0x9FA5)
|
||||
return 0xFB40;
|
||||
return 0xFBC0;
|
||||
}
|
||||
|
||||
|
||||
static inline void
|
||||
my_uca_implicit_weight_put(uint16 *to, my_wc_t code, uint level)
|
||||
{
|
||||
switch (level) {
|
||||
case 1: to[0]= 0x0020; to[1]= 0; break; /* Secondary level */
|
||||
case 2: to[0]= 0x0002; to[1]= 0; break; /* Tertiary level */
|
||||
case 3: to[0]= 0x0001; to[1]= 0; break; /* Quaternary level */
|
||||
default:
|
||||
DBUG_ASSERT(0);
|
||||
case 0:
|
||||
break;
|
||||
}
|
||||
/* Primary level */
|
||||
to[0]= (uint16)(code >> 15) + my_uca_implicit_weight_base(code);
|
||||
to[1]= (code & 0x7FFF) | 0x8000;
|
||||
MY_UCA_IMPLICIT_WEIGHT weight;
|
||||
weight= my_uca_520_implicit_weight_on_level(code, level);
|
||||
to[0]= weight.weight[0];
|
||||
to[1]= weight.weight[1];
|
||||
to[2]= 0;
|
||||
}
|
||||
|
||||
@ -31766,10 +31718,11 @@ static inline int
|
||||
my_uca_scanner_next_implicit_primary(my_uca_scanner *scanner)
|
||||
{
|
||||
my_wc_t wc= (scanner->page << 8) + scanner->code;
|
||||
scanner->implicit[0]= (wc & 0x7FFF) | 0x8000; /* The second weight */
|
||||
scanner->implicit[1]= 0; /* 0 terminator */
|
||||
MY_UCA_IMPLICIT_WEIGHT weight= my_uca_520_implicit_weight_primary(wc);
|
||||
scanner->implicit[0]= weight.weight[1]; /* The second weight */
|
||||
scanner->implicit[1]= 0; /* 0 terminator */
|
||||
scanner->wbeg= scanner->implicit;
|
||||
return my_uca_implicit_weight_base(wc) + (wc >> 15);
|
||||
return weight.weight[0]; /* The first weight */
|
||||
}
|
||||
|
||||
|
||||
|
155
strings/ctype-uca.h
Normal file
155
strings/ctype-uca.h
Normal file
@ -0,0 +1,155 @@
|
||||
#ifndef CTYPE_UCA_H
|
||||
#define CTYPE_UCA_H
|
||||
/* Copyright (c) 2021, MariaDB
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Library General Public
|
||||
License as published by the Free Software Foundation; version 2
|
||||
of the License.
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Library General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Library General Public
|
||||
License along with this library; if not, write to the Free
|
||||
Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
|
||||
MA 02110-1335 USA */
|
||||
|
||||
|
||||
/*
|
||||
Implicit weight handling is done according to
|
||||
the section "Computing Implicit Weights" in
|
||||
https://unicode.org/reports/tr10/#Values_For_Base_Table
|
||||
(as of Unicode 14.0.0)
|
||||
|
||||
Implicit weights for a code CP are constructed as follows:
|
||||
[.AAAA.0020.0002][.BBBB.0000.0000]
|
||||
|
||||
- There are two primary weights, depending on the character type and block.
|
||||
- There is one weight on the secondary and tertiary levels.
|
||||
|
||||
AAAA and BBBB are computed using different formulas for:
|
||||
- Siniform ideographic scripts
|
||||
- Han
|
||||
- Unassigned characters
|
||||
*/
|
||||
|
||||
typedef struct my_uca_implict_weight_t
|
||||
{
|
||||
uint16 weight[2];
|
||||
} MY_UCA_IMPLICIT_WEIGHT;
|
||||
|
||||
|
||||
/*
|
||||
By default, implicit weights for a code CP are constructed as follows:
|
||||
[.AAAA.0020.0002][.BBBB.0000.0000]
|
||||
|
||||
where AAAA and BBBB are :
|
||||
AAAA= BASE + (CP >> 15);
|
||||
BBBB= (CP & 0x7FFF) | 0x8000;
|
||||
|
||||
This formula covers the following implicit weight subtypes:
|
||||
- Core Han Unified Ideographs
|
||||
- All other Han Unified Ideographs
|
||||
- Unassigned characters
|
||||
Every mentioned subtype passes a different BASE.
|
||||
|
||||
This formula does not cover Siniform ideographic scripts.
|
||||
They are handled by separate functions.
|
||||
*/
|
||||
static inline MY_UCA_IMPLICIT_WEIGHT
|
||||
my_uca_implicit_weight_primary_default(uint16 base, my_wc_t code)
|
||||
{
|
||||
MY_UCA_IMPLICIT_WEIGHT res;
|
||||
res.weight[0]= (uint16) ((code >> 15) + base);
|
||||
res.weight[1]= (uint16) ((code & 0x7FFF)|0x8000);
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
Calculate Unicode-5.2.0 implicit weight on the primary level.
|
||||
|
||||
According to UCA, BASE is calculated as follows:
|
||||
- FB40 for Unified_Ideograph=True AND
|
||||
((Block=CJK_Unified_Ideograph) OR
|
||||
(Block=CJK_Compatibility_Ideographs))
|
||||
- FB80 for Unified_Ideograph=True AND NOT
|
||||
((Block=CJK_Unified_Ideograph) OR
|
||||
(Block=CJK_Compatibility_Ideographs))
|
||||
- FBC0 for any other code point
|
||||
|
||||
But for Unicode-5.2.0 and Unicode-4.0.0 we used
|
||||
a simplified formula as implemented before.
|
||||
*/
|
||||
static inline MY_UCA_IMPLICIT_WEIGHT
|
||||
my_uca_520_implicit_weight_primary(my_wc_t code)
|
||||
{
|
||||
uint16 base;
|
||||
/*
|
||||
3400;<CJK Ideograph Extension A, First>
|
||||
4DB5;<CJK Ideograph Extension A, Last>
|
||||
4E00;<CJK Ideograph, First>
|
||||
9FA5;<CJK Ideograph, Last>
|
||||
*/
|
||||
if (code >= 0x3400 && code <= 0x4DB5)
|
||||
base= 0xFB80;
|
||||
else if (code >= 0x4E00 && code <= 0x9FA5)
|
||||
base= 0xFB40;
|
||||
else
|
||||
base= 0xFBC0;
|
||||
|
||||
return my_uca_implicit_weight_primary_default(base, code);
|
||||
}
|
||||
|
||||
|
||||
static inline MY_UCA_IMPLICIT_WEIGHT
|
||||
my_uca_implicit_weight_secondary()
|
||||
{
|
||||
MY_UCA_IMPLICIT_WEIGHT res;
|
||||
res.weight[0]= 0x0020;
|
||||
res.weight[1]= 0;
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
static inline MY_UCA_IMPLICIT_WEIGHT
|
||||
my_uca_implicit_weight_tertiary()
|
||||
{
|
||||
MY_UCA_IMPLICIT_WEIGHT res;
|
||||
res.weight[0]= 0x0002;
|
||||
res.weight[1]= 0;
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
static inline MY_UCA_IMPLICIT_WEIGHT
|
||||
my_uca_implicit_weight_quaternary()
|
||||
{
|
||||
MY_UCA_IMPLICIT_WEIGHT res;
|
||||
res.weight[0]= 0x0001;
|
||||
res.weight[1]= 0;
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
static inline MY_UCA_IMPLICIT_WEIGHT
|
||||
my_uca_520_implicit_weight_on_level(my_wc_t code, uint level)
|
||||
{
|
||||
switch (level) {
|
||||
case 0:
|
||||
return my_uca_520_implicit_weight_primary(code);
|
||||
case 1:
|
||||
return my_uca_implicit_weight_secondary();
|
||||
case 2:
|
||||
return my_uca_implicit_weight_tertiary();
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return my_uca_implicit_weight_quaternary();
|
||||
}
|
||||
|
||||
|
||||
#endif /* CTYPE_UCA_H */
|
@ -20,13 +20,14 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
typedef unsigned char uchar;
|
||||
typedef unsigned short uint16;
|
||||
#include "my_global.h"
|
||||
#include "m_ctype.h"
|
||||
#include "ctype-uca.h"
|
||||
|
||||
struct uca_item_st
|
||||
{
|
||||
uchar num;
|
||||
uint16 weight[4][9];
|
||||
uint16 weight[4][MY_UCA_MAX_WEIGHT_SIZE];
|
||||
};
|
||||
|
||||
#if 0
|
||||
@ -43,35 +44,139 @@ struct uca_item_st
|
||||
|
||||
#define MAX_ALLOWED_CODE 0x10FFFF
|
||||
|
||||
/* Name that goes into all array names */
|
||||
static const char *global_name_prefix= "uca520";
|
||||
|
||||
typedef struct opt_st
|
||||
{
|
||||
const char *name_prefix; /* Name that goes into all array names */
|
||||
const char *filename; /* The filename or "-" for stdin */
|
||||
uint levels; /* The number of levels to dump */
|
||||
} OPT;
|
||||
|
||||
|
||||
static OPT defaults=
|
||||
{
|
||||
"uca",
|
||||
"-",
|
||||
3
|
||||
};
|
||||
|
||||
|
||||
/* Name prefix that goes into page weight array names after global_name_prefix */
|
||||
static char *pname_prefix[]= {"_p", "_p", "_p"};
|
||||
static const char *pname_prefix[]= {"_p", "_p", "_p"};
|
||||
|
||||
/* Name suffix that goes into page weight array names after page number */
|
||||
static char *pname_suffix[]= {"", "_w2", "_w3"};
|
||||
static const char *pname_suffix[]= {"", "_w2", "_w3"};
|
||||
|
||||
|
||||
void usage(const char *prog)
|
||||
{
|
||||
printf("Usage:\n");
|
||||
printf("%s [options] filename\n", prog);
|
||||
}
|
||||
|
||||
|
||||
static inline int lstrncmp(const char *str, const LEX_CSTRING lstr)
|
||||
{
|
||||
return strncmp(lstr.str, str, lstr.length);
|
||||
}
|
||||
|
||||
|
||||
int process_option(OPT *options, const char *opt)
|
||||
{
|
||||
static const LEX_CSTRING opt_name_prefix= {STRING_WITH_LEN("--name-prefix=")};
|
||||
static const LEX_CSTRING opt_levels= {STRING_WITH_LEN("--levels=")};
|
||||
if (!lstrncmp(opt, opt_name_prefix))
|
||||
{
|
||||
options->name_prefix= opt + opt_name_prefix.length;
|
||||
return 0;
|
||||
}
|
||||
if (!lstrncmp(opt, opt_levels))
|
||||
{
|
||||
options->levels= (uint) strtoul(opt + opt_levels.length, NULL, 10);
|
||||
if (options->levels < 1 || options->levels > 3)
|
||||
{
|
||||
printf("Bad --levels value\n");
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
printf("Unknown option: %s\n", opt);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
int process_options(OPT *options, int ac, char **av)
|
||||
{
|
||||
int i;
|
||||
for (i= 1; i < ac; i++)
|
||||
{
|
||||
if (!strncmp(av[i], "--", 2))
|
||||
{
|
||||
if (process_option(options, av[i]))
|
||||
return 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (i + 1 != ac)
|
||||
{
|
||||
usage(av[0]);
|
||||
return 1;
|
||||
}
|
||||
options->filename= av[i];
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
usage(av[0]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
FILE *open_file(const char *name)
|
||||
{
|
||||
if (!strcmp(name, "-"))
|
||||
return stdin;
|
||||
return fopen(name, "r");
|
||||
}
|
||||
|
||||
|
||||
void close_file(FILE *file)
|
||||
{
|
||||
if (file != stdin)
|
||||
fclose(file);
|
||||
}
|
||||
|
||||
|
||||
int main(int ac, char **av)
|
||||
{
|
||||
char str[256];
|
||||
char str[1024];
|
||||
char *weights[64];
|
||||
static struct uca_item_st uca[MAX_ALLOWED_CODE+1];
|
||||
size_t code, w;
|
||||
my_wc_t code;
|
||||
uint w;
|
||||
int pageloaded[MY_UCA_NPAGES];
|
||||
|
||||
FILE *file;
|
||||
OPT options= defaults;
|
||||
|
||||
if (process_options(&options, ac, av))
|
||||
return 1;
|
||||
|
||||
if (!(file= open_file(options.filename)))
|
||||
{
|
||||
printf("Could not open %s for reading\n", options.filename);
|
||||
return 1;
|
||||
}
|
||||
|
||||
bzero(uca, sizeof(uca));
|
||||
bzero(pageloaded, sizeof(pageloaded));
|
||||
|
||||
while (fgets(str,sizeof(str),stdin))
|
||||
while (fgets(str, sizeof(str), file))
|
||||
{
|
||||
char *comment;
|
||||
char *weight;
|
||||
char *s;
|
||||
size_t codenum;
|
||||
|
||||
code= strtol(str,NULL,16);
|
||||
code= (my_wc_t) strtol(str,NULL,16);
|
||||
|
||||
if (str[0]=='#' || (code > MAX_ALLOWED_CODE))
|
||||
continue;
|
||||
@ -116,7 +221,9 @@ int main(int ac, char **av)
|
||||
uca[code].num++;
|
||||
}
|
||||
|
||||
for (w=0; w < uca[code].num; w++)
|
||||
set_if_smaller(uca[code].num, MY_UCA_MAX_WEIGHT_SIZE-1);
|
||||
|
||||
for (w=0; w < uca[code].num ; w++)
|
||||
{
|
||||
size_t partnum;
|
||||
|
||||
@ -125,9 +232,8 @@ int main(int ac, char **av)
|
||||
while (*s)
|
||||
{
|
||||
char *endptr;
|
||||
size_t part;
|
||||
part= strtol(s+1,&endptr,16);
|
||||
uca[code].weight[partnum][w]= part;
|
||||
uint part= (uint) strtoul(s + 1, &endptr, 16);
|
||||
uca[code].weight[partnum][w]= (uint16) part;
|
||||
s= endptr;
|
||||
partnum++;
|
||||
}
|
||||
@ -135,45 +241,24 @@ int main(int ac, char **av)
|
||||
/* Mark that a character from this page was loaded */
|
||||
pageloaded[code >> MY_UCA_PSHIFT]++;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
close_file(file);
|
||||
|
||||
/* Now set implicit weights */
|
||||
for (code=0; code <= MAX_ALLOWED_CODE; code++)
|
||||
{
|
||||
size_t base, aaaa, bbbb;
|
||||
|
||||
uint level;
|
||||
|
||||
if (uca[code].num)
|
||||
continue;
|
||||
|
||||
/*
|
||||
3400;<CJK Ideograph Extension A, First>
|
||||
4DB5;<CJK Ideograph Extension A, Last>
|
||||
4E00;<CJK Ideograph, First>
|
||||
9FA5;<CJK Ideograph, Last>
|
||||
*/
|
||||
|
||||
if (code >= 0x3400 && code <= 0x4DB5)
|
||||
base= 0xFB80;
|
||||
else if (code >= 0x4E00 && code <= 0x9FA5)
|
||||
base= 0xFB40;
|
||||
else
|
||||
base= 0xFBC0;
|
||||
|
||||
aaaa= base + (code >> 15);
|
||||
bbbb= (code & 0x7FFF) | 0x8000;
|
||||
uca[code].weight[0][0]= aaaa;
|
||||
uca[code].weight[0][1]= bbbb;
|
||||
|
||||
uca[code].weight[1][0]= 0x0020;
|
||||
uca[code].weight[1][1]= 0x0000;
|
||||
|
||||
uca[code].weight[2][0]= 0x0002;
|
||||
uca[code].weight[2][1]= 0x0000;
|
||||
|
||||
uca[code].weight[3][0]= 0x0001;
|
||||
uca[code].weight[3][2]= 0x0000;
|
||||
|
||||
|
||||
for (level= 0; level < 4; level++)
|
||||
{
|
||||
MY_UCA_IMPLICIT_WEIGHT weight;
|
||||
weight= my_uca_520_implicit_weight_on_level(code, level);
|
||||
uca[code].weight[level][0]= weight.weight[0];
|
||||
uca[code].weight[level][1]= weight.weight[1];
|
||||
}
|
||||
uca[code].num= 2;
|
||||
}
|
||||
|
||||
@ -184,7 +269,7 @@ int main(int ac, char **av)
|
||||
printf("#define MY_UCA_CMASK %d\n",MY_UCA_CMASK);
|
||||
printf("#define MY_UCA_PSHIFT %d\n",MY_UCA_PSHIFT);
|
||||
|
||||
for (w=0; w<3; w++)
|
||||
for (w=0; w < options.levels; w++)
|
||||
{
|
||||
size_t page;
|
||||
int pagemaxlen[MY_UCA_NPAGES];
|
||||
@ -259,7 +344,7 @@ int main(int ac, char **av)
|
||||
default: mchars= uca[code].num;
|
||||
}
|
||||
|
||||
pagemaxlen[page]= maxnum;
|
||||
pagemaxlen[page]= (int) maxnum;
|
||||
|
||||
|
||||
/*
|
||||
@ -268,12 +353,12 @@ int main(int ac, char **av)
|
||||
|
||||
|
||||
printf("static const uint16 %s%s%03X%s[]= { /* %04X (%d weights per char) */\n",
|
||||
global_name_prefix, pname_prefix[w], (int) page, pname_suffix[w],
|
||||
options.name_prefix, pname_prefix[w], (int) page, pname_suffix[w],
|
||||
(int) page*MY_UCA_NCHARS, (int) maxnum);
|
||||
|
||||
for (offs=0; offs < MY_UCA_NCHARS; offs++)
|
||||
{
|
||||
uint16 weight[8];
|
||||
uint16 weight[MY_UCA_MAX_WEIGHT_SIZE];
|
||||
size_t num, i;
|
||||
|
||||
code= page*MY_UCA_NCHARS+offs;
|
||||
@ -324,7 +409,7 @@ int main(int ac, char **av)
|
||||
}
|
||||
|
||||
printf("const uchar %s_length%s[%d]={\n",
|
||||
global_name_prefix, pname_suffix[w], MY_UCA_NPAGES);
|
||||
options.name_prefix, pname_suffix[w], MY_UCA_NPAGES);
|
||||
for (page=0; page < MY_UCA_NPAGES; page++)
|
||||
{
|
||||
printf("%d%s%s",pagemaxlen[page],page<MY_UCA_NPAGES-1?",":"",(page+1) % 16 ? "":"\n");
|
||||
@ -333,7 +418,7 @@ int main(int ac, char **av)
|
||||
|
||||
|
||||
printf("static const uint16 *%s_weight%s[%d]={\n",
|
||||
global_name_prefix, pname_suffix[w], MY_UCA_NPAGES);
|
||||
options.name_prefix, pname_suffix[w], MY_UCA_NPAGES);
|
||||
for (page=0; page < MY_UCA_NPAGES; page++)
|
||||
{
|
||||
const char *comma= page < MY_UCA_NPAGES-1 ? "," : "";
|
||||
@ -342,7 +427,7 @@ int main(int ac, char **av)
|
||||
printf("NULL %s%s%s", w ? " ": "", comma , nline);
|
||||
else
|
||||
printf("%s%s%03X%s%s%s",
|
||||
global_name_prefix, pname_prefix[w], (int) page, pname_suffix[w],
|
||||
options.name_prefix, pname_prefix[w], (int) page, pname_suffix[w],
|
||||
comma, nline);
|
||||
}
|
||||
printf("};\n");
|
||||
|
Loading…
x
Reference in New Issue
Block a user