update to Unicode 5
SVN=126184
This commit is contained in:
parent
0d079a5362
commit
5b904a3bde
@ -34,7 +34,7 @@ asin(arg double)double
|
|||||||
sign = true;
|
sign = true;
|
||||||
}
|
}
|
||||||
if arg > 1 {
|
if arg > 1 {
|
||||||
panic "return sys.NaN()";
|
return sys.NaN();
|
||||||
}
|
}
|
||||||
|
|
||||||
temp = sqrt(1 - x*x);
|
temp = sqrt(1 - x*x);
|
||||||
@ -54,7 +54,7 @@ func
|
|||||||
acos(arg double)double
|
acos(arg double)double
|
||||||
{
|
{
|
||||||
if(arg > 1 || arg < -1) {
|
if(arg > 1 || arg < -1) {
|
||||||
panic "return sys.NaN()";
|
return sys.NaN();
|
||||||
}
|
}
|
||||||
return pio2 - asin(arg);
|
return pio2 - asin(arg);
|
||||||
}
|
}
|
||||||
|
@ -40,7 +40,7 @@ exp(arg double) double
|
|||||||
return 0.;
|
return 0.;
|
||||||
}
|
}
|
||||||
if arg > maxf {
|
if arg > maxf {
|
||||||
panic "return sys.Inf(1)"
|
return sys.Inf(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
x = arg*log2e;
|
x = arg*log2e;
|
||||||
|
@ -36,7 +36,7 @@ log(arg double) double
|
|||||||
var exp int;
|
var exp int;
|
||||||
|
|
||||||
if arg <= 0 {
|
if arg <= 0 {
|
||||||
panic "return sys.NaN()";
|
return sys.NaN();
|
||||||
}
|
}
|
||||||
|
|
||||||
exp,x = sys.frexp(arg);
|
exp,x = sys.frexp(arg);
|
||||||
@ -63,7 +63,7 @@ log10(arg double) double
|
|||||||
{
|
{
|
||||||
|
|
||||||
if arg <= 0 {
|
if arg <= 0 {
|
||||||
panic "return sys.NaN()";
|
return sys.NaN();
|
||||||
}
|
}
|
||||||
return log(arg) * ln10o1;
|
return log(arg) * ln10o1;
|
||||||
}
|
}
|
||||||
|
@ -5,7 +5,25 @@
|
|||||||
|
|
||||||
package main
|
package main
|
||||||
|
|
||||||
import math "math"
|
//import math "math"
|
||||||
|
//////////////////
|
||||||
|
import math "asin"
|
||||||
|
import math "atan"
|
||||||
|
import math "atan2"
|
||||||
|
import math "exp"
|
||||||
|
import math "fabs"
|
||||||
|
import math "floor"
|
||||||
|
import math "fmod"
|
||||||
|
import math "hypot"
|
||||||
|
import math "log"
|
||||||
|
import math "pow"
|
||||||
|
import math "pow10"
|
||||||
|
import math "sin"
|
||||||
|
import math "sinh"
|
||||||
|
import math "sqrt"
|
||||||
|
import math "tan"
|
||||||
|
import math "tanh"
|
||||||
|
|
||||||
|
|
||||||
const
|
const
|
||||||
(
|
(
|
||||||
|
@ -26,14 +26,14 @@ pow(arg1,arg2 double) double
|
|||||||
if arg1 <= 0 {
|
if arg1 <= 0 {
|
||||||
if(arg1 == 0) {
|
if(arg1 == 0) {
|
||||||
if arg2 <= 0 {
|
if arg2 <= 0 {
|
||||||
panic "return sys.NaN()";
|
return sys.NaN();
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
temp = floor(arg2);
|
temp = floor(arg2);
|
||||||
if temp != arg2 {
|
if temp != arg2 {
|
||||||
panic "return sys.NaN()";
|
panic sys.NaN();
|
||||||
}
|
}
|
||||||
|
|
||||||
l = long(temp);
|
l = long(temp);
|
||||||
|
@ -48,7 +48,7 @@ sinh(arg double) double
|
|||||||
temp = exp(arg)/2;
|
temp = exp(arg)/2;
|
||||||
|
|
||||||
case arg > 0.5:
|
case arg > 0.5:
|
||||||
// temp = (exp(arg) - exp(-arg))/2;
|
temp = (exp(arg) - exp(-arg))/2;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
argsq = arg*arg;
|
argsq = arg*arg;
|
||||||
@ -71,5 +71,5 @@ cosh(arg double) double
|
|||||||
if arg > 21 {
|
if arg > 21 {
|
||||||
return exp(arg)/2;
|
return exp(arg)/2;
|
||||||
}
|
}
|
||||||
// return (exp(arg) + exp(-arg))/2;
|
return (exp(arg) + exp(-arg))/2;
|
||||||
}
|
}
|
||||||
|
@ -19,11 +19,10 @@ sqrt(arg double) double
|
|||||||
var x, temp double;
|
var x, temp double;
|
||||||
var exp, i int;
|
var exp, i int;
|
||||||
|
|
||||||
/* BUG: NO isINF
|
|
||||||
if sys.isInf(arg, 1) {
|
if sys.isInf(arg, 1) {
|
||||||
return arg;
|
return arg;
|
||||||
}
|
}
|
||||||
*/
|
|
||||||
if arg <= 0 {
|
if arg <= 0 {
|
||||||
if arg < 0 {
|
if arg < 0 {
|
||||||
panic "return sys.NaN()"
|
panic "return sys.NaN()"
|
||||||
|
@ -62,7 +62,7 @@ tan(arg double) double
|
|||||||
|
|
||||||
if flag {
|
if flag {
|
||||||
if(temp == 0) {
|
if(temp == 0) {
|
||||||
panic "return sys.NaN()";
|
panic sys.NaN();
|
||||||
}
|
}
|
||||||
temp = 1/temp;
|
temp = 1/temp;
|
||||||
}
|
}
|
||||||
|
733
src/lib9/utf/mkrunetype.c
Normal file
733
src/lib9/utf/mkrunetype.c
Normal file
@ -0,0 +1,733 @@
|
|||||||
|
// Copyright 2009 The Go Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
/*
|
||||||
|
* make is(upper|lower|title|space|alpha)rune and
|
||||||
|
* to(upper|lower|title)rune from a UnicodeData.txt file.
|
||||||
|
* these can be found at unicode.org
|
||||||
|
*
|
||||||
|
* with -c, runs a check of the existing runetype functions vs.
|
||||||
|
* those extracted from UnicodeData.
|
||||||
|
*
|
||||||
|
* with -p, generates tables for pairs of chars, as well as for ranges
|
||||||
|
* and singletons.
|
||||||
|
*
|
||||||
|
* UnicodeData defines 4 fields of interest:
|
||||||
|
* 1) a category
|
||||||
|
* 2) an upper case mapping
|
||||||
|
* 3) a lower case mapping
|
||||||
|
* 4) a title case mapping
|
||||||
|
*
|
||||||
|
* toupper, tolower, and totitle are defined directly from the mapping.
|
||||||
|
*
|
||||||
|
* isalpharune(c) is true iff c is a "letter" category
|
||||||
|
* isupperrune(c) is true iff c is the target of toupperrune,
|
||||||
|
* or is in the uppercase letter category
|
||||||
|
* similarly for islowerrune and istitlerune.
|
||||||
|
* isspacerune is true for space category chars, "C" locale white space chars,
|
||||||
|
* and two additions:
|
||||||
|
* 0085 "next line" control char
|
||||||
|
* feff] "zero-width non-break space"
|
||||||
|
* isdigitrune is true iff c is a numeric-digit category.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <stdarg.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <libgen.h>
|
||||||
|
#include "utf.h"
|
||||||
|
#include "utfdef.h"
|
||||||
|
|
||||||
|
enum {
|
||||||
|
/*
|
||||||
|
* fields in the unicode data file
|
||||||
|
*/
|
||||||
|
FIELD_CODE,
|
||||||
|
FIELD_NAME,
|
||||||
|
FIELD_CATEGORY,
|
||||||
|
FIELD_COMBINING,
|
||||||
|
FIELD_BIDIR,
|
||||||
|
FIELD_DECOMP,
|
||||||
|
FIELD_DECIMAL_DIG,
|
||||||
|
FIELD_DIG,
|
||||||
|
FIELD_NUMERIC_VAL,
|
||||||
|
FIELD_MIRRORED,
|
||||||
|
FIELD_UNICODE_1_NAME,
|
||||||
|
FIELD_COMMENT,
|
||||||
|
FIELD_UPPER,
|
||||||
|
FIELD_LOWER,
|
||||||
|
FIELD_TITLE,
|
||||||
|
NFIELDS,
|
||||||
|
|
||||||
|
MAX_LINE = 1024,
|
||||||
|
|
||||||
|
TO_OFFSET = 1 << 20,
|
||||||
|
|
||||||
|
NRUNES = 1 << 21,
|
||||||
|
};
|
||||||
|
|
||||||
|
#define TO_DELTA(xmapped,x) (TO_OFFSET + (xmapped) - (x))
|
||||||
|
|
||||||
|
static char myisspace[NRUNES];
|
||||||
|
static char myisalpha[NRUNES];
|
||||||
|
static char myisdigit[NRUNES];
|
||||||
|
static char myisupper[NRUNES];
|
||||||
|
static char myislower[NRUNES];
|
||||||
|
static char myistitle[NRUNES];
|
||||||
|
|
||||||
|
static int mytoupper[NRUNES];
|
||||||
|
static int mytolower[NRUNES];
|
||||||
|
static int mytotitle[NRUNES];
|
||||||
|
|
||||||
|
static void check(void);
|
||||||
|
static void mktables(char *src, int usepairs);
|
||||||
|
static void fatal(const char *fmt, ...);
|
||||||
|
static int mygetfields(char **fields, int nfields, char *str, const char *delim);
|
||||||
|
static int getunicodeline(FILE *in, char **fields, char *buf);
|
||||||
|
static int getcode(char *s);
|
||||||
|
|
||||||
|
static void
|
||||||
|
usage(void)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "usage: mktables [-cp] <UnicodeData.txt>\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
int
|
||||||
|
main(int argc, char *argv[]){
|
||||||
|
FILE *in;
|
||||||
|
char buf[MAX_LINE], buf2[MAX_LINE];
|
||||||
|
char *fields[NFIELDS + 1], *fields2[NFIELDS + 1];
|
||||||
|
char *p;
|
||||||
|
int i, code, last, docheck, usepairs;
|
||||||
|
|
||||||
|
docheck = 0;
|
||||||
|
usepairs = 0;
|
||||||
|
ARGBEGIN{
|
||||||
|
case 'c':
|
||||||
|
docheck = 1;
|
||||||
|
break;
|
||||||
|
case 'p':
|
||||||
|
usepairs = 1;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
usage();
|
||||||
|
}ARGEND
|
||||||
|
|
||||||
|
if(argc != 1){
|
||||||
|
usage();
|
||||||
|
}
|
||||||
|
|
||||||
|
in = fopen(argv[0], "r");
|
||||||
|
if(in == NULL){
|
||||||
|
fatal("can't open %s", argv[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
for(i = 0; i < NRUNES; i++){
|
||||||
|
mytoupper[i] = i;
|
||||||
|
mytolower[i] = i;
|
||||||
|
mytotitle[i] = i;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* make sure isspace has all of the "C" locale whitespace chars
|
||||||
|
*/
|
||||||
|
myisspace['\t'] = 1;
|
||||||
|
myisspace['\n'] = 1;
|
||||||
|
myisspace['\r'] = 1;
|
||||||
|
myisspace['\f'] = 1;
|
||||||
|
myisspace['\v'] = 1;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* a couple of other exceptions
|
||||||
|
*/
|
||||||
|
myisspace[0x85] = 1; /* control char, "next line" */
|
||||||
|
myisspace[0xfeff] = 1; /* zero-width non-break space */
|
||||||
|
|
||||||
|
last = -1;
|
||||||
|
while(getunicodeline(in, fields, buf)){
|
||||||
|
code = getcode(fields[FIELD_CODE]);
|
||||||
|
if (code >= NRUNES)
|
||||||
|
fatal("code-point value too big: %x", code);
|
||||||
|
if(code <= last)
|
||||||
|
fatal("bad code sequence: %x then %x", last, code);
|
||||||
|
last = code;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* check for ranges
|
||||||
|
*/
|
||||||
|
p = fields[FIELD_CATEGORY];
|
||||||
|
if(strstr(fields[FIELD_NAME], ", First>") != NULL){
|
||||||
|
if(!getunicodeline(in, fields2, buf2))
|
||||||
|
fatal("range start at eof");
|
||||||
|
if (strstr(fields2[FIELD_NAME], ", Last>") == NULL)
|
||||||
|
fatal("range start not followed by range end");
|
||||||
|
last = getcode(fields2[FIELD_CODE]);
|
||||||
|
if(last <= code)
|
||||||
|
fatal("range out of sequence: %x then %x", code, last);
|
||||||
|
if(strcmp(p, fields2[FIELD_CATEGORY]) != 0)
|
||||||
|
fatal("range with mismatched category");
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* set properties and conversions
|
||||||
|
*/
|
||||||
|
for (; code <= last; code++){
|
||||||
|
if(p[0] == 'L')
|
||||||
|
myisalpha[code] = 1;
|
||||||
|
if(p[0] == 'Z')
|
||||||
|
myisspace[code] = 1;
|
||||||
|
|
||||||
|
if(strcmp(p, "Lu") == 0)
|
||||||
|
myisupper[code] = 1;
|
||||||
|
if(strcmp(p, "Ll") == 0)
|
||||||
|
myislower[code] = 1;
|
||||||
|
|
||||||
|
if(strcmp(p, "Lt") == 0)
|
||||||
|
myistitle[code] = 1;
|
||||||
|
|
||||||
|
if(strcmp(p, "Nd") == 0)
|
||||||
|
myisdigit[code] = 1;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* when finding conversions, also need to mark
|
||||||
|
* upper/lower case, since some chars, like
|
||||||
|
* "III" (0x2162), aren't defined as letters but have a
|
||||||
|
* lower case mapping ("iii" (0x2172)).
|
||||||
|
*/
|
||||||
|
if(fields[FIELD_UPPER][0] != '\0'){
|
||||||
|
mytoupper[code] = getcode(fields[FIELD_UPPER]);
|
||||||
|
}
|
||||||
|
if(fields[FIELD_LOWER][0] != '\0'){
|
||||||
|
mytolower[code] = getcode(fields[FIELD_LOWER]);
|
||||||
|
}
|
||||||
|
if(fields[FIELD_TITLE][0] != '\0'){
|
||||||
|
mytotitle[code] = getcode(fields[FIELD_TITLE]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fclose(in);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* check for codes with no totitle mapping but a toupper mapping.
|
||||||
|
* these appear in UnicodeData-2.0.14.txt, but are almost certainly
|
||||||
|
* erroneous.
|
||||||
|
*/
|
||||||
|
for(i = 0; i < NRUNES; i++){
|
||||||
|
if(mytotitle[i] == i
|
||||||
|
&& mytoupper[i] != i
|
||||||
|
&& !myistitle[i])
|
||||||
|
fprintf(stderr, "warning: code=%.4x not istitle, totitle is same, toupper=%.4x\n", i, mytoupper[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* make sure isupper[c] is true if for some x toupper[x] == c
|
||||||
|
* ditto for islower and istitle
|
||||||
|
*/
|
||||||
|
for(i = 0; i < NRUNES; i++) {
|
||||||
|
if(mytoupper[i] != i)
|
||||||
|
myisupper[mytoupper[i]] = 1;
|
||||||
|
if(mytolower[i] != i)
|
||||||
|
myislower[mytolower[i]] = 1;
|
||||||
|
if(mytotitle[i] != i)
|
||||||
|
myistitle[mytotitle[i]] = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(docheck){
|
||||||
|
check();
|
||||||
|
}else{
|
||||||
|
mktables(argv[0], usepairs);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* generate a properties array for ranges, clearing those cases covered.
|
||||||
|
* if force, generate one-entry ranges for singletons.
|
||||||
|
*/
|
||||||
|
static int
|
||||||
|
mkisrange(const char* label, char* prop, int force)
|
||||||
|
{
|
||||||
|
int start, stop, some;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* first, the ranges
|
||||||
|
*/
|
||||||
|
some = 0;
|
||||||
|
for(start = 0; start < NRUNES; ) {
|
||||||
|
if(!prop[start]){
|
||||||
|
start++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
for(stop = start + 1; stop < NRUNES; stop++){
|
||||||
|
if(!prop[stop]){
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
prop[stop] = 0;
|
||||||
|
}
|
||||||
|
if(force || stop != start + 1){
|
||||||
|
if(!some){
|
||||||
|
printf("static Rune __is%sr[] = {\n", label);
|
||||||
|
some = 1;
|
||||||
|
}
|
||||||
|
prop[start] = 0;
|
||||||
|
printf("\t0x%.4x, 0x%.4x,\n", start, stop - 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
start = stop;
|
||||||
|
}
|
||||||
|
if(some)
|
||||||
|
printf("};\n\n");
|
||||||
|
return some;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* generate a mapping array for pairs with a skip between,
|
||||||
|
* clearing those entries covered.
|
||||||
|
*/
|
||||||
|
static int
|
||||||
|
mkispair(const char *label, char *prop)
|
||||||
|
{
|
||||||
|
int start, stop, some;
|
||||||
|
|
||||||
|
some = 0;
|
||||||
|
for(start = 0; start + 2 < NRUNES; ) {
|
||||||
|
if(!prop[start]){
|
||||||
|
start++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
for(stop = start + 2; stop < NRUNES; stop += 2){
|
||||||
|
if(!prop[stop]){
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
prop[stop] = 0;
|
||||||
|
}
|
||||||
|
if(stop != start + 2){
|
||||||
|
if(!some){
|
||||||
|
printf("static Rune __is%sp[] = {\n", label);
|
||||||
|
some = 1;
|
||||||
|
}
|
||||||
|
prop[start] = 0;
|
||||||
|
printf("\t0x%.4x, 0x%.4x,\n", start, stop - 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
start = stop;
|
||||||
|
}
|
||||||
|
if(some)
|
||||||
|
printf("};\n\n");
|
||||||
|
return some;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* generate a properties array for singletons, clearing those cases covered.
|
||||||
|
*/
|
||||||
|
static int
|
||||||
|
mkissingle(const char *label, char *prop)
|
||||||
|
{
|
||||||
|
int start, some;
|
||||||
|
|
||||||
|
some = 0;
|
||||||
|
for(start = 0; start < NRUNES; start++) {
|
||||||
|
if(!prop[start]){
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(!some){
|
||||||
|
printf("static Rune __is%ss[] = {\n", label);
|
||||||
|
some = 1;
|
||||||
|
}
|
||||||
|
prop[start] = 0;
|
||||||
|
printf("\t0x%.4x,\n", start);
|
||||||
|
}
|
||||||
|
if(some)
|
||||||
|
printf("};\n\n");
|
||||||
|
return some;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* generate tables and a function for is<label>rune
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
mkis(const char* label, char* prop, int usepairs)
|
||||||
|
{
|
||||||
|
int isr, isp, iss;
|
||||||
|
|
||||||
|
isr = mkisrange(label, prop, 0);
|
||||||
|
isp = 0;
|
||||||
|
if(usepairs)
|
||||||
|
isp = mkispair(label, prop);
|
||||||
|
iss = mkissingle(label, prop);
|
||||||
|
|
||||||
|
printf(
|
||||||
|
"int\n"
|
||||||
|
"is%srune(Rune c)\n"
|
||||||
|
"{\n"
|
||||||
|
" Rune *p;\n"
|
||||||
|
"\n",
|
||||||
|
label);
|
||||||
|
|
||||||
|
if(isr)
|
||||||
|
printf(
|
||||||
|
" p = rbsearch(c, __is%sr, nelem(__is%sr)/2, 2);\n"
|
||||||
|
" if(p && c >= p[0] && c <= p[1])\n"
|
||||||
|
" return 1;\n",
|
||||||
|
label, label);
|
||||||
|
|
||||||
|
if(isp)
|
||||||
|
printf(
|
||||||
|
" p = rbsearch(c, __is%sp, nelem(__is%sp)/2, 2);\n"
|
||||||
|
" if(p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))\n"
|
||||||
|
" return 1;\n",
|
||||||
|
label, label);
|
||||||
|
|
||||||
|
if(iss)
|
||||||
|
printf(
|
||||||
|
" p = rbsearch(c, __is%ss, nelem(__is%ss), 1);\n"
|
||||||
|
" if(p && c == p[0])\n"
|
||||||
|
" return 1;\n",
|
||||||
|
label, label);
|
||||||
|
|
||||||
|
|
||||||
|
printf(
|
||||||
|
" return 0;\n"
|
||||||
|
"}\n"
|
||||||
|
"\n"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* generate a mapping array for ranges, clearing those entries covered.
|
||||||
|
* if force, generate one-entry ranges for singletons.
|
||||||
|
*/
|
||||||
|
static int
|
||||||
|
mktorange(const char* label, int* map, int force)
|
||||||
|
{
|
||||||
|
int start, stop, delta, some;
|
||||||
|
|
||||||
|
some = 0;
|
||||||
|
for(start = 0; start < NRUNES; ) {
|
||||||
|
if(map[start] == start){
|
||||||
|
start++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
delta = TO_DELTA(map[start], start);
|
||||||
|
if(delta != (Rune)delta)
|
||||||
|
fatal("bad map delta %d", delta);
|
||||||
|
for(stop = start + 1; stop < NRUNES; stop++){
|
||||||
|
if(TO_DELTA(map[stop], stop) != delta){
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
map[stop] = stop;
|
||||||
|
}
|
||||||
|
if(stop != start + 1){
|
||||||
|
if(!some){
|
||||||
|
printf("static Rune __to%sr[] = {\n", label);
|
||||||
|
some = 1;
|
||||||
|
}
|
||||||
|
map[start] = start;
|
||||||
|
printf("\t0x%.4x, 0x%.4x, %d,\n", start, stop - 1, delta);
|
||||||
|
}
|
||||||
|
|
||||||
|
start = stop;
|
||||||
|
}
|
||||||
|
if(some)
|
||||||
|
printf("};\n\n");
|
||||||
|
return some;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* generate a mapping array for pairs with a skip between,
|
||||||
|
* clearing those entries covered.
|
||||||
|
*/
|
||||||
|
static int
|
||||||
|
mktopair(const char* label, int* map)
|
||||||
|
{
|
||||||
|
int start, stop, delta, some;
|
||||||
|
|
||||||
|
some = 0;
|
||||||
|
for(start = 0; start + 2 < NRUNES; ) {
|
||||||
|
if(map[start] == start){
|
||||||
|
start++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
delta = TO_DELTA(map[start], start);
|
||||||
|
if(delta != (Rune)delta)
|
||||||
|
fatal("bad map delta %d", delta);
|
||||||
|
for(stop = start + 2; stop < NRUNES; stop += 2){
|
||||||
|
if(TO_DELTA(map[stop], stop) != delta){
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
map[stop] = stop;
|
||||||
|
}
|
||||||
|
if(stop != start + 2){
|
||||||
|
if(!some){
|
||||||
|
printf("static Rune __to%sp[] = {\n", label);
|
||||||
|
some = 1;
|
||||||
|
}
|
||||||
|
map[start] = start;
|
||||||
|
printf("\t0x%.4x, 0x%.4x, %d,\n", start, stop - 2, delta);
|
||||||
|
}
|
||||||
|
|
||||||
|
start = stop;
|
||||||
|
}
|
||||||
|
if(some)
|
||||||
|
printf("};\n\n");
|
||||||
|
return some;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* generate a mapping array for singletons, clearing those entries covered.
|
||||||
|
*/
|
||||||
|
static int
|
||||||
|
mktosingle(const char* label, int* map)
|
||||||
|
{
|
||||||
|
int start, delta, some;
|
||||||
|
|
||||||
|
some = 0;
|
||||||
|
for(start = 0; start < NRUNES; start++) {
|
||||||
|
if(map[start] == start){
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
delta = TO_DELTA(map[start], start);
|
||||||
|
if(delta != (Rune)delta)
|
||||||
|
fatal("bad map delta %d", delta);
|
||||||
|
if(!some){
|
||||||
|
printf("static Rune __to%ss[] = {\n", label);
|
||||||
|
some = 1;
|
||||||
|
}
|
||||||
|
map[start] = start;
|
||||||
|
printf("\t0x%.4x, %d,\n", start, delta);
|
||||||
|
}
|
||||||
|
if(some)
|
||||||
|
printf("};\n\n");
|
||||||
|
return some;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* generate tables and a function for to<label>rune
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
mkto(const char* label, int* map, int usepairs)
|
||||||
|
{
|
||||||
|
int tor, top, tos;
|
||||||
|
|
||||||
|
tor = mktorange(label, map, 0);
|
||||||
|
top = 0;
|
||||||
|
if(usepairs)
|
||||||
|
top = mktopair(label, map);
|
||||||
|
tos = mktosingle(label, map);
|
||||||
|
|
||||||
|
printf(
|
||||||
|
"Rune\n"
|
||||||
|
"to%srune(Rune c)\n"
|
||||||
|
"{\n"
|
||||||
|
" Rune *p;\n"
|
||||||
|
"\n",
|
||||||
|
label);
|
||||||
|
|
||||||
|
if(tor)
|
||||||
|
printf(
|
||||||
|
" p = rbsearch(c, __to%sr, nelem(__to%sr)/3, 3);\n"
|
||||||
|
" if(p && c >= p[0] && c <= p[1])\n"
|
||||||
|
" return c + p[2] - %d;\n",
|
||||||
|
label, label, TO_OFFSET);
|
||||||
|
|
||||||
|
if(top)
|
||||||
|
printf(
|
||||||
|
" p = rbsearch(c, __to%sp, nelem(__to%sp)/3, 3);\n"
|
||||||
|
" if(p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))\n"
|
||||||
|
" return c + p[2] - %d;\n",
|
||||||
|
label, label, TO_OFFSET);
|
||||||
|
|
||||||
|
if(tos)
|
||||||
|
printf(
|
||||||
|
" p = rbsearch(c, __to%ss, nelem(__to%ss)/2, 2);\n"
|
||||||
|
" if(p && c == p[0])\n"
|
||||||
|
" return c + p[1] - %d;\n",
|
||||||
|
label, label, TO_OFFSET);
|
||||||
|
|
||||||
|
|
||||||
|
printf(
|
||||||
|
" return c;\n"
|
||||||
|
"}\n"
|
||||||
|
"\n"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Make only range tables and a function for is<label>rune.
|
||||||
|
static void
|
||||||
|
mkisronly(const char* label, char* prop) {
|
||||||
|
mkisrange(label, prop, 1);
|
||||||
|
printf(
|
||||||
|
"int\n"
|
||||||
|
"is%srune(Rune c)\n"
|
||||||
|
"{\n"
|
||||||
|
" Rune *p;\n"
|
||||||
|
"\n"
|
||||||
|
" p = rbsearch(c, __is%sr, nelem(__is%sr)/2, 2);\n"
|
||||||
|
" if(p && c >= p[0] && c <= p[1])\n"
|
||||||
|
" return 1;\n"
|
||||||
|
" return 0;\n"
|
||||||
|
"}\n"
|
||||||
|
"\n",
|
||||||
|
label, label, label);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* generate the body of runetype.
|
||||||
|
* assumes there is a function Rune* rbsearch(Rune c, Rune *t, int n, int ne);
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
mktables(char *src, int usepairs)
|
||||||
|
{
|
||||||
|
printf("/* generated automatically by mkrunetype.c from %s */\n\n",
|
||||||
|
basename(src));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* we special case the space and digit tables, since they are assumed
|
||||||
|
* to be small with several ranges.
|
||||||
|
*/
|
||||||
|
mkisronly("space", myisspace);
|
||||||
|
mkisronly("digit", myisdigit);
|
||||||
|
|
||||||
|
mkis("alpha", myisalpha, 0);
|
||||||
|
mkis("upper", myisupper, usepairs);
|
||||||
|
mkis("lower", myislower, usepairs);
|
||||||
|
mkis("title", myistitle, usepairs);
|
||||||
|
|
||||||
|
mkto("upper", mytoupper, usepairs);
|
||||||
|
mkto("lower", mytolower, usepairs);
|
||||||
|
mkto("title", mytotitle, usepairs);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* find differences between the newly generated tables and current runetypes.
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
check(void)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
|
||||||
|
for(i = 0; i < NRUNES; i++){
|
||||||
|
if(isdigitrune(i) != myisdigit[i])
|
||||||
|
fprintf(stderr, "isdigit diff at %x: runetype=%x, unicode=%x\n",
|
||||||
|
i, isdigitrune(i), myisdigit[i]);
|
||||||
|
|
||||||
|
if(isspacerune(i) != myisspace[i])
|
||||||
|
fprintf(stderr, "isspace diff at %x: runetype=%x, unicode=%x\n",
|
||||||
|
i, isspacerune(i), myisspace[i]);
|
||||||
|
|
||||||
|
if(isupperrune(i) != myisupper[i])
|
||||||
|
fprintf(stderr, "isupper diff at %x: runetype=%x, unicode=%x\n",
|
||||||
|
i, isupperrune(i), myisupper[i]);
|
||||||
|
|
||||||
|
if(islowerrune(i) != myislower[i])
|
||||||
|
fprintf(stderr, "islower diff at %x: runetype=%x, unicode=%x\n",
|
||||||
|
i, islowerrune(i), myislower[i]);
|
||||||
|
|
||||||
|
if(isalpharune(i) != myisalpha[i])
|
||||||
|
fprintf(stderr, "isalpha diff at %x: runetype=%x, unicode=%x\n",
|
||||||
|
i, isalpharune(i), myisalpha[i]);
|
||||||
|
|
||||||
|
if(toupperrune(i) != mytoupper[i])
|
||||||
|
fprintf(stderr, "toupper diff at %x: runetype=%x, unicode=%x\n",
|
||||||
|
i, toupperrune(i), mytoupper[i]);
|
||||||
|
|
||||||
|
if(tolowerrune(i) != mytolower[i])
|
||||||
|
fprintf(stderr, "tolower diff at %x: runetype=%x, unicode=%x\n",
|
||||||
|
i, tolowerrune(i), mytolower[i]);
|
||||||
|
|
||||||
|
if(istitlerune(i) != myistitle[i])
|
||||||
|
fprintf(stderr, "istitle diff at %x: runetype=%x, unicode=%x\n",
|
||||||
|
i, istitlerune(i), myistitle[i]);
|
||||||
|
|
||||||
|
if(totitlerune(i) != mytotitle[i])
|
||||||
|
fprintf(stderr, "totitle diff at %x: runetype=%x, unicode=%x\n",
|
||||||
|
i, totitlerune(i), mytotitle[i]);
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static int
|
||||||
|
mygetfields(char **fields, int nfields, char *str, const char *delim)
|
||||||
|
{
|
||||||
|
int nf;
|
||||||
|
|
||||||
|
fields[0] = str;
|
||||||
|
nf = 1;
|
||||||
|
if(nf >= nfields)
|
||||||
|
return nf;
|
||||||
|
|
||||||
|
for(; *str; str++){
|
||||||
|
if(strchr(delim, *str) != NULL){
|
||||||
|
*str = '\0';
|
||||||
|
fields[nf++] = str + 1;
|
||||||
|
if(nf >= nfields)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nf;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int
|
||||||
|
getunicodeline(FILE *in, char **fields, char *buf)
|
||||||
|
{
|
||||||
|
char *p;
|
||||||
|
|
||||||
|
if(fgets(buf, MAX_LINE, in) == NULL)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
p = strchr(buf, '\n');
|
||||||
|
if (p == NULL)
|
||||||
|
fatal("line too long");
|
||||||
|
*p = '\0';
|
||||||
|
|
||||||
|
if (mygetfields(fields, NFIELDS + 1, buf, ";") != NFIELDS)
|
||||||
|
fatal("bad number of fields");
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int
|
||||||
|
getcode(char *s)
|
||||||
|
{
|
||||||
|
int i, code;
|
||||||
|
|
||||||
|
code = 0;
|
||||||
|
i = 0;
|
||||||
|
/* Parse a hex number */
|
||||||
|
while(s[i]) {
|
||||||
|
code <<= 4;
|
||||||
|
if(s[i] >= '0' && s[i] <= '9')
|
||||||
|
code += s[i] - '0';
|
||||||
|
else if(s[i] >= 'A' && s[i] <= 'F')
|
||||||
|
code += s[i] - 'A' + 10;
|
||||||
|
else
|
||||||
|
fatal("bad code char '%c'", s[i]);
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
return code;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
fatal(const char *fmt, ...)
|
||||||
|
{
|
||||||
|
va_list arg;
|
||||||
|
|
||||||
|
fprintf(stderr, "%s: fatal error: ", argv0);
|
||||||
|
va_start(arg, fmt);
|
||||||
|
vfprintf(stderr, fmt, arg);
|
||||||
|
va_end(arg);
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
|
||||||
|
exit(1);
|
||||||
|
}
|
@ -1,20 +1,21 @@
|
|||||||
/*
|
/*
|
||||||
* The authors of this software are Rob Pike and Ken Thompson.
|
* The authors of this software are Rob Pike and Ken Thompson.
|
||||||
* Copyright (c) 2002 by Lucent Technologies.
|
* Copyright (c) 2002 by Lucent Technologies.
|
||||||
|
* Portions Copyright (c) 2009 The Go Authors. All rights reserved.
|
||||||
* Permission to use, copy, modify, and distribute this software for any
|
* Permission to use, copy, modify, and distribute this software for any
|
||||||
* purpose without fee is hereby granted, provided that this entire notice
|
* purpose without fee is hereby granted, provided that this entire notice
|
||||||
* is included in all copies of any software which is or includes a copy
|
* is included in all copies of any software which is or includes a copy
|
||||||
* or modification of this software and in all copies of the supporting
|
* or modification of this software and in all copies of the supporting
|
||||||
* documentation for such software.
|
* documentation for such software.
|
||||||
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
|
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
|
||||||
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
|
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
|
||||||
* ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
||||||
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
|
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
|
||||||
*/
|
*/
|
||||||
#include <stdarg.h>
|
#include <stdarg.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include "plan9.h"
|
|
||||||
#include "utf.h"
|
#include "utf.h"
|
||||||
|
#include "utfdef.h"
|
||||||
|
|
||||||
enum
|
enum
|
||||||
{
|
{
|
||||||
@ -23,27 +24,150 @@ enum
|
|||||||
Bit2 = 5,
|
Bit2 = 5,
|
||||||
Bit3 = 4,
|
Bit3 = 4,
|
||||||
Bit4 = 3,
|
Bit4 = 3,
|
||||||
|
Bit5 = 2,
|
||||||
|
|
||||||
T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
|
T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
|
||||||
Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
|
Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
|
||||||
T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
|
T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
|
||||||
T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
|
T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
|
||||||
T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
|
T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
|
||||||
|
T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
|
||||||
|
|
||||||
Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
|
Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
|
||||||
Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
|
Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
|
||||||
Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
|
Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
|
||||||
|
Rune4 = (1<<(Bit4+3*Bitx))-1,
|
||||||
|
/* 0001 1111 1111 1111 1111 1111 */
|
||||||
|
|
||||||
Maskx = (1<<Bitx)-1, /* 0011 1111 */
|
Maskx = (1<<Bitx)-1, /* 0011 1111 */
|
||||||
Testx = Maskx ^ 0xFF, /* 1100 0000 */
|
Testx = Maskx ^ 0xFF, /* 1100 0000 */
|
||||||
|
|
||||||
Bad = Runeerror
|
Bad = Runeerror,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24
|
||||||
|
* This is a slower but "safe" version of the old chartorune
|
||||||
|
* that works on strings that are not necessarily null-terminated.
|
||||||
|
*
|
||||||
|
* If you know for sure that your string is null-terminated,
|
||||||
|
* chartorune will be a bit faster.
|
||||||
|
*
|
||||||
|
* It is guaranteed not to attempt to access "length"
|
||||||
|
* past the incoming pointer. This is to avoid
|
||||||
|
* possible access violations. If the string appears to be
|
||||||
|
* well-formed but incomplete (i.e., to get the whole Rune
|
||||||
|
* we'd need to read past str+length) then we'll set the Rune
|
||||||
|
* to Bad and return 0.
|
||||||
|
*
|
||||||
|
* Note that if we have decoding problems for other
|
||||||
|
* reasons, we return 1 instead of 0.
|
||||||
|
*/
|
||||||
int
|
int
|
||||||
chartorune(Rune *rune, char *str)
|
charntorune(Rune *rune, const char *str, int length)
|
||||||
{
|
{
|
||||||
int c, c1, c2;
|
int c, c1, c2, c3;
|
||||||
|
long l;
|
||||||
|
|
||||||
|
/* When we're not allowed to read anything */
|
||||||
|
if(length <= 0) {
|
||||||
|
goto badlen;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* one character sequence (7-bit value)
|
||||||
|
* 00000-0007F => T1
|
||||||
|
*/
|
||||||
|
c = *(uchar*)str;
|
||||||
|
if(c < Tx) {
|
||||||
|
*rune = c;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we can't read more than one character we must stop
|
||||||
|
if(length <= 1) {
|
||||||
|
goto badlen;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* two character sequence (11-bit value)
|
||||||
|
* 0080-07FF => T2 Tx
|
||||||
|
*/
|
||||||
|
c1 = *(uchar*)(str+1) ^ Tx;
|
||||||
|
if(c1 & Testx)
|
||||||
|
goto bad;
|
||||||
|
if(c < T3) {
|
||||||
|
if(c < T2)
|
||||||
|
goto bad;
|
||||||
|
l = ((c << Bitx) | c1) & Rune2;
|
||||||
|
if(l <= Rune1)
|
||||||
|
goto bad;
|
||||||
|
*rune = l;
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we can't read more than two characters we must stop
|
||||||
|
if(length <= 2) {
|
||||||
|
goto badlen;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* three character sequence (16-bit value)
|
||||||
|
* 0800-FFFF => T3 Tx Tx
|
||||||
|
*/
|
||||||
|
c2 = *(uchar*)(str+2) ^ Tx;
|
||||||
|
if(c2 & Testx)
|
||||||
|
goto bad;
|
||||||
|
if(c < T4) {
|
||||||
|
l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
|
||||||
|
if(l <= Rune2)
|
||||||
|
goto bad;
|
||||||
|
*rune = l;
|
||||||
|
return 3;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (length <= 3)
|
||||||
|
goto badlen;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* four character sequence (21-bit value)
|
||||||
|
* 10000-1FFFFF => T4 Tx Tx Tx
|
||||||
|
*/
|
||||||
|
c3 = *(uchar*)(str+3) ^ Tx;
|
||||||
|
if (c3 & Testx)
|
||||||
|
goto bad;
|
||||||
|
if (c < T5) {
|
||||||
|
l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
|
||||||
|
if (l <= Rune3)
|
||||||
|
goto bad;
|
||||||
|
*rune = l;
|
||||||
|
return 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Support for 5-byte or longer UTF-8 would go here, but
|
||||||
|
// since we don't have that, we'll just fall through to bad.
|
||||||
|
|
||||||
|
/*
|
||||||
|
* bad decoding
|
||||||
|
*/
|
||||||
|
bad:
|
||||||
|
*rune = Bad;
|
||||||
|
return 1;
|
||||||
|
badlen:
|
||||||
|
*rune = Bad;
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This is the older "unsafe" version, which works fine on
|
||||||
|
* null-terminated strings.
|
||||||
|
*/
|
||||||
|
int
|
||||||
|
chartorune(Rune *rune, const char *str)
|
||||||
|
{
|
||||||
|
int c, c1, c2, c3;
|
||||||
long l;
|
long l;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -88,6 +212,26 @@ chartorune(Rune *rune, char *str)
|
|||||||
return 3;
|
return 3;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* four character sequence (21-bit value)
|
||||||
|
* 10000-1FFFFF => T4 Tx Tx Tx
|
||||||
|
*/
|
||||||
|
c3 = *(uchar*)(str+3) ^ Tx;
|
||||||
|
if (c3 & Testx)
|
||||||
|
goto bad;
|
||||||
|
if (c < T5) {
|
||||||
|
l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
|
||||||
|
if (l <= Rune3)
|
||||||
|
goto bad;
|
||||||
|
*rune = l;
|
||||||
|
return 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Support for 5-byte or longer UTF-8 would go here, but
|
||||||
|
* since we don't have that, we'll just fall through to bad.
|
||||||
|
*/
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* bad decoding
|
* bad decoding
|
||||||
*/
|
*/
|
||||||
@ -97,9 +241,16 @@ bad:
|
|||||||
}
|
}
|
||||||
|
|
||||||
int
|
int
|
||||||
runetochar(char *str, Rune *rune)
|
isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed) {
|
||||||
|
*consumed = charntorune(rune, str, length);
|
||||||
|
return *rune != Runeerror || *consumed == 3;
|
||||||
|
}
|
||||||
|
|
||||||
|
int
|
||||||
|
runetochar(char *str, const Rune *rune)
|
||||||
{
|
{
|
||||||
long c;
|
/* Runes are signed, so convert to unsigned for range check. */
|
||||||
|
unsigned long c;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* one character sequence
|
* one character sequence
|
||||||
@ -121,57 +272,80 @@ runetochar(char *str, Rune *rune)
|
|||||||
return 2;
|
return 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the Rune is out of range, convert it to the error rune.
|
||||||
|
* Do this test here because the error rune encodes to three bytes.
|
||||||
|
* Doing it earlier would duplicate work, since an out of range
|
||||||
|
* Rune wouldn't have fit in one or two bytes.
|
||||||
|
*/
|
||||||
|
if (c > Runemax)
|
||||||
|
c = Runeerror;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* three character sequence
|
* three character sequence
|
||||||
* 0800-FFFF => T3 Tx Tx
|
* 0800-FFFF => T3 Tx Tx
|
||||||
*/
|
*/
|
||||||
str[0] = T3 | (c >> 2*Bitx);
|
if (c <= Rune3) {
|
||||||
str[1] = Tx | ((c >> 1*Bitx) & Maskx);
|
str[0] = T3 | (c >> 2*Bitx);
|
||||||
str[2] = Tx | (c & Maskx);
|
str[1] = Tx | ((c >> 1*Bitx) & Maskx);
|
||||||
return 3;
|
str[2] = Tx | (c & Maskx);
|
||||||
|
return 3;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* four character sequence (21-bit value)
|
||||||
|
* 10000-1FFFFF => T4 Tx Tx Tx
|
||||||
|
*/
|
||||||
|
str[0] = T4 | (c >> 3*Bitx);
|
||||||
|
str[1] = Tx | ((c >> 2*Bitx) & Maskx);
|
||||||
|
str[2] = Tx | ((c >> 1*Bitx) & Maskx);
|
||||||
|
str[3] = Tx | (c & Maskx);
|
||||||
|
return 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
int
|
int
|
||||||
runelen(long c)
|
runelen(Rune rune)
|
||||||
{
|
{
|
||||||
Rune rune;
|
|
||||||
char str[10];
|
char str[10];
|
||||||
|
|
||||||
rune = c;
|
|
||||||
return runetochar(str, &rune);
|
return runetochar(str, &rune);
|
||||||
}
|
}
|
||||||
|
|
||||||
int
|
int
|
||||||
runenlen(Rune *r, int nrune)
|
runenlen(const Rune *r, int nrune)
|
||||||
{
|
{
|
||||||
int nb, c;
|
int nb, c;
|
||||||
|
|
||||||
nb = 0;
|
nb = 0;
|
||||||
while(nrune--) {
|
while(nrune--) {
|
||||||
c = *r++;
|
c = *r++;
|
||||||
if(c <= Rune1)
|
if (c <= Rune1)
|
||||||
nb++;
|
nb++;
|
||||||
else
|
else if (c <= Rune2)
|
||||||
if(c <= Rune2)
|
|
||||||
nb += 2;
|
nb += 2;
|
||||||
else
|
else if (c <= Rune3)
|
||||||
nb += 3;
|
nb += 3;
|
||||||
|
else /* assert(c <= Rune4) */
|
||||||
|
nb += 4;
|
||||||
}
|
}
|
||||||
return nb;
|
return nb;
|
||||||
}
|
}
|
||||||
|
|
||||||
int
|
int
|
||||||
fullrune(char *str, int n)
|
fullrune(const char *str, int n)
|
||||||
{
|
{
|
||||||
int c;
|
if (n > 0) {
|
||||||
|
int c = *(uchar*)str;
|
||||||
if(n > 0) {
|
if (c < Tx)
|
||||||
c = *(uchar*)str;
|
|
||||||
if(c < Tx)
|
|
||||||
return 1;
|
return 1;
|
||||||
if(n > 1)
|
if (n > 1) {
|
||||||
if(c < T3 || n > 2)
|
if (c < T3)
|
||||||
return 1;
|
return 1;
|
||||||
|
if (n > 2) {
|
||||||
|
if (c < T4 || n > 3)
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
File diff suppressed because it is too large
Load Diff
248
src/lib9/utf/utf.h
Normal file
248
src/lib9/utf/utf.h
Normal file
@ -0,0 +1,248 @@
|
|||||||
|
/*
|
||||||
|
* The authors of this software are Rob Pike and Ken Thompson.
|
||||||
|
* Copyright (c) 1998-2002 by Lucent Technologies.
|
||||||
|
* Portions Copyright (c) 2009 The Go Authors. All rights reserved.
|
||||||
|
* Permission to use, copy, modify, and distribute this software for any
|
||||||
|
* purpose without fee is hereby granted, provided that this entire notice
|
||||||
|
* is included in all copies of any software which is or includes a copy
|
||||||
|
* or modification of this software and in all copies of the supporting
|
||||||
|
* documentation for such software.
|
||||||
|
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
|
||||||
|
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
|
||||||
|
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
||||||
|
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef _UTFH_
|
||||||
|
#define _UTFH_ 1
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
typedef signed int Rune; /* Code-point values in Unicode 4.0 are 21 bits wide.*/
|
||||||
|
|
||||||
|
enum
|
||||||
|
{
|
||||||
|
UTFmax = 4, /* maximum bytes per rune */
|
||||||
|
Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */
|
||||||
|
Runeself = 0x80, /* rune and UTF sequences are the same (<) */
|
||||||
|
Runeerror = 0xFFFD, /* decoding error in UTF */
|
||||||
|
Runemax = 0x10FFFF, /* maximum rune value */
|
||||||
|
};
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/*
|
||||||
|
* rune routines
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* These routines were written by Rob Pike and Ken Thompson
|
||||||
|
* and first appeared in Plan 9.
|
||||||
|
* SEE ALSO
|
||||||
|
* utf (7)
|
||||||
|
* tcs (1)
|
||||||
|
*/
|
||||||
|
|
||||||
|
// runetochar copies (encodes) one rune, pointed to by r, to at most
|
||||||
|
// UTFmax bytes starting at s and returns the number of bytes generated.
|
||||||
|
|
||||||
|
int runetochar(char* s, const Rune* r);
|
||||||
|
|
||||||
|
|
||||||
|
// chartorune copies (decodes) at most UTFmax bytes starting at s to
|
||||||
|
// one rune, pointed to by r, and returns the number of bytes consumed.
|
||||||
|
// If the input is not exactly in UTF format, chartorune will set *r
|
||||||
|
// to Runeerror and return 1.
|
||||||
|
//
|
||||||
|
// Note: There is no special case for a "null-terminated" string. A
|
||||||
|
// string whose first byte has the value 0 is the UTF8 encoding of the
|
||||||
|
// Unicode value 0 (i.e., ASCII NULL). A byte value of 0 is illegal
|
||||||
|
// anywhere else in a UTF sequence.
|
||||||
|
|
||||||
|
int chartorune(Rune* r, const char* s);
|
||||||
|
|
||||||
|
|
||||||
|
// charntorune is like chartorune, except that it will access at most
|
||||||
|
// n bytes of s. If the UTF sequence is incomplete within n bytes,
|
||||||
|
// charntorune will set *r to Runeerror and return 0. If it is complete
|
||||||
|
// but not in UTF format, it will set *r to Runeerror and return 1.
|
||||||
|
//
|
||||||
|
// Added 2004-09-24 by Wei-Hwa Huang
|
||||||
|
|
||||||
|
int charntorune(Rune* r, const char* s, int n);
|
||||||
|
|
||||||
|
// isvalidcharntorune(str, n, r, consumed)
|
||||||
|
// is a convenience function that calls "*consumed = charntorune(r, str, n)"
|
||||||
|
// and returns an int (logically boolean) indicating whether the first
|
||||||
|
// n bytes of str was a valid and complete UTF sequence.
|
||||||
|
|
||||||
|
int isvalidcharntorune(const char* str, int n, Rune* r, int* consumed);
|
||||||
|
|
||||||
|
// runelen returns the number of bytes required to convert r into UTF.
|
||||||
|
|
||||||
|
int runelen(Rune r);
|
||||||
|
|
||||||
|
|
||||||
|
// runenlen returns the number of bytes required to convert the n
|
||||||
|
// runes pointed to by r into UTF.
|
||||||
|
|
||||||
|
int runenlen(const Rune* r, int n);
|
||||||
|
|
||||||
|
|
||||||
|
// fullrune returns 1 if the string s of length n is long enough to be
|
||||||
|
// decoded by chartorune, and 0 otherwise. This does not guarantee
|
||||||
|
// that the string contains a legal UTF encoding. This routine is used
|
||||||
|
// by programs that obtain input one byte at a time and need to know
|
||||||
|
// when a full rune has arrived.
|
||||||
|
|
||||||
|
int fullrune(const char* s, int n);
|
||||||
|
|
||||||
|
// The following routines are analogous to the corresponding string
|
||||||
|
// routines with "utf" substituted for "str", and "rune" substituted
|
||||||
|
// for "chr".
|
||||||
|
|
||||||
|
// utflen returns the number of runes that are represented by the UTF
|
||||||
|
// string s. (cf. strlen)
|
||||||
|
|
||||||
|
int utflen(const char* s);
|
||||||
|
|
||||||
|
|
||||||
|
// utfnlen returns the number of complete runes that are represented
|
||||||
|
// by the first n bytes of the UTF string s. If the last few bytes of
|
||||||
|
// the string contain an incompletely coded rune, utfnlen will not
|
||||||
|
// count them; in this way, it differs from utflen, which includes
|
||||||
|
// every byte of the string. (cf. strnlen)
|
||||||
|
|
||||||
|
int utfnlen(const char* s, long n);
|
||||||
|
|
||||||
|
|
||||||
|
// utfrune returns a pointer to the first occurrence of rune r in the
|
||||||
|
// UTF string s, or 0 if r does not occur in the string. The NULL
|
||||||
|
// byte terminating a string is considered to be part of the string s.
|
||||||
|
// (cf. strchr)
|
||||||
|
|
||||||
|
const char* utfrune(const char* s, Rune r);
|
||||||
|
|
||||||
|
|
||||||
|
// utfrrune returns a pointer to the last occurrence of rune r in the
|
||||||
|
// UTF string s, or 0 if r does not occur in the string. The NULL
|
||||||
|
// byte terminating a string is considered to be part of the string s.
|
||||||
|
// (cf. strrchr)
|
||||||
|
|
||||||
|
const char* utfrrune(const char* s, Rune r);
|
||||||
|
|
||||||
|
|
||||||
|
// utfutf returns a pointer to the first occurrence of the UTF string
|
||||||
|
// s2 as a UTF substring of s1, or 0 if there is none. If s2 is the
|
||||||
|
// null string, utfutf returns s1. (cf. strstr)
|
||||||
|
|
||||||
|
const char* utfutf(const char* s1, const char* s2);
|
||||||
|
|
||||||
|
|
||||||
|
// utfecpy copies UTF sequences until a null sequence has been copied,
|
||||||
|
// but writes no sequences beyond es1. If any sequences are copied,
|
||||||
|
// s1 is terminated by a null sequence, and a pointer to that sequence
|
||||||
|
// is returned. Otherwise, the original s1 is returned. (cf. strecpy)
|
||||||
|
|
||||||
|
char* utfecpy(char *s1, char *es1, const char *s2);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// These functions are rune-string analogues of the corresponding
|
||||||
|
// functions in strcat (3).
|
||||||
|
//
|
||||||
|
// These routines first appeared in Plan 9.
|
||||||
|
// SEE ALSO
|
||||||
|
// memmove (3)
|
||||||
|
// rune (3)
|
||||||
|
// strcat (2)
|
||||||
|
//
|
||||||
|
// BUGS: The outcome of overlapping moves varies among implementations.
|
||||||
|
|
||||||
|
Rune* runestrcat(Rune* s1, const Rune* s2);
|
||||||
|
Rune* runestrncat(Rune* s1, const Rune* s2, long n);
|
||||||
|
|
||||||
|
const Rune* runestrchr(const Rune* s, Rune c);
|
||||||
|
|
||||||
|
int runestrcmp(const Rune* s1, const Rune* s2);
|
||||||
|
int runestrncmp(const Rune* s1, const Rune* s2, long n);
|
||||||
|
|
||||||
|
Rune* runestrcpy(Rune* s1, const Rune* s2);
|
||||||
|
Rune* runestrncpy(Rune* s1, const Rune* s2, long n);
|
||||||
|
Rune* runestrecpy(Rune* s1, Rune* es1, const Rune* s2);
|
||||||
|
|
||||||
|
Rune* runestrdup(const Rune* s);
|
||||||
|
|
||||||
|
const Rune* runestrrchr(const Rune* s, Rune c);
|
||||||
|
long runestrlen(const Rune* s);
|
||||||
|
const Rune* runestrstr(const Rune* s1, const Rune* s2);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// The following routines test types and modify cases for Unicode
|
||||||
|
// characters. Unicode defines some characters as letters and
|
||||||
|
// specifies three cases: upper, lower, and title. Mappings among the
|
||||||
|
// cases are also defined, although they are not exhaustive: some
|
||||||
|
// upper case letters have no lower case mapping, and so on. Unicode
|
||||||
|
// also defines several character properties, a subset of which are
|
||||||
|
// checked by these routines. These routines are based on Unicode
|
||||||
|
// version 3.0.0.
|
||||||
|
//
|
||||||
|
// NOTE: The routines are implemented in C, so the boolean functions
|
||||||
|
// (e.g., isupperrune) return 0 for false and 1 for true.
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// toupperrune, tolowerrune, and totitlerune are the Unicode case
|
||||||
|
// mappings. These routines return the character unchanged if it has
|
||||||
|
// no defined mapping.
|
||||||
|
|
||||||
|
Rune toupperrune(Rune r);
|
||||||
|
Rune tolowerrune(Rune r);
|
||||||
|
Rune totitlerune(Rune r);
|
||||||
|
|
||||||
|
|
||||||
|
// isupperrune tests for upper case characters, including Unicode
|
||||||
|
// upper case letters and targets of the toupper mapping. islowerrune
|
||||||
|
// and istitlerune are defined analogously.
|
||||||
|
|
||||||
|
int isupperrune(Rune r);
|
||||||
|
int islowerrune(Rune r);
|
||||||
|
int istitlerune(Rune r);
|
||||||
|
|
||||||
|
|
||||||
|
// isalpharune tests for Unicode letters; this includes ideographs in
|
||||||
|
// addition to alphabetic characters.
|
||||||
|
|
||||||
|
int isalpharune(Rune r);
|
||||||
|
|
||||||
|
|
||||||
|
// isdigitrune tests for digits. Non-digit numbers, such as Roman
|
||||||
|
// numerals, are not included.
|
||||||
|
|
||||||
|
int isdigitrune(Rune r);
|
||||||
|
|
||||||
|
|
||||||
|
// isideographicrune tests for ideographic characters and numbers, as
|
||||||
|
// defined by the Unicode standard.
|
||||||
|
|
||||||
|
int isideographicrune(Rune r);
|
||||||
|
|
||||||
|
|
||||||
|
// isspacerune tests for whitespace characters, including "C" locale
|
||||||
|
// whitespace, Unicode defined whitespace, and the "zero-width
|
||||||
|
// non-break space" character.
|
||||||
|
|
||||||
|
int isspacerune(Rune r);
|
||||||
|
|
||||||
|
|
||||||
|
// (The comments in this file were copied from the manpage files rune.3,
|
||||||
|
// isalpharune.3, and runestrcat.3. Some formatting changes were also made
|
||||||
|
// to conform to Google style. /JRM 11/11/05)
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif
|
@ -12,36 +12,17 @@
|
|||||||
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
|
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/*
|
#define uchar _utfuchar
|
||||||
* compiler directive on Plan 9
|
#define ushort _utfushort
|
||||||
*/
|
#define uint _utfuint
|
||||||
#ifndef USED
|
#define ulong _utfulong
|
||||||
#define USED(x) if(x);else
|
#define vlong _utfvlong
|
||||||
#endif
|
#define uvlong _utfuvlong
|
||||||
|
|
||||||
/*
|
|
||||||
* easiest way to make sure these are defined
|
|
||||||
*/
|
|
||||||
#define uchar _fmtuchar
|
|
||||||
#define ushort _fmtushort
|
|
||||||
#define uint _fmtuint
|
|
||||||
#define ulong _fmtulong
|
|
||||||
#define vlong _fmtvlong
|
|
||||||
#define uvlong _fmtuvlong
|
|
||||||
typedef unsigned char uchar;
|
typedef unsigned char uchar;
|
||||||
typedef unsigned short ushort;
|
typedef unsigned short ushort;
|
||||||
typedef unsigned int uint;
|
typedef unsigned int uint;
|
||||||
typedef unsigned long ulong;
|
typedef unsigned long ulong;
|
||||||
typedef unsigned long long uvlong;
|
|
||||||
typedef long long vlong;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* nil cannot be ((void*)0) on ANSI C,
|
|
||||||
* because it is used for function pointers
|
|
||||||
*/
|
|
||||||
#undef nil
|
|
||||||
#define nil 0
|
|
||||||
|
|
||||||
#undef nelem
|
|
||||||
#define nelem ((void*)0)
|
|
||||||
|
|
||||||
|
#define nelem(x) (sizeof(x)/sizeof((x)[0]))
|
||||||
|
#define nil ((void*)0)
|
||||||
|
@ -7,18 +7,17 @@
|
|||||||
* or modification of this software and in all copies of the supporting
|
* or modification of this software and in all copies of the supporting
|
||||||
* documentation for such software.
|
* documentation for such software.
|
||||||
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
|
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
|
||||||
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
|
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
|
||||||
* ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
||||||
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
|
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
|
||||||
*/
|
*/
|
||||||
#define _BSD_SOURCE 1 /* memccpy */
|
|
||||||
#include <stdarg.h>
|
#include <stdarg.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include "plan9.h"
|
|
||||||
#include "utf.h"
|
#include "utf.h"
|
||||||
|
#include "utfdef.h"
|
||||||
|
|
||||||
char*
|
char*
|
||||||
utfecpy(char *to, char *e, char *from)
|
utfecpy(char *to, char *e, const char *from)
|
||||||
{
|
{
|
||||||
char *end;
|
char *end;
|
||||||
|
|
||||||
|
@ -7,17 +7,17 @@
|
|||||||
* or modification of this software and in all copies of the supporting
|
* or modification of this software and in all copies of the supporting
|
||||||
* documentation for such software.
|
* documentation for such software.
|
||||||
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
|
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
|
||||||
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
|
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
|
||||||
* ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
||||||
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
|
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
|
||||||
*/
|
*/
|
||||||
#include <stdarg.h>
|
#include <stdarg.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include "plan9.h"
|
|
||||||
#include "utf.h"
|
#include "utf.h"
|
||||||
|
#include "utfdef.h"
|
||||||
|
|
||||||
int
|
int
|
||||||
utflen(char *s)
|
utflen(const char *s)
|
||||||
{
|
{
|
||||||
int c;
|
int c;
|
||||||
long n;
|
long n;
|
||||||
@ -34,4 +34,5 @@ utflen(char *s)
|
|||||||
s += chartorune(&rune, s);
|
s += chartorune(&rune, s);
|
||||||
n++;
|
n++;
|
||||||
}
|
}
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -7,22 +7,22 @@
|
|||||||
* or modification of this software and in all copies of the supporting
|
* or modification of this software and in all copies of the supporting
|
||||||
* documentation for such software.
|
* documentation for such software.
|
||||||
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
|
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
|
||||||
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
|
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
|
||||||
* ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
||||||
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
|
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
|
||||||
*/
|
*/
|
||||||
#include <stdarg.h>
|
#include <stdarg.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include "plan9.h"
|
|
||||||
#include "utf.h"
|
#include "utf.h"
|
||||||
|
#include "utfdef.h"
|
||||||
|
|
||||||
int
|
int
|
||||||
utfnlen(char *s, long m)
|
utfnlen(const char *s, long m)
|
||||||
{
|
{
|
||||||
int c;
|
int c;
|
||||||
long n;
|
long n;
|
||||||
Rune rune;
|
Rune rune;
|
||||||
char *es;
|
const char *es;
|
||||||
|
|
||||||
es = s + m;
|
es = s + m;
|
||||||
for(n = 0; s < es; n++) {
|
for(n = 0; s < es; n++) {
|
||||||
|
@ -7,21 +7,22 @@
|
|||||||
* or modification of this software and in all copies of the supporting
|
* or modification of this software and in all copies of the supporting
|
||||||
* documentation for such software.
|
* documentation for such software.
|
||||||
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
|
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
|
||||||
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
|
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
|
||||||
* ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
||||||
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
|
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
|
||||||
*/
|
*/
|
||||||
#include <stdarg.h>
|
#include <stdarg.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include "plan9.h"
|
|
||||||
#include "utf.h"
|
#include "utf.h"
|
||||||
|
#include "utfdef.h"
|
||||||
|
|
||||||
|
const
|
||||||
char*
|
char*
|
||||||
utfrrune(char *s, long c)
|
utfrrune(const char *s, Rune c)
|
||||||
{
|
{
|
||||||
long c1;
|
long c1;
|
||||||
Rune r;
|
Rune r;
|
||||||
char *s1;
|
const char *s1;
|
||||||
|
|
||||||
if(c < Runesync) /* not part of utf sequence */
|
if(c < Runesync) /* not part of utf sequence */
|
||||||
return strrchr(s, c);
|
return strrchr(s, c);
|
||||||
@ -42,4 +43,5 @@ utfrrune(char *s, long c)
|
|||||||
s1 = s;
|
s1 = s;
|
||||||
s += c1;
|
s += c1;
|
||||||
}
|
}
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -7,17 +7,18 @@
|
|||||||
* or modification of this software and in all copies of the supporting
|
* or modification of this software and in all copies of the supporting
|
||||||
* documentation for such software.
|
* documentation for such software.
|
||||||
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
|
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
|
||||||
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
|
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
|
||||||
* ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
||||||
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
|
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
|
||||||
*/
|
*/
|
||||||
#include <stdarg.h>
|
#include <stdarg.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include "plan9.h"
|
|
||||||
#include "utf.h"
|
#include "utf.h"
|
||||||
|
#include "utfdef.h"
|
||||||
|
|
||||||
|
const
|
||||||
char*
|
char*
|
||||||
utfrune(char *s, long c)
|
utfrune(const char *s, Rune c)
|
||||||
{
|
{
|
||||||
long c1;
|
long c1;
|
||||||
Rune r;
|
Rune r;
|
||||||
@ -41,4 +42,5 @@ utfrune(char *s, long c)
|
|||||||
return s;
|
return s;
|
||||||
s += n;
|
s += n;
|
||||||
}
|
}
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -7,24 +7,25 @@
|
|||||||
* or modification of this software and in all copies of the supporting
|
* or modification of this software and in all copies of the supporting
|
||||||
* documentation for such software.
|
* documentation for such software.
|
||||||
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
|
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
|
||||||
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
|
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
|
||||||
* ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
||||||
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
|
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
|
||||||
*/
|
*/
|
||||||
#include <stdarg.h>
|
#include <stdarg.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include "plan9.h"
|
|
||||||
#include "utf.h"
|
#include "utf.h"
|
||||||
|
#include "utfdef.h"
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Return pointer to first occurrence of s2 in s1,
|
* Return pointer to first occurrence of s2 in s1,
|
||||||
* 0 if none
|
* 0 if none
|
||||||
*/
|
*/
|
||||||
|
const
|
||||||
char*
|
char*
|
||||||
utfutf(char *s1, char *s2)
|
utfutf(const char *s1, const char *s2)
|
||||||
{
|
{
|
||||||
char *p;
|
const char *p;
|
||||||
long f, n1, n2;
|
long f, n1, n2;
|
||||||
Rune r;
|
Rune r;
|
||||||
|
|
||||||
@ -34,7 +35,7 @@ utfutf(char *s1, char *s2)
|
|||||||
return strstr(s1, s2);
|
return strstr(s1, s2);
|
||||||
|
|
||||||
n2 = strlen(s2);
|
n2 = strlen(s2);
|
||||||
for(p=s1; p=utfrune(p, f); p+=n1)
|
for(p=s1; (p=utfrune(p, f)) != 0; p+=n1)
|
||||||
if(strncmp(p, s2, n2) == 0)
|
if(strncmp(p, s2, n2) == 0)
|
||||||
return p;
|
return p;
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -20,6 +20,7 @@ LIBOFILES=\
|
|||||||
runtime.$O\
|
runtime.$O\
|
||||||
map.$O\
|
map.$O\
|
||||||
print.$O\
|
print.$O\
|
||||||
|
rune.$O\
|
||||||
string.$O\
|
string.$O\
|
||||||
sys_file.$O\
|
sys_file.$O\
|
||||||
|
|
||||||
|
224
src/runtime/rune.c
Normal file
224
src/runtime/rune.c
Normal file
@ -0,0 +1,224 @@
|
|||||||
|
/*
|
||||||
|
* The authors of this software are Rob Pike and Ken Thompson.
|
||||||
|
* Copyright (c) 2002 by Lucent Technologies.
|
||||||
|
* Permission to use, copy, modify, and distribute this software for any
|
||||||
|
* purpose without fee is hereby granted, provided that this entire notice
|
||||||
|
* is included in all copies of any software which is or includes a copy
|
||||||
|
* or modification of this software and in all copies of the supporting
|
||||||
|
* documentation for such software.
|
||||||
|
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
|
||||||
|
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
|
||||||
|
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
||||||
|
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This code is copied, with slight editing due to type differences,
|
||||||
|
* from a subset of ../lib9/utf/rune.c
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "runtime.h"
|
||||||
|
|
||||||
|
enum
|
||||||
|
{
|
||||||
|
Bit1 = 7,
|
||||||
|
Bitx = 6,
|
||||||
|
Bit2 = 5,
|
||||||
|
Bit3 = 4,
|
||||||
|
Bit4 = 3,
|
||||||
|
Bit5 = 2,
|
||||||
|
|
||||||
|
T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
|
||||||
|
Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
|
||||||
|
T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
|
||||||
|
T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
|
||||||
|
T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
|
||||||
|
T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
|
||||||
|
|
||||||
|
Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
|
||||||
|
Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
|
||||||
|
Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
|
||||||
|
Rune4 = (1<<(Bit4+3*Bitx))-1,
|
||||||
|
/* 0001 1111 1111 1111 1111 1111 */
|
||||||
|
|
||||||
|
Maskx = (1<<Bitx)-1, /* 0011 1111 */
|
||||||
|
Testx = Maskx ^ 0xFF, /* 1100 0000 */
|
||||||
|
|
||||||
|
Runeerror = 0xFFFD,
|
||||||
|
Runeself = 0x80,
|
||||||
|
|
||||||
|
Bad = Runeerror,
|
||||||
|
|
||||||
|
Runemax = 0x10FFFF, /* maximum rune value */
|
||||||
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24
|
||||||
|
* This is a slower but "safe" version of the old chartorune
|
||||||
|
* that works on strings that are not necessarily null-terminated.
|
||||||
|
*
|
||||||
|
* If you know for sure that your string is null-terminated,
|
||||||
|
* chartorune will be a bit faster.
|
||||||
|
*
|
||||||
|
* It is guaranteed not to attempt to access "length"
|
||||||
|
* past the incoming pointer. This is to avoid
|
||||||
|
* possible access violations. If the string appears to be
|
||||||
|
* well-formed but incomplete (i.e., to get the whole Rune
|
||||||
|
* we'd need to read past str+length) then we'll set the Rune
|
||||||
|
* to Bad and return 0.
|
||||||
|
*
|
||||||
|
* Note that if we have decoding problems for other
|
||||||
|
* reasons, we return 1 instead of 0.
|
||||||
|
*/
|
||||||
|
int32
|
||||||
|
charntorune(int32 *rune, byte *str, int32 length)
|
||||||
|
{
|
||||||
|
int32 c, c1, c2, c3;
|
||||||
|
int32 l;
|
||||||
|
|
||||||
|
/* When we're not allowed to read anything */
|
||||||
|
if(length <= 0) {
|
||||||
|
goto badlen;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* one character sequence (7-bit value)
|
||||||
|
* 00000-0007F => T1
|
||||||
|
*/
|
||||||
|
c = *(byte*)str; /* cast not necessary, but kept for safety */
|
||||||
|
if(c < Tx) {
|
||||||
|
*rune = c;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we can't read more than one character we must stop
|
||||||
|
if(length <= 1) {
|
||||||
|
goto badlen;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* two character sequence (11-bit value)
|
||||||
|
* 0080-07FF => T2 Tx
|
||||||
|
*/
|
||||||
|
c1 = *(byte*)(str+1) ^ Tx;
|
||||||
|
if(c1 & Testx)
|
||||||
|
goto bad;
|
||||||
|
if(c < T3) {
|
||||||
|
if(c < T2)
|
||||||
|
goto bad;
|
||||||
|
l = ((c << Bitx) | c1) & Rune2;
|
||||||
|
if(l <= Rune1)
|
||||||
|
goto bad;
|
||||||
|
*rune = l;
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we can't read more than two characters we must stop
|
||||||
|
if(length <= 2) {
|
||||||
|
goto badlen;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* three character sequence (16-bit value)
|
||||||
|
* 0800-FFFF => T3 Tx Tx
|
||||||
|
*/
|
||||||
|
c2 = *(byte*)(str+2) ^ Tx;
|
||||||
|
if(c2 & Testx)
|
||||||
|
goto bad;
|
||||||
|
if(c < T4) {
|
||||||
|
l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
|
||||||
|
if(l <= Rune2)
|
||||||
|
goto bad;
|
||||||
|
*rune = l;
|
||||||
|
return 3;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (length <= 3)
|
||||||
|
goto badlen;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* four character sequence (21-bit value)
|
||||||
|
* 10000-1FFFFF => T4 Tx Tx Tx
|
||||||
|
*/
|
||||||
|
c3 = *(byte*)(str+3) ^ Tx;
|
||||||
|
if (c3 & Testx)
|
||||||
|
goto bad;
|
||||||
|
if (c < T5) {
|
||||||
|
l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
|
||||||
|
if (l <= Rune3)
|
||||||
|
goto bad;
|
||||||
|
*rune = l;
|
||||||
|
return 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Support for 5-byte or longer UTF-8 would go here, but
|
||||||
|
// since we don't have that, we'll just fall through to bad.
|
||||||
|
|
||||||
|
/*
|
||||||
|
* bad decoding
|
||||||
|
*/
|
||||||
|
bad:
|
||||||
|
*rune = Bad;
|
||||||
|
return 1;
|
||||||
|
badlen:
|
||||||
|
*rune = Bad;
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
int32
|
||||||
|
runetochar(byte *str, int32 rune) /* note: in original, arg2 was pointer */
|
||||||
|
{
|
||||||
|
/* Runes are signed, so convert to unsigned for range check. */
|
||||||
|
uint32 c;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* one character sequence
|
||||||
|
* 00000-0007F => 00-7F
|
||||||
|
*/
|
||||||
|
c = rune;
|
||||||
|
if(c <= Rune1) {
|
||||||
|
str[0] = c;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* two character sequence
|
||||||
|
* 0080-07FF => T2 Tx
|
||||||
|
*/
|
||||||
|
if(c <= Rune2) {
|
||||||
|
str[0] = T2 | (c >> 1*Bitx);
|
||||||
|
str[1] = Tx | (c & Maskx);
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the Rune is out of range, convert it to the error rune.
|
||||||
|
* Do this test here because the error rune encodes to three bytes.
|
||||||
|
* Doing it earlier would duplicate work, since an out of range
|
||||||
|
* Rune wouldn't have fit in one or two bytes.
|
||||||
|
*/
|
||||||
|
if (c > Runemax)
|
||||||
|
c = Runeerror;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* three character sequence
|
||||||
|
* 0800-FFFF => T3 Tx Tx
|
||||||
|
*/
|
||||||
|
if (c <= Rune3) {
|
||||||
|
str[0] = T3 | (c >> 2*Bitx);
|
||||||
|
str[1] = Tx | ((c >> 1*Bitx) & Maskx);
|
||||||
|
str[2] = Tx | (c & Maskx);
|
||||||
|
return 3;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* four character sequence (21-bit value)
|
||||||
|
* 10000-1FFFFF => T4 Tx Tx Tx
|
||||||
|
*/
|
||||||
|
str[0] = T4 | (c >> 3*Bitx);
|
||||||
|
str[1] = Tx | ((c >> 2*Bitx) & Maskx);
|
||||||
|
str[2] = Tx | ((c >> 1*Bitx) & Maskx);
|
||||||
|
str[3] = Tx | (c & Maskx);
|
||||||
|
return 4;
|
||||||
|
}
|
@ -85,6 +85,8 @@ enum
|
|||||||
int32 strcmp(byte*, byte*);
|
int32 strcmp(byte*, byte*);
|
||||||
int32 findnull(int8*);
|
int32 findnull(int8*);
|
||||||
void dump(byte*, int32);
|
void dump(byte*, int32);
|
||||||
|
int32 runetochar(byte*, int32);
|
||||||
|
int32 chartorune(uint32*, byte*);
|
||||||
|
|
||||||
extern string emptystring;
|
extern string emptystring;
|
||||||
extern int32 debug;
|
extern int32 debug;
|
||||||
|
@ -151,55 +151,6 @@ sys·indexstring(string s, int32 i, byte b)
|
|||||||
FLUSH(&b);
|
FLUSH(&b);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* this is the plan9 runetochar
|
|
||||||
* extended for 36 bits in 7 bytes
|
|
||||||
* note that it truncates to 32 bits
|
|
||||||
* through the argument passing.
|
|
||||||
*/
|
|
||||||
static int32
|
|
||||||
runetochar(byte *str, uint32 c)
|
|
||||||
{
|
|
||||||
int32 i, n;
|
|
||||||
uint32 mask, mark;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* one character in 7 bits
|
|
||||||
*/
|
|
||||||
if(c <= 0x07FUL) {
|
|
||||||
str[0] = c;
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* every new character picks up 5 bits
|
|
||||||
* one less in the first byte and
|
|
||||||
* six more in an extension byte
|
|
||||||
*/
|
|
||||||
mask = 0x7ffUL;
|
|
||||||
mark = 0xC0UL;
|
|
||||||
for(n=1;; n++) {
|
|
||||||
if(c <= mask)
|
|
||||||
break;
|
|
||||||
mask = (mask<<5) | 0x1fUL;
|
|
||||||
mark = (mark>>1) | 0x80UL;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* lay down the bytes backwards
|
|
||||||
* n is the number of extension bytes
|
|
||||||
* mask is the max codepoint
|
|
||||||
* mark is the zeroth byte indicator
|
|
||||||
*/
|
|
||||||
for(i=n; i>0; i--) {
|
|
||||||
str[i] = 0x80UL | (c&0x3fUL);
|
|
||||||
c >>= 6;
|
|
||||||
}
|
|
||||||
|
|
||||||
str[0] = mark|c;
|
|
||||||
return n+1;
|
|
||||||
}
|
|
||||||
|
|
||||||
void
|
void
|
||||||
sys·intstring(int64 v, string s)
|
sys·intstring(int64 v, string s)
|
||||||
{
|
{
|
||||||
|
@ -75,5 +75,14 @@ func main() {
|
|||||||
`\000\123\x00\312\xFE\u0123\ubabe\U0000babe`,
|
`\000\123\x00\312\xFE\u0123\ubabe\U0000babe`,
|
||||||
"backslashes 2 (backquote)");
|
"backslashes 2 (backquote)");
|
||||||
assert("\\x\\u\\U\\", `\x\u\U\`, "backslash 3 (backquote)");
|
assert("\\x\\u\\U\\", `\x\u\U\`, "backslash 3 (backquote)");
|
||||||
|
|
||||||
|
// test large runes. perhaps not the most logical place for this test.
|
||||||
|
var r int32;
|
||||||
|
r = 0x10ffff; // largest rune value
|
||||||
|
s = string(r);
|
||||||
|
assert(s, "\xf4\x8f\xbf\xbf", "largest rune");
|
||||||
|
r = 0x10ffff + 1;
|
||||||
|
s = string(r);
|
||||||
|
assert(s, "\xef\xbf\xbd", "too-large rune");
|
||||||
sys.exit(ecode);
|
sys.exit(ecode);
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user