fontforge/Unicode/ustring.c
2013-07-11 15:46:35 +02:00

989 lines
23 KiB
C

/* Copyright (C) 2000-2012 by George Williams */
/*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
* EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <stddef.h>
#include "ustring.h"
#include "utype.h"
long uc_strcmp(const unichar_t *str1,const char *str2) {
long ch1, ch2;
for (;;) {
ch1 = *str1++; ch2 = *(unsigned char *) str2++ ;
if ( ch1!=ch2 || ch1=='\0' )
return(ch1-ch2);
}
}
long uc_strncmp(const unichar_t *str1,const char *str2,int n) {
long ch1, ch2;
while ( --n>=0 ) {
ch1 = *str1++; ch2 = *(unsigned char *) str2++ ;
if ( ch1!=ch2 || ch1=='\0' )
return(ch1-ch2);
}
return( 0 );
}
long uc_strmatch(const unichar_t *str1, const char *str2) {
long ch1, ch2;
for (;;) {
ch1 = *str1++; ch2 = *(unsigned char *) str2++ ;
ch1 = tolower(ch1);
ch2 = tolower(ch2);
if ( ch1!=ch2 || ch1=='\0' )
return(ch1-ch2);
}
}
long uc_strnmatch(const unichar_t *str1, const char *str2, int len) {
long ch1, ch2;
for (;--len>=0;) {
ch1 = *str1++; ch2 = *(unsigned char *) str2++ ;
ch1 = tolower(ch1);
ch2 = tolower(ch2);
if ( ch1!=ch2 || ch1=='\0' || len<=0 )
return(ch1-ch2);
}
return( 0 );
}
long u_strnmatch(const unichar_t *str1, const unichar_t *str2, int len) {
long ch1, ch2;
for (;--len>=0;) {
ch1 = *str1++; ch2 = *str2++ ;
ch1 = tolower(ch1);
ch2 = tolower(ch2);
if ( ch1!=ch2 || ch1=='\0' || len<=0 )
return(ch1-ch2);
}
return( 0 );
}
long u_strcmp(const unichar_t *str1,const unichar_t *str2) {
long ch1, ch2;
for (;;) {
ch1 = *str1++; ch2 = *str2++ ;
if ( ch1!=ch2 || ch1=='\0' )
return(ch1-ch2);
}
}
long u_strncmp(const unichar_t *str1,const unichar_t *str2,int n) {
long ch1, ch2;
while ( --n>=0 ) {
ch1 = *str1++; ch2 = *str2++ ;
if ( ch1!=ch2 || ch1=='\0' )
return(ch1-ch2);
}
return( 0 );
}
long u_strmatch(const unichar_t *str1, const unichar_t *str2) {
long ch1, ch2;
for (;;) {
ch1 = *str1++; ch2 = *str2++ ;
ch1 = tolower(ch1);
ch2 = tolower(ch2);
if ( ch1!=ch2 || ch1=='\0' )
return(ch1-ch2);
}
}
void cu_strcpy(char *to, const unichar_t *from) {
register unichar_t ch;
while ( (ch = *from++) != '\0' )
*(to++) = ch;
*to = 0;
}
void uc_strcpy(unichar_t *to, const char *from) {
register unichar_t ch;
while ( (ch = *(unsigned char *) from++) != '\0' )
*(to++) = ch;
*to = 0;
}
void u_strcpy(unichar_t *to, const unichar_t *from) {
register unichar_t ch;
while ( (ch = *from++) != '\0' )
*(to++) = ch;
*to = 0;
}
void u_strncpy(register unichar_t *to, const unichar_t *from, int len) {
register unichar_t ch;
while ( (ch = *from++) != '\0' && --len>=0 )
*(to++) = ch;
*to = 0;
}
void cu_strncpy(register char *to, const unichar_t *from, int len) {
register unichar_t ch;
while ( (ch = *from++) != '\0' && --len>=0 )
*(to++) = ch;
*to = 0;
}
void uc_strncpy(register unichar_t *to, const char *from, int len) {
register unichar_t ch;
while ( (ch = *(unsigned char *) from++) != '\0' && --len>=0 )
*(to++) = ch;
*to = 0;
}
void uc_strcat(unichar_t *to, const char *from) {
uc_strcpy(to+u_strlen(to),from);
}
void uc_strncat(unichar_t *to, const char *from,int len) {
uc_strncpy(to+u_strlen(to),from,len);
}
void cu_strcat(char *to, const unichar_t *from) {
cu_strcpy(to+strlen(to),from);
}
void cu_strncat(char *to, const unichar_t *from, int len) {
cu_strncpy(to+strlen(to),from,len);
}
void u_strcat(unichar_t *to, const unichar_t *from) {
u_strcpy(to+u_strlen(to),from);
}
void u_strncat(unichar_t *to, const unichar_t *from, int len) {
u_strncpy(to+u_strlen(to),from,len);
}
int u_strlen(register const unichar_t *str) {
register int len = 0;
while ( *str++!='\0' )
++len;
return( len );
}
unichar_t *u_strchr(const unichar_t *str ,unichar_t ch) {
register unichar_t test;
while ( (test=*(str++))!='\0' )
if ( test==ch )
return( (unichar_t *) str-1 );
return( NULL );
}
unichar_t *u_strrchr(const unichar_t *str ,unichar_t ch) {
register unichar_t test, *last = NULL;
while ( (test=*(str++))!='\0' )
if ( test==ch )
last = (unichar_t *) str-1;
return( last );
}
unichar_t *uc_strstr(const unichar_t *longer, const char *substr) {
long ch1, ch2;
const unichar_t *lpt, *str1; const char *str2;
for ( lpt=longer; *lpt!='\0'; ++lpt ) {
str1 = lpt; str2 = substr;
for (;;) {
ch1 = *str1++; ch2 = *(unsigned char *) str2++ ;
if ( ch2=='\0' )
return((unichar_t *) lpt);
if ( ch1!=ch2 )
break;
}
}
return( NULL );
}
unichar_t *u_strstr(const unichar_t *longer, const unichar_t *substr) {
long ch1, ch2;
const unichar_t *lpt, *str1, *str2;
for ( lpt=longer; *lpt!='\0'; ++lpt ) {
str1 = lpt; str2 = substr;
for (;;) {
ch1 = *str1++; ch2 = *str2++ ;
if ( ch2=='\0' )
return((unichar_t *) lpt);
if ( ch1!=ch2 )
break;
}
}
return( NULL );
}
unichar_t *uc_strstrmatch(const unichar_t *longer, const char *substr) {
long ch1, ch2;
const unichar_t *lpt, *str1; const unsigned char *str2;
for ( lpt=longer; *lpt!='\0'; ++lpt ) {
str1 = lpt; str2 = (unsigned char *) substr;
for (;;) {
ch1 = *str1++; ch2 = *str2++ ;
ch1 = tolower(ch1);
ch2 = tolower(ch2);
if ( ch2=='\0' )
return((unichar_t *) lpt);
if ( ch1!=ch2 )
break;
}
}
return( NULL );
}
unichar_t *u_strstrmatch(const unichar_t *longer, const unichar_t *substr) {
long ch1, ch2;
const unichar_t *lpt, *str1, *str2;
for ( lpt=longer; *lpt!='\0'; ++lpt ) {
str1 = lpt; str2 = substr;
for (;;) {
ch1 = *str1++; ch2 = *str2++ ;
ch1 = tolower(ch1);
ch2 = tolower(ch2);
if ( ch2=='\0' )
return((unichar_t *) lpt);
if ( ch1!=ch2 )
break;
}
}
return( NULL );
}
unichar_t *u_copyn(const unichar_t *pt, long n) {
unichar_t *res;
#ifdef MEMORY_MASK
if ( n*sizeof(unichar_t)>=MEMORY_MASK )
n = MEMORY_MASK/sizeof(unichar_t)-1;
#endif
res = galloc((n+1)*sizeof(unichar_t));
memcpy(res,pt,n*sizeof(unichar_t));
res[n]='\0';
return(res);
}
unichar_t *u_copy(const unichar_t *pt) {
if(pt)
return u_copyn(pt,u_strlen(pt));
return((unichar_t *)0);
}
unichar_t *u_concat(const unichar_t *s1, const unichar_t *s2) {
long len1, len2;
unichar_t *pt;
if ( s1==NULL )
return( u_copy( s2 ));
else if ( s2==NULL )
return( u_copy( s1 ));
len1 = u_strlen(s1); len2 = u_strlen(s2);
pt = galloc((len1+len2+1)*sizeof(unichar_t));
u_strcpy(pt,s1);
u_strcpy(pt+len1,s2);
return( pt );
}
unichar_t *uc_copyn(const char *pt,int len) {
unichar_t *res, *rpt;
if(!pt)
return((unichar_t *)0);
#ifdef MEMORY_MASK
if ( (len+1)*sizeof(unichar_t)>=MEMORY_MASK )
len = MEMORY_MASK/sizeof(unichar_t)-1;
#endif
res = galloc((len+1)*sizeof(unichar_t));
for ( rpt=res; --len>=0 ; *rpt++ = *(unsigned char *) pt++ );
*rpt = '\0';
return(res);
}
unichar_t *uc_copy(const char *pt) {
unichar_t *res, *rpt;
int n;
if(!pt)
return((unichar_t *)0);
n = strlen(pt);
#ifdef MEMORY_MASK
if ( (n+1)*sizeof(unichar_t)>=MEMORY_MASK )
n = MEMORY_MASK/sizeof(unichar_t)-1;
#endif
res = galloc((n+1)*sizeof(unichar_t));
for ( rpt=res; --n>=0 ; *rpt++ = *(unsigned char *) pt++ );
*rpt = '\0';
return(res);
}
char *cu_copyn(const unichar_t *pt,int len) {
char *res, *rpt;
if(!pt)
return(NULL);
#ifdef MEMORY_MASK
if ( (len+1)>=MEMORY_MASK )
len = MEMORY_MASK-1;
#endif
res = galloc(len+1);
for ( rpt=res; --len>=0 ; *rpt++ = *pt++ );
*rpt = '\0';
return(res);
}
char *cu_copy(const unichar_t *pt) {
char *res, *rpt;
int n;
if(!pt)
return((char *)0);
n = u_strlen(pt);
#ifdef MEMORY_MASK
if ( (n+1)>=MEMORY_MASK )
n = MEMORY_MASK/sizeof(unichar_t)-1;
#endif
res = galloc(n+1);
for ( rpt=res; --n>=0 ; *rpt++ = *pt++ );
*rpt = '\0';
return(res);
}
double u_strtod(const unichar_t *str, unichar_t **ptr) {
char buf[60], *pt, *ret;
const unichar_t *upt;
double val;
extern double strtod(); /* Please don't delete this, not all of us have good ansi headers */
for ( upt=str, pt=buf; *upt<128 && *upt!='\0' && pt-buf<sizeof(buf)-1; )
*pt++ = *upt++;
*pt = '\0';
val = strtod(buf,&ret);
if ( ptr!=NULL ) {
if ( pt==ret )
*ptr = (unichar_t *) upt;
else
*ptr = (unichar_t *) (str + (ret-buf));
}
return( val );
}
long u_strtol(const unichar_t *str, unichar_t **ptr, int base) {
char buf[60], *pt, *ret;
const unichar_t *upt;
long val;
extern long strtol(); /* Please don't delete this, not all of us have good ansi headers */
for ( upt=str, pt=buf; *upt<128 && *upt!='\0' && pt<buf+sizeof(buf)-1; )
*pt++ = *upt++;
*pt = '\0';
val = strtol(buf,&ret,base);
if ( ptr!=NULL ) {
if ( pt==ret )
*ptr = (unichar_t *) upt;
else
*ptr = (unichar_t *) (str + (ret-buf));
}
return( val );
}
unsigned long u_strtoul(const unichar_t *str, unichar_t **ptr, int base) {
char buf[60], *pt, *ret;
const unichar_t *upt;
unsigned long val;
for ( upt=str, pt=buf; *upt<128 && *upt!='\0' && pt<buf+sizeof(buf)-1; )
*pt++ = *upt++;
*pt = '\0';
val = strtoul(buf,&ret,base);
if ( ptr!=NULL ) {
if ( pt==ret )
*ptr = (unichar_t *) upt;
else
*ptr = (unichar_t *) (str + (ret-buf));
}
return( val );
}
unichar_t *cu_strstartmatch(const char *key,const unichar_t *str) {
if ( key && str ) {
while( *key ) {
if(tolower(*key) != tolower(*str))
return 0;
key++;
str++;
}
}
return (unichar_t *)str;
}
unichar_t *u_strstartmatch(const unichar_t *initial, const unichar_t *full) {
int ch1, ch2;
for (;;) {
ch1 = *initial++; ch2 = *full++ ;
if ( ch1=='\0' )
return( (unichar_t *) full );
ch1 = tolower(ch1);
ch2 = tolower(ch2);
if ( ch1!=ch2 || ch1=='\0' )
return(NULL);
}
}
char *u_to_c(const unichar_t *ubuf) {
static char buf[400];
cu_strncpy(buf,ubuf,sizeof(buf));
return( buf );
}
unichar_t *c_to_u(const char *buf) {
static unichar_t ubuf[400];
uc_strncpy(ubuf,buf,sizeof(ubuf));
return( ubuf );
}
unichar_t *utf82u_strncpy(unichar_t *ubuf,const char *utf8buf,int len) {
unichar_t *upt=ubuf, *uend=ubuf+len-1;
const uint8 *pt = (const uint8 *) utf8buf, *end = pt+strlen(utf8buf);
int w, w2;
while ( pt<end && *pt!='\0' && upt<uend ) {
if ( *pt<=127 )
*upt = *pt++;
else if ( *pt<=0xdf ) {
*upt = ((*pt&0x1f)<<6) | (pt[1]&0x3f);
pt += 2;
} else if ( *pt<=0xef ) {
*upt = ((*pt&0xf)<<12) | ((pt[1]&0x3f)<<6) | (pt[2]&0x3f);
pt += 3;
#ifdef UNICHAR_16
} else if ( upt+1<uend ) {
/* Um... I don't support surrogates */
w = ( ((*pt&0x7)<<2) | ((pt[1]&0x30)>>4) )-1;
*upt++ = 0xd800 | (w<<6) | ((pt[1]&0xf)<<2) | ((pt[2]&0x30)>>4);
*upt = 0xdc00 | ((pt[2]&0xf)<<6) | (pt[3]&0x3f);
pt += 4;
} else {
/* no space for surrogate */
pt += 4;
#else
} else {
w = ( ((*pt&0x7)<<2) | ((pt[1]&0x30)>>4) )-1;
w = (w<<6) | ((pt[1]&0xf)<<2) | ((pt[2]&0x30)>>4);
w2 = ((pt[2]&0xf)<<6) | (pt[3]&0x3f);
*upt = w*0x400 + w2 + 0x10000;
pt += 4;
#endif
}
++upt;
}
*upt = '\0';
return( ubuf );
}
unichar_t *utf82u_strcpy(unichar_t *ubuf,const char *utf8buf) {
return( utf82u_strncpy(ubuf,utf8buf,strlen(utf8buf)+1));
}
# ifdef UNICHAR_16
uint32 *utf82u32_strncpy(uint32 *ubuf,const char *utf8buf,int len) {
uint32 *upt=ubuf, *uend=ubuf+len-1;
const uint8 *pt = (const uint8 *) utf8buf;
int w, w2;
while ( *pt!='\0' && upt<uend ) {
if ( *pt<=127 )
*upt = *pt++;
else if ( *pt<=0xdf ) {
*upt = ((*pt&0x1f)<<6) | (pt[1]&0x3f);
pt += 2;
} else if ( *pt<=0xef ) {
*upt = ((*pt&0xf)<<12) | ((pt[1]&0x3f)<<6) | (pt[2]&0x3f);
pt += 3;
} else {
w = ( ((*pt&0x7)<<2) | ((pt[1]&0x30)>>4) )-1;
w = (w<<6) | ((pt[1]&0xf)<<2) | ((pt[2]&0x30)>>4);
w2 = ((pt[2]&0xf)<<6) | (pt[3]&0x3f);
*upt = w*0x400 + w2 + 0x10000;
pt += 4;
}
++upt;
}
*upt = '\0';
return( ubuf );
}
char *u322utf8_strncpy(char *utf8buf, const uint32 *ubuf,int len) {
uint8 *pt=(uint8 *) utf8buf, *end=(uint8 *) utf8buf+len-1;
const uint32 *upt = ubuf;
while ( *upt!='\0' && pt<end ) {
if ( *upt<=127 )
*pt++ = *upt;
else if ( *upt<=0x7ff ) {
if ( pt+1>=end )
break;
*pt++ = 0xc0 | (*upt>>6);
*pt++ = 0x80 | (*upt&0x3f);
} else if ( *upt<=0xffff ) {
if ( pt+2>=end )
break;
*pt++ = 0xe0 | (*upt>>12);
*pt++ = 0x80 | ((*upt>>6)&0x3f);
*pt++ = 0x80 | (*upt&0x3f);
} else {
uint32 val = *upt-0x10000;
int u = ((val&0xf0000)>>16)+1, z=(val&0x0f000)>>12, y=(val&0x00fc0)>>6, x=val&0x0003f;
if ( pt+3>=end )
break;
*pt++ = 0xf0 | (u>>2);
*pt++ = 0x80 | ((u&3)<<4) | z;
*pt++ = 0x80 | y;
*pt++ = 0x80 | x;
}
++upt;
}
*pt = '\0';
return( utf8buf );
}
char *u322utf8_copy(const uint32 *ubuf) {
int i, len;
char *buf;
for ( i=len=0; ubuf[i]!=0; ++i )
if ( ubuf[i]<0x80 )
++len;
else if ( ubuf[i]<0x800 )
len += 2;
else if ( ubuf[i]<0x10000 )
len += 3;
else
len += 4;
buf = galloc(len+1);
return( u322utf8_strncpy(buf,ubuf,len+1));
}
#endif
unichar_t *utf82u_copyn(const char *utf8buf,int len) {
unichar_t *ubuf = galloc((len+1)*sizeof(unichar_t));
return( utf82u_strncpy(ubuf,utf8buf,len+1));
}
unichar_t *utf82u_copy(const char *utf8buf) {
int len;
unichar_t *ubuf;
if ( utf8buf==NULL )
return( NULL );
len = strlen(utf8buf);
ubuf = galloc((len+1)*sizeof(unichar_t));
return( utf82u_strncpy(ubuf,utf8buf,len+1));
}
void utf82u_strcat(unichar_t *to,const char *from) {
utf82u_strcpy(to+u_strlen(to),from);
}
#ifdef UNICHAR_16
uint32 *utf82u32_copy(const char *utf8buf) {
int len;
uint32 *ubuf;
if ( utf8buf==NULL )
return( NULL );
len = strlen(utf8buf);
ubuf = galloc((len+1)*sizeof(uint32));
return( utf82u32_strncpy(ubuf,utf8buf,len+1));
}
#endif
char *u2utf8_strcpy(char *utf8buf,const unichar_t *ubuf) {
char *pt = utf8buf;
while ( *ubuf ) {
if ( *ubuf<0x80 )
*pt++ = *ubuf;
else if ( *ubuf<0x800 ) {
*pt++ = 0xc0 | (*ubuf>>6);
*pt++ = 0x80 | (*ubuf&0x3f);
#ifdef UNICHAR_16
} else if ( *ubuf>=0xd800 && *ubuf<0xdc00 && ubuf[1]>=0xdc00 && ubuf[1]<0xe000 ) {
int u = ((*ubuf>>6)&0xf)+1, y = ((*ubuf&3)<<4) | ((ubuf[1]>>6)&0xf);
*pt++ = 0xf0 | (u>>2);
*pt++ = 0x80 | ((u&3)<<4) | ((*ubuf>>2)&0xf);
*pt++ = 0x80 | y;
*pt++ = 0x80 | (ubuf[1]&0x3f);
} else {
*pt++ = 0xe0 | (*ubuf>>12);
*pt++ = 0x80 | ((*ubuf>>6)&0x3f);
*pt++ = 0x80 | (*ubuf&0x3f);
#else
} else if ( *ubuf < 0x10000 ) {
*pt++ = 0xe0 | (*ubuf>>12);
*pt++ = 0x80 | ((*ubuf>>6)&0x3f);
*pt++ = 0x80 | (*ubuf&0x3f);
} else {
uint32 val = *ubuf-0x10000;
int u = ((val&0xf0000)>>16)+1, z=(val&0x0f000)>>12, y=(val&0x00fc0)>>6, x=val&0x0003f;
*pt++ = 0xf0 | (u>>2);
*pt++ = 0x80 | ((u&3)<<4) | z;
*pt++ = 0x80 | y;
*pt++ = 0x80 | x;
#endif
}
++ubuf;
}
*pt = '\0';
return( utf8buf );
}
char *utf8_strchr(const char *str, int search) {
int ch;
const char *old = str;
while ( (ch = utf8_ildb(&str))!=0 ) {
if ( ch==search )
return( (char *) old );
old = str;
}
return( NULL );
}
char *latin1_2_utf8_strcpy(char *utf8buf,const char *lbuf) {
char *pt = utf8buf;
const unsigned char *lpt = (const unsigned char *) lbuf;
while ( *lpt ) {
if ( *lpt<0x80 )
*pt++ = *lpt;
else {
*pt++ = 0xc0 | (*lpt>>6);
*pt++ = 0x80 | (*lpt&0x3f);
}
++lpt;
}
*pt = '\0';
return( utf8buf );
}
char *latin1_2_utf8_copy(const char *lbuf) {
int len;
char *utf8buf;
if ( lbuf==NULL )
return( NULL );
len = strlen(lbuf);
utf8buf = galloc(2*len+1);
return( latin1_2_utf8_strcpy(utf8buf,lbuf));
}
char *utf8_2_latin1_copy(const char *utf8buf) {
int len;
int ch;
char *lbuf, *pt; const char *upt;
if ( utf8buf==NULL )
return( NULL );
len = strlen(utf8buf);
pt = lbuf = galloc(len+1);
for ( upt=utf8buf; (ch=utf8_ildb(&upt))!='\0'; )
if ( ch>=0xff )
*pt++ = '?';
else
*pt++ = ch;
*pt = '\0';
return( lbuf );
}
char *u2utf8_copy(const unichar_t *ubuf) {
int len;
char *utf8buf;
if ( ubuf==NULL )
return( NULL );
len = u_strlen(ubuf);
utf8buf = galloc((len+1)*4);
return( u2utf8_strcpy(utf8buf,ubuf));
}
char *u2utf8_copyn(const unichar_t *ubuf,int len) {
int i;
char *utf8buf, *pt;
if ( ubuf==NULL )
return( NULL );
utf8buf = pt = galloc((len+1)*4);
for ( i=0; i<len && *ubuf!='\0'; ++i )
pt = utf8_idpb(pt, *ubuf++);
*pt = '\0';
return( utf8buf );
}
int32 utf8_ildb(const char **_text) {
int32 val= -1;
int ch;
const uint8 *text = (const uint8 *) *_text;
/* Increment and load character */
if ( (ch = *text++)<0x80 ) {
val = ch;
} else if ( ch<=0xbf ) {
/* error */
} else if ( ch<=0xdf ) {
if ( *text>=0x80 && *text<0xc0 )
val = ((ch&0x1f)<<6) | (*text++&0x3f);
} else if ( ch<=0xef ) {
if ( *text>=0x80 && *text<0xc0 && text[1]>=0x80 && text[1]<0xc0 ) {
val = ((ch&0xf)<<12) | ((text[0]&0x3f)<<6) | (text[1]&0x3f);
text += 2;
}
} else {
int w = ( ((ch&0x7)<<2) | ((text[0]&0x30)>>4) )-1, w2;
w = (w<<6) | ((text[0]&0xf)<<2) | ((text[1]&0x30)>>4);
w2 = ((text[1]&0xf)<<6) | (text[2]&0x3f);
val = w*0x400 + w2 + 0x10000;
if ( *text<0x80 || text[1]<0x80 || text[2]<0x80 ||
*text>=0xc0 || text[1]>=0xc0 || text[2]>=0xc0 )
val = -1;
else
text += 3;
}
*_text = (const char *) text;
return( val );
}
char *utf8_idpb(char *utf8_text,uint32 ch) {
/* Increment and deposit character */
if ( ch<0 || ch>=17*65536 )
return( utf8_text );
if ( ch<=127 )
*utf8_text++ = ch;
else if ( ch<=0x7ff ) {
*utf8_text++ = 0xc0 | (ch>>6);
*utf8_text++ = 0x80 | (ch&0x3f);
} else if ( ch<=0xffff ) {
*utf8_text++ = 0xe0 | (ch>>12);
*utf8_text++ = 0x80 | ((ch>>6)&0x3f);
*utf8_text++ = 0x80 | (ch&0x3f);
} else {
uint32 val = ch-0x10000;
int u = ((val&0xf0000)>>16)+1, z=(val&0x0f000)>>12, y=(val&0x00fc0)>>6, x=val&0x0003f;
*utf8_text++ = 0xf0 | (u>>2);
*utf8_text++ = 0x80 | ((u&3)<<4) | z;
*utf8_text++ = 0x80 | y;
*utf8_text++ = 0x80 | x;
}
return( utf8_text );
}
char *utf8_ib(char *utf8_text) {
int ch;
/* Increment character */
if ( (ch = *utf8_text)=='\0' )
return( utf8_text );
else if ( ch<=127 )
return( utf8_text+1 );
else if ( ch<0xe0 )
return( utf8_text+2 );
else if ( ch<0xf0 )
return( utf8_text+3 );
else
return( utf8_text+4 );
}
int utf8_valid(const char *str) {
/* Is this a valid utf8 string? */
int ch;
while ( (ch=utf8_ildb(&str))!='\0' )
if ( ch==-1 )
return( false );
return( true );
}
void utf8_truncatevalid(char *str) {
/* There are certain cases where we have a fixed amount of space to display */
/* something, and if it doesn't fit in that, then we truncate it. But... */
/* that can leave us with a half completed utf8 byte sequence. So truncate*/
/* again, right before the start of the bad sequence */
int ch;
char *old;
old = str;
while ( (ch=utf8_ildb((const char **) &str))!='\0' ) {
if ( ch==-1 ) {
*old = '\0';
return;
}
old = str;
}
}
char *utf8_db(char *utf8_text) {
/* Decrement utf8 pointer */
unsigned char *pt = (unsigned char *) utf8_text;
--pt;
if ( *pt>=0xc0 )
/* This should never happen. The pointer was looking at an intermediate */
/* character. However, if it does happen then we are now properly */
/* positioned at the start of a new char */;
else if ( *pt>=0x80 ) {
--pt;
if ( *pt>=0xc0 )
/* Done */;
else if ( *pt>=0x80 ) {
--pt;
if ( *pt>=0xc0 )
/* Done */;
else if ( *pt>=0x80 )
--pt;
}
}
return( (char *) pt );
}
int utf8_strlen(const char *utf8_str) {
/* how many characters in the string NOT bytes */
int len = 0;
while ( utf8_ildb(&utf8_str)>0 )
++len;
return( len );
}
int utf82u_strlen(const char *utf8_str) {
/* how many shorts needed to represent it in UCS2 */
int ch;
int len = 0;
while ( (ch = utf8_ildb(&utf8_str))>0 )
if ( ch>0x10000 )
len += 2;
else
++len;
return( len );
}
#include <chardata.h>
char *StripToASCII(const char *utf8_str) {
/* Remove any non-ascii characters: Special case, convert the copyright symbol to (c) */
char *newcr, *pt, *end;
int len, ch;
const unichar_t *alt;
len = strlen(utf8_str);
pt = newcr = galloc(len+1);
end = pt+len;
while ( (ch= utf8_ildb(&utf8_str))!='\0' ) {
if ( pt>=end ) {
int off = pt-newcr;
newcr = grealloc(newcr,(off+10)+1);
pt = newcr+off;
end = pt+10;
}
if ( (ch>=' ' && ch<'\177' ) || ch=='\n' || ch=='\t' )
*pt++ = ch;
else if ( ch=='\r' && *utf8_str!='\n' )
*pt++ = '\n';
else if ( ch==0xa9 /* Copyright sign */ ) {
char *str = "(c)";
if ( pt+strlen(str)>=end ) {
int off = pt-newcr;
newcr = grealloc(newcr,(off+10+strlen(str))+1);
pt = newcr+off;
end = pt+10;
}
while ( *str )
*pt++ = *str++;
} else if ( unicode_alternates[ch>>8]!=NULL &&
(alt = unicode_alternates[ch>>8][ch&0xff])!=NULL ) {
while ( *alt!='\0' ) {
if ( pt>=end ) {
int off = pt-newcr;
newcr = grealloc(newcr,(off+10)+1);
pt = newcr+off;
end = pt+10;
}
if ( *alt>=' ' && *alt<'\177' )
*pt++ = *alt;
else if ( *alt==0x300 )
*pt++ = '`';
else if ( *alt==0x301 )
*pt++ = '\'';
else if ( *alt==0x302 )
*pt++ = '^';
else if ( *alt==0x303 )
*pt++ = '~';
else if ( *alt==0x308 )
*pt++ = ':';
++alt;
}
}
}
*pt = '\0';
return( newcr );
}
int AllAscii(const char *txt) {
for ( ; *txt!='\0'; ++txt ) {
if ( *txt=='\t' || *txt=='\n' || *txt=='\r' )
/* All right */;
else if ( *txt<' ' || *txt>='\177' )
return( false );
}
return( true );
}
int uAllAscii(const unichar_t *txt) {
for ( ; *txt!='\0'; ++txt ) {
if ( *txt=='\t' || *txt=='\n' || *txt=='\r' )
/* All right */;
else if ( *txt<' ' || *txt>='\177' )
return( false );
}
return( true );
}