/* Copyright (C) 2000-2012 by George Williams */ /* * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include "../inc/charset.h" #include "../inc/basics.h" char *alphabets[] = { "8859-1.TXT", "8859-2.TXT", "8859-3.TXT", "8859-4.TXT", "8859-5.TXT", "8859-6.TXT", "8859-7.TXT", "8859-8.TXT", "8859-9.TXT", "8859-10.TXT", "8859-11.TXT", "8859-13.TXT", "8859-14.TXT", "8859-15.TXT", "koi8r.TXT", "JIS0201.txt", "WIN.TXT", "MacRoman.TXT", "MacSYMBOL.TXT", "zapfding.TXT", /*"MacCYRILLIC.TXT",*/ NULL }; char *alnames[] = { "i8859_1", "i8859_2", "i8859_3", "i8859_4", "i8859_5", "i8859_6", "i8859_7", "i8859_8", "i8859_9", "i8859_10", "i8859_11", "i8859_13", "i8859_14", "i8859_15", "koi8_r", "jis201", "win", "mac", "MacSymbol", "ZapfDingbats", /*"MacCyrillic",*/ NULL }; int almaps[] = { em_iso8859_1, em_iso8859_2, em_iso8859_3, em_iso8859_4, em_iso8859_5, em_iso8859_6, em_iso8859_7, em_iso8859_8, em_iso8859_9, em_iso8859_10, em_iso8859_11, em_iso8859_13, em_iso8859_14, em_iso8859_15, em_koi8_r, em_jis201, em_win, em_mac, em_symbol, em_zapfding, -1 }; char *cjk[] = { "JIS0208.TXT", "JIS0212.TXT", "BIG5.TXT", "GB2312.TXT", "HANGUL.TXT", "Big5HKSCS.txt", NULL }; /* I'm only paying attention to Wansung encoding (in HANGUL.TXT) which is 94x94 */ /* I used to look at OLD5601, but that maps to Unicode 1.0, and Hangul's moved */ char *adobecjk[] = { "aj16cid2code.txt", "aj20cid2code.txt", "ac15cid2code.txt", "ag15cid2code.txt", "ak12cid2code.txt", NULL }; /* I'm told that most of the mappings provided on the Unicode site go to */ /* Unicode 1.* and that CJK have been moved radically since. So instead */ /* of the unicode site's files, try using Adobe's which claim they are */ /* up to date. These may be found in: */ /* ftp://ftp.ora.com/pub/examples/nutshell/ujip/adobe/samples/{aj14,aj20,ak12,ac13,ag14}/cid2code.txt */ /* they may be bundled up in a tar file, I forget exactly... */ char *cjknames[] = { "jis208", "jis212", "big5", "gb2312", "ksc5601", "big5hkscs", NULL }; int cjkmaps[] = { em_jis208, em_jis212, em_big5, em_gb2312, em_ksc5601, em_big5hkscs }; unsigned long *used[256]; const char CantReadFile[] = "Can't find or read file %s\n"; const char CantSaveFile[] = "Can't open or write to output file %s\n"; const char NoMoreMemory[] = "Can't access more memory.\n"; static void dumpalphas(FILE *output, FILE *header) { FILE *file; int i,j,k, first, last; long _orig, _unicode, mask; unichar_t unicode[256]; unsigned char *table[256], *plane; char buffer[200+1]; fprintf(output, "#include \n\n" ); fprintf(output, "const unsigned char c_allzeros[256] = { 0 };\n\n" ); buffer[200]='\0'; for ( k=0; k<256; ++k ) table[k] = NULL; for ( j=0; alphabets[j]!=NULL; ++j ) { file = fopen( alphabets[j], "r" ); if ( file==NULL ) { fprintf( stderr, CantReadFile, alphabets[j]); } else { for ( i=0; i<160; ++i ) unicode[i] = i; for ( ; i<256; ++i ) unicode[i] = 0; while ( fgets(buffer,sizeof(buffer)-1,file)!=NULL ) { if ( buffer[0]=='#' ) continue; sscanf(buffer, "0x%lx 0x%lx", (unsigned long *) &_orig, (unsigned long *) &_unicode); unicode[_orig] = _unicode; if ( table[_unicode>>8]==NULL ) { if ((plane = table[_unicode>>8] = calloc(256,1))==NULL) { fprintf( stderr, NoMoreMemory ); exit(3); } if ( j==0 && (_unicode>>8)==0 ) for ( k=0; k<256; ++k ) plane[k] = k; else if ( j==0 ) for ( k=0; k<128; ++k ) plane[k] = k; } table[_unicode>>8][_unicode&0xff] = _orig; if ( used[_unicode>>8]==NULL ) { if ((used[_unicode>>8] = calloc(256,sizeof(long)))==NULL) { fprintf( stderr, NoMoreMemory ); exit(3); } } if ( almaps[j]!=-1 ) used[_unicode>>8][_unicode&0xff] |= (1<=0x2e80 && val<=0x2fff ) return( 1 ); /* New CJK Radicals are least important */ else if ( val>=VERTMARK ) return( 0 ); /* Then vertical guys */ /* only we can't handle vertical here */ else if ( val>=0xf000 && val<=0xffff ) return( 3 ); /* else if (( val>=0x3400 && val<0x3dff ) || (val>=0x4000 && val<=0x4dff))*/ else if ( val>=0x3400 && val<=0x4dff ) return( 4 ); else return( 5 ); } static int getnth(char *buffer, int col) { int i, val=0, best; char *end; int vals[10]; if ( col==1 ) { /* first column is decimal, others are hex */ if ( !isdigit(*buffer)) return( -1 ); while ( isdigit(*buffer)) val = 10*val + *buffer++-'0'; return( val ); } for ( i=1; ibest ) { val = vals[i]; best = ucs2_score(vals[i]); } } } if ( val >= VERTMARK ) return( -1 ); return( val ); } static void dumpjis(FILE *output,FILE *header) { FILE *file; int i,j,k, first, last; long _orig, _unicode; unichar_t unicode208[94*94], unicode212[94*94]; unichar_t *table[256], *plane; char buffer[400+1]; memset(table,0,sizeof(table)); buffer[400]='\0'; j=0; file = fopen( adobecjk[j], "r" ); if ( file==NULL ) { fprintf( stderr, CantReadFile, adobecjk[j]); } else { memset(unicode208,0,sizeof(unicode208)); while ( fgets(buffer,sizeof(buffer)-1,file)!=NULL ) { if ( buffer[0]=='#' ) continue; _orig = getnth(buffer,2); if ( _orig==-1 ) continue; _unicode = getnth(buffer,22); if ( _unicode==-1 ) { fprintf( stderr, "Eh? JIS 208-1997 %lx is unencoded\n", _orig ); continue; } if ( _unicode>0xffff ) { fprintf( stderr, "Eh? JIS 208-1997 %lx is outside of BMP\n", _orig ); continue; } if ( table[_unicode>>8]==NULL ) if ((table[_unicode>>8] = calloc(256,sizeof(unichar_t)))==NULL) { fprintf( stderr, NoMoreMemory ); exit(3); } table[_unicode>>8][_unicode&0xff] = _orig; _orig -= 0x2121; _orig = (_orig>>8)*94 + (_orig&0xff); if ( _orig>=94*94 ) fprintf( stderr, "Attempt to index with %ld\n", _orig ); else { unicode208[_orig] = _unicode; if ( used[_unicode>>8]==NULL ) { if ((used[_unicode>>8] = calloc(256,sizeof(long)))==NULL) { fprintf( stderr, NoMoreMemory ); exit(3); } } used[_unicode>>8][_unicode&0xff] |= (1<0xffff ) { fprintf( stderr, "Eh? JIS 212-1990 %lx is out of BMP U+%lx\n", _orig, _unicode ); continue; } if ( table[_unicode>>8]==NULL ) if ((table[_unicode>>8] = calloc(256,sizeof(unichar_t)))==NULL) { fprintf( stderr, NoMoreMemory ); exit(3); } if ( table[_unicode>>8][_unicode&0xff]==0 ) table[_unicode>>8][_unicode&0xff] = _orig|0x8000; else fprintf( stderr, "JIS clash at JIS212 %lx, unicode %lx\n", _orig, _unicode ); /* there are said to be a few of these, I'll just always map to 208 */ _orig -= 0x2121; _orig = (_orig>>8)*94 + (_orig&0xff); if ( _orig>=94*94 ) fprintf( stderr, "Attempt to index JIS212 with %ld\n", _orig ); else { unicode212[_orig] = _unicode; if ( used[_unicode>>8]==NULL ) { if ((used[_unicode>>8] = calloc(256,sizeof(long)))==NULL) { fprintf( stderr, NoMoreMemory ); exit(3); } } used[_unicode>>8][_unicode&0xff] |= (1<0xffff ) { fprintf( stderr, "Eh? BIG5 %lx is out of BMP U+%lx\n", _orig, _unicode ); continue; } unicode[_orig-0xa100] = _unicode; if ( table[_unicode>>8]==NULL ) if ((table[_unicode>>8] = calloc(256,sizeof(unichar_t)))==NULL) { fprintf( stderr, NoMoreMemory ); exit(3); } table[_unicode>>8][_unicode&0xff] = _orig; if ( used[_unicode>>8]==NULL ) { if ((used[_unicode>>8] = calloc(256,sizeof(long)))==NULL) { fprintf( stderr, NoMoreMemory ); exit(3); } } used[_unicode>>8][_unicode&0xff] |= (1<>8]==NULL ) if ((table[_unicode>>8] = calloc(256,sizeof(unichar_t)))==NULL) { fprintf( stderr, NoMoreMemory ); exit(3); } table[_unicode>>8][_unicode&0xff] = _orig; if ( used[_unicode>>8]==NULL ) { if ((used[_unicode>>8] = calloc(256,sizeof(long)))==NULL) { fprintf( stderr, NoMoreMemory ); exit(3); } } used[_unicode>>8][_unicode&0xff] |= (1<=0x2121 && (_orig&0xff)>=0x21 && _orig<=0x7e7e && (_orig&0xff)<=0x7e ) fprintf( stderr, "Eh? Wansung %lx is unencoded\n", _orig ); else if ( _johab>=0x8431 && _johab<=0xf9fe ) fprintf( stderr, "Eh? Johab %lx is unencoded\n", _johab ); continue; } if ( _unicode>0xffff ) { if ( _orig>=0x2121 && (_orig&0xff)>=0x21 && _orig<=0x7e7e && (_orig&0xff)<=0x7e ) fprintf( stderr, "Eh? Wansung %lx is out of BMP U+%lx\n", _orig, _unicode ); else if ( _johab>=0x8431 && _johab<=0xf9fe ) fprintf( stderr, "Eh? Johab %lx is out of BMP U+%lx\n", _johab, _unicode ); continue; } if ( _orig>=0x2121 && (_orig&0xff)>=0x21 && _orig<=0x7e7e && (_orig&0xff)<=0x7e ) { if ( table[_unicode>>8]==NULL ) if ((table[_unicode>>8] = calloc(256,sizeof(unichar_t)))==NULL) { fprintf( stderr, NoMoreMemory ); exit(3); } table[_unicode>>8][_unicode&0xff] = _orig; _orig -= 0x2121; _orig = (_orig>>8)*94 + (_orig&0xff); if ( _orig>=94*94 ) { fprintf( stderr, "Not 94x94\n" ); continue; } unicode[_orig] = _unicode; if ( used[_unicode>>8]==NULL ) { if ((used[_unicode>>8] = calloc(256,sizeof(long)))==NULL) { fprintf( stderr, NoMoreMemory ); exit(3); } } used[_unicode>>8][_unicode&0xff] |= (1<=0x8431 && _johab<=0xf9fe ) { if ( jtable[_unicode>>8]==NULL ) if ((jtable[_unicode>>8] = calloc(256,sizeof(unichar_t)))==NULL) { fprintf( stderr, NoMoreMemory ); exit(3); } jtable[_unicode>>8][_unicode&0xff] = _johab; _johab -= 0x8400; junicode[_johab] = _unicode; if ( used[_unicode>>8]==NULL ) { if ((used[_unicode>>8] = calloc(256,sizeof(long)))==NULL) { fprintf( stderr, NoMoreMemory ); exit(3); } } used[_unicode>>8][_unicode&0xff] |= (1<0xffff ) { fprintf( stderr, "Eh? GB2312-80 %lx is out of BMP U+%lx\n", _orig, _unicode ); continue; } if ( table[_unicode>>8]==NULL ) if ((table[_unicode>>8] = calloc(256,sizeof(unichar_t)))==NULL) { fprintf( stderr, NoMoreMemory ); exit(3); } table[_unicode>>8][_unicode&0xff] = _orig; _orig -= 0x2121; _orig = (_orig>>8)*94 + (_orig&0xff); unicode[_orig] = _unicode; if ( used[_unicode>>8]==NULL ) { if ((used[_unicode>>8] = calloc(256,sizeof(long)))==NULL) { fprintf( stderr, NoMoreMemory ); exit(3); } } used[_unicode>>8][_unicode&0xff] |= (1<\n\n" ); fprintf(output, "const unsigned short u_allzeros[256] = { 0 };\n\n" ); dumpjis(output,header); dumpbig5(output,header); dumpbig5hkscs(output,header); dumpWansung(output,header); dumpgb2312(output,header); } static void dumptrans(FILE *output, FILE *header) { unsigned long *plane; int k, i; fprintf(output, "static const unsigned long l_allzeros[256] = { 0 };\n" ); for ( k=0; k<256; ++k ) { if ( used[k]!=NULL ) { plane = used[k]; fprintf( output, "static const unsigned long unicode_backtrans_%x[] = {\n", k ); for ( i=0; i<256-8; i+=8 ) fprintf( output, " 0x%06lx, 0x%06lx, 0x%06lx, 0x%06lx, 0x%06lx, 0x%06lx, 0x%06lx, 0x%06lx,\n", plane[i], plane[i+1], plane[i+2], plane[i+3], plane[i+4], plane[i+5], plane[i+6], plane[i+7]); fprintf( output, " 0x%06lx, 0x%06lx, 0x%06lx, 0x%06lx, 0x%06lx, 0x%06lx, 0x%06lx, 0x%06lx\n};\n\n", plane[i], plane[i+1], plane[i+2], plane[i+3], plane[i+4], plane[i+5], plane[i+6], plane[i+7]); } } fprintf( header, "\n/* a mask for each character saying what charset(s) it may be found in */\n" ); fprintf( header, "extern const unsigned long * const unicode_backtrans[];\n" ); fprintf( output, "const unsigned long *const unicode_backtrans[] = {\n" ); for ( k=0; k<256; ++k ) if ( used[k]!=NULL ) fprintf( output, " unicode_backtrans_%x%s", k, k!=255?",\n":"\n" ); else fprintf( output, " l_allzeros,\n" ); fprintf( output, "};\n" ); } int main(int argc, char **argv) { FILE *output, *header; if (( output = fopen( "alphabet.c", "w" ))==NULL ) { fprintf( stderr, CantSaveFile, "alphabet.c" ); return 1; } if (( header = fopen( "chardata.h", "w" ))==NULL ) { fprintf( stderr, CantSaveFile, "chardata.h" ); fclose(output); return 1; } fprintf( header, "#include \"basics.h\"\n\n" ); fprintf( header, "struct charmap {\n int first, last;\n unsigned char **table;\n unichar_t *totable;\n};\n" ); fprintf( header, "struct charmap2 {\n int first, last;\n unsigned short **table;\n unichar_t *totable;\n};\n\n" ); dumpalphas(output,header); /*dumprandom(output,header);*/ fclose(output); if (( output = fopen( "cjk.c", "w" ))==NULL ) { fprintf( stderr, CantSaveFile, "cjk.c" ); fclose(header); return 1; } dumpcjks(output,header); fclose(output); if (( output = fopen( "backtrns.c", "w" ))==NULL ) { fprintf( stderr, CantSaveFile, "backtrns.c" ); fclose(header); return 1; } dumptrans(output,header); /* This really should be in make ctype, but putting it there causes all */ /* sorts of build problems in things when they happen out of order */ fprintf( header,"\nextern const unichar_t *const * const unicode_alternates[];\n" ); fclose(output); fclose(header); return 0; }