diff options
Diffstat (limited to 'common/unicode.c')
-rw-r--r-- | common/unicode.c | 441 |
1 files changed, 441 insertions, 0 deletions
diff --git a/common/unicode.c b/common/unicode.c new file mode 100644 index 000000000..90b862c5b --- /dev/null +++ b/common/unicode.c @@ -0,0 +1,441 @@ +/*************************************************************************** + * Copyright (c) 2004,2005 by Marcoen Hirschberg + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307 USA + * + * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY + * KIND, either express or implied. + * + **************************************************************************** + * Source file dumped from rockbox-3.1 distribution. + * $Id: unicode.c 19448 2008-12-15 23:42:19Z zagor $ + * Copyright (c) 2004,2005 by Marcoen Hirschberg + **************************************************************************** + * See file CREDITS for list of people who contributed to the U-boot + * project. + * + * 09-jan-2008 etienne.carriere@stnwireless.com - port from rockbox to U-boot + **************************************************************************** + * Some conversion functions for handling UTF-8 + * + * I got all the info from: + * http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8 + * and + * http://en.wikipedia.org/wiki/Unicode + **************************************************************************** + */ +#include <common.h> /* u-boot basics */ + +/*#include <stdio.h> n.a for u-boot */ +/* #include "file.h" */ +/* #include "debug.h" */ +#include <rbunicode.h> +/*#include "config.h" n.a for u-boot */ + +#ifndef O_BINARY +#define O_BINARY 0 +#endif + +/* U-boot port: no code page file accessible: remove code page ressources */ +#define UNICODE_NO_CP_TABLE +#define UNICODE_NO_CP_TABLE + + + +#ifndef UNICODE_NO_CP_TABLE + +#define CODEPAGE_DIR ROCKBOX_DIR"/codepages" +static int default_codepage = 0; +static int loaded_cp_table = 0; + +#ifdef HAVE_LCD_BITMAP + +#define MAX_CP_TABLE_SIZE 32768 +#define NUM_TABLES 5 + +static const char *filename[NUM_TABLES] = +{ + CODEPAGE_DIR"/iso.cp", + CODEPAGE_DIR"/932.cp", /* SJIS */ + CODEPAGE_DIR"/936.cp", /* GB2312 */ + CODEPAGE_DIR"/949.cp", /* KSX1001 */ + CODEPAGE_DIR"/950.cp" /* BIG5 */ +}; + +static const char cp_2_table[NUM_CODEPAGES] = +{ + 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 5, 0 +}; + +static const char *name_codepages[NUM_CODEPAGES+1] = +{ + "ISO-8859-1", + "ISO-8859-7", + "ISO-8859-8", + "CP1251", + "ISO-8859-11", + "CP1256", + "ISO-8859-9", + "ISO-8859-2", + "CP1250", + "SJIS", + "GB-2312", + "KSX-1001", + "BIG5", + "UTF-8", + "unknown" +}; + +#else /* !HAVE_LCD_BITMAP, reduced support */ + +#define MAX_CP_TABLE_SIZE 640 +#define NUM_TABLES 1 + +static const char *filename[NUM_TABLES] = { + CODEPAGE_DIR"/isomini.cp" +}; + +static const char cp_2_table[NUM_CODEPAGES] = +{ + 0, 1, 1, 1, 1, 1, 0 +}; + +static const char *name_codepages[NUM_CODEPAGES+1] = +{ + "ISO-8859-1", + "ISO-8859-7", + "CP1251", + "ISO-8859-9", + "ISO-8859-2", + "CP1250", + "UTF-8", + "unknown" +}; + +#endif + +static unsigned short codepage_table[MAX_CP_TABLE_SIZE]; + +#endif /* #ifndef UNICODE_NO_CP_TABLE */ + +static const unsigned char utf8comp[6] = +{ + 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC +}; + +#ifndef UNICODE_NO_CP_TABLE +/* Load codepage file into memory */ +static int load_cp_table(int cp) +{ + int i=0; + int table = cp_2_table[cp]; + int file, tablesize; + unsigned char tmp[2]; + + if (table == 0 || table == loaded_cp_table) + return 1; + + file = open(filename[table-1], O_RDONLY|O_BINARY); + + if (file < 0) { + DEBUGF("Can't open codepage file: %s.cp\n", filename[table-1]); + return 0; + } + + tablesize = filesize(file) / 2; + + if (tablesize > MAX_CP_TABLE_SIZE) { + DEBUGF("Invalid codepage file: %s.cp\n", filename[table-1]); + close(file); + return 0; + } + + while (i < tablesize) { + if (!read(file, tmp, 2)) { + DEBUGF("Can't read from codepage file: %s.cp\n", + filename[table-1]); + loaded_cp_table = 0; + return 0; + } + codepage_table[i++] = (tmp[1] << 8) | tmp[0]; + } + + loaded_cp_table = table; + close(file); + return 1; +} +#endif /* #ifndef UNICODE_NO_CP_TABLE */ + +/* Encode a UCS value as UTF-8 and return a pointer after this UTF-8 char. */ +unsigned char* utf8encode(unsigned long ucs, unsigned char *utf8) +{ + int tail = 0; + + if (ucs > 0x7F) + while (ucs >> (5*tail + 6)) + tail++; + + *utf8++ = (ucs >> (6*tail)) | utf8comp[tail]; + while (tail--) + *utf8++ = ((ucs >> (6*tail)) & (MASK ^ 0xFF)) | COMP; + + return utf8; +} + +#ifndef UNICODE_NO_CP_TABLE +/* Recode an iso encoded string to UTF-8 */ +unsigned char* iso_decode(const unsigned char *iso, unsigned char *utf8, + int cp, int count) +{ + unsigned short ucs, tmp; + + if (cp == -1) /* use default codepage */ + cp = default_codepage; + + if (!load_cp_table(cp)) cp = 0; + + while (count--) { + if (*iso < 128 || cp == UTF_8) /* Already UTF-8 */ + *utf8++ = *iso++; + + else { + + /* cp tells us which codepage to convert from */ + switch (cp) { + case ISO_8859_7: /* Greek */ + case WIN_1251: /* Cyrillic */ + case ISO_8859_9: /* Turkish */ + case ISO_8859_2: /* Latin Extended */ + case WIN_1250: /* Central European */ +#ifdef HAVE_LCD_BITMAP + case ISO_8859_8: /* Hebrew */ + case ISO_8859_11: /* Thai */ + case WIN_1256: /* Arabic */ +#endif + tmp = ((cp-1)*128) + (*iso++ - 128); + ucs = codepage_table[tmp]; + break; + +#ifdef HAVE_LCD_BITMAP + case SJIS: /* Japanese */ + if (*iso > 0xA0 && *iso < 0xE0) { + tmp = *iso++ | (0xA100 - 0x8000); + ucs = codepage_table[tmp]; + break; + } + + case GB_2312: /* Simplified Chinese */ + case KSX_1001: /* Korean */ + case BIG_5: /* Traditional Chinese */ + if (count < 1 || !iso[1]) { + ucs = *iso++; + break; + } + + /* we assume all cjk strings are written + in big endian order */ + tmp = *iso++ << 8; + tmp |= *iso++; + tmp -= 0x8000; + ucs = codepage_table[tmp]; + count--; + break; +#endif /* HAVE_LCD_BITMAP */ + + default: + ucs = *iso++; + break; + } + + if (ucs == 0) /* unknown char, use replacement char */ + ucs = 0xfffd; + utf8 = utf8encode(ucs, utf8); + } + } + return utf8; +} +#else /* #ifndef UNICODE_NO_CP_TABLE */ +/* Recode an iso encoded string to UTF-8 : Support only default code page ISO_8859_1 */ +unsigned char* iso_decode(const unsigned char *iso, unsigned char *utf8, + int cp, int count) +{ + unsigned short ucs; + + if (cp == -1) cp = ISO_8859_1; /* use default codepage */ + if (cp != ISO_8859_1) { + printf("ERROR: unsupported codepage ID %d (see include/rbunicode.h)\n", cp); + cp = ISO_8859_1; + } + + while (count--) { + ucs = *iso++; + if (ucs == 0) ucs = 0xfffd; /* unknown char, use replacement char */ + utf8 = utf8encode(ucs, utf8); + } + return utf8; +} +#endif + +/* Recode a UTF-16 string with little-endian byte ordering to UTF-8 */ +unsigned char* utf16LEdecode(const unsigned char *utf16, unsigned char *utf8, + int count) +{ + unsigned long ucs; + + while (count > 0) { + /* Check for a surrogate pair */ + if (utf16[1] >= 0xD8 && utf16[1] < 0xE0) { + ucs = 0x10000 + ((utf16[0] << 10) | ((utf16[1] - 0xD8) << 18) + | utf16[2] | ((utf16[3] - 0xDC) << 8)); + utf16 += 4; + count -= 2; + } else { + ucs = (utf16[0] | (utf16[1] << 8)); + utf16 += 2; + count -= 1; + } + utf8 = utf8encode(ucs, utf8); + } + return utf8; +} + +/* Recode a UTF-16 string with big-endian byte ordering to UTF-8 */ +unsigned char* utf16BEdecode(const unsigned char *utf16, unsigned char *utf8, + int count) +{ + unsigned long ucs; + + while (count > 0) { + if (*utf16 >= 0xD8 && *utf16 < 0xE0) { /* Check for a surrogate pair */ + ucs = 0x10000 + (((utf16[0] - 0xD8) << 18) | (utf16[1] << 10) + | ((utf16[2] - 0xDC) << 8) | utf16[3]); + utf16 += 4; + count -= 2; + } else { + ucs = (utf16[0] << 8) | utf16[1]; + utf16 += 2; + count -= 1; + } + utf8 = utf8encode(ucs, utf8); + } + return utf8; +} + +#if 0 /* currently unused */ +/* Recode any UTF-16 string to UTF-8 */ +unsigned char* utf16decode(const unsigned char *utf16, unsigned char *utf8, + unsigned int count) +{ + unsigned long ucs; + + ucs = *(utf16++) << 8; + ucs |= *(utf16++); + + if (ucs == 0xFEFF) /* Check for BOM */ + return utf16BEdecode(utf16, utf8, count-1); + else if (ucs == 0xFFFE) + return utf16LEdecode(utf16, utf8, count-1); + else { /* ADDME: Should default be LE or BE? */ + utf16 -= 2; + return utf16BEdecode(utf16, utf8, count); + } +} +#endif + +/* Return the number of UTF-8 chars in a string */ +unsigned long utf8length(const unsigned char *utf8) +{ + unsigned long l = 0; + + while (*utf8 != 0) + if ((*utf8++ & MASK) != COMP) + l++; + + return l; +} + +/* Decode 1 UTF-8 char and return a pointer to the next char. */ +const unsigned char* utf8decode(const unsigned char *utf8, unsigned short *ucs) +{ + unsigned char c = *utf8++; + unsigned long code; + int tail = 0; + + if ((c <= 0x7f) || (c >= 0xc2)) { + /* Start of new character. */ + if (c < 0x80) { /* U-00000000 - U-0000007F, 1 byte */ + code = c; + } else if (c < 0xe0) { /* U-00000080 - U-000007FF, 2 bytes */ + tail = 1; + code = c & 0x1f; + } else if (c < 0xf0) { /* U-00000800 - U-0000FFFF, 3 bytes */ + tail = 2; + code = c & 0x0f; + } else if (c < 0xf5) { /* U-00010000 - U-001FFFFF, 4 bytes */ + tail = 3; + code = c & 0x07; + } else { + /* Invalid size. */ + code = 0xfffd; + } + + while (tail-- && ((c = *utf8++) != 0)) { + if ((c & 0xc0) == 0x80) { + /* Valid continuation character. */ + code = (code << 6) | (c & 0x3f); + + } else { + /* Invalid continuation char */ + code = 0xfffd; + utf8--; + break; + } + } + } else { + /* Invalid UTF-8 char */ + code = 0xfffd; + } + /* currently we don't support chars above U-FFFF */ + *ucs = (code < 0x10000) ? code : 0xfffd; + return utf8; +} + +#ifndef UNICODE_NO_CP_TABLE +void set_codepage(int cp) +{ + default_codepage = cp; + return; +} +#endif /* #ifndef UNICODE_NO_CP_TABLE */ + +/* seek to a given char in a utf8 string and + return its start position in the string */ +int utf8seek(const unsigned char* utf8, int offset) +{ + int pos = 0; + + while (offset--) { + pos++; + while ((utf8[pos] & MASK) == COMP) + pos++; + } + return pos; +} + +#ifndef UNICODE_NO_CP_TABLE +const char* get_codepage_name(int cp) +{ + if (cp < 0 || cp>= NUM_CODEPAGES) + return name_codepages[NUM_CODEPAGES]; + return name_codepages[cp]; +} +#endif /* #ifndef UNICODE_NO_CP_TABLE */ + |