/*-------- * Module : multibyte.c * * Description: New Multibyte related additional function. * * Create 2001-03-03 Eiji Tokuya * New Create 2001-09-16 Eiji Tokuya *-------- */ #include "multibyte.h" #include "misc.h" #include "connection.h" #include "pgapifunc.h" #include #include #include #include #ifndef WIN32 #include #endif #ifndef TRUE #define TRUE 1 #endif typedef struct pg_CS { char *name; int code; } pg_CS; static pg_CS CS_Table[] = { { "SQL_ASCII", SQL_ASCII }, { "EUC_JP", EUC_JP }, { "EUC_CN", EUC_CN }, { "EUC_KR", EUC_KR }, { "EUC_TW", EUC_TW }, { "JOHAB", JOHAB }, /* since 7.3 */ { "UTF8", UTF8 }, /* since 7.2 */ { "MULE_INTERNAL",MULE_INTERNAL }, { "LATIN1", LATIN1 }, { "LATIN2", LATIN2 }, { "LATIN3", LATIN3 }, { "LATIN4", LATIN4 }, { "LATIN5", LATIN5 }, { "LATIN6", LATIN6 }, { "LATIN7", LATIN7 }, { "LATIN8", LATIN8 }, { "LATIN9", LATIN9 }, { "LATIN10", LATIN10 }, { "WIN1256", WIN1256 }, /* Arabic since 7.3 */ { "WIN1258", WIN1258 }, /* Vietnamese since 8.1 */ { "WIN866", WIN866 }, /* since 8.1 */ { "WIN874", WIN874 }, /* Thai since 7.3 */ { "KOI8", KOI8R }, { "WIN1251", WIN1251 }, /* Cyrillic */ { "WIN1252", WIN1252 }, /* Western Europe since 8.1 */ { "ISO_8859_5", ISO_8859_5 }, { "ISO_8859_6", ISO_8859_6 }, { "ISO_8859_7", ISO_8859_7 }, { "ISO_8859_8", ISO_8859_8 }, { "WIN1250", WIN1250 }, /* Central Europe */ { "WIN1253", WIN1253 }, /* Greek since 8.2 */ { "WIN1254", WIN1254 }, /* Turkish since 8.2 */ { "WIN1255", WIN1255 }, /* Hebrew since 8.2 */ { "WIN1257", WIN1257 }, /* Baltic(North Europe) since 8.2 */ { "EUC_JIS_2004", EUC_JIS_2004}, /* EUC for SHIFT-JIS-2004 Japanese, since 8.3 */ { "SJIS", SJIS }, { "BIG5", BIG5 }, { "GBK", GBK }, /* since 7.3 */ { "UHC", UHC }, /* since 7.3 */ { "GB18030", GB18030 }, /* since 7.3 */ { "SHIFT_JIS_2004", SHIFT_JIS_2004 }, /* SHIFT-JIS-2004 Japanese, standard JIS X 0213, since 8.3 */ { "OTHER", OTHER } }; static pg_CS CS_Alias[] = { { "UNICODE", UTF8 }, { "TCVN", WIN1258 }, { "ALT", WIN866 }, { "WIN", WIN1251 }, { "KOI8R", KOI8R }, { "OTHER", OTHER } }; int pg_CS_code(const char *characterset_string) { int i, c = -1; for(i = 0; CS_Table[i].code != OTHER; i++) { if (0 == stricmp(characterset_string, CS_Table[i].name)) { c = CS_Table[i].code; break; } } if (c < 0) { for(i = 0; CS_Alias[i].code != OTHER; i++) { if (0 == stricmp(characterset_string, CS_Alias[i].name)) { c = CS_Alias[i].code; break; } } } if (c < 0) c = OTHER; return (c); } char * check_client_encoding(const pgNAME conn_settings) { const char *cptr, *sptr = NULL; char *rptr; BOOL allowed_cmd = TRUE, in_quote = FALSE; int step = 0; size_t len = 0; if (NAME_IS_NULL(conn_settings)) return NULL; for (cptr = SAFE_NAME(conn_settings); *cptr; cptr++) { if (in_quote) { if (LITERAL_QUOTE == *cptr) { in_quote = FALSE; continue; } } if (';' == *cptr) { allowed_cmd = TRUE; step = 0; continue; } if (!allowed_cmd) continue; if (isspace((unsigned char) *cptr)) continue; switch (step) { case 0: if (0 != strnicmp(cptr, "set", 3)) { allowed_cmd = FALSE; continue; } step++; cptr += 3; break; case 1: if (0 != strnicmp(cptr, "client_encoding", 15)) { allowed_cmd = FALSE; continue; } step++; cptr += 15; if ('=' == *cptr) cptr--; break; case 2: if (0 == strnicmp(cptr, "to", 2)) cptr += 2; else if (0 == strnicmp(cptr, "=", 1)) ; else { allowed_cmd = FALSE; continue; } step++; break; case 3: if (LITERAL_QUOTE == *cptr) { cptr++; for (sptr = cptr; *cptr && *cptr != LITERAL_QUOTE; cptr++) ; } else { for (sptr = cptr; ';' != *cptr && IS_NOT_SPACE(*cptr); cptr++) ; } len = cptr - sptr; if (';' == *cptr) cptr--; step++; break; } } if (!sptr) return NULL; rptr = malloc(len + 1); if (!rptr) return NULL; memcpy(rptr, sptr, len); rptr[len] = '\0'; MYLOG(0, "extracted a client_encoding '%s' from conn_settings\n", rptr); return rptr; } int pg_mb_maxlen(int characterset_code) { switch (characterset_code) { case UTF8: return 4; case EUC_TW: return 4; case EUC_JIS_2004: case EUC_JP: case GB18030: return 3; case SHIFT_JIS_2004: case SJIS: case BIG5: case GBK: case UHC: case EUC_CN: case EUC_KR: case JOHAB: return 2; default: return 1; } } static int pg_CS_stat(int stat,unsigned int character,int characterset_code) { if (character == 0) stat = 0; switch (characterset_code) { case UTF8: { if (stat < 2 && character >= 0x80) { if (character >= 0xfc) stat = 6; else if (character >= 0xf8) stat = 5; else if (character >= 0xf0) stat = 4; else if (character >= 0xe0) stat = 3; else if (character >= 0xc0) stat = 2; } else if (stat >= 2 && character > 0x7f) stat--; else stat=0; } break; /* SHIFT_JIS_2004 Support. */ case SHIFT_JIS_2004: { if (stat < 2 && character >= 0x81 && character <= 0x9f) stat = 2; else if (stat < 2 && character >= 0xe0 && character <= 0xef) stat = 2; else if (stat < 2 && character >= 0xf0 && character <= 0xfc) stat = 2; else if (stat == 2) stat = 1; else stat = 0; } break; /* Shift-JIS Support. */ case SJIS: { if (stat < 2 && character > 0x80 && !(character > 0x9f && character < 0xe0)) stat = 2; else if (stat == 2) stat = 1; else stat = 0; } break; /* Chinese Big5 Support. */ case BIG5: { if (stat < 2 && character > 0xA0) stat = 2; else if (stat == 2) stat = 1; else stat = 0; } break; /* Chinese GBK Support. */ case GBK: { if (stat < 2 && character > 0x7F) stat = 2; else if (stat == 2) stat = 1; else stat = 0; } break; /* Korian UHC Support. */ case UHC: { if (stat < 2 && character > 0x7F) stat = 2; else if (stat == 2) stat = 1; else stat = 0; } break; case EUC_JIS_2004: /* 0x8f is JIS X 0212 + JIS X 0213(2) 3 byte */ /* 0x8e is JIS X 0201 2 byte */ /* 0xa0-0xff is JIS X 0213(1) 2 byte */ case EUC_JP: /* 0x8f is JIS X 0212 3 byte */ /* 0x8e is JIS X 0201 2 byte */ /* 0xa0-0xff is JIS X 0208 2 byte */ { if (stat < 3 && character == 0x8f) /* JIS X 0212 */ stat = 3; else if (stat != 2 && (character == 0x8e || character > 0xa0)) /* Half Katakana HighByte & Kanji HighByte */ stat = 2; else if (stat == 2) stat = 1; else stat = 0; } break; /* EUC_CN, EUC_KR, JOHAB Support */ case EUC_CN: case EUC_KR: case JOHAB: { if (stat < 2 && character > 0xa0) stat = 2; else if (stat == 2) stat = 1; else stat = 0; } break; case EUC_TW: { if (stat < 4 && character == 0x8e) stat = 4; else if (stat == 4 && character > 0xa0) stat = 3; else if ((stat == 3 || stat < 2) && character > 0xa0) stat = 2; else if (stat == 2) stat = 1; else stat = 0; } break; /*Chinese GB18030 support.Added by Bill Huang */ case GB18030: { if (stat < 2 && character > 0x80) stat = 2; else if (stat == 2) { if (character >= 0x30 && character <= 0x39) stat = 3; else stat = 1; } else if (stat == 3) { if (character >= 0x30 && character <= 0x39) stat = 1; else stat = 3; } else stat = 0; } break; default: { stat = 0; } break; } return stat; } /* * This function is used to know the encoding corresponding to * the current locale. */ const char * derive_locale_encoding(const char *dbencoding) { const char *wenc = NULL; #ifdef WIN32 int acp; #else const char *loc, *ptr; #endif /* WIN32 */ if (wenc = getenv("PGCLIENTENCODING"), NULL != wenc) /* environmnt variable */ return wenc; #ifdef WIN32 acp = GetACP(); if (acp >= 1251 && acp <= 1258) { if (stricmp(dbencoding, "SQL_ASCII") == 0) return wenc; } switch (acp) { case 932: wenc = "SJIS"; break; case 936: wenc = "GBK"; break; case 949: wenc = "UHC"; break; case 950: wenc = "BIG5"; break; case 1250: wenc = "WIN1250"; break; case 1251: wenc = "WIN1251"; break; case 1256: wenc = "WIN1256"; break; case 1252: if (strnicmp(dbencoding, "LATIN", 5) == 0) break; wenc = "WIN1252"; break; case 1258: wenc = "WIN1258"; break; case 1253: wenc = "WIN1253"; break; case 1254: wenc = "WIN1254"; break; case 1255: wenc = "WIN1255"; break; case 1257: wenc = "WIN1257"; break; } #else /* * Derive the encoding from the codeset part of the current locale. */ loc = setlocale(LC_CTYPE, ""); if (loc && (ptr = strchr(loc, '.'))) { int enc_no; ptr++; if ((enc_no= pg_char_to_encoding(ptr)) >= 0) wenc = pg_encoding_to_char(enc_no); MYLOG(0, "locale=%s enc=%s\n", loc, wenc ? wenc : "(null)"); } #endif /* WIN32 */ return wenc; } void encoded_str_constr(encoded_str *encstr, int ccsc, const char *str) { encstr->ccsc = ccsc; encstr->encstr = (const UCHAR *) str; encstr->pos = -1; encstr->ccst = 0; } int encoded_nextchar(encoded_str *encstr) { int chr; if (encstr->pos >= 0 && !encstr->encstr[encstr->pos]) return 0; chr = encstr->encstr[++encstr->pos]; encstr->ccst = pg_CS_stat(encstr->ccst, (unsigned int) chr, encstr->ccsc); return chr; } ssize_t encoded_position_shift(encoded_str *encstr, size_t shift) { encstr->pos += shift; return encstr->pos; } int encoded_byte_check(encoded_str *encstr, size_t abspos) { int chr; chr = encstr->encstr[encstr->pos = abspos]; encstr->ccst = pg_CS_stat(encstr->ccst, (unsigned int) chr, encstr->ccsc); return chr; }