From 3736c5f3635863e54ab2cc47860628d26855c749 Mon Sep 17 00:00:00 2001 From: "Suren A. Chilingaryan" Date: Thu, 11 Aug 2005 01:06:56 +0000 Subject: Transliteration and Documentation Update - Fix: Autodetection of dissabled charsets. - Fix: Cleanely terminate external process if parrent thread terminated. - Transliteration for Russian, Ukrainian and using IConv. - Documentation Update. --- src/librcc.h | 10 +++++++--- src/lngconfig.c | 59 ++++++++++++++++++++++++++++++++++++++++++--------------- src/rccconfig.c | 11 ++++++----- src/rccconfig.h | 3 +++ src/recode.c | 47 ++++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 106 insertions(+), 24 deletions(-) (limited to 'src') diff --git a/src/librcc.h b/src/librcc.h index 9b064d1..98ca1a6 100644 --- a/src/librcc.h +++ b/src/librcc.h @@ -427,6 +427,7 @@ typedef int rcc_option_value; typedef enum rcc_option_translate_t { RCC_OPTION_TRANSLATE_OFF = 0, /**< Switch translation off. */ + RCC_OPTION_TRANSLATE_TRANSLITERATE, /**< Transliterate data. */ RCC_OPTION_TRANSLATE_TO_ENGLISH, /**< Translate data to english language (Current language don't matter). */ RCC_OPTION_TRANSLATE_SKIP_RELATED, /**< Skip translation of the text's between related languages. */ RCC_OPTION_TRANSLATE_SKIP_PARRENT, /**< Skip translation of the text's from parrent languages (from english). */ @@ -821,7 +822,7 @@ rcc_charset_id rccConfigGetClassCharsetByName(rcc_language_config config, rcc_cl * Checks if charset is disabled for the specified class. * @param config is language configuration * @param class_id is class id. - * @param charset is charset name. + * @param charset_id is charset id. * @return 1 if charset is disabled, 0 if charset is enabled, -1 in the case of error. */ int rccConfigIsDisabledCharset(rcc_language_config config, rcc_class_id class_id, rcc_charset_id charset_id); @@ -885,10 +886,13 @@ const char *rccConfigGetSelectedCharsetName(rcc_language_config config, rcc_clas /** * Return current encoding_id. The default value will be resolved to paticular encoding id. * The following procedure is used to detect default encoding: + * - If Unicode encoding selected for the same class english language. Return this encoding. * - If the parrent class is defined in #defcharset, - return current encoding of parrent class. - * - If the locale variable is defined in #defcharset and config language coincide with locale language, use locale encoding. + * - If the locale variable is defined in #defcharset and either config language coincide with locale language or unciode encoding defined, use locale encoding. * - If the default value for config language is defined in #defvalue return that default value. - * - Return language with id 0. Normally this should be dummy language which indicates that RCC library is not used. + * - If the default value for all languages is defined in #defvalue return that default value. + * - If either config language is coincide with locale language or unicode locale is used, return locale encoding. + * - Return first by the list non-dissabled encoding. * * @param config is language configuration * @param class_id is encoding class diff --git a/src/lngconfig.c b/src/lngconfig.c index 20aff63..631abd1 100644 --- a/src/lngconfig.c +++ b/src/lngconfig.c @@ -567,9 +567,11 @@ const char *rccConfigGetSelectedCharsetName(rcc_language_config config, rcc_clas } rcc_charset_id rccConfigGetCurrentCharset(rcc_language_config config, rcc_class_id class_id) { + rcc_language_config enconfig; unsigned int i, max; rcc_charset_id charset_id; rcc_charset_id all_charset_id = (rcc_language_id)-1; + const char *charset; rcc_class_default_charset *defcharset; const char *lang; @@ -582,10 +584,19 @@ rcc_charset_id rccConfigGetCurrentCharset(rcc_language_config config, rcc_class_ const char *defvalue; if ((!config)||(!config->ctx)||(class_id<0)||(class_id>=config->ctx->n_classes)) return -1; - + charset_id = config->charset[class_id]; if (charset_id) return charset_id; + enconfig = rccGetConfigByName(config->ctx, rcc_english_language_sn); + if ((enconfig)&&(enconfig!=config)) { + charset_id = enconfig->charset[class_id]; + if (charset_id) { + charset = rccConfigGetClassCharsetName(enconfig, class_id, charset_id); + if ((charset)&&(rccIsUnicode(charset))) return charset_id; + } + } + if (!config->language) return (rcc_charset_id)-1; else language = config->language; @@ -598,23 +609,27 @@ rcc_charset_id rccConfigGetCurrentCharset(rcc_language_config config, rcc_class_ if (!strcmp(classes[i]->name, defvalue)) return rccConfigGetCurrentCharset(config, i); } - } else defvalue = config->ctx->locale_variable; + } if (config->default_charset[class_id]) return config->default_charset[class_id]; if (cl->defvalue) { charset_id = rccConfigGetLocaleClassCharset(config, class_id, defvalue); if ((charset_id != 0)&&(charset_id != (rcc_charset_id)-1)) { - config->default_charset[class_id] = charset_id; - return charset_id; + if (!rccConfigIsDisabledCharset(config, class_id, charset_id)) { + config->default_charset[class_id] = charset_id; + return charset_id; + } } } if (cl->defvalue) { charset_id = rccConfigGetClassCharsetByName(config, class_id, defvalue); if ((charset_id != 0)&&(charset_id != (rcc_charset_id)-1)) { - config->default_charset[class_id] = charset_id; - return charset_id; + if (!rccConfigIsDisabledCharset(config, class_id, charset_id)) { + config->default_charset[class_id] = charset_id; + return charset_id; + } } } @@ -626,9 +641,17 @@ rcc_charset_id rccConfigGetCurrentCharset(rcc_language_config config, rcc_class_ if (!strcasecmp(lang, defcharset[i].lang)) { charset_id = rccConfigGetClassCharsetByName(config, class_id, defcharset[i].charset); if ((charset_id != 0)&&(charset_id != (rcc_charset_id)-1)) { - config->default_charset[class_id] = charset_id; - return charset_id; - } else break; + if (!rccConfigIsDisabledCharset(config, class_id, charset_id)) { + config->default_charset[class_id] = charset_id; + return charset_id; + } else { + all_charset_id = (rcc_charset_id)-1; + break; + } + } else { + all_charset_id = (rcc_charset_id)-1; + break; + } } else if (!strcasecmp(rcc_default_all, defcharset[i].lang)) { charset_id = rccConfigGetClassCharsetByName(config, class_id, defcharset[i].charset); if ((charset_id != 0)&&(charset_id != (rcc_charset_id)-1)) { @@ -638,20 +661,26 @@ rcc_charset_id rccConfigGetCurrentCharset(rcc_language_config config, rcc_class_ } if (all_charset_id != (rcc_language_id)-1) { - config->default_charset[class_id] = all_charset_id; - return all_charset_id; + if (!rccConfigIsDisabledCharset(config, class_id, all_charset_id)) { + config->default_charset[class_id] = all_charset_id; + return all_charset_id; + } } } - charset_id = rccConfigGetLocaleClassCharset(config, class_id, defvalue); + charset_id = rccConfigGetLocaleClassCharset(config, class_id, config->ctx->locale_variable); if ((charset_id != 0)&&(charset_id != (rcc_charset_id)-1)) { - config->default_charset[class_id] = charset_id; - return charset_id; + if (!rccConfigIsDisabledCharset(config, class_id, charset_id)) { + config->default_charset[class_id] = charset_id; + return charset_id; + } } max = rccConfigGetClassCharsetNumber(config, class_id); for (i = 1; i< max; i++) - if (!rccConfigIsDisabledCharset(config, class_id, (rcc_charset_id)i)) return (rcc_charset_id)i; + if (!rccConfigIsDisabledCharset(config, class_id, (rcc_charset_id)i)) { + return (rcc_charset_id)i; + } return (rcc_charset_id)-1; } diff --git a/src/rccconfig.c b/src/rccconfig.c index 0752ee3..ae47a63 100644 --- a/src/rccconfig.c +++ b/src/rccconfig.c @@ -31,6 +31,8 @@ const char rcc_default_all[] = "all"; const char rcc_default_language_sn[] = "default"; const char rcc_disabled_language_sn[] = "Off"; const char rcc_english_language_sn[] = "en"; +const char rcc_russian_language_sn[] = "ru"; +const char rcc_ukrainian_language_sn[] = "uk"; const char rcc_disabled_engine_sn[] = "Off"; const char rcc_default_charset[] = "Default"; @@ -61,18 +63,18 @@ rcc_language rcc_default_languages_embeded[RCC_MAX_LANGUAGES + 1] = { &rcc_default_engine, NULL }}, -{"en", {rcc_default_charset, rcc_utf8_charset, NULL}, { +{rcc_english_language_sn, {rcc_default_charset, rcc_utf8_charset, "ISO8859-1", NULL}, { &rcc_default_engine, NULL }}, -{"ru", {rcc_default_charset,"KOI8-R","CP1251",rcc_utf8_charset,"IBM866","MACCYRILLIC","ISO8859-5", NULL}, { +{rcc_russian_language_sn, {rcc_default_charset,"KOI8-R","CP1251",rcc_utf8_charset,"IBM866","MACCYRILLIC","ISO8859-5", NULL}, { &rcc_default_engine, #ifdef RCC_RCD_SUPPORT &rcc_russian_engine, #endif /* RCC_RCD_SUPPORT */ NULL }}, -{"uk", {rcc_default_charset,"KOI8-U","CP1251",rcc_utf8_charset,"IBM855","MACCYRILLIC","ISO8859-5","CP1125", NULL}, { +{rcc_ukrainian_language_sn, {rcc_default_charset,"KOI8-U","CP1251",rcc_utf8_charset,"IBM855","MACCYRILLIC","ISO8859-5","CP1125", NULL}, { &rcc_default_engine, #ifdef RCC_RCD_SUPPORT &rcc_ukrainian_engine, @@ -129,11 +131,10 @@ rcc_language rcc_default_languages_embeded[RCC_MAX_LANGUAGES + 1] = { }}, {NULL} }; - rcc_option_value_name rcc_sn_boolean[] = { "OFF", "ON", NULL }; rcc_option_value_name rcc_sn_learning[] = { "OFF", "ON", "RELEARN", "LEARN", NULL }; rcc_option_value_name rcc_sn_clo[] = { "ALL", "CONFIGURED_AND_AUTO", "CONFIGURED_ONLY", NULL }; -rcc_option_value_name rcc_sn_translate[] = { "OFF", "TO_ENGLISH", "SKIP_RELATED", "SKIP_PARRENT", "FULL", NULL }; +rcc_option_value_name rcc_sn_translate[] = { "OFF", "TRANSLITERATE", "TO_ENGLISH", "SKIP_RELATED", "SKIP_PARRENT", "FULL", NULL }; rcc_option_description rcc_option_descriptions[RCC_MAX_OPTIONS+1]; rcc_option_description rcc_option_descriptions_embeded[RCC_MAX_OPTIONS+1] = { diff --git a/src/rccconfig.h b/src/rccconfig.h index f7f70dd..8b5ac0d 100644 --- a/src/rccconfig.h +++ b/src/rccconfig.h @@ -10,6 +10,9 @@ extern const char rcc_default_all[]; extern const char rcc_default_language_sn[]; extern const char rcc_english_language_sn[]; +extern const char rcc_russian_language_sn[]; +extern const char rcc_ukrainian_language_sn[]; + extern const char rcc_disabled_language_sn[]; extern const char rcc_disabled_engine_sn[]; diff --git a/src/recode.c b/src/recode.c index a528481..9e19078 100644 --- a/src/recode.c +++ b/src/recode.c @@ -322,7 +322,9 @@ static char *rccRecodeTranslate(rcc_language_config *config, rcc_class_id class_ rcc_translate trans, entrans; + unsigned int i; char *translated; + unsigned char change_case; ctx = (*config)->ctx; @@ -336,7 +338,7 @@ static char *rccRecodeTranslate(rcc_language_config *config, rcc_class_id class_ english_language_id = rccGetLanguageByName(ctx, rcc_english_language_sn); - if (translate == RCC_OPTION_TRANSLATE_TO_ENGLISH) { + if ((translate == RCC_OPTION_TRANSLATE_TO_ENGLISH)||(translate == RCC_OPTION_TRANSLATE_TRANSLITERATE)) { current_language_id = english_language_id ; } else { if (ctype == RCC_CLASS_TRANSLATE_LOCALE) { @@ -356,6 +358,49 @@ static char *rccRecodeTranslate(rcc_language_config *config, rcc_class_id class_ if (rccConfigConfigure(curconfig)) return NULL; + if (translate == RCC_OPTION_TRANSLATE_TRANSLITERATE) { + if (!strcasecmp((*config)->language->sn, rcc_russian_language_sn)) { + translated = rccSizedRecodeCharsets(ctx, "UTF-8", "KOI8-R", utfstring, 0, NULL); + if (!translated) return NULL; + for (i=0;translated[i];i++) { + if (translated[i]&0x80) change_case = 1; + else change_case = 0; + + translated[i]=translated[i]&0x7F; + if (change_case) { + if ((translated[i]<'Z')&&(translated[i]>'A')) + translated[i]=translated[i]-'A'+'a'; + else if ((translated[i]<'z')&&(translated[i]>'a')) + translated[i]=translated[i]-'a'+'A'; + } + } + *config = curconfig; + return translated; + } + if (!strcasecmp((*config)->language->sn, rcc_ukrainian_language_sn)) { + translated = rccSizedRecodeCharsets(ctx, "UTF-8", "KOI8-U", utfstring, 0, NULL); + if (!translated) return NULL; + for (i=0;translated[i];i++) { + if (translated[i]&0x80) change_case = 1; + else change_case = 0; + + translated[i]=translated[i]&0x7F; + if (change_case) { + if ((translated[i]<'Z')&&(translated[i]>'A')) + translated[i]=translated[i]-'A'+'a'; + else if ((translated[i]<'z')&&(translated[i]>'a')) + translated[i]=translated[i]-'a'+'A'; + } + } + *config = curconfig; + return translated; + } + + translated = rccSizedRecodeCharsets(ctx, "UTF-8", "US-ASCII//TRANSLIT", utfstring, 0, NULL); + if (translated) *config = curconfig; + return translated; + } + if (translate == RCC_OPTION_TRANSLATE_SKIP_RELATED) { if (rccAreRelatedLanguages(curconfig, *config)) return NULL; } -- cgit v1.2.3