From 9d4628df369b92016b7fc3bfc7fed6d06ff2ca9a Mon Sep 17 00:00:00 2001 From: "Suren A. Chilingaryan" Date: Sun, 7 Aug 2005 21:41:32 +0000 Subject: - Russian autoengine is renamed to LibRCD - Fix Learning with Language Autodetection switched on - Attempt to perform rccFS with Language Autodetection switched off, if failed with default behaviour. - Systematization of translation engine: + Rearangement of the translation modes: OFF, TO_ENGLISH, SKIP_RELATED, SKIP_PARRENT, FULL. + New class types: TRANSLATE_LOCALE, TRANSLATE_CURRENT, TRANSLATE_FROM. - Detect "Unicode" locales for foreign languages - "out" class is assumed to be TRANSLATE_LOCALE - Respect RCC_CLASS_KNOWN - Check for Latin UTF-8 prior to running any charset detection engine. --- src/engine.c | 48 ++++++++ src/engine.h | 1 + src/internal.h | 11 +- src/librcc.c | 47 ++++++-- src/librcc.h | 35 ++---- src/lngconfig.c | 37 +++--- src/lngconfig.h | 2 - src/lngrecode.c | 47 +++++++- src/rccconfig.c | 7 +- src/rccconfig.h | 1 + src/rcciconv.c | 2 + src/rcclocale.h | 3 + src/recode.c | 358 +++++++++++++++++++++++++++++++++----------------------- 13 files changed, 390 insertions(+), 209 deletions(-) (limited to 'src') diff --git a/src/engine.c b/src/engine.c index 8058faf..f9c2284 100644 --- a/src/engine.c +++ b/src/engine.c @@ -151,3 +151,51 @@ rcc_context rccEngineGetRccContext(rcc_engine_context ctx) { return ctx->config->ctx; } + +#define bit(i) (1<0) { + // Western is 0x100-0x17e + if ((buf[i]&0xC0)==0x80) bytes--; + else return 0; + } else { + if (buf[i]<128) continue; + + for (j=6;j>=0;j--) + if ((buf[i]&bit(j))==0) break; + + if ((j==0)||(j==6)) return 0; + + bytes=6-j; + if (bytes==1) { + // Western Languages (C2-C3) + if ((buf[i]!=0xC2)&&(buf[i]!=0xC3)) return 0; + } else return 0; + } + } + return 1; +} + + +rcc_autocharset_id rccEngineDetectCharset(rcc_engine_context ctx, const char *buf, size_t len) { + rcc_autocharset_id utf; + + if (CheckWestern(buf, len)) { + utf=rccConfigGetAutoCharsetByName(ctx->config, "UTF-8"); + if (utf != (rcc_autocharset_id)-1) return utf; + utf=rccConfigGetAutoCharsetByName(ctx->config, "UTF8"); + if (utf != (rcc_autocharset_id)-1) return utf; + utf=rccConfigGetAutoCharsetByName(ctx->config, "UTF_8"); + return utf; + } + + if ((ctx)&&(ctx->func)) return ctx->func(ctx, buf, len); + return (rcc_autocharset_id)-1; +} + diff --git a/src/engine.h b/src/engine.h index 445e962..96e6db6 100644 --- a/src/engine.h +++ b/src/engine.h @@ -38,5 +38,6 @@ void rccEngineFreeContext(rcc_engine_context engine_ctx); int rccEngineConfigure(rcc_engine_context ctx); rcc_charset_id rccAutoengineRussian(rcc_engine_context ctx, const char *buf, int len); +rcc_autocharset_id rccEngineDetectCharset(rcc_engine_context ctx, const char *buf, size_t len); #endif /* _RCC_ENGINE_H */ diff --git a/src/internal.h b/src/internal.h index d5797fc..089311f 100644 --- a/src/internal.h +++ b/src/internal.h @@ -28,11 +28,20 @@ #include "rcciconv.h" #include "rccstring.h" #include "rccmutex.h" +#include "rcclocale.h" typedef rcc_language_id rcc_language_parrent_list[RCC_MAX_LANGUAGE_PARRENTS]; +struct rcc_language_internal_t { + rcc_language language; + rcc_language_id parrents[RCC_MAX_LANGUAGE_PARRENTS + 1]; + unsigned char latin; +}; +typedef struct rcc_language_internal_t rcc_language_internal; +typedef rcc_language_internal *rcc_language_internal_ptr; + struct rcc_context_t { char locale_variable[RCC_MAX_VARIABLE_CHARS+1]; @@ -43,8 +52,8 @@ struct rcc_context_t { unsigned int max_languages; unsigned int n_languages; + rcc_language_internal *ilang; rcc_language_ptr *languages; - rcc_language_parrent_list *language_parrents; rcc_language_config configs; unsigned int max_classes; diff --git a/src/librcc.c b/src/librcc.c index 208fcb3..c27c47d 100644 --- a/src/librcc.c +++ b/src/librcc.c @@ -140,7 +140,7 @@ rcc_context rccCreateContext(const char *locale_variable, unsigned int max_langu rcc_context ctx; rcc_language_ptr *languages; - rcc_language_parrent_list *language_parrents; + rcc_language_internal *ilang; rcc_class_ptr *classes; rcc_language_config configs; rcc_iconv *from; @@ -167,18 +167,18 @@ rcc_context rccCreateContext(const char *locale_variable, unsigned int max_langu languages = (rcc_language_ptr*)malloc((max_languages+1)*sizeof(rcc_language_ptr)); classes = (rcc_class_ptr*)malloc((max_classes+1)*sizeof(rcc_class_ptr)); from = (rcc_iconv*)malloc((max_classes)*sizeof(rcc_iconv)); - language_parrents = (rcc_language_parrent_list*)malloc((max_languages+1)*sizeof(rcc_language_parrent_list)); + ilang = (rcc_language_internal*)malloc((max_languages+1)*sizeof(rcc_language_internal)); mutex = rccMutexCreate(); configs = (rcc_language_config)malloc((max_languages)*sizeof(struct rcc_language_config_t)); - if ((!ctx)||(!languages)||(!classes)||(!mutex)||(!language_parrents)) { + if ((!ctx)||(!languages)||(!classes)||(!mutex)||(!from)||(!ilang)||(!mutex)) { if (mutex) rccMutexFree(mutex); if (from) free(from); if (configs) free(configs); if (classes) free(classes); if (languages) free(languages); - if (language_parrents) free(language_parrents); + if (ilang) free(ilang); if (ctx) free(ctx); return NULL; } @@ -193,8 +193,7 @@ rcc_context rccCreateContext(const char *locale_variable, unsigned int max_langu for (i=0;rcc_default_aliases[i].alias;i++) rccRegisterLanguageAlias(ctx, rcc_default_aliases + i); - ctx->language_parrents = language_parrents; - for (i=0;iilang = ilang; ctx->languages = languages; ctx->max_languages = max_languages; @@ -306,7 +305,7 @@ void rccFreeContext(rcc_context ctx) { free(ctx->configs); } if (ctx->classes) free(ctx->classes); - if (ctx->language_parrents) free(ctx->language_parrents); + if (ctx->ilang) free(ctx->ilang); if (ctx->languages) free(ctx->languages); if (ctx->mutex) rccMutexFree(ctx->mutex); free(ctx); @@ -365,6 +364,7 @@ int rccUnlockConfiguration(rcc_context ctx, unsigned int lock_code) { } rcc_language_id rccRegisterLanguage(rcc_context ctx, rcc_language *language) { + unsigned int i; if (!ctx) { if (rcc_default_ctx) ctx = rcc_default_ctx; else return (rcc_language_id)-1; @@ -373,7 +373,21 @@ rcc_language_id rccRegisterLanguage(rcc_context ctx, rcc_language *language) { if (ctx->configuration_lock) return (rcc_language_id)-1; if (ctx->n_languages == ctx->max_languages) return (rcc_language_id)-1; - ctx->languages[ctx->n_languages++] = language; + + memcpy(ctx->ilang + ctx->n_languages, language, sizeof(rcc_language)); + ctx->ilang[ctx->n_languages].parrents[0] = (rcc_language_id)-1; + ctx->ilang[ctx->n_languages].latin = 0; + + for (i=0;language->charsets[i];i++) + if ((strstr(language->charsets[i],"8859"))&&(language->charsets[i][strlen(language->charsets[i])-1]=='1')) { + ctx->ilang[ctx->n_languages].latin = 1; + break; + } + + if ((i==1)&&(!language->charsets[1])&&(rccIsUTF8(language->charsets[0]))) + ctx->ilang[ctx->n_languages].latin = 1; + + ctx->languages[ctx->n_languages++] = (rcc_language_ptr)(ctx->ilang + ctx->n_languages); ctx->languages[ctx->n_languages] = NULL; if (!ctx->current_language) @@ -388,6 +402,10 @@ rcc_charset_id rccLanguageRegisterCharset(rcc_language *language, rcc_charset ch if ((!language)||(!charset)) return (rcc_charset_id)-1; for (i=0;language->charsets[i];i++); if (i>=RCC_MAX_CHARSETS) return (rcc_charset_id)-1; + + if ((strstr(charset,"8859"))&&(charset[strlen(charset)-1]=='1')) + ((rcc_language_internal*)language)->latin = 1; + language->charsets[i++] = charset; language->charsets[i] = NULL; return i-1; @@ -443,7 +461,7 @@ rcc_relation_id rccRegisterLanguageRelation(rcc_context ctx, rcc_language_relati if (language_id == (rcc_language_id)-1) return (rcc_relation_id)-1; - list = ctx->language_parrents[language_id]; + list = ((rcc_language_internal*)ctx->languages[language_id])->parrents; language_id = rccGetLanguageByName(ctx, parrent); if (language_id == (rcc_language_id)-1) return (rcc_relation_id)0; @@ -478,6 +496,8 @@ rcc_class_id rccRegisterClass(rcc_context ctx, rcc_class *cl) { rcc_class_type rccGetClassType(rcc_context ctx, rcc_class_id class_id) { + rcc_class_type clt; + if (!ctx) { if (rcc_default_ctx) ctx = rcc_default_ctx; else return RCC_CLASS_INVALID; @@ -485,7 +505,14 @@ rcc_class_type rccGetClassType(rcc_context ctx, rcc_class_id class_id) { if ((class_id<0)||(class_id>=ctx->n_classes)) return RCC_CLASS_INVALID; - return ctx->classes[class_id]->class_type; + /*DS: temporary solution */ + + clt = ctx->classes[class_id]->class_type; + + if ((!strcasecmp(ctx->classes[class_id]->name, "out"))&&(clt == RCC_CLASS_STANDARD)) + clt = RCC_CLASS_TRANSLATE_LOCALE; + + return clt; } diff --git a/src/librcc.h b/src/librcc.h index 63a6f80..0529682 100644 --- a/src/librcc.h +++ b/src/librcc.h @@ -13,13 +13,6 @@ #define RCC_MAX_ALIASES 64 #define RCC_MAX_CLASSES 16 -#define RCC_MAX_ERRORS 3 - -#define RCC_MAX_CHARSET_CHARS 16 -#define RCC_MAX_LANGUAGE_CHARS 16 -#define RCC_MAX_VARIABLE_CHARS 16 - - /* ID's */ /** * Language ID. @@ -301,7 +294,10 @@ typedef enum rcc_class_type_t { RCC_CLASS_INVALID = 0, /**< Invalid value */ RCC_CLASS_STANDARD, /**< Standard class */ RCC_CLASS_KNOWN, /**< Class encoding is known and no autodetection should be performed */ - RCC_CLASS_FS /**< Class strings are representing file names */ + RCC_CLASS_FS, /**< Class strings are representing file names */ + RCC_CLASS_TRANSLATE_LOCALE, /**< It is permited to translate class strings to current Locale Language in rccTo */ + RCC_CLASS_TRANSLATE_CURRENT,/**< It is permited to translate class strings to Current Language in rccTo */ + RCC_CLASS_TRANSLATE_FROM, /**< It is permited to translate class strings to Current Language in rccFrom */ } rcc_class_type; /** @@ -390,22 +386,13 @@ typedef int rcc_option_value; */ #define RCC_OPTION_LEARNING_FLAG_LEARN 2 -/** - * Switch translation off. - */ -#define RCC_OPTION_TRANSLATE_OFF 0 -/** - * Translate data to english language (Current language don't matter). - */ -#define RCC_OPTION_TRANSLATE_TO_ENGLISH 1 -/** - * Skip translation of the english text. - */ -#define RCC_OPTION_TRANSLATE_SKIP_ENGLISH 2 -/** - * Translate whole data to the current language. - */ -#define RCC_OPTION_TRANSLATE_FULL 3 +typedef enum rcc_option_translate_t { + RCC_OPTION_TRANSLATE_OFF = 0, /**< Switch translation off. */ + RCC_OPTION_TRANSLATE_TO_ENGLISH, /**< Translate data to english language (Current language don't matter). */ + RCC_OPTION_TRANSLATE_SKIP_RELATED, /**< Skip translation of the text's between related languages. */ + RCC_OPTION_TRANSLATE_SKIP_PARRENT, /**< Skip translation of the text's from parrent languages (from english). */ + RCC_OPTION_TRANSLATE_FULL /**< Translate whole data to the current language */ +} rcc_option_translate; /** * List of options available diff --git a/src/lngconfig.c b/src/lngconfig.c index f9d1d6d..7e5a428 100644 --- a/src/lngconfig.c +++ b/src/lngconfig.c @@ -353,7 +353,7 @@ rcc_speller rccConfigGetSpeller(rcc_language_config config) { if (config->speller) language_id = rccConfigGetLanguage(config); else language_id = (rcc_language_id)-1; - if (language_id != (rcc_language_id)-1) parrents = config->ctx->language_parrents[language_id]; + if (language_id != (rcc_language_id)-1) parrents = ((rcc_language_internal*)config->language)->parrents; else parrents = NULL; if (parrents) { @@ -508,10 +508,12 @@ rcc_charset_id rccConfigGetCurrentCharset(rcc_language_config config, rcc_class_ if (config->default_charset[class_id]) return config->default_charset[class_id]; - charset_id = rccConfigGetLocaleCharset(config, defvalue); - if ((charset_id != 0)&&(charset_id != (rcc_charset_id)-1)) { - config->default_charset[class_id] = charset_id; - return charset_id; + if (cl->defvalue) { + charset_id = rccConfigGetLocaleCharset(config, defvalue); + if ((charset_id != 0)&&(charset_id != (rcc_charset_id)-1)) { + config->default_charset[class_id] = charset_id; + return charset_id; + } } if (cl->defvalue) { @@ -537,7 +539,7 @@ rcc_charset_id rccConfigGetCurrentCharset(rcc_language_config config, rcc_class_ } } - charset_id = rccConfigGetLocaleUnicodeCharset(config, defvalue); + charset_id = rccConfigGetLocaleCharset(config, defvalue); if ((charset_id != 0)&&(charset_id != (rcc_charset_id)-1)) { config->default_charset[class_id] = charset_id; return charset_id; @@ -634,6 +636,7 @@ int rccConfigSetCharsetByName(rcc_language_config config, rcc_class_id class_id, rcc_charset_id rccConfigGetLocaleCharset(rcc_language_config config, const char *locale_variable) { const char *lv; rcc_language_id language_id; + char lang[RCC_MAX_CHARSET_CHARS+1]; char stmp[RCC_MAX_CHARSET_CHARS+1]; if ((!config)||(!config->language)) return (rcc_charset_id)-1; @@ -642,29 +645,17 @@ rcc_charset_id rccConfigGetLocaleCharset(rcc_language_config config, const char language_id = rccGetLanguageByName(config->ctx, config->language->sn); if (language_id != (rcc_language_id)-1) { - if (!rccLocaleGetLanguage(stmp, lv, RCC_MAX_CHARSET_CHARS)) { - if (!strcmp(config->language->sn, stmp)) { - if (!rccLocaleGetCharset(stmp, lv, RCC_MAX_CHARSET_CHARS)) - return rccConfigGetCharsetByName(config, stmp); - } + if (!rccLocaleGetCharset(stmp, lv, RCC_MAX_CHARSET_CHARS)) { + if (rccIsUnicode(stmp)) + return rccConfigGetCharsetByName(config, stmp); + if ((!rccLocaleGetLanguage(lang, lv, RCC_MAX_CHARSET_CHARS))&&(!strcmp(config->language->sn, lang))) + return rccConfigGetCharsetByName(config, stmp); } } return (rcc_charset_id)-1; } -rcc_charset_id rccConfigGetLocaleUnicodeCharset(rcc_language_config config, const char *locale_variable) { - char stmp[RCC_MAX_CHARSET_CHARS+1]; - - if ((!config)||(!config->language)) return (rcc_charset_id)-1; - - if (!rccLocaleGetCharset(stmp, locale_variable?locale_variable:config->ctx->locale_variable, RCC_MAX_CHARSET_CHARS)) { - if (rccIsUTF8(stmp)) return rccConfigGetCharsetByName(config, stmp); - } - - return (rcc_charset_id)-1; -} - int rccConfigConfigure(rcc_language_config config) { int err; rcc_context ctx; diff --git a/src/lngconfig.h b/src/lngconfig.h index edfc782..b9e9a6b 100644 --- a/src/lngconfig.h +++ b/src/lngconfig.h @@ -47,8 +47,6 @@ void rccConfigClear(rcc_language_config config); int rccConfigConfigure(rcc_language_config config); -rcc_charset_id rccConfigGetLocaleUnicodeCharset(rcc_language_config config, const char *locale_variable); - const char *rccConfigGetAutoCharsetName(rcc_language_config config, rcc_autocharset_id charset_id); rcc_autocharset_id rccConfigGetAutoCharsetByName(rcc_language_config config, const char *name); diff --git a/src/lngrecode.c b/src/lngrecode.c index aef8e24..4b4f298 100644 --- a/src/lngrecode.c +++ b/src/lngrecode.c @@ -7,8 +7,38 @@ #include "internal.h" #include "fs.h" +static rcc_autocharset_id rccConfigDetectCharsetInternal(rcc_language_config config, rcc_class_id class_id, const char *buf, size_t len) { + int err; + rcc_context ctx; + rcc_class_type class_type; + rcc_autocharset_id autocharset_id; + + if ((!buf)||(!config)) return (rcc_autocharset_id)-1; + + ctx = config->ctx; + + err = rccConfigConfigure(config); + if (err) return (rcc_autocharset_id)-1; + + class_type = rccGetClassType(ctx, class_id); + if ((class_type != RCC_CLASS_FS)||((class_type == RCC_CLASS_FS)&&(rccGetOption(ctx, RCC_OPTION_AUTODETECT_FS_TITLES)))) { + rccMutexLock(config->mutex); + autocharset_id = rccEngineDetectCharset(&config->engine_ctx, buf, len); + rccMutexUnLock(config->mutex); + return autocharset_id; + } + + return (rcc_autocharset_id)-1; +} + + +rcc_autocharset_id rccConfigDetectCharset(rcc_language_config config, rcc_class_id class_id, const char *buf, size_t len) { + return rccConfigDetectCharsetInternal(config, class_id, buf, len); +} + rcc_string rccConfigSizedFrom(rcc_language_config config, rcc_class_id class_id, const char *buf, size_t len) { rcc_context ctx; + rcc_class_type class_type; rcc_string result; rcc_option_value usedb4; rcc_autocharset_id charset_id; @@ -30,7 +60,10 @@ rcc_string rccConfigSizedFrom(rcc_language_config config, rcc_class_id class_id, } } - charset_id = rccConfigDetectCharset(config, class_id, buf, len); + class_type = rccGetClassType(ctx, class_id); + + if (class_type == RCC_CLASS_KNOWN) charset_id = (rcc_autocharset_id)-1; + else charset_id = rccConfigDetectCharset(config, class_id, buf, len); if (charset_id != (rcc_autocharset_id)-1) charset = rccConfigGetAutoCharsetName(config, charset_id); else @@ -71,6 +104,7 @@ char *rccConfigSizedTo(rcc_language_config config, rcc_class_id class_id, rcc_co char *rccConfigSizedRecode(rcc_language_config config, rcc_class_id from, rcc_class_id to, const char *buf, size_t len, size_t *rlen) { rcc_context ctx; + rcc_class_type class_type; rcc_string result; rcc_option_value usedb4; rcc_autocharset_id charset_id; @@ -97,7 +131,10 @@ char *rccConfigSizedRecode(rcc_language_config config, rcc_class_id from, rcc_cl } } - charset_id = rccConfigDetectCharset(config, from, buf, len); + class_type = rccGetClassType(ctx, from); + + if (class_type == RCC_CLASS_KNOWN) charset_id = (rcc_autocharset_id)-1; + else charset_id = rccConfigDetectCharset(config, from, buf, len); if (charset_id != (rcc_autocharset_id)-1) fromcharset = rccConfigGetAutoCharsetName(config, charset_id); else @@ -115,6 +152,7 @@ char *rccConfigSizedRecode(rcc_language_config config, rcc_class_id from, rcc_cl char *rccConfigSizedRecodeToCharset(rcc_language_config config, rcc_class_id class_id, const char *charset, rcc_const_string buf, size_t len, size_t *rlen) { rcc_context ctx; + rcc_class_type class_type; rcc_string result; rcc_option_value usedb4; rcc_autocharset_id charset_id; @@ -141,7 +179,10 @@ char *rccConfigSizedRecodeToCharset(rcc_language_config config, rcc_class_id cla } } - charset_id = rccConfigDetectCharset(config, class_id, buf, len); + class_type = rccGetClassType(ctx, class_id); + + if (class_type == RCC_CLASS_KNOWN) charset_id = (rcc_autocharset_id)-1; + else charset_id = rccConfigDetectCharset(config, class_id, buf, len); if (charset_id != (rcc_autocharset_id)-1) ocharset = rccConfigGetAutoCharsetName(config, charset_id); else diff --git a/src/rccconfig.c b/src/rccconfig.c index a54b778..5fecb6b 100644 --- a/src/rccconfig.c +++ b/src/rccconfig.c @@ -127,7 +127,7 @@ rcc_language rcc_default_languages_embeded[RCC_MAX_LANGUAGES + 1] = { rcc_option_value_name rcc_sn_boolean[] = { "OFF", "ON", NULL }; rcc_option_value_name rcc_sn_learning[] = { "OFF", "ON", "RELEARN", "LEARN", NULL }; rcc_option_value_name rcc_sn_clo[] = { "ALL", "CONFIGURED_AND_AUTO", "CONFIGURED_ONLY", NULL }; -rcc_option_value_name rcc_sn_translate[] = { "OFF", "TO_ENGLISH", "SKIP_ENGLISH", "FULL", NULL }; +rcc_option_value_name rcc_sn_translate[] = { "OFF", "TO_ENGLISH", "SKIP_RELATED", "SKIP_PARRENT", "FULL", NULL }; rcc_option_description rcc_option_descriptions[RCC_MAX_OPTIONS+1]; rcc_option_description rcc_option_descriptions_embeded[RCC_MAX_OPTIONS+1] = { @@ -197,6 +197,11 @@ int rccIsUTF8(const char *name) { return 1; } +int rccIsUnicode(const char *name) { + if ((!name)||(strncasecmp(name, "UTF",3)&&strncasecmp(name, "UCS",3))) return 0; + return 1; +} + unsigned int rccDefaultDropLanguageRelations(const char *lang) { unsigned long i, j; for (i=0,j=0;rcc_default_relations[i].lang;i++) { diff --git a/src/rccconfig.h b/src/rccconfig.h index fe7b912..7361910 100644 --- a/src/rccconfig.h +++ b/src/rccconfig.h @@ -38,5 +38,6 @@ rcc_language_id rccDefaultGetLanguageByName(const char *name); unsigned int rccDefaultDropLanguageRelations(const char *lang); int rccIsUTF8(const char *name); +int rccIsUnicode(const char *name); #endif /* _RCC_CONFIG_H */ diff --git a/src/rcciconv.c b/src/rcciconv.c index 93278a7..b518cd7 100644 --- a/src/rcciconv.c +++ b/src/rcciconv.c @@ -7,6 +7,8 @@ #include "internal.h" #include "rcciconv.h" +#define RCC_MAX_ERRORS 3 + static void rccIConvCopySymbol(char **in_buf, int *in_left, char **out_buf, int *out_left) { if ((out_left>0)&&(in_left>0)) { /* (**out_buf)=(**in_buf); diff --git a/src/rcclocale.h b/src/rcclocale.h index dc2c4e7..b6832ed 100644 --- a/src/rcclocale.h +++ b/src/rcclocale.h @@ -1,5 +1,8 @@ #ifndef _RCC_LOCALE_H #define _RCC_LOCALE_H +#define RCC_MAX_CHARSET_CHARS 16 +#define RCC_MAX_LANGUAGE_CHARS 16 +#define RCC_MAX_VARIABLE_CHARS 16 #endif /* _RCC_LOCALE_H */ diff --git a/src/recode.c b/src/recode.c index 27dff92..ee9ac53 100644 --- a/src/recode.c +++ b/src/recode.c @@ -21,10 +21,17 @@ #define RCC_ACCEPTABLE_PROBABILITY 0 #define RCC_ACCEPTABLE_LENGTH 3 -static rcc_language_id rccDetectLanguageInternal(rcc_context ctx, rcc_class_id class_id, const char *buf, size_t len, rcc_string *retstring) { +typedef enum rcc_detect_language_confidence_t { + RCC_DETECT_LANGUAGE_CONFIDENCE_UNSURE = 0, + RCC_DETECT_LANGUAGE_CONFIDENCE_ALMOST, + RCC_DETECT_LANGUAGE_CONFIDENCE_SURE, + RCC_DETECT_LANGUAGE_CONFIDENCE_CACHED +} rcc_detect_language_confidence; + +static rcc_language_id rccDetectLanguageInternal(rcc_context ctx, rcc_class_id class_id, const char *buf, size_t len, rcc_string *retstring, rcc_detect_language_confidence *confidence) { rcc_speller speller = NULL; - unsigned long i, nlanguages; - rcc_language_config config, config0 = NULL; + long i, nlanguages; + rcc_language_config config, config0 = NULL, config1 = NULL; rcc_string recoded; unsigned char *utf8; size_t j, mode; @@ -48,6 +55,9 @@ static rcc_language_id rccDetectLanguageInternal(rcc_context ctx, rcc_class_id c unsigned long k; rcc_language_id *parrents; size_t chars = 0; + char llang[RCC_MAX_LANGUAGE_CHARS]; + rcc_language_id locale_lang; + unsigned char defstep = 0; unsigned long accepted_nonenglish_langs = 0; @@ -61,6 +71,7 @@ static rcc_language_id rccDetectLanguageInternal(rcc_context ctx, rcc_class_id c english_lang = rccStringGetLanguage(recoded); if (retstring) *retstring = recoded; else free(recoded); + if (confidence) *confidence = RCC_DETECT_LANGUAGE_CONFIDENCE_CACHED; return english_lang; } } @@ -72,17 +83,33 @@ static rcc_language_id rccDetectLanguageInternal(rcc_context ctx, rcc_class_id c english_lang = rccGetLanguageByName(ctx, rcc_english_language_sn); - for (i=0;i1)?i++:i) { if (i) { - if (config==config0) continue; - } else config0=config; + config = rccGetUsableConfig(ctx, (rcc_language_id)i); + if ((!config)||(config==config0)||(config==config1)) continue; + } else { + switch (defstep) { + case 0: + config = rccGetCurrentConfig(ctx); + config0 = config; + break; + case 1: + if (!rccLocaleGetLanguage(llang ,ctx->locale_variable, RCC_MAX_LANGUAGE_CHARS)) { + locale_lang = rccGetLanguageByName(ctx, llang); + config = rccGetConfig(ctx, locale_lang); + } else config = NULL; + config1 = config; + break; + default: + config = NULL; + } + defstep++; + if ((!config)||(config0==config1)) continue; + } + if (bestfixlang != (rcc_language_id)-1) { - parrents = ctx->language_parrents[i]; + parrents = ((rcc_language_internal*)config->language)->parrents; for (k = 0;parrents[k] != (rcc_language_id)-1;k++) if (parrents[k] == bestfixlang) break; @@ -192,6 +219,8 @@ static rcc_language_id rccDetectLanguageInternal(rcc_context ctx, rcc_class_id c if (english_string) free(english_string); if (retstring) *retstring = best_string; else if (best_string) free(best_string); + + if (confidence) *confidence = RCC_DETECT_LANGUAGE_CONFIDENCE_SURE; return bestlang; } @@ -199,6 +228,8 @@ static rcc_language_id rccDetectLanguageInternal(rcc_context ctx, rcc_class_id c if (best_string) free(best_string); if (retstring) *retstring = english_string; else if (english_string) free(english_string); + + if (confidence) *confidence = RCC_DETECT_LANGUAGE_CONFIDENCE_SURE; return english_lang; } @@ -206,6 +237,8 @@ static rcc_language_id rccDetectLanguageInternal(rcc_context ctx, rcc_class_id c if (english_string) free(english_string); if (retstring) *retstring = best_string; else if (best_string) free(best_string); + + if (confidence) *confidence = RCC_DETECT_LANGUAGE_CONFIDENCE_ALMOST; return bestlang; } @@ -213,6 +246,8 @@ static rcc_language_id rccDetectLanguageInternal(rcc_context ctx, rcc_class_id c if (best_string) free(best_string); if (retstring) *retstring = english_string; else if (english_string) free(english_string); + + if (confidence) *confidence = RCC_DETECT_LANGUAGE_CONFIDENCE_ALMOST; return english_lang; } @@ -220,89 +255,152 @@ static rcc_language_id rccDetectLanguageInternal(rcc_context ctx, rcc_class_id c if (english_string) free(english_string); if (retstring) *retstring = best_string; else if (best_string) free(best_string); + + if (confidence) *confidence = RCC_DETECT_LANGUAGE_CONFIDENCE_UNSURE; return bestlang; } else if (best_string) free(best_string); if ((english_res > RCC_ACCEPTABLE_PROBABILITY)&&(english_longest > RCC_ACCEPTABLE_LENGTH)) { if (retstring) *retstring = english_string; else if (english_string) free(english_string); + + if (confidence) *confidence = RCC_DETECT_LANGUAGE_CONFIDENCE_UNSURE; return english_lang; } else if (english_string) free(english_string); return (rcc_language_id)-1; } - rcc_language_id rccDetectLanguage(rcc_context ctx, rcc_class_id class_id, const char *buf, size_t len) { if (!ctx) { if (rcc_default_ctx) ctx = rcc_default_ctx; else return -1; } - return rccDetectLanguageInternal(ctx, class_id, buf, len, NULL); + return rccDetectLanguageInternal(ctx, class_id, buf, len, NULL, NULL); } -static rcc_autocharset_id rccConfigDetectCharsetInternal(rcc_language_config config, rcc_class_id class_id, const char *buf, size_t len) { - int err; - rcc_context ctx; - rcc_class_type class_type; - rcc_engine_ptr engine; - rcc_autocharset_id autocharset_id; - - if ((!buf)||(!config)) return (rcc_autocharset_id)-1; +static int rccIsParrentLanguage(rcc_language_config config, rcc_language_id parrent) { + unsigned int i; + rcc_language_id language; + rcc_language_id *list; + + language = rccConfigGetLanguage(config); + if (parrent == language) return 1; - ctx = config->ctx; + list = ((rcc_language_internal*)config->language)->parrents; + for (i=0;list[i] != (rcc_language_id)-1;i++) + if (list[i] == parrent) return 1; - err = rccConfigConfigure(config); - if (err) return (rcc_autocharset_id)-1; + return 0; +} + + +static int rccAreRelatedLanguages(rcc_language_config c1, rcc_language_config c2) { + rcc_language_id l1, l2; + + l1 = rccConfigGetLanguage(c1); + l2 = rccConfigGetLanguage(c2); - class_type = rccGetClassType(ctx, class_id); - if ((class_type != RCC_CLASS_FS)||((class_type == RCC_CLASS_FS)&&(rccGetOption(ctx, RCC_OPTION_AUTODETECT_FS_TITLES)))) { - rccMutexLock(config->mutex); - engine = rccConfigGetCurrentEnginePointer(config); - if ((engine)&&(engine->func)) autocharset_id = engine->func(&config->engine_ctx, buf, len); - else autocharset_id = (rcc_autocharset_id)-1; - rccMutexUnLock(config->mutex); - return autocharset_id; - } + if (rccIsParrentLanguage(c1, l2)) return 1; + if (rccIsParrentLanguage(c2, l1)) return 1; - return (rcc_autocharset_id)-1; + return 0; } -rcc_autocharset_id rccConfigDetectCharset(rcc_language_config config, rcc_class_id class_id, const char *buf, size_t len) { - return rccConfigDetectCharsetInternal(config, class_id, buf, len); -} +static char *rccRecodeTranslate(rcc_language_config *config, rcc_class_id class_id, const char *utfstring) { + rcc_context ctx; + rcc_language_config curconfig; + + rcc_option_value translate; + rcc_class_type ctype; + rcc_language_id language_id, english_language_id, current_language_id; -static int rccAreLanguagesRelated(rcc_context ctx, rcc_language_id l1, rcc_language_id l2, rcc_language_id skip) { - unsigned int i; - rcc_language_id *list; + char llang[RCC_MAX_LANGUAGE_CHARS]; - if ((l1 == skip)||(l2 == skip)) return 0; + rcc_translate trans, entrans; - if (l1 == l2) return 1; + char *translated; + + ctx = (*config)->ctx; + + translate = rccGetOption(ctx, RCC_OPTION_TRANSLATE); + if (translate == RCC_OPTION_TRANSLATE_OFF) return NULL; + + ctype = rccGetClassType(ctx, class_id); + if ((ctype != RCC_CLASS_TRANSLATE_LOCALE)&&(ctype != RCC_CLASS_TRANSLATE_CURRENT)&&(ctype != RCC_CLASS_TRANSLATE_FROM)) return NULL; + + language_id = rccConfigGetLanguage(*config); + + english_language_id = rccGetLanguageByName(ctx, rcc_english_language_sn); - list = ctx->language_parrents[l1]; - for (i=0;list[i] != (rcc_language_id)-1;i++) - if (list[i] == l2) return 1; + if (translate == RCC_OPTION_TRANSLATE_TO_ENGLISH) { + current_language_id = english_language_id ; + } else { + if (ctype == RCC_CLASS_TRANSLATE_LOCALE) { + if (!rccLocaleGetLanguage(llang ,ctx->locale_variable, RCC_MAX_LANGUAGE_CHARS)) + current_language_id = rccGetLanguageByName(ctx, llang); + else + current_language_id = (rcc_language_id)-1; + } else + current_language_id = rccGetCurrentLanguage(ctx); + } + + if (current_language_id == (rcc_language_id)-1) return NULL; + if (language_id == current_language_id) return NULL; - list = ctx->language_parrents[l2]; - for (i=0;list[i] != (rcc_language_id)-1;i++) - if (list[i] == l1) return 1; + curconfig = rccGetConfig(ctx, current_language_id); + if (!curconfig) return NULL; - return 0; + if (rccConfigConfigure(curconfig)) return NULL; + + if (translate == RCC_OPTION_TRANSLATE_SKIP_RELATED) { + if (rccAreRelatedLanguages(curconfig, *config)) return NULL; + } + + if (translate == RCC_OPTION_TRANSLATE_SKIP_PARRENT) { + if (rccIsParrentLanguage(curconfig, language_id)) return NULL; + } + + trans = rccConfigGetTranslator(*config, current_language_id); + if (trans) { + translated = rccTranslate(trans, utfstring); + if (translated) { + if ((!((rcc_language_internal*)curconfig->language)->latin)&&(rccIsASCII(translated))) { + free(translated); + translated = NULL; + } + } + } else translated = NULL; + + if ((!translated)&&(current_language_id != english_language_id)&&(!rccAreRelatedLanguages(*config, curconfig))) { + curconfig = rccGetConfig(ctx, english_language_id); + if (!curconfig) return NULL; + if (rccConfigConfigure(curconfig)) return NULL; + + entrans = rccConfigGetEnglishTranslator(*config); + if (entrans) translated = rccTranslate(entrans, utfstring); + } + + if (translated) *config = curconfig; + return translated; } rcc_string rccSizedFrom(rcc_context ctx, rcc_class_id class_id, const char *buf, size_t len) { int err; size_t ret; + rcc_language_config config; rcc_language_id language_id, detected_language_id; rcc_autocharset_id charset_id; rcc_iconv icnv = NULL; rcc_string result; + rcc_class_type class_type; rcc_option_value usedb4; const char *charset; + char *translate = NULL; + rcc_detect_language_confidence confidence; if (!ctx) { if (rcc_default_ctx) ctx = rcc_default_ctx; @@ -318,29 +416,38 @@ rcc_string rccSizedFrom(rcc_context ctx, rcc_class_id class_id, const char *buf, if (language_id == (rcc_language_id)-1) return NULL; if (!strcasecmp(ctx->languages[language_id]->sn, rcc_disabled_language_sn)) return NULL; - + class_type = rccGetClassType(ctx, class_id); usedb4 = rccGetOption(ctx, RCC_OPTION_LEARNING_MODE); -/* - if (usedb4&RCC_OPTION_LEARNING_FLAG_USE) { - result = rccDb4GetKey(ctx->db4ctx, buf, len); - if (result) { - if (rccStringFixID(result, ctx)) free(result); - else return result; - } - } - - if (rccGetOption(ctx, RCC_OPTION_AUTODETECT_LANGUAGE)) { - detected_language_id = rccDetectLanguageInternal(ctx, class_id, buf, len); - if (detected_language_id != (rcc_language_id)-1) - language_id = detected_language_id; - } -*/ - detected_language_id = rccDetectLanguageInternal(ctx, class_id, buf, len, &result); + detected_language_id = rccDetectLanguageInternal(ctx, class_id, buf, len, &result, &confidence); if (detected_language_id != (rcc_language_id)-1) { #ifdef RCC_DEBUG_LANGDETECT - printf("Language %i(%s): %s\n", rccStringGetLanguage(result), rccStringGetLanguage(result)?rccGetLanguageName(ctx, rccStringGetLanguage(result)):"", result); + printf("Language %i(%s): %s\n", rccStringGetLanguage(result), rccStringGetLanguage(result)?rccGetLanguageName(ctx, rccStringGetLanguage(result)):"", result); #endif /* RCC_DEBUG_LANGDETECT */ + + if ((result)&&(rccGetOption(ctx, RCC_OPTION_TRANSLATE))&&(class_type == RCC_CLASS_TRANSLATE_FROM)) { + rccMutexLock(ctx->mutex); + config = rccGetCurrentConfig(ctx); + translate = rccRecodeTranslate(&config, class_id, rccStringGetString(result)); + rccMutexUnLock(ctx->mutex); + + if (translate) { + language_id = rccConfigGetLanguage(config); + free(result); + result = rccCreateString(language_id, translate, 0); + } + } + + + if ((result)&& + (usedb4&RCC_OPTION_LEARNING_FLAG_LEARN)&& + (confidence!=RCC_DETECT_LANGUAGE_CONFIDENCE_CACHED)&& + ((language_id==detected_language_id)||(confidence!=RCC_DETECT_LANGUAGE_CONFIDENCE_UNSURE))&& + (!rccStringSetLang(result, ctx->languages[language_id]->sn))) { + + rccDb4SetKey(ctx->db4ctx, buf, len, result); + } + return result; } @@ -349,7 +456,8 @@ rcc_string rccSizedFrom(rcc_context ctx, rcc_class_id class_id, const char *buf, if (err) return NULL; rccMutexLock(ctx->mutex); - charset_id = rccDetectCharset(ctx, class_id, buf, len); + if (class_type == RCC_CLASS_KNOWN) charset_id = (rcc_autocharset_id)-1; + else charset_id = rccDetectCharset(ctx, class_id, buf, len); if (charset_id != (rcc_autocharset_id)-1) { icnv = ctx->iconv_auto[charset_id]; if (rccGetOption(ctx, RCC_OPTION_AUTOENGINE_SET_CURRENT)) { @@ -362,10 +470,24 @@ rcc_string rccSizedFrom(rcc_context ctx, rcc_class_id class_id, const char *buf, if (icnv) { ret = rccIConvInternal(ctx, icnv, buf, len); if (ret == (size_t)-1) return NULL; - result = rccCreateString(language_id, ctx->tmpbuffer, ret); + + if ((rccGetOption(ctx, RCC_OPTION_TRANSLATE))&&(rccGetClassType(ctx, class_id) == RCC_CLASS_TRANSLATE_FROM)) { + config = rccGetCurrentConfig(ctx); + translate = rccRecodeTranslate(&config , class_id, ctx->tmpbuffer); + if (translate) language_id = rccConfigGetLanguage(config); + } + + result = rccCreateString(language_id, translate?translate:ctx->tmpbuffer, translate?0:ret); } else { - result = rccCreateString(language_id, buf, len); + if ((rccGetOption(ctx, RCC_OPTION_TRANSLATE))&&(rccGetClassType(ctx, class_id) == RCC_CLASS_TRANSLATE_FROM)) { + config = rccGetCurrentConfig(ctx); + translate = rccRecodeTranslate(&config , class_id, buf); + if (translate) language_id = rccConfigGetLanguage(config); + } + + result = rccCreateString(language_id, translate?translate:buf, translate?0:len); } + rccMutexUnLock(ctx->mutex); if ((result)&&(usedb4&RCC_OPTION_LEARNING_FLAG_LEARN)) { @@ -385,13 +507,7 @@ char *rccSizedTo(rcc_context ctx, rcc_class_id class_id, rcc_const_string buf, s char *translated = NULL; rcc_language_config config; rcc_language_id language_id; - rcc_language_id current_language_id; - rcc_language_id english_language_id; rcc_class_type class_type; - rcc_option_value translate; - rcc_translate trans, entrans; - const char *langname; - unsigned char english_source; rcc_iconv icnv; if (!ctx) { @@ -414,74 +530,10 @@ char *rccSizedTo(rcc_context ctx, rcc_class_id class_id, rcc_const_string buf, s if (err) return NULL; class_type = rccGetClassType(ctx, class_id); - translate = rccGetOption(ctx, RCC_OPTION_TRANSLATE); - langname = rccGetLanguageName(ctx, language_id); - if (strcasecmp(langname, rcc_english_language_sn)) english_source = 0; - else english_source = 1; - - if ((class_type != RCC_CLASS_FS)&&((translate==RCC_OPTION_TRANSLATE_FULL)||((translate)&&(!english_source)))) { - english_language_id = rccGetLanguageByName(ctx, rcc_english_language_sn); - + if (((class_type == RCC_CLASS_TRANSLATE_LOCALE)||(class_type == RCC_CLASS_TRANSLATE_CURRENT))&&(rccGetOption(ctx, RCC_OPTION_TRANSLATE))) { rccMutexLock(ctx->mutex); - - current_language_id = rccGetCurrentLanguage(ctx); - if (current_language_id != language_id) { - if (translate != RCC_OPTION_TRANSLATE_TO_ENGLISH) { - trans = rccConfigGetTranslator(config, current_language_id); - if (trans) { - translated = rccTranslate(trans, utfstring); - if (translated) { - if ((current_language_id != english_language_id)&&(rccIsASCII(translated))) { - /* Ffrench to german (no umlauts) => not related - english to german (no umlauts) => skiping english relations - DS: Problem if we have relation between french and german */ - if (rccAreLanguagesRelated(ctx, language_id, current_language_id, english_language_id)) { - free(translated); - translated = NULL; - translate = 0; - } - } - } - if (translated) { - language_id = current_language_id; - - config = rccGetConfig(ctx, language_id); - if (!config) { - rccMutexUnLock(ctx->mutex); - free(translated); - return NULL; - } - - err = rccConfigConfigure(config); - if (err) { - rccMutexUnLock(ctx->mutex); - free(translated); - return NULL; - } - } - } - } - - if ((translate == RCC_OPTION_TRANSLATE_TO_ENGLISH)||((translate)&&(!translated)&&(!english_language_id == current_language_id)&&(!rccAreLanguagesRelated(ctx, language_id, current_language_id, (rcc_language_id)-1)))) { - entrans = rccConfigGetEnglishTranslator(config); - if (entrans) { - translated = rccTranslate(config->entrans, utfstring); -/* - config = rccGetConfig(ctx, language_id); - if (!config) { - rccMutexUnLock(ctx->mutex); - return translated; - } - - err = rccConfigConfigure(config); - if (err) { - rccMutexUnLock(ctx->mutex); - return translated; - }*/ - } - } - } + translated = rccRecodeTranslate(&config, class_id, utfstring); rccMutexUnLock(ctx->mutex); } @@ -492,7 +544,7 @@ char *rccSizedTo(rcc_context ctx, rcc_class_id class_id, rcc_const_string buf, s return result; } } - + rccMutexLock(ctx->mutex); rccMutexLock(config->mutex); icnv = config->iconv_to[class_id]; @@ -536,10 +588,14 @@ char *rccSizedRecode(rcc_context ctx, rcc_class_id from, rcc_class_id to, const if ((class_type == RCC_CLASS_FS)&&(rccGetOption(ctx, RCC_OPTION_AUTODETECT_FS_NAMES))) goto recoding; if (rccGetOption(ctx, RCC_OPTION_LEARNING_MODE)) goto recoding; if (rccGetOption(ctx, RCC_OPTION_AUTODETECT_LANGUAGE)) goto recoding; - if (rccGetOption(ctx, RCC_OPTION_TRANSLATE)) goto recoding; + if ((rccGetOption(ctx, RCC_OPTION_TRANSLATE))&&((class_type == RCC_CLASS_TRANSLATE_LOCALE)||(class_type == RCC_CLASS_TRANSLATE_CURRENT))) goto recoding; + + class_type = rccGetClassType(ctx, from); + if ((rccGetOption(ctx, RCC_OPTION_TRANSLATE))&&(class_type == RCC_CLASS_TRANSLATE_FROM)) goto recoding; rccMutexLock(ctx->mutex); - from_charset_id = rccDetectCharset(ctx, from, buf, len); + if (class_type == RCC_CLASS_KNOWN) from_charset_id = (rcc_autocharset_id)-1; + else from_charset_id = rccDetectCharset(ctx, from, buf, len); if (from_charset_id != (rcc_charset_id)-1) { from_charset = rccGetAutoCharsetName(ctx, from_charset_id); to_charset = rccGetCurrentCharsetName(ctx, to); @@ -606,6 +662,18 @@ char *rccFS(rcc_context ctx, rcc_class_id from, rcc_class_id to, const char *fsp rccMutexUnLock(config->mutex); rccMutexUnLock(ctx->mutex); } else result = NULL; + + if (!result) { + config = rccGetCurrentConfig(ctx); + if (config) { + rccMutexLock(ctx->mutex); + rccMutexLock(config->mutex); + result = rccFS3(config, to, prefix, rccStringGetString(string)); + rccMutexUnLock(config->mutex); + rccMutexUnLock(ctx->mutex); + } + } + free(string); } else result = NULL; -- cgit v1.2.3