summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/internal.h26
-rw-r--r--src/librcc.c70
-rw-r--r--src/librcc.h96
-rw-r--r--src/lngconfig.c41
-rw-r--r--src/opt.c4
-rw-r--r--src/rccconfig.c37
-rw-r--r--src/rccconfig.h5
-rw-r--r--src/rccdb4.h4
-rw-r--r--src/rccspell.c53
-rw-r--r--src/rccspell.h11
-rw-r--r--src/rccstring.c23
-rw-r--r--src/rccstring.h1
-rw-r--r--src/rccxml.c59
-rw-r--r--src/recode.c161
14 files changed, 501 insertions, 90 deletions
diff --git a/src/internal.h b/src/internal.h
index fcaa4c6..d5797fc 100644
--- a/src/internal.h
+++ b/src/internal.h
@@ -5,15 +5,8 @@
# define LIBRCC_DATA_DIR "/usr/lib/rcc"
#endif /* LIBRCC_DATA_DIR */
-#include "librcc.h"
-#include "recode.h"
-#include "engine.h"
-#include "lngconfig.h"
-#include "rccstring.h"
-#include "rccdb4.h"
-#include "rcciconv.h"
-#include "rccstring.h"
-#include "rccmutex.h"
+#define RCC_MAX_LANGUAGE_PARRENTS 4
+#define RCC_MAX_RELATIONS RCC_MAX_LANGUAGES
#ifdef HAVE_STRNLEN
#define STRNLEN(str,n) (n?strnlen(str,n):strlen(str))
@@ -26,6 +19,20 @@
#define RCC_MAX_PREFIX_CHARS 32
#define RCC_MIN_DB4_CHARS 3
+#include "librcc.h"
+#include "recode.h"
+#include "engine.h"
+#include "lngconfig.h"
+#include "rccstring.h"
+#include "rccdb4.h"
+#include "rcciconv.h"
+#include "rccstring.h"
+#include "rccmutex.h"
+
+
+
+typedef rcc_language_id rcc_language_parrent_list[RCC_MAX_LANGUAGE_PARRENTS];
+
struct rcc_context_t {
char locale_variable[RCC_MAX_VARIABLE_CHARS+1];
@@ -37,6 +44,7 @@ struct rcc_context_t {
unsigned int max_languages;
unsigned int n_languages;
rcc_language_ptr *languages;
+ rcc_language_parrent_list *language_parrents;
rcc_language_config configs;
unsigned int max_classes;
diff --git a/src/librcc.c b/src/librcc.c
index 757b71b..208fcb3 100644
--- a/src/librcc.c
+++ b/src/librcc.c
@@ -58,6 +58,7 @@ rcc_compiled_configuration rccGetCompiledConfiguration() {
int rccInit() {
int err;
char *tmp;
+ unsigned long i, rpos;
#ifdef HAVE_PWD_H
struct passwd *pw;
@@ -78,12 +79,26 @@ int rccInit() {
if (!rcc_home_dir) rcc_home_dir = strdup("/");
memcpy(rcc_default_languages, rcc_default_languages_embeded, (RCC_MAX_LANGUAGES + 1)*sizeof(rcc_language));
+ memcpy(rcc_default_aliases, rcc_default_aliases_embeded, (RCC_MAX_ALIASES + 1)*sizeof(rcc_language_alias));
+ memcpy(rcc_default_relations, rcc_default_relations_embeded, (RCC_MAX_RELATIONS + 1)*sizeof(rcc_language_relation));
memcpy(rcc_option_descriptions, rcc_option_descriptions_embeded, (RCC_MAX_OPTIONS + 1)*sizeof(rcc_option_description));
#ifdef HAVE_LIBTRANSLATE
rccExternalInit();
#endif /* HAVE_LIBTRANSLATE */
+ for (rpos=0;rcc_default_relations[rpos].lang;rpos++);
+ for (i=0;rcc_default_languages[i].sn;i++) {
+ if (!strcasecmp(rcc_default_languages[i].sn, rcc_default_language_sn)) continue;
+ if (!strcasecmp(rcc_default_languages[i].sn, rcc_disabled_language_sn)) continue;
+ if (!strcasecmp(rcc_default_languages[i].sn, rcc_english_language_sn)) continue;
+
+ rcc_default_relations[rpos].lang = rcc_default_languages[i].sn;
+ rcc_default_relations[rpos++].parrent = rcc_english_language_sn;
+ }
+ rcc_default_relations[rpos].lang = NULL;
+ rcc_default_relations[rpos].parrent = NULL;
+
err = rccPluginInit();
if (!err) err = rccTranslateInit();
if (!err) err = rccXmlInit(1);
@@ -125,6 +140,7 @@ rcc_context rccCreateContext(const char *locale_variable, unsigned int max_langu
rcc_context ctx;
rcc_language_ptr *languages;
+ rcc_language_parrent_list *language_parrents;
rcc_class_ptr *classes;
rcc_language_config configs;
rcc_iconv *from;
@@ -151,16 +167,18 @@ rcc_context rccCreateContext(const char *locale_variable, unsigned int max_langu
languages = (rcc_language_ptr*)malloc((max_languages+1)*sizeof(rcc_language_ptr));
classes = (rcc_class_ptr*)malloc((max_classes+1)*sizeof(rcc_class_ptr));
from = (rcc_iconv*)malloc((max_classes)*sizeof(rcc_iconv));
+ language_parrents = (rcc_language_parrent_list*)malloc((max_languages+1)*sizeof(rcc_language_parrent_list));
mutex = rccMutexCreate();
configs = (rcc_language_config)malloc((max_languages)*sizeof(struct rcc_language_config_t));
- if ((!ctx)||(!languages)||(!classes)||(!mutex)) {
+ if ((!ctx)||(!languages)||(!classes)||(!mutex)||(!language_parrents)) {
if (mutex) rccMutexFree(mutex);
if (from) free(from);
if (configs) free(configs);
if (classes) free(classes);
if (languages) free(languages);
+ if (language_parrents) free(language_parrents);
if (ctx) free(ctx);
return NULL;
}
@@ -174,7 +192,10 @@ rcc_context rccCreateContext(const char *locale_variable, unsigned int max_langu
ctx->aliases[0] = NULL;
for (i=0;rcc_default_aliases[i].alias;i++)
rccRegisterLanguageAlias(ctx, rcc_default_aliases + i);
-
+
+ ctx->language_parrents = language_parrents;
+ for (i=0;i<max_languages;i++) language_parrents[i][0] = (rcc_language_id)-1;
+
ctx->languages = languages;
ctx->max_languages = max_languages;
ctx->n_languages = 0;
@@ -216,12 +237,15 @@ rcc_context rccCreateContext(const char *locale_variable, unsigned int max_langu
} else {
for (i=0;rcc_default_languages[i].sn;i++)
rccRegisterLanguage(ctx, rcc_default_languages+i);
-
+
if (max_languages < i) {
rccFree(ctx);
return NULL;
}
+ for (i=0;rcc_default_relations[i].lang;i++)
+ rccRegisterLanguageRelation(ctx, rcc_default_relations+i);
+
ctx->current_config = rccGetCurrentConfig(ctx);
}
@@ -282,6 +306,7 @@ void rccFreeContext(rcc_context ctx) {
free(ctx->configs);
}
if (ctx->classes) free(ctx->classes);
+ if (ctx->language_parrents) free(ctx->language_parrents);
if (ctx->languages) free(ctx->languages);
if (ctx->mutex) rccMutexFree(ctx->mutex);
free(ctx);
@@ -397,6 +422,45 @@ rcc_alias_id rccRegisterLanguageAlias(rcc_context ctx, rcc_language_alias *alias
return i-1;
}
+rcc_relation_id rccRegisterLanguageRelation(rcc_context ctx, rcc_language_relation *relation) {
+ unsigned int i;
+ rcc_language_id language_id;
+ const char *lang;
+ const char *parrent;
+ rcc_language_id *list;
+
+ if (!ctx) {
+ if (rcc_default_ctx) ctx = rcc_default_ctx;
+ else return (rcc_alias_id)-1;
+ }
+ if (!relation) return (rcc_relation_id)-1;
+
+ lang = relation->lang;
+ parrent = relation->parrent;
+ if ((!lang)||(!parrent)||(!strcasecmp(lang,parrent))) return (rcc_relation_id)-1;
+
+ language_id = rccGetLanguageByName(ctx, lang);
+ if (language_id == (rcc_language_id)-1) return (rcc_relation_id)-1;
+
+
+ list = ctx->language_parrents[language_id];
+
+ language_id = rccGetLanguageByName(ctx, parrent);
+ if (language_id == (rcc_language_id)-1) return (rcc_relation_id)0;
+
+ for (i=0;list[i]!=(rcc_language_id)-1;i++)
+ if (list[i] == language_id) return (rcc_relation_id)0;
+
+ if (i<RCC_MAX_LANGUAGE_PARRENTS) {
+ list[i++] = language_id;
+ list[i] = (rcc_language_id)-1;
+ } else return (rcc_relation_id)-1;
+
+
+ return (rcc_relation_id)0;
+}
+
+
rcc_class_id rccRegisterClass(rcc_context ctx, rcc_class *cl) {
if (!ctx) {
if (rcc_default_ctx) ctx = rcc_default_ctx;
diff --git a/src/librcc.h b/src/librcc.h
index cbd9b3d..63a6f80 100644
--- a/src/librcc.h
+++ b/src/librcc.h
@@ -34,6 +34,10 @@ typedef unsigned char rcc_language_id;
*/
typedef unsigned char rcc_alias_id;
/**
+ * Relation ID
+ */
+typedef unsigned char rcc_relation_id;
+/**
* Charset ID.
* - 0 is default charset
* - -1 is error
@@ -225,17 +229,33 @@ typedef rcc_language_ptr rcc_language_list[RCC_MAX_LANGUAGES+1];
/**
* Language Aliases.
- * For example (ru_UA = uk, cs_SK = sk )
+ * For example: ru_UA = uk, cs_SK = sk
*/
struct rcc_language_alias_t {
const char *alias; /**< Long locale name */
- const char *lang; /* Coresponded language ISO-639-1 name */
+ const char *lang; /**< Coresponded language ISO-639-1 name */
};
typedef struct rcc_language_alias_t rcc_language_alias;
typedef rcc_language_alias *rcc_language_alias_ptr;
typedef rcc_language_alias_ptr rcc_language_alias_list[RCC_MAX_ALIASES+1];
/**
+ * Language relations.
+ * Meaning: sentence in considered language may contain words from all his parrents. This
+ * knowledge will help Autodetection Engine to guess right language.
+ *
+ * For example: Russian is parrent language for Ukrainian. This means it is possible
+ * to encounter russian words in ukrainian sentence.
+ *
+ * All languages by default are related to english language.
+ */
+struct rcc_language_relation_t {
+ const char *lang; /**< Coresponded language ISO-639-1 name */
+ const char *parrent; /**< Parrent language */
+};
+typedef struct rcc_language_relation_t rcc_language_relation;
+
+/**
* Register new language in supplied working context
* @param ctx is working context ( or default one if NULL supplied )
* @param language is pointer on language description (shouldn't be freed before library deinitialization).
@@ -263,6 +283,13 @@ rcc_engine_id rccLanguageRegisterEngine(rcc_language *language, rcc_engine *engi
* @return registered alias id or -1 in case of a error.
*/
rcc_alias_id rccRegisterLanguageAlias(rcc_context ctx, rcc_language_alias *alias);
+/**
+ * Register new language relation in supplied working context
+ * @param ctx is working context ( or default one if NULL supplied )
+ * @param relation is pointer on relation description (shouldn't be freed before library deinitialization).
+ * @return registered relation id or -1 in case of a error.
+ */
+rcc_relation_id rccRegisterLanguageRelation(rcc_context ctx, rcc_language_relation *relation);
/*******************************************************************************
************************ Altering Configuaration *******************************
@@ -391,7 +418,9 @@ typedef enum rcc_option_t {
RCC_OPTION_AUTOENGINE_SET_CURRENT, /**< If enabled autodetection engine will set current charset */
RCC_OPTION_AUTODETECT_LANGUAGE, /**< Enables language detection */
RCC_OPTION_TRANSLATE, /**< Translate #rcc_string if it's language differs from current one */
- RCC_MAX_OPTIONS
+ RCC_OPTION_TIMEOUT, /**< Recoding timeout. Currently it is only used to limit translation time */
+ RCC_MAX_OPTIONS,
+ RCC_OPTION_ALL
} rcc_option;
/**
@@ -948,12 +977,13 @@ int rccStringNCaseCmp(const char *str1, const char *str2, size_t n);
/*******************************************************************************
******************************** Recoding **************************************
*******************************************************************************/
+/* rcciconv.c */
+
/**
* recoding context
*/
typedef struct rcc_iconv_t *rcc_iconv;
-/* rcciconv.c */
/**
* Open recoding context.
*
@@ -980,12 +1010,13 @@ void rccIConvClose(rcc_iconv icnv);
*/
char *rccIConv(rcc_iconv icnv, const char *buf, size_t len, size_t *rlen);
+/* rcctranslate.c */
+
/**
* translating context
*/
typedef struct rcc_translate_t *rcc_translate;
-/* rcctranslate.c */
/**
* Open translating context.
*
@@ -1021,6 +1052,61 @@ int rccTranslateSetTimeout(rcc_translate translate, unsigned long us);
*/
char *rccTranslate(rcc_translate translate, const char *buf);
+
+/* rccspell.c */
+
+/**
+ * spelling context
+ */
+typedef struct rcc_speller_t *rcc_speller;
+
+/**
+ * result of spelling
+ */
+typedef enum rcc_speller_result_t {
+ RCC_SPELLER_INCORRECT = 0, /**< Word not found in dictionaries */
+ RCC_SPELLER_ALMOST_PARRENT, /**< Similliar word is found in parrents dictionary */
+ RCC_SPELLER_ALMOST_CORRECT, /**< Similliar word is found in dictionary */
+ RCC_SPELLER_PARRENT, /**< Word is found in parrent dictionary */
+ RCC_SPELLER_CORRECT /**< Word is found in dictionary */
+} rcc_speller_result;
+
+int rccSpellerResultIsOwn(rcc_speller_result res);
+int rccSpellerResultIsPrecise(rcc_speller_result res);
+int rccSpellerResultIsCorrect(rcc_speller_result res);
+
+/**
+ * Open spelling context.
+ *
+ * @param lang is language
+ * @return
+ * - NULL if language is not supported and in the case of error.
+ * - Pointer on initialized context if successful
+ */
+rcc_speller rccSpellerCreate(const char *lang);
+/**
+ * Close spelling context.
+ *
+ * @param speller is spelling context
+ */
+void rccSpellerFree(rcc_speller speller);
+/**
+ * Add parrent to the spelling context.
+ *
+ * @param speller is spelling context
+ * @param parrent is parrent spelling context
+ * @return non-zero value in the case of error
+ */
+int rccSpellerAddParrent(rcc_speller speller, rcc_speller parrent);
+/**
+ * Spell a word.
+ *
+ * @param speller is spelling context
+ * @param word is UTF-8 encoded word for spelling
+ * @return FALSE if word is not found in dictionary
+ */
+rcc_speller_result rccSpeller(rcc_speller speller, const char *word);
+
/* recode.c */
/**
diff --git a/src/lngconfig.c b/src/lngconfig.c
index 2a108b9..f9d1d6d 100644
--- a/src/lngconfig.c
+++ b/src/lngconfig.c
@@ -340,16 +340,40 @@ rcc_language_config rccGetCurrentConfig(rcc_context ctx) {
}
rcc_speller rccConfigGetSpeller(rcc_language_config config) {
+ unsigned int i;
+ rcc_speller speller;
+ rcc_language_config pconfig;
+ rcc_language_id *parrents;
+ rcc_language_id language_id;
if (!config) return NULL;
rccMutexLock(config->mutex);
- if (!config->speller) config->speller = rccSpellerCreate(config->language->sn);
+ if (!config->speller) {
+ config->speller = rccSpellerCreate(config->language->sn);
+
+ if (config->speller) language_id = rccConfigGetLanguage(config);
+ else language_id = (rcc_language_id)-1;
+ if (language_id != (rcc_language_id)-1) parrents = config->ctx->language_parrents[language_id];
+ else parrents = NULL;
+
+ if (parrents) {
+ for (i = 0; parrents[i]!=(rcc_language_id)-1; i++) {
+ pconfig = rccGetConfig(config->ctx, parrents[i]);
+ if (pconfig) {
+ speller = rccConfigGetSpeller(pconfig);
+ rccSpellerAddParrent(config->speller, speller);
+ }
+ }
+ }
+ }
rccMutexUnLock(config->mutex);
return config->speller;
}
rcc_translate rccConfigGetTranslator(rcc_language_config config, rcc_language_id to) {
+ rcc_option_value timeout;
+
if (!config) return NULL;
rccMutexLock(config->mutex);
@@ -360,7 +384,11 @@ rcc_translate rccConfigGetTranslator(rcc_language_config config, rcc_language_id
if (!config->trans) {
config->trans = rccTranslateOpen(config->language->sn, rccGetLanguageName(config->ctx, to));
- config->translang = to;
+ if (config->trans) {
+ config->translang = to;
+ timeout = rccGetOption(config->ctx, RCC_OPTION_TIMEOUT);
+ if (timeout) rccTranslateSetTimeout(config->trans, timeout);
+ }
}
rccMutexUnLock(config->mutex);
@@ -368,11 +396,18 @@ rcc_translate rccConfigGetTranslator(rcc_language_config config, rcc_language_id
}
rcc_translate rccConfigGetEnglishTranslator(rcc_language_config config) {
+ rcc_option_value timeout;
+
if (!config) return NULL;
rccMutexLock(config->mutex);
- if (!config->entrans)
+ if (!config->entrans) {
config->entrans = rccTranslateOpen(config->language->sn, rcc_english_language_sn);
+ if (config->entrans) {
+ timeout = rccGetOption(config->ctx, RCC_OPTION_TIMEOUT);
+ if (timeout) rccTranslateSetTimeout(config->entrans, timeout);
+ }
+ }
rccMutexUnLock(config->mutex);
return config->entrans;
diff --git a/src/opt.c b/src/opt.c
index e6f8486..9e9f00d 100644
--- a/src/opt.c
+++ b/src/opt.c
@@ -112,7 +112,7 @@ rcc_option rccOptionDescriptionGetOption(rcc_option_description *desc) {
const char *rccOptionDescriptionGetValueName(rcc_option_description *desc, rcc_option_value value) {
unsigned int i;
- if (desc) {
+ if ((desc)&&(desc->vsn)) {
for (i=0;desc->vsn[i];i++) {
if (i == value) return desc->vsn[i];
}
@@ -123,7 +123,7 @@ const char *rccOptionDescriptionGetValueName(rcc_option_description *desc, rcc_o
rcc_option_value rccOptionDescriptionGetValueByName(rcc_option_description *desc, const char *name) {
unsigned int i;
- if ((desc)&&(name)) {
+ if ((desc)&&(desc->vsn)&&(name)) {
for (i=0;desc->vsn[i];i++) {
if (!strcasecmp(desc->vsn[i], name)) return (rcc_option_value)i;
}
diff --git a/src/rccconfig.c b/src/rccconfig.c
index f820606..a54b778 100644
--- a/src/rccconfig.c
+++ b/src/rccconfig.c
@@ -6,10 +6,20 @@
#include "engine.h"
#include "opt.h"
-rcc_language_alias rcc_default_aliases[] = {
+#define RCC_DEFAULT_RECODING_TIMEOUT 500000
+
+rcc_language_alias rcc_default_aliases[RCC_MAX_ALIASES + 1];
+rcc_language_alias rcc_default_aliases_embeded[RCC_MAX_ALIASES + 1] = {
{ "cs_SK", "sk" },
{ "ru_UA", "uk" },
- { NULL, NULL}
+ { NULL, NULL }
+};
+
+rcc_language_relation rcc_default_relations[RCC_MAX_RELATIONS + 1];
+rcc_language_relation rcc_default_relations_embeded[RCC_MAX_RELATIONS + 1] = {
+ { "uk", "ru" },
+ { "be", "ru" },
+ { NULL, NULL }
};
const char rcc_default_language_sn[] = "default";
@@ -140,6 +150,11 @@ rcc_option_description rcc_option_descriptions_embeded[RCC_MAX_OPTIONS+1] = {
{RCC_OPTION_TRANSLATE, 0, { RCC_OPTION_RANGE_TYPE_MENU, 0, 3, 1}, RCC_OPTION_TYPE_INVISIBLE, "TRANSLATE", rcc_sn_translate },
#endif /* HAVE_LIBTRANSLATE */
{RCC_OPTION_AUTOENGINE_SET_CURRENT, 0, { RCC_OPTION_RANGE_TYPE_BOOLEAN, 0, 0, 0}, RCC_OPTION_TYPE_STANDARD, "AUTOENGINE_SET_CURRENT", rcc_sn_boolean },
+#ifdef HAVE_LIBTRANSLATE
+ {RCC_OPTION_TIMEOUT, RCC_DEFAULT_RECODING_TIMEOUT, { RCC_OPTION_RANGE_TYPE_RANGE, 0, 5000000, 50000}, RCC_OPTION_TYPE_STANDARD, "TIMEOUT", NULL },
+#else
+ {RCC_OPTION_TIMEOUT, RCC_DEFAULT_RECODING_TIMEOUT, { RCC_OPTION_RANGE_TYPE_RANGE, 0, 5000000, 50000}, RCC_OPTION_TYPE_INVISIBLE, "TIMEOUT", NULL },
+#endif /* HAVE_LIBTRANSLATE */
{RCC_MAX_OPTIONS}
};
@@ -149,7 +164,8 @@ rcc_option_description *rccGetOptionDescription(rcc_option option) {
if ((option<0)||(option>=RCC_MAX_OPTIONS)) return NULL;
for (i=0;rcc_option_descriptions[i].option!=RCC_MAX_OPTIONS;i++)
- if (rcc_option_descriptions[i].option == option) return rcc_option_descriptions+i;
+ if (rcc_option_descriptions[i].option == option)
+ return rcc_option_descriptions+i;
return NULL;
}
@@ -180,3 +196,18 @@ int rccIsUTF8(const char *name) {
if ((!name)||(strcasecmp(name, "UTF-8")&&strcasecmp(name, "UTF8"))) return 0;
return 1;
}
+
+unsigned int rccDefaultDropLanguageRelations(const char *lang) {
+ unsigned long i, j;
+ for (i=0,j=0;rcc_default_relations[i].lang;i++) {
+ if (strcasecmp(lang, rcc_default_relations[i].lang)) {
+ if (j<i) {
+ rcc_default_relations[j].lang = rcc_default_relations[i].lang;
+ rcc_default_relations[j++].parrent = rcc_default_relations[i].parrent;
+ } else j++;
+ }
+ }
+ rcc_default_relations[j].lang = NULL;
+ rcc_default_relations[j].parrent = NULL;
+ return j;
+}
diff --git a/src/rccconfig.h b/src/rccconfig.h
index 8e794ba..fe7b912 100644
--- a/src/rccconfig.h
+++ b/src/rccconfig.h
@@ -4,6 +4,7 @@
#include "opt.h"
#undef RCC_DEBUG
+#undef RCC_DEBUG_LANGDETECT
#define RCC_LOCALE_VARIABLE "LC_CTYPE"
extern const char rcc_default_language_sn[];
@@ -12,6 +13,9 @@ extern const char rcc_disabled_language_sn[];
extern const char rcc_disabled_engine_sn[];
extern rcc_language_alias rcc_default_aliases[];
+extern rcc_language_alias rcc_default_aliases_embeded[];
+extern rcc_language_relation rcc_default_relations[];
+extern rcc_language_relation rcc_default_relations_embeded[];
extern const char rcc_default_charset[];
extern const char rcc_utf8_charset[];
@@ -31,6 +35,7 @@ rcc_option_description *rccGetOptionDescription(rcc_option option);
rcc_option_description *rccGetOptionDescriptionByName(const char *name);
rcc_language_id rccDefaultGetLanguageByName(const char *name);
+unsigned int rccDefaultDropLanguageRelations(const char *lang);
int rccIsUTF8(const char *name);
diff --git a/src/rccdb4.h b/src/rccdb4.h
index a11e53d..cb978f0 100644
--- a/src/rccdb4.h
+++ b/src/rccdb4.h
@@ -1,10 +1,10 @@
#ifndef _RCC_DB4_H
#define _RCC_DB4_H
-#include "../config.h"
-
#include "librcc.h"
+#include "../config.h"
+
#ifdef HAVE_DB_H
# include <db.h>
#endif /* HAVE_DB_H */
diff --git a/src/rccspell.c b/src/rccspell.c
index c54e267..da5e4d1 100644
--- a/src/rccspell.c
+++ b/src/rccspell.c
@@ -1,6 +1,7 @@
#include <stdio.h>
#include <stdlib.h>
+#include "internal.h"
#include "rccspell.h"
rcc_speller rccSpellerCreate(const char *lang) {
@@ -28,6 +29,7 @@ rcc_speller rccSpellerCreate(const char *lang) {
}
rccspeller->speller = speller;
+ rccspeller->parrents[0] = NULL;
return rccspeller;
#else
return NULL;
@@ -47,17 +49,58 @@ int rccSpellerGetError(rcc_speller rccspeller) {
return 0;
}
-int rccSpellerSized(rcc_speller speller, const char *word, size_t len) {
+int rccSpellerAddParrent(rcc_speller speller, rcc_speller parrent) {
+ unsigned int i;
+ if ((!speller)||(!parrent)) return -1;
+
+ for (i=0;speller->parrents[i];i++);
+ if (i >= RCC_MAX_LANGUAGE_PARRENTS) return -1;
+ speller->parrents[i++] = parrent;
+ speller->parrents[i] = NULL;
+
+ return 0;
+}
+
+rcc_speller_result rccSpellerSized(rcc_speller speller, const char *word, size_t len, int recursion) {
#ifdef HAVE_ASPELL
+ rcc_speller_result result, saved_result = (rcc_speller_result)0;
+ unsigned int i;
int res;
+
+ if (rccSpellerGetError(speller)) return (rcc_speller_result)RCC_SPELLER_INCORRECT;
+
+ if (recursion) {
+ for (i=0; speller->parrents[i]; i++) {
+ result = rccSpellerSized(speller->parrents[i], word, len, 0);
+ if ((result == RCC_SPELLER_CORRECT)||(result == RCC_SPELLER_PARRENT)) return RCC_SPELLER_PARRENT;
+ if ((result == RCC_SPELLER_ALMOST_CORRECT)||(result == RCC_SPELLER_ALMOST_PARRENT)) saved_result = RCC_SPELLER_ALMOST_PARRENT;
+ }
+ }
- if (rccSpellerGetError(speller)) return 0;
+ if (saved_result) return saved_result;
+
res = aspell_speller_check(speller->speller, word, len?len:-1);
- return res<0?0:res;
+ return res<=0?RCC_SPELLER_INCORRECT:RCC_SPELLER_CORRECT;
#endif /* HAVE_ASPELL */
return 0;
}
-int rccSpeller(rcc_speller speller, const char *word) {
- return rccSpellerSized(speller, word, 0);
+rcc_speller_result rccSpeller(rcc_speller speller, const char *word) {
+ return rccSpellerSized(speller, word, 0, 1);
+}
+
+int rccSpellerResultIsOwn(rcc_speller_result res) {
+ if ((res == RCC_SPELLER_ALMOST_CORRECT)||(res == RCC_SPELLER_CORRECT)) return 1;
+ return 0;
+}
+
+int rccSpellerResultIsPrecise(rcc_speller_result res) {
+ if ((res == RCC_SPELLER_PARRENT)||(res == RCC_SPELLER_CORRECT)) return 1;
+ return 0;
+}
+
+int rccSpellerResultIsCorrect(rcc_speller_result res) {
+ if ((res == RCC_SPELLER_ALMOST_CORRECT)||(res == RCC_SPELLER_CORRECT)) return 1;
+ if ((res == RCC_SPELLER_ALMOST_PARRENT)||(res == RCC_SPELLER_PARRENT)) return 1;
+ return 0;
}
diff --git a/src/rccspell.h b/src/rccspell.h
index 49e39f4..49d5c99 100644
--- a/src/rccspell.h
+++ b/src/rccspell.h
@@ -7,23 +7,22 @@
#include <aspell.h>
#endif /* HAVE_ASPELL */
+#include "internal.h"
+
struct rcc_speller_t {
#ifdef HAVE_ASPELL
struct AspellSpeller *speller;
#else
void *speller;
#endif /* HAVE_ASPELL */
+ rcc_speller parrents[RCC_MAX_LANGUAGE_PARRENTS+1];
};
-typedef struct rcc_speller_t *rcc_speller;
typedef struct rcc_speller_t rcc_speller_s;
-rcc_speller rccSpellerCreate(const char *lang);
-void rccSpellerFree(rcc_speller speller);
-
int rccSpellerGetError(rcc_speller rccspeller);
-int rccSpellerSized(rcc_speller speller, const char *word, size_t len);
-int rccSpeller(rcc_speller speller, const char *word);
+
+rcc_speller_result rccSpellerSized(rcc_speller speller, const char *word, size_t len, int recursion);
#endif /* _RCC_SPELL_H */
diff --git a/src/rccstring.c b/src/rccstring.c
index aa92407..0f46c90 100644
--- a/src/rccstring.c
+++ b/src/rccstring.c
@@ -175,3 +175,26 @@ int rccIsASCII(const char *str) {
if ((unsigned char)str[i]>0x7F) return 0;
return 1;
}
+
+size_t rccStringSizedGetChars(const char *str, size_t size) {
+ size_t i, skip = 0, chars = 0;
+ const unsigned char *tmp;
+
+ tmp = rccGetString(str);
+
+ for (i=0;(size?(size-i):tmp[i]);i++) {
+ if (skip) {
+ skip--;
+ continue;
+ }
+
+ if (tmp[i]<0x80) skip = 0;
+ else if ((tmp[i]>0xBF)&&(tmp[i]<0xE0)) skip = 1;
+ else if ((tmp[i]>0xDF)&&(tmp[i]<0xF0)) skip = 2;
+ else if ((tmp[i]>0xEF)&&(tmp[i]<0xF8)) skip = 3;
+ else skip = 4;
+ chars++;
+ }
+
+ return chars;
+}
diff --git a/src/rccstring.h b/src/rccstring.h
index e9e9734..96f8b2d 100644
--- a/src/rccstring.h
+++ b/src/rccstring.h
@@ -26,5 +26,6 @@ int strnlen(const char *str, size_t size);
int rccStrnlen(const char *str, size_t size);
#endif /* HAVE_STRNLEN */
int rccIsASCII(const char *str);
+size_t rccStringSizedGetChars(const char *str, size_t size);
#endif /* _RCC_STRING_H */
diff --git a/src/rccxml.c b/src/rccxml.c
index 143f930..b40d4fc 100644
--- a/src/rccxml.c
+++ b/src/rccxml.c
@@ -50,7 +50,7 @@ int rccXmlInit(int LoadConfiguration) {
FILE *f;
char config[MAX_HOME_CHARS + 32];
- xmlXPathContextPtr xpathctx;
+ xmlXPathContextPtr xpathctx = NULL;
xmlXPathObjectPtr obj = NULL;
xmlNodeSetPtr node_set;
unsigned long i, nnodes;
@@ -58,6 +58,8 @@ int rccXmlInit(int LoadConfiguration) {
xmlAttrPtr attr;
const char *lang, *engine_name;
unsigned int pos, lpos, epos, cpos;
+ const char *alias, *parrent;
+ unsigned int j, apos, rpos;
rcc_engine *engine;
@@ -82,6 +84,8 @@ int rccXmlInit(int LoadConfiguration) {
} else config[0] = 0;
+ for (apos=0;rcc_default_aliases[apos].alias;apos++);
+
// Load Extra Languages
if (config[0]) {
xmlctx = xmlReadFile(config, NULL, 0);
@@ -108,7 +112,17 @@ int rccXmlInit(int LoadConfiguration) {
pos = rccDefaultGetLanguageByName(lang);
if (!pos) continue;
- if (pos == (rcc_language_id)-1) pos = lpos;
+ if (pos == (rcc_language_id)-1) {
+ for (rpos=0;rcc_default_relations[rpos].lang;rpos++);
+ if (rpos < RCC_MAX_RELATIONS) {
+ rcc_default_relations[rpos].parrent = rcc_english_language_sn;
+ rcc_default_relations[rpos++].lang = lang;
+ rcc_default_relations[rpos].parrent = NULL;
+ rcc_default_relations[rpos].lang = NULL;
+ }
+
+ pos = lpos;
+ }
else if (pos == RCC_MAX_LANGUAGES) continue;
for (epos = 1, cpos = 1,node=pnode->children;node;node=node->next) {
@@ -121,10 +135,10 @@ int rccXmlInit(int LoadConfiguration) {
}
}
}
- if (!xmlStrcmp(node->name, "Engines")) {
+ else if (!xmlStrcmp(node->name, "Engines")) {
for (enode=node->children;enode;enode=enode->next) {
if (enode->type != XML_ELEMENT_NODE) continue;
- if ((!xmlStrcmp(enode->name, "Engine"))&&(rccXmlGetText(enode))&&(epos<RCC_MAX_ENGINES)) {
+ if ((!xmlStrcmp(enode->name, "Engine"))&&(epos<RCC_MAX_ENGINES)) {
engine_name = rccXmlGetText(enode);
if (!engine_name) continue;
engine = rccPluginEngineGetInfo(engine_name, lang);
@@ -134,6 +148,39 @@ int rccXmlInit(int LoadConfiguration) {
}
}
}
+ else if (!xmlStrcmp(node->name, "Aliases")) {
+ for (enode=node->children;enode;enode=enode->next) {
+ if (enode->type != XML_ELEMENT_NODE) continue;
+ if ((!xmlStrcmp(enode->name, "Alias"))&&(apos<RCC_MAX_ALIASES)) {
+ alias = rccXmlGetText(enode);
+ if (!alias) continue;
+ for (j=0;j<apos;j++)
+ if (!strcasecmp(alias, rcc_default_aliases[j].alias)) break;
+ if (j<apos) {
+ rcc_default_aliases[j].lang = lang;
+ } else {
+ rcc_default_aliases[apos].alias = alias;
+ rcc_default_aliases[apos++].lang = lang;
+ rcc_default_aliases[apos].alias = NULL;
+ rcc_default_aliases[apos].lang = NULL;
+ }
+ }
+ }
+ }
+ else if (!xmlStrcmp(node->name, "Relations")) {
+ rpos = rccDefaultDropLanguageRelations(lang);
+ for (enode=node->children;enode;enode=enode->next) {
+ if (enode->type != XML_ELEMENT_NODE) continue;
+ if ((!xmlStrcmp(enode->name, "Parrent"))&&(rpos<RCC_MAX_RELATIONS)) {
+ parrent = rccXmlGetText(enode);
+ if (!parrent) continue;
+ rcc_default_relations[rpos].parrent = parrent;
+ rcc_default_relations[rpos++].lang = lang;
+ rcc_default_relations[rpos].parrent = NULL;
+ rcc_default_relations[rpos].lang = NULL;
+ }
+ }
+ }
}
if ((cpos > 1)||(epos > 1)) {
@@ -161,6 +208,7 @@ clear:
}
}
}
+
return 0;
}
@@ -507,8 +555,7 @@ int rccLoad(rcc_context ctx, const char *name) {
ovalue = rccOptionDescriptionGetValueByName(odesc, tmp);
if (ovalue == (rcc_option_value)-1) ovalue = (rcc_option_value)atoi(tmp);
err = rccSetOption(ctx, (rcc_option)i, ovalue);
- }
- else err = -1;
+ } else err = -1;
} else err = -1;
if (err) rccOptionSetDefault(ctx, (rcc_option)i);
}
diff --git a/src/recode.c b/src/recode.c
index 48ce2d6..27dff92 100644
--- a/src/recode.c
+++ b/src/recode.c
@@ -22,25 +22,32 @@
#define RCC_ACCEPTABLE_LENGTH 3
static rcc_language_id rccDetectLanguageInternal(rcc_context ctx, rcc_class_id class_id, const char *buf, size_t len, rcc_string *retstring) {
- rcc_speller speller = NULL, english_speller = NULL;
+ rcc_speller speller = NULL;
unsigned long i, nlanguages;
rcc_language_config config, config0 = NULL;
rcc_string recoded;
unsigned char *utf8;
size_t j, mode;
- unsigned long spres, words, english, result;
- size_t longest;
+ rcc_speller_result spres;
+ unsigned long words, result, own;
+ size_t longest, ownlongest;
unsigned char english_mode, english_word = 1;
char *english_string = NULL;
rcc_language_id english_lang = (rcc_language_id)-1;
size_t english_longest = 0;
unsigned char is_english_string = 1;
- double res, english_res = 0;
+ double res, ownres, english_res = 0;
rcc_option_value usedb4;
rcc_language_id bestlang = (rcc_language_id)-1;
- unsigned long bestlongest = RCC_ACCEPTABLE_LENGTH;
+ size_t bestlongest = RCC_ACCEPTABLE_LENGTH;
+ size_t bestownlongest = RCC_ACCEPTABLE_LENGTH;
+ unsigned long bestown = 0;
double bestres = RCC_ACCEPTABLE_PROBABILITY;
char *best_string = NULL;
+ rcc_language_id bestfixlang = (rcc_language_id)-1;
+ unsigned long k;
+ rcc_language_id *parrents;
+ size_t chars = 0;
unsigned long accepted_nonenglish_langs = 0;
@@ -64,22 +71,24 @@ static rcc_language_id rccDetectLanguageInternal(rcc_context ctx, rcc_class_id c
nlanguages = ctx->n_languages;
english_lang = rccGetLanguageByName(ctx, rcc_english_language_sn);
- if (english_lang != (rcc_language_id)-1) {
- config = rccGetUsableConfig(ctx, english_lang);
- if (config) {
- english_speller = rccConfigGetSpeller(config);
- if (rccSpellerGetError(english_speller)) english_speller = NULL;
- }
- }
for (i=0;i<nlanguages;i++) {
- config = rccGetUsableConfig(ctx, (rcc_language_id)i);
+ if (i) config = rccGetUsableConfig(ctx, (rcc_language_id)i);
+ else config = rccGetCurrentConfig(ctx);
if (!config) continue;
-
+
if (i) {
if (config==config0) continue;
} else config0=config;
+ if (bestfixlang != (rcc_language_id)-1) {
+ parrents = ctx->language_parrents[i];
+ for (k = 0;parrents[k] != (rcc_language_id)-1;k++)
+ if (parrents[k] == bestfixlang) break;
+
+ if (parrents[k] != bestfixlang) continue;
+ }
+
speller = rccConfigGetSpeller(config);
if (rccSpellerGetError(speller)) continue;
@@ -91,17 +100,24 @@ static rcc_language_id rccDetectLanguageInternal(rcc_context ctx, rcc_class_id c
utf8 = (char*)rccStringGetString(recoded);
- for (result=0,english=0,words=0,longest=0,mode=0,j=0;utf8[j];j++) {
+ for (result=0,own=0,words=0,ownlongest=0,longest=0,mode=0,j=0;utf8[j];j++) {
if (isSpace(utf8[j])) {
if (mode) {
- if ((!english_mode)&&(english_word)&&(rccSpellerSized(english_speller, utf8 + mode -1, j - mode + 1)))
- english++;
- else {
- if ((english_mode)&&(!english_word)) is_english_string = 0;
- spres = rccSpellerSized(speller, utf8 + mode - 1, j - mode + 1)?1:0;
- if ((spres)&&((j - mode + 1)>longest)) longest = j - mode + 1;
- result+=spres;
+ if ((english_mode)&&(!english_word)) is_english_string = 0;
+
+ spres = rccSpellerSized(speller, utf8 + mode - 1, j - mode + 1, 1);
+ if (rccSpellerResultIsCorrect(spres)) {
+ result++;
+ chars = rccStringSizedGetChars(utf8 + mode - 1, j - mode + 1);
+ if (chars > longest) longest = chars;
}
+ if (rccSpellerResultIsOwn(spres)) {
+ own++;
+ if (chars > ownlongest) ownlongest = chars;
+ }
+#if RCC_DEBUG_LANGDETECT > 1
+ printf("%s: %u (%.*s)\n", config->language->sn, spres, j - mode + 1, utf8 + mode -1);
+#endif /* RCC_DEBUG_LANGDETECT */
words++;
mode = 0;
} else continue;
@@ -116,14 +132,22 @@ static rcc_language_id rccDetectLanguageInternal(rcc_context ctx, rcc_class_id c
}
if (mode) {
- if ((!english_mode)&&(english_word)&&(rccSpeller(english_speller, utf8 + mode -1)))
- english++;
- else {
- if ((english_mode)&&(!english_word)) is_english_string = 0;
- spres = rccSpeller(speller, utf8 + mode - 1)?1:0;
- if ((spres)&&((j-mode+1)>longest)) longest = j - mode + 1;
- result += spres;
+ if ((english_mode)&&(!english_word)) is_english_string = 0;
+
+ spres = rccSpeller(speller, utf8 + mode - 1);
+ if (rccSpellerResultIsCorrect(spres)) {
+ result++;
+ chars = rccStringSizedGetChars(utf8 + mode - 1, 0);
+ if (chars > longest) longest = chars;
}
+ if (rccSpellerResultIsOwn(spres)) {
+ own++;
+ if (chars > ownlongest) ownlongest = chars;
+ }
+#if RCC_DEBUG_LANGDETECT > 1
+ printf("%s: %u (%.*s)\n", config->language->sn, spres, j - mode + 1, utf8 + mode -1);
+#endif /* RCC_DEBUG_LANGDETECT */
+
words++;
}
@@ -134,25 +158,27 @@ static rcc_language_id rccDetectLanguageInternal(rcc_context ctx, rcc_class_id c
english_lang = (rcc_language_id)i;
english_longest = longest;
english_string = recoded;
- } else if (words>english) {
- res = 1.*result/(words - english);
- if ((res > RCC_REQUIRED_PROBABILITY)&&(longest > RCC_REQUIRED_LENGTH)) {
- if (best_string) free(best_string);
- if (english_string) free(english_string);
-
- if (retstring) *retstring = recoded;
- else free(recoded);
- return (rcc_language_id)i;
- } else if ((res > bestres + RCC_PROBABILITY_STEP)||
+ } else if (words>0) {
+ res = 1.*result/words;
+ ownres = 1.*own/words;
+
+ if ((res > bestres + RCC_PROBABILITY_STEP)||
((res > bestres - RCC_PROBABILITY_STEP)&&(longest > bestlongest))||
- ((res > bestres)&&(longest == bestlongest))) {
-
+ ((res > bestres + 1E-10)&&(longest == bestlongest))||
+ (((res-bestres)<1E-10)&&((bestres-res)<1E-10)&&(longest == bestlongest)&&(own > 0))) {
+
if (best_string) free(best_string);
bestres = res;
- bestlang = (rcc_language_id)i;
+ bestlang = rccGetRealLanguage(ctx, (rcc_language_id)i);
bestlongest = longest;
best_string = recoded;
+ bestown = own;
+ bestownlongest = ownlongest;
+
+ if ((ownres > RCC_REQUIRED_PROBABILITY)&&(ownlongest > RCC_REQUIRED_LENGTH)) {
+ bestfixlang = bestlang;
+ }
} else if (!accepted_nonenglish_langs) {
bestlang = (rcc_language_id)i;
best_string = recoded;
@@ -162,6 +188,13 @@ static rcc_language_id rccDetectLanguageInternal(rcc_context ctx, rcc_class_id c
} else free(recoded);
}
+ if ((bestres > RCC_REQUIRED_PROBABILITY)&&(bestlongest > RCC_REQUIRED_LENGTH)&&(bestown>0)) {
+ if (english_string) free(english_string);
+ if (retstring) *retstring = best_string;
+ else if (best_string) free(best_string);
+ return bestlang;
+ }
+
if ((is_english_string)&&(english_res > RCC_REQUIRED_PROBABILITY)&&(english_longest > RCC_REQUIRED_LENGTH)) {
if (best_string) free(best_string);
if (retstring) *retstring = english_string;
@@ -242,6 +275,25 @@ rcc_autocharset_id rccConfigDetectCharset(rcc_language_config config, rcc_class_
return rccConfigDetectCharsetInternal(config, class_id, buf, len);
}
+static int rccAreLanguagesRelated(rcc_context ctx, rcc_language_id l1, rcc_language_id l2, rcc_language_id skip) {
+ unsigned int i;
+ rcc_language_id *list;
+
+ if ((l1 == skip)||(l2 == skip)) return 0;
+
+ if (l1 == l2) return 1;
+
+ list = ctx->language_parrents[l1];
+ for (i=0;list[i] != (rcc_language_id)-1;i++)
+ if (list[i] == l2) return 1;
+
+ list = ctx->language_parrents[l2];
+ for (i=0;list[i] != (rcc_language_id)-1;i++)
+ if (list[i] == l1) return 1;
+
+ return 0;
+}
+
rcc_string rccSizedFrom(rcc_context ctx, rcc_class_id class_id, const char *buf, size_t len) {
int err;
size_t ret;
@@ -286,7 +338,9 @@ rcc_string rccSizedFrom(rcc_context ctx, rcc_class_id class_id, const char *buf,
detected_language_id = rccDetectLanguageInternal(ctx, class_id, buf, len, &result);
if (detected_language_id != (rcc_language_id)-1) {
- /*printf("Language %i: %s\n", rccStringGetLanguage(result), result);*/
+#ifdef RCC_DEBUG_LANGDETECT
+ printf("Language %i(%s): %s\n", rccStringGetLanguage(result), rccStringGetLanguage(result)?rccGetLanguageName(ctx, rccStringGetLanguage(result)):"", result);
+#endif /* RCC_DEBUG_LANGDETECT */
return result;
}
@@ -332,6 +386,7 @@ char *rccSizedTo(rcc_context ctx, rcc_class_id class_id, rcc_const_string buf, s
rcc_language_config config;
rcc_language_id language_id;
rcc_language_id current_language_id;
+ rcc_language_id english_language_id;
rcc_class_type class_type;
rcc_option_value translate;
rcc_translate trans, entrans;
@@ -366,6 +421,8 @@ char *rccSizedTo(rcc_context ctx, rcc_class_id class_id, rcc_const_string buf, s
else english_source = 1;
if ((class_type != RCC_CLASS_FS)&&((translate==RCC_OPTION_TRANSLATE_FULL)||((translate)&&(!english_source)))) {
+ english_language_id = rccGetLanguageByName(ctx, rcc_english_language_sn);
+
rccMutexLock(ctx->mutex);
current_language_id = rccGetCurrentLanguage(ctx);
@@ -375,6 +432,18 @@ char *rccSizedTo(rcc_context ctx, rcc_class_id class_id, rcc_const_string buf, s
if (trans) {
translated = rccTranslate(trans, utfstring);
if (translated) {
+ if ((current_language_id != english_language_id)&&(rccIsASCII(translated))) {
+ /* Ffrench to german (no umlauts) => not related
+ english to german (no umlauts) => skiping english relations
+ DS: Problem if we have relation between french and german */
+ if (rccAreLanguagesRelated(ctx, language_id, current_language_id, english_language_id)) {
+ free(translated);
+ translated = NULL;
+ translate = 0;
+ }
+ }
+ }
+ if (translated) {
language_id = current_language_id;
config = rccGetConfig(ctx, language_id);
@@ -394,11 +463,11 @@ char *rccSizedTo(rcc_context ctx, rcc_class_id class_id, rcc_const_string buf, s
}
}
- if ((translate == RCC_OPTION_TRANSLATE_TO_ENGLISH)||((config->trans)&&(!translated))) {
+ if ((translate == RCC_OPTION_TRANSLATE_TO_ENGLISH)||((translate)&&(!translated)&&(!english_language_id == current_language_id)&&(!rccAreLanguagesRelated(ctx, language_id, current_language_id, (rcc_language_id)-1)))) {
entrans = rccConfigGetEnglishTranslator(config);
if (entrans) {
translated = rccTranslate(config->entrans, utfstring);
-
+/*
config = rccGetConfig(ctx, language_id);
if (!config) {
rccMutexUnLock(ctx->mutex);
@@ -409,7 +478,7 @@ char *rccSizedTo(rcc_context ctx, rcc_class_id class_id, rcc_const_string buf, s
if (err) {
rccMutexUnLock(ctx->mutex);
return translated;
- }
+ }*/
}
}
}