1
/* This header file is in the public domain. */
6
/* According to autoconf stdlib may not be enough for size_t */
11
#endif /* __cplusplus */
13
/* Enumerated types */
17
* @ENCA_SURFACE_EOL_CR: End-of-lines are represented with CR's.
18
* @ENCA_SURFACE_EOL_LF: End-of-lines are represented with LF's.
19
* @ENCA_SURFACE_EOL_CRLF: End-of-lines are represented with CRLF's.
20
* @ENCA_SURFACE_EOL_MIX: Several end-of-line types, mixed.
21
* @ENCA_SURFACE_EOL_BIN: End-of-line concept not applicable (binary data).
22
* @ENCA_SURFACE_MASK_EOL: Mask for end-of-line surfaces.
23
* @ENCA_SURFACE_PERM_21: Odd and even bytes swapped.
24
* @ENCA_SURFACE_PERM_4321: Reversed byte sequence in 4byte words.
25
* @ENCA_SURFACE_PERM_MIX: Chunks with both endianess, concatenated.
26
* @ENCA_SURFACE_MASK_PERM: Mask for permutation surfaces.
27
* @ENCA_SURFACE_QP: Quoted printables.
28
* @ENCA_SURFACE_REMOVE: Recode `remove' surface.
29
* @ENCA_SURFACE_UNKNOWN: Unknown surface.
30
* @ENCA_SURFACE_MASK_ALL: Mask for all bits, withnout #ENCA_SURFACE_UNKNOWN.
34
typedef enum { /*< flags >*/
35
ENCA_SURFACE_EOL_CR = 1 << 0,
36
ENCA_SURFACE_EOL_LF = 1 << 1,
37
ENCA_SURFACE_EOL_CRLF = 1 << 2,
38
ENCA_SURFACE_EOL_MIX = 1 << 3,
39
ENCA_SURFACE_EOL_BIN = 1 << 4,
40
ENCA_SURFACE_MASK_EOL = (ENCA_SURFACE_EOL_CR
42
| ENCA_SURFACE_EOL_CRLF
43
| ENCA_SURFACE_EOL_MIX
44
| ENCA_SURFACE_EOL_BIN),
45
ENCA_SURFACE_PERM_21 = 1 << 5,
46
ENCA_SURFACE_PERM_4321 = 1 << 6,
47
ENCA_SURFACE_PERM_MIX = 1 << 7,
48
ENCA_SURFACE_MASK_PERM = (ENCA_SURFACE_PERM_21
49
| ENCA_SURFACE_PERM_4321
50
| ENCA_SURFACE_PERM_MIX),
51
ENCA_SURFACE_QP = 1 << 8,
52
ENCA_SURFACE_REMOVE = 1 << 13,
53
ENCA_SURFACE_UNKNOWN = 1 << 14,
54
ENCA_SURFACE_MASK_ALL = (ENCA_SURFACE_MASK_EOL
55
| ENCA_SURFACE_MASK_PERM
57
| ENCA_SURFACE_REMOVE)
62
* @ENCA_NAME_STYLE_ENCA: Default, implicit charset name in Enca.
63
* @ENCA_NAME_STYLE_RFC1345: RFC 1345 charset name.
64
* @ENCA_NAME_STYLE_CSTOCS: Cstocs charset name.
65
* @ENCA_NAME_STYLE_ICONV: Iconv charset name.
66
* @ENCA_NAME_STYLE_HUMAN: Human comprehensible description.
68
* Charset naming styles and conventions.
72
ENCA_NAME_STYLE_RFC1345,
73
ENCA_NAME_STYLE_CSTOCS,
74
ENCA_NAME_STYLE_ICONV,
80
* @ENCA_CHARSET_7BIT: Characters are represented with 7bit characters.
81
* @ENCA_CHARSET_8BIT: Characters are represented with bytes.
82
* @ENCA_CHARSET_16BIT: Characters are represented with 2byte words.
83
* @ENCA_CHARSET_32BIT: Characters are represented with 4byte words.
84
* @ENCA_CHARSET_FIXED: One characters consists of one fundamental piece.
85
* @ENCA_CHARSET_VARIABLE: One character consists of variable number of
87
* @ENCA_CHARSET_BINARY: Charset is binary from ASCII viewpoint.
88
* @ENCA_CHARSET_REGULAR: Language dependent (8bit) charset.
89
* @ENCA_CHARSET_MULTIBYTE: Multibyte charset.
93
* Flags %ENCA_CHARSET_7BIT, %ENCA_CHARSET_8BIT, %ENCA_CHARSET_16BIT,
94
* %ENCA_CHARSET_32BIT tell how many bits a `fundamental piece' consists of.
95
* This is different from bits per character; r.g. UTF-8 consists of 8bit
96
* pieces (bytes), but character can be composed from 1 to 6 of them.
98
typedef enum { /*< flags >*/
99
ENCA_CHARSET_7BIT = 1 << 0,
100
ENCA_CHARSET_8BIT = 1 << 1,
101
ENCA_CHARSET_16BIT = 1 << 2,
102
ENCA_CHARSET_32BIT = 1 << 3,
103
ENCA_CHARSET_FIXED = 1 << 4,
104
ENCA_CHARSET_VARIABLE = 1 << 5,
105
ENCA_CHARSET_BINARY = 1 << 6,
106
ENCA_CHARSET_REGULAR = 1 << 7,
107
ENCA_CHARSET_MULTIBYTE = 1 << 8
113
* @ENCA_EINVALUE: Invalid value (usually of an option).
114
* @ENCA_EEMPTY: Sample is empty.
115
* @ENCA_EFILTERED: After filtering, (almost) nothing remained.
116
* @ENCA_ENOCS8: Mulitibyte tests failed and language contains no 8bit charsets.
117
* @ENCA_ESIGNIF: Too few significant characters.
118
* @ENCA_EWINNER: No clear winner.
119
* @ENCA_EGARBAGE: Sample is garbage.
137
* Unknown character set id.
139
* Use enca_charset_is_known() to check for unknown charset instead of direct
142
#define ENCA_CS_UNKNOWN (-1)
147
* Not-a-character in unicode tables.
149
#define ENCA_NOT_A_CHAR 0xffff
151
/* Published (opaque) typedefs */
152
typedef struct _EncaAnalyserState *EncaAnalyser;
154
/* Public (transparent) typedefs */
155
typedef struct _EncaEncoding EncaEncoding;
159
* @charset: Numeric charset identifier.
160
* @surface: Surface flags.
162
* Encoding, i.e. charset and surface.
164
* This is what enca_analyse() and enca_analyse_const() return.
166
* The @charset field is an opaque numerical charset identifier, which has no
167
* meaning outside Enca library.
168
* You will probably want to use it only as enca_charset_name() argument.
169
* It is only guaranteed not to change meaning
170
* during program execution time; change of its interpretation (e.g. due to
171
* addition of new charsets) is not considered API change.
173
* The @surface field is a combination of #EncaSurface flags. You may want
174
* to ignore it completely; you should use enca_set_interpreted_surfaces()
175
* to disable weird surfaces then.
177
struct _EncaEncoding { int charset; EncaSurface surface; };
179
void (*enca_set_multibyte) (EncaAnalyser analyser, int multibyte);
180
void (*enca_set_interpreted_surfaces) (EncaAnalyser analyser, int interpreted_surfaces);
181
void (*enca_set_ambiguity) (EncaAnalyser analyser, int ambiguity);
182
void (*enca_set_filtering) (EncaAnalyser analyser, int filtering);
183
void (*enca_set_garbage_test) (EncaAnalyser analyser, int garabage_test);
184
void (*enca_set_termination_strictness) (EncaAnalyser analyser, int termination_strictness);
185
int (*enca_set_significant) (EncaAnalyser analyser, size_t significant);
186
int (*enca_set_threshold) (EncaAnalyser analyser, double threshold);
187
const char* (*enca_charset_name) (int charset, EncaNameStyle whatname);
188
int* (*enca_get_language_charsets) (const char *langname, size_t *n);
189
EncaAnalyser (*enca_analyser_alloc) (const char *langname);
190
void (*enca_analyser_free) (EncaAnalyser analyser);
191
EncaEncoding (*enca_analyse_const) (EncaAnalyser analyser,const unsigned char *buffer, size_t size);
194
* enca_charset_is_known:
197
* Expands to nonzero when the charset is known (i.e. it's not
200
#define enca_charset_is_known(cs) \
201
((cs) != ENCA_CS_UNKNOWN)
204
* enca_charset_is_7bit:
207
* Expands to nonzero when characters are represented with 7bit characters.
209
#define enca_charset_is_7bit(cs) \
210
(enca_charset_properties(cs) & ENCA_CHARSET_7BIT)
213
* enca_charset_is_8bit:
216
* Expands to nonzero when characters are represented with bytes.
218
#define enca_charset_is_8bit(cs) \
219
(enca_charset_properties(cs) & ENCA_CHARSET_8BIT)
222
* enca_charset_is_16bit:
225
* Expands to nonzero when characters are represented with 2byte words.
227
#define enca_charset_is_16bit(cs) \
228
(enca_charset_properties(cs) & ENCA_CHARSET_16BIT)
231
* enca_charset_is_32bit:
234
* Expands to nonzero when characters are represented with 4byte words.
236
#define enca_charset_is_32bit(cs) \
237
(enca_charset_properties(cs) & ENCA_CHARSET_32BIT)
240
* enca_charset_is_fixed:
243
* Expands to nonzero when one characters consists of one fundamental piece.
245
#define enca_charset_is_fixed(cs) \
246
(enca_charset_properties(cs) & ENCA_CHARSET_FIXED)
249
* enca_charset_is_variable:
252
* Expands to nonzero when one character consists of variable number of
253
* fundamental pieces.
255
#define enca_charset_is_variable(cs) \
256
(enca_charset_properties(cs) & ENCA_CHARSET_VARIABLE)
259
* enca_charset_is_binary:
262
* Expands to nonzero when charset is binary from ASCII viewpoint.
264
#define enca_charset_is_binary(cs) \
265
(enca_charset_properties(cs) & ENCA_CHARSET_BINARY)
268
* enca_charset_is_regular:
271
* Expands to nonzero when charset is language dependent (8bit) charset.
273
#define enca_charset_is_regular(cs) \
274
(enca_charset_properties(cs) & ENCA_CHARSET_REGULAR)
277
* enca_charset_is_multibyte:
280
* Expands to nonzero when charset is multibyte.
282
#define enca_charset_is_multibyte(cs) \
283
(enca_charset_properties(cs) & ENCA_CHARSET_MULTIBYTE)
287
#endif /* __cplusplus */