|
93 | 93 | #include "filters/mbfilter_singlebyte.h" |
94 | 94 | #include "filters/mbfilter_utf8.h" |
95 | 95 |
|
96 | | -#include "rare_cp_bitvec.h" |
97 | | - |
98 | | -/* |
99 | | - * encoding detector |
100 | | - */ |
101 | | -static int mbfl_estimate_encoding_likelihood(int input_cp, void *void_data) |
102 | | -{ |
103 | | - mbfl_encoding_detector_data *data = void_data; |
104 | | - unsigned int c = input_cp; |
105 | | - |
106 | | - /* Receive wchars decoded from input string using candidate encoding. |
107 | | - * If the string was invalid in the candidate encoding, we assume |
108 | | - * it's the wrong one. Otherwise, give the candidate many 'demerits' |
109 | | - * for each 'rare' codepoint found, a smaller number for each ASCII |
110 | | - * punctuation character, and 1 for all other codepoints. |
111 | | - * |
112 | | - * The 'common' codepoints should cover the vast majority of |
113 | | - * codepoints we are likely to see in practice, while only covering |
114 | | - * a small minority of the entire Unicode encoding space. Why? |
115 | | - * Well, if the test string happens to be valid in an incorrect |
116 | | - * candidate encoding, the bogus codepoints which it decodes to will |
117 | | - * be more or less random. By treating the majority of codepoints as |
118 | | - * 'rare', we ensure that in almost all such cases, the bogus |
119 | | - * codepoints will include plenty of 'rares', thus giving the |
120 | | - * incorrect candidate encoding lots of demerits. See |
121 | | - * common_codepoints.txt for the actual list used. |
122 | | - * |
123 | | - * So, why give extra demerits for ASCII punctuation characters? It's |
124 | | - * because there are some text encodings, like UTF-7, HZ, and ISO-2022, |
125 | | - * which deliberately only use bytes in the ASCII range. When |
126 | | - * misinterpreted as ASCII/UTF-8, strings in these encodings will |
127 | | - * have an unusually high number of ASCII punctuation characters. |
128 | | - * So giving extra demerits for such characters will improve |
129 | | - * detection accuracy for UTF-7 and similar encodings. |
130 | | - * |
131 | | - * Finally, why 1 demerit for all other characters? That penalizes |
132 | | - * long strings, meaning we will tend to choose a candidate encoding |
133 | | - * in which the test string decodes to a smaller number of |
134 | | - * codepoints. That prevents single-byte encodings in which almost |
135 | | - * every possible input byte decodes to a 'common' codepoint from |
136 | | - * being favored too much. */ |
137 | | - if (c == MBFL_BAD_INPUT) { |
138 | | - data->num_illegalchars++; |
139 | | - } else if (c > 0xFFFF) { |
140 | | - data->score += 40; |
141 | | - } else if (c >= 0x21 && c <= 0x2F) { |
142 | | - data->score += 6; |
143 | | - } else if ((rare_codepoint_bitvec[c >> 5] >> (c & 0x1F)) & 1) { |
144 | | - data->score += 30; |
145 | | - } else { |
146 | | - data->score += 1; |
147 | | - } |
148 | | - return 0; |
149 | | -} |
150 | | - |
151 | | -mbfl_encoding_detector *mbfl_encoding_detector_new(const mbfl_encoding **elist, int elistsz, int strict) |
152 | | -{ |
153 | | - if (!elistsz) { |
154 | | - return NULL; |
155 | | - } |
156 | | - |
157 | | - mbfl_encoding_detector *identd = emalloc(sizeof(mbfl_encoding_detector)); |
158 | | - identd->filter_list = ecalloc(elistsz, sizeof(mbfl_convert_filter*)); |
159 | | - identd->filter_data = ecalloc(elistsz, sizeof(mbfl_encoding_detector_data)); |
160 | | - |
161 | | - int filter_list_size = 0; |
162 | | - for (int i = 0; i < elistsz; i++) { |
163 | | - mbfl_convert_filter *filter = mbfl_convert_filter_new(elist[i], &mbfl_encoding_wchar, |
164 | | - mbfl_estimate_encoding_likelihood, NULL, &identd->filter_data[filter_list_size]); |
165 | | - if (filter) { |
166 | | - identd->filter_list[filter_list_size++] = filter; |
167 | | - } |
168 | | - } |
169 | | - identd->filter_list_size = filter_list_size; |
170 | | - identd->strict = strict; |
171 | | - return identd; |
172 | | -} |
173 | | - |
174 | | -void mbfl_encoding_detector_delete(mbfl_encoding_detector *identd) |
175 | | -{ |
176 | | - for (int i = 0; i < identd->filter_list_size; i++) { |
177 | | - mbfl_convert_filter_delete(identd->filter_list[i]); |
178 | | - } |
179 | | - efree(identd->filter_list); |
180 | | - efree(identd->filter_data); |
181 | | - efree(identd); |
182 | | -} |
183 | | - |
184 | | -int mbfl_encoding_detector_feed(mbfl_encoding_detector *identd, mbfl_string *string) |
185 | | -{ |
186 | | - int num = identd->filter_list_size; |
187 | | - size_t n = string->len; |
188 | | - unsigned char *p = string->val; |
189 | | - int bad = 0; |
190 | | - |
191 | | - if (identd->strict) { |
192 | | - for (int i = 0; i < num; i++) { |
193 | | - mbfl_convert_filter *filter = identd->filter_list[i]; |
194 | | - mbfl_encoding_detector_data *data = &identd->filter_data[i]; |
195 | | - if (filter->from->check != NULL && !(filter->from->check)(p, n)) { |
196 | | - data->num_illegalchars++; |
197 | | - } |
198 | | - } |
199 | | - } |
200 | | - |
201 | | - while (n--) { |
202 | | - for (int i = 0; i < num; i++) { |
203 | | - mbfl_convert_filter *filter = identd->filter_list[i]; |
204 | | - mbfl_encoding_detector_data *data = &identd->filter_data[i]; |
205 | | - if (!data->num_illegalchars) { |
206 | | - (*filter->filter_function)(*p, filter); |
207 | | - if (data->num_illegalchars) { |
208 | | - bad++; |
209 | | - } |
210 | | - } |
211 | | - } |
212 | | - if ((num - 1) <= bad && !identd->strict) { |
213 | | - return 1; |
214 | | - } |
215 | | - p++; |
216 | | - } |
217 | | - |
218 | | - for (int i = 0; i < num; i++) { |
219 | | - mbfl_convert_filter *filter = identd->filter_list[i]; |
220 | | - (filter->filter_flush)(filter); |
221 | | - } |
222 | | - |
223 | | - return 0; |
224 | | -} |
225 | | - |
226 | | -const mbfl_encoding *mbfl_encoding_detector_judge(mbfl_encoding_detector *identd) |
227 | | -{ |
228 | | - size_t best_score = SIZE_MAX; /* Low score is 'better' */ |
229 | | - const mbfl_encoding *enc = NULL; |
230 | | - |
231 | | - for (int i = 0; i < identd->filter_list_size; i++) { |
232 | | - mbfl_convert_filter *filter = identd->filter_list[i]; |
233 | | - mbfl_encoding_detector_data *data = &identd->filter_data[i]; |
234 | | - if (!data->num_illegalchars && data->score < best_score) { |
235 | | - enc = filter->from; |
236 | | - best_score = data->score; |
237 | | - } |
238 | | - } |
239 | | - |
240 | | - return enc; |
241 | | -} |
242 | | - |
243 | 96 | /* |
244 | 97 | * strcut |
245 | 98 | */ |
|
0 commit comments