@@ -40,9 +40,6 @@ static void mb_wchar_to_cp50220(uint32_t *in, size_t len, mb_convert_buf *buf, b
4040static void mb_wchar_to_cp50221 (uint32_t * in , size_t len , mb_convert_buf * buf , bool end );
4141static void mb_wchar_to_cp50222 (uint32_t * in , size_t len , mb_convert_buf * buf , bool end );
4242
43- /* See mbstring.c */
44- uint32_t mb_convert_kana_codepoint (uint32_t c , uint32_t next , bool * consumed , uint32_t * second , int mode );
45-
4643/* Previously, a dubious 'encoding' called 'cp50220raw' was supported
4744 * This was just CP50220, but the implementation was less strict regarding
4845 * invalid characters; it would silently pass some through
@@ -333,6 +330,198 @@ static int mbfl_filt_conv_cp5022x_wchar_flush(mbfl_convert_filter *filter)
333330 return 0 ;
334331}
335332
333+ /* Apply various transforms to input codepoint, such as converting halfwidth katakana
334+ * to fullwidth katakana. `mode` is a bitfield which controls which transforms are
335+ * actually performed. The bit values are defined in translit_kana_jisx0201_jisx0208.h.
336+ * `mode` must not call for transforms which are inverses (i.e. which would cancel
337+ * each other out).
338+ *
339+ * In some cases, successive input codepoints may be merged into one output codepoint.
340+ * (That is the purpose of the `next` parameter.) If the `next` codepoint is consumed
341+ * and should be skipped over, `*consumed` will be set to true. Otherwise, `*consumed`
342+ * will not be modified. If there is no following codepoint, `next` should be zero.
343+ *
344+ * Again, in some cases, one input codepoint may convert to two output codepoints.
345+ * If so, the second output codepoint will be stored in `*second`.
346+ *
347+ * Return the resulting codepoint. If none of the requested transforms apply, return
348+ * the input codepoint unchanged.
349+ */
350+ uint32_t mb_convert_kana_codepoint (uint32_t c , uint32_t next , bool * consumed , uint32_t * second , unsigned int mode )
351+ {
352+ if ((mode & MBFL_HAN2ZEN_ALL ) && c >= 0x21 && c <= 0x7D && c != '"' && c != '\'' && c != '\\' ) {
353+ return c + 0xFEE0 ;
354+ }
355+ if ((mode & MBFL_HAN2ZEN_ALPHA ) && ((c >= 'A' && c <= 'Z' ) || (c >= 'a' && c <= 'z' ))) {
356+ return c + 0xFEE0 ;
357+ }
358+ if ((mode & MBFL_HAN2ZEN_NUMERIC ) && c >= '0' && c <= '9' ) {
359+ return c + 0xFEE0 ;
360+ }
361+ if ((mode & MBFL_HAN2ZEN_SPACE ) && c == ' ' ) {
362+ return 0x3000 ;
363+ }
364+
365+ if (mode & (MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_HIRAGANA )) {
366+ /* Convert Hankaku kana to Zenkaku kana
367+ * Either all Hankaku kana (including katakana and hiragana) will be converted
368+ * to Zenkaku katakana, or to Zenkaku hiragana */
369+ if ((mode & MBFL_HAN2ZEN_KATAKANA ) && (mode & MBFL_HAN2ZEN_GLUE )) {
370+ if (c >= 0xFF61 && c <= 0xFF9F ) {
371+ int n = c - 0xFF60 ;
372+
373+ if (next >= 0xFF61 && next <= 0xFF9F ) {
374+ if (next == 0xFF9E && ((n >= 22 && n <= 36 ) || (n >= 42 && n <= 46 ))) {
375+ * consumed = true;
376+ return 0x3001 + hankana2zenkana_table [n ];
377+ }
378+ if (next == 0xFF9E && n == 19 ) {
379+ * consumed = true;
380+ return 0x30F4 ;
381+ }
382+ if (next == 0xFF9F && n >= 42 && n <= 46 ) {
383+ * consumed = true;
384+ return 0x3002 + hankana2zenkana_table [n ];
385+ }
386+ }
387+
388+ return 0x3000 + hankana2zenkana_table [n ];
389+ }
390+ }
391+ if ((mode & MBFL_HAN2ZEN_HIRAGANA ) && (mode & MBFL_HAN2ZEN_GLUE )) {
392+ if (c >= 0xFF61 && c <= 0xFF9F ) {
393+ int n = c - 0xFF60 ;
394+
395+ if (next >= 0xFF61 && next <= 0xFF9F ) {
396+ if (next == 0xFF9E && ((n >= 22 && n <= 36 ) || (n >= 42 && n <= 46 ))) {
397+ * consumed = true;
398+ return 0x3001 + hankana2zenhira_table [n ];
399+ }
400+ if (next == 0xFF9F && n >= 42 && n <= 46 ) {
401+ * consumed = true;
402+ return 0x3002 + hankana2zenhira_table [n ];
403+ }
404+ }
405+
406+ return 0x3000 + hankana2zenhira_table [n ];
407+ }
408+ }
409+ if ((mode & MBFL_HAN2ZEN_KATAKANA ) && c >= 0xFF61 && c <= 0xFF9F ) {
410+ return 0x3000 + hankana2zenkana_table [c - 0xFF60 ];
411+ }
412+ if ((mode & MBFL_HAN2ZEN_HIRAGANA ) && c >= 0xFF61 && c <= 0xFF9F ) {
413+ return 0x3000 + hankana2zenhira_table [c - 0xFF60 ];
414+ }
415+ }
416+
417+ if (mode & MBFL_HAN2ZEN_SPECIAL ) { /* special ascii to symbol */
418+ if (c == '\\' || c == 0xA5 ) { /* YEN SIGN */
419+ return 0xFFE5 ; /* FULLWIDTH YEN SIGN */
420+ }
421+ if (c == 0x7E || c == 0x203E ) {
422+ return 0xFFE3 ; /* FULLWIDTH MACRON */
423+ }
424+ if (c == '\'' ) {
425+ return 0x2019 ; /* RIGHT SINGLE QUOTATION MARK */
426+ }
427+ if (c == '"' ) {
428+ return 0x201D ; /* RIGHT DOUBLE QUOTATION MARK */
429+ }
430+ }
431+
432+ if (mode & (MBFL_ZEN2HAN_ALL | MBFL_ZEN2HAN_ALPHA | MBFL_ZEN2HAN_NUMERIC | MBFL_ZEN2HAN_SPACE )) {
433+ /* Zenkaku to Hankaku */
434+ if ((mode & MBFL_ZEN2HAN_ALL ) && c >= 0xFF01 && c <= 0xFF5D && c != 0xFF02 && c != 0xFF07 && c != 0xFF3C ) {
435+ /* all except " ' \ ~ */
436+ return c - 0xFEE0 ;
437+ }
438+ if ((mode & MBFL_ZEN2HAN_ALPHA ) && ((c >= 0xFF21 && c <= 0xFF3A ) || (c >= 0xFF41 && c <= 0xFF5A ))) {
439+ return c - 0xFEE0 ;
440+ }
441+ if ((mode & MBFL_ZEN2HAN_NUMERIC ) && (c >= 0xFF10 && c <= 0xFF19 )) {
442+ return c - 0xFEE0 ;
443+ }
444+ if ((mode & MBFL_ZEN2HAN_SPACE ) && (c == 0x3000 )) {
445+ return ' ' ;
446+ }
447+ if ((mode & MBFL_ZEN2HAN_ALL ) && (c == 0x2212 )) { /* MINUS SIGN */
448+ return '-' ;
449+ }
450+ }
451+
452+ if (mode & (MBFL_ZEN2HAN_KATAKANA | MBFL_ZEN2HAN_HIRAGANA )) {
453+ /* Zenkaku kana to hankaku kana */
454+ if ((mode & MBFL_ZEN2HAN_KATAKANA ) && c >= 0x30A1 && c <= 0x30F4 ) {
455+ /* Zenkaku katakana to hankaku kana */
456+ int n = c - 0x30A1 ;
457+ if (zenkana2hankana_table [n ][1 ]) {
458+ * second = 0xFF00 + zenkana2hankana_table [n ][1 ];
459+ }
460+ return 0xFF00 + zenkana2hankana_table [n ][0 ];
461+ }
462+ if ((mode & MBFL_ZEN2HAN_HIRAGANA ) && c >= 0x3041 && c <= 0x3093 ) {
463+ /* Zenkaku hiragana to hankaku kana */
464+ int n = c - 0x3041 ;
465+ if (zenkana2hankana_table [n ][1 ]) {
466+ * second = 0xFF00 + zenkana2hankana_table [n ][1 ];
467+ }
468+ return 0xFF00 + zenkana2hankana_table [n ][0 ];
469+ }
470+ if (c == 0x3001 ) {
471+ return 0xFF64 ; /* HALFWIDTH IDEOGRAPHIC COMMA */
472+ }
473+ if (c == 0x3002 ) {
474+ return 0xFF61 ; /* HALFWIDTH IDEOGRAPHIC FULL STOP */
475+ }
476+ if (c == 0x300C ) {
477+ return 0xFF62 ; /* HALFWIDTH LEFT CORNER BRACKET */
478+ }
479+ if (c == 0x300D ) {
480+ return 0xFF63 ; /* HALFWIDTH RIGHT CORNER BRACKET */
481+ }
482+ if (c == 0x309B ) {
483+ return 0xFF9E ; /* HALFWIDTH KATAKANA VOICED SOUND MARK */
484+ }
485+ if (c == 0x309C ) {
486+ return 0xff9f ; /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
487+ }
488+ if (c == 0x30FC ) {
489+ return 0xFF70 ; /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
490+ }
491+ if (c == 0x30FB ) {
492+ return 0xFF65 ; /* HALFWIDTH KATAKANA MIDDLE DOT */
493+ }
494+ }
495+
496+ if (mode & (MBFL_ZENKAKU_HIRA2KATA | MBFL_ZENKAKU_KATA2HIRA )) {
497+ if ((mode & MBFL_ZENKAKU_HIRA2KATA ) && ((c >= 0x3041 && c <= 0x3093 ) || c == 0x309D || c == 0x309E )) {
498+ /* Zenkaku hiragana to Zenkaku katakana */
499+ return c + 0x60 ;
500+ }
501+ if ((mode & MBFL_ZENKAKU_KATA2HIRA ) && ((c >= 0x30A1 && c <= 0x30F3 ) || c == 0x30FD || c == 0x30FE )) {
502+ /* Zenkaku katakana to Zenkaku hiragana */
503+ return c - 0x60 ;
504+ }
505+ }
506+
507+ if (mode & MBFL_ZEN2HAN_SPECIAL ) { /* special symbol to ascii */
508+ if (c == 0xFFE5 || c == 0xFF3C ) { /* FULLWIDTH YEN SIGN/FULLWIDTH REVERSE SOLIDUS */
509+ return '\\' ;
510+ }
511+ if (c == 0xFFE3 || c == 0x203E ) { /* FULLWIDTH MACRON/OVERLINE */
512+ return '~' ;
513+ }
514+ if (c == 0x2018 || c == 0x2019 ) { /* LEFT/RIGHT SINGLE QUOTATION MARK*/
515+ return '\'' ;
516+ }
517+ if (c == 0x201C || c == 0x201D ) { /* LEFT/RIGHT DOUBLE QUOTATION MARK */
518+ return '"' ;
519+ }
520+ }
521+
522+ return c ;
523+ }
524+
336525static int mbfl_filt_conv_wchar_cp50220 (int c , mbfl_convert_filter * filter )
337526{
338527 int mode = MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE ;
0 commit comments