2 * guess.c - guessing character encoding
4 * Copyright (c) 2000-2009 Shiro Kawai <shiro@acm.org>
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * 3. Neither the name of the authors nor the names of its contributors
18 * may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
27 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 * $Id: guess.c,v 1.6 2008-05-10 13:35:37 shirok Exp $
40 typedef struct guess_arc_rec
{
41 unsigned int next
; /* next state */
42 double score
; /* score */
45 typedef struct guess_dfa_rec
{
46 signed char (*states
)[256];
52 #define DFA_INIT(st, ar) \
55 #define DFA_NEXT(dfa, ch) \
58 if (dfa.state >= 0) { \
59 arc__ = dfa.states[dfa.state][ch]; \
63 dfa.state = dfa.arcs[arc__].next; \
64 dfa.score *= dfa.arcs[arc__].score; \
69 #define DFA_ALIVE(dfa) (dfa.state >= 0)
71 /* include DFA table generated by guess.scm */
72 #include "guess_tab.c"
74 static const char *guess_jp(FILE *in
, const char *def
)
77 guess_dfa eucj
= DFA_INIT(guess_eucj_st
, guess_eucj_ar
);
78 guess_dfa sjis
= DFA_INIT(guess_sjis_st
, guess_sjis_ar
);
79 guess_dfa utf8
= DFA_INIT(guess_utf8_st
, guess_utf8_ar
);
80 guess_dfa
*top
= NULL
;
81 int sjis_halfwidth_alive
= 1;
83 while ((c
= fgetc(in
)) != EOF
) {
85 /* special treatment of jis escape sequence */
90 if (c
== '$' || c
== '(') return "iso-2022-jp";
93 if (c
>= 0x80 && (c
< 0xa1 || c
> 0xdf))
94 sjis_halfwidth_alive
= 0;
96 if (DFA_ALIVE(eucj
)) {
97 if (!DFA_ALIVE(sjis
) && !DFA_ALIVE(utf8
)) return "euc-jp";
100 if (DFA_ALIVE(sjis
)) {
101 if (!DFA_ALIVE(eucj
) && !DFA_ALIVE(utf8
)) return "cp932";
104 if (DFA_ALIVE(utf8
)) {
105 if (!DFA_ALIVE(sjis
) && !DFA_ALIVE(eucj
)) return "utf-8";
109 if (!DFA_ALIVE(eucj
) && !DFA_ALIVE(sjis
) && !DFA_ALIVE(utf8
)) {
110 /* we ran out the possibilities */
115 if (DFA_ALIVE(eucj
) && DFA_ALIVE(sjis
) && !DFA_ALIVE(utf8
) &&
116 sjis_halfwidth_alive
) {
117 /* non-ASCII chars are only cp932 half width chars */
121 /* Now, we have ambigous code. Pick the highest score. If more than
122 one candidate tie, pick the default encoding. */
123 if (DFA_ALIVE(eucj
)) top
= &eucj
;
124 if (DFA_ALIVE(utf8
)) {
126 if (top
->score
<= utf8
.score
) top
= &utf8
;
131 if (DFA_ALIVE(sjis
)) {
133 if (top
->score
< sjis
.score
) top
= &sjis
;
139 if (top
== &eucj
) return "euc-jp";
140 if (top
== &utf8
) return "utf-8";
141 if (top
== &sjis
) return "cp932";
145 static const char *guess_bom(FILE *in
)
150 if (c
== 0xFE || c
== 0xFF) {
152 if (c
== 0xFE && c2
== 0xFF) return "ucs-bom";
153 if (c
== 0xFF && c2
== 0xFE) return "ucs-bom";
155 } else if (c
== 0xEF) {
158 if (c2
== 0xBB && c3
== 0xBF) return "ucs-bom";
166 int guess_encode(char_u
** fenc
, int* fenc_alloced
, char_u
* fname
)
174 smsg((char_u
*)"guess_encode:");
175 smsg((char_u
*)" init: fenc=%s alloced=%d fname=%s\n",
176 *fenc
, *fenc_alloced
, fname
);
182 in
= mch_fopen((const char *)fname
, "r");
188 enc
= guess_jp(in
, "utf-8");
196 smsg((char_u
*)" result: newenc=%s\n", enc
);
201 *fenc
= vim_strsave((char_u
*)enc
);
202 *fenc_alloced
= TRUE
;