Merge branch 'MacVim'
[MacVim/KaoriYa.git] / src / guess.c
blob2371f2d39d564f7a037875aed4a3da1c72c90f34
1 /*
2 * guess.c - guessing character encoding
4 * Copyright (c) 2000-2009 Shiro Kawai <shiro@acm.org>
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * 3. Neither the name of the authors nor the names of its contributors
18 * may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
27 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 * $Id: guess.c,v 1.6 2008-05-10 13:35:37 shirok Exp $
36 #include <stdio.h>
37 #include <string.h>
38 #include "vim.h"
40 typedef struct guess_arc_rec {
41 unsigned int next; /* next state */
42 double score; /* score */
43 } guess_arc;
45 typedef struct guess_dfa_rec {
46 signed char (*states)[256];
47 guess_arc *arcs;
48 int state;
49 double score;
50 } guess_dfa;
52 #define DFA_INIT(st, ar) \
53 { st, ar, 0, 1.0 }
55 #define DFA_NEXT(dfa, ch) \
56 do { \
57 int arc__; \
58 if (dfa.state >= 0) { \
59 arc__ = dfa.states[dfa.state][ch]; \
60 if (arc__ < 0) { \
61 dfa.state = -1; \
62 } else { \
63 dfa.state = dfa.arcs[arc__].next; \
64 dfa.score *= dfa.arcs[arc__].score; \
65 } \
66 } \
67 } while (0)
69 #define DFA_ALIVE(dfa) (dfa.state >= 0)
71 /* include DFA table generated by guess.scm */
72 #include "guess_tab.c"
74 static const char *guess_jp(FILE *in, const char *def)
76 int c;
77 guess_dfa eucj = DFA_INIT(guess_eucj_st, guess_eucj_ar);
78 guess_dfa sjis = DFA_INIT(guess_sjis_st, guess_sjis_ar);
79 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar);
80 guess_dfa *top = NULL;
81 int sjis_halfwidth_alive = 1;
83 while ((c = fgetc(in)) != EOF) {
85 /* special treatment of jis escape sequence */
86 if (c == 0x1b) {
87 c = fgetc(in);
88 if (c == EOF)
89 break;
90 if (c == '$' || c == '(') return "iso-2022-jp";
93 if (c >= 0x80 && (c < 0xa1 || c > 0xdf))
94 sjis_halfwidth_alive = 0;
96 if (DFA_ALIVE(eucj)) {
97 if (!DFA_ALIVE(sjis) && !DFA_ALIVE(utf8)) return "euc-jp";
98 DFA_NEXT(eucj, c);
100 if (DFA_ALIVE(sjis)) {
101 if (!DFA_ALIVE(eucj) && !DFA_ALIVE(utf8)) return "cp932";
102 DFA_NEXT(sjis, c);
104 if (DFA_ALIVE(utf8)) {
105 if (!DFA_ALIVE(sjis) && !DFA_ALIVE(eucj)) return "utf-8";
106 DFA_NEXT(utf8, c);
109 if (!DFA_ALIVE(eucj) && !DFA_ALIVE(sjis) && !DFA_ALIVE(utf8)) {
110 /* we ran out the possibilities */
111 return NULL;
115 if (DFA_ALIVE(eucj) && DFA_ALIVE(sjis) && !DFA_ALIVE(utf8) &&
116 sjis_halfwidth_alive) {
117 /* non-ASCII chars are only cp932 half width chars */
118 return "cp932";
121 /* Now, we have ambigous code. Pick the highest score. If more than
122 one candidate tie, pick the default encoding. */
123 if (DFA_ALIVE(eucj)) top = &eucj;
124 if (DFA_ALIVE(utf8)) {
125 if (top) {
126 if (top->score <= utf8.score) top = &utf8;
127 } else {
128 top = &utf8;
131 if (DFA_ALIVE(sjis)) {
132 if (top) {
133 if (top->score < sjis.score) top = &sjis;
134 } else {
135 top = &sjis;
139 if (top == &eucj) return "euc-jp";
140 if (top == &utf8) return "utf-8";
141 if (top == &sjis) return "cp932";
142 return NULL;
145 static const char *guess_bom(FILE *in)
147 int c, c2, c3;
149 c = fgetc(in);
150 if (c == 0xFE || c == 0xFF) {
151 c2 = fgetc(in);
152 if (c == 0xFE && c2 == 0xFF) return "ucs-bom";
153 if (c == 0xFF && c2 == 0xFE) return "ucs-bom";
154 ungetc(c2, in);
155 } else if (c == 0xEF) {
156 c2 = fgetc(in);
157 c3 = fgetc(in);
158 if (c2 == 0xBB && c3 == 0xBF) return "ucs-bom";
159 ungetc(c3, in);
160 ungetc(c2, in);
162 ungetc(c, in);
163 return NULL;
166 int guess_encode(char_u** fenc, int* fenc_alloced, char_u* fname)
168 FILE *in;
169 const char *enc;
171 if (p_verbose >= 1)
173 verbose_enter();
174 smsg((char_u*)"guess_encode:");
175 smsg((char_u*)" init: fenc=%s alloced=%d fname=%s\n",
176 *fenc, *fenc_alloced, fname);
177 verbose_leave();
180 if (!fname)
181 return 0;
182 in = mch_fopen((const char *)fname, "r");
183 if (!in)
184 return 0;
186 enc = guess_bom(in);
187 if (!enc)
188 enc = guess_jp(in, "utf-8");
189 fclose(in);
191 if (enc)
193 if (p_verbose >= 1)
195 verbose_enter();
196 smsg((char_u*)" result: newenc=%s\n", enc);
197 verbose_leave();
199 if (*fenc_alloced)
200 vim_free(*fenc);
201 *fenc = vim_strsave((char_u*)enc);
202 *fenc_alloced = TRUE;
204 return 1;