2 * Copyright (C) 1984-2002 Mark Nudelman
4 * You may distribute under the terms of either the GNU General Public
5 * License or the Less License, as specified in the README file.
7 * For more information about less, or for information on how to
8 * contact the author, see the README file.
13 * Functions to define the character set
14 * and do things specific to the character set.
23 public int utf_mode
= 0;
26 * Predefined character sets,
27 * selected by the LESSCHARSET environment variable.
34 { "ascii", NULL
, "8bcccbcc18b95.b" },
35 { "dos", NULL
, "8bcccbcc12bc5b223.b" },
36 { "ebcdic", NULL
, "5bc6bcc7bcc41b.9b7.9b5.b..8b6.10b6.b9.7b9.8b8.17b3.3b9.7b9.8b8.6b10.b.b.b." },
37 { "IBM-1047", NULL
, "4cbcbc3b9cbccbccbb4c6bcc5b3cbbc4bc4bccbc191.b" },
38 { "iso8859", NULL
, "8bcccbcc18b95.33b." },
39 { "koi8-r", NULL
, "8bcccbcc18b95.b128." },
40 { "next", NULL
, "8bcccbcc18b95.bb125.bb" },
41 { "utf-8", &utf_mode
, "8bcccbcc18b." },
49 { "latin1", "iso8859" },
50 { "latin9", "iso8859" },
54 #define IS_BINARY_CHAR 01
55 #define IS_CONTROL_CHAR 02
57 static char chardef
[256];
58 static char *binfmt
= NULL
;
59 public int binattr
= AT_STANDOUT
;
63 * Define a charset, given a description string.
64 * The string consists of 256 letters,
65 * one for each character in the charset.
66 * If the string is shorter than 256 letters, missing letters
67 * are taken to be identical to the last one.
68 * A decimal number followed by a letter is taken to be a
69 * repetition of the letter.
71 * Each letter is one of:
98 v
= IS_BINARY_CHAR
|IS_CONTROL_CHAR
;
101 case '0': case '1': case '2': case '3': case '4':
102 case '5': case '6': case '7': case '8': case '9':
103 n
= (10 * n
) + (s
[-1] - '0');
107 error("invalid chardef", NULL_PARG
);
114 if (cp
>= chardef
+ sizeof(chardef
))
116 error("chardef longer than 256", NULL_PARG
);
125 while (cp
< chardef
+ sizeof(chardef
))
130 * Define a charset, given a charset name.
131 * The valid charset names are listed in the "charsets" array.
137 register struct charset
*p
;
138 register struct cs_alias
*a
;
140 if (name
== NULL
|| *name
== '\0')
143 /* First see if the name is an alias. */
144 for (a
= cs_aliases
; a
->name
!= NULL
; a
++)
146 if (strcmp(name
, a
->name
) == 0)
153 for (p
= charsets
; p
->name
!= NULL
; p
++)
155 if (strcmp(name
, p
->name
) == 0)
158 if (p
->p_flag
!= NULL
)
164 error("invalid charset name", NULL_PARG
);
172 * Define a charset, given a locale name.
179 setlocale(LC_ALL
, "");
180 for (c
= 0; c
< (int) sizeof(chardef
); c
++)
185 chardef
[c
] = IS_CONTROL_CHAR
;
187 chardef
[c
] = IS_BINARY_CHAR
|IS_CONTROL_CHAR
;
193 * Define the printing format for control chars.
199 if (s
== NULL
|| *s
== '\0')
202 * Select the attributes if it starts with "*".
208 case 'd': binattr
= AT_BOLD
; break;
209 case 'k': binattr
= AT_BLINK
; break;
210 case 's': binattr
= AT_STANDOUT
; break;
211 case 'u': binattr
= AT_UNDERLINE
; break;
212 default: binattr
= AT_NORMAL
; break;
220 * Initialize charset data structures.
227 s
= lgetenv("LESSBINFMT");
231 * See if environment variable LESSCHARSET is defined.
233 s
= lgetenv("LESSCHARSET");
237 * LESSCHARSET is not defined: try LESSCHARDEF.
239 s
= lgetenv("LESSCHARDEF");
240 if (s
!= NULL
&& *s
!= '\0')
248 * Check whether LC_ALL, LC_CTYPE or LANG look like UTF-8 is used.
250 if ((s
= lgetenv("LC_ALL")) != NULL
||
251 (s
= lgetenv("LC_CTYPE")) != NULL
||
252 (s
= lgetenv("LANG")) != NULL
)
254 if (strstr(s
, "UTF-8") != NULL
|| strstr(s
, "utf-8") != NULL
)
255 if (icharset("utf-8"))
270 (void) icharset("dos");
273 * Default to "latin1".
275 (void) icharset("latin1");
281 * Is a given character a "binary" character?
288 return (chardef
[c
] & IS_BINARY_CHAR
);
292 * Is a given character a "control" character?
299 return (chardef
[c
] & IS_CONTROL_CHAR
);
303 * Return the printable form of a character.
304 * For example, in the "ascii" charset '\3' is printed as "^C".
313 if (!control_char(c
))
314 sprintf(buf
, "%c", c
);
318 else if (!binary_char(c
) && c
< 64)
321 * This array roughly inverts CONTROL() #defined in less.h,
322 * and should be kept in sync with CONTROL() and IBM-1047.
328 "..V....D....TU.Z"[c
]);
330 else if (c
< 128 && !control_char(c
^ 0100))
331 sprintf(buf
, "^%c", c
^ 0100);
334 sprintf(buf
, binfmt
, c
);