2 Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; version 2 of the License.
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
13 You should have received a copy of the GNU General Public License
14 along with this program; if not, write to the Free Software
15 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 #include "mysys_priv.h"
19 #include "mysys_err.h"
27 The code below implements this functionality:
29 - Initializing charset related structures
30 - Loading dynamic charsets
31 - Searching for a proper CHARSET_INFO
32 using charset name, collation name or collation ID
33 - Setting server default character set
36 my_bool
my_charset_same(CHARSET_INFO
*cs1
, CHARSET_INFO
*cs2
)
38 return ((cs1
== cs2
) || !strcmp(cs1
->csname
,cs2
->csname
));
43 get_collation_number_internal(const char *name
)
46 for (cs
= all_charsets
;
47 cs
< all_charsets
+array_elements(all_charsets
)-1 ;
50 if ( cs
[0] && cs
[0]->name
&&
51 !my_strcasecmp(&my_charset_latin1
, cs
[0]->name
, name
))
58 static my_bool
init_state_maps(CHARSET_INFO
*cs
)
64 if (!(cs
->state_map
= (uchar
*) my_once_alloc(256, MYF(MY_WME
))))
67 if (!(cs
->ident_map
= (uchar
*) my_once_alloc(256, MYF(MY_WME
))))
70 state_map
= cs
->state_map
;
71 ident_map
= cs
->ident_map
;
73 /* Fill state_map with states to get a faster parser */
74 for (i
=0; i
< 256 ; i
++)
77 state_map
[i
]=(uchar
) MY_LEX_IDENT
;
78 else if (my_isdigit(cs
,i
))
79 state_map
[i
]=(uchar
) MY_LEX_NUMBER_IDENT
;
80 #if defined(USE_MB) && defined(USE_MB_IDENT)
81 else if (my_mbcharlen(cs
, i
)>1)
82 state_map
[i
]=(uchar
) MY_LEX_IDENT
;
84 else if (my_isspace(cs
,i
))
85 state_map
[i
]=(uchar
) MY_LEX_SKIP
;
87 state_map
[i
]=(uchar
) MY_LEX_CHAR
;
89 state_map
[(uchar
)'_']=state_map
[(uchar
)'$']=(uchar
) MY_LEX_IDENT
;
90 state_map
[(uchar
)'\'']=(uchar
) MY_LEX_STRING
;
91 state_map
[(uchar
)'.']=(uchar
) MY_LEX_REAL_OR_POINT
;
92 state_map
[(uchar
)'>']=state_map
[(uchar
)'=']=state_map
[(uchar
)'!']= (uchar
) MY_LEX_CMP_OP
;
93 state_map
[(uchar
)'<']= (uchar
) MY_LEX_LONG_CMP_OP
;
94 state_map
[(uchar
)'&']=state_map
[(uchar
)'|']=(uchar
) MY_LEX_BOOL
;
95 state_map
[(uchar
)'#']=(uchar
) MY_LEX_COMMENT
;
96 state_map
[(uchar
)';']=(uchar
) MY_LEX_SEMICOLON
;
97 state_map
[(uchar
)':']=(uchar
) MY_LEX_SET_VAR
;
98 state_map
[0]=(uchar
) MY_LEX_EOL
;
99 state_map
[(uchar
)'\\']= (uchar
) MY_LEX_ESCAPE
;
100 state_map
[(uchar
)'/']= (uchar
) MY_LEX_LONG_COMMENT
;
101 state_map
[(uchar
)'*']= (uchar
) MY_LEX_END_LONG_COMMENT
;
102 state_map
[(uchar
)'@']= (uchar
) MY_LEX_USER_END
;
103 state_map
[(uchar
) '`']= (uchar
) MY_LEX_USER_VARIABLE_DELIMITER
;
104 state_map
[(uchar
)'"']= (uchar
) MY_LEX_STRING_OR_DELIMITER
;
107 Create a second map to make it faster to find identifiers
109 for (i
=0; i
< 256 ; i
++)
111 ident_map
[i
]= (uchar
) (state_map
[i
] == MY_LEX_IDENT
||
112 state_map
[i
] == MY_LEX_NUMBER_IDENT
);
115 /* Special handling of hex and binary strings */
116 state_map
[(uchar
)'x']= state_map
[(uchar
)'X']= (uchar
) MY_LEX_IDENT_OR_HEX
;
117 state_map
[(uchar
)'b']= state_map
[(uchar
)'B']= (uchar
) MY_LEX_IDENT_OR_BIN
;
118 state_map
[(uchar
)'n']= state_map
[(uchar
)'N']= (uchar
) MY_LEX_IDENT_OR_NCHAR
;
123 static void simple_cs_init_functions(CHARSET_INFO
*cs
)
125 if (cs
->state
& MY_CS_BINSORT
)
126 cs
->coll
= &my_collation_8bit_bin_handler
;
128 cs
->coll
= &my_collation_8bit_simple_ci_handler
;
130 cs
->cset
= &my_charset_8bit_handler
;
135 static int cs_copy_data(CHARSET_INFO
*to
, CHARSET_INFO
*from
)
137 to
->number
= from
->number
? from
->number
: to
->number
;
140 if (!(to
->csname
= my_once_strdup(from
->csname
,MYF(MY_WME
))))
144 if (!(to
->name
= my_once_strdup(from
->name
,MYF(MY_WME
))))
148 if (!(to
->comment
= my_once_strdup(from
->comment
,MYF(MY_WME
))))
153 if (!(to
->ctype
= (uchar
*) my_once_memdup((char*) from
->ctype
,
154 MY_CS_CTYPE_TABLE_SIZE
,
157 if (init_state_maps(to
))
161 if (!(to
->to_lower
= (uchar
*) my_once_memdup((char*) from
->to_lower
,
162 MY_CS_TO_LOWER_TABLE_SIZE
,
167 if (!(to
->to_upper
= (uchar
*) my_once_memdup((char*) from
->to_upper
,
168 MY_CS_TO_UPPER_TABLE_SIZE
,
171 if (from
->sort_order
)
173 if (!(to
->sort_order
= (uchar
*) my_once_memdup((char*) from
->sort_order
,
174 MY_CS_SORT_ORDER_TABLE_SIZE
,
179 if (from
->tab_to_uni
)
181 uint sz
= MY_CS_TO_UNI_TABLE_SIZE
*sizeof(uint16
);
182 if (!(to
->tab_to_uni
= (uint16
*) my_once_memdup((char*)from
->tab_to_uni
,
187 if (!(to
->tailoring
= my_once_strdup(from
->tailoring
,MYF(MY_WME
))))
198 static my_bool
simple_cs_is_full(CHARSET_INFO
*cs
)
200 return ((cs
->csname
&& cs
->tab_to_uni
&& cs
->ctype
&& cs
->to_upper
&&
202 (cs
->number
&& cs
->name
&&
203 (cs
->sort_order
|| (cs
->state
& MY_CS_BINSORT
) )));
208 copy_uca_collation(CHARSET_INFO
*to
, CHARSET_INFO
*from
)
210 to
->cset
= from
->cset
;
211 to
->coll
= from
->coll
;
212 to
->strxfrm_multiply
= from
->strxfrm_multiply
;
213 to
->min_sort_char
= from
->min_sort_char
;
214 to
->max_sort_char
= from
->max_sort_char
;
215 to
->mbminlen
= from
->mbminlen
;
216 to
->mbmaxlen
= from
->mbmaxlen
;
217 to
->state
|= MY_CS_AVAILABLE
| MY_CS_LOADED
|
218 MY_CS_STRNXFRM
| MY_CS_UNICODE
;
222 static int add_collation(CHARSET_INFO
*cs
)
224 if (cs
->name
&& (cs
->number
||
225 (cs
->number
=get_collation_number_internal(cs
->name
))) &&
226 cs
->number
< array_elements(all_charsets
))
228 if (!all_charsets
[cs
->number
])
230 if (!(all_charsets
[cs
->number
]=
231 (CHARSET_INFO
*) my_once_alloc(sizeof(CHARSET_INFO
),MYF(0))))
233 bzero((void*)all_charsets
[cs
->number
],sizeof(CHARSET_INFO
));
236 if (cs
->primary_number
== cs
->number
)
237 cs
->state
|= MY_CS_PRIMARY
;
239 if (cs
->binary_number
== cs
->number
)
240 cs
->state
|= MY_CS_BINSORT
;
242 all_charsets
[cs
->number
]->state
|= cs
->state
;
244 if (!(all_charsets
[cs
->number
]->state
& MY_CS_COMPILED
))
246 CHARSET_INFO
*newcs
= all_charsets
[cs
->number
];
247 if (cs_copy_data(all_charsets
[cs
->number
],cs
))
250 newcs
->caseup_multiply
= newcs
->casedn_multiply
= 1;
252 if (!strcmp(cs
->csname
,"ucs2") )
254 #if defined(HAVE_CHARSET_ucs2) && defined(HAVE_UCA_COLLATIONS)
255 copy_uca_collation(newcs
, &my_charset_ucs2_unicode_ci
);
258 else if (!strcmp(cs
->csname
, "utf8"))
260 #if defined (HAVE_CHARSET_utf8) && defined(HAVE_UCA_COLLATIONS)
261 copy_uca_collation(newcs
, &my_charset_utf8_unicode_ci
);
266 uchar
*sort_order
= all_charsets
[cs
->number
]->sort_order
;
267 simple_cs_init_functions(all_charsets
[cs
->number
]);
270 if (simple_cs_is_full(all_charsets
[cs
->number
]))
272 all_charsets
[cs
->number
]->state
|= MY_CS_LOADED
;
274 all_charsets
[cs
->number
]->state
|= MY_CS_AVAILABLE
;
277 Check if case sensitive sort order: A < a < B.
278 We need MY_CS_FLAG for regex library, and for
279 case sensitivity flag for 5.0 client protocol,
280 to support isCaseSensitive() method in JDBC driver
282 if (sort_order
&& sort_order
['A'] < sort_order
['a'] &&
283 sort_order
['a'] < sort_order
['B'])
284 all_charsets
[cs
->number
]->state
|= MY_CS_CSSORT
;
286 if (my_charset_is_8bit_pure_ascii(all_charsets
[cs
->number
]))
287 all_charsets
[cs
->number
]->state
|= MY_CS_PUREASCII
;
293 We need the below to make get_charset_name()
294 and get_charset_number() working even if a
295 character set has not been really incompiled.
296 The above functions are used for example
297 in error message compiler extra/comp_err.c.
298 If a character set was compiled, this information
299 will get lost and overwritten in add_compiled_collation().
301 CHARSET_INFO
*dst
= all_charsets
[cs
->number
];
302 dst
->number
= cs
->number
;
304 if (!(dst
->comment
= my_once_strdup(cs
->comment
,MYF(MY_WME
))))
307 if (!(dst
->csname
= my_once_strdup(cs
->csname
,MYF(MY_WME
))))
310 if (!(dst
->name
= my_once_strdup(cs
->name
,MYF(MY_WME
))))
314 cs
->primary_number
= 0;
315 cs
->binary_number
= 0;
318 cs
->sort_order
= NULL
;
325 #define MY_MAX_ALLOWED_BUF 1024*1024
326 #define MY_CHARSET_INDEX "Index.xml"
328 const char *charsets_dir
= NULL
;
331 static my_bool
my_read_charset_file(const char *filename
, myf myflags
)
338 if (!my_stat(filename
, &stat_info
, MYF(myflags
)) ||
339 ((len
= (uint
)stat_info
.st_size
) > MY_MAX_ALLOWED_BUF
) ||
340 !(buf
= (uchar
*) my_malloc(len
,myflags
)))
343 if ((fd
=my_open(filename
,O_RDONLY
,myflags
)) < 0)
345 tmp_len
=my_read(fd
, buf
, len
, myflags
);
346 my_close(fd
,myflags
);
350 if (my_parse_charset_xml((char*) buf
,len
,add_collation
))
353 printf("ERROR at line %d pos %d '%s'\n",
354 my_xml_error_lineno(&p
)+1,
355 my_xml_error_pos(&p
),
356 my_xml_error_string(&p
));
360 my_free(buf
, myflags
);
364 my_free(buf
, myflags
);
369 char *get_charsets_dir(char *buf
)
371 const char *sharedir
= SHAREDIR
;
373 DBUG_ENTER("get_charsets_dir");
375 if (charsets_dir
!= NULL
)
376 strmake(buf
, charsets_dir
, FN_REFLEN
-1);
379 if (test_if_hard_path(sharedir
) ||
380 is_prefix(sharedir
, DEFAULT_CHARSET_HOME
))
381 strxmov(buf
, sharedir
, "/", CHARSET_DIR
, NullS
);
383 strxmov(buf
, DEFAULT_CHARSET_HOME
, "/", sharedir
, "/", CHARSET_DIR
,
386 res
= convert_dirname(buf
,buf
,NullS
);
387 DBUG_PRINT("info",("charsets dir: '%s'", buf
));
391 CHARSET_INFO
*all_charsets
[256]={NULL
};
392 CHARSET_INFO
*default_charset_info
= &my_charset_latin1
;
394 void add_compiled_collation(CHARSET_INFO
*cs
)
396 all_charsets
[cs
->number
]= cs
;
397 cs
->state
|= MY_CS_AVAILABLE
;
400 static void *cs_alloc(size_t size
)
402 return my_once_alloc(size
, MYF(MY_WME
));
406 static my_pthread_once_t charsets_initialized
= MY_PTHREAD_ONCE_INIT
;
407 static my_pthread_once_t charsets_template
= MY_PTHREAD_ONCE_INIT
;
409 static void init_available_charsets(void)
411 char fname
[FN_REFLEN
+ sizeof(MY_CHARSET_INDEX
)];
414 bzero(&all_charsets
,sizeof(all_charsets
));
415 init_compiled_charsets(MYF(0));
417 /* Copy compiled charsets */
418 for (cs
=all_charsets
;
419 cs
< all_charsets
+array_elements(all_charsets
)-1 ;
425 if (init_state_maps(*cs
))
430 strmov(get_charsets_dir(fname
), MY_CHARSET_INDEX
);
431 my_read_charset_file(fname
, MYF(0));
435 void free_charsets(void)
437 charsets_initialized
= charsets_template
;
440 uint
get_collation_number(const char *name
)
442 my_pthread_once(&charsets_initialized
, init_available_charsets
);
443 return get_collation_number_internal(name
);
447 uint
get_charset_number(const char *charset_name
, uint cs_flags
)
450 my_pthread_once(&charsets_initialized
, init_available_charsets
);
452 for (cs
= all_charsets
;
453 cs
< all_charsets
+array_elements(all_charsets
)-1 ;
456 if ( cs
[0] && cs
[0]->csname
&& (cs
[0]->state
& cs_flags
) &&
457 !my_strcasecmp(&my_charset_latin1
, cs
[0]->csname
, charset_name
))
458 return cs
[0]->number
;
464 const char *get_charset_name(uint charset_number
)
467 my_pthread_once(&charsets_initialized
, init_available_charsets
);
469 cs
=all_charsets
[charset_number
];
470 if (cs
&& (cs
->number
== charset_number
) && cs
->name
)
471 return (char*) cs
->name
;
473 return (char*) "?"; /* this mimics find_type() */
477 static CHARSET_INFO
*get_internal_charset(uint cs_number
, myf flags
)
482 if ((cs
= all_charsets
[cs_number
]))
484 if (cs
->state
& MY_CS_READY
) /* if CS is already initialized */
488 To make things thread safe we are not allowing other threads to interfere
489 while we may changing the cs_info_table
491 pthread_mutex_lock(&THR_LOCK_charset
);
493 if (!(cs
->state
& (MY_CS_COMPILED
|MY_CS_LOADED
))) /* if CS is not in memory */
495 strxmov(get_charsets_dir(buf
), cs
->csname
, ".xml", NullS
);
496 my_read_charset_file(buf
,flags
);
499 if (cs
->state
& MY_CS_AVAILABLE
)
501 if (!(cs
->state
& MY_CS_READY
))
503 if ((cs
->cset
->init
&& cs
->cset
->init(cs
, cs_alloc
)) ||
504 (cs
->coll
->init
&& cs
->coll
->init(cs
, cs_alloc
)))
507 cs
->state
|= MY_CS_READY
;
513 pthread_mutex_unlock(&THR_LOCK_charset
);
519 CHARSET_INFO
*get_charset(uint cs_number
, myf flags
)
522 if (cs_number
== default_charset_info
->number
)
523 return default_charset_info
;
525 my_pthread_once(&charsets_initialized
, init_available_charsets
);
527 if (!cs_number
|| cs_number
>= array_elements(all_charsets
)-1)
530 cs
=get_internal_charset(cs_number
, flags
);
532 if (!cs
&& (flags
& MY_WME
))
534 char index_file
[FN_REFLEN
+ sizeof(MY_CHARSET_INDEX
)], cs_string
[23];
535 strmov(get_charsets_dir(index_file
),MY_CHARSET_INDEX
);
537 int10_to_str(cs_number
, cs_string
+1, 10);
538 my_error(EE_UNKNOWN_CHARSET
, MYF(ME_BELL
), cs_string
, index_file
);
543 CHARSET_INFO
*get_charset_by_name(const char *cs_name
, myf flags
)
547 my_pthread_once(&charsets_initialized
, init_available_charsets
);
549 cs_number
=get_collation_number(cs_name
);
550 cs
= cs_number
? get_internal_charset(cs_number
,flags
) : NULL
;
552 if (!cs
&& (flags
& MY_WME
))
554 char index_file
[FN_REFLEN
+ sizeof(MY_CHARSET_INDEX
)];
555 strmov(get_charsets_dir(index_file
),MY_CHARSET_INDEX
);
556 my_error(EE_UNKNOWN_COLLATION
, MYF(ME_BELL
), cs_name
, index_file
);
563 CHARSET_INFO
*get_charset_by_csname(const char *cs_name
,
569 DBUG_ENTER("get_charset_by_csname");
570 DBUG_PRINT("enter",("name: '%s'", cs_name
));
572 my_pthread_once(&charsets_initialized
, init_available_charsets
);
574 cs_number
= get_charset_number(cs_name
, cs_flags
);
575 cs
= cs_number
? get_internal_charset(cs_number
, flags
) : NULL
;
577 if (!cs
&& (flags
& MY_WME
))
579 char index_file
[FN_REFLEN
+ sizeof(MY_CHARSET_INDEX
)];
580 strmov(get_charsets_dir(index_file
),MY_CHARSET_INDEX
);
581 my_error(EE_UNKNOWN_CHARSET
, MYF(ME_BELL
), cs_name
, index_file
);
589 Resolve character set by the character set name (utf8, latin1, ...).
591 The function tries to resolve character set by the specified name. If
592 there is character set with the given name, it is assigned to the "cs"
593 parameter and FALSE is returned. If there is no such character set,
594 "default_cs" is assigned to the "cs" and TRUE is returned.
596 @param[in] cs_name Character set name.
597 @param[in] default_cs Default character set.
598 @param[out] cs Variable to store character set.
600 @return FALSE if character set was resolved successfully; TRUE if there
601 is no character set with given name.
604 my_bool
resolve_charset(const char *cs_name
,
605 CHARSET_INFO
*default_cs
,
608 *cs
= get_charset_by_csname(cs_name
, MY_CS_PRIMARY
, MYF(0));
621 Resolve collation by the collation name (utf8_general_ci, ...).
623 The function tries to resolve collation by the specified name. If there
624 is collation with the given name, it is assigned to the "cl" parameter
625 and FALSE is returned. If there is no such collation, "default_cl" is
626 assigned to the "cl" and TRUE is returned.
628 @param[out] cl Variable to store collation.
629 @param[in] cl_name Collation name.
630 @param[in] default_cl Default collation.
632 @return FALSE if collation was resolved successfully; TRUE if there is no
633 collation with given name.
636 my_bool
resolve_collation(const char *cl_name
,
637 CHARSET_INFO
*default_cl
,
640 *cl
= get_charset_by_name(cl_name
, MYF(0));
653 Escape string with backslashes (\)
656 escape_string_for_mysql()
657 charset_info Charset of the strings
658 to Buffer for escaped string
659 to_length Length of destination buffer, or 0
660 from The string to escape
661 length The length of the string to escape
664 This escapes the contents of a string by adding backslashes before special
665 characters, and turning others into specific escape sequences, such as
666 turning newlines into \n and null bytes into \0.
669 To maintain compatibility with the old C API, to_length may be 0 to mean
673 (size_t) -1 The escaped string did not fit in the to buffer
674 # The length of the escaped string
677 size_t escape_string_for_mysql(CHARSET_INFO
*charset_info
,
678 char *to
, size_t to_length
,
679 const char *from
, size_t length
)
681 const char *to_start
= to
;
682 const char *end
, *to_end
=to_start
+ (to_length
? to_length
-1 : 2*length
);
683 my_bool overflow
= FALSE
;
685 my_bool use_mb_flag
= use_mb(charset_info
);
687 for (end
= from
+ length
; from
< end
; from
++)
692 if (use_mb_flag
&& (tmp_length
= my_ismbchar(charset_info
, from
, end
)))
694 if (to
+ tmp_length
> to_end
)
705 If the next character appears to begin a multi-byte character, we
706 escape that first byte of that apparent multi-byte character. (The
707 character just looks like a multi-byte character -- if it were actually
708 a multi-byte character, it would have been passed through in the test
711 Without this check, we can create a problem by converting an invalid
712 multi-byte character into a valid one. For example, 0xbf27 is not
713 a valid GBK character, but 0xbf5c is. (0x27 = ', 0x5c = \)
715 if (use_mb_flag
&& (tmp_length
= my_mbcharlen(charset_info
, *from
)) > 1)
720 case 0: /* Must be escaped for 'mysql' */
723 case '\n': /* Must be escaped for logs */
735 case '"': /* Better safe than sorry */
738 case '\032': /* This gives problems on Win32 */
763 return overflow
? (size_t) -1 : (size_t) (to
- to_start
);
767 #ifdef BACKSLASH_MBTAIL
768 static CHARSET_INFO
*fs_cset_cache
= NULL
;
770 CHARSET_INFO
*fs_character_set()
775 GetLocaleInfo(LOCALE_SYSTEM_DEFAULT
, LOCALE_IDEFAULTANSICODEPAGE
,
776 buf
+2, sizeof(buf
)-3);
778 We cannot call get_charset_by_name here
779 because fs_character_set() is executed before
780 LOCK_THD_charset mutex initialization, which
781 is used inside get_charset_by_name.
782 As we're now interested in cp932 only,
783 let's just detect it using strcmp().
785 fs_cset_cache
= !strcmp(buf
, "cp932") ?
786 &my_charset_cp932_japanese_ci
: &my_charset_bin
;
788 return fs_cset_cache
;
793 Escape apostrophes by doubling them up
796 escape_quotes_for_mysql()
797 charset_info Charset of the strings
798 to Buffer for escaped string
799 to_length Length of destination buffer, or 0
800 from The string to escape
801 length The length of the string to escape
804 This escapes the contents of a string by doubling up any apostrophes that
805 it contains. This is used when the NO_BACKSLASH_ESCAPES SQL_MODE is in
806 effect on the server.
809 To be consistent with escape_string_for_mysql(), to_length may be 0 to
813 ~0 The escaped string did not fit in the to buffer
814 >=0 The length of the escaped string
817 size_t escape_quotes_for_mysql(CHARSET_INFO
*charset_info
,
818 char *to
, size_t to_length
,
819 const char *from
, size_t length
)
821 const char *to_start
= to
;
822 const char *end
, *to_end
=to_start
+ (to_length
? to_length
-1 : 2*length
);
823 my_bool overflow
= FALSE
;
825 my_bool use_mb_flag
= use_mb(charset_info
);
827 for (end
= from
+ length
; from
< end
; from
++)
831 if (use_mb_flag
&& (tmp_length
= my_ismbchar(charset_info
, from
, end
)))
833 if (to
+ tmp_length
> to_end
)
844 We don't have the same issue here with a non-multi-byte character being
845 turned into a multi-byte character by the addition of an escaping
846 character, because we are only escaping the ' character with itself.
870 return overflow
? (ulong
)~0 : (ulong
) (to
- to_start
);