From f18c9365caaad75c0f4c9e26b89327a75cfcb3e6 Mon Sep 17 00:00:00 2001 From: Jeremy Allison Date: Thu, 21 Sep 2006 17:00:07 +0000 Subject: [PATCH] r18787: Fix the strlen_m and strlen_m_term code by merging in (and using elsewhere) next_codepoint from Samba4. Jerry please test. Jeremy. (This used to be commit ece00b70a4621633f1ac9e576c4bbe332031de09) --- source3/include/charset.h | 3 +- source3/include/smb.h | 4 ++ source3/lib/charcnv.c | 99 +++++++++++++++++++++++++++++++++++----------- source3/lib/util_str.c | 52 ++++++++++++++++++++++++ source3/script/mkproto.awk | 2 +- source3/smbd/reply.c | 27 +++++++++++-- source3/smbd/service.c | 11 +++++- 7 files changed, 169 insertions(+), 29 deletions(-) diff --git a/source3/include/charset.h b/source3/include/charset.h index a4dfef3a506..8a51a1876e1 100644 --- a/source3/include/charset.h +++ b/source3/include/charset.h @@ -22,6 +22,7 @@ /* this defines the charset types used in samba */ typedef enum {CH_UCS2=0, CH_UTF16=0, CH_UNIX=1, CH_DISPLAY=2, CH_DOS=3, CH_UTF8=4} charset_t; +#if 0 /* FIXME!!! Hack job for now to get the lsa ndr code compiling */ #ifndef strlen_m #define strlen_m strlen @@ -29,7 +30,7 @@ typedef enum {CH_UCS2=0, CH_UTF16=0, CH_UNIX=1, CH_DISPLAY=2, CH_DOS=3, CH_UTF8= #ifndef strlen_m_term #define strlen_m_term strlen #endif - +#endif #define NUM_CHARSETS 5 diff --git a/source3/include/smb.h b/source3/include/smb.h index bba1621e8f4..700dbcdf855 100644 --- a/source3/include/smb.h +++ b/source3/include/smb.h @@ -170,6 +170,10 @@ typedef smb_ucs2_t wfstring[FSTRING_LEN]; #define COPY_UCS2_CHAR(dest,src) (((unsigned char *)(dest))[0] = ((unsigned char *)(src))[0],\ ((unsigned char *)(dest))[1] = ((unsigned char *)(src))[1], (dest)) +/* Large data type for manipulating uint32 unicode codepoints */ +typedef uint32 codepoint_t; +#define INVALID_CODEPOINT ((codepoint_t)-1) + /* pipe string names */ #define PIPE_LANMAN "\\PIPE\\LANMAN" #define PIPE_SRVSVC "\\PIPE\\srvsvc" diff --git a/source3/lib/charcnv.c b/source3/lib/charcnv.c index fffdf010a05..c5ce3ca8c77 100644 --- a/source3/lib/charcnv.c +++ b/source3/lib/charcnv.c @@ -1374,33 +1374,86 @@ size_t align_string(const void *base_ptr, const char *p, int flags) return 0; } -/**************************************************************** - Calculate the size (in bytes) of the next multibyte character in - our internal character set. Note that p must be pointing to a - valid mb char, not within one. -****************************************************************/ +/* + Return the unicode codepoint for the next multi-byte CH_UNIX character + in the string. The unicode codepoint (codepoint_t) is an unsinged 32 bit value. -size_t next_mb_char_size(const char *s) + Also return the number of bytes consumed (which tells the caller + how many bytes to skip to get to the next CH_UNIX character). + + Return INVALID_CODEPOINT if the next character cannot be converted. +*/ + +codepoint_t next_codepoint(const char *str, size_t *size) { - size_t i; + /* It cannot occupy more than 4 bytes in UTF16 format */ + uint8_t buf[4]; + smb_iconv_t descriptor; + size_t ilen_orig; + size_t ilen; + size_t olen; + char *outbuf; + + if ((str[0] & 0x80) == 0) { + *size = 1; + return (codepoint_t)str[0]; + } - if (!(*s & 0x80)) - return 1; /* ascii. */ + /* We assume that no multi-byte character can take + more than 5 bytes. This is OK as we only + support codepoints up to 1M */ - conv_silent = True; - for ( i = 1; i <=4; i++ ) { - smb_ucs2_t uc; - if (convert_string(CH_UNIX, CH_UCS2, s, i, &uc, 2, False) == 2) { -#if 0 /* JRATEST */ - DEBUG(10,("next_mb_char_size: size %u at string %s\n", - (unsigned int)i, s)); -#endif - conv_silent = False; - return i; + ilen_orig = strnlen(str, 5); + ilen = ilen_orig; + + lazy_initialize_conv(); + + /* CH_UCS2 == UTF16-LE. */ + descriptor = conv_handles[CH_UNIX][CH_UCS2]; + if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) { + *size = 1; + return INVALID_CODEPOINT; + } + + /* This looks a little strange, but it is needed to cope + with codepoints above 64k which are encoded as per RFC2781. */ + olen = 2; + outbuf = (char *)buf; + smb_iconv(descriptor, &str, &ilen, &outbuf, &olen); + if (olen == 2) { + /* We failed to convert to a 2 byte character. + See if we can convert to a 4 UTF16-LE byte char encoding. + */ + olen = 4; + outbuf = (char *)buf; + smb_iconv(descriptor, &str, &ilen, &outbuf, &olen); + if (olen == 4) { + /* We didn't convert any bytes */ + *size = 1; + return INVALID_CODEPOINT; } + olen = 4 - olen; + } else { + olen = 2 - olen; } - /* We're hosed - we don't know how big this is... */ - DEBUG(10,("next_mb_char_size: unknown size at string %s\n", s)); - conv_silent = False; - return 1; + + *size = ilen_orig - ilen; + + if (olen == 2) { + /* 2 byte, UTF16-LE encoded value. */ + return (codepoint_t)SVAL(buf, 0); + } + if (olen == 4) { + /* Decode a 4 byte UTF16-LE character manually. + See RFC2871 for the encoding machanism. + */ + codepoint_t w1 = SVAL(buf,0) & ~0xD800; + codepoint_t w2 = SVAL(buf,2) & ~0xDC00; + + return (codepoint_t)0x10000 + + (w1 << 10) + w2; + } + + /* no other length is valid */ + return INVALID_CODEPOINT; } diff --git a/source3/lib/util_str.c b/source3/lib/util_str.c index 4619d473882..414a87a562c 100644 --- a/source3/lib/util_str.c +++ b/source3/lib/util_str.c @@ -1593,6 +1593,58 @@ void strupper_m(char *s) } /** + Count the number of UCS2 characters in a string. Normally this will + be the same as the number of bytes in a string for single byte strings, + but will be different for multibyte. +**/ + +size_t strlen_m(const char *s) +{ + size_t count = 0; + + if (!s) { + return 0; + } + + while (*s && !(((uint8_t)*s) & 0x80)) { + s++; + count++; + } + + if (!*s) { + return count; + } + + while (*s) { + size_t c_size; + codepoint_t c = next_codepoint(s, &c_size); + if (c < 0x10000) { + /* Unicode char fits into 16 bits. */ + count += 1; + } else { + /* Double-width unicode char - 32 bits. */ + count += 2; + } + s += c_size; + } + + return count; +} + +/** + Count the number of UCS2 characters in a string including the null + terminator. +**/ + +size_t strlen_m_term(const char *s) +{ + if (!s) { + return 0; + } + return strlen_m(s) + 1; +} + +/** Return a RFC2254 binary string representation of a buffer. Used in LDAP filters. Caller must free. diff --git a/source3/script/mkproto.awk b/source3/script/mkproto.awk index 30b5628b336..97578b046f7 100644 --- a/source3/script/mkproto.awk +++ b/source3/script/mkproto.awk @@ -146,7 +146,7 @@ END { gotstart = 1; } - if( $0 ~ /^NODE_STATUS_STRUCT|SMB_STRUCT_DIR|ELOG_TDB/ ) { + if( $0 ~ /^NODE_STATUS_STRUCT|SMB_STRUCT_DIR|ELOG_TDB|codepoint_t/ ) { gotstart = 1; } diff --git a/source3/smbd/reply.c b/source3/smbd/reply.c index e38edadee48..a0596643f81 100644 --- a/source3/smbd/reply.c +++ b/source3/smbd/reply.c @@ -132,13 +132,22 @@ NTSTATUS check_path_syntax(pstring destname, const pstring srcname) break; } } else { - switch(next_mb_char_size(s)) { + size_t siz; + /* Get the size of the next MB character. */ + next_codepoint(s,&siz); + switch(siz) { + case 5: + *d++ = *s++; + /*fall through*/ case 4: *d++ = *s++; + /*fall through*/ case 3: *d++ = *s++; + /*fall through*/ case 2: *d++ = *s++; + /*fall through*/ case 1: *d++ = *s++; break; @@ -266,7 +275,13 @@ NTSTATUS check_path_syntax_wcard(pstring destname, const pstring srcname, BOOL * } *d++ = *s++; } else { - switch(next_mb_char_size(s)) { + size_t siz; + /* Get the size of the next MB character. */ + next_codepoint(s,&siz); + switch(siz) { + case 5: + *d++ = *s++; + /*fall through*/ case 4: *d++ = *s++; /*fall through*/ @@ -374,7 +389,13 @@ NTSTATUS check_path_syntax_posix(pstring destname, const pstring srcname) if (!(*s & 0x80)) { *d++ = *s++; } else { - switch(next_mb_char_size(s)) { + size_t siz; + /* Get the size of the next MB character. */ + next_codepoint(s,&siz); + switch(siz) { + case 5: + *d++ = *s++; + /*fall through*/ case 4: *d++ = *s++; /*fall through*/ diff --git a/source3/smbd/service.c b/source3/smbd/service.c index 734feef4f7e..9c341f19fdd 100644 --- a/source3/smbd/service.c +++ b/source3/smbd/service.c @@ -95,13 +95,22 @@ void set_conn_connectpath(connection_struct *conn, const pstring connectpath) if (!(*s & 0x80)) { *d++ = *s++; } else { - switch(next_mb_char_size(s)) { + size_t siz; + /* Get the size of the next MB character. */ + next_codepoint(s,&siz); + switch(siz) { + case 5: + *d++ = *s++; + /*fall through*/ case 4: *d++ = *s++; + /*fall through*/ case 3: *d++ = *s++; + /*fall through*/ case 2: *d++ = *s++; + /*fall through*/ case 1: *d++ = *s++; break; -- 2.11.4.GIT