2 * Copyright (C) 2009 Gabor Kovesdan <gabor@FreeBSD.org>
3 * Copyright (C) 2012 Oleg Moskalenko <mom040267@gmail.com>
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * $FreeBSD: head/usr.bin/sort/bwstring.c 281181 2015-04-07 01:17:29Z pfg $
46 static wchar_t **wmonths
;
47 static unsigned char **cmonths
;
49 /* initialise months */
52 initialise_months(void)
54 const nl_item item
[12] = { ABMON_1
, ABMON_2
, ABMON_3
, ABMON_4
,
55 ABMON_5
, ABMON_6
, ABMON_7
, ABMON_8
, ABMON_9
, ABMON_10
,
60 if (MB_CUR_MAX
== 1) {
61 if (cmonths
== NULL
) {
64 cmonths
= sort_malloc(sizeof(unsigned char*) * 12);
65 for (int i
= 0; i
< 12; i
++) {
67 tmp
= (unsigned char *) nl_langinfo(item
[i
]);
69 printf("month[%d]=%s\n", i
, tmp
);
74 for (unsigned int j
= 0; j
< len
; j
++)
81 if (wmonths
== NULL
) {
84 wmonths
= sort_malloc(sizeof(wchar_t *) * 12);
85 for (int i
= 0; i
< 12; i
++) {
87 tmp
= (unsigned char *) nl_langinfo(item
[i
]);
89 printf("month[%d]=%s\n", i
, tmp
);
93 m
= sort_malloc(SIZEOF_WCHAR_STRING(len
+ 1));
94 if (mbstowcs(m
, (char*)tmp
, len
) ==
100 for (unsigned int j
= 0; j
< len
; j
++)
101 m
[j
] = towupper(m
[j
]);
109 * Compare two wide-character strings
112 wide_str_coll(const wchar_t *s1
, const wchar_t *s2
)
117 ret
= wcscoll(s1
, s2
);
118 if (errno
== EILSEQ
) {
120 ret
= wcscmp(s1
, s2
);
122 for (size_t i
= 0; ; ++i
) {
126 return ((c2
== L
'\0') ? 0 : -1);
131 return ((int)(c1
- c2
));
138 /* counterparts of wcs functions */
141 bwsprintf(FILE *f
, struct bwstring
*bws
, const char *prefix
, const char *suffix
)
145 fprintf(f
, "%s%s%s", prefix
, bws
->data
.cstr
, suffix
);
147 fprintf(f
, "%s%S%s", prefix
, bws
->data
.wstr
, suffix
);
150 const void* bwsrawdata(const struct bwstring
*bws
)
153 return (&(bws
->data
));
156 size_t bwsrawlen(const struct bwstring
*bws
)
159 return ((MB_CUR_MAX
== 1) ? bws
->len
: SIZEOF_WCHAR_STRING(bws
->len
));
163 bws_memsize(const struct bwstring
*bws
)
166 return ((MB_CUR_MAX
== 1) ? (bws
->len
+ 2 + sizeof(struct bwstring
)) :
167 (SIZEOF_WCHAR_STRING(bws
->len
+ 1) + sizeof(struct bwstring
)));
171 bws_setlen(struct bwstring
*bws
, size_t newlen
)
174 if (bws
&& newlen
!= bws
->len
&& newlen
<= bws
->len
) {
177 bws
->data
.cstr
[newlen
] = '\0';
179 bws
->data
.wstr
[newlen
] = L
'\0';
184 * Allocate a new binary string of specified size
189 struct bwstring
*ret
;
192 ret
= sort_malloc(sizeof(struct bwstring
) + 1 + sz
);
194 ret
= sort_malloc(sizeof(struct bwstring
) +
195 SIZEOF_WCHAR_STRING(sz
+ 1));
199 ret
->data
.cstr
[ret
->len
] = '\0';
201 ret
->data
.wstr
[ret
->len
] = L
'\0';
207 * Create a copy of binary string.
208 * New string size equals the length of the old string.
211 bwsdup(const struct bwstring
*s
)
217 struct bwstring
*ret
= bwsalloc(s
->len
);
220 memcpy(ret
->data
.cstr
, s
->data
.cstr
, (s
->len
));
222 memcpy(ret
->data
.wstr
, s
->data
.wstr
,
223 SIZEOF_WCHAR_STRING(s
->len
));
230 * Create a new binary string from a wide character buffer.
233 bwssbdup(const wchar_t *str
, size_t len
)
237 return ((len
== 0) ? bwsalloc(0) : NULL
);
239 struct bwstring
*ret
;
244 for (size_t i
= 0; i
< len
; ++i
)
245 ret
->data
.cstr
[i
] = (unsigned char) str
[i
];
247 memcpy(ret
->data
.wstr
, str
, SIZEOF_WCHAR_STRING(len
));
254 * Create a new binary string from a raw binary buffer.
257 bwscsbdup(const unsigned char *str
, size_t len
)
259 struct bwstring
*ret
;
265 memcpy(ret
->data
.cstr
, str
, len
);
269 size_t charlen
, chars
, cptr
;
273 s
= (const char *) str
;
275 memset(&mbs
, 0, sizeof(mbs
));
278 size_t n
= MB_CUR_MAX
;
282 charlen
= mbrlen(s
+ cptr
, n
, &mbs
);
289 ret
->data
.wstr
[chars
++] =
290 (unsigned char) s
[cptr
];
294 n
= mbrtowc(ret
->data
.wstr
+ (chars
++),
295 s
+ cptr
, charlen
, &mbs
);
296 if ((n
== (size_t)-1) || (n
== (size_t)-2))
298 err(2, "mbrtowc error");
304 ret
->data
.wstr
[ret
->len
] = L
'\0';
311 * De-allocate object memory
314 bwsfree(const struct bwstring
*s
)
322 * Copy content of src binary string to dst.
323 * If the capacity of the dst string is not sufficient,
324 * then the data is truncated.
327 bwscpy(struct bwstring
*dst
, const struct bwstring
*src
)
329 size_t nums
= src
->len
;
335 if (MB_CUR_MAX
== 1) {
336 memcpy(dst
->data
.cstr
, src
->data
.cstr
, nums
);
337 dst
->data
.cstr
[dst
->len
] = '\0';
339 memcpy(dst
->data
.wstr
, src
->data
.wstr
,
340 SIZEOF_WCHAR_STRING(nums
+ 1));
341 dst
->data
.wstr
[dst
->len
] = L
'\0';
348 * Copy content of src binary string to dst,
349 * with specified number of symbols to be copied.
350 * If the capacity of the dst string is not sufficient,
351 * then the data is truncated.
354 bwsncpy(struct bwstring
*dst
, const struct bwstring
*src
, size_t size
)
356 size_t nums
= src
->len
;
364 if (MB_CUR_MAX
== 1) {
365 memcpy(dst
->data
.cstr
, src
->data
.cstr
, nums
);
366 dst
->data
.cstr
[dst
->len
] = '\0';
368 memcpy(dst
->data
.wstr
, src
->data
.wstr
,
369 SIZEOF_WCHAR_STRING(nums
+ 1));
370 dst
->data
.wstr
[dst
->len
] = L
'\0';
377 * Copy content of src binary string to dst,
378 * with specified number of symbols to be copied.
379 * An offset value can be specified, from the start of src string.
380 * If the capacity of the dst string is not sufficient,
381 * then the data is truncated.
384 bwsnocpy(struct bwstring
*dst
, const struct bwstring
*src
, size_t offset
,
388 if (offset
>= src
->len
) {
389 dst
->data
.wstr
[0] = 0;
392 size_t nums
= src
->len
- offset
;
399 if (MB_CUR_MAX
== 1) {
400 memcpy(dst
->data
.cstr
, src
->data
.cstr
+ offset
,
402 dst
->data
.cstr
[dst
->len
] = '\0';
404 memcpy(dst
->data
.wstr
, src
->data
.wstr
+ offset
,
405 SIZEOF_WCHAR_STRING(nums
));
406 dst
->data
.wstr
[dst
->len
] = L
'\0';
413 * Write binary string to the file.
414 * The output is ended either with '\n' (nl == true)
415 * or '\0' (nl == false).
418 bwsfwrite(struct bwstring
*bws
, FILE *f
, bool zero_ended
)
421 if (MB_CUR_MAX
== 1) {
422 size_t len
= bws
->len
;
425 bws
->data
.cstr
[len
] = '\n';
427 if (fwrite(bws
->data
.cstr
, len
+ 1, 1, f
) < 1)
430 bws
->data
.cstr
[len
] = '\0';
431 } else if (fwrite(bws
->data
.cstr
, len
+ 1, 1, f
) < 1)
440 eols
= zero_ended
? btowc('\0') : btowc('\n');
442 while (printed
< BWSLEN(bws
)) {
443 const wchar_t *s
= bws
->data
.wstr
+ printed
;
448 nums
= fwprintf(f
, L
"%lc", *s
);
456 nums
= fwprintf(f
, L
"%ls", s
);
463 fwprintf(f
, L
"%lc", eols
);
464 return (printed
+ 1);
469 * Allocate and read a binary string from file.
470 * The strings are nl-ended or zero-ended, depending on the sort setting.
473 bwsfgetln(FILE *f
, size_t *len
, bool zero_ended
, struct reader_buffer
*rb
)
477 eols
= zero_ended
? btowc('\0') : btowc('\n');
479 if (!zero_ended
&& (MB_CUR_MAX
> 1)) {
482 ret
= fgetwln(f
, len
);
490 if (ret
[*len
- 1] == (wchar_t)eols
)
493 return (bwssbdup(ret
, *len
));
495 } else if (!zero_ended
&& (MB_CUR_MAX
== 1)) {
498 ret
= fgetln(f
, len
);
506 if (ret
[*len
- 1] == '\n')
509 return (bwscsbdup((unsigned char*)ret
, *len
));
517 if (2 >= rb
->fgetwln_z_buffer_size
) {
518 rb
->fgetwln_z_buffer_size
+= 256;
519 rb
->fgetwln_z_buffer
= sort_realloc(rb
->fgetwln_z_buffer
,
520 sizeof(wchar_t) * rb
->fgetwln_z_buffer_size
);
522 rb
->fgetwln_z_buffer
[*len
] = 0;
538 if (*len
+ 1 >= rb
->fgetwln_z_buffer_size
) {
539 rb
->fgetwln_z_buffer_size
+= 256;
540 rb
->fgetwln_z_buffer
= sort_realloc(rb
->fgetwln_z_buffer
,
541 SIZEOF_WCHAR_STRING(rb
->fgetwln_z_buffer_size
));
544 rb
->fgetwln_z_buffer
[*len
] = c
;
545 rb
->fgetwln_z_buffer
[++(*len
)] = 0;
561 if (*len
+ 1 >= rb
->fgetwln_z_buffer_size
) {
562 rb
->fgetwln_z_buffer_size
+= 256;
563 rb
->fgetwln_z_buffer
= sort_realloc(rb
->fgetwln_z_buffer
,
564 SIZEOF_WCHAR_STRING(rb
->fgetwln_z_buffer_size
));
567 rb
->fgetwln_z_buffer
[*len
] = c
;
568 rb
->fgetwln_z_buffer
[++(*len
)] = 0;
572 /* we do not count the last 0 */
573 return (bwssbdup(rb
->fgetwln_z_buffer
, *len
));
578 bwsncmp(const struct bwstring
*bws1
, const struct bwstring
*bws2
,
579 size_t offset
, size_t len
)
581 size_t cmp_len
, len1
, len2
;
588 if (len1
<= offset
) {
589 return ((len2
<= offset
) ? 0 : -1);
605 if (MB_CUR_MAX
== 1) {
606 const unsigned char *s1
, *s2
;
608 s1
= bws1
->data
.cstr
+ offset
;
609 s2
= bws2
->data
.cstr
+ offset
;
611 res
= memcmp(s1
, s2
, cmp_len
);
614 const wchar_t *s1
, *s2
;
616 s1
= bws1
->data
.wstr
+ offset
;
617 s2
= bws2
->data
.wstr
+ offset
;
619 res
= memcmp(s1
, s2
, SIZEOF_WCHAR_STRING(cmp_len
));
625 if (len1
< cmp_len
&& len1
< len2
)
627 else if (len2
< cmp_len
&& len2
< len1
)
635 bwscmp(const struct bwstring
*bws1
, const struct bwstring
*bws2
, size_t offset
)
637 size_t len1
, len2
, cmp_len
;
651 res
= bwsncmp(bws1
, bws2
, offset
, cmp_len
);
656 else if (len2
< len1
)
664 bws_iterator_cmp(bwstring_iterator iter1
, bwstring_iterator iter2
, size_t len
)
669 for (i
= 0; i
< len
; ++i
) {
670 c1
= bws_get_iter_value(iter1
);
671 c2
= bws_get_iter_value(iter2
);
674 iter1
= bws_iterator_inc(iter1
, 1);
675 iter2
= bws_iterator_inc(iter2
, 1);
682 bwscoll(const struct bwstring
*bws1
, const struct bwstring
*bws2
, size_t offset
)
690 return ((len2
<= offset
) ? 0 : -1);
698 if (MB_CUR_MAX
== 1) {
699 const unsigned char *s1
, *s2
;
701 s1
= bws1
->data
.cstr
+ offset
;
702 s2
= bws2
->data
.cstr
+ offset
;
708 res
= memcmp(s1
, s2
, len2
);
711 } else if (len1
< len2
) {
712 res
= memcmp(s1
, s2
, len1
);
716 res
= memcmp(s1
, s2
, len1
);
731 /* goto next non-zero part: */
732 while ((i
< maxlen
) &&
742 err(2, "bwscoll error 01");
745 } else if (s2
[i
] == 0)
748 res
= strcoll((const char*)(s1
+ i
), (const char*)(s2
+ i
));
752 while ((i
< maxlen
) &&
765 } else if (s2
[i
] == 0)
769 err(2, "bwscoll error 02");
774 else if (len1
> len2
)
780 const wchar_t *s1
, *s2
;
784 s1
= bws1
->data
.wstr
+ offset
;
785 s2
= bws2
->data
.wstr
+ offset
;
795 /* goto next non-zero part: */
796 while ((i
< maxlen
) &&
806 err(2, "bwscoll error 1");
809 } else if (s2
[i
] == 0)
812 res
= wide_str_coll(s1
+ i
, s2
+ i
);
816 while ((i
< maxlen
) && s1
[i
] && s2
[i
])
828 } else if (s2
[i
] == 0)
832 err(2, "bwscoll error 2");
837 else if (len1
> len2
)
847 * Correction of the system API
850 bwstod(struct bwstring
*s0
, bool *empty
)
854 if (MB_CUR_MAX
== 1) {
855 unsigned char *end
, *s
;
862 while (isblank(*s
) && s
< end
)
870 ret
= strtod((char*)s
, &ep
);
871 if ((unsigned char*) ep
== s
) {
876 wchar_t *end
, *ep
, *s
;
882 while (iswblank(*s
) && s
< end
)
890 ret
= wcstod(s
, &ep
);
902 * A helper function for monthcoll. If a line matches
903 * a month name, it returns (number of the month - 1),
904 * while if there is no match, it just return -1.
908 bws_month_score(const struct bwstring
*s0
)
911 if (MB_CUR_MAX
== 1) {
912 const unsigned char *end
, *s
;
917 while (isblank(*s
) && s
< end
)
920 for (int i
= 11; i
>= 0; --i
) {
922 (s
== (unsigned char*)strstr((const char*)s
, (char*)(cmonths
[i
]))))
927 const wchar_t *end
, *s
;
932 while (iswblank(*s
) && s
< end
)
935 for (int i
= 11; i
>= 0; --i
) {
936 if (wmonths
[i
] && (s
== wcsstr(s
, wmonths
[i
])))
945 * Rips out leading blanks (-b).
948 ignore_leading_blanks(struct bwstring
*str
)
951 if (MB_CUR_MAX
== 1) {
952 unsigned char *dst
, *end
, *src
;
954 src
= str
->data
.cstr
;
956 end
= src
+ str
->len
;
958 while (src
< end
&& isblank(*src
))
964 newlen
= BWSLEN(str
) - (src
- dst
);
971 bws_setlen(str
, newlen
);
974 wchar_t *dst
, *end
, *src
;
976 src
= str
->data
.wstr
;
978 end
= src
+ str
->len
;
980 while (src
< end
&& iswblank(*src
))
985 size_t newlen
= BWSLEN(str
) - (src
- dst
);
992 bws_setlen(str
, newlen
);
1000 * Rips out nonprinting characters (-i).
1003 ignore_nonprinting(struct bwstring
*str
)
1005 size_t newlen
= str
->len
;
1007 if (MB_CUR_MAX
== 1) {
1008 unsigned char *dst
, *end
, *src
;
1011 src
= str
->data
.cstr
;
1013 end
= src
+ str
->len
;
1027 wchar_t *dst
, *end
, *src
;
1030 src
= str
->data
.wstr
;
1032 end
= src
+ str
->len
;
1046 bws_setlen(str
, newlen
);
1052 * Rips out any characters that are not alphanumeric characters
1056 dictionary_order(struct bwstring
*str
)
1058 size_t newlen
= str
->len
;
1060 if (MB_CUR_MAX
== 1) {
1061 unsigned char *dst
, *end
, *src
;
1064 src
= str
->data
.cstr
;
1066 end
= src
+ str
->len
;
1070 if (isalnum(c
) || isblank(c
)) {
1080 wchar_t *dst
, *end
, *src
;
1083 src
= str
->data
.wstr
;
1085 end
= src
+ str
->len
;
1089 if (iswalnum(c
) || iswblank(c
)) {
1099 bws_setlen(str
, newlen
);
1105 * Converts string to lower case(-f).
1108 ignore_case(struct bwstring
*str
)
1111 if (MB_CUR_MAX
== 1) {
1112 unsigned char *end
, *s
;
1136 bws_disorder_warnx(struct bwstring
*s
, const char *fn
, size_t pos
)
1139 if (MB_CUR_MAX
== 1)
1140 warnx("%s:%zu: disorder: %s", fn
, pos
+ 1, s
->data
.cstr
);
1142 warnx("%s:%zu: disorder: %ls", fn
, pos
+ 1, s
->data
.wstr
);