Add "natural" option for $sort
[xapian.git] / xapian-applications / omega / sort.cc
blob19a91471ddbcba0ac5729dceee995cdb4a830bcb
1 /* @file sort.cc
2 * @brief OmegaScript $sort function
3 */
4 /* Copyright (C) 2001,2004,2012,2016,2018 Olly Betts
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
19 * USA
22 #include <config.h>
24 #include "sort.h"
26 // For option["decimal"].
27 #include "omega.h"
28 #include "stringutils.h"
30 #include <algorithm>
31 #include <string>
32 #include <vector>
34 using namespace std;
36 // Return -1, 0 or 1 depending on sign of floating point number starting at
37 // index i of string s. Non-numbers give 0.
38 static int
39 string_sign(const string& s, const string& decimal, size_t i = 0)
41 int r = 1;
42 if (s[i] == '-') {
43 r = -1;
44 ++i;
46 while (s[i] == '0') {
47 ++i;
49 if (s.compare(i, decimal.size(), decimal) == 0) {
50 i += decimal.size();
51 while (s[i] == '0') {
52 ++i;
55 return C_isdigit(s[i]) ? r : 0;
58 // Compare strings of two non-zero floating point numbers, without loss of
59 // precision, and ignoring their signs.
61 // The format handled is (as a Perl regex):
63 // ^[ \t]*(-?[0-9]*($decimal[0-9]*)?)
65 // where $decimal is option["decimal"].
67 // Values which are equal by the numeric test are compared as strings.
68 static int
69 ncmp(const string& a, const string& b, const string& decimal)
71 // Ignore leading blanks, an optional minus sign, and leading zeros.
72 size_t ai = a.find_first_not_of(" \t");
73 if (a[ai] == '-') ++ai;
74 while (a[ai] == '0') ++ai;
76 size_t bi = b.find_first_not_of(" \t");
77 if (b[bi] == '-') ++bi;
78 while (b[bi] == '0') ++bi;
80 while (a[ai] == b[bi] && C_isdigit(a[ai])) {
81 // Matching digits.
82 ++ai;
83 ++bi;
86 if (C_isdigit(a[ai])) {
87 if (!C_isdigit(b[bi])) {
88 // a > b
89 return 1;
92 int r = a[ai] - b[bi];
93 bool a_digit, b_digit;
94 do {
95 ++ai;
96 ++bi;
97 a_digit = C_isdigit(a[ai]);
98 b_digit = C_isdigit(b[bi]);
99 } while (a_digit && b_digit);
101 if (!a_digit && !b_digit) {
102 // Same number of digits, but not the same digits.
103 return r;
105 return a_digit ? 1 : -1;
108 if (C_isdigit(b[bi])) {
109 // a < b
110 return -1;
113 // Non-fractional parts are the same - compare any fractional parts.
114 bool a_frac = (a.compare(ai, decimal.size(), decimal) == 0);
115 if (a_frac) ai += decimal.size();
116 bool b_frac = (b.compare(bi, decimal.size(), decimal) == 0);
117 if (b_frac) bi += decimal.size();
118 if (!a_frac && !b_frac) return 0;
119 if (!b_frac) {
120 // Check if a's fractional part is zero.
121 while (a[ai] == '0') ++ai;
122 return C_isdigit(a[ai]) ? 1 : 0;
124 if (!a_frac) {
125 // Check if b's fractional part is zero.
126 while (b[bi] == '0') ++bi;
127 return C_isdigit(b[bi]) ? -1 : 0;
130 // Both have fractional parts, so compare.
131 while (a[ai] == b[bi] && C_isdigit(a[ai])) {
132 ++ai;
133 ++bi;
135 if (C_isdigit(a[ai])) return 1;
136 if (C_isdigit(b[bi])) return -1;
137 return 0;
140 static int
141 natcmp(const string& a, const string& b)
143 size_t shorter = min(a.size(), b.size());
144 size_t i = 0;
145 while (i != shorter) {
146 int cha = static_cast<unsigned char>(a[i]);
147 int chb = static_cast<unsigned char>(b[i]);
148 if (!C_isdigit(cha)) {
149 if (cha == chb) {
150 // Matching non-digits.
151 ++i;
152 continue;
155 if (C_isdigit(chb)) return 1;
156 return cha - chb;
159 // Sort embedded digit spans by numeric value and before non-digits.
160 if (!C_isdigit(chb)) return -1;
162 // Skip any leading zeros on each.
163 size_t sa = i;
164 while (a[sa] == '0') ++sa;
165 size_t sb = i;
166 while (b[sb] == '0') ++sb;
168 size_t ea = sa;
169 size_t eb = sb;
170 int res = 0;
171 while (true) {
172 if (!C_isdigit(a[ea])) {
173 if (C_isdigit(b[eb])) {
174 // Number in b is longer and so larger.
175 return -1;
177 // Digit sequences are the same length.
178 break;
180 if (a[ea] != b[eb]) {
181 if (!C_isdigit(b[eb])) {
182 // Number in a is longer and so larger.
183 return 1;
185 // Record first difference between digits.
186 if (res == 0) res = a[ea] - b[eb];
188 ++ea;
189 ++eb;
192 if (res == 0) {
193 // More leading zeros sorts first.
194 res = int(sb) - int(sa);
196 if (res) return res;
198 // Digit sequences were identical, so keep going.
199 i = ea;
201 return int(a.size()) - int(b.size());
204 void
205 omegascript_sort(const vector<string>& args,
206 string& value)
208 const string &list = args[0];
209 if (list.empty()) return;
211 // 0 => string sort, '#' => natural, 'n' => numeric.
212 char mode = 0;
213 bool uniq = false;
214 bool rev = false;
215 if (args.size() > 1) {
216 for (auto opt_ch : args[1]) {
217 switch (opt_ch) {
218 case '#':
219 case 'n':
220 if (mode != 0) {
221 string m = "Invalid $sort option combination: ";
222 m += args[1];
223 throw m;
225 mode = opt_ch;
226 break;
227 case 'r':
228 rev = true;
229 break;
230 case 'u':
231 uniq = true;
232 break;
233 default:
234 throw string("Unknown $sort option: ") + opt_ch;
238 vector<string> items;
239 string::size_type split = 0, split2;
240 do {
241 split2 = list.find('\t', split);
242 items.emplace_back(list, split, split2 - split);
243 split = split2 + 1;
244 } while (split2 != string::npos);
246 if (mode != 'n') {
247 if (mode == 0) {
248 // String sort.
249 if (!rev) {
250 sort(items.begin(), items.end());
251 } else {
252 sort(items.begin(), items.end(),
253 [](const string& a, const string& b) {
254 return a > b;
257 } else {
258 // "Natural" sort - embedded natural numbers are handled specially.
259 if (!rev) {
260 sort(items.begin(), items.end(),
261 [](const string& a, const string& b) {
262 return natcmp(a, b) < 0;
264 } else {
265 sort(items.begin(), items.end(),
266 [](const string& a, const string& b) {
267 return natcmp(a, b) > 0;
272 value.reserve(list.size());
273 bool tab = false;
274 const string* prev = nullptr;
275 for (auto&& item : items) {
276 // Skip duplicates if "u" flag specified.
277 if (prev && *prev == item) {
278 continue;
280 if (uniq) {
281 prev = &item;
284 if (tab) {
285 value += '\t';
286 } else {
287 tab = true;
289 value += item;
291 return;
294 // Numeric sort.
296 // We first partition items such that all the negative values come first,
297 // then all the zero values, then all the positive values (for a forward
298 // search that is - for a reverse search the order of the partitions is
299 // reversed).
301 // Then we sort within each partition.
302 const string& decimal = option["decimal"];
303 size_t part_z = 0;
304 size_t part_f = items.size();
306 int first = rev ? 1 : -1;
307 if (!uniq) {
308 // Scan linearly through items, swapping to create the desired
309 // partitioning.
310 for (size_t i = 0; i < part_f; ) {
311 int sign = string_sign(items[i], decimal);
312 if (sign == first) {
313 // Swap value to end of the first partition.
314 if (part_z != i)
315 swap(items[part_z], items[i]);
316 ++part_z;
317 ++i;
318 } else if (sign == 0) {
319 // Extend middle partition to include zero value.
320 ++i;
321 } else {
322 // Swap value to start of final partition.
323 swap(items[i], items[--part_f]);
326 } else {
327 // Need to preserve order within each partition.
328 auto z = stable_partition(items.begin(), items.end(),
329 [&decimal, first](const string& a) {
330 return string_sign(a, decimal) == first;
332 part_z = z - items.begin();
333 auto f = stable_partition(z, items.end(),
334 [&decimal](const string& a) {
335 return string_sign(a, decimal) == 0;
337 part_f = f - items.begin();
340 value.reserve(list.size());
341 bool tab = false;
343 const string* prev = nullptr;
344 if (part_z > 0) {
345 // Sort the first partition.
346 if (!uniq) {
347 if (!rev) {
348 sort(items.begin(), items.begin() + part_z,
349 [&decimal](const string& a, const string& b) {
350 int r = ncmp(a, b, decimal);
351 return r ? r > 0 : a < b;
353 } else {
354 sort(items.begin(), items.begin() + part_z,
355 [&decimal](const string& a, const string& b) {
356 int r = ncmp(a, b, decimal);
357 return r ? r > 0 : a > b;
360 } else {
361 stable_sort(items.begin(), items.begin() + part_z,
362 [&decimal](const string& a, const string& b) {
363 return ncmp(a, b, decimal) > 0;
366 for (size_t i = 0; i != part_z; ++i) {
367 const string& item = items[i];
368 // Skip duplicates.
369 if (prev && ncmp(*prev, item, decimal) == 0) {
370 continue;
372 prev = &item;
374 if (tab) {
375 value += '\t';
376 } else {
377 tab = true;
379 value += item;
384 if (part_z != part_f) {
385 // Handle all the zero values.
386 if (!uniq) {
387 if (!rev) {
388 sort(items.begin() + part_z, items.begin() + part_f,
389 [](const string& a, const string& b) {
390 return a < b;
392 } else {
393 sort(items.begin() + part_z, items.begin() + part_f,
394 [](const string& a, const string& b) {
395 return a > b;
398 } else {
399 if (tab) {
400 value += '\t';
401 } else {
402 tab = true;
404 // No need to actually sort this partition when uniq is true -
405 // we just take the first element.
406 value += items[part_z];
410 if (part_f != items.size()) {
411 // Sort the final partition.
412 if (!uniq) {
413 if (!rev) {
414 sort(items.begin() + part_f, items.end(),
415 [&decimal](const string& a, const string& b) {
416 int r = ncmp(a, b, decimal);
417 return r ? r < 0 : a < b;
419 } else {
420 sort(items.begin() + part_f, items.end(),
421 [&decimal](const string& a, const string& b) {
422 int r = ncmp(a, b, decimal);
423 return r ? r < 0 : a > b;
426 } else {
427 stable_sort(items.begin() + part_f, items.end(),
428 [&decimal](const string& a, const string& b) {
429 return ncmp(a, b, decimal) < 0;
432 for (size_t i = part_f; i != items.size(); ++i) {
433 const string& item = items[i];
434 // Skip duplicates.
435 if (prev && ncmp(*prev, item, decimal) == 0) {
436 continue;
438 prev = &item;
440 if (tab) {
441 value += '\t';
442 } else {
443 tab = true;
445 value += item;
450 // If uniq is true we already assembled the output in value as we processed
451 // each partition.
452 if (!uniq) {
453 for (auto&& item : items) {
454 if (tab) {
455 value += '\t';
456 } else {
457 tab = true;
459 value += item;