elf: Add a way to check if tunable is set (BZ 27069)
[glibc.git] / stdlib / qsort.c
blobbe01fb5598de2257ceaebf967826c15fab2ef398
1 /* Copyright (C) 1991-2023 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
4 The GNU C Library is free software; you can redistribute it and/or
5 modify it under the terms of the GNU Lesser General Public
6 License as published by the Free Software Foundation; either
7 version 2.1 of the License, or (at your option) any later version.
9 The GNU C Library is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Lesser General Public License for more details.
14 You should have received a copy of the GNU Lesser General Public
15 License along with the GNU C Library; if not, see
16 <https://www.gnu.org/licenses/>. */
18 /* If you consider tuning this algorithm, you should consult first:
19 Engineering a sort function; Jon Bentley and M. Douglas McIlroy;
20 Software - Practice and Experience; Vol. 23 (11), 1249-1265, 1993. */
22 #include <limits.h>
23 #include <memswap.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #include <stdbool.h>
28 /* Swap SIZE bytes between addresses A and B. These helpers are provided
29 along the generic one as an optimization. */
31 enum swap_type_t
33 SWAP_WORDS_64,
34 SWAP_WORDS_32,
35 SWAP_BYTES
38 /* If this function returns true, elements can be safely copied using word
39 loads and stores. Otherwise, it might not be safe. BASE (as an integer)
40 must be a multiple of the word alignment. SIZE must be a multiple of
41 WORDSIZE. Since WORDSIZE must be a multiple of the word alignment, and
42 WORDSIZE is a power of two on all supported platforms, this function for
43 speed merely checks that BASE and SIZE are both multiples of the word
44 size. */
45 static inline bool
46 is_aligned (const void *base, size_t size, size_t wordsize)
48 return (((uintptr_t) base | size) & (wordsize - 1)) == 0;
51 static inline void
52 swap_words_64 (void * restrict a, void * restrict b, size_t n)
54 typedef uint64_t __attribute__ ((__may_alias__)) u64_alias_t;
57 n -= 8;
58 u64_alias_t t = *(u64_alias_t *)(a + n);
59 *(u64_alias_t *)(a + n) = *(u64_alias_t *)(b + n);
60 *(u64_alias_t *)(b + n) = t;
61 } while (n);
64 static inline void
65 swap_words_32 (void * restrict a, void * restrict b, size_t n)
67 typedef uint32_t __attribute__ ((__may_alias__)) u32_alias_t;
70 n -= 4;
71 u32_alias_t t = *(u32_alias_t *)(a + n);
72 *(u32_alias_t *)(a + n) = *(u32_alias_t *)(b + n);
73 *(u32_alias_t *)(b + n) = t;
74 } while (n);
77 /* Replace the indirect call with a serie of if statements. It should help
78 the branch predictor. */
79 static void
80 do_swap (void * restrict a, void * restrict b, size_t size,
81 enum swap_type_t swap_type)
83 if (swap_type == SWAP_WORDS_64)
84 swap_words_64 (a, b, size);
85 else if (swap_type == SWAP_WORDS_32)
86 swap_words_32 (a, b, size);
87 else
88 __memswap (a, b, size);
91 /* Discontinue quicksort algorithm when partition gets below this size.
92 This particular magic number was chosen to work best on a Sun 4/260. */
93 #define MAX_THRESH 4
95 /* Stack node declarations used to store unfulfilled partition obligations. */
96 typedef struct
98 char *lo;
99 char *hi;
100 size_t depth;
101 } stack_node;
103 /* The stack needs log (total_elements) entries (we could even subtract
104 log(MAX_THRESH)). Since total_elements has type size_t, we get as
105 upper bound for log (total_elements):
106 bits per byte (CHAR_BIT) * sizeof(size_t). */
107 enum { STACK_SIZE = CHAR_BIT * sizeof (size_t) };
109 static inline stack_node *
110 push (stack_node *top, char *lo, char *hi, size_t depth)
112 top->lo = lo;
113 top->hi = hi;
114 top->depth = depth;
115 return ++top;
118 static inline stack_node *
119 pop (stack_node *top, char **lo, char **hi, size_t *depth)
121 --top;
122 *lo = top->lo;
123 *hi = top->hi;
124 *depth = top->depth;
125 return top;
128 /* Establish the heap condition at index K, that is, the key at K will
129 not be less than either of its children, at 2 * K + 1 and 2 * K + 2
130 (if they exist). N is the last valid index. */
131 static inline void
132 siftdown (void *base, size_t size, size_t k, size_t n,
133 enum swap_type_t swap_type, __compar_d_fn_t cmp, void *arg)
135 /* There can only be a heap condition violation if there are
136 children. */
137 while (2 * k + 1 <= n)
139 /* Left child. */
140 size_t j = 2 * k + 1;
141 /* If the right child is larger, use it. */
142 if (j < n && cmp (base + (j * size), base + ((j + 1) * size), arg) < 0)
143 j++;
145 /* If k is already >= to its children, we are done. */
146 if (j == k || cmp (base + (k * size), base + (j * size), arg) >= 0)
147 break;
149 /* Heal the violation. */
150 do_swap (base + (size * j), base + (k * size), size, swap_type);
152 /* Swapping with j may have introduced a violation at j. Fix
153 it in the next loop iteration. */
154 k = j;
158 /* Establish the heap condition for the indices 0 to N (inclusive). */
159 static inline void
160 heapify (void *base, size_t size, size_t n, enum swap_type_t swap_type,
161 __compar_d_fn_t cmp, void *arg)
163 /* If n is odd, k = n / 2 has a left child at n, so this is the
164 largest index that can have a heap condition violation regarding
165 its children. */
166 size_t k = n / 2;
167 while (1)
169 siftdown (base, size, k, n, swap_type, cmp, arg);
170 if (k-- == 0)
171 break;
175 /* A non-recursive heapsort, used on introsort implementation as a
176 fallback routine with worst-case performance of O(nlog n) and
177 worst-case space complexity of O(1). It sorts the array starting
178 at BASE and ending at END (inclusive), with each element of SIZE
179 bytes. The SWAP_TYPE is the callback function used to swap
180 elements, and CMP is the function used to compare elements. */
181 static void
182 heapsort_r (void *base, void *end, size_t size, enum swap_type_t swap_type,
183 __compar_d_fn_t cmp, void *arg)
185 size_t n = ((uintptr_t) end - (uintptr_t) base) / size;
186 if (n <= 1)
187 /* Handled by insertion sort. */
188 return;
190 /* Build the binary heap, largest value at the base[0]. */
191 heapify (base, size, n, swap_type, cmp, arg);
193 while (true)
195 /* Indices 0 .. n contain the binary heap. Extract the largest
196 element put it into the final position in the array. */
197 do_swap (base, base + (n * size), size, swap_type);
199 /* The heap is now one element shorter. */
200 n--;
201 if (n == 0)
202 break;
204 /* By swapping in elements 0 and the previous value of n (now at
205 n + 1), we likely introduced a heap condition violation. Fix
206 it for the reduced heap. */
207 siftdown (base, size, 0, n, swap_type, cmp, arg);
211 static inline void
212 insertion_sort_qsort_partitions (void *const pbase, size_t total_elems,
213 size_t size, enum swap_type_t swap_type,
214 __compar_d_fn_t cmp, void *arg)
216 char *base_ptr = (char *) pbase;
217 char *const end_ptr = &base_ptr[size * (total_elems - 1)];
218 char *tmp_ptr = base_ptr;
219 #define min(x, y) ((x) < (y) ? (x) : (y))
220 const size_t max_thresh = MAX_THRESH * size;
221 char *thresh = min(end_ptr, base_ptr + max_thresh);
222 char *run_ptr;
224 /* Find smallest element in first threshold and place it at the
225 array's beginning. This is the smallest array element,
226 and the operation speeds up insertion sort's inner loop. */
228 for (run_ptr = tmp_ptr + size; run_ptr <= thresh; run_ptr += size)
229 if (cmp (run_ptr, tmp_ptr, arg) < 0)
230 tmp_ptr = run_ptr;
232 if (tmp_ptr != base_ptr)
233 do_swap (tmp_ptr, base_ptr, size, swap_type);
235 /* Insertion sort, running from left-hand-side up to right-hand-side. */
237 run_ptr = base_ptr + size;
238 while ((run_ptr += size) <= end_ptr)
240 tmp_ptr = run_ptr - size;
241 while (run_ptr != tmp_ptr && cmp (run_ptr, tmp_ptr, arg) < 0)
242 tmp_ptr -= size;
244 tmp_ptr += size;
245 if (tmp_ptr != run_ptr)
247 char *trav;
249 trav = run_ptr + size;
250 while (--trav >= run_ptr)
252 char c = *trav;
253 char *hi, *lo;
255 for (hi = lo = trav; (lo -= size) >= tmp_ptr; hi = lo)
256 *hi = *lo;
257 *hi = c;
263 /* Order size using quicksort. This implementation incorporates
264 four optimizations discussed in Sedgewick:
266 1. Non-recursive, using an explicit stack of pointer that store the
267 next array partition to sort. To save time, this maximum amount
268 of space required to store an array of SIZE_MAX is allocated on the
269 stack. Assuming a 32-bit (64 bit) integer for size_t, this needs
270 only 32 * sizeof(stack_node) == 256 bytes (for 64 bit: 1024 bytes).
271 Pretty cheap, actually.
273 2. Chose the pivot element using a median-of-three decision tree.
274 This reduces the probability of selecting a bad pivot value and
275 eliminates certain extraneous comparisons.
277 3. Only quicksorts TOTAL_ELEMS / MAX_THRESH partitions, leaving
278 insertion sort to order the MAX_THRESH items within each partition.
279 This is a big win, since insertion sort is faster for small, mostly
280 sorted array segments.
282 4. The larger of the two sub-partitions is always pushed onto the
283 stack first, with the algorithm then concentrating on the
284 smaller partition. This *guarantees* no more than log (total_elems)
285 stack size is needed (actually O(1) in this case)! */
287 void
288 __qsort_r (void *const pbase, size_t total_elems, size_t size,
289 __compar_d_fn_t cmp, void *arg)
291 char *base_ptr = (char *) pbase;
293 const size_t max_thresh = MAX_THRESH * size;
295 if (total_elems <= 1)
296 /* Avoid lossage with unsigned arithmetic below. */
297 return;
299 enum swap_type_t swap_type;
300 if (is_aligned (pbase, size, 8))
301 swap_type = SWAP_WORDS_64;
302 else if (is_aligned (pbase, size, 4))
303 swap_type = SWAP_WORDS_32;
304 else
305 swap_type = SWAP_BYTES;
307 /* Maximum depth before quicksort switches to heapsort. */
308 size_t depth = 2 * (sizeof (size_t) * CHAR_BIT - 1
309 - __builtin_clzl (total_elems));
311 if (total_elems > MAX_THRESH)
313 char *lo = base_ptr;
314 char *hi = &lo[size * (total_elems - 1)];
315 stack_node stack[STACK_SIZE];
316 stack_node *top = push (stack, NULL, NULL, depth);
318 while (stack < top)
320 if (depth == 0)
322 heapsort_r (lo, hi, size, swap_type, cmp, arg);
323 top = pop (top, &lo, &hi, &depth);
324 continue;
327 char *left_ptr;
328 char *right_ptr;
330 /* Select median value from among LO, MID, and HI. Rearrange
331 LO and HI so the three values are sorted. This lowers the
332 probability of picking a pathological pivot value and
333 skips a comparison for both the LEFT_PTR and RIGHT_PTR in
334 the while loops. */
336 char *mid = lo + size * ((hi - lo) / size >> 1);
338 if ((*cmp) ((void *) mid, (void *) lo, arg) < 0)
339 do_swap (mid, lo, size, swap_type);
340 if ((*cmp) ((void *) hi, (void *) mid, arg) < 0)
341 do_swap (mid, hi, size, swap_type);
342 else
343 goto jump_over;
344 if ((*cmp) ((void *) mid, (void *) lo, arg) < 0)
345 do_swap (mid, lo, size, swap_type);
346 jump_over:;
348 left_ptr = lo + size;
349 right_ptr = hi - size;
351 /* Here's the famous ``collapse the walls'' section of quicksort.
352 Gotta like those tight inner loops! They are the main reason
353 that this algorithm runs much faster than others. */
356 while (left_ptr != mid
357 && (*cmp) ((void *) left_ptr, (void *) mid, arg) < 0)
358 left_ptr += size;
360 while (right_ptr != mid
361 && (*cmp) ((void *) mid, (void *) right_ptr, arg) < 0)
362 right_ptr -= size;
364 if (left_ptr < right_ptr)
366 do_swap (left_ptr, right_ptr, size, swap_type);
367 if (mid == left_ptr)
368 mid = right_ptr;
369 else if (mid == right_ptr)
370 mid = left_ptr;
371 left_ptr += size;
372 right_ptr -= size;
374 else if (left_ptr == right_ptr)
376 left_ptr += size;
377 right_ptr -= size;
378 break;
381 while (left_ptr <= right_ptr);
383 /* Set up pointers for next iteration. First determine whether
384 left and right partitions are below the threshold size. If so,
385 ignore one or both. Otherwise, push the larger partition's
386 bounds on the stack and continue sorting the smaller one. */
388 if ((size_t) (right_ptr - lo) <= max_thresh)
390 if ((size_t) (hi - left_ptr) <= max_thresh)
391 /* Ignore both small partitions. */
393 top = pop (top, &lo, &hi, &depth);
394 --depth;
396 else
398 /* Ignore small left partition. */
399 lo = left_ptr;
400 --depth;
403 else if ((size_t) (hi - left_ptr) <= max_thresh)
404 /* Ignore small right partition. */
406 hi = right_ptr;
407 --depth;
409 else if ((right_ptr - lo) > (hi - left_ptr))
411 /* Push larger left partition indices. */
412 top = push (top, lo, right_ptr, depth - 1);
413 lo = left_ptr;
415 else
417 /* Push larger right partition indices. */
418 top = push (top, left_ptr, hi, depth - 1);
419 hi = right_ptr;
424 /* Once the BASE_PTR array is partially sorted by quicksort the rest
425 is completely sorted using insertion sort, since this is efficient
426 for partitions below MAX_THRESH size. BASE_PTR points to the beginning
427 of the array to sort, and END_PTR points at the very last element in
428 the array (*not* one beyond it!). */
429 insertion_sort_qsort_partitions (pbase, total_elems, size, swap_type, cmp,
430 arg);
432 libc_hidden_def (__qsort_r)
433 weak_alias (__qsort_r, qsort_r)
435 void
436 qsort (void *b, size_t n, size_t s, __compar_fn_t cmp)
438 return __qsort_r (b, n, s, (__compar_d_fn_t) cmp, NULL);
440 libc_hidden_def (qsort)