From 0749dcdb9d984fad9de8c2b4fbd36b5f95f54104 Mon Sep 17 00:00:00 2001 From: Armin Burgmeier Date: Wed, 1 Feb 2012 22:11:46 +0100 Subject: [PATCH] Replace std::sort by timsort for Face ordering The usage of timsort instead of std::sort at this point is twofold. First, it's faster in our case where the array is already sorted in many cases (remember this is called at least once a frame). And it's not just a bit faster either but a lot. I have measured a factor of 7 on my system. Second, in our Windows autobuilds there is a crash within std::sort which is very hard to debug because it's hardly reproducible with anything other than the autobuilds (I tried hard). If the crash goes away with timsort then great, if not then maybe it's easier to debug since the code is in our tree. --- CMakeLists.txt | 2 +- src/lib/StdMesh.cpp | 47 ++- thirdparty/timsort/README.rst | 131 +++++++ thirdparty/timsort/sort.h | 627 ++++++++++++++++++++++++++++++++++ 4 files changed, 800 insertions(+), 7 deletions(-) create mode 100644 thirdparty/timsort/README.rst create mode 100644 thirdparty/timsort/sort.h diff --git a/CMakeLists.txt b/CMakeLists.txt index ad44deb5b..1e0c9004e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -548,7 +548,7 @@ set(OC_CLONK_SOURCES src/zlib/gzio.h src/zlib/gzio.c src/zlib/zutil.h - + thirdparty/timsort/sort.h ) mark_as_advanced(OC_CLONK_SOURCES) mark_as_advanced(OC_SYSTEM_SOURCES) diff --git a/src/lib/StdMesh.cpp b/src/lib/StdMesh.cpp index ec0637955..de0779d2e 100644 --- a/src/lib/StdMesh.cpp +++ b/src/lib/StdMesh.cpp @@ -22,6 +22,13 @@ #include #include +static int StdMeshFaceCmp(const StdMeshFace& face1, const StdMeshFace& face2); + +#define SORT_NAME StdMesh +#define SORT_TYPE StdMeshFace +#define SORT_CMP StdMeshFaceCmp +#include "timsort/sort.h" + std::vector* StdMeshInstance::SerializableValueProvider::IDs = NULL; namespace @@ -57,13 +64,18 @@ namespace } bool operator()(const StdMeshFace& face1, const StdMeshFace& face2) const + { + return compare(face1, face2) < 0; + } + + int compare(const StdMeshFace& face1, const StdMeshFace& face2) const { // TODO: Need to apply attach matrix in case of attached meshes switch (m_inst.GetFaceOrdering()) { case StdSubMeshInstance::FO_Fixed: assert(false); - return false; + return 0; case StdSubMeshInstance::FO_FarthestToNearest: case StdSubMeshInstance::FO_NearestToFarthest: { @@ -78,13 +90,13 @@ namespace float z2 = std::max(std::max(z21, z22), z23); if (m_inst.GetFaceOrdering() == StdSubMeshInstance::FO_FarthestToNearest) - return z1 < z2; + return (z1 < z2 ? -1 : (z1 > z2 ? +1 : 0)); else - return z2 < z1; + return (z2 < z1 ? -1 : (z2 > z1 ? +1 : 0)); } default: assert(false); - return false; + return 0; } } }; @@ -131,7 +143,7 @@ namespace ALLOW_TEMP_TO_REF(ValueProviderAdapt) }; - + ValueProviderAdapt mkValueProviderAdapt(StdMeshInstance::ValueProvider** ValueProvider) { return ValueProviderAdapt(ValueProvider); } // Serialize a bone index by name with StdCompiler @@ -212,6 +224,13 @@ namespace return true; } + + StdMeshInstanceFaceOrderingCmpPred* g_pred = NULL; +} + +static int StdMeshFaceCmp(const StdMeshFace& face1, const StdMeshFace& face2) +{ + return g_pred->compare(face1, face2); } StdMeshTransformation StdMeshTrack::GetTransformAt(float time) const @@ -1130,6 +1149,7 @@ bool StdMeshInstance::UpdateBoneTransforms() return was_dirty; } +//#include void StdMeshInstance::ReorderFaces(StdMeshMatrix* global_trans) { for (unsigned int i = 0; i < SubMeshInstances.size(); ++i) @@ -1139,7 +1159,22 @@ void StdMeshInstance::ReorderFaces(StdMeshMatrix* global_trans) { StdMeshInstanceFaceOrderingCmpPred pred(inst, global_trans ? *global_trans : StdMeshMatrix::Identity()); - std::sort(inst.Faces.begin(), inst.Faces.end(), pred); + // The usage of timsort instead of std::sort at this point is twofold. + // First, it's faster in our case where the array is already sorted in + // many cases (remember this is called at least once a frame). + // And it's not just a bit faster either but a lot. I have measured + // a factor of 7 on my system. + // Second, in our Windows autobuilds there is a crash within std::sort + // which is very hard to debug because it's hardly reproducible with + // anything other than the autobuilds (I tried hard). If the crash goes + // away with timsort then great, if not then maybe it's easier to debug + // since the code is in our tree. + + //std::sort(inst.Faces.begin(), inst.Faces.end(), pred); + + g_pred = &pred; + StdMesh_tim_sort(&inst.Faces[0], inst.Faces.size()); + g_pred = NULL; } } diff --git a/thirdparty/timsort/README.rst b/thirdparty/timsort/README.rst new file mode 100644 index 000000000..70ced608e --- /dev/null +++ b/thirdparty/timsort/README.rst @@ -0,0 +1,131 @@ +------ +sort.h +------ + +Overview +-------- + +sort.h is an implementation a ton of sorting algorithms in C with a +user-defined type, that is defined at include time. + +This means you don't have to pay the function call overhead of using +standard library routine. + +You get the choice of many extra sorting routines as well, including: + +* Shell sort +* Binary insertion sort +* Heap sort +* Quick sort +* Merge sort +* Bubble sort (ugh) +* Tim sort + +If you don't know which one to use, you should probably use Tim sort. + + +Usage +----- + +To use this library, you need to do three things: + +* #define SORT_TYPE to be the type of the elements of the array you + want to sort. +* #define SORT_NAME to be a unique name that will be prepended to all + the routines, i.e., #define SORT_NAME mine would give you routines + named mine_heap_sort, and so forth. +* #include "sort.h". Make sure that sort.h is in your include path, + obviously. + +Then, enjoy using the sorting routines. + +See demo.c for example usage. + +If you are going to use your own custom type, you must redefine +SORT_CMP(x, y) with your comparison function, so that it returns +a value less than zero if x < y, equal to zero if x == y, and +greater than 0 if x > y. + +The default just uses the builtin <, ==, and > operators: + +#define SORT_CMP(x, y) ((x) < (y) ? -1 : ((x) == (y) ? 0 : 1)) + +It is often just fine to just subtract the arguments as well (though +this can cause some stability problems with floating-point types): + +#define SORT_CMP(x, y) ((x) - (y)) + +Speed of routines +----------------- + +The speed of each routine is highly dependent on your computer and the +structure of your data. + +If your data has a lot of, like partially sorted sequences, then Tim sort +will beat the pants off of anything else. + +In general, Tim sort is probably the best sorting algorithm in this library, +even for random data. + +Tim sort is not as good if memory movement is many orders of magnitude more +expensive than comparisons (like, many more than for normal int and double). +If so, then quick sort is probably your routine. On the other hand, Tim +sort does extremely well if the comparison operator is very expensive, +since it strives hard to minimize comparisons. + +Here is the output of demo.c, which will give you the timings for a run of +10,000 things on my old Mac Pro (2006-era 2.66 GHz Xeons, 64-bit) on OS X 10.6: + +:: + + Running tests + quick sort time: 740.20 us per iteration + bubble sort time: 183914.60 us per iteration + merge sort time: 954.20 us per iteration + binary insertion sort time: 20472.70 us per iteration + heap sort time: 994.50 us per iteration + shell sort time: 1170.30 us per iteration + tim sort time: 708.50 us per iteration + + +Author +------ +Christopher Swenson (chris@caswenson.com) + + +References +---------- + +* Wikipedia +* timsort.txt (under doc/) + + +License +------- + +All code in this repository, unless otherwise specified, is hereby +licensed under the MIT Public License: + +Copyright (c) 2010 Christopher Swenson + + Permission is hereby granted, free of charge, to any person + obtaining a copy of this software and associated documentation + files (the "Software"), to deal in the Software without + restriction, including without limitation the rights to use, + copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the + Software is furnished to do so, subject to the following + conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + OTHER DEALINGS IN THE SOFTWARE. + diff --git a/thirdparty/timsort/sort.h b/thirdparty/timsort/sort.h new file mode 100644 index 000000000..98c122e0f --- /dev/null +++ b/thirdparty/timsort/sort.h @@ -0,0 +1,627 @@ +#include +#include +#include +#include + +#ifndef SORT_NAME +#error "Must declare SORT_NAME" +#endif + +#ifndef SORT_TYPE +#error "Must declare SORT_TYPE" +#endif + +#ifndef SORT_CMP +#define SORT_CMP(x, y) ((x) < (y) ? -1 : ((x) == (y) ? 0 : 1)) +#endif + + +#ifndef CLZ +#ifdef __GNUC__ +#define CLZ __builtin_clzll +#else + +// adapted from Hacker's Delight +int clzll(uint64_t x) { + int n; + + if (x == 0) return(64); + n = 0; + if (x <= 0x00000000FFFFFFFFL) {n = n + 32; x = x << 32;} + if (x <= 0x0000FFFFFFFFFFFFL) {n = n + 16; x = x << 16;} + if (x <= 0x00FFFFFFFFFFFFFFL) {n = n + 8; x = x << 8;} + if (x <= 0x0FFFFFFFFFFFFFFFL) {n = n + 4; x = x << 4;} + if (x <= 0x3FFFFFFFFFFFFFFFL) {n = n + 2; x = x << 2;} + if (x <= 0x7FFFFFFFFFFFFFFFL) {n = n + 1;} + return n; +} + +#define CLZ clzll +#endif +#endif + + +#define SORT_SWAP(x,y) ({SORT_TYPE __SORT_SWAP_t = (x); (x) = (y); (y) = __SORT_SWAP_t;}) + +#define SORT_CONCAT(x, y) x ## _ ## y +#define SORT_MAKE_STR1(x, y) SORT_CONCAT(x,y) +#define SORT_MAKE_STR(x) SORT_MAKE_STR1(SORT_NAME,x) + + +#define SHELL_SORT SORT_MAKE_STR(shell_sort) +#define BINARY_INSERTION_SORT SORT_MAKE_STR(binary_insertion_sort) +#define HEAP_SORT SORT_MAKE_STR(heap_sort) +#define QUICK_SORT SORT_MAKE_STR(quick_sort) +#define MERGE_SORT SORT_MAKE_STR(merge_sort) +#define BUBBLE_SORT SORT_MAKE_STR(bubble_sort) +#define TIM_SORT SORT_MAKE_STR(tim_sort) + +#define TIM_SORT_RUN_T SORT_MAKE_STR(tim_sort_run_t) +#define TEMP_STORAGE_T SORT_MAKE_STR(temp_storage_t) + +#ifndef MAX +#define MAX(x,y) (((x) > (y) ? (x) : (y))) +#endif +#ifndef MIN +#define MIN(x,y) (((x) < (y) ? (x) : (y))) +#endif + +typedef struct { + int64_t start; + int64_t length; +} TIM_SORT_RUN_T; + + +void SHELL_SORT(SORT_TYPE *dst, const size_t size); +void BINARY_INSERTION_SORT(SORT_TYPE *dst, const size_t size); +void HEAP_SORT(SORT_TYPE *dst, const size_t size); +void QUICK_SORT(SORT_TYPE *dst, const size_t size); +void MERGE_SORT(SORT_TYPE *dst, const size_t size); +void BUBBLE_SORT(SORT_TYPE *dst, const size_t size); +void TIM_SORT(SORT_TYPE *dst, const size_t size); + + +/* From http://oeis.org/classic/A102549 */ +static const uint64_t shell_gaps[48] = {1, 4, 10, 23, 57, 132, 301, 701, 1750, 4376, 10941, 27353, 68383, 170958, 427396, 1068491, 2671228, 6678071, 16695178, 41737946, 104344866, 260862166, 652155416, 1630388541, 4075971353LL, 10189928383LL, 25474820958LL, 63687052396LL, 159217630991LL, 398044077478LL, 995110193696LL, 2487775484241LL, 6219438710603LL, 15548596776508LL, 38871491941271LL, 97178729853178LL, 242946824632946LL, 607367061582366LL, 1518417653955916LL, 3796044134889791LL, 9490110337224478LL, 23725275843061196LL, 59313189607652991LL, 148282974019132478LL, 370707435047831196LL, 926768587619577991LL, 2316921469048944978LL, 5792303672622362446LL}; + +/* Shell sort implementation based on Wikipedia article + http://en.wikipedia.org/wiki/Shell_sort +*/ +void SHELL_SORT(SORT_TYPE *dst, const size_t size) +{ + // TODO: binary search to find first gap? + int inci = 47; + int64_t inc = shell_gaps[inci]; + while (inc > (size >> 1)) + { + inc = shell_gaps[--inci]; + } + int64_t i; + while (1) + { + for (i = inc; i < size; i++) + { + SORT_TYPE temp = dst[i]; + int64_t j = i; + while ((j >= inc) && (SORT_CMP(dst[j - inc], temp) > 0)) + { + dst[j] = dst[j - inc]; + j -= inc; + } + dst[j] = temp; + } + if (inc == 1) break; + inc = shell_gaps[--inci]; + } +} + +/* Function used to do a binary search for binary insertion sort */ +static inline int64_t binary_insertion_find(SORT_TYPE *dst, const SORT_TYPE x, const size_t size) +{ + int64_t l, c, r; + l = 0; + r = size - 1; + c = r >> 1; + SORT_TYPE lx, cx, rx; + lx = dst[l]; + + /* check for beginning conditions */ + if (SORT_CMP(x, lx) < 0) + return 0; + else if (SORT_CMP(x, lx) == 0) + { + int64_t i = 1; + while (SORT_CMP(x, dst[i]) == 0) i++; + return i; + } + + rx = dst[r]; + // guaranteed not to be >= rx + cx = dst[c]; + while (1) + { + const int val = SORT_CMP(x, cx); + if (val < 0) + { + if (c - l <= 1) return c; + r = c; + rx = cx; + } + else if (val > 0) + { + if (r - c <= 1) return c + 1; + l = c; + lx = cx; + } + else + { + do + { + cx = dst[++c]; + } while (SORT_CMP(x, cx) == 0); + return c; + } + c = l + ((r - l) >> 1); + cx = dst[c]; + } +} + +/* Binary insertion sort, but knowing that the first "start" entries are sorted. Used in timsort. */ +static inline void binary_insertion_sort_start(SORT_TYPE *dst, const size_t start, const size_t size) +{ + int64_t i; + for (i = start; i < size; i++) + { + int64_t j; + /* If this entry is already correct, just move along */ + if (SORT_CMP(dst[i - 1], dst[i]) <= 0) continue; + + /* Else we need to find the right place, shift everything over, and squeeze in */ + SORT_TYPE x = dst[i]; + int64_t location = binary_insertion_find(dst, x, i); + for (j = i - 1; j >= location; j--) + { + dst[j + 1] = dst[j]; + } + dst[location] = x; + } +} + +/* Binary insertion sort */ +void BINARY_INSERTION_SORT(SORT_TYPE *dst, const size_t size) +{ + binary_insertion_sort_start(dst, 1, size); +} + + +void BUBBLE_SORT(SORT_TYPE *dst, const size_t size) +{ + int64_t i; + int64_t j; + for (i = 0; i < size; i++) + { + for (j = i + 1; j < size; j++) + { + if (SORT_CMP(dst[j], dst[i]) < 0) + SORT_SWAP(dst[i], dst[j]); + } + } +} + +void MERGE_SORT(SORT_TYPE *dst, const size_t size) +{ + if (size < 16) + { + BINARY_INSERTION_SORT(dst, size); + return; + } + + const int64_t middle = size / 2; + + MERGE_SORT(dst, middle); + MERGE_SORT(&dst[middle], size - middle); + + SORT_TYPE newdst[size]; + int64_t out = 0; + int64_t i = 0; + int64_t j = middle; + while (out != size) + { + if (i < middle) + { + if (j < size) + { + if (SORT_CMP(dst[i], dst[j]) <= 0) + newdst[out] = dst[i++]; + else + newdst[out] = dst[j++]; + } + else + newdst[out] = dst[i++]; + } + else + newdst[out] = dst[j++]; + out++; + } + memcpy(dst, newdst, size * sizeof(SORT_TYPE)); +} + + +/* quick sort: based on wikipedia */ + +static inline int64_t quick_sort_partition(SORT_TYPE *dst, const int64_t left, const int64_t right, const int64_t pivot) +{ + SORT_TYPE value = dst[pivot]; + SORT_SWAP(dst[pivot], dst[right]); + int64_t index = left; + int64_t i; + for (i = left; i < right; i++) + { + if (SORT_CMP(dst[i], value) <= 0) + { + SORT_SWAP(dst[i], dst[index]); + index++; + } + } + SORT_SWAP(dst[right], dst[index]); + return index; +} + +static void quick_sort_recursive(SORT_TYPE *dst, const int64_t left, const int64_t right) +{ + if (right <= left) return; + if ((right - left + 1) < 16) + { + BINARY_INSERTION_SORT(&dst[left], right - left + 1); + return; + } + const int64_t pivot = left + ((right - left) >> 1); + const int64_t new_pivot = quick_sort_partition(dst, left, right, pivot); + quick_sort_recursive(dst, left, new_pivot - 1); + quick_sort_recursive(dst, new_pivot + 1, right); +} + +void QUICK_SORT(SORT_TYPE *dst, const size_t size) +{ + quick_sort_recursive(dst, 0, size - 1); +} + + +/* timsort implementation, based on timsort.txt */ + +static inline void reverse_elements(SORT_TYPE *dst, int64_t start, int64_t end) +{ + while (1) + { + if (start >= end) return; + SORT_SWAP(dst[start], dst[end]); + start++; + end--; + } +} + +static inline int64_t count_run(SORT_TYPE *dst, const int64_t start, const size_t size) +{ + if (size - start == 1) return 1; + if (start >= size - 2) + { + if (SORT_CMP(dst[size - 2], dst[size - 1]) > 0) + SORT_SWAP(dst[size - 2], dst[size - 1]); + return 2; + } + + int64_t curr = start + 2; + + if (SORT_CMP(dst[start], dst[start + 1]) <= 0) + { + // increasing run + while (1) + { + if (curr == size - 1) break; + if (SORT_CMP(dst[curr - 1], dst[curr]) > 0) break; + curr++; + } + return curr - start; + } + else + { + // decreasing run + while (1) + { + if (curr == size - 1) break; + if (SORT_CMP(dst[curr - 1], dst[curr]) <= 0) break; + curr++; + } + // reverse in-place + reverse_elements(dst, start, curr - 1); + return curr - start; + } +} + +static inline int compute_minrun(const uint64_t size) +{ + const int top_bit = 64 - CLZ(size); + const int shift = MAX(top_bit, 6) - 6; + const int minrun = size >> shift; + const uint64_t mask = (1ULL << shift) - 1; + if (mask & size) return minrun + 1; + return minrun; +} + +#define PUSH_NEXT() do {\ +len = count_run(dst, curr, size);\ +run = minrun;\ +if (run < minrun) run = minrun;\ +if (run > size - curr) run = size - curr;\ +if (run > len)\ +{\ + binary_insertion_sort_start(&dst[curr], len, run);\ + len = run;\ +}\ +run_stack[stack_curr++] = (TIM_SORT_RUN_T) {curr, len};\ +curr += len;\ +if (curr == size)\ +{\ + /* finish up */ \ + while (stack_curr > 1) \ + { \ + tim_sort_merge(dst, run_stack, stack_curr, store); \ + run_stack[stack_curr - 2].length += run_stack[stack_curr - 1].length; \ + stack_curr--; \ + } \ + if (store->storage != NULL)\ + {\ + free(store->storage);\ + store->storage = NULL;\ + }\ + return;\ +}\ +}\ +while (0) + +static inline int check_invariant(TIM_SORT_RUN_T *stack, const int stack_curr) +{ + if (stack_curr < 2) return 1; + if (stack_curr == 2) + { + const int64_t A = stack[stack_curr - 2].length; + const int64_t B = stack[stack_curr - 1].length; + if (A <= B) return 0; + return 1; + } + const int64_t A = stack[stack_curr - 3].length; + const int64_t B = stack[stack_curr - 2].length; + const int64_t C = stack[stack_curr - 1].length; + if ((A <= B + C) || (B <= C)) return 0; + return 1; +} + +typedef struct { + size_t alloc; + SORT_TYPE *storage; +} TEMP_STORAGE_T; + + +static inline void tim_sort_resize(TEMP_STORAGE_T *store, const size_t new_size) +{ + if (store->alloc < new_size) + { + SORT_TYPE *tempstore = (SORT_TYPE*)realloc(store->storage, new_size * sizeof(SORT_TYPE)); + if (tempstore == NULL) + { + fprintf(stderr, "Error allocating temporary storage for tim sort: need %lu bytes", sizeof(SORT_TYPE) * new_size); + exit(1); + } + store->storage = tempstore; + store->alloc = new_size; + } +} + +static inline void tim_sort_merge(SORT_TYPE *dst, const TIM_SORT_RUN_T *stack, const int stack_curr, TEMP_STORAGE_T *store) +{ + const int64_t A = stack[stack_curr - 2].length; + const int64_t B = stack[stack_curr - 1].length; + const int64_t curr = stack[stack_curr - 2].start; + + tim_sort_resize(store, MIN(A, B)); + SORT_TYPE *storage = store->storage; + + int64_t i, j, k; + + // left merge + if (A < B) + { + memcpy(storage, &dst[curr], A * sizeof(SORT_TYPE)); + i = 0; + j = curr + A; + + for (k = curr; k < curr + A + B; k++) + { + if ((i < A) && (j < curr + A + B)) + { + if (SORT_CMP(storage[i], dst[j]) <= 0) + dst[k] = storage[i++]; + else + dst[k] = dst[j++]; + } + else if (i < A) + { + dst[k] = storage[i++]; + } + else + dst[k] = dst[j++]; + } + } + // right merge + else + { + memcpy(storage, &dst[curr + A], B * sizeof(SORT_TYPE)); + i = B - 1; + j = curr + A - 1; + + for (k = curr + A + B - 1; k >= curr; k--) + { + if ((i >= 0) && (j >= curr)) + { + if (SORT_CMP(dst[j], storage[i]) > 0) + dst[k] = dst[j--]; + else + dst[k] = storage[i--]; + } + else if (i >= 0) + dst[k] = storage[i--]; + else + dst[k] = dst[j--]; + } + } +} + +static inline int tim_sort_collapse(SORT_TYPE *dst, TIM_SORT_RUN_T *stack, int stack_curr, TEMP_STORAGE_T *store, const size_t size) +{ + while (1) + { + // if the stack only has one thing on it, we are done with the collapse + if (stack_curr <= 1) break; + // if this is the last merge, just do it + if ((stack_curr == 2) && (stack[0].length + stack[1].length == size)) + { + tim_sort_merge(dst, stack, stack_curr, store); + stack[0].length += stack[1].length; + stack_curr--; + break; + } + // check if the invariant is off for a stack of 2 elements + else if ((stack_curr == 2) && (stack[0].length <= stack[1].length)) + { + tim_sort_merge(dst, stack, stack_curr, store); + stack[0].length += stack[1].length; + stack_curr--; + break; + } + else if (stack_curr == 2) + break; + + const int64_t A = stack[stack_curr - 3].length; + const int64_t B = stack[stack_curr - 2].length; + const int64_t C = stack[stack_curr - 1].length; + + // check first invariant + if (A <= B + C) + { + if (A < C) + { + tim_sort_merge(dst, stack, stack_curr - 1, store); + stack[stack_curr - 3].length += stack[stack_curr - 2].length; + stack[stack_curr - 2] = stack[stack_curr - 1]; + stack_curr--; + } + else + { + tim_sort_merge(dst, stack, stack_curr, store); + stack[stack_curr - 2].length += stack[stack_curr - 1].length; + stack_curr--; + } + } + // check second invariant + else if (B <= C) + { + tim_sort_merge(dst, stack, stack_curr, store); + stack[stack_curr - 2].length += stack[stack_curr - 1].length; + stack_curr--; + } + else + break; + } + return stack_curr; +} + +void TIM_SORT(SORT_TYPE *dst, const size_t size) +{ + if (size < 64) + { + BINARY_INSERTION_SORT(dst, size); + return; + } + + // compute the minimum run length + const int minrun = compute_minrun(size); + + // temporary storage for merges + TEMP_STORAGE_T _store, *store = &_store; + store->alloc = 0; + store->storage = NULL; + + TIM_SORT_RUN_T run_stack[128]; + int stack_curr = 0; + int64_t len, run; + int64_t curr = 0; + + PUSH_NEXT(); + PUSH_NEXT(); + PUSH_NEXT(); + + while (1) + { + if (!check_invariant(run_stack, stack_curr)) + { + stack_curr = tim_sort_collapse(dst, run_stack, stack_curr, store, size); + continue; + } + PUSH_NEXT(); + } +} + + + +/* heap sort: based on wikipedia */ + +static inline void heap_sift_down(SORT_TYPE *dst, const int64_t start, const int64_t end) +{ + int64_t root = start; + + while ((root << 1) <= end) + { + int64_t child = root << 1; + if ((child < end) && (SORT_CMP(dst[child], dst[child + 1]) < 0)) + child++; + if (SORT_CMP(dst[root], dst[child]) < 0) + { + SORT_SWAP(dst[root], dst[child]); + root = child; + } + else + return; + } +} + +static inline void heapify(SORT_TYPE *dst, const size_t size) +{ + int64_t start = size >> 1; + while (start >= 0) + { + heap_sift_down(dst, start, size - 1); + start--; + } +} + +void HEAP_SORT(SORT_TYPE *dst, const size_t size) +{ + heapify(dst, size); + int64_t end = size - 1; + + while (end > 0) + { + SORT_SWAP(dst[end], dst[0]); + heap_sift_down(dst, 0, end - 1); + end--; + } +} + +#undef SORT_CONCAT +#undef SORT_MAKE_STR1 +#undef SORT_MAKE_STR +#undef SORT_NAME + +#undef TEMP_STORAGE_T +#undef TIM_SORT_RUN_T +#undef PUSH_NEXT