diff --git a/src/lib/StdMesh.cpp b/src/lib/StdMesh.cpp
index 84313cbf6..95e70603a 100644
--- a/src/lib/StdMesh.cpp
+++ b/src/lib/StdMesh.cpp
@@ -22,10 +22,24 @@
 #include <StdMesh.h>
 #include <algorithm>
 
-static int StdMeshFaceCmp(const StdMeshFace& face1, const StdMeshFace& face2);
+namespace
+{
+	struct StdMeshFaceOrderHelper
+	{
+		float z;
+		unsigned int i;
+	};
+}
+
+static int StdMeshFaceCmp(const StdMeshFaceOrderHelper& h1, const StdMeshFaceOrderHelper& h2)
+{
+	if(h1.z < h2.z) return -1;
+	else if(h1.z > h2.z) return +1;
+	return 0;
+}
 
 #define SORT_NAME StdMesh
-#define SORT_TYPE StdMeshFace
+#define SORT_TYPE StdMeshFaceOrderHelper
 #define SORT_CMP StdMeshFaceCmp
 #include "timsort/sort.h"
 
@@ -42,71 +56,80 @@ namespace
 		}
 	};
 
-	// Helper to sort faces for FaceOrdering
-	struct StdMeshInstanceFaceOrderingCmpPred
+	float StdMeshFaceOrderGetVertexZ(const StdMeshVertex& vtx, const StdMeshMatrix& trans)
 	{
-		const StdMeshVertex* m_vertices;
-		StdSubMeshInstance::FaceOrdering m_face_ordering;
-		const StdMeshMatrix& m_global_trans;
+		// TODO: Need to apply attach matrix in case of attached meshes
 
-		StdMeshInstanceFaceOrderingCmpPred(const StdMeshInstance& mesh_inst, const StdSubMeshInstance& sub_inst,
-		                                   StdSubMeshInstance::FaceOrdering face_ordering, const StdMeshMatrix& global_trans):
-				m_face_ordering(face_ordering), m_global_trans(global_trans)
+		// We need to evaluate the Z coordinate of the transformed vertex
+		// (for all three vertices of the two faces), something like
+		// float z11 = (trans*m_vertices[face1.Vertices[0]]).z;
+		// However we don't do the full matrix multiplication as we are
+		// only interested in the Z coordinate of the result, also we are
+		// not interested in the resulting normals.
+		return trans(2,0)*vtx.x + trans(2,1)*vtx.y + trans(2,2)*vtx.z + trans(2,3);
+	}
+
+	float StdMeshFaceOrderGetFaceZ(const StdMeshVertex* vertices, const StdMeshFace& face, const StdMeshMatrix& trans)
+	{
+		const float z1 = StdMeshFaceOrderGetVertexZ(vertices[face.Vertices[0]], trans);
+		const float z2 = StdMeshFaceOrderGetVertexZ(vertices[face.Vertices[1]], trans);
+		const float z3 = StdMeshFaceOrderGetVertexZ(vertices[face.Vertices[2]], trans);
+		return std::max(std::max(z1, z2), z3);
+	}
+
+	void SortFacesArray(const StdMeshVertex* vertices, std::vector<StdMeshFace>& faces, StdSubMeshInstance::FaceOrdering face_ordering, const StdMeshMatrix& trans)
+	{
+		if(faces.empty()) return;
+
+		std::vector<StdMeshFaceOrderHelper> helpers(faces.size());
+		for(unsigned int i = 0; i < faces.size(); ++i)
 		{
-			if(sub_inst.GetNumVertices() > 0)
-				m_vertices = &sub_inst.GetVertices()[0];
-			else
-				m_vertices = &mesh_inst.GetSharedVertices()[0];
+			helpers[i].i = i;
+			helpers[i].z = StdMeshFaceOrderGetFaceZ(vertices, faces[i], trans);
 		}
 
-		inline float get_z(const StdMeshVertex& vtx) const
+		// The reason to use timsort here instead of std::sort is for performance
+		// reasons. This is performance critical code, with this function being
+		// called at least once per frame for each semi-transparent object. I have
+		// measured a factor 7 difference between the two sorting algorithms on my
+		// system.
+
+		// We also pre-compute the Z values that we use for sorting, and sort the
+		// array of Z values, then use the resorted indices to sort the original
+		// faces array. The reason for this is twofold:
+		// 1. We don't need to compute the Z value every time the comparison function
+		//    is called. Even though the computation is not very expensive, we have
+		//    to do many comparisons, and small things add up. I have measured a
+		//    5-10% performance benefit.
+		// 2. More importantly, due to floating point rounding errors we cannot guarantee
+		//    that Z values computed in the sorting function always yield the exact same
+		//    number, and the same sorting result for the same faces. This can lead to
+		//    a crash, because the f(a1, a2) = -f(a2, a1) property for the sorting function
+		//    would no longer be met, resulting in undefined behaviour in the sort call.
+		//    See http://bugs.openclonk.org/view.php?id=984.
+		StdMesh_tim_sort(&helpers[0], helpers.size());
+
+		std::vector<StdMeshFace> new_faces(faces.size());
+		switch(face_ordering)
 		{
-			// We need to evaluate the Z coordinate of the transformed vertex
-			// (for all three vertices of the two faces), something like
-			// float z11 = (m_global_trans*m_vertices[face1.Vertices[0]]).z;
-			// However we don't do the full matrix multiplication as we are
-			// only interested in the Z coordinate of the result, also we are
-			// not interested in the resulting normals.
-			return m_global_trans(2,0)*vtx.x + m_global_trans(2,1)*vtx.y + m_global_trans(2,2)*vtx.z + m_global_trans(2,3);
+		case StdSubMeshInstance::FO_Fixed:
+			assert(false);
+			break;
+		case StdSubMeshInstance::FO_FarthestToNearest:
+			for(unsigned int i = 0; i < faces.size(); ++i)
+				new_faces[i] = faces[helpers[i].i];
+			break;
+		case StdSubMeshInstance::FO_NearestToFarthest:
+			for(unsigned int i = 0; i < faces.size(); ++i)
+				new_faces[i] = faces[helpers[faces.size() - i - 1].i];
+			break;
+		default:
+			assert(false);
+			break;
 		}
 
-		bool operator()(const StdMeshFace& face1, const StdMeshFace& face2) const
-		{
-			return compare(face1, face2) < 0;
-		}
-
-		int compare(const StdMeshFace& face1, const StdMeshFace& face2) const
-		{
-			// TODO: Need to apply attach matrix in case of attached meshes
-			switch (m_face_ordering)
-			{
-			case StdSubMeshInstance::FO_Fixed:
-				assert(false);
-				return 0;
-			case StdSubMeshInstance::FO_FarthestToNearest:
-			case StdSubMeshInstance::FO_NearestToFarthest:
-			{
-				float z11 = get_z(m_vertices[face1.Vertices[0]]);
-				float z12 = get_z(m_vertices[face1.Vertices[1]]);
-				float z13 = get_z(m_vertices[face1.Vertices[2]]);
-				float z21 = get_z(m_vertices[face2.Vertices[0]]);
-				float z22 = get_z(m_vertices[face2.Vertices[1]]);
-				float z23 = get_z(m_vertices[face2.Vertices[2]]);
-
-				float z1 = std::max(std::max(z11, z12), z13);
-				float z2 = std::max(std::max(z21, z22), z23);
-
-				if (m_face_ordering == StdSubMeshInstance::FO_FarthestToNearest)
-					return (z1 < z2 ? -1 : (z1 > z2 ? +1 : 0));
-				else
-					return (z2 < z1 ? -1 : (z2 > z1 ? +1 : 0));
-			}
-			default:
-				assert(false);
-				return 0;
-			}
-		}
-	};
+		faces.swap(new_faces);
+	}
 
 	// Serialize a ValueProvider with StdCompiler
 	struct ValueProviderAdapt
@@ -231,13 +254,6 @@ namespace
 
 		return true;
 	}
-
-	StdMeshInstanceFaceOrderingCmpPred* g_pred = NULL;
-}
-
-static int StdMeshFaceCmp(const StdMeshFace& face1, const StdMeshFace& face2)
-{
-	return g_pred->compare(face1, face2);
 }
 
 StdMeshTransformation StdMeshTrack::GetTransformAt(float time) const
@@ -447,10 +463,12 @@ void StdSubMeshInstance::LoadFacesForCompletion(StdMeshInstance& instance, const
 		// however we can simply give an appropriate transformation matrix to the face ordering.
 		// At this point, all vertices are in the OGRE coordinate frame, and Z in OGRE equals
 		// Y in Clonk, so we are fine without additional transformation.
-		StdMeshInstanceFaceOrderingCmpPred pred(instance, *this, FO_FarthestToNearest, StdMeshMatrix::Identity());
-		g_pred = &pred;
-		StdMesh_tim_sort(&Faces[0], Faces.size());
-		g_pred = NULL;
+		const StdMeshVertex* vertices;
+		if(GetNumVertices() > 0)
+			vertices = &GetVertices()[0];
+		else
+			vertices = &instance.GetSharedVertices()[0];
+		SortFacesArray(vertices, Faces, FO_FarthestToNearest, StdMeshMatrix::Identity());
 
 		// Third: Only use the first few ones
 		assert(submesh.GetNumFaces() >= 1);
@@ -1181,24 +1199,12 @@ void StdMeshInstance::ReorderFaces(StdMeshMatrix* global_trans)
 		StdSubMeshInstance& inst = *SubMeshInstances[i];
 		if(inst.CurrentFaceOrdering != StdSubMeshInstance::FO_Fixed)
 		{
-			StdMeshInstanceFaceOrderingCmpPred pred(*this, inst, inst.CurrentFaceOrdering, global_trans ? *global_trans : StdMeshMatrix::Identity());
-
-			// The usage of timsort instead of std::sort at this point is twofold.
-			// First, it's faster in our case where the array is already sorted in
-			// many cases (remember this is called at least once a frame).
-			// And it's not just a bit faster either but a lot. I have measured
-			// a factor of 7 on my system.
-			// Second, in our Windows autobuilds there is a crash within std::sort
-			// which is very hard to debug because it's hardly reproducible with
-			// anything other than the autobuilds (I tried hard). If the crash goes
-			// away with timsort then great, if not then maybe it's easier to debug
-			// since the code is in our tree.
-
-			//std::sort(inst.Faces.begin(), inst.Faces.end(), pred);
-
-			g_pred = &pred;
-			StdMesh_tim_sort(&inst.Faces[0], inst.Faces.size());
-			g_pred = NULL;
+			const StdMeshVertex* vertices;
+			if(inst.GetNumVertices() > 0)
+				vertices = &inst.GetVertices()[0];
+			else
+				vertices = &GetSharedVertices()[0];
+			SortFacesArray(vertices, inst.Faces, inst.CurrentFaceOrdering, global_trans ? *global_trans : StdMeshMatrix::Identity());
 		}
 	}