Split UTF-8 validation out of StdStrBuf

2011-04-05 20:58:40 +02:00 · 2011-04-05 20:58:40 +02:00 · c4276e84c4
parent 5e2e25cc74
commit c4276e84c4
3 changed files with 91 additions and 48 deletions
--- a/src/lib/Standard.cpp
+++ b/src/lib/Standard.cpp
@ -762,3 +762,90 @@ long InterlockedDecrement(long * var)
 	return --(*var);
 }
 #endif
+
+// UTF-8 conformance checking
+namespace
+{
+	static const int utf8_continuation_byte_table[256] =
+	{
+		// How many continuation bytes must follow a byte with this value?
+		// Negative values mean that this byte can never start a valid
+		// UTF-8 sequence.
+		// Note that while the encoding scheme allows more than three
+		// trailing bytes in principle, it is not actually allowed for UTF-8.
+		// Values 0xC0 and 0xC1 can never occur in UTF-8 because they
+		// would mark the beginning of an overlong encoding of characters
+		// below 0x80.
+		// Values 0xF5 to 0xFD are invalid because they can only be used
+		// to encode characters beyond the Unicode range.
+		 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  // 0b00000000..0b00001111, 0x00..0x0F
+		 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  // 0b00010000..0b00011111, 0x10..0x1F
+		 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  // 0b00100000..0b00101111, 0x20..0x2F
+		 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  // 0b00110000..0b00111111, 0x30..0x3F
+		 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  // 0b01000000..0b01001111, 0x40..0x4F
+		 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  // 0b01010000..0b01011111, 0x50..0x5F
+		 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  // 0b01100000..0b01101111, 0x60..0x6F
+		 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  // 0b01110000..0b01111111, 0x70..0x7F
+		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  // 0b10000000..0b10001111, 0x80..0x8F
+		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  // 0b10010000..0b10011111, 0x90..0x9F
+		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  // 0b10100000..0b10101111, 0xA0..0xAF
+		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  // 0b10110000..0b10111111, 0xB0..0xBF
+		-1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 0b11000000..0b11001111, 0xC0..0xCF
+		 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 0b11010000..0b11011111, 0xD0..0xDF
+		 2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  // 0b11100000..0b11101111, 0xE0..0xEF
+		 3,  3,  3,  3,  3, -3, -3, -3, -4, -4, -4, -4, -5, -5, -1, -1   // 0b11110000..0b11111111, 0xF0..0xFF
+	};
+	static const uint32_t utf8_min_char_value[4] =
+	{
+		// Which is the lowest character value that may be encoded
+		// using this many continuation bytes?
+		0, 0x80, 0x800, 0x10000
+	};
+}
+
+bool IsValidUtf8(const char *text, int length)
+{
+	// Intentionally using a C-style cast to always get a uint8_t* from char*;
+	// reinterpret_cast would fail here on platforms that have unsigned char,
+	// while static_cast would fail on platforms with a signed char type
+	const uint8_t *input = (const uint8_t*)(text);
+
+	for (const uint8_t *cursor = input; length < 0 ? *cursor != 0 : cursor - input < length; ++cursor)
+	{
+		int continuation_bytes = utf8_continuation_byte_table[*cursor];
+		if (continuation_bytes < 0)
+			return false;
+		else if (continuation_bytes == 0)
+		{
+			// Standard 7-bit ASCII value (i.e., 1 byte codepoint)
+			continue;
+		}
+		
+		// Compute character value, so we can detect overlong sequences
+		assert((*cursor & 0xC0) == 0xC0);
+		uint32_t value = *cursor;
+		// strip length bits off the start byte
+		value &= (static_cast<uint32_t>(~0U) >> (continuation_bytes + 1));
+		for (int byte = 0; byte < continuation_bytes; ++byte)
+		{
+			// check that this is actually a continuation byte
+			if ((cursor[byte + 1] & 0xC0) != 0x80)
+				return false;
+			// merge continuation byte into value
+			value <<= 6;
+			value |= cursor[byte + 1] & 0x3F;
+		}
+		// make sure this is not overlong
+		if (value < utf8_min_char_value[continuation_bytes])
+			return false;
+		// and also not beyond 0x10FFFF
+		if (value > 0x10FFFF)
+			return false;
+		// and also not a wrongly encoded UTF-16 surrogate half
+		if (value >= 0xD800 && value <= 0xDFFF)
+			return false;
+		cursor += continuation_bytes;
+	}
+	// Looks fine
+	return true;
+}
--- a/src/lib/Standard.h
+++ b/src/lib/Standard.h
@ -171,4 +171,7 @@ inline int ssprintf(T &str, const char *fmt, ...)
 	return m;
 }

+// Checks a string for conformance with UTF-8
+bool IsValidUtf8(const char *string, int length = -1);
+
 #endif // INC_STANDARD
--- a/src/lib/StdBuf.cpp
+++ b/src/lib/StdBuf.cpp
@ -376,55 +376,8 @@ void StdStrBuf::ToLowerCase()

 void StdStrBuf::EnsureUnicode()
 {
-	bool valid = true;
-	int need_continuation_bytes = 0;
-	// Check wether valid UTF-8
-	for (size_t i = 0; i < getSize(); ++i)
-	{
-		unsigned char c = *getPtr(i);
-		// remaining of a code point started before
-		if (need_continuation_bytes)
-		{
-			--need_continuation_bytes;
-			// (10000000-10111111)
-			if (0x80 <= c && c <= 0xBF)
-				continue;
-			else
-			{
-				valid = false;
-				break;
-			}
-		}
-		// ASCII
-		if (c < 0x80)
-			continue;
-		// Two byte sequence (11000010-11011111)
-		// Note: 1100000x is an invalid overlong sequence
-		if (0xC2 <= c && c <= 0xDF)
-		{
-			need_continuation_bytes = 1;
-			continue;
-		}
-		// Three byte sequence (11100000-11101111)
-		if (0xE0 <= c && c <= 0xEF)
-		{
-			need_continuation_bytes = 2;
-			continue;
-			// FIXME: could check for UTF-16 surrogates from a broken utf-16->utf-8 converter here
-		}
-		// Four byte sequence (11110000-11110100)
-		if (0xF0 <= c && c <= 0xF4)
-		{
-			need_continuation_bytes = 3;
-			continue;
-		}
-		valid = false;
-		break;
-	}
-	if (need_continuation_bytes)
-		valid = false;
 	// assume that it's windows-1252 and convert to utf-8
-	if (!valid)
+	if (!IsValidUtf8(getData(), getLength()))
 	{
 		size_t j = 0;
 		StdStrBuf buf;