forked from Mirrors/openclonk
232 lines
8.4 KiB
C++
232 lines
8.4 KiB
C++
/*
|
|
* OpenClonk, http://www.openclonk.org
|
|
*
|
|
* Copyright (c) 2011 Nicolas Hake
|
|
*
|
|
* Permission to use, copy, modify, and/or distribute this software for any
|
|
* purpose with or without fee is hereby granted, provided that the above
|
|
* copyright notice and this permission notice appear in all copies.
|
|
* See isc_license.txt for full license and disclaimer.
|
|
*
|
|
* "Clonk" is a registered trademark of Matthes Bender.
|
|
* See clonk_trademark_license.txt for full license.
|
|
*/
|
|
|
|
/* Verify correct behavior of UTF-8 handling code. */
|
|
|
|
#include "lib/Standard.h"
|
|
#include "lib/Standard.cpp"
|
|
#include <gtest/gtest.h>
|
|
|
|
TEST(UnicodeHandlingTest, AcceptsEmptyString)
|
|
{
|
|
// Check acceptance of empty strings.
|
|
// Part 1: Automatic length detection
|
|
EXPECT_TRUE(::IsValidUtf8(""));
|
|
// Part 2: Automatic length detection with trailing garbage
|
|
EXPECT_TRUE(::IsValidUtf8("\0\xFF\xFF\xFF\xFF"));
|
|
// Part 3: Manual length override with trailing garbage
|
|
EXPECT_TRUE(::IsValidUtf8("\xFF\xFF\xFF\xFF", 0));
|
|
}
|
|
|
|
TEST(UnicodeHandlingTest, AcceptsValidSingleByteUtf8)
|
|
{
|
|
// Check acceptance of valid UTF-8 single-byte sequences.
|
|
// This test is exhaustive over U+0000..U+007F.
|
|
// Part 1: Automatic length detection
|
|
// Test gc=Lu: General category: Letter, uppercase
|
|
EXPECT_TRUE(::IsValidUtf8("ABCDEFGHIJKLMNOPQRSTUVWXYZ"));
|
|
// Test gc=Ll: General category: Letter, lowercase
|
|
EXPECT_TRUE(::IsValidUtf8("abcdefghijklmnopqrstuvwxyz"));
|
|
// Test gc=Nd: General category: Number, decimal digit
|
|
EXPECT_TRUE(::IsValidUtf8("0123456789"));
|
|
// Test gc=Zs: General category: Separator, space
|
|
EXPECT_TRUE(::IsValidUtf8(" "));
|
|
// Test gc=Po: General category: Punctuation, other
|
|
EXPECT_TRUE(::IsValidUtf8(
|
|
"!"
|
|
"\x22" // U+0022 QUOTATION MARK
|
|
"#%&'*,./:;?@"
|
|
"\x5C" // U+005C REVERSE SOLIDUS (aka BACKSLASH)
|
|
));
|
|
// Test gc=Sc: General category: Symbol, currency
|
|
EXPECT_TRUE(::IsValidUtf8("$"));
|
|
// Test gc=Ps: General category: Punctuation, open
|
|
EXPECT_TRUE(::IsValidUtf8("([{"));
|
|
// Test gc=Pe: General category: Punctuation, close
|
|
EXPECT_TRUE(::IsValidUtf8(")]}"));
|
|
// Test gc=Sm: General category: Symbol, math
|
|
EXPECT_TRUE(::IsValidUtf8("+<=>|~"));
|
|
// Test gc=Pd: General category: Punctuation, dash
|
|
EXPECT_TRUE(::IsValidUtf8("-"));
|
|
// Test gc=Sk: General category: Symbol, modifier
|
|
EXPECT_TRUE(::IsValidUtf8("^"));
|
|
// Test gc=Cc: General category: Other, control
|
|
// NB: This omits U+0000 NULL due to it being the C string terminator
|
|
EXPECT_TRUE(::IsValidUtf8(
|
|
"\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F"
|
|
"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F"
|
|
"\x7F"));
|
|
|
|
// Part 2: Interspersed U+0000 NULL characters
|
|
EXPECT_TRUE(::IsValidUtf8("A\0BC\0DEF\0GHIJ\0KLMNO", 20));
|
|
|
|
// Part 3: Valid UTF-8 with trailing garbage, manual length override
|
|
EXPECT_TRUE(::IsValidUtf8("AAAA\x80\xF0\xFF", 4));
|
|
}
|
|
|
|
TEST(UnicodeHandlingTest, RejectsInvalidSingleByteUtf8)
|
|
{
|
|
// Check rejection of invalid UTF-8 single-byte sequences
|
|
// Part 1: Range 0x80..0xBF (orphaned continuation bytes)
|
|
for (int i = 0x80; i <= 0xBF; ++i)
|
|
{
|
|
char buffer[] = { static_cast<char>(i), 0 };
|
|
EXPECT_FALSE(::IsValidUtf8(buffer));
|
|
}
|
|
// Part 2: Range 0xC0..0xF4 (orphaned start bytes)
|
|
for (int i = 0xC0; i <= 0xFF; ++i)
|
|
{
|
|
char buffer[] = { static_cast<char>(i), 0 };
|
|
EXPECT_FALSE(::IsValidUtf8(buffer));
|
|
}
|
|
// Part 3: Range 0xF5..0xFF (invalid bytes)
|
|
for (int i = 0xF5; i <= 0xFF; ++i)
|
|
{
|
|
char buffer[] = { static_cast<char>(i), 0 };
|
|
EXPECT_FALSE(::IsValidUtf8(buffer));
|
|
}
|
|
}
|
|
|
|
TEST(UnicodeHandlingTest, AcceptsValidMultiByteUtf8)
|
|
{
|
|
// Check acceptance of valid UTF-8 multi-byte sequences.
|
|
// Part 1: Generate all valid two-byte sequences
|
|
for (int i = 0x80; i < 0x800; ++i)
|
|
{
|
|
char buffer[] =
|
|
{
|
|
static_cast<char>(0xC0 | (i >> 6)),
|
|
static_cast<char>(0x80 | (i & 0x3F)),
|
|
0
|
|
};
|
|
EXPECT_TRUE(::IsValidUtf8(buffer)) << "Valid UTF-8 character not recognized:" << std::hex
|
|
<< " 0x" << (uint32_t)(uint8_t)buffer[0] << " 0x" << (uint32_t)(uint8_t)buffer[1]
|
|
<< " (0x" << i << ")";
|
|
}
|
|
// Part 2: Generate all valid three-byte sequences
|
|
for (int i = 0x800; i < 0x10000; ++i)
|
|
{
|
|
if (i == 0xD800) i = 0xE000; // Skip invalid surrogate halves
|
|
char buffer[] =
|
|
{
|
|
static_cast<char>(0xE0 | (i >> 12)),
|
|
static_cast<char>(0x80 | ((i >> 6) & 0x3F)),
|
|
static_cast<char>(0x80 | (i & 0x3F)),
|
|
0
|
|
};
|
|
EXPECT_TRUE(::IsValidUtf8(buffer)) << "Valid UTF-8 character not recognized:" << std::hex
|
|
<< " 0x" << (uint32_t)(uint8_t)buffer[0] << " 0x" << (uint32_t)(uint8_t)buffer[1]
|
|
<< " 0x" << (uint32_t)(uint8_t)buffer[2]
|
|
<< " (0x" << i << ")";
|
|
}
|
|
// Part 3: Generate all valid four-byte sequences
|
|
for (int i = 0x10000; i < 0x10FFFF; ++i)
|
|
{
|
|
char buffer[] =
|
|
{
|
|
static_cast<char>(0xF0 | (i >> 18)),
|
|
static_cast<char>(0x80 | ((i >> 12) & 0x3F)),
|
|
static_cast<char>(0x80 | ((i >> 6) & 0x3F)),
|
|
static_cast<char>(0x80 | (i & 0x3F)),
|
|
0
|
|
};
|
|
EXPECT_TRUE(::IsValidUtf8(buffer)) << "Valid UTF-8 character not recognized:" << std::hex
|
|
<< " 0x" << (uint32_t)(uint8_t)buffer[0] << " 0x" << (uint32_t)(uint8_t)buffer[1]
|
|
<< " 0x" << (uint32_t)(uint8_t)buffer[2] << " 0x" << (uint32_t)(uint8_t)buffer[3]
|
|
<< " (0x" << i << ")";
|
|
}
|
|
}
|
|
|
|
TEST(UnicodeHandlingTest, RejectsInvalidMultiByteUtf8)
|
|
{
|
|
// Check rejection of invalid UTF-8 multi-byte sequences.
|
|
// Part 1: Overlong sequences
|
|
// 1.1: U+0000 NULL encoding
|
|
EXPECT_FALSE(::IsValidUtf8("\xC0\x80")); // Two-byte representation of U+0000 NULL
|
|
EXPECT_FALSE(::IsValidUtf8("\xE0\x80\x80")); // Three-byte representation of U+0000 NULL
|
|
EXPECT_FALSE(::IsValidUtf8("\xF0\x80\x80\x80")); // Four-byte representation of U+0000 NULL
|
|
EXPECT_FALSE(::IsValidUtf8("\xF8\x80\x80\x80\x80")); // Five-byte representation of U+0000 NULL
|
|
EXPECT_FALSE(::IsValidUtf8("\xFC\x80\x80\x80\x80\x80")); // Six-byte representation of U+0000 NULL
|
|
// 1.2: U+0080 <control> encoding
|
|
EXPECT_FALSE(::IsValidUtf8("\xE0\x82\x80")); // Three-byte representation of U+0080 <control>
|
|
EXPECT_FALSE(::IsValidUtf8("\xF0\x80\x82\x80")); // Four-byte representation of U+0080 <control>
|
|
// 1.3: U+0800 SAMARITAN LETTER ALAF encoding
|
|
EXPECT_FALSE(::IsValidUtf8("\xF0\x80\xA0\x80")); // four-byte representation of U+0800 SAMARITAN LETTER ALAF
|
|
// Part 2: Incorrectly encoded surrogate halves
|
|
for (int i = 0xD800; i <= 0xDFFF; ++i)
|
|
{
|
|
char buffer[] =
|
|
{
|
|
static_cast<char>(0xE0 | (i >> 12)),
|
|
static_cast<char>(0x80 | ((i >> 6) & 0x3F)),
|
|
static_cast<char>(0x80 | (i & 0x3F)),
|
|
0
|
|
};
|
|
EXPECT_FALSE(::IsValidUtf8(buffer)) << "Invalid surrogate half not recognized: " << std::hex
|
|
<< " 0x" << (uint32_t)(uint8_t)buffer[0] << " 0x" << (uint32_t)(uint8_t)buffer[1]
|
|
<< " 0x" << (uint32_t)(uint8_t)buffer[2]
|
|
<< " (0x" << i << ")";
|
|
}
|
|
// Part 3: Sequences encoding codepoints beyond the unicode range
|
|
EXPECT_FALSE(::IsValidUtf8("\xF4\x90\x80\x80")); // Representation of invalid codepoint U+110000
|
|
// Part 4: Incomplete multibyte sequences
|
|
EXPECT_FALSE(::IsValidUtf8("\xC3\xA6", 1)); // U+00E6 LATIN SMALL LETTER AE
|
|
EXPECT_FALSE(::IsValidUtf8("\xE2\x84\x95", 2)); // U+2115 DOUBLE-STRUCK CAPITAL N
|
|
EXPECT_FALSE(::IsValidUtf8("\xE2\x84"));
|
|
EXPECT_FALSE(::IsValidUtf8("\xF0\x9F\x94\x87", 3)); // U+1F507 SPEAKER WITH CANCELLATION STROKE
|
|
}
|
|
|
|
#include "lib/StdBuf.h"
|
|
#include "lib/StdBuf.cpp"
|
|
size_t FileSize(int) { return 0; }
|
|
|
|
#ifdef _WIN32
|
|
TEST(UnicodeHandlingTest, WideStringConversion)
|
|
{
|
|
wchar_t *wide_strings[] = {
|
|
L"\xD835\xDD07\xD835\xDD22\xD835\xDD2F",
|
|
L"\xD835\xDD0E\xD835\xDD29\xD835\xDD1E\xD835\xDD32\xD835\xDD30",
|
|
NULL
|
|
};
|
|
for (wchar_t **wide_string = wide_strings; *wide_string; ++wide_string)
|
|
{
|
|
StdStrBuf wide_string_buf(*wide_string);
|
|
EXPECT_STREQ(*wide_string, wide_string_buf.GetWideChar()) << "Conversion wchar_t*=>StdStrBuf=>wchar_t* isn't lossless";
|
|
}
|
|
}
|
|
#endif
|
|
|
|
#ifdef _WIN32
|
|
#include "platform/StdRegistry.h"
|
|
#include "platform/StdRegistry.cpp"
|
|
char StdCompiler::SeparatorToChar(enum StdCompiler::Sep) { return ' '; }
|
|
TEST(UnicodeHandlingTest, RegistryAccess)
|
|
{
|
|
wchar_t *wide_strings[] = {
|
|
L"\xD835\xDD07\xD835\xDD22\xD835\xDD2F",
|
|
L"\xD835\xDD0E\xD835\xDD29\xD835\xDD1E\xD835\xDD32\xD835\xDD30",
|
|
NULL
|
|
};
|
|
|
|
const char *key = "SOFTWARE\\OpenClonk Project\\OpenClonk\\Testing";
|
|
for (wchar_t **wide_string = wide_strings; *wide_string; ++wide_string)
|
|
{
|
|
ASSERT_TRUE(SetRegistryString(key, "WideCharTest", StdStrBuf(*wide_string).getData()));
|
|
StdCopyStrBuf buffer;
|
|
ASSERT_TRUE(!(buffer = GetRegistryString(key, "WideCharTest")).isNull());
|
|
EXPECT_STREQ(*wide_string, StdStrBuf(buffer).GetWideChar()) << "Registry read-back returned wrong value";
|
|
}
|
|
}
|
|
#endif
|