Script: Use Unicode instead of Byte strings

Internally, strings are UTF-8 as before, but GetChar returns an Unicode code point instead of a byte from the UTF-8 encoded string, and Format("%c") takes an Unicode code point as well.
2011-03-10 00:26:31 +01:00 · 2011-03-10 00:26:31 +01:00 · 21e28a4689
parent e537296829
commit 21e28a4689
7 changed files with 126 additions and 54 deletions
--- a/src/lib/Standard.cpp
+++ b/src/lib/Standard.cpp
@ -849,3 +849,70 @@ bool IsValidUtf8(const char *text, int length)
 	// Looks fine
 	return true;
 }
+
+// UTF-8 iteration
+uint32_t GetNextUTF8Character(const char **pszString)
+{
+	// assume the current character is UTF8 already (i.e., highest bit set)
+	const char *szString = *pszString;
+	unsigned char c = *szString++;
+	uint32_t dwResult = '?';
+	assert(c>127);
+	if (c>191 && c<224)
+	{
+		unsigned char c2 = *szString++;
+		if ((c2 & 192) != 128) { *pszString = szString; return '?'; }
+		dwResult = (int(c&31)<<6) | (c2&63); // two char code
+	}
+	else if (c >= 224 && c <= 239)
+	{
+		unsigned char c2 = *szString++;
+		if ((c2 & 192) != 128) { *pszString = szString; return '?'; }
+		unsigned char c3 = *szString++;
+		if ((c3 & 192) != 128) { *pszString = szString; return '?'; }
+		dwResult = (int(c&15)<<12) | (int(c2&63)<<6) | int(c3&63); // three char code
+	}
+	else if (c >= 240 && c <= 247)
+	{
+		unsigned char c2 = *szString++;
+		if ((c2 & 192) != 128) { *pszString = szString; return '?'; }
+		unsigned char c3 = *szString++;
+		if ((c3 & 192) != 128) { *pszString = szString; return '?'; }
+		unsigned char c4 = *szString++;
+		if ((c4 & 192) != 128) { *pszString = szString; return '?'; }
+		dwResult = (int(c&7)<<18) | (int(c2&63)<<12) | (int(c3&63)<<6) | int(c4&63); // four char code
+	}
+	*pszString = szString;
+	return dwResult;
+}
+
+int GetCharacterCount(const char * s)
+{
+	int l = 0;
+	while (*s)
+	{
+		unsigned char c = *s;
+		if (c < 128 || c > 247)
+		{
+			++l;
+			s += 1;
+		}
+		else if (c > 191 && c < 224)
+		{
+			++l;
+			s += 2;
+		}
+		else if (c >= 224 && c <= 239)
+		{
+			++l;
+			s += 3;
+		}
+		else if (c >= 240 && c <= 247)
+		{
+			++l;
+			s += 4;
+		}
+		else assert(false);
+	}
+	return l;
+}
--- a/src/lib/Standard.h
+++ b/src/lib/Standard.h
@ -76,6 +76,17 @@ inline size_t SLenUntil(const char *szStr, char cUntil)
 	return end ? end-szStr : std::strlen(szStr);
 }

+// get a character at the current string pos and advance pos by that character
+uint32_t GetNextUTF8Character(const char **pszString); // GetNextCharacter helper
+inline uint32_t GetNextCharacter(const char **pszString)
+{
+	unsigned char c=**pszString;
+	if (c<128) { ++*pszString; return c; }
+	else return GetNextUTF8Character(pszString);
+}
+// Get string length in characters (not bytes)
+int GetCharacterCount(const char * s);
+
 inline bool SEqual(const char *szStr1, const char *szStr2) { return szStr1&&szStr2?!std::strcmp(szStr1,szStr2):false; }
 bool SEqual2(const char *szStr1, const char *szStr2);
 bool SEqualUntil(const char *szStr1, const char *szStr2, char cWild);
--- a/src/lib/StdBuf.cpp
+++ b/src/lib/StdBuf.cpp
@ -374,6 +374,34 @@ void StdStrBuf::ToLowerCase()
 			*szPos = tolower(*szPos);
 }

+void StdStrBuf::AppendCharacter(uint32_t unicodechar)
+{
+	if (unicodechar < 0x80)
+		AppendChar(unicodechar);
+	else if (unicodechar < 0x800)
+	{
+		Grow(2);
+		*getMPtr(getLength() - 2) = (0xC0 | (unicodechar >> 6));
+		*getMPtr(getLength() - 1) = (0x80 | (unicodechar & 0x3F));
+	}
+	else if (unicodechar < 0x10000)
+	{
+		Grow(3);
+		*getMPtr(getLength() - 3) = (0xE0 | (unicodechar >> 12));
+		*getMPtr(getLength() - 2) = (0x80 | ((unicodechar >> 6) & 0x3F));
+		*getMPtr(getLength() - 1) = (0x80 | (unicodechar & 0x3F));
+	}
+	else if (unicodechar < 0x110000)
+	{
+		Grow(4);
+		*getMPtr(getLength() - 4) = (0xF0 | (unicodechar >> 18));
+		*getMPtr(getLength() - 3) = (0x80 | ((unicodechar >> 12) & 0x3F));
+		*getMPtr(getLength() - 2) = (0x80 | ((unicodechar >> 6) & 0x3F));
+		*getMPtr(getLength() - 1) = (0x80 | (unicodechar & 0x3F));
+	}
+	else /* not an unicode code point, ignore */;
+}
+
 void StdStrBuf::EnsureUnicode()
 {
 	// assume that it's windows-1252 and convert to utf-8
--- a/src/lib/StdBuf.h
+++ b/src/lib/StdBuf.h
@ -614,6 +614,7 @@ public:
 	{
 		AppendChars(cChar, 1);
 	}
+	void AppendCharacter(uint32_t unicodechar);
 	void AppendBackslash();
 	void InsertChar(char cChar, size_t insert_before)
 	{
--- a/src/platform/StdFont.cpp
+++ b/src/platform/StdFont.cpp
@ -298,41 +298,6 @@ bool CStdFont::AddRenderedChar(uint32_t dwChar, CFacet *pfctTarget)
 	return true;
 }

-uint32_t CStdFont::GetNextUTF8Character(const char **pszString)
-{
-	// assume the current character is UTF8 already (i.e., highest bit set)
-	const char *szString = *pszString;
-	unsigned char c = *szString++;
-	uint32_t dwResult = '?';
-	assert(c>127);
-	if (c>191 && c<224)
-	{
-		unsigned char c2 = *szString++;
-		if ((c2 & 192) != 128) { *pszString = szString; return '?'; }
-		dwResult = (int(c&31)<<6) | (c2&63); // two char code
-	}
-	else if (c >= 224 && c <= 239)
-	{
-		unsigned char c2 = *szString++;
-		if ((c2 & 192) != 128) { *pszString = szString; return '?'; }
-		unsigned char c3 = *szString++;
-		if ((c3 & 192) != 128) { *pszString = szString; return '?'; }
-		dwResult = (int(c&15)<<12) | (int(c2&63)<<6) | int(c3&63); // three char code
-	}
-	else if (c >= 240 && c <= 247)
-	{
-		unsigned char c2 = *szString++;
-		if ((c2 & 192) != 128) { *pszString = szString; return '?'; }
-		unsigned char c3 = *szString++;
-		if ((c3 & 192) != 128) { *pszString = szString; return '?'; }
-		unsigned char c4 = *szString++;
-		if ((c4 & 192) != 128) { *pszString = szString; return '?'; }
-		dwResult = (int(c&7)<<18) | (int(c2&63)<<12) | (int(c3&63)<<6) | int(c4&63); // four char code
-	}
-	*pszString = szString;
-	return dwResult;
-}
-
 CFacet &CStdFont::GetUnicodeCharacterFacet(uint32_t c)
 {
 	// find/add facet in map
--- a/src/platform/StdFont.h
+++ b/src/platform/StdFont.h
@ -100,14 +100,6 @@ protected:
 	bool CheckRenderedCharSpace(uint32_t iCharWdt, uint32_t iCharHgt);
 	bool AddRenderedChar(uint32_t dwChar, CFacet *pfctTarget);

-	// get a character at the current string pos and advance pos by that character
-	inline uint32_t GetNextCharacter(const char **pszString)
-	{
-		unsigned char c=**pszString;
-		if (c<128) { ++*pszString; return c; }
-		else return GetNextUTF8Character(pszString);
-	}
-	uint32_t GetNextUTF8Character(const char **pszString);
 	CFacet &GetCharacterFacet(uint32_t c)
 	{
 		if (c<128) return fctAsciiTexCoords[c-' ']; else return GetUnicodeCharacterFacet(c);
--- a/src/script/C4Script.cpp
+++ b/src/script/C4Script.cpp
@ -62,13 +62,21 @@ StdStrBuf FnStringFormat(C4AulContext *cthr, const char *szFormatPar, C4Value *
 			switch (*cpType)
 			{
 				// number
-			case 'd': case 'x': case 'X': case 'c':
+			case 'd': case 'x': case 'X':
 			{
 				if (!Par[cPar]) throw new C4AulExecError(cthr->Obj, "format placeholder without parameter");
 				StringBuf.AppendFormat(szField, Par[cPar++]->getInt());
 				cpFormat+=SLen(szField);
 				break;
 			}
+			// character
+			case 'c':
+			{
+				if (!Par[cPar]) throw new C4AulExecError(cthr->Obj, "format placeholder without parameter");
+				StringBuf.AppendCharacter(Par[cPar++]->getInt());
+				cpFormat+=SLen(szField);
+				break;
+			}
 			// C4ID
 			case 'i':
 			{
@ -365,7 +373,7 @@ static C4Value FnGetLength(C4AulContext *cthr, C4Value *pPars)
 		return C4VInt(pArray->GetSize());
 	C4String * pStr = pPars->getStr();
 	if (pStr)
-		return C4VInt(pStr->GetData().getLength());
+		return C4VInt(GetCharacterCount(pStr->GetData().getData()));
 	throw new C4AulExecError(cthr->Obj, "func \"GetLength\" par 0 cannot be converted to string or array");
 }

@ -378,12 +386,9 @@ static C4Value FnGetIndexOf(C4AulContext *cthr, C4Value *pPars)
 	const C4ValueArray * pArray = pPars[1].getArray();
 	if (!pArray)
 		throw new C4AulExecError(cthr->Obj, "func \"GetIndexOf\" par 1 cannot be converted to array");
-	// find the element by comparing data only - this may result in bogus results if an object ptr array is searched for an int
-	// however, that's rather unlikely and strange scripting style
 	int32_t iSize = pArray->GetSize();
-	long cmp = pPars[0].GetData().Int;
 	for (int32_t i = 0; i<iSize; ++i)
-		if (cmp == pArray->GetItem(i).GetData().Int)
+		if (pPars[0] == pArray->GetItem(i))
 			// element found
 			return C4VInt(i);
 	// element not found
@ -405,11 +410,14 @@ static long FnGetChar(C4AulContext* cthr, C4String *pString, long iIndex)
 {
 	const char *szText = FnStringPar(pString);
 	if (!szText) return 0;
-	// loop and check for end of string
-	for (int i=0; i<iIndex; i++, szText++)
-		if (!*szText) return 0;
-	// return indiced character value
-	return (unsigned char) *szText;
+	// C4Strings are UTF-8 encoded, so decode to get the indicated character
+	uint32_t c = GetNextCharacter(&szText);
+	for (int i = 0; i < iIndex; ++i)
+	{
+		c = GetNextCharacter(&szText);
+		if (!c) return 0;
+	}
+	return c;
 }

 static C4Value FnEval(C4AulContext *cthr, C4Value *strScript_C4V)