forked from Mirrors/openclonk
Script: Use Unicode instead of Byte strings
Internally, strings are UTF-8 as before, but GetChar returns an Unicode code point instead of a byte from the UTF-8 encoded string, and Format("%c") takes an Unicode code point as well.
parent
e537296829
commit
21e28a4689
|
@ -849,3 +849,70 @@ bool IsValidUtf8(const char *text, int length)
|
|||
// Looks fine
|
||||
return true;
|
||||
}
|
||||
|
||||
// UTF-8 iteration
|
||||
uint32_t GetNextUTF8Character(const char **pszString)
|
||||
{
|
||||
// assume the current character is UTF8 already (i.e., highest bit set)
|
||||
const char *szString = *pszString;
|
||||
unsigned char c = *szString++;
|
||||
uint32_t dwResult = '?';
|
||||
assert(c>127);
|
||||
if (c>191 && c<224)
|
||||
{
|
||||
unsigned char c2 = *szString++;
|
||||
if ((c2 & 192) != 128) { *pszString = szString; return '?'; }
|
||||
dwResult = (int(c&31)<<6) | (c2&63); // two char code
|
||||
}
|
||||
else if (c >= 224 && c <= 239)
|
||||
{
|
||||
unsigned char c2 = *szString++;
|
||||
if ((c2 & 192) != 128) { *pszString = szString; return '?'; }
|
||||
unsigned char c3 = *szString++;
|
||||
if ((c3 & 192) != 128) { *pszString = szString; return '?'; }
|
||||
dwResult = (int(c&15)<<12) | (int(c2&63)<<6) | int(c3&63); // three char code
|
||||
}
|
||||
else if (c >= 240 && c <= 247)
|
||||
{
|
||||
unsigned char c2 = *szString++;
|
||||
if ((c2 & 192) != 128) { *pszString = szString; return '?'; }
|
||||
unsigned char c3 = *szString++;
|
||||
if ((c3 & 192) != 128) { *pszString = szString; return '?'; }
|
||||
unsigned char c4 = *szString++;
|
||||
if ((c4 & 192) != 128) { *pszString = szString; return '?'; }
|
||||
dwResult = (int(c&7)<<18) | (int(c2&63)<<12) | (int(c3&63)<<6) | int(c4&63); // four char code
|
||||
}
|
||||
*pszString = szString;
|
||||
return dwResult;
|
||||
}
|
||||
|
||||
int GetCharacterCount(const char * s)
|
||||
{
|
||||
int l = 0;
|
||||
while (*s)
|
||||
{
|
||||
unsigned char c = *s;
|
||||
if (c < 128 || c > 247)
|
||||
{
|
||||
++l;
|
||||
s += 1;
|
||||
}
|
||||
else if (c > 191 && c < 224)
|
||||
{
|
||||
++l;
|
||||
s += 2;
|
||||
}
|
||||
else if (c >= 224 && c <= 239)
|
||||
{
|
||||
++l;
|
||||
s += 3;
|
||||
}
|
||||
else if (c >= 240 && c <= 247)
|
||||
{
|
||||
++l;
|
||||
s += 4;
|
||||
}
|
||||
else assert(false);
|
||||
}
|
||||
return l;
|
||||
}
|
||||
|
|
|
@ -76,6 +76,17 @@ inline size_t SLenUntil(const char *szStr, char cUntil)
|
|||
return end ? end-szStr : std::strlen(szStr);
|
||||
}
|
||||
|
||||
// get a character at the current string pos and advance pos by that character
|
||||
uint32_t GetNextUTF8Character(const char **pszString); // GetNextCharacter helper
|
||||
inline uint32_t GetNextCharacter(const char **pszString)
|
||||
{
|
||||
unsigned char c=**pszString;
|
||||
if (c<128) { ++*pszString; return c; }
|
||||
else return GetNextUTF8Character(pszString);
|
||||
}
|
||||
// Get string length in characters (not bytes)
|
||||
int GetCharacterCount(const char * s);
|
||||
|
||||
inline bool SEqual(const char *szStr1, const char *szStr2) { return szStr1&&szStr2?!std::strcmp(szStr1,szStr2):false; }
|
||||
bool SEqual2(const char *szStr1, const char *szStr2);
|
||||
bool SEqualUntil(const char *szStr1, const char *szStr2, char cWild);
|
||||
|
|
|
@ -374,6 +374,34 @@ void StdStrBuf::ToLowerCase()
|
|||
*szPos = tolower(*szPos);
|
||||
}
|
||||
|
||||
void StdStrBuf::AppendCharacter(uint32_t unicodechar)
|
||||
{
|
||||
if (unicodechar < 0x80)
|
||||
AppendChar(unicodechar);
|
||||
else if (unicodechar < 0x800)
|
||||
{
|
||||
Grow(2);
|
||||
*getMPtr(getLength() - 2) = (0xC0 | (unicodechar >> 6));
|
||||
*getMPtr(getLength() - 1) = (0x80 | (unicodechar & 0x3F));
|
||||
}
|
||||
else if (unicodechar < 0x10000)
|
||||
{
|
||||
Grow(3);
|
||||
*getMPtr(getLength() - 3) = (0xE0 | (unicodechar >> 12));
|
||||
*getMPtr(getLength() - 2) = (0x80 | ((unicodechar >> 6) & 0x3F));
|
||||
*getMPtr(getLength() - 1) = (0x80 | (unicodechar & 0x3F));
|
||||
}
|
||||
else if (unicodechar < 0x110000)
|
||||
{
|
||||
Grow(4);
|
||||
*getMPtr(getLength() - 4) = (0xF0 | (unicodechar >> 18));
|
||||
*getMPtr(getLength() - 3) = (0x80 | ((unicodechar >> 12) & 0x3F));
|
||||
*getMPtr(getLength() - 2) = (0x80 | ((unicodechar >> 6) & 0x3F));
|
||||
*getMPtr(getLength() - 1) = (0x80 | (unicodechar & 0x3F));
|
||||
}
|
||||
else /* not an unicode code point, ignore */;
|
||||
}
|
||||
|
||||
void StdStrBuf::EnsureUnicode()
|
||||
{
|
||||
// assume that it's windows-1252 and convert to utf-8
|
||||
|
|
|
@ -614,6 +614,7 @@ public:
|
|||
{
|
||||
AppendChars(cChar, 1);
|
||||
}
|
||||
void AppendCharacter(uint32_t unicodechar);
|
||||
void AppendBackslash();
|
||||
void InsertChar(char cChar, size_t insert_before)
|
||||
{
|
||||
|
|
|
@ -298,41 +298,6 @@ bool CStdFont::AddRenderedChar(uint32_t dwChar, CFacet *pfctTarget)
|
|||
return true;
|
||||
}
|
||||
|
||||
uint32_t CStdFont::GetNextUTF8Character(const char **pszString)
|
||||
{
|
||||
// assume the current character is UTF8 already (i.e., highest bit set)
|
||||
const char *szString = *pszString;
|
||||
unsigned char c = *szString++;
|
||||
uint32_t dwResult = '?';
|
||||
assert(c>127);
|
||||
if (c>191 && c<224)
|
||||
{
|
||||
unsigned char c2 = *szString++;
|
||||
if ((c2 & 192) != 128) { *pszString = szString; return '?'; }
|
||||
dwResult = (int(c&31)<<6) | (c2&63); // two char code
|
||||
}
|
||||
else if (c >= 224 && c <= 239)
|
||||
{
|
||||
unsigned char c2 = *szString++;
|
||||
if ((c2 & 192) != 128) { *pszString = szString; return '?'; }
|
||||
unsigned char c3 = *szString++;
|
||||
if ((c3 & 192) != 128) { *pszString = szString; return '?'; }
|
||||
dwResult = (int(c&15)<<12) | (int(c2&63)<<6) | int(c3&63); // three char code
|
||||
}
|
||||
else if (c >= 240 && c <= 247)
|
||||
{
|
||||
unsigned char c2 = *szString++;
|
||||
if ((c2 & 192) != 128) { *pszString = szString; return '?'; }
|
||||
unsigned char c3 = *szString++;
|
||||
if ((c3 & 192) != 128) { *pszString = szString; return '?'; }
|
||||
unsigned char c4 = *szString++;
|
||||
if ((c4 & 192) != 128) { *pszString = szString; return '?'; }
|
||||
dwResult = (int(c&7)<<18) | (int(c2&63)<<12) | (int(c3&63)<<6) | int(c4&63); // four char code
|
||||
}
|
||||
*pszString = szString;
|
||||
return dwResult;
|
||||
}
|
||||
|
||||
CFacet &CStdFont::GetUnicodeCharacterFacet(uint32_t c)
|
||||
{
|
||||
// find/add facet in map
|
||||
|
|
|
@ -100,14 +100,6 @@ protected:
|
|||
bool CheckRenderedCharSpace(uint32_t iCharWdt, uint32_t iCharHgt);
|
||||
bool AddRenderedChar(uint32_t dwChar, CFacet *pfctTarget);
|
||||
|
||||
// get a character at the current string pos and advance pos by that character
|
||||
inline uint32_t GetNextCharacter(const char **pszString)
|
||||
{
|
||||
unsigned char c=**pszString;
|
||||
if (c<128) { ++*pszString; return c; }
|
||||
else return GetNextUTF8Character(pszString);
|
||||
}
|
||||
uint32_t GetNextUTF8Character(const char **pszString);
|
||||
CFacet &GetCharacterFacet(uint32_t c)
|
||||
{
|
||||
if (c<128) return fctAsciiTexCoords[c-' ']; else return GetUnicodeCharacterFacet(c);
|
||||
|
|
|
@ -62,13 +62,21 @@ StdStrBuf FnStringFormat(C4AulContext *cthr, const char *szFormatPar, C4Value *
|
|||
switch (*cpType)
|
||||
{
|
||||
// number
|
||||
case 'd': case 'x': case 'X': case 'c':
|
||||
case 'd': case 'x': case 'X':
|
||||
{
|
||||
if (!Par[cPar]) throw new C4AulExecError(cthr->Obj, "format placeholder without parameter");
|
||||
StringBuf.AppendFormat(szField, Par[cPar++]->getInt());
|
||||
cpFormat+=SLen(szField);
|
||||
break;
|
||||
}
|
||||
// character
|
||||
case 'c':
|
||||
{
|
||||
if (!Par[cPar]) throw new C4AulExecError(cthr->Obj, "format placeholder without parameter");
|
||||
StringBuf.AppendCharacter(Par[cPar++]->getInt());
|
||||
cpFormat+=SLen(szField);
|
||||
break;
|
||||
}
|
||||
// C4ID
|
||||
case 'i':
|
||||
{
|
||||
|
@ -365,7 +373,7 @@ static C4Value FnGetLength(C4AulContext *cthr, C4Value *pPars)
|
|||
return C4VInt(pArray->GetSize());
|
||||
C4String * pStr = pPars->getStr();
|
||||
if (pStr)
|
||||
return C4VInt(pStr->GetData().getLength());
|
||||
return C4VInt(GetCharacterCount(pStr->GetData().getData()));
|
||||
throw new C4AulExecError(cthr->Obj, "func \"GetLength\" par 0 cannot be converted to string or array");
|
||||
}
|
||||
|
||||
|
@ -378,12 +386,9 @@ static C4Value FnGetIndexOf(C4AulContext *cthr, C4Value *pPars)
|
|||
const C4ValueArray * pArray = pPars[1].getArray();
|
||||
if (!pArray)
|
||||
throw new C4AulExecError(cthr->Obj, "func \"GetIndexOf\" par 1 cannot be converted to array");
|
||||
// find the element by comparing data only - this may result in bogus results if an object ptr array is searched for an int
|
||||
// however, that's rather unlikely and strange scripting style
|
||||
int32_t iSize = pArray->GetSize();
|
||||
long cmp = pPars[0].GetData().Int;
|
||||
for (int32_t i = 0; i<iSize; ++i)
|
||||
if (cmp == pArray->GetItem(i).GetData().Int)
|
||||
if (pPars[0] == pArray->GetItem(i))
|
||||
// element found
|
||||
return C4VInt(i);
|
||||
// element not found
|
||||
|
@ -405,11 +410,14 @@ static long FnGetChar(C4AulContext* cthr, C4String *pString, long iIndex)
|
|||
{
|
||||
const char *szText = FnStringPar(pString);
|
||||
if (!szText) return 0;
|
||||
// loop and check for end of string
|
||||
for (int i=0; i<iIndex; i++, szText++)
|
||||
if (!*szText) return 0;
|
||||
// return indiced character value
|
||||
return (unsigned char) *szText;
|
||||
// C4Strings are UTF-8 encoded, so decode to get the indicated character
|
||||
uint32_t c = GetNextCharacter(&szText);
|
||||
for (int i = 0; i < iIndex; ++i)
|
||||
{
|
||||
c = GetNextCharacter(&szText);
|
||||
if (!c) return 0;
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
static C4Value FnEval(C4AulContext *cthr, C4Value *strScript_C4V)
|
||||
|
|
Loading…
Reference in New Issue