Script: Use Unicode instead of Byte strings

Internally, strings are UTF-8 as before, but GetChar returns an
Unicode code point instead of a byte from the UTF-8 encoded string,
and Format("%c") takes an Unicode code point as well.
Günther Brammer 2011-03-10 00:26:31 +01:00
parent e537296829
commit 21e28a4689
7 changed files with 126 additions and 54 deletions

View File

@ -849,3 +849,70 @@ bool IsValidUtf8(const char *text, int length)
// Looks fine
return true;
}
// UTF-8 iteration
uint32_t GetNextUTF8Character(const char **pszString)
{
// assume the current character is UTF8 already (i.e., highest bit set)
const char *szString = *pszString;
unsigned char c = *szString++;
uint32_t dwResult = '?';
assert(c>127);
if (c>191 && c<224)
{
unsigned char c2 = *szString++;
if ((c2 & 192) != 128) { *pszString = szString; return '?'; }
dwResult = (int(c&31)<<6) | (c2&63); // two char code
}
else if (c >= 224 && c <= 239)
{
unsigned char c2 = *szString++;
if ((c2 & 192) != 128) { *pszString = szString; return '?'; }
unsigned char c3 = *szString++;
if ((c3 & 192) != 128) { *pszString = szString; return '?'; }
dwResult = (int(c&15)<<12) | (int(c2&63)<<6) | int(c3&63); // three char code
}
else if (c >= 240 && c <= 247)
{
unsigned char c2 = *szString++;
if ((c2 & 192) != 128) { *pszString = szString; return '?'; }
unsigned char c3 = *szString++;
if ((c3 & 192) != 128) { *pszString = szString; return '?'; }
unsigned char c4 = *szString++;
if ((c4 & 192) != 128) { *pszString = szString; return '?'; }
dwResult = (int(c&7)<<18) | (int(c2&63)<<12) | (int(c3&63)<<6) | int(c4&63); // four char code
}
*pszString = szString;
return dwResult;
}
int GetCharacterCount(const char * s)
{
int l = 0;
while (*s)
{
unsigned char c = *s;
if (c < 128 || c > 247)
{
++l;
s += 1;
}
else if (c > 191 && c < 224)
{
++l;
s += 2;
}
else if (c >= 224 && c <= 239)
{
++l;
s += 3;
}
else if (c >= 240 && c <= 247)
{
++l;
s += 4;
}
else assert(false);
}
return l;
}

View File

@ -76,6 +76,17 @@ inline size_t SLenUntil(const char *szStr, char cUntil)
return end ? end-szStr : std::strlen(szStr);
}
// get a character at the current string pos and advance pos by that character
uint32_t GetNextUTF8Character(const char **pszString); // GetNextCharacter helper
inline uint32_t GetNextCharacter(const char **pszString)
{
unsigned char c=**pszString;
if (c<128) { ++*pszString; return c; }
else return GetNextUTF8Character(pszString);
}
// Get string length in characters (not bytes)
int GetCharacterCount(const char * s);
inline bool SEqual(const char *szStr1, const char *szStr2) { return szStr1&&szStr2?!std::strcmp(szStr1,szStr2):false; }
bool SEqual2(const char *szStr1, const char *szStr2);
bool SEqualUntil(const char *szStr1, const char *szStr2, char cWild);

View File

@ -374,6 +374,34 @@ void StdStrBuf::ToLowerCase()
*szPos = tolower(*szPos);
}
void StdStrBuf::AppendCharacter(uint32_t unicodechar)
{
if (unicodechar < 0x80)
AppendChar(unicodechar);
else if (unicodechar < 0x800)
{
Grow(2);
*getMPtr(getLength() - 2) = (0xC0 | (unicodechar >> 6));
*getMPtr(getLength() - 1) = (0x80 | (unicodechar & 0x3F));
}
else if (unicodechar < 0x10000)
{
Grow(3);
*getMPtr(getLength() - 3) = (0xE0 | (unicodechar >> 12));
*getMPtr(getLength() - 2) = (0x80 | ((unicodechar >> 6) & 0x3F));
*getMPtr(getLength() - 1) = (0x80 | (unicodechar & 0x3F));
}
else if (unicodechar < 0x110000)
{
Grow(4);
*getMPtr(getLength() - 4) = (0xF0 | (unicodechar >> 18));
*getMPtr(getLength() - 3) = (0x80 | ((unicodechar >> 12) & 0x3F));
*getMPtr(getLength() - 2) = (0x80 | ((unicodechar >> 6) & 0x3F));
*getMPtr(getLength() - 1) = (0x80 | (unicodechar & 0x3F));
}
else /* not an unicode code point, ignore */;
}
void StdStrBuf::EnsureUnicode()
{
// assume that it's windows-1252 and convert to utf-8

View File

@ -614,6 +614,7 @@ public:
{
AppendChars(cChar, 1);
}
void AppendCharacter(uint32_t unicodechar);
void AppendBackslash();
void InsertChar(char cChar, size_t insert_before)
{

View File

@ -298,41 +298,6 @@ bool CStdFont::AddRenderedChar(uint32_t dwChar, CFacet *pfctTarget)
return true;
}
uint32_t CStdFont::GetNextUTF8Character(const char **pszString)
{
// assume the current character is UTF8 already (i.e., highest bit set)
const char *szString = *pszString;
unsigned char c = *szString++;
uint32_t dwResult = '?';
assert(c>127);
if (c>191 && c<224)
{
unsigned char c2 = *szString++;
if ((c2 & 192) != 128) { *pszString = szString; return '?'; }
dwResult = (int(c&31)<<6) | (c2&63); // two char code
}
else if (c >= 224 && c <= 239)
{
unsigned char c2 = *szString++;
if ((c2 & 192) != 128) { *pszString = szString; return '?'; }
unsigned char c3 = *szString++;
if ((c3 & 192) != 128) { *pszString = szString; return '?'; }
dwResult = (int(c&15)<<12) | (int(c2&63)<<6) | int(c3&63); // three char code
}
else if (c >= 240 && c <= 247)
{
unsigned char c2 = *szString++;
if ((c2 & 192) != 128) { *pszString = szString; return '?'; }
unsigned char c3 = *szString++;
if ((c3 & 192) != 128) { *pszString = szString; return '?'; }
unsigned char c4 = *szString++;
if ((c4 & 192) != 128) { *pszString = szString; return '?'; }
dwResult = (int(c&7)<<18) | (int(c2&63)<<12) | (int(c3&63)<<6) | int(c4&63); // four char code
}
*pszString = szString;
return dwResult;
}
CFacet &CStdFont::GetUnicodeCharacterFacet(uint32_t c)
{
// find/add facet in map

View File

@ -100,14 +100,6 @@ protected:
bool CheckRenderedCharSpace(uint32_t iCharWdt, uint32_t iCharHgt);
bool AddRenderedChar(uint32_t dwChar, CFacet *pfctTarget);
// get a character at the current string pos and advance pos by that character
inline uint32_t GetNextCharacter(const char **pszString)
{
unsigned char c=**pszString;
if (c<128) { ++*pszString; return c; }
else return GetNextUTF8Character(pszString);
}
uint32_t GetNextUTF8Character(const char **pszString);
CFacet &GetCharacterFacet(uint32_t c)
{
if (c<128) return fctAsciiTexCoords[c-' ']; else return GetUnicodeCharacterFacet(c);

View File

@ -62,13 +62,21 @@ StdStrBuf FnStringFormat(C4AulContext *cthr, const char *szFormatPar, C4Value *
switch (*cpType)
{
// number
case 'd': case 'x': case 'X': case 'c':
case 'd': case 'x': case 'X':
{
if (!Par[cPar]) throw new C4AulExecError(cthr->Obj, "format placeholder without parameter");
StringBuf.AppendFormat(szField, Par[cPar++]->getInt());
cpFormat+=SLen(szField);
break;
}
// character
case 'c':
{
if (!Par[cPar]) throw new C4AulExecError(cthr->Obj, "format placeholder without parameter");
StringBuf.AppendCharacter(Par[cPar++]->getInt());
cpFormat+=SLen(szField);
break;
}
// C4ID
case 'i':
{
@ -365,7 +373,7 @@ static C4Value FnGetLength(C4AulContext *cthr, C4Value *pPars)
return C4VInt(pArray->GetSize());
C4String * pStr = pPars->getStr();
if (pStr)
return C4VInt(pStr->GetData().getLength());
return C4VInt(GetCharacterCount(pStr->GetData().getData()));
throw new C4AulExecError(cthr->Obj, "func \"GetLength\" par 0 cannot be converted to string or array");
}
@ -378,12 +386,9 @@ static C4Value FnGetIndexOf(C4AulContext *cthr, C4Value *pPars)
const C4ValueArray * pArray = pPars[1].getArray();
if (!pArray)
throw new C4AulExecError(cthr->Obj, "func \"GetIndexOf\" par 1 cannot be converted to array");
// find the element by comparing data only - this may result in bogus results if an object ptr array is searched for an int
// however, that's rather unlikely and strange scripting style
int32_t iSize = pArray->GetSize();
long cmp = pPars[0].GetData().Int;
for (int32_t i = 0; i<iSize; ++i)
if (cmp == pArray->GetItem(i).GetData().Int)
if (pPars[0] == pArray->GetItem(i))
// element found
return C4VInt(i);
// element not found
@ -405,11 +410,14 @@ static long FnGetChar(C4AulContext* cthr, C4String *pString, long iIndex)
{
const char *szText = FnStringPar(pString);
if (!szText) return 0;
// loop and check for end of string
for (int i=0; i<iIndex; i++, szText++)
if (!*szText) return 0;
// return indiced character value
return (unsigned char) *szText;
// C4Strings are UTF-8 encoded, so decode to get the indicated character
uint32_t c = GetNextCharacter(&szText);
for (int i = 0; i < iIndex; ++i)
{
c = GetNextCharacter(&szText);
if (!c) return 0;
}
return c;
}
static C4Value FnEval(C4AulContext *cthr, C4Value *strScript_C4V)