hhctrl.ocx: Add HTML to Unicode decoding capability to the table of contents.

2012-06-20 14:31:10 -06:00 · 2012-06-20 14:31:10 -06:00 · 9033b14438
parent b527679d17
commit 9033b14438
3 changed files with 206 additions and 12 deletions
--- a/dlls/hhctrl.ocx/content.c
+++ b/dlls/hhctrl.ocx/content.c
@ -50,16 +50,6 @@ static void free_content_item(ContentItem *item)
    }
 }

-static void store_param(LPWSTR *param, const char *value, int len)
-{
-    int wlen;
-
-    wlen = MultiByteToWideChar(CP_ACP, 0, value, len, NULL, 0);
-    *param = heap_alloc((wlen+1)*sizeof(WCHAR));
-    MultiByteToWideChar(CP_ACP, 0, value, len, *param, wlen);
-    (*param)[wlen] = 0;
-}
-
 static void parse_obj_node_param(ContentItem *item, ContentItem *hhc_root, const char *text)
 {
    const char *ptr;
@ -99,11 +89,11 @@ static void parse_obj_node_param(ContentItem *item, ContentItem *hhc_root, const
        const char *local = strstr(ptr, "::")+2;
        int local_len = len-(local-ptr);

-        store_param(&item->local, local, local_len);
+        item->local = decode_html(local, local_len);
        param = &merge;
    }

-    store_param(param, ptr, len);
+    *param = decode_html(ptr, len);

    if(param == &merge) {
        SetChmPath(&item->merge, hhc_root->merge.chm_file, merge);
--- a/dlls/hhctrl.ocx/help.c
+++ b/dlls/hhctrl.ocx/help.c
@ -50,6 +50,119 @@ static void ExpandContract(HHInfo *pHHInfo);

 static const WCHAR szEmpty[] = {0};

+struct html_encoded_symbol {
+    const char *html_code;
+    char        ansi_symbol;
+};
+
+/*
+ * Table mapping the conversion between HTML encoded symbols and their ANSI code page equivalent.
+ * Note: Add additional entries in proper alphabetical order (a binary search is used on this table).
+ */
+struct html_encoded_symbol html_encoded_symbols[] =
+{
+    {"AElig",  0xC6},
+    {"Aacute", 0xC1},
+    {"Acirc",  0xC2},
+    {"Agrave", 0xC0},
+    {"Aring",  0xC5},
+    {"Atilde", 0xC3},
+    {"Auml",   0xC4},
+    {"Ccedil", 0xC7},
+    {"ETH",    0xD0},
+    {"Eacute", 0xC9},
+    {"Ecirc",  0xCA},
+    {"Egrave", 0xC8},
+    {"Euml",   0xCB},
+    {"Iacute", 0xCD},
+    {"Icirc",  0xCE},
+    {"Igrave", 0xCC},
+    {"Iuml",   0xCF},
+    {"Ntilde", 0xD1},
+    {"Oacute", 0xD3},
+    {"Ocirc",  0xD4},
+    {"Ograve", 0xD2},
+    {"Oslash", 0xD8},
+    {"Otilde", 0xD5},
+    {"Ouml",   0xD6},
+    {"THORN",  0xDE},
+    {"Uacute", 0xDA},
+    {"Ucirc",  0xDB},
+    {"Ugrave", 0xD9},
+    {"Uuml",   0xDC},
+    {"Yacute", 0xDD},
+    {"aacute", 0xE1},
+    {"acirc",  0xE2},
+    {"acute",  0xB4},
+    {"aelig",  0xE6},
+    {"agrave", 0xE0},
+    {"amp",    '&'},
+    {"aring",  0xE5},
+    {"atilde", 0xE3},
+    {"auml",   0xE4},
+    {"brvbar", 0xA6},
+    {"ccedil", 0xE7},
+    {"cedil",  0xB8},
+    {"cent",   0xA2},
+    {"copy",   0xA9},
+    {"curren", 0xA4},
+    {"deg",    0xB0},
+    {"divide", 0xF7},
+    {"eacute", 0xE9},
+    {"ecirc",  0xEA},
+    {"egrave", 0xE8},
+    {"eth",    0xF0},
+    {"euml",   0xEB},
+    {"frac12", 0xBD},
+    {"frac14", 0xBC},
+    {"frac34", 0xBE},
+    {"gt",     '>'},
+    {"iacute", 0xED},
+    {"icirc",  0xEE},
+    {"iexcl",  0xA1},
+    {"igrave", 0xEC},
+    {"iquest", 0xBF},
+    {"iuml",   0xEF},
+    {"laquo",  0xAB},
+    {"lt",     '<'},
+    {"macr",   0xAF},
+    {"micro",  0xB5},
+    {"middot", 0xB7},
+    {"nbsp",   ' '},
+    {"not",    0xAC},
+    {"ntilde", 0xF1},
+    {"oacute", 0xF3},
+    {"ocirc",  0xF4},
+    {"ograve", 0xF2},
+    {"ordf",   0xAA},
+    {"ordm",   0xBA},
+    {"oslash", 0xF8},
+    {"otilde", 0xF5},
+    {"ouml",   0xF6},
+    {"para",   0xB6},
+    {"plusmn", 0xB1},
+    {"pound",  0xA3},
+    {"quot",   '"'},
+    {"raquo",  0xBB},
+    {"reg",    0xAE},
+    {"sect",   0xA7},
+    {"shy",    0xAD},
+    {"sup1",   0xB9},
+    {"sup2",   0xB2},
+    {"sup3",   0xB3},
+    {"szlig",  0xDF},
+    {"thorn",  0xFE},
+    {"times",  0xD7},
+    {"uacute", 0xFA},
+    {"ucirc",  0xFB},
+    {"ugrave", 0xF9},
+    {"uml",    0xA8},
+    {"uuml",   0xFC},
+    {"yacute", 0xFD},
+    {"yen",    0xA5},
+    {"yuml",   0xFF}
+};
+
 /* Loads a string from the resource file */
 static LPWSTR HH_LoadString(DWORD dwID)
 {
@ -1654,3 +1767,92 @@ HHInfo *CreateHelpViewer(LPCWSTR filename)

    return info;
 }
+
+/*
+ * Search the table of HTML entities and return the corresponding ANSI symbol.
+ */
+static char find_html_symbol(const char *entity, int entity_len)
+{
+    int max = sizeof(html_encoded_symbols)/sizeof(html_encoded_symbols[0])-1;
+    int min = 0, dir;
+
+    while(min <= max)
+    {
+        int pos = (min+max)/2;
+        const char *encoded_symbol = html_encoded_symbols[pos].html_code;
+        dir = strncmp(encoded_symbol, entity, entity_len);
+        if(dir == 0 && !encoded_symbol[entity_len]) return html_encoded_symbols[pos].ansi_symbol;
+        if(dir < 0)
+            min = pos+1;
+        else
+            max = pos-1;
+    }
+    return 0;
+}
+
+/*
+ * Decode a string containing HTML encoded characters into a unicode string.
+ */
+WCHAR *decode_html(const char *html_fragment, int html_fragment_len)
+{
+    const char *h = html_fragment;
+    char *amp, *sem, symbol, *tmp;
+    int len, tmp_len = 0;
+    WCHAR *unicode_text;
+
+    tmp = heap_alloc(html_fragment_len+1);
+    while(1)
+    {
+        symbol = 0;
+        amp = strchr(h, '&');
+        if(!amp) break;
+        len = amp-h;
+        /* Copy the characters prior to the HTML encoded character */
+        memcpy(&tmp[tmp_len], h, len);
+        tmp_len += len;
+        amp++; /* skip ampersand */
+        sem = strchr(amp, ';');
+        /* Require a semicolon after the ampersand */
+        if(!sem)
+        {
+            h = amp;
+            tmp[tmp_len++] = '&';
+            continue;
+        }
+        /* Find the symbol either by using the ANSI character number (prefixed by the pound symbol)
+         * or by searching the HTML entity table */
+        len = sem-amp;
+        if(amp[0] == '#')
+        {
+            char *endnum = NULL;
+            int tmp;
+
+            tmp = (char) strtol(amp, &endnum, 10);
+            if(endnum == sem)
+                symbol = tmp;
+        }
+        else
+            symbol = find_html_symbol(amp, len);
+        if(!symbol)
+        {
+            FIXME("Failed to translate HTML encoded character '&%.*s;'.\n", len, amp);
+            h = amp;
+            tmp[tmp_len++] = '&';
+            continue;
+        }
+        /* Insert the new symbol */
+        h = sem+1;
+        tmp[tmp_len++] = symbol;
+    }
+    /* Convert any remaining characters */
+    len = html_fragment_len-(h-html_fragment);
+    memcpy(&tmp[tmp_len], h, len);
+    tmp_len += len;
+    tmp[tmp_len++] = 0; /* NULL-terminate the string */
+
+    len = MultiByteToWideChar(CP_ACP, 0, tmp, tmp_len, NULL, 0);
+    unicode_text = heap_alloc(len*sizeof(WCHAR));
+    MultiByteToWideChar(CP_ACP, 0, tmp, tmp_len, unicode_text, len);
+    heap_free(tmp);
+    return unicode_text;
+}
--- a/dlls/hhctrl.ocx/hhctrl.h
+++ b/dlls/hhctrl.ocx/hhctrl.h
@ -193,6 +193,8 @@ void ReleaseSearch(HHInfo *info) DECLSPEC_HIDDEN;

 LPCWSTR skip_schema(LPCWSTR url) DECLSPEC_HIDDEN;

+WCHAR *decode_html(const char *html_fragment, int html_fragment_len);
+
 /* memory allocation functions */

 static inline void * __WINE_ALLOC_SIZE(1) heap_alloc(size_t len)