Speedup enhancement for the 8->16 and 8->32 copy loop hotspots,

using a faster generic routine and some inline assembly for i386
(cleanups by Ove).
oldstable
Marcus Meissner 1999-06-05 08:45:32 +00:00 committed by Alexandre Julliard
parent eb2e77fd6f
commit 4ebd9d85aa
1 changed files with 54 additions and 12 deletions

View File

@ -3404,15 +3404,39 @@ static HRESULT WINAPI DGA_IDirectDrawImpl_SetDisplayMode(
static void pixel_convert_16_to_8(void *src, void *dst, DWORD width, DWORD height, LONG pitch, IDirectDrawPaletteImpl* palette) {
unsigned char *c_src = (unsigned char *) src;
unsigned short *c_dst = (unsigned short *) dst;
int x, y;
int y;
if (palette != NULL) {
unsigned short *pal = (unsigned short *) palette->screen_palents;
const unsigned short * pal = (unsigned short *) palette->screen_palents;
for (y = 0; y < height; y++) {
for (x = 0; x < width; x++) {
c_dst[x + y * width] = pal[c_src[x + y * pitch]];
}
for (y = height; y--; ) {
#ifdef __i386__
/* gcc generates slightly inefficient code for the the copy / lookup,
* it generates one excess memory access (to pal) per pixel. Since
* we know that pal is not modified by the memory write we can
* put it into a register and reduce the number of memory accesses
* from 4 to 3 pp. There are two xor eax,eax to avoid pipeline stalls.
* (This is not guaranteed to be the fastest method.)
*/
__asm__ __volatile__(
"xor %%eax,%%eax\n"
"1:\n"
" lodsb\n"
" movw (%%edx,%%eax,2),%%ax\n"
" stosw\n"
" xor %%eax,%%eax\n"
" loop 1b\n"
: "=S" (c_src), "=D" (c_dst)
: "S" (c_src), "D" (c_dst) , "c" (width), "d" (pal)
: "eax", "cc", "memory"
);
c_src+=(pitch-width);
#else
unsigned char * srclineend = c_src+width;
while (c_src < srclineend)
*c_dst++ = pal[*c_src++];
c_src+=(pitch-width);
#endif
}
} else {
WARN(ddraw, "No palette set...\n");
@ -3444,15 +3468,33 @@ static void palette_convert_15_to_8(LPPALETTEENTRY palent, void *screen_palette,
static void pixel_convert_32_to_8(void *src, void *dst, DWORD width, DWORD height, LONG pitch, IDirectDrawPaletteImpl* palette) {
unsigned char *c_src = (unsigned char *) src;
unsigned int *c_dst = (unsigned int *) dst;
int x, y;
int y;
if (palette != NULL) {
unsigned int *pal = (unsigned int *) palette->screen_palents;
const unsigned int *pal = (unsigned int *) palette->screen_palents;
for (y = 0; y < height; y++) {
for (x = 0; x < width; x++) {
c_dst[x + y * width] = pal[c_src[x + y * pitch]];
}
for (y = height; y--; ) {
#ifdef __i386__
/* See comment in pixel_convert_16_to_8 */
__asm__ __volatile__(
"xor %%eax,%%eax\n"
"1:\n"
" lodsb\n"
" movl (%%edx,%%eax,4),%%eax\n"
" stosl\n"
" xor %%eax,%%eax\n"
" loop 1b\n"
: "=S" (c_src), "=D" (c_dst)
: "S" (c_src), "D" (c_dst) , "c" (width), "d" (pal)
: "eax", "cc", "memory"
);
c_src+=(pitch-width);
#else
unsigned char * srclineend = c_src+width;
while (c_src < srclineend )
*c_dst++ = pal[*c_src++];
c_src+=(pitch-width);
#endif
}
} else {
WARN(ddraw, "No palette set...\n");