Improved x86-64 XMM register argument passing.

Also made XMM0-7 available for use as temporary registers, since they
are not used by the ABI. I'd like to do the same with RSI and RDI but
that's trickier since they can be used by gv() as temporary registers
and there isn't a way to disable that.
master
James Lyon 2013-04-19 22:05:49 +01:00
parent 946afd2343
commit cbce6d2bac
2 changed files with 119 additions and 76 deletions

View File

@ -764,7 +764,7 @@ ST_FUNC int gv(int rc)
#endif #endif
r = vtop->r & VT_VALMASK; r = vtop->r & VT_VALMASK;
rc2 = RC_INT; rc2 = (rc & RC_FLOAT) ? RC_FLOAT : RC_INT;
if (rc == RC_IRET) if (rc == RC_IRET)
rc2 = RC_LRET; rc2 = RC_LRET;
#ifdef TCC_TARGET_X86_64 #ifdef TCC_TARGET_X86_64

View File

@ -23,7 +23,7 @@
#ifdef TARGET_DEFS_ONLY #ifdef TARGET_DEFS_ONLY
/* number of available registers */ /* number of available registers */
#define NB_REGS 18 #define NB_REGS 24
#define NB_ASM_REGS 8 #define NB_ASM_REGS 8
/* a register can belong to several classes. The classes must be /* a register can belong to several classes. The classes must be
@ -34,13 +34,19 @@
#define RC_RAX 0x0004 #define RC_RAX 0x0004
#define RC_RCX 0x0008 #define RC_RCX 0x0008
#define RC_RDX 0x0010 #define RC_RDX 0x0010
#define RC_ST0 0x0080 /* only for long double */
#define RC_R8 0x0100 #define RC_R8 0x0100
#define RC_R9 0x0200 #define RC_R9 0x0200
#define RC_R10 0x0400 #define RC_R10 0x0400
#define RC_R11 0x0800 #define RC_R11 0x0800
#define RC_XMM0 0x0020 #define RC_XMM0 0x1000
#define RC_XMM1 0x0040 #define RC_XMM1 0x2000
#define RC_ST0 0x0080 /* only for long double */ #define RC_XMM2 0x4000
#define RC_XMM3 0x8000
#define RC_XMM4 0x10000
#define RC_XMM5 0x20000
#define RC_XMM6 0x40000
#define RC_XMM7 0x80000
#define RC_IRET RC_RAX /* function return: integer register */ #define RC_IRET RC_RAX /* function return: integer register */
#define RC_LRET RC_RDX /* function return: second integer register */ #define RC_LRET RC_RDX /* function return: second integer register */
#define RC_FRET RC_XMM0 /* function return: float register */ #define RC_FRET RC_XMM0 /* function return: float register */
@ -61,6 +67,12 @@ enum {
TREG_XMM0 = 16, TREG_XMM0 = 16,
TREG_XMM1 = 17, TREG_XMM1 = 17,
TREG_XMM2 = 18,
TREG_XMM3 = 19,
TREG_XMM4 = 20,
TREG_XMM5 = 21,
TREG_XMM6 = 22,
TREG_XMM7 = 23,
TREG_ST0 = 4, // SP slot won't be used TREG_ST0 = 4, // SP slot won't be used
@ -117,16 +129,22 @@ ST_DATA const int reg_classes[NB_REGS] = {
0, 0,
0, 0,
0, 0,
/*RC_INT |*/ RC_R8, RC_R8,
/*RC_INT |*/ RC_R9, RC_R9,
/*RC_INT |*/ RC_R10, RC_R10,
/*RC_INT |*/ RC_R11, RC_R11,
0, 0,
0, 0,
0, 0,
0, 0,
/* xmm0 */ RC_FLOAT | RC_XMM0, /* xmm0 */ RC_FLOAT | RC_XMM0,
/* xmm1 */ RC_FLOAT | RC_XMM1, /* xmm1 */ RC_FLOAT | RC_XMM1,
/* xmm2 */ RC_FLOAT | RC_XMM2,
/* xmm3 */ RC_FLOAT | RC_XMM3,
/* xmm4 */ RC_FLOAT | RC_XMM4,
/* xmm5 */ RC_FLOAT | RC_XMM5,
/* xmm6 */ RC_FLOAT | RC_XMM6,
/* xmm7 */ RC_FLOAT | RC_XMM7,
}; };
static unsigned long func_sub_sp_offset; static unsigned long func_sub_sp_offset;
@ -141,8 +159,6 @@ void g(int c)
section_realloc(cur_text_section, ind1); section_realloc(cur_text_section, ind1);
cur_text_section->data[ind] = c; cur_text_section->data[ind] = c;
ind = ind1; ind = ind1;
assert((ind < 4) || (cur_text_section->data[ind-4] != ('\362'&0xFF)) || (cur_text_section->data[ind-3] != '\017')
|| (cur_text_section->data[ind-2] != 'X') || (cur_text_section->data[ind-1] != '\001'));
} }
void o(unsigned int c) void o(unsigned int c)
@ -1055,32 +1071,48 @@ void gfunc_call(int nb_args)
oad(0xec81, args_size); /* sub $xxx, %rsp */ oad(0xec81, args_size); /* sub $xxx, %rsp */
} }
for(i = 0; i < nb_args; i++) { for(i = 0; i < nb_args;) {
/* Swap argument to top, it will possibly be changed here, /* Swap argument to top, it will possibly be changed here,
and might use more temps. All arguments must remain on the and might use more temps. At the end of the loop we keep
stack, so that get_reg can correctly evict some of them onto in on the stack and swap it back to its original position
stack. We could use also use a vrott(nb_args) at the end if it is a register. */
of this loop, but this seems faster. */
SValue tmp = vtop[0]; SValue tmp = vtop[0];
vtop[0] = vtop[-i]; vtop[0] = vtop[-i];
vtop[-i] = tmp; vtop[-i] = tmp;
mode = classify_x86_64_arg(&vtop->type, NULL, &size, &reg_count); mode = classify_x86_64_arg(&vtop->type, NULL, &size, &reg_count);
switch (mode) {
case x86_64_mode_memory: int arg_stored = 1;
/* allocate the necessary size on stack */ switch (vtop->type.t & VT_BTYPE) {
o(0x48); case VT_STRUCT:
oad(0xec81, size); /* sub $xxx, %rsp */ if (mode == x86_64_mode_sse) {
/* generate structure store */ if (sse_reg > 8)
r = get_reg(RC_INT); sse_reg -= reg_count;
orex(1, r, 0, 0x89); /* mov %rsp, r */ else
o(0xe0 + REG_VALUE(r)); arg_stored = 0;
vset(&vtop->type, r | VT_LVAL, 0); } else if (mode == x86_64_mode_integer) {
vswap(); if (gen_reg > REGN)
vstore(); gen_reg -= reg_count;
args_size += size; else
arg_stored = 0;
}
if (arg_stored) {
/* allocate the necessary size on stack */
o(0x48);
oad(0xec81, size); /* sub $xxx, %rsp */
/* generate structure store */
r = get_reg(RC_INT);
orex(1, r, 0, 0x89); /* mov %rsp, r */
o(0xe0 + REG_VALUE(r));
vset(&vtop->type, r | VT_LVAL, 0);
vswap();
vstore();
args_size += size;
}
break; break;
case x86_64_mode_x87: case VT_LDOUBLE:
gv(RC_ST0); gv(RC_ST0);
size = LDOUBLE_SIZE; size = LDOUBLE_SIZE;
oad(0xec8148, size); /* sub $xxx, %rsp */ oad(0xec8148, size); /* sub $xxx, %rsp */
@ -1090,8 +1122,11 @@ void gfunc_call(int nb_args)
args_size += size; args_size += size;
break; break;
case x86_64_mode_sse: case VT_FLOAT:
case VT_DOUBLE:
assert(mode == x86_64_mode_sse);
if (sse_reg > 8) { if (sse_reg > 8) {
--sse_reg;
r = gv(RC_FLOAT); r = gv(RC_FLOAT);
o(0x50); /* push $rax */ o(0x50); /* push $rax */
/* movq %xmm0, (%rsp) */ /* movq %xmm0, (%rsp) */
@ -1099,26 +1134,39 @@ void gfunc_call(int nb_args)
o(0x04 + REG_VALUE(r)*8); o(0x04 + REG_VALUE(r)*8);
o(0x24); o(0x24);
args_size += size; args_size += size;
} else {
arg_stored = 0;
} }
sse_reg -= reg_count;
break; break;
case x86_64_mode_integer: default:
assert(mode == x86_64_mode_integer);
/* simple type */ /* simple type */
/* XXX: implicit cast ? */ /* XXX: implicit cast ? */
if (gen_reg > REGN) { if (gen_reg > REGN) {
--gen_reg;
r = gv(RC_INT); r = gv(RC_INT);
orex(0,r,0,0x50 + REG_VALUE(r)); /* push r */ orex(0,r,0,0x50 + REG_VALUE(r)); /* push r */
args_size += size; args_size += size;
} else {
arg_stored = 0;
} }
gen_reg -= reg_count;
break; break;
} }
/* And swap the argument back to it's original position. */ /* And swap the argument back to it's original position. */
tmp = vtop[0]; tmp = vtop[0];
vtop[0] = vtop[-i]; vtop[0] = vtop[-i];
vtop[-i] = tmp; vtop[-i] = tmp;
if (arg_stored) {
vrotb(i+1);
assert(vtop->type.t == tmp.type.t);
vpop();
--nb_args;
} else {
++i;
}
} }
/* XXX This should be superfluous. */ /* XXX This should be superfluous. */
@ -1128,55 +1176,50 @@ void gfunc_call(int nb_args)
Note that we cannot set RDX and RCX in this loop because gv() Note that we cannot set RDX and RCX in this loop because gv()
may break these temporary registers. Let's use R10 and R11 may break these temporary registers. Let's use R10 and R11
instead of them */ instead of them */
gen_reg = nb_reg_args; assert(gen_reg <= REGN);
sse_reg = nb_sse_args; assert(sse_reg <= 8);
for(i = 0; i < nb_args; i++) { for(i = 0; i < nb_args; i++) {
mode = classify_x86_64_arg(&vtop->type, &type, &size, &reg_count); mode = classify_x86_64_arg(&vtop->type, &type, &size, &reg_count);
/* Alter stack entry type so that gv() knows how to treat it */ /* Alter stack entry type so that gv() knows how to treat it */
vtop->type = type; vtop->type = type;
switch (mode) { if (mode == x86_64_mode_sse) {
default: if (reg_count == 2) {
break; sse_reg -= 2;
gv(RC_FRET); /* Use pair load into xmm0 & xmm1 */
case x86_64_mode_sse: if (sse_reg) { /* avoid redundant movaps %xmm0, %xmm0 */
sse_reg -= reg_count; /* movaps %xmm0, %xmmN */
if (sse_reg + reg_count <= 8) { o(0x280f);
gv(RC_FRET); /* only one float register */ o(0xc0 + (sse_reg << 3));
if (sse_reg) { /* avoid redundant movaps %xmm0, %xmm0 */ /* movaps %xmm1, %xmmN */
/* movaps %xmm0, %xmmN */ o(0x280f);
o(0x280f); o(0xc1 + ((sse_reg+1) << 3));
o(0xc0 + (sse_reg << 3)); }
if (reg_count == 2) { } else {
/* movaps %xmm1, %xmmN */ assert(reg_count == 1);
o(0x280f); --sse_reg;
o(0xc1 + ((sse_reg+1) << 3)); /* Load directly to register */
} gv(RC_XMM0 << sse_reg);
} }
} } else if (mode == x86_64_mode_integer) {
break;
case x86_64_mode_integer:
/* simple type */ /* simple type */
/* XXX: implicit cast ? */ /* XXX: implicit cast ? */
gen_reg -= reg_count; gen_reg -= reg_count;
if (gen_reg + reg_count <= REGN) { r = gv(RC_INT);
r = gv((reg_count == 1) ? RC_INT : RC_IRET); int d = arg_prepare_reg(gen_reg);
int d = arg_prepare_reg(gen_reg); orex(1,d,r,0x89); /* mov */
orex(1,d,r,0x89); /* mov */ o(0xc0 + REG_VALUE(r) * 8 + REG_VALUE(d));
o(0xc0 + REG_VALUE(r) * 8 + REG_VALUE(d)); if (reg_count == 2) {
if (reg_count == 2) { /* Second word of two-word value should always be in rdx
/* Second word of two-word value should always be in rdx this case is handled via RC_IRET */
this case is handled via RC_IRET */ d = arg_prepare_reg(gen_reg+1);
assert(vtop->r2 == TREG_RDX); orex(1,d,vtop->r2,0x89); /* mov */
d = arg_prepare_reg(gen_reg+1); o(0xc0 + REG_VALUE(vtop->r2) * 8 + REG_VALUE(d));
orex(1,d,vtop->r2,0x89); /* mov */
o(0xc0 + REG_VALUE(vtop->r2) * 8 + REG_VALUE(d));
}
} }
break;
} }
vtop--; vtop--;
} }
assert(gen_reg == 0);
assert(sse_reg == 0);
/* We shouldn't have many operands on the stack anymore, but the /* We shouldn't have many operands on the stack anymore, but the
call address itself is still there, and it might be in %eax call address itself is still there, and it might be in %eax