From 3e4c296ebab341bbe95974e39b589ea821f02789 Mon Sep 17 00:00:00 2001
From: Michael Matz <matz@suse.de>
Date: Thu, 23 Feb 2017 00:16:25 +0100
Subject: [PATCH] x86-64-asm: Fix mov im64,rax encoding

the avoidance of mov im32->reg64 wasn't working when reg64 was rax.
While fixing this also fix instructions which had the REX prefix
hardcoded in opcode and so didn't support extended registers which
would have added another REX prefix.
---
 i386-asm.c      | 11 +++++++----
 tests/asmtest.S | 17 +++++++++++++++++
 x86_64-asm.h    | 18 +++++++++---------
 3 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/i386-asm.c b/i386-asm.c
index 5011af5..0c0985e 100644
--- a/i386-asm.c
+++ b/i386-asm.c
@@ -42,6 +42,7 @@
 #define OPCT_IS(v,i) (((v) & OPCT_MASK) == (i))
 
 #define OPC_0F        0x100 /* Is secondary map (0x0f prefix) */
+#define OPC_48        0x200 /* Always has REX prefix */
 #ifdef TCC_TARGET_X86_64
 # define OPC_WLQ     0x1000  /* accepts w, l, q or no suffix */
 # define OPC_BWLQ    (OPC_B | OPC_WLQ) /* accepts b, w, l, q or no suffix */
@@ -785,7 +786,7 @@ ST_FUNC void asm_opcode(TCCState *s1, int opcode)
 	   should only be done if we really have an >32bit imm64, and that
 	   is hardcoded.  Ignore it here.  */
 	if (pa->opcode == 0xb0 && ops[0].type != OP_IM64
-	    && ops[1].type == OP_REG64
+	    && (ops[1].type & OP_REG) == OP_REG64
 	    && !(pa->instr_type & OPC_0F))
 	    continue;
 #endif
@@ -901,14 +902,16 @@ ST_FUNC void asm_opcode(TCCState *s1, int opcode)
         g(0x66);
 #ifdef TCC_TARGET_X86_64
     rex64 = 0;
-    if (s == 3 || (alltypes & OP_REG64)) {
+    if (pa->instr_type & OPC_48)
+        rex64 = 1;
+    else if (s == 3 || (alltypes & OP_REG64)) {
         /* generate REX prefix */
 	int default64 = 0;
 	for(i = 0; i < nb_ops; i++) {
-	    if (op_type[i] == OP_REG64) {
+	    if (op_type[i] == OP_REG64 && pa->opcode != 0xb8) {
 		/* If only 64bit regs are accepted in one operand
 		   this is a default64 instruction without need for
-		   REX prefixes.  */
+		   REX prefixes, except for movabs(0xb8).  */
 		default64 = 1;
 		break;
 	    }
diff --git a/tests/asmtest.S b/tests/asmtest.S
index 280aeaf..5578705 100644
--- a/tests/asmtest.S
+++ b/tests/asmtest.S
@@ -114,12 +114,21 @@ notl %r15d
      movzb 0x1000, %eax
      movzb 0x1000, %ax
                 
+     mov $0x12345678,%eax
+
 #ifdef __x86_64__
      movzb 0x1000, %rax
      movzbq 0x1000, %rbx
      movsbq 0x1000, %rdx
      movzwq 0x1000, %rdi
      movswq 0x1000, %rdx
+     movslq %eax, %rcx
+     mov $0x12345678,%rax
+     mov $0x12345678,%rdx
+     mov $0x12345678,%r10
+     mov $0x123456789abcdef0,%rax
+     mov $0x123456789abcdef0,%rcx
+     mov $0x123456789abcdef0,%r11
 #endif
         
 #ifdef __i386__
@@ -546,6 +555,7 @@ invlpg 0x1000
 cmpxchg8b 0x1002
 #ifdef __x86_64__
 cmpxchg16b (%rax)
+cmpxchg16b (%r10,%r11)
 #endif
 
 fcmovb %st(5), %st
@@ -569,6 +579,7 @@ fucomip %st(5), %st
  cmovne %ax, %si
 #ifdef __x86_64__
  bswapq %rsi
+ bswapq %r10
  cmovz %rdi,%rbx
 #endif
 
@@ -675,7 +686,9 @@ int $0x10
     prefetchw (%rdi)
     clflush 0x1000(%rax,%rcx)
     fxsaveq (%rdx)
+    fxsaveq (%r11)
     fxrstorq (%rcx)
+    fxrstorq (%r10)
 
 #endif
 
@@ -751,6 +764,9 @@ int $0x10
     sidtq 0x1000
 
     swapgs
+
+    str %rdx
+    str %r9
 #endif
 
     lmsw 0x1000
@@ -879,6 +895,7 @@ overrideme:
 #ifdef __x86_64__
     movq %rcx, %mm1
     movq %rdx, %xmm2
+    movq %r13, %xmm3
     /* movq mem64->xmm is encoded as f30f7e by GAS, but as
        660f6e by tcc (which really is a movd and would need 
        a REX.W prefix to be movq).  */
diff --git a/x86_64-asm.h b/x86_64-asm.h
index 675e7df..cb9eb16 100644
--- a/x86_64-asm.h
+++ b/x86_64-asm.h
@@ -106,8 +106,8 @@ ALT(DEF_ASM_OP2(movb, 0x8a, 0, OPC_MODRM | OPC_BWLX, OPT_EA | OPT_REG, OPT_REG))
    the full movabs form (64bit immediate).  For IM32->REG64 we prefer
    the 0xc7 opcode.  So disallow all 64bit forms and code the rest by hand. */
 ALT(DEF_ASM_OP2(movb, 0xb0, 0, OPC_REG | OPC_BWLX, OPT_IM, OPT_REG))
-ALT(DEF_ASM_OP2(mov,  0x48b8, 0, OPC_REG, OPT_IM64, OPT_REG64))
-ALT(DEF_ASM_OP2(movq, 0x48b8, 0, OPC_REG, OPT_IM64, OPT_REG64))
+ALT(DEF_ASM_OP2(mov,  0xb8, 0, OPC_REG, OPT_IM64, OPT_REG64))
+ALT(DEF_ASM_OP2(movq, 0xb8, 0, OPC_REG, OPT_IM64, OPT_REG64))
 ALT(DEF_ASM_OP2(movb, 0xc6, 0, OPC_MODRM | OPC_BWLX, OPT_IM, OPT_REG | OPT_EA))
 
 ALT(DEF_ASM_OP2(movw, 0x8c, 0, OPC_MODRM | OPC_WLX, OPT_SEG, OPT_EA | OPT_REG))
@@ -123,7 +123,7 @@ ALT(DEF_ASM_OP2(movsbl, 0x0fbe, 0, OPC_MODRM, OPT_REG8 | OPT_EA, OPT_REG32))
 ALT(DEF_ASM_OP2(movsbq, 0x0fbe, 0, OPC_MODRM, OPT_REG8 | OPT_EA, OPT_REGW))
 ALT(DEF_ASM_OP2(movswl, 0x0fbf, 0, OPC_MODRM, OPT_REG16 | OPT_EA, OPT_REG32))
 ALT(DEF_ASM_OP2(movswq, 0x0fbf, 0, OPC_MODRM, OPT_REG16 | OPT_EA, OPT_REG))
-ALT(DEF_ASM_OP2(movslq, 0x4863, 0, OPC_MODRM, OPT_REG32 | OPT_EA, OPT_REG))
+ALT(DEF_ASM_OP2(movslq, 0x63, 0, OPC_MODRM, OPT_REG32 | OPT_EA, OPT_REG))
 ALT(DEF_ASM_OP2(movzbw, 0x0fb6, 0, OPC_MODRM | OPC_WLX, OPT_REG8 | OPT_EA, OPT_REGW))
 ALT(DEF_ASM_OP2(movzwl, 0x0fb7, 0, OPC_MODRM, OPT_REG16 | OPT_EA, OPT_REG32))
 ALT(DEF_ASM_OP2(movzwq, 0x0fb7, 0, OPC_MODRM, OPT_REG16 | OPT_EA, OPT_REG))
@@ -354,8 +354,8 @@ ALT(DEF_ASM_OP1(fstsw, 0xdd, 7, OPC_MODRM | OPC_FWAIT, OPT_EA ))
        If the operand would use extended registers we would have to modify
        it instead of generating a second one.  Currently that's no
        problem with TCC, we don't use extended registers.  */
-    DEF_ASM_OP1(fxsaveq, 0x480fae, 0, OPC_MODRM, OPT_EA )
-    DEF_ASM_OP1(fxrstorq, 0x480fae, 1, OPC_MODRM, OPT_EA )
+    DEF_ASM_OP1(fxsaveq, 0x0fae, 0, OPC_MODRM | OPC_48, OPT_EA )
+    DEF_ASM_OP1(fxrstorq, 0x0fae, 1, OPC_MODRM | OPC_48, OPT_EA )
 
     /* segments */
     DEF_ASM_OP2(arpl, 0x63, 0, OPC_MODRM, OPT_REG16, OPT_REG16 | OPT_EA)
@@ -376,7 +376,7 @@ ALT(DEF_ASM_OP2(lslw, 0x0f03, 0, OPC_MODRM | OPC_WLX, OPT_EA | OPT_REG, OPT_REG)
     DEF_ASM_OP1(smsw, 0x0f01, 4, OPC_MODRM, OPT_REG | OPT_EA)
     DEF_ASM_OP1(str, 0x0f00, 1, OPC_MODRM, OPT_REG32 | OPT_EA)
 ALT(DEF_ASM_OP1(str, 0x660f00, 1, OPC_MODRM, OPT_REG16))
-ALT(DEF_ASM_OP1(str, 0x480f00, 1, OPC_MODRM, OPT_REG64))
+ALT(DEF_ASM_OP1(str, 0x0f00, 1, OPC_MODRM | OPC_48, OPT_REG64))
     DEF_ASM_OP1(verr, 0x0f00, 4, OPC_MODRM, OPT_REG | OPT_EA)
     DEF_ASM_OP1(verw, 0x0f00, 5, OPC_MODRM, OPT_REG | OPT_EA)
     DEF_ASM_OP0L(swapgs, 0x0f01, 7, OPC_MODRM)
@@ -385,7 +385,7 @@ ALT(DEF_ASM_OP1(str, 0x480f00, 1, OPC_MODRM, OPT_REG64))
     /* bswap can't be applied to 16bit regs */
     DEF_ASM_OP1(bswap, 0x0fc8, 0, OPC_REG, OPT_REG32 )
     DEF_ASM_OP1(bswapl, 0x0fc8, 0, OPC_REG, OPT_REG32 )
-    DEF_ASM_OP1(bswapq, 0x480fc8, 0, OPC_REG, OPT_REG64 )
+    DEF_ASM_OP1(bswapq, 0x0fc8, 0, OPC_REG | OPC_48, OPT_REG64 )
 
 ALT(DEF_ASM_OP2(xaddb, 0x0fc0, 0, OPC_MODRM | OPC_BWLX, OPT_REG, OPT_REG | OPT_EA ))
 ALT(DEF_ASM_OP2(cmpxchgb, 0x0fb0, 0, OPC_MODRM | OPC_BWLX, OPT_REG, OPT_REG | OPT_EA ))
@@ -395,7 +395,7 @@ ALT(DEF_ASM_OP2(cmpxchgb, 0x0fb0, 0, OPC_MODRM | OPC_BWLX, OPT_REG, OPT_REG | OP
     DEF_ASM_OP1(cmpxchg8b, 0x0fc7, 1, OPC_MODRM, OPT_EA )
 
     /* AMD 64 */
-    DEF_ASM_OP1(cmpxchg16b, 0x480fc7, 1, OPC_MODRM, OPT_EA )
+    DEF_ASM_OP1(cmpxchg16b, 0x0fc7, 1, OPC_MODRM | OPC_48, OPT_EA )
 
     /* pentium pro */
 ALT(DEF_ASM_OP2(cmovo, 0x0f40, 0, OPC_MODRM | OPC_TEST | OPC_WLX, OPT_REGW | OPT_EA, OPT_REGW))
@@ -420,7 +420,7 @@ ALT(DEF_ASM_OP2(cmovo, 0x0f40, 0, OPC_MODRM | OPC_TEST | OPC_WLX, OPT_REGW | OPT
     /* movd shouldn't accept REG64, but AMD64 spec uses it for 32 and 64 bit
        moves, so let's be compatible. */
 ALT(DEF_ASM_OP2(movd, 0x0f6e, 0, OPC_MODRM, OPT_EA | OPT_REG64, OPT_MMXSSE ))
-ALT(DEF_ASM_OP2(movq, 0x480f6e, 0, OPC_MODRM, OPT_REG64, OPT_MMXSSE ))
+ALT(DEF_ASM_OP2(movq, 0x0f6e, 0, OPC_MODRM | OPC_48, OPT_REG64, OPT_MMXSSE ))
 ALT(DEF_ASM_OP2(movq, 0x0f6f, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX ))
 ALT(DEF_ASM_OP2(movd, 0x0f7e, 0, OPC_MODRM, OPT_MMXSSE, OPT_EA | OPT_REG32 ))
 ALT(DEF_ASM_OP2(movd, 0x0f7e, 0, OPC_MODRM, OPT_MMXSSE, OPT_EA | OPT_REG64 ))