Add arm64 (AArch64) as a target architecture.

2015-02-13 18:58:31 +00:00 · 2015-02-13 18:58:31 +00:00 · b14ef0e24b
parent 738606dbd5
commit b14ef0e24b
16 changed files with 3265 additions and 40 deletions
--- a/20
+++ b/20
@ -70,6 +70,7 @@ NATIVE_DEFINES_$(CONFIG_arm) += -DTCC_TARGET_ARM
 NATIVE_DEFINES_$(CONFIG_arm_eabihf) += -DTCC_ARM_EABI -DTCC_ARM_HARDFLOAT
 NATIVE_DEFINES_$(CONFIG_arm_eabi) += -DTCC_ARM_EABI
 NATIVE_DEFINES_$(CONFIG_arm_vfp) += -DTCC_ARM_VFP
+NATIVE_DEFINES_$(CONFIG_arm64) += -DTCC_TARGET_ARM64
 NATIVE_DEFINES += $(NATIVE_DEFINES_yes)

 ifeq ($(TOP),.)
@ -86,6 +87,7 @@ ARM_VFP_CROSS = arm-linux-gnu-tcc$(EXESUF)
 ARM_EABI_CROSS = arm-linux-gnueabi-tcc$(EXESUF)
 ARM_EABIHF_CROSS = arm-linux-gnueabihf-tcc$(EXESUF)
 ARM_CROSS = $(ARM_FPA_CROSS) $(ARM_FPA_LD_CROSS) $(ARM_VFP_CROSS) $(ARM_EABI_CROSS)
+ARM64_CROSS = arm64-tcc$(EXESUF)
 C67_CROSS = c67-tcc$(EXESUF)

 # Legacy symlinks for cross compilers
@ -107,33 +109,39 @@ WIN64_FILES = $(CORE_FILES) x86_64-gen.c i386-asm.c x86_64-asm.h tccpe.c
 WINCE_FILES = $(CORE_FILES) arm-gen.c tccpe.c
 X86_64_FILES = $(CORE_FILES) x86_64-gen.c i386-asm.c x86_64-asm.h
 ARM_FILES = $(CORE_FILES) arm-gen.c
+ARM64_FILES = $(CORE_FILES) arm64-gen.c
 C67_FILES = $(CORE_FILES) c67-gen.c tcccoff.c

 ifdef CONFIG_WIN64
 PROGS+=tiny_impdef$(EXESUF) tiny_libmaker$(EXESUF)
 NATIVE_FILES=$(WIN64_FILES)
-PROGS_CROSS=$(WIN32_CROSS) $(I386_CROSS) $(X64_CROSS) $(ARM_CROSS) $(C67_CROSS) $(WINCE_CROSS)
+PROGS_CROSS=$(WIN32_CROSS) $(I386_CROSS) $(X64_CROSS) $(ARM_CROSS) $(ARM64_CROSS) $(C67_CROSS) $(WINCE_CROSS)
 LIBTCC1_CROSS=lib/i386-win32/libtcc1.a
 LIBTCC1=libtcc1.a
 else ifdef CONFIG_WIN32
 PROGS+=tiny_impdef$(EXESUF) tiny_libmaker$(EXESUF)
 NATIVE_FILES=$(WIN32_FILES)
-PROGS_CROSS=$(WIN64_CROSS) $(I386_CROSS) $(X64_CROSS) $(ARM_CROSS) $(C67_CROSS) $(WINCE_CROSS)
+PROGS_CROSS=$(WIN64_CROSS) $(I386_CROSS) $(X64_CROSS) $(ARM_CROSS) $(ARM64_CROSS) $(C67_CROSS) $(WINCE_CROSS)
 LIBTCC1_CROSS=lib/x86_64-win32/libtcc1.a
 LIBTCC1=libtcc1.a
 else ifeq ($(ARCH),i386)
 NATIVE_FILES=$(I386_FILES)
-PROGS_CROSS=$(X64_CROSS) $(WIN32_CROSS) $(WIN64_CROSS) $(ARM_CROSS) $(C67_CROSS) $(WINCE_CROSS)
+PROGS_CROSS=$(X64_CROSS) $(WIN32_CROSS) $(WIN64_CROSS) $(ARM_CROSS) $(ARM64_CROSS) $(C67_CROSS) $(WINCE_CROSS)
 LIBTCC1_CROSS=lib/i386-win32/libtcc1.a lib/x86_64-win32/libtcc1.a
 LIBTCC1=libtcc1.a
 else ifeq ($(ARCH),x86-64)
 NATIVE_FILES=$(X86_64_FILES)
-PROGS_CROSS=$(I386_CROSS) $(WIN32_CROSS) $(WIN64_CROSS) $(ARM_CROSS) $(C67_CROSS) $(WINCE_CROSS)
+PROGS_CROSS=$(I386_CROSS) $(WIN32_CROSS) $(WIN64_CROSS) $(ARM_CROSS) $(ARM64_CROSS) $(C67_CROSS) $(WINCE_CROSS)
 LIBTCC1_CROSS=lib/i386-win32/libtcc1.a lib/x86_64-win32/libtcc1.a lib/i386/libtcc1.a
 LIBTCC1=libtcc1.a
 else ifeq ($(ARCH),arm)
 NATIVE_FILES=$(ARM_FILES)
-PROGS_CROSS=$(I386_CROSS) $(X64_CROSS) $(WIN32_CROSS) $(WIN64_CROSS) $(C67_CROSS) $(WINCE_CROSS)
+PROGS_CROSS=$(I386_CROSS) $(X64_CROSS) $(WIN32_CROSS) $(WIN64_CROSS) $(ARM64_CROSS) $(C67_CROSS) $(WINCE_CROSS)
+LIBTCC1=libtcc1.a
+LIBTCC1_CROSS=lib/i386-win32/libtcc1.a lib/x86_64-win32/libtcc1.a lib/i386/libtcc1.a
+else ifeq ($(ARCH),arm64)
+NATIVE_FILES=$(ARM64_FILES)
+PROGS_CROSS=$(I386_CROSS) $(X64_CROSS) $(WIN32_CROSS) $(WIN64_CROSS) $(ARM_CROSS) $(C67_CROSS) $(WINCE_CROSS)
 LIBTCC1=libtcc1.a
 LIBTCC1_CROSS=lib/i386-win32/libtcc1.a lib/x86_64-win32/libtcc1.a lib/i386/libtcc1.a
 endif
@ -181,6 +189,7 @@ $(ARM_FPA_CROSS): DEFINES = -DTCC_TARGET_ARM
 $(ARM_FPA_LD_CROSS)$(EXESUF): DEFINES = -DTCC_TARGET_ARM -DLDOUBLE_SIZE=12
 $(ARM_VFP_CROSS): DEFINES = -DTCC_TARGET_ARM -DTCC_ARM_VFP -DCONFIG_MULTIARCHDIR="\"arm-linux-gnu\""
 $(ARM_EABI_CROSS): DEFINES = -DTCC_TARGET_ARM -DTCC_ARM_EABI -DTCC_ARM_VFP -DCONFIG_MULTIARCHDIR="\"arm-linux-gnueabi\""
+$(ARM64_CROSS): DEFINES = -DTCC_TARGET_ARM64

 $(I386_CROSS): $(I386_FILES)
 $(X64_CROSS): $(X86_64_FILES)
@ -189,6 +198,7 @@ $(WIN64_CROSS): $(WIN64_FILES)
 $(WINCE_CROSS): $(WINCE_FILES)
 $(C67_CROSS): $(C67_FILES)
 $(ARM_FPA_CROSS) $(ARM_FPA_LD_CROSS) $(ARM_VFP_CROSS) $(ARM_EABI_CROSS): $(ARM_FILES)
+$(ARM64_CROSS): $(ARM64_FILES)

 # libtcc generation and test
 ifndef ONE_SOURCE
--- a/arm64-gen.c
+++ b/arm64-gen.c
--- a/6
+++ b/6
@ -99,6 +99,9 @@ classify_cpu ()
      esac
      cpu="armv4l"
    ;;
+    aarch64)
+      cpu="aarch64"
+    ;;
    alpha)
      cpu="alpha"
    ;;
@ -435,6 +438,9 @@ elif test "$cpu" = "armv4l" ; then
  echo "ARCH=arm" >> config.mak
  echo "#define HOST_ARM 1" >> $TMPH
  echo "#define TCC_ARM_VERSION $cpuver" >> $TMPH
+elif test "$cpu" = "aarch64" ; then
+  echo "ARCH=arm64" >> config.mak
+  echo "#define HOST_ARM64 1" >> $TMPH
 elif test "$cpu" = "powerpc" ; then
  echo "ARCH=ppc" >> config.mak
  echo "#define HOST_PPC 1" >> $TMPH
--- a/conftest.c
+++ b/conftest.c
@ -7,6 +7,8 @@
 # define TRIPLET_ARCH "x86_64"
 #elif defined(__arm__)
 # define TRIPLET_ARCH "arm"
+#elif defined(__aarch64__)
+# define TRIPLET_ARCH "aarch64"
 #else
 # define TRIPLET_ARCH "unknown"
 #endif
--- a/elf.h
+++ b/elf.h
@ -2336,6 +2336,117 @@ typedef Elf32_Addr Elf32_Conflict;
 #define R_AARCH64_NONE            0	/* No relocation.  */
 #define R_AARCH64_ABS64         257	/* Direct 64 bit. */
 #define R_AARCH64_ABS32         258	/* Direct 32 bit.  */
+#define R_AARCH64_ABS16         259	/* Direct 16-bit.  */
+#define R_AARCH64_PREL64        260	/* PC-relative 64-bit.  */
+#define R_AARCH64_PREL32        261	/* PC-relative 32-bit.  */
+#define R_AARCH64_PREL16        262	/* PC-relative 16-bit.  */
+#define R_AARCH64_MOVW_UABS_G0  263	/* Dir. MOVZ imm. from bits 15:0.  */
+#define R_AARCH64_MOVW_UABS_G0_NC 264	/* Likewise for MOVK; no check.  */
+#define R_AARCH64_MOVW_UABS_G1  265	/* Dir. MOVZ imm. from bits 31:16.  */
+#define R_AARCH64_MOVW_UABS_G1_NC 266	/* Likewise for MOVK; no check.  */
+#define R_AARCH64_MOVW_UABS_G2  267	/* Dir. MOVZ imm. from bits 47:32.  */
+#define R_AARCH64_MOVW_UABS_G2_NC 268	/* Likewise for MOVK; no check.  */
+#define R_AARCH64_MOVW_UABS_G3  269	/* Dir. MOV{K,Z} imm. from 63:48.  */
+#define R_AARCH64_MOVW_SABS_G0  270	/* Dir. MOV{N,Z} imm. from 15:0.  */
+#define R_AARCH64_MOVW_SABS_G1  271	/* Dir. MOV{N,Z} imm. from 31:16.  */
+#define R_AARCH64_MOVW_SABS_G2  272	/* Dir. MOV{N,Z} imm. from 47:32.  */
+#define R_AARCH64_LD_PREL_LO19  273	/* PC-rel. LD imm. from bits 20:2.  */
+#define R_AARCH64_ADR_PREL_LO21 274	/* PC-rel. ADR imm. from bits 20:0.  */
+#define R_AARCH64_ADR_PREL_PG_HI21 275	/* Page-rel. ADRP imm. from 32:12.  */
+#define R_AARCH64_ADR_PREL_PG_HI21_NC 276	/* Likewise; no overflow check.  */
+#define R_AARCH64_ADD_ABS_LO12_NC 277	/* Dir. ADD imm. from bits 11:0.  */
+#define R_AARCH64_LDST8_ABS_LO12_NC 278	/* Likewise for LD/ST; no check. */
+#define R_AARCH64_TSTBR14       279	/* PC-rel. TBZ/TBNZ imm. from 15:2.  */
+#define R_AARCH64_CONDBR19      280	/* PC-rel. cond. br. imm. from 20:2. */
+#define R_AARCH64_JUMP26        282	/* PC-rel. B imm. from bits 27:2.  */
+#define R_AARCH64_CALL26        283	/* Likewise for CALL.  */
+#define R_AARCH64_LDST16_ABS_LO12_NC 284	/* Dir. ADD imm. from bits 11:1.  */
+#define R_AARCH64_LDST32_ABS_LO12_NC 285	/* Likewise for bits 11:2.  */
+#define R_AARCH64_LDST64_ABS_LO12_NC 286	/* Likewise for bits 11:3.  */
+#define R_AARCH64_MOVW_PREL_G0  287	/* PC-rel. MOV{N,Z} imm. from 15:0.  */
+#define R_AARCH64_MOVW_PREL_G0_NC 288	/* Likewise for MOVK; no check.  */
+#define R_AARCH64_MOVW_PREL_G1  289	/* PC-rel. MOV{N,Z} imm. from 31:16. */
+#define R_AARCH64_MOVW_PREL_G1_NC 290	/* Likewise for MOVK; no check.  */
+#define R_AARCH64_MOVW_PREL_G2  291	/* PC-rel. MOV{N,Z} imm. from 47:32. */
+#define R_AARCH64_MOVW_PREL_G2_NC 292	/* Likewise for MOVK; no check.  */
+#define R_AARCH64_MOVW_PREL_G3  293	/* PC-rel. MOV{N,Z} imm. from 63:48. */
+#define R_AARCH64_LDST128_ABS_LO12_NC 299	/* Dir. ADD imm. from bits 11:4.  */
+#define R_AARCH64_MOVW_GOTOFF_G0 300	/* GOT-rel. off. MOV{N,Z} imm. 15:0. */
+#define R_AARCH64_MOVW_GOTOFF_G0_NC 301	/* Likewise for MOVK; no check.  */
+#define R_AARCH64_MOVW_GOTOFF_G1 302	/* GOT-rel. o. MOV{N,Z} imm. 31:16.  */
+#define R_AARCH64_MOVW_GOTOFF_G1_NC 303	/* Likewise for MOVK; no check.  */
+#define R_AARCH64_MOVW_GOTOFF_G2 304	/* GOT-rel. o. MOV{N,Z} imm. 47:32.  */
+#define R_AARCH64_MOVW_GOTOFF_G2_NC 305	/* Likewise for MOVK; no check.  */
+#define R_AARCH64_MOVW_GOTOFF_G3 306	/* GOT-rel. o. MOV{N,Z} imm. 63:48.  */
+#define R_AARCH64_GOTREL64      307	/* GOT-relative 64-bit.  */
+#define R_AARCH64_GOTREL32      308	/* GOT-relative 32-bit.  */
+#define R_AARCH64_GOT_LD_PREL19 309	/* PC-rel. GOT off. load imm. 20:2.  */
+#define R_AARCH64_LD64_GOTOFF_LO15 310	/* GOT-rel. off. LD/ST imm. 14:3.  */
+#define R_AARCH64_ADR_GOT_PAGE  311	/* P-page-rel. GOT off. ADRP 32:12.  */
+#define R_AARCH64_LD64_GOT_LO12_NC 312	/* Dir. GOT off. LD/ST imm. 11:3.  */
+#define R_AARCH64_LD64_GOTPAGE_LO15 313	/* GOT-page-rel. GOT off. LD/ST 14:3 */
+#define R_AARCH64_TLSGD_ADR_PREL21 512	/* PC-relative ADR imm. 20:0.  */
+#define R_AARCH64_TLSGD_ADR_PAGE21 513	/* page-rel. ADRP imm. 32:12.  */
+#define R_AARCH64_TLSGD_ADD_LO12_NC 514	/* direct ADD imm. from 11:0.  */
+#define R_AARCH64_TLSGD_MOVW_G1 515	/* GOT-rel. MOV{N,Z} 31:16.  */
+#define R_AARCH64_TLSGD_MOVW_G0_NC 516	/* GOT-rel. MOVK imm. 15:0.  */
+#define R_AARCH64_TLSLD_ADR_PREL21 517	/* Like 512; local dynamic model.  */
+#define R_AARCH64_TLSLD_ADR_PAGE21 518	/* Like 513; local dynamic model.  */
+#define R_AARCH64_TLSLD_ADD_LO12_NC 519	/* Like 514; local dynamic model.  */
+#define R_AARCH64_TLSLD_MOVW_G1 520	/* Like 515; local dynamic model.  */
+#define R_AARCH64_TLSLD_MOVW_G0_NC 521	/* Like 516; local dynamic model.  */
+#define R_AARCH64_TLSLD_LD_PREL19 522	/* TLS PC-rel. load imm. 20:2.  */
+#define R_AARCH64_TLSLD_MOVW_DTPREL_G2 523	/* TLS DTP-rel. MOV{N,Z} 47:32.  */
+#define R_AARCH64_TLSLD_MOVW_DTPREL_G1 524	/* TLS DTP-rel. MOV{N,Z} 31:16.  */
+#define R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC 525	/* Likewise; MOVK; no check.  */
+#define R_AARCH64_TLSLD_MOVW_DTPREL_G0 526	/* TLS DTP-rel. MOV{N,Z} 15:0.  */
+#define R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC 527	/* Likewise; MOVK; no check.  */
+#define R_AARCH64_TLSLD_ADD_DTPREL_HI12 528	/* DTP-rel. ADD imm. from 23:12. */
+#define R_AARCH64_TLSLD_ADD_DTPREL_LO12 529	/* DTP-rel. ADD imm. from 11:0.  */
+#define R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC 530	/* Likewise; no ovfl. check.  */
+#define R_AARCH64_TLSLD_LDST8_DTPREL_LO12 531	/* DTP-rel. LD/ST imm. 11:0.  */
+#define R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC 532	/* Likewise; no check.  */
+#define R_AARCH64_TLSLD_LDST16_DTPREL_LO12 533	/* DTP-rel. LD/ST imm. 11:1.  */
+#define R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC 534	/* Likewise; no check.  */
+#define R_AARCH64_TLSLD_LDST32_DTPREL_LO12 535	/* DTP-rel. LD/ST imm. 11:2.  */
+#define R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC 536	/* Likewise; no check.  */
+#define R_AARCH64_TLSLD_LDST64_DTPREL_LO12 537	/* DTP-rel. LD/ST imm. 11:3.  */
+#define R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC 538	/* Likewise; no check.  */
+#define R_AARCH64_TLSIE_MOVW_GOTTPREL_G1 539	/* GOT-rel. MOV{N,Z} 31:16.  */
+#define R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC 540	/* GOT-rel. MOVK 15:0.  */
+#define R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21 541	/* Page-rel. ADRP 32:12.  */
+#define R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC 542	/* Direct LD off. 11:3.  */
+#define R_AARCH64_TLSIE_LD_GOTTPREL_PREL19 543	/* PC-rel. load imm. 20:2.  */
+#define R_AARCH64_TLSLE_MOVW_TPREL_G2 544	/* TLS TP-rel. MOV{N,Z} 47:32.  */
+#define R_AARCH64_TLSLE_MOVW_TPREL_G1 545	/* TLS TP-rel. MOV{N,Z} 31:16.  */
+#define R_AARCH64_TLSLE_MOVW_TPREL_G1_NC 546	/* Likewise; MOVK; no check.  */
+#define R_AARCH64_TLSLE_MOVW_TPREL_G0 547	/* TLS TP-rel. MOV{N,Z} 15:0.  */
+#define R_AARCH64_TLSLE_MOVW_TPREL_G0_NC 548	/* Likewise; MOVK; no check.  */
+#define R_AARCH64_TLSLE_ADD_TPREL_HI12 549	/* TP-rel. ADD imm. 23:12.  */
+#define R_AARCH64_TLSLE_ADD_TPREL_LO12 550	/* TP-rel. ADD imm. 11:0.  */
+#define R_AARCH64_TLSLE_ADD_TPREL_LO12_NC 551	/* Likewise; no ovfl. check.  */
+#define R_AARCH64_TLSLE_LDST8_TPREL_LO12 552	/* TP-rel. LD/ST off. 11:0.  */
+#define R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC 553	/* Likewise; no ovfl. check. */
+#define R_AARCH64_TLSLE_LDST16_TPREL_LO12 554	/* TP-rel. LD/ST off. 11:1.  */
+#define R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC 555	/* Likewise; no check.  */
+#define R_AARCH64_TLSLE_LDST32_TPREL_LO12 556	/* TP-rel. LD/ST off. 11:2.  */
+#define R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC 557	/* Likewise; no check.  */
+#define R_AARCH64_TLSLE_LDST64_TPREL_LO12 558	/* TP-rel. LD/ST off. 11:3.  */
+#define R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC 559	/* Likewise; no check.  */
+#define R_AARCH64_TLSDESC_LD_PREL19 560	/* PC-rel. load immediate 20:2.  */
+#define R_AARCH64_TLSDESC_ADR_PREL21 561	/* PC-rel. ADR immediate 20:0.  */
+#define R_AARCH64_TLSDESC_ADR_PAGE21 562	/* Page-rel. ADRP imm. 32:12.  */
+#define R_AARCH64_TLSDESC_LD64_LO12 563	/* Direct LD off. from 11:3.  */
+#define R_AARCH64_TLSDESC_ADD_LO12 564	/* Direct ADD imm. from 11:0.  */
+#define R_AARCH64_TLSDESC_OFF_G1 565	/* GOT-rel. MOV{N,Z} imm. 31:16.  */
+#define R_AARCH64_TLSDESC_OFF_G0_NC 566	/* GOT-rel. MOVK imm. 15:0; no ck.  */
+#define R_AARCH64_TLSDESC_LDR   567	/* Relax LDR.  */
+#define R_AARCH64_TLSDESC_ADD   568	/* Relax ADD.  */
+#define R_AARCH64_TLSDESC_CALL  569	/* Relax BLR.  */
+#define R_AARCH64_TLSLE_LDST128_TPREL_LO12 570	/* TP-rel. LD/ST off. 11:4.  */
+#define R_AARCH64_TLSLE_LDST128_TPREL_LO12_NC 571	/* Likewise; no check.  */
+#define R_AARCH64_TLSLD_LDST128_DTPREL_LO12 572	/* DTP-rel. LD/ST imm. 11:4. */
+#define R_AARCH64_TLSLD_LDST128_DTPREL_LO12_NC 573	/* Likewise; no check.  */
 #define R_AARCH64_COPY         1024	/* Copy symbol at runtime.  */
 #define R_AARCH64_GLOB_DAT     1025	/* Create GOT entry.  */
 #define R_AARCH64_JUMP_SLOT    1026	/* Create PLT entry.  */
@ -2344,6 +2455,7 @@ typedef Elf32_Addr Elf32_Conflict;
 #define R_AARCH64_TLS_DTPREL64 1029	/* Module-relative offset, 64 bit.  */
 #define R_AARCH64_TLS_TPREL64  1030	/* TP-relative offset, 64 bit.  */
 #define R_AARCH64_TLSDESC      1031	/* TLS Descriptor.  */
+#define R_AARCH64_IRELATIVE    1032	/* STT_GNU_IFUNC relocation.  */

 /* ARM relocs.  */

--- a/include/stdarg.h
+++ b/include/stdarg.h
@ -46,6 +46,19 @@ typedef char *va_list;
 #define va_copy(dest, src) (dest) = (src)
 #define va_end(ap)

+#elif defined(__aarch64__)
+typedef struct {
+    void *__stack;
+    void *__gr_top;
+    void *__vr_top;
+    int   __gr_offs;
+    int   __vr_offs;
+} va_list;
+#define va_start(ap, last) __va_start(ap, last)
+#define va_arg(ap, type) __va_arg(ap, type)
+#define va_end(ap)
+#define va_copy(dest, src) ((dest) = (src))
+
 #else /* __i386__ */
 typedef char *va_list;
 /* only correct for i386 */
--- a/lib/Makefile
+++ b/lib/Makefile
@ -28,6 +28,11 @@ ifndef TARGET # native library
 ifeq ($(ARCH),arm)
  TARGET = arm
  XCC = $(CC)
+ else
+ ifeq ($(ARCH),arm64)
+  TARGET = arm64
+ else
+ endif
 endif
 endif
 endif
@ -49,6 +54,7 @@ X86_64_O = libtcc1.o alloca86_64.o
 ARM_O = libtcc1.o armeabi.o alloca-arm.o
 WIN32_O = $(I386_O) crt1.o wincrt1.o dllcrt1.o dllmain.o chkstk.o
 WIN64_O = $(X86_64_O) crt1.o wincrt1.o dllcrt1.o dllmain.o chkstk.o
+ARM64_O = lib-arm64.o

 # build TCC runtime library to contain PIC code, so it can be linked
 # into shared libraries
@ -86,6 +92,11 @@ ifeq "$(TARGET)" "arm"
 OBJ = $(addprefix $(DIR)/,$(ARM_O))
 TGT = -DTCC_TARGET_ARM
 XCC ?= $(TCC) -B$(TOP)
+else
+ifeq "$(TARGET)" "arm64"
+ OBJ = $(addprefix $(DIR)/,$(ARM64_O))
+ TGT = -DTCC_TARGET_ARM64
+ XCC ?= $(TCC) -B$(TOP)
 else
 $(error libtcc1.a not supported on target '$(TARGET)')
 endif
@ -93,6 +104,7 @@ endif
 endif
 endif
 endif
+endif

 XFLAGS = $(CPPFLAGS) $(CFLAGS) $(PICFLAGS) $(TGT)

--- a/lib/lib-arm64.c
+++ b/lib/lib-arm64.c
@ -0,0 +1,652 @@
+/*
+ *  TCC runtime library for arm64.
+ *
+ *  Copyright (c) 2015 Edmund Grimley Evans
+ *
+ * Copying and distribution of this file, with or without modification,
+ * are permitted in any medium without royalty provided the copyright
+ * notice and this notice are preserved.  This file is offered as-is,
+ * without any warranty.
+ */
+
+#include <stdint.h>
+#include <string.h>
+
+void __clear_cache(char *beg, char *end)
+{
+#warning __clear_cache not yet implemented
+}
+
+typedef struct {
+    uint64_t x0, x1;
+} u128_t;
+
+static long double f3_zero(int sgn)
+{
+    long double f;
+    u128_t x = { 0, (uint64_t)sgn << 63 };
+    memcpy(&f, &x, 16);
+    return f;
+}
+
+static long double f3_infinity(int sgn)
+{
+    long double f;
+    u128_t x = { 0, (uint64_t)sgn << 63 | 0x7fff000000000000 };
+    memcpy(&f, &x, 16);
+    return f;
+}
+
+static long double f3_NaN(void)
+{
+    long double f;
+#if 0
+    // ARM's default NaN usually has just the top fraction bit set:
+    u128_t x = {  0, 0x7fff800000000000 };
+#else
+    // GCC's library sets all fraction bits:
+    u128_t x = { -1, 0x7fffffffffffffff };
+#endif
+    memcpy(&f, &x, 16);
+    return f;
+}
+
+static int fp3_convert_NaN(long double *f, int sgn, u128_t mnt)
+{
+    u128_t x = { mnt.x0,
+                 mnt.x1 | 0x7fff800000000000 | (uint64_t)sgn << 63 };
+    memcpy(f, &x, 16);
+    return 1;
+}
+
+static int fp3_detect_NaNs(long double *f,
+                           int a_sgn, int a_exp, u128_t a,
+                           int b_sgn, int b_exp, u128_t b)
+{
+    // Detect signalling NaNs:
+    if (a_exp == 32767 && (a.x0 | a.x1 << 16) && !(a.x1 >> 47 & 1))
+        return fp3_convert_NaN(f, a_sgn, a);
+    if (b_exp == 32767 && (b.x0 | b.x1 << 16) && !(b.x1 >> 47 & 1))
+        return fp3_convert_NaN(f, b_sgn, b);
+
+    // Detect quiet NaNs:
+    if (a_exp == 32767 && (a.x0 | a.x1 << 16))
+        return fp3_convert_NaN(f, a_sgn, a);
+    if (b_exp == 32767 && (b.x0 | b.x1 << 16))
+        return fp3_convert_NaN(f, b_sgn, b);
+
+    return 0;
+}
+
+static void f3_unpack(int *sgn, int32_t *exp, u128_t *mnt, long double f)
+{
+    u128_t x;
+    memcpy(&x, &f, 16);
+    *sgn = x.x1 >> 63;
+    *exp = x.x1 >> 48 & 32767;
+    x.x1 = x.x1 << 16 >> 16;
+    if (*exp)
+        x.x1 |= (uint64_t)1 << 48;
+    else
+        *exp = 1;
+    *mnt = x;
+}
+
+static u128_t f3_normalise(int32_t *exp, u128_t mnt)
+{
+    int sh;
+    if (!(mnt.x0 | mnt.x1))
+        return mnt;
+    if (!mnt.x1) {
+        mnt.x1 = mnt.x0;
+        mnt.x0 = 0;
+        *exp -= 64;
+    }
+    for (sh = 32; sh; sh >>= 1) {
+        if (!(mnt.x1 >> (64 - sh))) {
+            mnt.x1 = mnt.x1 << sh | mnt.x0 >> (64 - sh);
+            mnt.x0 = mnt.x0 << sh;
+            *exp -= sh;
+        }
+    }
+    return mnt;
+}
+
+static u128_t f3_sticky_shift(int32_t sh, u128_t x)
+{
+  if (sh >= 128) {
+      x.x0 = !!(x.x0 | x.x1);
+      x.x1 = 0;
+      return x;
+  }
+  if (sh >= 64) {
+      x.x0 = x.x1 | !!x.x0;
+      x.x1 = 0;
+      sh -= 64;
+  }
+  if (sh > 0) {
+      x.x0 = x.x0 >> sh | x.x1 << (64 - sh) | !!(x.x0 << (64 - sh));
+      x.x1 = x.x1 >> sh;
+  }
+  return x;
+}
+
+static long double f3_round(int sgn, int32_t exp, u128_t x)
+{
+    long double f;
+    int error;
+
+    if (exp > 0) {
+        x = f3_sticky_shift(13, x);
+    }
+    else {
+        x = f3_sticky_shift(14 - exp, x);
+        exp = 0;
+    }
+
+    error = x.x0 & 3;
+    x.x0 = x.x0 >> 2 | x.x1 << 62;
+    x.x1 = x.x1 >> 2;
+
+    if (error == 3 || ((error == 2) & (x.x0 & 1))) {
+        if (!++x.x0) {
+            ++x.x1;
+            if (x.x1 == (uint64_t)1 << 48)
+                exp = 1;
+            else if (x.x1 == (uint64_t)1 << 49) {
+                ++exp;
+                x.x0 = x.x0 >> 1 | x.x1 << 63;
+                x.x1 = x.x1 >> 1;
+            }
+        }
+    }
+
+    if (exp >= 32767)
+        return f3_infinity(sgn);
+
+    x.x1 = x.x1 << 16 >> 16 | (uint64_t)exp << 48 | (uint64_t)sgn << 63;
+    memcpy(&f, &x, 16);
+    return f;
+}
+
+static long double f3_add(long double fa, long double fb, int neg)
+{
+    u128_t a, b, x;
+    int32_t a_exp, b_exp, x_exp;
+    int a_sgn, b_sgn, x_sgn;
+    long double fx;
+
+    f3_unpack(&a_sgn, &a_exp, &a, fa);
+    f3_unpack(&b_sgn, &b_exp, &b, fb);
+
+    if (fp3_detect_NaNs(&fx, a_sgn, a_exp, a, b_sgn, b_exp, b))
+        return fx;
+
+    b_sgn ^= neg;
+
+    // Handle infinities and zeroes:
+    if (a_exp == 32767 && b_exp == 32767 && a_sgn != b_sgn)
+        return f3_NaN();
+    if (a_exp == 32767)
+        return f3_infinity(a_sgn);
+    if (b_exp == 32767)
+        return f3_infinity(b_sgn);
+    if (!(a.x0 | a.x1 | b.x0 | b.x1))
+        return f3_zero(a_sgn & b_sgn);
+
+    a.x1 = a.x1 << 3 | a.x0 >> 61;
+    a.x0 = a.x0 << 3;
+    b.x1 = b.x1 << 3 | b.x0 >> 61;
+    b.x0 = b.x0 << 3;
+
+    if (a_exp <= b_exp) {
+        a = f3_sticky_shift(b_exp - a_exp, a);
+        a_exp = b_exp;
+    }
+    else {
+        b = f3_sticky_shift(a_exp - b_exp, b);
+        b_exp = a_exp;
+    }
+
+    x_sgn = a_sgn;
+    x_exp = a_exp;
+    if (a_sgn == b_sgn) {
+        x.x0 = a.x0 + b.x0;
+        x.x1 = a.x1 + b.x1 + (x.x0 < a.x0);
+    }
+    else {
+        x.x0 = a.x0 - b.x0;
+        x.x1 = a.x1 - b.x1 - (x.x0 > a.x0);
+        if (x.x1 >> 63) {
+            x_sgn ^= 1;
+            x.x0 = -x.x0;
+            x.x1 = -x.x1 - !!x.x0;
+        }
+    }
+
+    if (!(x.x0 | x.x1))
+        return f3_zero(0);
+
+    x = f3_normalise(&x_exp, x);
+
+    return f3_round(x_sgn, x_exp + 12, x);
+}
+
+long double __addtf3(long double a, long double b)
+{
+    return f3_add(a, b, 0);
+}
+
+long double __subtf3(long double a, long double b)
+{
+    return f3_add(a, b, 1);
+}
+
+long double __multf3(long double fa, long double fb)
+{
+    u128_t a, b, x;
+    int32_t a_exp, b_exp, x_exp;
+    int a_sgn, b_sgn, x_sgn;
+    long double fx;
+
+    f3_unpack(&a_sgn, &a_exp, &a, fa);
+    f3_unpack(&b_sgn, &b_exp, &b, fb);
+
+    if (fp3_detect_NaNs(&fx, a_sgn, a_exp, a, b_sgn, b_exp, b))
+        return fx;
+
+    // Handle infinities and zeroes:
+    if ((a_exp == 32767 && !(b.x0 | b.x1)) ||
+        (b_exp == 32767 && !(a.x0 | a.x1)))
+        return f3_NaN();
+    if (a_exp == 32767 || b_exp == 32767)
+        return f3_infinity(a_sgn ^ b_sgn);
+    if (!(a.x0 | a.x1) || !(b.x0 | b.x1))
+        return f3_zero(a_sgn ^ b_sgn);
+
+    a = f3_normalise(&a_exp, a);
+    b = f3_normalise(&b_exp, b);
+
+    x_sgn = a_sgn ^ b_sgn;
+    x_exp = a_exp + b_exp - 16352;
+
+    {
+        // Convert to base (1 << 30), discarding bottom 6 bits, which are zero,
+        // so there are (32, 30, 30, 30) bits in (a3, a2, a1, a0):
+        uint64_t a0 = a.x0 << 28 >> 34;
+        uint64_t b0 = b.x0 << 28 >> 34;
+        uint64_t a1 = a.x0 >> 36 | a.x1 << 62 >> 34;
+        uint64_t b1 = b.x0 >> 36 | b.x1 << 62 >> 34;
+        uint64_t a2 = a.x1 << 32 >> 34;
+        uint64_t b2 = b.x1 << 32 >> 34;
+        uint64_t a3 = a.x1 >> 32;
+        uint64_t b3 = b.x1 >> 32;
+        // Use 16 small multiplications and additions that do not overflow:
+        uint64_t x0 = a0 * b0;
+        uint64_t x1 = (x0 >> 30) + a0 * b1 + a1 * b0;
+        uint64_t x2 = (x1 >> 30) + a0 * b2 + a1 * b1 + a2 * b0;
+        uint64_t x3 = (x2 >> 30) + a0 * b3 + a1 * b2 + a2 * b1 + a3 * b0;
+        uint64_t x4 = (x3 >> 30) + a1 * b3 + a2 * b2 + a3 * b1;
+        uint64_t x5 = (x4 >> 30) + a2 * b3 + a3 * b2;
+        uint64_t x6 = (x5 >> 30) + a3 * b3;
+        // We now have (64, 30, 30, ...) bits in (x6, x5, x4, ...).
+        // Take the top 128 bits, setting bottom bit if any lower bits were set:
+        uint64_t y0 = (x5 << 34 | x4 << 34 >> 30 | x3 << 34 >> 60 |
+                       !!(x3 << 38 | (x2 | x1 | x0) << 34));
+        uint64_t y1 = x6;
+        // Top bit may be zero. Renormalise:
+        if (!(y1 >> 63)) {
+            y1 = y1 << 1 | y0 >> 63;
+            y0 = y0 << 1;
+            --x_exp;
+        }
+        x.x0 = y0;
+        x.x1 = y1;
+    }
+
+    return f3_round(x_sgn, x_exp, x);
+}
+
+long double __divtf3(long double fa, long double fb)
+{
+    u128_t a, b, x;
+    int32_t a_exp, b_exp, x_exp;
+    int a_sgn, b_sgn, x_sgn, i;
+    long double fx;
+
+    f3_unpack(&a_sgn, &a_exp, &a, fa);
+    f3_unpack(&b_sgn, &b_exp, &b, fb);
+
+    if (fp3_detect_NaNs(&fx, a_sgn, a_exp, a, b_sgn, b_exp, b))
+        return fx;
+
+    // Handle infinities and zeroes:
+    if ((a_exp == 32767 && b_exp == 32767) ||
+        (!(a.x0 | a.x1) && !(b.x0 | b.x1)))
+        return f3_NaN();
+    if (a_exp == 32767 || !(b.x0 | b.x1))
+        return f3_infinity(a_sgn ^ b_sgn);
+    if (!(a.x0 | a.x1) || b_exp == 32767)
+        return f3_zero(a_sgn ^ b_sgn);
+
+    a = f3_normalise(&a_exp, a);
+    b = f3_normalise(&b_exp, b);
+
+    x_sgn = a_sgn ^ b_sgn;
+    x_exp = a_exp - b_exp + 16395;
+
+    a.x0 = a.x0 >> 1 | a.x1 << 63;
+    a.x1 = a.x1 >> 1;
+    b.x0 = b.x0 >> 1 | b.x1 << 63;
+    b.x1 = b.x1 >> 1;
+    x.x0 = 0;
+    x.x1 = 0;
+    for (i = 0; i < 116; i++) {
+        x.x1 = x.x1 << 1 | x.x0 >> 63;
+        x.x0 = x.x0 << 1;
+        if (a.x1 > b.x1 || (a.x1 == b.x1 && a.x0 >= b.x0)) {
+            a.x1 = a.x1 - b.x1 - (a.x0 < b.x0);
+            a.x0 = a.x0 - b.x0;
+            x.x0 |= 1;
+        }
+        a.x1 = a.x1 << 1 | a.x0 >> 63;
+        a.x0 = a.x0 << 1;
+    }
+    x.x0 |= !!(a.x0 | a.x1);
+
+    x = f3_normalise(&x_exp, x);
+
+    return f3_round(x_sgn, x_exp, x);
+}
+
+long double __extendsftf2(float f)
+{
+    long double fx;
+    u128_t x;
+    uint32_t a;
+    uint64_t aa;
+    memcpy(&a, &f, 4);
+    aa = a;
+    x.x0 = 0;
+    if (!(a << 1))
+        x.x1 = aa << 32;
+    else if (a << 1 >> 24 == 255)
+        x.x1 = (0x7fff000000000000 | aa >> 31 << 63 | aa << 41 >> 16 |
+                (uint64_t)!!(a << 9) << 47);
+    else
+        x.x1 = (aa >> 31 << 63 | ((aa >> 23 & 255) + 16256) << 48 |
+                aa << 41 >> 16);
+    memcpy(&fx, &x, 16);
+    return fx;
+}
+
+long double __extenddftf2(double f)
+{
+    long double fx;
+    u128_t x;
+    uint64_t a;
+    memcpy(&a, &f, 8);
+    x.x0 = a << 60;
+    if (!(a << 1))
+        x.x1 = a;
+    else if (a << 1 >> 53 == 2047)
+        x.x1 = (0x7fff000000000000 | a >> 63 << 63 | a << 12 >> 16 |
+                (uint64_t)!!(a << 12) << 47);
+    else
+        x.x1 = a >> 63 << 63 | ((a >> 52 & 2047) + 15360) << 48 | a << 12 >> 16;
+    memcpy(&fx, &x, 16);
+    return fx;
+}
+
+float __trunctfsf2(long double f)
+{
+    u128_t mnt;
+    int32_t exp;
+    int sgn;
+    uint32_t x;
+    float fx;
+
+    f3_unpack(&sgn, &exp, &mnt, f);
+
+    if (exp == 32767 && (mnt.x0 | mnt.x1 << 16))
+        x = 0x7fc00000 | (uint32_t)sgn << 31 | (mnt.x1 >> 25 & 0x007fffff);
+    else if (exp > 16510)
+        x = 0x7f800000 | (uint32_t)sgn << 31;
+    else if (exp < 16233)
+        x = (uint32_t)sgn << 31;
+    else {
+        exp -= 16257;
+        x = mnt.x1 >> 23 | !!(mnt.x0 | mnt.x1 << 41);
+        if (exp < 0) {
+            x = x >> -exp | !!(x << (32 + exp));
+            exp = 0;
+        }
+        if ((x & 3) == 3 || (x & 7) == 6)
+            x += 4;
+        x = ((x >> 2) + (exp << 23)) | (uint32_t)sgn << 31;
+    }
+    memcpy(&fx, &x, 4);
+    return fx;
+}
+
+double __trunctfdf2(long double f)
+{
+    u128_t mnt;
+    int32_t exp;
+    int sgn;
+    uint64_t x;
+    double fx;
+
+    f3_unpack(&sgn, &exp, &mnt, f);
+
+    if (exp == 32767 && (mnt.x0 | mnt.x1 << 16))
+        x = (0x7ff8000000000000 | (uint64_t)sgn << 63 |
+             mnt.x1 << 16 >> 12 | mnt.x0 >> 60);
+    else if (exp > 17406)
+        x = 0x7ff0000000000000 | (uint64_t)sgn << 63;
+    else if (exp < 15308)
+        x = (uint64_t)sgn << 63;
+    else {
+        exp -= 15361;
+        x = mnt.x1 << 6 | mnt.x0 >> 58 | !!(mnt.x0 << 6);
+        if (exp < 0) {
+            x = x >> -exp | !!(x << (64 + exp));
+            exp = 0;
+        }
+        if ((x & 3) == 3 || (x & 7) == 6)
+            x += 4;
+        x = ((x >> 2) + ((uint64_t)exp << 52)) | (uint64_t)sgn << 63;
+    }
+    memcpy(&fx, &x, 8);
+    return fx;
+}
+
+int32_t __fixtfsi(long double fa)
+{
+    u128_t a;
+    int32_t a_exp;
+    int a_sgn;
+    int32_t x;
+    f3_unpack(&a_sgn, &a_exp, &a, fa);
+    if (a_exp < 16369)
+        return 0;
+    if (a_exp > 16413)
+        return a_sgn ? -0x80000000 : 0x7fffffff;
+    x = a.x1 >> (16431 - a_exp);
+    return a_sgn ? -x : x;
+}
+
+int64_t __fixtfdi(long double fa)
+{
+    u128_t a;
+    int32_t a_exp;
+    int a_sgn;
+    int64_t x;
+    f3_unpack(&a_sgn, &a_exp, &a, fa);
+    if (a_exp < 16383)
+        return 0;
+    if (a_exp > 16445)
+        return a_sgn ? -0x8000000000000000 : 0x7fffffffffffffff;
+    x = (a.x1 << 15 | a.x0 >> 49) >> (16446 - a_exp);
+    return a_sgn ? -x : x;
+}
+
+uint32_t __fixunstfsi(long double fa)
+{
+    u128_t a;
+    int32_t a_exp;
+    int a_sgn;
+    f3_unpack(&a_sgn, &a_exp, &a, fa);
+    if (a_sgn || a_exp < 16369)
+        return 0;
+    if (a_exp > 16414)
+        return -1;
+    return a.x1 >> (16431 - a_exp);
+}
+
+uint64_t __fixunstfdi(long double fa)
+{
+    u128_t a;
+    int32_t a_exp;
+    int a_sgn;
+    f3_unpack(&a_sgn, &a_exp, &a, fa);
+    if (a_sgn || a_exp < 16383)
+        return 0;
+    if (a_exp > 16446)
+        return -1;
+    return (a.x1 << 15 | a.x0 >> 49) >> (16446 - a_exp);
+}
+
+long double __floatsitf(int32_t a)
+{
+    int sgn = 0;
+    int exp = 16414;
+    uint32_t mnt = a;
+    u128_t x = { 0, 0 };
+    long double f;
+    int i;
+    if (a) {
+        if (a < 0) {
+            sgn = 1;
+            mnt = -mnt;
+        }
+        for (i = 16; i; i >>= 1)
+            if (!(mnt >> (32 - i))) {
+                mnt <<= i;
+                exp -= i;
+            }
+        x.x1 = ((uint64_t)sgn << 63 | (uint64_t)exp << 48 |
+                (uint64_t)(mnt << 1) << 16);
+    }
+    memcpy(&f, &x, 16);
+    return f;
+}
+
+long double __floatditf(int64_t a)
+{
+    int sgn = 0;
+    int exp = 16446;
+    uint64_t mnt = a;
+    u128_t x = { 0, 0 };
+    long double f;
+    int i;
+    if (a) {
+        if (a < 0) {
+            sgn = 1;
+            mnt = -mnt;
+        }
+        for (i = 32; i; i >>= 1)
+            if (!(mnt >> (64 - i))) {
+                mnt <<= i;
+                exp -= i;
+            }
+        x.x0 = mnt << 49;
+        x.x1 = (uint64_t)sgn << 63 | (uint64_t)exp << 48 | mnt << 1 >> 16;
+    }
+    memcpy(&f, &x, 16);
+    return f;
+}
+
+long double __floatunsitf(uint32_t a)
+{
+    int exp = 16414;
+    uint32_t mnt = a;
+    u128_t x = { 0, 0 };
+    long double f;
+    int i;
+    if (a) {
+        for (i = 16; i; i >>= 1)
+            if (!(mnt >> (32 - i))) {
+                mnt <<= i;
+                exp -= i;
+            }
+        x.x1 = (uint64_t)exp << 48 | (uint64_t)(mnt << 1) << 16;
+    }
+    memcpy(&f, &x, 16);
+    return f;
+}
+
+long double __floatunditf(uint64_t a)
+{
+    int exp = 16446;
+    uint64_t mnt = a;
+    u128_t x = { 0, 0 };
+    long double f;
+    int i;
+    if (a) {
+        for (i = 32; i; i >>= 1)
+            if (!(mnt >> (64 - i))) {
+                mnt <<= i;
+                exp -= i;
+            }
+        x.x0 = mnt << 49;
+        x.x1 = (uint64_t)exp << 48 | mnt << 1 >> 16;
+    }
+    memcpy(&f, &x, 16);
+    return f;
+}
+
+static int f3_cmp(long double fa, long double fb)
+{
+    u128_t a, b;
+    memcpy(&a, &fa, 16);
+    memcpy(&b, &fb, 16);
+    return (!(a.x0 | a.x1 << 1 | b.x0 | b.x1 << 1) ? 0 :
+            ((a.x1 << 1 >> 49 == 0x7fff && (a.x0 | a.x1 << 16)) ||
+             (b.x1 << 1 >> 49 == 0x7fff && (b.x0 | b.x1 << 16))) ? 2 :
+            a.x1 >> 63 != b.x1 >> 63 ? (int)(b.x1 >> 63) - (int)(a.x1 >> 63) :
+            a.x1 < b.x1 ? (int)(a.x1 >> 63 << 1) - 1 :
+            a.x1 > b.x1 ? 1 - (int)(a.x1 >> 63 << 1) :
+            a.x0 < b.x0 ? (int)(a.x1 >> 63 << 1) - 1 :
+            b.x0 < a.x0 ? 1 - (int)(a.x1 >> 63 << 1) : 0);
+}
+
+int __eqtf2(long double a, long double b)
+{
+    return !!f3_cmp(a, b);
+}
+
+int __netf2(long double a, long double b)
+{
+    return !!f3_cmp(a, b);
+}
+
+int __lttf2(long double a, long double b)
+{
+    return f3_cmp(a, b);
+}
+
+int __letf2(long double a, long double b)
+{
+    return f3_cmp(a, b);
+}
+
+int __gttf2(long double a, long double b)
+{
+    return -f3_cmp(b, a);
+}
+
+int __getf2(long double a, long double b)
+{
+    return -f3_cmp(b, a);
+}
--- a/lib/testfp.c
+++ b/lib/testfp.c
@ -0,0 +1,510 @@
+/*
+ *  Test 128-bit floating-point arithmetic on arm64:
+ *  build with two different compilers and compare the output.
+ *
+ *  Copyright (c) 2015 Edmund Grimley Evans
+ *
+ * Copying and distribution of this file, with or without modification,
+ * are permitted in any medium without royalty provided the copyright
+ * notice and this notice are preserved.  This file is offered as-is,
+ * without any warranty.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define check(x) ((x) ? (void)0 : check_fail(#x, __FILE__, __LINE__))
+
+void check_fail(const char *assertion, const char *file, unsigned int line)
+{
+    printf("%s:%d: Check (%s) failed.", file, line, assertion);
+    exit(1);
+}
+
+typedef struct {
+    unsigned long long x0, x1;
+} u128_t;
+
+float copy_fi(uint32_t x)
+{
+    float f;
+    memcpy(&f, &x, 4);
+    return f;
+}
+
+double copy_di(uint64_t x)
+{
+    double f;
+    memcpy(&f, &x, 8);
+    return f;
+}
+
+long double copy_ldi(u128_t x)
+{
+    long double f;
+    memcpy(&f, &x, 16);
+    return f;
+}
+
+uint32_t copy_if(float f)
+{
+    uint32_t x;
+    memcpy(&x, &f, 4);
+    return x;
+}
+
+uint64_t copy_id(double f)
+{
+    uint64_t x;
+    memcpy(&x, &f, 8);
+    return x;
+}
+
+u128_t copy_ild(long double f)
+{
+    u128_t x;
+    memcpy(&x, &f, 16);
+    return x;
+}
+
+long double make(int sgn, int exp, uint64_t high, uint64_t low)
+{
+    u128_t x = { low,
+                 (0x0000ffffffffffff & high) |
+                 (0x7fff000000000000 & (uint64_t)exp << 48) |
+                 (0x8000000000000000 & (uint64_t)sgn << 63) };
+    return copy_ldi(x);
+}
+
+void cmp(long double a, long double b)
+{
+    u128_t ax = copy_ild(a);
+    u128_t bx = copy_ild(b);
+    int eq = (a == b);
+    int ne = (a != b);
+    int lt = (a < b);
+    int le = (a <= b);
+    int gt = (a > b);
+    int ge = (a >= b);
+
+    check(eq == 0 || eq == 1);
+    check(lt == 0 || lt == 1);
+    check(gt == 0 || gt == 1);
+    check(ne == !eq && le == (lt | eq) && ge == (gt | eq));
+    check(eq + lt + gt < 2);
+
+    printf("cmp %016llx%016llx %016llx%016llx %d %d %d\n",
+           ax.x1, ax.x0, bx.x1, bx.x0, lt, eq, gt);
+}
+
+void cmps(void)
+{
+    int i, j;
+
+    for (i = 0; i < 2; i++)
+        for (j = 0; j < 2; j++)
+            cmp(make(i, 0, 0, 0), make(j, 0, 0, 0));
+
+    for (i = 0; i < 2; i++) {
+        for (j = 0; j < 64; j++) {
+            long double f1 = make(i, 32767, (uint64_t)1 << j, 0);
+            long double f2 = make(i, 32767, 0, (uint64_t)1 << j);
+            cmp(f1, 0);
+            cmp(f2, 0);
+            cmp(0, f1);
+            cmp(0, f2);
+        }
+    }
+
+    for (i = 0; i < 6; i++)
+        for (j = 0; j < 6; j++)
+            cmp(make(i & 1, i >> 1, 0, 0),
+                make(j & 1, j >> 1, 0, 0));
+
+    for (i = 0; i < 2; i++) {
+        for (j = 0; j < 2; j++) {
+            int a, b;
+            for (a = 0; a < 2; a++) {
+                for (b = 0; b < 2; b++) {
+                    cmp(make(i, j, a, b), make(i, j, 0, 0));
+                    cmp(make(i, j, 0, 0), make(i, j, a, b));
+                }
+            }
+        }
+    }
+}
+
+void xop(const char *name, long double a, long double b, long double c)
+{
+    u128_t ax = copy_ild(a);
+    u128_t bx = copy_ild(b);
+    u128_t cx = copy_ild(c);
+    printf("%s %016llx%016llx %016llx%016llx %016llx%016llx\n",
+           name, ax.x1, ax.x0, bx.x1, bx.x0, cx.x1, cx.x0);
+}
+
+void fadd(long double a, long double b)
+{
+    xop("add", a, b, a + b);
+}
+
+void fsub(long double a, long double b)
+{
+    xop("sub", a, b, a - b);
+}
+
+void fmul(long double a, long double b)
+{
+    xop("mul", a, b, a * b);
+}
+
+void fdiv(long double a, long double b)
+{
+    xop("div", a, b, a / b);
+}
+
+void nanz(void)
+{
+    // Check NaNs:
+    {
+        long double x[7];
+        int i, j, n = 0;
+        x[n++] = make(0, 32000, 0x95132b76effc, 0xd79035214b4f8d53);
+        x[n++] = make(1, 32001, 0xbe71d7a51587, 0x30601c6815d6c3ac);
+        x[n++] = make(0, 32767, 0, 1);
+        x[n++] = make(0, 32767, (uint64_t)1 << 46, 0);
+        x[n++] = make(1, 32767, (uint64_t)1 << 47, 0);
+        x[n++] = make(1, 32767, 0x7596c7099ad5, 0xe25fed2c58f73fc9);
+        x[n++] = make(0, 32767, 0x835d143360f9, 0x5e315efb35630666);
+        check(n == sizeof(x) / sizeof(*x));
+        for (i = 0; i < n; i++) {
+            for (j = 0; j < n; j++) {
+                fadd(x[i], x[j]);
+                fsub(x[i], x[j]);
+                fmul(x[i], x[j]);
+                fdiv(x[i], x[j]);
+            }
+        }
+    }
+
+    // Check infinities and zeroes:
+    {
+        long double x[6];
+        int i, j, n = 0;
+        x[n++] = make(1, 32000, 0x62acda85f700, 0x47b6c9f35edc4044);
+        x[n++] = make(0, 32001, 0x94b7abf55af7, 0x9f425fe354428e19);
+        x[n++] = make(0, 32767, 0, 0);
+        x[n++] = make(1, 32767, 0, 0);
+        x[n++] = make(0, 0, 0, 0);
+        x[n++] = make(1, 0, 0, 0);
+        check(n == sizeof(x) / sizeof(*x));
+        for (i = 0; i < n; i++) {
+            for (j = 0; j < n; j++) {
+                fadd(x[i], x[j]);
+                fsub(x[i], x[j]);
+                fmul(x[i], x[j]);
+                fdiv(x[i], x[j]);
+            }
+        }
+    }
+}
+
+void adds(void)
+{
+    // Check shifting and add/sub:
+    {
+        int i;
+        for (i = -130; i <= 130; i++) {
+            int s1 = (uint32_t)i % 3 < 1;
+            int s2 = (uint32_t)i % 5 < 2;
+            fadd(make(s1, 16384    , 0x502c065e4f71a65d, 0xd2f9bdb031f4f031),
+                 make(s2, 16384 + i, 0xae267395a9bc1033, 0xb56b5800da1ba448));
+        }
+    }
+
+    // Check normalisation:
+    {
+        uint64_t a0 = 0xc6bab0a6afbef5ed;
+        uint64_t a1 = 0x4f84136c4a2e9b52;
+        int ee[] = { 0, 1, 10000 };
+        int e, i;
+        for (e = 0; e < sizeof(ee) / sizeof(*ee); e++) {
+            int exp = ee[e];
+            fsub(make(0, exp, a1, a0), make(0, 0, 0, 0));
+            for (i = 63; i >= 0; i--)
+                fsub(make(0, exp, a1 | (uint64_t)1 << i >> 1, a0),
+                     make(0, exp, a1 >> i << i, 0));
+            for (i = 63; i >=0; i--)
+                fsub(make(0, exp, a1, a0 | (uint64_t)1 << i >> 1),
+                     make(0, exp, a1, a0 >> i << i));
+        }
+    }
+
+    // Carry/overflow from rounding:
+    {
+        fadd(make(0, 114, -1, -1), make(0, 1, 0, 0));
+        fadd(make(0, 32766, -1, -1), make(0, 32653, 0, 0));
+        fsub(make(1, 32766, -1, -1), make(0, 32653, 0, 0));
+    }
+}
+
+void muls(void)
+{
+    int i, j;
+
+    {
+        long double max = make(0, 32766, -1, -1);
+        long double min = make(0, 0, 0, 1);
+        fmul(max, max);
+        fmul(max, min);
+        fmul(min, min);
+    }
+
+    for (i = 117; i > 0; i--)
+        fmul(make(0, 16268, 0x643dcea76edc, 0xe0877a598403627a),
+             make(i & 1, i, 0, 0));
+
+    fmul(make(0, 16383, -1, -3), make(0, 16383, 0, 1));
+    // Round to next exponent:
+    fmul(make(0, 16383, -1, -2), make(0, 16383, 0, 1));
+    // Round from subnormal to normal:
+    fmul(make(0, 1, -1, -1), make(0, 16382, 0, 0));
+
+    for (i = 0; i < 2; i++)
+        for (j = 0; j < 112; j++)
+            fmul(make(0, 16383, (uint64_t)1 << i, 0),
+                 make(0, 16383,
+                      j < 64 ? 0 : (uint64_t)1 << (j - 64),
+                      j < 64 ? (uint64_t)1 << j : 0));
+}
+
+void divs(void)
+{
+    int i;
+
+    {
+        long double max = make(0, 32766, -1, -1);
+        long double min = make(0, 0, 0, 1);
+        fdiv(max, max);
+        fdiv(max, min);
+        fdiv(min, max);
+        fdiv(min, min);
+    }
+
+    for (i = 0; i < 64; i++)
+        fdiv(make(0, 16383, -1, -1), make(0, 16383, -1, -(uint64_t)1 << i));
+    for (i = 0; i < 48; i++)
+        fdiv(make(0, 16383, -1, -1), make(0, 16383, -(uint64_t)1 << i, 0));
+}
+
+void cvtlsw(int32_t a)
+{
+    long double f = a;
+    u128_t x = copy_ild(f);
+    printf("cvtlsw %08lx %016llx%016llx\n", (long)(uint32_t)a, x.x1, x.x0);
+}
+
+void cvtlsx(int64_t a)
+{
+    long double f = a;
+    u128_t x = copy_ild(f);
+    printf("cvtlsx %016llx %016llx%016llx\n",
+           (long long)(uint64_t)a, x.x1, x.x0);
+}
+
+void cvtluw(uint32_t a)
+{
+    long double f = a;
+    u128_t x = copy_ild(f);
+    printf("cvtluw %08lx %016llx%016llx\n", (long)a, x.x1, x.x0);
+}
+
+void cvtlux(uint64_t a)
+{
+    long double f = a;
+    u128_t x = copy_ild(f);
+    printf("cvtlux %016llx %016llx%016llx\n", (long long)a, x.x1, x.x0);
+}
+
+void cvtil(long double a)
+{
+    u128_t x = copy_ild(a);
+    int32_t b1 = a;
+    int64_t b2 = a;
+    uint32_t b3 = a;
+    uint64_t b4 = a;
+    printf("cvtswl %016llx%016llx %08lx\n",
+           x.x1, x.x0, (long)(uint32_t)b1);
+    printf("cvtsxl %016llx%016llx %016llx\n",
+           x.x1, x.x0, (long long)(uint64_t)b2);
+    printf("cvtuwl %016llx%016llx %08lx\n",
+           x.x1, x.x0, (long)b3);
+    printf("cvtuxl %016llx%016llx %016llx\n",
+           x.x1, x.x0, (long long)b4);
+}
+
+void cvtlf(float a)
+{
+    uint32_t ax = copy_if(a);
+    long double b = a;
+    u128_t bx = copy_ild(b);
+    printf("cvtlf %08lx %016llx%016llx\n", (long)ax, bx.x1, bx.x0);
+}
+
+void cvtld(double a)
+{
+    uint64_t ax = copy_id(a);
+    long double b = a;
+    u128_t bx = copy_ild(b);
+    printf("cvtld %016llx %016llx%016llx\n", (long long)ax, bx.x1, bx.x0);
+}
+
+void cvtfl(long double a)
+{
+    u128_t ax = copy_ild(a);
+    float b = a;
+    uint32_t bx = copy_if(b);
+    printf("cvtfl %016llx%016llx %08lx\n", ax.x1, ax.x0, (long)bx);
+}
+
+void cvtdl(long double a)
+{
+    u128_t ax = copy_ild(a);
+    double b = a;
+    uint64_t bx = copy_id(b);
+    printf("cvtdl %016llx%016llx %016llx\n", ax.x1, ax.x0, (long long)bx);
+}
+
+void cvts(void)
+{
+    int i, j;
+
+    {
+        uint32_t x = 0xad040c5b;
+        cvtlsw(0);
+        for (i = 0; i < 31; i++)
+            cvtlsw(x >> (31 - i));
+        for (i = 0; i < 31; i++)
+            cvtlsw(-(x >> (31 - i)));
+        cvtlsw(0x80000000);
+    }
+    {
+        uint64_t x = 0xb630a248cad9afd2;
+        cvtlsx(0);
+        for (i = 0; i < 63; i++)
+            cvtlsx(x >> (63 - i));
+        for (i = 0; i < 63; i++)
+            cvtlsx(-(x >> (63 - i)));
+        cvtlsx(0x8000000000000000);
+    }
+    {
+        uint32_t x = 0xad040c5b;
+        cvtluw(0);
+        for (i = 0; i < 32; i++)
+            cvtluw(x >> (31 - i));
+    }
+    {
+        uint64_t x = 0xb630a248cad9afd2;
+        cvtlux(0);
+        for (i = 0; i < 64; i++)
+            cvtlux(x >> (63 - i));
+    }
+
+    for (i = 0; i < 2; i++) {
+        cvtil(make(i, 32767, 0, 1));
+        cvtil(make(i, 32767, (uint64_t)1 << 47, 0));
+        cvtil(make(i, 32767, 123, 456));
+        cvtil(make(i, 32767, 0, 0));
+        cvtil(make(i, 16382, -1, -1));
+        cvtil(make(i, 16383, -1, -1));
+        cvtil(make(i, 16384, 0x7fffffffffff, -1));
+        cvtil(make(i, 16384, 0x800000000000, 0));
+        for (j = 0; j < 68; j++)
+            cvtil(make(i, 16381 + j, 0xd4822c0a10ec, 0x1fe2f8b2669f5c9d));
+    }
+
+    cvtlf(copy_fi(0x00000000));
+    cvtlf(copy_fi(0x456789ab));
+    cvtlf(copy_fi(0x7f800000));
+    cvtlf(copy_fi(0x7f923456));
+    cvtlf(copy_fi(0x7fdbcdef));
+    cvtlf(copy_fi(0x80000000));
+    cvtlf(copy_fi(0xabcdef12));
+    cvtlf(copy_fi(0xff800000));
+    cvtlf(copy_fi(0xff923456));
+    cvtlf(copy_fi(0xffdbcdef));
+
+    cvtld(copy_di(0x0000000000000000));
+    cvtld(copy_di(0x456789abcdef0123));
+    cvtld(copy_di(0x7ff0000000000000));
+    cvtld(copy_di(0x7ff123456789abcd));
+    cvtld(copy_di(0x7ffabcdef1234567));
+    cvtld(copy_di(0x8000000000000000));
+    cvtld(copy_di(0xcdef123456789abc));
+    cvtld(copy_di(0xfff0000000000000));
+    cvtld(copy_di(0xfff123456789abcd));
+    cvtld(copy_di(0xfffabcdef1234567));
+
+    for (i = 0; i < 2; i++) {                   \
+        cvtfl(make(i, 0, 0, 0));
+        cvtfl(make(i, 16232, -1, -1));
+        cvtfl(make(i, 16233, 0, 0));
+        cvtfl(make(i, 16233, 0, 1));
+        cvtfl(make(i, 16383, 0xab0ffd000000, 0));
+        cvtfl(make(i, 16383, 0xab0ffd000001, 0));
+        cvtfl(make(i, 16383, 0xab0ffeffffff, 0));
+        cvtfl(make(i, 16383, 0xab0fff000000, 0));
+        cvtfl(make(i, 16383, 0xab0fff000001, 0));
+        cvtfl(make(i, 16510, 0xfffffeffffff, -1));
+        cvtfl(make(i, 16510, 0xffffff000000, 0));
+        cvtfl(make(i, 16511, 0, 0));
+        cvtfl(make(i, 32767, 0, 0));
+        cvtfl(make(i, 32767, 0, 1));
+        cvtfl(make(i, 32767, 0x4cbe01ac5f40, 0x75cee3c6afbb00b5));
+        cvtfl(make(i, 32767, 0x800000000000, 1));
+        cvtfl(make(i, 32767, 0xa11caaaf6a52, 0x696033e871eab099));
+    }
+
+    for (i = 0; i < 2; i++) {
+        cvtdl(make(i, 0, 0, 0));
+        cvtdl(make(i, 15307, -1, -1));
+        cvtdl(make(i, 15308, 0, 0));
+        cvtdl(make(i, 15308, 0, 1));
+        cvtdl(make(i, 16383, 0xabc123abc0ff, 0xe800000000000000));
+        cvtdl(make(i, 16383, 0xabc123abc0ff, 0xe800000000000001));
+        cvtdl(make(i, 16383, 0xabc123abc0ff, 0xf7ffffffffffffff));
+        cvtdl(make(i, 16383, 0xabc123abc0ff, 0xf800000000000000));
+        cvtdl(make(i, 16383, 0xabc123abc0ff, 0xf800000000000001));
+        cvtdl(make(i, 17406, 0xffffffffffff, 0xf7ffffffffffffff));
+        cvtdl(make(i, 17406, 0xffffffffffff, 0xf800000000000000));
+        cvtdl(make(i, 17407, 0, 0));
+        cvtdl(make(i, 32767, 0, 0));
+        cvtdl(make(i, 32767, 0, 1));
+        cvtdl(make(i, 32767, 0x4cbe01ac5f40, 0x75cee3c6afbb00b5));
+        cvtdl(make(i, 32767, 0x800000000000, 1));
+        cvtdl(make(i, 32767, 0xa11caaaf6a52, 0x696033e871eab099));
+    }
+}
+
+void tests(void)
+{
+    cmps();
+    nanz();
+    adds();
+    muls();
+    divs();
+    cvts();
+}
+
+int main()
+{
+#ifdef __aarch64__
+    tests();
+#else
+    printf("This test program is intended for a little-endian architecture\n"
+           "with an IEEE-standard 128-bit long double.\n");
+#endif
+    return 0;
+}
--- a/libtcc.c
+++ b/libtcc.c
@ -45,6 +45,9 @@ ST_DATA struct TCCState *tcc_state;
 #ifdef TCC_TARGET_ARM
 #include "arm-gen.c"
 #endif
+#ifdef TCC_TARGET_ARM64
+#include "arm64-gen.c"
+#endif
 #ifdef TCC_TARGET_C67
 #include "c67-gen.c"
 #endif
@ -959,6 +962,8 @@ LIBTCCAPI TCCState *tcc_new(void)
 #else
    s->float_abi = ARM_SOFTFP_FLOAT;
 #endif
+#elif defined(TCC_TARGET_ARM64)
+    tcc_define_symbol(s, "__aarch64__", NULL);
 #endif

 #ifdef TCC_TARGET_PE
@ -1560,7 +1565,7 @@ static int tcc_set_linker(TCCState *s, const char *option)
        } else if (link_option(option, "oformat=", &p)) {
 #if defined(TCC_TARGET_PE)
            if (strstart("pe-", &p)) {
-#elif defined(TCC_TARGET_X86_64)
+#elif defined(TCC_TARGET_ARM64) || defined(TCC_TARGET_X86_64)
            if (strstart("elf64-", &p)) {
 #else
            if (strstart("elf32-", &p)) {
--- a/tcc.c
+++ b/tcc.c
@ -203,6 +203,8 @@ static void display_info(TCCState *s, int what)
 # endif
 #elif defined TCC_TARGET_ARM
        "ARM"
+#elif defined TCC_TARGET_ARM64
+        "AArch64"
 # ifdef TCC_ARM_HARDFLOAT
        " Hard Float"
 # endif
--- a/tcc.h
+++ b/tcc.h
@ -113,7 +113,7 @@
 #endif

 #include "elf.h"
-#ifdef TCC_TARGET_X86_64
+#if defined(TCC_TARGET_ARM64) || defined(TCC_TARGET_X86_64)
 # define ELFCLASSW ELFCLASS64
 # define ElfW(type) Elf##64##_##type
 # define ELFW(type) ELF##64##_##type
@ -151,23 +151,26 @@
 /* target selection */
 /* #define TCC_TARGET_I386   *//* i386 code generator */
 /* #define TCC_TARGET_ARM    *//* ARMv4 code generator */
+/* #define TCC_TARGET_ARM64  *//* ARMv8 code generator */
 /* #define TCC_TARGET_C67    *//* TMS320C67xx code generator */
 /* #define TCC_TARGET_X86_64 *//* x86-64 code generator */

 /* default target is I386 */
 #if !defined(TCC_TARGET_I386) && !defined(TCC_TARGET_ARM) && \
-    !defined(TCC_TARGET_C67) && !defined(TCC_TARGET_X86_64)
+    !defined(TCC_TARGET_ARM64) && !defined(TCC_TARGET_C67) && \
+    !defined(TCC_TARGET_X86_64)
 #define TCC_TARGET_I386
 #endif

 #if !defined(TCC_UCLIBC) && !defined(TCC_TARGET_ARM) && \
-    !defined(TCC_TARGET_C67) && !defined(TCC_TARGET_X86_64) && \
-    !defined(CONFIG_USE_LIBGCC)
+    !defined(TCC_TARGET_ARM64) && !defined(TCC_TARGET_C67) && \
+    !defined(TCC_TARGET_X86_64) && !defined(CONFIG_USE_LIBGCC)
 #define CONFIG_TCC_BCHECK /* enable bound checking code */
 #endif

 /* define it to include assembler support */
-#if !defined(TCC_TARGET_ARM) && !defined(TCC_TARGET_C67)
+#if !defined(TCC_TARGET_ARM) && !defined(TCC_TARGET_ARM64) && \
+    !defined(TCC_TARGET_C67)
 #define CONFIG_TCC_ASM
 #endif

@ -184,6 +187,8 @@
 #  define TCC_IS_NATIVE
 # elif defined __arm__ && defined TCC_TARGET_ARM
 #  define TCC_IS_NATIVE
+# elif defined __aarch64__ && defined TCC_TARGET_ARM64
+#  define TCC_IS_NATIVE
 # endif
 #endif

@ -256,6 +261,8 @@
 #  define CONFIG_TCC_ELFINTERP "/usr/libexec/ld-elf.so.2"
 # elif defined __GNU__
 #  define CONFIG_TCC_ELFINTERP "/lib/ld.so"
+# elif defined TCC_TARGET_ARM64
+#  define CONFIG_TCC_ELFINTERP "/lib/ld-linux-aarch64.so.1"
 # elif defined(TCC_TARGET_X86_64)
 #  define CONFIG_TCC_ELFINTERP "/lib64/ld-linux-x86-64.so.2"
 # elif defined(TCC_UCLIBC)
@ -290,6 +297,9 @@
 #ifdef TCC_TARGET_ARM
 # include "arm-gen.c"
 #endif
+#ifdef TCC_TARGET_ARM64
+# include "arm64-gen.c"
+#endif
 #ifdef TCC_TARGET_C67
 # include "coff.h"
 # include "c67-gen.c"
@ -1214,6 +1224,7 @@ ST_FUNC void vpushv(SValue *v);
 ST_FUNC void save_reg(int r);
 ST_FUNC int get_reg(int rc);
 ST_FUNC void save_regs(int n);
+ST_FUNC void gaddrof(void);
 ST_FUNC int gv(int rc);
 ST_FUNC void gv2(int rc1, int rc2);
 ST_FUNC void vpop(void);
@ -1357,6 +1368,15 @@ ST_FUNC uint32_t encbranch(int pos, int addr, int fail);
 ST_FUNC void gen_cvt_itof1(int t);
 #endif

+/* ------------ arm64-gen.c ------------ */
+#ifdef TCC_TARGET_ARM64
+ST_FUNC void gen_cvt_sxtw(void);
+ST_FUNC void gen_opl(int op);
+ST_FUNC void greturn(void);
+ST_FUNC void gen_va_start(void);
+ST_FUNC void gen_va_arg(CType *t);
+#endif
+
 /* ------------ c67-gen.c ------------ */
 #ifdef TCC_TARGET_C67
 #endif
--- a/tccelf.c
+++ b/tccelf.c
@ -291,7 +291,7 @@ ST_FUNC void put_elf_reloca(Section *symtab, Section *s, unsigned long offset,
    rel = section_ptr_add(sr, sizeof(ElfW_Rel));
    rel->r_offset = offset;
    rel->r_info = ELFW(R_INFO)(symbol, type);
-#ifdef TCC_TARGET_X86_64
+#if defined(TCC_TARGET_ARM64) || defined(TCC_TARGET_X86_64)
    rel->r_addend = addend;
 #else
    if (addend)
@ -506,7 +506,7 @@ ST_FUNC void relocate_section(TCCState *s1, Section *s)
        sym_index = ELFW(R_SYM)(rel->r_info);
        sym = &((ElfW(Sym) *)symtab_section->data)[sym_index];
        val = sym->st_value;
-#ifdef TCC_TARGET_X86_64
+#if defined(TCC_TARGET_ARM64) || defined(TCC_TARGET_X86_64)
        val += rel->r_addend;
 #endif
        type = ELFW(R_TYPE)(rel->r_info);
@ -760,6 +760,69 @@ ST_FUNC void relocate_section(TCCState *s1, Section *s)
            fprintf(stderr,"FIXME: handle reloc type %x at %x [%p] to %x\n",
                type, (unsigned)addr, ptr, (unsigned)val);
            break;
+#elif defined(TCC_TARGET_ARM64)
+        case R_AARCH64_ABS64:
+            *(uint64_t *)ptr = val;
+            break;
+        case R_AARCH64_ABS32:
+            *(uint32_t *)ptr = val;
+            break;
+        case R_AARCH64_MOVW_UABS_G0_NC:
+            *(uint32_t *)ptr = (*(uint32_t *)ptr & 0xffe0001f) |
+                (val & 0xffff) << 5;
+            break;
+        case R_AARCH64_MOVW_UABS_G1_NC:
+            *(uint32_t *)ptr = (*(uint32_t *)ptr & 0xffe0001f) |
+                (val >> 16 & 0xffff) << 5;
+            break;
+        case R_AARCH64_MOVW_UABS_G2_NC:
+            *(uint32_t *)ptr = (*(uint32_t *)ptr & 0xffe0001f) |
+                (val >> 32 & 0xffff) << 5;
+            break;
+        case R_AARCH64_MOVW_UABS_G3:
+            *(uint32_t *)ptr = (*(uint32_t *)ptr & 0xffe0001f) |
+                (val >> 48 & 0xffff) << 5;
+            break;
+        case R_AARCH64_ADR_PREL_PG_HI21: {
+            uint64_t off = (val >> 12) - (addr >> 12);
+            if ((off + ((uint64_t)1 << 20)) >> 21)
+                tcc_error("R_AARCH64_ADR_PREL_PG_HI21 relocation failed");
+            *(uint32_t *)ptr = (*(uint32_t *)ptr & 0x9f00001f) |
+                (off & 0x1ffffc) << 3 | (off & 3) << 29;
+            break;
+        }
+        case R_AARCH64_ADD_ABS_LO12_NC:
+            *(uint32_t *)ptr = (*(uint32_t *)ptr & 0xffc003ff) |
+                (val & 0xfff) << 10;
+            break;
+        case R_AARCH64_JUMP26:
+        case R_AARCH64_CALL26:
+            if (((val - addr) + ((uint64_t)1 << 27)) & ~(uint64_t)0xffffffc)
+                tcc_error("R_AARCH64_(JUMP|CALL)26 relocation failed");
+            *(uint32_t *)ptr = 0x14000000 | (type == R_AARCH64_CALL26) << 31 |
+                ((val - addr) >> 2 & 0x3ffffff);
+            break;
+        case R_AARCH64_ADR_GOT_PAGE: {
+            uint64_t off =
+                (((s1->got->sh_addr +
+                   s1->sym_attrs[sym_index].got_offset) >> 12) - (addr >> 12));
+            if ((off + ((uint64_t)1 << 20)) >> 21)
+                tcc_error("R_AARCH64_ADR_GOT_PAGE relocation failed");
+            *(uint32_t *)ptr = (*(uint32_t *)ptr & 0x9f00001f) |
+                (off & 0x1ffffc) << 3 | (off & 3) << 29;
+            break;
+        }
+        case R_AARCH64_LD64_GOT_LO12_NC:
+            *(uint32_t *)ptr = (*(uint32_t *)ptr & 0xfff803ff) |
+                ((s1->got->sh_addr + s1->sym_attrs[sym_index].got_offset)
+                 & 0xff8) << 7;
+            break;
+        case R_AARCH64_COPY:
+          break;
+        default:
+            fprintf(stderr, "FIXME: handle reloc type %x at %x [%p] to %x\n",
+                    type, (unsigned)addr, ptr, (unsigned)val);
+            break;
 #elif defined(TCC_TARGET_C67)
        case R_C60_32:
            *(int *)ptr += val;
@ -955,7 +1018,7 @@ static void put32(unsigned char *p, uint32_t val)
 }

 #if defined(TCC_TARGET_I386) || defined(TCC_TARGET_ARM) || \
-    defined(TCC_TARGET_X86_64)
+    defined(TCC_TARGET_ARM64) || defined(TCC_TARGET_X86_64)
 static uint32_t get32(unsigned char *p)
 {
    return p[0] | (p[1] << 8) | (p[2] << 16) | (p[3] << 24);
@ -1014,6 +1077,8 @@ static unsigned long put_got_entry(TCCState *s1,
        (reloc_type == R_386_JMP_SLOT);
 #elif defined(TCC_TARGET_ARM)
        (reloc_type == R_ARM_JUMP_SLOT);
+#elif defined(TCC_TARGET_ARM64)
+        (reloc_type == R_AARCH64_JUMP_SLOT);
 #else
        0;
 #endif
@ -1135,6 +1200,24 @@ static unsigned long put_got_entry(TCCState *s1,
 	    if (sym->st_shndx == SHN_UNDEF)
                offset = plt->data_offset - 16;
        }
+#elif defined(TCC_TARGET_ARM64)
+        if (need_plt_entry) {
+            Section *plt;
+            uint8_t *p;
+
+            if (s1->output_type == TCC_OUTPUT_DLL)
+                tcc_error("DLLs unimplemented!");
+
+            plt = s1->plt;
+            if (plt->data_offset == 0)
+                section_ptr_add(plt, 32);
+            p = section_ptr_add(plt, 16);
+            put32(p, s1->got->data_offset);
+            put32(p + 4, (uint64_t)s1->got->data_offset >> 32);
+
+            if (sym->st_shndx == SHN_UNDEF)
+                offset = plt->data_offset - 16;
+        }
 #elif defined(TCC_TARGET_C67)
    if (s1->dynsym) {
        tcc_error("C67 got not implemented");
@ -1277,6 +1360,18 @@ ST_FUNC void build_got_entries(TCCState *s1)
                    put32(p+2, 0x46c0); /* nop   */
                    put32(p+4, 0xeafffffe); /* b $sym */
                }
+#elif defined(TCC_TARGET_ARM64)
+                //xx Other cases may be required here:
+            case R_AARCH64_ADR_GOT_PAGE:
+            case R_AARCH64_LD64_GOT_LO12_NC:
+                if (!s1->got)
+                    build_got(s1);
+                sym_index = ELFW(R_SYM)(rel->r_info);
+                sym = &((ElfW(Sym) *)symtab_section->data)[sym_index];
+                reloc_type = R_AARCH64_GLOB_DAT;
+                put_got_entry(s1, reloc_type, sym->st_size, sym->st_info,
+                              sym_index);
+                break;
 #elif defined(TCC_TARGET_C67)
            case R_C60_GOT32:
            case R_C60_GOTOFF:
@ -1796,6 +1891,40 @@ ST_FUNC void relocate_plt(TCCState *s1)
            put32(p + 12, x + get32(p + 12) + s1->plt->data - p);
            p += 16;
        }
+#elif defined(TCC_TARGET_ARM64)
+        uint64_t plt = s1->plt->sh_addr;
+        uint64_t got = s1->got->sh_addr;
+        uint64_t off = (got >> 12) - (plt >> 12);
+        if ((off + ((uint64_t)1 << 20)) >> 21)
+            tcc_error("Failed relocating PLT");
+        put32(p, 0xa9bf7bf0); // stp x16,x30,[sp,#-16]!
+        put32(p + 4, (0x90000010 | // adrp x16,...
+                      (off & 0x1ffffc) << 3 | (off & 3) << 29));
+        put32(p + 8, (0xf9400211 | // ldr x17,[x16,#...]
+                      (got & 0xff8) << 7));
+        put32(p + 12, (0x91000210 | // add x16,x16,#...
+                       (got & 0xfff) << 10));
+        put32(p + 16, 0xd61f0220); // br x17
+        put32(p + 20, 0xd503201f); // nop
+        put32(p + 24, 0xd503201f); // nop
+        put32(p + 28, 0xd503201f); // nop
+        p += 32;
+        while (p < p_end) {
+            uint64_t pc = plt + (p - s1->plt->data);
+            uint64_t addr = got +
+                (get32(p) | (uint64_t)get32(p + 4) << 32);
+            uint32_t off = (addr >> 12) - (pc >> 12);
+            if ((off + ((uint64_t)1 << 20)) >> 21)
+                tcc_error("Failed relocating PLT");
+            put32(p, (0x90000010 | // adrp x16,...
+                      (off & 0x1ffffc) << 3 | (off & 3) << 29));
+            put32(p + 4, (0xf9400211 | // ldr x17,[x16,#...]
+                          (addr & 0xff8) << 7));
+            put32(p + 8, (0x91000210 | // add x16,x16,#...
+                          (addr & 0xfff) << 10));
+            put32(p + 12, 0xd61f0220); // br x17
+            p += 16;
+        }
 #elif defined(TCC_TARGET_C67)
        /* XXX: TODO */
 #else
@ -2093,7 +2222,7 @@ static void fill_dynamic(TCCState *s1, struct dyn_inf *dyninf)
    put_dt(dynamic, DT_SYMTAB, s1->dynsym->sh_addr);
    put_dt(dynamic, DT_STRSZ, dyninf->dynstr->data_offset);
    put_dt(dynamic, DT_SYMENT, sizeof(ElfW(Sym)));
-#ifdef TCC_TARGET_X86_64
+#if defined(TCC_TARGET_ARM64) || defined(TCC_TARGET_X86_64)
    put_dt(dynamic, DT_RELA, dyninf->rel_addr);
    put_dt(dynamic, DT_RELASZ, dyninf->rel_size);
    put_dt(dynamic, DT_RELAENT, sizeof(ElfW_Rel));
--- a/tccgen.c
+++ b/tccgen.c
@ -545,7 +545,7 @@ ST_FUNC void save_reg(int r)
                type = &p->type;
                if ((p->r & VT_LVAL) ||
                    (!is_float(type->t) && (type->t & VT_BTYPE) != VT_LLONG))
-#ifdef TCC_TARGET_X86_64
+#if defined(TCC_TARGET_ARM64) || defined(TCC_TARGET_X86_64)
                    type = &char_pointer_type;
 #else
                    type = &int_type;
@ -562,7 +562,7 @@ ST_FUNC void save_reg(int r)
                    o(0xd8dd); /* fstp %st(0) */
                }
 #endif
-#ifndef TCC_TARGET_X86_64
+#if !defined(TCC_TARGET_ARM64) && !defined(TCC_TARGET_X86_64)
                /* special long long case */
                if ((type->t & VT_BTYPE) == VT_LLONG) {
                    sv.c.ul += 4;
@ -681,7 +681,7 @@ static void move_reg(int r, int s, int t)
 }

 /* get address of vtop (vtop MUST BE an lvalue) */
-static void gaddrof(void)
+ST_FUNC void gaddrof(void)
 {
    if (vtop->r & VT_REF)
        gv(RC_INT);
@ -803,11 +803,13 @@ ST_FUNC int gv(int rc)

        r = vtop->r & VT_VALMASK;
        rc2 = (rc & RC_FLOAT) ? RC_FLOAT : RC_INT;
+#ifndef TCC_TARGET_ARM64
        if (rc == RC_IRET)
            rc2 = RC_LRET;
 #ifdef TCC_TARGET_X86_64
        else if (rc == RC_FRET)
            rc2 = RC_QRET;
+#endif
 #endif

        /* need to reload if:
@ -817,7 +819,7 @@ ST_FUNC int gv(int rc)
        if (r >= VT_CONST
         || (vtop->r & VT_LVAL)
         || !(reg_classes[r] & rc)
-#ifdef TCC_TARGET_X86_64
+#if defined(TCC_TARGET_ARM64) || defined(TCC_TARGET_X86_64)
         || ((vtop->type.t & VT_BTYPE) == VT_QLONG && !(reg_classes[vtop->r2] & rc2))
         || ((vtop->type.t & VT_BTYPE) == VT_QFLOAT && !(reg_classes[vtop->r2] & rc2))
 #else
@ -826,7 +828,7 @@ ST_FUNC int gv(int rc)
            )
        {
            r = get_reg(rc);
-#ifdef TCC_TARGET_X86_64
+#if defined(TCC_TARGET_ARM64) || defined(TCC_TARGET_X86_64)
            if (((vtop->type.t & VT_BTYPE) == VT_QLONG) || ((vtop->type.t & VT_BTYPE) == VT_QFLOAT)) {
                int addr_type = VT_LLONG, load_size = 8, load_type = ((vtop->type.t & VT_BTYPE) == VT_QLONG) ? VT_LLONG : VT_DOUBLE;
 #else
@ -838,7 +840,7 @@ ST_FUNC int gv(int rc)
                original_type = vtop->type.t;
                /* two register type load : expand to two words
                   temporarily */
-#ifndef TCC_TARGET_X86_64
+#if !defined(TCC_TARGET_ARM64) && !defined(TCC_TARGET_X86_64)
                if ((vtop->r & (VT_VALMASK | VT_LVAL)) == VT_CONST) {
                    /* load constant */
                    ll = vtop->c.ull;
@ -890,7 +892,7 @@ ST_FUNC int gv(int rc)
                t1 = t;
                /* compute memory access type */
                if (vtop->r & VT_REF)
-#ifdef TCC_TARGET_X86_64
+#if defined(TCC_TARGET_ARM64) || defined(TCC_TARGET_X86_64)
                    t = VT_PTR;
 #else
                    t = VT_INT;
@ -952,6 +954,7 @@ ST_FUNC void gv2(int rc1, int rc2)
    }
 }

+#ifndef TCC_TARGET_ARM64
 /* wrapper around RC_FRET to return a register by type */
 static int rc_fret(int t)
 {
@ -962,6 +965,7 @@ static int rc_fret(int t)
 #endif
    return RC_FRET;
 }
+#endif

 /* wrapper around REG_FRET to return a register by type */
 static int reg_fret(int t)
@ -1147,7 +1151,7 @@ ST_FUNC int gvtst(int inv, int t)
    return gtst(inv, t);
 }

-#ifndef TCC_TARGET_X86_64
+#if !defined(TCC_TARGET_ARM64) && !defined(TCC_TARGET_X86_64)
 /* generate CPU independent (unsigned) long long operations */
 static void gen_opl(int op)
 {
@ -1358,7 +1362,7 @@ static void gen_opl(int op)
 #elif defined(TCC_TARGET_ARM)
                b = ind;
                o(0x1A000000 | encbranch(ind, 0, 1));
-#elif defined(TCC_TARGET_C67)
+#elif defined(TCC_TARGET_C67) || defined(TCC_TARGET_ARM64)
                tcc_error("not implemented");
 #else
 #error not supported
@ -1512,7 +1516,8 @@ static void gen_opic(int op)
        general_case:
            if (!nocode_wanted) {
                /* call low level op generator */
-                if (t1 == VT_LLONG || t2 == VT_LLONG) 
+                if (t1 == VT_LLONG || t2 == VT_LLONG ||
+                    (PTR_SIZE == 8 && (t1 == VT_PTR || t2 == VT_PTR)))
                    gen_opl(op);
                else
                    gen_opi(op);
@ -1679,7 +1684,7 @@ ST_FUNC void gen_op(int op)
        if (op >= TOK_ULT && op <= TOK_LOR) {
            check_comparison_pointer_types(vtop - 1, vtop, op);
            /* pointers are handled are unsigned */
-#ifdef TCC_TARGET_X86_64
+#if defined(TCC_TARGET_ARM64) || defined(TCC_TARGET_X86_64)
            t = VT_LLONG | VT_UNSIGNED;
 #else
            t = VT_INT | VT_UNSIGNED;
@ -1700,7 +1705,7 @@ ST_FUNC void gen_op(int op)
            vrott(3);
            gen_opic(op);
            /* set to integer type */
-#ifdef TCC_TARGET_X86_64
+#if defined(TCC_TARGET_ARM64) || defined(TCC_TARGET_X86_64)
            vtop->type.t = VT_LLONG;
 #else
            vtop->type.t = VT_INT; 
@ -1724,7 +1729,7 @@ ST_FUNC void gen_op(int op)
                u = pointed_size(&vtop[-1].type);
                if (u < 0)
                    tcc_error("unknown array element size");
-#ifdef TCC_TARGET_X86_64
+#if defined(TCC_TARGET_ARM64) || defined(TCC_TARGET_X86_64)
                vpushll(u);
 #else
                /* XXX: cast to int ? (long long case) */
@ -1833,6 +1838,9 @@ ST_FUNC void gen_op(int op)
 /* generic itof for unsigned long long case */
 static void gen_cvt_itof1(int t)
 {
+#ifdef TCC_TARGET_ARM64
+    gen_cvt_itof(t);
+#else
    if ((vtop->type.t & (VT_BTYPE | VT_UNSIGNED)) == 
        (VT_LLONG | VT_UNSIGNED)) {

@ -1851,12 +1859,16 @@ static void gen_cvt_itof1(int t)
    } else {
        gen_cvt_itof(t);
    }
+#endif
 }
 #endif

 /* generic ftoi for unsigned long long case */
 static void gen_cvt_ftoi1(int t)
 {
+#ifdef TCC_TARGET_ARM64
+    gen_cvt_ftoi(t);
+#else
    int st;

    if (t == (VT_LLONG | VT_UNSIGNED)) {
@ -1878,6 +1890,7 @@ static void gen_cvt_ftoi1(int t)
    } else {
        gen_cvt_ftoi(t);
    }
+#endif
 }

 /* force char or short cast */
@ -1968,7 +1981,7 @@ static void gen_cast(CType *type)
                    vtop->c.ll = vtop->c.ull;
                else if (sbt & VT_UNSIGNED)
                    vtop->c.ll = vtop->c.ui;
-#ifdef TCC_TARGET_X86_64
+#if defined(TCC_TARGET_ARM64) || defined(TCC_TARGET_X86_64)
                else if (sbt == VT_PTR)
                    ;
 #endif
@ -1979,7 +1992,7 @@ static void gen_cast(CType *type)
                    vtop->c.ull = vtop->c.ll;
                else if (dbt == VT_BOOL)
                    vtop->c.i = (vtop->c.ll != 0);
-#ifdef TCC_TARGET_X86_64
+#if defined(TCC_TARGET_ARM64) || defined(TCC_TARGET_X86_64)
                else if (dbt == VT_PTR)
                    ;
 #endif
@ -2024,7 +2037,7 @@ static void gen_cast(CType *type)
                        gen_cast(type);
                    }
                }
-#ifndef TCC_TARGET_X86_64
+#if !defined(TCC_TARGET_ARM64) && !defined(TCC_TARGET_X86_64)
            } else if ((dbt & VT_BTYPE) == VT_LLONG) {
                if ((sbt & VT_BTYPE) != VT_LLONG) {
                    /* scalar to long long */
@ -2056,11 +2069,18 @@ static void gen_cast(CType *type)
                    (sbt & VT_BTYPE) != VT_PTR &&
                    (sbt & VT_BTYPE) != VT_FUNC) {
                    /* need to convert from 32bit to 64bit */
-                    int r = gv(RC_INT);
+                    gv(RC_INT);
                    if (sbt != (VT_INT | VT_UNSIGNED)) {
+#if defined(TCC_TARGET_ARM64)
+                        gen_cvt_sxtw();
+#elif defined(TCC_TARGET_X86_64)
+                        int r = gv(RC_INT);
                        /* x86_64 specific: movslq */
                        o(0x6348);
                        o(0xc0 + (REG_VALUE(r) << 3) + REG_VALUE(r));
+#else
+#error
+#endif
                    }
                }
 #endif
@ -2589,7 +2609,7 @@ ST_FUNC void vstore(void)
            if ((vtop[-1].r & VT_VALMASK) == VT_LLOCAL) {
                SValue sv;
                t = get_reg(RC_INT);
-#ifdef TCC_TARGET_X86_64
+#if defined(TCC_TARGET_ARM64) || defined(TCC_TARGET_X86_64)
                sv.type.t = VT_PTR;
 #else
                sv.type.t = VT_INT;
@ -2600,7 +2620,7 @@ ST_FUNC void vstore(void)
                vtop[-1].r = t | VT_LVAL;
            }
            /* two word case handling : store second register at word + 4 (or +8 for x86-64)  */
-#ifdef TCC_TARGET_X86_64
+#if defined(TCC_TARGET_ARM64) || defined(TCC_TARGET_X86_64)
            if (((ft & VT_BTYPE) == VT_QLONG) || ((ft & VT_BTYPE) == VT_QFLOAT)) {
                int addr_type = VT_LLONG, load_size = 8, load_type = ((vtop->type.t & VT_BTYPE) == VT_QLONG) ? VT_LLONG : VT_DOUBLE;
 #else
@ -3098,6 +3118,13 @@ static int parse_btype(CType *type, AttributeDef *ad)
                goto basic_type1;
            }
            break;
+#ifdef TCC_TARGET_ARM64
+        case TOK_UINT128:
+            /* GCC's __uint128_t appears in some Linux header files. Make it a
+               synonym for long double to get the size and alignment right. */
+            u = VT_LDOUBLE;
+            goto basic_type;
+#endif
        case TOK_BOOL:
            u = VT_BOOL;
            goto basic_type;
@ -3233,7 +3260,8 @@ the_end:

    /* long is never used as type */
    if ((t & VT_BTYPE) == VT_LONG)
-#if !defined TCC_TARGET_X86_64 || defined TCC_TARGET_PE
+#if (!defined TCC_TARGET_X86_64 && !defined TCC_TARGET_ARM64) || \
+    defined TCC_TARGET_PE
        t = (t & ~VT_BTYPE) | VT_INT;
 #else
        t = (t & ~VT_BTYPE) | VT_LLONG;
@ -3881,6 +3909,36 @@ ST_FUNC void unary(void)
        break;
 #endif
 #endif
+
+#ifdef TCC_TARGET_ARM64
+    case TOK___va_start: {
+        next();
+        skip('(');
+        expr_eq();
+        skip(',');
+        expr_eq();
+        skip(')');
+        //xx check types
+        gen_va_start();
+        vpushi(0);
+        vtop->type.t = VT_VOID;
+        break;
+    }
+    case TOK___va_arg: {
+        CType type;
+        next();
+        skip('(');
+        expr_eq();
+        skip(',');
+        parse_type(&type);
+        skip(')');
+        //xx check types
+        gen_va_arg(&type);
+        vtop->type = type;
+        break;
+    }
+#endif
+
    case TOK_INC:
    case TOK_DEC:
        t = tok;
@ -4071,6 +4129,15 @@ ST_FUNC void unary(void)
                if (!ret_nregs) {
                    /* get some space for the returned structure */
                    size = type_size(&s->type, &align);
+#ifdef TCC_TARGET_ARM64
+                /* On arm64, a small struct is return in registers.
+                   It is much easier to write it to memory if we know
+                   that we are allowed to write some extra bytes, so
+                   round the allocated space up to a power of 2: */
+                if (size < 16)
+                    while (size & (size - 1))
+                        size = (size | (size - 1)) + 1;
+#endif
                    loc = (loc - size) & -align;
                    ret.type = s->type;
                    ret.r = VT_LOCAL | VT_LVAL;
@ -4094,12 +4161,14 @@ ST_FUNC void unary(void)
                      ret.r2 = REG_QRET;
 #endif
                } else {
+#ifndef TCC_TARGET_ARM64
 #ifdef TCC_TARGET_X86_64
                    if ((ret.type.t & VT_BTYPE) == VT_QLONG)
 #else
                    if ((ret.type.t & VT_BTYPE) == VT_LLONG)
 #endif
                        ret.r2 = REG_LRET;
+#endif
                    ret.r = REG_IRET;
                }
                ret.c.i = 0;
@ -4717,6 +4786,10 @@ static void block(int *bsym, int *csym, int *case_sym, int *def_sym,
        if (tok != ';') {
            gexpr();
            gen_assign_cast(&func_vt);
+#ifdef TCC_TARGET_ARM64
+            // Perhaps it would be better to use this for all backends:
+            greturn();
+#else
            if ((func_vt.t & VT_BTYPE) == VT_STRUCT) {
                CType type, ret_type;
                int ret_align, ret_nregs;
@ -4770,6 +4843,7 @@ static void block(int *bsym, int *csym, int *case_sym, int *def_sym,
            } else {
                gv(RC_IRET);
            }
+#endif
            vtop--; /* NOT vpop() because on x86 it would flush the fp stack */
        }
        skip(';');
@ -5160,9 +5234,9 @@ static void init_putv(CType *type, Section *sec, unsigned long c,
        /* XXX: generate error if incorrect relocation */
        gen_assign_cast(&dtype);
        bt = type->t & VT_BTYPE;
-        /* we'll write at most 12 bytes */
-        if (c + 12 > sec->data_allocated) {
-            section_realloc(sec, c + 12);
+        /* we'll write at most 16 bytes */
+        if (c + 16 > sec->data_allocated) {
+            section_realloc(sec, c + 16);
        }
        ptr = sec->data + c;
        /* XXX: make code faster ? */
@ -5184,6 +5258,9 @@ static void init_putv(CType *type, Section *sec, unsigned long c,
             (bt == VT_INT && bit_size != 32)))
            tcc_error("initializer element is not computable at load time");
        switch(bt) {
+            /* XXX: when cross-compiling we assume that each type has the
+               same representation on host and target, which is likely to
+               be wrong in the case of long double */
        case VT_BOOL:
            vtop->c.i = (vtop->c.i != 0);
        case VT_BYTE:
@ -5203,7 +5280,7 @@ static void init_putv(CType *type, Section *sec, unsigned long c,
            break;
        case VT_PTR: {
            addr_t val = (vtop->c.ptr_offset & bit_mask) << bit_pos;
-#ifdef TCC_TARGET_X86_64
+#if defined(TCC_TARGET_ARM64) || defined(TCC_TARGET_X86_64)
            if (vtop->r & VT_SYM)
                greloca(sec, vtop->sym, c, R_DATA_PTR, val);
            else
@ -5217,7 +5294,7 @@ static void init_putv(CType *type, Section *sec, unsigned long c,
        }
        default: {
            int val = (vtop->c.i & bit_mask) << bit_pos;
-#ifdef TCC_TARGET_X86_64
+#if defined(TCC_TARGET_ARM64) || defined(TCC_TARGET_X86_64)
            if (vtop->r & VT_SYM)
                greloca(sec, vtop->sym, c, R_DATA_PTR, val);
            else
--- a/tccrun.c
+++ b/tccrun.c
@ -604,6 +604,27 @@ static int rt_get_caller_pc(addr_t *paddr, ucontext_t *uc, int level)
    }
 }

+/* ------------------------------------------------------------- */
+#elif defined(__aarch64__)
+
+static int rt_get_caller_pc(addr_t *paddr, ucontext_t *uc, int level)
+{
+    if (level < 0)
+        return -1;
+    else if (level == 0) {
+        *paddr = uc->uc_mcontext.pc;
+        return 0;
+    }
+    else {
+        addr_t *fp = (addr_t *)uc->uc_mcontext.regs[29];
+        int i;
+        for (i = 1; i < level; i++)
+            fp = (addr_t *)fp[0];
+        *paddr = fp[1];
+        return 0;
+    }
+}
+
 /* ------------------------------------------------------------- */
 #else

--- a/tcctok.h
+++ b/tcctok.h
@ -59,6 +59,10 @@
     DEF(TOK_ASM2, "__asm")
     DEF(TOK_ASM3, "__asm__")

+#ifdef TCC_TARGET_ARM64
+     DEF(TOK_UINT128, "__uint128_t")
+#endif
+
 /*********************************************************************/
 /* the following are not keywords. They are included to ease parsing */
 /* preprocessor only */
@ -136,6 +140,11 @@
     DEF(TOK_REGPARM1, "regparm")
     DEF(TOK_REGPARM2, "__regparm__")

+#ifdef TCC_TARGET_ARM64
+     DEF(TOK___va_start, "__va_start")
+     DEF(TOK___va_arg, "__va_arg")
+#endif
+
 /* pragma */
     DEF(TOK_pack, "pack")
 #if !defined(TCC_TARGET_I386) && !defined(TCC_TARGET_X86_64)
@ -229,6 +238,30 @@
 #if defined TCC_TARGET_PE
     DEF(TOK___chkstk, "__chkstk")
 #endif
+#ifdef TCC_TARGET_ARM64
+     DEF(TOK___addtf3, "__addtf3")
+     DEF(TOK___subtf3, "__subtf3")
+     DEF(TOK___multf3, "__multf3")
+     DEF(TOK___divtf3, "__divtf3")
+     DEF(TOK___extendsftf2, "__extendsftf2")
+     DEF(TOK___extenddftf2, "__extenddftf2")
+     DEF(TOK___trunctfsf2, "__trunctfsf2")
+     DEF(TOK___trunctfdf2, "__trunctfdf2")
+     DEF(TOK___fixtfsi, "__fixtfsi")
+     DEF(TOK___fixtfdi, "__fixtfdi")
+     DEF(TOK___fixunstfsi, "__fixunstfsi")
+     DEF(TOK___fixunstfdi, "__fixunstfdi")
+     DEF(TOK___floatsitf, "__floatsitf")
+     DEF(TOK___floatditf, "__floatditf")
+     DEF(TOK___floatunsitf, "__floatunsitf")
+     DEF(TOK___floatunditf, "__floatunditf")
+     DEF(TOK___eqtf2, "__eqtf2")
+     DEF(TOK___netf2, "__netf2")
+     DEF(TOK___lttf2, "__lttf2")
+     DEF(TOK___letf2, "__letf2")
+     DEF(TOK___gttf2, "__gttf2")
+     DEF(TOK___getf2, "__getf2")
+#endif

 /* bound checking symbols */
 #ifdef CONFIG_TCC_BCHECK