Support basic libc function

TCC tools don't run exec command.
Implemenations for the cstring header.
2020-05-23 01:35:11 +02:00 · 2020-05-21 04:27:31 +02:00 · 2020-05-21 04:19:56 +02:00 · 2020-05-16 19:15:10 +02:00 · 2017-12-12 17:57:20 +01:00 · 2017-12-12 17:33:37 +01:00
372 changed files with 38991 additions and 16801 deletions
--- a/.cvsignore
+++ b/.cvsignore
@ -1,37 +0,0 @@
-tcc_g
-tcc
-tc2.c
-doc
-tc3s.c
-p3.c
-tc1.c
-error.c
-i386-gen1.c
-test.out2
-test.out3
-web.sh
-memdebug.c
-bench
-Makefile.uClibc
-boundtest
-prog.ref
-test.ref
-test.out
-tcc-doc.html
-ideas
-tcctest.ref
-linux.tcc
-ldtest
-libtcc_test
-instr.S
-p.c
-p2.c
-tcctest[1234]
-test[1234].out
-.gdb_history
-tcc.1
-tcc.pod
-config.h
-config.mak
-config.texi
-tests
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,57 @@
+*~
+\#*
+.#*
+*.o
+*.a
+*.exe
+*.dll
+*.obj
+*.pdb
+*.lib
+*.exp
+*.log
+*.bz2
+*.zip
+.gdb_history
+a.out
+tcc_g
+tcc
+*-tcc
+libtcc*.def
+
+config*.h
+config*.mak
+config.texi
+conftest*
+tags
+TAGS
+tcc.1
+tcc.pod
+tcc-doc.html
+tcc-doc.info
+
+win32/doc
+win32/libtcc
+win32/lib/32
+win32/lib/64
+win32/include/float.h
+win32/include/stdarg.h
+win32/include/stdbool.h
+win32/include/stddef.h
+win32/include/varargs.h
+win32/include/tcclib.h
+
+tests/tcctest[1234]
+tests/tcctest.gcc
+tests/*.out*
+tests/*.ref
+tests/*.txt
+tests/*.gcc
+tests/*-cc*
+tests/*-tcc*
+tests/libtcc_test
+tests/asm-c-connect
+tests/asm-c-connect-sep
+tests/vla_test
+tests/hello
+tests/tests2/fred.txt
--- a/73
+++ b/73
@ -1,7 +1,78 @@
+Version 0.9.27:
+
+User interface:
+- -x[c|a|n] filetype option (Sergey Korshunoff)
+- -P[1], -dD, -dM preprocessor options (Sergey Korshunoff)
+- -Wl,-(no-)whole-archive linker option (Reuben Thomas)
+- -mms-bitfields option (David Mertens)
+- -include <file> option (Michael Matz)
+- -mno-sse on x86-64 disables use of SSE instructions
+- @listfile support (Vlad Vissoultchev)
+- tcc -ar/-impdef - formerly tiny_xxx tools integrated (grischka)
+- CPATH, C_INCLUDE_PATH and LIBRARY_PATH environment variables support
+  (Andrew Aladjev, Urs Janssen)
+
+Platforms:
+- new AARCH64 (arm64) target (Edmund Grimley Evans)
+- vastly improved support for ARM hard float calling convention
+   (Thomas Preud'homme, Daniel Glöckner)
+- provide a runtime library for ARM (Thomas Preud'homme)
+- many x86_64 ABI fixes incl. XMM register passing and tests (James Lyon)
+- ABI tests with native compiler using libtcc (James Lyon)
+- UNICODE startup code supports wmain and wWinMain (YX Hao)
+- shared libraries for x86_64 (Michael Matz)
+- Bootstrap native Windows 32/64 compiler using Cygwin+gcc (Christian Jullien)
+
+Features:
+- VLA (variable length array) improved (James Lyon, Pip Cet)
+- import functions by ordinal in .def files on windows (YX Hao)
+- x86/x86_64 assembler much improved (Michael Matz)
+- simple dead code suppression (Edmund Grimley Evans, Michael Matz, grischka)
+- implement round/fmin/fmax etc. math on windows (Avi Halachmi)
+- #pragma once support (Sergey Korshunoff, Vlad Vissoultchev, ...)
+- switch/case code improved (Zdenek Pavlas)
+- ~15% faster by TinyAlloc fast memory allocator (Vlad Vissoultchev)
+- standard conforming (and GCC compatible) struct initialization
+   (Michael Matz)
+- bit-field layout made compatible with GCC (Michael Matz)
+- UTF8 in string literals supported (Zdenek Pavlas)
+_ _Generic(...) supported (Matthias Gatto)
+
+Licensing:
+- TinyCC partly relicensed to MIT license (See RELICENSING file).
+
+version 0.9.26:
+
+User interface:
+- -MD/-MF (automatically generate dependencies for make)
+- -pthread option (same as -D_REENTRANT -lpthread) (Henry Kroll III)
+- -m32/-m64 to re-exec cross compiler (Henry Kroll III)
+- -Wl, Mimic all GNU -option forms supported by ld (Kirill Smelkov)
+- new LIBTCCAPI tcc_set_options() (grischka)
+
+Platforms:
+- Many improvements for x86-64 target (Shinichiro Hamaji, Michael Matz, grischka)
+- x86-64 assembler (Frederic Feret)
+- Many improvements for ARM target (Daniel Glöckner, Thomas Preud'homme)
+- Support WinCE PE ARM (Timo VJ Lahde)
+- Support ARM hardfloat calling convention (Thomas Preud'homme)
+- Support SELinux (Security-Enhanced Linux) (Henry Kroll III)
+- Support Debian GNU/kFreeBSD kernels (Pierre Chifflier)
+- Support GNU/Hurd kernels (Thomas Preud'homme)
+- Support OSX (tcc -run only) (Milutin Jovanovic)
+- Support multiarch configuration (Thomas Preud'homme)
+- Support out-of-tree build (Akim Demaille)
+
+Features:
+- C99 variable length arrays (Thomas Preud'homme & Joe Soroka)
+- Asm labels for variables and functions (Thomas Preud'homme)
+- STT_GNU_IFUNC (Indirect functions as externals) (Thomas Preud'homme)
+- More tests (tests2) (Milutin Jovanovic)
+
 version 0.9.25:

 - first support for x86-64 target (Shinichiro Hamaji)
- support ｵClibc
+- support µClibc
 - split tcc.c into tcc.h libtcc.c tccpp.c tccgen.c tcc.c
 - improved preprocess output with linenumbers and spaces preserved
 - tcc_relocate now copies code into user buffer
--- a/71
+++ b/71
@ -0,0 +1,71 @@
+
+In general, use the same coding style as the surrounding code.
+
+However, do not make any unnecessary changes as that complicates
+the VCS (git) history and makes it harder to merge patches. So
+do not modify code just to make it conform to a coding style.
+
+    Indentation
+
+Turn on a "fill tabs with spaces" option in your editor.
+
+Remove tabs and trailing spaces from any lines that are modified.
+
+Note that some files are indented with 2 spaces (when they
+have large indentation) while most are indented with 4 spaces.
+
+    Language
+
+TCC is mostly implemented in C90. Do not use any non-C90 features
+that are not already in use.
+
+Non-C90 features currently in use, as revealed by
+./configure --extra-cflags="-std=c90 -Wpedantic":
+
+- long long (including "LL" constants)
+- inline
+- very long string constants
+- assignment between function pointer and 'void *'
+- "//" comments
+- empty macro arguments (DEF_ASMTEST in i386-tok.h)
+- unnamed struct and union fields (in struct Sym), a C11 feature
+
+    Testing
+
+A simple "make test" is sufficient for some simple changes. However,
+before committing a change consider performing some of the following
+additional tests:
+
+- Build and run "make test" on several architectures.
+
+- Build with ./configure --enable-cross.
+
+- If the generation of relocations has been changed, try compiling
+  with TCC and linking with GCC/Clang. If the linker has been
+  modified, try compiling with GCC/Clang and linking with TCC.
+
+- Test with ASan/UBSan to detect memory corruption and undefined behaviour:
+
+make clean
+./configure
+make
+make test
+cp libtcc.a libtcc.a.hide
+
+make clean
+./configure --extra-cflags="-fsanitize=address,undefined -g"
+make
+cp libtcc.a.hide libtcc.a
+make test
+
+- Test with Valgrind to detect some uses of uninitialised values:
+
+make clean
+./configure
+make
+# On Intel, because Valgrind does floating-point arithmetic differently:
+( cd tests && gcc -I.. tcctest.c && valgrind -q ./a.out > test.ref )
+make test TCC="valgrind -q --leak-check=full `pwd`/tcc -B`pwd` -I`pwd`"
+
+  (Because of how VLAs are implemented, invalid reads are expected
+  with 79_vla_continue.)
--- a/594
+++ b/594
@ -1,281 +1,403 @@
+# --------------------------------------------------------------------------
 #
 # Tiny C Compiler Makefile
 #

-TOP ?= .
+ifndef TOP
+ TOP = .
+ INCLUDED = no
+endif
+
 include $(TOP)/config.mak

-CFLAGS+=-g -Wall
-CFLAGS_P=$(CFLAGS) -pg -static -DCONFIG_TCC_STATIC
-LIBS_P=
-
-ifneq ($(GCC_MAJOR),2)
-CFLAGS+=-fno-strict-aliasing
-ifneq ($(GCC_MAJOR),3)
-CFLAGS+=-Wno-pointer-sign -Wno-sign-compare -D_FORTIFY_SOURCE=0
-endif
+ifeq (-$(CC)-$(GCC_MAJOR)-$(findstring $(GCC_MINOR),56789)-,-gcc-4--)
+ CFLAGS += -D_FORTIFY_SOURCE=0
 endif

-ifeq ($(ARCH),i386)
-CFLAGS+=-mpreferred-stack-boundary=2
-ifeq ($(GCC_MAJOR),2)
-CFLAGS+=-m386 -malign-functions=0
-else
-CFLAGS+=-march=i386 -falign-functions=0
-endif
-endif
-
-ifndef CONFIG_WIN32
-LIBS=-lm
-ifndef CONFIG_NOLDL
-LIBS+=-ldl
-endif
-endif
-
-ifeq ($(ARCH),i386)
-NATIVE_TARGET=-DTCC_TARGET_I386
-LIBTCC1=libtcc1.a
-BCHECK_O=bcheck.o
-ALLOCA_O=alloca86.o alloca86-bt.o
-else
-ifeq ($(ARCH),x86-64)
-NATIVE_TARGET=-DTCC_TARGET_X86_64
-LIBTCC1=libtcc1.a
-BCHECK_O=
-ALLOCA_O=alloca86_64.o
-endif
-endif
-
-ifeq ($(ARCH),arm)
-NATIVE_TARGET=-DTCC_TARGET_ARM
-NATIVE_TARGET+=$(if $(wildcard /lib/ld-linux.so.3),-DTCC_ARM_EABI)
-NATIVE_TARGET+=$(if $(shell grep -l "^Features.* \(vfp\|iwmmxt\) " /proc/cpuinfo),-DTCC_ARM_VFP)
-endif
+LIBTCC = libtcc.a
+LIBTCC1 = libtcc1.a
+LINK_LIBTCC =
+LIBS =
+CFLAGS += -I$(TOP)
+CFLAGS += $(CPPFLAGS)
+VPATH = $(TOPSRC)

 ifdef CONFIG_WIN32
-NATIVE_TARGET+=-DTCC_TARGET_PE
-BCHECK_O=
-endif
-
-ifneq ($(wildcard /lib/ld-uClibc.so.0),)
-NATIVE_TARGET+=-DTCC_UCLIBC
-BCHECK_O=
-endif
-
-ifdef CONFIG_USE_LIBGCC
-LIBTCC1=
-endif
-
-ifeq ($(TOP),.)
-
-PROGS=tcc$(EXESUF)
-I386_CROSS = i386-tcc$(EXESUF)
-WIN32_CROSS = i386-win32-tcc$(EXESUF)
-WIN64_CROSS = x86_64-win32-tcc$(EXESUF)
-WINCE_CROSS = arm-win32-tcc$(EXESUF)
-X64_CROSS = x86_64-tcc$(EXESUF)
-ARM_CROSS = arm-tcc-fpa$(EXESUF) arm-tcc-fpa-ld$(EXESUF) \
-    arm-tcc-vfp$(EXESUF) arm-tcc-vfp-eabi$(EXESUF)
-C67_CROSS = c67-tcc$(EXESUF)
-
-CORE_FILES = tcc.c libtcc.c tccpp.c tccgen.c tccelf.c tccasm.c \
-    tcc.h config.h libtcc.h tcctok.h
-I386_FILES = $(CORE_FILES) i386-gen.c i386-asm.c i386-asm.h i386-tok.h
-WIN32_FILES = $(CORE_FILES) i386-gen.c i386-asm.c i386-asm.h i386-tok.h tccpe.c
-WIN64_FILES = $(CORE_FILES) x86_64-gen.c x86_64-asm.c x86_64-asm.h x86_64-tok.h tccpe.c
-WINCE_FILES = $(CORE_FILES) arm-gen.c tccpe.c
-X86_64_FILES = $(CORE_FILES) x86_64-gen.c x86_64-asm.c x86_64-asm.h x86_64-tok.h
-ARM_FILES = $(CORE_FILES) arm-gen.c
-C67_FILES = $(CORE_FILES) c67-gen.c tcccoff.c
-
-ifdef CONFIG_WIN32
-PROGS+=tiny_impdef$(EXESUF) tiny_libmaker$(EXESUF)
-NATIVE_FILES=$(WIN32_FILES)
-PROGS_CROSS=$(WIN64_CROSS) $(I386_CROSS) $(X64_CROSS) $(ARM_CROSS) $(C67_CROSS)
+ ifneq ($(CONFIG_static),yes)
+  LIBTCC = libtcc$(DLLSUF)
+  LIBTCCDEF = libtcc.def
+ endif
+ CFGWIN = -win
+ NATIVE_TARGET = $(ARCH)-win$(if $(findstring arm,$(ARCH)),ce,32)
 else
-ifeq ($(ARCH),i386)
-NATIVE_FILES=$(I386_FILES)
-PROGS_CROSS=$(X64_CROSS) $(WIN32_CROSS) $(WIN64_CROSS) $(ARM_CROSS) $(C67_CROSS)
-else
-ifeq ($(ARCH),x86-64)
-NATIVE_FILES=$(X86_64_FILES)
-PROGS_CROSS=$(I386_CROSS) $(WIN32_CROSS) $(WIN64_CROSS) $(ARM_CROSS) $(C67_CROSS)
-else
-ifeq ($(ARCH),arm)
-NATIVE_FILES=$(ARM_FILES)
-PROGS_CROSS=$(I386_CROSS) $(X64_CROSS) $(WIN32_CROSS) $(WIN64_CROSS) $(C67_CROSS)
+ LIBS=-lm
+ ifneq ($(CONFIG_ldl),no)
+  LIBS+=-ldl
+ endif
+ # make libtcc as static or dynamic library?
+ ifeq ($(CONFIG_static),no)
+  LIBTCC=libtcc$(DLLSUF)
+  export LD_LIBRARY_PATH := $(CURDIR)/$(TOP)
+  ifneq ($(CONFIG_rpath),no)
+   LINK_LIBTCC += -Wl,-rpath,"$(libdir)"
+  endif
+ endif
+ CFGWIN =-unx
+ NATIVE_TARGET = $(ARCH)
+ ifdef CONFIG_OSX
+  NATIVE_TARGET = $(ARCH)-osx
+  LDFLAGS += -flat_namespace -undefined warning
+  export MACOSX_DEPLOYMENT_TARGET := 10.2
+ endif
 endif
+
+# run local version of tcc with local libraries and includes
+TCCFLAGS-unx = -B$(TOP) -I$(TOPSRC)/include -I$(TOPSRC) -I$(TOP)
+TCCFLAGS-win = -B$(TOPSRC)/win32 -I$(TOPSRC)/include -I$(TOPSRC) -I$(TOP) -L$(TOP)
+TCCFLAGS = $(TCCFLAGS$(CFGWIN))
+TCC = $(TOP)/tcc$(EXESUF) $(TCCFLAGS)
+ifdef CONFIG_OSX
+ TCCFLAGS += -D_ANSI_SOURCE
 endif
+
+CFLAGS_P = $(CFLAGS) -pg -static -DCONFIG_TCC_STATIC -DTCC_PROFILE
+LIBS_P = $(LIBS)
+LDFLAGS_P = $(LDFLAGS)
+
+CONFIG_$(ARCH) = yes
+NATIVE_DEFINES_$(CONFIG_i386) += -DTCC_TARGET_I386
+NATIVE_DEFINES_$(CONFIG_x86_64) += -DTCC_TARGET_X86_64
+NATIVE_DEFINES_$(CONFIG_WIN32) += -DTCC_TARGET_PE
+NATIVE_DEFINES_$(CONFIG_OSX) += -DTCC_TARGET_MACHO
+NATIVE_DEFINES_$(CONFIG_uClibc) += -DTCC_UCLIBC
+NATIVE_DEFINES_$(CONFIG_musl) += -DTCC_MUSL
+NATIVE_DEFINES_$(CONFIG_libgcc) += -DCONFIG_USE_LIBGCC
+NATIVE_DEFINES_$(CONFIG_selinux) += -DHAVE_SELINUX
+NATIVE_DEFINES_$(CONFIG_arm) += -DTCC_TARGET_ARM
+NATIVE_DEFINES_$(CONFIG_arm_eabihf) += -DTCC_ARM_EABI -DTCC_ARM_HARDFLOAT
+NATIVE_DEFINES_$(CONFIG_arm_eabi) += -DTCC_ARM_EABI
+NATIVE_DEFINES_$(CONFIG_arm_vfp) += -DTCC_ARM_VFP
+NATIVE_DEFINES_$(CONFIG_arm64) += -DTCC_TARGET_ARM64
+NATIVE_DEFINES += $(NATIVE_DEFINES_yes)
+
+ifeq ($(INCLUDED),no)
+# --------------------------------------------------------------------------
+# running top Makefile
+
+PROGS = tcc$(EXESUF)
+TCCLIBS = $(LIBTCC1) $(LIBTCC) $(LIBTCCDEF)
+TCCDOCS = tcc.1 tcc-doc.html tcc-doc.info
+
+all: $(PROGS) $(TCCLIBS) $(TCCDOCS)
+
+# cross compiler targets to build
+TCC_X = i386 x86_64 i386-win32 x86_64-win32 x86_64-osx arm arm64 arm-wince c67
+# TCC_X += arm-fpa arm-fpa-ld arm-vfp arm-eabi
+
+# cross libtcc1.a targets to build
+LIBTCC1_X = i386 x86_64 i386-win32 x86_64-win32 x86_64-osx arm arm64 arm-wince
+
+PROGS_CROSS = $(foreach X,$(TCC_X),$X-tcc$(EXESUF))
+LIBTCC1_CROSS = $(foreach X,$(LIBTCC1_X),$X-libtcc1.a)
+
+# build cross compilers & libs
+cross: $(LIBTCC1_CROSS) $(PROGS_CROSS)
+
+# build specific cross compiler & lib
+cross-%: %-tcc$(EXESUF) %-libtcc1.a ;
+
+install: ; @$(MAKE) --no-print-directory install$(CFGWIN)
+install-strip: ; @$(MAKE) --no-print-directory install$(CFGWIN) CONFIG_strip=yes
+uninstall: ; @$(MAKE) --no-print-directory uninstall$(CFGWIN)
+
+ifdef CONFIG_cross
+all : cross
+endif
+
+# --------------------------------------------
+
+T = $(or $(CROSS_TARGET),$(NATIVE_TARGET),unknown)
+X = $(if $(CROSS_TARGET),$(CROSS_TARGET)-)
+
+DEF-i386        = -DTCC_TARGET_I386
+DEF-x86_64      = -DTCC_TARGET_X86_64
+DEF-i386-win32  = -DTCC_TARGET_PE -DTCC_TARGET_I386
+DEF-x86_64-win32= -DTCC_TARGET_PE -DTCC_TARGET_X86_64
+DEF-x86_64-osx  = -DTCC_TARGET_MACHO -DTCC_TARGET_X86_64
+DEF-arm-wince   = -DTCC_TARGET_PE -DTCC_TARGET_ARM -DTCC_ARM_EABI -DTCC_ARM_VFP -DTCC_ARM_HARDFLOAT
+DEF-arm64       = -DTCC_TARGET_ARM64
+DEF-c67         = -DTCC_TARGET_C67 -w # disable warnigs
+DEF-arm-fpa     = -DTCC_TARGET_ARM
+DEF-arm-fpa-ld  = -DTCC_TARGET_ARM -DLDOUBLE_SIZE=12
+DEF-arm-vfp     = -DTCC_TARGET_ARM -DTCC_ARM_VFP
+DEF-arm-eabi    = -DTCC_TARGET_ARM -DTCC_ARM_VFP -DTCC_ARM_EABI
+DEF-arm-eabihf  = -DTCC_TARGET_ARM -DTCC_ARM_VFP -DTCC_ARM_EABI -DTCC_ARM_HARDFLOAT
+DEF-arm         = $(DEF-arm-eabihf)
+DEF-$(NATIVE_TARGET) = $(NATIVE_DEFINES)
+
+DEFINES += $(DEF-$T) $(DEF-all)
+DEFINES += $(if $(ROOT-$T),-DCONFIG_SYSROOT="\"$(ROOT-$T)\"")
+DEFINES += $(if $(CRT-$T),-DCONFIG_TCC_CRTPREFIX="\"$(CRT-$T)\"")
+DEFINES += $(if $(LIB-$T),-DCONFIG_TCC_LIBPATHS="\"$(LIB-$T)\"")
+DEFINES += $(if $(INC-$T),-DCONFIG_TCC_SYSINCLUDEPATHS="\"$(INC-$T)\"")
+DEFINES += $(DEF-$(or $(findstring win,$T),unx))
+
+ifneq ($(X),)
+ifeq ($(CONFIG_WIN32),yes)
+DEF-win += -DTCC_LIBTCC1="\"$(X)libtcc1.a\""
+DEF-unx += -DTCC_LIBTCC1="\"lib/$(X)libtcc1.a\""
+else
+DEF-all += -DTCC_LIBTCC1="\"$(X)libtcc1.a\""
+DEF-win += -DCONFIG_TCCDIR="\"$(tccdir)/win32\""
 endif
 endif

-ifdef CONFIG_CROSS
-PROGS+=$(PROGS_CROSS)
+# include custom configuration (see make help)
+-include config-extra.mak
+
+CORE_FILES = tcc.c tcctools.c libtcc.c tccpp.c tccgen.c tccelf.c tccasm.c tccrun.c
+CORE_FILES += tcc.h config.h libtcc.h tcctok.h
+i386_FILES = $(CORE_FILES) i386-gen.c i386-link.c i386-asm.c i386-asm.h i386-tok.h
+i386-win32_FILES = $(i386_FILES) tccpe.c
+x86_64_FILES = $(CORE_FILES) x86_64-gen.c x86_64-link.c i386-asm.c x86_64-asm.h
+x86_64-win32_FILES = $(x86_64_FILES) tccpe.c
+x86_64-osx_FILES = $(x86_64_FILES)
+arm_FILES = $(CORE_FILES) arm-gen.c arm-link.c arm-asm.c
+arm-wince_FILES = $(arm_FILES) tccpe.c
+arm64_FILES = $(CORE_FILES) arm64-gen.c arm64-link.c
+c67_FILES = $(CORE_FILES) c67-gen.c c67-link.c tcccoff.c
+
+# libtcc sources
+LIBTCC_SRC = $(filter-out tcc.c tcctools.c,$(filter %.c,$($T_FILES)))
+
+ifeq ($(ONE_SOURCE),yes)
+LIBTCC_OBJ = $(X)libtcc.o
+LIBTCC_INC = $($T_FILES)
+TCC_FILES = $(X)tcc.o
+tcc.o : DEFINES += -DONE_SOURCE=0
+else
+LIBTCC_OBJ = $(patsubst %.c,$(X)%.o,$(LIBTCC_SRC))
+LIBTCC_INC = $(filter %.h %-gen.c %-link.c,$($T_FILES))
+TCC_FILES = $(X)tcc.o $(LIBTCC_OBJ)
+$(TCC_FILES) : DEFINES += -DONE_SOURCE=0
 endif

-all: $(PROGS) $(LIBTCC1) $(BCHECK_O) libtcc.a tcc-doc.html tcc.1 libtcc_test$(EXESUF)
+# target specific object rule
+$(X)%.o : %.c $(LIBTCC_INC)
+	$(CC) -o $@ -c $< $(DEFINES) $(CFLAGS)
+
+# additional dependencies
+$(X)tcc.o : tcctools.c

 # Host Tiny C Compiler
-tcc$(EXESUF): $(NATIVE_FILES)
-	$(CC) -o $@ $< $(NATIVE_TARGET) $(CFLAGS) $(LIBS)
+tcc$(EXESUF): tcc.o $(LIBTCC)
+	$(CC) -o $@ $^ $(LIBS) $(LDFLAGS) $(LINK_LIBTCC)

 # Cross Tiny C Compilers
-i386-tcc$(EXESUF): $(I386_FILES)
-	$(CC) -o $@ $< -DTCC_TARGET_I386 $(CFLAGS) $(LIBS)
+%-tcc$(EXESUF): FORCE
+	@$(MAKE) --no-print-directory $@ CROSS_TARGET=$* ONE_SOURCE=$(or $(ONE_SOURCE),yes)

-i386-win32-tcc$(EXESUF): $(WIN32_FILES)
-	$(CC) -o $@ $< -DTCC_TARGET_PE $(CFLAGS) $(LIBS)
-
-x86_64-win32-tcc$(EXESUF): $(WIN32_FILES)
-	$(CC) -o $@ $< -DTCC_TARGET_PE -DTCC_TARGET_X86_64 $(CFLAGS) $(LIBS)
-
-x86_64-tcc$(EXESUF): $(X86_64_FILES)
-	$(CC) -o $@ $< -DTCC_TARGET_X86_64 $(CFLAGS) $(LIBS)
-
-c67-tcc$(EXESUF): $(C67_FILES)
-	$(CC) -o $@ $< -DTCC_TARGET_C67 $(CFLAGS) $(LIBS)
-
-arm-win32-tcc$(EXESUF): $(WIN32_FILES)
-	$(CC) -o $@ $< -DTCC_TARGET_PE -DTCC_TARGET_ARM $(CFLAGS) $(LIBS)
-
-arm-tcc-fpa$(EXESUF): $(ARM_FILES)
-	$(CC) -o $@ $< -DTCC_TARGET_ARM $(CFLAGS) $(LIBS)
-
-arm-tcc-fpa-ld$(EXESUF): $(ARM_FILES)
-	$(CC) -o $@ $< -DTCC_TARGET_ARM -DLDOUBLE_SIZE=12 $(CFLAGS) $(LIBS)
-
-arm-tcc-vfp$(EXESUF): $(ARM_FILES)
-	$(CC) -o $@ $< -DTCC_TARGET_ARM -DTCC_ARM_VFP $(CFLAGS) $(LIBS)
-
-arm-tcc-vfp-eabi$(EXESUF): $(ARM_FILES)
-	$(CC) -o $@ $< -DTCC_TARGET_ARM -DTCC_ARM_EABI $(CFLAGS) $(LIBS)
-
-# libtcc generation and test
-libtcc.o: $(NATIVE_FILES)
-	$(CC) -o $@ -c libtcc.c $(NATIVE_TARGET) $(CFLAGS)
-
-libtcc.a: libtcc.o
-	$(AR) rcs $@ $^
-
-libtcc_test$(EXESUF): tests/libtcc_test.c libtcc.a
-	$(CC) -o $@ $^ -I. $(CFLAGS) $(LIBS)
-
-libtest: libtcc_test$(EXESUF) $(LIBTCC1)
-	./libtcc_test$(EXESUF) lib_path=.
+$(CROSS_TARGET)-tcc$(EXESUF): $(TCC_FILES)
+	$(CC) -o $@ $^ $(LIBS) $(LDFLAGS)

 # profiling version
-tcc_p$(EXESUF): $(NATIVE_FILES)
-	$(CC) -o $@ $< $(NATIVE_TARGET) $(CFLAGS_P) $(LIBS_P)
+tcc_p$(EXESUF): $($T_FILES)
+	$(CC) -o $@ $< $(DEFINES) $(CFLAGS_P) $(LIBS_P) $(LDFLAGS_P)

-# windows utilities
-tiny_impdef$(EXESUF): win32/tools/tiny_impdef.c
-	$(CC) -o $@ $< $(CFLAGS)
-tiny_libmaker$(EXESUF): win32/tools/tiny_libmaker.c
-	$(CC) -o $@ $< $(CFLAGS)
-
-# TinyCC runtime libraries
-LIBTCC1_OBJS=libtcc1.o $(ALLOCA_O)
-LIBTCC1_CC=$(CC)
-VPATH+=lib
-
-ifdef CONFIG_WIN32
-# for windows, we must use TCC because we generate ELF objects
-LIBTCC1_OBJS+=crt1.o wincrt1.o dllcrt1.o dllmain.o chkstk.o
-LIBTCC1_CC=./tcc.exe -Bwin32 -Iinclude $(NATIVE_TARGET)
-VPATH+=win32/lib
-endif
-
-%.o: %.c
-	$(LIBTCC1_CC) -o $@ -c $< -O2 -Wall
-%.o: %.S
-	$(LIBTCC1_CC) -o $@ -c $<
-
-libtcc1.a: $(LIBTCC1_OBJS)
+# static libtcc library
+libtcc.a: $(LIBTCC_OBJ)
 	$(AR) rcs $@ $^

-bcheck.o: bcheck.c
-	$(CC) -o $@ -c $< -O2 -Wall
+# dynamic libtcc library
+libtcc.so: $(LIBTCC_OBJ)
+	$(CC) -shared -Wl,-soname,$@ -o $@ $^ $(LDFLAGS)

-# install
-TCC_INCLUDES = stdarg.h stddef.h stdbool.h float.h varargs.h tcclib.h
-INSTALL=install
+libtcc.so: CFLAGS+=-fPIC
+libtcc.so: LDFLAGS+=-fPIC

-ifndef CONFIG_WIN32
-install: $(PROGS) $(LIBTCC1) $(BCHECK_O) libtcc.a tcc.1 tcc-doc.html
-	mkdir -p "$(bindir)"
-	$(INSTALL) -s -m755 $(PROGS) "$(bindir)"
-	mkdir -p "$(mandir)/man1"
-	$(INSTALL) tcc.1 "$(mandir)/man1"
-	mkdir -p "$(tccdir)"
-	mkdir -p "$(tccdir)/include"
-ifneq ($(LIBTCC1),)
-	$(INSTALL) -m644 $(LIBTCC1) "$(tccdir)"
-endif
-ifneq ($(BCHECK_O),)
-	$(INSTALL) -m644 $(BCHECK_O) "$(tccdir)"
-endif
-	$(INSTALL) -m644 $(addprefix include/,$(TCC_INCLUDES)) "$(tccdir)/include"
-	mkdir -p "$(docdir)"
-	$(INSTALL) -m644 tcc-doc.html "$(docdir)"
-	mkdir -p "$(libdir)"
-	$(INSTALL) -m644 libtcc.a "$(libdir)"
-	mkdir -p "$(includedir)"
-	$(INSTALL) -m644 libtcc.h "$(includedir)"
+# windows dynamic libtcc library
+libtcc.dll : $(LIBTCC_OBJ)
+	$(CC) -shared -o $@ $^ $(LDFLAGS)
+libtcc.dll : DEFINES += -DLIBTCC_AS_DLL

-uninstall:
-	rm -fv $(foreach P,$(PROGS),"$(bindir)/$P")
-	rm -fv $(foreach P,$(LIBTCC1) $(BCHECK_O),"$(tccdir)/$P")
-	rm -fv $(foreach P,$(TCC_INCLUDES),"$(tccdir)/include/$P")
-	rm -fv "$(docdir)/tcc-doc.html" "$(mandir)/man1/tcc.1"
-	rm -fv "$(libdir)/libtcc.a" "$(includedir)/libtcc.h"
+# import file for windows libtcc.dll
+libtcc.def : libtcc.dll tcc$(EXESUF)
+	$(XTCC) -impdef $< -o $@
+XTCC ?= ./tcc$(EXESUF)

-else
-install: $(PROGS) $(LIBTCC1) libtcc.a tcc-doc.html
-	mkdir -p "$(tccdir)"
-	mkdir -p "$(tccdir)/lib"
-	mkdir -p "$(tccdir)/include"
-	mkdir -p "$(tccdir)/examples"
-	mkdir -p "$(tccdir)/doc"
-	mkdir -p "$(tccdir)/libtcc"
-	$(INSTALL) -s -m755 $(PROGS) "$(tccdir)"
-	$(INSTALL) -m644 $(LIBTCC1) win32/lib/*.def "$(tccdir)/lib"
-	cp -r win32/include/. "$(tccdir)/include"
-	cp -r win32/examples/. "$(tccdir)/examples"
-	$(INSTALL) -m644 $(addprefix include/,$(TCC_INCLUDES)) "$(tccdir)/include"
-	$(INSTALL) -m644 tcc-doc.html win32/tcc-win32.txt "$(tccdir)/doc"
-	$(INSTALL) -m644 libtcc.a libtcc.h "$(tccdir)/libtcc"
-endif
+# TinyCC runtime libraries
+libtcc1.a : tcc$(EXESUF) FORCE
+	@$(MAKE) -C lib DEFINES='$(DEF-$T)'

+# Cross libtcc1.a
+%-libtcc1.a : %-tcc$(EXESUF) FORCE
+	@$(MAKE) -C lib DEFINES='$(DEF-$*)' CROSS_TARGET=$*
+
+.PRECIOUS: %-libtcc1.a
+FORCE:
+
+# --------------------------------------------------------------------------
 # documentation and man page
 tcc-doc.html: tcc-doc.texi
-	-texi2html -monolithic -number $<
+	makeinfo --no-split --html --number-sections -o $@ $< || true

 tcc.1: tcc-doc.texi
-	-./texi2pod.pl $< tcc.pod
-	-pod2man --section=1 --center=" " --release=" " tcc.pod > $@
+	$(TOPSRC)/texi2pod.pl $< tcc.pod \
+	&& pod2man --section=1 --center="Tiny C Compiler" --release="$(VERSION)" tcc.pod >tmp.1 \
+	&& mv tmp.1 $@ || rm -f tmp.1

-# tar release (use 'make -k tar' on a checkouted tree)
-TCC-VERSION=tcc-$(shell cat VERSION)
-tar:
-	rm -rf /tmp/$(TCC-VERSION)
-	cp -r . /tmp/$(TCC-VERSION)
-	( cd /tmp ; tar zcvf ~/$(TCC-VERSION).tar.gz $(TCC-VERSION) --exclude CVS )
-	rm -rf /tmp/$(TCC-VERSION)
+tcc-doc.info: tcc-doc.texi
+	makeinfo $< || true

-# in tests subdir
-test clean:
-	$(MAKE) -C tests $@
+# --------------------------------------------------------------------------
+# install
+
+INSTALL = install -m644
+INSTALLBIN = install -m755 $(STRIP_$(CONFIG_strip))
+STRIP_yes = -s
+
+LIBTCC1_W = $(filter %-win32-libtcc1.a %-wince-libtcc1.a,$(LIBTCC1_CROSS))
+LIBTCC1_U = $(filter-out $(LIBTCC1_W),$(LIBTCC1_CROSS))
+IB = $(if $1,mkdir -p $2 && $(INSTALLBIN) $1 $2)
+IBw = $(call IB,$(wildcard $1),$2)
+IF = $(if $1,mkdir -p $2 && $(INSTALL) $1 $2)
+IFw = $(call IF,$(wildcard $1),$2)
+IR = mkdir -p $2 && cp -r $1/. $2
+
+# install progs & libs
+install-unx:
+	$(call IBw,$(PROGS) $(PROGS_CROSS),"$(bindir)")
+	$(call IFw,$(LIBTCC1) $(LIBTCC1_U),"$(tccdir)")
+	$(call IF,$(TOPSRC)/include/*.h $(TOPSRC)/tcclib.h,"$(tccdir)/include")
+	$(call $(if $(findstring .so,$(LIBTCC)),IBw,IFw),$(LIBTCC),"$(libdir)")
+	$(call IF,$(TOPSRC)/libtcc.h,"$(includedir)")
+	$(call IFw,tcc.1,"$(mandir)/man1")
+	$(call IFw,tcc-doc.info,"$(infodir)")
+	$(call IFw,tcc-doc.html,"$(docdir)")
+ifneq "$(wildcard $(LIBTCC1_W))" ""
+	$(call IFw,$(TOPSRC)/win32/lib/*.def $(LIBTCC1_W),"$(tccdir)/win32/lib")
+	$(call IR,$(TOPSRC)/win32/include,"$(tccdir)/win32/include")
+	$(call IF,$(TOPSRC)/include/*.h $(TOPSRC)/tcclib.h,"$(tccdir)/win32/include")
+endif
+
+# uninstall
+uninstall-unx:
+	@rm -fv $(foreach P,$(PROGS) $(PROGS_CROSS),"$(bindir)/$P")
+	@rm -fv "$(libdir)/libtcc.a" "$(libdir)/libtcc.so" "$(includedir)/libtcc.h"
+	@rm -fv "$(mandir)/man1/tcc.1" "$(infodir)/tcc-doc.info"
+	@rm -fv "$(docdir)/tcc-doc.html"
+	rm -r "$(tccdir)"
+
+# install progs & libs on windows
+install-win:
+	$(call IBw,$(PROGS) $(PROGS_CROSS) $(subst libtcc.a,,$(LIBTCC)),"$(bindir)")
+	$(call IF,$(TOPSRC)/win32/lib/*.def,"$(tccdir)/lib")
+	$(call IFw,libtcc1.a $(LIBTCC1_W),"$(tccdir)/lib")
+	$(call IF,$(TOPSRC)/include/*.h $(TOPSRC)/tcclib.h,"$(tccdir)/include")
+	$(call IR,$(TOPSRC)/win32/include,"$(tccdir)/include")
+	$(call IR,$(TOPSRC)/win32/examples,"$(tccdir)/examples")
+	$(call IF,$(TOPSRC)/tests/libtcc_test.c,"$(tccdir)/examples")
+	$(call IFw,$(TOPSRC)/libtcc.h $(subst .dll,.def,$(LIBTCC)),"$(libdir)")
+	$(call IFw,$(TOPSRC)/win32/tcc-win32.txt tcc-doc.html,"$(docdir)")
+ifneq "$(wildcard $(LIBTCC1_U))" ""
+	$(call IFw,$(LIBTCC1_U),"$(tccdir)/lib")
+	$(call IF,$(TOPSRC)/include/*.h $(TOPSRC)/tcclib.h,"$(tccdir)/lib/include")
+endif
+
+# the msys-git shell works to configure && make except it does not have install
+ifeq "$(and $(CONFIG_WIN32),$(shell which install >/dev/null 2>&1 || echo no))" "no"
+install-win : INSTALL = cp
+install-win : INSTALLBIN = cp
+endif
+
+# uninstall on windows
+uninstall-win:
+	@rm -fv $(foreach P,$(PROGS) $(PROGS_CROSS) libtcc.dll,"$(bindir)/$P")
+	@rm -fv $(foreach F,tcc-doc.html tcc-win32.txt,"$(docdir)/$F")
+	@rm -fv $(foreach F,libtcc.h libtcc.def libtcc.a,"$(libdir)/$F")
+	rm -r "$(tccdir)"
+
+# --------------------------------------------------------------------------
+# other stuff
+
+TAGFILES = *.[ch] include/*.h lib/*.[chS]
+tags : ; ctags $(TAGFILES)
+# cannot have both tags and TAGS on windows
+ETAGS : ; etags $(TAGFILES)
+
+# create release tarball from *current* git branch (including tcc-doc.html
+# and converting two files to CRLF)
+TCC-VERSION = tcc-$(VERSION)
+tar:    tcc-doc.html
+	mkdir $(TCC-VERSION)
+	( cd $(TCC-VERSION) && git --git-dir ../.git checkout -f )
+	cp tcc-doc.html $(TCC-VERSION)
+	for f in tcc-win32.txt build-tcc.bat ; do \
+	    cat win32/$$f | sed 's,\(.*\),\1\r,g' > $(TCC-VERSION)/win32/$$f ; \
+	done
+	tar cjf $(TCC-VERSION).tar.bz2 $(TCC-VERSION)
+	rm -rf $(TCC-VERSION)
+	git reset

 config.mak:
-	@echo Running configure ...
-	@./configure
+	$(if $(wildcard $@),,@echo "Please run ./configure." && exit 1)

-# clean
-clean: local_clean
-local_clean:
-	rm -vf $(PROGS) tcc_p$(EXESUF) tcc.pod *~ *.o *.a *.out libtcc_test$(EXESUF)
+# run all tests
+test:
+	$(MAKE) -C tests
+# run test(s) from tests2 subdir (see make help)
+tests2.%:
+	$(MAKE) -C tests/tests2 $@
+
+clean:
+	rm -f tcc$(EXESUF) tcc_p$(EXESUF) *-tcc$(EXESUF) tcc.pod
+	rm -f  *~ *.o *.a *.so* *.out *.log lib*.def *.exe *.dll a.out tags TAGS
+	@$(MAKE) -C lib $@
+	@$(MAKE) -C tests $@

 distclean: clean
-	rm -vf config.h config.mak config.texi tcc.1
+	rm -f config.h config.mak config.texi tcc.1 tcc-doc.info tcc-doc.html

-endif # ifeq ($(TOP),.)
+.PHONY: all clean test tar tags ETAGS distclean install uninstall FORCE
+
+help:
+	@echo "make"
+	@echo "   build native compiler (from separate objects)"
+	@echo ""
+	@echo "make cross"
+	@echo "   build cross compilers (from one source)"
+	@echo ""
+	@echo "make ONE_SOURCE=yes / no"
+	@echo "   force building from one source / separate objects"
+	@echo ""
+	@echo "make cross-TARGET"
+	@echo "   build one specific cross compiler for 'TARGET', as in"
+	@echo "   $(TCC_X)"
+	@echo ""
+	@echo "Custom configuration:"
+	@echo "   The makefile includes a file 'config-extra.mak' if it is present."
+	@echo "   This file may contain some custom configuration.  For example:"
+	@echo ""
+	@echo "      NATIVE_DEFINES += -D..."
+	@echo ""
+	@echo "   Or for example to configure the search paths for a cross-compiler"
+	@echo "   that expects the linux files in <tccdir>/i386-linux:"
+	@echo ""
+	@echo "      ROOT-i386 = {B}/i386-linux"
+	@echo "      CRT-i386  = {B}/i386-linux/usr/lib"
+	@echo "      LIB-i386  = {B}/i386-linux/lib:{B}/i386-linux/usr/lib"
+	@echo "      INC-i386  = {B}/lib/include:{B}/i386-linux/usr/include"
+	@echo "      DEF-i386  += -D__linux__"
+	@echo ""
+	@echo "make test"
+	@echo "   run all tests"
+	@echo ""
+	@echo "make tests2.all / make tests2.37 / make tests2.37+"
+	@echo "   run all/single test(s) from tests2, optionally update .expect"
+	@echo ""
+	@echo "Other supported make targets:"
+	@echo "   install install-strip tags ETAGS tar clean distclean help"
+	@echo ""
+
+# --------------------------------------------------------------------------
+endif # ($(INCLUDED),no)
--- a/13
+++ b/13
@ -28,15 +28,19 @@ Features:
 Documentation:
 -------------

-1) Installation on a i386 Linux host (for Windows read tcc-win32.txt)
+1) Installation on a i386/x86_64/arm Linux/OSX/FreeBSD host

   ./configure
   make
   make test
   make install

-By default, tcc is installed in /usr/local/bin.
-./configure --help  shows configuration options.
+   Notes: For OSX and FreeBSD, gmake should be used instead of make.
+   For Windows read tcc-win32.txt.
+
+makeinfo must be installed to compile the doc.  By default, tcc is
+installed in /usr/local/bin.  ./configure --help  shows configuration
+options.


 2) Introduction
@ -65,7 +69,8 @@ operations given a list of numbers (benchmark).
 ex3.c: compute fibonacci numbers (benchmark).

 ex4.c: more complicated: X11 program. Very complicated test in fact
-because standard headers are being used !
+because standard headers are being used ! As for ex1.c, can also be launched
+directly as a script: './ex4.c'.

 ex5.c: 'hello world' with standard glibc headers.

--- a/60
+++ b/60
@ -0,0 +1,60 @@
+
+ Relicensing TinyCC
+ ------------------
+
+ The authors listed below hereby confirm their agreement to relicense TinyCC
+ including their past contributions under the following terms:
+
+
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+
+
+ Author (name)              I agree (YES/NO)    Files/Features (optional)
+ ------------------------------------------------------------------------------
+ Adam Sampson               YES                 makefiles
+ Daniel Glöckner            NO                  arm-gen.c
+ Daniel Glöckner            YES                 not arm-gen.c
+ Edmund Grimley Evans       YES                 arm64
+ Fabrice Bellard            YES                 original author
+ Frédéric Féret             YES                 x86 64/16 bit asm
+ grischka                   YES                 tccpe.c
+ Henry Kroll                YES
+ Joe Soroka                 YES
+ Kirill Smelkov             YES
+ mingodad                   YES
+ Pip Cet                    YES
+ Shinichiro Hamaji          YES                 x86_64-gen.c
+ Vincent Lefèvre            YES
+ Thomas Preud'homme         YES                 arm-gen.c
+ Timo VJ Lähde (Timppa)     ?                   tiny_libmaker.c
+ TK                         ?                   tcccoff.c c67-gen.c
+ Urs Janssen                YES
+ waddlesplash               YES
+ Christian Jullien          YES                 Windows Cygwin build and tests
+
+
+ ------------------------------------------------------------------------------
+
+ Please add yourself to the list above (rsp. replace the question mark)
+ and (after fetching the latest version) commit to the "mob" branch with
+ commit message:
+
+     Relicensing TinyCC
+
+ Thanks.
--- a/45
+++ b/45
@ -2,17 +2,9 @@ TODO list:

 Bugs:

- fix macro substitution with nested definitions (ShangHongzhang)
+- i386 fastcall is mostly wrong
 - FPU st(0) is left unclean (kwisatz haderach). Incompatible with
  optimized gcc/msc code
-
- constructors
- cast bug (Peter Wang)
- define incomplete type if defined several times (Peter Wang).
- configure --cc=tcc (still one bug in libtcc1.c)
- test binutils/gcc compile
- tci patch + argument.
- see -lxxx bug (Michael Charity).
 - see transparent union pb in /urs/include/sys/socket.h
 - precise behaviour of typeof with arrays ? (__put_user macro)
  but should suffice for most cases)
@ -21,19 +13,30 @@ Bugs:
 - transform functions to function pointers in function parameters
  (net/ipv4/ip_output.c)
 - fix function pointer type display
- check lcc test suite -> fix bitfield binary operations
 - check section alignment in C
 - fix invalid cast in comparison 'if (v == (int8_t)v)'
 - finish varargs.h support (gcc 3.2 testsuite issue)
 - fix static functions declared inside block
 - fix multiple unions init
- sizeof, alignof, typeof can still generate code in some cases.
- Fix the remaining libtcc memory leaks.
 - make libtcc fully reentrant (except for the compilation stage itself).
+- struct/union/enum definitions in nested scopes (see also Debian bug #770657)
+- __STDC_IEC_559__: float f(void) { static float x = 0.0 / 0.0; return x; }
+- memory may be leaked after errors (longjmp).
+
+Portability:
+
+- it is assumed that int is 32-bit and sizeof(int) == 4
+- int is used when host or target size_t would make more sense
+- TCC handles target floating-point (fp) values using the host's fp
+  arithmetic, which is simple and fast but may lead to exceptions
+  and inaccuracy and wrong representations when cross-compiling
+
+Linking:
+
+- static linking (-static) does not work

 Bound checking:

- '-b' bug.
 - fix bound exit on RedHat 7.3
 - setjmp is not supported properly in bound checking.
 - fix bound check code with '&' on local variables (currently done
@ -45,13 +48,10 @@ Missing features:

 - disable-asm and disable-bcheck options
 - __builtin_expect()
- improve '-E' option.
- add '-MD' option
 - atexit (Nigel Horne)
- packed attribute
- C99: add variable size arrays (gcc 3.2 testsuite issue)
 - C99: add complex types (gcc 3.2 testsuite issue)
 - postfix compound literals (see 20010124-1.c)
+- interactive mode / integrated debugger

 Optimizations:

@ -67,17 +67,17 @@ Not critical:
  normative example - only relevant when using gotos! -> must add
  boolean variable to tell if compound literal was already
  initialized).
- add PowerPC or ARM code generator and improve codegen for RISC (need
+- add PowerPC generator and improve codegen for RISC (need
  to suppress VT_LOCAL and use a base register instead).
- interactive mode / integrated debugger
 - fix preprocessor symbol redefinition
- better constant opt (&&, ||, ?:)
 - add portable byte code generator and interpreter for other
  unsupported architectures.
 - C++: variable declaration in for, minimal 'class' support.
 - win32: __intxx. use resolve for bchecked malloc et al.
  check exception code (exception filter func).
 - handle void (__attribute__() *ptr)()
+- VLAs are implemented in a way that is not compatible with signals:
+  http://lists.gnu.org/archive/html/tinycc-devel/2015-11/msg00018.html

 Fixed (probably):

@ -93,3 +93,8 @@ Fixed (probably):
 - #include_next support for /usr/include/limits ?
 - function pointers/lvalues in ? : (linux kernel net/core/dev.c)
 - win32: add __stdcall, check GetModuleHandle for dlls.
+- macro substitution with nested definitions (ShangHongzhang)
+- with "-run" and libtcc, a PLT is now built.
+- '-E' option was improved
+- packed attribute is now supported
+- ARM and ARM64 code generators have been added.
--- a/2
+++ b/2
@ -1 +1 @@
-0.9.25
+0.9.27
--- a/arm-asm.c
+++ b/arm-asm.c
@ -0,0 +1,94 @@
+/*************************************************************/
+/*
+ *  ARM dummy assembler for TCC
+ *
+ */
+
+#ifdef TARGET_DEFS_ONLY
+
+#define CONFIG_TCC_ASM
+#define NB_ASM_REGS 16
+
+ST_FUNC void g(int c);
+ST_FUNC void gen_le16(int c);
+ST_FUNC void gen_le32(int c);
+
+/*************************************************************/
+#else
+/*************************************************************/
+
+#include "tcc.h"
+
+static void asm_error(void)
+{
+    tcc_error("ARM asm not implemented.");
+}
+
+/* XXX: make it faster ? */
+ST_FUNC void g(int c)
+{
+    int ind1;
+    if (nocode_wanted)
+        return;
+    ind1 = ind + 1;
+    if (ind1 > cur_text_section->data_allocated)
+        section_realloc(cur_text_section, ind1);
+    cur_text_section->data[ind] = c;
+    ind = ind1;
+}
+
+ST_FUNC void gen_le16 (int i)
+{
+    g(i);
+    g(i>>8);
+}
+
+ST_FUNC void gen_le32 (int i)
+{
+    gen_le16(i);
+    gen_le16(i>>16);
+}
+
+ST_FUNC void gen_expr32(ExprValue *pe)
+{
+    gen_le32(pe->v);
+}
+
+ST_FUNC void asm_opcode(TCCState *s1, int opcode)
+{
+    asm_error();
+}
+
+ST_FUNC void subst_asm_operand(CString *add_str, SValue *sv, int modifier)
+{
+    asm_error();
+}
+
+/* generate prolog and epilog code for asm statement */
+ST_FUNC void asm_gen_code(ASMOperand *operands, int nb_operands,
+                         int nb_outputs, int is_output,
+                         uint8_t *clobber_regs,
+                         int out_reg)
+{
+}
+
+ST_FUNC void asm_compute_constraints(ASMOperand *operands,
+                                    int nb_operands, int nb_outputs,
+                                    const uint8_t *clobber_regs,
+                                    int *pout_reg)
+{
+}
+
+ST_FUNC void asm_clobber(uint8_t *clobber_regs, const char *str)
+{
+    asm_error();
+}
+
+ST_FUNC int asm_parse_regvar (int t)
+{
+    asm_error();
+    return -1;
+}
+
+/*************************************************************/
+#endif /* ndef TARGET_DEFS_ONLY */
--- a/arm-gen.c
+++ b/arm-gen.c
--- a/arm-link.c
+++ b/arm-link.c
@ -0,0 +1,398 @@
+#ifdef TARGET_DEFS_ONLY
+
+#define EM_TCC_TARGET EM_ARM
+
+/* relocation type for 32 bit data relocation */
+#define R_DATA_32   R_ARM_ABS32
+#define R_DATA_PTR  R_ARM_ABS32
+#define R_JMP_SLOT  R_ARM_JUMP_SLOT
+#define R_GLOB_DAT  R_ARM_GLOB_DAT
+#define R_COPY      R_ARM_COPY
+#define R_RELATIVE  R_ARM_RELATIVE
+
+#define R_NUM       R_ARM_NUM
+
+#define ELF_START_ADDR 0x00008000
+#define ELF_PAGE_SIZE  0x1000
+
+#define PCRELATIVE_DLLPLT 1
+#define RELOCATE_DLLPLT 0
+
+enum float_abi {
+    ARM_SOFTFP_FLOAT,
+    ARM_HARD_FLOAT,
+};
+
+#else /* !TARGET_DEFS_ONLY */
+
+#include "tcc.h"
+
+/* Returns 1 for a code relocation, 0 for a data relocation. For unknown
+   relocations, returns -1. */
+int code_reloc (int reloc_type)
+{
+    switch (reloc_type) {
+	case R_ARM_MOVT_ABS:
+	case R_ARM_MOVW_ABS_NC:
+	case R_ARM_THM_MOVT_ABS:
+	case R_ARM_THM_MOVW_ABS_NC:
+	case R_ARM_ABS32:
+	case R_ARM_REL32:
+	case R_ARM_GOTPC:
+	case R_ARM_GOTOFF:
+	case R_ARM_GOT32:
+	case R_ARM_COPY:
+	case R_ARM_GLOB_DAT:
+	case R_ARM_NONE:
+            return 0;
+
+        case R_ARM_PC24:
+        case R_ARM_CALL:
+	case R_ARM_JUMP24:
+	case R_ARM_PLT32:
+	case R_ARM_THM_PC22:
+	case R_ARM_THM_JUMP24:
+	case R_ARM_PREL31:
+	case R_ARM_V4BX:
+	case R_ARM_JUMP_SLOT:
+            return 1;
+    }
+
+    tcc_error ("Unknown relocation type: %d", reloc_type);
+    return -1;
+}
+
+/* Returns an enumerator to describe whether and when the relocation needs a
+   GOT and/or PLT entry to be created. See tcc.h for a description of the
+   different values. */
+int gotplt_entry_type (int reloc_type)
+{
+    switch (reloc_type) {
+	case R_ARM_NONE:
+	case R_ARM_COPY:
+	case R_ARM_GLOB_DAT:
+	case R_ARM_JUMP_SLOT:
+            return NO_GOTPLT_ENTRY;
+
+        case R_ARM_PC24:
+        case R_ARM_CALL:
+	case R_ARM_JUMP24:
+	case R_ARM_PLT32:
+	case R_ARM_THM_PC22:
+	case R_ARM_THM_JUMP24:
+	case R_ARM_MOVT_ABS:
+	case R_ARM_MOVW_ABS_NC:
+	case R_ARM_THM_MOVT_ABS:
+	case R_ARM_THM_MOVW_ABS_NC:
+	case R_ARM_PREL31:
+	case R_ARM_ABS32:
+	case R_ARM_REL32:
+	case R_ARM_V4BX:
+            return AUTO_GOTPLT_ENTRY;
+
+	case R_ARM_GOTPC:
+	case R_ARM_GOTOFF:
+            return BUILD_GOT_ONLY;
+
+	case R_ARM_GOT32:
+            return ALWAYS_GOTPLT_ENTRY;
+    }
+
+    tcc_error ("Unknown relocation type: %d", reloc_type);
+    return -1;
+}
+
+ST_FUNC unsigned create_plt_entry(TCCState *s1, unsigned got_offset, struct sym_attr *attr)
+{
+    Section *plt = s1->plt;
+    uint8_t *p;
+    unsigned plt_offset;
+
+    /* when building a DLL, GOT entry accesses must be done relative to
+       start of GOT (see x86_64 example above)  */
+    if (s1->output_type == TCC_OUTPUT_DLL)
+        tcc_error("DLLs unimplemented!");
+
+    /* empty PLT: create PLT0 entry that push address of call site and
+       jump to ld.so resolution routine (GOT + 8) */
+    if (plt->data_offset == 0) {
+        p = section_ptr_add(plt, 20);
+        write32le(p,    0xe52de004); /* push {lr}         */
+        write32le(p+4,  0xe59fe004); /* ldr lr, [pc, #4] */
+        write32le(p+8,  0xe08fe00e); /* add lr, pc, lr    */
+        write32le(p+12, 0xe5bef008); /* ldr pc, [lr, #8]! */
+        /* p+16 is set in relocate_plt */
+    }
+    plt_offset = plt->data_offset;
+
+    if (attr->plt_thumb_stub) {
+        p = section_ptr_add(plt, 4);
+        write32le(p,   0x4778); /* bx pc */
+        write32le(p+2, 0x46c0); /* nop   */
+    }
+    p = section_ptr_add(plt, 16);
+    /* Jump to GOT entry where ld.so initially put address of PLT0 */
+    write32le(p,   0xe59fc004); /* ldr ip, [pc, #4] */
+    write32le(p+4, 0xe08fc00c); /* add ip, pc, ip */
+    write32le(p+8, 0xe59cf000); /* ldr pc, [ip] */
+    /* p + 12 contains offset to GOT entry once patched by relocate_plt */
+    write32le(p+12, got_offset);
+    return plt_offset;
+}
+
+/* relocate the PLT: compute addresses and offsets in the PLT now that final
+   address for PLT and GOT are known (see fill_program_header) */
+ST_FUNC void relocate_plt(TCCState *s1)
+{
+    uint8_t *p, *p_end;
+
+    if (!s1->plt)
+      return;
+
+    p = s1->plt->data;
+    p_end = p + s1->plt->data_offset;
+
+    if (p < p_end) {
+        int x = s1->got->sh_addr - s1->plt->sh_addr - 12;
+        write32le(s1->plt->data + 16, x - 16);
+        p += 20;
+        while (p < p_end) {
+            if (read32le(p) == 0x46c04778) /* PLT Thumb stub present */
+                p += 4;
+            add32le(p + 12, x + s1->plt->data - p);
+            p += 16;
+        }
+    }
+}
+
+void relocate_init(Section *sr) {}
+
+void relocate(TCCState *s1, ElfW_Rel *rel, int type, unsigned char *ptr, addr_t addr, addr_t val)
+{
+    ElfW(Sym) *sym;
+    int sym_index;
+
+    sym_index = ELFW(R_SYM)(rel->r_info);
+    sym = &((ElfW(Sym) *)symtab_section->data)[sym_index];
+
+    switch(type) {
+        case R_ARM_PC24:
+        case R_ARM_CALL:
+        case R_ARM_JUMP24:
+        case R_ARM_PLT32:
+            {
+                int x, is_thumb, is_call, h, blx_avail, is_bl, th_ko;
+                x = (*(int *) ptr) & 0xffffff;
+#ifdef DEBUG_RELOC
+		printf ("reloc %d: x=0x%x val=0x%x ", type, x, val);
+#endif
+                (*(int *)ptr) &= 0xff000000;
+                if (x & 0x800000)
+                    x -= 0x1000000;
+                x <<= 2;
+                blx_avail = (TCC_CPU_VERSION >= 5);
+                is_thumb = val & 1;
+                is_bl = (*(unsigned *) ptr) >> 24 == 0xeb;
+                is_call = (type == R_ARM_CALL || (type == R_ARM_PC24 && is_bl));
+                x += val - addr;
+#ifdef DEBUG_RELOC
+		printf (" newx=0x%x name=%s\n", x,
+			(char *) symtab_section->link->data + sym->st_name);
+#endif
+                h = x & 2;
+                th_ko = (x & 3) && (!blx_avail || !is_call);
+                if (th_ko || x >= 0x2000000 || x < -0x2000000)
+                    tcc_error("can't relocate value at %x,%d",addr, type);
+                x >>= 2;
+                x &= 0xffffff;
+                /* Only reached if blx is avail and it is a call */
+                if (is_thumb) {
+                    x |= h << 24;
+                    (*(int *)ptr) = 0xfa << 24; /* bl -> blx */
+                }
+                (*(int *) ptr) |= x;
+            }
+            return;
+        /* Since these relocations only concern Thumb-2 and blx instruction was
+           introduced before Thumb-2, we can assume blx is available and not
+           guard its use */
+        case R_ARM_THM_PC22:
+        case R_ARM_THM_JUMP24:
+            {
+                int x, hi, lo, s, j1, j2, i1, i2, imm10, imm11;
+                int to_thumb, is_call, to_plt, blx_bit = 1 << 12;
+                Section *plt;
+
+                /* weak reference */
+                if (sym->st_shndx == SHN_UNDEF &&
+                    ELFW(ST_BIND)(sym->st_info) == STB_WEAK)
+                    return;
+
+                /* Get initial offset */
+                hi = (*(uint16_t *)ptr);
+                lo = (*(uint16_t *)(ptr+2));
+                s = (hi >> 10) & 1;
+                j1 = (lo >> 13) & 1;
+                j2 = (lo >> 11) & 1;
+                i1 = (j1 ^ s) ^ 1;
+                i2 = (j2 ^ s) ^ 1;
+                imm10 = hi & 0x3ff;
+                imm11 = lo & 0x7ff;
+                x = (s << 24) | (i1 << 23) | (i2 << 22) |
+                    (imm10 << 12) | (imm11 << 1);
+                if (x & 0x01000000)
+                    x -= 0x02000000;
+
+                /* Relocation infos */
+                to_thumb = val & 1;
+                plt = s1->plt;
+                to_plt = (val >= plt->sh_addr) &&
+                         (val < plt->sh_addr + plt->data_offset);
+                is_call = (type == R_ARM_THM_PC22);
+
+                if (!to_thumb && !to_plt && !is_call) {
+                    int index;
+                    uint8_t *p;
+                    char *name, buf[1024];
+                    Section *text_section;
+
+                    name = (char *) symtab_section->link->data + sym->st_name;
+                    text_section = s1->sections[sym->st_shndx];
+                    /* Modify reloc to target a thumb stub to switch to ARM */
+                    snprintf(buf, sizeof(buf), "%s_from_thumb", name);
+                    index = put_elf_sym(symtab_section,
+                                        text_section->data_offset + 1,
+                                        sym->st_size, sym->st_info, 0,
+                                        sym->st_shndx, buf);
+                    to_thumb = 1;
+                    val = text_section->data_offset + 1;
+                    rel->r_info = ELFW(R_INFO)(index, type);
+                    /* Create a thumb stub function to switch to ARM mode */
+                    put_elf_reloc(symtab_section, text_section,
+                                  text_section->data_offset + 4, R_ARM_JUMP24,
+                                  sym_index);
+                    p = section_ptr_add(text_section, 8);
+                    write32le(p,   0x4778); /* bx pc */
+                    write32le(p+2, 0x46c0); /* nop   */
+                    write32le(p+4, 0xeafffffe); /* b $sym */
+                }
+
+                /* Compute final offset */
+                x += val - addr;
+                if (!to_thumb && is_call) {
+                    blx_bit = 0; /* bl -> blx */
+                    x = (x + 3) & -4; /* Compute offset from aligned PC */
+                }
+
+                /* Check that relocation is possible
+                   * offset must not be out of range
+                   * if target is to be entered in arm mode:
+                     - bit 1 must not set
+                     - instruction must be a call (bl) or a jump to PLT */
+                if (!to_thumb || x >= 0x1000000 || x < -0x1000000)
+                    if (to_thumb || (val & 2) || (!is_call && !to_plt))
+                        tcc_error("can't relocate value at %x,%d",addr, type);
+
+                /* Compute and store final offset */
+                s = (x >> 24) & 1;
+                i1 = (x >> 23) & 1;
+                i2 = (x >> 22) & 1;
+                j1 = s ^ (i1 ^ 1);
+                j2 = s ^ (i2 ^ 1);
+                imm10 = (x >> 12) & 0x3ff;
+                imm11 = (x >> 1) & 0x7ff;
+                (*(uint16_t *)ptr) = (uint16_t) ((hi & 0xf800) |
+                                     (s << 10) | imm10);
+                (*(uint16_t *)(ptr+2)) = (uint16_t) ((lo & 0xc000) |
+                                (j1 << 13) | blx_bit | (j2 << 11) |
+                                imm11);
+            }
+            return;
+        case R_ARM_MOVT_ABS:
+        case R_ARM_MOVW_ABS_NC:
+            {
+                int x, imm4, imm12;
+                if (type == R_ARM_MOVT_ABS)
+                    val >>= 16;
+                imm12 = val & 0xfff;
+                imm4 = (val >> 12) & 0xf;
+                x = (imm4 << 16) | imm12;
+                if (type == R_ARM_THM_MOVT_ABS)
+                    *(int *)ptr |= x;
+                else
+                    *(int *)ptr += x;
+            }
+            return;
+        case R_ARM_THM_MOVT_ABS:
+        case R_ARM_THM_MOVW_ABS_NC:
+            {
+                int x, i, imm4, imm3, imm8;
+                if (type == R_ARM_THM_MOVT_ABS)
+                    val >>= 16;
+                imm8 = val & 0xff;
+                imm3 = (val >> 8) & 0x7;
+                i = (val >> 11) & 1;
+                imm4 = (val >> 12) & 0xf;
+                x = (imm3 << 28) | (imm8 << 16) | (i << 10) | imm4;
+                if (type == R_ARM_THM_MOVT_ABS)
+                    *(int *)ptr |= x;
+                else
+                    *(int *)ptr += x;
+            }
+            return;
+        case R_ARM_PREL31:
+            {
+                int x;
+                x = (*(int *)ptr) & 0x7fffffff;
+                (*(int *)ptr) &= 0x80000000;
+                x = (x * 2) / 2;
+                x += val - addr;
+                if((x^(x>>1))&0x40000000)
+                    tcc_error("can't relocate value at %x,%d",addr, type);
+                (*(int *)ptr) |= x & 0x7fffffff;
+            }
+        case R_ARM_ABS32:
+            *(int *)ptr += val;
+            return;
+        case R_ARM_REL32:
+            *(int *)ptr += val - addr;
+            return;
+        case R_ARM_GOTPC:
+            *(int *)ptr += s1->got->sh_addr - addr;
+            return;
+        case R_ARM_GOTOFF:
+            *(int *)ptr += val - s1->got->sh_addr;
+            return;
+        case R_ARM_GOT32:
+            /* we load the got offset */
+            *(int *)ptr += s1->sym_attrs[sym_index].got_offset;
+            return;
+        case R_ARM_COPY:
+            return;
+        case R_ARM_V4BX:
+            /* trade Thumb support for ARMv4 support */
+            if ((0x0ffffff0 & *(int*)ptr) == 0x012FFF10)
+                *(int*)ptr ^= 0xE12FFF10 ^ 0xE1A0F000; /* BX Rm -> MOV PC, Rm */
+            return;
+        case R_ARM_GLOB_DAT:
+        case R_ARM_JUMP_SLOT:
+            *(addr_t *)ptr = val;
+            return;
+        case R_ARM_NONE:
+            /* Nothing to do.  Normally used to indicate a dependency
+               on a certain symbol (like for exception handling under EABI).  */
+            return;
+        case R_ARM_RELATIVE:
+#ifdef TCC_TARGET_PE
+            add32le(ptr, val - s1->pe_imagebase);
+#endif
+            /* do nothing */
+            return;
+        default:
+            fprintf(stderr,"FIXME: handle reloc type %x at %x [%p] to %x\n",
+                type, (unsigned)addr, ptr, (unsigned)val);
+            return;
+    }
+}
+
+#endif /* !TARGET_DEFS_ONLY */
--- a/arm64-gen.c
+++ b/arm64-gen.c
--- a/arm64-link.c
+++ b/arm64-link.c
@ -0,0 +1,256 @@
+#ifdef TARGET_DEFS_ONLY
+
+#define EM_TCC_TARGET EM_AARCH64
+
+#define R_DATA_32  R_AARCH64_ABS32
+#define R_DATA_PTR R_AARCH64_ABS64
+#define R_JMP_SLOT R_AARCH64_JUMP_SLOT
+#define R_GLOB_DAT R_AARCH64_GLOB_DAT
+#define R_COPY     R_AARCH64_COPY
+#define R_RELATIVE R_AARCH64_RELATIVE
+
+#define R_NUM      R_AARCH64_NUM
+
+#define ELF_START_ADDR 0x00400000
+#define ELF_PAGE_SIZE 0x1000
+
+#define PCRELATIVE_DLLPLT 1
+#define RELOCATE_DLLPLT 1
+
+#else /* !TARGET_DEFS_ONLY */
+
+#include "tcc.h"
+
+/* Returns 1 for a code relocation, 0 for a data relocation. For unknown
+   relocations, returns -1. */
+int code_reloc (int reloc_type)
+{
+    switch (reloc_type) {
+        case R_AARCH64_ABS32:
+        case R_AARCH64_ABS64:
+	case R_AARCH64_PREL32:
+        case R_AARCH64_MOVW_UABS_G0_NC:
+        case R_AARCH64_MOVW_UABS_G1_NC:
+        case R_AARCH64_MOVW_UABS_G2_NC:
+        case R_AARCH64_MOVW_UABS_G3:
+        case R_AARCH64_ADR_PREL_PG_HI21:
+        case R_AARCH64_ADD_ABS_LO12_NC:
+        case R_AARCH64_ADR_GOT_PAGE:
+        case R_AARCH64_LD64_GOT_LO12_NC:
+        case R_AARCH64_GLOB_DAT:
+        case R_AARCH64_COPY:
+            return 0;
+
+        case R_AARCH64_JUMP26:
+        case R_AARCH64_CALL26:
+        case R_AARCH64_JUMP_SLOT:
+            return 1;
+    }
+
+    tcc_error ("Unknown relocation type: %d", reloc_type);
+    return -1;
+}
+
+/* Returns an enumerator to describe whether and when the relocation needs a
+   GOT and/or PLT entry to be created. See tcc.h for a description of the
+   different values. */
+int gotplt_entry_type (int reloc_type)
+{
+    switch (reloc_type) {
+	case R_AARCH64_PREL32:
+        case R_AARCH64_MOVW_UABS_G0_NC:
+        case R_AARCH64_MOVW_UABS_G1_NC:
+        case R_AARCH64_MOVW_UABS_G2_NC:
+        case R_AARCH64_MOVW_UABS_G3:
+        case R_AARCH64_ADR_PREL_PG_HI21:
+        case R_AARCH64_ADD_ABS_LO12_NC:
+        case R_AARCH64_GLOB_DAT:
+        case R_AARCH64_JUMP_SLOT:
+        case R_AARCH64_COPY:
+            return NO_GOTPLT_ENTRY;
+
+        case R_AARCH64_ABS32:
+        case R_AARCH64_ABS64:
+        case R_AARCH64_JUMP26:
+        case R_AARCH64_CALL26:
+            return AUTO_GOTPLT_ENTRY;
+
+        case R_AARCH64_ADR_GOT_PAGE:
+        case R_AARCH64_LD64_GOT_LO12_NC:
+            return ALWAYS_GOTPLT_ENTRY;
+    }
+
+    tcc_error ("Unknown relocation type: %d", reloc_type);
+    return -1;
+}
+
+ST_FUNC unsigned create_plt_entry(TCCState *s1, unsigned got_offset, struct sym_attr *attr)
+{
+    Section *plt = s1->plt;
+    uint8_t *p;
+    unsigned plt_offset;
+
+    if (s1->output_type == TCC_OUTPUT_DLL)
+        tcc_error("DLLs unimplemented!");
+
+    if (plt->data_offset == 0) {
+        section_ptr_add(plt, 32);
+    }
+    plt_offset = plt->data_offset;
+
+    p = section_ptr_add(plt, 16);
+    write32le(p, got_offset);
+    write32le(p + 4, (uint64_t) got_offset >> 32);
+    return plt_offset;
+}
+
+/* relocate the PLT: compute addresses and offsets in the PLT now that final
+   address for PLT and GOT are known (see fill_program_header) */
+ST_FUNC void relocate_plt(TCCState *s1)
+{
+    uint8_t *p, *p_end;
+
+    if (!s1->plt)
+      return;
+
+    p = s1->plt->data;
+    p_end = p + s1->plt->data_offset;
+
+    if (p < p_end) {
+        uint64_t plt = s1->plt->sh_addr;
+        uint64_t got = s1->got->sh_addr;
+        uint64_t off = (got >> 12) - (plt >> 12);
+        if ((off + ((uint32_t)1 << 20)) >> 21)
+            tcc_error("Failed relocating PLT (off=0x%lx, got=0x%lx, plt=0x%lx)", off, got, plt);
+        write32le(p, 0xa9bf7bf0); // stp x16,x30,[sp,#-16]!
+        write32le(p + 4, (0x90000010 | // adrp x16,...
+			  (off & 0x1ffffc) << 3 | (off & 3) << 29));
+        write32le(p + 8, (0xf9400211 | // ldr x17,[x16,#...]
+			  (got & 0xff8) << 7));
+        write32le(p + 12, (0x91000210 | // add x16,x16,#...
+			   (got & 0xfff) << 10));
+        write32le(p + 16, 0xd61f0220); // br x17
+        write32le(p + 20, 0xd503201f); // nop
+        write32le(p + 24, 0xd503201f); // nop
+        write32le(p + 28, 0xd503201f); // nop
+        p += 32;
+        while (p < p_end) {
+            uint64_t pc = plt + (p - s1->plt->data);
+            uint64_t addr = got + read64le(p);
+            uint64_t off = (addr >> 12) - (pc >> 12);
+            if ((off + ((uint32_t)1 << 20)) >> 21)
+                tcc_error("Failed relocating PLT (off=0x%lx, addr=0x%lx, pc=0x%lx)", off, addr, pc);
+            write32le(p, (0x90000010 | // adrp x16,...
+			  (off & 0x1ffffc) << 3 | (off & 3) << 29));
+            write32le(p + 4, (0xf9400211 | // ldr x17,[x16,#...]
+			      (addr & 0xff8) << 7));
+            write32le(p + 8, (0x91000210 | // add x16,x16,#...
+			      (addr & 0xfff) << 10));
+            write32le(p + 12, 0xd61f0220); // br x17
+            p += 16;
+        }
+    }
+}
+
+void relocate_init(Section *sr) {}
+
+void relocate(TCCState *s1, ElfW_Rel *rel, int type, unsigned char *ptr, addr_t addr, addr_t val)
+{
+    int sym_index = ELFW(R_SYM)(rel->r_info);
+#ifdef DEBUG_RELOC
+    ElfW(Sym) *sym = &((ElfW(Sym) *)symtab_section->data)[sym_index];
+#endif
+
+    switch(type) {
+        case R_AARCH64_ABS64:
+            write64le(ptr, val);
+            return;
+        case R_AARCH64_ABS32:
+            write32le(ptr, val);
+            return;
+	case R_AARCH64_PREL32:
+	    write32le(ptr, val - addr);
+	    return;
+        case R_AARCH64_MOVW_UABS_G0_NC:
+            write32le(ptr, ((read32le(ptr) & 0xffe0001f) |
+                            (val & 0xffff) << 5));
+            return;
+        case R_AARCH64_MOVW_UABS_G1_NC:
+            write32le(ptr, ((read32le(ptr) & 0xffe0001f) |
+                            (val >> 16 & 0xffff) << 5));
+            return;
+        case R_AARCH64_MOVW_UABS_G2_NC:
+            write32le(ptr, ((read32le(ptr) & 0xffe0001f) |
+                            (val >> 32 & 0xffff) << 5));
+            return;
+        case R_AARCH64_MOVW_UABS_G3:
+            write32le(ptr, ((read32le(ptr) & 0xffe0001f) |
+                            (val >> 48 & 0xffff) << 5));
+            return;
+        case R_AARCH64_ADR_PREL_PG_HI21: {
+            uint64_t off = (val >> 12) - (addr >> 12);
+            if ((off + ((uint64_t)1 << 20)) >> 21)
+                tcc_error("R_AARCH64_ADR_PREL_PG_HI21 relocation failed");
+            write32le(ptr, ((read32le(ptr) & 0x9f00001f) |
+                            (off & 0x1ffffc) << 3 | (off & 3) << 29));
+            return;
+        }
+        case R_AARCH64_ADD_ABS_LO12_NC:
+            write32le(ptr, ((read32le(ptr) & 0xffc003ff) |
+                            (val & 0xfff) << 10));
+            return;
+        case R_AARCH64_JUMP26:
+        case R_AARCH64_CALL26:
+#ifdef DEBUG_RELOC
+	    printf ("reloc %d @ 0x%lx: val=0x%lx name=%s\n", type, addr, val,
+		    (char *) symtab_section->link->data + sym->st_name);
+#endif
+            if (((val - addr) + ((uint64_t)1 << 27)) & ~(uint64_t)0xffffffc)
+                tcc_error("R_AARCH64_(JUMP|CALL)26 relocation failed"
+                          " (val=%lx, addr=%lx)", val, addr);
+            write32le(ptr, (0x14000000 |
+                            (uint32_t)(type == R_AARCH64_CALL26) << 31 |
+                            ((val - addr) >> 2 & 0x3ffffff)));
+            return;
+        case R_AARCH64_ADR_GOT_PAGE: {
+            uint64_t off =
+                (((s1->got->sh_addr +
+                   s1->sym_attrs[sym_index].got_offset) >> 12) - (addr >> 12));
+            if ((off + ((uint64_t)1 << 20)) >> 21)
+                tcc_error("R_AARCH64_ADR_GOT_PAGE relocation failed");
+            write32le(ptr, ((read32le(ptr) & 0x9f00001f) |
+                            (off & 0x1ffffc) << 3 | (off & 3) << 29));
+            return;
+        }
+        case R_AARCH64_LD64_GOT_LO12_NC:
+            write32le(ptr,
+                      ((read32le(ptr) & 0xfff803ff) |
+                       ((s1->got->sh_addr +
+                         s1->sym_attrs[sym_index].got_offset) & 0xff8) << 7));
+            return;
+        case R_AARCH64_COPY:
+            return;
+        case R_AARCH64_GLOB_DAT:
+        case R_AARCH64_JUMP_SLOT:
+            /* They don't need addend */
+#ifdef DEBUG_RELOC
+	    printf ("reloc %d @ 0x%lx: val=0x%lx name=%s\n", type, addr,
+		    val - rel->r_addend,
+		    (char *) symtab_section->link->data + sym->st_name);
+#endif
+            write64le(ptr, val - rel->r_addend);
+            return;
+        case R_AARCH64_RELATIVE:
+#ifdef TCC_TARGET_PE
+            add32le(ptr, val - s1->pe_imagebase);
+#endif
+            /* do nothing */
+            return;
+        default:
+            fprintf(stderr, "FIXME: handle reloc type %x at %x [%p] to %x\n",
+                    type, (unsigned)addr, ptr, (unsigned)val);
+            return;
+    }
+}
+
+#endif /* !TARGET_DEFS_ONLY */
--- a/bootstrap/.gitignore
+++ b/bootstrap/.gitignore
@ -0,0 +1,2 @@
+/libc.a
+/tcc
--- a/bootstrap/base_config.h
+++ b/bootstrap/base_config.h
@ -0,0 +1,4 @@
+#ifndef CONFIG_TCCDIR
+# define CONFIG_TCCDIR "/usr/lib/tcc"
+#endif
+#define TCC_VERSION "0.9.27"
--- a/bootstrap/build_x86_64.sh
+++ b/bootstrap/build_x86_64.sh
@ -0,0 +1,14 @@
+#! /usr/bin/env sh
+
+cd "$(dirname "$0")" &&
+
+# Compile dietlibc
+gcc -r -nostdlib -o libc.a -Ilibc/include -I../include \
+    libc/src/cstring.c libc/src/cstdio.c libc/src/syscalls.c &&
+
+# Build compiler
+ln -fs base_config.h config.h &&
+gcc -o tcc -static -nostdlib \
+    -DNDEBUG -DTCC_NOT_NATIVE -DCONFIG_LDDIR="\"lib64\"" -DTCC_TARGET_X86_64 -DONE_SOURCE=0 -I. -I../include -Ilibc/include \
+    ../tcc.c ../libtcc.c ../tccpp.c ../tccgen.c ../tccelf.c ../tccasm.c ../tccrun.c ../x86_64-gen.c ../x86_64-link.c ../i386-asm.c\
+    ./libc.a
--- a/bootstrap/libc/include/stdio.h
+++ b/bootstrap/libc/include/stdio.h
@ -0,0 +1,17 @@
+#include <stddef.h>
+
+#ifndef __STDIO_H__
+#define __STDIO_H__
+
+// Type definitions
+typedef void FILE;
+
+// Values
+extern FILE* stdout;
+extern FILE* stderr;
+extern FILE* stdin;
+
+// Basic file handlers
+extern FILE* fopen(const char* filename, const char* mode);
+
+#endif
--- a/bootstrap/libc/include/string.h
+++ b/bootstrap/libc/include/string.h
@ -0,0 +1,22 @@
+#ifndef __STRING_H__
+#define __STRING_H__
+
+#include <stddef.h>
+
+// Memory
+const void* memchr(const void* ptr, int value, size_t size);
+int memcmp(const void* ptr1, const void* ptr2, size_t num);
+void* memcpy(void* destination, const void* source, size_t num);
+void* memmove(void* destination, const void* source, size_t num);
+void* memset(void* ptr, int value, size_t num);
+
+// Strings
+char* strchr(const char* str, int character);
+int strcmp(const char* str1, const char* str2);
+char* strcpy(char* destination, const char* source);
+size_t strlen(const char* str);
+int strncmp(const char* str1, const char* str2, size_t num);
+char* strrchr(const char* str, int character);
+char* strstr(const char* str1, const char* str2);
+
+#endif 
--- a/bootstrap/libc/src/base_x86_64.s
+++ b/bootstrap/libc/src/base_x86_64.s
@ -0,0 +1,34 @@
+.global _start
+.global __syscall
+
+.text
+_start:
+    // Run main
+    call main
+
+    // Exit process
+    mov %rax, %rdi
+    mov $60, %rax
+    syscall
+
+__syscall:
+    // Save the config (except rax)
+    push %rdi
+    push %rsi
+    push %rdx
+    push %r10
+
+    // Make the syscall
+    mov %rdi, %rax
+    mov %rsi, %rdi
+    mov %rdx, %rsi
+    mov %rcx, %rdx
+    mov %r8d, %r10d
+    syscall
+
+    // Restore and return
+    pop %r10
+    pop %rdx
+    pop %rsi
+    pop %rdi
+    retq
--- a/bootstrap/libc/src/cstdio.c
+++ b/bootstrap/libc/src/cstdio.c
@ -0,0 +1,18 @@
+#include "syscalls.h"
+
+// File definition
+typedef void FILE;
+
+// Values TODO: Make it correct
+//FILE* stdout = 0;
+//FILE* stderr = 1;
+//FILE* stdin = 2;
+
+// Base file writer
+FILE* fopen(const char* filename, const char* mode) {
+    int inode = open(filename, 0, 0x1a4);
+    if(inode < 0) {
+        return NULL;
+    }
+    return (FILE*) (size_t) inode;
+}
--- a/bootstrap/libc/src/cstring.c
+++ b/bootstrap/libc/src/cstring.c
@ -0,0 +1,154 @@
+#include <stddef.h>
+
+// Tools
+size_t min(size_t a, size_t b) {
+    if(a < b) {
+        return a;
+    }
+    else {
+        return b;
+    }
+} 
+
+// Memory
+const void* memchr(const void* ptr, int value, size_t size) {
+    const char* ptr_real = (const char*) ptr;
+    char value_real = value;
+    for(size_t i = 0; i < size; i++) {
+        if(ptr_real[i] == value_real) {
+            return &(ptr_real[i]);
+        }
+    }
+    return NULL;
+}
+int memcmp(const void* ptr1, const void* ptr2, size_t num) {
+    const char* ptr1_real = (const char*) ptr1;
+    const char* ptr2_real = (const char*) ptr2;
+    for(size_t i = 0; i < num; i++) {
+        if(ptr1_real[i] < ptr2_real[i]) {
+            return -1;
+        }
+        else if(ptr1_real[i] > ptr2_real[i]) {
+            return 1;
+        }
+    }
+    return 0;
+}
+void* memcpy(void* destination, const void* source, size_t num) {
+    char* destination_real = (char*) destination;
+    const char* source_real = (const char*) source;
+    if(source > destination) {
+        for(size_t i = 0; i < num; i++) {
+            destination_real[i] = source_real[i];
+        }
+    }
+    else {
+        for(size_t i = 0; i < num; i++) {
+            destination_real[num - i - 1] = source_real[num - i - 1];
+        }
+    }
+    return destination;
+}
+void* memmove(void* destination, const void* source, size_t num) {
+    return memcpy(destination, source, num);
+}
+void* memset(void* ptr, int value, size_t num) {
+    char* ptr_real = (char*) ptr;
+    char value_real = value;
+    for(size_t i = 0; i < num; i++) {
+        ptr_real[i] = value_real;
+    }
+    return ptr;
+}
+
+// String
+size_t strlen(const char* str);
+
+char* strchr(const char* str, int character) {
+    while((*str) == 0) {
+        if((*str) == character) {
+            return (char*) str;
+        }
+        str++;
+    }
+    return NULL;
+}
+int strcmp(const char* str1, const char* str2) { // TODO: Make faster
+    size_t len_str1 = strlen(str1);
+    size_t len_str2 = strlen(str2);
+    size_t min_len = min(len_str1, len_str2);
+    int result = memcmp(str1, str2, min_len);
+    if(result != 0) {
+        return result;
+    }
+    else if(len_str1 > min_len) {
+        return 1;
+    }
+    else {
+        return -1;
+    }
+}
+char* strcpy(char* destination, const char* source) {
+    char* result = destination;
+    while((*source) == 0) {
+        (*destination) = (*source);
+        destination++;
+        source++;
+    }
+    return result;
+}
+size_t strlen(const char* str) {
+    size_t size = 0;
+    while(str[size] != 0) {
+        size++;
+    }
+    return size;
+}
+int strncmp(const char* str1, const char* str2, size_t num) { // TODO: Make faster
+    size_t len_str1 = strlen(str1);
+    size_t len_str2 = strlen(str2);
+    size_t min_len = min(min(len_str1, len_str2), num);
+    int result = memcmp(str1, str2, min_len);
+    if(min_len == num) {
+        return result;
+    }
+    else if(result != 0) {
+        return result;
+    }
+    else if(len_str1 > min_len) {
+        return 1;
+    }
+    else {
+        return -1;
+    }
+}
+char* strrchr(const char* str, int character) {
+    char* result = NULL;
+    while((*str) != 0) {
+        if((*str) == character) {
+            result = (char*) str;
+        }
+        str++;
+    }
+    if(character == 0) {
+        result = (char*) str;
+    }
+    return result;
+}
+char* strstr(const char* str1, const char* str2) {
+    size_t len_str1 = strlen(str1);
+    size_t len_str2 = strlen(str2);
+    for(size_t i = 0; i <= len_str1 - len_str2; i++) {
+        int valid = 1;
+        for(size_t j = 0; j < len_str2; j++) {
+            if(str1[i + j] != str2[j]) {
+                valid = 0;
+                break;
+            }
+        }
+        if(valid != 0) {
+            return (char*) &(str1[i]);
+        }
+    }
+    return NULL;
+}
--- a/bootstrap/libc/src/syscalls.c
+++ b/bootstrap/libc/src/syscalls.c
@ -0,0 +1,18 @@
+#include "syscalls.h"
+
+#ifdef __x86_64__
+    #define SYSCALL_READ 0
+    #define SYSCALL_WRITE 1
+    #define SYSCALL_OPEN 2
+    #define SYSCALL_CLOSE 3
+#else
+    #error Unsupported architecture
+#endif
+
+int open(const char* path, int flags, int mode) {
+    return __syscall(SYSCALL_OPEN, (size_t) path, (size_t) flags, (size_t) mode,
+                     0);
+}
+int close(int fd) {
+    return __syscall(SYSCALL_CLOSE, (size_t) fd, 0, 0, 0);
+}
--- a/bootstrap/libc/src/syscalls.h
+++ b/bootstrap/libc/src/syscalls.h
@ -0,0 +1,6 @@
+#include <stddef.h>
+
+extern volatile size_t __syscall(size_t code, size_t arg1, size_t arg2,
+                                 size_t arg3, size_t arg4);
+
+int open(const char* path, int flags, int mode);
--- a/bootstrap/libc/test/.gitignore
+++ b/bootstrap/libc/test/.gitignore
@ -0,0 +1,2 @@
+*.test
+libc.a
--- a/bootstrap/libc/test/base/exit_0.c
+++ b/bootstrap/libc/test/base/exit_0.c
@ -0,0 +1,3 @@
+int main() {
+    return 0;
+}
--- a/bootstrap/libc/test/base/exit_1.c
+++ b/bootstrap/libc/test/base/exit_1.c
@ -0,0 +1,3 @@
+int main() {
+    return 1;
+}
--- a/bootstrap/libc/test/base/exit_2.c
+++ b/bootstrap/libc/test/base/exit_2.c
@ -0,0 +1,3 @@
+int main() {
+    return 2;
+}
--- a/bootstrap/libc/test/base/test.sh
+++ b/bootstrap/libc/test/base/test.sh
@ -0,0 +1,30 @@
+#! /usr/bin/env bash
+# Args: <compiler>
+
+
+# Test exists
+cd "$(dirname "$0")" &&
+(
+    echo TEST: exit_0 &&
+    $1 -I../../../include -o exit_0.test exit_0.c ../libc.a &&
+    (
+        ./exit_0.test
+        if [ "$?" != "0" ]; then exit 1; fi
+    ) || ( echo FAILED; exit 1 )
+) &&
+(
+    echo TEST: exit_1 &&
+    $1 -I../../../include -o exit_1.test exit_1.c ../libc.a &&
+    (
+        ./exit_1.test
+        if [ "$?" != "1" ]; then exit 1; fi
+    ) || ( echo FAILED; exit 1 )
+) &&
+(
+    echo TEST: exit_2 &&
+    $1 -I../../../include -o exit_2.test exit_2.c ../libc.a &&
+    (
+        ./exit_2.test
+        if [ "$?" != "2" ]; then exit 1; fi
+    ) || ( echo FAILED; exit 1 )
+)
--- a/bootstrap/libc/test/stdio/fopen_fail.c
+++ b/bootstrap/libc/test/stdio/fopen_fail.c
@ -0,0 +1,13 @@
+#include <stddef.h>
+#include <stdio.h>
+
+
+int main() {
+    FILE* file = fopen("not_exists", "r");
+    if(file == NULL) {
+        return 0;
+    }
+    else {
+        return 1;
+    }
+} 
--- a/bootstrap/libc/test/stdio/fopen_success.c
+++ b/bootstrap/libc/test/stdio/fopen_success.c
@ -0,0 +1,13 @@
+#include <stddef.h>
+#include <stdio.h>
+
+
+int main() {
+    FILE* file = fopen("fopen_success.c", "r");
+    if(file == NULL) {
+        return 1;
+    }
+    else {
+        return 0;
+    }
+} 
--- a/bootstrap/libc/test/stdio/test.sh
+++ b/bootstrap/libc/test/stdio/test.sh
@ -0,0 +1,14 @@
+#! /usr/bin/env bash
+
+
+cd "$(dirname "$0")" &&
+(
+    echo TEST: fopen_fail
+    gcc -nostdlib -O0 -I ../../include -o fopen_fail.test fopen_fail.c ../libc.a &&
+    ./fopen_fail.test || ( echo ERROR; exit 1)
+) &&
+(
+    echo TEST: fopen_success
+    gcc -nostdlib -O0 -I ../../include -o fopen_success.test fopen_success.c ../libc.a &&
+    ./fopen_success.test || ( echo ERROR; exit 1)
+)
--- a/bootstrap/libc/test/test_x86_64.sh
+++ b/bootstrap/libc/test/test_x86_64.sh
@ -0,0 +1,13 @@
+#! /usr/bin/env bash
+# Args: <compiler>
+
+
+# Build libc
+cd "$(dirname "$0")" &&
+echo BUILD libc.a  &&
+$1 -r -o libc.a -I../include \
+    ../src/cstring.c ../src/cstdio.c ../src/syscalls.c ../src/base_x86_64.s &&
+
+# Test
+./base/test.sh "$1" &&
+./stdio/test.sh "$1"
--- a/bootstrap/test_gcc_x86_64.sh
+++ b/bootstrap/test_gcc_x86_64.sh
@ -0,0 +1,3 @@
+#! /usr/bin/env sh
+
+exec "$(dirname "$0")"/libc/test/test_x86_64.sh "gcc -nostdlib -g"
--- a/c67-gen.c
+++ b/c67-gen.c
@ -18,7 +18,9 @@
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

-//#define ASSEMBLY_LISTING_C67
+#ifdef TARGET_DEFS_ONLY
+
+/* #define ASSEMBLY_LISTING_C67 */

 /* number of available registers */
 #define NB_REGS            24
@ -85,12 +87,38 @@ enum {
    TREG_C67_B13,
 };

-const int reg_classes[NB_REGS] = {
-						/* eax */ RC_INT | RC_FLOAT | RC_EAX,
-						// only allow even regs for floats (allow for doubles)
+/* return registers for function */
+#define REG_IRET TREG_C67_A4	/* single word int return register */
+#define REG_LRET TREG_C67_A5	/* second word return register (for long long) */
+#define REG_FRET TREG_C67_A4	/* float return register */
+
+/* defined if function parameters must be evaluated in reverse order */
+/* #define INVERT_FUNC_PARAMS */
+
+/* defined if structures are passed as pointers. Otherwise structures
+   are directly pushed on stack. */
+/* #define FUNC_STRUCT_PARAM_AS_PTR */
+
+/* pointer size, in bytes */
+#define PTR_SIZE 4
+
+/* long double size and alignment, in bytes */
+#define LDOUBLE_SIZE  12
+#define LDOUBLE_ALIGN 4
+/* maximum alignment (for aligned attribute support) */
+#define MAX_ALIGN     8
+
+/******************************************************/
+#else /* ! TARGET_DEFS_ONLY */
+/******************************************************/
+#include "tcc.h"
+
+ST_DATA const int reg_classes[NB_REGS] = {
+    /* eax */ RC_INT | RC_FLOAT | RC_EAX, 
+    // only allow even regs for floats (allow for doubles)
    /* ecx */ RC_INT | RC_ECX,
-								/* edx */ RC_INT | RC_INT_BSIDE | RC_FLOAT | RC_EDX,
-								// only allow even regs for floats (allow for doubles)
+    /* edx */ RC_INT | RC_INT_BSIDE | RC_FLOAT | RC_EDX,
+    // only allow even regs for floats (allow for doubles)
    /* st0 */ RC_INT | RC_INT_BSIDE | RC_ST0,
    /* A4  */ RC_C67_A4,
    /* A5  */ RC_C67_A5,
@ -114,68 +142,36 @@ const int reg_classes[NB_REGS] = {
    /* B13  */ RC_C67_B11
 };

-/* return registers for function */
-#define REG_IRET TREG_C67_A4	/* single word int return register */
-#define REG_LRET TREG_C67_A5	/* second word return register (for long long) */
-#define REG_FRET TREG_C67_A4	/* float return register */
-
-
-#define ALWAYS_ASSERT(x) \
-do {\
-   if (!(x))\
-       error("internal compiler error file at %s:%d", __FILE__, __LINE__);\
-} while (0)
-
 // although tcc thinks it is passing parameters on the stack,
 // the C67 really passes up to the first 10 params in special
 // regs or regs pairs (for 64 bit params).  So keep track of
 // the stack offsets so we can translate to the appropriate 
 // reg (pair)

-
 #define NoCallArgsPassedOnStack 10
 int NoOfCurFuncArgs;
 int TranslateStackToReg[NoCallArgsPassedOnStack];
 int ParamLocOnStack[NoCallArgsPassedOnStack];
 int TotalBytesPushedOnStack;

-/* defined if function parameters must be evaluated in reverse order */
+#ifndef FALSE
+# define FALSE 0
+# define TRUE 1
+#endif

-//#define INVERT_FUNC_PARAMS
+#undef BOOL
+#define BOOL int

-/* defined if structures are passed as pointers. Otherwise structures
-   are directly pushed on stack. */
-//#define FUNC_STRUCT_PARAM_AS_PTR
-
-/* pointer size, in bytes */
-#define PTR_SIZE 4
-
-/* long double size and alignment, in bytes */
-#define LDOUBLE_SIZE  12
-#define LDOUBLE_ALIGN 4
-/* maximum alignment (for aligned attribute support) */
-#define MAX_ALIGN     8
+#define ALWAYS_ASSERT(x) \
+do {\
+   if (!(x))\
+       tcc_error("internal compiler error file at %s:%d", __FILE__, __LINE__);\
+} while (0)

 /******************************************************/
-/* ELF defines */
-
-#define EM_TCC_TARGET EM_C60
-
-/* relocation type for 32 bit data relocation */
-#define R_DATA_32   R_C60_32
-#define R_DATA_PTR  R_C60_32
-#define R_JMP_SLOT  R_C60_JMP_SLOT
-#define R_COPY      R_C60_COPY
-
-#define ELF_START_ADDR 0x00000400
-#define ELF_PAGE_SIZE  0x1000
-
-/******************************************************/
-
 static unsigned long func_sub_sp_offset;
 static int func_ret_sub;

-
 static BOOL C67_invert_test;
 static int C67_compare_reg;

@ -183,11 +179,11 @@ static int C67_compare_reg;
 FILE *f = NULL;
 #endif

-
 void C67_g(int c)
 {
    int ind1;
-
+    if (nocode_wanted)
+        return;
 #ifdef ASSEMBLY_LISTING_C67
    fprintf(f, " %08X", c);
 #endif
@ -236,15 +232,15 @@ void gsym(int t)
 }

 // these are regs that tcc doesn't really know about, 
-// but asign them unique values so the mapping routines
-// can distinquish them
+// but assign them unique values so the mapping routines
+// can distinguish them

 #define C67_A0 105
 #define C67_SP 106
 #define C67_B3 107
 #define C67_FP 108
 #define C67_B2 109
-#define C67_CREG_ZERO -1	// Special code for no condition reg test
+#define C67_CREG_ZERO -1	/* Special code for no condition reg test */


 int ConvertRegToRegClass(int r)
@ -1553,23 +1549,23 @@ void C67_SHR(int r, int v)
 void load(int r, SValue * sv)
 {
    int v, t, ft, fc, fr, size = 0, element;
-    BOOL Unsigned = false;
+    BOOL Unsigned = FALSE;
    SValue v1;

    fr = sv->r;
    ft = sv->type.t;
-    fc = sv->c.ul;
+    fc = sv->c.i;

    v = fr & VT_VALMASK;
    if (fr & VT_LVAL) {
 	if (v == VT_LLOCAL) {
 	    v1.type.t = VT_INT;
 	    v1.r = VT_LOCAL | VT_LVAL;
-	    v1.c.ul = fc;
+	    v1.c.i = fc;
 	    load(r, &v1);
 	    fr = r;
 	} else if ((ft & VT_BTYPE) == VT_LDOUBLE) {
-	    error("long double not supported");
+	    tcc_error("long double not supported");
 	} else if ((ft & VT_TYPE) == VT_BYTE) {
 	    size = 1;
 	} else if ((ft & VT_TYPE) == (VT_BYTE | VT_UNSIGNED)) {
@ -1717,13 +1713,13 @@ void store(int r, SValue * v)
    int fr, bt, ft, fc, size, t, element;

    ft = v->type.t;
-    fc = v->c.ul;
+    fc = v->c.i;
    fr = v->r & VT_VALMASK;
    bt = ft & VT_BTYPE;
    /* XXX: incorrect if float reg to reg */

    if (bt == VT_LDOUBLE) {
-	error("long double not supported");
+	tcc_error("long double not supported");
    } else {
 	if (bt == VT_SHORT)
 	    size = 2;
@ -1870,6 +1866,13 @@ static void gcall_or_jmp(int is_jmp)
    }
 }

+/* Return the number of registers needed to return the struct, or 0 if
+   returning via struct pointer. */
+ST_FUNC int gfunc_sret(CType *vt, int variadic, CType *ret, int *ret_align, int *regsize) {
+    *ret_align = 1; // Never have to re-align return values for x86-64
+    return 0;
+}
+
 /* generate function call with address in (vtop->t, vtop->c) and free function
   context. Stack entry is popped */
 void gfunc_call(int nb_args)
@ -1878,24 +1881,22 @@ void gfunc_call(int nb_args)
    int args_sizes[NoCallArgsPassedOnStack];

    if (nb_args > NoCallArgsPassedOnStack) {
-	error("more than 10 function params not currently supported");
+	tcc_error("more than 10 function params not currently supported");
 	// handle more than 10, put some on the stack
    }

    for (i = 0; i < nb_args; i++) {
 	if ((vtop->type.t & VT_BTYPE) == VT_STRUCT) {
 	    ALWAYS_ASSERT(FALSE);
-	} else if ((vtop->type.t & VT_BTYPE) == VT_STRUCT) {
-	    ALWAYS_ASSERT(FALSE);
 	} else {
 	    /* simple type (currently always same size) */
 	    /* XXX: implicit cast ? */


 	    if ((vtop->type.t & VT_BTYPE) == VT_LLONG) {
-		error("long long not supported");
+		tcc_error("long long not supported");
 	    } else if ((vtop->type.t & VT_BTYPE) == VT_LDOUBLE) {
-		error("long double not supported");
+		tcc_error("long double not supported");
 	    } else if ((vtop->type.t & VT_BTYPE) == VT_DOUBLE) {
 		size = 8;
 	    } else {
@ -1950,11 +1951,12 @@ void gfunc_prolog(CType * func_type)
    CType *type;

    sym = func_type->ref;
-    func_call = sym->r;
+    func_call = sym->f.func_call;
    addr = 8;
    /* if the function returns a structure, then add an
       implicit pointer parameter */
    func_vt = sym->type;
+    func_var = (sym->f.func_type == FUNC_ELLIPSIS);
    if ((func_vt.t & VT_BTYPE) == VT_STRUCT) {
 	func_vc = addr;
 	addr += 4;
@ -2037,6 +2039,8 @@ void gfunc_epilog(void)
 int gjmp(int t)
 {
    int ind1 = ind;
+    if (nocode_wanted)
+        return t;

    C67_MVKL(C67_A0, t);	//r=reg to load,  constant
    C67_MVKH(C67_A0, t);	//r=reg to load,  constant
@ -2069,7 +2073,9 @@ int gtst(int inv, int t)
    int v, *p;

    v = vtop->r & VT_VALMASK;
-    if (v == VT_CMP) {
+    if (nocode_wanted) {
+        ;
+    } else if (v == VT_CMP) {
 	/* fast case : can jump directly since flags are set */
 	// C67 uses B2 sort of as flags register
 	ind1 = ind;
@ -2091,13 +2097,12 @@ int gtst(int inv, int t)
 	/* && or || optimization */
 	if ((v & 1) == inv) {
 	    /* insert vtop->c jump list in t */
-	    p = &vtop->c.i;

 	    // I guess the idea is to traverse to the
 	    // null at the end of the list and store t
 	    // there

-	    n = *p;
+	    n = vtop->c.i;
 	    while (n != 0) {
 		p = (int *) (cur_text_section->data + n);

@ -2113,37 +2118,6 @@ int gtst(int inv, int t)
 	    t = gjmp(t);
 	    gsym(vtop->c.i);
 	}
-    } else {
-	if (is_float(vtop->type.t)) {
-	    vpushi(0);
-	    gen_op(TOK_NE);
-	}
-	if ((vtop->r & (VT_VALMASK | VT_LVAL | VT_SYM)) == VT_CONST) {
-	    /* constant jmp optimization */
-	    if ((vtop->c.i != 0) != inv)
-		t = gjmp(t);
-	} else {
-	    // I think we need to get the value on the stack
-	    // into a register, test it, and generate a branch
-	    // return the address of the branch, so it can be
-	    // later patched
-
-	    v = gv(RC_INT);	// get value into a reg 
-	    ind1 = ind;
-	    C67_MVKL(C67_A0, t);	//r=reg to load, constant
-	    C67_MVKH(C67_A0, t);	//r=reg to load, constant
-
-	    if (v != TREG_EAX &&	// check if not already in a conditional test reg
-		v != TREG_EDX && v != TREG_ST0 && v != C67_B2) {
-		C67_MV(v, C67_B2);
-		v = C67_B2;
-	    }
-
-	    C67_IREG_B_REG(inv, v, C67_A0);	// [!R] B.S2x  A0
-	    C67_NOP(5);
-	    t = ind1;		//return where we need to patch
-	    ind1 = ind;
-	}
    }
    vtop--;
    return t;
@ -2179,34 +2153,34 @@ void gen_opi(int op)

 	if (op == TOK_LT) {
 	    C67_CMPLT(r, fr, C67_B2);
-	    C67_invert_test = false;
+	    C67_invert_test = FALSE;
 	} else if (op == TOK_GE) {
 	    C67_CMPLT(r, fr, C67_B2);
-	    C67_invert_test = true;
+	    C67_invert_test = TRUE;
 	} else if (op == TOK_GT) {
 	    C67_CMPGT(r, fr, C67_B2);
-	    C67_invert_test = false;
+	    C67_invert_test = FALSE;
 	} else if (op == TOK_LE) {
 	    C67_CMPGT(r, fr, C67_B2);
-	    C67_invert_test = true;
+	    C67_invert_test = TRUE;
 	} else if (op == TOK_EQ) {
 	    C67_CMPEQ(r, fr, C67_B2);
-	    C67_invert_test = false;
+	    C67_invert_test = FALSE;
 	} else if (op == TOK_NE) {
 	    C67_CMPEQ(r, fr, C67_B2);
-	    C67_invert_test = true;
+	    C67_invert_test = TRUE;
 	} else if (op == TOK_ULT) {
 	    C67_CMPLTU(r, fr, C67_B2);
-	    C67_invert_test = false;
+	    C67_invert_test = FALSE;
 	} else if (op == TOK_UGE) {
 	    C67_CMPLTU(r, fr, C67_B2);
-	    C67_invert_test = true;
+	    C67_invert_test = TRUE;
 	} else if (op == TOK_UGT) {
 	    C67_CMPGTU(r, fr, C67_B2);
-	    C67_invert_test = false;
+	    C67_invert_test = FALSE;
 	} else if (op == TOK_ULE) {
 	    C67_CMPGTU(r, fr, C67_B2);
-	    C67_invert_test = true;
+	    C67_invert_test = TRUE;
 	} else if (op == '+')
 	    C67_ADD(fr, r);	// ADD  r,fr,r
 	else if (op == '-')
@ -2251,7 +2225,7 @@ void gen_opi(int op)
 	r = vtop[-1].r;
 	fr = vtop[0].r;
 	vtop--;
-	C67_MPYI(fr, r);	// 32 bit bultiply  fr,r,fr
+	C67_MPYI(fr, r);	// 32 bit multiply  fr,r,fr
 	C67_NOP(8);		// NOP 8 for worst case
 	break;
    case TOK_SHL:
@ -2308,7 +2282,7 @@ void gen_opi(int op)
 }

 /* generate a floating point operation 'v = t1 op t2' instruction. The
-   two operands are guaranted to have the same floating point type */
+   two operands are guaranteed to have the same floating point type */
 /* XXX: need to use ST1 too */
 void gen_opf(int op)
 {
@ -2320,13 +2294,13 @@ void gen_opf(int op)
 	gv2(RC_FLOAT, RC_FLOAT);	// make sure src2 is on b side

    ft = vtop->type.t;
-    fc = vtop->c.ul;
+    fc = vtop->c.i;
    r = vtop->r;
    fr = vtop[-1].r;


    if ((ft & VT_BTYPE) == VT_LDOUBLE)
-	error("long doubles not supported");
+	tcc_error("long doubles not supported");

    if (op >= TOK_ULT && op <= TOK_GT) {

@ -2341,42 +2315,42 @@ void gen_opf(int op)
 	    else
 		C67_CMPLTSP(r, fr, C67_B2);

-	    C67_invert_test = false;
+	    C67_invert_test = FALSE;
 	} else if (op == TOK_GE) {
 	    if ((ft & VT_BTYPE) == VT_DOUBLE)
 		C67_CMPLTDP(r, fr, C67_B2);
 	    else
 		C67_CMPLTSP(r, fr, C67_B2);

-	    C67_invert_test = true;
+	    C67_invert_test = TRUE;
 	} else if (op == TOK_GT) {
 	    if ((ft & VT_BTYPE) == VT_DOUBLE)
 		C67_CMPGTDP(r, fr, C67_B2);
 	    else
 		C67_CMPGTSP(r, fr, C67_B2);

-	    C67_invert_test = false;
+	    C67_invert_test = FALSE;
 	} else if (op == TOK_LE) {
 	    if ((ft & VT_BTYPE) == VT_DOUBLE)
 		C67_CMPGTDP(r, fr, C67_B2);
 	    else
 		C67_CMPGTSP(r, fr, C67_B2);

-	    C67_invert_test = true;
+	    C67_invert_test = TRUE;
 	} else if (op == TOK_EQ) {
 	    if ((ft & VT_BTYPE) == VT_DOUBLE)
 		C67_CMPEQDP(r, fr, C67_B2);
 	    else
 		C67_CMPEQSP(r, fr, C67_B2);

-	    C67_invert_test = false;
+	    C67_invert_test = FALSE;
 	} else if (op == TOK_NE) {
 	    if ((ft & VT_BTYPE) == VT_DOUBLE)
 		C67_CMPEQDP(r, fr, C67_B2);
 	    else
 		C67_CMPEQSP(r, fr, C67_B2);

-	    C67_invert_test = true;
+	    C67_invert_test = TRUE;
 	} else {
 	    ALWAYS_ASSERT(FALSE);
 	}
@ -2478,7 +2452,7 @@ void gen_cvt_ftoi(int t)
    r = vtop->r;

    if (t != VT_INT)
-	error("long long not supported");
+	tcc_error("long long not supported");
    else {
 	if ((vtop->type.t & VT_BTYPE) == VT_DOUBLE) {
 	    C67_DPTRUNC(r, r);
@ -2545,5 +2519,22 @@ void ggoto(void)
    vtop--;
 }

-/* end of X86 code generator */
+/* Save the stack pointer onto the stack and return the location of its address */
+ST_FUNC void gen_vla_sp_save(int addr) {
+    tcc_error("variable length arrays unsupported for this target");
+}
+
+/* Restore the SP from a location on the stack */
+ST_FUNC void gen_vla_sp_restore(int addr) {
+    tcc_error("variable length arrays unsupported for this target");
+}
+
+/* Subtract from the stack pointer, and push the resulting value onto the stack */
+ST_FUNC void gen_vla_alloc(CType *type, int align) {
+    tcc_error("variable length arrays unsupported for this target");
+}
+
+/* end of C67 code generator */
+/*************************************************************/
+#endif
 /*************************************************************/
--- a/c67-link.c
+++ b/c67-link.c
@ -0,0 +1,131 @@
+#ifdef TARGET_DEFS_ONLY
+
+#define EM_TCC_TARGET EM_C60
+
+/* relocation type for 32 bit data relocation */
+#define R_DATA_32   R_C60_32
+#define R_DATA_PTR  R_C60_32
+#define R_JMP_SLOT  R_C60_JMP_SLOT
+#define R_GLOB_DAT  R_C60_GLOB_DAT
+#define R_COPY      R_C60_COPY
+#define R_RELATIVE  R_C60_RELATIVE
+
+#define R_NUM       R_C60_NUM
+
+#define ELF_START_ADDR 0x00000400
+#define ELF_PAGE_SIZE  0x1000
+
+#define PCRELATIVE_DLLPLT 0
+#define RELOCATE_DLLPLT 0
+
+#else /* !TARGET_DEFS_ONLY */
+
+#include "tcc.h"
+
+/* Returns 1 for a code relocation, 0 for a data relocation. For unknown
+   relocations, returns -1. */
+int code_reloc (int reloc_type)
+{
+    switch (reloc_type) {
+        case R_C60_32:
+	case R_C60LO16:
+	case R_C60HI16:
+        case R_C60_GOT32:
+        case R_C60_GOTOFF:
+        case R_C60_GOTPC:
+        case R_C60_COPY:
+            return 0;
+
+        case R_C60_PLT32:
+            return 1;
+    }
+
+    tcc_error ("Unknown relocation type: %d", reloc_type);
+    return -1;
+}
+
+/* Returns an enumerator to describe whether and when the relocation needs a
+   GOT and/or PLT entry to be created. See tcc.h for a description of the
+   different values. */
+int gotplt_entry_type (int reloc_type)
+{
+    switch (reloc_type) {
+        case R_C60_32:
+	case R_C60LO16:
+	case R_C60HI16:
+        case R_C60_COPY:
+            return NO_GOTPLT_ENTRY;
+
+        case R_C60_GOTOFF:
+        case R_C60_GOTPC:
+            return BUILD_GOT_ONLY;
+
+        case R_C60_PLT32:
+        case R_C60_GOT32:
+            return ALWAYS_GOTPLT_ENTRY;
+    }
+
+    tcc_error ("Unknown relocation type: %d", reloc_type);
+    return -1;
+}
+
+ST_FUNC unsigned create_plt_entry(TCCState *s1, unsigned got_offset, struct sym_attr *attr)
+{
+    tcc_error("C67 got not implemented");
+    return 0;
+}
+
+/* relocate the PLT: compute addresses and offsets in the PLT now that final
+   address for PLT and GOT are known (see fill_program_header) */
+ST_FUNC void relocate_plt(TCCState *s1)
+{
+    uint8_t *p, *p_end;
+
+    if (!s1->plt)
+      return;
+
+    p = s1->plt->data;
+    p_end = p + s1->plt->data_offset;
+
+    if (p < p_end) {
+        /* XXX: TODO */
+        while (p < p_end) {
+            /* XXX: TODO */
+        }
+   }
+}
+
+void relocate_init(Section *sr) {}
+
+void relocate(TCCState *s1, ElfW_Rel *rel, int type, unsigned char *ptr, addr_t addr, addr_t val)
+{
+    switch(type) {
+        case R_C60_32:
+            *(int *)ptr += val;
+            break;
+        case R_C60LO16:
+            {
+                uint32_t orig;
+
+                /* put the low 16 bits of the absolute address add to what is
+                   already there */
+                orig  =   ((*(int *)(ptr  )) >> 7) & 0xffff;
+                orig |=  (((*(int *)(ptr+4)) >> 7) & 0xffff) << 16;
+
+                /* patch both at once - assumes always in pairs Low - High */
+                *(int *) ptr    = (*(int *) ptr    & (~(0xffff << 7)) ) |
+                                   (((val+orig)      & 0xffff) << 7);
+                *(int *)(ptr+4) = (*(int *)(ptr+4) & (~(0xffff << 7)) ) |
+                                  ((((val+orig)>>16) & 0xffff) << 7);
+            }
+            break;
+        case R_C60HI16:
+            break;
+        default:
+            fprintf(stderr,"FIXME: handle reloc type %x at %x [%p] to %x\n",
+                    type, (unsigned) addr, ptr, (unsigned) val);
+            break;
+    }
+}
+
+#endif /* !TARGET_DEFS_ONLY */
--- a/coff.h
+++ b/coff.h
@ -22,7 +22,7 @@ struct filehdr {
 /*------------------------------------------------------------------------*/
 #define  F_RELFLG   0x01       /* relocation info stripped from file       */
 #define  F_EXEC     0x02       /* file is executable (no unresolved refs)  */
-#define  F_LNNO     0x04       /* line nunbers stripped from file          */
+#define  F_LNNO     0x04       /* line numbers stripped from file          */
 #define  F_LSYMS    0x08       /* local symbols stripped from file         */
 #define  F_GSP10    0x10       /* 34010 version                            */
 #define  F_GSP20    0x20       /* 34020 version                            */
@ -37,8 +37,8 @@ struct filehdr {
 #define F_BYTE_ORDER (F_LITTLE | F_BIG)
 #define FILHDR  struct filehdr

-//#define FILHSZ  sizeof(FILHDR) 
-#define FILHSZ  22                // above rounds to align on 4 bytes which causes problems 
+/* #define FILHSZ  sizeof(FILHDR)  */
+#define FILHSZ  22                /* above rounds to align on 4 bytes which causes problems */

 #define COFF_C67_MAGIC 0x00c2

@ -150,7 +150,7 @@ struct scnhdr {
 /*------------------------------------------------------------------------*/
 /* Define constants for names of "special" sections                       */
 /*------------------------------------------------------------------------*/
-//#define _TEXT    ".text"
+/* #define _TEXT    ".text" */
 #define _DATA    ".data"
 #define _BSS     ".bss"
 #define _CINIT   ".cinit"
--- a/686
+++ b/686
@ -1,25 +1,22 @@
 #!/bin/sh
 #
 # tcc configure script (c) 2003 Fabrice Bellard
-#
-# set temporary file name
-if test ! -z "$TMPDIR" ; then
-    TMPDIR1="${TMPDIR}"
-elif test ! -z "$TEMPDIR" ; then
-    TMPDIR1="${TEMPDIR}"
-else
-    TMPDIR1="/tmp"
-fi

-TMPC="${TMPDIR1}/tcc-conf-${RANDOM}-$$-${RANDOM}.c"
-TMPO="${TMPDIR1}/tcc-conf-${RANDOM}-$$-${RANDOM}.o"
-TMPE="${TMPDIR1}/tcc-conf-${RANDOM}-$$-${RANDOM}"
-TMPS="${TMPDIR1}/tcc-conf-${RANDOM}-$$-${RANDOM}.S"
-TMPH="${TMPDIR1}/tcc-conf-${RANDOM}-$$-${RANDOM}.h"
+# set temporary file name
+# if test ! -z "$TMPDIR" ; then
+#     TMPDIR1="${TMPDIR}"
+# elif test ! -z "$TEMPDIR" ; then
+#     TMPDIR1="${TEMPDIR}"
+# else
+#     TMPDIR1="/tmp"
+# fi
+#
+# bashism: TMPN="${TMPDIR1}/tcc-conf-${RANDOM}-$$-${RANDOM}.c"
+
+TMPN="./conftest-$$"
+TMPH=$TMPN.h

 # default parameters
-build_cross="no"
-use_libgcc="no"
 prefix=""
 execprefix=""
 bindir=""
@ -27,63 +24,48 @@ libdir=""
 tccdir=""
 includedir=""
 mandir=""
+infodir=""
 sysroot=""
 cross_prefix=""
 cc="gcc"
-host_cc="gcc"
 ar="ar"
 strip="strip"
-cpu=`uname -m`
-case "$cpu" in
-  i386|i486|i586|i686|i86pc|BePC)
-    cpu="x86"
-  ;;
-  x86_64)
-    cpu="x86-64"
-  ;;
-  armv4l)
-    cpu="armv4l"
-  ;;
-  alpha)
-    cpu="alpha"
-  ;;
-  "Power Macintosh"|ppc|ppc64)
-    cpu="powerpc"
-  ;;
-  mips)
-    cpu="mips"
-  ;;
-  s390)
-    cpu="s390"
-  ;;
-  *)
-    cpu="unknown"
-  ;;
-esac
-gprof="no"
 bigendian="no"
 mingw32="no"
 LIBSUF=".a"
 EXESUF=""
+DLLSUF=".so"
+tcc_sysincludepaths=""
+tcc_libpaths=""
+tcc_crtprefix=""
+tcc_elfinterp=""
+triplet=
+tcc_lddir=
+confvars=
+suggest="yes"
+cpu=
+cpuver=
+gcc_major=0
+gcc_minor=0

 # OS specific
-targetos=`uname -s`
+targetos=`uname`
 case $targetos in
-MINGW32*)
-mingw32="yes"
-;;
-DragonFly)
-noldl="yes"
-;;
-OpenBSD)
-noldl="yes"
-;;
-*) ;;
+  Darwin)
+    confvars="$confvars OSX"
+    DLLSUF=".dylib"
+    ;;
+  MINGW*|MSYS*|CYGWIN*)
+    mingw32=yes
+    ;;
+  DragonFly|OpenBSD|FreeBSD|NetBSD)
+    confvars="$confvars ldl=no"
+    ;;
+  *)
+    ;;
 esac

 # find source path
-# XXX: we assume an absolute path is given when launching configure, 
-# except in './configure' case.
 source_path=${0%configure}
 source_path=${source_path%/}
 source_path_used="yes"
@ -93,19 +75,28 @@ if test -z "$source_path" -o "$source_path" = "." ; then
 fi

 for opt do
+  eval opt=\"$opt\"
  case "$opt" in
  --prefix=*) prefix=`echo $opt | cut -d '=' -f 2`
  ;;
  --exec-prefix=*) execprefix=`echo $opt | cut -d '=' -f 2`
  ;;
+  --tccdir=*) tccdir=`echo $opt | cut -d '=' -f 2`
+  ;;
  --bindir=*) bindir=`echo $opt | cut -d '=' -f 2`
  ;;
  --libdir=*) libdir=`echo $opt | cut -d '=' -f 2`
  ;;
  --includedir=*) includedir=`echo $opt | cut -d '=' -f 2`
  ;;
+  --sharedir=*) sharedir=`echo $opt | cut -d '=' -f 2`
+  ;;
  --mandir=*) mandir=`echo $opt | cut -d '=' -f 2`
  ;;
+  --infodir=*) infodir=`echo $opt | cut -d '=' -f 2`
+  ;;
+  --docdir=*) docdir=`echo $opt | cut -d '=' -f 2`
+  ;;
  --sysroot=*) sysroot=`echo $opt | cut -d '=' -f 2`
  ;;
  --source-path=*) source_path=`echo $opt | cut -d '=' -f 2`
@ -114,266 +105,386 @@ for opt do
  ;;
  --cc=*) cc=`echo $opt | cut -d '=' -f 2`
  ;;
+  --ar=*) ar=`echo $opt | cut -d '=' -f 2`
+  ;;
  --extra-cflags=*) CFLAGS="${opt#--extra-cflags=}"
  ;;
  --extra-ldflags=*) LDFLAGS="${opt#--extra-ldflags=}"
  ;;
-  --extra-libs=*) extralibs=${opt#--extra-libs=}
+  --extra-libs=*) extralibs="${opt#--extra-libs=}"
+  ;;
+  --sysincludepaths=*) tcc_sysincludepaths=`echo $opt | cut -d '=' -f 2`
+  ;;
+  --libpaths=*) tcc_libpaths=`echo $opt | cut -d '=' -f 2`
+  ;;
+  --crtprefix=*) tcc_crtprefix=`echo $opt | cut -d '=' -f 2`
+  ;;
+  --elfinterp=*) tcc_elfinterp=`echo $opt | cut -d '=' -f 2`
+  ;;
+  --triplet=*) triplet=`echo $opt | cut -d '=' -f 2`
  ;;
  --cpu=*) cpu=`echo $opt | cut -d '=' -f 2`
  ;;
-  --enable-gprof) gprof="yes"
+  --enable-cross) confvars="$confvars cross"
  ;;
-  --enable-mingw32) mingw32="yes" ; cross_prefix="i386-mingw32-"
-  ;; 
-  --enable-cross) build_cross="yes"
+  --disable-static) confvars="$confvars static=no"
  ;;
-  --with-libgcc) use_libgcc="yes"
+  --enable-static) confvars="$confvars static"
+  ;;
+  --disable-rpath) confvars="$confvars rpath=no"
+  ;;
+  --strip-binaries) confvars="$confvars strip"
+  ;;
+  --with-libgcc) confvars="$confvars libgcc"
+  ;;
+  --with-selinux) confvars="$confvars selinux"
+  ;;
+  --config-mingw32*) mingw32=$(echo "$opt=yes" | cut -d '=' -f 2)
+  ;;
+  --config-*) confvars="$confvars ${opt#--config-}"; suggest="no"
  ;;
  --help|-h) show_help="yes"
  ;;
+  *) echo "configure: WARNING: unrecognized option $opt"
+  ;;
  esac
 done

+if test -z "$cpu" ; then
+    if test -n "$ARCH" ; then
+	cpu="$ARCH"
+    else
+	cpu=`uname -m`
+    fi
+fi
+
+case "$cpu" in
+  x86|i386|i486|i586|i686|i86pc|BePC|i686-AT386)
+    cpu="i386"
+  ;;
+  x86_64|amd64|x86-64)
+    cpu="x86_64"
+  ;;
+  arm*)
+    case "$cpu" in
+      arm|armv4l)
+	cpuver=4
+      ;;
+      armv5tel|armv5tejl)
+	cpuver=5
+      ;;
+      armv6j|armv6l)
+	cpuver=6
+      ;;
+      armv7a|armv7l)
+	cpuver=7
+      ;;
+    esac
+    cpu="arm"
+  ;;
+  aarch64)
+    cpu="aarch64"
+  ;;
+  alpha)
+    cpu="alpha"
+  ;;
+  "Power Macintosh"|ppc|ppc64)
+    cpu="ppc"
+  ;;
+  mips)
+    cpu="mips"
+  ;;
+  s390)
+    cpu="s390"
+  ;;
+  *)
+    echo "Unsupported CPU"
+    exit 1
+  ;;
+esac
+
 # Checking for CFLAGS
 if test -z "$CFLAGS"; then
-    CFLAGS="-O2"
+    CFLAGS="-Wall -g -O2"
+fi
+
+if test "$mingw32" = "yes" ; then
+    if test "$source_path_used" = "no"; then
+      source_path="."
+    fi
+    if test "$cc" = gcc; then
+      test -z "$LDFLAGS" && LDFLAGS="-static"
+    fi
+    test -z "$prefix" && prefix="C:/Program Files/tcc"
+    test -z "$tccdir" && tccdir="${prefix}"
+    test -z "$bindir" && bindir="${tccdir}"
+    test -z "$docdir" && docdir="${tccdir}/doc"
+    test -z "$libdir" && libdir="${tccdir}/libtcc"
+    confvars="$confvars WIN32"
+    LIBSUF=".lib"
+    EXESUF=".exe"
+    DLLSUF=".dll"
+else
+    if test -z "$prefix" ; then
+      prefix="/usr/local"
+    fi
+    if test -z "$sharedir" ; then
+      sharedir="${prefix}/share"
+    fi
+    if test x"$execprefix" = x""; then
+      execprefix="${prefix}"
+    fi
+    if test x"$libdir" = x""; then
+      libdir="${execprefix}/lib"
+    fi
+    if test x"$bindir" = x""; then
+      bindir="${execprefix}/bin"
+    fi
+    if test x"$docdir" = x""; then
+      docdir="${sharedir}/doc"
+    fi
+    if test x"$mandir" = x""; then
+      mandir="${sharedir}/man"
+    fi
+    if test x"$infodir" = x""; then
+      infodir="${sharedir}/info"
+    fi
+    if test x"$tccdir" = x""; then
+      tccdir="${libdir}/tcc"
+    fi
+    if test x"$includedir" = x""; then
+      includedir="${prefix}/include"
+    fi
+fi # mingw32
+
+if test x"$show_help" = "xyes" ; then
+cat << EOF
+Usage: configure [options]
+Options: [defaults in brackets after descriptions]
+
+Standard options:
+  --help                   print this message
+  --prefix=PREFIX          install in PREFIX [$prefix]
+  --exec-prefix=EPREFIX    install architecture-dependent files in EPREFIX
+			   [same as prefix]
+  --bindir=DIR             user executables in DIR [EPREFIX/bin]
+  --libdir=DIR             object code libraries in DIR [EPREFIX/lib]
+  --tccdir=DIR             installation directory [EPREFIX/lib/tcc]
+  --includedir=DIR         C header files in DIR [PREFIX/include]
+  --sharedir=DIR           documentation root DIR [PREFIX/share]
+  --docdir=DIR             documentation in DIR [SHAREDIR/doc/tcc]
+  --mandir=DIR             man documentation in DIR [SHAREDIR/man]
+  --infodir=DIR            info documentation in DIR [SHAREDIR/info]
+
+Advanced options (experts only):
+  --source-path=PATH       path of source code [$source_path]
+  --cross-prefix=PREFIX    use PREFIX for compile tools [$cross_prefix]
+  --sysroot=PREFIX         prepend PREFIX to library/include paths []
+  --cc=CC                  use C compiler CC [$cc]
+  --ar=AR                  create archives using AR [$ar]
+  --extra-cflags=          specify compiler flags [$CFLAGS]
+  --extra-ldflags=         specify linker options []
+  --cpu=CPU                CPU [$cpu]
+  --strip-binaries         strip symbol tables from resulting binaries
+  --disable-static         make libtcc.so instead of libtcc.a
+  --enable-static          make libtcc.a instead of libtcc.dll (win32)
+  --disable-rpath          disable use of -rpath with the above
+  --with-libgcc            use libgcc_s.so.1 instead of libtcc1.a
+  --enable-cross           build cross compilers
+  --with-selinux           use mmap for executable memory (with tcc -run)
+  --sysincludepaths=...    specify system include paths, colon separated
+  --libpaths=...           specify system library paths, colon separated
+  --crtprefix=...          specify locations of crt?.o, colon separated
+  --elfinterp=...          specify elf interpreter
+  --triplet=...            specify system library/include directory triplet
+  --config-uClibc,-musl,-mingw32... enable system specific configurations
+EOF
+#echo "NOTE: The object files are build at the place where configure is launched"
+exit 1
 fi

 cc="${cross_prefix}${cc}"
 ar="${cross_prefix}${ar}"
 strip="${cross_prefix}${strip}"

-if test "$mingw32" = "yes" ; then
-    LIBSUF=".lib"
-    EXESUF=".exe"
-fi
-
 if test -z "$cross_prefix" ; then
+  CONFTEST=./conftest$EXESUF
+  if ! $cc -o $CONFTEST $source_path/conftest.c 2>/dev/null ; then
+    echo "configure: error: '$cc' failed to compile conftest.c."
+  else
+    gcc_major="$($CONFTEST version)"
+    gcc_minor="$($CONFTEST minor)"
+  fi
+  bigendian="$($CONFTEST bigendian)"
+  if test "$mingw32" = "no" ; then

-# ---
-# big/little endian test
-cat > $TMPC << EOF
-#include <inttypes.h>
-int main(int argc, char ** argv){
-    volatile uint32_t i=0x01234567;
-    return (*((uint8_t*)(&i))) == 0x67;
-}
-EOF
+      if test -z "$triplet"; then
+        tt="$($CONFTEST triplet)"
+        if test -n "$tt" -a -f "/usr/lib/$tt/crti.o" ; then
+          triplet="$tt"
+        fi
+      fi

-if $cc -o $TMPE $TMPC 2>/dev/null ; then
-    $TMPE && bigendian="yes"
+      if test -z "$triplet"; then
+        if test $cpu = "x86_64" -o $cpu = "aarch64" ; then
+          if test -f "/usr/lib64/crti.o" ; then
+            tcc_lddir="lib64"
+          fi
+        fi
+      fi
+
+      if test "$cpu" = "arm" ; then
+	if test "${triplet%eabihf}" != "$triplet" ; then
+	   confvars="$confvars arm_eabihf"
+	elif test "${triplet%eabi}" != "$triplet" ; then
+	   confvars="$confvars arm_eabi"
+	fi
+	if grep -s -q "^Features.* \(vfp\|iwmmxt\) " /proc/cpuinfo ; then
+	   confvars="$confvars arm_vfp"
+	fi
+      fi
+
+      if test "$suggest" = "yes"; then
+        if test -f "/lib/ld-uClibc.so.0" ; then
+          echo "Perhaps you want ./configure --config-uClibc"
+        fi
+        if test -f "/lib/ld-musl-$cpu.so.1"; then
+          echo "Perhaps you want ./configure --config-musl"
+        fi
+      fi
+  fi
 else
-    echo big/little test failed
+  # if cross compiling, cannot launch a program, so make a static guess
+  case $cpu in
+    ppc|mips|s390)  bigendian=yes;;
+  esac
 fi

-else
-
-# if cross compiling, cannot launch a program, so make a static guess
-if test "$cpu" = "powerpc" -o "$cpu" = "mips" -o "$cpu" = "s390" ; then
-    bigendian="yes"
+if test "$bigendian" = "yes" ; then
+  confvars="$confvars BIGENDIAN"
 fi

+# a final configuration tuning
+if ! echo "$cc" | grep -q "tcc"; then
+  OPT1="-Wdeclaration-after-statement -fno-strict-aliasing"
+  # we want -Wno- but gcc does not always reject unknown -Wno- options
+  OPT2="-Wpointer-sign -Wsign-compare -Wunused-result"
+  if echo "$cc" | grep -q "clang"; then
+    OPT1="$OPT1 -fheinous-gnu-extensions"
+    OPT2="$OPT2 -Wstring-plus-int"
+  fi
+  $cc $OPT1 $OPT2 -o a.out -c -xc - < /dev/null > cc_msg.txt 2>&1
+  for o in $OPT1; do # enable these options
+    if ! grep -q -- $o cc_msg.txt; then CFLAGS="$CFLAGS $o"; fi
+  done
+  for o in $OPT2; do # disable these options
+    if ! grep -q -- $o cc_msg.txt; then CFLAGS="$CFLAGS -Wno-${o#-W*}"; fi
+  done
+  # cat cc_msg.txt
+  # echo $CFLAGS
+  rm -f cc_msg.txt a.out
 fi

-# check gcc version
-cat > $TMPC <<EOF
-int main(void) {
-#if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 2)
-return 0;
-#else
-#error gcc < 3.2
-#endif
-}
-EOF
-
-gcc_major="2"
-if $cc -o $TMPO $TMPC 2> /dev/null ; then
-    gcc_major="3"
-fi
-cat > $TMPC <<EOF
-int main(void) {
-#if __GNUC__ >= 4
-return 0;
-#else
-#error gcc < 4
-#endif
-}
-EOF
-
-if $cc -o $TMPO $TMPC 2> /dev/null ; then
-    gcc_major="4"
-fi
-
-if test x"$show_help" = "xyes" ; then
-cat << EOF
-
-Usage: configure [options]
-Options: [defaults in brackets after descriptions]
-
-EOF
-echo "Standard options:"
-echo "  --help                   print this message"
-echo "  --prefix=PREFIX          install in PREFIX [$prefix]"
-echo "  --exec-prefix=EPREFIX    install architecture-dependent files in EPREFIX"
-echo "                           [same as prefix]"
-echo "  --bindir=DIR             user executables in DIR [EPREFIX/bin]"
-echo "  --libdir=DIR             object code libraries in DIR [EPREFIX/lib]"
-echo "  --includedir=DIR         C header files in DIR [PREFIX/include]"
-echo "  --mandir=DIR             man documentation in DIR [PREFIX/man]"
-echo "  --enable-cross           build cross compilers"
-echo ""
-echo "Advanced options (experts only):"
-echo "  --source-path=PATH       path of source code [$source_path]"
-echo "  --cross-prefix=PREFIX    use PREFIX for compile tools [$cross_prefix]"
-echo "  --sysroot=PREFIX         prepend PREFIX to library/include paths []"
-echo "  --cc=CC                  use C compiler CC [$cc]"
-echo "  --with-libgcc            use /lib/libgcc_s.so.1 instead of libtcc1.a"
-echo ""
-#echo "NOTE: The object files are build at the place where configure is launched"
-exit 1
-fi
-
-if test "$mingw32" = "yes" ; then
-    if test -z "$prefix" ; then
-        prefix="C:/Program Files/tcc"
-    fi
-    execprefix="$prefix"
-    bindir="$prefix"
-    tccdir="$prefix"
-    docdir="$prefix/doc"
-else
-    if test -z "$prefix" ; then
-        prefix="/usr/local"
-    fi
-    if test x"$execprefix" = x""; then
-        execprefix="${prefix}"
-    fi
-    if test x"$bindir" = x""; then
-        bindir="${execprefix}/bin"
-    fi
-    if test x"$docdir" = x""; then
-        docdir="$prefix/share/doc/tcc"
-    fi
-fi # mingw32
-
-if test x"$libdir" = x""; then
-libdir="${execprefix}/lib"
-fi
-if test x"$tccdir" = x""; then
-tccdir="${execprefix}/lib/tcc"
-fi
-if test x"$mandir" = x""; then
-mandir="${prefix}/man"
-fi
-if test x"$includedir" = x""; then
-includedir="${prefix}/include"
-fi
-
-echo "Binary  directory   $bindir"
-echo "TinyCC directory    $tccdir"
-echo "Library directory   $libdir"
-echo "Include directory   $includedir"
-echo "Manual directory    $mandir"
-echo "Doc directory       $docdir"
-echo "Target root prefix  $sysroot"
-echo "Source path      $source_path"
-echo "C compiler       $cc"
-echo "CPU              $cpu"
-echo "Big Endian       $bigendian"
-echo "gprof enabled    $gprof"
-echo "cross compilers  $build_cross"
-echo "use libgcc       $use_libgcc"
+fcho() { if test -n "$2"; then echo "$1$2"; fi }

+fcho "Binary directory    " "$bindir"
+fcho "TinyCC directory    " "$tccdir"
+fcho "Library directory   " "$libdir"
+fcho "Include directory   " "$includedir"
+fcho "Manual directory    " "$mandir"
+fcho "Info directory      " "$infodir"
+fcho "Doc directory       " "$docdir"
+fcho "Target root prefix  " "$sysroot"
+echo "Source path         $source_path"
+echo "C compiler          $cc ($gcc_major.$gcc_minor)"
+echo "Target OS           $targetos"
+echo "CPU                 $cpu"
+fcho "Triplet             " "$triplet"
+fcho "Config              " "${confvars# }"
 echo "Creating config.mak and config.h"

-echo "# Automatically generated by configure - do not modify" > config.mak
+cat >config.mak <<EOF
+# Automatically generated by configure - do not modify
+prefix=$prefix
+bindir=\$(DESTDIR)$bindir
+tccdir=\$(DESTDIR)$tccdir
+libdir=\$(DESTDIR)$libdir
+includedir=\$(DESTDIR)$includedir
+mandir=\$(DESTDIR)$mandir
+infodir=\$(DESTDIR)$infodir
+docdir=\$(DESTDIR)$docdir
+CC=$cc
+GCC_MAJOR=$gcc_major
+GCC_MINOR=$gcc_minor
+AR=$ar
+STRIP=$strip -s -R .comment -R .note
+CFLAGS=$CFLAGS
+LDFLAGS=$LDFLAGS
+LIBSUF=$LIBSUF
+EXESUF=$EXESUF
+DLLSUF=$DLLSUF
+EOF
+
+print_inc() {
+  if test -n "$2"; then
+    echo "#ifndef $1" >> $TMPH
+    echo "# define $1 \"$2\"" >> $TMPH
+    echo "#endif" >> $TMPH
+  fi
+}
+
+print_mak() {
+  if test -n "$2"; then
+    echo "NATIVE_DEFINES+=-D$1=\"\\\"$2\\\"\"" >> config.mak
+  fi
+}
+
+print_mak_int() {
+  if test -n "$2"; then
+    echo "NATIVE_DEFINES+=-D$1=$2" >> config.mak
+  fi
+}
+
 echo "/* Automatically generated by configure - do not modify */" > $TMPH

-echo "prefix=\$(DESTDIR)$prefix" >> config.mak
-echo "bindir=\$(DESTDIR)$bindir" >> config.mak
-echo "tccdir=\$(DESTDIR)$tccdir" >> config.mak
-echo "libdir=\$(DESTDIR)$libdir" >> config.mak
-echo "includedir=\$(DESTDIR)$includedir" >> config.mak
-echo "mandir=\$(DESTDIR)$mandir" >> config.mak
-echo "docdir=\$(DESTDIR)$docdir" >> config.mak
+print_inc CONFIG_SYSROOT "$sysroot"
+print_inc CONFIG_TCCDIR "$tccdir"
+print_mak CONFIG_TCC_SYSINCLUDEPATHS "$tcc_sysincludepaths"
+print_mak CONFIG_TCC_LIBPATHS "$tcc_libpaths"
+print_mak CONFIG_TCC_CRTPREFIX "$tcc_crtprefix"
+print_mak CONFIG_TCC_ELFINTERP "$tcc_elfinterp"
+print_mak CONFIG_LDDIR "$tcc_lddir"
+print_mak CONFIG_TRIPLET "$triplet"
+print_mak_int TCC_CPU_VERSION "$cpuver"

-echo "#define CONFIG_SYSROOT \"$sysroot\"" >> $TMPH
-echo "#define CONFIG_TCCDIR \"$tccdir\"" >> $TMPH
-echo "CC=$cc" >> config.mak
-echo "GCC_MAJOR=$gcc_major" >> config.mak
-echo "#define GCC_MAJOR $gcc_major" >> $TMPH
-echo "HOST_CC=$host_cc" >> config.mak
-echo "AR=$ar" >> config.mak
-echo "STRIP=$strip -s -R .comment -R .note" >> config.mak
-echo "CFLAGS=$CFLAGS" >> config.mak
-echo "LDFLAGS=$LDFLAGS" >> config.mak
-echo "LIBSUF=$LIBSUF" >> config.mak
-echo "EXESUF=$EXESUF" >> config.mak
-if test "$cpu" = "x86" ; then
-  echo "ARCH=i386" >> config.mak
-  echo "#define HOST_I386 1" >> $TMPH
-elif test "$cpu" = "x86-64" ; then
-  echo "ARCH=x86-64" >> config.mak
-  echo "#define HOST_X86_64 1" >> $TMPH
-elif test "$cpu" = "armv4l" ; then
-  echo "ARCH=arm" >> config.mak
-  echo "#define HOST_ARM 1" >> $TMPH
-elif test "$cpu" = "powerpc" ; then
-  echo "ARCH=ppc" >> config.mak
-  echo "#define HOST_PPC 1" >> $TMPH
-elif test "$cpu" = "mips" ; then
-  echo "ARCH=mips" >> config.mak
-  echo "#define HOST_MIPS 1" >> $TMPH
-elif test "$cpu" = "s390" ; then
-  echo "ARCH=s390" >> config.mak
-  echo "#define HOST_S390 1" >> $TMPH
-elif test "$cpu" = "alpha" ; then
-  echo "ARCH=alpha" >> config.mak
-  echo "#define HOST_ALPHA 1" >> $TMPH
+if test "$cpu" = "aarch64" ; then
+  echo "ARCH=arm64" >> config.mak
 else
-  echo "Unsupported CPU"
-  exit 1
-fi
-if test "$noldl" = "yes" ; then
-  echo "CONFIG_NOLDL=yes" >> config.mak
-fi
-if test "$mingw32" = "yes" ; then
-  echo "CONFIG_WIN32=yes" >> config.mak
-  echo "#define CONFIG_WIN32 1" >> $TMPH
-fi
-if test "$bigendian" = "yes" ; then
-  echo "WORDS_BIGENDIAN=yes" >> config.mak
-  echo "#define WORDS_BIGENDIAN 1" >> $TMPH
-fi
-if test "$gprof" = "yes" ; then
-  echo "TARGET_GPROF=yes" >> config.mak
-  echo "#define HAVE_GPROF 1" >> $TMPH
-fi
-if test "$build_cross" = "yes" ; then
-  echo "CONFIG_CROSS=yes" >> config.mak
-fi
-if test "$use_libgcc" = "yes" ; then
-  echo "#define CONFIG_USE_LIBGCC" >> $TMPH
-  echo "CONFIG_USE_LIBGCC=yes" >> config.mak
+  echo "ARCH=$cpu" >> config.mak
 fi
+echo "TARGETOS=$targetos" >> config.mak
+
+for v in $confvars ; do
+  if test "${v%=*}" = "$v"; then
+    echo "CONFIG_$v=yes" >> config.mak
+  else
+    echo "CONFIG_$v" >> config.mak
+  fi
+done
+
 version=`head $source_path/VERSION`
-echo "VERSION=$version" >>config.mak
+echo "VERSION = $version" >> config.mak
 echo "#define TCC_VERSION \"$version\"" >> $TMPH
 echo "@set VERSION $version" > config.texi

-# build tree in object directory if source path is different from current one
 if test "$source_path_used" = "yes" ; then
-    DIRS="tests"
-    FILES="Makefile tests/Makefile"
-    for dir in $DIRS ; do
-        mkdir -p $dir
-    done
-    for f in $FILES ; do
-        ln -sf $source_path/$f $f
-    done
+    case $source_path in
+       /*) echo "TOPSRC=$source_path";;
+	*) echo "TOPSRC=\$(TOP)/$source_path";;
+     esac >>config.mak
+else
+     echo 'TOPSRC=$(TOP)' >>config.mak
 fi
-echo "SRC_PATH=$source_path" >> config.mak

 diff $TMPH config.h >/dev/null 2>&1
 if test $? -ne 0 ; then
@ -382,4 +493,35 @@ else
    echo "config.h is unchanged"
 fi

-rm -f $TMPO $TMPC $TMPE $TMPS $TMPH
+rm -f $TMPN* $CONFTEST
+
+# ---------------------------------------------------------------------------
+# build tree in object directory if source path is different from current one
+
+fn_makelink()
+{
+    tgt=$1/$2
+    case $2 in
+    */*) dn=${2%/*}
+	 test -d $dn || mkdir -p $dn
+	 case $1 in
+	 /*) ;;
+	  *) while test $dn ; do
+		tgt=../$tgt; dn=${dn#${dn%%/*}}; dn=${dn#/}
+	     done
+	     ;;
+	 esac
+	 ;;
+    esac
+
+    ln -sfn $tgt $2 || ( echo "ln failed. Using cp instead."; cp -f $1/$2 $2 )
+}
+
+if test "$source_path_used" = "yes" ; then
+  FILES="Makefile lib/Makefile tests/Makefile tests/tests2/Makefile tests/pp/Makefile"
+  for f in $FILES ; do
+    fn_makelink $source_path $f
+  done
+fi
+
+# ---------------------------------------------------------------------------
--- a/conftest.c
+++ b/conftest.c
@ -0,0 +1,87 @@
+#include <stdio.h>
+
+/* Define architecture */
+#if defined(__i386__) || defined _M_IX86
+# define TRIPLET_ARCH "i386"
+#elif defined(__x86_64__) || defined _M_AMD64
+# define TRIPLET_ARCH "x86_64"
+#elif defined(__arm__)
+# define TRIPLET_ARCH "arm"
+#elif defined(__aarch64__)
+# define TRIPLET_ARCH "aarch64"
+#else
+# define TRIPLET_ARCH "unknown"
+#endif
+
+/* Define OS */
+#if defined (__linux__)
+# define TRIPLET_OS "linux"
+#elif defined (__FreeBSD__) || defined (__FreeBSD_kernel__)
+# define TRIPLET_OS "kfreebsd"
+#elif defined _WIN32
+# define TRIPLET_OS "win32"
+#elif !defined (__GNU__)
+# define TRIPLET_OS "unknown"
+#endif
+
+/* Define calling convention and ABI */
+#if defined (__ARM_EABI__)
+# if defined (__ARM_PCS_VFP)
+#  define TRIPLET_ABI "gnueabihf"
+# else
+#  define TRIPLET_ABI "gnueabi"
+# endif
+#else
+# define TRIPLET_ABI "gnu"
+#endif
+
+#if defined _WIN32
+# define TRIPLET TRIPLET_ARCH "-" TRIPLET_OS
+#elif defined __GNU__
+# define TRIPLET TRIPLET_ARCH "-" TRIPLET_ABI
+#else
+# define TRIPLET TRIPLET_ARCH "-" TRIPLET_OS "-" TRIPLET_ABI
+#endif
+
+#if defined(_WIN32)
+int _CRT_glob = 0;
+#endif
+
+int main(int argc, char *argv[])
+{
+    switch(argc == 2 ? argv[1][0] : 0) {
+        case 'b':
+        {
+            volatile unsigned foo = 0x01234567;
+            puts(*(unsigned char*)&foo == 0x67 ? "no" : "yes");
+            break;
+        }
+#ifdef __GNUC__
+        case 'm':
+            printf("%d\n", __GNUC_MINOR__);
+            break;
+        case 'v':
+            printf("%d\n", __GNUC__);
+            break;
+#elif defined __TINYC__
+        case 'v':
+            puts("0");
+            break;
+        case 'm':
+            printf("%d\n", __TINYC__);
+            break;
+#else
+        case 'm':
+        case 'v':
+            puts("0");
+            break;
+#endif
+        case 't':
+            puts(TRIPLET);
+            break;
+
+        default:
+            break;
+    }
+    return 0;
+}
--- a/elf.h
+++ b/elf.h
--- a/examples/ex1.c
+++ b/examples/ex1.c
@ -1,4 +1,4 @@
-#! /usr/local/bin/tcc -run
+#!/usr/local/bin/tcc -run
 #include <tcclib.h>

 int main()
--- a/examples/ex3.c
+++ b/examples/ex3.c
@ -1,5 +1,4 @@
-#include <stdlib.h>
-#include <stdio.h>
+#include <tcclib.h>

 int fib(n)
 {
--- a/examples/ex4.c
+++ b/examples/ex4.c
@ -1,4 +1,4 @@
-#!./tcc -run -L/usr/X11R6/lib -lX11
+#!/usr/local/bin/tcc -run -L/usr/X11R6/lib -lX11
 #include <stdlib.h>
 #include <stdio.h>
 #include <X11/Xlib.h>
--- a/i386-asm.c
+++ b/i386-asm.c
--- a/i386-asm.h
+++ b/i386-asm.h
@ -1,12 +1,12 @@
-     DEF_ASM_OP0(pusha, 0x60) /* must be first OP0 */
-     DEF_ASM_OP0(popa, 0x61)
-     DEF_ASM_OP0(clc, 0xf8)
+     DEF_ASM_OP0(clc, 0xf8) /* must be first OP0 */
     DEF_ASM_OP0(cld, 0xfc)
     DEF_ASM_OP0(cli, 0xfa)
     DEF_ASM_OP0(clts, 0x0f06)
     DEF_ASM_OP0(cmc, 0xf5)
     DEF_ASM_OP0(lahf, 0x9f)
     DEF_ASM_OP0(sahf, 0x9e)
+     DEF_ASM_OP0(pusha, 0x60)
+     DEF_ASM_OP0(popa, 0x61)
     DEF_ASM_OP0(pushfl, 0x9c)
     DEF_ASM_OP0(popfl, 0x9d)
     DEF_ASM_OP0(pushf, 0x9c)
@ -33,53 +33,53 @@
     DEF_ASM_OP0(iret, 0xcf)
     DEF_ASM_OP0(rsm, 0x0faa)
     DEF_ASM_OP0(hlt, 0xf4)
-     DEF_ASM_OP0(wait, 0x9b)
     DEF_ASM_OP0(nop, 0x90)
+     DEF_ASM_OP0(pause, 0xf390)
     DEF_ASM_OP0(xlat, 0xd7)

     /* strings */
-ALT(DEF_ASM_OP0L(cmpsb, 0xa6, 0, OPC_BWL))
-ALT(DEF_ASM_OP0L(scmpb, 0xa6, 0, OPC_BWL))
+ALT(DEF_ASM_OP0L(cmpsb, 0xa6, 0, OPC_BWLX))
+ALT(DEF_ASM_OP0L(scmpb, 0xa6, 0, OPC_BWLX))

 ALT(DEF_ASM_OP0L(insb, 0x6c, 0, OPC_BWL))
 ALT(DEF_ASM_OP0L(outsb, 0x6e, 0, OPC_BWL))

-ALT(DEF_ASM_OP0L(lodsb, 0xac, 0, OPC_BWL))
-ALT(DEF_ASM_OP0L(slodb, 0xac, 0, OPC_BWL))
+ALT(DEF_ASM_OP0L(lodsb, 0xac, 0, OPC_BWLX))
+ALT(DEF_ASM_OP0L(slodb, 0xac, 0, OPC_BWLX))

-ALT(DEF_ASM_OP0L(movsb, 0xa4, 0, OPC_BWL))
-ALT(DEF_ASM_OP0L(smovb, 0xa4, 0, OPC_BWL))
+ALT(DEF_ASM_OP0L(movsb, 0xa4, 0, OPC_BWLX))
+ALT(DEF_ASM_OP0L(smovb, 0xa4, 0, OPC_BWLX))

-ALT(DEF_ASM_OP0L(scasb, 0xae, 0, OPC_BWL))
-ALT(DEF_ASM_OP0L(sscab, 0xae, 0, OPC_BWL))
+ALT(DEF_ASM_OP0L(scasb, 0xae, 0, OPC_BWLX))
+ALT(DEF_ASM_OP0L(sscab, 0xae, 0, OPC_BWLX))

-ALT(DEF_ASM_OP0L(stosb, 0xaa, 0, OPC_BWL))
-ALT(DEF_ASM_OP0L(sstob, 0xaa, 0, OPC_BWL))
+ALT(DEF_ASM_OP0L(stosb, 0xaa, 0, OPC_BWLX))
+ALT(DEF_ASM_OP0L(sstob, 0xaa, 0, OPC_BWLX))

     /* bits */
     
-ALT(DEF_ASM_OP2(bsfw, 0x0fbc, 0, OPC_MODRM | OPC_WL, OPT_REGW | OPT_EA, OPT_REGW))
-ALT(DEF_ASM_OP2(bsrw, 0x0fbd, 0, OPC_MODRM | OPC_WL, OPT_REGW | OPT_EA, OPT_REGW))
+ALT(DEF_ASM_OP2(bsfw, 0x0fbc, 0, OPC_MODRM | OPC_WLX, OPT_REGW | OPT_EA, OPT_REGW))
+ALT(DEF_ASM_OP2(bsrw, 0x0fbd, 0, OPC_MODRM | OPC_WLX, OPT_REGW | OPT_EA, OPT_REGW))

-ALT(DEF_ASM_OP2(btw, 0x0fa3, 0, OPC_MODRM | OPC_WL, OPT_REGW, OPT_REGW | OPT_EA))
-ALT(DEF_ASM_OP2(btw, 0x0fba, 4, OPC_MODRM | OPC_WL, OPT_IM8, OPT_REGW | OPT_EA))
+ALT(DEF_ASM_OP2(btw, 0x0fa3, 0, OPC_MODRM | OPC_WLX, OPT_REGW, OPT_REGW | OPT_EA))
+ALT(DEF_ASM_OP2(btw, 0x0fba, 4, OPC_MODRM | OPC_WLX, OPT_IM8, OPT_REGW | OPT_EA))

-ALT(DEF_ASM_OP2(btsw, 0x0fab, 0, OPC_MODRM | OPC_WL, OPT_REGW, OPT_REGW | OPT_EA))
-ALT(DEF_ASM_OP2(btsw, 0x0fba, 5, OPC_MODRM | OPC_WL, OPT_IM8, OPT_REGW | OPT_EA))
+ALT(DEF_ASM_OP2(btsw, 0x0fab, 0, OPC_MODRM | OPC_WLX, OPT_REGW, OPT_REGW | OPT_EA))
+ALT(DEF_ASM_OP2(btsw, 0x0fba, 5, OPC_MODRM | OPC_WLX, OPT_IM8, OPT_REGW | OPT_EA))

-ALT(DEF_ASM_OP2(btrw, 0x0fb3, 0, OPC_MODRM | OPC_WL, OPT_REGW, OPT_REGW | OPT_EA))
-ALT(DEF_ASM_OP2(btrw, 0x0fba, 6, OPC_MODRM | OPC_WL, OPT_IM8, OPT_REGW | OPT_EA))
+ALT(DEF_ASM_OP2(btrw, 0x0fb3, 0, OPC_MODRM | OPC_WLX, OPT_REGW, OPT_REGW | OPT_EA))
+ALT(DEF_ASM_OP2(btrw, 0x0fba, 6, OPC_MODRM | OPC_WLX, OPT_IM8, OPT_REGW | OPT_EA))

-ALT(DEF_ASM_OP2(btcw, 0x0fbb, 0, OPC_MODRM | OPC_WL, OPT_REGW, OPT_REGW | OPT_EA))
-ALT(DEF_ASM_OP2(btcw, 0x0fba, 7, OPC_MODRM | OPC_WL, OPT_IM8, OPT_REGW | OPT_EA))
+ALT(DEF_ASM_OP2(btcw, 0x0fbb, 0, OPC_MODRM | OPC_WLX, OPT_REGW, OPT_REGW | OPT_EA))
+ALT(DEF_ASM_OP2(btcw, 0x0fba, 7, OPC_MODRM | OPC_WLX, OPT_IM8, OPT_REGW | OPT_EA))

     /* prefixes */
+     DEF_ASM_OP0(wait, 0x9b)
+     DEF_ASM_OP0(fwait, 0x9b)
+     DEF_ASM_OP0(aword, 0x67)
     DEF_ASM_OP0(addr16, 0x67)
-     DEF_ASM_OP0(a32, 0x67)
-
+     ALT(DEF_ASM_OP0(word, 0x66))
     DEF_ASM_OP0(data16, 0x66)
-     DEF_ASM_OP0(o32, 0x66)
-
     DEF_ASM_OP0(lock, 0xf0)
     DEF_ASM_OP0(rep, 0xf3)
     DEF_ASM_OP0(repe, 0xf3)
@ -97,43 +97,43 @@ ALT(DEF_ASM_OP2(btcw, 0x0fba, 7, OPC_MODRM | OPC_WL, OPT_IM8, OPT_REGW | OPT_EA)
     DEF_ASM_OP0(ud2, 0x0f0b)

     /* NOTE: we took the same order as gas opcode definition order */
-ALT(DEF_ASM_OP2(movb, 0xa0, 0, OPC_BWL, OPT_ADDR, OPT_EAX))
-ALT(DEF_ASM_OP2(movb, 0xa2, 0, OPC_BWL, OPT_EAX, OPT_ADDR))
-ALT(DEF_ASM_OP2(movb, 0x88, 0, OPC_MODRM | OPC_BWL, OPT_REG, OPT_EA | OPT_REG))
-ALT(DEF_ASM_OP2(movb, 0x8a, 0, OPC_MODRM | OPC_BWL, OPT_EA | OPT_REG, OPT_REG))
-ALT(DEF_ASM_OP2(movb, 0xb0, 0, OPC_REG | OPC_BWL, OPT_IM, OPT_REG))
-ALT(DEF_ASM_OP2(movb, 0xc6, 0, OPC_MODRM | OPC_BWL, OPT_IM, OPT_REG | OPT_EA))
+ALT(DEF_ASM_OP2(movb, 0xa0, 0, OPC_BWLX, OPT_ADDR, OPT_EAX))
+ALT(DEF_ASM_OP2(movb, 0xa2, 0, OPC_BWLX, OPT_EAX, OPT_ADDR))
+ALT(DEF_ASM_OP2(movb, 0x88, 0, OPC_MODRM | OPC_BWLX, OPT_REG, OPT_EA | OPT_REG))
+ALT(DEF_ASM_OP2(movb, 0x8a, 0, OPC_MODRM | OPC_BWLX, OPT_EA | OPT_REG, OPT_REG))
+ALT(DEF_ASM_OP2(movb, 0xb0, 0, OPC_REG | OPC_BWLX, OPT_IM, OPT_REG))
+ALT(DEF_ASM_OP2(movb, 0xc6, 0, OPC_MODRM | OPC_BWLX, OPT_IM, OPT_REG | OPT_EA))

-ALT(DEF_ASM_OP2(movw, 0x8c, 0, OPC_MODRM | OPC_WL, OPT_SEG, OPT_EA | OPT_REG))
-ALT(DEF_ASM_OP2(movw, 0x8e, 0, OPC_MODRM | OPC_WL, OPT_EA | OPT_REG, OPT_SEG))
+ALT(DEF_ASM_OP2(movw, 0x8c, 0, OPC_MODRM | OPC_WLX, OPT_SEG, OPT_EA | OPT_REG))
+ALT(DEF_ASM_OP2(movw, 0x8e, 0, OPC_MODRM | OPC_WLX, OPT_EA | OPT_REG, OPT_SEG))

-ALT(DEF_ASM_OP2(movw, 0x0f20, 0, OPC_MODRM | OPC_WL, OPT_CR, OPT_REG32))
-ALT(DEF_ASM_OP2(movw, 0x0f21, 0, OPC_MODRM | OPC_WL, OPT_DB, OPT_REG32))
-ALT(DEF_ASM_OP2(movw, 0x0f24, 0, OPC_MODRM | OPC_WL, OPT_TR, OPT_REG32))
-ALT(DEF_ASM_OP2(movw, 0x0f22, 0, OPC_MODRM | OPC_WL, OPT_REG32, OPT_CR))
-ALT(DEF_ASM_OP2(movw, 0x0f23, 0, OPC_MODRM | OPC_WL, OPT_REG32, OPT_DB))
-ALT(DEF_ASM_OP2(movw, 0x0f26, 0, OPC_MODRM | OPC_WL, OPT_REG32, OPT_TR))
+ALT(DEF_ASM_OP2(movw, 0x0f20, 0, OPC_MODRM | OPC_WLX, OPT_CR, OPT_REG32))
+ALT(DEF_ASM_OP2(movw, 0x0f21, 0, OPC_MODRM | OPC_WLX, OPT_DB, OPT_REG32))
+ALT(DEF_ASM_OP2(movw, 0x0f24, 0, OPC_MODRM | OPC_WLX, OPT_TR, OPT_REG32))
+ALT(DEF_ASM_OP2(movw, 0x0f22, 0, OPC_MODRM | OPC_WLX, OPT_REG32, OPT_CR))
+ALT(DEF_ASM_OP2(movw, 0x0f23, 0, OPC_MODRM | OPC_WLX, OPT_REG32, OPT_DB))
+ALT(DEF_ASM_OP2(movw, 0x0f26, 0, OPC_MODRM | OPC_WLX, OPT_REG32, OPT_TR))

 ALT(DEF_ASM_OP2(movsbl, 0x0fbe, 0, OPC_MODRM, OPT_REG8 | OPT_EA, OPT_REG32))
-ALT(DEF_ASM_OP2(movsbw, 0x0fbe, 0, OPC_MODRM | OPC_D16, OPT_REG8 | OPT_EA, OPT_REG16))
+ALT(DEF_ASM_OP2(movsbw, 0x660fbe, 0, OPC_MODRM, OPT_REG8 | OPT_EA, OPT_REG16))
 ALT(DEF_ASM_OP2(movswl, 0x0fbf, 0, OPC_MODRM, OPT_REG16 | OPT_EA, OPT_REG32))
-ALT(DEF_ASM_OP2(movzbw, 0x0fb6, 0, OPC_MODRM | OPC_WL, OPT_REG8 | OPT_EA, OPT_REGW))
+ALT(DEF_ASM_OP2(movzbw, 0x0fb6, 0, OPC_MODRM | OPC_WLX, OPT_REG8 | OPT_EA, OPT_REGW))
 ALT(DEF_ASM_OP2(movzwl, 0x0fb7, 0, OPC_MODRM, OPT_REG16 | OPT_EA, OPT_REG32))

-ALT(DEF_ASM_OP1(pushw, 0x50, 0, OPC_REG | OPC_WL, OPT_REGW))
-ALT(DEF_ASM_OP1(pushw, 0xff, 6, OPC_MODRM | OPC_WL, OPT_REGW | OPT_EA))
-ALT(DEF_ASM_OP1(pushw, 0x68, 0, OPC_WL, OPT_IM32))
-ALT(DEF_ASM_OP1(pushw, 0x06, 0, OPC_WL, OPT_SEG))
-    DEF_ASM_OP1(pushb, 0x6a, 0, OPC_B, OPT_IM8S)
+ALT(DEF_ASM_OP1(pushw, 0x50, 0, OPC_REG | OPC_WLX, OPT_REGW))
+ALT(DEF_ASM_OP1(pushw, 0xff, 6, OPC_MODRM | OPC_WLX, OPT_REGW | OPT_EA))
+ALT(DEF_ASM_OP1(pushw, 0x6a, 0, OPC_WLX, OPT_IM8S))
+ALT(DEF_ASM_OP1(pushw, 0x68, 0, OPC_WLX, OPT_IM32))
+ALT(DEF_ASM_OP1(pushw, 0x06, 0, OPC_WLX, OPT_SEG))

-ALT(DEF_ASM_OP1(popw, 0x58, 0, OPC_REG | OPC_WL, OPT_REGW))
-ALT(DEF_ASM_OP1(popw, 0x8f, 0, OPC_MODRM | OPC_WL, OPT_REGW | OPT_EA))
-ALT(DEF_ASM_OP1(popw, 0x07, 0, OPC_WL, OPT_SEG))
+ALT(DEF_ASM_OP1(popw, 0x58, 0, OPC_REG | OPC_WLX, OPT_REGW))
+ALT(DEF_ASM_OP1(popw, 0x8f, 0, OPC_MODRM | OPC_WLX, OPT_REGW | OPT_EA))
+ALT(DEF_ASM_OP1(popw, 0x07, 0, OPC_WLX, OPT_SEG))

-ALT(DEF_ASM_OP2(xchgw, 0x90, 0, OPC_REG | OPC_WL, OPT_REG, OPT_EAX))
-ALT(DEF_ASM_OP2(xchgw, 0x90, 0, OPC_REG | OPC_WL, OPT_EAX, OPT_REG))
-ALT(DEF_ASM_OP2(xchgb, 0x86, 0, OPC_MODRM | OPC_BWL, OPT_REG, OPT_EA | OPT_REG))
-ALT(DEF_ASM_OP2(xchgb, 0x86, 0, OPC_MODRM | OPC_BWL, OPT_EA | OPT_REG, OPT_REG))
+ALT(DEF_ASM_OP2(xchgw, 0x90, 0, OPC_REG | OPC_WLX, OPT_REGW, OPT_EAX))
+ALT(DEF_ASM_OP2(xchgw, 0x90, 0, OPC_REG | OPC_WLX, OPT_EAX, OPT_REGW))
+ALT(DEF_ASM_OP2(xchgb, 0x86, 0, OPC_MODRM | OPC_BWLX, OPT_REG, OPT_EA | OPT_REG))
+ALT(DEF_ASM_OP2(xchgb, 0x86, 0, OPC_MODRM | OPC_BWLX, OPT_EA | OPT_REG, OPT_REG))

 ALT(DEF_ASM_OP2(inb, 0xe4, 0, OPC_BWL, OPT_IM8, OPT_EAX))
 ALT(DEF_ASM_OP1(inb, 0xe4, 0, OPC_BWL, OPT_IM8))
@ -145,7 +145,7 @@ ALT(DEF_ASM_OP1(outb, 0xe6, 0, OPC_BWL, OPT_IM8))
 ALT(DEF_ASM_OP2(outb, 0xee, 0, OPC_BWL, OPT_EAX, OPT_DX))
 ALT(DEF_ASM_OP1(outb, 0xee, 0, OPC_BWL, OPT_DX))

-ALT(DEF_ASM_OP2(leaw, 0x8d, 0, OPC_MODRM | OPC_WL, OPT_EA, OPT_REG))
+ALT(DEF_ASM_OP2(leaw, 0x8d, 0, OPC_MODRM | OPC_WLX, OPT_EA, OPT_REG))

 ALT(DEF_ASM_OP2(les, 0xc4, 0, OPC_MODRM, OPT_EA, OPT_REG32))
 ALT(DEF_ASM_OP2(lds, 0xc5, 0, OPC_MODRM, OPT_EA, OPT_REG32))
@ -154,78 +154,80 @@ ALT(DEF_ASM_OP2(lfs, 0x0fb4, 0, OPC_MODRM, OPT_EA, OPT_REG32))
 ALT(DEF_ASM_OP2(lgs, 0x0fb5, 0, OPC_MODRM, OPT_EA, OPT_REG32))

     /* arith */
-ALT(DEF_ASM_OP2(addb, 0x00, 0, OPC_ARITH | OPC_MODRM | OPC_BWL, OPT_REG, OPT_EA | OPT_REG)) /* XXX: use D bit ? */
-ALT(DEF_ASM_OP2(addb, 0x02, 0, OPC_ARITH | OPC_MODRM | OPC_BWL, OPT_EA | OPT_REG, OPT_REG))
-ALT(DEF_ASM_OP2(addb, 0x04, 0, OPC_ARITH | OPC_BWL, OPT_IM, OPT_EAX))
-ALT(DEF_ASM_OP2(addb, 0x80, 0, OPC_ARITH | OPC_MODRM | OPC_BWL, OPT_IM, OPT_EA | OPT_REG))
-ALT(DEF_ASM_OP2(addw, 0x83, 0, OPC_ARITH | OPC_MODRM | OPC_WL, OPT_IM8S, OPT_EA | OPT_REG))
+ALT(DEF_ASM_OP2(addb, 0x00, 0, OPC_ARITH | OPC_MODRM | OPC_BWLX, OPT_REG, OPT_EA | OPT_REG)) /* XXX: use D bit ? */
+ALT(DEF_ASM_OP2(addb, 0x02, 0, OPC_ARITH | OPC_MODRM | OPC_BWLX, OPT_EA | OPT_REG, OPT_REG))
+ALT(DEF_ASM_OP2(addb, 0x04, 0, OPC_ARITH | OPC_BWLX, OPT_IM, OPT_EAX))
+ALT(DEF_ASM_OP2(addw, 0x83, 0, OPC_ARITH | OPC_MODRM | OPC_WLX, OPT_IM8S, OPT_EA | OPT_REGW))
+ALT(DEF_ASM_OP2(addb, 0x80, 0, OPC_ARITH | OPC_MODRM | OPC_BWLX, OPT_IM, OPT_EA | OPT_REG))

-ALT(DEF_ASM_OP2(testb, 0x84, 0, OPC_MODRM | OPC_BWL, OPT_EA | OPT_REG, OPT_REG))
-ALT(DEF_ASM_OP2(testb, 0x84, 0, OPC_MODRM | OPC_BWL, OPT_REG, OPT_EA | OPT_REG))
-ALT(DEF_ASM_OP2(testb, 0xa8, 0, OPC_BWL, OPT_IM, OPT_EAX))
-ALT(DEF_ASM_OP2(testb, 0xf6, 0, OPC_MODRM | OPC_BWL, OPT_IM, OPT_EA | OPT_REG))
+ALT(DEF_ASM_OP2(testb, 0x84, 0, OPC_MODRM | OPC_BWLX, OPT_REG, OPT_EA | OPT_REG))
+ALT(DEF_ASM_OP2(testb, 0x84, 0, OPC_MODRM | OPC_BWLX, OPT_EA | OPT_REG, OPT_REG))
+ALT(DEF_ASM_OP2(testb, 0xa8, 0, OPC_BWLX, OPT_IM, OPT_EAX))
+ALT(DEF_ASM_OP2(testb, 0xf6, 0, OPC_MODRM | OPC_BWLX, OPT_IM, OPT_EA | OPT_REG))

-ALT(DEF_ASM_OP1(incw, 0x40, 0, OPC_REG | OPC_WL, OPT_REGW))
-ALT(DEF_ASM_OP1(incb, 0xfe, 0, OPC_MODRM | OPC_BWL, OPT_REG | OPT_EA))
-ALT(DEF_ASM_OP1(decw, 0x48, 0, OPC_REG | OPC_WL, OPT_REGW))
-ALT(DEF_ASM_OP1(decb, 0xfe, 1, OPC_MODRM | OPC_BWL, OPT_REG | OPT_EA))
+ALT(DEF_ASM_OP1(incw, 0x40, 0, OPC_REG | OPC_WLX, OPT_REGW))
+ALT(DEF_ASM_OP1(incb, 0xfe, 0, OPC_MODRM | OPC_BWLX, OPT_REG | OPT_EA))
+ALT(DEF_ASM_OP1(decw, 0x48, 0, OPC_REG | OPC_WLX, OPT_REGW))
+ALT(DEF_ASM_OP1(decb, 0xfe, 1, OPC_MODRM | OPC_BWLX, OPT_REG | OPT_EA))

-ALT(DEF_ASM_OP1(notb, 0xf6, 2, OPC_MODRM | OPC_BWL, OPT_REG | OPT_EA))
-ALT(DEF_ASM_OP1(negb, 0xf6, 3, OPC_MODRM | OPC_BWL, OPT_REG | OPT_EA))
+ALT(DEF_ASM_OP1(notb, 0xf6, 2, OPC_MODRM | OPC_BWLX, OPT_REG | OPT_EA))
+ALT(DEF_ASM_OP1(negb, 0xf6, 3, OPC_MODRM | OPC_BWLX, OPT_REG | OPT_EA))

-ALT(DEF_ASM_OP1(mulb, 0xf6, 4, OPC_MODRM | OPC_BWL, OPT_REG | OPT_EA))
-ALT(DEF_ASM_OP1(imulb, 0xf6, 5, OPC_MODRM | OPC_BWL, OPT_REG | OPT_EA))
+ALT(DEF_ASM_OP1(mulb, 0xf6, 4, OPC_MODRM | OPC_BWLX, OPT_REG | OPT_EA))
+ALT(DEF_ASM_OP1(imulb, 0xf6, 5, OPC_MODRM | OPC_BWLX, OPT_REG | OPT_EA))

-ALT(DEF_ASM_OP2(imulw, 0x0faf, 0, OPC_MODRM | OPC_WL, OPT_REG | OPT_EA, OPT_REG))
-ALT(DEF_ASM_OP3(imulw, 0x6b, 0, OPC_MODRM | OPC_WL, OPT_IM8S, OPT_REGW | OPT_EA, OPT_REGW))
-ALT(DEF_ASM_OP2(imulw, 0x6b, 0, OPC_MODRM | OPC_WL, OPT_IM8S, OPT_REGW))
-ALT(DEF_ASM_OP3(imulw, 0x69, 0, OPC_MODRM | OPC_WL, OPT_IMW, OPT_REGW | OPT_EA, OPT_REGW))
-ALT(DEF_ASM_OP2(imulw, 0x69, 0, OPC_MODRM | OPC_WL, OPT_IMW, OPT_REGW))
+ALT(DEF_ASM_OP2(imulw, 0x0faf, 0, OPC_MODRM | OPC_WLX, OPT_REG | OPT_EA, OPT_REG))
+ALT(DEF_ASM_OP3(imulw, 0x6b, 0, OPC_MODRM | OPC_WLX, OPT_IM8S, OPT_REGW | OPT_EA, OPT_REGW))
+ALT(DEF_ASM_OP2(imulw, 0x6b, 0, OPC_MODRM | OPC_WLX, OPT_IM8S, OPT_REGW))
+ALT(DEF_ASM_OP3(imulw, 0x69, 0, OPC_MODRM | OPC_WLX, OPT_IMW, OPT_REGW | OPT_EA, OPT_REGW))
+ALT(DEF_ASM_OP2(imulw, 0x69, 0, OPC_MODRM | OPC_WLX, OPT_IMW, OPT_REGW))

-ALT(DEF_ASM_OP1(divb, 0xf6, 6, OPC_MODRM | OPC_BWL, OPT_REG | OPT_EA))
-ALT(DEF_ASM_OP2(divb, 0xf6, 6, OPC_MODRM | OPC_BWL, OPT_REG | OPT_EA, OPT_EAX))
-ALT(DEF_ASM_OP1(idivb, 0xf6, 7, OPC_MODRM | OPC_BWL, OPT_REG | OPT_EA))
-ALT(DEF_ASM_OP2(idivb, 0xf6, 7, OPC_MODRM | OPC_BWL, OPT_REG | OPT_EA, OPT_EAX))
+ALT(DEF_ASM_OP1(divb, 0xf6, 6, OPC_MODRM | OPC_BWLX, OPT_REG | OPT_EA))
+ALT(DEF_ASM_OP2(divb, 0xf6, 6, OPC_MODRM | OPC_BWLX, OPT_REG | OPT_EA, OPT_EAX))
+ALT(DEF_ASM_OP1(idivb, 0xf6, 7, OPC_MODRM | OPC_BWLX, OPT_REG | OPT_EA))
+ALT(DEF_ASM_OP2(idivb, 0xf6, 7, OPC_MODRM | OPC_BWLX, OPT_REG | OPT_EA, OPT_EAX))

     /* shifts */
-ALT(DEF_ASM_OP2(rolb, 0xc0, 0, OPC_MODRM | OPC_BWL | OPC_SHIFT, OPT_IM8, OPT_EA | OPT_REG))
-ALT(DEF_ASM_OP2(rolb, 0xd2, 0, OPC_MODRM | OPC_BWL | OPC_SHIFT, OPT_CL, OPT_EA | OPT_REG))
-ALT(DEF_ASM_OP1(rolb, 0xd0, 0, OPC_MODRM | OPC_BWL | OPC_SHIFT, OPT_EA | OPT_REG))
+ALT(DEF_ASM_OP2(rolb, 0xc0, 0, OPC_MODRM | OPC_BWLX | OPC_SHIFT, OPT_IM8, OPT_EA | OPT_REG))
+ALT(DEF_ASM_OP2(rolb, 0xd2, 0, OPC_MODRM | OPC_BWLX | OPC_SHIFT, OPT_CL, OPT_EA | OPT_REG))
+ALT(DEF_ASM_OP1(rolb, 0xd0, 0, OPC_MODRM | OPC_BWLX | OPC_SHIFT, OPT_EA | OPT_REG))

-ALT(DEF_ASM_OP3(shldw, 0x0fa4, 0, OPC_MODRM | OPC_WL, OPT_IM8, OPT_REGW, OPT_EA | OPT_REGW))
-ALT(DEF_ASM_OP3(shldw, 0x0fa5, 0, OPC_MODRM | OPC_WL, OPT_CL, OPT_REGW, OPT_EA | OPT_REGW))
-ALT(DEF_ASM_OP2(shldw, 0x0fa5, 0, OPC_MODRM | OPC_WL, OPT_REGW, OPT_EA | OPT_REGW))
-ALT(DEF_ASM_OP3(shrdw, 0x0fac, 0, OPC_MODRM | OPC_WL, OPT_IM8, OPT_REGW, OPT_EA | OPT_REGW))
-ALT(DEF_ASM_OP3(shrdw, 0x0fad, 0, OPC_MODRM | OPC_WL, OPT_CL, OPT_REGW, OPT_EA | OPT_REGW))
-ALT(DEF_ASM_OP2(shrdw, 0x0fad, 0, OPC_MODRM | OPC_WL, OPT_REGW, OPT_EA | OPT_REGW))
+ALT(DEF_ASM_OP3(shldw, 0x0fa4, 0, OPC_MODRM | OPC_WLX, OPT_IM8, OPT_REGW, OPT_EA | OPT_REGW))
+ALT(DEF_ASM_OP3(shldw, 0x0fa5, 0, OPC_MODRM | OPC_WLX, OPT_CL, OPT_REGW, OPT_EA | OPT_REGW))
+ALT(DEF_ASM_OP2(shldw, 0x0fa5, 0, OPC_MODRM | OPC_WLX, OPT_REGW, OPT_EA | OPT_REGW))
+ALT(DEF_ASM_OP3(shrdw, 0x0fac, 0, OPC_MODRM | OPC_WLX, OPT_IM8, OPT_REGW, OPT_EA | OPT_REGW))
+ALT(DEF_ASM_OP3(shrdw, 0x0fad, 0, OPC_MODRM | OPC_WLX, OPT_CL, OPT_REGW, OPT_EA | OPT_REGW))
+ALT(DEF_ASM_OP2(shrdw, 0x0fad, 0, OPC_MODRM | OPC_WLX, OPT_REGW, OPT_EA | OPT_REGW))

 ALT(DEF_ASM_OP1(call, 0xff, 2, OPC_MODRM, OPT_INDIR))
-ALT(DEF_ASM_OP1(call, 0xe8, 0, OPC_JMP, OPT_ADDR))
+ALT(DEF_ASM_OP1(call, 0xe8, 0, 0, OPT_DISP))
 ALT(DEF_ASM_OP1(jmp, 0xff, 4, OPC_MODRM, OPT_INDIR))
-ALT(DEF_ASM_OP1(jmp, 0xeb, 0, OPC_SHORTJMP | OPC_JMP, OPT_ADDR))
-ALT(DEF_ASM_OP1(jmp, 0xff, 0, OPC_JMP | OPC_WL, OPT_REGW))
+ALT(DEF_ASM_OP1(jmp, 0xeb, 0, 0, OPT_DISP8))

 ALT(DEF_ASM_OP2(lcall, 0x9a, 0, 0, OPT_IM16, OPT_IM32))
-ALT(DEF_ASM_OP1(lcall, 0xff, 3, 0, OPT_EA))
+ALT(DEF_ASM_OP1(lcall, 0xff, 3, OPC_MODRM, OPT_EA))
 ALT(DEF_ASM_OP2(ljmp, 0xea, 0, 0, OPT_IM16, OPT_IM32))
-ALT(DEF_ASM_OP1(ljmp, 0xff, 5, 0, OPT_EA))
+ALT(DEF_ASM_OP1(ljmp, 0xff, 5, OPC_MODRM, OPT_EA))

 ALT(DEF_ASM_OP1(int, 0xcd, 0, 0, OPT_IM8))
 ALT(DEF_ASM_OP1(seto, 0x0f90, 0, OPC_MODRM | OPC_TEST, OPT_REG8 | OPT_EA))
+ALT(DEF_ASM_OP1(setob, 0x0f90, 0, OPC_MODRM | OPC_TEST, OPT_REG8 | OPT_EA))
    DEF_ASM_OP2(enter, 0xc8, 0, 0, OPT_IM16, OPT_IM8)
    DEF_ASM_OP0(leave, 0xc9)
    DEF_ASM_OP0(ret, 0xc3)
+    DEF_ASM_OP0(retl,0xc3)
+ALT(DEF_ASM_OP1(retl,0xc2, 0, 0, OPT_IM16))
 ALT(DEF_ASM_OP1(ret, 0xc2, 0, 0, OPT_IM16))
    DEF_ASM_OP0(lret, 0xcb)
 ALT(DEF_ASM_OP1(lret, 0xca, 0, 0, OPT_IM16))

-ALT(DEF_ASM_OP1(jo, 0x70, 0, OPC_SHORTJMP | OPC_JMP | OPC_TEST, OPT_ADDR))
-    DEF_ASM_OP1(loopne, 0xe0, 0, OPC_SHORTJMP, OPT_ADDR)
-    DEF_ASM_OP1(loopnz, 0xe0, 0, OPC_SHORTJMP, OPT_ADDR)
-    DEF_ASM_OP1(loope, 0xe1, 0, OPC_SHORTJMP, OPT_ADDR)
-    DEF_ASM_OP1(loopz, 0xe1, 0, OPC_SHORTJMP, OPT_ADDR)
-    DEF_ASM_OP1(loop, 0xe2, 0, OPC_SHORTJMP, OPT_ADDR)
-    DEF_ASM_OP1(jecxz, 0xe3, 0, OPC_SHORTJMP, OPT_ADDR)
+ALT(DEF_ASM_OP1(jo, 0x70, 0, OPC_TEST, OPT_DISP8))
+    DEF_ASM_OP1(loopne, 0xe0, 0, 0, OPT_DISP8)
+    DEF_ASM_OP1(loopnz, 0xe0, 0, 0, OPT_DISP8)
+    DEF_ASM_OP1(loope, 0xe1, 0, 0, OPT_DISP8)
+    DEF_ASM_OP1(loopz, 0xe1, 0, 0, OPT_DISP8)
+    DEF_ASM_OP1(loop, 0xe2, 0, 0, OPT_DISP8)
+    DEF_ASM_OP1(jecxz, 0xe3, 0, 0, OPT_DISP8)
     
     /* float */
     /* specific fcomp handling */
@ -233,6 +235,8 @@ ALT(DEF_ASM_OP0L(fcomp, 0xd8d9, 0, 0))

 ALT(DEF_ASM_OP1(fadd, 0xd8c0, 0, OPC_FARITH | OPC_REG, OPT_ST))
 ALT(DEF_ASM_OP2(fadd, 0xd8c0, 0, OPC_FARITH | OPC_REG, OPT_ST, OPT_ST0))
+ALT(DEF_ASM_OP2(fadd, 0xdcc0, 0, OPC_FARITH | OPC_REG, OPT_ST0, OPT_ST))
+ALT(DEF_ASM_OP2(fmul, 0xdcc8, 0, OPC_FARITH | OPC_REG, OPT_ST0, OPT_ST))
 ALT(DEF_ASM_OP0L(fadd, 0xdec1, 0, OPC_FARITH))
 ALT(DEF_ASM_OP1(faddp, 0xdec0, 0, OPC_FARITH | OPC_REG, OPT_ST))
 ALT(DEF_ASM_OP2(faddp, 0xdec0, 0, OPC_FARITH | OPC_REG, OPT_ST, OPT_ST0))
@ -275,7 +279,6 @@ ALT(DEF_ASM_OP1(fiadds, 0xde, 0, OPC_FARITH | OPC_MODRM, OPT_EA))
     DEF_ASM_OP0(fninit, 0xdbe3)
     DEF_ASM_OP0(fnclex, 0xdbe2)
     DEF_ASM_OP0(fnop, 0xd9d0)
-     DEF_ASM_OP0(fwait, 0x9b)

    /* fp load */
    DEF_ASM_OP1(fld, 0xd9c0, 0, OPC_REG, OPT_ST)
@ -338,12 +341,12 @@ ALT(DEF_ASM_OP1(fstsw, 0xdd, 7, OPC_MODRM | OPC_FWAIT, OPT_EA ))

    /* segments */
    DEF_ASM_OP2(arpl, 0x63, 0, OPC_MODRM, OPT_REG16, OPT_REG16 | OPT_EA)
-    DEF_ASM_OP2(lar, 0x0f02, 0, OPC_MODRM, OPT_REG32 | OPT_EA, OPT_REG32)
+ALT(DEF_ASM_OP2(larw, 0x0f02, 0, OPC_MODRM | OPC_WLX, OPT_REG | OPT_EA, OPT_REG))
    DEF_ASM_OP1(lgdt, 0x0f01, 2, OPC_MODRM, OPT_EA)
    DEF_ASM_OP1(lidt, 0x0f01, 3, OPC_MODRM, OPT_EA)
    DEF_ASM_OP1(lldt, 0x0f00, 2, OPC_MODRM, OPT_EA | OPT_REG)
    DEF_ASM_OP1(lmsw, 0x0f01, 6, OPC_MODRM, OPT_EA | OPT_REG)
-ALT(DEF_ASM_OP2(lslw, 0x0f03, 0, OPC_MODRM | OPC_WL, OPT_EA | OPT_REG, OPT_REG))
+ALT(DEF_ASM_OP2(lslw, 0x0f03, 0, OPC_MODRM | OPC_WLX, OPT_EA | OPT_REG, OPT_REG))
    DEF_ASM_OP1(ltr, 0x0f00, 3, OPC_MODRM, OPT_EA | OPT_REG)
    DEF_ASM_OP1(sgdt, 0x0f01, 0, OPC_MODRM, OPT_EA)
    DEF_ASM_OP1(sidt, 0x0f01, 1, OPC_MODRM, OPT_EA)
@ -353,31 +356,20 @@ ALT(DEF_ASM_OP2(lslw, 0x0f03, 0, OPC_MODRM | OPC_WL, OPT_EA | OPT_REG, OPT_REG))
    DEF_ASM_OP1(verr, 0x0f00, 4, OPC_MODRM, OPT_REG | OPT_EA)
    DEF_ASM_OP1(verw, 0x0f00, 5, OPC_MODRM, OPT_REG | OPT_EA)

-    /* 386 */
-    DEF_ASM_OP0(loadall386, 0x0f07)
-
    /* 486 */
    DEF_ASM_OP1(bswap, 0x0fc8, 0, OPC_REG, OPT_REG32 )
-ALT(DEF_ASM_OP2(xaddb, 0x0fc0, 0, OPC_MODRM | OPC_BWL, OPT_REG, OPT_REG | OPT_EA ))
-ALT(DEF_ASM_OP2(cmpxchgb, 0x0fb0, 0, OPC_MODRM | OPC_BWL, OPT_REG, OPT_REG | OPT_EA ))
+ALT(DEF_ASM_OP2(xaddb, 0x0fc0, 0, OPC_MODRM | OPC_BWLX, OPT_REG, OPT_REG | OPT_EA ))
+ALT(DEF_ASM_OP2(cmpxchgb, 0x0fb0, 0, OPC_MODRM | OPC_BWLX, OPT_REG, OPT_REG | OPT_EA ))
    DEF_ASM_OP1(invlpg, 0x0f01, 7, OPC_MODRM, OPT_EA )

    DEF_ASM_OP2(boundl, 0x62, 0, OPC_MODRM, OPT_REG32, OPT_EA)
-    DEF_ASM_OP2(boundw, 0x62, 0, OPC_MODRM | OPC_D16, OPT_REG16, OPT_EA)
+    DEF_ASM_OP2(boundw, 0x6662, 0, OPC_MODRM, OPT_REG16, OPT_EA)

    /* pentium */
    DEF_ASM_OP1(cmpxchg8b, 0x0fc7, 1, OPC_MODRM, OPT_EA )
    
    /* pentium pro */
-ALT(DEF_ASM_OP2(cmovo, 0x0f40, 0, OPC_MODRM | OPC_TEST, OPT_REG32 | OPT_EA, OPT_REG32))
-ALT(DEF_ASM_OP2(cmovno, 0x0f41, 0, OPC_MODRM | OPC_TEST, OPT_REG32 | OPT_EA, OPT_REG32))
-ALT(DEF_ASM_OP2(cmovc, 0x0f42, 0, OPC_MODRM | OPC_TEST, OPT_REG32 | OPT_EA, OPT_REG32))
-ALT(DEF_ASM_OP2(cmovnc, 0x0f43, 0, OPC_MODRM | OPC_TEST, OPT_REG32 | OPT_EA, OPT_REG32))
-ALT(DEF_ASM_OP2(cmovz, 0x0f44, 0, OPC_MODRM | OPC_TEST, OPT_REG32 | OPT_EA, OPT_REG32))
-ALT(DEF_ASM_OP2(cmovnz, 0x0f45, 0, OPC_MODRM | OPC_TEST, OPT_REG32 | OPT_EA, OPT_REG32))
-ALT(DEF_ASM_OP2(cmovna, 0x0f46, 0, OPC_MODRM | OPC_TEST, OPT_REG32 | OPT_EA, OPT_REG32))
-ALT(DEF_ASM_OP2(cmova, 0x0f47, 0, OPC_MODRM | OPC_TEST, OPT_REG32 | OPT_EA, OPT_REG32))
-
+ALT(DEF_ASM_OP2(cmovo, 0x0f40, 0, OPC_MODRM | OPC_TEST | OPC_WLX, OPT_REGW | OPT_EA, OPT_REGW))
    DEF_ASM_OP2(fcmovb, 0xdac0, 0, OPC_REG, OPT_ST, OPT_ST0 )
    DEF_ASM_OP2(fcmove, 0xdac8, 0, OPC_REG, OPT_ST, OPT_ST0 )
    DEF_ASM_OP2(fcmovbe, 0xdad0, 0, OPC_REG, OPT_ST, OPT_ST0 )
@ -394,62 +386,91 @@ ALT(DEF_ASM_OP2(cmova, 0x0f47, 0, OPC_MODRM | OPC_TEST, OPT_REG32 | OPT_EA, OPT_

    /* mmx */
    DEF_ASM_OP0(emms, 0x0f77) /* must be last OP0 */
-    DEF_ASM_OP2(movd, 0x0f6e, 0, OPC_MODRM, OPT_EA | OPT_REG32, OPT_MMX )
-ALT(DEF_ASM_OP2(movd, 0x0f7e, 0, OPC_MODRM, OPT_MMX, OPT_EA | OPT_REG32 ))
+    DEF_ASM_OP2(movd, 0x0f6e, 0, OPC_MODRM, OPT_EA | OPT_REG32, OPT_MMXSSE )
    DEF_ASM_OP2(movq, 0x0f6f, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX )
+ALT(DEF_ASM_OP2(movd, 0x0f7e, 0, OPC_MODRM, OPT_MMXSSE, OPT_EA | OPT_REG32 ))
 ALT(DEF_ASM_OP2(movq, 0x0f7f, 0, OPC_MODRM, OPT_MMX, OPT_EA | OPT_MMX ))
-    DEF_ASM_OP2(packssdw, 0x0f6b, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX )
-    DEF_ASM_OP2(packsswb, 0x0f63, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX )
-    DEF_ASM_OP2(packuswb, 0x0f67, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX )
-    DEF_ASM_OP2(paddb, 0x0ffc, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX )
-    DEF_ASM_OP2(paddw, 0x0ffd, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX )
-    DEF_ASM_OP2(paddd, 0x0ffe, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX )
-    DEF_ASM_OP2(paddsb, 0x0fec, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX )
-    DEF_ASM_OP2(paddsw, 0x0fed, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX )
-    DEF_ASM_OP2(paddusb, 0x0fdc, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX )
-    DEF_ASM_OP2(paddusw, 0x0fdd, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX )
-    DEF_ASM_OP2(pand, 0x0fdb, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX )
-    DEF_ASM_OP2(pandn, 0x0fdf, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX )
-    DEF_ASM_OP2(pcmpeqb, 0x0f74, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX )
-    DEF_ASM_OP2(pcmpeqw, 0x0f75, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX )
-    DEF_ASM_OP2(pcmpeqd, 0x0f76, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX )
-    DEF_ASM_OP2(pcmpgtb, 0x0f64, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX )
-    DEF_ASM_OP2(pcmpgtw, 0x0f65, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX )
-    DEF_ASM_OP2(pcmpgtd, 0x0f66, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX )
-    DEF_ASM_OP2(pmaddwd, 0x0ff5, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX )
-    DEF_ASM_OP2(pmulhw, 0x0fe5, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX )
-    DEF_ASM_OP2(pmullw, 0x0fd5, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX )
-    DEF_ASM_OP2(por, 0x0feb, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX )
-    DEF_ASM_OP2(psllw, 0x0ff1, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX )
-ALT(DEF_ASM_OP2(psllw, 0x0f71, 6, OPC_MODRM, OPT_IM8, OPT_MMX ))
-    DEF_ASM_OP2(pslld, 0x0ff2, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX )
-ALT(DEF_ASM_OP2(pslld, 0x0f72, 6, OPC_MODRM, OPT_IM8, OPT_MMX ))
-    DEF_ASM_OP2(psllq, 0x0ff3, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX )
-ALT(DEF_ASM_OP2(psllq, 0x0f73, 6, OPC_MODRM, OPT_IM8, OPT_MMX ))
-    DEF_ASM_OP2(psraw, 0x0fe1, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX )
-ALT(DEF_ASM_OP2(psraw, 0x0f71, 4, OPC_MODRM, OPT_IM8, OPT_MMX ))
-    DEF_ASM_OP2(psrad, 0x0fe2, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX )
-ALT(DEF_ASM_OP2(psrad, 0x0f72, 4, OPC_MODRM, OPT_IM8, OPT_MMX ))
-    DEF_ASM_OP2(psrlw, 0x0fd1, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX )
-ALT(DEF_ASM_OP2(psrlw, 0x0f71, 2, OPC_MODRM, OPT_IM8, OPT_MMX ))
-    DEF_ASM_OP2(psrld, 0x0fd2, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX )
-ALT(DEF_ASM_OP2(psrld, 0x0f72, 2, OPC_MODRM, OPT_IM8, OPT_MMX ))
-    DEF_ASM_OP2(psrlq, 0x0fd3, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX )
-ALT(DEF_ASM_OP2(psrlq, 0x0f73, 2, OPC_MODRM, OPT_IM8, OPT_MMX ))
-    DEF_ASM_OP2(psubb, 0x0ff8, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX )
-    DEF_ASM_OP2(psubw, 0x0ff9, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX )
-    DEF_ASM_OP2(psubd, 0x0ffa, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX )
-    DEF_ASM_OP2(psubsb, 0x0fe8, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX )
-    DEF_ASM_OP2(psubsw, 0x0fe9, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX )
-    DEF_ASM_OP2(psubusb, 0x0fd8, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX )
-    DEF_ASM_OP2(psubusw, 0x0fd9, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX )
-    DEF_ASM_OP2(punpckhbw, 0x0f68, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX )
-    DEF_ASM_OP2(punpckhwd, 0x0f69, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX )
-    DEF_ASM_OP2(punpckhdq, 0x0f6a, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX )
-    DEF_ASM_OP2(punpcklbw, 0x0f60, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX )
-    DEF_ASM_OP2(punpcklwd, 0x0f61, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX )
-    DEF_ASM_OP2(punpckldq, 0x0f62, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX )
-    DEF_ASM_OP2(pxor, 0x0fef, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_MMX )
+ALT(DEF_ASM_OP2(movq, 0x660fd6, 0, OPC_MODRM, OPT_SSE, OPT_EA | OPT_SSE ))
+ALT(DEF_ASM_OP2(movq, 0xf30f7e, 0, OPC_MODRM, OPT_EA | OPT_SSE, OPT_SSE ))
+
+    DEF_ASM_OP2(packssdw, 0x0f6b, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+    DEF_ASM_OP2(packsswb, 0x0f63, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+    DEF_ASM_OP2(packuswb, 0x0f67, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+    DEF_ASM_OP2(paddb, 0x0ffc, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+    DEF_ASM_OP2(paddw, 0x0ffd, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+    DEF_ASM_OP2(paddd, 0x0ffe, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+    DEF_ASM_OP2(paddsb, 0x0fec, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+    DEF_ASM_OP2(paddsw, 0x0fed, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+    DEF_ASM_OP2(paddusb, 0x0fdc, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+    DEF_ASM_OP2(paddusw, 0x0fdd, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+    DEF_ASM_OP2(pand, 0x0fdb, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+    DEF_ASM_OP2(pandn, 0x0fdf, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+    DEF_ASM_OP2(pcmpeqb, 0x0f74, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+    DEF_ASM_OP2(pcmpeqw, 0x0f75, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+    DEF_ASM_OP2(pcmpeqd, 0x0f76, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+    DEF_ASM_OP2(pcmpgtb, 0x0f64, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+    DEF_ASM_OP2(pcmpgtw, 0x0f65, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+    DEF_ASM_OP2(pcmpgtd, 0x0f66, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+    DEF_ASM_OP2(pmaddwd, 0x0ff5, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+    DEF_ASM_OP2(pmulhw, 0x0fe5, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+    DEF_ASM_OP2(pmullw, 0x0fd5, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+    DEF_ASM_OP2(por, 0x0feb, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+    DEF_ASM_OP2(psllw, 0x0ff1, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+ALT(DEF_ASM_OP2(psllw, 0x0f71, 6, OPC_MODRM, OPT_IM8, OPT_MMXSSE ))
+    DEF_ASM_OP2(pslld, 0x0ff2, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+ALT(DEF_ASM_OP2(pslld, 0x0f72, 6, OPC_MODRM, OPT_IM8, OPT_MMXSSE ))
+    DEF_ASM_OP2(psllq, 0x0ff3, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+ALT(DEF_ASM_OP2(psllq, 0x0f73, 6, OPC_MODRM, OPT_IM8, OPT_MMXSSE ))
+    DEF_ASM_OP2(psraw, 0x0fe1, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+ALT(DEF_ASM_OP2(psraw, 0x0f71, 4, OPC_MODRM, OPT_IM8, OPT_MMXSSE ))
+    DEF_ASM_OP2(psrad, 0x0fe2, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+ALT(DEF_ASM_OP2(psrad, 0x0f72, 4, OPC_MODRM, OPT_IM8, OPT_MMXSSE ))
+    DEF_ASM_OP2(psrlw, 0x0fd1, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+ALT(DEF_ASM_OP2(psrlw, 0x0f71, 2, OPC_MODRM, OPT_IM8, OPT_MMXSSE ))
+    DEF_ASM_OP2(psrld, 0x0fd2, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+ALT(DEF_ASM_OP2(psrld, 0x0f72, 2, OPC_MODRM, OPT_IM8, OPT_MMXSSE ))
+    DEF_ASM_OP2(psrlq, 0x0fd3, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+ALT(DEF_ASM_OP2(psrlq, 0x0f73, 2, OPC_MODRM, OPT_IM8, OPT_MMXSSE ))
+    DEF_ASM_OP2(psubb, 0x0ff8, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+    DEF_ASM_OP2(psubw, 0x0ff9, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+    DEF_ASM_OP2(psubd, 0x0ffa, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+    DEF_ASM_OP2(psubsb, 0x0fe8, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+    DEF_ASM_OP2(psubsw, 0x0fe9, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+    DEF_ASM_OP2(psubusb, 0x0fd8, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+    DEF_ASM_OP2(psubusw, 0x0fd9, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+    DEF_ASM_OP2(punpckhbw, 0x0f68, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+    DEF_ASM_OP2(punpckhwd, 0x0f69, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+    DEF_ASM_OP2(punpckhdq, 0x0f6a, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+    DEF_ASM_OP2(punpcklbw, 0x0f60, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+    DEF_ASM_OP2(punpcklwd, 0x0f61, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+    DEF_ASM_OP2(punpckldq, 0x0f62, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+    DEF_ASM_OP2(pxor, 0x0fef, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+
+    /* sse */
+    DEF_ASM_OP2(movups, 0x0f10, 0, OPC_MODRM, OPT_EA | OPT_REG32, OPT_SSE )
+ALT(DEF_ASM_OP2(movups, 0x0f11, 0, OPC_MODRM, OPT_SSE, OPT_EA | OPT_REG32 ))
+    DEF_ASM_OP2(movaps, 0x0f28, 0, OPC_MODRM, OPT_EA | OPT_REG32, OPT_SSE )
+ALT(DEF_ASM_OP2(movaps, 0x0f29, 0, OPC_MODRM, OPT_SSE, OPT_EA | OPT_REG32 ))
+    DEF_ASM_OP2(movhps, 0x0f16, 0, OPC_MODRM, OPT_EA | OPT_REG32, OPT_SSE )
+ALT(DEF_ASM_OP2(movhps, 0x0f17, 0, OPC_MODRM, OPT_SSE, OPT_EA | OPT_REG32 ))
+    DEF_ASM_OP2(addps, 0x0f58, 0, OPC_MODRM, OPT_EA | OPT_SSE, OPT_SSE )
+    DEF_ASM_OP2(cvtpi2ps, 0x0f2a, 0, OPC_MODRM, OPT_EA | OPT_MMX, OPT_SSE )
+    DEF_ASM_OP2(cvtps2pi, 0x0f2d, 0, OPC_MODRM, OPT_EA | OPT_SSE, OPT_MMX )
+    DEF_ASM_OP2(cvttps2pi, 0x0f2c, 0, OPC_MODRM, OPT_EA | OPT_SSE, OPT_MMX )
+    DEF_ASM_OP2(divps, 0x0f5e, 0, OPC_MODRM, OPT_EA | OPT_SSE, OPT_SSE )
+    DEF_ASM_OP2(maxps, 0x0f5f, 0, OPC_MODRM, OPT_EA | OPT_SSE, OPT_SSE )
+    DEF_ASM_OP2(minps, 0x0f5d, 0, OPC_MODRM, OPT_EA | OPT_SSE, OPT_SSE )
+    DEF_ASM_OP2(mulps, 0x0f59, 0, OPC_MODRM, OPT_EA | OPT_SSE, OPT_SSE )
+    DEF_ASM_OP2(pavgb, 0x0fe0, 0, OPC_MODRM, OPT_EA | OPT_SSE, OPT_SSE )
+    DEF_ASM_OP2(pavgw, 0x0fe3, 0, OPC_MODRM, OPT_EA | OPT_SSE, OPT_SSE )
+    DEF_ASM_OP2(pmaxsw, 0x0fee, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+    DEF_ASM_OP2(pmaxub, 0x0fde, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+    DEF_ASM_OP2(pminsw, 0x0fea, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+    DEF_ASM_OP2(pminub, 0x0fda, 0, OPC_MODRM, OPT_EA | OPT_MMXSSE, OPT_MMXSSE )
+    DEF_ASM_OP2(rcpss, 0x0f53, 0, OPC_MODRM, OPT_EA | OPT_SSE, OPT_SSE )
+    DEF_ASM_OP2(rsqrtps, 0x0f52, 0, OPC_MODRM, OPT_EA | OPT_SSE, OPT_SSE )
+    DEF_ASM_OP2(sqrtps, 0x0f51, 0, OPC_MODRM, OPT_EA | OPT_SSE, OPT_SSE )
+    DEF_ASM_OP2(subps, 0x0f5c, 0, OPC_MODRM, OPT_EA | OPT_SSE, OPT_SSE )

 #undef ALT
 #undef DEF_ASM_OP0
--- a/i386-gen.c
+++ b/i386-gen.c
@ -18,8 +18,12 @@
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

+#ifdef TARGET_DEFS_ONLY
+
 /* number of available registers */
-#define NB_REGS             4
+#define NB_REGS         5
+#define NB_ASM_REGS     8
+#define CONFIG_TCC_ASM

 /* a register can belong to several classes. The classes must be
   sorted from more general to more precise (see gv2() code which does
@ -30,6 +34,8 @@
 #define RC_ST0     0x0008 
 #define RC_ECX     0x0010
 #define RC_EDX     0x0020
+#define RC_EBX     0x0040
+
 #define RC_IRET    RC_EAX /* function return: integer register */
 #define RC_LRET    RC_EDX /* function return: second integer register */
 #define RC_FRET    RC_ST0 /* function return: float register */
@ -39,14 +45,9 @@ enum {
    TREG_EAX = 0,
    TREG_ECX,
    TREG_EDX,
+    TREG_EBX,
    TREG_ST0,
-};
-
-const int reg_classes[NB_REGS] = {
-    /* eax */ RC_INT | RC_EAX,
-    /* ecx */ RC_INT | RC_ECX,
-    /* edx */ RC_INT | RC_EDX,
-    /* st0 */ RC_FLOAT | RC_ST0,
+    TREG_ESP = 4
 };

 /* return registers for function */
@ -59,7 +60,7 @@ const int reg_classes[NB_REGS] = {

 /* defined if structures are passed as pointers. Otherwise structures
   are directly pushed on stack. */
-//#define FUNC_STRUCT_PARAM_AS_PTR
+/* #define FUNC_STRUCT_PARAM_AS_PTR */

 /* pointer size, in bytes */
 #define PTR_SIZE 4
@ -71,29 +72,34 @@ const int reg_classes[NB_REGS] = {
 #define MAX_ALIGN     8

 /******************************************************/
-/* ELF defines */
-
-#define EM_TCC_TARGET EM_386
-
-/* relocation type for 32 bit data relocation */
-#define R_DATA_32   R_386_32
-#define R_DATA_PTR  R_386_32
-#define R_JMP_SLOT  R_386_JMP_SLOT
-#define R_COPY      R_386_COPY
-
-#define ELF_START_ADDR 0x08048000
-#define ELF_PAGE_SIZE  0x1000
-
+#else /* ! TARGET_DEFS_ONLY */
 /******************************************************/
+#include "tcc.h"
+
+/* define to 1/0 to [not] have EBX as 4th register */
+#define USE_EBX 0
+
+ST_DATA const int reg_classes[NB_REGS] = {
+    /* eax */ RC_INT | RC_EAX,
+    /* ecx */ RC_INT | RC_ECX,
+    /* edx */ RC_INT | RC_EDX,
+    /* ebx */ (RC_INT | RC_EBX) * USE_EBX,
+    /* st0 */ RC_FLOAT | RC_ST0,
+};

 static unsigned long func_sub_sp_offset;
-static unsigned long func_bound_offset;
 static int func_ret_sub;
+#ifdef CONFIG_TCC_BCHECK
+static addr_t func_bound_offset;
+static unsigned long func_bound_ind;
+#endif

 /* XXX: make it faster ? */
-void g(int c)
+ST_FUNC void g(int c)
 {
    int ind1;
+    if (nocode_wanted)
+        return;
    ind1 = ind + 1;
    if (ind1 > cur_text_section->data_allocated)
        section_realloc(cur_text_section, ind1);
@ -101,7 +107,7 @@ void g(int c)
    ind = ind1;
 }

-void o(unsigned int c)
+ST_FUNC void o(unsigned int c)
 {
    while (c) {
        g(c);
@ -109,7 +115,13 @@ void o(unsigned int c)
    }
 }

-void gen_le32(int c)
+ST_FUNC void gen_le16(int v)
+{
+    g(v);
+    g(v >> 8);
+}
+
+ST_FUNC void gen_le32(int c)
 {
    g(c);
    g(c >> 8);
@ -118,50 +130,52 @@ void gen_le32(int c)
 }

 /* output a symbol and patch all calls to it */
-void gsym_addr(int t, int a)
+ST_FUNC void gsym_addr(int t, int a)
 {
-    int n, *ptr;
    while (t) {
-        ptr = (int *)(cur_text_section->data + t);
-        n = *ptr; /* next value */
-        *ptr = a - t - 4;
+        unsigned char *ptr = cur_text_section->data + t;
+        uint32_t n = read32le(ptr); /* next value */
+        write32le(ptr, a - t - 4);
        t = n;
    }
 }

-void gsym(int t)
+ST_FUNC void gsym(int t)
 {
    gsym_addr(t, ind);
 }

-/* psym is used to put an instruction with a data field which is a
-   reference to a symbol. It is in fact the same as oad ! */
-#define psym oad
-
 /* instruction + 4 bytes data. Return the address of the data */
 static int oad(int c, int s)
 {
-    int ind1;
-
+    int t;
+    if (nocode_wanted)
+        return s;
    o(c);
-    ind1 = ind + 4;
-    if (ind1 > cur_text_section->data_allocated)
-        section_realloc(cur_text_section, ind1);
-    *(int *)(cur_text_section->data + ind) = s;
-    s = ind;
-    ind = ind1;
-    return s;
+    t = ind;
+    gen_le32(s);
+    return t;
 }

+/* generate jmp to a label */
+#define gjmp2(instr,lbl) oad(instr,lbl)
+
 /* output constant with relocation if 'r & VT_SYM' is true */
-static void gen_addr32(int r, Sym *sym, int c)
+ST_FUNC void gen_addr32(int r, Sym *sym, int c)
 {
    if (r & VT_SYM)
        greloc(cur_text_section, sym, ind, R_386_32);
    gen_le32(c);
 }

-/* generate a modrm reference. 'op_reg' contains the addtionnal 3
+ST_FUNC void gen_addrpc32(int r, Sym *sym, int c)
+{
+    if (r & VT_SYM)
+        greloc(cur_text_section, sym, ind, R_386_PC32);
+    gen_le32(c - 4);
+}
+
+/* generate a modrm reference. 'op_reg' contains the additional 3
   opcode bits */
 static void gen_modrm(int op_reg, int r, Sym *sym, int c)
 {
@ -184,55 +198,33 @@ static void gen_modrm(int op_reg, int r, Sym *sym, int c)
    }
 }

-#ifdef TCC_TARGET_PE
-static void mk_pointer(CType *type);
-static void indir(void);
-
-int handle_dllimport(int r, SValue *sv, void (*fn)(int r, SValue *sv))
-{
-    if ((sv->r & (VT_VALMASK|VT_SYM|VT_CONST)) != (VT_SYM|VT_CONST))
-        return 0;
-    if (0 == (sv->sym->type.t & VT_IMPORT))
-        return 0;
-
-    printf("import %d %04x %s\n", r, ind, get_tok_str(sv->sym->v, NULL));
-
-    sv->sym->type.t &= ~VT_IMPORT;
-    ++vtop;
-
-    *vtop = *sv;
-    mk_pointer(&vtop->type);
-    indir();
-    fn(r, vtop);
-
-    --vtop;
-    sv->sym->type.t |= VT_IMPORT;
-    return 1;
-}
-#endif
-
 /* load 'r' from value 'sv' */
-void load(int r, SValue *sv)
+ST_FUNC void load(int r, SValue *sv)
 {
    int v, t, ft, fc, fr;
    SValue v1;

 #ifdef TCC_TARGET_PE
-    if (handle_dllimport(r, sv, load))
-        return;
+    SValue v2;
+    sv = pe_getimport(sv, &v2);
 #endif
+
    fr = sv->r;
-    ft = sv->type.t;
-    fc = sv->c.ul;
+    ft = sv->type.t & ~VT_DEFSIGN;
+    fc = sv->c.i;
+
+    ft &= ~(VT_VOLATILE | VT_CONSTANT);

    v = fr & VT_VALMASK;
    if (fr & VT_LVAL) {
        if (v == VT_LLOCAL) {
            v1.type.t = VT_INT;
            v1.r = VT_LOCAL | VT_LVAL;
-            v1.c.ul = fc;
-            load(r, &v1);
+            v1.c.i = fc;
            fr = r;
+            if (!(reg_classes[fr] & RC_INT))
+                fr = get_reg(RC_INT);
+            load(fr, &v1);
        }
        if ((ft & VT_BTYPE) == VT_FLOAT) {
            o(0xd9); /* flds */
@ -243,7 +235,7 @@ void load(int r, SValue *sv)
        } else if ((ft & VT_BTYPE) == VT_LDOUBLE) {
            o(0xdb); /* fldt */
            r = 5;
-        } else if ((ft & VT_TYPE) == VT_BYTE) {
+        } else if ((ft & VT_TYPE) == VT_BYTE || (ft & VT_TYPE) == VT_BOOL) {
            o(0xbe0f);   /* movsbl */
        } else if ((ft & VT_TYPE) == (VT_BYTE | VT_UNSIGNED)) {
            o(0xb60f);   /* movzbl */
@ -260,8 +252,13 @@ void load(int r, SValue *sv)
            o(0xb8 + r); /* mov $xx, r */
            gen_addr32(fr, sv->sym, fc);
        } else if (v == VT_LOCAL) {
-            o(0x8d); /* lea xxx(%ebp), r */
-            gen_modrm(r, VT_LOCAL, sv->sym, fc);
+            if (fc) {
+                o(0x8d); /* lea xxx(%ebp), r */
+                gen_modrm(r, VT_LOCAL, sv->sym, fc);
+            } else {
+                o(0x89);
+                o(0xe8 + r); /* mov %ebp, r */
+            }
        } else if (v == VT_CMP) {
            oad(0xb8 + r, 0); /* mov $0, r */
            o(0x0f); /* setxx %br */
@ -281,17 +278,19 @@ void load(int r, SValue *sv)
 }

 /* store register 'r' in lvalue 'v' */
-void store(int r, SValue *v)
+ST_FUNC void store(int r, SValue *v)
 {
    int fr, bt, ft, fc;

 #ifdef TCC_TARGET_PE
-    if (handle_dllimport(r, v, store))
-        return;
+    SValue v2;
+    v = pe_getimport(v, &v2);
 #endif
+
    ft = v->type.t;
-    fc = v->c.ul;
+    fc = v->c.i;
    fr = v->r & VT_VALMASK;
+    ft &= ~(VT_VOLATILE | VT_CONSTANT);
    bt = ft & VT_BTYPE;
    /* XXX: incorrect if float reg to reg */
    if (bt == VT_FLOAT) {
@ -331,37 +330,94 @@ static void gadd_sp(int val)
    }
 }

+#if defined CONFIG_TCC_BCHECK || defined TCC_TARGET_PE
+static void gen_static_call(int v)
+{
+    Sym *sym;
+
+    sym = external_global_sym(v, &func_old_type, 0);
+    oad(0xe8, -4);
+    greloc(cur_text_section, sym, ind-4, R_386_PC32);
+}
+#endif
+
 /* 'is_jmp' is '1' if it is a jump */
 static void gcall_or_jmp(int is_jmp)
 {
    int r;
-    if ((vtop->r & (VT_VALMASK | VT_LVAL)) == VT_CONST) {
-        /* constant case */
-        if (vtop->r & VT_SYM) {
-            /* relocation case */
-            greloc(cur_text_section, vtop->sym, 
-                   ind + 1, R_386_PC32);
-        } else {
-            /* put an empty PC32 relocation */
-            put_elf_reloc(symtab_section, cur_text_section, 
-                          ind + 1, R_386_PC32, 0);
-        }
-        oad(0xe8 + is_jmp, vtop->c.ul - 4); /* call/jmp im */
+    if ((vtop->r & (VT_VALMASK | VT_LVAL)) == VT_CONST && (vtop->r & VT_SYM)) {
+        /* constant and relocation case */
+        greloc(cur_text_section, vtop->sym, ind + 1, R_386_PC32);
+        oad(0xe8 + is_jmp, vtop->c.i - 4); /* call/jmp im */
    } else {
        /* otherwise, indirect call */
        r = gv(RC_INT);
        o(0xff); /* call/jmp *r */
        o(0xd0 + r + (is_jmp << 4));
    }
+    if (!is_jmp) {
+        int rt;
+        /* extend the return value to the whole register if necessary
+           visual studio and gcc do not always set the whole eax register
+           when assigning the return value of a function  */
+        rt = vtop->type.ref->type.t;
+        switch (rt & VT_BTYPE) {
+            case VT_BYTE:
+                if (rt & VT_UNSIGNED) {
+                    o(0xc0b60f); /* movzx %al, %eax */
+                }
+                else {
+                    o(0xc0be0f); /* movsx %al, %eax */
+                }
+                break;
+            case VT_SHORT:
+                if (rt & VT_UNSIGNED) {
+                    o(0xc0b70f); /* movzx %ax, %eax */
+                }
+                else {
+                    o(0xc0bf0f); /* movsx %ax, %eax */
+                }
+                break;
+            default:
+                break;
+        }
+    }
 }

 static uint8_t fastcall_regs[3] = { TREG_EAX, TREG_EDX, TREG_ECX };
 static uint8_t fastcallw_regs[2] = { TREG_ECX, TREG_EDX };

+/* Return the number of registers needed to return the struct, or 0 if
+   returning via struct pointer. */
+ST_FUNC int gfunc_sret(CType *vt, int variadic, CType *ret, int *ret_align, int *regsize)
+{
+#ifdef TCC_TARGET_PE
+    int size, align;
+    *ret_align = 1; // Never have to re-align return values for x86
+    *regsize = 4;
+    size = type_size(vt, &align);
+    if (size > 8 || (size & (size - 1)))
+        return 0;
+    if (size == 8)
+        ret->t = VT_LLONG;
+    else if (size == 4)
+        ret->t = VT_INT;
+    else if (size == 2)
+        ret->t = VT_SHORT;
+    else
+        ret->t = VT_BYTE;
+    ret->ref = NULL;
+    return 1;
+#else
+    *ret_align = 1; // Never have to re-align return values for x86
+    return 0;
+#endif
+}
+
 /* Generate function call. The function address is pushed first, then
   all the parameters in call order. This functions pops all the
   parameters and the function address. */
-void gfunc_call(int nb_args)
+ST_FUNC void gfunc_call(int nb_args)
 {
    int size, align, r, args_size, i, func_call;
    Sym *func_sym;
@ -415,7 +471,7 @@ void gfunc_call(int nb_args)
    }
    save_regs(0); /* save used temporary registers */
    func_sym = vtop->type.ref;
-    func_call = FUNC_CALL(func_sym->r);
+    func_call = func_sym->f.func_call;
    /* fast call case */
    if ((func_call >= FUNC_FASTCALL1 && func_call <= FUNC_FASTCALL3) ||
        func_call == FUNC_FASTCALLW) {
@ -436,25 +492,25 @@ void gfunc_call(int nb_args)
            args_size -= 4;
        }
    }
-    gcall_or_jmp(0);
-
-#ifdef TCC_TARGET_PE
-    if ((func_sym->type.t & VT_BTYPE) == VT_STRUCT)
+#ifndef TCC_TARGET_PE
+    else if ((vtop->type.ref->type.t & VT_BTYPE) == VT_STRUCT)
        args_size -= 4;
 #endif
-    if (args_size && func_call != FUNC_STDCALL)
+    gcall_or_jmp(0);
+
+    if (args_size && func_call != FUNC_STDCALL && func_call != FUNC_FASTCALLW)
        gadd_sp(args_size);
    vtop--;
 }

 #ifdef TCC_TARGET_PE
-#define FUNC_PROLOG_SIZE 10
+#define FUNC_PROLOG_SIZE (10 + USE_EBX)
 #else
-#define FUNC_PROLOG_SIZE 9
+#define FUNC_PROLOG_SIZE (9 + USE_EBX)
 #endif

 /* generate function prolog of type 't' */
-void gfunc_prolog(CType *func_type)
+ST_FUNC void gfunc_prolog(CType *func_type)
 {
    int addr, align, size, func_call, fastcall_nb_regs;
    int param_index, param_addr;
@ -463,7 +519,7 @@ void gfunc_prolog(CType *func_type)
    CType *type;

    sym = func_type->ref;
-    func_call = FUNC_CALL(sym->r);
+    func_call = sym->f.func_call;
    addr = 8;
    loc = 0;
    func_vc = 0;
@ -485,7 +541,14 @@ void gfunc_prolog(CType *func_type)
    /* if the function returns a structure, then add an
       implicit pointer parameter */
    func_vt = sym->type;
+    func_var = (sym->f.func_type == FUNC_ELLIPSIS);
+#ifdef TCC_TARGET_PE
+    size = type_size(&func_vt,&align);
+    if (((func_vt.t & VT_BTYPE) == VT_STRUCT)
+        && (size > 8 || (size & (size - 1)))) {
+#else
    if ((func_vt.t & VT_BTYPE) == VT_STRUCT) {
+#endif
        /* XXX: fastcall case ? */
        func_vc = addr;
        addr += 4;
@ -517,61 +580,69 @@ void gfunc_prolog(CType *func_type)
        param_index++;
    }
    func_ret_sub = 0;
-    /* pascal type call ? */
-    if (func_call == FUNC_STDCALL)
+    /* pascal type call or fastcall ? */
+    if (func_call == FUNC_STDCALL || func_call == FUNC_FASTCALLW)
        func_ret_sub = addr - 8;
-#ifdef TCC_TARGET_PE
+#ifndef TCC_TARGET_PE
    else if (func_vc)
        func_ret_sub = 4;
 #endif

+#ifdef CONFIG_TCC_BCHECK
    /* leave some room for bound checking code */
    if (tcc_state->do_bounds_check) {
+        func_bound_offset = lbounds_section->data_offset;
+        func_bound_ind = ind;
        oad(0xb8, 0); /* lbound section pointer */
        oad(0xb8, 0); /* call to function */
-        func_bound_offset = lbounds_section->data_offset;
    }
+#endif
 }

 /* generate function epilog */
-void gfunc_epilog(void)
+ST_FUNC void gfunc_epilog(void)
 {
-    int v, saved_ind;
+    addr_t v, saved_ind;

 #ifdef CONFIG_TCC_BCHECK
    if (tcc_state->do_bounds_check
     && func_bound_offset != lbounds_section->data_offset) {
-        int saved_ind;
-        int *bounds_ptr;
-        Sym *sym, *sym_data;
+        addr_t saved_ind;
+        addr_t *bounds_ptr;
+        Sym *sym_data;
+
        /* add end of table info */
-        bounds_ptr = section_ptr_add(lbounds_section, sizeof(int));
+        bounds_ptr = section_ptr_add(lbounds_section, sizeof(addr_t));
        *bounds_ptr = 0;
+
        /* generate bound local allocation */
        saved_ind = ind;
-        ind = func_sub_sp_offset;
+        ind = func_bound_ind;
        sym_data = get_sym_ref(&char_pointer_type, lbounds_section, 
                               func_bound_offset, lbounds_section->data_offset);
        greloc(cur_text_section, sym_data,
               ind + 1, R_386_32);
        oad(0xb8, 0); /* mov %eax, xxx */
-        sym = external_global_sym(TOK___bound_local_new, &func_old_type, 0);
-        greloc(cur_text_section, sym, 
-               ind + 1, R_386_PC32);
-        oad(0xe8, -4);
+        gen_static_call(TOK___bound_local_new);
        ind = saved_ind;
+
        /* generate bound check local freeing */
        o(0x5250); /* save returned value, if any */
-        greloc(cur_text_section, sym_data,
-               ind + 1, R_386_32);
+        greloc(cur_text_section, sym_data, ind + 1, R_386_32);
        oad(0xb8, 0); /* mov %eax, xxx */
-        sym = external_global_sym(TOK___bound_local_delete, &func_old_type, 0);
-        greloc(cur_text_section, sym, 
-               ind + 1, R_386_PC32);
-        oad(0xe8, -4);
+        gen_static_call(TOK___bound_local_delete);
        o(0x585a); /* restore returned value, if any */
    }
 #endif
+
+    /* align local size to word & save local variables */
+    v = (-loc + 3) & -4;
+
+#if USE_EBX
+    o(0x8b);
+    gen_modrm(TREG_EBX, VT_LOCAL, NULL, -(v+4));
+#endif
+
    o(0xc9); /* leave */
    if (func_ret_sub == 0) {
        o(0xc3); /* ret */
@ -580,38 +651,34 @@ void gfunc_epilog(void)
        g(func_ret_sub);
        g(func_ret_sub >> 8);
    }
-    /* align local size to word & save local variables */
-    
-    v = (-loc + 3) & -4; 
    saved_ind = ind;
    ind = func_sub_sp_offset - FUNC_PROLOG_SIZE;
 #ifdef TCC_TARGET_PE
    if (v >= 4096) {
-        Sym *sym = external_global_sym(TOK___chkstk, &func_old_type, 0);
        oad(0xb8, v); /* mov stacksize, %eax */
-        oad(0xe8, -4); /* call __chkstk, (does the stackframe too) */
-        greloc(cur_text_section, sym, ind-4, R_386_PC32);
+        gen_static_call(TOK___chkstk); /* call __chkstk, (does the stackframe too) */
    } else
 #endif
    {
        o(0xe58955);  /* push %ebp, mov %esp, %ebp */
        o(0xec81);  /* sub esp, stacksize */
        gen_le32(v);
-#if FUNC_PROLOG_SIZE == 10
+#ifdef TCC_TARGET_PE
        o(0x90);  /* adjust to FUNC_PROLOG_SIZE */
 #endif
    }
+    o(0x53 * USE_EBX); /* push ebx */
    ind = saved_ind;
 }

 /* generate a jump to a label */
-int gjmp(int t)
+ST_FUNC int gjmp(int t)
 {
-    return psym(0xe9, t);
+    return gjmp2(0xe9, t);
 }

 /* generate a jump to a fixed address */
-void gjmp_addr(int a)
+ST_FUNC void gjmp_addr(int a)
 {
    int r;
    r = a - ind - 2;
@ -623,53 +690,64 @@ void gjmp_addr(int a)
    }
 }

-/* generate a test. set 'inv' to invert test. Stack entry is popped */
-int gtst(int inv, int t)
+ST_FUNC void gtst_addr(int inv, int a)
 {
-    int v, *p;
-
-    v = vtop->r & VT_VALMASK;
+    int v = vtop->r & VT_VALMASK;
    if (v == VT_CMP) {
+	inv ^= (vtop--)->c.i;
+	a -= ind + 2;
+	if (a == (char)a) {
+	    g(inv - 32);
+	    g(a);
+	} else {
+	    g(0x0f);
+	    oad(inv - 16, a - 4);
+	}
+    } else if ((v & ~1) == VT_JMP) {
+	if ((v & 1) != inv) {
+	    gjmp_addr(a);
+	    gsym(vtop->c.i);
+	} else {
+	    gsym(vtop->c.i);
+	    o(0x05eb);
+	    gjmp_addr(a);
+	}
+	vtop--;
+    }
+}
+
+/* generate a test. set 'inv' to invert test. Stack entry is popped */
+ST_FUNC int gtst(int inv, int t)
+{
+    int v = vtop->r & VT_VALMASK;
+    if (nocode_wanted) {
+        ;
+    } else if (v == VT_CMP) {
        /* fast case : can jump directly since flags are set */
        g(0x0f);
-        t = psym((vtop->c.i - 16) ^ inv, t);
+        t = gjmp2((vtop->c.i - 16) ^ inv, t);
    } else if (v == VT_JMP || v == VT_JMPI) {
        /* && or || optimization */
        if ((v & 1) == inv) {
            /* insert vtop->c jump list in t */
-            p = &vtop->c.i;
-            while (*p != 0)
-                p = (int *)(cur_text_section->data + *p);
-            *p = t;
-            t = vtop->c.i;
+            uint32_t n1, n = vtop->c.i;
+            if (n) {
+                while ((n1 = read32le(cur_text_section->data + n)))
+                    n = n1;
+                write32le(cur_text_section->data + n, t);
+                t = vtop->c.i;
+            }
        } else {
            t = gjmp(t);
            gsym(vtop->c.i);
        }
-    } else {
-        if (is_float(vtop->type.t) || 
-            (vtop->type.t & VT_BTYPE) == VT_LLONG) {
-            vpushi(0);
-            gen_op(TOK_NE);
-        }
-        if ((vtop->r & (VT_VALMASK | VT_LVAL | VT_SYM)) == VT_CONST) {
-            /* constant jmp optimization */
-            if ((vtop->c.i != 0) != inv) 
-                t = gjmp(t);
-        } else {
-            v = gv(RC_INT);
-            o(0x85);
-            o(0xc0 + v * 9);
-            g(0x0f);
-            t = psym(0x85 ^ inv, t);
-        }
    }
    vtop--;
    return t;
 }

 /* generate an integer binary operation */
-void gen_opi(int op)
+ST_FUNC void gen_opi(int op)
 {
    int r, fr, opc, c;

@ -685,10 +763,16 @@ void gen_opi(int op)
            vswap();
            c = vtop->c.i;
            if (c == (char)c) {
-                /* XXX: generate inc and dec for smaller code ? */
-                o(0x83);
-                o(0xc0 | (opc << 3) | r);
-                g(c);
+                /* generate inc and dec for smaller code */
+                if (c==1 && opc==0 && op != TOK_ADDC1) {
+                    o (0x40 | r); // inc
+                } else if (c==1 && opc==5 && op != TOK_SUBC1) {
+                    o (0x48 | r); // dec
+                } else {
+                    o(0x83);
+                    o(0xc0 | (opc << 3) | r);
+                    g(c);
+                }
            } else {
                o(0x81);
                oad(0xc0 | (opc << 3) | r, c);
@ -774,6 +858,8 @@ void gen_opi(int op)
        fr = vtop[0].r;
        vtop--;
        save_reg(TREG_EDX);
+        /* save EAX too if used otherwise */
+        save_reg_upstack(TREG_EAX, 1);
        if (op == TOK_UMULL) {
            o(0xf7); /* mul fr */
            o(0xe0 + fr);
@ -801,9 +887,9 @@ void gen_opi(int op)
 }

 /* generate a floating point operation 'v = t1 op t2' instruction. The
-   two operands are guaranted to have the same floating point type */
+   two operands are guaranteed to have the same floating point type */
 /* XXX: need to use ST1 too */
-void gen_opf(int op)
+ST_FUNC void gen_opf(int op)
 {
    int a, ft, fc, swapped, r;

@ -840,7 +926,10 @@ void gen_opf(int op)
            swapped = 0;
        if (swapped)
            o(0xc9d9); /* fxch %st(1) */
-        o(0xe9da); /* fucompp */
+        if (op == TOK_EQ || op == TOK_NE)
+            o(0xe9da); /* fucompp */
+        else
+            o(0xd9de); /* fcompp */
        o(0xe0df); /* fnstsw %ax */
        if (op == TOK_EQ) {
            o(0x45e480); /* and $0x45, %ah */
@ -886,7 +975,7 @@ void gen_opf(int op)
            break;
        }
        ft = vtop->type.t;
-        fc = vtop->c.ul;
+        fc = vtop->c.i;
        if ((ft & VT_BTYPE) == VT_LDOUBLE) {
            o(0xde); /* fxxxp %st, %st(1) */
            o(0xc1 + (a << 3));
@ -898,7 +987,7 @@ void gen_opf(int op)
                r = get_reg(RC_INT);
                v1.type.t = VT_INT;
                v1.r = VT_LOCAL | VT_LVAL;
-                v1.c.ul = fc;
+                v1.c.i = fc;
                load(r, &v1);
                fc = 0;
            }
@ -915,7 +1004,7 @@ void gen_opf(int op)

 /* convert integers to fp 't' type. Must handle 'int', 'unsigned int'
   and 'long long' cases. */
-void gen_cvt_itof(int t)
+ST_FUNC void gen_cvt_itof(int t)
 {
    save_reg(TREG_ST0);
    gv(RC_INT);
@ -944,65 +1033,31 @@ void gen_cvt_itof(int t)
 }

 /* convert fp to int 't' type */
-/* XXX: handle long long case */
-void gen_cvt_ftoi(int t)
+ST_FUNC void gen_cvt_ftoi(int t)
 {
-    int r, r2, size;
-    Sym *sym;
-    CType ushort_type;
-
-    ushort_type.t = VT_SHORT | VT_UNSIGNED;
-
-    gv(RC_FLOAT);
-    if (t != VT_INT)
-        size = 8;
-    else 
-        size = 4;
-    
-    o(0x2dd9); /* ldcw xxx */
-    sym = external_global_sym(TOK___tcc_int_fpu_control, 
-                              &ushort_type, VT_LVAL);
-    greloc(cur_text_section, sym, 
-           ind, R_386_32);
-    gen_le32(0);
-    
-    oad(0xec81, size); /* sub $xxx, %esp */
-    if (size == 4)
-        o(0x1cdb); /* fistpl */
+    int bt = vtop->type.t & VT_BTYPE;
+    if (bt == VT_FLOAT)
+        vpush_global_sym(&func_old_type, TOK___fixsfdi);
+    else if (bt == VT_LDOUBLE)
+        vpush_global_sym(&func_old_type, TOK___fixxfdi);
    else
-        o(0x3cdf); /* fistpll */
-    o(0x24);
-    o(0x2dd9); /* ldcw xxx */
-    sym = external_global_sym(TOK___tcc_fpu_control, 
-                              &ushort_type, VT_LVAL);
-    greloc(cur_text_section, sym, 
-           ind, R_386_32);
-    gen_le32(0);
-
-    r = get_reg(RC_INT);
-    o(0x58 + r); /* pop r */
-    if (size == 8) {
-        if (t == VT_LLONG) {
-            vtop->r = r; /* mark reg as used */
-            r2 = get_reg(RC_INT);
-            o(0x58 + r2); /* pop r2 */
-            vtop->r2 = r2;
-        } else {
-            o(0x04c483); /* add $4, %esp */
-        }
-    }
-    vtop->r = r;
+        vpush_global_sym(&func_old_type, TOK___fixdfdi);
+    vswap();
+    gfunc_call(1);
+    vpushi(0);
+    vtop->r = REG_IRET;
+    vtop->r2 = REG_LRET;
 }

 /* convert from one floating point type to another */
-void gen_cvt_ftof(int t)
+ST_FUNC void gen_cvt_ftof(int t)
 {
    /* all we have to do on i386 is to put the float in a register */
    gv(RC_FLOAT);
 }

 /* computed goto support */
-void ggoto(void)
+ST_FUNC void ggoto(void)
 {
    gcall_or_jmp(1);
    vtop--;
@ -1012,33 +1067,28 @@ void ggoto(void)
 #ifdef CONFIG_TCC_BCHECK

 /* generate a bounded pointer addition */
-void gen_bounded_ptr_add(void)
+ST_FUNC void gen_bounded_ptr_add(void)
 {
-    Sym *sym;
-
    /* prepare fast i386 function call (args in eax and edx) */
    gv2(RC_EAX, RC_EDX);
    /* save all temporary registers */
    vtop -= 2;
    save_regs(0);
    /* do a fast function call */
-    sym = external_global_sym(TOK___bound_ptr_add, &func_old_type, 0);
-    greloc(cur_text_section, sym, 
-           ind + 1, R_386_PC32);
-    oad(0xe8, -4);
+    gen_static_call(TOK___bound_ptr_add);
    /* returned pointer is in eax */
    vtop++;
    vtop->r = TREG_EAX | VT_BOUNDED;
    /* address of bounding function call point */
-    vtop->c.ul = (cur_text_section->reloc->data_offset - sizeof(Elf32_Rel)); 
+    vtop->c.i = (cur_text_section->reloc->data_offset - sizeof(Elf32_Rel));
 }

 /* patch pointer addition in vtop so that pointer dereferencing is
   also tested */
-void gen_bounded_ptr_deref(void)
+ST_FUNC void gen_bounded_ptr_deref(void)
 {
-    int func;
-    int size, align;
+    addr_t func;
+    int  size, align;
    Elf32_Rel *rel;
    Sym *sym;

@ -1060,14 +1110,14 @@ void gen_bounded_ptr_deref(void)
    case 12: func = TOK___bound_ptr_indir12; break;
    case 16: func = TOK___bound_ptr_indir16; break;
    default:
-        error("unhandled size when derefencing bounded pointer");
+        tcc_error("unhandled size when dereferencing bounded pointer");
        func = 0;
        break;
    }

    /* patch relocation */
    /* XXX: find a better solution ? */
-    rel = (Elf32_Rel *)(cur_text_section->reloc->data + vtop->c.ul);
+    rel = (Elf32_Rel *)(cur_text_section->reloc->data + vtop->c.i);
    sym = external_global_sym(func, &func_old_type, 0);
    if (!sym->c)
        put_extern_sym(sym, NULL, 0, 0);
@ -1075,6 +1125,40 @@ void gen_bounded_ptr_deref(void)
 }
 #endif

+/* Save the stack pointer onto the stack */
+ST_FUNC void gen_vla_sp_save(int addr) {
+    /* mov %esp,addr(%ebp)*/
+    o(0x89);
+    gen_modrm(TREG_ESP, VT_LOCAL, NULL, addr);
+}
+
+/* Restore the SP from a location on the stack */
+ST_FUNC void gen_vla_sp_restore(int addr) {
+    o(0x8b);
+    gen_modrm(TREG_ESP, VT_LOCAL, NULL, addr);
+}
+
+/* Subtract from the stack pointer, and push the resulting value onto the stack */
+ST_FUNC void gen_vla_alloc(CType *type, int align) {
+#ifdef TCC_TARGET_PE
+    /* alloca does more than just adjust %rsp on Windows */
+    vpush_global_sym(&func_old_type, TOK_alloca);
+    vswap(); /* Move alloca ref past allocation size */
+    gfunc_call(1);
+#else
+    int r;
+    r = gv(RC_INT); /* allocation size */
+    /* sub r,%rsp */
+    o(0x2b);
+    o(0xe0 | r);
+    /* We align to 16 bytes rather than align */
+    /* and ~15, %esp */
+    o(0xf0e483);
+    vpop();
+#endif
+}
+
 /* end of X86 code generator */
 /*************************************************************/
-
+#endif
+/*************************************************************/
--- a/i386-link.c
+++ b/i386-link.c
@ -0,0 +1,247 @@
+#ifdef TARGET_DEFS_ONLY
+
+#define EM_TCC_TARGET EM_386
+
+/* relocation type for 32 bit data relocation */
+#define R_DATA_32   R_386_32
+#define R_DATA_PTR  R_386_32
+#define R_JMP_SLOT  R_386_JMP_SLOT
+#define R_GLOB_DAT  R_386_GLOB_DAT
+#define R_COPY      R_386_COPY
+#define R_RELATIVE  R_386_RELATIVE
+
+#define R_NUM       R_386_NUM
+
+#define ELF_START_ADDR 0x08048000
+#define ELF_PAGE_SIZE  0x1000
+
+#define PCRELATIVE_DLLPLT 0
+#define RELOCATE_DLLPLT 0
+
+#else /* !TARGET_DEFS_ONLY */
+
+#include "tcc.h"
+
+/* Returns 1 for a code relocation, 0 for a data relocation. For unknown
+   relocations, returns -1. */
+int code_reloc (int reloc_type)
+{
+    switch (reloc_type) {
+	case R_386_RELATIVE:
+	case R_386_16:
+        case R_386_32:
+	case R_386_GOTPC:
+	case R_386_GOTOFF:
+	case R_386_GOT32:
+	case R_386_GOT32X:
+	case R_386_GLOB_DAT:
+	case R_386_COPY:
+            return 0;
+
+	case R_386_PC16:
+	case R_386_PC32:
+	case R_386_PLT32:
+	case R_386_JMP_SLOT:
+            return 1;
+    }
+
+    tcc_error ("Unknown relocation type: %d", reloc_type);
+    return -1;
+}
+
+/* Returns an enumerator to describe whether and when the relocation needs a
+   GOT and/or PLT entry to be created. See tcc.h for a description of the
+   different values. */
+int gotplt_entry_type (int reloc_type)
+{
+    switch (reloc_type) {
+	case R_386_RELATIVE:
+	case R_386_16:
+	case R_386_GLOB_DAT:
+	case R_386_JMP_SLOT:
+	case R_386_COPY:
+            return NO_GOTPLT_ENTRY;
+
+        case R_386_32:
+	    /* This relocations shouldn't normally need GOT or PLT
+	       slots if it weren't for simplicity in the code generator.
+	       See our caller for comments.  */
+            return AUTO_GOTPLT_ENTRY;
+
+	case R_386_PC16:
+	case R_386_PC32:
+            return AUTO_GOTPLT_ENTRY;
+
+	case R_386_GOTPC:
+	case R_386_GOTOFF:
+            return BUILD_GOT_ONLY;
+
+	case R_386_GOT32:
+	case R_386_GOT32X:
+	case R_386_PLT32:
+            return ALWAYS_GOTPLT_ENTRY;
+    }
+
+    tcc_error ("Unknown relocation type: %d", reloc_type);
+    return -1;
+}
+
+ST_FUNC unsigned create_plt_entry(TCCState *s1, unsigned got_offset, struct sym_attr *attr)
+{
+    Section *plt = s1->plt;
+    uint8_t *p;
+    int modrm;
+    unsigned plt_offset, relofs;
+
+    /* on i386 if we build a DLL, we add a %ebx offset */
+    if (s1->output_type == TCC_OUTPUT_DLL)
+        modrm = 0xa3;
+    else
+        modrm = 0x25;
+
+    /* empty PLT: create PLT0 entry that pushes the library identifier
+       (GOT + PTR_SIZE) and jumps to ld.so resolution routine
+       (GOT + 2 * PTR_SIZE) */
+    if (plt->data_offset == 0) {
+        p = section_ptr_add(plt, 16);
+        p[0] = 0xff; /* pushl got + PTR_SIZE */
+        p[1] = modrm + 0x10;
+        write32le(p + 2, PTR_SIZE);
+        p[6] = 0xff; /* jmp *(got + PTR_SIZE * 2) */
+        p[7] = modrm;
+        write32le(p + 8, PTR_SIZE * 2);
+    }
+    plt_offset = plt->data_offset;
+
+    /* The PLT slot refers to the relocation entry it needs via offset.
+       The reloc entry is created below, so its offset is the current
+       data_offset */
+    relofs = s1->got->reloc ? s1->got->reloc->data_offset : 0;
+
+    /* Jump to GOT entry where ld.so initially put the address of ip + 4 */
+    p = section_ptr_add(plt, 16);
+    p[0] = 0xff; /* jmp *(got + x) */
+    p[1] = modrm;
+    write32le(p + 2, got_offset);
+    p[6] = 0x68; /* push $xxx */
+    write32le(p + 7, relofs);
+    p[11] = 0xe9; /* jmp plt_start */
+    write32le(p + 12, -(plt->data_offset));
+    return plt_offset;
+}
+
+/* relocate the PLT: compute addresses and offsets in the PLT now that final
+   address for PLT and GOT are known (see fill_program_header) */
+ST_FUNC void relocate_plt(TCCState *s1)
+{
+    uint8_t *p, *p_end;
+
+    if (!s1->plt)
+      return;
+
+    p = s1->plt->data;
+    p_end = p + s1->plt->data_offset;
+
+    if (p < p_end) {
+        add32le(p + 2, s1->got->sh_addr);
+        add32le(p + 8, s1->got->sh_addr);
+        p += 16;
+        while (p < p_end) {
+            add32le(p + 2, s1->got->sh_addr);
+            p += 16;
+        }
+    }
+}
+
+static ElfW_Rel *qrel; /* ptr to next reloc entry reused */
+
+void relocate_init(Section *sr)
+{
+    qrel = (ElfW_Rel *) sr->data;
+}
+
+void relocate(TCCState *s1, ElfW_Rel *rel, int type, unsigned char *ptr, addr_t addr, addr_t val)
+{
+    int sym_index, esym_index;
+
+    sym_index = ELFW(R_SYM)(rel->r_info);
+
+    switch (type) {
+        case R_386_32:
+            if (s1->output_type == TCC_OUTPUT_DLL) {
+                esym_index = s1->sym_attrs[sym_index].dyn_index;
+                qrel->r_offset = rel->r_offset;
+                if (esym_index) {
+                    qrel->r_info = ELFW(R_INFO)(esym_index, R_386_32);
+                    qrel++;
+                    return;
+                } else {
+                    qrel->r_info = ELFW(R_INFO)(0, R_386_RELATIVE);
+                    qrel++;
+                }
+            }
+            add32le(ptr, val);
+            return;
+        case R_386_PC32:
+            if (s1->output_type == TCC_OUTPUT_DLL) {
+                /* DLL relocation */
+                esym_index = s1->sym_attrs[sym_index].dyn_index;
+                if (esym_index) {
+                    qrel->r_offset = rel->r_offset;
+                    qrel->r_info = ELFW(R_INFO)(esym_index, R_386_PC32);
+                    qrel++;
+                    return;
+                }
+            }
+            add32le(ptr, val - addr);
+            return;
+        case R_386_PLT32:
+            add32le(ptr, val - addr);
+            return;
+        case R_386_GLOB_DAT:
+        case R_386_JMP_SLOT:
+            write32le(ptr, val);
+            return;
+        case R_386_GOTPC:
+            add32le(ptr, s1->got->sh_addr - addr);
+            return;
+        case R_386_GOTOFF:
+            add32le(ptr, val - s1->got->sh_addr);
+            return;
+        case R_386_GOT32:
+        case R_386_GOT32X:
+            /* we load the got offset */
+            add32le(ptr, s1->sym_attrs[sym_index].got_offset);
+            return;
+        case R_386_16:
+            if (s1->output_format != TCC_OUTPUT_FORMAT_BINARY) {
+            output_file:
+                tcc_error("can only produce 16-bit binary files");
+            }
+            write16le(ptr, read16le(ptr) + val);
+            return;
+        case R_386_PC16:
+            if (s1->output_format != TCC_OUTPUT_FORMAT_BINARY)
+                goto output_file;
+            write16le(ptr, read16le(ptr) + val - addr);
+            return;
+        case R_386_RELATIVE:
+#ifdef TCC_TARGET_PE
+            add32le(ptr, val - s1->pe_imagebase);
+#endif
+            /* do nothing */
+            return;
+        case R_386_COPY:
+            /* This relocation must copy initialized data from the library
+            to the program .bss segment. Currently made like for ARM
+            (to remove noise of default case). Is this true?
+            */
+            return;
+        default:
+            fprintf(stderr,"FIXME: handle reloc type %d at %x [%p] to %x\n",
+                type, (unsigned)addr, ptr, (unsigned)val);
+            return;
+    }
+}
+
+#endif /* !TARGET_DEFS_ONLY */
--- a/i386-tok.h
+++ b/i386-tok.h
@ -1,5 +1,7 @@
-
+/* ------------------------------------------------------------------ */
 /* WARNING: relative order of tokens is important. */
+
+/* register */
 DEF_ASM(al)
 DEF_ASM(cl)
 DEF_ASM(dl)
@ -24,6 +26,16 @@
 DEF_ASM(ebp)
 DEF_ASM(esi)
 DEF_ASM(edi)
+#ifdef TCC_TARGET_X86_64
+ DEF_ASM(rax)
+ DEF_ASM(rcx)
+ DEF_ASM(rdx)
+ DEF_ASM(rbx)
+ DEF_ASM(rsp)
+ DEF_ASM(rbp)
+ DEF_ASM(rsi)
+ DEF_ASM(rdi)
+#endif
 DEF_ASM(mm0)
 DEF_ASM(mm1)
 DEF_ASM(mm2)
@ -79,65 +91,83 @@
 DEF_ASM(fs)
 DEF_ASM(gs)
 DEF_ASM(st)
+ DEF_ASM(rip)

- DEF_BWL(mov)
-
+#ifdef TCC_TARGET_X86_64
+ /* The four low parts of sp/bp/si/di that exist only on
+    x86-64 (encoding aliased to ah,ch,dh,dh when not using REX). */
+ DEF_ASM(spl)
+ DEF_ASM(bpl)
+ DEF_ASM(sil)
+ DEF_ASM(dil)
+#endif
 /* generic two operands */
- DEF_BWL(add)
- DEF_BWL(or)
- DEF_BWL(adc)
- DEF_BWL(sbb)
- DEF_BWL(and)
- DEF_BWL(sub)
- DEF_BWL(xor)
- DEF_BWL(cmp)
+ DEF_BWLX(mov)
+
+ DEF_BWLX(add)
+ DEF_BWLX(or)
+ DEF_BWLX(adc)
+ DEF_BWLX(sbb)
+ DEF_BWLX(and)
+ DEF_BWLX(sub)
+ DEF_BWLX(xor)
+ DEF_BWLX(cmp)

 /* unary ops */
- DEF_BWL(inc)
- DEF_BWL(dec)
- DEF_BWL(not)
- DEF_BWL(neg)
- DEF_BWL(mul)
- DEF_BWL(imul)
- DEF_BWL(div)
- DEF_BWL(idiv)
+ DEF_BWLX(inc)
+ DEF_BWLX(dec)
+ DEF_BWLX(not)
+ DEF_BWLX(neg)
+ DEF_BWLX(mul)
+ DEF_BWLX(imul)
+ DEF_BWLX(div)
+ DEF_BWLX(idiv)

- DEF_BWL(xchg)
- DEF_BWL(test)
+ DEF_BWLX(xchg)
+ DEF_BWLX(test)

 /* shifts */
- DEF_BWL(rol)
- DEF_BWL(ror)
- DEF_BWL(rcl)
- DEF_BWL(rcr)
- DEF_BWL(shl)
- DEF_BWL(shr)
- DEF_BWL(sar)
+ DEF_BWLX(rol)
+ DEF_BWLX(ror)
+ DEF_BWLX(rcl)
+ DEF_BWLX(rcr)
+ DEF_BWLX(shl)
+ DEF_BWLX(shr)
+ DEF_BWLX(sar)

- DEF_ASM(shldw)
- DEF_ASM(shldl)
- DEF_ASM(shld)
- DEF_ASM(shrdw)
- DEF_ASM(shrdl)
- DEF_ASM(shrd)
+ DEF_WLX(shld)
+ DEF_WLX(shrd)

 DEF_ASM(pushw)
 DEF_ASM(pushl)
+#ifdef TCC_TARGET_X86_64
+ DEF_ASM(pushq)
+#endif
 DEF_ASM(push)
+
 DEF_ASM(popw)
 DEF_ASM(popl)
+#ifdef TCC_TARGET_X86_64
+ DEF_ASM(popq)
+#endif
 DEF_ASM(pop)
+
 DEF_BWL(in)
 DEF_BWL(out)

- DEF_WL(movzb)
-
+ DEF_WLX(movzb)
 DEF_ASM(movzwl)
 DEF_ASM(movsbw)
 DEF_ASM(movsbl)
 DEF_ASM(movswl)
+#ifdef TCC_TARGET_X86_64
+ DEF_ASM(movsbq)
+ DEF_ASM(movswq)
+ DEF_ASM(movzwq)
+ DEF_ASM(movslq)
+#endif

- DEF_WL(lea)
+ DEF_WLX(lea)

 DEF_ASM(les)
 DEF_ASM(lds)
@ -150,26 +180,28 @@
 DEF_ASM(lcall)
 DEF_ASM(ljmp)

- DEF_ASMTEST(j)
+ DEF_ASMTEST(j,)

- DEF_ASMTEST(set)
- DEF_ASMTEST(cmov)
+ DEF_ASMTEST(set,)
+ DEF_ASMTEST(set,b)
+ DEF_ASMTEST(cmov,)

- DEF_WL(bsf)
- DEF_WL(bsr)
- DEF_WL(bt)
- DEF_WL(bts)
- DEF_WL(btr)
- DEF_WL(btc)
+ DEF_WLX(bsf)
+ DEF_WLX(bsr)
+ DEF_WLX(bt)
+ DEF_WLX(bts)
+ DEF_WLX(btr)
+ DEF_WLX(btc)

- DEF_WL(lsl)
+ DEF_WLX(lar)
+ DEF_WLX(lsl)

 /* generic FP ops */
 DEF_FP(add)
 DEF_FP(mul)

 DEF_ASM(fcom)
- DEF_ASM(fcom_1) /* non existant op, just to have a regular table */
+ DEF_ASM(fcom_1) /* non existent op, just to have a regular table */
 DEF_FP1(com)

 DEF_FP(comp)
@ -178,32 +210,35 @@
 DEF_FP(div)
 DEF_FP(divr)

- DEF_BWL(xadd)
- DEF_BWL(cmpxchg)
+ DEF_BWLX(xadd)
+ DEF_BWLX(cmpxchg)

 /* string ops */
- DEF_BWL(cmps)
- DEF_BWL(scmp)
+ DEF_BWLX(cmps)
+ DEF_BWLX(scmp)
 DEF_BWL(ins)
 DEF_BWL(outs)
- DEF_BWL(lods)
- DEF_BWL(slod)
- DEF_BWL(movs)
- DEF_BWL(smov)
- DEF_BWL(scas)
- DEF_BWL(ssca)
- DEF_BWL(stos)
- DEF_BWL(ssto)
+ DEF_BWLX(lods)
+ DEF_BWLX(slod)
+ DEF_BWLX(movs)
+ DEF_BWLX(smov)
+ DEF_BWLX(scas)
+ DEF_BWLX(ssca)
+ DEF_BWLX(stos)
+ DEF_BWLX(ssto)

 /* generic asm ops */
-
 #define ALT(x)
 #define DEF_ASM_OP0(name, opcode) DEF_ASM(name)
 #define DEF_ASM_OP0L(name, opcode, group, instr_type)
 #define DEF_ASM_OP1(name, opcode, group, instr_type, op0)
 #define DEF_ASM_OP2(name, opcode, group, instr_type, op0, op1)
 #define DEF_ASM_OP3(name, opcode, group, instr_type, op0, op1, op2)
-#include "i386-asm.h"
+#ifdef TCC_TARGET_X86_64
+# include "x86_64-asm.h"
+#else
+# include "i386-asm.h"
+#endif

 #define ALT(x)
 #define DEF_ASM_OP0(name, opcode)
@ -211,4 +246,8 @@
 #define DEF_ASM_OP1(name, opcode, group, instr_type, op0) DEF_ASM(name)
 #define DEF_ASM_OP2(name, opcode, group, instr_type, op0, op1) DEF_ASM(name)
 #define DEF_ASM_OP3(name, opcode, group, instr_type, op0, op1, op2) DEF_ASM(name)
-#include "i386-asm.h"
+#ifdef TCC_TARGET_X86_64
+# include "x86_64-asm.h"
+#else
+# include "i386-asm.h"
+#endif
--- a/il-gen.c
+++ b/il-gen.c
@ -18,6 +18,8 @@
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

+#error this code has bit-rotted since 2003
+
 /* number of available registers */
 #define NB_REGS             3

@ -53,11 +55,11 @@ const int reg_classes[NB_REGS] = {
 #define REG_FRET REG_ST0 /* float return register */

 /* defined if function parameters must be evaluated in reverse order */
-//#define INVERT_FUNC_PARAMS
+/* #define INVERT_FUNC_PARAMS */

 /* defined if structures are passed as pointers. Otherwise structures
   are directly pushed on stack. */
-//#define FUNC_STRUCT_PARAM_AS_PTR
+/* #define FUNC_STRUCT_PARAM_AS_PTR */

 /* pointer size, in bytes */
 #define PTR_SIZE 4
@ -193,7 +195,7 @@ static void il_type_to_str(char *buf, int buf_size,
        pstrcat(buf, buf_size, tstr);
        break;
    case VT_STRUCT:
-        error("structures not handled yet");
+        tcc_error("structures not handled yet");
        break;
    case VT_FUNC:
        s = sym_find((unsigned)t >> VT_STRUCT_SHIFT);
@ -387,7 +389,7 @@ void gfunc_start(GFuncContext *c, int func_call)
 void gfunc_param(GFuncContext *c)
 {
    if ((vtop->t & VT_BTYPE) == VT_STRUCT) {
-        error("structures passed as value not handled yet");
+        tcc_error("structures passed as value not handled yet");
    } else {
        /* simply push on stack */
        gv(RC_ST0);
@ -441,6 +443,7 @@ void gfunc_prolog(int t)
    /* if the function returns a structure, then add an
       implicit pointer parameter */
    func_vt = sym->t;
+    func_var = (sym->c == FUNC_ELLIPSIS);
    if ((func_vt & VT_BTYPE) == VT_STRUCT) {
        func_vc = addr;
        addr++;
@ -528,19 +531,6 @@ int gtst(int inv, int t)
            t = gjmp(t);
            gsym(vtop->c.i);
        }
-    } else {
-        if (is_float(vtop->t)) {
-            vpushi(0);
-            gen_op(TOK_NE);
-        }
-        if ((vtop->r & (VT_VALMASK | VT_LVAL | VT_FORWARD)) == VT_CONST) {
-            /* constant jmp optimization */
-            if ((vtop->c.i != 0) != inv) 
-                t = gjmp(t);
-        } else {
-            v = gv(RC_INT);
-            t = out_opj(IL_OP_BRTRUE - inv, t);
-        }
    }
    vtop--;
    return t;
@ -612,7 +602,7 @@ void gen_opi(int op)
 }

 /* generate a floating point operation 'v = t1 op t2' instruction. The
-   two operands are guaranted to have the same floating point type */
+   two operands are guaranteed to have the same floating point type */
 void gen_opf(int op)
 {
    /* same as integer */
--- a/include/float.h
+++ b/include/float.h
@ -27,7 +27,7 @@
 #define DBL_MAX_10_EXP 308

 /* horrible intel long double */
-#ifdef __i386__
+#if defined __i386__ || defined __x86_64__

 #define LDBL_MANT_DIG 64
 #define LDBL_DIG 18
--- a/include/stdarg.h
+++ b/include/stdarg.h
@ -3,10 +3,10 @@

 #ifdef __x86_64__
 #ifndef _WIN64
-#include <stdlib.h>

+//This should be in sync with the declaration on our lib/libtcc1.c
 /* GCC compatible definition of va_list. */
-struct __va_list_struct {
+typedef struct {
    unsigned int gp_offset;
    unsigned int fp_offset;
    union {
@ -14,49 +14,55 @@ struct __va_list_struct {
        char *overflow_arg_area;
    };
    char *reg_save_area;
-};
+} __va_list_struct;

-typedef struct __va_list_struct *va_list;
+typedef __va_list_struct va_list[1];

-/* we use __builtin_(malloc|free) to avoid #define malloc tcc_malloc */
-/* XXX: this lacks the support of aggregated types. */
-#define va_start(ap, last)                                              \
-    (ap = (va_list)__builtin_malloc(sizeof(struct __va_list_struct)),   \
-     *ap = *(struct __va_list_struct*)(                                 \
-         (char*)__builtin_frame_address(0) - 16),                       \
-     ap->overflow_arg_area = ((char *)__builtin_frame_address(0) +      \
-                              ap->overflow_offset),                     \
-     ap->reg_save_area = (char *)__builtin_frame_address(0) - 176 - 16  \
-        )
-#define va_arg(ap, type)                                        \
-    (*(type*)(__builtin_types_compatible_p(type, long double)   \
-              ? (ap->overflow_arg_area += 16,                   \
-                 ap->overflow_arg_area - 16)                    \
-              : __builtin_types_compatible_p(type, double)      \
-              ? (ap->fp_offset < 128 + 48                       \
-                 ? (ap->fp_offset += 16,                        \
-                    ap->reg_save_area + ap->fp_offset - 16)     \
-                 : (ap->overflow_arg_area += 8,                 \
-                    ap->overflow_arg_area - 8))                 \
-              : (ap->gp_offset < 48                             \
-                 ? (ap->gp_offset += 8,                         \
-                    ap->reg_save_area + ap->gp_offset - 8)      \
-                 : (ap->overflow_arg_area += 8,                 \
-                    ap->overflow_arg_area - 8))                 \
-        ))
-#define va_copy(dest, src)                                      \
-    ((dest) = (va_list)malloc(sizeof(struct __va_list_struct)), \
-     *(dest) = *(src))
-#define va_end(ap) __builtin_free(ap)
+void __va_start(__va_list_struct *ap, void *fp);
+void *__va_arg(__va_list_struct *ap, int arg_type, int size, int align);
+
+#define va_start(ap, last) __va_start(ap, __builtin_frame_address(0))
+#define va_arg(ap, type)                                                \
+    (*(type *)(__va_arg(ap, __builtin_va_arg_types(type), sizeof(type), __alignof__(type))))
+#define va_copy(dest, src) (*(dest) = *(src))
+#define va_end(ap)
+
+/* avoid conflicting definition for va_list on Macs. */
+#define _VA_LIST_T

 #else /* _WIN64 */
 typedef char *va_list;
-#define va_start(ap,last) ap = ((char *)&(last)) + ((sizeof(last)+7)&~7)
-#define va_arg(ap,type) (ap += (sizeof(type)+7)&~7, *(type *)(ap - ((sizeof(type)+7)&~7)))
-#define va_copy(dest, src) (dest) = (src)
+#define va_start(ap,last) __builtin_va_start(ap,last)
+#define va_arg(ap, t) ((sizeof(t) > 8 || (sizeof(t) & (sizeof(t) - 1))) \
+	? **(t **)((ap += 8) - 8) : *(t  *)((ap += 8) - 8))
+#define va_copy(dest, src) ((dest) = (src))
 #define va_end(ap)
 #endif

+#elif __arm__
+typedef char *va_list;
+#define _tcc_alignof(type) ((int)&((struct {char c;type x;} *)0)->x)
+#define _tcc_align(addr,type) (((unsigned)addr + _tcc_alignof(type) - 1) \
+                               & ~(_tcc_alignof(type) - 1))
+#define va_start(ap,last) ap = ((char *)&(last)) + ((sizeof(last)+3)&~3)
+#define va_arg(ap,type) (ap = (void *) ((_tcc_align(ap,type)+sizeof(type)+3) \
+                        &~3), *(type *)(ap - ((sizeof(type)+3)&~3)))
+#define va_copy(dest, src) (dest) = (src)
+#define va_end(ap)
+
+#elif defined(__aarch64__)
+typedef struct {
+    void *__stack;
+    void *__gr_top;
+    void *__vr_top;
+    int   __gr_offs;
+    int   __vr_offs;
+} va_list;
+#define va_start(ap, last) __va_start(ap, last)
+#define va_arg(ap, type) __va_arg(ap, type)
+#define va_end(ap)
+#define va_copy(dest, src) ((dest) = (src))
+
 #else /* __i386__ */
 typedef char *va_list;
 /* only correct for i386 */
--- a/include/stdbool.h
+++ b/include/stdbool.h
@ -6,5 +6,6 @@
 #define bool	_Bool
 #define true	1
 #define false	0
+#define __bool_true_false_are_defined 1

 #endif /* _STDBOOL_H */
--- a/include/stddef.h
+++ b/include/stddef.h
@ -1,20 +1,54 @@
 #ifndef _STDDEF_H
 #define _STDDEF_H

-#define NULL ((void *)0)
 typedef __SIZE_TYPE__ size_t;
+typedef __PTRDIFF_TYPE__ ssize_t;
 typedef __WCHAR_TYPE__ wchar_t;
 typedef __PTRDIFF_TYPE__ ptrdiff_t;
-#define offsetof(type, field) ((size_t) &((type *)0)->field)
+typedef __PTRDIFF_TYPE__ intptr_t;
+typedef __SIZE_TYPE__ uintptr_t;

 #ifndef __int8_t_defined
 #define __int8_t_defined
-typedef char int8_t;
-typedef short int int16_t;
-typedef int int32_t;
-typedef long long int int64_t;
+typedef signed char int8_t;
+typedef signed short int int16_t;
+typedef signed int int32_t;
+#ifdef __LP64__
+typedef signed long int int64_t;
+#else
+typedef signed long long int int64_t;
 #endif
+typedef unsigned char uint8_t;
+typedef unsigned short int uint16_t;
+typedef unsigned int uint32_t;
+#ifdef __LP64__
+typedef unsigned long int uint64_t;
+#else
+typedef unsigned long long int uint64_t;
+#endif
+#endif
+
+#ifndef NULL
+#define NULL ((void*)0)
+#endif
+
+#define offsetof(type, field) ((size_t)&((type *)0)->field)

 void *alloca(size_t size);

 #endif
+
+/* Older glibc require a wint_t from <stddef.h> (when requested
+   by __need_wint_t, as otherwise stddef.h isn't allowed to
+   define this type).   Note that this must be outside the normal
+   _STDDEF_H guard, so that it works even when we've included the file
+   already (without requiring wint_t).  Some other libs define _WINT_T
+   if they've already provided that type, so we can use that as guard.
+   TCC defines __WINT_TYPE__ for us.  */
+#if defined (__need_wint_t)
+#ifndef _WINT_T
+#define _WINT_T
+typedef __WINT_TYPE__ wint_t;
+#endif
+#undef __need_wint_t
+#endif
--- a/lib/Makefile
+++ b/lib/Makefile
@ -0,0 +1,73 @@
+#
+# Tiny C Compiler Makefile for libtcc1.a
+#
+
+TOP = ..
+include $(TOP)/Makefile
+VPATH = $(TOPSRC)/lib $(TOPSRC)/win32/lib
+T = $(or $(CROSS_TARGET),$(NATIVE_TARGET),unknown)
+X = $(if $(CROSS_TARGET),$(CROSS_TARGET)-)
+BIN = $(TOP)/$(X)libtcc1.a
+
+XTCC ?= $(TOP)/$(X)tcc$(EXESUF)
+XCC = $(XTCC)
+XAR = $(XTCC) -ar
+XFLAGS-unx = -B$(TOPSRC)
+XFLAGS-win = -B$(TOPSRC)/win32 -I$(TOPSRC)/include
+XFLAGS = $(XFLAGS$(XCFG))
+XCFG = $(or $(findstring -win,$T),-unx)
+
+# in order to use gcc, tyoe: make <target>-libtcc1-usegcc=yes
+arm-libtcc1-usegcc ?= no
+
+ifeq "$($(T)-libtcc1-usegcc)" "yes"
+ XCC = $(CC)
+ XAR = $(AR)
+ XFLAGS = $(CFLAGS) -fPIC
+endif
+
+# only for native compiler
+$(X)BCHECK_O = bcheck.o
+
+ifeq ($(CONFIG_musl)$(CONFIG_uClibc),yes)
+ BCHECK_O =
+endif
+
+ifdef CONFIG_OSX
+ XFLAGS += -D_ANSI_SOURCE
+endif
+
+I386_O = libtcc1.o alloca86.o alloca86-bt.o
+X86_64_O = libtcc1.o alloca86_64.o alloca86_64-bt.o
+ARM_O = libtcc1.o armeabi.o alloca-arm.o armflush.o
+ARM64_O = lib-arm64.o
+WIN_O = crt1.o crt1w.o wincrt1.o wincrt1w.o dllcrt1.o dllmain.o
+
+OBJ-i386 = $(I386_O) $(BCHECK_O)
+OBJ-x86_64 = $(X86_64_O) va_list.o $(BCHECK_O)
+OBJ-x86_64-osx = $(X86_64_O) va_list.o
+OBJ-i386-win32 = $(I386_O) chkstk.o bcheck.o $(WIN_O)
+OBJ-x86_64-win32 = $(X86_64_O) chkstk.o bcheck.o $(WIN_O)
+OBJ-arm64 = $(ARM64_O)
+OBJ-arm = $(ARM_O)
+OBJ-arm-fpa = $(ARM_O)
+OBJ-arm-fpa-ld = $(ARM_O)
+OBJ-arm-vfp = $(ARM_O)
+OBJ-arm-eabi = $(ARM_O)
+OBJ-arm-eabihf = $(ARM_O)
+OBJ-arm-wince = $(ARM_O) $(WIN_O)
+
+$(BIN) : $(patsubst %.o,$(X)%.o,$(OBJ-$T))
+	$(XAR) rcs $@ $^
+
+$(X)%.o : %.c
+	$(XCC) -c $< -o $@ $(XFLAGS)
+
+$(X)%.o : %.S
+	$(XCC) -c $< -o $@ $(XFLAGS)
+
+$(X)crt1w.o : crt1.c
+$(X)wincrt1w.o : wincrt1.c
+
+clean :
+	rm -f *.a *.o $(BIN)
--- a/lib/alloca-arm.S
+++ b/lib/alloca-arm.S
@ -0,0 +1,17 @@
+	.text
+	.align	2
+	.global	alloca
+	.type	alloca, %function
+alloca:
+#ifdef __TINYC__
+        .int 0xe060d00d
+        .int 0xe3cdd007
+        .int 0xe1a0000d
+        .int 0xe1a0f00e
+#else
+	rsb	sp, r0, sp
+	bic	sp, sp, #7
+	mov	r0, sp
+	mov	pc, lr
+#endif
+	.size	alloca, .-alloca
--- a/lib/alloca86-bt.S
+++ b/lib/alloca86-bt.S
@ -1,8 +1,6 @@
 /* ---------------------------------------------- */
 /* alloca86-bt.S */

-#include "../config.h"
-
 .globl __bound_alloca

 __bound_alloca:
@ -13,13 +11,13 @@ __bound_alloca:
    and     $-4,%eax
    jz      p6

-#ifdef TCC_TARGET_PE
+#ifdef _WIN32
 p4:
    cmp     $4096,%eax
-    jle     p5
+    jbe     p5
+    test    %eax,-4096(%esp)
    sub     $4096,%esp
    sub     $4096,%eax
-    test    %eax,(%esp)
    jmp p4

 p5:
@ -42,4 +40,8 @@ p6:
    push    %edx
    ret

+/* mark stack as nonexecutable */
+#if defined __ELF__ && defined __linux__
+    .section    .note.GNU-stack,"",@progbits
+#endif
 /* ---------------------------------------------- */
--- a/lib/alloca86.S
+++ b/lib/alloca86.S
@ -1,8 +1,6 @@
 /* ---------------------------------------------- */
 /* alloca86.S */

-#include "../config.h"
-
 .globl alloca

 alloca:
@ -12,13 +10,13 @@ alloca:
    and     $-4,%eax
    jz      p3

-#ifdef TCC_TARGET_PE
+#ifdef _WIN32
 p1:
    cmp     $4096,%eax
-    jle     p2
+    jbe     p2
+    test    %eax,-4096(%esp)
    sub     $4096,%esp
    sub     $4096,%eax
-    test    %eax,(%esp)
    jmp p1
 p2:
 #endif
--- a/lib/alloca86_64-bt.S
+++ b/lib/alloca86_64-bt.S
@ -0,0 +1,56 @@
+/* ---------------------------------------------- */
+/* alloca86_64.S */
+
+.globl __bound_alloca
+__bound_alloca:
+
+#ifdef _WIN32
+    # bound checking is not implemented
+    pop     %rdx
+    mov     %rcx,%rax
+    add     $15,%rax
+    and     $-16,%rax
+    jz      p3
+
+p1:
+    cmp     $4096,%rax
+    jbe     p2
+    test    %rax,-4096(%rsp)
+    sub     $4096,%rsp
+    sub     $4096,%rax
+    jmp p1
+p2:
+
+    sub     %rax,%rsp
+    mov     %rsp,%rax
+    add     $32,%rax
+
+p3:
+    push    %rdx
+    ret
+#else
+    pop     %rdx
+    mov     %rdi,%rax
+    mov     %rax,%rsi	# size, a second parm to the __bound_new_region
+
+    add     $15,%rax
+    and     $-16,%rax
+    jz      p3
+
+
+    sub     %rax,%rsp
+    mov     %rsp,%rdi	# pointer, a first parm to the __bound_new_region
+    mov     %rsp,%rax
+
+    push    %rdx
+    push    %rax
+    call   __bound_new_region
+    pop     %rax
+    pop     %rdx
+
+p3:
+    push    %rdx
+    ret
+#endif
+
+/* ---------------------------------------------- */
--- a/lib/alloca86_64.S
+++ b/lib/alloca86_64.S
@ -1,13 +1,11 @@
 /* ---------------------------------------------- */
 /* alloca86_64.S */

-#include "../config.h"
-
 .globl alloca

 alloca:
    pop     %rdx
-#ifdef TCC_TARGET_PE
+#ifdef _WIN32
    mov     %rcx,%rax
 #else
    mov     %rdi,%rax
@ -16,23 +14,19 @@ alloca:
    and     $-16,%rax
    jz      p3

-#ifdef TCC_TARGET_PE
+#ifdef _WIN32
 p1:
    cmp     $4096,%rax
-    jle     p2
+    jbe     p2
+    test    %rax,-4096(%rsp)
    sub     $4096,%rsp
    sub     $4096,%rax
-    test    %rax,(%rsp)
    jmp p1
 p2:
 #endif

    sub     %rax,%rsp
    mov     %rsp,%rax
-#ifdef TCC_TARGET_PE
-    add     $32,%rax
-#endif
-
 p3:
    push    %rdx
    ret
--- a/lib/armeabi.c
+++ b/lib/armeabi.c
@ -0,0 +1,501 @@
+/* TCC ARM runtime EABI
+   Copyright (C) 2013 Thomas Preud'homme
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.*/
+
+#ifdef __TINYC__
+#define INT_MIN (-2147483647 - 1)
+#define INT_MAX 2147483647
+#define UINT_MAX 0xffffffff
+#define LONG_MIN (-2147483647L - 1)
+#define LONG_MAX 2147483647L
+#define ULONG_MAX 0xffffffffUL
+#define LLONG_MAX 9223372036854775807LL
+#define LLONG_MIN (-9223372036854775807LL - 1)
+#define ULLONG_MAX 0xffffffffffffffffULL
+#else
+#include <limits.h>
+#endif
+
+/* We rely on the little endianness and EABI calling convention for this to
+   work */
+
+typedef struct double_unsigned_struct {
+    unsigned low;
+    unsigned high;
+} double_unsigned_struct;
+
+typedef struct unsigned_int_struct {
+    unsigned low;
+    int high;
+} unsigned_int_struct;
+
+#define REGS_RETURN(name, type) \
+    void name ## _return(type ret) {}
+
+
+/* Float helper functions */
+
+#define FLOAT_EXP_BITS 8
+#define FLOAT_FRAC_BITS 23
+
+#define DOUBLE_EXP_BITS 11
+#define DOUBLE_FRAC_BITS 52
+
+#define ONE_EXP(type) ((1 << (type ## _EXP_BITS - 1)) - 1)
+
+REGS_RETURN(unsigned_int_struct, unsigned_int_struct)
+REGS_RETURN(double_unsigned_struct, double_unsigned_struct)
+
+/* float -> integer: (sign) 1.fraction x 2^(exponent - exp_for_one) */
+
+
+/* float to [unsigned] long long conversion */
+#define DEFINE__AEABI_F2XLZ(name, with_sign)                                 \
+void __aeabi_ ## name(unsigned val)                                          \
+{                                                                            \
+    int exp, high_shift, sign;                                               \
+    double_unsigned_struct ret;                                              \
+                                                                             \
+    /* compute sign */                                                       \
+    sign = val >> 31;                                                        \
+                                                                             \
+    /* compute real exponent */                                              \
+    exp = val >> FLOAT_FRAC_BITS;                                            \
+    exp &= (1 << FLOAT_EXP_BITS) - 1;                                        \
+    exp -= ONE_EXP(FLOAT);                                                   \
+                                                                             \
+    /* undefined behavior if truncated value cannot be represented */        \
+    if (with_sign) {                                                         \
+        if (exp > 62) /* |val| too big, double cannot represent LLONG_MAX */ \
+            return;                                                          \
+    } else {                                                                 \
+        if ((sign && exp >= 0) || exp > 63) /* if val < 0 || val too big */  \
+            return;                                                          \
+    }                                                                        \
+                                                                             \
+    val &= (1 << FLOAT_FRAC_BITS) - 1;                                       \
+    if (exp >= 32) {                                                         \
+        ret.high = 1 << (exp - 32);                                          \
+        if (exp - 32 >= FLOAT_FRAC_BITS) {                                   \
+            ret.high |= val << (exp - 32 - FLOAT_FRAC_BITS);                 \
+            ret.low = 0;                                                     \
+        } else {                                                             \
+            high_shift = FLOAT_FRAC_BITS - (exp - 32);                       \
+            ret.high |= val >> high_shift;                                   \
+            ret.low = val << (32 - high_shift);                              \
+        }                                                                    \
+    } else {                                                                 \
+        ret.high = 0;                                                        \
+        ret.low = 1 << exp;                                                  \
+        if (exp > FLOAT_FRAC_BITS)                                           \
+            ret.low |= val << (exp - FLOAT_FRAC_BITS);                       \
+        else                                                                 \
+            ret.low |= val >> (FLOAT_FRAC_BITS - exp);                       \
+    }                                                                        \
+                                                                             \
+    /* encode negative integer using 2's complement */                       \
+    if (with_sign && sign) {                                                 \
+        ret.low = ~ret.low;                                                  \
+        ret.high = ~ret.high;                                                \
+        if (ret.low == UINT_MAX) {                                           \
+            ret.low = 0;                                                     \
+            ret.high++;                                                      \
+        } else                                                               \
+            ret.low++;                                                       \
+    }                                                                        \
+                                                                             \
+    double_unsigned_struct_return(ret);                                      \
+}
+
+/* float to unsigned long long conversion */
+DEFINE__AEABI_F2XLZ(f2ulz, 0)
+
+/* float to long long conversion */
+DEFINE__AEABI_F2XLZ(f2lz, 1)
+
+/* double to [unsigned] long long conversion */
+#define DEFINE__AEABI_D2XLZ(name, with_sign)                                 \
+void __aeabi_ ## name(double_unsigned_struct val)                            \
+{                                                                            \
+    int exp, high_shift, sign;                                               \
+    double_unsigned_struct ret;                                              \
+                                                                             \
+    /* compute sign */                                                       \
+    sign = val.high >> 31;                                                   \
+                                                                             \
+    /* compute real exponent */                                              \
+    exp = (val.high >> (DOUBLE_FRAC_BITS - 32));                             \
+    exp &= (1 << DOUBLE_EXP_BITS) - 1;                                       \
+    exp -= ONE_EXP(DOUBLE);                                                  \
+                                                                             \
+    /* undefined behavior if truncated value cannot be represented */        \
+    if (with_sign) {                                                         \
+        if (exp > 62) /* |val| too big, double cannot represent LLONG_MAX */ \
+            return;                                                          \
+    } else {                                                                 \
+        if ((sign && exp >= 0) || exp > 63) /* if val < 0 || val too big */  \
+            return;                                                          \
+    }                                                                        \
+                                                                             \
+    val.high &= (1 << (DOUBLE_FRAC_BITS - 32)) - 1;                          \
+    if (exp >= 32) {                                                         \
+        ret.high = 1 << (exp - 32);                                          \
+        if (exp >= DOUBLE_FRAC_BITS) {                                       \
+            high_shift = exp - DOUBLE_FRAC_BITS;                             \
+            ret.high |= val.high << high_shift;                              \
+            ret.high |= val.low >> (32 - high_shift);                        \
+            ret.low = val.low << high_shift;                                 \
+        } else {                                                             \
+            high_shift = DOUBLE_FRAC_BITS - exp;                             \
+            ret.high |= val.high >> high_shift;                              \
+            ret.low = val.high << (32 - high_shift);                         \
+            ret.low |= val.low >> high_shift;                                \
+        }                                                                    \
+    } else {                                                                 \
+        ret.high = 0;                                                        \
+        ret.low = 1 << exp;                                                  \
+        if (exp > DOUBLE_FRAC_BITS - 32) {                                   \
+            high_shift = exp - DOUBLE_FRAC_BITS - 32;                        \
+            ret.low |= val.high << high_shift;                               \
+            ret.low |= val.low >> (32 - high_shift);                         \
+        } else                                                               \
+            ret.low |= val.high >> (DOUBLE_FRAC_BITS - 32 - exp);            \
+    }                                                                        \
+                                                                             \
+    /* encode negative integer using 2's complement */                       \
+    if (with_sign && sign) {                                                 \
+        ret.low = ~ret.low;                                                  \
+        ret.high = ~ret.high;                                                \
+        if (ret.low == UINT_MAX) {                                           \
+            ret.low = 0;                                                     \
+            ret.high++;                                                      \
+        } else                                                               \
+            ret.low++;                                                       \
+    }                                                                        \
+                                                                             \
+    double_unsigned_struct_return(ret);                                      \
+}
+
+/* double to unsigned long long conversion */
+DEFINE__AEABI_D2XLZ(d2ulz, 0)
+
+/* double to long long conversion */
+DEFINE__AEABI_D2XLZ(d2lz, 1)
+
+/* long long to float conversion */
+#define DEFINE__AEABI_XL2F(name, with_sign)                             \
+unsigned __aeabi_ ## name(unsigned long long v)                         \
+{                                                                       \
+    int s /* shift */, flb /* first lost bit */, sign = 0;              \
+    unsigned p = 0 /* power */, ret;                                    \
+    double_unsigned_struct val;                                         \
+                                                                        \
+    /* fraction in negative float is encoded in 1's complement */       \
+    if (with_sign && (v & (1ULL << 63))) {                              \
+        sign = 1;                                                       \
+        v = ~v + 1;                                                     \
+    }                                                                   \
+    val.low = v;                                                        \
+    val.high = v >> 32;                                                 \
+    /* fill fraction bits */                                            \
+    for (s = 31, p = 1 << 31; p && !(val.high & p); s--, p >>= 1);      \
+    if (p) {                                                            \
+        ret = val.high & (p - 1);                                       \
+        if (s < FLOAT_FRAC_BITS) {                                      \
+            ret <<= FLOAT_FRAC_BITS - s;                                \
+            ret |= val.low >> (32 - (FLOAT_FRAC_BITS - s));             \
+            flb = (val.low >> (32 - (FLOAT_FRAC_BITS - s - 1))) & 1;    \
+        } else {                                                        \
+            flb = (ret >> (s - FLOAT_FRAC_BITS - 1)) & 1;               \
+            ret >>= s - FLOAT_FRAC_BITS;                                \
+        }                                                               \
+        s += 32;                                                        \
+    } else {                                                            \
+        for (s = 31, p = 1 << 31; p && !(val.low & p); s--, p >>= 1);   \
+        if (p) {                                                        \
+            ret = val.low & (p - 1);                                    \
+            if (s <= FLOAT_FRAC_BITS) {                                 \
+                ret <<= FLOAT_FRAC_BITS - s;                            \
+                flb = 0;                                                \
+	    } else {                                                    \
+                flb = (ret >> (s - FLOAT_FRAC_BITS - 1)) & 1;           \
+                ret >>= s - FLOAT_FRAC_BITS;                            \
+	    }                                                           \
+        } else                                                          \
+            return 0;                                                   \
+    }                                                                   \
+    if (flb)                                                            \
+        ret++;                                                          \
+                                                                        \
+    /* fill exponent bits */                                            \
+    ret |= (s + ONE_EXP(FLOAT)) << FLOAT_FRAC_BITS;                     \
+                                                                        \
+    /* fill sign bit */                                                 \
+    ret |= sign << 31;                                                  \
+                                                                        \
+    return ret;                                                         \
+}
+
+/* unsigned long long to float conversion */
+DEFINE__AEABI_XL2F(ul2f, 0)
+
+/* long long to float conversion */
+DEFINE__AEABI_XL2F(l2f, 1)
+
+/* long long to double conversion */
+#define __AEABI_XL2D(name, with_sign)                                   \
+void __aeabi_ ## name(unsigned long long v)                             \
+{                                                                       \
+    int s /* shift */, high_shift, sign = 0;                            \
+    unsigned tmp, p = 0;                                                \
+    double_unsigned_struct val, ret;                                    \
+                                                                        \
+    /* fraction in negative float is encoded in 1's complement */       \
+    if (with_sign && (v & (1ULL << 63))) {                              \
+        sign = 1;                                                       \
+        v = ~v + 1;                                                     \
+    }                                                                   \
+    val.low = v;                                                        \
+    val.high = v >> 32;                                                 \
+                                                                        \
+    /* fill fraction bits */                                            \
+    for (s = 31, p = 1 << 31; p && !(val.high & p); s--, p >>= 1);      \
+    if (p) {                                                            \
+        tmp = val.high & (p - 1);                                       \
+        if (s < DOUBLE_FRAC_BITS - 32) {                                \
+            high_shift = DOUBLE_FRAC_BITS - 32 - s;                     \
+            ret.high = tmp << high_shift;                               \
+            ret.high |= val.low >> (32 - high_shift);                   \
+            ret.low = val.low << high_shift;                            \
+        } else {                                                        \
+            high_shift = s - (DOUBLE_FRAC_BITS - 32);                   \
+            ret.high = tmp >> high_shift;                               \
+            ret.low = tmp << (32 - high_shift);                         \
+            ret.low |= val.low >> high_shift;                           \
+            if ((val.low >> (high_shift - 1)) & 1) {                    \
+                if (ret.low == UINT_MAX) {                              \
+                    ret.high++;                                         \
+                    ret.low = 0;                                        \
+		} else                                                  \
+                    ret.low++;                                          \
+            }                                                           \
+        }                                                               \
+        s += 32;                                                        \
+    } else {                                                            \
+        for (s = 31, p = 1 << 31; p && !(val.low & p); s--, p >>= 1);   \
+        if (p) {                                                        \
+            tmp = val.low & (p - 1);                                    \
+            if (s <= DOUBLE_FRAC_BITS - 32) {                           \
+                high_shift = DOUBLE_FRAC_BITS - 32 - s;                 \
+                ret.high = tmp << high_shift;                           \
+                ret.low = 0;                                            \
+	    } else {                                                    \
+                high_shift = s - (DOUBLE_FRAC_BITS - 32);               \
+                ret.high = tmp >> high_shift;                           \
+                ret.low = tmp << (32 - high_shift);                     \
+            }                                                           \
+        } else {                                                        \
+            ret.high = ret.low = 0;                                     \
+            double_unsigned_struct_return(ret);                         \
+        }                                                               \
+    }                                                                   \
+                                                                        \
+    /* fill exponent bits */                                            \
+    ret.high |= (s + ONE_EXP(DOUBLE)) << (DOUBLE_FRAC_BITS - 32);       \
+                                                                        \
+    /* fill sign bit */                                                 \
+    ret.high |= sign << 31;                                             \
+                                                                        \
+    double_unsigned_struct_return(ret);                                 \
+}
+
+/* unsigned long long to double conversion */
+__AEABI_XL2D(ul2d, 0)
+
+/* long long to double conversion */
+__AEABI_XL2D(l2d, 1)
+
+
+/* Long long helper functions */
+
+/* TODO: add error in case of den == 0 (see §4.3.1 and §4.3.2) */
+
+#define define_aeabi_xdivmod_signed_type(basetype, type) \
+typedef struct type {                                    \
+    basetype quot;                                       \
+    unsigned basetype rem;                               \
+} type
+
+#define define_aeabi_xdivmod_unsigned_type(basetype, type) \
+typedef struct type {                                      \
+    basetype quot;                                         \
+    basetype rem;                                          \
+} type
+
+#define AEABI_UXDIVMOD(name,type, rettype, typemacro)                     \
+static inline rettype aeabi_ ## name (type num, type den)                 \
+{                                                                         \
+    rettype ret;                                                          \
+    type quot = 0;                                                        \
+                                                                          \
+    /* Increase quotient while it is less than numerator */               \
+    while (num >= den) {                                                  \
+        type q = 1;                                                       \
+                                                                          \
+        /* Find closest power of two */                                   \
+        while ((q << 1) * den <= num && q * den <= typemacro ## _MAX / 2) \
+            q <<= 1;                                                      \
+                                                                          \
+        /* Compute difference between current quotient and numerator */   \
+        num -= q * den;                                                   \
+        quot += q;                                                        \
+    }                                                                     \
+    ret.quot = quot;                                                      \
+    ret.rem = num;                                                        \
+    return ret;                                                           \
+}
+
+#define __AEABI_XDIVMOD(name, type, uiname, rettype, urettype, typemacro)     \
+void __aeabi_ ## name(type numerator, type denominator)                       \
+{                                                                             \
+    unsigned type num, den;                                                   \
+    urettype uxdiv_ret;                                                       \
+    rettype ret;                                                              \
+                                                                              \
+    if (numerator >= 0)                                                       \
+      num = numerator;                                                        \
+    else                                                                      \
+      num = 0 - numerator;                                                    \
+    if (denominator >= 0)                                                     \
+      den = denominator;                                                      \
+    else                                                                      \
+      den = 0 - denominator;                                                  \
+    uxdiv_ret = aeabi_ ## uiname(num, den);                                   \
+    /* signs differ */                                                        \
+    if ((numerator & typemacro ## _MIN) != (denominator & typemacro ## _MIN)) \
+        ret.quot = 0 - uxdiv_ret.quot;                                        \
+    else                                                                      \
+        ret.quot = uxdiv_ret.quot;                                            \
+    if (numerator < 0)                                                        \
+        ret.rem = 0 - uxdiv_ret.rem;                                          \
+    else                                                                      \
+        ret.rem = uxdiv_ret.rem;                                              \
+                                                                              \
+    rettype ## _return(ret);                                                  \
+}
+
+define_aeabi_xdivmod_signed_type(long long, lldiv_t);
+define_aeabi_xdivmod_unsigned_type(unsigned long long, ulldiv_t);
+define_aeabi_xdivmod_signed_type(int, idiv_t);
+define_aeabi_xdivmod_unsigned_type(unsigned, uidiv_t);
+
+REGS_RETURN(lldiv_t, lldiv_t)
+REGS_RETURN(ulldiv_t, ulldiv_t)
+REGS_RETURN(idiv_t, idiv_t)
+REGS_RETURN(uidiv_t, uidiv_t)
+
+AEABI_UXDIVMOD(uldivmod, unsigned long long, ulldiv_t, ULLONG)
+
+__AEABI_XDIVMOD(ldivmod, long long, uldivmod, lldiv_t, ulldiv_t, LLONG)
+
+void __aeabi_uldivmod(unsigned long long num, unsigned long long den)
+{
+    ulldiv_t_return(aeabi_uldivmod(num, den));
+}
+
+void __aeabi_llsl(double_unsigned_struct val, int shift)
+{
+    double_unsigned_struct ret;
+
+    if (shift >= 32) {
+        val.high = val.low;
+        val.low = 0;
+        shift -= 32;
+    }
+    if (shift > 0) {
+        ret.low = val.low << shift;
+        ret.high = (val.high << shift) | (val.low >> (32 - shift));
+        double_unsigned_struct_return(ret);
+	return;
+    }
+    double_unsigned_struct_return(val);
+}
+
+#define aeabi_lsr(val, shift, fill, type)                          \
+    type ## _struct ret;                                           \
+                                                                   \
+    if (shift >= 32) {                                             \
+        val.low = val.high;                                        \
+        val.high = fill;                                           \
+        shift -= 32;                                               \
+    }                                                              \
+    if (shift > 0) {                                               \
+        ret.high = val.high >> shift;                              \
+        ret.low = (val.high << (32 - shift)) | (val.low >> shift); \
+        type ## _struct_return(ret);                               \
+	return;                                                    \
+    }                                                              \
+    type ## _struct_return(val);
+
+void __aeabi_llsr(double_unsigned_struct val, int shift)
+{
+    aeabi_lsr(val, shift, 0, double_unsigned);
+}
+
+void __aeabi_lasr(unsigned_int_struct val, int shift)
+{
+    aeabi_lsr(val, shift, val.high >> 31, unsigned_int);
+}
+
+
+/* Integer division functions */
+
+AEABI_UXDIVMOD(uidivmod, unsigned, uidiv_t, UINT)
+
+int __aeabi_idiv(int numerator, int denominator)
+{
+    unsigned num, den;
+    uidiv_t ret;
+
+    if (numerator >= 0)
+        num = numerator;
+    else
+        num = 0 - numerator;
+    if (denominator >= 0)
+        den = denominator;
+    else
+        den = 0 - denominator;
+    ret = aeabi_uidivmod(num, den);
+    if ((numerator & INT_MIN) != (denominator & INT_MIN)) /* signs differ */
+        ret.quot *= -1;
+    return ret.quot;
+}
+
+unsigned __aeabi_uidiv(unsigned num, unsigned den)
+{
+    return aeabi_uidivmod(num, den).quot;
+}
+
+__AEABI_XDIVMOD(idivmod, int, uidivmod, idiv_t, uidiv_t, INT)
+
+void __aeabi_uidivmod(unsigned num, unsigned den)
+{
+    uidiv_t_return(aeabi_uidivmod(num, den));
+}
--- a/lib/armflush.c
+++ b/lib/armflush.c
@ -0,0 +1,58 @@
+/* armflush.c - flush the instruction cache
+
+   __clear_cache is used in tccrun.c,  It is a built-in
+   intrinsic with gcc.  However tcc in order to compile
+   itself needs this function */
+
+#ifdef __TINYC__
+
+/* syscall wrapper */
+unsigned syscall(unsigned syscall_nr, ...);
+
+/* arm-tcc supports only fake asm currently */
+__asm__(
+    ".global syscall\n"
+    "syscall:\n"
+    ".int 0xe92d4080\n"  // push    {r7, lr}
+    ".int 0xe1a07000\n"  // mov     r7, r0
+    ".int 0xe1a00001\n"  // mov     r0, r1
+    ".int 0xe1a01002\n"  // mov     r1, r2
+    ".int 0xe1a02003\n"  // mov     r2, r3
+    ".int 0xef000000\n"  // svc     0x00000000
+    ".int 0xe8bd8080\n"  // pop     {r7, pc}
+    );
+
+/* from unistd.h: */
+#if defined(__thumb__) || defined(__ARM_EABI__)
+# define __NR_SYSCALL_BASE      0x0
+#else
+# define __NR_SYSCALL_BASE      0x900000
+#endif
+#define __ARM_NR_BASE           (__NR_SYSCALL_BASE+0x0f0000)
+#define __ARM_NR_cacheflush     (__ARM_NR_BASE+2)
+
+#else
+
+#define _GNU_SOURCE
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <stdio.h>
+
+#endif
+
+/* Flushing for tccrun */
+void __clear_cache(void *beginning, void *end)
+{
+/* __ARM_NR_cacheflush is kernel private and should not be used in user space.
+ * However, there is no ARM asm parser in tcc so we use it for now */
+#if 1
+    syscall(__ARM_NR_cacheflush, beginning, end, 0);
+#else
+    __asm__ ("push {r7}\n\t"
+             "mov r7, #0xf0002\n\t"
+             "mov r2, #0\n\t"
+             "swi 0\n\t"
+             "pop {r7}\n\t"
+             "ret");
+#endif
+}
--- a/lib/bcheck.c
+++ b/lib/bcheck.c
@ -21,60 +21,83 @@
 #include <stdio.h>
 #include <stdarg.h>
 #include <string.h>
-#if !defined(__FreeBSD__) && !defined(__DragonFly__) && !defined(__OpenBSD__)
+
+#if !defined(__FreeBSD__) \
+ && !defined(__FreeBSD_kernel__) \
+ && !defined(__DragonFly__) \
+ && !defined(__OpenBSD__) \
+ && !defined(__NetBSD__)
 #include <malloc.h>
 #endif

-//#define BOUND_DEBUG
+#if !defined(_WIN32)
+#include <unistd.h>
+#endif
+
+/* #define BOUND_DEBUG */
+
+#ifdef BOUND_DEBUG
+ #define dprintf(a...) fprintf(a)
+#else
+ #define dprintf(a...)
+#endif

 /* define so that bound array is static (faster, but use memory if
   bound checking not used) */
-//#define BOUND_STATIC
+/* #define BOUND_STATIC */

 /* use malloc hooks. Currently the code cannot be reliable if no hooks */
 #define CONFIG_TCC_MALLOC_HOOKS
-
 #define HAVE_MEMALIGN

-#if defined(__FreeBSD__) || defined(__DragonFly__) || defined(__dietlibc__) \
-    || defined(__UCLIBC__) || defined(__OpenBSD__)
-#warning Bound checking not fully supported in this environment.
+#if defined(__FreeBSD__) \
+ || defined(__FreeBSD_kernel__) \
+ || defined(__DragonFly__) \
+ || defined(__OpenBSD__) \
+ || defined(__NetBSD__) \
+ || defined(__dietlibc__) \
+ || defined(_WIN32)
+//#warning Bound checking does not support malloc (etc.) in this environment.
 #undef CONFIG_TCC_MALLOC_HOOKS
 #undef HAVE_MEMALIGN
 #endif

 #define BOUND_T1_BITS 13
 #define BOUND_T2_BITS 11
-#define BOUND_T3_BITS (32 - BOUND_T1_BITS - BOUND_T2_BITS)
+#define BOUND_T3_BITS (sizeof(size_t)*8 - BOUND_T1_BITS - BOUND_T2_BITS)
+#define BOUND_E_BITS  (sizeof(size_t))

-#define BOUND_T1_SIZE (1 << BOUND_T1_BITS)
-#define BOUND_T2_SIZE (1 << BOUND_T2_BITS)
-#define BOUND_T3_SIZE (1 << BOUND_T3_BITS)
-#define BOUND_E_BITS  4
+#define BOUND_T1_SIZE ((size_t)1 << BOUND_T1_BITS)
+#define BOUND_T2_SIZE ((size_t)1 << BOUND_T2_BITS)
+#define BOUND_T3_SIZE ((size_t)1 << BOUND_T3_BITS)

 #define BOUND_T23_BITS (BOUND_T2_BITS + BOUND_T3_BITS)
-#define BOUND_T23_SIZE (1 << BOUND_T23_BITS)
+#define BOUND_T23_SIZE ((size_t)1 << BOUND_T23_BITS)


 /* this pointer is generated when bound check is incorrect */
 #define INVALID_POINTER ((void *)(-2))
 /* size of an empty region */
-#define EMPTY_SIZE        0xffffffff
+#define EMPTY_SIZE  ((size_t)(-1))
 /* size of an invalid region */
 #define INVALID_SIZE      0

 typedef struct BoundEntry {
-    unsigned long start;
-    unsigned long size;
+    size_t start;
+    size_t size;
    struct BoundEntry *next;
-    unsigned long is_invalid; /* true if pointers outside region are invalid */
+    size_t is_invalid; /* true if pointers outside region are invalid */
 } BoundEntry;

 /* external interface */
 void __bound_init(void);
-void __bound_new_region(void *p, unsigned long size);
+void __bound_new_region(void *p, size_t size);
 int __bound_delete_region(void *p);

+#ifdef __attribute__
+  /* an __attribute__ macro is defined in the system headers */
+  #undef __attribute__ 
+#endif
 #define FASTCALL __attribute__((regparm(3)))

 void *__bound_malloc(size_t size, const void *caller);
@ -93,16 +116,13 @@ static void *saved_realloc_hook;
 static void *saved_memalign_hook;
 #endif

-/* linker definitions */
-extern char _end;
-
 /* TCC definitions */
 extern char __bounds_start; /* start of static bounds table */
 /* error message, just for TCC */
 const char *__bound_error_msg;

 /* runtime error output */
-extern void rt_error(unsigned long pc, const char *fmt, ...);
+extern void rt_error(size_t pc, const char *fmt, ...);

 #ifdef BOUND_STATIC
 static BoundEntry *__bound_t1[BOUND_T1_SIZE]; /* page table */
@ -114,12 +134,12 @@ static BoundEntry *__bound_invalid_t2; /* invalid page, for invalid pointers */

 static BoundEntry *__bound_find_region(BoundEntry *e1, void *p)
 {
-    unsigned long addr, tmp;
+    size_t addr, tmp;
    BoundEntry *e;

    e = e1;
    while (e != NULL) {
-        addr = (unsigned long)p;
+        addr = (size_t)p;
        addr -= e->start;
        if (addr <= e->size) {
            /* put region at the head */
@ -144,7 +164,8 @@ static BoundEntry *__bound_find_region(BoundEntry *e1, void *p)
 static void bound_error(const char *fmt, ...)
 {
    __bound_error_msg = fmt;
-    *(int *)0 = 0; /* force a runtime error */
+    fprintf(stderr,"%s %s: %s\n", __FILE__, __FUNCTION__, fmt);
+    *(void **)0 = 0; /* force a runtime error */
 }

 static void bound_alloc_error(void)
@ -152,18 +173,17 @@ static void bound_alloc_error(void)
    bound_error("not enough memory for bound checking code");
 }

-/* currently, tcc cannot compile that because we use GNUC extensions */
-#if !defined(__TINYC__)
-
 /* return '(p + offset)' for pointer arithmetic (a pointer can reach
   the end of a region in this case */
-void * FASTCALL __bound_ptr_add(void *p, int offset)
+void * FASTCALL __bound_ptr_add(void *p, size_t offset)
 {
-    unsigned long addr = (unsigned long)p;
+    size_t addr = (size_t)p;
    BoundEntry *e;
-#if defined(BOUND_DEBUG)
-    printf("add: 0x%x %d\n", (int)p, offset);
-#endif
+
+    dprintf(stderr, "%s %s: %p %x\n",
+        __FILE__, __FUNCTION__, p, (unsigned)offset);
+
+    __bound_init();

    e = __bound_t1[addr >> (BOUND_T2_BITS + BOUND_T3_BITS)];
    e = (BoundEntry *)((char *)e + 
@ -172,22 +192,29 @@ void * FASTCALL __bound_ptr_add(void *p, int offset)
    addr -= e->start;
    if (addr > e->size) {
        e = __bound_find_region(e, p);
-        addr = (unsigned long)p - e->start;
+        addr = (size_t)p - e->start;
    }
    addr += offset;
-    if (addr > e->size)
+    if (addr >= e->size) {
+	fprintf(stderr,"%s %s: %p is outside of the region\n",
+            __FILE__, __FUNCTION__, p + offset);
        return INVALID_POINTER; /* return an invalid pointer */
+    }
    return p + offset;
 }

 /* return '(p + offset)' for pointer indirection (the resulting must
   be strictly inside the region */
 #define BOUND_PTR_INDIR(dsize)                                          \
-void * FASTCALL __bound_ptr_indir ## dsize (void *p, int offset)        \
+void * FASTCALL __bound_ptr_indir ## dsize (void *p, size_t offset)     \
 {                                                                       \
-    unsigned long addr = (unsigned long)p;                              \
+    size_t addr = (size_t)p;                                            \
    BoundEntry *e;                                                      \
                                                                        \
+    dprintf(stderr, "%s %s: %p %x start\n",                             \
+        __FILE__, __FUNCTION__, p, (unsigned)offset);	                \
+									\
+    __bound_init();							\
    e = __bound_t1[addr >> (BOUND_T2_BITS + BOUND_T3_BITS)];            \
    e = (BoundEntry *)((char *)e +                                      \
                       ((addr >> (BOUND_T3_BITS - BOUND_E_BITS)) &      \
@ -195,30 +222,47 @@ void * FASTCALL __bound_ptr_indir ## dsize (void *p, int offset)        \
    addr -= e->start;                                                   \
    if (addr > e->size) {                                               \
        e = __bound_find_region(e, p);                                  \
-        addr = (unsigned long)p - e->start;                             \
+        addr = (size_t)p - e->start;                                    \
    }                                                                   \
    addr += offset + dsize;                                             \
-    if (addr > e->size)                                                 \
+    if (addr > e->size) {                                               \
+	fprintf(stderr,"%s %s: %p is outside of the region\n",          \
+            __FILE__, __FUNCTION__, p + offset);                        \
        return INVALID_POINTER; /* return an invalid pointer */         \
+    }									\
+    dprintf(stderr, "%s %s: return p+offset = %p\n",                    \
+        __FILE__, __FUNCTION__, p + offset);                            \
    return p + offset;                                                  \
 }

-#ifdef __i386__
+BOUND_PTR_INDIR(1)
+BOUND_PTR_INDIR(2)
+BOUND_PTR_INDIR(4)
+BOUND_PTR_INDIR(8)
+BOUND_PTR_INDIR(12)
+BOUND_PTR_INDIR(16)
+
+#if defined(__GNUC__) && (__GNUC__ >= 6)
+/*
+ * At least gcc 6.2 complains when __builtin_frame_address is used with
+ * nonzero argument.
+ */
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wframe-address"
+#endif
+
 /* return the frame pointer of the caller */
 #define GET_CALLER_FP(fp)\
 {\
-    unsigned long *fp1;\
-    __asm__ __volatile__ ("movl %%ebp,%0" :"=g" (fp1));\
-    fp = fp1[0];\
+    fp = (size_t)__builtin_frame_address(1);\
 }
-#else
-#error put code to extract the calling frame pointer
-#endif

 /* called when entering a function to add all the local regions */
 void FASTCALL __bound_local_new(void *p1) 
 {
-    unsigned long addr, size, fp, *p = p1;
+    size_t addr, size, fp, *p = p1;
+
+    dprintf(stderr, "%s, %s start p1=%p\n", __FILE__, __FUNCTION__, p);
    GET_CALLER_FP(fp);
    for(;;) {
        addr = p[0];
@ -229,12 +273,13 @@ void FASTCALL __bound_local_new(void *p1)
        p += 2;
        __bound_new_region((void *)addr, size);
    }
+    dprintf(stderr, "%s, %s end\n", __FILE__, __FUNCTION__);
 }

 /* called when leaving a function to delete all the local regions */
 void FASTCALL __bound_local_delete(void *p1) 
 {
-    unsigned long addr, fp, *p = p1;
+    size_t addr, fp, *p = p1;
    GET_CALLER_FP(fp);
    for(;;) {
        addr = p[0];
@ -246,38 +291,14 @@ void FASTCALL __bound_local_delete(void *p1)
    }
 }

-#else
-
-void __bound_local_new(void *p) 
-{
-}
-void __bound_local_delete(void *p) 
-{
-}
-
-void *__bound_ptr_add(void *p, int offset)
-{
-    return p + offset;
-}
-
-#define BOUND_PTR_INDIR(dsize)                               \
-void *__bound_ptr_indir ## dsize (void *p, int offset)       \
-{                                                            \
-    return p + offset;                                       \
-}
+#if defined(__GNUC__) && (__GNUC__ >= 6)
+#pragma GCC diagnostic pop
 #endif

-BOUND_PTR_INDIR(1)
-BOUND_PTR_INDIR(2)
-BOUND_PTR_INDIR(4)
-BOUND_PTR_INDIR(8)
-BOUND_PTR_INDIR(12)
-BOUND_PTR_INDIR(16)
-
 static BoundEntry *__bound_new_page(void)
 {
    BoundEntry *page;
-    int i;
+    size_t i;

    page = libc_malloc(sizeof(BoundEntry) * BOUND_T2_SIZE);
    if (!page)
@ -305,11 +326,11 @@ static void bound_free_entry(BoundEntry *e)
    libc_free(e);
 }

-static inline BoundEntry *get_page(int index)
+static BoundEntry *get_page(size_t index)
 {
    BoundEntry *page;
    page = __bound_t1[index];
-    if (page == __bound_empty_t2 || page == __bound_invalid_t2) {
+    if (!page || page == __bound_empty_t2 || page == __bound_invalid_t2) {
        /* create a new page if necessary */
        page = __bound_new_page();
        __bound_t1[index] = page;
@ -318,11 +339,11 @@ static inline BoundEntry *get_page(int index)
 }

 /* mark a region as being invalid (can only be used during init) */
-static void mark_invalid(unsigned long addr, unsigned long size)
+static void mark_invalid(size_t addr, size_t size)
 {
-    unsigned long start, end;
+    size_t start, end;
    BoundEntry *page;
-    int t1_start, t1_end, i, j, t2_start, t2_end;
+    size_t t1_start, t1_end, i, j, t2_start, t2_end;

    start = addr;
    end = addr + size;
@ -334,7 +355,7 @@ static void mark_invalid(unsigned long addr, unsigned long size)
        t2_end = 1 << (BOUND_T1_BITS + BOUND_T2_BITS);

 #if 0
-    printf("mark_invalid: start = %x %x\n", t2_start, t2_end);
+    dprintf(stderr, "mark_invalid: start = %x %x\n", t2_start, t2_end);
 #endif
    
    /* first we handle full pages */
@ -373,10 +394,18 @@ static void mark_invalid(unsigned long addr, unsigned long size)

 void __bound_init(void)
 {
-    int i;
+    size_t i;
    BoundEntry *page;
-    unsigned long start, size;
-    int *p;
+    size_t start, size;
+    size_t *p;
+
+    static int inited;
+    if (inited)
+	return;
+
+    inited = 1;
+
+    dprintf(stderr, "%s, %s() start\n", __FILE__, __FUNCTION__);

    /* save malloc hooks and install bound check hooks */
    install_malloc_hooks();
@ -402,34 +431,71 @@ void __bound_init(void)
    __bound_invalid_t2 = page;

    /* invalid pointer zone */
-    start = (unsigned long)INVALID_POINTER & ~(BOUND_T23_SIZE - 1);
+    start = (size_t)INVALID_POINTER & ~(BOUND_T23_SIZE - 1);
    size = BOUND_T23_SIZE;
    mark_invalid(start, size);

-#if !defined(__TINYC__) && defined(CONFIG_TCC_MALLOC_HOOKS)
+#if defined(CONFIG_TCC_MALLOC_HOOKS)
    /* malloc zone is also marked invalid. can only use that with
-       hooks because all libs should use the same malloc. The solution
-       would be to build a new malloc for tcc. */
-    start = (unsigned long)&_end;
+     * hooks because all libs should use the same malloc. The solution
+     * would be to build a new malloc for tcc.
+     *
+     * usually heap (= malloc zone) comes right after bss, i.e. after _end, but
+     * not always - either if we are running from under `tcc -b -run`, or if
+     * address space randomization is turned on(a), heap start will be separated
+     * from bss end.
+     *
+     * So sbrk(0) will be a good approximation for start_brk:
+     *
+     *   - if we are a separately compiled program, __bound_init() runs early,
+     *     and sbrk(0) should be equal or very near to start_brk(b) (in case other
+     *     constructors malloc something), or
+     *
+     *   - if we are running from under `tcc -b -run`, sbrk(0) will return
+     *     start of heap portion which is under this program control, and not
+     *     mark as invalid earlier allocated memory.
+     *
+     *
+     * (a) /proc/sys/kernel/randomize_va_space = 2, on Linux;
+     *     usually turned on by default.
+     *
+     * (b) on Linux >= v3.3, the alternative is to read
+     *     start_brk from /proc/self/stat
+     */
+    start = (size_t)sbrk(0);
    size = 128 * 0x100000;
    mark_invalid(start, size);
 #endif

    /* add all static bound check values */
-    p = (int *)&__bounds_start;
+    p = (size_t *)&__bounds_start;
    while (p[0] != 0) {
        __bound_new_region((void *)p[0], p[1]);
        p += 2;
    }
+
+    dprintf(stderr, "%s, %s() end\n\n", __FILE__, __FUNCTION__);
+}
+
+void __bound_main_arg(void **p)
+{
+    void *start = p;
+    while (*p++);
+
+    dprintf(stderr, "%s, %s calling __bound_new_region(%p %x)\n",
+            __FILE__, __FUNCTION__, start, (unsigned)((void *)p - start));
+
+    __bound_new_region(start, (void *) p - start);
 }

 void __bound_exit(void)
 {
+    dprintf(stderr, "%s, %s()\n", __FILE__, __FUNCTION__);
    restore_malloc_hooks();
 }

 static inline void add_region(BoundEntry *e, 
-                              unsigned long start, unsigned long size)
+                              size_t start, size_t size)
 {
    BoundEntry *e1;
    if (e->start == 0) {
@ -449,13 +515,18 @@ static inline void add_region(BoundEntry *e,
 }

 /* create a new region. It should not already exist in the region list */
-void __bound_new_region(void *p, unsigned long size)
+void __bound_new_region(void *p, size_t size)
 {
-    unsigned long start, end;
+    size_t start, end;
    BoundEntry *page, *e, *e2;
-    int t1_start, t1_end, i, t2_start, t2_end;
+    size_t t1_start, t1_end, i, t2_start, t2_end;

-    start = (unsigned long)p;
+    dprintf(stderr, "%s, %s(%p, %x) start\n",
+        __FILE__, __FUNCTION__, p, (unsigned)size);
+
+    __bound_init();
+
+    start = (size_t)p;
    end = start + size;
    t1_start = start >> (BOUND_T2_BITS + BOUND_T3_BITS);
    t1_end = end >> (BOUND_T2_BITS + BOUND_T3_BITS);
@ -466,10 +537,7 @@ void __bound_new_region(void *p, unsigned long size)
        ((BOUND_T2_SIZE - 1) << BOUND_E_BITS);
    t2_end = (end >> (BOUND_T3_BITS - BOUND_E_BITS)) & 
        ((BOUND_T2_SIZE - 1) << BOUND_E_BITS);
-#ifdef BOUND_DEBUG
-    printf("new %lx %lx %x %x %x %x\n", 
-           start, end, t1_start, t1_end, t2_start, t2_end);
-#endif
+

    e = (BoundEntry *)((char *)page + t2_start);
    add_region(e, start, size);
@ -511,16 +579,17 @@ void __bound_new_region(void *p, unsigned long size)
        }
        add_region(e, start, size);
    }
+
+    dprintf(stderr, "%s, %s end\n", __FILE__, __FUNCTION__);
 }

 /* delete a region */
-static inline void delete_region(BoundEntry *e, 
-                                 void *p, unsigned long empty_size)
+static inline void delete_region(BoundEntry *e, void *p, size_t empty_size)
 {
-    unsigned long addr;
+    size_t addr;
    BoundEntry *e1;

-    addr = (unsigned long)p;
+    addr = (size_t)p;
    addr -= e->start;
    if (addr <= e->size) {
        /* region found is first one */
@ -544,7 +613,7 @@ static inline void delete_region(BoundEntry *e,
            /* region not found: do nothing */
            if (e == NULL)
                break;
-            addr = (unsigned long)p - e->start;
+            addr = (size_t)p - e->start;
            if (addr <= e->size) {
                /* found: remove entry */
                e1->next = e->next;
@ -559,11 +628,15 @@ static inline void delete_region(BoundEntry *e,
 /* return non zero if error */
 int __bound_delete_region(void *p)
 {
-    unsigned long start, end, addr, size, empty_size;
+    size_t start, end, addr, size, empty_size;
    BoundEntry *page, *e, *e2;
-    int t1_start, t1_end, t2_start, t2_end, i;
+    size_t t1_start, t1_end, t2_start, t2_end, i;

-    start = (unsigned long)p;
+    dprintf(stderr, "%s %s() start\n", __FILE__, __FUNCTION__);
+
+    __bound_init();
+
+    start = (size_t)p;
    t1_start = start >> (BOUND_T2_BITS + BOUND_T3_BITS);
    t2_start = (start >> (BOUND_T3_BITS - BOUND_E_BITS)) & 
        ((BOUND_T2_SIZE - 1) << BOUND_E_BITS);
@ -575,7 +648,7 @@ int __bound_delete_region(void *p)
    if (addr > e->size)
        e = __bound_find_region(e, p);
    /* test if invalid region */
-    if (e->size == EMPTY_SIZE || (unsigned long)p != e->start) 
+    if (e->size == EMPTY_SIZE || (size_t)p != e->start) 
        return -1;
    /* compute the size we put in invalid regions */
    if (e->is_invalid)
@ -621,7 +694,7 @@ int __bound_delete_region(void *p)
            }
        }
        /* last page */
-        page = get_page(t2_end);
+        page = get_page(t1_end);
        e2 = (BoundEntry *)((char *)page + t2_end);
        for(e=page;e<e2;e++) {
            e->start = 0;
@ -629,14 +702,17 @@ int __bound_delete_region(void *p)
        }
        delete_region(e, p, empty_size);
    }
+
+    dprintf(stderr, "%s %s() end\n", __FILE__, __FUNCTION__);
+
    return 0;
 }

 /* return the size of the region starting at p, or EMPTY_SIZE if non
-   existant region. */
-static unsigned long get_region_size(void *p)
+   existent region. */
+static size_t get_region_size(void *p)
 {
-    unsigned long addr = (unsigned long)p;
+    size_t addr = (size_t)p;
    BoundEntry *e;

    e = __bound_t1[addr >> (BOUND_T2_BITS + BOUND_T3_BITS)];
@ -646,13 +722,16 @@ static unsigned long get_region_size(void *p)
    addr -= e->start;
    if (addr > e->size)
        e = __bound_find_region(e, p);
-    if (e->start != (unsigned long)p)
+    if (e->start != (size_t)p)
        return EMPTY_SIZE;
    return e->size;
 }

 /* patched memory functions */

+/* force compiler to perform stores coded up to this point */
+#define barrier()   __asm__ __volatile__ ("": : : "memory")
+
 static void install_malloc_hooks(void)
 {
 #ifdef CONFIG_TCC_MALLOC_HOOKS
@ -664,6 +743,8 @@ static void install_malloc_hooks(void)
    __free_hook = __bound_free;
    __realloc_hook = __bound_realloc;
    __memalign_hook = __bound_memalign;
+
+    barrier();
 #endif
 }

@ -674,6 +755,8 @@ static void restore_malloc_hooks(void)
    __free_hook = saved_free_hook;
    __realloc_hook = saved_realloc_hook;
    __memalign_hook = saved_memalign_hook;
+
+    barrier();
 #endif
 }

@ -707,6 +790,10 @@ void *__bound_malloc(size_t size, const void *caller)
    
    if (!ptr)
        return NULL;
+
+    dprintf(stderr, "%s, %s calling __bound_new_region(%p, %x)\n",
+           __FILE__, __FUNCTION__, ptr, (unsigned)size);
+
    __bound_new_region(ptr, size);
    return ptr;
 }
@ -736,6 +823,10 @@ void *__bound_memalign(size_t size, size_t align, const void *caller)
    
    if (!ptr)
        return NULL;
+
+    dprintf(stderr, "%s, %s calling __bound_new_region(%p, %x)\n",
+           __FILE__, __FUNCTION__, ptr, (unsigned)size);
+
    __bound_new_region(ptr, size);
    return ptr;
 }
@ -753,7 +844,7 @@ void __bound_free(void *ptr, const void *caller)
 void *__bound_realloc(void *ptr, size_t size, const void *caller)
 {
    void *ptr1;
-    int old_size;
+    size_t old_size;

    if (size == 0) {
        __bound_free(ptr, caller);
@ -788,23 +879,23 @@ void *__bound_calloc(size_t nmemb, size_t size)
 static void bound_dump(void)
 {
    BoundEntry *page, *e;
-    int i, j;
+    size_t i, j;

-    printf("region dump:\n");
+    fprintf(stderr, "region dump:\n");
    for(i=0;i<BOUND_T1_SIZE;i++) {
        page = __bound_t1[i];
        for(j=0;j<BOUND_T2_SIZE;j++) {
            e = page + j;
            /* do not print invalid or empty entries */
            if (e->size != EMPTY_SIZE && e->start != 0) {
-                printf("%08x:", 
+                fprintf(stderr, "%08x:", 
                       (i << (BOUND_T2_BITS + BOUND_T3_BITS)) + 
                       (j << BOUND_T3_BITS));
                do {
-                    printf(" %08lx:%08lx", e->start, e->start + e->size);
+                    fprintf(stderr, " %08lx:%08lx", e->start, e->start + e->size);
                    e = e->next;
                } while (e != NULL);
-                printf("\n");
+                fprintf(stderr, "\n");
            }
        }
    }
@ -818,19 +909,28 @@ static void __bound_check(const void *p, size_t size)
 {
    if (size == 0)
        return;
-    p = __bound_ptr_add((void *)p, size);
+    p = __bound_ptr_add((void *)p, size - 1);
    if (p == INVALID_POINTER)
        bound_error("invalid pointer");
 }

 void *__bound_memcpy(void *dst, const void *src, size_t size)
 {
+    void* p;
+
+    dprintf(stderr, "%s %s: start, dst=%p src=%p size=%x\n",
+            __FILE__, __FUNCTION__, dst, src, (unsigned)size);
+
    __bound_check(dst, size);
    __bound_check(src, size);
    /* check also region overlap */
    if (src >= dst && src < dst + size)
        bound_error("overlapping regions in memcpy()");
-    return memcpy(dst, src, size);
+
+    p = memcpy(dst, src, size);
+
+    dprintf(stderr, "%s %s: end, p=%p\n", __FILE__, __FUNCTION__, p);
+    return p;
 }

 void *__bound_memmove(void *dst, const void *src, size_t size)
@ -850,7 +950,7 @@ void *__bound_memset(void *dst, int c, size_t size)
 int __bound_strlen(const char *s)
 {
    const char *p;
-    int len;
+    size_t len;

    len = 0;
    for(;;) {
@ -866,8 +966,14 @@ int __bound_strlen(const char *s)

 char *__bound_strcpy(char *dst, const char *src)
 {
-    int len;
-    len = __bound_strlen(src);
-    return __bound_memcpy(dst, src, len + 1);
-}
+    size_t len;
+    void *p;

+    dprintf(stderr, "%s %s: strcpy start, dst=%p src=%p\n",
+            __FILE__, __FUNCTION__, dst, src);
+    len = __bound_strlen(src);
+    p = __bound_memcpy(dst, src, len + 1);
+    dprintf(stderr, "%s %s: strcpy end, p = %p\n",
+            __FILE__, __FUNCTION__, p);
+    return p;
+}
--- a/lib/lib-arm64.c
+++ b/lib/lib-arm64.c
@ -0,0 +1,664 @@
+/*
+ *  TCC runtime library for arm64.
+ *
+ *  Copyright (c) 2015 Edmund Grimley Evans
+ *
+ * Copying and distribution of this file, with or without modification,
+ * are permitted in any medium without royalty provided the copyright
+ * notice and this notice are preserved.  This file is offered as-is,
+ * without any warranty.
+ */
+
+#ifdef __TINYC__
+typedef signed char int8_t;
+typedef unsigned char uint8_t;
+typedef short int16_t;
+typedef unsigned short uint16_t;
+typedef int int32_t;
+typedef unsigned uint32_t;
+typedef long long int64_t;
+typedef unsigned long long uint64_t;
+void *memcpy(void*,void*,__SIZE_TYPE__);
+#else
+#include <stdint.h>
+#include <string.h>
+#endif
+
+void __clear_cache(void *beg, void *end)
+{
+    __arm64_clear_cache(beg, end);
+}
+
+typedef struct {
+    uint64_t x0, x1;
+} u128_t;
+
+static long double f3_zero(int sgn)
+{
+    long double f;
+    u128_t x = { 0, (uint64_t)sgn << 63 };
+    memcpy(&f, &x, 16);
+    return f;
+}
+
+static long double f3_infinity(int sgn)
+{
+    long double f;
+    u128_t x = { 0, (uint64_t)sgn << 63 | 0x7fff000000000000 };
+    memcpy(&f, &x, 16);
+    return f;
+}
+
+static long double f3_NaN(void)
+{
+    long double f;
+#if 0
+    // ARM's default NaN usually has just the top fraction bit set:
+    u128_t x = {  0, 0x7fff800000000000 };
+#else
+    // GCC's library sets all fraction bits:
+    u128_t x = { -1, 0x7fffffffffffffff };
+#endif
+    memcpy(&f, &x, 16);
+    return f;
+}
+
+static int fp3_convert_NaN(long double *f, int sgn, u128_t mnt)
+{
+    u128_t x = { mnt.x0,
+                 mnt.x1 | 0x7fff800000000000 | (uint64_t)sgn << 63 };
+    memcpy(f, &x, 16);
+    return 1;
+}
+
+static int fp3_detect_NaNs(long double *f,
+                           int a_sgn, int a_exp, u128_t a,
+                           int b_sgn, int b_exp, u128_t b)
+{
+    // Detect signalling NaNs:
+    if (a_exp == 32767 && (a.x0 | a.x1 << 16) && !(a.x1 >> 47 & 1))
+        return fp3_convert_NaN(f, a_sgn, a);
+    if (b_exp == 32767 && (b.x0 | b.x1 << 16) && !(b.x1 >> 47 & 1))
+        return fp3_convert_NaN(f, b_sgn, b);
+
+    // Detect quiet NaNs:
+    if (a_exp == 32767 && (a.x0 | a.x1 << 16))
+        return fp3_convert_NaN(f, a_sgn, a);
+    if (b_exp == 32767 && (b.x0 | b.x1 << 16))
+        return fp3_convert_NaN(f, b_sgn, b);
+
+    return 0;
+}
+
+static void f3_unpack(int *sgn, int32_t *exp, u128_t *mnt, long double f)
+{
+    u128_t x;
+    memcpy(&x, &f, 16);
+    *sgn = x.x1 >> 63;
+    *exp = x.x1 >> 48 & 32767;
+    x.x1 = x.x1 << 16 >> 16;
+    if (*exp)
+        x.x1 |= (uint64_t)1 << 48;
+    else
+        *exp = 1;
+    *mnt = x;
+}
+
+static u128_t f3_normalise(int32_t *exp, u128_t mnt)
+{
+    int sh;
+    if (!(mnt.x0 | mnt.x1))
+        return mnt;
+    if (!mnt.x1) {
+        mnt.x1 = mnt.x0;
+        mnt.x0 = 0;
+        *exp -= 64;
+    }
+    for (sh = 32; sh; sh >>= 1) {
+        if (!(mnt.x1 >> (64 - sh))) {
+            mnt.x1 = mnt.x1 << sh | mnt.x0 >> (64 - sh);
+            mnt.x0 = mnt.x0 << sh;
+            *exp -= sh;
+        }
+    }
+    return mnt;
+}
+
+static u128_t f3_sticky_shift(int32_t sh, u128_t x)
+{
+  if (sh >= 128) {
+      x.x0 = !!(x.x0 | x.x1);
+      x.x1 = 0;
+      return x;
+  }
+  if (sh >= 64) {
+      x.x0 = x.x1 | !!x.x0;
+      x.x1 = 0;
+      sh -= 64;
+  }
+  if (sh > 0) {
+      x.x0 = x.x0 >> sh | x.x1 << (64 - sh) | !!(x.x0 << (64 - sh));
+      x.x1 = x.x1 >> sh;
+  }
+  return x;
+}
+
+static long double f3_round(int sgn, int32_t exp, u128_t x)
+{
+    long double f;
+    int error;
+
+    if (exp > 0) {
+        x = f3_sticky_shift(13, x);
+    }
+    else {
+        x = f3_sticky_shift(14 - exp, x);
+        exp = 0;
+    }
+
+    error = x.x0 & 3;
+    x.x0 = x.x0 >> 2 | x.x1 << 62;
+    x.x1 = x.x1 >> 2;
+
+    if (error == 3 || ((error == 2) & (x.x0 & 1))) {
+        if (!++x.x0) {
+            ++x.x1;
+            if (x.x1 == (uint64_t)1 << 48)
+                exp = 1;
+            else if (x.x1 == (uint64_t)1 << 49) {
+                ++exp;
+                x.x0 = x.x0 >> 1 | x.x1 << 63;
+                x.x1 = x.x1 >> 1;
+            }
+        }
+    }
+
+    if (exp >= 32767)
+        return f3_infinity(sgn);
+
+    x.x1 = x.x1 << 16 >> 16 | (uint64_t)exp << 48 | (uint64_t)sgn << 63;
+    memcpy(&f, &x, 16);
+    return f;
+}
+
+static long double f3_add(long double fa, long double fb, int neg)
+{
+    u128_t a, b, x;
+    int32_t a_exp, b_exp, x_exp;
+    int a_sgn, b_sgn, x_sgn;
+    long double fx;
+
+    f3_unpack(&a_sgn, &a_exp, &a, fa);
+    f3_unpack(&b_sgn, &b_exp, &b, fb);
+
+    if (fp3_detect_NaNs(&fx, a_sgn, a_exp, a, b_sgn, b_exp, b))
+        return fx;
+
+    b_sgn ^= neg;
+
+    // Handle infinities and zeroes:
+    if (a_exp == 32767 && b_exp == 32767 && a_sgn != b_sgn)
+        return f3_NaN();
+    if (a_exp == 32767)
+        return f3_infinity(a_sgn);
+    if (b_exp == 32767)
+        return f3_infinity(b_sgn);
+    if (!(a.x0 | a.x1 | b.x0 | b.x1))
+        return f3_zero(a_sgn & b_sgn);
+
+    a.x1 = a.x1 << 3 | a.x0 >> 61;
+    a.x0 = a.x0 << 3;
+    b.x1 = b.x1 << 3 | b.x0 >> 61;
+    b.x0 = b.x0 << 3;
+
+    if (a_exp <= b_exp) {
+        a = f3_sticky_shift(b_exp - a_exp, a);
+        a_exp = b_exp;
+    }
+    else {
+        b = f3_sticky_shift(a_exp - b_exp, b);
+        b_exp = a_exp;
+    }
+
+    x_sgn = a_sgn;
+    x_exp = a_exp;
+    if (a_sgn == b_sgn) {
+        x.x0 = a.x0 + b.x0;
+        x.x1 = a.x1 + b.x1 + (x.x0 < a.x0);
+    }
+    else {
+        x.x0 = a.x0 - b.x0;
+        x.x1 = a.x1 - b.x1 - (x.x0 > a.x0);
+        if (x.x1 >> 63) {
+            x_sgn ^= 1;
+            x.x0 = -x.x0;
+            x.x1 = -x.x1 - !!x.x0;
+        }
+    }
+
+    if (!(x.x0 | x.x1))
+        return f3_zero(0);
+
+    x = f3_normalise(&x_exp, x);
+
+    return f3_round(x_sgn, x_exp + 12, x);
+}
+
+long double __addtf3(long double a, long double b)
+{
+    return f3_add(a, b, 0);
+}
+
+long double __subtf3(long double a, long double b)
+{
+    return f3_add(a, b, 1);
+}
+
+long double __multf3(long double fa, long double fb)
+{
+    u128_t a, b, x;
+    int32_t a_exp, b_exp, x_exp;
+    int a_sgn, b_sgn, x_sgn;
+    long double fx;
+
+    f3_unpack(&a_sgn, &a_exp, &a, fa);
+    f3_unpack(&b_sgn, &b_exp, &b, fb);
+
+    if (fp3_detect_NaNs(&fx, a_sgn, a_exp, a, b_sgn, b_exp, b))
+        return fx;
+
+    // Handle infinities and zeroes:
+    if ((a_exp == 32767 && !(b.x0 | b.x1)) ||
+        (b_exp == 32767 && !(a.x0 | a.x1)))
+        return f3_NaN();
+    if (a_exp == 32767 || b_exp == 32767)
+        return f3_infinity(a_sgn ^ b_sgn);
+    if (!(a.x0 | a.x1) || !(b.x0 | b.x1))
+        return f3_zero(a_sgn ^ b_sgn);
+
+    a = f3_normalise(&a_exp, a);
+    b = f3_normalise(&b_exp, b);
+
+    x_sgn = a_sgn ^ b_sgn;
+    x_exp = a_exp + b_exp - 16352;
+
+    {
+        // Convert to base (1 << 30), discarding bottom 6 bits, which are zero,
+        // so there are (32, 30, 30, 30) bits in (a3, a2, a1, a0):
+        uint64_t a0 = a.x0 << 28 >> 34;
+        uint64_t b0 = b.x0 << 28 >> 34;
+        uint64_t a1 = a.x0 >> 36 | a.x1 << 62 >> 34;
+        uint64_t b1 = b.x0 >> 36 | b.x1 << 62 >> 34;
+        uint64_t a2 = a.x1 << 32 >> 34;
+        uint64_t b2 = b.x1 << 32 >> 34;
+        uint64_t a3 = a.x1 >> 32;
+        uint64_t b3 = b.x1 >> 32;
+        // Use 16 small multiplications and additions that do not overflow:
+        uint64_t x0 = a0 * b0;
+        uint64_t x1 = (x0 >> 30) + a0 * b1 + a1 * b0;
+        uint64_t x2 = (x1 >> 30) + a0 * b2 + a1 * b1 + a2 * b0;
+        uint64_t x3 = (x2 >> 30) + a0 * b3 + a1 * b2 + a2 * b1 + a3 * b0;
+        uint64_t x4 = (x3 >> 30) + a1 * b3 + a2 * b2 + a3 * b1;
+        uint64_t x5 = (x4 >> 30) + a2 * b3 + a3 * b2;
+        uint64_t x6 = (x5 >> 30) + a3 * b3;
+        // We now have (64, 30, 30, ...) bits in (x6, x5, x4, ...).
+        // Take the top 128 bits, setting bottom bit if any lower bits were set:
+        uint64_t y0 = (x5 << 34 | x4 << 34 >> 30 | x3 << 34 >> 60 |
+                       !!(x3 << 38 | (x2 | x1 | x0) << 34));
+        uint64_t y1 = x6;
+        // Top bit may be zero. Renormalise:
+        if (!(y1 >> 63)) {
+            y1 = y1 << 1 | y0 >> 63;
+            y0 = y0 << 1;
+            --x_exp;
+        }
+        x.x0 = y0;
+        x.x1 = y1;
+    }
+
+    return f3_round(x_sgn, x_exp, x);
+}
+
+long double __divtf3(long double fa, long double fb)
+{
+    u128_t a, b, x;
+    int32_t a_exp, b_exp, x_exp;
+    int a_sgn, b_sgn, x_sgn, i;
+    long double fx;
+
+    f3_unpack(&a_sgn, &a_exp, &a, fa);
+    f3_unpack(&b_sgn, &b_exp, &b, fb);
+
+    if (fp3_detect_NaNs(&fx, a_sgn, a_exp, a, b_sgn, b_exp, b))
+        return fx;
+
+    // Handle infinities and zeroes:
+    if ((a_exp == 32767 && b_exp == 32767) ||
+        (!(a.x0 | a.x1) && !(b.x0 | b.x1)))
+        return f3_NaN();
+    if (a_exp == 32767 || !(b.x0 | b.x1))
+        return f3_infinity(a_sgn ^ b_sgn);
+    if (!(a.x0 | a.x1) || b_exp == 32767)
+        return f3_zero(a_sgn ^ b_sgn);
+
+    a = f3_normalise(&a_exp, a);
+    b = f3_normalise(&b_exp, b);
+
+    x_sgn = a_sgn ^ b_sgn;
+    x_exp = a_exp - b_exp + 16395;
+
+    a.x0 = a.x0 >> 1 | a.x1 << 63;
+    a.x1 = a.x1 >> 1;
+    b.x0 = b.x0 >> 1 | b.x1 << 63;
+    b.x1 = b.x1 >> 1;
+    x.x0 = 0;
+    x.x1 = 0;
+    for (i = 0; i < 116; i++) {
+        x.x1 = x.x1 << 1 | x.x0 >> 63;
+        x.x0 = x.x0 << 1;
+        if (a.x1 > b.x1 || (a.x1 == b.x1 && a.x0 >= b.x0)) {
+            a.x1 = a.x1 - b.x1 - (a.x0 < b.x0);
+            a.x0 = a.x0 - b.x0;
+            x.x0 |= 1;
+        }
+        a.x1 = a.x1 << 1 | a.x0 >> 63;
+        a.x0 = a.x0 << 1;
+    }
+    x.x0 |= !!(a.x0 | a.x1);
+
+    x = f3_normalise(&x_exp, x);
+
+    return f3_round(x_sgn, x_exp, x);
+}
+
+long double __extendsftf2(float f)
+{
+    long double fx;
+    u128_t x;
+    uint32_t a;
+    uint64_t aa;
+    memcpy(&a, &f, 4);
+    aa = a;
+    x.x0 = 0;
+    if (!(a << 1))
+        x.x1 = aa << 32;
+    else if (a << 1 >> 24 == 255)
+        x.x1 = (0x7fff000000000000 | aa >> 31 << 63 | aa << 41 >> 16 |
+                (uint64_t)!!(a << 9) << 47);
+    else
+        x.x1 = (aa >> 31 << 63 | ((aa >> 23 & 255) + 16256) << 48 |
+                aa << 41 >> 16);
+    memcpy(&fx, &x, 16);
+    return fx;
+}
+
+long double __extenddftf2(double f)
+{
+    long double fx;
+    u128_t x;
+    uint64_t a;
+    memcpy(&a, &f, 8);
+    x.x0 = a << 60;
+    if (!(a << 1))
+        x.x1 = a;
+    else if (a << 1 >> 53 == 2047)
+        x.x1 = (0x7fff000000000000 | a >> 63 << 63 | a << 12 >> 16 |
+                (uint64_t)!!(a << 12) << 47);
+    else
+        x.x1 = a >> 63 << 63 | ((a >> 52 & 2047) + 15360) << 48 | a << 12 >> 16;
+    memcpy(&fx, &x, 16);
+    return fx;
+}
+
+float __trunctfsf2(long double f)
+{
+    u128_t mnt;
+    int32_t exp;
+    int sgn;
+    uint32_t x;
+    float fx;
+
+    f3_unpack(&sgn, &exp, &mnt, f);
+
+    if (exp == 32767 && (mnt.x0 | mnt.x1 << 16))
+        x = 0x7fc00000 | (uint32_t)sgn << 31 | (mnt.x1 >> 25 & 0x007fffff);
+    else if (exp > 16510)
+        x = 0x7f800000 | (uint32_t)sgn << 31;
+    else if (exp < 16233)
+        x = (uint32_t)sgn << 31;
+    else {
+        exp -= 16257;
+        x = mnt.x1 >> 23 | !!(mnt.x0 | mnt.x1 << 41);
+        if (exp < 0) {
+            x = x >> -exp | !!(x << (32 + exp));
+            exp = 0;
+        }
+        if ((x & 3) == 3 || (x & 7) == 6)
+            x += 4;
+        x = ((x >> 2) + (exp << 23)) | (uint32_t)sgn << 31;
+    }
+    memcpy(&fx, &x, 4);
+    return fx;
+}
+
+double __trunctfdf2(long double f)
+{
+    u128_t mnt;
+    int32_t exp;
+    int sgn;
+    uint64_t x;
+    double fx;
+
+    f3_unpack(&sgn, &exp, &mnt, f);
+
+    if (exp == 32767 && (mnt.x0 | mnt.x1 << 16))
+        x = (0x7ff8000000000000 | (uint64_t)sgn << 63 |
+             mnt.x1 << 16 >> 12 | mnt.x0 >> 60);
+    else if (exp > 17406)
+        x = 0x7ff0000000000000 | (uint64_t)sgn << 63;
+    else if (exp < 15308)
+        x = (uint64_t)sgn << 63;
+    else {
+        exp -= 15361;
+        x = mnt.x1 << 6 | mnt.x0 >> 58 | !!(mnt.x0 << 6);
+        if (exp < 0) {
+            x = x >> -exp | !!(x << (64 + exp));
+            exp = 0;
+        }
+        if ((x & 3) == 3 || (x & 7) == 6)
+            x += 4;
+        x = ((x >> 2) + ((uint64_t)exp << 52)) | (uint64_t)sgn << 63;
+    }
+    memcpy(&fx, &x, 8);
+    return fx;
+}
+
+int32_t __fixtfsi(long double fa)
+{
+    u128_t a;
+    int32_t a_exp;
+    int a_sgn;
+    int32_t x;
+    f3_unpack(&a_sgn, &a_exp, &a, fa);
+    if (a_exp < 16369)
+        return 0;
+    if (a_exp > 16413)
+        return a_sgn ? -0x80000000 : 0x7fffffff;
+    x = a.x1 >> (16431 - a_exp);
+    return a_sgn ? -x : x;
+}
+
+int64_t __fixtfdi(long double fa)
+{
+    u128_t a;
+    int32_t a_exp;
+    int a_sgn;
+    int64_t x;
+    f3_unpack(&a_sgn, &a_exp, &a, fa);
+    if (a_exp < 16383)
+        return 0;
+    if (a_exp > 16445)
+        return a_sgn ? -0x8000000000000000 : 0x7fffffffffffffff;
+    x = (a.x1 << 15 | a.x0 >> 49) >> (16446 - a_exp);
+    return a_sgn ? -x : x;
+}
+
+uint32_t __fixunstfsi(long double fa)
+{
+    u128_t a;
+    int32_t a_exp;
+    int a_sgn;
+    f3_unpack(&a_sgn, &a_exp, &a, fa);
+    if (a_sgn || a_exp < 16369)
+        return 0;
+    if (a_exp > 16414)
+        return -1;
+    return a.x1 >> (16431 - a_exp);
+}
+
+uint64_t __fixunstfdi(long double fa)
+{
+    u128_t a;
+    int32_t a_exp;
+    int a_sgn;
+    f3_unpack(&a_sgn, &a_exp, &a, fa);
+    if (a_sgn || a_exp < 16383)
+        return 0;
+    if (a_exp > 16446)
+        return -1;
+    return (a.x1 << 15 | a.x0 >> 49) >> (16446 - a_exp);
+}
+
+long double __floatsitf(int32_t a)
+{
+    int sgn = 0;
+    int exp = 16414;
+    uint32_t mnt = a;
+    u128_t x = { 0, 0 };
+    long double f;
+    int i;
+    if (a) {
+        if (a < 0) {
+            sgn = 1;
+            mnt = -mnt;
+        }
+        for (i = 16; i; i >>= 1)
+            if (!(mnt >> (32 - i))) {
+                mnt <<= i;
+                exp -= i;
+            }
+        x.x1 = ((uint64_t)sgn << 63 | (uint64_t)exp << 48 |
+                (uint64_t)(mnt << 1) << 16);
+    }
+    memcpy(&f, &x, 16);
+    return f;
+}
+
+long double __floatditf(int64_t a)
+{
+    int sgn = 0;
+    int exp = 16446;
+    uint64_t mnt = a;
+    u128_t x = { 0, 0 };
+    long double f;
+    int i;
+    if (a) {
+        if (a < 0) {
+            sgn = 1;
+            mnt = -mnt;
+        }
+        for (i = 32; i; i >>= 1)
+            if (!(mnt >> (64 - i))) {
+                mnt <<= i;
+                exp -= i;
+            }
+        x.x0 = mnt << 49;
+        x.x1 = (uint64_t)sgn << 63 | (uint64_t)exp << 48 | mnt << 1 >> 16;
+    }
+    memcpy(&f, &x, 16);
+    return f;
+}
+
+long double __floatunsitf(uint32_t a)
+{
+    int exp = 16414;
+    uint32_t mnt = a;
+    u128_t x = { 0, 0 };
+    long double f;
+    int i;
+    if (a) {
+        for (i = 16; i; i >>= 1)
+            if (!(mnt >> (32 - i))) {
+                mnt <<= i;
+                exp -= i;
+            }
+        x.x1 = (uint64_t)exp << 48 | (uint64_t)(mnt << 1) << 16;
+    }
+    memcpy(&f, &x, 16);
+    return f;
+}
+
+long double __floatunditf(uint64_t a)
+{
+    int exp = 16446;
+    uint64_t mnt = a;
+    u128_t x = { 0, 0 };
+    long double f;
+    int i;
+    if (a) {
+        for (i = 32; i; i >>= 1)
+            if (!(mnt >> (64 - i))) {
+                mnt <<= i;
+                exp -= i;
+            }
+        x.x0 = mnt << 49;
+        x.x1 = (uint64_t)exp << 48 | mnt << 1 >> 16;
+    }
+    memcpy(&f, &x, 16);
+    return f;
+}
+
+static int f3_cmp(long double fa, long double fb)
+{
+    u128_t a, b;
+    memcpy(&a, &fa, 16);
+    memcpy(&b, &fb, 16);
+    return (!(a.x0 | a.x1 << 1 | b.x0 | b.x1 << 1) ? 0 :
+            ((a.x1 << 1 >> 49 == 0x7fff && (a.x0 | a.x1 << 16)) ||
+             (b.x1 << 1 >> 49 == 0x7fff && (b.x0 | b.x1 << 16))) ? 2 :
+            a.x1 >> 63 != b.x1 >> 63 ? (int)(b.x1 >> 63) - (int)(a.x1 >> 63) :
+            a.x1 < b.x1 ? (int)(a.x1 >> 63 << 1) - 1 :
+            a.x1 > b.x1 ? 1 - (int)(a.x1 >> 63 << 1) :
+            a.x0 < b.x0 ? (int)(a.x1 >> 63 << 1) - 1 :
+            b.x0 < a.x0 ? 1 - (int)(a.x1 >> 63 << 1) : 0);
+}
+
+int __eqtf2(long double a, long double b)
+{
+    return !!f3_cmp(a, b);
+}
+
+int __netf2(long double a, long double b)
+{
+    return !!f3_cmp(a, b);
+}
+
+int __lttf2(long double a, long double b)
+{
+    return f3_cmp(a, b);
+}
+
+int __letf2(long double a, long double b)
+{
+    return f3_cmp(a, b);
+}
+
+int __gttf2(long double a, long double b)
+{
+    return -f3_cmp(b, a);
+}
+
+int __getf2(long double a, long double b)
+{
+    return -f3_cmp(b, a);
+}
--- a/lib/libtcc1.c
+++ b/lib/libtcc1.c
@ -103,14 +103,14 @@ union double_long {

 union float_long {
    float f;
-    long l;
+    unsigned int l;
 };

 /* XXX: we don't support several builtin supports for now */
-#ifndef __x86_64__
+#if !defined __x86_64__ && !defined __arm__

 /* XXX: use gcc/tcc intrinsic ? */
-#if defined(__i386__)
+#if defined __i386__
 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
  __asm__ ("subl %5,%1\n\tsbbl %3,%0"					\
 	   : "=r" ((USItype) (sh)),					\
@ -162,7 +162,7 @@ static UDWtype __udivmoddi4 (UDWtype n, UDWtype d, UDWtype *rp)
  n0 = nn.s.low;
  n1 = nn.s.high;

-#if !UDIV_NEEDS_NORMALIZATION
+#if !defined(UDIV_NEEDS_NORMALIZATION)
  if (d1 == 0)
    {
      if (d0 > n1)
@ -478,13 +478,6 @@ long long __ashldi3(long long a, int b)
 #endif
 }

-#if defined(__i386__)
-/* FPU control word for rounding to nearest mode */
-unsigned short __tcc_fpu_control = 0x137f;
-/* FPU control word for round to zero mode for int conversion */
-unsigned short __tcc_int_fpu_control = 0x137f | 0x0c00;
-#endif
-
 #endif /* !__x86_64__ */

 /* XXX: fix tcc's code generator to do this instead */
@ -557,6 +550,13 @@ unsigned long long __fixunssfdi (float a1)
        return 0;
 }

+long long __fixsfdi (float a1)
+{
+    long long ret; int s;
+    ret = __fixunssfdi((s = a1 >= 0) ? a1 : -a1);
+    return s ? ret : -ret;
+}
+
 unsigned long long __fixunsdfdi (double a1)
 {
    register union double_long dl1;
@ -582,6 +582,14 @@ unsigned long long __fixunsdfdi (double a1)
        return 0;
 }

+long long __fixdfdi (double a1)
+{
+    long long ret; int s;
+    ret = __fixunsdfdi((s = a1 >= 0) ? a1 : -a1);
+    return s ? ret : -ret;
+}
+
+#ifndef __arm__
 unsigned long long __fixunsxfdi (long double a1)
 {
    register union ldouble_long dl1;
@ -605,3 +613,10 @@ unsigned long long __fixunsxfdi (long double a1)
        return 0;
 }

+long long __fixxfdi (long double a1)
+{
+    long long ret; int s;
+    ret = __fixunsxfdi((s = a1 >= 0) ? a1 : -a1);
+    return s ? ret : -ret;
+}
+#endif /* !ARM */
--- a/lib/va_list.c
+++ b/lib/va_list.c
@ -0,0 +1,65 @@
+/* va_list.c - tinycc support for va_list on X86_64 */
+
+#if defined __x86_64__
+
+/* Avoid include files, they may not be available when cross compiling */
+extern void *memset(void *s, int c, __SIZE_TYPE__ n);
+extern void abort(void);
+
+/* This should be in sync with our include/stdarg.h */
+enum __va_arg_type {
+    __va_gen_reg, __va_float_reg, __va_stack
+};
+
+/* GCC compatible definition of va_list. */
+typedef struct {
+    unsigned int gp_offset;
+    unsigned int fp_offset;
+    union {
+        unsigned int overflow_offset;
+        char *overflow_arg_area;
+    };
+    char *reg_save_area;
+} __va_list_struct;
+
+void __va_start(__va_list_struct *ap, void *fp)
+{
+    memset(ap, 0, sizeof(__va_list_struct));
+    *ap = *(__va_list_struct *)((char *)fp - 16);
+    ap->overflow_arg_area = (char *)fp + ap->overflow_offset;
+    ap->reg_save_area = (char *)fp - 176 - 16;
+}
+
+void *__va_arg(__va_list_struct *ap,
+               enum __va_arg_type arg_type,
+               int size, int align)
+{
+    size = (size + 7) & ~7;
+    align = (align + 7) & ~7;
+    switch (arg_type) {
+    case __va_gen_reg:
+        if (ap->gp_offset + size <= 48) {
+            ap->gp_offset += size;
+            return ap->reg_save_area + ap->gp_offset - size;
+        }
+        goto use_overflow_area;
+
+    case __va_float_reg:
+        if (ap->fp_offset < 128 + 48) {
+            ap->fp_offset += 16;
+            return ap->reg_save_area + ap->fp_offset - 16;
+        }
+        size = 8;
+        goto use_overflow_area;
+
+    case __va_stack:
+    use_overflow_area:
+        ap->overflow_arg_area += size;
+        ap->overflow_arg_area = (char*)((long long)(ap->overflow_arg_area + align - 1) & -align);
+        return ap->overflow_arg_area - size;
+
+    default: /* should never happen */
+        abort();
+    }
+}
+#endif
--- a/libtcc.c
+++ b/libtcc.c
--- a/libtcc.h
+++ b/libtcc.h
@ -1,10 +1,8 @@
 #ifndef LIBTCC_H
 #define LIBTCC_H

-#ifdef LIBTCC_AS_DLL
-#define LIBTCCAPI __declspec(dllexport)
-#else
-#define LIBTCCAPI
+#ifndef LIBTCCAPI
+# define LIBTCCAPI
 #endif

 #ifdef __cplusplus
@ -21,15 +19,15 @@ LIBTCCAPI TCCState *tcc_new(void);
 /* free a TCC compilation context */
 LIBTCCAPI void tcc_delete(TCCState *s);

-/* add debug information in the generated code */
-LIBTCCAPI void tcc_enable_debug(TCCState *s);
+/* set CONFIG_TCCDIR at runtime */
+LIBTCCAPI void tcc_set_lib_path(TCCState *s, const char *path);

 /* set error/warning display callback */
 LIBTCCAPI void tcc_set_error_func(TCCState *s, void *error_opaque,
-                        void (*error_func)(void *opaque, const char *msg));
+    void (*error_func)(void *opaque, const char *msg));

-/* set/reset a warning */
-LIBTCCAPI int tcc_set_warning(TCCState *s, const char *warning_name, int value);
+/* set options as from command line (multiple supported) */
+LIBTCCAPI void tcc_set_options(TCCState *s, const char *str);

 /*****************************/
 /* preprocessor */
@ -49,29 +47,22 @@ LIBTCCAPI void tcc_undefine_symbol(TCCState *s, const char *sym);
 /*****************************/
 /* compiling */

-/* add a file (either a C file, dll, an object, a library or an ld
-   script). Return -1 if error. */
+/* add a file (C file, dll, object, library, ld script). Return -1 if error. */
 LIBTCCAPI int tcc_add_file(TCCState *s, const char *filename);

-/* compile a string containing a C source. Return non zero if
-   error. */
+/* compile a string containing a C source. Return -1 if error. */
 LIBTCCAPI int tcc_compile_string(TCCState *s, const char *buf);

 /*****************************/
 /* linking commands */

 /* set output type. MUST BE CALLED before any compilation */
-#define TCC_OUTPUT_MEMORY   0 /* output will be ran in memory (no
-                                 output file) (default) */
-#define TCC_OUTPUT_EXE      1 /* executable file */
-#define TCC_OUTPUT_DLL      2 /* dynamic library */
-#define TCC_OUTPUT_OBJ      3 /* object file */
-#define TCC_OUTPUT_PREPROCESS 4 /* preprocessed file (used internally) */
 LIBTCCAPI int tcc_set_output_type(TCCState *s, int output_type);
-
-#define TCC_OUTPUT_FORMAT_ELF    0 /* default output format: ELF */
-#define TCC_OUTPUT_FORMAT_BINARY 1 /* binary image output */
-#define TCC_OUTPUT_FORMAT_COFF   2 /* COFF */
+#define TCC_OUTPUT_MEMORY   1 /* output will be run in memory (default) */
+#define TCC_OUTPUT_EXE      2 /* executable file */
+#define TCC_OUTPUT_DLL      3 /* dynamic library */
+#define TCC_OUTPUT_OBJ      4 /* object file */
+#define TCC_OUTPUT_PREPROCESS 5 /* only preprocess (used internally) */

 /* equivalent to -Lpath option */
 LIBTCCAPI int tcc_add_library_path(TCCState *s, const char *pathname);
@ -80,7 +71,7 @@ LIBTCCAPI int tcc_add_library_path(TCCState *s, const char *pathname);
 LIBTCCAPI int tcc_add_library(TCCState *s, const char *libraryname);

 /* add a symbol to the compiled program */
-LIBTCCAPI int tcc_add_symbol(TCCState *s, const char *name, void *val);
+LIBTCCAPI int tcc_add_symbol(TCCState *s, const char *name, const void *val);

 /* output an executable, library or object file. DO NOT call
   tcc_relocate() before. */
@ -90,17 +81,18 @@ LIBTCCAPI int tcc_output_file(TCCState *s, const char *filename);
   tcc_relocate() before. */
 LIBTCCAPI int tcc_run(TCCState *s, int argc, char **argv);

-/* copy code into memory passed in by the caller and do all relocations
-   (needed before using tcc_get_symbol()).
-   returns -1 on error and required size if ptr is NULL */
+/* do all relocations (needed before using tcc_get_symbol()) */
 LIBTCCAPI int tcc_relocate(TCCState *s1, void *ptr);
+/* possible values for 'ptr':
+   - TCC_RELOCATE_AUTO : Allocate and manage memory internally
+   - NULL              : return required memory size for the step below
+   - memory address    : copy code to memory passed by the caller
+   returns -1 if error. */
+#define TCC_RELOCATE_AUTO (void*)1

 /* return symbol value or NULL if not found */
 LIBTCCAPI void *tcc_get_symbol(TCCState *s, const char *name);

-/* set CONFIG_TCCDIR at runtime */
-LIBTCCAPI void tcc_set_lib_path(TCCState *s, const char *path);
-
 #ifdef __cplusplus
 }
 #endif
--- a/tcc-doc.texi
+++ b/tcc-doc.texi
@ -2,6 +2,10 @@
@c %**start of header
@setfilename tcc-doc.info
@settitle Tiny C Compiler Reference Documentation
+@dircategory Software development
+@direntry
+* TCC: (tcc-doc).               The Tiny C Compiler.
+@end direntry
@c %**end of header

@include config.texi
@ -64,7 +68,7 @@ ports for the ARM (@code{arm-tcc}) and the TMS320C67xx targets
 (@code{c67-tcc}). More information about the ARM port is available at
@url{http://lists.gnu.org/archive/html/tinycc-devel/2003-10/msg00044.html}.

-For usage on Windows, see also tcc-win32.txt.
+For usage on Windows, see also @url{tcc-win32.txt}.

@node Invoke
@chapter Command line invocation
@ -153,47 +157,34 @@ General Options:

@c man begin OPTIONS
@table @option
-@item -v
-Display current TCC version, increase verbosity.
-
-@item -print-search-dirs
-Print the name of the configured installation directory and a list
-of program and library directories tcc will search.
-
@item -c
-Generate an object file (@option{-o} option must also be given).
+Generate an object file.

@item -o outfile
 Put object file, executable, or dll into output file @file{outfile}.

-@item -Bdir
-Set the path where the tcc internal libraries can be found (default is
-@file{PREFIX/lib/tcc}).
-
-@item -bench
-Output compilation statistics.
-
@item -run source [args...]
 Compile file @var{source} and run it with the command line arguments
@var{args}. In order to be able to give more than one argument to a
 script, several TCC options can be given @emph{after} the
-@option{-run} option, separated by spaces. Example:
-
+@option{-run} option, separated by spaces:
@example
 tcc "-run -L/usr/X11R6/lib -lX11" ex4.c
@end example
-
 In a script, it gives the following header:
-
@example
 #!/usr/local/bin/tcc -run -L/usr/X11R6/lib -lX11
-#include <stdlib.h>
-int main(int argc, char **argv)
-@{
-    ...
-@}
@end example

+@item -v
+Display TCC version.
+
+@item -vv
+Show included files.  As sole argument, print search dirs.  -vvv shows tries too.
+
+@item -bench
+Display compilation statistics.
+
@end table

 Preprocessor options:
@ -215,11 +206,15 @@ also be defined: @option{-DF(a)=a+1}

@item -Usym
 Undefine preprocessor symbol @samp{sym}.
+
+@item -E
+Preprocess only, to stdout or file (with -o).
+
@end table

 Compilation flags:

-Note: each of the following warning options has a negative form beginning with
+Note: each of the following options has a negative form beginning with
@option{-fno-}.

@table @option
@ -235,6 +230,14 @@ Do not generate common symbols for uninitialized data.
@item -fleading-underscore
 Add a leading underscore at the beginning of each C symbol.

+@item -fms-extensions
+Allow a MS C compiler extensions to the language. Currently this
+assumes a nested named structure declaration without an identifier
+behaves like an unnamed one.
+
+@item -fdollars-in-identifiers
+Allow dollar signs in identifiers
+
@end table

 Warning options:
@ -278,28 +281,37 @@ default library paths are @file{/usr/local/lib}, @file{/usr/lib} and @file{/lib}
@item -lxxx
 Link your program with dynamic library libxxx.so or static library
 libxxx.a. The library is searched in the paths specified by the
-@option{-L} option.
+@option{-L} option and @env{LIBRARY_PATH} variable.
+
+@item -Bdir
+Set the path where the tcc internal libraries (and include files) can be
+found (default is @file{PREFIX/lib/tcc}).

@item -shared
-Generate a shared library instead of an executable (@option{-o} option
-must also be given).
+Generate a shared library instead of an executable.
+
+@item -soname name
+set name for shared library to be used at runtime

@item -static
 Generate a statically linked executable (default is a shared linked
-executable) (@option{-o} option must also be given).
+executable).

@item -rdynamic
 Export global symbols to the dynamic linker. It is useful when a library
 opened with @code{dlopen()} needs to access executable symbols.

@item -r
-Generate an object file combining all input files (@option{-o} option must
-also be given).
+Generate an object file combining all input files.

-@item -Wl,-Ttext,address
-Set the start of the .text section to @var{address}.
+@item -Wl,-rpath=path
+Put custom search path for dynamic libraries into executable.

-@item -Wl,--oformat,fmt
+@item -Wl,--enable-new-dtags
+When putting a custom search path for dynamic libraries into the executable,
+create the new ELF dynamic tag DT_RUNPATH instead of the old legacy DT_RPATH.
+
+@item -Wl,--oformat=fmt
 Use @var{fmt} as output format. The supported output formats are:
@table @code
@item elf32-i386
@ -310,6 +322,18 @@ Binary image (only for executable output)
 COFF output format (only for executable output for TMS320C67xx target)
@end table

+@item -Wl,-subsystem=console/gui/wince/...
+Set type for PE (Windows) executables.
+
+@item -Wl,-[Ttext=# | section-alignment=# | file-alignment=# | image-base=# | stack=#]
+Modify executable layout.
+
+@item -Wl,-Bsymbolic
+Set DT_SYMBOLIC tag.
+
+@item -Wl,-(no-)whole-archive
+Turn on/off linking of all objects in archives.
+
@end table

 Debugger options:
@ -326,22 +350,79 @@ Generate additional support code to check
 memory allocations and array/pointer bounds. @option{-g} is implied. Note
 that the generated code is slower and bigger in this case.

+Note: @option{-b} is only available on i386 when using libtcc for the moment.
+
@item -bt N
 Display N callers in stack traces. This is useful with @option{-g} or
@option{-b}.

@end table

+Misc options:
+
+@table @option
+@item -MD
+Generate makefile fragment with dependencies.
+
+@item -MF depfile
+Use @file{depfile} as output for -MD.
+
+@item -print-search-dirs
+Print the configured installation directory and a list of library
+and include directories tcc will search.
+
+@item -dumpversion
+Print version.
+
+@end table
+
+Target specific options:
+
+@table @option
+@item -mms-bitfields
+Use an algorithm for bitfield alignment consistent with MSVC. Default is
+gcc's algorithm.
+
+@item -mfloat-abi (ARM only)
+Select the float ABI. Possible values: @code{softfp} and @code{hard}
+
+@item -mno-sse
+Do not use sse registers on x86_64
+
+@item -m32, -m64
+Pass command line to the i386/x86_64 cross compiler.
+
+@end table
+
 Note: GCC options @option{-Ox}, @option{-fx} and @option{-mx} are
 ignored.
@c man end

+@c man begin ENVIRONMENT
+Environment variables that affect how tcc operates.
+
+@table @option
+
+@item CPATH
+@item C_INCLUDE_PATH
+A colon-separated list of directories searched for include files,
+directories given with @option{-I} are searched first.
+
+@item LIBRARY_PATH
+A colon-separated list of directories searched for libraries for the
+@option{-l} option, directories given with @option{-L} are searched first.
+
+@end table
+
+@c man end
+
@ignore

@setfilename tcc
@settitle Tiny C Compiler

@c man begin SEEALSO
+cpp(1),
 gcc(1)
@c man end

@ -363,13 +444,14 @@ and floating point numbers (@code{long double}, @code{double}, and
@section ISOC99 extensions

 TCC implements many features of the new C standard: ISO C99. Currently
-missing items are: complex and imaginary numbers and variable length
-arrays.
+missing items are: complex and imaginary numbers.

 Currently implemented ISOC99 features:

@itemize

+@item variable length arrays.
+
@item 64 bit @code{long long} types are fully supported.

@item The boolean type @code{_Bool} is supported.
@ -571,8 +653,7 @@ are supported.

@itemize

-@item @code{__TINYC__} is a predefined macro to @code{1} to
-indicate that you use TCC.
+@item @code{__TINYC__} is a predefined macro to indicate that you use TCC.

@item @code{#!} at the start of a line is ignored to allow scripting.

@ -588,7 +669,7 @@ indicate that you use TCC.

 Since version 0.9.16, TinyCC integrates its own assembler. TinyCC
 assembler supports a gas-like syntax (GNU assembler). You can
-desactivate assembler support if you want a smaller TinyCC executable
+deactivate assembler support if you want a smaller TinyCC executable
 (the C compiler does not rely on the assembler).

 TinyCC Assembler is used to handle files with @file{.S} (C
@ -677,7 +758,7 @@ They can be defined several times in the same source. Use 'b'
@cindex asciz directive
@cindex ascii directive

-All directives are preceeded by a '.'. The following directives are
+All directives are preceded by a '.'. The following directives are
 supported:

@itemize
@ -896,7 +977,7 @@ reverse order, a first pass is done to reverse the argument order.

@section Types

-The types are stored in a single 'int' variable. It was choosen in the
+The types are stored in a single 'int' variable. It was chosen in the
 first stages of development when tcc was much simpler. Now, it may not
 be the best solution.

@ -919,9 +1000,13 @@ be the best solution.
 #define VT_BTYPE      0x000f /* mask for basic type */
 #define VT_UNSIGNED   0x0010  /* unsigned type */
 #define VT_ARRAY      0x0020  /* array type (also has VT_PTR) */
+#define VT_VLA        0x20000 /* VLA type (also has VT_PTR and VT_ARRAY) */
 #define VT_BITFIELD   0x0040  /* bitfield modifier */
+#define VT_CONSTANT   0x0800  /* const modifier */
+#define VT_VOLATILE   0x1000  /* volatile modifier */
+#define VT_DEFSIGN    0x2000  /* signed type */

-#define VT_STRUCT_SHIFT 16   /* structure/enum name shift (16 bits left) */
+#define VT_STRUCT_SHIFT 18   /* structure/enum name shift (14 bits left) */
@end example

 When a reference to another type is needed (for pointers, functions and
@ -932,7 +1017,8 @@ The @code{VT_UNSIGNED} flag can be set for chars, shorts, ints and long
 longs.

 Arrays are considered as pointers @code{VT_PTR} with the flag
-@code{VT_ARRAY} set.
+@code{VT_ARRAY} set. Variable length arrays are considered as special
+arrays and have flag @code{VT_VLA} set instead of @code{VT_ARRAY}.

 The @code{VT_BITFIELD} flag can be set for chars, shorts, ints and long
 longs. If it is set, then the bitfield position is stored from bits
@ -948,6 +1034,10 @@ integer:
 #define VT_EXTERN  0x00000080  /* extern definition */
 #define VT_STATIC  0x00000100  /* static variable */
 #define VT_TYPEDEF 0x00000200  /* typedef definition */
+#define VT_INLINE  0x00000400  /* inline definition */
+#define VT_IMPORT  0x00004000  /* win32: extern data imported from dll */
+#define VT_EXPORT  0x00008000  /* win32: data exported from dll */
+#define VT_WEAK    0x00010000  /* win32: data exported from dll */
@end example

@section Symbols
@ -956,10 +1046,13 @@ All symbols are stored in hashed symbol stacks. Each symbol stack
 contains @code{Sym} structures.

@code{Sym.v} contains the symbol name (remember
-an idenfier is also a token, so a string is never necessary to store
+an identifier is also a token, so a string is never necessary to store
 it). @code{Sym.t} gives the type of the symbol. @code{Sym.r} is usually
 the register in which the corresponding variable is stored. @code{Sym.c} is
-usually a constant associated to the symbol.
+usually a constant associated to the symbol like its address for normal
+symbols, and the number of entries for symbols representing arrays.
+Variable length array types use @code{Sym.c} as a location on the stack
+which holds the runtime sizeof for the type.

 Four main symbol stacks are defined:

@ -996,7 +1089,7 @@ global stack.

@section Sections

-The generated code and datas are written in sections. The structure
+The generated code and data are written in sections. The structure
@code{Section} contains all the necessary information for a given
 section. @code{new_section()} creates a new section. ELF file semantics
 is assumed for each section.
@ -1021,7 +1114,7 @@ are used when bound checking is activated

@item stab_section
@itemx stabstr_section
-are used when debugging is actived to store debug information
+are used when debugging is active to store debug information

@item symtab_section
@itemx strtab_section
@ -1121,8 +1214,10 @@ if the lvalue has an integer type, then these flags give its real
 type. The type alone is not enough in case of cast optimisations.

@item VT_LLOCAL
-is a saved lvalue on the stack. @code{VT_LLOCAL} should be eliminated
-ASAP because its semantics are rather complicated.
+is a saved lvalue on the stack. @code{VT_LVAL} must also be set with
+@code{VT_LLOCAL}. @code{VT_LLOCAL} can arise when a @code{VT_LVAL} in
+a register has to be saved to the stack, or it can come from an
+architecture-specific calling convention.

@item VT_MUSTCAST
 indicates that a cast to the value type must be performed if the value
@ -1181,13 +1276,13 @@ should generate a function prolog/epilog.

@item gen_opi(op)
 must generate the binary integer operation @var{op} on the two top
-entries of the stack which are guaranted to contain integer types.
+entries of the stack which are guaranteed to contain integer types.

 The result value should be put on the stack.

@item gen_opf(op)
 same as @code{gen_opi()} for floating point operations. The two top
-entries of the stack are guaranted to contain floating point values of
+entries of the stack are guaranteed to contain floating point values of
 same types.

@item gen_cvt_itof()
--- a/tcc.c
+++ b/tcc.c
@ -18,580 +18,354 @@
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

-#ifdef TCC_USE_LIBTCC
 #include "tcc.h"
-#else
-#include "libtcc.c"
+#if ONE_SOURCE
+# include "libtcc.c"
 #endif
+#include "tcctools.c"

-void help(void)
-{
-    printf("tcc version " TCC_VERSION " - Tiny C Compiler - Copyright (C) 2001-2006 Fabrice Bellard\n"
-           "usage: tcc [-v] [-c] [-o outfile] [-Bdir] [-bench] [-Idir] [-Dsym[=val]] [-Usym]\n"
-           "           [-Wwarn] [-g] [-b] [-bt N] [-Ldir] [-llib] [-shared] [-soname name]\n"
-           "           [-static] [infile1 infile2...] [-run infile args...]\n"
-           "\n"
-           "General options:\n"
-           "  -v          display current version, increase verbosity\n"
-           "  -c          compile only - generate an object file\n"
-           "  -o outfile  set output filename\n"
-           "  -Bdir       set tcc internal library path\n"
-           "  -bench      output compilation statistics\n"
-           "  -run        run compiled source\n"
-           "  -fflag      set or reset (with 'no-' prefix) 'flag' (see man page)\n"
-           "  -Wwarning   set or reset (with 'no-' prefix) 'warning' (see man page)\n"
-           "  -w          disable all warnings\n"
-           "Preprocessor options:\n"
-           "  -E          preprocess only\n"
-           "  -Idir       add include path 'dir'\n"
-           "  -Dsym[=val] define 'sym' with value 'val'\n"
-           "  -Usym       undefine 'sym'\n"
-           "Linker options:\n"
-           "  -Ldir       add library path 'dir'\n"
-           "  -llib       link with dynamic or static library 'lib'\n"
-           "  -shared     generate a shared library\n"
-           "  -soname     set name for shared library to be used at runtime\n"
-           "  -static     static linking\n"
-           "  -rdynamic   export all global symbols to dynamic linker\n"
-           "  -r          generate (relocatable) object file\n"
-           "Debugger options:\n"
-           "  -g          generate runtime debug info\n"
+static const char help[] =
+    "Tiny C Compiler "TCC_VERSION" - Copyright (C) 2001-2006 Fabrice Bellard\n"
+    "Usage: tcc [options...] [-o outfile] [-c] infile(s)...\n"
+    "       tcc [options...] -run infile [arguments...]\n"
+    "General options:\n"
+    "  -c          compile only - generate an object file\n"
+    "  -o outfile  set output filename\n"
+    "  -run        run compiled source\n"
+    "  -fflag      set or reset (with 'no-' prefix) 'flag' (see tcc -hh)\n"
+    "  -Wwarning   set or reset (with 'no-' prefix) 'warning' (see tcc -hh)\n"
+    "  -w          disable all warnings\n"
+    "  -v -vv      show version, show search paths or loaded files\n"
+    "  -h -hh      show this, show more help\n"
+    "  -bench      show compilation statistics\n"
+    "  -           use stdin pipe as infile\n"
+    "  @listfile   read arguments from listfile\n"
+    "Preprocessor options:\n"
+    "  -Idir       add include path 'dir'\n"
+    "  -Dsym[=val] define 'sym' with value 'val'\n"
+    "  -Usym       undefine 'sym'\n"
+    "  -E          preprocess only\n"
+    "Linker options:\n"
+    "  -Ldir       add library path 'dir'\n"
+    "  -llib       link with dynamic or static library 'lib'\n"
+    "  -r          generate (relocatable) object file\n"
+    "  -shared     generate a shared library/dll\n"
+    "  -rdynamic   export all global symbols to dynamic linker\n"
+    "  -soname     set name for shared library to be used at runtime\n"
+    "  -Wl,-opt[=val]  set linker option (see tcc -hh)\n"
+    "Debugger options:\n"
+    "  -g          generate runtime debug info\n"
 #ifdef CONFIG_TCC_BCHECK
-           "  -b          compile with built-in memory and bounds checker (implies -g)\n"
+    "  -b          compile with built-in memory and bounds checker (implies -g)\n"
 #endif
 #ifdef CONFIG_TCC_BACKTRACE
-           "  -bt N       show N callers in stack traces\n"
+    "  -bt N       show N callers in stack traces\n"
 #endif
-           );
+    "Misc. options:\n"
+    "  -x[c|a|n]   specify type of the next infile\n"
+    "  -nostdinc   do not use standard system include paths\n"
+    "  -nostdlib   do not link with standard crt and libraries\n"
+    "  -Bdir       set tcc's private include/library dir\n"
+    "  -MD         generate dependency file for make\n"
+    "  -MF file    specify dependency file name\n"
+    "  -m32/64     defer to i386/x86_64 cross compiler\n"
+    "Tools:\n"
+    "  create library  : tcc -ar [rcsv] lib.a files\n"
+#ifdef TCC_TARGET_PE
+    "  create def file : tcc -impdef lib.dll [-v] [-o lib.def]\n"
+#endif
+    ;
+
+static const char help2[] =
+    "Tiny C Compiler "TCC_VERSION" - More Options\n"
+    "Special options:\n"
+    "  -P -P1                        with -E: no/alternative #line output\n"
+    "  -dD -dM                       with -E: output #define directives\n"
+    "  -pthread                      same as -D_REENTRANT and -lpthread\n"
+    "  -On                           same as -D__OPTIMIZE__ for n > 0\n"
+    "  -Wp,-opt                      same as -opt\n"
+    "  -include file                 include 'file' above each input file\n"
+    "  -isystem dir                  add 'dir' to system include path\n"
+    "  -static                       link to static libraries (not recommended)\n"
+    "  -dumpversion                  print version\n"
+    "  -print-search-dirs            print search paths\n"
+    "  -dt                           with -run/-E: auto-define 'test_...' macros\n"
+    "Ignored options:\n"
+    "  --param  -pedantic  -pipe  -s  -std  -traditional\n"
+    "-W... warnings:\n"
+    "  all                           turn on some (*) warnings\n"
+    "  error                         stop after first warning\n"
+    "  unsupported                   warn about ignored options, pragmas, etc.\n"
+    "  write-strings                 strings are const\n"
+    "  implicit-function-declaration warn for missing prototype (*)\n"
+    "-f[no-]... flags:\n"
+    "  unsigned-char                 default char is unsigned\n"
+    "  signed-char                   default char is signed\n"
+    "  common                        use common section instead of bss\n"
+    "  leading-underscore            decorate extern symbols\n"
+    "  ms-extensions                 allow anonymous struct in struct\n"
+    "  dollars-in-identifiers        allow '$' in C symbols\n"
+    "-m... target specific options:\n"
+    "  ms-bitfields                  use MSVC bitfield layout\n"
+#ifdef TCC_TARGET_ARM
+    "  float-abi                     hard/softfp on arm\n"
+#endif
+#ifdef TCC_TARGET_X86_64
+    "  no-sse                        disable floats on x86_64\n"
+#endif
+    "-Wl,... linker options:\n"
+    "  -nostdlib                     do not link with standard crt/libs\n"
+    "  -[no-]whole-archive           load lib(s) fully/only as needed\n"
+    "  -export-all-symbols           same as -rdynamic\n"
+    "  -image-base= -Ttext=          set base address of executable\n"
+    "  -section-alignment=           set section alignment in executable\n"
+#ifdef TCC_TARGET_PE
+    "  -file-alignment=              set PE file alignment\n"
+    "  -stack=                       set PE stack reserve\n"
+    "  -large-address-aware          set related PE option\n"
+    "  -subsystem=[console/windows]  set PE subsystem\n"
+    "  -oformat=[pe-* binary]        set executable output format\n"
+    "Predefined macros:\n"
+    "  tcc -E -dM - < nul\n"
+#else
+    "  -rpath=                       set dynamic library search path\n"
+    "  -enable-new-dtags             set DT_RUNPATH instead of DT_RPATH\n"
+    "  -soname=                      set DT_SONAME elf tag\n"
+    "  -Bsymbolic                    set DT_SYMBOLIC elf tag\n"
+    "  -oformat=[elf32/64-* binary]  set executable output format\n"
+    "  -init= -fini= -as-needed -O   (ignored)\n"
+    "Predefined macros:\n"
+    "  tcc -E -dM - < /dev/null\n"
+#endif
+    "See also the manual for more details.\n"
+    ;
+
+static const char version[] =
+    "tcc version "TCC_VERSION" ("
+#ifdef TCC_TARGET_I386
+        "i386"
+#elif defined TCC_TARGET_X86_64
+        "x86_64"
+#elif defined TCC_TARGET_C67
+        "C67"
+#elif defined TCC_TARGET_ARM
+        "ARM"
+#elif defined TCC_TARGET_ARM64
+        "AArch64"
+#endif
+#ifdef TCC_ARM_HARDFLOAT
+        " Hard Float"
+#endif
+#ifdef TCC_TARGET_PE
+        " Windows"
+#elif defined(TCC_TARGET_MACHO)
+        " Darwin"
+#elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
+        " FreeBSD"
+#else
+        " Linux"
+#endif
+    ")\n"
+    ;
+
+static void print_dirs(const char *msg, char **paths, int nb_paths)
+{
+    int i;
+    printf("%s:\n%s", msg, nb_paths ? "" : "  -\n");
+    for(i = 0; i < nb_paths; i++)
+        printf("  %s\n", paths[i]);
 }

-static char **files;
-static int nb_files, nb_libraries;
-static int multiple_files;
-static int print_search_dirs;
-static int output_type;
-static int reloc_output;
-static const char *outfile;
-static int do_bench = 0;
-
-#define TCC_OPTION_HAS_ARG 0x0001
-#define TCC_OPTION_NOSEP   0x0002 /* cannot have space before option and arg */
-
-typedef struct TCCOption {
-    const char *name;
-    uint16_t index;
-    uint16_t flags;
-} TCCOption;
-
-enum {
-    TCC_OPTION_HELP,
-    TCC_OPTION_I,
-    TCC_OPTION_D,
-    TCC_OPTION_U,
-    TCC_OPTION_L,
-    TCC_OPTION_B,
-    TCC_OPTION_l,
-    TCC_OPTION_bench,
-    TCC_OPTION_bt,
-    TCC_OPTION_b,
-    TCC_OPTION_g,
-    TCC_OPTION_c,
-    TCC_OPTION_static,
-    TCC_OPTION_shared,
-    TCC_OPTION_soname,
-    TCC_OPTION_o,
-    TCC_OPTION_r,
-    TCC_OPTION_Wl,
-    TCC_OPTION_W,
-    TCC_OPTION_O,
-    TCC_OPTION_m,
-    TCC_OPTION_f,
-    TCC_OPTION_nostdinc,
-    TCC_OPTION_nostdlib,
-    TCC_OPTION_print_search_dirs,
-    TCC_OPTION_rdynamic,
-    TCC_OPTION_run,
-    TCC_OPTION_v,
-    TCC_OPTION_w,
-    TCC_OPTION_pipe,
-    TCC_OPTION_E,
-    TCC_OPTION_x,
-};
-
-static const TCCOption tcc_options[] = {
-    { "h", TCC_OPTION_HELP, 0 },
-    { "?", TCC_OPTION_HELP, 0 },
-    { "I", TCC_OPTION_I, TCC_OPTION_HAS_ARG },
-    { "D", TCC_OPTION_D, TCC_OPTION_HAS_ARG },
-    { "U", TCC_OPTION_U, TCC_OPTION_HAS_ARG },
-    { "L", TCC_OPTION_L, TCC_OPTION_HAS_ARG },
-    { "B", TCC_OPTION_B, TCC_OPTION_HAS_ARG },
-    { "l", TCC_OPTION_l, TCC_OPTION_HAS_ARG | TCC_OPTION_NOSEP },
-    { "bench", TCC_OPTION_bench, 0 },
-    { "bt", TCC_OPTION_bt, TCC_OPTION_HAS_ARG },
-#ifdef CONFIG_TCC_BCHECK
-    { "b", TCC_OPTION_b, 0 },
+static void print_search_dirs(TCCState *s)
+{
+    printf("install: %s\n", s->tcc_lib_path);
+    /* print_dirs("programs", NULL, 0); */
+    print_dirs("include", s->sysinclude_paths, s->nb_sysinclude_paths);
+    print_dirs("libraries", s->library_paths, s->nb_library_paths);
+    printf("libtcc1:\n  %s/"TCC_LIBTCC1"\n", s->tcc_lib_path);
+#ifndef TCC_TARGET_PE
+    print_dirs("crt", s->crt_paths, s->nb_crt_paths);
+    printf("elfinterp:\n  %s\n",  DEFAULT_ELFINTERP(s));
 #endif
-    { "g", TCC_OPTION_g, TCC_OPTION_HAS_ARG | TCC_OPTION_NOSEP },
-    { "c", TCC_OPTION_c, 0 },
-    { "static", TCC_OPTION_static, 0 },
-    { "shared", TCC_OPTION_shared, 0 },
-    { "soname", TCC_OPTION_soname, TCC_OPTION_HAS_ARG },
-    { "o", TCC_OPTION_o, TCC_OPTION_HAS_ARG },
-    { "run", TCC_OPTION_run, TCC_OPTION_HAS_ARG | TCC_OPTION_NOSEP },
-    { "rdynamic", TCC_OPTION_rdynamic, 0 },
-    { "r", TCC_OPTION_r, 0 },
-    { "Wl,", TCC_OPTION_Wl, TCC_OPTION_HAS_ARG | TCC_OPTION_NOSEP },
-    { "W", TCC_OPTION_W, TCC_OPTION_HAS_ARG | TCC_OPTION_NOSEP },
-    { "O", TCC_OPTION_O, TCC_OPTION_HAS_ARG | TCC_OPTION_NOSEP },
-    { "m", TCC_OPTION_m, TCC_OPTION_HAS_ARG },
-    { "f", TCC_OPTION_f, TCC_OPTION_HAS_ARG | TCC_OPTION_NOSEP },
-    { "nostdinc", TCC_OPTION_nostdinc, 0 },
-    { "nostdlib", TCC_OPTION_nostdlib, 0 },
-    { "print-search-dirs", TCC_OPTION_print_search_dirs, 0 }, 
-    { "v", TCC_OPTION_v, TCC_OPTION_HAS_ARG | TCC_OPTION_NOSEP },
-    { "w", TCC_OPTION_w, 0 },
-    { "pipe", TCC_OPTION_pipe, 0},
-    { "E", TCC_OPTION_E, 0},
-    { "x", TCC_OPTION_x, TCC_OPTION_HAS_ARG },
-    { NULL },
-};
+}

-static int64_t getclock_us(void)
+static void set_environment(TCCState *s)
+{
+    char * path;
+
+    path = getenv("C_INCLUDE_PATH");
+    if(path != NULL) {
+        tcc_add_sysinclude_path(s, path);
+    }
+    path = getenv("CPATH");
+    if(path != NULL) {
+        tcc_add_include_path(s, path);
+    }
+    path = getenv("LIBRARY_PATH");
+    if(path != NULL) {
+        tcc_add_library_path(s, path);
+    }
+}
+
+static char *default_outputfile(TCCState *s, const char *first_file)
+{
+    char buf[1024];
+    char *ext;
+    const char *name = "a";
+
+    if (first_file && strcmp(first_file, "-"))
+        name = tcc_basename(first_file);
+    snprintf(buf, sizeof(buf), "%s", name);
+    ext = tcc_fileextension(buf);
+#ifdef TCC_TARGET_PE
+    if (s->output_type == TCC_OUTPUT_DLL)
+        strcpy(ext, ".dll");
+    else
+    if (s->output_type == TCC_OUTPUT_EXE)
+        strcpy(ext, ".exe");
+    else
+#endif
+    if (s->output_type == TCC_OUTPUT_OBJ && !s->option_r && *ext)
+        strcpy(ext, ".o");
+    else
+        strcpy(buf, "a.out");
+    return tcc_strdup(buf);
+}
+
+static unsigned getclock_ms(void)
 {
 #ifdef _WIN32
-    struct _timeb tb;
-    _ftime(&tb);
-    return (tb.time * 1000LL + tb.millitm) * 1000LL;
+    return GetTickCount();
 #else
    struct timeval tv;
    gettimeofday(&tv, NULL);
-    return tv.tv_sec * 1000000LL + tv.tv_usec;
+    return tv.tv_sec*1000 + (tv.tv_usec+500)/1000;
 #endif
 }

-static int strstart(const char *str, const char *val, const char **ptr)
+int main(int argc0, char **argv0)
 {
-    const char *p, *q;
-    p = str;
-    q = val;
-    while (*q != '\0') {
-        if (*p != *q)
-            return 0;
-        p++;
-        q++;
-    }
-    if (ptr)
-        *ptr = p;
-    return 1;
-}
-
-/* convert 'str' into an array of space separated strings */
-static int expand_args(char ***pargv, const char *str)
-{
-    const char *s1;
-    char **argv, *arg;
-    int argc, len;
-
-    argc = 0;
-    argv = NULL;
-    for(;;) {
-        while (is_space(*str))
-            str++;
-        if (*str == '\0')
-            break;
-        s1 = str;
-        while (*str != '\0' && !is_space(*str))
-            str++;
-        len = str - s1;
-        arg = tcc_malloc(len + 1);
-        memcpy(arg, s1, len);
-        arg[len] = '\0';
-        dynarray_add((void ***)&argv, &argc, arg);
-    }
-    *pargv = argv;
-    return argc;
-}
-
-int parse_args(TCCState *s, int argc, char **argv)
-{
-    int optind;
-    const TCCOption *popt;
-    const char *optarg, *p1, *r1;
-    char *r;
-
-    optind = 0;
-    while (optind < argc) {
-
-        r = argv[optind++];
-        if (r[0] != '-' || r[1] == '\0') {
-            /* add a new file */
-            dynarray_add((void ***)&files, &nb_files, r);
-            if (!multiple_files) {
-                optind--;
-                /* argv[0] will be this file */
-                break;
-            }
-        } else {
-            /* find option in table (match only the first chars */
-            popt = tcc_options;
-            for(;;) {
-                p1 = popt->name;
-                if (p1 == NULL)
-                    error("invalid option -- '%s'", r);
-                r1 = r + 1;
-                for(;;) {
-                    if (*p1 == '\0')
-                        goto option_found;
-                    if (*r1 != *p1)
-                        break;
-                    p1++;
-                    r1++;
-                }
-                popt++;
-            }
-        option_found:
-            if (popt->flags & TCC_OPTION_HAS_ARG) {
-                if (*r1 != '\0' || (popt->flags & TCC_OPTION_NOSEP)) {
-                    optarg = r1;
-                } else {
-                    if (optind >= argc)
-                        error("argument to '%s' is missing", r);
-                    optarg = argv[optind++];
-                }
-            } else {
-                if (*r1 != '\0')
-                    return 0;
-                optarg = NULL;
-            }
-                
-            switch(popt->index) {
-            case TCC_OPTION_HELP:
-                return 0;
-
-            case TCC_OPTION_I:
-                if (tcc_add_include_path(s, optarg) < 0)
-                    error("too many include paths");
-                break;
-            case TCC_OPTION_D:
-                {
-                    char *sym, *value;
-                    sym = (char *)optarg;
-                    value = strchr(sym, '=');
-                    if (value) {
-                        *value = '\0';
-                        value++;
-                    }
-                    tcc_define_symbol(s, sym, value);
-                }
-                break;
-            case TCC_OPTION_U:
-                tcc_undefine_symbol(s, optarg);
-                break;
-            case TCC_OPTION_L:
-                tcc_add_library_path(s, optarg);
-                break;
-            case TCC_OPTION_B:
-                /* set tcc utilities path (mainly for tcc development) */
-                tcc_set_lib_path(s, optarg);
-                break;
-            case TCC_OPTION_l:
-                dynarray_add((void ***)&files, &nb_files, r);
-                nb_libraries++;
-                break;
-            case TCC_OPTION_bench:
-                do_bench = 1;
-                break;
-#ifdef CONFIG_TCC_BACKTRACE
-            case TCC_OPTION_bt:
-                num_callers = atoi(optarg);
-                break;
-#endif
-#ifdef CONFIG_TCC_BCHECK
-            case TCC_OPTION_b:
-                s->do_bounds_check = 1;
-                s->do_debug = 1;
-                break;
-#endif
-            case TCC_OPTION_g:
-                s->do_debug = 1;
-                break;
-            case TCC_OPTION_c:
-                multiple_files = 1;
-                output_type = TCC_OUTPUT_OBJ;
-                break;
-            case TCC_OPTION_static:
-                s->static_link = 1;
-                break;
-            case TCC_OPTION_shared:
-                output_type = TCC_OUTPUT_DLL;
-                break;
-            case TCC_OPTION_soname:
-                s->soname = optarg; 
-                break;
-            case TCC_OPTION_o:
-                multiple_files = 1;
-                outfile = optarg;
-                break;
-            case TCC_OPTION_r:
-                /* generate a .o merging several output files */
-                reloc_output = 1;
-                output_type = TCC_OUTPUT_OBJ;
-                break;
-            case TCC_OPTION_nostdinc:
-                s->nostdinc = 1;
-                break;
-            case TCC_OPTION_nostdlib:
-                s->nostdlib = 1;
-                break;
-            case TCC_OPTION_print_search_dirs:
-                print_search_dirs = 1;
-                break;
-            case TCC_OPTION_run:
-                {
-                    int argc1;
-                    char **argv1;
-                    argc1 = expand_args(&argv1, optarg);
-                    if (argc1 > 0) {
-                        parse_args(s, argc1, argv1);
-                    }
-                    multiple_files = 0;
-                    output_type = TCC_OUTPUT_MEMORY;
-                }
-                break;
-            case TCC_OPTION_v:
-                do {
-                    if (0 == s->verbose++)
-                        printf("tcc version %s\n", TCC_VERSION);
-                } while (*optarg++ == 'v');
-                break;
-            case TCC_OPTION_f:
-                if (tcc_set_flag(s, optarg, 1) < 0 && s->warn_unsupported)
-                    goto unsupported_option;
-                break;
-            case TCC_OPTION_W:
-                if (tcc_set_warning(s, optarg, 1) < 0 && 
-                    s->warn_unsupported)
-                    goto unsupported_option;
-                break;
-            case TCC_OPTION_w:
-                s->warn_none = 1;
-                break;
-            case TCC_OPTION_rdynamic:
-                s->rdynamic = 1;
-                break;
-            case TCC_OPTION_Wl:
-                {
-                    const char *p;
-                    if (strstart(optarg, "-Ttext,", &p)) {
-                        s->text_addr = strtoul(p, NULL, 16);
-                        s->has_text_addr = 1;
-                    } else if (strstart(optarg, "--section-alignment,", &p)) {
-                        s->section_align = strtoul(p, NULL, 16);
-                    } else if (strstart(optarg, "--image-base,", &p)) {
-                        s->text_addr = strtoul(p, NULL, 16);
-                        s->has_text_addr = 1;
-#ifdef TCC_TARGET_PE
-                    } else if (strstart(optarg, "--file-alignment,", &p)) {
-                        s->pe_file_align = strtoul(p, NULL, 16);
-                    } else if (strstart(optarg, "--subsystem,", &p)) {
-#if defined(TCC_TARGET_I386) || defined(TCC_TARGET_X86_64)
-                        if (!strcmp(p, "native"))
-                            s->pe_subsystem = 1;
-                        else if (!strcmp(p, "console"))
-                            s->pe_subsystem = 3;
-                        else if (!strcmp(p, "gui"))
-                            s->pe_subsystem = 2;
-                        else if (!strcmp(p, "posix"))
-                            s->pe_subsystem = 7;
-                        else if (!strcmp(p, "efiapp"))
-                            s->pe_subsystem = 10;
-                        else if (!strcmp(p, "efiboot"))
-                            s->pe_subsystem = 11;
-                        else if (!strcmp(p, "efiruntime"))
-                            s->pe_subsystem = 12;
-                        else if (!strcmp(p, "efirom"))
-                            s->pe_subsystem = 13;
-#elif defined(TCC_TARGET_ARM)
-                        if (!strcmp(p, "wince"))
-                            s->pe_subsystem = 9;
-#endif
-                        else {
-                            error("invalid subsystem '%s'", p);
-                        }
-#endif
-                    } else if (strstart(optarg, "--oformat,", &p)) {
-#if defined(TCC_TARGET_PE)
-                        if (strstart(p, "pe-", NULL)) {
-#else
-#if defined(TCC_TARGET_X86_64)
-                        if (strstart(p, "elf64-", NULL)) {
-#else
-                        if (strstart(p, "elf32-", NULL)) {
-#endif
-#endif
-                            s->output_format = TCC_OUTPUT_FORMAT_ELF;
-                        } else if (!strcmp(p, "binary")) {
-                            s->output_format = TCC_OUTPUT_FORMAT_BINARY;
-                        } else
-#ifdef TCC_TARGET_COFF
-                        if (!strcmp(p, "coff")) {
-                            s->output_format = TCC_OUTPUT_FORMAT_COFF;
-                        } else
-#endif
-                        {
-                            error("target %s not found", p);
-                        }
-                    } else {
-                        error("unsupported linker option '%s'", optarg);
-                    }
-                }
-                break;
-            case TCC_OPTION_E:
-                output_type = TCC_OUTPUT_PREPROCESS;
-                break;
-            case TCC_OPTION_x:
-                break;
-            default:
-                if (s->warn_unsupported) {
-                unsupported_option:
-                    warning("unsupported option '%s'", r);
-                }
-                break;
-            }
-        }
-    }
-    return optind + 1;
-}
-
-int main(int argc, char **argv)
-{
-    int i;
    TCCState *s;
-    int nb_objfiles, ret, optind;
-    char objfilename[1024];
-    int64_t start_time = 0;
+    int ret, opt, n = 0, t = 0;
+    unsigned start_time = 0;
+    const char *first_file;
+    int argc; char **argv;
+    FILE *ppfp = stdout;

+redo:
+    argc = argc0, argv = argv0;
    s = tcc_new();
+    opt = tcc_parse_args(s, &argc, &argv, 1);

-    output_type = TCC_OUTPUT_EXE;
-    outfile = NULL;
-    multiple_files = 1;
-    files = NULL;
-    nb_files = 0;
-    nb_libraries = 0;
-    reloc_output = 0;
-    print_search_dirs = 0;
-    ret = 0;
-
-    optind = parse_args(s, argc - 1, argv + 1);
-    if (print_search_dirs) {
-        /* enough for Linux kernel */
-        printf("install: %s/\n", s->tcc_lib_path);
-        return 0;
-    }
-    if (optind == 0 || nb_files == 0) {
-        if (optind && s->verbose)
-            return 0;
-        help();
-        return 1;
-    }
-
-    nb_objfiles = nb_files - nb_libraries;
-
-    /* if outfile provided without other options, we output an
-       executable */
-    if (outfile && output_type == TCC_OUTPUT_MEMORY)
-        output_type = TCC_OUTPUT_EXE;
-
-    /* check -c consistency : only single file handled. XXX: checks file type */
-    if (output_type == TCC_OUTPUT_OBJ && !reloc_output) {
-        /* accepts only a single input file */
-        if (nb_objfiles != 1)
-            error("cannot specify multiple files with -c");
-        if (nb_libraries != 0)
-            error("cannot specify libraries with -c");
-    }
-    
-
-    if (output_type == TCC_OUTPUT_PREPROCESS) {
-        if (!outfile) {
-            s->outfile = stdout;
-        } else {
-            s->outfile = fopen(outfile, "w");
-            if (!s->outfile)
-                error("could not open '%s", outfile);
-        }
-    } else if (output_type != TCC_OUTPUT_MEMORY) {
-        if (!outfile) {
-            /* compute default outfile name */
-            char *ext;
-            const char *name = 
-                strcmp(files[0], "-") == 0 ? "a" : tcc_basename(files[0]);
-            pstrcpy(objfilename, sizeof(objfilename), name);
-            ext = tcc_fileextension(objfilename);
+    if ((n | t) == 0) {
+        if (opt == OPT_HELP)
+            return printf(help), 1;
+        if (opt == OPT_HELP2)
+            return printf(help2), 1;
+        if (opt == OPT_M32 || opt == OPT_M64)
+            tcc_tool_cross(s, argv, opt); /* never returns */
+        if (s->verbose)
+            printf(version);
+        if (opt == OPT_AR)
+            return tcc_tool_ar(s, argc, argv);
 #ifdef TCC_TARGET_PE
-            if (output_type == TCC_OUTPUT_DLL)
-                strcpy(ext, ".dll");
-            else
-            if (output_type == TCC_OUTPUT_EXE)
-                strcpy(ext, ".exe");
-            else
+        if (opt == OPT_IMPDEF)
+            return tcc_tool_impdef(s, argc, argv);
 #endif
-            if (output_type == TCC_OUTPUT_OBJ && !reloc_output && *ext)
-                strcpy(ext, ".o");
-            else
-                pstrcpy(objfilename, sizeof(objfilename), "a.out");
-            outfile = objfilename;
+        if (opt == OPT_V)
+            return 0;
+        if (opt == OPT_PRINT_DIRS) {
+            /* initialize search dirs */
+            set_environment(s);
+            tcc_set_output_type(s, TCC_OUTPUT_MEMORY);
+            print_search_dirs(s);
+            return 0;
        }
-    }

-    if (do_bench) {
-        start_time = getclock_us();
-    }
-
-    tcc_set_output_type(s, output_type);
-
-    /* compile or add each files or library */
-    for(i = 0; i < nb_files && ret == 0; i++) {
-        const char *filename;
-
-        filename = files[i];
-        if (filename[0] == '-' && filename[1]) {
-            if (tcc_add_library(s, filename + 2) < 0) {
-                error_noabort("cannot find %s", filename);
-                ret = 1;
-            }
-        } else {
-            if (1 == s->verbose)
-                printf("-> %s\n", filename);
-            if (tcc_add_file(s, filename) < 0)
-                ret = 1;
-        }
-    }
-
-    /* free all files */
-    tcc_free(files);
-
-    if (0 == ret) {
-        if (do_bench)
-            tcc_print_stats(s, getclock_us() - start_time);
+        n = s->nb_files;
+        if (n == 0)
+            tcc_error("no input files\n");

        if (s->output_type == TCC_OUTPUT_PREPROCESS) {
-            if (outfile)
-                fclose(s->outfile);
-        } else if (s->output_type == TCC_OUTPUT_MEMORY)
-            ret = tcc_run(s, argc - optind, argv + optind);
-        else
-            ret = tcc_output_file(s, outfile) ? 1 : 0;
+            if (s->outfile) {
+                ppfp = fopen(s->outfile, "w");
+                if (!ppfp)
+                    tcc_error("could not write '%s'", s->outfile);
+            }
+        } else if (s->output_type == TCC_OUTPUT_OBJ && !s->option_r) {
+            if (s->nb_libraries)
+                tcc_error("cannot specify libraries with -c");
+            if (n > 1 && s->outfile)
+                tcc_error("cannot specify output file with -c many files");
+        } else {
+            if (s->option_pthread)
+                tcc_set_options(s, "-lpthread");
+        }
+
+        if (s->do_bench)
+            start_time = getclock_ms();
    }

-    tcc_delete(s);
+    set_environment(s);
+    if (s->output_type == 0)
+        s->output_type = TCC_OUTPUT_EXE;
+    tcc_set_output_type(s, s->output_type);
+    s->ppfp = ppfp;

-#ifdef MEM_DEBUG
-    if (do_bench) {
-        printf("memory: %d bytes, max = %d bytes\n", mem_cur_size, mem_max_size);
+    if ((s->output_type == TCC_OUTPUT_MEMORY
+      || s->output_type == TCC_OUTPUT_PREPROCESS) && (s->dflag & 16))
+        s->dflag |= t ? 32 : 0, s->run_test = ++t, n = s->nb_files;
+
+    /* compile or add each files or library */
+    for (first_file = NULL, ret = 0;;) {
+        struct filespec *f = s->files[s->nb_files - n];
+        s->filetype = f->type;
+        s->alacarte_link = f->alacarte;
+        if (f->type == AFF_TYPE_LIB) {
+            if (tcc_add_library_err(s, f->name) < 0)
+                ret = 1;
+        } else {
+            if (1 == s->verbose)
+                printf("-> %s\n", f->name);
+            if (!first_file)
+                first_file = f->name;
+            if (tcc_add_file(s, f->name) < 0)
+                ret = 1;
+        }
+        s->filetype = 0;
+        s->alacarte_link = 1;
+        if (--n == 0 || ret
+            || (s->output_type == TCC_OUTPUT_OBJ && !s->option_r))
+            break;
    }
+
+    if (s->run_test) {
+        t = 0;
+    } else if (s->output_type == TCC_OUTPUT_PREPROCESS) {
+        ;
+    } else if (0 == ret) {
+        if (s->output_type == TCC_OUTPUT_MEMORY) {
+#ifdef TCC_IS_NATIVE
+            ret = tcc_run(s, argc, argv);
 #endif
+        } else {
+            if (!s->outfile)
+                s->outfile = default_outputfile(s, first_file);
+            if (tcc_output_file(s, s->outfile))
+                ret = 1;
+            else if (s->gen_deps)
+                gen_makedeps(s, s->outfile, s->deps_outfile);
+        }
+    }
+
+    if (s->do_bench && (n | t | ret) == 0)
+        tcc_print_stats(s, getclock_ms() - start_time);
+    tcc_delete(s);
+    if (ret == 0 && n)
+        goto redo; /* compile more files with -c */
+    if (t)
+        goto redo; /* run more tests with -dt -run */
+    if (ppfp && ppfp != stdout)
+        fclose(ppfp);
    return ret;
 }
--- a/tcc.h
+++ b/tcc.h
--- a/tccasm.c
+++ b/tccasm.c
--- a/tcccoff.c
+++ b/tcccoff.c
@ -18,7 +18,8 @@
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
-#include "coff.h"
+
+#include "tcc.h"

 #define MAXNSCNS 255		/* MAXIMUM NUMBER OF SECTIONS         */
 #define MAX_STR_TABLE 1000000
@ -37,7 +38,7 @@ int EndAddress[MAX_FUNCS];
 int LastLineNo[MAX_FUNCS];
 int FuncEntries[MAX_FUNCS];

-BOOL OutputTheSection(Section * sect);
+int OutputTheSection(Section * sect);
 short int GetCoffFlags(const char *s);
 void SortSymbolTable(void);
 Section *FindSection(TCCState * s1, const char *sname);
@ -73,7 +74,7 @@ typedef struct {
    unsigned short dummy4;
 } AUXEF;

-int tcc_output_coff(TCCState *s1, FILE *f)
+ST_FUNC int tcc_output_coff(TCCState *s1, FILE *f)
 {
    Section *tcc_sect;
    SCNHDR *coff_sec;
@ -532,7 +533,7 @@ int tcc_output_coff(TCCState *s1, FILE *f)
 	    } else {
 		if (pCoff_str_table - Coff_str_table + strlen(name) >
 		    MAX_STR_TABLE - 1)
-		    error("String table too large");
+		    tcc_error("String table too large");

 		csym._n._n_n._n_zeroes = 0;
 		csym._n._n_n._n_offset =
@ -562,11 +563,7 @@ int tcc_output_coff(TCCState *s1, FILE *f)
 		}

 		if (k >= nFuncs) {
-		    char s[256];
-
-		    sprintf(s, "debug info can't find function: %s", name);
-
-		    error(s);
+		    tcc_error("debug info can't find function: %s", name);
 		}
 		// put a Function Name

@ -733,13 +730,7 @@ void SortSymbolTable(void)
 		    }

 		    if (k >= nFuncs) {
-			char s[256];
-
-			sprintf(s,
-				"debug (sort) info can't find function: %s",
-				name2);
-
-			error(s);
+                        tcc_error("debug (sort) info can't find function: %s", name2);
 		    }

 		    if (strcmp(AssociatedFile[k], name) == 0) {
@ -766,7 +757,7 @@ void SortSymbolTable(void)
    }

    if (n != nb_syms)
-	error("Internal Compiler error, debug info");
+	tcc_error("Internal Compiler error, debug info");

    // copy it all back

@ -823,14 +814,14 @@ int FindCoffSymbolIndex(const char *func_name)
    return n;			// total number of symbols
 }

-BOOL OutputTheSection(Section * sect)
+int OutputTheSection(Section * sect)
 {
    const char *s = sect->name;

    if (!strcmp(s, ".text"))
-	return true;
+	return 1;
    else if (!strcmp(s, ".data"))
-	return true;
+	return 1;
    else
 	return 0;
 }
@ -863,11 +854,11 @@ Section *FindSection(TCCState * s1, const char *sname)
 	    return s;
    }

-    error("could not find section %s", sname);
+    tcc_error("could not find section %s", sname);
    return 0;
 }

-int tcc_load_coff(TCCState * s1, int fd)
+ST_FUNC int tcc_load_coff(TCCState * s1, int fd)
 {
 // tktk TokenSym *ts;

@ -881,39 +872,39 @@ int tcc_load_coff(TCCState * s1, int fd)

    f = fdopen(fd, "rb");
    if (!f) {
-	error("Unable to open .out file for input");
+	tcc_error("Unable to open .out file for input");
    }

    if (fread(&file_hdr, FILHSZ, 1, f) != 1)
-	error("error reading .out file for input");
+	tcc_error("error reading .out file for input");

    if (fread(&o_filehdr, sizeof(o_filehdr), 1, f) != 1)
-	error("error reading .out file for input");
+	tcc_error("error reading .out file for input");

    // first read the string table

    if (fseek(f, file_hdr.f_symptr + file_hdr.f_nsyms * SYMESZ, SEEK_SET))
-	error("error reading .out file for input");
+	tcc_error("error reading .out file for input");

    if (fread(&str_size, sizeof(int), 1, f) != 1)
-	error("error reading .out file for input");
+	tcc_error("error reading .out file for input");


    Coff_str_table = (char *) tcc_malloc(str_size);

    if (fread(Coff_str_table, str_size - 4, 1, f) != 1)
-	error("error reading .out file for input");
+	tcc_error("error reading .out file for input");

    // read/process all the symbols

    // seek back to symbols

    if (fseek(f, file_hdr.f_symptr, SEEK_SET))
-	error("error reading .out file for input");
+	tcc_error("error reading .out file for input");

    for (i = 0; i < file_hdr.f_nsyms; i++) {
 	if (fread(&csym, SYMESZ, 1, f) != 1)
-	    error("error reading .out file for input");
+	    tcc_error("error reading .out file for input");

 	if (csym._n._n_n._n_zeroes == 0) {
 	    name = Coff_str_table + csym._n._n_n._n_offset - 4;
@ -942,13 +933,13 @@ int tcc_load_coff(TCCState * s1, int fd)
 	    if (name[0] == '_' && strcmp(name, "_main") != 0)
 		name++;

-	    tcc_add_symbol(s1, name, (void*)csym.n_value);
+	    tcc_add_symbol(s1, name, (void*)(uintptr_t)csym.n_value);
 	}
 	// skip any aux records

 	if (csym.n_numaux == 1) {
 	    if (fread(&csym, SYMESZ, 1, f) != 1)
-		error("error reading .out file for input");
+		tcc_error("error reading .out file for input");
 	    i++;
 	}
    }
--- a/tccelf.c
+++ b/tccelf.c
--- a/tccgen.c
+++ b/tccgen.c
--- a/include/tcclib.h
+++ b/include/tcclib.h
@ -39,6 +39,7 @@ int getchar(void);
 char *gets(char *s);
 int ungetc(int c, FILE *stream);
 int fflush(FILE *stream);
+int putchar (int c);

 int printf(const char *format, ...);
 int fprintf(FILE *stream, const char *format, ...);
@ -64,6 +65,7 @@ void *memcpy(void *dest, const void *src, size_t n);
 void *memmove(void *dest, const void *src, size_t n);
 void *memset(void *s, int c, size_t n);
 char *strdup(const char *s);
+size_t strlen(const char *s);

 /* dlfcn.h */
 #define RTLD_LAZY       0x001
--- a/tccpe.c
+++ b/tccpe.c
--- a/tccpp.c
+++ b/tccpp.c
--- a/tccrun.c
+++ b/tccrun.c
@ -0,0 +1,844 @@
+/*
+ *  TCC - Tiny C Compiler - Support for -run switch
+ *
+ *  Copyright (c) 2001-2004 Fabrice Bellard
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "tcc.h"
+
+/* only native compiler supports -run */
+#ifdef TCC_IS_NATIVE
+
+#ifndef _WIN32
+# include <sys/mman.h>
+#endif
+
+#ifdef CONFIG_TCC_BACKTRACE
+# ifndef _WIN32
+#  include <signal.h>
+#  ifndef __OpenBSD__
+#   include <sys/ucontext.h>
+#  endif
+# else
+#  define ucontext_t CONTEXT
+# endif
+ST_DATA int rt_num_callers = 6;
+ST_DATA const char **rt_bound_error_msg;
+ST_DATA void *rt_prog_main;
+static int rt_get_caller_pc(addr_t *paddr, ucontext_t *uc, int level);
+static void rt_error(ucontext_t *uc, const char *fmt, ...);
+static void set_exception_handler(void);
+#endif
+
+static void set_pages_executable(void *ptr, unsigned long length);
+static int tcc_relocate_ex(TCCState *s1, void *ptr, addr_t ptr_diff);
+
+#ifdef _WIN64
+static void *win64_add_function_table(TCCState *s1);
+static void win64_del_function_table(void *);
+#endif
+
+/* ------------------------------------------------------------- */
+/* Do all relocations (needed before using tcc_get_symbol())
+   Returns -1 on error. */
+
+LIBTCCAPI int tcc_relocate(TCCState *s1, void *ptr)
+{
+    int size;
+    addr_t ptr_diff = 0;
+
+    if (TCC_RELOCATE_AUTO != ptr)
+        return tcc_relocate_ex(s1, ptr, 0);
+
+    size = tcc_relocate_ex(s1, NULL, 0);
+    if (size < 0)
+        return -1;
+
+#ifdef HAVE_SELINUX
+{
+    /* Using mmap instead of malloc */
+    void *prx;
+    char tmpfname[] = "/tmp/.tccrunXXXXXX";
+    int fd = mkstemp(tmpfname);
+    unlink(tmpfname);
+    ftruncate(fd, size);
+
+    ptr = mmap (NULL, size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+    prx = mmap (NULL, size, PROT_READ|PROT_EXEC, MAP_SHARED, fd, 0);
+    if (ptr == MAP_FAILED || prx == MAP_FAILED)
+	tcc_error("tccrun: could not map memory");
+    dynarray_add(&s1->runtime_mem, &s1->nb_runtime_mem, (void*)(addr_t)size);
+    dynarray_add(&s1->runtime_mem, &s1->nb_runtime_mem, prx);
+    ptr_diff = (char*)prx - (char*)ptr;
+}
+#else
+    ptr = tcc_malloc(size);
+#endif
+    tcc_relocate_ex(s1, ptr, ptr_diff); /* no more errors expected */
+    dynarray_add(&s1->runtime_mem, &s1->nb_runtime_mem, ptr);
+    return 0;
+}
+
+ST_FUNC void tcc_run_free(TCCState *s1)
+{
+    int i;
+
+    for (i = 0; i < s1->nb_runtime_mem; ++i) {
+#ifdef HAVE_SELINUX
+        unsigned size = (unsigned)(addr_t)s1->runtime_mem[i++];
+        munmap(s1->runtime_mem[i++], size);
+        munmap(s1->runtime_mem[i], size);
+#else
+#ifdef _WIN64
+        win64_del_function_table(*(void**)s1->runtime_mem[i]);
+#endif
+        tcc_free(s1->runtime_mem[i]);
+#endif
+    }
+    tcc_free(s1->runtime_mem);
+}
+
+/* launch the compiled program with the given arguments */
+LIBTCCAPI int tcc_run(TCCState *s1, int argc, char **argv)
+{
+    int (*prog_main)(int, char **);
+
+    s1->runtime_main = "main";
+    if ((s1->dflag & 16) && !find_elf_sym(s1->symtab, s1->runtime_main))
+        return 0;
+    if (tcc_relocate(s1, TCC_RELOCATE_AUTO) < 0)
+        return -1;
+    prog_main = tcc_get_symbol_err(s1, s1->runtime_main);
+
+#ifdef CONFIG_TCC_BACKTRACE
+    if (s1->do_debug) {
+        set_exception_handler();
+        rt_prog_main = prog_main;
+    }
+#endif
+
+    errno = 0; /* clean errno value */
+
+#ifdef CONFIG_TCC_BCHECK
+    if (s1->do_bounds_check) {
+        void (*bound_init)(void);
+        void (*bound_exit)(void);
+        void (*bound_new_region)(void *p, addr_t size);
+        int  (*bound_delete_region)(void *p);
+        int i, ret;
+
+        /* set error function */
+        rt_bound_error_msg = tcc_get_symbol_err(s1, "__bound_error_msg");
+        /* XXX: use .init section so that it also work in binary ? */
+        bound_init = tcc_get_symbol_err(s1, "__bound_init");
+        bound_exit = tcc_get_symbol_err(s1, "__bound_exit");
+        bound_new_region = tcc_get_symbol_err(s1, "__bound_new_region");
+        bound_delete_region = tcc_get_symbol_err(s1, "__bound_delete_region");
+
+        bound_init();
+        /* mark argv area as valid */
+        bound_new_region(argv, argc*sizeof(argv[0]));
+        for (i=0; i<argc; ++i)
+            bound_new_region(argv[i], strlen(argv[i]) + 1);
+
+        ret = (*prog_main)(argc, argv);
+
+        /* unmark argv area */
+        for (i=0; i<argc; ++i)
+            bound_delete_region(argv[i]);
+        bound_delete_region(argv);
+        bound_exit();
+        return ret;
+    }
+#endif
+    return (*prog_main)(argc, argv);
+}
+
+#if defined TCC_TARGET_I386 || defined TCC_TARGET_X86_64
+ #define RUN_SECTION_ALIGNMENT 63
+#else
+ #define RUN_SECTION_ALIGNMENT 15
+#endif
+
+/* relocate code. Return -1 on error, required size if ptr is NULL,
+   otherwise copy code into buffer passed by the caller */
+static int tcc_relocate_ex(TCCState *s1, void *ptr, addr_t ptr_diff)
+{
+    Section *s;
+    unsigned offset, length, fill, i, k;
+    addr_t mem;
+
+    if (NULL == ptr) {
+        s1->nb_errors = 0;
+#ifdef TCC_TARGET_PE
+        pe_output_file(s1, NULL);
+#else
+        tcc_add_runtime(s1);
+	resolve_common_syms(s1);
+        build_got_entries(s1);
+#endif
+        if (s1->nb_errors)
+            return -1;
+    }
+
+    offset = 0, mem = (addr_t)ptr;
+    fill = -mem & RUN_SECTION_ALIGNMENT;
+#ifdef _WIN64
+    offset += sizeof (void*);
+#endif
+    for (k = 0; k < 2; ++k) {
+        for(i = 1; i < s1->nb_sections; i++) {
+            s = s1->sections[i];
+            if (0 == (s->sh_flags & SHF_ALLOC))
+                continue;
+            if (k != !(s->sh_flags & SHF_EXECINSTR))
+                continue;
+            offset += fill;
+            if (!mem)
+                s->sh_addr = 0;
+            else if (s->sh_flags & SHF_EXECINSTR)
+                s->sh_addr = mem + offset + ptr_diff;
+            else
+                s->sh_addr = mem + offset;
+#if 0
+            if (mem)
+                printf("%-16s +%02lx %p %04x\n",
+                    s->name, fill, (void*)s->sh_addr, (unsigned)s->data_offset);
+#endif
+            offset += s->data_offset;
+            fill = -(mem + offset) & 15;
+        }
+#if RUN_SECTION_ALIGNMENT > 15
+        /* To avoid that x86 processors would reload cached instructions each time
+           when data is written in the near, we need to make sure that code and data
+           do not share the same 64 byte unit */
+        fill = -(mem + offset) & RUN_SECTION_ALIGNMENT;
+#endif
+    }
+
+    /* relocate symbols */
+    relocate_syms(s1, s1->symtab, 1);
+    if (s1->nb_errors)
+        return -1;
+
+    if (0 == mem)
+        return offset + RUN_SECTION_ALIGNMENT;
+
+#ifdef TCC_TARGET_PE
+    s1->pe_imagebase = mem;
+#endif
+
+    /* relocate each section */
+    for(i = 1; i < s1->nb_sections; i++) {
+        s = s1->sections[i];
+        if (s->reloc)
+            relocate_section(s1, s);
+    }
+    relocate_plt(s1);
+
+    for(i = 1; i < s1->nb_sections; i++) {
+        s = s1->sections[i];
+        if (0 == (s->sh_flags & SHF_ALLOC))
+            continue;
+        length = s->data_offset;
+        ptr = (void*)s->sh_addr;
+        if (s->sh_flags & SHF_EXECINSTR)
+            ptr = (char*)ptr - ptr_diff;
+        if (NULL == s->data || s->sh_type == SHT_NOBITS)
+            memset(ptr, 0, length);
+        else
+            memcpy(ptr, s->data, length);
+        /* mark executable sections as executable in memory */
+        if (s->sh_flags & SHF_EXECINSTR)
+            set_pages_executable((char*)ptr + ptr_diff, length);
+    }
+
+#ifdef _WIN64
+    *(void**)mem = win64_add_function_table(s1);
+#endif
+
+    return 0;
+}
+
+/* ------------------------------------------------------------- */
+/* allow to run code in memory */
+
+static void set_pages_executable(void *ptr, unsigned long length)
+{
+#ifdef _WIN32
+    unsigned long old_protect;
+    VirtualProtect(ptr, length, PAGE_EXECUTE_READWRITE, &old_protect);
+#else
+    void __clear_cache(void *beginning, void *end);
+# ifndef HAVE_SELINUX
+    addr_t start, end;
+#  ifndef PAGESIZE
+#   define PAGESIZE 4096
+#  endif
+    start = (addr_t)ptr & ~(PAGESIZE - 1);
+    end = (addr_t)ptr + length;
+    end = (end + PAGESIZE - 1) & ~(PAGESIZE - 1);
+    if (mprotect((void *)start, end - start, PROT_READ | PROT_WRITE | PROT_EXEC))
+        tcc_error("mprotect failed: did you mean to configure --with-selinux?");
+# endif
+# if defined TCC_TARGET_ARM || defined TCC_TARGET_ARM64
+    __clear_cache(ptr, (char *)ptr + length);
+# endif
+#endif
+}
+
+#ifdef _WIN64
+static void *win64_add_function_table(TCCState *s1)
+{
+    void *p = NULL;
+    if (s1->uw_pdata) {
+        p = (void*)s1->uw_pdata->sh_addr;
+        RtlAddFunctionTable(
+            (RUNTIME_FUNCTION*)p,
+            s1->uw_pdata->data_offset / sizeof (RUNTIME_FUNCTION),
+            s1->pe_imagebase
+            );
+        s1->uw_pdata = NULL;
+    }
+    return p;
+}
+
+static void win64_del_function_table(void *p)
+{
+    if (p) {
+        RtlDeleteFunctionTable((RUNTIME_FUNCTION*)p);
+    }
+}
+#endif
+
+/* ------------------------------------------------------------- */
+#ifdef CONFIG_TCC_BACKTRACE
+
+ST_FUNC void tcc_set_num_callers(int n)
+{
+    rt_num_callers = n;
+}
+
+/* print the position in the source file of PC value 'pc' by reading
+   the stabs debug information */
+static addr_t rt_printline(addr_t wanted_pc, const char *msg)
+{
+    char func_name[128], last_func_name[128];
+    addr_t func_addr, last_pc, pc;
+    const char *incl_files[INCLUDE_STACK_SIZE];
+    int incl_index, len, last_line_num, i;
+    const char *str, *p;
+
+    Stab_Sym *stab_sym = NULL, *stab_sym_end, *sym;
+    int stab_len = 0;
+    char *stab_str = NULL;
+
+    if (stab_section) {
+        stab_len = stab_section->data_offset;
+        stab_sym = (Stab_Sym *)stab_section->data;
+        stab_str = (char *) stabstr_section->data;
+    }
+
+    func_name[0] = '\0';
+    func_addr = 0;
+    incl_index = 0;
+    last_func_name[0] = '\0';
+    last_pc = (addr_t)-1;
+    last_line_num = 1;
+
+    if (!stab_sym)
+        goto no_stabs;
+
+    stab_sym_end = (Stab_Sym*)((char*)stab_sym + stab_len);
+    for (sym = stab_sym + 1; sym < stab_sym_end; ++sym) {
+        switch(sym->n_type) {
+            /* function start or end */
+        case N_FUN:
+            if (sym->n_strx == 0) {
+                /* we test if between last line and end of function */
+                pc = sym->n_value + func_addr;
+                if (wanted_pc >= last_pc && wanted_pc < pc)
+                    goto found;
+                func_name[0] = '\0';
+                func_addr = 0;
+            } else {
+                str = stab_str + sym->n_strx;
+                p = strchr(str, ':');
+                if (!p) {
+                    pstrcpy(func_name, sizeof(func_name), str);
+                } else {
+                    len = p - str;
+                    if (len > sizeof(func_name) - 1)
+                        len = sizeof(func_name) - 1;
+                    memcpy(func_name, str, len);
+                    func_name[len] = '\0';
+                }
+                func_addr = sym->n_value;
+            }
+            break;
+            /* line number info */
+        case N_SLINE:
+            pc = sym->n_value + func_addr;
+            if (wanted_pc >= last_pc && wanted_pc < pc)
+                goto found;
+            last_pc = pc;
+            last_line_num = sym->n_desc;
+            /* XXX: slow! */
+            strcpy(last_func_name, func_name);
+            break;
+            /* include files */
+        case N_BINCL:
+            str = stab_str + sym->n_strx;
+        add_incl:
+            if (incl_index < INCLUDE_STACK_SIZE) {
+                incl_files[incl_index++] = str;
+            }
+            break;
+        case N_EINCL:
+            if (incl_index > 1)
+                incl_index--;
+            break;
+        case N_SO:
+            if (sym->n_strx == 0) {
+                incl_index = 0; /* end of translation unit */
+            } else {
+                str = stab_str + sym->n_strx;
+                /* do not add path */
+                len = strlen(str);
+                if (len > 0 && str[len - 1] != '/')
+                    goto add_incl;
+            }
+            break;
+        }
+    }
+
+no_stabs:
+    /* second pass: we try symtab symbols (no line number info) */
+    incl_index = 0;
+    if (symtab_section)
+    {
+        ElfW(Sym) *sym, *sym_end;
+        int type;
+
+        sym_end = (ElfW(Sym) *)(symtab_section->data + symtab_section->data_offset);
+        for(sym = (ElfW(Sym) *)symtab_section->data + 1;
+            sym < sym_end;
+            sym++) {
+            type = ELFW(ST_TYPE)(sym->st_info);
+            if (type == STT_FUNC || type == STT_GNU_IFUNC) {
+                if (wanted_pc >= sym->st_value &&
+                    wanted_pc < sym->st_value + sym->st_size) {
+                    pstrcpy(last_func_name, sizeof(last_func_name),
+                            (char *) symtab_section->link->data + sym->st_name);
+                    func_addr = sym->st_value;
+                    goto found;
+                }
+            }
+        }
+    }
+    /* did not find any info: */
+    fprintf(stderr, "%s %p ???\n", msg, (void*)wanted_pc);
+    fflush(stderr);
+    return 0;
+ found:
+    i = incl_index;
+    if (i > 0)
+        fprintf(stderr, "%s:%d: ", incl_files[--i], last_line_num);
+    fprintf(stderr, "%s %p", msg, (void*)wanted_pc);
+    if (last_func_name[0] != '\0')
+        fprintf(stderr, " %s()", last_func_name);
+    if (--i >= 0) {
+        fprintf(stderr, " (included from ");
+        for (;;) {
+            fprintf(stderr, "%s", incl_files[i]);
+            if (--i < 0)
+                break;
+            fprintf(stderr, ", ");
+        }
+        fprintf(stderr, ")");
+    }
+    fprintf(stderr, "\n");
+    fflush(stderr);
+    return func_addr;
+}
+
+/* emit a run time error at position 'pc' */
+static void rt_error(ucontext_t *uc, const char *fmt, ...)
+{
+    va_list ap;
+    addr_t pc;
+    int i;
+
+    fprintf(stderr, "Runtime error: ");
+    va_start(ap, fmt);
+    vfprintf(stderr, fmt, ap);
+    va_end(ap);
+    fprintf(stderr, "\n");
+
+    for(i=0;i<rt_num_callers;i++) {
+        if (rt_get_caller_pc(&pc, uc, i) < 0)
+            break;
+        pc = rt_printline(pc, i ? "by" : "at");
+        if (pc == (addr_t)rt_prog_main && pc)
+            break;
+    }
+}
+
+/* ------------------------------------------------------------- */
+#ifndef _WIN32
+
+/* signal handler for fatal errors */
+static void sig_error(int signum, siginfo_t *siginf, void *puc)
+{
+    ucontext_t *uc = puc;
+
+    switch(signum) {
+    case SIGFPE:
+        switch(siginf->si_code) {
+        case FPE_INTDIV:
+        case FPE_FLTDIV:
+            rt_error(uc, "division by zero");
+            break;
+        default:
+            rt_error(uc, "floating point exception");
+            break;
+        }
+        break;
+    case SIGBUS:
+    case SIGSEGV:
+        if (rt_bound_error_msg && *rt_bound_error_msg)
+            rt_error(uc, *rt_bound_error_msg);
+        else
+            rt_error(uc, "dereferencing invalid pointer");
+        break;
+    case SIGILL:
+        rt_error(uc, "illegal instruction");
+        break;
+    case SIGABRT:
+        rt_error(uc, "abort() called");
+        break;
+    default:
+        rt_error(uc, "caught signal %d", signum);
+        break;
+    }
+    exit(255);
+}
+
+#ifndef SA_SIGINFO
+# define SA_SIGINFO 0x00000004u
+#endif
+
+/* Generate a stack backtrace when a CPU exception occurs. */
+static void set_exception_handler(void)
+{
+    struct sigaction sigact;
+    /* install TCC signal handlers to print debug info on fatal
+       runtime errors */
+    sigact.sa_flags = SA_SIGINFO | SA_RESETHAND;
+    sigact.sa_sigaction = sig_error;
+    sigemptyset(&sigact.sa_mask);
+    sigaction(SIGFPE, &sigact, NULL);
+    sigaction(SIGILL, &sigact, NULL);
+    sigaction(SIGSEGV, &sigact, NULL);
+    sigaction(SIGBUS, &sigact, NULL);
+    sigaction(SIGABRT, &sigact, NULL);
+}
+
+/* ------------------------------------------------------------- */
+#ifdef __i386__
+
+/* fix for glibc 2.1 */
+#ifndef REG_EIP
+#define REG_EIP EIP
+#define REG_EBP EBP
+#endif
+
+/* return the PC at frame level 'level'. Return negative if not found */
+static int rt_get_caller_pc(addr_t *paddr, ucontext_t *uc, int level)
+{
+    addr_t fp;
+    int i;
+
+    if (level == 0) {
+#if defined(__APPLE__)
+        *paddr = uc->uc_mcontext->__ss.__eip;
+#elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__) || defined(__DragonFly__)
+        *paddr = uc->uc_mcontext.mc_eip;
+#elif defined(__dietlibc__)
+        *paddr = uc->uc_mcontext.eip;
+#elif defined(__NetBSD__)
+        *paddr = uc->uc_mcontext.__gregs[_REG_EIP];
+#elif defined(__OpenBSD__)
+        *paddr = uc->sc_eip;
+#else
+        *paddr = uc->uc_mcontext.gregs[REG_EIP];
+#endif
+        return 0;
+    } else {
+#if defined(__APPLE__)
+        fp = uc->uc_mcontext->__ss.__ebp;
+#elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__) || defined(__DragonFly__)
+        fp = uc->uc_mcontext.mc_ebp;
+#elif defined(__dietlibc__)
+        fp = uc->uc_mcontext.ebp;
+#elif defined(__NetBSD__)
+        fp = uc->uc_mcontext.__gregs[_REG_EBP];
+#elif defined(__OpenBSD__)
+        *paddr = uc->sc_ebp;
+#else
+        fp = uc->uc_mcontext.gregs[REG_EBP];
+#endif
+        for(i=1;i<level;i++) {
+            /* XXX: check address validity with program info */
+            if (fp <= 0x1000 || fp >= 0xc0000000)
+                return -1;
+            fp = ((addr_t *)fp)[0];
+        }
+        *paddr = ((addr_t *)fp)[1];
+        return 0;
+    }
+}
+
+/* ------------------------------------------------------------- */
+#elif defined(__x86_64__)
+
+/* return the PC at frame level 'level'. Return negative if not found */
+static int rt_get_caller_pc(addr_t *paddr, ucontext_t *uc, int level)
+{
+    addr_t fp;
+    int i;
+
+    if (level == 0) {
+        /* XXX: only support linux */
+#if defined(__APPLE__)
+        *paddr = uc->uc_mcontext->__ss.__rip;
+#elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__) || defined(__DragonFly__)
+        *paddr = uc->uc_mcontext.mc_rip;
+#elif defined(__NetBSD__)
+        *paddr = uc->uc_mcontext.__gregs[_REG_RIP];
+#else
+        *paddr = uc->uc_mcontext.gregs[REG_RIP];
+#endif
+        return 0;
+    } else {
+#if defined(__APPLE__)
+        fp = uc->uc_mcontext->__ss.__rbp;
+#elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__) || defined(__DragonFly__)
+        fp = uc->uc_mcontext.mc_rbp;
+#elif defined(__NetBSD__)
+        fp = uc->uc_mcontext.__gregs[_REG_RBP];
+#else
+        fp = uc->uc_mcontext.gregs[REG_RBP];
+#endif
+        for(i=1;i<level;i++) {
+            /* XXX: check address validity with program info */
+            if (fp <= 0x1000)
+                return -1;
+            fp = ((addr_t *)fp)[0];
+        }
+        *paddr = ((addr_t *)fp)[1];
+        return 0;
+    }
+}
+
+/* ------------------------------------------------------------- */
+#elif defined(__arm__)
+
+/* return the PC at frame level 'level'. Return negative if not found */
+static int rt_get_caller_pc(addr_t *paddr, ucontext_t *uc, int level)
+{
+    addr_t fp, sp;
+    int i;
+
+    if (level == 0) {
+        /* XXX: only supports linux */
+#if defined(__linux__)
+        *paddr = uc->uc_mcontext.arm_pc;
+#else
+        return -1;
+#endif
+        return 0;
+    } else {
+#if defined(__linux__)
+        fp = uc->uc_mcontext.arm_fp;
+        sp = uc->uc_mcontext.arm_sp;
+        if (sp < 0x1000)
+            sp = 0x1000;
+#else
+        return -1;
+#endif
+        /* XXX: specific to tinycc stack frames */
+        if (fp < sp + 12 || fp & 3)
+            return -1;
+        for(i = 1; i < level; i++) {
+            sp = ((addr_t *)fp)[-2];
+            if (sp < fp || sp - fp > 16 || sp & 3)
+                return -1;
+            fp = ((addr_t *)fp)[-3];
+            if (fp <= sp || fp - sp < 12 || fp & 3)
+                return -1;
+        }
+        /* XXX: check address validity with program info */
+        *paddr = ((addr_t *)fp)[-1];
+        return 0;
+    }
+}
+
+/* ------------------------------------------------------------- */
+#elif defined(__aarch64__)
+
+static int rt_get_caller_pc(addr_t *paddr, ucontext_t *uc, int level)
+{
+    if (level < 0)
+        return -1;
+    else if (level == 0) {
+        *paddr = uc->uc_mcontext.pc;
+        return 0;
+    }
+    else {
+        addr_t *fp = (addr_t *)uc->uc_mcontext.regs[29];
+        int i;
+        for (i = 1; i < level; i++)
+            fp = (addr_t *)fp[0];
+        *paddr = fp[1];
+        return 0;
+    }
+}
+
+/* ------------------------------------------------------------- */
+#else
+
+#warning add arch specific rt_get_caller_pc()
+static int rt_get_caller_pc(addr_t *paddr, ucontext_t *uc, int level)
+{
+    return -1;
+}
+
+#endif /* !__i386__ */
+
+/* ------------------------------------------------------------- */
+#else /* WIN32 */
+
+static long __stdcall cpu_exception_handler(EXCEPTION_POINTERS *ex_info)
+{
+    EXCEPTION_RECORD *er = ex_info->ExceptionRecord;
+    CONTEXT *uc = ex_info->ContextRecord;
+    switch (er->ExceptionCode) {
+    case EXCEPTION_ACCESS_VIOLATION:
+        if (rt_bound_error_msg && *rt_bound_error_msg)
+            rt_error(uc, *rt_bound_error_msg);
+        else
+	    rt_error(uc, "access violation");
+        break;
+    case EXCEPTION_STACK_OVERFLOW:
+        rt_error(uc, "stack overflow");
+        break;
+    case EXCEPTION_INT_DIVIDE_BY_ZERO:
+        rt_error(uc, "division by zero");
+        break;
+    default:
+        rt_error(uc, "exception caught");
+        break;
+    }
+    return EXCEPTION_EXECUTE_HANDLER;
+}
+
+/* Generate a stack backtrace when a CPU exception occurs. */
+static void set_exception_handler(void)
+{
+    SetUnhandledExceptionFilter(cpu_exception_handler);
+}
+
+/* return the PC at frame level 'level'. Return non zero if not found */
+static int rt_get_caller_pc(addr_t *paddr, CONTEXT *uc, int level)
+{
+    addr_t fp, pc;
+    int i;
+#ifdef _WIN64
+    pc = uc->Rip;
+    fp = uc->Rbp;
+#else
+    pc = uc->Eip;
+    fp = uc->Ebp;
+#endif
+    if (level > 0) {
+        for(i=1;i<level;i++) {
+	    /* XXX: check address validity with program info */
+	    if (fp <= 0x1000 || fp >= 0xc0000000)
+		return -1;
+	    fp = ((addr_t*)fp)[0];
+	}
+        pc = ((addr_t*)fp)[1];
+    }
+    *paddr = pc;
+    return 0;
+}
+
+#endif /* _WIN32 */
+#endif /* CONFIG_TCC_BACKTRACE */
+/* ------------------------------------------------------------- */
+#ifdef CONFIG_TCC_STATIC
+
+/* dummy function for profiling */
+ST_FUNC void *dlopen(const char *filename, int flag)
+{
+    return NULL;
+}
+
+ST_FUNC void dlclose(void *p)
+{
+}
+
+ST_FUNC const char *dlerror(void)
+{
+    return "error";
+}
+
+typedef struct TCCSyms {
+    char *str;
+    void *ptr;
+} TCCSyms;
+
+
+/* add the symbol you want here if no dynamic linking is done */
+static TCCSyms tcc_syms[] = {
+#if !defined(CONFIG_TCCBOOT)
+#define TCCSYM(a) { #a, &a, },
+    TCCSYM(printf)
+    TCCSYM(fprintf)
+    TCCSYM(fopen)
+    TCCSYM(fclose)
+#undef TCCSYM
+#endif
+    { NULL, NULL },
+};
+
+ST_FUNC void *dlsym(void *handle, const char *symbol)
+{
+    TCCSyms *p;
+    p = tcc_syms;
+    while (p->str != NULL) {
+        if (!strcmp(p->str, symbol))
+            return p->ptr;
+        p++;
+    }
+    return NULL;
+}
+
+#endif /* CONFIG_TCC_STATIC */
+#endif /* TCC_IS_NATIVE */
+/* ------------------------------------------------------------- */
--- a/tcctok.h
+++ b/tcctok.h
@ -36,7 +36,9 @@
     DEF(TOK_RESTRICT2, "__restrict")
     DEF(TOK_RESTRICT3, "__restrict__")
     DEF(TOK_EXTENSION, "__extension__") /* gcc keyword */
-     
+
+     DEF(TOK_GENERIC, "_Generic")
+
     DEF(TOK_FLOAT, "float")
     DEF(TOK_DOUBLE, "double")
     DEF(TOK_BOOL, "_Bool")
@ -59,6 +61,10 @@
     DEF(TOK_ASM2, "__asm")
     DEF(TOK_ASM3, "__asm__")

+#ifdef TCC_TARGET_ARM64
+     DEF(TOK_UINT128, "__uint128_t")
+#endif
+
 /*********************************************************************/
 /* the following are not keywords. They are included to ease parsing */
 /* preprocessor only */
@ -81,10 +87,16 @@
     DEF(TOK___TIME__, "__TIME__")
     DEF(TOK___FUNCTION__, "__FUNCTION__")
     DEF(TOK___VA_ARGS__, "__VA_ARGS__")
-     
+     DEF(TOK___COUNTER__, "__COUNTER__")
+
 /* special identifiers */
     DEF(TOK___FUNC__, "__func__")
-     
+
+/* special floating point values */
+     DEF(TOK___NAN__, "__nan__")
+     DEF(TOK___SNAN__, "__snan__")
+     DEF(TOK___INF__, "__inf__")
+
 /* attribute identifiers */
 /* XXX: handle all tokens generically since speed is not critical */
     DEF(TOK_SECTION1, "section")
@ -93,6 +105,10 @@
     DEF(TOK_ALIGNED2, "__aligned__")
     DEF(TOK_PACKED1, "packed")
     DEF(TOK_PACKED2, "__packed__")
+     DEF(TOK_WEAK1, "weak")
+     DEF(TOK_WEAK2, "__weak__")
+     DEF(TOK_ALIAS1, "alias")
+     DEF(TOK_ALIAS2, "__alias__")
     DEF(TOK_UNUSED1, "unused")
     DEF(TOK_UNUSED2, "__unused__")
     DEF(TOK_CDECL1, "cdecl")
@ -104,21 +120,38 @@
     DEF(TOK_FASTCALL1, "fastcall")
     DEF(TOK_FASTCALL2, "__fastcall")
     DEF(TOK_FASTCALL3, "__fastcall__")
+     DEF(TOK_REGPARM1, "regparm")
+     DEF(TOK_REGPARM2, "__regparm__")
+
+     DEF(TOK_MODE, "__mode__")
+     DEF(TOK_MODE_QI, "__QI__")
+     DEF(TOK_MODE_DI, "__DI__")
+     DEF(TOK_MODE_HI, "__HI__")
+     DEF(TOK_MODE_SI, "__SI__")
+     DEF(TOK_MODE_word, "__word__")
+
     DEF(TOK_DLLEXPORT, "dllexport")
     DEF(TOK_DLLIMPORT, "dllimport")
     DEF(TOK_NORETURN1, "noreturn")
     DEF(TOK_NORETURN2, "__noreturn__")
+     DEF(TOK_VISIBILITY1, "visibility")
+     DEF(TOK_VISIBILITY2, "__visibility__")
+
     DEF(TOK_builtin_types_compatible_p, "__builtin_types_compatible_p")
+     DEF(TOK_builtin_choose_expr, "__builtin_choose_expr")
     DEF(TOK_builtin_constant_p, "__builtin_constant_p")
     DEF(TOK_builtin_frame_address, "__builtin_frame_address")
-#ifdef TCC_TARGET_X86_64
-     DEF(TOK_builtin_malloc, "__builtin_malloc")
-     DEF(TOK_builtin_free, "__builtin_free")
-     DEF(TOK_malloc, "malloc")
-     DEF(TOK_free, "free")
+     DEF(TOK_builtin_return_address, "__builtin_return_address")
+     DEF(TOK_builtin_expect, "__builtin_expect")
+     /*DEF(TOK_builtin_va_list, "__builtin_va_list")*/
+#if defined TCC_TARGET_PE && defined TCC_TARGET_X86_64
+     DEF(TOK_builtin_va_start, "__builtin_va_start")
+#elif defined TCC_TARGET_X86_64
+     DEF(TOK_builtin_va_arg_types, "__builtin_va_arg_types")
+#elif defined TCC_TARGET_ARM64
+     DEF(TOK___va_start, "__va_start")
+     DEF(TOK___va_arg, "__va_arg")
 #endif
-     DEF(TOK_REGPARM1, "regparm")
-     DEF(TOK_REGPARM2, "__regparm__")

 /* pragma */
     DEF(TOK_pack, "pack")
@ -127,25 +160,44 @@
     DEF(TOK_ASM_push, "push")
     DEF(TOK_ASM_pop, "pop")
 #endif
+     DEF(TOK_comment, "comment")
+     DEF(TOK_lib, "lib")
+     DEF(TOK_push_macro, "push_macro")
+     DEF(TOK_pop_macro, "pop_macro")
+     DEF(TOK_once, "once")
+     DEF(TOK_option, "option")

 /* builtin functions or variables */
-#ifdef TCC_ARM_EABI
-     DEF(TOK_memcpy, "__aeabi_memcpy")
-     DEF(TOK_memcpy4, "__aeabi_memcpy4")
-     DEF(TOK_memcpy8, "__aeabi_memcpy8")
-     DEF(TOK_memset, "__aeabi_memset")
-     DEF(TOK___aeabi_ldivmod, "__aeabi_ldivmod")
-     DEF(TOK___aeabi_uldivmod, "__aeabi_uldivmod")
-#else
+#ifndef TCC_ARM_EABI
     DEF(TOK_memcpy, "memcpy")
+     DEF(TOK_memmove, "memmove")
     DEF(TOK_memset, "memset")
     DEF(TOK___divdi3, "__divdi3")
     DEF(TOK___moddi3, "__moddi3")
     DEF(TOK___udivdi3, "__udivdi3")
     DEF(TOK___umoddi3, "__umoddi3")
+     DEF(TOK___ashrdi3, "__ashrdi3")
+     DEF(TOK___lshrdi3, "__lshrdi3")
+     DEF(TOK___ashldi3, "__ashldi3")
+     DEF(TOK___floatundisf, "__floatundisf")
+     DEF(TOK___floatundidf, "__floatundidf")
+# ifndef TCC_ARM_VFP
+     DEF(TOK___floatundixf, "__floatundixf")
+     DEF(TOK___fixunsxfdi, "__fixunsxfdi")
+# endif
+     DEF(TOK___fixunssfdi, "__fixunssfdi")
+     DEF(TOK___fixunsdfdi, "__fixunsdfdi")
 #endif
-#if defined(TCC_TARGET_ARM)
-#ifdef TCC_ARM_EABI
+
+#if defined TCC_TARGET_ARM
+# ifdef TCC_ARM_EABI
+     DEF(TOK_memcpy, "__aeabi_memcpy")
+     DEF(TOK_memcpy4, "__aeabi_memcpy4")
+     DEF(TOK_memcpy8, "__aeabi_memcpy8")
+     DEF(TOK_memmove, "__aeabi_memmove")
+     DEF(TOK_memset, "__aeabi_memset")
+     DEF(TOK___aeabi_ldivmod, "__aeabi_ldivmod")
+     DEF(TOK___aeabi_uldivmod, "__aeabi_uldivmod")
     DEF(TOK___aeabi_idivmod, "__aeabi_idivmod")
     DEF(TOK___aeabi_uidivmod, "__aeabi_uidivmod")
     DEF(TOK___divsi3, "__aeabi_idiv")
@ -154,36 +206,6 @@
     DEF(TOK___floatdidf, "__aeabi_l2d")
     DEF(TOK___fixsfdi, "__aeabi_f2lz")
     DEF(TOK___fixdfdi, "__aeabi_d2lz")
-#else
-     DEF(TOK___modsi3, "__modsi3")
-     DEF(TOK___umodsi3, "__umodsi3")
-     DEF(TOK___divsi3, "__divsi3")
-     DEF(TOK___udivsi3, "__udivsi3")
-     DEF(TOK___floatdisf, "__floatdisf")
-     DEF(TOK___floatdidf, "__floatdidf")
-#ifndef TCC_ARM_VFP
-     DEF(TOK___floatdixf, "__floatdixf")
-     DEF(TOK___fixunssfsi, "__fixunssfsi")
-     DEF(TOK___fixunsdfsi, "__fixunsdfsi")
-     DEF(TOK___fixunsxfsi, "__fixunsxfsi")
-     DEF(TOK___fixxfdi, "__fixxfdi")
-#endif
-     DEF(TOK___fixsfdi, "__fixsfdi")
-     DEF(TOK___fixdfdi, "__fixdfdi")
-#endif
-#elif defined(TCC_TARGET_C67)
-     DEF(TOK__divi, "_divi")
-     DEF(TOK__divu, "_divu")
-     DEF(TOK__divf, "_divf")
-     DEF(TOK__divd, "_divd")
-     DEF(TOK__remi, "_remi")
-     DEF(TOK__remu, "_remu")
-#endif
-#ifdef TCC_TARGET_I386
-     DEF(TOK___tcc_int_fpu_control, "__tcc_int_fpu_control")
-     DEF(TOK___tcc_fpu_control, "__tcc_fpu_control")
-#endif
-#ifdef TCC_ARM_EABI
     DEF(TOK___ashrdi3, "__aeabi_lasr")
     DEF(TOK___lshrdi3, "__aeabi_llsr")
     DEF(TOK___ashldi3, "__aeabi_llsl")
@ -191,22 +213,72 @@
     DEF(TOK___floatundidf, "__aeabi_ul2d")
     DEF(TOK___fixunssfdi, "__aeabi_f2ulz")
     DEF(TOK___fixunsdfdi, "__aeabi_d2ulz")
-#else
-     DEF(TOK___ashrdi3, "__ashrdi3")
-     DEF(TOK___lshrdi3, "__lshrdi3")
-     DEF(TOK___ashldi3, "__ashldi3")
-     DEF(TOK___floatundisf, "__floatundisf")
-     DEF(TOK___floatundidf, "__floatundidf")
-#ifndef TCC_ARM_VFP
-     DEF(TOK___floatundixf, "__floatundixf")
-     DEF(TOK___fixunsxfdi, "__fixunsxfdi")
+# else
+     DEF(TOK___modsi3, "__modsi3")
+     DEF(TOK___umodsi3, "__umodsi3")
+     DEF(TOK___divsi3, "__divsi3")
+     DEF(TOK___udivsi3, "__udivsi3")
+     DEF(TOK___floatdisf, "__floatdisf")
+     DEF(TOK___floatdidf, "__floatdidf")
+#  ifndef TCC_ARM_VFP
+     DEF(TOK___floatdixf, "__floatdixf")
+     DEF(TOK___fixunssfsi, "__fixunssfsi")
+     DEF(TOK___fixunsdfsi, "__fixunsdfsi")
+     DEF(TOK___fixunsxfsi, "__fixunsxfsi")
+     DEF(TOK___fixxfdi, "__fixxfdi")
+#  endif
+     DEF(TOK___fixsfdi, "__fixsfdi")
+     DEF(TOK___fixdfdi, "__fixdfdi")
+# endif
 #endif
-     DEF(TOK___fixunssfdi, "__fixunssfdi")
-     DEF(TOK___fixunsdfdi, "__fixunsdfdi")
+
+#if defined TCC_TARGET_C67
+     DEF(TOK__divi, "_divi")
+     DEF(TOK__divu, "_divu")
+     DEF(TOK__divf, "_divf")
+     DEF(TOK__divd, "_divd")
+     DEF(TOK__remi, "_remi")
+     DEF(TOK__remu, "_remu")
 #endif
-#ifdef TCC_TARGET_PE
+
+#if defined TCC_TARGET_I386
+     DEF(TOK___fixsfdi, "__fixsfdi")
+     DEF(TOK___fixdfdi, "__fixdfdi")
+     DEF(TOK___fixxfdi, "__fixxfdi")
+#endif
+
+#if defined TCC_TARGET_I386 || defined TCC_TARGET_X86_64
+     DEF(TOK_alloca, "alloca")
+#endif
+
+#if defined TCC_TARGET_PE
     DEF(TOK___chkstk, "__chkstk")
 #endif
+#ifdef TCC_TARGET_ARM64
+     DEF(TOK___arm64_clear_cache, "__arm64_clear_cache")
+     DEF(TOK___addtf3, "__addtf3")
+     DEF(TOK___subtf3, "__subtf3")
+     DEF(TOK___multf3, "__multf3")
+     DEF(TOK___divtf3, "__divtf3")
+     DEF(TOK___extendsftf2, "__extendsftf2")
+     DEF(TOK___extenddftf2, "__extenddftf2")
+     DEF(TOK___trunctfsf2, "__trunctfsf2")
+     DEF(TOK___trunctfdf2, "__trunctfdf2")
+     DEF(TOK___fixtfsi, "__fixtfsi")
+     DEF(TOK___fixtfdi, "__fixtfdi")
+     DEF(TOK___fixunstfsi, "__fixunstfsi")
+     DEF(TOK___fixunstfdi, "__fixunstfdi")
+     DEF(TOK___floatsitf, "__floatsitf")
+     DEF(TOK___floatditf, "__floatditf")
+     DEF(TOK___floatunsitf, "__floatunsitf")
+     DEF(TOK___floatunditf, "__floatunditf")
+     DEF(TOK___eqtf2, "__eqtf2")
+     DEF(TOK___netf2, "__netf2")
+     DEF(TOK___lttf2, "__lttf2")
+     DEF(TOK___letf2, "__letf2")
+     DEF(TOK___gttf2, "__gttf2")
+     DEF(TOK___getf2, "__getf2")
+#endif

 /* bound checking symbols */
 #ifdef CONFIG_TCC_BCHECK
@ -217,50 +289,62 @@
     DEF(TOK___bound_ptr_indir8, "__bound_ptr_indir8")
     DEF(TOK___bound_ptr_indir12, "__bound_ptr_indir12")
     DEF(TOK___bound_ptr_indir16, "__bound_ptr_indir16")
+     DEF(TOK___bound_main_arg, "__bound_main_arg")
     DEF(TOK___bound_local_new, "__bound_local_new")
     DEF(TOK___bound_local_delete, "__bound_local_delete")
-#if 0
+# ifdef TCC_TARGET_PE
     DEF(TOK_malloc, "malloc")
     DEF(TOK_free, "free")
     DEF(TOK_realloc, "realloc")
     DEF(TOK_memalign, "memalign")
     DEF(TOK_calloc, "calloc")
-#endif
-     DEF(TOK_memmove, "memmove")
+# endif
     DEF(TOK_strlen, "strlen")
     DEF(TOK_strcpy, "strcpy")
-     DEF(TOK_alloca, "alloca")
 #endif

 /* Tiny Assembler */
-
- DEF_ASM(byte)
- DEF_ASM(word)
- DEF_ASM(align)
- DEF_ASM(skip)
- DEF_ASM(space)
- DEF_ASM(string)
- DEF_ASM(asciz)
- DEF_ASM(ascii)
- DEF_ASM(globl)
- DEF_ASM(global)
- DEF_ASM(text)
- DEF_ASM(data)
- DEF_ASM(bss)
- DEF_ASM(previous)
- DEF_ASM(fill)
- DEF_ASM(org)
- DEF_ASM(quad)
+ DEF_ASMDIR(byte)              /* must be first directive */
+ DEF_ASMDIR(word)
+ DEF_ASMDIR(align)
+ DEF_ASMDIR(balign)
+ DEF_ASMDIR(p2align)
+ DEF_ASMDIR(set)
+ DEF_ASMDIR(skip)
+ DEF_ASMDIR(space)
+ DEF_ASMDIR(string)
+ DEF_ASMDIR(asciz)
+ DEF_ASMDIR(ascii)
+ DEF_ASMDIR(file)
+ DEF_ASMDIR(globl)
+ DEF_ASMDIR(global)
+ DEF_ASMDIR(weak)
+ DEF_ASMDIR(hidden)
+ DEF_ASMDIR(ident)
+ DEF_ASMDIR(size)
+ DEF_ASMDIR(type)
+ DEF_ASMDIR(text)
+ DEF_ASMDIR(data)
+ DEF_ASMDIR(bss)
+ DEF_ASMDIR(previous)
+ DEF_ASMDIR(pushsection)
+ DEF_ASMDIR(popsection)
+ DEF_ASMDIR(fill)
+ DEF_ASMDIR(rept)
+ DEF_ASMDIR(endr)
+ DEF_ASMDIR(org)
+ DEF_ASMDIR(quad)
 #if defined(TCC_TARGET_I386)
- DEF_ASM(code16)
- DEF_ASM(code32)
+ DEF_ASMDIR(code16)
+ DEF_ASMDIR(code32)
 #elif defined(TCC_TARGET_X86_64)
- DEF_ASM(code64)
+ DEF_ASMDIR(code64)
 #endif
+ DEF_ASMDIR(short)
+ DEF_ASMDIR(long)
+ DEF_ASMDIR(int)
+ DEF_ASMDIR(section)            /* must be last directive */

-#ifdef TCC_TARGET_I386
+#if defined TCC_TARGET_I386 || defined TCC_TARGET_X86_64
 #include "i386-tok.h"
-#elif TCC_TARGET_X86_64
-#include "x86_64-tok.h"
 #endif
-
--- a/tcctools.c
+++ b/tcctools.c
@ -0,0 +1,529 @@
+/* -------------------------------------------------------------- */
+/*
+ *  TCC - Tiny C Compiler
+ *
+ *  tcctools.c - extra tools and and -m32/64 support
+ *
+ */
+
+/* -------------------------------------------------------------- */
+/*
+ * This program is for making libtcc1.a without ar
+ * tiny_libmaker - tiny elf lib maker
+ * usage: tiny_libmaker [lib] files...
+ * Copyright (c) 2007 Timppa
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include "tcc.h"
+
+//#define ARMAG  "!<arch>\n"
+#define ARFMAG "`\n"
+
+typedef struct {
+    char ar_name[16];
+    char ar_date[12];
+    char ar_uid[6];
+    char ar_gid[6];
+    char ar_mode[8];
+    char ar_size[10];
+    char ar_fmag[2];
+} ArHdr;
+
+static unsigned long le2belong(unsigned long ul) {
+    return ((ul & 0xFF0000)>>8)+((ul & 0xFF000000)>>24) +
+        ((ul & 0xFF)<<24)+((ul & 0xFF00)<<8);
+}
+
+/* Returns 1 if s contains any of the chars of list, else 0 */
+static int contains_any(const char *s, const char *list) {
+  const char *l;
+  for (; *s; s++) {
+      for (l = list; *l; l++) {
+          if (*s == *l)
+              return 1;
+      }
+  }
+  return 0;
+}
+
+static int ar_usage(int ret) {
+    fprintf(stderr, "usage: tcc -ar [rcsv] lib file...\n");
+    fprintf(stderr, "create library ([abdioptxN] not supported).\n");
+    return ret;
+}
+
+ST_FUNC int tcc_tool_ar(TCCState *s1, int argc, char **argv)
+{
+    static ArHdr arhdr = {
+        "/               ",
+        "            ",
+        "0     ",
+        "0     ",
+        "0       ",
+        "          ",
+        ARFMAG
+        };
+
+    static ArHdr arhdro = {
+        "                ",
+        "            ",
+        "0     ",
+        "0     ",
+        "0       ",
+        "          ",
+        ARFMAG
+        };
+
+    FILE *fi, *fh = NULL, *fo = NULL;
+    ElfW(Ehdr) *ehdr;
+    ElfW(Shdr) *shdr;
+    ElfW(Sym) *sym;
+    int i, fsize, i_lib, i_obj;
+    char *buf, *shstr, *symtab = NULL, *strtab = NULL;
+    int symtabsize = 0;//, strtabsize = 0;
+    char *anames = NULL;
+    int *afpos = NULL;
+    int istrlen, strpos = 0, fpos = 0, funccnt = 0, funcmax, hofs;
+    char tfile[260], stmp[20];
+    char *file, *name;
+    int ret = 2;
+    const char *ops_conflict = "habdioptxN";  // unsupported but destructive if ignored.
+    int verbose = 0;
+
+    i_lib = 0; i_obj = 0;  // will hold the index of the lib and first obj
+    for (i = 1; i < argc; i++) {
+        const char *a = argv[i];
+        if (*a == '-' && strstr(a, "."))
+            ret = 1; // -x.y is always invalid (same as gnu ar)
+        if ((*a == '-') || (i == 1 && !strstr(a, "."))) {  // options argument
+            if (contains_any(a, ops_conflict))
+                ret = 1;
+            if (strstr(a, "v"))
+                verbose = 1;
+        } else {  // lib or obj files: don't abort - keep validating all args.
+            if (!i_lib)  // first file is the lib
+                i_lib = i;
+            else if (!i_obj)  // second file is the first obj
+                i_obj = i;
+        }
+    }
+
+    if (!i_obj)  // i_obj implies also i_lib. we require both.
+        ret = 1;
+
+    if (ret == 1)
+        return ar_usage(ret);
+
+    if ((fh = fopen(argv[i_lib], "wb")) == NULL)
+    {
+        fprintf(stderr, "tcc: ar: can't open file %s \n", argv[i_lib]);
+        goto the_end;
+    }
+
+    sprintf(tfile, "%s.tmp", argv[i_lib]);
+    if ((fo = fopen(tfile, "wb+")) == NULL)
+    {
+        fprintf(stderr, "tcc: ar: can't create temporary file %s\n", tfile);
+        goto the_end;
+    }
+
+    funcmax = 250;
+    afpos = tcc_realloc(NULL, funcmax * sizeof *afpos); // 250 func
+    memcpy(&arhdro.ar_mode, "100666", 6);
+
+    // i_obj = first input object file
+    while (i_obj < argc)
+    {
+        if (*argv[i_obj] == '-') {  // by now, all options start with '-'
+            i_obj++;
+            continue;
+        }
+        if ((fi = fopen(argv[i_obj], "rb")) == NULL) {
+            fprintf(stderr, "tcc: ar: can't open file %s \n", argv[i_obj]);
+            goto the_end;
+        }
+        if (verbose)
+            printf("a - %s\n", argv[i_obj]);
+
+        fseek(fi, 0, SEEK_END);
+        fsize = ftell(fi);
+        fseek(fi, 0, SEEK_SET);
+        buf = tcc_malloc(fsize + 1);
+        fread(buf, fsize, 1, fi);
+        fclose(fi);
+
+        // elf header
+        ehdr = (ElfW(Ehdr) *)buf;
+        if (ehdr->e_ident[4] != ELFCLASSW)
+        {
+            fprintf(stderr, "tcc: ar: Unsupported Elf Class: %s\n", argv[i_obj]);
+            goto the_end;
+        }
+
+        shdr = (ElfW(Shdr) *) (buf + ehdr->e_shoff + ehdr->e_shstrndx * ehdr->e_shentsize);
+        shstr = (char *)(buf + shdr->sh_offset);
+        for (i = 0; i < ehdr->e_shnum; i++)
+        {
+            shdr = (ElfW(Shdr) *) (buf + ehdr->e_shoff + i * ehdr->e_shentsize);
+            if (!shdr->sh_offset)
+                continue;
+            if (shdr->sh_type == SHT_SYMTAB)
+            {
+                symtab = (char *)(buf + shdr->sh_offset);
+                symtabsize = shdr->sh_size;
+            }
+            if (shdr->sh_type == SHT_STRTAB)
+            {
+                if (!strcmp(shstr + shdr->sh_name, ".strtab"))
+                {
+                    strtab = (char *)(buf + shdr->sh_offset);
+                    //strtabsize = shdr->sh_size;
+                }
+            }
+        }
+
+        if (symtab && symtabsize)
+        {
+            int nsym = symtabsize / sizeof(ElfW(Sym));
+            //printf("symtab: info size shndx name\n");
+            for (i = 1; i < nsym; i++)
+            {
+                sym = (ElfW(Sym) *) (symtab + i * sizeof(ElfW(Sym)));
+                if (sym->st_shndx &&
+                    (sym->st_info == 0x10
+                    || sym->st_info == 0x11
+                    || sym->st_info == 0x12
+                    )) {
+                    //printf("symtab: %2Xh %4Xh %2Xh %s\n", sym->st_info, sym->st_size, sym->st_shndx, strtab + sym->st_name);
+                    istrlen = strlen(strtab + sym->st_name)+1;
+                    anames = tcc_realloc(anames, strpos+istrlen);
+                    strcpy(anames + strpos, strtab + sym->st_name);
+                    strpos += istrlen;
+                    if (++funccnt >= funcmax) {
+                        funcmax += 250;
+                        afpos = tcc_realloc(afpos, funcmax * sizeof *afpos); // 250 func more
+                    }
+                    afpos[funccnt] = fpos;
+                }
+            }
+        }
+
+        file = argv[i_obj];
+        for (name = strchr(file, 0);
+             name > file && name[-1] != '/' && name[-1] != '\\';
+             --name);
+        istrlen = strlen(name);
+        if (istrlen >= sizeof(arhdro.ar_name))
+            istrlen = sizeof(arhdro.ar_name) - 1;
+        memset(arhdro.ar_name, ' ', sizeof(arhdro.ar_name));
+        memcpy(arhdro.ar_name, name, istrlen);
+        arhdro.ar_name[istrlen] = '/';
+        sprintf(stmp, "%-10d", fsize);
+        memcpy(&arhdro.ar_size, stmp, 10);
+        fwrite(&arhdro, sizeof(arhdro), 1, fo);
+        fwrite(buf, fsize, 1, fo);
+        tcc_free(buf);
+        i_obj++;
+        fpos += (fsize + sizeof(arhdro));
+    }
+    hofs = 8 + sizeof(arhdr) + strpos + (funccnt+1) * sizeof(int);
+    fpos = 0;
+    if ((hofs & 1)) // align
+        hofs++, fpos = 1;
+    // write header
+    fwrite("!<arch>\n", 8, 1, fh);
+    sprintf(stmp, "%-10d", (int)(strpos + (funccnt+1) * sizeof(int)));
+    memcpy(&arhdr.ar_size, stmp, 10);
+    fwrite(&arhdr, sizeof(arhdr), 1, fh);
+    afpos[0] = le2belong(funccnt);
+    for (i=1; i<=funccnt; i++)
+        afpos[i] = le2belong(afpos[i] + hofs);
+    fwrite(afpos, (funccnt+1) * sizeof(int), 1, fh);
+    fwrite(anames, strpos, 1, fh);
+    if (fpos)
+        fwrite("", 1, 1, fh);
+    // write objects
+    fseek(fo, 0, SEEK_END);
+    fsize = ftell(fo);
+    fseek(fo, 0, SEEK_SET);
+    buf = tcc_malloc(fsize + 1);
+    fread(buf, fsize, 1, fo);
+    fwrite(buf, fsize, 1, fh);
+    tcc_free(buf);
+    ret = 0;
+the_end:
+    if (anames)
+        tcc_free(anames);
+    if (afpos)
+        tcc_free(afpos);
+    if (fh)
+        fclose(fh);
+    if (fo)
+        fclose(fo), remove(tfile);
+    return ret;
+}
+
+/* -------------------------------------------------------------- */
+/*
+ * tiny_impdef creates an export definition file (.def) from a dll
+ * on MS-Windows. Usage: tiny_impdef library.dll [-o outputfile]"
+ *
+ *  Copyright (c) 2005,2007 grischka
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifdef TCC_TARGET_PE
+
+ST_FUNC int tcc_tool_impdef(TCCState *s1, int argc, char **argv)
+{
+    int ret, v, i;
+    char infile[260];
+    char outfile[260];
+
+    const char *file;
+    char *p, *q;
+    FILE *fp, *op;
+
+#ifdef _WIN32
+    char path[260];
+#endif
+
+    infile[0] = outfile[0] = 0;
+    fp = op = NULL;
+    ret = 1;
+    p = NULL;
+    v = 0;
+
+    for (i = 1; i < argc; ++i) {
+        const char *a = argv[i];
+        if ('-' == a[0]) {
+            if (0 == strcmp(a, "-v")) {
+                v = 1;
+            } else if (0 == strcmp(a, "-o")) {
+                if (++i == argc)
+                    goto usage;
+                strcpy(outfile, argv[i]);
+            } else
+                goto usage;
+        } else if (0 == infile[0])
+            strcpy(infile, a);
+        else
+            goto usage;
+    }
+
+    if (0 == infile[0]) {
+usage:
+        fprintf(stderr,
+            "usage: tcc -impdef library.dll [-v] [-o outputfile]\n"
+            "create export definition file (.def) from dll\n"
+            );
+        goto the_end;
+    }
+
+    if (0 == outfile[0]) {
+        strcpy(outfile, tcc_basename(infile));
+        q = strrchr(outfile, '.');
+        if (NULL == q)
+            q = strchr(outfile, 0);
+        strcpy(q, ".def");
+    }
+
+    file = infile;
+#ifdef _WIN32
+    if (SearchPath(NULL, file, ".dll", sizeof path, path, NULL))
+        file = path;
+#endif
+    ret = tcc_get_dllexports(file, &p);
+    if (ret || !p) {
+        fprintf(stderr, "tcc: impdef: %s '%s'\n",
+            ret == -1 ? "can't find file" :
+            ret ==  1 ? "can't read symbols" :
+            ret ==  0 ? "no symbols found in" :
+            "unknown file type", file);
+        ret = 1;
+        goto the_end;
+    }
+
+    if (v)
+        printf("-> %s\n", file);
+
+    op = fopen(outfile, "wb");
+    if (NULL == op) {
+        fprintf(stderr, "tcc: impdef: could not create output file: %s\n", outfile);
+        goto the_end;
+    }
+
+    fprintf(op, "LIBRARY %s\n\nEXPORTS\n", tcc_basename(file));
+    for (q = p, i = 0; *q; ++i) {
+        fprintf(op, "%s\n", q);
+        q += strlen(q) + 1;
+    }
+
+    if (v)
+        printf("<- %s (%d symbol%s)\n", outfile, i, &"s"[i<2]);
+
+    ret = 0;
+
+the_end:
+    /* cannot free memory received from tcc_get_dllexports
+       if it came from a dll */
+    /* if (p)
+        tcc_free(p); */
+    if (fp)
+        fclose(fp);
+    if (op)
+        fclose(op);
+    return ret;
+}
+
+#endif /* TCC_TARGET_PE */
+
+/* -------------------------------------------------------------- */
+/*
+ *  TCC - Tiny C Compiler
+ *
+ *  Copyright (c) 2001-2004 Fabrice Bellard
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+/* re-execute the i386/x86_64 cross-compilers with tcc -m32/-m64: */
+
+#if !defined TCC_TARGET_I386 && !defined TCC_TARGET_X86_64
+
+ST_FUNC void tcc_tool_cross(TCCState *s, char **argv, int option)
+{
+    tcc_error("-m%d not implemented.", option);
+}
+
+#else
+#ifdef _WIN32
+#include <process.h>
+
+static char *str_replace(const char *str, const char *p, const char *r)
+{
+    const char *s, *s0;
+    char *d, *d0;
+    int sl, pl, rl;
+
+    sl = strlen(str);
+    pl = strlen(p);
+    rl = strlen(r);
+    for (d0 = NULL;; d0 = tcc_malloc(sl + 1)) {
+        for (d = d0, s = str; s0 = s, s = strstr(s, p), s; s += pl) {
+            if (d) {
+                memcpy(d, s0, sl = s - s0), d += sl;
+                memcpy(d, r, rl), d += rl;
+            } else
+                sl += rl - pl;
+        }
+        if (d) {
+            strcpy(d, s0);
+            return d0;
+        }
+    }
+}
+
+static int execvp_win32(const char *prog, char **argv)
+{
+    int ret; char **p;
+    /* replace all " by \" */
+    for (p = argv; *p; ++p)
+        if (strchr(*p, '"'))
+            *p = str_replace(*p, "\"", "\\\"");
+    ret = _spawnvp(P_NOWAIT, prog, (const char *const*)argv);
+    if (-1 == ret)
+        return ret;
+    _cwait(&ret, ret, WAIT_CHILD);
+    exit(ret);
+}
+#define execvp execvp_win32
+#endif /* _WIN32 */
+
+ST_FUNC void tcc_tool_cross(TCCState *s, char **argv, int target)
+{
+    tcc_error("-m%d not implemented.", target);
+}
+
+#endif /* TCC_TARGET_I386 && TCC_TARGET_X86_64 */
+/* -------------------------------------------------------------- */
+/* enable commandline wildcard expansion (tcc -o x.exe *.c) */
+
+#ifdef _WIN32
+int _CRT_glob = 1;
+#ifndef _CRT_glob
+int _dowildcard = 1;
+#endif
+#endif
+
+/* -------------------------------------------------------------- */
+/* generate xxx.d file */
+
+ST_FUNC void gen_makedeps(TCCState *s, const char *target, const char *filename)
+{
+    FILE *depout;
+    char buf[1024];
+    int i;
+
+    if (!filename) {
+        /* compute filename automatically: dir/file.o -> dir/file.d */
+        snprintf(buf, sizeof buf, "%.*s.d",
+            (int)(tcc_fileextension(target) - target), target);
+        filename = buf;
+    }
+
+    if (s->verbose)
+        printf("<- %s\n", filename);
+
+    /* XXX return err codes instead of error() ? */
+    depout = fopen(filename, "w");
+    if (!depout)
+        tcc_error("could not open '%s'", filename);
+
+    fprintf(depout, "%s: \\\n", target);
+    for (i=0; i<s->nb_target_deps; ++i)
+        fprintf(depout, " %s \\\n", s->target_deps[i]);
+    fprintf(depout, "\n");
+    fclose(depout);
+}
+
+/* -------------------------------------------------------------- */
--- a/tests/42test.h
+++ b/tests/42test.h
@ -0,0 +1,13 @@
+/* This file is to test compute #include directives.  It's named so
+   that it starts with a pre-processing number which isn't a valid
+   number (42test.h).  Including this must work.  */
+#ifndef INC42_FIRST
+int have_included_42test_h;
+#define INC42_FIRST
+#elif !defined INC42_SECOND
+#define INC42_SECOND
+int have_included_42test_h_second;
+#else
+#define INC42_THIRD
+int have_included_42test_h_third;
+#endif
--- a/tests/Makefile
+++ b/tests/Makefile
@ -2,83 +2,151 @@
 # Tiny C Compiler Makefile - tests
 #

-# what tests to run
-TESTS = libtest test3
-
-# these should work too
-# TESTS += test1 test2 speedtest btest
-
-# these don't work as they should
-# TESTS += test4 asmtest
-
 TOP = ..
 include $(TOP)/Makefile
+VPATH = $(TOPSRC)/tests $(TOPSRC) $(TOP)
+CFLAGS := $(filter-out -W% -g% -O%,$(CFLAGS)) -I$(TOPSRC) $(LDFLAGS)

-# run local version of tcc with local libraries and includes
-TCC = ../tcc -B..
-RUN_TCC = $(NATIVE_TARGET) -run ../tcc.c -B..
-DISAS=objdump -d
+# what tests to run
+TESTS = \
+ hello-exe \
+ hello-run \
+ libtest \
+ test3 \
+ memtest \
+ dlltest \
+ abitest \
+ asm-c-connect-test \
+ vla_test-run \
+ cross-test \
+ tests2-dir \
+ pp-dir

-all test : $(TESTS)
+BTESTS = test1b test3b btest

-# make sure that tcc exists
-$(TESTS) : ../tcc
+# test4 -- problem with -static
+# asmtest / asmtest2 -- minor differences with gcc
+# btest -- works on i386 (including win32)

-../tcc ../libtcc.a :
-	$(MAKE) -C ..
+# bounds-checking is supported only on i386
+ifneq ($(ARCH),i386)
+ TESTS := $(filter-out $(BTESTS),$(TESTS))
+endif
+ifdef CONFIG_WIN32
+ TESTS := $(filter-out $(BTESTS),$(TESTS))
+endif
+ifdef CONFIG_OSX # -run only
+ TESTS := hello-run libtest tests2-dir pp-dir
+endif
+ifeq (,$(filter arm64 i386 x86_64,$(ARCH)))
+ TESTS := $(filter-out vla_test-run,$(TESTS))
+endif
+ifeq ($(CONFIG_arm_eabi),yes)
+ TESTS := $(filter-out test3,$(TESTS))
+endif
+ifeq (,$(filter i386 x86_64,$(ARCH)))
+ TESTS := $(filter-out dlltest asm-c-connect-test,$(TESTS))
+endif
+ifndef CONFIG_cross
+ TESTS := $(filter-out cross-%,$(TESTS))
+endif
+
+ifeq ($(OS),Windows_NT) # for libtcc_test to find libtcc.dll
+ PATH := $(CURDIR)/$(TOP)$(if $(findstring :\,$(PATH)),;,:)$(PATH)
+endif
+
+RUN_TCC = $(NATIVE_DEFINES) -run $(TOPSRC)/tcc.c $(TCCFLAGS)
+DISAS = objdump -d
+DUMPTCC = (set -x; $(TOP)/tcc -vv; ldd $(TOP)/tcc; exit 1)
+
+all test : clean-s $(TESTS)
+
+hello-exe: ../examples/ex1.c
+	@echo ------------ $@ ------------
+	$(TCC) $< -o hello$(EXESUF) && ./hello$(EXESUF) || $(DUMPTCC)
+
+hello-run: ../examples/ex1.c
+	@echo ------------ $@ ------------
+	$(TCC) -run $< || $(DUMPTCC)

-# libtcc test
 libtest: libtcc_test$(EXESUF)
 	@echo ------------ $@ ------------
-	./libtcc_test lib_path=..
+	./libtcc_test$(EXESUF) $(TCCFLAGS)

-libtcc_test$(EXESUF): libtcc_test.c ../libtcc.a
-	$(CC) -o $@ $^ -I.. $(CFLAGS) $(LIBS)
+libtcc_test$(EXESUF): libtcc_test.c $(LIBTCC)
+	$(CC) -o $@ $^ $(CFLAGS) $(LIBS)

-# test.ref - generate using gcc
+%-dir:
+	@echo ------------ $@ ------------
+	$(MAKE) -k -C $*
+
+# test.ref - generate using cc
 test.ref: tcctest.c
-	cp -u ../include/tcclib.h .
-	$(CC) -o tcctest.gcc $< -I. -w $(CFLAGS)
+	$(CC) -o tcctest.gcc $< $(NATIVE_DEFINES) $(CFLAGS) -w -O0 -std=gnu99 -fno-omit-frame-pointer
 	./tcctest.gcc > $@

 # auto test
-test1: test.ref
+test1 test1b: tcctest.c test.ref
 	@echo ------------ $@ ------------
-	$(TCC) -run tcctest.c > test.out1
-	@if diff -u test.ref test.out1 ; then echo "Auto Test OK"; fi
+	$(TCC) -run $< > test.out1
+	@diff -u test.ref test.out1 && echo "Auto Test OK"

 # iterated test2 (compile tcc then compile tcctest.c !)
-test2: test.ref
+test2 test2b: tcctest.c test.ref
 	@echo ------------ $@ ------------
-	$(TCC) $(RUN_TCC) $(RUN_TCC) -run tcctest.c > test.out2
-	@if diff -u test.ref test.out2 ; then echo "Auto Test2 OK"; fi
+	$(TCC) $(RUN_TCC) $(RUN_TCC) -run $< > test.out2
+	@diff -u test.ref test.out2 && echo "Auto Test2 OK"

 # iterated test3 (compile tcc then compile tcc then compile tcctest.c !)
-test3: test.ref
+test3 test3b: tcctest.c test.ref
 	@echo ------------ $@ ------------
-	$(TCC) $(RUN_TCC) $(RUN_TCC) $(RUN_TCC) -run tcctest.c > test.out3
-	@if diff -u test.ref test.out3 ; then echo "Auto Test3 OK"; fi
+	$(TCC) $(RUN_TCC) $(RUN_TCC) $(RUN_TCC) -run $< > test.out3
+	@diff -u test.ref test.out3 && echo "Auto Test3 OK"
+
+test%b : TCCFLAGS += -b

 # binary output test
-test4: test.ref
+test4: tcctest.c test.ref
 	@echo ------------ $@ ------------
-# dynamic output
-	$(TCC) -o tcctest1 tcctest.c
-	./tcctest1 > test1.out
-	@if diff -u test.ref test1.out ; then echo "Dynamic Auto Test OK"; fi
 # object + link output
-	$(TCC) -c -o tcctest3.o tcctest.c
+	$(TCC) -c -o tcctest3.o $<
 	$(TCC) -o tcctest3 tcctest3.o
 	./tcctest3 > test3.out
 	@if diff -u test.ref test3.out ; then echo "Object Auto Test OK"; fi
-# static output
-	$(TCC) -static -o tcctest2 tcctest.c
-	./tcctest2 > test2.out
-	@if diff -u test.ref test2.out ; then echo "Static Auto Test OK"; fi
+# dynamic output
+	$(TCC) -o tcctest1 $<
+	./tcctest1 > test1.out
+	@if diff -u test.ref test1.out ; then echo "Dynamic Auto Test OK"; fi
 # dynamic output + bound check
-	$(TCC) -b -o tcctest4 tcctest.c
+	$(TCC) -b -o tcctest4 $<
 	./tcctest4 > test4.out
 	@if diff -u test.ref test4.out ; then echo "BCheck Auto Test OK"; fi
+# static output
+	$(TCC) -static -o tcctest2 $<
+	./tcctest2 > test2.out
+	@if diff -u test.ref test2.out ; then echo "Static Auto Test OK"; fi
+
+# use tcc to create libtcc.so/.dll and the tcc(.exe) frontend and run them
+dlltest:
+	@echo ------------ $@ ------------
+	$(TCC) $(NATIVE_DEFINES) -DLIBTCC_AS_DLL $(TOPSRC)/libtcc.c $(LIBS) -shared -o libtcc2$(DLLSUF)
+	$(TCC) $(NATIVE_DEFINES) -DONE_SOURCE=0 $(TOPSRC)/tcc.c libtcc2$(DLLSUF) $(LIBS) -Wl,-rpath=. -o tcc2$(EXESUF)
+	./tcc2$(EXESUF) $(TCCFLAGS) $(RUN_TCC) -run $(TOPSRC)/examples/ex1.c
+ifndef CONFIG_WIN32
+	@echo ------------ $@ with PIC ------------
+	$(CC) $(CFLAGS) -fPIC $(NATIVE_DEFINES) -DLIBTCC_AS_DLL -c $(TOPSRC)/libtcc.c
+	$(TCC) libtcc.o $(LIBS) -shared -o libtcc2$(DLLSUF)
+	$(TCC) $(NATIVE_DEFINES) -DONE_SOURCE=0 $(TOPSRC)/tcc.c libtcc2$(DLLSUF) $(LIBS) -Wl,-rpath=. -o tcc2$(EXESUF)
+	./tcc2$(EXESUF) $(TCCFLAGS) $(RUN_TCC) -run $(TOPSRC)/examples/ex1.c
+endif
+	@rm tcc2$(EXESUF) libtcc2$(DLLSUF)
+
+memtest:
+	@echo ------------ $@ ------------
+	$(CC) $(CFLAGS) $(NATIVE_DEFINES) -DMEM_DEBUG=2 $(TOPSRC)/tcc.c $(LIBS) -o memtest-tcc$(EXESUF)
+	./memtest-tcc$(EXESUF) $(TCCFLAGS) $(NATIVE_DEFINES) $(TOPSRC)/tcc.c $(LIBS)
+	./memtest-tcc$(EXESUF) $(TCCFLAGS) $(NATIVE_DEFINES) -run $(TOPSRC)/tcc.c $(TCCFLAGS) $(TOPSRC)/tests/tcctest.c
+

 # memory and bound check auto test
 BOUNDS_OK  = 1 4 8 10 14
@ -88,15 +156,15 @@ btest: boundtest.c
 	@echo ------------ $@ ------------
 	@for i in $(BOUNDS_OK); do \
 	   echo ; echo --- boundtest $$i ---; \
-	   if $(TCC) -b -run boundtest.c $$i ; then \
-	       echo succeded as expected; \
+	   if $(TCC) -b -run $< $$i ; then \
+	       echo succeeded as expected; \
 	   else\
 	       echo Failed positive test $$i ; exit 1 ; \
 	   fi ;\
 	done ;\
 	for i in $(BOUNDS_FAIL); do \
 	   echo ; echo --- boundtest $$i ---; \
-	   if $(TCC) -b -run boundtest.c $$i ; then \
+	   if $(TCC) -b -run $< $$i ; then \
 	       echo Failed negative test $$i ; exit 1 ;\
 	   else\
 	       echo failed as expected; \
@ -108,11 +176,18 @@ btest: boundtest.c
 speedtest: ex2 ex3
 	@echo ------------ $@ ------------
 	time ./ex2 1238 2 3 4 10 13 4
-	time $(TCC) -run ../examples/ex2.c 1238 2 3 4 10 13 4
+	time $(TCC) -run $(TOPSRC)/examples/ex2.c 1238 2 3 4 10 13 4
 	time ./ex3 35
-	time $(TCC) -run ../examples/ex3.c 35
+	time $(TCC) -run $(TOPSRC)/examples/ex3.c 35

-ex%: ../examples/ex%.c
+weaktest: tcctest.c test.ref
+	$(TCC) -c $< -o weaktest.tcc.o
+	$(CC) -c $< -o weaktest.gcc.o $(NATIVE_DEFINES) $(CFLAGS) -w -O0 -std=gnu99 -fno-omit-frame-pointer
+	objdump -t weaktest.tcc.o | grep ' w ' | sed -e 's/.* \([a-zA-Z0-9_]*\)$$/\1/' | LC_ALL=C sort > weaktest.tcc.o.txt
+	objdump -t weaktest.gcc.o | grep ' w ' | sed -e 's/.* \([a-zA-Z0-9_]*\)$$/\1/' | LC_ALL=C sort > weaktest.gcc.o.txt
+	diff weaktest.gcc.o.txt weaktest.tcc.o.txt && echo "Weak Auto Test OK"
+
+ex%: $(TOPSRC)/examples/ex%.c
 	$(CC) -o $@ $< $(CFLAGS)

 # tiny assembler testing
@ -120,12 +195,71 @@ asmtest.ref: asmtest.S
 	$(CC) -Wa,-W -o asmtest.ref.o -c asmtest.S
 	objdump -D asmtest.ref.o > asmtest.ref

-asmtest: asmtest.ref
+asmtest asmtest2: asmtest.ref
 	@echo ------------ $@ ------------
-	$(TCC) -c asmtest.S
+	$(TCC) $(MAYBE_RUN_TCC) -c asmtest.S
 	objdump -D asmtest.o > asmtest.out
 	@if diff -u --ignore-matching-lines="file format" asmtest.ref asmtest.out ; then echo "ASM Auto Test OK"; fi

+# test assembler with tcc compiled by itself
+asmtest2: MAYBE_RUN_TCC = $(RUN_TCC)
+
+# Check that code generated by libtcc is binary compatible with
+# that generated by CC
+abitest-cc$(EXESUF): abitest.c $(LIBTCC)
+	$(CC) -o $@ $^ $(CFLAGS) $(LIBS) -w
+
+abitest-tcc$(EXESUF): abitest.c libtcc.c
+	$(TCC) -o $@ $^ $(NATIVE_DEFINES) $(LIBS)
+
+ABITESTS := abitest-cc$(EXESUF)
+ifneq ($(CONFIG_arm_eabi),yes) # not ARM soft-float
+ ABITESTS += abitest-tcc$(EXESUF)
+endif
+
+abitest: $(ABITESTS)
+	@echo ------------ $@ ------------
+	./abitest-cc$(EXESUF) $(TCCFLAGS)
+ifneq ($(CONFIG_arm_eabi),yes) # not ARM soft-float
+	./abitest-tcc$(EXESUF) $(TCCFLAGS)
+endif
+
+vla_test$(EXESUF): vla_test.c
+	$(TCC) -o $@ $^
+
+vla_test-run: vla_test$(EXESUF)
+	@echo ------------ $@ ------------
+	./vla_test$(EXESUF)
+
+asm-c-connect$(EXESUF): asm-c-connect-1.c asm-c-connect-2.c
+	$(TCC) -o $@ $^
+
+asm-c-connect-%.o: asm-c-connect-%.c
+	$(TCC) -c -o $@ $<
+
+asm-c-connect-sep$(EXESUF): asm-c-connect-1.o asm-c-connect-2.o
+	$(TCC) -o $@ $^
+
+asm-c-connect-test: asm-c-connect$(EXESUF) asm-c-connect-sep$(EXESUF)
+	@echo ------------ $@ ------------
+	./asm-c-connect$(EXESUF) > asm-c-connect.out1 && cat asm-c-connect.out1
+	./asm-c-connect-sep$(EXESUF) > asm-c-connect.out2 && cat asm-c-connect.out2
+	@diff -u asm-c-connect.out1 asm-c-connect.out2 && echo "ok"
+
+cross-test :
+	@echo ------------ $@ ------------
+	$(TOP)/i386-tcc$(EXESUF) $(TCCFLAGS-unx) -c $(TOPSRC)/examples/ex3.c && echo "ok"
+	$(TOP)/i386-win32-tcc$(EXESUF) $(TCCFLAGS-win) $(TOPSRC)/examples/ex3.c && echo "ok"
+	$(TOP)/x86_64-tcc$(EXESUF) $(TCCFLAGS-unx) -c $(TOPSRC)/examples/ex3.c && echo "ok"
+	$(TOP)/x86_64-win32-tcc$(EXESUF) $(TCCFLAGS-win) $(TOPSRC)/examples/ex3.c && echo "ok"
+	$(TOP)/arm-tcc$(EXESUF) $(TCCFLAGS-unx) -c $(TOPSRC)/examples/ex3.c && echo "ok"
+	$(TOP)/arm-wince-tcc$(EXESUF) $(TCCFLAGS-win) -c $(TOPSRC)/examples/ex3.c && echo "ok"
+	$(TOP)/arm64-tcc$(EXESUF) $(TCCFLAGS-unx) -c $(TOPSRC)/examples/ex3.c && echo "ok"
+	$(TOP)/c67-tcc$(EXESUF) $(TCCFLAGS-unx) -c $(TOPSRC)/examples/ex3.c && echo "ok"
+	$(TOP)/i386-win32-tcc$(EXESUF) $(TCCFLAGS-win) $(TOPSRC)/win32/examples/hello_win.c && echo "ok"
+	$(TOP)/x86_64-win32-tcc$(EXESUF) $(TCCFLAGS-win) $(TOPSRC)/win32/examples/hello_win.c && echo "ok"
+	$(TOP)/arm-wince-tcc$(EXESUF) $(TCCFLAGS-win) -c $(TOPSRC)/win32/examples/hello_win.c && echo "ok"
+
 # targets for development
 %.bin: %.c tcc
 	$(TCC) -g -o $@ $<
@ -143,5 +277,13 @@ cache: tcc_g

 # clean
 clean:
-	rm -vf *~ *.o *.a *.bin *.i *.ref *.out *.out? *.gcc \
-	   tcctest[1234] ex? libtcc_test$(EXESUF) tcc_g tcclib.h
+	rm -f *~ *.o *.a *.bin *.i *.ref *.out *.out? *.out?b *.cc *.gcc
+	rm -f *-cc *-gcc *-tcc *.exe hello libtcc_test vla_test tcctest[1234]
+	rm -f asm-c-connect$(EXESUF)
+	rm -f ex? tcc_g weaktest.*.txt *.def
+	@$(MAKE) -C tests2 $@
+	@$(MAKE) -C pp $@
+
+# silent clean, used before running tests
+clean-s:
+	@$(MAKE) -s --no-print-directory clean
--- a/tests/abitest.c
+++ b/tests/abitest.c
@ -0,0 +1,691 @@
+#include <libtcc.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdarg.h>
+
+// MinGW has 80-bit rather than 64-bit long double which isn't compatible with TCC or MSVC
+#if defined(_WIN32) && defined(__GNUC__)
+#define LONG_DOUBLE double
+#define LONG_DOUBLE_LITERAL(x) x
+#else
+#define LONG_DOUBLE long double
+#define LONG_DOUBLE_LITERAL(x) x ## L
+#endif
+
+static int g_argc;
+static char **g_argv;
+
+static void set_options(TCCState *s, int argc, char **argv)
+{
+    int i;
+    for (i = 1; i < argc; ++i) {
+        char *a = argv[i];
+        if (a[0] == '-') {
+            if (a[1] == 'B')
+                tcc_set_lib_path(s, a+2);
+            else if (a[1] == 'I')
+                tcc_add_include_path(s, a+2);
+            else if (a[1] == 'L')
+                tcc_add_library_path(s, a+2);
+        }
+    }
+}
+
+typedef int (*callback_type) (void*);
+
+/*
+ * Compile source code and call a callback with a pointer to the symbol "f".
+ */
+static int run_callback(const char *src, callback_type callback) {
+  TCCState *s;
+  int result;
+  void *ptr;
+  
+  s = tcc_new();
+  if (!s)
+    return -1;
+
+  set_options(s, g_argc, g_argv);
+
+  if (tcc_set_output_type(s, TCC_OUTPUT_MEMORY) == -1)
+    return -1;
+  if (tcc_compile_string(s, src) == -1)
+    return -1;
+  if (tcc_relocate(s, TCC_RELOCATE_AUTO) == -1)
+    return -1;
+  
+  ptr = tcc_get_symbol(s, "f");
+  if (!ptr)
+    return -1;
+  result = callback(ptr);
+  
+  tcc_delete(s);
+  
+  return result;
+}
+
+#define STR2(x) #x
+#define STR(x) STR2(x)
+
+#define RET_PRIMITIVE_TEST(name, type, val) \
+  static int ret_ ## name ## _test_callback(void *ptr) { \
+    type (*callback) (type) = (type(*)(type))ptr; \
+    type x = val; \
+    type y = callback(x); \
+    return (y == x+x) ? 0 : -1; \
+  } \
+  \
+  static int ret_ ## name ## _test(void) { \
+    const char *src = STR(type) " f(" STR(type) " x) {return x+x;}"; \
+    return run_callback(src, ret_ ## name ## _test_callback); \
+  }
+
+RET_PRIMITIVE_TEST(int, int, 70000)
+RET_PRIMITIVE_TEST(longlong, long long, 4333369356528LL)
+RET_PRIMITIVE_TEST(float, float, 63.0)
+RET_PRIMITIVE_TEST(double, double, 14789798.0)
+RET_PRIMITIVE_TEST(longdouble, LONG_DOUBLE, LONG_DOUBLE_LITERAL(378943892.0))
+
+/*
+ * ret_2float_test:
+ * 
+ * On x86-64, a struct with 2 floats should be packed into a single
+ * SSE register (VT_DOUBLE is used for this purpose).
+ */
+typedef struct ret_2float_test_type_s {float x, y;} ret_2float_test_type;
+typedef ret_2float_test_type (*ret_2float_test_function_type) (ret_2float_test_type);
+
+static int ret_2float_test_callback(void *ptr) {
+  ret_2float_test_function_type f = (ret_2float_test_function_type)ptr;
+  ret_2float_test_type a = {10, 35};
+  ret_2float_test_type r;
+  r = f(a);
+  return ((r.x == a.x*5) && (r.y == a.y*3)) ? 0 : -1;
+}
+
+static int ret_2float_test(void) {
+  const char *src =
+  "typedef struct ret_2float_test_type_s {float x, y;} ret_2float_test_type;"
+  "ret_2float_test_type f(ret_2float_test_type a) {\n"
+  "  ret_2float_test_type r = {a.x*5, a.y*3};\n"
+  "  return r;\n"
+  "}\n";
+
+  return run_callback(src, ret_2float_test_callback);
+}
+
+/*
+ * ret_2double_test:
+ * 
+ * On x86-64, a struct with 2 doubles should be passed in two SSE
+ * registers.
+ */
+typedef struct ret_2double_test_type_s {double x, y;} ret_2double_test_type;
+typedef ret_2double_test_type (*ret_2double_test_function_type) (ret_2double_test_type);
+
+static int ret_2double_test_callback(void *ptr) {
+  ret_2double_test_function_type f = (ret_2double_test_function_type)ptr;
+  ret_2double_test_type a = {10, 35};
+  ret_2double_test_type r;
+  r = f(a);
+  return ((r.x == a.x*5) && (r.y == a.y*3)) ? 0 : -1;
+}
+
+static int ret_2double_test(void) {
+  const char *src =
+  "typedef struct ret_2double_test_type_s {double x, y;} ret_2double_test_type;"
+  "ret_2double_test_type f(ret_2double_test_type a) {\n"
+  "  ret_2double_test_type r = {a.x*5, a.y*3};\n"
+  "  return r;\n"
+  "}\n";
+
+  return run_callback(src, ret_2double_test_callback);
+}
+
+/*
+ * ret_8plus2double_test:
+ *
+ * This catches a corner case in the x86_64 ABI code: the first 7
+ * arguments fit into registers, the 8th doesn't, but the 9th argument
+ * fits into the 8th XMM register.
+ *
+ * Note that the purpose of the 10th argument is to avoid a situation
+ * in which gcc would accidentally put the double at the right
+ * address, thus causing a success message even though TCC actually
+ * generated incorrect code.
+ */
+typedef ret_2double_test_type (*ret_8plus2double_test_function_type) (double, double, double, double, double, double, double, ret_2double_test_type, double, double);
+
+static int ret_8plus2double_test_callback(void *ptr) {
+  ret_8plus2double_test_function_type f = (ret_8plus2double_test_function_type)ptr;
+  ret_2double_test_type a = {10, 35};
+  ret_2double_test_type r;
+  r = f(0, 0, 0, 0, 0, 0, 0, a, 37, 38);
+  return ((r.x == 37) && (r.y == 37)) ? 0 : -1;
+}
+
+static int ret_8plus2double_test(void) {
+  const char *src =
+  "typedef struct ret_2double_test_type_s {double x, y;} ret_2double_test_type;"
+  "ret_2double_test_type f(double x1, double x2, double x3, double x4, double x5, double x6, double x7, ret_2double_test_type a, double x8, double x9) {\n"
+  "  ret_2double_test_type r = { x8, x8 };\n"
+  "  return r;\n"
+  "}\n";
+
+  return run_callback(src, ret_8plus2double_test_callback);
+}
+
+/*
+ * ret_mixed_test:
+ *
+ * On x86-64, a struct with a double and a 64-bit integer should be
+ * passed in one SSE register and one integer register.
+ */
+typedef struct ret_mixed_test_type_s {double x; long long y;} ret_mixed_test_type;
+typedef ret_mixed_test_type (*ret_mixed_test_function_type) (ret_mixed_test_type);
+
+static int ret_mixed_test_callback(void *ptr) {
+  ret_mixed_test_function_type f = (ret_mixed_test_function_type)ptr;
+  ret_mixed_test_type a = {10, 35};
+  ret_mixed_test_type r;
+  r = f(a);
+  return ((r.x == a.x*5) && (r.y == a.y*3)) ? 0 : -1;
+}
+
+static int ret_mixed_test(void) {
+  const char *src =
+  "typedef struct ret_mixed_test_type_s {double x; long long y;} ret_mixed_test_type;"
+  "ret_mixed_test_type f(ret_mixed_test_type a) {\n"
+  "  ret_mixed_test_type r = {a.x*5, a.y*3};\n"
+  "  return r;\n"
+  "}\n";
+
+  return run_callback(src, ret_mixed_test_callback);
+}
+
+/*
+ * ret_mixed2_test:
+ *
+ * On x86-64, a struct with two floats and two 32-bit integers should
+ * be passed in one SSE register and one integer register.
+ */
+typedef struct ret_mixed2_test_type_s {float x,x2; int y,y2;} ret_mixed2_test_type;
+typedef ret_mixed2_test_type (*ret_mixed2_test_function_type) (ret_mixed2_test_type);
+
+static int ret_mixed2_test_callback(void *ptr) {
+  ret_mixed2_test_function_type f = (ret_mixed2_test_function_type)ptr;
+  ret_mixed2_test_type a = {10, 5, 35, 7 };
+  ret_mixed2_test_type r;
+  r = f(a);
+  return ((r.x == a.x*5) && (r.y == a.y*3)) ? 0 : -1;
+}
+
+static int ret_mixed2_test(void) {
+  const char *src =
+  "typedef struct ret_mixed2_test_type_s {float x, x2; int y,y2;} ret_mixed2_test_type;"
+  "ret_mixed2_test_type f(ret_mixed2_test_type a) {\n"
+  "  ret_mixed2_test_type r = {a.x*5, 0, a.y*3, 0};\n"
+  "  return r;\n"
+  "}\n";
+
+  return run_callback(src, ret_mixed2_test_callback);
+}
+
+/*
+ * ret_mixed3_test:
+ *
+ * On x86-64, this struct should be passed in two integer registers.
+ */
+typedef struct ret_mixed3_test_type_s {float x; int y; float x2; int y2;} ret_mixed3_test_type;
+typedef ret_mixed3_test_type (*ret_mixed3_test_function_type) (ret_mixed3_test_type);
+
+static int ret_mixed3_test_callback(void *ptr) {
+  ret_mixed3_test_function_type f = (ret_mixed3_test_function_type)ptr;
+  ret_mixed3_test_type a = {10, 5, 35, 7 };
+  ret_mixed3_test_type r;
+  r = f(a);
+  return ((r.x == a.x*5) && (r.y2 == a.y*3)) ? 0 : -1;
+}
+
+static int ret_mixed3_test(void) {
+  const char *src =
+  "typedef struct ret_mixed3_test_type_s {float x; int y; float x2; int y2;} ret_mixed3_test_type;"
+  "ret_mixed3_test_type f(ret_mixed3_test_type a) {\n"
+  "  ret_mixed3_test_type r = {a.x*5, 0, 0, a.y*3};\n"
+  "  return r;\n"
+  "}\n";
+
+  return run_callback(src, ret_mixed3_test_callback);
+}
+
+/*
+ * reg_pack_test: return a small struct which should be packed into
+ * registers (Win32) during return.
+ */
+typedef struct reg_pack_test_type_s {int x, y;} reg_pack_test_type;
+typedef reg_pack_test_type (*reg_pack_test_function_type) (reg_pack_test_type);
+
+static int reg_pack_test_callback(void *ptr) {
+  reg_pack_test_function_type f = (reg_pack_test_function_type)ptr;
+  reg_pack_test_type a = {10, 35};
+  reg_pack_test_type r;
+  r = f(a);
+  return ((r.x == a.x*5) && (r.y == a.y*3)) ? 0 : -1;
+}
+
+static int reg_pack_test(void) {
+  const char *src =
+  "typedef struct reg_pack_test_type_s {int x, y;} reg_pack_test_type;"
+  "reg_pack_test_type f(reg_pack_test_type a) {\n"
+  "  reg_pack_test_type r = {a.x*5, a.y*3};\n"
+  "  return r;\n"
+  "}\n";
+  
+  return run_callback(src, reg_pack_test_callback);
+}
+
+/*
+ * reg_pack_longlong_test: return a small struct which should be packed into
+ * registers (x86-64) during return.
+ */
+typedef struct reg_pack_longlong_test_type_s {long long x, y;} reg_pack_longlong_test_type;
+typedef reg_pack_longlong_test_type (*reg_pack_longlong_test_function_type) (reg_pack_longlong_test_type);
+
+static int reg_pack_longlong_test_callback(void *ptr) {
+  reg_pack_longlong_test_function_type f = (reg_pack_longlong_test_function_type)ptr;
+  reg_pack_longlong_test_type a = {10, 35};
+  reg_pack_longlong_test_type r;
+  r = f(a);
+  return ((r.x == a.x*5) && (r.y == a.y*3)) ? 0 : -1;
+}
+
+static int reg_pack_longlong_test(void) {
+  const char *src =
+  "typedef struct reg_pack_longlong_test_type_s {long long x, y;} reg_pack_longlong_test_type;"
+  "reg_pack_longlong_test_type f(reg_pack_longlong_test_type a) {\n"
+  "  reg_pack_longlong_test_type r = {a.x*5, a.y*3};\n"
+  "  return r;\n"
+  "}\n";
+  
+  return run_callback(src, reg_pack_longlong_test_callback);
+}
+
+/*
+ * ret_6plus2longlong_test:
+ *
+ * This catches a corner case in the x86_64 ABI code: the first 5
+ * arguments fit into registers, the 6th doesn't, but the 7th argument
+ * fits into the 6th argument integer register, %r9.
+ *
+ * Note that the purpose of the 10th argument is to avoid a situation
+ * in which gcc would accidentally put the longlong at the right
+ * address, thus causing a success message even though TCC actually
+ * generated incorrect code.
+ */
+typedef reg_pack_longlong_test_type (*ret_6plus2longlong_test_function_type) (long long, long long, long long, long long, long long, reg_pack_longlong_test_type, long long, long long);
+
+static int ret_6plus2longlong_test_callback(void *ptr) {
+  ret_6plus2longlong_test_function_type f = (ret_6plus2longlong_test_function_type)ptr;
+  reg_pack_longlong_test_type a = {10, 35};
+  reg_pack_longlong_test_type r;
+  r = f(0, 0, 0, 0, 0, a, 37, 38);
+  return ((r.x == 37) && (r.y == 37)) ? 0 : -1;
+}
+
+static int ret_6plus2longlong_test(void) {
+  const char *src =
+  "typedef struct reg_pack_longlong_test_type_s {long long x, y;} reg_pack_longlong_test_type;"
+  "reg_pack_longlong_test_type f(long long x1, long long x2, long long x3, long long x4, long long x5, reg_pack_longlong_test_type a, long long x8, long long x9) {\n"
+  "  reg_pack_longlong_test_type r = { x8, x8 };\n"
+  "  return r;\n"
+  "}\n";
+
+  return run_callback(src, ret_6plus2longlong_test_callback);
+}
+
+/*
+ * sret_test: Create a struct large enough to be returned via sret
+ * (hidden pointer as first function argument)
+ */
+typedef struct sret_test_type_s {long long a, b, c;} sret_test_type;
+typedef sret_test_type (*sret_test_function_type) (sret_test_type);
+
+static int sret_test_callback(void *ptr) {
+  sret_test_function_type f = (sret_test_function_type)(ptr);
+  sret_test_type x = {5436LL, 658277698LL, 43878957LL};
+  sret_test_type r = f(x);
+  return ((r.a==x.a*35)&&(r.b==x.b*19)&&(r.c==x.c*21)) ? 0 : -1;
+}
+
+static int sret_test(void) {
+  const char *src =
+  "typedef struct sret_test_type_s {long long a, b, c;} sret_test_type;\n"
+  "sret_test_type f(sret_test_type x) {\n"
+  "  sret_test_type r = {x.a*35, x.b*19, x.c*21};\n"
+  "  return r;\n"
+  "}\n";
+  
+  return run_callback(src, sret_test_callback);
+}
+
+/*
+ * one_member_union_test:
+ * 
+ * In the x86-64 ABI a union should always be passed on the stack. However
+ * it appears that a single member union is treated by GCC as its member.
+ */
+typedef union one_member_union_test_type_u {int x;} one_member_union_test_type;
+typedef one_member_union_test_type (*one_member_union_test_function_type) (one_member_union_test_type);
+
+static int one_member_union_test_callback(void *ptr) {
+  one_member_union_test_function_type f = (one_member_union_test_function_type)ptr;
+  one_member_union_test_type a, b;
+  a.x = 34;
+  b = f(a);
+  return (b.x == a.x*2) ? 0 : -1;
+}
+
+static int one_member_union_test(void) {
+  const char *src =
+  "typedef union one_member_union_test_type_u {int x;} one_member_union_test_type;\n"
+  "one_member_union_test_type f(one_member_union_test_type a) {\n"
+  "  one_member_union_test_type b;\n"
+  "  b.x = a.x * 2;\n"
+  "  return b;\n"
+  "}\n";
+  return run_callback(src, one_member_union_test_callback);
+}
+
+/*
+ * two_member_union_test:
+ * 
+ * In the x86-64 ABI a union should always be passed on the stack.
+ */
+typedef union two_member_union_test_type_u {int x; long y;} two_member_union_test_type;
+typedef two_member_union_test_type (*two_member_union_test_function_type) (two_member_union_test_type);
+
+static int two_member_union_test_callback(void *ptr) {
+  two_member_union_test_function_type f = (two_member_union_test_function_type)ptr;
+  two_member_union_test_type a, b;
+  a.x = 34;
+  b = f(a);
+  return (b.x == a.x*2) ? 0 : -1;
+}
+
+static int two_member_union_test(void) {
+  const char *src =
+  "typedef union two_member_union_test_type_u {int x; long y;} two_member_union_test_type;\n"
+  "two_member_union_test_type f(two_member_union_test_type a) {\n"
+  "  two_member_union_test_type b;\n"
+  "  b.x = a.x * 2;\n"
+  "  return b;\n"
+  "}\n";
+  return run_callback(src, two_member_union_test_callback);
+}
+
+/*
+ * Win64 calling convention test.
+ */
+
+typedef struct many_struct_test_type_s {long long a, b, c;} many_struct_test_type;
+typedef many_struct_test_type (*many_struct_test_function_type) (many_struct_test_type,many_struct_test_type,many_struct_test_type,many_struct_test_type,many_struct_test_type,many_struct_test_type);
+ 
+static int many_struct_test_callback(void *ptr) {
+  many_struct_test_function_type f = (many_struct_test_function_type)ptr;
+  many_struct_test_type v = {1, 2, 3};
+  many_struct_test_type r = f(v,v,v,v,v,v);
+  return ((r.a == 6) && (r.b == 12) && (r.c == 18))?0:-1;
+}
+
+static int many_struct_test(void) {
+  const char *src =
+  "typedef struct many_struct_test_type_s {long long a, b, c;} many_struct_test_type;\n"
+  "many_struct_test_type f(many_struct_test_type x1, many_struct_test_type x2, many_struct_test_type x3, many_struct_test_type x4, many_struct_test_type x5, many_struct_test_type x6) {\n"
+  "  many_struct_test_type y;\n"
+  "  y.a = x1.a + x2.a + x3.a + x4.a + x5.a + x6.a;\n"
+  "  y.b = x1.b + x2.b + x3.b + x4.b + x5.b + x6.b;\n"
+  "  y.c = x1.c + x2.c + x3.c + x4.c + x5.c + x6.c;\n"
+  "  return y;\n"
+  "}\n";
+  return run_callback(src, many_struct_test_callback);
+}
+
+/*
+ * Win64 calling convention test.
+ */
+
+typedef struct many_struct_test_2_type_s {int a, b;} many_struct_test_2_type;
+typedef many_struct_test_2_type (*many_struct_test_2_function_type) (many_struct_test_2_type,many_struct_test_2_type,many_struct_test_2_type,many_struct_test_2_type,many_struct_test_2_type,many_struct_test_2_type);
+ 
+static int many_struct_test_2_callback(void *ptr) {
+  many_struct_test_2_function_type f = (many_struct_test_2_function_type)ptr;
+  many_struct_test_2_type v = {1,2};
+  many_struct_test_2_type r = f(v,v,v,v,v,v);
+  return ((r.a == 6) && (r.b == 12))?0:-1;
+}
+
+static int many_struct_test_2(void) {
+  const char *src =
+  "typedef struct many_struct_test_2_type_s {int a, b;} many_struct_test_2_type;\n"
+  "many_struct_test_2_type f(many_struct_test_2_type x1, many_struct_test_2_type x2, many_struct_test_2_type x3, many_struct_test_2_type x4, many_struct_test_2_type x5, many_struct_test_2_type x6) {\n"
+  "  many_struct_test_2_type y;\n"
+  "  y.a = x1.a + x2.a + x3.a + x4.a + x5.a + x6.a;\n"
+  "  y.b = x1.b + x2.b + x3.b + x4.b + x5.b + x6.b;\n"
+  "  return y;\n"
+  "}\n";
+  return run_callback(src, many_struct_test_2_callback);
+}
+
+/*
+ * Win64 calling convention test.
+ */
+
+typedef struct many_struct_test_3_type_s {int a, b;} many_struct_test_3_type;
+typedef many_struct_test_3_type (*many_struct_test_3_function_type) (many_struct_test_3_type,many_struct_test_3_type,many_struct_test_3_type,many_struct_test_3_type,many_struct_test_3_type,many_struct_test_3_type, ...);
+typedef struct many_struct_test_3_struct_type { many_struct_test_3_function_type f; many_struct_test_3_function_type *f2; } many_struct_test_3_struct_type;
+
+static void many_struct_test_3_dummy(double d, ...)
+{
+  volatile double x = d;
+}
+
+static int many_struct_test_3_callback(void *ptr) {
+  many_struct_test_3_struct_type s = { ptr, };
+  many_struct_test_3_struct_type *s2 = &s;
+  s2->f2 = &s2->f;
+  many_struct_test_3_dummy(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, &s2);
+  many_struct_test_3_function_type f = *(s2->f2);
+  many_struct_test_3_type v = {1,2};
+  many_struct_test_3_type r = (*((s2->f2=&f)+0))(v,v,v,v,v,v,1.0);
+  return ((r.a == 6) && (r.b == 12))?0:-1;
+}
+
+static int many_struct_test_3(void) {
+  const char *src =
+  "typedef struct many_struct_test_3_type_s {int a, b;} many_struct_test_3_type;\n"
+  "many_struct_test_3_type f(many_struct_test_3_type x1, many_struct_test_3_type x2, many_struct_test_3_type x3, many_struct_test_3_type x4, many_struct_test_3_type x5, many_struct_test_3_type x6, ...) {\n"
+  "  many_struct_test_3_type y;\n"
+  "  y.a = x1.a + x2.a + x3.a + x4.a + x5.a + x6.a;\n"
+  "  y.b = x1.b + x2.b + x3.b + x4.b + x5.b + x6.b;\n"
+  "  return y;\n"
+  "}\n";
+  return run_callback(src, many_struct_test_3_callback);
+}
+
+/*
+ * stdarg_test: Test variable argument list ABI
+ */
+
+typedef struct {long long a, b, c;} stdarg_test_struct_type;
+typedef void (*stdarg_test_function_type) (int,int,int,...);
+
+static int stdarg_test_callback(void *ptr) {
+  stdarg_test_function_type f = (stdarg_test_function_type)ptr;
+  int x;
+  double y;
+  stdarg_test_struct_type z = {1, 2, 3}, w;
+  f(10, 10, 5,
+    1, 2, 3, 4, 5, 6, 7, 8, 9, 10, &x,
+    1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, &y,
+    z, z, z, z, z, &w);
+  return ((x == 55) && (y == 55) && (w.a == 5) && (w.b == 10) && (w.c == 15)) ? 0 : -1;
+}
+
+static int stdarg_test(void) {
+  const char *src =
+  "#include <stdarg.h>\n"
+  "typedef struct {long long a, b, c;} stdarg_test_struct_type;\n"
+  "void f(int n_int, int n_float, int n_struct, ...) {\n"
+  "  int i, ti = 0;\n"
+  "  double td = 0.0;\n"
+  "  stdarg_test_struct_type ts = {0,0,0}, tmp;\n"
+  "  va_list ap;\n"
+  "  va_start(ap, n_struct);\n"
+  "  for (i = 0, ti = 0; i < n_int; ++i)\n"
+  "    ti += va_arg(ap, int);\n"
+  "  *va_arg(ap, int*) = ti;\n"
+  "  for (i = 0, td = 0; i < n_float; ++i)\n"
+  "    td += va_arg(ap, double);\n"
+  "  *va_arg(ap, double*) = td;\n"
+  "  for (i = 0; i < n_struct; ++i) {\n"
+  "    tmp = va_arg(ap, stdarg_test_struct_type);\n"
+  "    ts.a += tmp.a; ts.b += tmp.b; ts.c += tmp.c;"
+  "  }\n"
+  "  *va_arg(ap, stdarg_test_struct_type*) = ts;\n"
+  "  va_end(ap);"
+  "}\n";
+  return run_callback(src, stdarg_test_callback);
+}
+
+typedef struct {long long a, b;} stdarg_many_test_struct_type;
+typedef void (*stdarg_many_test_function_type) (int, int, int, int, int,
+						stdarg_many_test_struct_type,
+						int, int, ...);
+
+static int stdarg_many_test_callback(void *ptr)
+{
+  stdarg_many_test_function_type f = (stdarg_many_test_function_type)ptr;
+  int x;
+  stdarg_many_test_struct_type l = {10, 11};
+  f(1, 2, 3, 4, 5, l, 6, 7, &x, 44);
+  return x == 44 ? 0 : -1;
+}
+
+static int stdarg_many_test(void)
+{
+  const char *src =
+  "#include <stdarg.h>\n"
+  "typedef struct {long long a, b;} stdarg_many_test_struct_type;\n"
+  "void f (int a, int b, int c, int d, int e, stdarg_many_test_struct_type l, int f, int g, ...){\n"
+  "  va_list ap;\n"
+  "  int *p;\n"
+  "  va_start (ap, g);\n"
+  "  p = va_arg(ap, int*);\n"
+  "  *p = va_arg(ap, int);\n"
+  "  va_end (ap);\n"
+  "}\n";
+  return run_callback(src, stdarg_many_test_callback);
+}
+
+/*
+ * Test Win32 stdarg handling, since the calling convention will pass a pointer
+ * to the struct and the stdarg pointer must point to that pointer initially.
+ */
+
+typedef struct {long long a, b, c;} stdarg_struct_test_struct_type;
+typedef int (*stdarg_struct_test_function_type) (stdarg_struct_test_struct_type a, ...);
+
+static int stdarg_struct_test_callback(void *ptr) {
+  stdarg_struct_test_function_type f = (stdarg_struct_test_function_type)ptr;
+  stdarg_struct_test_struct_type v = {10, 35, 99};
+  int x = f(v, 234);
+  return (x == 378) ? 0 : -1;
+}
+
+static int stdarg_struct_test(void) {
+  const char *src =
+  "#include <stdarg.h>\n"
+  "typedef struct {long long a, b, c;} stdarg_struct_test_struct_type;\n"
+  "int f(stdarg_struct_test_struct_type a, ...) {\n"
+  "  va_list ap;\n"
+  "  va_start(ap, a);\n"
+  "  int z = va_arg(ap, int);\n"
+  "  va_end(ap);\n"
+  "  return z + a.a + a.b + a.c;\n"
+  "}\n";
+  return run_callback(src, stdarg_struct_test_callback);
+}
+
+/* Test that x86-64 arranges the stack correctly for arguments with alignment >8 bytes */
+
+typedef LONG_DOUBLE (*arg_align_test_callback_type) (LONG_DOUBLE,int,LONG_DOUBLE,int,LONG_DOUBLE);
+
+static int arg_align_test_callback(void *ptr) {
+  arg_align_test_callback_type f = (arg_align_test_callback_type)ptr;
+  long double x = f(12, 0, 25, 0, 37);
+  return (x == 74) ? 0 : -1;
+}
+
+static int arg_align_test(void) {
+  const char *src = 
+  "long double f(long double a, int b, long double c, int d, long double e) {\n"
+  "  return a + c + e;\n"
+  "}\n";
+  return run_callback(src, arg_align_test_callback);
+}
+
+#define RUN_TEST(t) \
+  if (!testname || (strcmp(#t, testname) == 0)) { \
+    fputs(#t "... ", stdout); \
+    fflush(stdout); \
+    if (t() == 0) { \
+      fputs("success\n", stdout); \
+    } else { \
+      fputs("failure\n", stdout); \
+      retval = EXIT_FAILURE; \
+    } \
+  }
+
+int main(int argc, char **argv) {
+  int i;
+  const char *testname = NULL;
+  int retval = EXIT_SUCCESS;
+  
+  /* if tcclib.h and libtcc1.a are not installed, where can we find them */
+  for (i = 1; i < argc; ++i) {
+    if (!memcmp(argv[i], "run_test=", 9))
+      testname = argv[i] + 9;
+  }
+
+  g_argv = argv, g_argc = argc;
+
+  RUN_TEST(ret_int_test);
+  RUN_TEST(ret_longlong_test);
+  RUN_TEST(ret_float_test);
+  RUN_TEST(ret_double_test);
+  RUN_TEST(ret_longdouble_test);
+  RUN_TEST(ret_2float_test);
+  RUN_TEST(ret_2double_test);
+  RUN_TEST(ret_8plus2double_test);
+  RUN_TEST(ret_6plus2longlong_test);
+#if !defined __x86_64__ || defined _WIN32
+  /* currently broken on x86_64 linux */
+  RUN_TEST(ret_mixed_test);
+  RUN_TEST(ret_mixed2_test);
+#endif
+  RUN_TEST(ret_mixed3_test);
+  RUN_TEST(reg_pack_test);
+  RUN_TEST(reg_pack_longlong_test);
+  RUN_TEST(sret_test);
+  RUN_TEST(one_member_union_test);
+  RUN_TEST(two_member_union_test);
+  RUN_TEST(many_struct_test);
+  RUN_TEST(many_struct_test_2);
+  RUN_TEST(many_struct_test_3);
+  RUN_TEST(stdarg_test);
+  RUN_TEST(stdarg_many_test);
+  RUN_TEST(stdarg_struct_test);
+  RUN_TEST(arg_align_test);
+  return retval;
+}
--- a/tests/asm-c-connect-1.c
+++ b/tests/asm-c-connect-1.c
@ -0,0 +1,57 @@
+#include <stdio.h>
+
+#if defined _WIN32 && !defined __TINYC__
+# define _ "_"
+#else
+# define _
+#endif
+
+static int x1_c(void)
+{
+    printf(" x1");
+    return 1;
+}
+
+asm(".text;"_"x1: call "_"x1_c; ret");
+
+void callx4(void);
+void callx5_again(void);
+
+void x6()
+{
+    printf(" x6-1");
+}
+
+int main(int argc, char *argv[])
+{
+    printf("*");
+    asm("call "_"x1");
+    asm("call "_"x2");
+    asm("call "_"x3");
+    callx4();
+    asm("call "_"x5");
+    callx5_again();
+    x6();
+    printf(" *\n");
+    return 0;
+}
+
+static
+int x2(void)
+{
+    printf(" x2");
+    return 2;
+}
+
+extern int x3(void);
+
+void x4(void)
+{
+    printf(" x4");
+}
+
+void x5(void);
+void x5(void)
+{
+    printf(" x5");
+}
--- a/tests/asm-c-connect-2.c
+++ b/tests/asm-c-connect-2.c
@ -0,0 +1,36 @@
+#include <stdio.h>
+
+#if defined _WIN32 && !defined __TINYC__
+# define _ "_"
+#else
+# define _
+#endif
+
+int x3(void)
+{
+    printf(" x3");
+    return 3;
+}
+
+/* That callx4 is defined globally (as if ".globl callx4")
+   is a TCC extension.  GCC doesn't behave like this.  */
+void callx4(void);
+__asm__(_"callx4: call "_"x4; ret;"
+#ifndef __TINYC__
+    " .global "_"callx4"
+#endif
+);
+
+extern void x5(void);
+
+void callx5_again(void);
+void callx5_again(void)
+{
+    x5();
+    asm("call "_"x6");
+}
+
+static void x6()
+{
+    printf(" x6-2");
+}
--- a/tests/asmtest.S
+++ b/tests/asmtest.S
@ -1,3 +1,4 @@
+# gas comment with ``gnu'' style quotes

 /* some directive tests */

@ -9,14 +10,23 @@
   .int 1, 2, 3
   .align 8
   .byte 1
-   .align 16, 0x90
+/* .align 16, 0x90 gas is too clever for us with 0x90 fill */
+   .balign 4, 0x92
+   .align 16, 0x91 /* 0x91 tests the non-clever behaviour */
   .skip 3
   .skip 15, 0x90
   .string "hello\0world"
+/* Macro expansion should work like with C, the #n shouldn't be parsed
+   as asm line comment */
+#define __stringify(n) #n
+#define stringify(n) __stringify(n)
+   .skip 8,0x90
+   .asciz stringify(BLA)
+   .skip 8,0x90

+# 28 "asmtest.S"        # a line directive (and a line comment)
+        movl %eax, %ebx # some more asm comment
 /* some label tests */
-
-        movl %eax, %ebx
 L1:
        movl %eax, %ebx
        mov 0x10000, %eax
@ -36,7 +46,7 @@ mov %al, 0x10000
                
 mov $1, %edx
 mov $1, %dx
-mov $1, %dl
+mov $1, %cl
 movb $2, 0x100(%ebx,%edx,2)
 movw $2, 0x100(%ebx,%edx,2)
 movl $2, 0x100(%ebx,%edx,2)
@ -44,17 +54,55 @@ movl %eax, 0x100(%ebx,%edx,2)
 movl 0x100(%ebx,%edx,2), %edx
 movw %ax, 0x100(%ebx,%edx,2)

+movw $0x1122,%si
+movl $0x112233,%edx
+movl $0x80000000, %esi
+movl $-0x7fffffff, %edi
+#ifdef __x86_64__
+mov $0x11223344,%rbx
+movq $0x11223344,%rbx
+mov $0x1122334455,%rbx
+movq $0x1122334455,%rbx
+movl $0x11334455,(%rbx)
+#endif
+
 mov %eax, 0x12(,%edx,2)
        
+#ifdef __i386__
 mov %cr3, %edx
 mov %ecx, %cr3
 movl %cr3, %eax
 movl %tr3, %eax
 movl %db3, %ebx
 movl %dr6, %eax
+#else
+mov %cr3, %rdx
+mov %rcx, %cr3
+movq %cr3, %rax
+movq %db3, %rbx
+movq %dr6, %rax
+mov %cr8, %rsi
+mov %rdi, %cr8
+#endif
 movl %fs, %ecx
 movl %ebx, %fs

+#ifdef __x86_64__
+movq %r8, %r9
+movq %r10, %r11
+movq %r12, %r13
+movq %r14, %r15
+movq %rax, %r9
+movq %r15, %rsi
+inc %r9b
+dec %r10w
+not %r11d
+negq %r12
+decb %r13b
+incw %r14w
+notl %r15d
+#endif
+
     movsbl 0x1000, %eax
     movsbw 0x1000, %ax
     movswl 0x1000, %eax
@ -66,19 +114,47 @@ movl %ebx, %fs
     movzb 0x1000, %eax
     movzb 0x1000, %ax
                
+     mov $0x12345678,%eax
+
+#ifdef __x86_64__
+     movzb 0x1000, %rax
+     movzbq 0x1000, %rbx
+     movsbq 0x1000, %rdx
+     movzwq 0x1000, %rdi
+     movswq 0x1000, %rdx
+     movslq %eax, %rcx
+     mov $0x12345678,%rax
+     mov $0x12345678,%rdx
+     mov $0x12345678,%r10
+     mov $0x123456789abcdef0,%rax
+     mov $0x123456789abcdef0,%rcx
+     mov $0x123456789abcdef0,%r11
+#endif
        
+#ifdef __i386__
  pushl %eax
-  pushw %ax
  push %eax
  push %cs
+#else
+  pushq %rax
+  push %rax
+#endif
+  pushw %ax
  push %gs
  push $1
  push $100
+  push 0x42(%eax)
+  pop 0x43(%esi)
                                                
+#ifdef __i386__
  popl %eax
-  popw %ax
  pop %eax
  pop %ds
+#else
+  popq %rax
+  pop %rax
+#endif
+  popw %ax
  pop %fs
          
  xchg %eax, %ecx
@ -109,22 +185,61 @@ movl %ebx, %fs
  leal 0x1000(%ebx), %ecx
  lea 0x1000(%ebx), %ecx

+#ifdef __i386__
  les 0x2000, %eax
  lds 0x2000, %ebx
+  lss 0x2000, %edx
+#endif
  lfs 0x2000, %ecx
  lgs 0x2000, %edx
-  lss 0x2000, %edx

 addl $0x123, %eax
 add $0x123, %ebx
+add $-16, %ecx
+add $-0x123, %esi
+add $1, %bx
+add $1, %ebx
+add $-1, %bx
+add $-1, %ebx
+add $127, %bx
+addl $127, %ebx
+addl $-128, %ebx
+addl $-128, %ebx
+addl $-129, %ebx
+addl $128, %ebx
+addl $255, %ebx
+addl $256, %ebx
+andb $0xf, %ah
+andb $-15, %cl
+xorb $127, %dh
+cmpb $42, (%eax)
 addl $0x123, 0x100
 addl $0x123, 0x100(%ebx)
 addl $0x123, 0x100(%ebx,%edx,2)
 addl $0x123, 0x100(%esp)
+addl $0x123, (3*8)(%esp)
 addl $0x123, (%ebp)
 addl $0x123, (%esp)
 cmpl $0x123, (%esp)

+#ifdef __x86_64__
+xor %bl,%ah
+xor %bl,%r8b
+xor %r9b,%bl
+xor %sil,%cl
+add %eax,(%r8d)
+add %ebx,(%r9)
+add %edx,(%r10d,%r11d)
+add %ecx,(%r12,%r13)
+add %esi,(%r14,%r15,4)
+add %edi,0x1000(%rbx,%r12,8)
+add %r11,0x1000(%ebp,%r9d,8)
+movb $12, %ah
+movb $13, %bpl
+movb $14, %dil
+movb $15, %r12b
+#endif
+
 add %eax, (%ebx)
 add (%ebx), %eax
                
@ -176,6 +291,8 @@ add (%ebx), %dl
    div %bl
    div %ecx, %eax

+and $15,%bx
+and $-20,%edx

 shl %edx
 shl $10, %edx
@ -192,21 +309,53 @@ shrd %eax, %edx
 L4:
 call 0x1000
 call L4
+#ifdef __i386__
 call *%eax
+#else
+call *%rax
+#endif
 call *0x1000
 call func1

+.global L5,L6
+
+L5:
+L6:
+
+#ifdef __i386__
 lcall $0x100, $0x1000
+#else
+lcall *0x100
+lcall *(%rax)
+#endif

 jmp 0x1000
+jmp *(%edi)
+#ifdef __i386__
 jmp *%eax
+#else
+jmp *%rax
+#endif
 jmp *0x1000

+#ifdef __i386__
 ljmp $0x100, $0x1000
+#else
+ljmp *0x100
+ljmp *(%rdi)
+ljmpl *(%esi)
+ljmpw *(%esi)
+#endif

 ret
-
 ret $10
+#ifdef __i386__
+retl
+retl $10
+#else
+retq
+retq $10
+#endif

 lret

@ -234,14 +383,20 @@ L3:

        
 seto %al
+ setc %al
+ setcb %al
 setnp 0x1000
 setl 0xaaaa
 setg %dl

 fadd
 fadd %st(1), %st
+ fadd %st(0), %st(1)
 fadd %st(3)

+ fmul %st(0),%st(0)
+ fmul %st(0),%st(1)
+
 faddp %st(5)
 faddp
 faddp %st(1), %st
@ -387,6 +542,7 @@ L3:
    fwait

 bswap %edx
+bswapl %ecx
 xadd %ecx, %edx
 xaddb %dl, 0x1000
 xaddw %ax, 0x1000
@ -397,6 +553,10 @@ cmpxchgw %ax, 0x1000
 cmpxchgl %eax, 0x1000
 invlpg 0x1000
 cmpxchg8b 0x1002
+#ifdef __x86_64__
+cmpxchg16b (%rax)
+cmpxchg16b (%r10,%r11)
+#endif

 fcmovb %st(5), %st
 fcmove %st(5), %st
@ -416,32 +576,51 @@ fucomip %st(5), %st
 cmovo 0x1000, %eax
 cmovs 0x1000, %eax
 cmovns %edx, %edi
+ cmovne %ax, %si
+ cmovbw %ax, %di
+ cmovnbel %edx, %ecx
+#ifdef __x86_64__
+ bswapq %rsi
+ bswapq %r10
+ cmovz %rdi,%rbx
+ cmovpeq %rsi, %rdx
+#endif

 int $3
 int $0x10

+#ifdef __i386__
    pusha
    popa
-    clc
-    cld
+#endif
+    clc # another comment
+    cld # a comment with embedded ' tick
    cli
    clts
    cmc
    lahf
    sahf
+#ifdef __i386__
    pushfl
    popfl
+#else
+    pushfq
+    popfq
+#endif
    pushf
    popf
    stc
    std
    sti
+#ifdef __i386__
    aaa
    aas
    daa
    das
    aad
    aam
+    into
+#endif
    cbw
    cwd
    cwde
@ -452,7 +631,6 @@ int $0x10
    cltd
    leave
    int3
-    into
    iret
    rsm
    hlt
@ -470,6 +648,24 @@ int $0x10
    repz
    repne
    repnz
+    nop
+
+    lock ;negl (%eax)
+    wait ;pushf
+    rep  ;stosb
+    repe ;lodsb
+    repz ;cmpsb
+    repne;movsb
+    repnz;outsb
+
+    /* handle one-line prefix + ops */
+    lock  negl (%eax)
+    wait  pushf
+    rep   stosb
+    repe  lodsb
+    repz  cmpsb
+    repne movsb
+    repnz outsb
    
    invd
    wbinvd
@ -479,7 +675,34 @@ int $0x10
    rdmsr
    rdpmc
    ud2
+#ifdef __x86_64__
+    syscall
+    sysret
+    sysretq
+    lfence
+    mfence
+    sfence
+    prefetchnta 0x18(%rdx)
+    prefetcht0 (%rcx)
+    prefetcht1 (%rsi)
+    prefetcht2 (%rdi)
+    prefetchw (%rdi)
+    clflush 0x1000(%rax,%rcx)
+    fxsaveq (%rdx)
+    fxsaveq (%r11)
+    fxrstorq (%rcx)
+    fxrstorq (%r10)

+#endif
+
+    lar %ax,%dx
+    lar %eax,%dx
+    lar %ax,%edx
+    lar %eax,%edx
+#ifdef __x86_64__
+    lar %ax,%rdx
+    lar %eax,%rdx
+#endif
    emms
    movd %edx, %mm3
    movd 0x1000, %mm2
@ -524,35 +747,232 @@ int $0x10

        
        
+#ifdef __i386__
    boundl %edx, 0x10000
    boundw %bx, 0x1000
-    
+
    arpl %bx, 0x1000
+#endif
    lar 0x1000, %eax
    lgdt 0x1000
    lidt 0x1000
    lldt 0x1000
-    lmsw 0x1000
-    lsl 0x1000, %ecx
-    ltr 0x1000
-    
    sgdt 0x1000
    sidt 0x1000
    sldt 0x1000
+#ifdef __x86_64__
+    lgdtq 0x1000
+    lidtq 0x1000
+    sgdtq 0x1000
+    sidtq 0x1000
+
+    swapgs
+
+    str %rdx
+    str %r9
+#endif
+
+    lmsw 0x1000
+    lsl 0x1000, %ecx
+    ltr 0x1000
+    ltr %si
    smsw 0x1000
    str 0x1000
+    str %ecx
+    str %dx
    
    verr 0x1000
    verw 0x1000
  
+#ifdef __i386__
    push %ds
    pushw %ds
    pushl %ds
    pop %ds
    popw %ds
    popl %ds
+#endif
    fxsave 1(%ebx)
    fxrstor 1(%ecx)
+#ifdef __i386__
    pushl $1
+#else
+    pushq $1
+#endif
    pushw $1
    push $1
+
+#ifdef __ASSEMBLER__ // should be defined, for S files
+    inc %eax
+#endif
+
+#ifndef _WIN32
+ft1: ft2: ft3: ft4: ft5: ft6: ft7: ft8: ft9:
+    xor %eax, %eax
+    ret
+
+.type ft1,STT_FUNC
+.type ft2,@STT_FUNC
+.type ft3,%STT_FUNC
+.type ft4,"STT_FUNC"
+.type ft5,function
+.type ft6,@function
+.type ft7,%function
+.type ft8,"function"
+#endif
+
+    pause
+.rept 6
+    nop
+.endr
+.fill 4,1,0x90
+
+.section .text.one,"ax"
+nop
+.previous
+.pushsection .text.one,"ax"
+nop
+.pushsection .text.two,"ax"
+nop
+.popsection
+.popsection
+
+1: ud2
+.pushsection __bug_table,"a"
+.align 8
+2: .long 1b - 2b
+   .long 0x600000 - 2b
+   .long 1b + 42
+   .long 43 + 1b
+   .long 2b + 144
+   .long 145 + 2b
+   .word 164, 0
+   .org 2b+32
+#ifdef __x86_64__
+   .quad 1b
+#else
+   .long 1b
+#endif
+.popsection
+3: mov %eax,%ecx
+4:
+.pushsection .text.three, "ax"
+nop
+.skip (-((4b-3b) > 0) * 2) , 0x90
+.popsection
+
+.globl overrideme
+.weak overrideme
+  nop
+.globl notimplemented
+notimplemented:
+  ret
+.set overrideme, notimplemented
+overrideme = notimplemented
+overrideme:
+  ret
+
+    movd %esi, %mm1
+    movd %edi, %xmm2
+    movd (%ebx), %mm3
+    movd (%ebx), %xmm3
+    movd %mm1, %esi
+    movd %xmm2, %edi
+    movd %mm3, (%edx)
+    movd %xmm3, (%edx)
+#ifdef __x86_64__
+    movd %rsi, %mm1
+    movd %rdi, %xmm2
+    movd (%rbx), %mm3
+    movd (%rbx), %xmm3
+    movd %mm1, %r12
+    movd %xmm2, %rdi
+    movd %mm3, (%r8)
+    movd %xmm3, (%r13)
+#endif
+
+    movq (%ebp), %mm1
+    movq %mm2, (%edi)
+    movq (%edi), %xmm3
+    movq %mm4, %mm5
+#ifdef __x86_64__
+    movq %rcx, %mm1
+    movq %rdx, %xmm2
+    movq %r13, %xmm3
+    /* movq mem64->xmm is encoded as f30f7e by GAS, but as
+       660f6e by tcc (which really is a movd and would need 
+       a REX.W prefix to be movq).  */
+    movq (%rsi), %xmm3
+    movq %mm1, %rdx
+    movq %xmm3, %rcx
+    movq %xmm4, (%rsi)
+#endif
+
+#define TEST_MMX_SSE(insn) \
+    insn %mm1, %mm2; \
+    insn %xmm2, %xmm3; \
+    insn (%ebx), %xmm3;
+#define TEST_MMX_SSE_I8(insn) \
+    TEST_MMX_SSE(insn) \
+    insn $0x42, %mm4; \
+    insn $0x42, %xmm4;
+
+    TEST_MMX_SSE(packssdw)
+    TEST_MMX_SSE(packsswb)
+    TEST_MMX_SSE(packuswb)
+    TEST_MMX_SSE(paddb)
+    TEST_MMX_SSE(paddw)
+    TEST_MMX_SSE(paddd)
+    TEST_MMX_SSE(paddsb)
+    TEST_MMX_SSE(paddsw)
+    TEST_MMX_SSE(paddusb)
+    TEST_MMX_SSE(paddusw)
+    TEST_MMX_SSE(pand)
+    TEST_MMX_SSE(pandn)
+    TEST_MMX_SSE(pcmpeqb)
+    TEST_MMX_SSE(pcmpeqw)
+    TEST_MMX_SSE(pcmpeqd)
+    TEST_MMX_SSE(pcmpgtb)
+    TEST_MMX_SSE(pcmpgtw)
+    TEST_MMX_SSE(pcmpgtd)
+    TEST_MMX_SSE(pmaddwd)
+    TEST_MMX_SSE(pmulhw)
+    TEST_MMX_SSE(pmullw)
+    TEST_MMX_SSE(por)
+    TEST_MMX_SSE(psllw)
+TEST_MMX_SSE_I8(psllw)
+    TEST_MMX_SSE(pslld)
+TEST_MMX_SSE_I8(pslld)
+    TEST_MMX_SSE(psllq)
+TEST_MMX_SSE_I8(psllq)
+    TEST_MMX_SSE(psraw)
+TEST_MMX_SSE_I8(psraw)
+    TEST_MMX_SSE(psrad)
+TEST_MMX_SSE_I8(psrad)
+    TEST_MMX_SSE(psrlw)
+TEST_MMX_SSE_I8(psrlw)
+    TEST_MMX_SSE(psrld)
+TEST_MMX_SSE_I8(psrld)
+    TEST_MMX_SSE(psrlq)
+TEST_MMX_SSE_I8(psrlq)
+    TEST_MMX_SSE(psubb)
+    TEST_MMX_SSE(psubw)
+    TEST_MMX_SSE(psubd)
+    TEST_MMX_SSE(psubsb)
+    TEST_MMX_SSE(psubsw)
+    TEST_MMX_SSE(psubusb)
+    TEST_MMX_SSE(psubusw)
+    TEST_MMX_SSE(punpckhbw)
+    TEST_MMX_SSE(punpckhwd)
+    TEST_MMX_SSE(punpckhdq)
+    TEST_MMX_SSE(punpcklbw)
+    TEST_MMX_SSE(punpcklwd)
+    TEST_MMX_SSE(punpckldq)
+    TEST_MMX_SSE(pxor)
+
+    cvtpi2ps %mm1, %xmm2
+    cvtpi2ps (%ebx), %xmm2
+    TEST_MMX_SSE(pmaxsw)
+    TEST_MMX_SSE(pmaxub)
+    TEST_MMX_SSE(pminsw)
+    TEST_MMX_SSE(pminub)
--- a/tests/boundtest.c
+++ b/tests/boundtest.c
@ -1,5 +1,6 @@
 #include <stdlib.h>
 #include <stdio.h>
+#include <string.h>

 #define NB_ITS 1000000
 //#define NB_ITS 1
@ -49,12 +50,15 @@ int test4(void)
    int i, sum = 0;
    int *tab4;

+    fprintf(stderr, "%s start\n", __FUNCTION__);
+
    tab4 = malloc(20 * sizeof(int));
    for(i=0;i<20;i++) {
        sum += tab4[i];
    }
    free(tab4);

+    fprintf(stderr, "%s end\n", __FUNCTION__);
    return sum;
 }

@ -64,12 +68,15 @@ int test5(void)
    int i, sum = 0;
    int *tab4;

+    fprintf(stderr, "%s start\n", __FUNCTION__);
+
    tab4 = malloc(20 * sizeof(int));
    for(i=0;i<21;i++) {
        sum += tab4[i];
    }
    free(tab4);

+    fprintf(stderr, "%s end\n", __FUNCTION__);
    return sum;
 }

@ -186,8 +193,43 @@ int test15(void)
    return strlen(p);
 }

+/* ok */
+int test16()
+{
+    char *demo = "This is only a test.";
+    char *p;
+
+    fprintf(stderr, "%s start\n", __FUNCTION__);
+
+    p = alloca(16);
+    strcpy(p,"12345678901234");
+    printf("alloca: p is %s\n", p);
+
+    /* Test alloca embedded in a larger expression */
+    printf("alloca: %s\n", strcpy(alloca(strlen(demo)+1),demo) );
+
+    fprintf(stderr, "%s end\n", __FUNCTION__);
+}
+
+/* error */
+int test17()
+{
+    char *demo = "This is only a test.";
+    char *p;
+
+    fprintf(stderr, "%s start\n", __FUNCTION__);
+
+    p = alloca(16);
+    strcpy(p,"12345678901234");
+    printf("alloca: p is %s\n", p);
+
+    /* Test alloca embedded in a larger expression */
+    printf("alloca: %s\n", strcpy(alloca(strlen(demo)),demo) );
+
+    fprintf(stderr, "%s end\n", __FUNCTION__);
+}
+
 int (*table_test[])(void) = {
-    test1,
    test1,
    test2,
    test3,
@ -203,23 +245,33 @@ int (*table_test[])(void) = {
    test13,
    test14,
    test15,
+    test16,
+    test17,
 };

 int main(int argc, char **argv)
 {
    int index;
    int (*ftest)(void);
+    int index_max = sizeof(table_test)/sizeof(table_test[0]);

    if (argc < 2) {
-        printf("usage: boundtest n\n"
-               "test TCC bound checking system\n"
-               );
+        printf(
+    	    "test TCC bound checking system\n"
+	    "usage: boundtest N\n"
+            "  1 <= N <= %d\n", index_max);
        exit(1);
    }

    index = 0;
    if (argc >= 2)
-        index = atoi(argv[1]);
+        index = atoi(argv[1]) - 1;
+
+    if ((index < 0) || (index >= index_max)) {
+        printf("N is outside of the valid range (%d)\n", index);
+        exit(2);
+    }
+
    /* well, we also use bounds on this ! */
    ftest = table_test[index];
    ftest();
--- a/tests/gcctestsuite.sh
+++ b/tests/gcctestsuite.sh
@ -8,11 +8,11 @@ nb_failed="0"
 for src in $TESTSUITE_PATH/compile/*.c ; do
  echo $TCC -o /tmp/test.o -c $src 
  $TCC -o /tmp/test.o -c $src >> tcc.log 2>&1
-  if [ "$?" == "0" ] ; then
+  if [ "$?" = "0" ] ; then
     result="PASS"
  else
     result="FAIL"
-     nb_failed=$[ $nb_failed + 1 ]
+     nb_failed=$(( $nb_failed + 1 ))
  fi
  echo "$result: $src"  >> tcc.sum
 done
@ -20,11 +20,11 @@ done
 for src in $TESTSUITE_PATH/execute/*.c ; do
  echo $TCC $src 
  $TCC $src >> tcc.log 2>&1
-  if [ "$?" == "0" ] ; then
+  if [ "$?" = "0" ] ; then
     result="PASS"
  else
     result="FAIL"
-     nb_failed=$[ $nb_failed + 1 ]
+     nb_failed=$(( $nb_failed + 1 ))
  fi
  echo "$result: $src"  >> tcc.sum
 done
--- a/tests/libtcc_test.c
+++ b/tests/libtcc_test.c
@ -15,7 +15,16 @@ int add(int a, int b)
    return a + b;
 }

+/* this strinc is referenced by the generated code */
+const char hello[] = "Hello World!";
+
 char my_program[] =
+"#include <tcclib.h>\n" /* include the "Simple libc header for TCC" */
+"extern int add(int a, int b);\n"
+"#ifdef _WIN32\n" /* dynamically linked data needs 'dllimport' */
+" __attribute__((dllimport))\n"
+"#endif\n"
+"extern const char hello[];\n"
 "int fib(int n)\n"
 "{\n"
 "    if (n <= 2)\n"
@ -26,7 +35,7 @@ char my_program[] =
 "\n"
 "int foo(int n)\n"
 "{\n"
-"    printf(\"Hello World!\\n\");\n"
+"    printf(\"%s\\n\", hello);\n"
 "    printf(\"fib(%d) = %d\\n\", n, fib(n));\n"
 "    printf(\"add(%d, %d) = %d\\n\", n, 2 * n, add(n, 2 * n));\n"
 "    return 0;\n"
@ -35,9 +44,8 @@ char my_program[] =
 int main(int argc, char **argv)
 {
    TCCState *s;
+    int i;
    int (*func)(int);
-    void *mem;
-    int size;

    s = tcc_new();
    if (!s) {
@ -46,8 +54,17 @@ int main(int argc, char **argv)
    }

    /* if tcclib.h and libtcc1.a are not installed, where can we find them */
-    if (argc == 2 && !memcmp(argv[1], "lib_path=",9))
-        tcc_set_lib_path(s, argv[1]+9);
+    for (i = 1; i < argc; ++i) {
+        char *a = argv[i];
+        if (a[0] == '-') {
+            if (a[1] == 'B')
+                tcc_set_lib_path(s, a+2);
+            else if (a[1] == 'I')
+                tcc_add_include_path(s, a+2);
+            else if (a[1] == 'L')
+                tcc_add_library_path(s, a+2);
+        }
+    }

    /* MUST BE CALLED before any compilation */
    tcc_set_output_type(s, TCC_OUTPUT_MEMORY);
@ -55,30 +72,25 @@ int main(int argc, char **argv)
    if (tcc_compile_string(s, my_program) == -1)
        return 1;

-    /* as a test, we add a symbol that the compiled program can use.
+    /* as a test, we add symbols that the compiled program can use.
       You may also open a dll with tcc_add_dll() and use symbols from that */
    tcc_add_symbol(s, "add", add);
+    tcc_add_symbol(s, "hello", hello);

-    /* get needed size of the code */
-    size = tcc_relocate(s, NULL);
-    if (size == -1)
+    /* relocate the code */
+    if (tcc_relocate(s, TCC_RELOCATE_AUTO) < 0)
        return 1;

-    /* allocate memory and copy the code into it */
-    mem = malloc(size);
-    tcc_relocate(s, mem);
-
    /* get entry symbol */
    func = tcc_get_symbol(s, "foo");
    if (!func)
        return 1;

-    /* delete the state */
-    tcc_delete(s);
-
    /* run the code */
    func(32);

-    free(mem);
+    /* delete the state */
+    tcc_delete(s);
+
    return 0;
 }
--- a/tests/pp/01.c
+++ b/tests/pp/01.c
@ -0,0 +1,6 @@
+#define hash_hash # ## #
+#define mkstr(a) # a
+#define in_between(a) mkstr(a)
+#define join(c, d) in_between(c hash_hash d)
+char p[] = join(x, y);
+// char p[] = "x ## y";
--- a/tests/pp/01.expect
+++ b/tests/pp/01.expect
@ -0,0 +1 @@
+char p[] = "x ## y";
--- a/tests/pp/02.c
+++ b/tests/pp/02.c
@ -0,0 +1,28 @@
+#define x 3
+#define f(a) f(x * (a))
+#undef x
+#define x 2
+#define g f
+#define z z[0]
+#define h g(~
+#define m(a) a(w)
+#define w 0,1
+#define t(a) a
+#define p() int
+#define q(x) x
+#define r(x,y) x ## y
+#define str(x) # x
+f(y+1) + f(f(z)) % t(t(g)(0) + t)(1);
+g(x+(3,4)-w) | h 5) & m
+(f)^m(m);
+char c[2][6] = { str(hello), str() };
+/*
+ * f(2 * (y+1)) + f(2 * (f(2 * (z[0])))) % f(2 * (0)) + t(1);
+ * f(2 * (2+(3,4)-0,1)) | f(2 * (~ 5)) & f(2 * (0,1))^m(0,1);
+ * char c[2][6] = { "hello", "" };
+ */
+#define L21 f(y+1) + f(f(z)) % t(t(g)(0) + t)(1);
+#define L22 g(x+(3,4)-w) | h 5) & m\
+(f)^m(m);
+L21
+L22
--- a/tests/pp/02.expect
+++ b/tests/pp/02.expect
@ -0,0 +1,5 @@
+f(2 * (y+1)) + f(2 * (f(2 * (z[0])))) % f(2 * (0)) + t(1);
+f(2 * (2 +(3,4)-0,1)) | f(2 * (~ 5)) & f(2 * (0,1))^m(0,1);
+char c[2][6] = { "hello", "" };
+f(2 * (y+1)) + f(2 * (f(2 * (z[0])))) % f(2 * (0)) + t(1);
+f(2 * (2 +(3,4)-0,1)) | f(2 * (~ 5)) & f(2 * (0,1))^m(0,1);
--- a/tests/pp/03.c
+++ b/tests/pp/03.c
@ -0,0 +1,15 @@
+#define str(s) # s
+#define xstr(s) str(s)
+#define debug(s, t) printf("x" # s "= %d, x" # t "= %s", \
+	x ## s, x ## t)
+#define INCFILE(n) vers ## n
+#define glue(a, b) a ## b
+#define xglue(a, b) glue(a, b)
+#define HIGHLOW "hello"
+#define LOW LOW ", world"
+debug(1, 2);
+fputs(str(strncmp("abc\0d", "abc", '\4') // this goes away
+	== 0) str(: @\n), s);
+\#include xstr(INCFILE(2).h)
+glue(HIGH, LOW);
+xglue(HIGH, LOW)
--- a/tests/pp/03.expect
+++ b/tests/pp/03.expect
@ -0,0 +1,5 @@
+printf("x" "1" "= %d, x" "2" "= %s", x1, x2);
+fputs("strncmp(\"abc\\0d\", \"abc\", '\\4') == 0" ": @\n", s);
+\#include "vers2.h"
+"hello";
+"hello" ", world"
--- a/tests/pp/04.c
+++ b/tests/pp/04.c
@ -0,0 +1,4 @@
+#define foobar 1
+#define C(x,y) x##y
+#define D(x) (C(x,bar))
+D(foo)
--- a/tests/pp/04.expect
+++ b/tests/pp/04.expect
@ -0,0 +1 @@
+(1)
--- a/tests/pp/05.c
+++ b/tests/pp/05.c
@ -0,0 +1,7 @@
+#define t(x,y,z) x ## y ## z
+#define xxx(s) int s[] = { t(1,2,3), t(,4,5), t(6,,7), t(8,9,), \
+        t(10,,), t(,11,), t(,,12), t(,,) };
+
+int j[] = { t(1,2,3), t(,4,5), t(6,,7), t(8,9,),
+	t(10,,), t(,11,), t(,,12), t(,,) };
+xxx(j)
--- a/tests/pp/05.expect
+++ b/tests/pp/05.expect
@ -0,0 +1,3 @@
+int j[] = { 123, 45, 67, 89,
+ 10, 11, 12, };
+int j[] = { 123, 45, 67, 89, 10, 11, 12, };
--- a/tests/pp/06.c
+++ b/tests/pp/06.c
@ -0,0 +1,5 @@
+#define X(a,b, \
+	c,d) \
+	foo
+
+X(1,2,3,4)
--- a/Show More
+++ b/Show More
 @ -1 +1 @@
 .9.25
 .9.27