]> Cypherpunks repositories - gostls13.git/commitdiff
8g:
authorRuss Cox <rsc@golang.org>
Wed, 3 Jun 2009 06:25:17 +0000 (23:25 -0700)
committerRuss Cox <rsc@golang.org>
Wed, 3 Jun 2009 06:25:17 +0000 (23:25 -0700)
  * floating point -> integer conversions.
    x86 defines that overflow/underflow
    results in 1<<15, 1<<31, 1<<63 for
    int16, int32, int64.  when building the
    unsigned conversions out of the native signed
    ones, 8g turns overflow/underflow into zero.
    the spec does not say what should happen.

  * many tiny bug fixes.  can run a large number
    of files from go/test now, and can fmt.Printf.

  * struggling with byte register allocation
    and float32 computation.

R=ken
OCL=29642
CL=29811

src/cmd/8g/cgen.c
src/cmd/8g/gg.h
src/cmd/8g/gsubr.c

index 911e004dc2abecd052ef2fc52838a018d7cb3c91..75c15cd23a4d0105e144d4cdd0f7757a4f916b71 100644 (file)
@@ -7,58 +7,6 @@
 
 #include "gg.h"
 
-static int cancgen64(Node *n, Node *res);
-
-int
-is64(Type *t)
-{
-       if(t == T)
-               return 0;
-       switch(simtype[t->etype]) {
-       case TINT64:
-       case TUINT64:
-       case TPTR64:
-               return 1;
-       }
-       return 0;
-}
-
-int
-noconv(Type *t1, Type *t2)
-{
-       int e1, e2;
-
-       e1 = simtype[t1->etype];
-       e2 = simtype[t2->etype];
-
-       switch(e1) {
-       case TINT8:
-       case TUINT8:
-               return e2 == TINT8 || e2 == TUINT8;
-
-       case TINT16:
-       case TUINT16:
-               return e2 == TINT16 || e2 == TUINT16;
-
-       case TINT32:
-       case TUINT32:
-       case TPTR32:
-               return e2 == TINT32 || e2 == TUINT32 || e2 == TPTR32;
-
-       case TINT64:
-       case TUINT64:
-       case TPTR64:
-               return e2 == TINT64 || e2 == TUINT64 || e2 == TPTR64;
-
-       case TFLOAT32:
-               return e2 == TFLOAT32;
-
-       case TFLOAT64:
-               return e2 == TFLOAT64;
-       }
-       return 0;
-}
-
 /*
  * generate:
  *     res = n;
@@ -84,11 +32,16 @@ cgen(Node *n, Node *res)
        if(res == N || res->type == T)
                fatal("cgen: res nil");
 
+       // static initializations
+       if(initflag && gen_as_init(n, res))
+               return;
+
        // function calls on both sides?  introduce temporary
        if(n->ullman >= UINF && res->ullman >= UINF) {
-               tempname(&n1, n->type);
+               tempalloc(&n1, n->type);
                cgen(n, &n1);
                cgen(&n1, res);
+               tempfree(&n1);
                return;
        }
 
@@ -125,10 +78,6 @@ cgen(Node *n, Node *res)
        // otherwise, the result is addressable but n is not.
        // let's do some computation.
 
-       // 64-bit ops are hard on 32-bit machine.
-       if(is64(n->type) && cancgen64(n, res))
-               return;
-
        // use ullman to pick operand to eval first.
        nl = n->left;
        nr = n->right;
@@ -144,6 +93,25 @@ cgen(Node *n, Node *res)
                return;
        }
 
+       // 64-bit ops are hard on 32-bit machine.
+       if(is64(n->type) || is64(res->type) || n->left != N && is64(n->left->type)) {
+               switch(n->op) {
+               // math goes to cgen64.
+               case OMINUS:
+               case OCOM:
+               case OADD:
+               case OSUB:
+               case OMUL:
+               case OLSH:
+               case ORSH:
+               case OAND:
+               case OOR:
+               case OXOR:
+                       cgen64(n, res);
+                       return;
+               }
+       }
+
        if(isfloat[n->type->etype] && isfloat[nl->type->etype])
                goto flt;
 
@@ -178,6 +146,7 @@ cgen(Node *n, Node *res)
                return;
 
        case OMINUS:
+       case OCOM:
                a = optoas(n->op, nl->type);
                goto uop;
 
@@ -218,8 +187,8 @@ cgen(Node *n, Node *res)
                break;
 
        case OLEN:
-               if(istype(nl->type, TSTRING) || istype(nl->type, TMAP)) {
-                       // both string and map have len in the first 32-bit word.
+               if(istype(nl->type, TMAP)) {
+                       // map has len in the first 32-bit word.
                        // a zero pointer means zero length
                        tempalloc(&n1, types[tptr]);
                        cgen(nl, &n1);
@@ -243,7 +212,9 @@ cgen(Node *n, Node *res)
                        regfree(&n1);
                        break;
                }
-               if(isslice(nl->type)) {
+               if(istype(nl->type, TSTRING) || isslice(nl->type)) {
+                       // both slice and string have len one pointer into the struct.
+                       // a zero pointer means zero length
                        igen(nl, &n1, res);
                        n1.op = OINDREG;
                        n1.type = types[TUINT32];
@@ -289,10 +260,6 @@ cgen(Node *n, Node *res)
 
        case OMOD:
        case ODIV:
-               if(isfloat[n->type->etype]) {
-                       a = optoas(n->op, nl->type);
-                       goto abop;
-               }
                cgen_div(n->op, nl, nr, res);
                break;
 
@@ -340,8 +307,19 @@ uop:       // unary
        return;
 
 flt:   // floating-point.  387 (not SSE2) to interoperate with 6c
-       nodreg(&f0, n->type, D_F0);
+       nodreg(&f0, nl->type, D_F0);
        nodreg(&f1, n->type, D_F0+1);
+       if(nr != N)
+               goto flt2;
+
+       // unary
+       cgen(nl, &f0);
+       if(n->op != OCONV)
+               gins(foptoas(n->op, n->type, 0), &f0, &f0);
+       gmove(&f0, res);
+       return;
+
+flt2:  // binary
        if(nl->ullman >= nr->ullman) {
                cgen(nl, &f0);
                if(nr->addable)
@@ -402,7 +380,7 @@ agen(Node *n, Node *res)
                fatal("agen %O", n->op);
 
        case OCONV:
-               if(!eqtype(n->type, nl->type))
+               if(!cvttype(n->type, nl->type))
                        fatal("agen: non-trivial OCONV");
                agen(nl, res);
                break;
@@ -427,8 +405,11 @@ agen(Node *n, Node *res)
                if(nr->addable) {
                        agenr(nl, &n3, res);
                        if(!isconst(nr, CTINT)) {
+                               tempalloc(&tmp, nr->type);
+                               cgen(nr, &tmp);
                                regalloc(&n1, nr->type, N);
-                               cgen(nr, &n1);
+                               gmove(&tmp, &n1);
+                               tempfree(&tmp);
                        }
                } else if(nl->addable) {
                        if(!isconst(nr, CTINT)) {
@@ -640,7 +621,7 @@ bgen(Node *n, int true, Prog *to)
 {
        int et, a;
        Node *nl, *nr, *r;
-       Node n1, n2, tmp;
+       Node n1, n2, tmp, t1, t2, ax;
        Prog *p1, *p2;
 
        if(debug['g']) {
@@ -778,6 +759,37 @@ bgen(Node *n, int true, Prog *to)
                        break;
                }
 
+               if(isfloat[nr->type->etype]) {
+                       nodreg(&tmp, nr->type, D_F0);
+                       nodreg(&n2, nr->type, D_F0 + 1);
+                       nodreg(&ax, types[TUINT16], D_AX);
+                       et = simsimtype(nr->type);
+                       if(et == TFLOAT64) {
+                               // easy - do in FPU
+                               cgen(nr, &tmp);
+                               cgen(nl, &tmp);
+                               gins(AFUCOMPP, &tmp, &n2);
+                       } else {
+                               // NOTE(rsc): This is wrong.
+                               // It's right for comparison but presumably all the
+                               // other ops have the same problem.  We need to
+                               // figure out what the right solution is, besides
+                               // tell people to use float64.
+                               tempalloc(&t1, types[TFLOAT32]);
+                               tempalloc(&t2, types[TFLOAT32]);
+                               cgen(nr, &t1);
+                               cgen(nl, &t2);
+                               gmove(&t1, &tmp);
+                               gins(AFCOMFP, &t1, &tmp);
+                               tempfree(&t2);
+                               tempfree(&t1);
+                       }
+                       gins(AFSTSW, N, &ax);
+                       gins(ASAHF, N, N);
+                       patch(gbranch(optoas(brrev(a), nr->type), T), to);
+                       break;
+               }
+
                if(is64(nr->type)) {
                        if(!nl->addable) {
                                tempalloc(&n1, nl->type);
@@ -800,45 +812,43 @@ bgen(Node *n, int true, Prog *to)
                a = optoas(a, nr->type);
 
                if(nr->ullman >= UINF) {
-                       regalloc(&n1, nr->type, N);
-                       cgen(nr, &n1);
-
-                       tempname(&tmp, nr->type);
-                       gmove(&n1, &tmp);
-                       regfree(&n1);
+                       tempalloc(&tmp, nr->type);
+                       cgen(nr, &tmp);
 
-                       regalloc(&n1, nl->type, N);
+                       tempalloc(&n1, nl->type);
                        cgen(nl, &n1);
 
-                       regalloc(&n2, nr->type, &n2);
+                       regalloc(&n2, nr->type, N);
                        cgen(&tmp, &n2);
 
                        gins(optoas(OCMP, nr->type), &n1, &n2);
                        patch(gbranch(a, nr->type), to);
-
-                       regfree(&n1);
+                       tempfree(&n1);
+                       tempfree(&tmp);
                        regfree(&n2);
                        break;
                }
 
-               regalloc(&n1, nl->type, N);
+               tempalloc(&n1, nl->type);
                cgen(nl, &n1);
 
                if(smallintconst(nr)) {
                        gins(optoas(OCMP, nr->type), &n1, nr);
                        patch(gbranch(a, nr->type), to);
-                       regfree(&n1);
+                       tempfree(&n1);
                        break;
                }
 
+               tempalloc(&tmp, nr->type);
+               cgen(nr, &tmp);
                regalloc(&n2, nr->type, N);
-               cgen(nr, &n2);
+               gmove(&tmp, &n2);
+               tempfree(&tmp);
 
                gins(optoas(OCMP, nr->type), &n1, &n2);
                patch(gbranch(a, nr->type), to);
-
-               regfree(&n1);
                regfree(&n2);
+               tempfree(&n1);
                break;
        }
 }
@@ -883,7 +893,7 @@ stkof(Node *n)
 void
 sgen(Node *n, Node *res, int w)
 {
-       Node nodl, nodr;
+       Node dst, src, tdst, tsrc;
        int32 c, q, odst, osrc;
 
        if(debug['g']) {
@@ -904,22 +914,29 @@ sgen(Node *n, Node *res, int w)
        osrc = stkof(n);
        odst = stkof(res);
 
-       // TODO(rsc): Should these be tempalloc instead?
-       nodreg(&nodl, types[tptr], D_DI);
-       nodreg(&nodr, types[tptr], D_SI);
-
-       if(n->ullman >= res->ullman) {
-               agen(n, &nodr);
-               agen(res, &nodl);
-       } else {
-               agen(res, &nodl);
-               agen(n, &nodr);
-       }
+       nodreg(&dst, types[tptr], D_DI);
+       nodreg(&src, types[tptr], D_SI);
+
+       tempalloc(&tsrc, types[tptr]);
+       tempalloc(&tdst, types[tptr]);
+       if(!n->addable)
+               agen(n, &tsrc);
+       if(!res->addable)
+               agen(res, &tdst);
+       if(n->addable)
+               agen(n, &src);
+       else
+               gmove(&tsrc, &src);
+       if(res->addable)
+               agen(res, &dst);
+       else
+               gmove(&tdst, &dst);
+       tempfree(&tdst);
+       tempfree(&tsrc);
 
        c = w % 4;      // bytes
        q = w / 4;      // doublewords
 
-       gins(ACLD, N, N);
        // if we are copying forward on the stack and
        // the src and dst overlap, then reverse direction
        if(osrc < odst && odst < osrc+w) {
@@ -949,6 +966,7 @@ sgen(Node *n, Node *res, int w)
                // we leave with the flag clear
                gins(ACLD, N, N);
        } else {
+               gins(ACLD, N, N);       // paranoia.  TODO(rsc): remove?
                // normal direction
                if(q >= 4) {
                        gconreg(AMOVL, q, D_CX);
@@ -966,34 +984,13 @@ sgen(Node *n, Node *res, int w)
        }
 }
 
-void
-nswap(Node *a, Node *b)
-{
-       Node t;
-
-       t = *a;
-       *a = *b;
-       *b = t;
-}
-
-Node*
-ncon(uint32 i)
-{
-       static Node n;
-
-       if(n.type == T)
-               nodconst(&n, types[TUINT32], 0);
-       mpmovecfix(n.val.u.xval, i);
-       return &n;
-}
-
 /*
  * attempt to generate 64-bit
  *     res = n
  * return 1 on success, 0 if op not handled.
  */
-static int
-cancgen64(Node *n, Node *res)
+void
+cgen64(Node *n, Node *res)
 {
        Node t1, t2, ax, dx, cx, ex, fx, *l, *r;
        Node lo1, lo2, lo3, hi1, hi2, hi3;
@@ -1001,8 +998,6 @@ cancgen64(Node *n, Node *res)
        uint64 v;
        uint32 lv, hv;
 
-       if(n->op == OCALL)
-               return 0;
        if(res->op != OINDREG && res->op != ONAME) {
                dump("n", n);
                dump("res", res);
@@ -1010,12 +1005,7 @@ cancgen64(Node *n, Node *res)
        }
        switch(n->op) {
        default:
-               return 0;
-
-       case ONAME:
-       case ODOT:
-               gmove(n, res);
-               return 1;
+               fatal("cgen64 %O", n->op);
 
        case OMINUS:
                cgen(n->left, res);
@@ -1024,7 +1014,7 @@ cancgen64(Node *n, Node *res)
                gins(AADCL, ncon(0), &hi1);
                gins(ANEGL, N, &hi1);
                splitclean();
-               return 1;
+               return;
 
        case OCOM:
                cgen(n->left, res);
@@ -1032,7 +1022,7 @@ cancgen64(Node *n, Node *res)
                gins(ANOTL, N, &lo1);
                gins(ANOTL, N, &hi1);
                splitclean();
-               return 1;
+               return;
 
        case OADD:
        case OSUB:
@@ -1408,7 +1398,6 @@ out:
                tempfree(&t2);
        if(l == &t1)
                tempfree(&t1);
-       return 1;
 }
 
 /*
index ee9140b047443eca89ab1420d3f6af55fa7a2415..03f7aac6f691f63c79873acb10cdf05a671e3fce 100644 (file)
@@ -2,7 +2,6 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-
 #include <u.h>
 #include <libc.h>
 
@@ -68,7 +67,7 @@ EXTERN        Node*   throwreturn;
 EXTERN int     maxstksize;
 
 /*
- * gen.c
+ * ggen.c
  */
 void   compile(Node*);
 void   proglist(void);
@@ -90,7 +89,7 @@ void  checklabels();
 void   ginscall(Node*, int);
 
 /*
- * cgen
+ * cgen.c
  */
 void   agen(Node*, Node*);
 void   agenr(Node *n, Node *a, Node *res);
@@ -103,10 +102,14 @@ Prog*     gins(int, Node*, Node*);
 int    samaddr(Node*, Node*);
 void   naddr(Node*, Addr*);
 void   cgen_aret(Node*, Node*);
-int    is64(Type*);
-void   cmp64(Node*, Node*, int, Prog*);
 Node*  ncon(uint32);
 
+/*
+ * cgen64.c
+ */
+void   cmp64(Node*, Node*, int, Prog*);
+void   cgen64(Node*, Node*);
+
 /*
  * gsubr.c
  */
@@ -133,9 +136,10 @@ void       tempfree(Node*);
 Node*  nodarg(Type*, int);
 void   nodreg(Node*, Type*, int);
 void   nodindreg(Node*, Type*, int);
-void   nodconst(Node*, Type*, vlong);
+void   nodconst(Node*, Type*, int64);
 void   gconreg(int, vlong, int);
 void   datagostring(Strlit*, Addr*);
+void   datastring(char*, int, Addr*);
 void   buildtxt(void);
 Plist* newplist(void);
 int    isfat(Type*);
@@ -145,6 +149,7 @@ int dotaddable(Node*, Node*);
 void   afunclit(Addr*);
 void   split64(Node*, Node*, Node*);
 void   splitclean(void);
+void   nswap(Node*, Node*);
 
 /*
  * list.c
index 1d9e9967f6a2e264904cb6bee87621f328258691..4f30c606b9896da7cb64898b403581ce0efd2d91 100755 (executable)
@@ -407,6 +407,22 @@ optoas(int op, Type *t)
                a = ADECL;
                break;
 
+       case CASE(OCOM, TINT8):
+       case CASE(OCOM, TUINT8):
+               a = ANOTB;
+               break;
+
+       case CASE(OCOM, TINT16):
+       case CASE(OCOM, TUINT16):
+               a = ANOTW;
+               break;
+
+       case CASE(OCOM, TINT32):
+       case CASE(OCOM, TUINT32):
+       case CASE(OCOM, TPTR32):
+               a = ANOTL;
+               break;
+
        case CASE(OMINUS, TINT8):
        case CASE(OMINUS, TUINT8):
                a = ANEGB;
@@ -560,6 +576,10 @@ optoas(int op, Type *t)
                a = ADIVL;
                break;
 
+       case CASE(OEXTEND, TINT8):
+               a = ACBW;
+               break;
+
        case CASE(OEXTEND, TINT16):
                a = ACWD;
                break;
@@ -577,7 +597,13 @@ foptoas(int op, Type *t, int flg)
 {
        int et;
 
-       et = t->etype;
+       et = simtype[t->etype];
+
+       // If we need Fpop, it means we're working on
+       // two different floating-point registers, not memory.
+       // There the instruction only has a float64 form.
+       if(flg & Fpop)
+               et = TFLOAT64;
 
        // clear Frev if unneeded
        switch(op) {
@@ -655,6 +681,9 @@ static      int     resvd[] =
        D_CX,   // for shift
        D_DX,   // for divide
        D_SP,   // for stack
+
+       D_BL,   // because D_BX can be allocated
+       D_BH,
 };
 
 void
@@ -664,7 +693,7 @@ ginit(void)
 
        for(i=0; i<nelem(reg); i++)
                reg[i] = 1;
-       for(i=D_AX; i<=D_DI; i++)
+       for(i=D_AL; i<=D_DI; i++)
                reg[i] = 0;
 
        // TODO: Use MMX ?
@@ -685,7 +714,7 @@ gclean(void)
        for(i=0; i<nelem(resvd); i++)
                reg[resvd[i]]--;
 
-       for(i=D_AX; i<=D_DI; i++)
+       for(i=D_AL; i<=D_DI; i++)
                if(reg[i])
                        yyerror("reg %R left allocated at %lux\n", i, regpc[i]);
        for(i=D_F0; i<=D_F7; i++)
@@ -701,7 +730,7 @@ gclean(void)
 void
 regalloc(Node *n, Type *t, Node *o)
 {
-       int i, et;
+       int i, et, min, max;
 
        if(t == T)
                fatal("regalloc: t nil");
@@ -710,6 +739,13 @@ regalloc(Node *n, Type *t, Node *o)
        switch(et) {
        case TINT8:
        case TUINT8:
+               // This is going to come back to bite us;
+               // we're not tracking tiny registers vs big ones.
+               // The hope is that because we use temporaries
+               // everywhere instead of registers, this will be okay.
+               min = D_AL;
+               max = D_BH;
+               goto try;
        case TINT16:
        case TUINT16:
        case TINT32:
@@ -719,17 +755,20 @@ regalloc(Node *n, Type *t, Node *o)
        case TPTR32:
        case TPTR64:
        case TBOOL:
+               min = D_AX;
+               max = D_DI;
+       try:
                if(o != N && o->op == OREGISTER) {
                        i = o->val.u.reg;
-                       if(i >= D_AX && i <= D_DI)
+                       if(i >= D_AX && i <= max)
                                goto out;
                }
-               for(i=D_AX; i<=D_DI; i++)
+               for(i=min; i<=max; i++)
                        if(reg[i] == 0)
                                goto out;
 
                fprint(2, "registers allocated at\n");
-               for(i=D_AX; i<=D_DI; i++)
+               for(i=min; i<=max; i++)
                        fprint(2, "\t%R\t%#lux\n", i, regpc[i]);
                yyerror("out of fixed registers");
                goto err;
@@ -805,6 +844,7 @@ tempalloc(Node *n, Type *t)
        stksize += w;
        stksize = rnd(stksize, w);
        n->xoffset = -stksize;
+//print("tempalloc %d -> %d from %p\n", n->ostk, n->xoffset, __builtin_return_address(0));
        if(stksize > maxstksize)
                maxstksize = stksize;
 }
@@ -812,6 +852,7 @@ tempalloc(Node *n, Type *t)
 void
 tempfree(Node *n)
 {
+//print("tempfree %d\n", n->xoffset);
        if(n->xoffset != -stksize)
                fatal("tempfree %lld %d", -n->xoffset, stksize);
        stksize = n->ostk;
@@ -912,6 +953,33 @@ gconreg(int as, vlong c, int reg)
        gins(as, &n1, &n2);
 }
 
+/*
+ * swap node contents
+ */
+void
+nswap(Node *a, Node *b)
+{
+       Node t;
+
+       t = *a;
+       *a = *b;
+       *b = t;
+}
+
+/*
+ * return constant i node.
+ * overwritten by next call, but useful in calls to gins.
+ */
+Node*
+ncon(uint32 i)
+{
+       static Node n;
+
+       if(n.type == T)
+               nodconst(&n, types[TUINT32], 0);
+       mpmovecfix(n.val.u.xval, i);
+       return &n;
+}
 
 /*
  * Is this node a memory operand?
@@ -954,9 +1022,17 @@ split64(Node *n, Node *lo, Node *hi)
                        sclean[nsclean-1] = n1;
                }
                n = &n1;
-               // fall through
+               goto common;
        case ONAME:
+               if(n->class == PPARAMREF) {
+                       cgen(n->heapaddr, &n1);
+                       sclean[nsclean-1] = n1;
+                       // fall through.
+                       n = &n1;
+               }
+               goto common;
        case OINDREG:
+       common:
                *lo = *n;
                *hi = *n;
                lo->type = types[TUINT32];
@@ -990,12 +1066,44 @@ splitclean(void)
                regfree(&sclean[nsclean]);
 }
 
+/*
+ * set up nodes representing fp constants
+ */
+Node zerof;
+Node two64f;
+Node two63f;
+
+void
+bignodes(void)
+{
+       static int did;
+
+       if(did)
+               return;
+       did = 1;
+
+       two64f = *ncon(0);
+       two64f.type = types[TFLOAT64];
+       two64f.val.ctype = CTFLT;
+       two64f.val.u.fval = mal(sizeof *two64f.val.u.fval);
+       mpmovecflt(two64f.val.u.fval, 18446744073709551616.);
+
+       two63f = two64f;
+       two63f.val.u.fval = mal(sizeof *two63f.val.u.fval);
+       mpmovecflt(two63f.val.u.fval, 9223372036854775808.);
+
+       zerof = two64f;
+       zerof.val.u.fval = mal(sizeof *zerof.val.u.fval);
+       mpmovecflt(zerof.val.u.fval, 0);
+}
+
 void
 gmove(Node *f, Node *t)
 {
        int a, ft, tt;
        Type *cvt;
-       Node r1, r2, flo, fhi, tlo, thi, con;
+       Node r1, r2, t1, t2, flo, fhi, tlo, thi, con, f0, f1, ax, dx, cx;
+       Prog *p1, *p2, *p3;
 
        if(debug['M'])
                print("gmove %N -> %N\n", f, t);
@@ -1004,16 +1112,19 @@ gmove(Node *f, Node *t)
        tt = simsimtype(t->type);
        cvt = t->type;
 
-       // cannot have two memory operands;
+       // cannot have two integer memory operands;
        // except 64-bit, which always copies via registers anyway.
-       if(ismem(f) && ismem(t) && !is64(f->type) && !is64(t->type))
+       if(isint[ft] && isint[tt] && !is64(f->type) && !is64(t->type) && ismem(f) && ismem(t))
                goto hard;
 
        // convert constant to desired type
        if(f->op == OLITERAL) {
-               convconst(&con, t->type, &f->val);
+               if(tt == TFLOAT32)
+                       convconst(&con, types[TFLOAT64], &f->val);
+               else
+                       convconst(&con, t->type, &f->val);
                f = &con;
-               ft = tt;        // so big switch will choose a simple mov
+               ft = simsimtype(con.type);
 
                // some constants can't move directly to memory.
                if(ismem(t)) {
@@ -1032,7 +1143,7 @@ gmove(Node *f, Node *t)
 
        switch(CASE(ft, tt)) {
        default:
-               fatal("gmove %N -> %N", f, t);
+               goto fatal;
 
        /*
         * integer copy and truncate
@@ -1057,10 +1168,9 @@ gmove(Node *f, Node *t)
        case CASE(TINT64, TUINT8):
        case CASE(TUINT64, TUINT8):
                split64(f, &flo, &fhi);
-               regalloc(&r1, t->type, t);
+               nodreg(&r1, t->type, D_AX);
                gins(AMOVB, &flo, &r1);
                gins(AMOVB, &r1, t);
-               regfree(&r1);
                splitclean();
                return;
 
@@ -1080,10 +1190,9 @@ gmove(Node *f, Node *t)
        case CASE(TINT64, TUINT16):
        case CASE(TUINT64, TUINT16):
                split64(f, &flo, &fhi);
-               regalloc(&r1, t->type, t);
+               nodreg(&r1, t->type, D_AX);
                gins(AMOVW, &flo, &r1);
                gins(AMOVW, &r1, t);
-               regfree(&r1);
                splitclean();
                return;
 
@@ -1099,10 +1208,9 @@ gmove(Node *f, Node *t)
        case CASE(TINT64, TUINT32):
        case CASE(TUINT64, TUINT32):
                split64(f, &flo, &fhi);
-               regalloc(&r1, t->type, t);
+               nodreg(&r1, t->type, D_AX);
                gins(AMOVL, &flo, &r1);
                gins(AMOVL, &r1, t);
-               regfree(&r1);
                splitclean();
                return;
 
@@ -1116,14 +1224,12 @@ gmove(Node *f, Node *t)
                        gins(AMOVL, &flo, &tlo);
                        gins(AMOVL, &fhi, &thi);
                } else {
-                       regalloc(&r1, types[TUINT32], N);
-                       regalloc(&r2, types[TUINT32], N);
+                       nodreg(&r1, t->type, D_AX);
+                       nodreg(&r2, t->type, D_DX);
                        gins(AMOVL, &flo, &r1);
                        gins(AMOVL, &fhi, &r2);
                        gins(AMOVL, &r1, &tlo);
                        gins(AMOVL, &r2, &thi);
-                       regfree(&r2);
-                       regfree(&r1);
                }
                splitclean();
                splitclean();
@@ -1198,23 +1304,36 @@ gmove(Node *f, Node *t)
 
        /*
        * float to integer
-       *
+       */
        case CASE(TFLOAT32, TINT16):
        case CASE(TFLOAT32, TINT32):
        case CASE(TFLOAT32, TINT64):
        case CASE(TFLOAT64, TINT16):
        case CASE(TFLOAT64, TINT32):
        case CASE(TFLOAT64, TINT64):
+               if(t->op == OREGISTER)
+                       goto hardmem;
+               nodreg(&r1, types[ft], D_F0);
                if(ft == TFLOAT32)
-                       gins(AFMOVF, f, &f0);
+                       gins(AFMOVF, f, &r1);
                else
-                       gins(AFMOVD, f, &f0);
+                       gins(AFMOVD, f, &r1);
+
+               // set round to zero mode during conversion
+               tempalloc(&t1, types[TUINT16]);
+               tempalloc(&t2, types[TUINT16]);
+               gins(AFSTCW, N, &t1);
+               gins(AMOVW, ncon(0xf7f), &t2);
+               gins(AFLDCW, &t2, N);
                if(tt == TINT16)
-                       gins(AFMOVWP, &f0, t);
+                       gins(AFMOVWP, &r1, t);
                else if(tt == TINT32)
-                       gins(AFMOVLP, &f0, t);
+                       gins(AFMOVLP, &r1, t);
                else
-                       gins(AFMOVVP, &f0, t);
+                       gins(AFMOVVP, &r1, t);
+               gins(AFLDCW, &t1, N);
+               tempfree(&t2);
+               tempfree(&t1);
                return;
 
        case CASE(TFLOAT32, TINT8):
@@ -1224,139 +1343,249 @@ gmove(Node *f, Node *t)
        case CASE(TFLOAT64, TUINT16):
        case CASE(TFLOAT64, TUINT8):
                // convert via int32.
-               cvt = types[TINT32];
-               goto hard;
+               tempalloc(&t1, types[TINT32]);
+               gmove(f, &t1);
+               switch(tt) {
+               default:
+                       fatal("gmove %T", t);
+               case TINT8:
+                       gins(ACMPL, &t1, ncon(-0x80));
+                       p1 = gbranch(optoas(OLT, types[TINT32]), T);
+                       gins(ACMPL, &t1, ncon(0x7f));
+                       p2 = gbranch(optoas(OGT, types[TINT32]), T);
+                       p3 = gbranch(AJMP, T);
+                       patch(p1, pc);
+                       patch(p2, pc);
+                       gmove(ncon(-0x80), &t1);
+                       patch(p3, pc);
+                       gmove(&t1, t);
+                       break;
+               case TUINT8:
+                       gins(ATESTL, ncon(0xffffff00), &t1);
+                       p1 = gbranch(AJEQ, T);
+                       gins(AMOVB, ncon(0), &t1);
+                       patch(p1, pc);
+                       gmove(&t1, t);
+                       break;
+               case TUINT16:
+                       gins(ATESTL, ncon(0xffff0000), &t1);
+                       p1 = gbranch(AJEQ, T);
+                       gins(AMOVW, ncon(0), &t1);
+                       patch(p1, pc);
+                       gmove(&t1, t);
+                       break;
+               }
+               tempfree(&t1);
+               return;
 
        case CASE(TFLOAT32, TUINT32):
        case CASE(TFLOAT64, TUINT32):
-               // could potentially convert via int64.
-               cvt = types[TINT64];
-               goto hard;
+               // convert via int64.
+               tempalloc(&t1, types[TINT64]);
+               gmove(f, &t1);
+               split64(&t1, &tlo, &thi);
+               gins(ACMPL, &thi, ncon(0));
+               p1 = gbranch(AJEQ, T);
+               gins(AMOVL, ncon(0), &tlo);
+               patch(p1, pc);
+               gmove(&tlo, t);
+               splitclean();
+               tempfree(&t1);
+               return;
 
        case CASE(TFLOAT32, TUINT64):
        case CASE(TFLOAT64, TUINT64):
+               bignodes();
+               nodreg(&f0, types[ft], D_F0);
+               nodreg(&f1, types[ft], D_F0 + 1);
+               nodreg(&ax, types[TUINT16], D_AX);
+
                if(ft == TFLOAT32)
                        gins(AFMOVF, f, &f0);
                else
                        gins(AFMOVD, f, &f0);
-               // algorithm is:
+
+               // if 0 > v { answer = 0 }
+               gmove(&zerof, &f0);
+               gins(AFUCOMP, &f0, &f1);
+               gins(AFSTSW, N, &ax);
+               gins(ASAHF, N, N);
+               p1 = gbranch(optoas(OGT, types[tt]), T);
+               // if 1<<64 <= v { answer = 0 too }
+               gmove(&two64f, &f0);
+               gins(AFUCOMP, &f0, &f1);
+               gins(AFSTSW, N, &ax);
+               gins(ASAHF, N, N);
+               p2 = gbranch(optoas(OGT, types[tt]), T);
+               patch(p1, pc);
+               gins(AFMOVVP, &f0, t);  // don't care about t, but will pop the stack
+               split64(t, &tlo, &thi);
+               gins(AMOVL, ncon(0), &tlo);
+               gins(AMOVL, ncon(0), &thi);
+               splitclean();
+               p1 = gbranch(AJMP, T);
+               patch(p2, pc);
+
+               // in range; algorithm is:
                //      if small enough, use native float64 -> int64 conversion.
                //      otherwise, subtract 2^63, convert, and add it back.
-               bignodes();
-               regalloc(&r1, types[ft], N);
-               regalloc(&r2, types[ft], N);
-               gins(optoas(OCMP, f->type), &bigf, &r1);
-               p1 = gbranch(optoas(OLE, f->type), T);
-               gins(a, &r1, &r2);
-               p2 = gbranch(AJMP, T);
-               patch(p1, pc);
-               gins(optoas(OAS, f->type), &bigf, &r3);
-               gins(optoas(OSUB, f->type), &r3, &r1);
-               gins(a, &r1, &r2);
-               gins(AMOVQ, &bigi, &r4);
-               gins(AXORQ, &r4, &r2);
+
+               // set round to zero mode during conversion
+               tempalloc(&t1, types[TUINT16]);
+               tempalloc(&t2, types[TUINT16]);
+               gins(AFSTCW, N, &t1);
+               gins(AMOVW, ncon(0xf7f), &t2);
+               gins(AFLDCW, &t2, N);
+               tempfree(&t2);
+
+               // actual work
+               gmove(&two63f, &f0);
+               gins(AFUCOMP, &f0, &f1);
+               gins(AFSTSW, N, &ax);
+               gins(ASAHF, N, N);
+               p2 = gbranch(optoas(OLE, types[tt]), T);
+               gins(AFMOVVP, &f0, t);
+               p3 = gbranch(AJMP, T);
                patch(p2, pc);
-               gmove(&r2, t);
-               regfree(&r4);
-               regfree(&r3);
-               regfree(&r2);
-               regfree(&r1);
-               fatal("lazy");
+               gmove(&two63f, &f0);
+               gins(AFSUBDP, &f0, &f1);
+               gins(AFMOVVP, &f0, t);
+               split64(t, &tlo, &thi);
+               gins(AXORL, ncon(0x80000000), &thi);    // + 2^63
+               patch(p3, pc);
+               patch(p1, pc);
+               splitclean();
+
+               // restore rounding mode
+               gins(AFLDCW, &t1, N);
+               tempfree(&t1);
                return;
-       */
+
        /*
         * integer to float
-        *
+        */
+       case CASE(TINT16, TFLOAT32):
+       case CASE(TINT16, TFLOAT64):
        case CASE(TINT32, TFLOAT32):
-               a = ACVTSL2SS;
-               goto rdst;
-
-
        case CASE(TINT32, TFLOAT64):
-               a = ACVTSL2SD;
-               goto rdst;
-
        case CASE(TINT64, TFLOAT32):
-               a = ACVTSQ2SS;
-               goto rdst;
-
        case CASE(TINT64, TFLOAT64):
-               a = ACVTSQ2SD;
-               goto rdst;
+               if(t->op != OREGISTER)
+                       goto hard;
+               if(f->op == OREGISTER) {
+                       cvt = f->type;
+                       goto hardmem;
+               }
+               switch(ft) {
+               case TINT16:
+                       a = AFMOVW;
+                       break;
+               case TINT32:
+                       a = AFMOVL;
+                       break;
+               default:
+                       a = AFMOVV;
+                       break;
+               }
+               break;
 
-       case CASE(TINT16, TFLOAT32):
-       case CASE(TINT16, TFLOAT64):
        case CASE(TINT8, TFLOAT32):
        case CASE(TINT8, TFLOAT64):
        case CASE(TUINT16, TFLOAT32):
        case CASE(TUINT16, TFLOAT64):
        case CASE(TUINT8, TFLOAT32):
        case CASE(TUINT8, TFLOAT64):
-               // convert via int32
+               // convert via int32 memory
                cvt = types[TINT32];
-               goto hard;
+               goto hardmem;
 
        case CASE(TUINT32, TFLOAT32):
        case CASE(TUINT32, TFLOAT64):
-               // convert via int64.
+               // convert via int64 memory
                cvt = types[TINT64];
-               goto hard;
+               goto hardmem;
 
        case CASE(TUINT64, TFLOAT32):
        case CASE(TUINT64, TFLOAT64):
                // algorithm is:
                //      if small enough, use native int64 -> uint64 conversion.
                //      otherwise, halve (rounding to odd?), convert, and double.
-               a = ACVTSQ2SS;
-               if(tt == TFLOAT64)
-                       a = ACVTSQ2SD;
-               nodconst(&zero, types[TUINT64], 0);
-               nodconst(&one, types[TUINT64], 1);
-               regalloc(&r1, f->type, f);
-               regalloc(&r2, t->type, t);
-               regalloc(&r3, f->type, N);
-               regalloc(&r4, f->type, N);
-               gmove(f, &r1);
-               gins(ACMPQ, &r1, &zero);
+               nodreg(&ax, types[TUINT32], D_AX);
+               nodreg(&dx, types[TUINT32], D_DX);
+               nodreg(&cx, types[TUINT32], D_CX);
+               tempalloc(&t1, f->type);
+               split64(&t1, &tlo, &thi);
+               gmove(f, &t1);
+               gins(ACMPL, &thi, ncon(0));
                p1 = gbranch(AJLT, T);
-               gins(a, &r1, &r2);
+               // native
+               t1.type = types[TINT64];
+               gmove(&t1, t);
                p2 = gbranch(AJMP, T);
+               // simulated
                patch(p1, pc);
-               gmove(&r1, &r3);
-               gins(ASHRQ, &one, &r3);
-               gmove(&r1, &r4);
-               gins(AANDL, &one, &r4);
-               gins(AORQ, &r4, &r3);
-               gins(a, &r3, &r2);
-               gins(optoas(OADD, t->type), &r2, &r2);
+               gmove(&tlo, &ax);
+               gmove(&thi, &dx);
+               p1 = gins(ASHRL, ncon(1), &ax);
+               p1->from.index = D_DX;  // double-width shift DX -> AX
+               p1->from.scale = 0;
+               gins(ASETCC, N, &cx);
+               gins(AORB, &cx, &ax);
+               gins(ASHRL, ncon(1), &dx);
+               gmove(&dx, &thi);
+               gmove(&ax, &tlo);
+               nodreg(&r1, types[tt], D_F0);
+               nodreg(&r2, types[tt], D_F0 + 1);
+               gmove(&t1, &r1);        // t1.type is TINT64 now, set above
+               gins(AFMOVD, &r1, &r1);
+               gins(AFADDDP, &r1, &r2);
+               gmove(&r1, t);
                patch(p2, pc);
-               gmove(&r2, t);
-               regfree(&r4);
-               regfree(&r3);
-               regfree(&r2);
-               regfree(&r1);
+               splitclean();
+               tempfree(&t1);
                return;
-       */
+
        /*
         * float to float
         */
        case CASE(TFLOAT32, TFLOAT32):
-               a = AFMOVF;
-               break;
-
        case CASE(TFLOAT64, TFLOAT64):
-               a = AFMOVD;
+               // The way the code generator uses floating-point
+               // registers, a move from F0 to F0 is intended as a no-op.
+               // On the x86, it's not: it pushes a second copy of F0
+               // on the floating point stack.  So toss it away here.
+               // Also, F0 is the *only* register we ever evaluate
+               // into, so we should only see register/register as F0/F0.
+               if(f->op == OREGISTER && t->op == OREGISTER) {
+                       if(f->val.u.reg != D_F0 || t->val.u.reg != D_F0)
+                               goto fatal;
+                       return;
+               }
+               if(ismem(f) && ismem(t))
+                       goto hard;
+               a = AFMOVF;
+               if(ft == TFLOAT64)
+                       a = AFMOVD;
+               if(ismem(t)) {
+                       a = AFMOVFP;
+                       if(ft == TFLOAT64)
+                               a = AFMOVDP;
+               }
                break;
 
-       /*
        case CASE(TFLOAT32, TFLOAT64):
-               a = ACVTSS2SD;
-               goto rdst;
+               if(f->op == OREGISTER)
+                       gins(AFMOVD, f, t);
+               else
+                       gins(AFMOVF, f, t);
+               return;
 
        case CASE(TFLOAT64, TFLOAT32):
-               a = ACVTSD2SS;
-               goto rdst;
-       */
+               if(f->op == OREGISTER)
+                       gins(AFMOVF, f, t);
+               else
+                       gins(AFMOVD, f, t);
+               return;
        }
 
        gins(a, f, t);
@@ -1377,6 +1606,18 @@ hard:
        gmove(&r1, t);
        regfree(&r1);
        return;
+
+hardmem:
+       // requires memory intermediate
+       tempalloc(&r1, cvt);
+       gmove(f, &r1);
+       gmove(&r1, t);
+       tempfree(&r1);
+       return;
+
+fatal:
+       // should not happen
+       fatal("gmove %N -> %N", f, t);
 }
 
 int