]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/gc: add temporary-merging optimization pass
authorRuss Cox <rsc@golang.org>
Tue, 13 Aug 2013 04:09:31 +0000 (00:09 -0400)
committerRuss Cox <rsc@golang.org>
Tue, 13 Aug 2013 04:09:31 +0000 (00:09 -0400)
The compilers assume they can generate temporary variables
as needed to preserve the right semantics or simplify code
generation and the back end will still generate good code.
This turns out not to be true. The back ends will only
track the first 128 variables per function and give up
on the remainder. That needs to be fixed too, in a later CL.

This CL merges temporary variables with equal types and
non-overlapping lifetimes using the greedy algorithm in
Poletto and Sarkar, "Linear Scan Register Allocation",
ACM TOPLAS 1999.

The result can be striking in the right functions.

Top 20 frame size changes in a 6g godoc binary by bytes saved:

5464 1984 (-3480, -63.7%) go/build.(*Context).Import
4456 1824 (-2632, -59.1%) go/printer.(*printer).expr1
2560   80 (-2480, -96.9%) time.nextStdChunk
3496 1608 (-1888, -54.0%) go/printer.(*printer).stmt
1896  272 (-1624, -85.7%) net/http.init
2688 1400 (-1288, -47.9%) fmt.(*pp).printReflectValue
2800 1512 (-1288, -46.0%) main.main
3296 2016 (-1280, -38.8%) crypto/tls.(*Conn).clientHandshake
1664  488 (-1176, -70.7%) time.loadZoneZip
1760  608 (-1152, -65.5%) time.parse
4104 3072 (-1032, -25.1%) runtime/pprof.writeHeap
1680  712 ( -968, -57.6%) go/ast.Walk
2488 1560 ( -928, -37.3%) crypto/x509.parseCertificate
1128  392 ( -736, -65.2%) math/big.nat.divLarge
1528  864 ( -664, -43.5%) go/printer.(*printer).fieldList
1360  712 ( -648, -47.6%) regexp/syntax.(*parser).factor
2104 1528 ( -576, -27.4%) encoding/asn1.parseField
1064  504 ( -560, -52.6%) encoding/xml.(*Decoder).text
 584   48 ( -536, -91.8%) html.init
1400  864 ( -536, -38.3%) go/doc.playExample

In the same godoc build, cuts the number of functions with
too many vars from 83 to 32.

R=ken2
CC=golang-dev
https://golang.org/cl/12829043

12 files changed:
src/cmd/5g/opt.h
src/cmd/5g/peep.c
src/cmd/5g/reg.c
src/cmd/6g/opt.h
src/cmd/6g/reg.c
src/cmd/6l/list.c
src/cmd/8g/opt.h
src/cmd/8g/peep.c
src/cmd/8g/reg.c
src/cmd/gc/go.h
src/cmd/gc/popt.c
src/cmd/gc/popt.h

index cbd8cca3fcbf4e149360a83e88359484f1594d51..15b9d14582aaf6cbf228dd67f13ec16dcc7c0ef4 100644 (file)
@@ -83,7 +83,6 @@ struct        Rgn
 
 EXTERN int32   exregoffset;            // not set
 EXTERN int32   exfregoffset;           // not set
-EXTERN Reg*    firstr;
 EXTERN Reg     zreg;
 EXTERN Reg*    freer;
 EXTERN Reg**   rpo2r;
index b005b4ac10390fb8ab50fed93f370d6d0072e18a..a6c9a6ada9d2dc62beda5b05145e06c2d041df4f 100644 (file)
@@ -236,6 +236,8 @@ loop1:
        }
 
 //     predicate(g);
+
+       flowend(g);
 }
 
 static int
index dc5aa8e0ee9e26a02608073ee7e650895f143ad7..f35713f67a5a553b31bccbbb3b29a852813956ff 100644 (file)
@@ -38,6 +38,7 @@
 #define        REGBITS ((uint32)0xffffffff)
 
        void    addsplits(void);
+static Reg*    firstr;
 static int     first   = 1;
 
 int
@@ -169,6 +170,7 @@ regopt(Prog *firstp)
        }
        
        fixjmp(firstp);
+       mergetemp(firstp);
 
        /*
         * control flow is more complicated in generated go code
@@ -262,9 +264,6 @@ regopt(Prog *firstp)
         * pass 2
         * find looping structure
         */
-       for(r = firstr; r != R; r = (Reg*)r->f.link)
-               r->f.active = 0;
-       change = 0;
        flowrpo(g);
 
        if(debug['R'] && debug['v'])
index 9054234c387906d580917ac7f64671fda6a3850d..3dcc3d7476a6c2d4288aec99af9ae4ee75526f83 100644 (file)
@@ -83,7 +83,6 @@ struct        Rgn
 
 EXTERN int32   exregoffset;            // not set
 EXTERN int32   exfregoffset;           // not set
-EXTERN Reg*    firstr;
 EXTERN Reg     zreg;
 EXTERN Rgn     region[NRGN];
 EXTERN Rgn*    rgp;
index d540b4affe324216c8ff2655e506155f279f0fc1..63fd0deca0df20dbcd7b69defb4d1393cc31d813 100644 (file)
@@ -36,6 +36,7 @@
 #define        NREGVAR 32      /* 16 general + 16 floating */
 #define        REGBITS ((uint32)0xffffffff)
 
+static Reg*    firstr;
 static int     first   = 1;
 
 int
@@ -155,6 +156,7 @@ regopt(Prog *firstp)
        }
 
        fixjmp(firstp);
+       mergetemp(firstp);
        
        /*
         * control flow is more complicated in generated go code
@@ -248,9 +250,6 @@ regopt(Prog *firstp)
         * pass 2
         * find looping structure
         */
-       for(r = firstr; r != R; r = (Reg*)r->f.link)
-               r->f.active = 0;
-       change = 0;
        flowrpo(g);
 
        if(debug['R'] && debug['v'])
index aaf45c4dd766845fe2fee7d4bd524b3123c60861..5040e432716408e0be63f985b46c3091a760edf8 100644 (file)
@@ -57,7 +57,7 @@ Pconv(Fmt *fp)
        switch(p->as) {
        case ATEXT:
                if(p->from.scale) {
-                       fmtprint(fp, "(%d)      %A      %D,%d,%D",
+                       fmtprint(fp, "(%d)      %A      %D,%d,%lD",
                                p->line, p->as, &p->from, p->from.scale, &p->to);
                        break;
                }
index 0a2740432d8e2545cd5786cc45c7fb51bac39074..0d99bdb97269de4b940ad4ea1f7d68f70f2f3afe 100644 (file)
@@ -96,7 +96,6 @@ struct        Rgn
 
 EXTERN int32   exregoffset;            // not set
 EXTERN int32   exfregoffset;           // not set
-EXTERN Reg*    firstr;
 EXTERN Reg     zreg;
 EXTERN Reg*    freer;
 EXTERN Reg**   rpo2r;
index ac7c71cbdbb5811429e89d237bc2c74148289aa0..5a0b1d3ab1abd61b7f993f9aaa64b6326688f4d0 100644 (file)
@@ -222,6 +222,8 @@ loop1:
                if(regtyp(&p->to))
                        p->as = AMOVAPD;
        }
+       
+       flowend(g);
 }
 
 void
index 307fb815747d6eaa0f7f8440977ee593ef602f0d..a85c6608a6cbb2aa79eb0d14e499ed7e5200df77 100644 (file)
 #define        NREGVAR 16      /* 8 integer + 8 floating */
 #define        REGBITS ((uint32)0xffff)
 
+static Reg*    firstr;
 static int     first   = 1;
 
-static void    fixtemp(Prog*);
-
 int
 rcmp(const void *a1, const void *a2)
 {
@@ -126,8 +125,8 @@ regopt(Prog *firstp)
                first = 0;
        }
        
-       fixtemp(firstp);
        fixjmp(firstp);
+       mergetemp(firstp);
 
        /*
         * control flow is more complicated in generated go code
@@ -223,9 +222,6 @@ regopt(Prog *firstp)
         * pass 2
         * find looping structure
         */
-       for(r = firstr; r != R; r = (Reg*)r->f.link)
-               r->f.active = 0;
-       change = 0;
        flowrpo(g);
 
        if(debug['R'] && debug['v'])
@@ -1157,131 +1153,3 @@ dumpit(char *str, Flow *r0, int isreg)
 //             }
        }
 }
-
-static uint32
-fnv1(Sym *sym)
-{
-       uint32 h;
-       char *s;
-
-       h = 2166136261U;
-       for(s=sym->name;*s;s++) {
-               h = (16777619 * h) ^ (uint32)(uint8)(*s);
-       }
-       return h;
-}
-
-static uint16
-hash32to16(uint32 h)
-{
-       return (h & 0xffff) ^ (h >> 16);
-}
-
-/*
- * fixtemp eliminates sequences like:
- *   MOV reg1, mem
- *   OP mem, reg2
- * when mem is a stack variable which is not mentioned
- * anywhere else. The instructions are replaced by
- *   OP reg1, reg2
- * this reduces the number of variables that the register optimizer
- * sees, which lets it do a better job and makes it less likely to turn
- * itself off.
- */
-static void
-fixtemp(Prog *firstp)
-{
-       static uint8 counts[1<<16]; // A hash table to count variable occurrences.
-       int i;
-       Prog *p, *p2;
-       uint32 h;
-
-       if(debug['R'] && debug['v'])
-               print("\nfixtemp\n");
-
-       // Count variable references. We actually use a hashtable so this
-       // is only approximate.
-       for(i=0; i<nelem(counts); i++)
-               counts[i] = 0;
-       for(p=firstp; p!=P; p=p->link) {
-               if(p->from.type == D_AUTO) {
-                       h = hash32to16(fnv1(p->from.sym));
-                       //print("seen %S hash %d\n", p->from.sym, hash32to16(h));
-                       if(counts[h] < 10)
-                               counts[h]++;
-               }
-               if(p->to.type == D_AUTO) {
-                       h = hash32to16(fnv1(p->to.sym));
-                       //print("seen %S hash %d\n", p->to.sym, hash32to16(h));
-                       if(counts[h] < 10)
-                               counts[h]++;
-               }
-       }
-
-       // Eliminate single-write, single-read stack variables.
-       for(p=firstp; p!=P; p=p->link) {
-               if(debug['R'] && debug['v'])
-                       print("%P\n", p);
-               if(p->link == P || p->to.type != D_AUTO)
-                       continue;
-               if(isfloat[p->to.etype] && FtoB(p->from.type)) {
-                       switch(p->as) {
-                       case AMOVSS:
-                       case AMOVSD:
-                               break;
-                       default:
-                               continue;
-                       }
-               } else if(!isfloat[p->to.etype] && RtoB(p->from.type)) {
-                       switch(p->as) {
-                       case AMOVB:
-                               if(p->to.width == 1)
-                                       break;
-                       case AMOVW:
-                               if(p->to.width == 2)
-                                       break;
-                       case AMOVL:
-                               if(p->to.width == 4)
-                                       break;
-                       default:
-                               continue;
-                       }
-               } else
-                       continue;
-               // p is a MOV reg, mem.
-               p2 = p->link;
-               h = hash32to16(fnv1(p->to.sym));
-               if(counts[h] != 2) {
-                       continue;
-               }
-               switch(p2->as) {
-               case ALEAL:
-               case AFMOVD:
-               case AFMOVF:
-               case AFMOVL:
-               case AFMOVW:
-               case AFMOVV:
-                       // funny
-                       continue;
-               }
-               // p2 is OP mem, reg2
-               // and OP is not a funny instruction.
-               if(p2->from.sym == p->to.sym
-                       && p2->from.offset == p->to.offset
-                       && p2->from.type == p->to.type) {
-                       if(debug['R'] && debug['v']) {
-                               print(" ===elide== %D\n", &p->to);
-                               print("%P", p2);
-                       }
-                       // p2 is OP mem, reg2.
-                       // change to OP reg, reg2 and
-                       // eliminate the mov.
-                       p2->from = p->from;
-                       *p = *p2;
-                       p->link = p2->link;
-                       if(debug['R'] && debug['v']) {
-                               print(" ===change== %P\n", p);
-                       }
-               }
-       }
-}
index 6679fa855d668fa30091b4f8d82721e22caab33c..f41923b635bec1f3b60bc05e379696171eea8be6 100644 (file)
@@ -192,8 +192,7 @@ struct      Type
        // for TFORW, where to copy the eventual value to
        NodeList        *copyto;
        
-       // for usefield
-       Node    *lastfn;
+       Node    *lastfn;        // for usefield
 };
 #define        T       ((Type*)0)
 
@@ -331,6 +330,7 @@ struct      Node
        int32   iota;
        uint32  walkgen;
        int32   esclevel;
+       void*   opt;    // for optimization passes
 };
 #define        N       ((Node*)0)
 
index b686cb670c44abc42f0438444731fa912a143876..c3277b48fcf55640722cbd40469e22e8eb8cd58b 100644 (file)
@@ -182,6 +182,9 @@ fixjmp(Prog *firstp)
        }
 }
 
+#undef alive
+#undef dead
+
 // Control flow analysis. The Flow structures hold predecessor and successor
 // information as well as basic loop analysis.
 //
@@ -392,6 +395,9 @@ flowrpo(Graph *g)
        if(g->rpo == nil || idom == nil)
                fatal("out of memory");
 
+       for(r1 = g->start; r1 != nil; r1 = r1->link)
+               r1->active = 0;
+
        rpo2r = g->rpo;
        d = postorder(g->start, rpo2r, 0);
        nr = g->num;
@@ -428,6 +434,9 @@ flowrpo(Graph *g)
                        loopmark(rpo2r, i, r1);
        }
        free(idom);
+
+       for(r1 = g->start; r1 != nil; r1 = r1->link)
+               r1->active = 0;
 }
 
 Flow*
@@ -462,3 +471,296 @@ uniqs(Flow *r)
        return r1;
 }
 
+// The compilers assume they can generate temporary variables
+// as needed to preserve the right semantics or simplify code
+// generation and the back end will still generate good code.
+// This results in a large number of ephemeral temporary variables.
+// Merge temps with non-overlapping lifetimes and equal types using the
+// greedy algorithm in Poletto and Sarkar, "Linear Scan Register Allocation",
+// ACM TOPLAS 1999.
+
+typedef struct TempVar TempVar;
+typedef struct TempFlow TempFlow;
+
+struct TempVar
+{
+       Node *node;
+       TempFlow *def; // definition of temp var
+       TempFlow *use; // use list, chained through TempFlow.uselink
+       TempVar *freelink; // next free temp in Type.opt list
+       TempVar *merge; // merge var with this one
+       uint32 start; // smallest Prog.loc in live range
+       uint32 end; // largest Prog.loc in live range
+       uchar addr; // address taken - no accurate end
+       uchar removed; // removed from program
+};
+
+struct TempFlow
+{
+       Flow    f;
+       TempFlow *uselink;
+};
+
+static int
+startcmp(const void *va, const void *vb)
+{
+       TempVar *a, *b;
+       
+       a = *(TempVar**)va;
+       b = *(TempVar**)vb;
+
+       if(a->start < b->start)
+               return -1;
+       if(a->start > b->start)
+               return +1;
+       return 0;
+}
+
+// Is n available for merging?
+static int
+canmerge(Node *n)
+{
+       return n->class == PAUTO && !n->addrtaken && strncmp(n->sym->name, "autotmp", 7) == 0;
+}
+
+static void mergewalk(TempVar*, TempFlow*, uint32);
+
+void
+mergetemp(Prog *firstp)
+{
+       int i, j, nvar, ninuse, nfree, nkill;
+       TempVar *var, *v, *v1, **bystart, **inuse;
+       TempFlow *r;
+       NodeList *l, **lp;
+       Node *n;
+       Prog *p, *p1;
+       Type *t;
+       ProgInfo info, info1;
+       int32 gen;
+       Graph *g;
+
+       enum { Debug = 0 };
+
+       g = flowstart(firstp, sizeof(TempFlow));
+       if(g == nil)
+               return;
+
+       // Build list of all mergeable variables.
+       nvar = 0;
+       for(l = curfn->dcl; l != nil; l = l->next)
+               if(canmerge(l->n))
+                       nvar++;
+       
+       var = calloc(nvar*sizeof var[0], 1);
+       nvar = 0;
+       for(l = curfn->dcl; l != nil; l = l->next) {
+               n = l->n;
+               if(canmerge(n)) {
+                       v = &var[nvar++];
+                       n->opt = v;
+                       v->node = n;
+               }
+       }
+       
+       // Build list of uses.
+       // We assume that the earliest reference to a temporary is its definition.
+       // This is not true of variables in general but our temporaries are all
+       // single-use (that's why we have so many!).
+       for(r = (TempFlow*)g->start; r != nil; r = (TempFlow*)r->f.link) {
+               p = r->f.prog;
+               proginfo(&info, p);
+
+               if(p->from.node != N && p->from.node->opt && p->to.node != N && p->to.node->opt)
+                       fatal("double node %P", p);
+               if((n = p->from.node) != N && (v = n->opt) != nil ||
+                  (n = p->to.node) != N && (v = n->opt) != nil) {
+                       if(v->def == nil)
+                               v->def = r;
+                       r->uselink = v->use;
+                       v->use = r;
+                       if(n == p->from.node && (info.flags & LeftAddr))
+                               v->addr = 1;
+               }
+       }
+       
+       if(Debug > 1)
+               dumpit("before", g->start, 0);
+       
+       nkill = 0;
+
+       // Special case.
+       for(v = var; v < var+nvar; v++) {
+               if(v->addr)
+                       continue;
+               // Used in only one instruction, which had better be a write.
+               if((r = v->use) != nil && r->uselink == nil) {
+                       p = r->f.prog;
+                       proginfo(&info, p);
+                       if(p->to.node == v->node && (info.flags & RightWrite) && !(info.flags & RightRead)) {
+                               p->as = ANOP;
+                               p->to = zprog.to;
+                               v->removed = 1;
+                               if(Debug)
+                                       print("drop write-only %S\n", v->node->sym);
+                       } else
+                               fatal("temp used and not set: %P", p);
+                       nkill++;
+                       continue;
+               }
+               
+               // Written in one instruction, read in the next, otherwise unused,
+               // no jumps to the next instruction. Happens mainly in 386 compiler.
+               if((r = v->use) != nil && r->f.link == &r->uselink->f && r->uselink->uselink == nil && uniqp(r->f.link) == &r->f) {
+                       p = r->f.prog;
+                       proginfo(&info, p);
+                       p1 = r->f.link->prog;
+                       proginfo(&info1, p1);
+                       enum {
+                               SizeAny = SizeB | SizeW | SizeL | SizeQ | SizeF | SizeD,
+                       };
+                       if(p->from.node == v->node && p1->to.node == v->node && (info.flags & Move) &&
+                          !((info.flags|info1.flags) & (LeftAddr|RightAddr)) &&
+                          (info.flags & SizeAny) == (info1.flags & SizeAny)) {
+                               p1->from = p->from;
+                               excise(&r->f);
+                               v->removed = 1;
+                               if(Debug)
+                                       print("drop immediate-use %S\n", v->node->sym);
+                       }
+                       nkill++;
+                       continue;
+               }                          
+       }
+
+       // Traverse live range of each variable to set start, end.
+       // Each flood uses a new value of gen so that we don't have
+       // to clear all the r->f.active words after each variable.
+       gen = 0;
+       for(v = var; v < var+nvar; v++) {
+               gen++;
+               for(r = v->use; r != nil; r = r->uselink)
+                       mergewalk(v, r, gen);
+       }
+
+       // Sort variables by start.
+       bystart = malloc(nvar*sizeof bystart[0]);
+       for(i=0; i<nvar; i++)
+               bystart[i] = &var[i];
+       qsort(bystart, nvar, sizeof bystart[0], startcmp);
+
+       // List of in-use variables, sorted by end, so that the ones that
+       // will last the longest are the earliest ones in the array.
+       // The tail inuse[nfree:] holds no-longer-used variables.
+       // In theory we should use a sorted tree so that insertions are
+       // guaranteed O(log n) and then the loop is guaranteed O(n log n).
+       // In practice, it doesn't really matter.
+       inuse = malloc(nvar*sizeof inuse[0]);
+       ninuse = 0;
+       nfree = nvar;
+       for(i=0; i<nvar; i++) {
+               v = bystart[i];
+               if(v->addr || v->removed)
+                       continue;
+
+               // Expire no longer in use.
+               while(ninuse > 0 && inuse[ninuse-1]->end < v->start) {
+                       v1 = inuse[--ninuse];
+                       inuse[--nfree] = v1;
+               }
+
+               // Find old temp to reuse if possible.
+               t = v->node->type;
+               for(j=nfree; j<nvar; j++) {
+                       v1 = inuse[j];
+                       if(eqtype(t, v1->node->type)) {
+                               inuse[j] = inuse[nfree++];
+                               if(v1->merge)
+                                       v->merge = v1->merge;
+                               else
+                                       v->merge = v1;
+                               nkill++;
+                               break;
+                       }
+               }
+
+               // Sort v into inuse.
+               j = ninuse++;
+               while(j > 0 && inuse[j-1]->end < v->end) {
+                       inuse[j] = inuse[j-1];
+                       j--;
+               }
+               inuse[j] = v;
+       }
+
+       if(Debug) {
+               print("%S [%d - %d]\n", curfn->nname->sym, nvar, nkill);
+               for(v=var; v<var+nvar; v++) {
+                       print("var %#N %T %d-%d", v->node, v->node->type, v->start, v->end);
+                       if(v->addr)
+                               print(" addr=1");
+                       if(v->removed)
+                               print(" dead=1");
+                       if(v->merge)
+                               print(" merge %#N", v->merge->node);
+                       if(v->start == v->end)
+                               print(" %P", v->def->f.prog);
+                       print("\n");
+               }
+       
+               if(Debug > 1)
+                       dumpit("after", g->start, 0);
+       }
+
+       // Update node references to use merged temporaries.
+       for(r = (TempFlow*)g->start; r != nil; r = (TempFlow*)r->f.link) {
+               p = r->f.prog;
+               if((n = p->from.node) != N && (v = n->opt) != nil && v->merge != nil)
+                       p->from.node = v->merge->node;
+               if((n = p->to.node) != N && (v = n->opt) != nil && v->merge != nil)
+                       p->to.node = v->merge->node;
+       }
+
+       // Delete merged nodes from declaration list.
+       for(lp = &curfn->dcl; (l = *lp); ) {
+               curfn->dcl->end = l;
+               n = l->n;
+               v = n->opt;
+               if(v && (v->merge || v->removed)) {
+                       *lp = l->next;
+                       continue;
+               }
+               lp = &l->next;
+       }
+
+       // Clear aux structures.
+       for(v=var; v<var+nvar; v++)
+               v->node->opt = nil;
+       free(var);
+       free(bystart);
+       free(inuse);
+       flowend(g);
+}
+
+static void
+mergewalk(TempVar *v, TempFlow *r0, uint32 gen)
+{
+       Prog *p;
+       TempFlow *r1, *r, *r2;
+       
+       for(r1 = r0; r1 != nil; r1 = (TempFlow*)r1->f.p1) {
+               if(r1->f.active == gen)
+                       break;
+               r1->f.active = gen;
+               p = r1->f.prog;
+               if(v->end < p->loc)
+                       v->end = p->loc;
+               if(r1 == v->def) {
+                       v->start = p->loc;
+                       break;
+               }
+       }
+       
+       for(r = r0; r != r1; r = (TempFlow*)r->f.p1)
+               for(r2 = (TempFlow*)r->f.p2; r2 != nil; r2 = (TempFlow*)r2->f.p2link)
+                       mergewalk(v, r2, gen);
+}
index 26a17b70be40e1916847735c3abd41fd5f69542e..65c27309691237f09633761228edf893aff95b5f 100644 (file)
@@ -35,6 +35,7 @@ void  fixjmp(Prog*);
 Graph* flowstart(Prog*, int);
 void   flowrpo(Graph*);
 void   flowend(Graph*);
+void   mergetemp(Prog*);
 int    noreturn(Prog*);
 Flow*  uniqp(Flow*);
 Flow*  uniqs(Flow*);