From 383963b50622d9e0a28a3deb75aba8533e5ad949 Mon Sep 17 00:00:00 2001 From: Keith Randall Date: Tue, 1 Apr 2014 19:44:07 -0700 Subject: [PATCH] runtime: zero at start of frame more efficiently. Use Duff's device for zeroing. Combine adjacent regions. Update #7680 Update #7624 LGTM=rsc R=rsc CC=golang-codereviews https://golang.org/cl/83200045 --- src/cmd/6g/ggen.c | 71 +++++++++++++++++++++++++++++++++-------------- src/cmd/8g/ggen.c | 69 +++++++++++++++++++++++++++++++-------------- 2 files changed, 98 insertions(+), 42 deletions(-) diff --git a/src/cmd/6g/ggen.c b/src/cmd/6g/ggen.c index 6b159e2e37..51319f23db 100644 --- a/src/cmd/6g/ggen.c +++ b/src/cmd/6g/ggen.c @@ -10,13 +10,14 @@ #include "opt.h" static Prog *appendpp(Prog*, int, int, vlong, int, vlong); +static Prog *zerorange(Prog *p, vlong frame, vlong lo, vlong hi, uint32 *ax); void defframe(Prog *ptxt) { - uint32 frame; + uint32 frame, ax; Prog *p; - vlong i; + vlong hi, lo; NodeList *l; Node *n; @@ -28,14 +29,13 @@ defframe(Prog *ptxt) frame = rnd(stksize+maxarg, widthreg); ptxt->to.offset |= frame; - // insert code to contain ambiguously live variables - // so that garbage collector only sees initialized values + // insert code to zero ambiguously live variables + // so that the garbage collector only sees initialized values // when it looks for pointers. - // - // TODO: determine best way to zero the given values. - // among other problems, AX is initialized to 0 multiple times, - // but that's really the tip of the iceberg. p = ptxt; + lo = hi = 0; + ax = 0; + // iterate through declarations - they are sorted in decreasing xoffset order. for(l=curfn->dcl; l != nil; l = l->next) { n = l->n; if(!n->needzero) @@ -44,21 +44,50 @@ defframe(Prog *ptxt) fatal("needzero class %d", n->class); if(n->type->width % widthreg != 0 || n->xoffset % widthreg != 0 || n->type->width == 0) fatal("var %lN has size %d offset %d", n, (int)n->type->width, (int)n->xoffset); - if(n->type->width <= 2*widthreg) { - for(i = 0; i < n->type->width; i += widthreg) - p = appendpp(p, AMOVQ, D_CONST, 0, D_SP+D_INDIR, frame+n->xoffset+i); - } else if(n->type->width <= 16*widthreg) { - p = appendpp(p, AMOVQ, D_CONST, 0, D_AX, 0); - for(i = 0; i < n->type->width; i += widthreg) - p = appendpp(p, AMOVQ, D_AX, 0, D_SP+D_INDIR, frame+n->xoffset+i); - } else { - p = appendpp(p, AMOVQ, D_CONST, 0, D_AX, 0); - p = appendpp(p, AMOVQ, D_CONST, n->type->width/widthreg, D_CX, 0); - p = appendpp(p, leaptr, D_SP+D_INDIR, frame+n->xoffset, D_DI, 0); - p = appendpp(p, AREP, D_NONE, 0, D_NONE, 0); - p = appendpp(p, ASTOSQ, D_NONE, 0, D_NONE, 0); + + if(n->xoffset + n->type->width >= lo - 2*widthptr) { + // merge with range we already have + lo = n->xoffset; + continue; + } + // zero old range + p = zerorange(p, frame, lo, hi, &ax); + + // set new range + hi = n->xoffset + n->type->width; + lo = n->xoffset; + } + // zero final range + zerorange(p, frame, lo, hi, &ax); +} + +static Prog* +zerorange(Prog *p, vlong frame, vlong lo, vlong hi, uint32 *ax) +{ + vlong cnt, i; + + cnt = hi - lo; + if(cnt == 0) + return p; + if(*ax == 0) { + p = appendpp(p, AMOVQ, D_CONST, 0, D_AX, 0); + *ax = 1; + } + if(cnt <= 4*widthreg) { + for(i = 0; i < cnt; i += widthreg) { + p = appendpp(p, AMOVQ, D_AX, 0, D_SP+D_INDIR, frame+lo+i); } + } else if(cnt <= 128*widthreg) { + p = appendpp(p, leaptr, D_SP+D_INDIR, frame+lo, D_DI, 0); + p = appendpp(p, ADUFFZERO, D_NONE, 0, D_ADDR, 2*(128-cnt/widthreg)); + p->to.sym = linksym(pkglookup("duffzero", runtimepkg)); + } else { + p = appendpp(p, AMOVQ, D_CONST, cnt/widthreg, D_CX, 0); + p = appendpp(p, leaptr, D_SP+D_INDIR, frame+lo, D_DI, 0); + p = appendpp(p, AREP, D_NONE, 0, D_NONE, 0); + p = appendpp(p, ASTOSQ, D_NONE, 0, D_NONE, 0); } + return p; } static Prog* diff --git a/src/cmd/8g/ggen.c b/src/cmd/8g/ggen.c index 8388e64bd5..0a1523871e 100644 --- a/src/cmd/8g/ggen.c +++ b/src/cmd/8g/ggen.c @@ -10,13 +10,14 @@ #include "opt.h" static Prog *appendpp(Prog*, int, int, vlong, int, vlong); +static Prog *zerorange(Prog *p, vlong frame, vlong lo, vlong hi, uint32 *ax); void defframe(Prog *ptxt) { - uint32 frame; + uint32 frame, ax; Prog *p; - vlong i; + vlong lo, hi; NodeList *l; Node *n; @@ -27,14 +28,12 @@ defframe(Prog *ptxt) frame = rnd(stksize+maxarg, widthptr); ptxt->to.offset = frame; - // insert code to contain ambiguously live variables - // so that garbage collector only sees initialized values + // insert code to zero ambiguously live variables + // so that the garbage collector only sees initialized values // when it looks for pointers. - // - // TODO: determine best way to zero the given values. - // among other problems, AX is initialized to 0 multiple times, - // but that's really the tip of the iceberg. p = ptxt; + lo = hi = 0; + ax = 0; for(l=curfn->dcl; l != nil; l = l->next) { n = l->n; if(!n->needzero) @@ -43,21 +42,49 @@ defframe(Prog *ptxt) fatal("needzero class %d", n->class); if(n->type->width % widthptr != 0 || n->xoffset % widthptr != 0 || n->type->width == 0) fatal("var %lN has size %d offset %d", n, (int)n->type->width, (int)n->xoffset); - if(n->type->width <= 2*widthptr) { - for(i = 0; i < n->type->width; i += widthptr) - p = appendpp(p, AMOVL, D_CONST, 0, D_SP+D_INDIR, frame+n->xoffset+i); - } else if(n->type->width <= 16*widthptr) { - p = appendpp(p, AMOVL, D_CONST, 0, D_AX, 0); - for(i = 0; i < n->type->width; i += widthptr) - p = appendpp(p, AMOVL, D_AX, 0, D_SP+D_INDIR, frame+n->xoffset+i); - } else { - p = appendpp(p, AMOVL, D_CONST, 0, D_AX, 0); - p = appendpp(p, AMOVL, D_CONST, n->type->width/widthptr, D_CX, 0); - p = appendpp(p, ALEAL, D_SP+D_INDIR, frame+n->xoffset, D_DI, 0); - p = appendpp(p, AREP, D_NONE, 0, D_NONE, 0); - p = appendpp(p, ASTOSL, D_NONE, 0, D_NONE, 0); + if(n->xoffset + n->type->width == lo - 2*widthptr) { + // merge with range we already have + lo = n->xoffset; + continue; } + // zero old range + p = zerorange(p, frame, lo, hi, &ax); + + // set new range + hi = n->xoffset + n->type->width; + lo = n->xoffset; + } + // zero final range + zerorange(p, frame, lo, hi, &ax); +} + +static Prog* +zerorange(Prog *p, vlong frame, vlong lo, vlong hi, uint32 *ax) +{ + vlong cnt, i; + + cnt = hi - lo; + if(cnt == 0) + return p; + if(*ax == 0) { + p = appendpp(p, AMOVL, D_CONST, 0, D_AX, 0); + *ax = 1; + } + if(cnt <= 4*widthreg) { + for(i = 0; i < cnt; i += widthreg) { + p = appendpp(p, AMOVL, D_AX, 0, D_SP+D_INDIR, frame+lo+i); + } + } else if(cnt <= 128*widthreg) { + p = appendpp(p, ALEAL, D_SP+D_INDIR, frame+lo, D_DI, 0); + p = appendpp(p, ADUFFZERO, D_NONE, 0, D_ADDR, 1*(128-cnt/widthreg)); + p->to.sym = linksym(pkglookup("duffzero", runtimepkg)); + } else { + p = appendpp(p, AMOVL, D_CONST, cnt/widthreg, D_CX, 0); + p = appendpp(p, ALEAL, D_SP+D_INDIR, frame+lo, D_DI, 0); + p = appendpp(p, AREP, D_NONE, 0, D_NONE, 0); + p = appendpp(p, ASTOSL, D_NONE, 0, D_NONE, 0); } + return p; } static Prog* -- 2.50.0