From: Rob Pike Date: Wed, 5 Aug 2009 20:03:46 +0000 (-0700) Subject: regex-dna X-Git-Tag: weekly.2009-11-06~964 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=d9a0bc9b58eab201b462f5e308e3f472fccb29aa;p=gostls13.git regex-dna R=rsc DELTA=243 (242 added, 0 deleted, 1 changed) OCL=32786 CL=32791 --- diff --git a/test/bench/regex-dna.c b/test/bench/regex-dna.c new file mode 100644 index 0000000000..134f8215c7 --- /dev/null +++ b/test/bench/regex-dna.c @@ -0,0 +1,154 @@ +/* +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of "The Computer Language Benchmarks Game" nor the + name of "The Computer Language Shootout Benchmarks" nor the names of + its contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +*/ + +/* +** The Computer Language Shootout +** http://shootout.alioth.debian.org/ +** contributed by Mike Pall +** +** regex-dna benchmark using PCRE +** +** compile with: +** gcc -O3 -fomit-frame-pointer -o regexdna regexdna.c -lpcre +*/ + +#define __USE_STRING_INLINES +#include +#include +#include +#include + +typedef struct fbuf { + char *buf; + size_t size, len; +} fbuf_t; + +static void fb_init(fbuf_t *b) +{ + b->buf = NULL; + b->len = b->size = 0; +} + +static char *fb_need(fbuf_t *b, size_t need) +{ + need += b->len; + if (need > b->size) { + if (b->size == 0) b->size = need; + else while (need > b->size) b->size += b->size; + if (!(b->buf = realloc(b->buf, b->size))) exit(1); + } + return b->buf+b->len; +} + +#define FB_MINREAD (3<<16) + +/* Read all of a stdio stream into dst buffer. */ +static size_t fb_readall(fbuf_t *dst, FILE *fp) +{ + char *dp; + int n; + for (dp = fb_need(dst, FB_MINREAD); + (n = fread(dp, 1, dst->size-dst->len, fp)) > 0; + dp = fb_need(dst, FB_MINREAD)) dst->len += n; + if (ferror(fp)) exit(1); + return dst->len; +} + +/* Substitute pattern p with replacement r, copying from src to dst buffer. */ +static size_t fb_subst(fbuf_t *dst, fbuf_t *src, const char *p, const char *r) +{ + pcre *re; + pcre_extra *re_ex; + const char *re_e; + char *dp; + int re_eo, m[3], pos, rlen, clen; + if (!(re = pcre_compile(p, PCRE_CASELESS, &re_e, &re_eo, NULL))) exit(1); + re_ex = pcre_study(re, 0, &re_e); + for (dst->len = 0, rlen = strlen(r), pos = 0; + pcre_exec(re, re_ex, src->buf, src->len, pos, 0, m, 3) >= 0; + pos = m[1]) { + clen = m[0]-pos; + dp = fb_need(dst, clen+rlen); + dst->len += clen+rlen; + memcpy(dp, src->buf+pos, clen); + memcpy(dp+clen, r, rlen); + } + clen = src->len-pos; + dp = fb_need(dst, clen); + dst->len += clen; + memcpy(dp, src->buf+pos, clen); + return dst->len; +} + +/* Count all matches with pattern p in src buffer. */ +static int fb_countmatches(fbuf_t *src, const char *p) +{ + pcre *re; + pcre_extra *re_ex; + const char *re_e; + int re_eo, m[3], pos, count; + if (!(re = pcre_compile(p, PCRE_CASELESS, &re_e, &re_eo, NULL))) exit(1); + re_ex = pcre_study(re, 0, &re_e); + for (count = 0, pos = 0; + pcre_exec(re, re_ex, src->buf, src->len, pos, 0, m, 3) >= 0; + pos = m[1]) count++; + return count; +} + +static const char *variants[] = { + "agggtaaa|tttaccct", "[cgt]gggtaaa|tttaccc[acg]", + "a[act]ggtaaa|tttacc[agt]t", "ag[act]gtaaa|tttac[agt]ct", + "agg[act]taaa|ttta[agt]cct", "aggg[acg]aaa|ttt[cgt]ccct", + "agggt[cgt]aa|tt[acg]accct", "agggta[cgt]a|t[acg]taccct", + "agggtaa[cgt]|[acg]ttaccct", NULL +}; + +static const char *subst[] = { + "B", "(c|g|t)", "D", "(a|g|t)", "H", "(a|c|t)", "K", "(g|t)", + "M", "(a|c)", "N", "(a|c|g|t)", "R", "(a|g)", "S", "(c|g)", + "V", "(a|c|g)", "W", "(a|t)", "Y", "(c|t)", NULL +}; + +int main(int argc, char **argv) +{ + fbuf_t seq[2]; + const char **pp; + size_t ilen, clen, slen; + int flip; + fb_init(&seq[0]); + fb_init(&seq[1]); + ilen = fb_readall(&seq[0], stdin); + clen = fb_subst(&seq[1], &seq[0], ">.*|\n", ""); + for (pp = variants; *pp; pp++) + printf("%s %d\n", *pp, fb_countmatches(&seq[1], *pp)); + for (slen = 0, flip = 1, pp = subst; *pp; pp += 2, flip = 1-flip) + slen = fb_subst(&seq[1-flip], &seq[flip], *pp, pp[1]); + printf("\n%zu\n%zu\n%zu\n", ilen, clen, slen); + return 0; +} diff --git a/test/bench/regex-dna.go b/test/bench/regex-dna.go new file mode 100644 index 0000000000..c0ade94e7a --- /dev/null +++ b/test/bench/regex-dna.go @@ -0,0 +1,117 @@ +/* +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of "The Computer Language Benchmarks Game" nor the + name of "The Computer Language Shootout Benchmarks" nor the names of + its contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +*/ + +/* The Computer Language Benchmarks Game + * http://shootout.alioth.debian.org/ + * + * contributed by The Go Authors. + */ + +package main + +import ( + "fmt"; + "io"; + "os"; + "regexp"; +) + +func compile(s string) *regexp.Regexp { + r, err := regexp.Compile(s); + if err != nil { + fmt.Fprintf(os.Stderr, "can't compile pattern %q: %s\n", s, err); + os.Exit(2); + } + return r; +} + +var variants = []string { + "agggtaaa|tttaccct", + "[cgt]gggtaaa|tttaccc[acg]", + "a[act]ggtaaa|tttacc[agt]t", + "ag[act]gtaaa|tttac[agt]ct", + "agg[act]taaa|ttta[agt]cct", + "aggg[acg]aaa|ttt[cgt]ccct", + "agggt[cgt]aa|tt[acg]accct", + "agggta[cgt]a|t[acg]taccct", + "agggtaa[cgt]|[acg]ttaccct", +} + +type Subst struct { + pat, repl string +} + +var substs = [] Subst { + Subst {"B", "(c|g|t)"}, + Subst {"D", "(a|g|t)"}, + Subst {"H", "(a|c|t)"}, + Subst {"K", "(g|t)"}, + Subst {"M", "(a|c)"}, + Subst {"N", "(a|c|g|t)"}, + Subst {"R", "(a|g)"}, + Subst {"S", "(c|g)"}, + Subst {"V", "(a|c|g)"}, + Subst {"W", "(a|t)"}, + Subst {"Y", "(c|t)"}, +} + +func countMatches(pat, str string) int { + re := compile(pat); + n := 0; + pos := 0; + for { + e := re.Execute(str); + if len(e) == 0 { + break; + } + n++; + str = str[e[1]:len(str)]; + } + return n; +} + +func main() { + bytes, err := io.ReadFile("/dev/stdin"); + if err != nil { + fmt.Fprintf(os.Stderr, "can't read input: %s\n", err); + os.Exit(2); + } + str := string(bytes); + ilen := len(str); + // Delete the comment lines and newlines + str = compile("(>[^\n]+)?\n").ReplaceAll(str, ""); + clen := len(str); + for i, s := range variants { + fmt.Printf("%s %d\n", s, countMatches(s, str)); + } + for i, sub := range substs { + str = compile(sub.pat).ReplaceAll(str, sub.repl); + } + fmt.Printf("\n%d\n%d\n%d\n", ilen, clen, len(str)); +} diff --git a/test/bench/regex-dna.txt b/test/bench/regex-dna.txt new file mode 100644 index 0000000000..d36baa5be8 --- /dev/null +++ b/test/bench/regex-dna.txt @@ -0,0 +1,13 @@ +agggtaaa|tttaccct 0 +[cgt]gggtaaa|tttaccc[acg] 3 +a[act]ggtaaa|tttacc[agt]t 9 +ag[act]gtaaa|tttac[agt]ct 8 +agg[act]taaa|ttta[agt]cct 10 +aggg[acg]aaa|ttt[cgt]ccct 3 +agggt[cgt]aa|tt[acg]accct 4 +agggta[cgt]a|t[acg]taccct 3 +agggtaa[cgt]|[acg]ttaccct 5 + +101745 +100000 +133640 diff --git a/test/bench/timing.log b/test/bench/timing.log index d5ea8db967..20e22b0215 100644 --- a/test/bench/timing.log +++ b/test/bench/timing.log @@ -53,3 +53,8 @@ fannkuch 12 gccgo -O2 fannkuch.go 64.89u 0.00s 64.92r gc fannkuch 124.59u 0.00s 124.67r gc_B fannkuch 91.14u 0.00s 91.16r + +regex-dna 100000 + gcc -O2 regex-dna.c -lpcre 0.92u 0.00s 0.99r + gc regexp-dna 26.94u 0.18s 28.75r + gc_B regexp-dna 26.51u 0.09s 26.75r diff --git a/test/bench/timing.sh b/test/bench/timing.sh index 6ebc32165b..f481b0ac4e 100755 --- a/test/bench/timing.sh +++ b/test/bench/timing.sh @@ -65,9 +65,20 @@ fannkuch() { run 'gc_B fannkuch' $O.out -n 12 } +regexdna() { + gcc -O2 fasta.c + a.out 100000 > x + echo 'regex-dna 100000' + run 'gcc -O2 regex-dna.c -lpcre' a.out