--- /dev/null
+/*
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of "The Computer Language Benchmarks Game" nor the
+ name of "The Computer Language Shootout Benchmarks" nor the names of
+ its contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*
+** The Computer Language Shootout
+** http://shootout.alioth.debian.org/
+** contributed by Mike Pall
+**
+** regex-dna benchmark using PCRE
+**
+** compile with:
+** gcc -O3 -fomit-frame-pointer -o regexdna regexdna.c -lpcre
+*/
+
+#define __USE_STRING_INLINES
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <pcre.h>
+
+typedef struct fbuf {
+ char *buf;
+ size_t size, len;
+} fbuf_t;
+
+static void fb_init(fbuf_t *b)
+{
+ b->buf = NULL;
+ b->len = b->size = 0;
+}
+
+static char *fb_need(fbuf_t *b, size_t need)
+{
+ need += b->len;
+ if (need > b->size) {
+ if (b->size == 0) b->size = need;
+ else while (need > b->size) b->size += b->size;
+ if (!(b->buf = realloc(b->buf, b->size))) exit(1);
+ }
+ return b->buf+b->len;
+}
+
+#define FB_MINREAD (3<<16)
+
+/* Read all of a stdio stream into dst buffer. */
+static size_t fb_readall(fbuf_t *dst, FILE *fp)
+{
+ char *dp;
+ int n;
+ for (dp = fb_need(dst, FB_MINREAD);
+ (n = fread(dp, 1, dst->size-dst->len, fp)) > 0;
+ dp = fb_need(dst, FB_MINREAD)) dst->len += n;
+ if (ferror(fp)) exit(1);
+ return dst->len;
+}
+
+/* Substitute pattern p with replacement r, copying from src to dst buffer. */
+static size_t fb_subst(fbuf_t *dst, fbuf_t *src, const char *p, const char *r)
+{
+ pcre *re;
+ pcre_extra *re_ex;
+ const char *re_e;
+ char *dp;
+ int re_eo, m[3], pos, rlen, clen;
+ if (!(re = pcre_compile(p, PCRE_CASELESS, &re_e, &re_eo, NULL))) exit(1);
+ re_ex = pcre_study(re, 0, &re_e);
+ for (dst->len = 0, rlen = strlen(r), pos = 0;
+ pcre_exec(re, re_ex, src->buf, src->len, pos, 0, m, 3) >= 0;
+ pos = m[1]) {
+ clen = m[0]-pos;
+ dp = fb_need(dst, clen+rlen);
+ dst->len += clen+rlen;
+ memcpy(dp, src->buf+pos, clen);
+ memcpy(dp+clen, r, rlen);
+ }
+ clen = src->len-pos;
+ dp = fb_need(dst, clen);
+ dst->len += clen;
+ memcpy(dp, src->buf+pos, clen);
+ return dst->len;
+}
+
+/* Count all matches with pattern p in src buffer. */
+static int fb_countmatches(fbuf_t *src, const char *p)
+{
+ pcre *re;
+ pcre_extra *re_ex;
+ const char *re_e;
+ int re_eo, m[3], pos, count;
+ if (!(re = pcre_compile(p, PCRE_CASELESS, &re_e, &re_eo, NULL))) exit(1);
+ re_ex = pcre_study(re, 0, &re_e);
+ for (count = 0, pos = 0;
+ pcre_exec(re, re_ex, src->buf, src->len, pos, 0, m, 3) >= 0;
+ pos = m[1]) count++;
+ return count;
+}
+
+static const char *variants[] = {
+ "agggtaaa|tttaccct", "[cgt]gggtaaa|tttaccc[acg]",
+ "a[act]ggtaaa|tttacc[agt]t", "ag[act]gtaaa|tttac[agt]ct",
+ "agg[act]taaa|ttta[agt]cct", "aggg[acg]aaa|ttt[cgt]ccct",
+ "agggt[cgt]aa|tt[acg]accct", "agggta[cgt]a|t[acg]taccct",
+ "agggtaa[cgt]|[acg]ttaccct", NULL
+};
+
+static const char *subst[] = {
+ "B", "(c|g|t)", "D", "(a|g|t)", "H", "(a|c|t)", "K", "(g|t)",
+ "M", "(a|c)", "N", "(a|c|g|t)", "R", "(a|g)", "S", "(c|g)",
+ "V", "(a|c|g)", "W", "(a|t)", "Y", "(c|t)", NULL
+};
+
+int main(int argc, char **argv)
+{
+ fbuf_t seq[2];
+ const char **pp;
+ size_t ilen, clen, slen;
+ int flip;
+ fb_init(&seq[0]);
+ fb_init(&seq[1]);
+ ilen = fb_readall(&seq[0], stdin);
+ clen = fb_subst(&seq[1], &seq[0], ">.*|\n", "");
+ for (pp = variants; *pp; pp++)
+ printf("%s %d\n", *pp, fb_countmatches(&seq[1], *pp));
+ for (slen = 0, flip = 1, pp = subst; *pp; pp += 2, flip = 1-flip)
+ slen = fb_subst(&seq[1-flip], &seq[flip], *pp, pp[1]);
+ printf("\n%zu\n%zu\n%zu\n", ilen, clen, slen);
+ return 0;
+}
--- /dev/null
+/*
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of "The Computer Language Benchmarks Game" nor the
+ name of "The Computer Language Shootout Benchmarks" nor the names of
+ its contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* The Computer Language Benchmarks Game
+ * http://shootout.alioth.debian.org/
+ *
+ * contributed by The Go Authors.
+ */
+
+package main
+
+import (
+ "fmt";
+ "io";
+ "os";
+ "regexp";
+)
+
+func compile(s string) *regexp.Regexp {
+ r, err := regexp.Compile(s);
+ if err != nil {
+ fmt.Fprintf(os.Stderr, "can't compile pattern %q: %s\n", s, err);
+ os.Exit(2);
+ }
+ return r;
+}
+
+var variants = []string {
+ "agggtaaa|tttaccct",
+ "[cgt]gggtaaa|tttaccc[acg]",
+ "a[act]ggtaaa|tttacc[agt]t",
+ "ag[act]gtaaa|tttac[agt]ct",
+ "agg[act]taaa|ttta[agt]cct",
+ "aggg[acg]aaa|ttt[cgt]ccct",
+ "agggt[cgt]aa|tt[acg]accct",
+ "agggta[cgt]a|t[acg]taccct",
+ "agggtaa[cgt]|[acg]ttaccct",
+}
+
+type Subst struct {
+ pat, repl string
+}
+
+var substs = [] Subst {
+ Subst {"B", "(c|g|t)"},
+ Subst {"D", "(a|g|t)"},
+ Subst {"H", "(a|c|t)"},
+ Subst {"K", "(g|t)"},
+ Subst {"M", "(a|c)"},
+ Subst {"N", "(a|c|g|t)"},
+ Subst {"R", "(a|g)"},
+ Subst {"S", "(c|g)"},
+ Subst {"V", "(a|c|g)"},
+ Subst {"W", "(a|t)"},
+ Subst {"Y", "(c|t)"},
+}
+
+func countMatches(pat, str string) int {
+ re := compile(pat);
+ n := 0;
+ pos := 0;
+ for {
+ e := re.Execute(str);
+ if len(e) == 0 {
+ break;
+ }
+ n++;
+ str = str[e[1]:len(str)];
+ }
+ return n;
+}
+
+func main() {
+ bytes, err := io.ReadFile("/dev/stdin");
+ if err != nil {
+ fmt.Fprintf(os.Stderr, "can't read input: %s\n", err);
+ os.Exit(2);
+ }
+ str := string(bytes);
+ ilen := len(str);
+ // Delete the comment lines and newlines
+ str = compile("(>[^\n]+)?\n").ReplaceAll(str, "");
+ clen := len(str);
+ for i, s := range variants {
+ fmt.Printf("%s %d\n", s, countMatches(s, str));
+ }
+ for i, sub := range substs {
+ str = compile(sub.pat).ReplaceAll(str, sub.repl);
+ }
+ fmt.Printf("\n%d\n%d\n%d\n", ilen, clen, len(str));
+}