- indexing component for godoc

author Robert Griesemer <gri@golang.org>

Tue, 27 Oct 2009 01:32:51 +0000 (18:32 -0700)

committer Robert Griesemer <gri@golang.org>

Tue, 27 Oct 2009 01:32:51 +0000 (18:32 -0700)
author Robert Griesemer <gri@golang.org>
Tue, 27 Oct 2009 01:32:51 +0000 (18:32 -0700)
committer Robert Griesemer <gri@golang.org>
Tue, 27 Oct 2009 01:32:51 +0000 (18:32 -0700)
diff --git a/src/cmd/godoc/index.go b/src/cmd/godoc/index.go

new file mode 100644 (file)

index 0000000..1cd6c5c
--- /dev/null
+++ b/src/cmd/godoc/index.go
@@ -0,0 +1,664 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file contains the infrastructure to create an
+// (identifier) index for a set of Go files.
+//
+// Basic indexing algorithm:
+// - traverse all .go files of the file tree specified by root
+// - for each word (identifier) encountered, collect all occurences (spots)
+//   into a list; this produces a list of spots for each word
+// - reduce the lists: from a list of spots to a list of FileRuns,
+//   and from a list of FileRuns into a list of PakRuns
+// - make a HitList from the PakRuns
+//
+// Details:
+// - keep two lists per word: one containing package-level declarations
+//   that have snippets, and one containing all other spots
+// - keep the snippets in a separate table indexed by snippet index
+//   and store the snippet index in place of the line number in a SpotInfo
+//   (the line number for spots with snippets is stored in the snippet)
+// - at the end, create lists of alternative spellings for a given
+//   word
+
+package main
+
+import (
+       "container/vector";
+       "go/ast";
+       "go/parser";
+       "go/token";
+       "os";
+       pathutil "path";
+       "sort";
+       "strings";
+)
+
+
+// ----------------------------------------------------------------------------
+// Data structures used during indexing
+
+// A RunList is a vector of entries that can be sorted according to some
+// criteria. A RunList may be compressed by grouping "runs" of entries
+// which are equal (according to the sort critera) into a new RunList of
+// runs. For instance, a RunList containing pairs (x, y) may be compressed
+// into a RunList containing pair runs (x, {y}) where each run consists of
+// a list of y's with the same x.
+type RunList struct {
+       vector.Vector;
+       less    func(x, y interface{}) bool;
+}
+
+func (h *RunList) Less(i, j int) bool {
+       return h.less(h.At(i), h.At(j));
+}
+
+
+func (h *RunList) sort(less func(x, y interface{}) bool) {
+       h.less = less;
+       sort.Sort(h);
+}
+
+
+// Compress entries which are the same according to a sort criteria
+// (specified by less) into "runs".
+func (h *RunList) reduce(less func(x, y interface{}) bool, newRun func(h *RunList, i, j int) interface{}) *RunList {
+       // create runs of entries with equal values
+       h.sort(less);
+
+       // for each run, make a new run object and collect them in a new RunList
+       var hh RunList;
+       i := 0;
+       for j := 0; j < h.Len(); j++ {
+               if less(h.At(i), h.At(j)) {
+                       hh.Push(newRun(h, i, j));
+                       i = j;  // start a new run
+               }
+       }
+       // add final run, if any
+       if i < h.Len() {
+               hh.Push(newRun(h, i, h.Len()));
+       }
+
+       return &hh;
+}
+
+
+// A SpotInfo value describes a particular identifier spot in a given file;
+// It encodes three values: the SpotKind (declaration or use), a line or
+// snippet index "lori", and whether it's a line or index.
+//
+// The following encoding is used:
+//
+//   bits    32   4    1       0
+//   value    [lori|kind|isIndex]
+//
+type SpotInfo uint32
+
+// SpotKind describes whether an identifier is declared (and what kind of
+// declaration) or used.
+type SpotKind uint32
+
+const (
+       ImportDecl      SpotKind        = iota;
+       ConstDecl;
+       TypeDecl;
+       VarDecl;
+       FuncDecl;
+       MethodDecl;
+       Use;
+       nKinds;
+)
+
+
+// makeSpotInfo makes a SpotInfo.
+func makeSpotInfo(kind SpotKind, lori int, isIndex bool) SpotInfo {
+       // encode lori: bits [4..32)
+       x := SpotInfo(lori)<<4;
+       if int(x>>4) != lori {
+               // lori value doesn't fit - since snippet indices are
+               // most certainly always smaller then 1<<28, this can
+               // only happen for line numbers; give it no line number (= 0)
+               x = 0;
+       }
+       // encode kind: bits [1..4)
+       x |= SpotInfo(kind)<<1;
+       // encode isIndex: bit 0
+       if isIndex {
+               x |= 1;
+       }
+       return x;
+}
+
+
+func (x SpotInfo) less(y SpotInfo) bool {
+       return x.Lori() < y.Lori();
+}
+
+
+func (x SpotInfo) Kind() SpotKind {
+       return SpotKind(x>>1&7);
+}
+
+
+func (x SpotInfo) Lori() int {
+       return int(x>>4);
+}
+
+
+func (x SpotInfo) IsIndex() bool {
+       return x&1 != 0;
+}
+
+
+// A Pak describes a Go package.
+type Pak struct {
+       Path    string; // directory name containing the package
+       Name    string; // package name as declared by package clause
+}
+
+
+func (p *Pak) less(q *Pak) bool {
+       return p.Path < q.Path || p.Name < q.Name;
+}
+
+
+// A File describes a Go file.
+type File struct {
+       Path    string; // complete file name
+       Pak     Pak;    // the package to which the file belongs
+}
+
+
+func (f *File) less(g *File) bool {
+       return f.Path < g.Path;
+}
+
+
+// A Spot describes a single occurence of a word.
+type Spot struct {
+       File    *File;
+       Info    SpotInfo;
+}
+
+
+// Spots are sorted by filename.
+func lessSpot(x, y interface{}) bool {
+       return x.(Spot).File.less(y.(Spot).File);
+}
+
+
+// A FileRun describes a run of Spots of a word in a single file.
+type FileRun struct {
+       File    *File;
+       Infos   []SpotInfo;
+}
+
+
+// newFileRun allocates a new *FileRun from the Spot run [i, j) in h.
+func newFileRun(h *RunList, i, j int) interface{} {
+       file := h.At(i).(Spot).File;
+       lines := make([]SpotInfo, j-i);
+       prev := 0;
+       k := 0;
+       for ; i < j; i++ {
+               info := h.At(i).(Spot).Info;
+               // ignore line duplicates
+               // (if lori is a snippet index it is unique - no need to check IsIndex())
+               lori := info.Lori();
+               if lori != prev {
+                       lines[k] = info;
+                       prev = lori;
+                       k++;
+               }
+       }
+       return &FileRun{file, lines[0:k]};
+}
+
+
+// FileRuns are sorted by package.
+func lessFileRun(x, y interface{}) bool {
+       return x.(*FileRun).File.Pak.less(&y.(*FileRun).File.Pak);
+}
+
+
+// A PakRun describes a run of *FileRuns of a package.
+type PakRun struct {
+       Pak     Pak;
+       Files   []*FileRun;
+}
+
+// Sorting support for files within a PakRun.
+func (p *PakRun) Len() int {
+       return len(p.Files);
+}
+func (p *PakRun) Less(i, j int) bool {
+       return p.Files[i].File.less(p.Files[j].File);
+}
+func (p *PakRun) Swap(i, j int) {
+       p.Files[i], p.Files[j] = p.Files[j], p.Files[i];
+}
+
+
+// newPakRun allocates a new *PakRun from the *FileRun run [i, j) in h.
+func newPakRun(h *RunList, i, j int) interface{} {
+       pak := h.At(i).(*FileRun).File.Pak;
+       files := make([]*FileRun, j-i);
+       k := 0;
+       for ; i < j; i++ {
+               files[k] = h.At(i).(*FileRun);
+               k++;
+       }
+       run := &PakRun{pak, files};
+       sort.Sort(run); // files were sorted by package; sort them by file now
+       return run;
+}
+
+
+// PakRuns are sorted by package.
+func lessPakRun(x, y interface{}) bool {
+       return x.(*PakRun).Pak.less(&y.(*PakRun).Pak);
+}
+
+
+// A HitList describes a list of PakRuns.
+type HitList []*PakRun
+
+
+func reduce(h0 *RunList) HitList {
+       // reduce a list of Spots into a list of FileRuns
+       h1 := h0.reduce(lessSpot, newFileRun);
+       // reduce a list of FileRuns into a list of PakRuns
+       h2 := h1.reduce(lessFileRun, newPakRun);
+       // sort the list of PakRuns by package
+       h2.sort(lessPakRun);
+       // create a HitList
+       h := make(HitList, h2.Len());
+       for i := 0; i < h2.Len(); i++ {
+               h[i] = h2.At(i).(*PakRun);
+       }
+       return h;
+}
+
+
+func (h HitList) filter(pakname string) HitList {
+       // determine number of matching packages (most of the time just one)
+       n := 0;
+       for _, p := range h {
+               if p.Pak.Name == pakname {
+                       n++;
+               }
+       }
+       // create filtered HitList
+       hh := make(HitList, n);
+       i := 0;
+       for _, p := range h {
+               if p.Pak.Name == pakname {
+                       hh[i] = p;
+                       i++;
+               }
+       }
+       return hh;
+}
+
+
+type wordPair struct {
+       canon   string; // canonical word spelling (all lowercase)
+       alt     string; // alternative spelling
+}
+
+
+// An AltWords describes a list of alternative spellings for a
+// canonical (all lowercase) spelling of a word.
+type AltWords struct {
+       Canon   string;         // canonical word spelling (all lowercase)
+       Alts    []string;       // alternative spelling for the same word
+}
+
+
+func lessWordPair(x, y interface{}) bool {
+       return x.(*wordPair).canon < y.(*wordPair).canon;
+}
+
+
+// newAltWords allocates a new *AltWords from the *wordPair run [i, j) in h.
+func newAltWords(h *RunList, i, j int) interface{} {
+       canon := h.At(i).(*wordPair).canon;
+       alts := make([]string, j-i);
+       k := 0;
+       for ; i < j; i++ {
+               alts[k] = h.At(i).(*wordPair).alt;
+               k++;
+       }
+       return &AltWords{canon, alts};
+}
+
+
+func (a *AltWords) filter(s string) *AltWords {
+       if len(a.Alts) == 1 && a.Alts[0] == s {
+               // there are no different alternatives
+               return nil;
+       }
+
+       // make a new AltWords with the current spelling removed
+       alts := make([]string, len(a.Alts));
+       i := 0;
+       for _, w := range a.Alts {
+               if w != s {
+                       alts[i] = w;
+                       i++;
+               }
+       }
+       return &AltWords{a.Canon, alts[0:i]};
+}
+
+
+// ----------------------------------------------------------------------------
+// Indexer
+
+type IndexResult struct {
+       Decls   RunList;        // package-level declarations (with snippets)
+       Others  RunList;        // all other occurences
+}
+
+
+// An Indexer maintains the data structures and provides the machinery
+// for indexing .go files under a file tree. It implements the path.Visitor
+// interface for walking file trees, and the ast.Visitor interface for
+// walking Go ASTs.
+type Indexer struct {
+       words           map[string]*IndexResult;        // RunLists of Spots
+       snippets        vector.Vector;                  // vector of *Snippets, indexed by snippet indices
+       file            *File;                          // current file
+       decl            ast.Decl;                       // current decl
+       nspots          int;                            // number of spots encountered
+}
+
+
+func (x *Indexer) addSnippet(s *Snippet) int {
+       index := x.snippets.Len();
+       x.snippets.Push(s);
+       return index;
+}
+
+
+func (x *Indexer) visitComment(c *ast.CommentGroup) {
+       if c != nil {
+               ast.Walk(x, c);
+       }
+}
+
+
+func (x *Indexer) visitIdent(kind SpotKind, id *ast.Ident) {
+       if id != nil {
+               lists, found := x.words[id.Value];
+               if !found {
+                       lists = new(IndexResult);
+                       x.words[id.Value] = lists;
+               }
+
+               if kind == Use || x.decl == nil {
+                       // not a declaration or no snippet required
+                       info := makeSpotInfo(kind, id.Pos().Line, false);
+                       lists.Others.Push(Spot{x.file, info});
+               } else {
+                       // a declaration with snippet
+                       index := x.addSnippet(NewSnippet(x.decl, id));
+                       info := makeSpotInfo(kind, index, true);
+                       lists.Decls.Push(Spot{x.file, info});
+               }
+
+               x.nspots++;
+       }
+}
+
+
+func (x *Indexer) visitSpec(spec ast.Spec, isVarDecl bool) {
+       switch n := spec.(type) {
+       case *ast.ImportSpec:
+               x.visitComment(n.Doc);
+               x.visitIdent(ImportDecl, n.Name);
+               for _, s := range n.Path {
+                       ast.Walk(x, s);
+               }
+               x.visitComment(n.Comment);
+
+       case *ast.ValueSpec:
+               x.visitComment(n.Doc);
+               kind := ConstDecl;
+               if isVarDecl {
+                       kind = VarDecl;
+               }
+               for _, n := range n.Names {
+                       x.visitIdent(kind, n);
+               }
+               ast.Walk(x, n.Type);
+               for _, v := range n.Values {
+                       ast.Walk(x, v);
+               }
+               x.visitComment(n.Comment);
+
+       case *ast.TypeSpec:
+               x.visitComment(n.Doc);
+               x.visitIdent(TypeDecl, n.Name);
+               ast.Walk(x, n.Type);
+               x.visitComment(n.Comment);
+       }
+}
+
+
+func (x *Indexer) Visit(node interface{}) bool {
+       // TODO(gri): methods in interface types are categorized as VarDecl
+       switch n := node.(type) {
+       case *ast.Ident:
+               x.visitIdent(Use, n);
+
+       case *ast.Field:
+               x.decl = nil;   // no snippets for fields
+               x.visitComment(n.Doc);
+               for _, m := range n.Names {
+                       x.visitIdent(VarDecl, m);
+               }
+               ast.Walk(x, n.Type);
+               for _, s := range n.Tag {
+                       ast.Walk(x, s);
+               }
+               x.visitComment(n.Comment);
+
+       case *ast.DeclStmt:
+               if decl, ok := n.Decl.(*ast.GenDecl); ok {
+                       // local declarations can only be *ast.GenDecls
+                       x.decl = nil;   // no snippets for local declarations
+                       x.visitComment(decl.Doc);
+                       for _, s := range decl.Specs {
+                               x.visitSpec(s, decl.Tok == token.VAR);
+                       }
+               } else {
+                       // handle error case gracefully
+                       ast.Walk(x, n.Decl);
+               }
+
+       case *ast.GenDecl:
+               x.decl = n;
+               x.visitComment(n.Doc);
+               for _, s := range n.Specs {
+                       x.visitSpec(s, n.Tok == token.VAR);
+               }
+
+       case *ast.FuncDecl:
+               x.visitComment(n.Doc);
+               kind := FuncDecl;
+               if n.Recv != nil {
+                       kind = MethodDecl;
+                       ast.Walk(x, n.Recv);
+               }
+               x.decl = n;
+               x.visitIdent(kind, n.Name);
+               ast.Walk(x, n.Type);
+               if n.Body != nil {
+                       ast.Walk(x, n.Type);
+               }
+
+       default:
+               return true;
+       }
+
+       return false;
+}
+
+
+func (x *Indexer) VisitDir(path string, d *os.Dir) bool {
+       return true;
+}
+
+
+func (x *Indexer) VisitFile(path string, d *os.Dir) {
+       if !isGoFile(d) {
+               return;
+       }
+
+       file, err := parser.ParseFile(path, nil, parser.ParseComments);
+       if err != nil {
+               return; // ignore files with (parse) errors
+       }
+
+       dir, _ := pathutil.Split(path);
+       pak := Pak{dir, file.Name.Value};
+       x.file = &File{path, pak};
+       ast.Walk(x, file);
+}
+
+
+// ----------------------------------------------------------------------------
+// Index
+
+type LookupResult struct {
+       Decls   HitList;        // package-level declarations (with snippets)
+       Others  HitList;        // all other occurences
+}
+
+
+type Index struct {
+       words           map[string]*LookupResult;       // maps words to hit lists
+       alts            map[string]*AltWords;           // maps canonical(words) to lists of alternative spellings
+       snippets        []*Snippet;                     // all snippets, indexed by snippet index
+       nspots          int;                            // number of spots indexed (a measure of the index size)
+}
+
+
+func canonical(w string) string {
+       return strings.ToLower(w);
+}
+
+
+// NewIndex creates a new index for the file tree rooted at root.
+func NewIndex(root string) *Index {
+       var x Indexer;
+
+       // initialize Indexer
+       x.words = make(map[string]*IndexResult);
+
+       // collect all Spots
+       pathutil.Walk(root, &x, nil);
+
+       // for each word, reduce the RunLists into a LookupResult;
+       // also collect the word with its canonical spelling in a
+       // word list for later computation of alternative spellings
+       words := make(map[string]*LookupResult);
+       var wlist RunList;
+       for w, h := range x.words {
+               decls := reduce(&h.Decls);
+               others := reduce(&h.Others);
+               words[w] = &LookupResult{
+                       Decls: decls,
+                       Others: others,
+               };
+               wlist.Push(&wordPair{canonical(w), w});
+       }
+
+       // reduce the word list {canonical(w), w} into
+       // a list of AltWords runs {canonical(w), {w}}
+       alist := wlist.reduce(lessWordPair, newAltWords);
+
+       // convert alist into a map of alternative spellings
+       alts := make(map[string]*AltWords);
+       for i := 0; i < alist.Len(); i++ {
+               a := alist.At(i).(*AltWords);
+               alts[a.Canon] = a;
+       }
+
+       // convert snippet vector into a list
+       snippets := make([]*Snippet, x.snippets.Len());
+       for i := 0; i < x.snippets.Len(); i++ {
+               snippets[i] = x.snippets.At(i).(*Snippet);
+       }
+
+       return &Index{words, alts, snippets, x.nspots};
+}
+
+
+// Size returns the number of different words and
+// spots indexed as a measure for the index size.
+func (x *Index) Size() (nwords int, nspots int) {
+       return len(x.words), x.nspots;
+}
+
+
+func (x *Index) LookupWord(w string) (match *LookupResult, alt *AltWords) {
+       match, _ = x.words[w];
+       alt, _ = x.alts[canonical(w)];
+       // remove current spelling from alternatives
+       // (if there is no match, the alternatives do
+       // not contain the current spelling)
+       if match != nil && alt != nil {
+               alt = alt.filter(w);
+       }
+       return;
+}
+
+
+// For a given string s, which is either a single identifier or a qualified
+// identifier, Lookup returns a LookupResult, and a list of alternative
+// spellings, if any.
+func (x *Index) Lookup(s string) (match *LookupResult, alt *AltWords) {
+       ss := strings.Split(s, ".", 0);
+
+       switch len(ss) {
+       case 1:
+               match, alt = x.LookupWord(ss[0]);
+
+       case 2:
+               pakname := ss[0];
+               match, alt = x.LookupWord(ss[1]);
+               if match != nil {
+                       // found a match - filter by package name
+                       decls := match.Decls.filter(pakname);
+                       others := match.Others.filter(pakname);
+                       match = &LookupResult{decls, others};
+               }
+               if alt != nil {
+                       // alternative spellings found - add package name
+                       // TODO(gri): At the moment this is not very smart
+                       // and likely will produce suggestions that have
+                       // no match. Should filter incorrect alternatives.
+                       canon := pakname + "." + alt.Canon;     // for completeness (currently not used)
+                       alts := make([]string, len(alt.Alts));
+                       for i, a := range alt.Alts {
+                               alts[i] = pakname+"."+a;
+                       }
+                       alt = &AltWords{canon, alts};
+               }
+       }
+
+       return;
+}
+
+
+func (x *Index) Snippet(i int) *Snippet {
+       // handle illegal snippet indices gracefully
+       if 0 <= i && i < len(x.snippets) {
+               return x.snippets[i];
+       }
+       return nil;
+}
author	Robert Griesemer <gri@golang.org>
	Tue, 27 Oct 2009 01:32:51 +0000 (18:32 -0700)
committer	Robert Griesemer <gri@golang.org>
	Tue, 27 Oct 2009 01:32:51 +0000 (18:32 -0700)