From: Robert Griesemer <gri@golang.org>
Date: Thu, 3 Jul 2008 00:02:55 +0000 (-0700)
Subject: - implemented first cut at Go scanner in Go
X-Git-Tag: weekly.2009-11-06~3575
X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=85728a2da7bc610c25087486e3ec2c23ccdf97a0;p=gostls13.git

- implemented first cut at Go scanner in Go

SVN=125785
---

diff --git a/usr/gri/src/scanner.go b/usr/gri/src/scanner.go
new file mode 100644
index 0000000000..ee8a3c929c
--- /dev/null
+++ b/usr/gri/src/scanner.go
@@ -0,0 +1,589 @@
+// Copyright 2009 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package Scanner
+
+
+export EOF;
+const (
+	ILLEGAL = iota;
+	EOF = iota;
+	IDENT = iota;
+	STRING = iota;
+	NUMBER = iota;
+
+	COMMA = iota;
+	COLON = iota;
+	SEMICOLON = iota;
+	PERIOD = iota;
+
+	LPAREN = iota;
+	RPAREN = iota;
+	LBRACK = iota;
+	RBRACK = iota;
+	LBRACE = iota;
+	RBRACE = iota;
+	
+	ASSIGN = iota;
+	DEFINE = iota;
+	
+	INC = iota;
+	DEC = iota;
+	NOT = iota;
+	
+	OR = iota;
+	BOR = iota;
+	AND = iota;
+	BAND = iota;
+	
+	ADD = iota;
+	SUB = iota;
+	MUL = iota;
+	QUO = iota;
+	REM = iota;
+	
+	EQL = iota;
+	NEQ = iota;
+	LSS = iota;
+	LEQ = iota;
+	GTR = iota;
+	GEQ = iota;
+	
+	// keywords
+	BREAK = iota;
+	CASE = iota;
+	CONST = iota;
+	CONTINUE = iota;
+	DEFAULT = iota;
+	ELSE = iota;
+	EXPORT = iota;
+	FALLTHROUGH = iota;
+	FALSE = iota;
+	FOR = iota;
+	FUNC = iota;
+	GO = iota;
+	GOTO = iota;
+	IF = iota;
+	IMPORT = iota;
+	INTERFACE = iota;
+	MAP = iota;
+	NEW = iota;
+	NIL = iota;
+	PACKAGE = iota;
+	RANGE = iota;
+	RETURN = iota;
+	SELECT = iota;
+	STRUCT = iota;
+	SWITCH = iota;
+	TRUE = iota;
+	TYPE = iota;
+	VAR = iota;
+)
+
+
+var (
+	Keywords *map [string] int;
+)
+
+
+export TokenName
+func TokenName(tok int) string {
+	switch (tok) {
+	case ILLEGAL: return "ILLEGAL";
+	case EOF: return "EOF";
+	case IDENT: return "IDENT";
+	case STRING: return "STRING";
+	case NUMBER: return "NUMBER";
+
+	case COMMA: return "COMMA";
+	case COLON: return "COLON";
+	case SEMICOLON: return "SEMICOLON";
+	case PERIOD: return "PERIOD";
+
+	case LPAREN: return "LPAREN";
+	case RPAREN: return "RPAREN";
+	case LBRACK: return "LBRACK";
+	case RBRACK: return "RBRACK";
+	case LBRACE: return "LBRACE";
+	case RBRACE: return "RBRACE";
+
+	case ASSIGN: return "ASSIGN";
+	case DEFINE: return "DEFINE";
+	
+	case INC: return "INC";
+	case DEC: return "DEC";
+	case NOT: return "NOT";
+	
+	case OR: return "OR";
+	case BOR: return "BOR";
+	case AND: return "AND";
+	case BAND: return "BAND";
+	
+	case ADD: return "ADD";
+	case SUB: return "SUB";
+	case MUL: return "MUL";
+	case REM: return "REM";
+	case QUO: return "QUO";
+	case REM: return "REM";
+	
+	case EQL: return "EQL";
+	case NEQ: return "NEQ";
+	case LSS: return "LSS";
+	case LEQ: return "LEQ";
+	case GTR: return "GTR";
+	case GEQ: return "GEQ";
+
+	case BREAK: return "BREAK";
+	case CASE: return "CASE";
+	case CONST: return "CONST";
+	case CONTINUE: return "CONTINUE";
+	case DEFAULT: return "DEFAULT";
+	case ELSE: return "ELSE";
+	case EXPORT: return "EXPORT";
+	case FALLTHROUGH: return "FALLTHROUGH";
+	case FALSE: return "FALSE";
+	case FOR: return "FOR";
+	case FUNC: return "FUNC";
+	case GO: return "GO";
+	case GOTO: return "GOTO";
+	case IF: return "IF";
+	case IMPORT: return "IMPORT";
+	case INTERFACE: return "INTERFACE";
+	case MAP: return "MAP";
+	case NEW: return "NEW";
+	case NIL: return "NIL";
+	case PACKAGE: return "PACKAGE";
+	case RANGE: return "RANGE";
+	case RETURN: return "RETURN";
+	case SELECT: return "SELECT";
+	case STRUCT: return "STRUCT";
+	case SWITCH: return "SWITCH";
+	case TRUE: return "TRUE";
+	case TYPE: return "TYPE";
+	case VAR: return "VAR";
+	}
+	
+	return "???";
+}
+
+
+func is_whitespace (ch int) bool {
+	return ch == ' ' || ch == '\r' || ch == '\n' || ch == '\t';
+}
+
+
+func is_letter (ch int) bool {
+	return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= 128 ;
+}
+
+
+func is_oct_digit (ch int) bool {
+	return '0' <= ch && ch <= '7';
+}
+
+
+func is_dec_digit (ch int) bool {
+	return '0' <= ch && ch <= '9';
+}
+
+
+func is_hex_digit (ch int) bool {
+	return '0' <= ch && ch <= '9' || 'a' <= ch && ch <= 'f' || 'A' <= ch && ch <= 'F';
+}
+
+
+export Scanner
+type Scanner struct {
+	src string;
+	pos int;
+	ch int;  // one char look-ahead
+}
+
+
+func (S *Scanner) Next () {
+	src := S.src;  // TODO only needed because of 6g bug
+	if S.pos < len(src) {
+		S.ch = int(S.src[S.pos]);
+		S.pos++;
+		if (S.ch >= 128) {
+			panic "UTF-8 not handled"
+		}
+	} else {
+		S.ch = -1;
+	}
+}
+
+
+func Init () {
+	Keywords = new(map [string] int);
+
+	Keywords["break"] = BREAK;
+	Keywords["case"] = CASE;
+	Keywords["const"] = CONST;
+	Keywords["continue"] = CONTINUE;
+	Keywords["default"] = DEFAULT;
+	Keywords["else"] = ELSE;
+	Keywords["export"] = EXPORT;
+	Keywords["fallthrough"] = FALLTHROUGH;
+	Keywords["false"] = FALSE;
+	Keywords["for"] = FOR;
+	Keywords["func"] = FUNC;
+	Keywords["go"] = GO;
+	Keywords["goto"] = GOTO;
+	Keywords["if"] = IF;
+	Keywords["import"] = IMPORT;
+	Keywords["interface"] = INTERFACE;
+	Keywords["map"] = MAP;
+	Keywords["new"] = NEW;
+	Keywords["nil"] = NIL;
+	Keywords["package"] = PACKAGE;
+	Keywords["range"] = RANGE;
+	Keywords["return"] = RETURN;
+	Keywords["select"] = SELECT;
+	Keywords["struct"] = STRUCT;
+	Keywords["switch"] = SWITCH;
+	Keywords["true"] = TRUE;
+	Keywords["type"] = TYPE;
+	Keywords["var"] = VAR;
+}
+
+
+func (S *Scanner) Open (src string) {
+	if Keywords == nil {
+		Init();
+	}
+
+	S.src = src;
+	S.pos = 0;
+	S.Next();
+}
+
+
+func (S *Scanner) SkipWhitespace () {
+	for is_whitespace(S.ch) {
+		S.Next();
+	}
+}
+
+
+func (S *Scanner) SkipComment () {
+	if S.ch == '/' {
+		// comment
+		for S.Next(); S.ch != '\n' && S.ch >= 0; S.Next() {}
+		
+	} else {
+		/* comment */
+		for S.Next(); S.ch >= 0; {
+			c := S.ch;
+			S.Next();
+			if c == '*' && S.ch == '/' {
+				S.Next();
+				return;
+			}
+		}
+		panic "comment not terminated";
+	}
+}
+
+
+func (S *Scanner) ScanIdentifier () int {
+	beg := S.pos - 1;
+	for is_letter(S.ch) || is_dec_digit(S.ch) {
+		S.Next();
+	}
+	end := S.pos - 1;
+	
+	var tok int;
+	var present bool;
+	tok, present = Keywords[S.src[beg : end]];
+	if !present {
+		tok = IDENT;
+	}
+	
+	return tok;
+}
+
+
+func (S *Scanner) ScanNumber () {
+	// TODO complete this routine
+	
+	for is_dec_digit(S.ch) {
+		S.Next();
+	}
+}
+
+
+func (S *Scanner) ScanOctDigits(n int) {
+	for ; n > 0; n-- {
+		if !is_oct_digit(S.ch) {
+			panic "illegal char escape";
+		}
+		S.Next();
+	}
+}
+
+
+func (S *Scanner) ScanHexDigits(n int) {
+	for ; n > 0; n-- {
+		if !is_hex_digit(S.ch) {
+			panic "illegal char escape";
+		}
+		S.Next();
+	}
+}
+
+
+func (S *Scanner) ScanEscape () {
+	// TODO: fix this routine
+	
+	switch (S.ch) {
+	case 'a': fallthrough;
+	case 'b': fallthrough;
+	case 'f': fallthrough;
+	case 'n': fallthrough;
+	case 'r': fallthrough;
+	case 't': fallthrough;
+	case 'v': fallthrough;
+	case '\\': fallthrough;
+	case '\'': fallthrough;
+	case '"':
+		S.Next();
+		
+	case '0', '1', '2', '3', '4', '5', '6', '7':
+		S.ScanOctDigits(3);
+		
+	case 'x':
+		S.Next();
+		S.ScanHexDigits(2);
+		
+	case 'u':
+		S.Next();
+		S.ScanHexDigits(4);
+
+	case 'U':
+		S.Next();
+		S.ScanHexDigits(8);
+
+	default:
+		panic "illegal char escape";
+	}
+}
+
+
+func (S *Scanner) ScanChar () {
+	S.Next();  // consume '\'
+
+	if (S.ch == '\\') {
+		S.Next();
+		S.ScanEscape();
+	} else {
+		S.Next();
+	}
+
+	if S.ch == '\'' {
+		S.Next();
+	} else {
+		panic "char not terminated";
+	}
+}
+
+
+func (S *Scanner) ScanString () {
+	for S.Next(); S.ch != '"'; S.Next() {
+		if S.ch == '\n' || S.ch < 0 {
+			panic "string not terminated";
+		}
+	}
+	S.Next();
+}
+
+
+func (S *Scanner) ScanRawString () {
+	for S.Next(); S.ch != '`'; S.Next() {
+		if S.ch == '\n' || S.ch < 0 {
+			panic "string not terminated";
+		}
+	}
+	S.Next();
+}
+
+
+func (S *Scanner) Scan () (tok, beg, end int) {
+	S.SkipWhitespace();
+	
+	var tok int = ILLEGAL;
+	var beg int = S.pos - 1;
+	var end int = beg;
+	
+	if is_letter(S.ch) {
+		tok = S.ScanIdentifier();
+
+	} else if is_dec_digit(S.ch) {
+		S.ScanNumber();
+		tok = NUMBER;
+
+	} else {
+		switch S.ch {
+			case -1:
+				tok = EOF;
+				
+			case '/':
+				S.Next();
+				if S.ch == '/' || S.ch == '*' {
+					S.SkipComment();
+					tok, beg, end = S.Scan();
+					return tok, beg, end;
+				} else {
+					tok = QUO;
+				}
+				
+			case '"':
+				S.ScanString();
+				tok = STRING;
+				
+			case '\'':
+				S.ScanChar();
+				tok = NUMBER;
+				
+			case '`':
+				S.ScanRawString();
+				tok = STRING;
+				
+			case ':':
+				S.Next();
+				if (S.ch == '=') {
+					S.Next();
+					tok = DEFINE;
+				} else {
+					tok = COLON;
+				}
+				
+			case '.':
+				S.Next();
+				tok = PERIOD;
+				
+			case ',':
+				S.Next();
+				tok = COMMA;
+				
+			case '+':
+				S.Next();
+				if (S.ch == '+') {
+					S.Next();
+					tok = INC;
+				} else {
+					tok = ADD;
+				}
+				
+			case '-':
+				S.Next();
+				if (S.ch == '-') {
+					S.Next();
+					tok = DEC;
+				} else {
+					tok = SUB;
+				}
+				
+			case '*':
+				S.Next();
+				tok = MUL;
+
+			case '/':
+				S.Next();
+				tok = QUO;
+
+			case '%':
+				S.Next();
+				tok = REM;
+
+			case '<':
+				S.Next();
+				if (S.ch == '=') {
+					S.Next();
+					tok = LEQ;
+				} else {
+					tok = LSS;
+				}
+				
+			case '>':
+				S.Next();
+				if (S.ch == '=') {
+					S.Next();
+					tok = GEQ;
+				} else {
+					tok = GTR;
+				}
+				
+			case '=':
+				S.Next();
+				if (S.ch == '=') {
+					S.Next();
+					tok = EQL;
+				} else {
+					tok = ASSIGN;
+				}
+				
+			case '!':
+				S.Next();
+				if (S.ch == '=') {
+					S.Next();
+					tok = NEQ;
+				} else {
+					tok = NOT;
+				}
+				
+			case ';':
+				S.Next();
+				tok = SEMICOLON;
+				
+			case '(':
+				S.Next();
+				tok = LPAREN;
+				
+			case ')':
+				S.Next();
+				tok = LPAREN;
+				
+			case '[':
+				S.Next();
+				tok = LBRACK;
+				
+			case ']':
+				S.Next();
+				tok = RBRACK;
+				
+			case '{':
+				S.Next();
+				tok = LBRACE;
+				
+			case '}':
+				S.Next();
+				tok = RBRACE;
+				
+			case '&':
+				S.Next();
+				if S.ch == '&' {
+					S.Next();
+					tok = AND;
+				} else {
+					tok = BAND;
+				}
+				
+			case '|':
+				S.Next();
+				if S.ch == '|' {
+					S.Next();
+					tok = OR;
+				} else {
+					tok = BOR;
+				}
+				
+			default:
+				S.Next();  // make progress
+		}
+	}
+	
+	end = S.pos - 1;
+	return tok, beg, end;
+}
diff --git a/usr/gri/src/test_scanner.go b/usr/gri/src/test_scanner.go
new file mode 100644
index 0000000000..6f6a922488
--- /dev/null
+++ b/usr/gri/src/test_scanner.go
@@ -0,0 +1,36 @@
+// Copyright 2009 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import Scanner "scanner"
+
+
+func Scan(src string) {
+	S := new(Scanner.Scanner);
+	S.Open(src);
+	for {
+		var tok, beg, end int;
+		tok, beg, end = S.Scan();
+		print Scanner.TokenName(tok), "\t ", src[beg : end], "\n";
+		if tok == Scanner.EOF {
+			return;
+		}
+	}
+}
+
+
+func main() {
+	for i := 1; i < sys.argc(); i++ {
+		var src string;
+		var ok bool;
+		src, ok = sys.readfile(sys.argv(i));
+		if ok {
+			print "scanning " + sys.argv(i) + "\n";
+			Scan(src);
+		} else {
+			print "error: cannot read " + sys.argv(i) + "\n";
+		}
+	}
+}