diff options
| -rw-r--r-- | Makefile | 18 | ||||
| -rw-r--r-- | lexer.go | 66 | ||||
| -rw-r--r-- | lexer_test.go | 265 | ||||
| -rw-r--r-- | stack_test.go | 18 |
4 files changed, 344 insertions, 23 deletions
diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..4667b56 --- /dev/null +++ b/Makefile @@ -0,0 +1,18 @@ + +pkgs := $(shell go list ./...) + +.PHONY: lint test clean + +ifdef GOPATH +GO111MODULE=on +endif + +test: lint ## Run tests with coverage + go test -short -cover -coverprofile coverage.out $(pkgs) + go tool cover -html=coverage.out -o coverage.html + +lint: + golint $(pkgs) + +clean: ## Clean all test files + rm -rf coverage* @@ -1,22 +1,31 @@ package lexer import ( + "bytes" "fmt" "strings" "unicode" "unicode/utf8" ) +// StateFunc captures the movement from one state to the next. type StateFunc func(*Lexer) StateFunc +// TokenType identifies the tokens emitted. type TokenType int const ( - EOFRune rune = -1 + // EOFRune is a convenience for EOF + EOFRune rune = -1 + // ErrorToken is returned on error ErrorToken TokenType = -1 - EOFToken TokenType = 0 + // EOFToken is return on EOF + EOFToken TokenType = 0 ) +var lineSep = []byte{'\n'} + +// Token is returned by the lexer. type Token struct { Type TokenType Value string @@ -24,13 +33,16 @@ type Token struct { Line int } +// String implements Stringer func (t Token) String() string { return fmt.Sprintf("[%d] %s", t.Type, t.Value) } +// Lexer represents the lexer machine. type Lexer struct { source string start int + line int position int lastWidth int startState StateFunc @@ -44,6 +56,7 @@ func New(src string, start StateFunc) *Lexer { source: src, startState: start, start: 0, + line: 1, position: 0, history: newStack(), } @@ -60,6 +73,7 @@ func (l *Lexer) Start() { go l.run() } +// StartSync starts the lexer synchronously. func (l *Lexer) StartSync() { // Take half the string length as a buffer size. buffSize := len(l.source) / 2 @@ -75,7 +89,6 @@ func (l *Lexer) run() { for state != nil { state = state(l) } - //fmt.Println("nil state") close(l.tokens) } @@ -91,18 +104,42 @@ func (l *Lexer) Emit(t TokenType) { Type: t, Value: l.Current(), Position: l.position, + Line: l.line, } - //fmt.Printf("emitting: %v\n", tok) l.tokens <- tok + l.checkLines() l.start = l.position l.history.clear() } +func (l *Lexer) checkLines() { + val := l.Current() + l.line += bytes.Count([]byte(val), lineSep) +} + +// Next pulls the next rune from the Lexer and returns it, moving the position +// forward in the source. +func (l *Lexer) Next() rune { + var r rune + var s int + str := l.source[l.position:] + if len(str) == 0 { + r, s = EOFRune, 0 + } else { + r, s = utf8.DecodeRuneInString(str) + } + l.position += s + l.history.push(r) + + return r +} + // Ignore clears the history stack and then sets the current beginning position // to the current position in the source which effectively ignores the section // of the source being analyzed. func (l *Lexer) Ignore() { l.history.clear() + l.checkLines() l.start = l.position } @@ -129,23 +166,6 @@ func (l *Lexer) Backup() { } } -// Next pulls the next rune from the Lexer and returns it, moving the position -// forward in the source. -func (l *Lexer) Next() rune { - var r rune - var s int - str := l.source[l.position:] - if len(str) == 0 { - r, s = EOFRune, 0 - } else { - r, s = utf8.DecodeRuneInString(str) - } - l.position += s - l.history.push(r) - - return r -} - // Accept receives a string containing all acceptable strings and will contine // over each consecutive character in the source until a token not in the given // string is encountered. This should be used to quickly pull token parts. @@ -166,6 +186,7 @@ func (l *Lexer) AcceptRun(valid string) (n int) { return n } +// SkipWhitespace continues over all unicode whitespace. func (l *Lexer) SkipWhitespace() { for { r := l.Next() @@ -185,13 +206,12 @@ func (l *Lexer) SkipWhitespace() { // NextToken returns the next token from the lexer and done func (l *Lexer) NextToken() (*Token, bool) { if tok, ok := <-l.tokens; ok { - //fmt.Printf("next token: %v, ok: %t\n", tok, ok) return &tok, false } return nil, true } -func (l *Lexer) ErrorState(format string, args ...interface{}) StateFunc { +func (l *Lexer) Error(format string, args ...interface{}) StateFunc { l.tokens <- Token{ Type: ErrorToken, Value: fmt.Sprintf(format, args...), diff --git a/lexer_test.go b/lexer_test.go new file mode 100644 index 0000000..25da17f --- /dev/null +++ b/lexer_test.go @@ -0,0 +1,265 @@ +package lexer + +import ( + "fmt" + "testing" +) + +const ( + NumberToken TokenType = iota + OpToken + IdentToken +) + +func NumberState(l *Lexer) StateFunc { + l.AcceptRun("0123456789") + l.Emit(NumberToken) + if l.Peek() == '.' { + l.Next() + l.Emit(OpToken) + return IdentState + } + + return nil +} + +func IdentState(l *Lexer) StateFunc { + r := l.Next() + for (r >= 'a' && r <= 'z') || r == '_' { + r = l.Next() + } + l.Backup() + l.Emit(IdentToken) + + return WhitespaceState +} + +func NewlineState(l *Lexer) StateFunc { + l.AcceptRun("0123456789") + l.Emit(NumberToken) + l.SkipWhitespace() + l.Ignore() + l.AcceptRun("0123456789") + l.Emit(NumberToken) + l.SkipWhitespace() + + return nil +} + +func WhitespaceState(l *Lexer) StateFunc { + r := l.Next() + if r == EOFRune { + return nil + } + + if r != ' ' && r != '\t' && r != '\n' && r != '\r' { + l.Error(fmt.Sprintf("unexpected token %q", r)) + return nil + } + + l.Accept(" \t\n\r") + l.Ignore() + + return NumberState +} + +func TestMovingThroughString(t *testing.T) { + l := New("123", nil) + run := []struct { + s string + r rune + }{ + {"1", '1'}, + {"12", '2'}, + {"123", '3'}, + {"123", EOFRune}, + } + + for _, test := range run { + r := l.Next() + if r != test.r { + t.Errorf("Expected %q but got %q", test.r, r) + return + } + + if l.Current() != test.s { + t.Errorf("Expected %q but got %q", test.s, l.Current()) + return + } + } +} + +func TestNumbers(t *testing.T) { + l := New("123", NumberState) + l.Start() + tok, done := l.NextToken() + if done { + t.Error("Expected a token, but lexer was finished") + return + } + + if tok.Type != NumberToken { + t.Errorf("Expected a %v but got %v", NumberToken, tok.Type) + return + } + + if tok.Value != "123" { + t.Errorf("Expected %q but got %q", "123", tok.Value) + return + } + + tok, done = l.NextToken() + if !done { + t.Error("Expected the lexer to be done, but it wasn't.") + return + } + + if tok != nil { + t.Errorf("Expected a nil token, but got %v", *tok) + return + } +} + +func TestNewlines(t *testing.T) { + src := `123 +456 +789` + l := New(src, NewlineState) + l.Start() + tok, done := l.NextToken() + if done { + t.Error("Expected the lexer to not be done, but it was.") + return + } + + if tok.Type != NumberToken { + t.Errorf("Expected a number token but got %v", *tok) + return + } + + if tok.Value != "123" { + t.Errorf("Expected 123 but got %q", tok.Value) + return + } + + if tok.Line != 1 { + t.Fatalf("Expected line 1 but got %d", tok.Line) + } + + tok, done = l.NextToken() + if done { + t.Error("Expected the lexer to not be done, but it was.") + return + } + + if tok.Type != NumberToken { + t.Errorf("Expected a number token but got %v", *tok) + return + } + + if tok.Value != "456" { + t.Errorf("Expected 456 but got %q", tok.Value) + return + } + + if tok.Line != 2 { + t.Fatalf("Expected line 2 but got %d", tok.Line) + } +} + +func TestBackup(t *testing.T) { + l := New("1", nil) + r := l.Next() + if r != '1' { + t.Errorf("Expected %q but got %q", '1', r) + return + } + + if l.Current() != "1" { + t.Errorf("Expected %q but got %q", "1", l.Current()) + return + } + + l.Backup() + if l.Current() != "" { + t.Errorf("Expected empty string, but got %q", l.Current()) + return + } +} + +func TestWhitespace(t *testing.T) { + l := New(" 1", NumberState) + l.Start() + l.SkipWhitespace() + + tok, done := l.NextToken() + if done { + t.Fatal("Expected token to be !done, but it was.") + } + + if tok.Type != NumberToken { + t.Fatalf("Expected number token, but got %v", *tok) + } +} + +func TestMultipleTokens(t *testing.T) { + cases := []struct { + tokType TokenType + val string + }{ + {NumberToken, "123"}, + {OpToken, "."}, + {IdentToken, "hello"}, + {NumberToken, "675"}, + {OpToken, "."}, + {IdentToken, "world"}, + } + + l := New("123.hello 675.world", NumberState) + l.Start() + + for _, c := range cases { + tok, done := l.NextToken() + if done { + t.Error("Expected there to be more tokens, but there weren't") + return + } + + if c.tokType != tok.Type { + t.Errorf("Expected token type %v but got %v", c.tokType, tok.Type) + return + } + + if c.val != tok.Value { + t.Errorf("Expected %q but got %q", c.val, tok.Value) + return + } + } + + tok, done := l.NextToken() + if !done { + t.Error("Expected the lexer to be done, but it wasn't.") + return + } + + if tok != nil { + t.Errorf("Did not expect a token, but got %v", *tok) + return + } +} + +func TestError(t *testing.T) { + l := New("notaspace", WhitespaceState) + l.Start() + + tok, done := l.NextToken() + if done { + t.Error("Expected token to be !done, but it was.") + return + } + + if tok.Type != ErrorToken { + t.Errorf("Expected error token, but got %v", *tok) + return + } +} diff --git a/stack_test.go b/stack_test.go new file mode 100644 index 0000000..377ba6d --- /dev/null +++ b/stack_test.go @@ -0,0 +1,18 @@ +package lexer + +import ( + "testing" +) + +func TestStack(t *testing.T) { + s := newStack() + s.push('r') + r := s.pop() + if r != 'r' { + t.Fatalf("Expected r but got %b", r) + } + r = s.pop() + if r != EOFRune { + t.Fatalf("Expected EOFRune but got %b", r) + } +} |
