aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile18
-rw-r--r--lexer.go66
-rw-r--r--lexer_test.go265
-rw-r--r--stack_test.go18
4 files changed, 344 insertions, 23 deletions
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..4667b56
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,18 @@
+
+pkgs := $(shell go list ./...)
+
+.PHONY: lint test clean
+
+ifdef GOPATH
+GO111MODULE=on
+endif
+
+test: lint ## Run tests with coverage
+ go test -short -cover -coverprofile coverage.out $(pkgs)
+ go tool cover -html=coverage.out -o coverage.html
+
+lint:
+ golint $(pkgs)
+
+clean: ## Clean all test files
+ rm -rf coverage*
diff --git a/lexer.go b/lexer.go
index 71ca1f4..1cba323 100644
--- a/lexer.go
+++ b/lexer.go
@@ -1,22 +1,31 @@
package lexer
import (
+ "bytes"
"fmt"
"strings"
"unicode"
"unicode/utf8"
)
+// StateFunc captures the movement from one state to the next.
type StateFunc func(*Lexer) StateFunc
+// TokenType identifies the tokens emitted.
type TokenType int
const (
- EOFRune rune = -1
+ // EOFRune is a convenience for EOF
+ EOFRune rune = -1
+ // ErrorToken is returned on error
ErrorToken TokenType = -1
- EOFToken TokenType = 0
+ // EOFToken is return on EOF
+ EOFToken TokenType = 0
)
+var lineSep = []byte{'\n'}
+
+// Token is returned by the lexer.
type Token struct {
Type TokenType
Value string
@@ -24,13 +33,16 @@ type Token struct {
Line int
}
+// String implements Stringer
func (t Token) String() string {
return fmt.Sprintf("[%d] %s", t.Type, t.Value)
}
+// Lexer represents the lexer machine.
type Lexer struct {
source string
start int
+ line int
position int
lastWidth int
startState StateFunc
@@ -44,6 +56,7 @@ func New(src string, start StateFunc) *Lexer {
source: src,
startState: start,
start: 0,
+ line: 1,
position: 0,
history: newStack(),
}
@@ -60,6 +73,7 @@ func (l *Lexer) Start() {
go l.run()
}
+// StartSync starts the lexer synchronously.
func (l *Lexer) StartSync() {
// Take half the string length as a buffer size.
buffSize := len(l.source) / 2
@@ -75,7 +89,6 @@ func (l *Lexer) run() {
for state != nil {
state = state(l)
}
- //fmt.Println("nil state")
close(l.tokens)
}
@@ -91,18 +104,42 @@ func (l *Lexer) Emit(t TokenType) {
Type: t,
Value: l.Current(),
Position: l.position,
+ Line: l.line,
}
- //fmt.Printf("emitting: %v\n", tok)
l.tokens <- tok
+ l.checkLines()
l.start = l.position
l.history.clear()
}
+func (l *Lexer) checkLines() {
+ val := l.Current()
+ l.line += bytes.Count([]byte(val), lineSep)
+}
+
+// Next pulls the next rune from the Lexer and returns it, moving the position
+// forward in the source.
+func (l *Lexer) Next() rune {
+ var r rune
+ var s int
+ str := l.source[l.position:]
+ if len(str) == 0 {
+ r, s = EOFRune, 0
+ } else {
+ r, s = utf8.DecodeRuneInString(str)
+ }
+ l.position += s
+ l.history.push(r)
+
+ return r
+}
+
// Ignore clears the history stack and then sets the current beginning position
// to the current position in the source which effectively ignores the section
// of the source being analyzed.
func (l *Lexer) Ignore() {
l.history.clear()
+ l.checkLines()
l.start = l.position
}
@@ -129,23 +166,6 @@ func (l *Lexer) Backup() {
}
}
-// Next pulls the next rune from the Lexer and returns it, moving the position
-// forward in the source.
-func (l *Lexer) Next() rune {
- var r rune
- var s int
- str := l.source[l.position:]
- if len(str) == 0 {
- r, s = EOFRune, 0
- } else {
- r, s = utf8.DecodeRuneInString(str)
- }
- l.position += s
- l.history.push(r)
-
- return r
-}
-
// Accept receives a string containing all acceptable strings and will contine
// over each consecutive character in the source until a token not in the given
// string is encountered. This should be used to quickly pull token parts.
@@ -166,6 +186,7 @@ func (l *Lexer) AcceptRun(valid string) (n int) {
return n
}
+// SkipWhitespace continues over all unicode whitespace.
func (l *Lexer) SkipWhitespace() {
for {
r := l.Next()
@@ -185,13 +206,12 @@ func (l *Lexer) SkipWhitespace() {
// NextToken returns the next token from the lexer and done
func (l *Lexer) NextToken() (*Token, bool) {
if tok, ok := <-l.tokens; ok {
- //fmt.Printf("next token: %v, ok: %t\n", tok, ok)
return &tok, false
}
return nil, true
}
-func (l *Lexer) ErrorState(format string, args ...interface{}) StateFunc {
+func (l *Lexer) Error(format string, args ...interface{}) StateFunc {
l.tokens <- Token{
Type: ErrorToken,
Value: fmt.Sprintf(format, args...),
diff --git a/lexer_test.go b/lexer_test.go
new file mode 100644
index 0000000..25da17f
--- /dev/null
+++ b/lexer_test.go
@@ -0,0 +1,265 @@
+package lexer
+
+import (
+ "fmt"
+ "testing"
+)
+
+const (
+ NumberToken TokenType = iota
+ OpToken
+ IdentToken
+)
+
+func NumberState(l *Lexer) StateFunc {
+ l.AcceptRun("0123456789")
+ l.Emit(NumberToken)
+ if l.Peek() == '.' {
+ l.Next()
+ l.Emit(OpToken)
+ return IdentState
+ }
+
+ return nil
+}
+
+func IdentState(l *Lexer) StateFunc {
+ r := l.Next()
+ for (r >= 'a' && r <= 'z') || r == '_' {
+ r = l.Next()
+ }
+ l.Backup()
+ l.Emit(IdentToken)
+
+ return WhitespaceState
+}
+
+func NewlineState(l *Lexer) StateFunc {
+ l.AcceptRun("0123456789")
+ l.Emit(NumberToken)
+ l.SkipWhitespace()
+ l.Ignore()
+ l.AcceptRun("0123456789")
+ l.Emit(NumberToken)
+ l.SkipWhitespace()
+
+ return nil
+}
+
+func WhitespaceState(l *Lexer) StateFunc {
+ r := l.Next()
+ if r == EOFRune {
+ return nil
+ }
+
+ if r != ' ' && r != '\t' && r != '\n' && r != '\r' {
+ l.Error(fmt.Sprintf("unexpected token %q", r))
+ return nil
+ }
+
+ l.Accept(" \t\n\r")
+ l.Ignore()
+
+ return NumberState
+}
+
+func TestMovingThroughString(t *testing.T) {
+ l := New("123", nil)
+ run := []struct {
+ s string
+ r rune
+ }{
+ {"1", '1'},
+ {"12", '2'},
+ {"123", '3'},
+ {"123", EOFRune},
+ }
+
+ for _, test := range run {
+ r := l.Next()
+ if r != test.r {
+ t.Errorf("Expected %q but got %q", test.r, r)
+ return
+ }
+
+ if l.Current() != test.s {
+ t.Errorf("Expected %q but got %q", test.s, l.Current())
+ return
+ }
+ }
+}
+
+func TestNumbers(t *testing.T) {
+ l := New("123", NumberState)
+ l.Start()
+ tok, done := l.NextToken()
+ if done {
+ t.Error("Expected a token, but lexer was finished")
+ return
+ }
+
+ if tok.Type != NumberToken {
+ t.Errorf("Expected a %v but got %v", NumberToken, tok.Type)
+ return
+ }
+
+ if tok.Value != "123" {
+ t.Errorf("Expected %q but got %q", "123", tok.Value)
+ return
+ }
+
+ tok, done = l.NextToken()
+ if !done {
+ t.Error("Expected the lexer to be done, but it wasn't.")
+ return
+ }
+
+ if tok != nil {
+ t.Errorf("Expected a nil token, but got %v", *tok)
+ return
+ }
+}
+
+func TestNewlines(t *testing.T) {
+ src := `123
+456
+789`
+ l := New(src, NewlineState)
+ l.Start()
+ tok, done := l.NextToken()
+ if done {
+ t.Error("Expected the lexer to not be done, but it was.")
+ return
+ }
+
+ if tok.Type != NumberToken {
+ t.Errorf("Expected a number token but got %v", *tok)
+ return
+ }
+
+ if tok.Value != "123" {
+ t.Errorf("Expected 123 but got %q", tok.Value)
+ return
+ }
+
+ if tok.Line != 1 {
+ t.Fatalf("Expected line 1 but got %d", tok.Line)
+ }
+
+ tok, done = l.NextToken()
+ if done {
+ t.Error("Expected the lexer to not be done, but it was.")
+ return
+ }
+
+ if tok.Type != NumberToken {
+ t.Errorf("Expected a number token but got %v", *tok)
+ return
+ }
+
+ if tok.Value != "456" {
+ t.Errorf("Expected 456 but got %q", tok.Value)
+ return
+ }
+
+ if tok.Line != 2 {
+ t.Fatalf("Expected line 2 but got %d", tok.Line)
+ }
+}
+
+func TestBackup(t *testing.T) {
+ l := New("1", nil)
+ r := l.Next()
+ if r != '1' {
+ t.Errorf("Expected %q but got %q", '1', r)
+ return
+ }
+
+ if l.Current() != "1" {
+ t.Errorf("Expected %q but got %q", "1", l.Current())
+ return
+ }
+
+ l.Backup()
+ if l.Current() != "" {
+ t.Errorf("Expected empty string, but got %q", l.Current())
+ return
+ }
+}
+
+func TestWhitespace(t *testing.T) {
+ l := New(" 1", NumberState)
+ l.Start()
+ l.SkipWhitespace()
+
+ tok, done := l.NextToken()
+ if done {
+ t.Fatal("Expected token to be !done, but it was.")
+ }
+
+ if tok.Type != NumberToken {
+ t.Fatalf("Expected number token, but got %v", *tok)
+ }
+}
+
+func TestMultipleTokens(t *testing.T) {
+ cases := []struct {
+ tokType TokenType
+ val string
+ }{
+ {NumberToken, "123"},
+ {OpToken, "."},
+ {IdentToken, "hello"},
+ {NumberToken, "675"},
+ {OpToken, "."},
+ {IdentToken, "world"},
+ }
+
+ l := New("123.hello 675.world", NumberState)
+ l.Start()
+
+ for _, c := range cases {
+ tok, done := l.NextToken()
+ if done {
+ t.Error("Expected there to be more tokens, but there weren't")
+ return
+ }
+
+ if c.tokType != tok.Type {
+ t.Errorf("Expected token type %v but got %v", c.tokType, tok.Type)
+ return
+ }
+
+ if c.val != tok.Value {
+ t.Errorf("Expected %q but got %q", c.val, tok.Value)
+ return
+ }
+ }
+
+ tok, done := l.NextToken()
+ if !done {
+ t.Error("Expected the lexer to be done, but it wasn't.")
+ return
+ }
+
+ if tok != nil {
+ t.Errorf("Did not expect a token, but got %v", *tok)
+ return
+ }
+}
+
+func TestError(t *testing.T) {
+ l := New("notaspace", WhitespaceState)
+ l.Start()
+
+ tok, done := l.NextToken()
+ if done {
+ t.Error("Expected token to be !done, but it was.")
+ return
+ }
+
+ if tok.Type != ErrorToken {
+ t.Errorf("Expected error token, but got %v", *tok)
+ return
+ }
+}
diff --git a/stack_test.go b/stack_test.go
new file mode 100644
index 0000000..377ba6d
--- /dev/null
+++ b/stack_test.go
@@ -0,0 +1,18 @@
+package lexer
+
+import (
+ "testing"
+)
+
+func TestStack(t *testing.T) {
+ s := newStack()
+ s.push('r')
+ r := s.pop()
+ if r != 'r' {
+ t.Fatalf("Expected r but got %b", r)
+ }
+ r = s.pop()
+ if r != EOFRune {
+ t.Fatalf("Expected EOFRune but got %b", r)
+ }
+}