aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFelix Hanley <felix@userspace.com.au>2019-08-30 14:30:28 +0000
committerFelix Hanley <felix@userspace.com.au>2019-08-30 14:30:28 +0000
commit0032516676b486b395086b40641a34f2cafeca99 (patch)
treef2d6cabe8248a12a0cad1fda61d36af93b32c359
downloadbechars-0032516676b486b395086b40641a34f2cafeca99.tar.gz
bechars-0032516676b486b395086b40641a34f2cafeca99.tar.bz2
Generation of all but unicode tested
-rw-r--r--Makefile18
-rw-r--r--generator.go197
-rw-r--r--generator_test.go112
-rw-r--r--lexer.go106
4 files changed, 433 insertions, 0 deletions
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..58fa853
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,18 @@
+
+.PHONY: test
+test: lint ## Run tests with coverage
+ go test -race -short -cover -coverprofile coverage.txt ./...
+ go tool cover -html=coverage.txt -o coverage.html
+
+.PHONY: lint
+lint: ## Run the code linter
+ revive ./...
+
+.PHONY: clean
+clean: ## Clean all test files
+ rm -rf coverage*
+
+.PHONY: help
+help:
+ grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) |sort \
+ |awk 'BEGIN{FS=":.*?## "};{printf "\033[36m%-30s\033[0m %s\n",$$1,$$2}'
diff --git a/generator.go b/generator.go
new file mode 100644
index 0000000..4a56ce5
--- /dev/null
+++ b/generator.go
@@ -0,0 +1,197 @@
+package brechars
+
+import (
+ "fmt"
+ "strings"
+
+ "src.userspace.com.au/felix/lexer"
+)
+
+const (
+ lower = "abcdefghijklmnopqrstuvwxyz"
+ upper = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+ numeric = "0123456789"
+ space = " \t\n\r\f\v"
+ punct = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~]"
+)
+
+// Generator will generate strings of characters
+// that match the provided POSIX bracket expression.
+type Generator struct {
+ l *lexer.Lexer
+ tok *lexer.Token
+ maxRune *rune
+ minRune *rune
+}
+
+// Option functions allows configuration of the generator.
+type Option func(*Generator) error
+
+// New creates a new generator.
+func New(opts ...Option) (*Generator, error) {
+ out := new(Generator)
+ for _, o := range opts {
+ if err := o(out); err != nil {
+ return nil, err
+ }
+ }
+ return out, nil
+}
+
+// MaxRune sets the maximum rune for any generated sequences.
+func MaxRune(r rune) Option {
+ return func(g *Generator) error {
+ g.maxRune = &r
+ return nil
+ }
+}
+
+// MinRune sets the minimum rune for any generated sequences.
+func MinRune(r rune) Option {
+ return func(g *Generator) error {
+ g.minRune = &r
+ return nil
+ }
+}
+
+func ensureRangeLimits(g *Generator) {
+ if g.maxRune == nil {
+ maxRune := '\u007F'
+ g.maxRune = &maxRune
+ }
+ if g.minRune == nil {
+ minRune := '\u0000'
+ g.minRune = &minRune
+ }
+}
+
+func (g *Generator) next() bool {
+ var ok bool
+ g.tok, ok = g.l.NextToken()
+ return g.tok != nil && !ok
+}
+
+// Generate will return the string from the POSIX bracket expression.
+func (g *Generator) Generate(be string) (string, error) {
+ ensureRangeLimits(g)
+ g.l = lexer.New(be, startState)
+ g.l.Start()
+
+ g.next()
+ if g.tok.Type != tBREStart {
+ return "", fmt.Errorf("missing opening '['")
+ }
+ return g.buildSequence()
+}
+
+func (g *Generator) buildSequence() (string, error) {
+ var out strings.Builder
+ for g.next() {
+ //fmt.Println(g.tok.Value)
+ switch g.tok.Type {
+ case tCharacter:
+ out.WriteString(g.tok.Value)
+ case tClass:
+ s, err := g.getClass(g.tok.Value)
+ if err != nil {
+ return "", err
+ }
+ out.WriteString(g.filter(s, ""))
+ case tRangeStart:
+ start := g.tok.Value
+ if g.next(); g.tok.Type != tRangeDash {
+ // Impossible situ?
+ return "", fmt.Errorf("invalid range")
+ }
+ if g.next(); g.tok.Type != tRangeEnd {
+ return "", fmt.Errorf("invalid range")
+ }
+ end := g.tok.Value
+ s := g.getRange([]rune(start)[0], []rune(end)[0])
+ out.WriteString(g.filter(s, ""))
+ case tBREStart, tBREEnd:
+ // No op
+ case tNot:
+ nots, err := g.buildSequence()
+ if err != nil {
+ return "", err
+ }
+ s := g.getRange(*g.minRune, *g.maxRune)
+ out.WriteString(g.filter(s, nots))
+ case lexer.ErrorToken:
+ return "", fmt.Errorf("%s", g.tok.Value)
+ default:
+ panic("invalid token")
+ }
+ }
+ return out.String(), nil
+}
+
+func (g Generator) getClass(c string) (string, error) {
+ var out string
+ switch c {
+ case ":alnum:":
+ out = lower + upper + numeric
+ case ":cntrl:":
+ out = g.getRange('\u0000', '\u001F') + "\u007F"
+ case ":lower:":
+ out = lower
+ case ":space:":
+ out = space
+ case ":alpha:":
+ out = lower + upper
+ case ":digit:":
+ out = numeric
+ case ":print:":
+ c, err := g.getClass(":cntrl:")
+ if err != nil {
+ return "", err
+ }
+ out = g.filter(g.getRange(*g.minRune, *g.maxRune), c)
+ case ":upper:":
+ out = upper
+ case ":blank:":
+ out = " \t"
+ case ":word:":
+ out = lower + upper + numeric + "_"
+ case ":graph:":
+ case ":punct:":
+ out = punct
+ case ":xdigit:":
+ out = "abcdefABCDEF" + numeric
+ default:
+ return "", fmt.Errorf("invalid class '%s'", c)
+ }
+ return out, nil
+}
+
+func (g *Generator) getRange(start, end rune) string {
+ // Swap?
+ if start > end {
+ tmp := start
+ start = end
+ end = tmp
+ }
+ var out strings.Builder
+ for i := start; i <= end; i++ {
+ out.WriteRune(i)
+ }
+ return out.String()
+}
+
+func (g *Generator) filter(in, exclude string) string {
+ var out strings.Builder
+ for _, r := range in {
+ if r < *g.minRune {
+ continue
+ }
+ if r > *g.maxRune {
+ continue
+ }
+ if strings.ContainsRune(exclude, r) {
+ continue
+ }
+ out.WriteRune(r)
+ }
+ return out.String()
+}
diff --git a/generator_test.go b/generator_test.go
new file mode 100644
index 0000000..4cb6a9e
--- /dev/null
+++ b/generator_test.go
@@ -0,0 +1,112 @@
+package brechars
+
+import (
+ "testing"
+)
+
+func TestGenerator(t *testing.T) {
+ tests := []struct {
+ in string
+ expected string
+ }{
+ {"[abc]", "abc"},
+ // First characters
+ {"[]", ""},
+ {"[-]", "-"},
+ {"[]abc]", "]abc"},
+ // Characterw
+ //{"[\\]", "\\"},
+ // Not
+ {"[^:cntrl::punct:]", " 0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"},
+ {"[^-:cntrl::digit:]", " !\"#$%&'()*+,./:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~"},
+ {"[^]:cntrl::digit:]", " !\"#$%&'()*+,-./:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\^_`abcdefghijklmnopqrstuvwxyz{|}~"},
+ // Classes
+ {"[:alnum:]", "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"},
+ {"[:alpha:]", "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"},
+ {"[:digit:]", "0123456789"},
+ {"[:space:]", " \t\n\r\f\v"},
+ {"[:blank:]", " \t"},
+ {"[:cntrl:]", "\x00\x01\x02\x03\x04\x05\x06\a\b\t\n\v\f\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\u007f"},
+ {"[:lower:]", "abcdefghijklmnopqrstuvwxyz"},
+ {"[:upper:]", "ABCDEFGHIJKLMNOPQRSTUVWXYZ"},
+ {"[:digit::upper:]", "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"},
+ {"[:print:]", " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~"},
+ {"[:punct:]", "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~]"},
+ {"[:xdigit:]", "abcdefABCDEF0123456789"},
+ {"[:digit::punct::upper:]", "0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~]ABCDEFGHIJKLMNOPQRSTUVWXYZ"},
+ // Ranges
+ {"[a-d]", "abcd"},
+ {"[\x20-\x7E]", " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~"},
+ // Swapped
+ {"[\x7E-\x20]", " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~"},
+ }
+
+ for _, tt := range tests {
+ p, err := New()
+ if err != nil {
+ t.Fatalf("New failed with %q", err)
+ }
+ actual, err := p.Generate(tt.in)
+ if err != nil {
+ t.Errorf("%s => failed with %q", tt.in, err)
+ }
+ if actual != tt.expected {
+ t.Errorf("%s => expected %q, got %q", tt.in, tt.expected, actual)
+ }
+ }
+}
+
+func TestOptions(t *testing.T) {
+ tests := []struct {
+ in string
+ opts []Option
+ expected string
+ }{
+ {"[a-z]", []Option{MinRune('a'), MaxRune('f')}, "abcdef"},
+ {"[^:cntrl::punct:]", []Option{MinRune('a'), MaxRune('z')}, "abcdefghijklmnopqrstuvwxyz"},
+ {"[:upper:]", []Option{MinRune('a'), MaxRune('z')}, ""},
+ }
+
+ for _, tt := range tests {
+ p, err := New(tt.opts...)
+ if err != nil {
+ t.Fatalf("New failed with %q", err)
+ }
+ actual, err := p.Generate(tt.in)
+ if err != nil {
+ t.Errorf("%s => failed with %q", tt.in, err)
+ }
+ if actual != tt.expected {
+ t.Errorf("%s => expected %q, got %q", tt.in, tt.expected, actual)
+ }
+ }
+}
+
+func TestErrors(t *testing.T) {
+ tests := []struct {
+ in string
+ expected string
+ }{
+ {"abc", "missing opening '['"},
+ {"[a-]", "invalid range"},
+ {"[aa-]", "invalid range"},
+ {"[a-b-]", "parse error, unexpected '-'"},
+ {"[:digit]", "parse error, expecting ':'"},
+ {"[ab", "parse error, unexpected EOF"},
+ {"[:blah:]", "invalid class ':blah:'"},
+ }
+
+ for _, tt := range tests {
+ p, err := New()
+ if err != nil {
+ t.Fatalf("New failed with %q", err)
+ }
+ _, err = p.Generate(tt.in)
+ if err == nil {
+ t.Errorf("%s => should have failed", tt.in)
+ }
+ if err.Error() != tt.expected {
+ t.Errorf("%s => expected %q, got %q", tt.in, tt.expected, err.Error())
+ }
+ }
+}
diff --git a/lexer.go b/lexer.go
new file mode 100644
index 0000000..a8383a9
--- /dev/null
+++ b/lexer.go
@@ -0,0 +1,106 @@
+package brechars
+
+import (
+ "src.userspace.com.au/felix/lexer"
+)
+
+const (
+ _ lexer.TokenType = iota
+ tBREStart
+ tBREEnd
+ tRangeStart
+ tRangeDash
+ tRangeEnd
+ tCharacter
+ tClass
+ tNot
+)
+
+func startState(l *lexer.Lexer) lexer.StateFunc {
+ l.SkipWhitespace()
+ r := l.Next()
+ if r != '[' {
+ return l.Error("expecting [")
+ }
+ l.Emit(tBREStart)
+ return breFirstState
+}
+
+// Handle the first characters of the BRE.
+func breFirstState(l *lexer.Lexer) lexer.StateFunc {
+ switch l.Next() {
+ case '^':
+ l.Emit(tNot)
+ // - or ] After ^ is literal
+ if l.Accept("-]") {
+ l.Emit(tCharacter)
+ }
+ return breState
+ case ']':
+ // Check for empty BRE
+ if l.Peek() == lexer.EOFRune {
+ l.Emit(tBREEnd)
+ return nil
+ }
+ l.Emit(tCharacter)
+ return breState
+ case '-':
+ l.Emit(tCharacter)
+ return breState
+ default:
+ l.Backup()
+ return breState
+ }
+}
+
+func breState(l *lexer.Lexer) lexer.StateFunc {
+ switch r := l.Next(); {
+ case r == ']':
+ l.Emit(tBREEnd)
+ return nil
+ case r == ':':
+ return classState
+ case r == '-':
+ return l.Error("parse error, unexpected '-'")
+ case r == '\\':
+ if l.Accept("ux") {
+ return unicodeState
+ }
+ l.Emit(tCharacter)
+ return breState
+ case r == lexer.EOFRune:
+ return l.Error("parse error, unexpected EOF")
+ default:
+ if l.Peek() == '-' {
+ l.Emit(tRangeStart)
+ l.Accept("-")
+ l.Emit(tRangeDash)
+ if l.Accept("-][^") {
+ return l.Error("parse error, invalid range end")
+ }
+ l.Next()
+ l.Emit(tRangeEnd)
+ } else {
+ l.Emit(tCharacter)
+ }
+ return breState
+ }
+}
+
+func classState(l *lexer.Lexer) lexer.StateFunc {
+ // TODO
+ l.AcceptRun("abcdefghijklmnopqrstuvwxyz")
+ if !l.Accept(":") {
+ return l.Error("parse error, expecting ':'")
+ }
+ l.Emit(tClass)
+ return breState
+}
+
+func unicodeState(l *lexer.Lexer) lexer.StateFunc {
+ // TODO valid code point
+ if n := l.AcceptRun("0123456789abcdef"); n > 0 {
+ l.Emit(tCharacter)
+ }
+ return breState
+}