From 0032516676b486b395086b40641a34f2cafeca99 Mon Sep 17 00:00:00 2001 From: Felix Hanley Date: Sat, 31 Aug 2019 00:30:28 +1000 Subject: Generation of all but unicode tested --- Makefile | 18 +++++ generator.go | 197 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ generator_test.go | 112 +++++++++++++++++++++++++++++++ lexer.go | 106 +++++++++++++++++++++++++++++ 4 files changed, 433 insertions(+) create mode 100644 Makefile create mode 100644 generator.go create mode 100644 generator_test.go create mode 100644 lexer.go diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..58fa853 --- /dev/null +++ b/Makefile @@ -0,0 +1,18 @@ + +.PHONY: test +test: lint ## Run tests with coverage + go test -race -short -cover -coverprofile coverage.txt ./... + go tool cover -html=coverage.txt -o coverage.html + +.PHONY: lint +lint: ## Run the code linter + revive ./... + +.PHONY: clean +clean: ## Clean all test files + rm -rf coverage* + +.PHONY: help +help: + grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) |sort \ + |awk 'BEGIN{FS=":.*?## "};{printf "\033[36m%-30s\033[0m %s\n",$$1,$$2}' diff --git a/generator.go b/generator.go new file mode 100644 index 0000000..4a56ce5 --- /dev/null +++ b/generator.go @@ -0,0 +1,197 @@ +package brechars + +import ( + "fmt" + "strings" + + "src.userspace.com.au/felix/lexer" +) + +const ( + lower = "abcdefghijklmnopqrstuvwxyz" + upper = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + numeric = "0123456789" + space = " \t\n\r\f\v" + punct = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~]" +) + +// Generator will generate strings of characters +// that match the provided POSIX bracket expression. +type Generator struct { + l *lexer.Lexer + tok *lexer.Token + maxRune *rune + minRune *rune +} + +// Option functions allows configuration of the generator. +type Option func(*Generator) error + +// New creates a new generator. +func New(opts ...Option) (*Generator, error) { + out := new(Generator) + for _, o := range opts { + if err := o(out); err != nil { + return nil, err + } + } + return out, nil +} + +// MaxRune sets the maximum rune for any generated sequences. +func MaxRune(r rune) Option { + return func(g *Generator) error { + g.maxRune = &r + return nil + } +} + +// MinRune sets the minimum rune for any generated sequences. +func MinRune(r rune) Option { + return func(g *Generator) error { + g.minRune = &r + return nil + } +} + +func ensureRangeLimits(g *Generator) { + if g.maxRune == nil { + maxRune := '\u007F' + g.maxRune = &maxRune + } + if g.minRune == nil { + minRune := '\u0000' + g.minRune = &minRune + } +} + +func (g *Generator) next() bool { + var ok bool + g.tok, ok = g.l.NextToken() + return g.tok != nil && !ok +} + +// Generate will return the string from the POSIX bracket expression. +func (g *Generator) Generate(be string) (string, error) { + ensureRangeLimits(g) + g.l = lexer.New(be, startState) + g.l.Start() + + g.next() + if g.tok.Type != tBREStart { + return "", fmt.Errorf("missing opening '['") + } + return g.buildSequence() +} + +func (g *Generator) buildSequence() (string, error) { + var out strings.Builder + for g.next() { + //fmt.Println(g.tok.Value) + switch g.tok.Type { + case tCharacter: + out.WriteString(g.tok.Value) + case tClass: + s, err := g.getClass(g.tok.Value) + if err != nil { + return "", err + } + out.WriteString(g.filter(s, "")) + case tRangeStart: + start := g.tok.Value + if g.next(); g.tok.Type != tRangeDash { + // Impossible situ? + return "", fmt.Errorf("invalid range") + } + if g.next(); g.tok.Type != tRangeEnd { + return "", fmt.Errorf("invalid range") + } + end := g.tok.Value + s := g.getRange([]rune(start)[0], []rune(end)[0]) + out.WriteString(g.filter(s, "")) + case tBREStart, tBREEnd: + // No op + case tNot: + nots, err := g.buildSequence() + if err != nil { + return "", err + } + s := g.getRange(*g.minRune, *g.maxRune) + out.WriteString(g.filter(s, nots)) + case lexer.ErrorToken: + return "", fmt.Errorf("%s", g.tok.Value) + default: + panic("invalid token") + } + } + return out.String(), nil +} + +func (g Generator) getClass(c string) (string, error) { + var out string + switch c { + case ":alnum:": + out = lower + upper + numeric + case ":cntrl:": + out = g.getRange('\u0000', '\u001F') + "\u007F" + case ":lower:": + out = lower + case ":space:": + out = space + case ":alpha:": + out = lower + upper + case ":digit:": + out = numeric + case ":print:": + c, err := g.getClass(":cntrl:") + if err != nil { + return "", err + } + out = g.filter(g.getRange(*g.minRune, *g.maxRune), c) + case ":upper:": + out = upper + case ":blank:": + out = " \t" + case ":word:": + out = lower + upper + numeric + "_" + case ":graph:": + case ":punct:": + out = punct + case ":xdigit:": + out = "abcdefABCDEF" + numeric + default: + return "", fmt.Errorf("invalid class '%s'", c) + } + return out, nil +} + +func (g *Generator) getRange(start, end rune) string { + // Swap? + if start > end { + tmp := start + start = end + end = tmp + } + var out strings.Builder + for i := start; i <= end; i++ { + out.WriteRune(i) + } + return out.String() +} + +func (g *Generator) filter(in, exclude string) string { + var out strings.Builder + for _, r := range in { + if r < *g.minRune { + continue + } + if r > *g.maxRune { + continue + } + if strings.ContainsRune(exclude, r) { + continue + } + out.WriteRune(r) + } + return out.String() +} diff --git a/generator_test.go b/generator_test.go new file mode 100644 index 0000000..4cb6a9e --- /dev/null +++ b/generator_test.go @@ -0,0 +1,112 @@ +package brechars + +import ( + "testing" +) + +func TestGenerator(t *testing.T) { + tests := []struct { + in string + expected string + }{ + {"[abc]", "abc"}, + // First characters + {"[]", ""}, + {"[-]", "-"}, + {"[]abc]", "]abc"}, + // Characterw + //{"[\\]", "\\"}, + // Not + {"[^:cntrl::punct:]", " 0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"}, + {"[^-:cntrl::digit:]", " !\"#$%&'()*+,./:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~"}, + {"[^]:cntrl::digit:]", " !\"#$%&'()*+,-./:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\^_`abcdefghijklmnopqrstuvwxyz{|}~"}, + // Classes + {"[:alnum:]", "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"}, + {"[:alpha:]", "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"}, + {"[:digit:]", "0123456789"}, + {"[:space:]", " \t\n\r\f\v"}, + {"[:blank:]", " \t"}, + {"[:cntrl:]", "\x00\x01\x02\x03\x04\x05\x06\a\b\t\n\v\f\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\u007f"}, + {"[:lower:]", "abcdefghijklmnopqrstuvwxyz"}, + {"[:upper:]", "ABCDEFGHIJKLMNOPQRSTUVWXYZ"}, + {"[:digit::upper:]", "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"}, + {"[:print:]", " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~"}, + {"[:punct:]", "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~]"}, + {"[:xdigit:]", "abcdefABCDEF0123456789"}, + {"[:digit::punct::upper:]", "0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~]ABCDEFGHIJKLMNOPQRSTUVWXYZ"}, + // Ranges + {"[a-d]", "abcd"}, + {"[\x20-\x7E]", " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~"}, + // Swapped + {"[\x7E-\x20]", " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~"}, + } + + for _, tt := range tests { + p, err := New() + if err != nil { + t.Fatalf("New failed with %q", err) + } + actual, err := p.Generate(tt.in) + if err != nil { + t.Errorf("%s => failed with %q", tt.in, err) + } + if actual != tt.expected { + t.Errorf("%s => expected %q, got %q", tt.in, tt.expected, actual) + } + } +} + +func TestOptions(t *testing.T) { + tests := []struct { + in string + opts []Option + expected string + }{ + {"[a-z]", []Option{MinRune('a'), MaxRune('f')}, "abcdef"}, + {"[^:cntrl::punct:]", []Option{MinRune('a'), MaxRune('z')}, "abcdefghijklmnopqrstuvwxyz"}, + {"[:upper:]", []Option{MinRune('a'), MaxRune('z')}, ""}, + } + + for _, tt := range tests { + p, err := New(tt.opts...) + if err != nil { + t.Fatalf("New failed with %q", err) + } + actual, err := p.Generate(tt.in) + if err != nil { + t.Errorf("%s => failed with %q", tt.in, err) + } + if actual != tt.expected { + t.Errorf("%s => expected %q, got %q", tt.in, tt.expected, actual) + } + } +} + +func TestErrors(t *testing.T) { + tests := []struct { + in string + expected string + }{ + {"abc", "missing opening '['"}, + {"[a-]", "invalid range"}, + {"[aa-]", "invalid range"}, + {"[a-b-]", "parse error, unexpected '-'"}, + {"[:digit]", "parse error, expecting ':'"}, + {"[ab", "parse error, unexpected EOF"}, + {"[:blah:]", "invalid class ':blah:'"}, + } + + for _, tt := range tests { + p, err := New() + if err != nil { + t.Fatalf("New failed with %q", err) + } + _, err = p.Generate(tt.in) + if err == nil { + t.Errorf("%s => should have failed", tt.in) + } + if err.Error() != tt.expected { + t.Errorf("%s => expected %q, got %q", tt.in, tt.expected, err.Error()) + } + } +} diff --git a/lexer.go b/lexer.go new file mode 100644 index 0000000..a8383a9 --- /dev/null +++ b/lexer.go @@ -0,0 +1,106 @@ +package brechars + +import ( + "src.userspace.com.au/felix/lexer" +) + +const ( + _ lexer.TokenType = iota + tBREStart + tBREEnd + tRangeStart + tRangeDash + tRangeEnd + tCharacter + tClass + tNot +) + +func startState(l *lexer.Lexer) lexer.StateFunc { + l.SkipWhitespace() + r := l.Next() + if r != '[' { + return l.Error("expecting [") + } + l.Emit(tBREStart) + return breFirstState +} + +// Handle the first characters of the BRE. +func breFirstState(l *lexer.Lexer) lexer.StateFunc { + switch l.Next() { + case '^': + l.Emit(tNot) + // - or ] After ^ is literal + if l.Accept("-]") { + l.Emit(tCharacter) + } + return breState + case ']': + // Check for empty BRE + if l.Peek() == lexer.EOFRune { + l.Emit(tBREEnd) + return nil + } + l.Emit(tCharacter) + return breState + case '-': + l.Emit(tCharacter) + return breState + default: + l.Backup() + return breState + } +} + +func breState(l *lexer.Lexer) lexer.StateFunc { + switch r := l.Next(); { + case r == ']': + l.Emit(tBREEnd) + return nil + case r == ':': + return classState + case r == '-': + return l.Error("parse error, unexpected '-'") + case r == '\\': + if l.Accept("ux") { + return unicodeState + } + l.Emit(tCharacter) + return breState + case r == lexer.EOFRune: + return l.Error("parse error, unexpected EOF") + default: + if l.Peek() == '-' { + l.Emit(tRangeStart) + l.Accept("-") + l.Emit(tRangeDash) + if l.Accept("-][^") { + return l.Error("parse error, invalid range end") + } + l.Next() + l.Emit(tRangeEnd) + } else { + l.Emit(tCharacter) + } + return breState + } +} + +func classState(l *lexer.Lexer) lexer.StateFunc { + // TODO + l.AcceptRun("abcdefghijklmnopqrstuvwxyz") + if !l.Accept(":") { + return l.Error("parse error, expecting ':'") + } + l.Emit(tClass) + return breState +} + +func unicodeState(l *lexer.Lexer) lexer.StateFunc { + // TODO valid code point + if n := l.AcceptRun("0123456789abcdef"); n > 0 { + l.Emit(tCharacter) + } + return breState +} -- cgit v1.2.3