commit 93740d2d153b3da3a1be5707db1400106e3f6491 Author: Alexander Kiryukhin Date: Sat Mar 6 22:30:32 2021 +0300 Initial diff --git a/example/math_expression/main.go b/example/math_expression/main.go new file mode 100644 index 0000000..35f7c3b --- /dev/null +++ b/example/math_expression/main.go @@ -0,0 +1,54 @@ +package main + +import ( + "fmt" + + "github.com/neonxp/unilex" +) + +var output []unilex.Lexem = []unilex.Lexem{} +var opPriority = map[string]int{ + "^": 3, + "!": 3, + "*": 2, + "/": 2, + "+": 1, + "-": 1, +} + +func main() { + + l := unilex.New("10 * (20.0 + 30.0)") + + go l.Run(lexExpression) // Start lexer + + // Read infix expression lexems from lexer and convert them to RPN (reverse polish notation) + rpn := infixToRPNotation(l) + fmt.Println("RPN:", rpn) + + // Calculate RPN + result := calculateRPN(rpn) + fmt.Println("Result:", result) +} + +func lexExpression(l *unilex.Lexer) unilex.StateFunc { + l.AcceptWhile(" \t") + l.Ignore() // Ignore whitespaces + + switch { + case l.Accept("("): + l.Emit("LP") + case l.Accept(")"): + l.Emit("RP") + case unilex.ScanNumber(l): + l.Emit("NUMBER") + case l.Accept("+-*/^!"): + l.Emit("OPERATOR") + case l.Peek() == unilex.EOF: + return nil + default: + return l.Errorf("Unexpected symbol") + } + + return lexExpression +} diff --git a/example/math_expression/rpn.go b/example/math_expression/rpn.go new file mode 100644 index 0000000..b65983d --- /dev/null +++ b/example/math_expression/rpn.go @@ -0,0 +1,84 @@ +package main + +// Helper functions to convert infix notation to RPN and calculates expression result. + +import ( + "fmt" + "log" + "strconv" + + "github.com/neonxp/unilex" +) + +func infixToRPNotation(l *unilex.Lexer) []unilex.Lexem { + output := []unilex.Lexem{} + stack := lexemStack{} +parseLoop: + for ll := range l.Output { // Read lexems from Lexer output channel, convert starts as soon as first lexems scanned! + fmt.Printf("Lexem: %+v\n", ll) + + switch { + case ll.Type == "NUMBER", ll.Type == "OPERATOR" && ll.Value == "!": + output = append(output, ll) + case ll.Type == "LP": + stack.Push(ll) + case ll.Type == "RP": + for { + cl := stack.Pop() + if cl.Type == "LP" { + break + } + if cl.Type == unilex.LEOF { + log.Fatalf("No pair for parenthesis at %d", ll.Start) + } + output = append(output, cl) + } + case ll.Type == "OPERATOR": + for { + if stack.Head().Type == "OPERATOR" && (opPriority[stack.Head().Value] > opPriority[ll.Value]) { + output = append(output, stack.Pop()) + continue + } + break + } + stack.Push(ll) + case ll.Type == unilex.LEOF: + break parseLoop + } + } + + for stack.Head().Type != unilex.LEOF { + output = append(output, stack.Pop()) + } + + return output +} + +func calculateRPN(rpnLexems []unilex.Lexem) string { + stack := lexemStack{} + for _, op := range rpnLexems { + if op.Type == "NUMBER" { + stack.Push(op) + } else { + switch op.Value { + case "+": + a1, _ := strconv.ParseFloat(stack.Pop().Value, 64) + a2, _ := strconv.ParseFloat(stack.Pop().Value, 64) + stack.Push(unilex.Lexem{Type: "NUMBER", Value: strconv.FormatFloat(a2+a1, 'f', -1, 64)}) + case "-": + a1, _ := strconv.ParseFloat(stack.Pop().Value, 64) + a2, _ := strconv.ParseFloat(stack.Pop().Value, 64) + stack.Push(unilex.Lexem{Type: "NUMBER", Value: strconv.FormatFloat(a2-a1, 'f', -1, 64)}) + case "*": + a1, _ := strconv.ParseFloat(stack.Pop().Value, 64) + a2, _ := strconv.ParseFloat(stack.Pop().Value, 64) + stack.Push(unilex.Lexem{Type: "NUMBER", Value: strconv.FormatFloat(a2*a1, 'f', -1, 64)}) + case "/": + a1, _ := strconv.ParseFloat(stack.Pop().Value, 64) + a2, _ := strconv.ParseFloat(stack.Pop().Value, 64) + stack.Push(unilex.Lexem{Type: "NUMBER", Value: strconv.FormatFloat(a2/a1, 'f', -1, 64)}) + } + } + } + return stack.Head().Value +} diff --git a/example/math_expression/stack.go b/example/math_expression/stack.go new file mode 100644 index 0000000..2ca8acc --- /dev/null +++ b/example/math_expression/stack.go @@ -0,0 +1,26 @@ +package main + +// Simple lexem stack implementation. + +import "github.com/neonxp/unilex" + +type lexemStack []unilex.Lexem + +func (ls *lexemStack) Head() (l unilex.Lexem) { + if len(*ls) == 0 { + return unilex.Lexem{Type: unilex.LEOF} + } + return (*ls)[len(*ls)-1] +} + +func (ls *lexemStack) Push(l unilex.Lexem) { + *ls = append(*ls, l) +} + +func (ls *lexemStack) Pop() (l unilex.Lexem) { + if len(*ls) == 0 { + return unilex.Lexem{Type: unilex.LEOF} + } + *ls, l = (*ls)[:len(*ls)-1], (*ls)[len(*ls)-1] + return l +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..59574b8 --- /dev/null +++ b/go.mod @@ -0,0 +1,3 @@ +module github.com/neonxp/unilex + +go 1.16 diff --git a/lexem.go b/lexem.go new file mode 100644 index 0000000..bd24ea9 --- /dev/null +++ b/lexem.go @@ -0,0 +1,20 @@ +package unilex + +// Lexem represents part of parsed string. +type Lexem struct { + Type LexType // Type of Lexem. + Value string // Value of Lexem. + Start int // Start position at input string. + End int // End position at input string. +} + +// LexType represents type of current lexem. +type LexType string + +// Some std lexem types +const ( + // LError represents lexing error. + LError LexType = "ERROR" + // LEOF represents end of input. + LEOF LexType = "EOF" +) diff --git a/lexer.go b/lexer.go new file mode 100644 index 0000000..10317bd --- /dev/null +++ b/lexer.go @@ -0,0 +1,141 @@ +package unilex + +import ( + "fmt" + "strings" + "unicode/utf8" +) + +// EOF const. +const EOF rune = -1 + +// Lexer holds current scanner state. +type Lexer struct { + Input string // Input string. + Start int // Start position of current lexem. + Pos int // Pos at input string. + Output chan Lexem // Lexems channel. + width int // Width of last rune. +} + +// New returns new scanner for input string. +func New(input string) *Lexer { + return &Lexer{ + Input: input, + Start: 0, + Pos: 0, + Output: make(chan Lexem, 2), + width: 0, + } +} + +// Run lexing. +func (l *Lexer) Run(init StateFunc) { + for state := init; state != nil; { + state = state(l) + } + close(l.Output) +} + +// Emit current lexem to output. +func (l *Lexer) Emit(typ LexType) { + l.Output <- Lexem{ + Type: typ, + Value: l.Input[l.Start:l.Pos], + Start: l.Start, + End: l.Pos, + } + l.Start = l.Pos +} + +// Errorf produces error lexem and stops scanning. +func (l *Lexer) Errorf(format string, args ...interface{}) StateFunc { + l.Output <- Lexem{ + Type: LError, + Value: fmt.Sprintf(format, args...), + Start: l.Start, + End: l.Pos, + } + return nil +} + +// Next rune from input. +func (l *Lexer) Next() (r rune) { + if int(l.Pos) >= len(l.Input) { + l.width = 0 + return EOF + } + r, l.width = utf8.DecodeRuneInString(l.Input[l.Pos:]) + l.Pos += l.width + return r +} + +// Back move position to previos rune. +func (l *Lexer) Back() { + l.Pos -= l.width +} + +// Ignore previosly buffered text. +func (l *Lexer) Ignore() { + l.Start = l.Pos + l.width = 0 +} + +// Peek rune at current position without moving position. +func (l *Lexer) Peek() (r rune) { + r = l.Next() + l.Back() + return r +} + +// Accept any rune from valid string. Returns true if next rune was in valid string. +func (l *Lexer) Accept(valid string) bool { + if strings.ContainsRune(valid, l.Next()) { + return true + } + l.Back() + return false +} + +// AcceptString returns true if given string was at position. +func (l *Lexer) AcceptString(s string, caseInsentive bool) bool { + input := l.Input + if caseInsentive { + input = strings.ToLower(input) + s = strings.ToLower(s) + } + if strings.HasPrefix(input, s) { + l.width = 0 + l.Pos += len(s) + return true + } + return false +} + +// AcceptAnyOf substrings. Retuns true if any of substrings was found. +func (l *Lexer) AcceptAnyOf(s []string, caseInsentive bool) bool { + for _, substring := range s { + if l.AcceptString(substring, caseInsentive) { + return true + } + } + return false +} + +// AcceptWhile passing symbols from input while they at `valid` string. +func (l *Lexer) AcceptWhile(valid string) { + for l.Accept(valid) { + } +} + +// AcceptWhileNot passing symbols from input while they NOT in `invalid` string. +func (l *Lexer) AcceptWhileNot(invalid string) { + for !strings.ContainsRune(invalid, l.Next()) { + } + l.Back() +} + +// AtStart returns true if current lexem not empty +func (l *Lexer) AtStart() bool { + return l.Pos == l.Start +} diff --git a/scanners.go b/scanners.go new file mode 100644 index 0000000..b5811af --- /dev/null +++ b/scanners.go @@ -0,0 +1,25 @@ +package unilex + +// ScanNumber simplest scanner that accepts decimal int and float. +func ScanNumber(l *Lexer) bool { + l.AcceptWhile("0123456789") + if l.AtStart() { + // not found any digit + return false + } + l.Accept(".") + l.AcceptWhile("0123456789") + return !l.AtStart() +} + +// ScanAlphaNum returns true if next input token contains alphanum sequence that not starts from digit and not contains. +// spaces or special characters. +func ScanAlphaNum(l *Lexer) bool { + digits := "0123456789" + alpha := "qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM" + if !l.Accept(alpha) { + return false + } + l.AcceptWhile(alpha + digits) + return true +} diff --git a/scanners_test.go b/scanners_test.go new file mode 100644 index 0000000..cab697e --- /dev/null +++ b/scanners_test.go @@ -0,0 +1,53 @@ +package unilex + +import "testing" + +func TestScanNumber(t *testing.T) { + testCases := []struct { + Input string + Expected bool + Pos int + }{ + {"asd", false, 0}, + {"asd123", false, 0}, + {"123", true, 3}, + {"123asd", true, 3}, + {"123.321", true, 7}, + } + for _, tc := range testCases { + l := New(tc.Input) + actual := ScanNumber(l) + if actual != tc.Expected { + t.Errorf("Input: %s expected scan result: %v actual: %v", tc.Input, tc.Expected, actual) + } + if l.Pos != tc.Pos { + t.Errorf("Input: %s expected scan position: %d actual: %d", tc.Input, tc.Pos, l.Pos) + } + } +} + +func TestScanAlphaNum(t *testing.T) { + testCases := []struct { + Input string + Expected bool + Pos int + }{ + {"asd", true, 3}, + {"asd123", true, 6}, + {"123", false, 0}, + {"123asd", false, 0}, + {"123.321", false, 0}, + {"asd!dsa", true, 3}, + {"asd dsa", true, 3}, + } + for _, tc := range testCases { + l := New(tc.Input) + actual := ScanAlphaNum(l) + if actual != tc.Expected { + t.Errorf("Input: %s expected scan result: %v actual: %v", tc.Input, tc.Expected, actual) + } + if l.Pos != tc.Pos { + t.Errorf("Input: %s expected scan position: %d actual: %d", tc.Input, tc.Pos, l.Pos) + } + } +} diff --git a/statefunc.go b/statefunc.go new file mode 100644 index 0000000..5980ecc --- /dev/null +++ b/statefunc.go @@ -0,0 +1,4 @@ +package unilex + +// StateFunc represents function that scans lexems and returns new state function or nil if lexing completed. +type StateFunc func(*Lexer) StateFunc