From ff198abd8fc9e2019c2f3ef9b7e74206ecdb99b7 Mon Sep 17 00:00:00 2001 From: Alexander Kiryukhin Date: Wed, 10 Mar 2021 00:47:58 +0300 Subject: [PATCH] Push/pop state, full json example --- README.md | 4 +- example/json/main.go | 146 +++++++++++++++++++++++++++---------------- lexem.go | 8 +-- lexer.go | 13 +++- statefunc.go | 14 +++++ 5 files changed, 125 insertions(+), 60 deletions(-) diff --git a/README.md b/README.md index 172ce9b..352cbe0 100644 --- a/README.md +++ b/README.md @@ -3,4 +3,6 @@ Universal lexer for Golang Based on Rob Pike's awesome video [Lexical Scanning in Go](https://www.youtube.com/watch?v=HxaD_trXwRE) -TODO: Write brief documentation. For now - watch video and [/examples](https://github.com/neonxp/unilex/tree/master/example) directory. +Examples: [/examples](https://github.com/neonxp/unilex/tree/master/example) directory. + + diff --git a/example/json/main.go b/example/json/main.go index 842ab82..745e5aa 100644 --- a/example/json/main.go +++ b/example/json/main.go @@ -15,7 +15,19 @@ func main() { "key2": { "key3" : "value 3" }, - "key4": 123.321 + "key4": 123.321, + "key5": [ + 1, + 2, + [ + 3, + 4, + 5, + { + "key6": "value6" + } + ] + ] }` l := unilex.New(testJson) go l.Run(initJson) @@ -25,11 +37,14 @@ func main() { } const ( - lObjectStart unilex.LexType = "lObjectStart" - lObjectEnd unilex.LexType = "lObjectEnd" - lObjectKey unilex.LexType = "lObjectKey" - lObjectValueString unilex.LexType = "lObjectValueString" - lObjectValueNumber unilex.LexType = "lObjectValueNumber" + lObjectStart unilex.LexType = iota + lObjectEnd + lObjectKey + lObjectValue + lArrayStart + lArrayEnd + lString + lNumber ) func initJson(l *unilex.Lexer) unilex.StateFunc { @@ -37,60 +52,83 @@ func initJson(l *unilex.Lexer) unilex.StateFunc { switch { case l.Accept("{"): l.Emit(lObjectStart) - return stateInObject(true) + return stateInObject case l.Peek() == unilex.EOF: return nil } - return l.Errorf("Unknown token: %s", l.Peek()) + return l.Errorf("Unknown token: %s", string(l.Peek())) } -func stateInObject(initial bool) unilex.StateFunc { - return func(l *unilex.Lexer) unilex.StateFunc { - // we in object, so we expect field keys and values - ignoreWhiteSpace(l) - if l.Accept("}") { - l.Emit(lObjectEnd) - if initial { - return initJson - } - ignoreWhiteSpace(l) - l.Accept(",") - ignoreWhiteSpace(l) - return stateInObject(initial) - } - if l.Peek() == unilex.EOF { - return nil - } - if !unilex.ScanQuotedString(l, '"') { - return l.Errorf("Unknown token: %s", l.Peek()) - } - l.Emit(lObjectKey) - ignoreWhiteSpace(l) - if !l.Accept(":") { - return l.Errorf("Expected ':'") - } - ignoreWhiteSpace(l) - switch { - case unilex.ScanQuotedString(l, '"'): - l.Emit(lObjectValueString) - ignoreWhiteSpace(l) - l.Accept(",") - l.Ignore() - ignoreWhiteSpace(l) - return stateInObject(initial) - case unilex.ScanNumber(l): - l.Emit(lObjectValueNumber) - ignoreWhiteSpace(l) - l.Accept(",") - l.Ignore() - ignoreWhiteSpace(l) - return stateInObject(initial) - case l.Accept("{"): - l.Emit(lObjectStart) - return stateInObject(false) - } - return l.Errorf("Unknown token") +func stateInObject(l *unilex.Lexer) unilex.StateFunc { + // we in object, so we expect field keys and values + ignoreWhiteSpace(l) + if l.Accept("}") { + l.Emit(lObjectEnd) + // If meet close object return to previous state (including initial) + return l.PopState() } + ignoreWhiteSpace(l) + l.Accept(",") + ignoreWhiteSpace(l) + if !unilex.ScanQuotedString(l, '"') { + return l.Errorf("Unknown token: %s", string(l.Peek())) + } + l.Emit(lObjectKey) + ignoreWhiteSpace(l) + if !l.Accept(":") { + return l.Errorf("Expected ':'") + } + ignoreWhiteSpace(l) + l.Emit(lObjectValue) + switch { + case unilex.ScanQuotedString(l, '"'): + l.Emit(lString) + ignoreWhiteSpace(l) + l.Accept(",") + l.Ignore() + ignoreWhiteSpace(l) + return stateInObject + case unilex.ScanNumber(l): + l.Emit(lNumber) + ignoreWhiteSpace(l) + l.Accept(",") + l.Ignore() + ignoreWhiteSpace(l) + return stateInObject + case l.Accept("{"): + l.Emit(lObjectStart) + l.PushState(stateInObject) + return stateInObject + case l.Accept("["): + l.Emit(lArrayStart) + l.PushState(stateInObject) + return stateInArray + } + return l.Errorf("Unknown token: %s", string(l.Peek())) +} + +func stateInArray(l *unilex.Lexer) unilex.StateFunc { + ignoreWhiteSpace(l) + l.Accept(",") + ignoreWhiteSpace(l) + switch { + case unilex.ScanQuotedString(l, '"'): + l.Emit(lString) + case unilex.ScanNumber(l): + l.Emit(lNumber) + case l.Accept("{"): + l.Emit(lObjectStart) + l.PushState(stateInArray) + return stateInObject + case l.Accept("["): + l.Emit(lArrayStart) + l.PushState(stateInArray) + return stateInArray + case l.Accept("]"): + l.Emit(lArrayEnd) + return l.PopState() + } + return stateInArray } func ignoreWhiteSpace(l *unilex.Lexer) { diff --git a/lexem.go b/lexem.go index bd24ea9..c594ee9 100644 --- a/lexem.go +++ b/lexem.go @@ -9,12 +9,12 @@ type Lexem struct { } // LexType represents type of current lexem. -type LexType string +type LexType int // Some std lexem types const ( - // LError represents lexing error. - LError LexType = "ERROR" // LEOF represents end of input. - LEOF LexType = "EOF" + LexEOF LexType = -1 + // LError represents lexing error. + LexError LexType = -2 ) diff --git a/lexer.go b/lexer.go index 10317bd..b556045 100644 --- a/lexer.go +++ b/lexer.go @@ -16,6 +16,7 @@ type Lexer struct { Pos int // Pos at input string. Output chan Lexem // Lexems channel. width int // Width of last rune. + states stateStack // Stack of states to realize PrevState. } // New returns new scanner for input string. @@ -37,6 +38,16 @@ func (l *Lexer) Run(init StateFunc) { close(l.Output) } +// PopState returns previous state function. +func (l *Lexer) PopState() StateFunc { + return l.states.Pop() +} + +// PushState pushes state before going deeper states. +func (l *Lexer) PushState(s StateFunc) { + l.states.Push(s) +} + // Emit current lexem to output. func (l *Lexer) Emit(typ LexType) { l.Output <- Lexem{ @@ -51,7 +62,7 @@ func (l *Lexer) Emit(typ LexType) { // Errorf produces error lexem and stops scanning. func (l *Lexer) Errorf(format string, args ...interface{}) StateFunc { l.Output <- Lexem{ - Type: LError, + Type: LexError, Value: fmt.Sprintf(format, args...), Start: l.Start, End: l.Pos, diff --git a/statefunc.go b/statefunc.go index 5980ecc..734fe57 100644 --- a/statefunc.go +++ b/statefunc.go @@ -2,3 +2,17 @@ package unilex // StateFunc represents function that scans lexems and returns new state function or nil if lexing completed. type StateFunc func(*Lexer) StateFunc + +type stateStack []StateFunc + +func (ss *stateStack) Push(s StateFunc) { + *ss = append(*ss, s) +} + +func (ss *stateStack) Pop() (s StateFunc) { + if len(*ss) == 0 { + return nil + } + *ss, s = (*ss)[:len(*ss)-1], (*ss)[len(*ss)-1] + return s +}