gob payloads and add list processing

2021-07-04 20:31:45 +08:00 · 2021-07-04 20:31:45 +08:00 · 6e059224fd
commit 6e059224fd
parent 72c5c9e85d
4 changed files with 158 additions and 77 deletions
--- a/README.md
+++ b/README.md
@ -2,16 +2,12 @@
 Convert Markdown to Gemini [gemtext](https://gemini.circumlunar.space/docs/gemtext.gmi) markup with
 Go. Working with streams and pipes for UNIX like behavior utilizing Go channels. Processing streams
-line by line is deliberately slightly more challenging than it needs to be to play around with go
+line by line is slightly more complex than it needs to be as I'm playing with channels and state
-state machines.
+machines here.
-<!-- testing markdown, this should be deleted, below merged -->
+Internally md2gmi does a 1st pass that constructs the blocks of single lines for gemtext. This is
-See the [gemini
+then streamed to the 2nd pass line by line. The 2nd pass will convert links, fix headings and stream
-protocol](https://gemini.circumlunar.space/) and the [protocol
+line by line to the output sink. The sink is either a file or stdout.
 spec](https://gemini.circumlunar.space/docs/specification.gmi).
 Internally md2gmi does a 1st pass that constructs the core layout for gemtext. This is then streamed
 to the 2nd pass line by line. The 2nd pass will convert links and stream line by line to the output.
 ### Usage
@ -29,19 +25,31 @@ Usage of ./md2gmi:
    cat file.md | md2gmi
    md2gmi -in file.md -out file.gmi
-The top part of this readme parses to
+The top part of this readme parses from
 ```md
-## md2gmi
+Convert Markdown to Gemini [gemtext](https://gemini.circumlunar.space/docs/gemtext.gmi) markup with
 Go. Working with streams and pipes for UNIX like behavior utilizing Go channels. Processing streams
 line by line is slightly more complex than it needs to be as I'm playing with channels and state
 machines here.
-Convert Markdown to Gemini gemtext[1] markup with Go. Working with streams and pipes for UNIX like behavior utilizing Go channels. Processing streams line by line is deliberately slightly more challenging than it needs to be to play around with go state machines.
+> this is
 a quote
 <!-- testing markdown, this should be deleted, below merged -->
 See the [gemini
 protocol](https://gemini.circumlunar.space/) and the [protocol
 spec](https://gemini.circumlunar.space/docs/specification.gmi).
 ```
 ```md
 Convert Markdown to Gemini gemtext[1] markup with Go. Working with streams and pipes for UNIX like behavior utilizing Go channels. Processing streams line by line is slightly more complex than it needs to be as I'm playing with channels and state machines here.
 => https://gemini.circumlunar.space/docs/gemtext.gmi 1: gemtext
 > this is a quote
 See the gemini protocol[1] and the protocol spec[2].
 => https://gemini.circumlunar.space/ 1: gemini protocol
 => https://gemini.circumlunar.space/docs/specification.gmi 2: protocol spec
 Internally md2gmi does a 1st pass that constructs the core layout for gemtext. This is then streamed to the 2nd pass line by line. The 2nd pass will convert links and stream line by line to the output.
 ```
--- a/main.go
+++ b/main.go
@ -2,12 +2,44 @@ package main
 import (
 	"bufio"
 	"bytes"
 	"encoding/gob"
 	"flag"
 	"fmt"
 	"io"
 	"os"
 )
 type WorkItem struct {
 	index   int
 	payload []byte
 }
 func New(index int, payload []byte) WorkItem {
 	w := WorkItem{index: index}
 	var indexBuffer bytes.Buffer
 	encoder := gob.NewEncoder(&indexBuffer)
 	if err := encoder.Encode(payload); err != nil {
 		panic(err)
 	}
 	w.payload = indexBuffer.Bytes()
 	return w
 }
 func (w *WorkItem) Index() int {
 	return w.index
 }
 func (w *WorkItem) Payload() []byte {
 	buf := bytes.NewReader(w.payload)
 	decoder := gob.NewDecoder(buf)
 	var tmp []byte
 	if err := decoder.Decode(&tmp); err != nil {
 		panic(err)
 	}
 	return tmp
 }
 func reader(in string) (io.Reader, error) {
 	if in != "" {
 		file, err := os.Open(in)
@ -46,12 +78,14 @@ func InputStream(r io.Reader) *ir {
 	return &ir{r: r}
 }
-func (m *ir) Output() chan []byte {
+func (m *ir) Output() chan WorkItem {
-	data := make(chan []byte)
+	data := make(chan WorkItem)
 	s := bufio.NewScanner(m.r)
 	go func() {
 		i := 0
 		for s.Scan() {
-			data <- s.Bytes()
+			data <- New(i, s.Bytes())
 			i += 1
 		}
 		close(data)
 	}()
@ -66,9 +100,9 @@ func OutputStream(w io.Writer) *ow {
 	return &ow{w: w}
 }
-func (m *ow) Input(data chan []byte) {
+func (m *ow) Input(data chan WorkItem) {
 	for b := range data {
-		write(m.w, b)
+		write(m.w, b.Payload())
 	}
 }
@ -97,9 +131,9 @@ func main() {
 	//sink.Input(preproc.Process(source.Output()))
 	sink.Input(
 		RemoveComments(
 			FormatHeadings(
 		FormatLinks(
 			FormatHeadings(
 				RemoveComments(
 					preproc.Process(source.Output()),
 				),
 			),
--- a/preproc.go
+++ b/preproc.go
@ -1,15 +1,23 @@
 package main
 import (
 	"bytes"
 	"regexp"
 )
 // state function
 type stateFn func(*fsm, []byte) stateFn
 // state machine
 type fsm struct {
 	state stateFn
-	out   chan []byte
+
 	i   int
 	out chan WorkItem
 	// combining multiple input lines
-	buffer []byte
+	blockBuffer []byte
 	sendBuffer  []byte
 	// if we have a termination rule to abide, e.g. implied code fences
 	pending []byte
 }
@ -18,42 +26,69 @@ func NewPreproc() *fsm {
 	return &fsm{}
 }
-func (m *fsm) Process(in chan []byte) chan []byte {
+func (m *fsm) Process(in chan WorkItem) chan WorkItem {
-	m.out = make(chan []byte)
+	m.out = make(chan WorkItem)
 	go func() {
 		for m.state = normal; m.state != nil; {
 			b, ok := <-in
 			if !ok {
-				m.flush()
+				m.blockFlush()
 				m.sync()
 				close(m.out)
 				m.state = nil
 				continue
 			}
-			m.state = m.state(m, b)
+
 			// fmt.Printf("i preproc '%v'\n", string(b.Payload()))
 			m.state = m.state(m, b.Payload())
 			m.sync()
 		}
 	}()
 	return m.out
 }
-func (m *fsm) flush() {
+func (m *fsm) sync() {
 	if len(m.sendBuffer) > 0 {
 		//m.sendBuffer = bytes.TrimSpace(m.sendBuffer)
 		m.sendBuffer = append(m.sendBuffer, '\n')
 		//fmt.Printf("o preproc '%v'\n", string(m.sendBuffer))
 		m.out <- New(m.i, m.sendBuffer)
 		m.sendBuffer = m.sendBuffer[:0]
 		m.i += 1
 	}
 }
 func (m *fsm) blockFlush() {
 	// blockBuffer to sendbuffer
 	//fmt.Println("block ", string(m.blockBuffer))
 	m.sendBuffer = append(m.sendBuffer, m.blockBuffer...)
 	m.blockBuffer = m.blockBuffer[:0]
 	if len(m.pending) > 0 {
-		m.out <- append(m.pending, '\n')
+		m.sendBuffer = append(m.sendBuffer, m.pending...)
 		m.sendBuffer = append(m.sendBuffer, '\n')
 		m.pending = m.pending[:0]
 	}
 }
 func isBlank(data []byte) bool {
 	return len(data) == 0
 }
 func isHeader(data []byte) bool {
 	return len(data) > 0 && data[0] == '#'
 }
 func triggerBreak(data []byte) bool {
 	return len(data) == 0 || data[len(data)-1] == '.'
 }
 func isTerminated(data []byte) bool {
 	return len(data) > 0 && data[len(data)-1] != '.'
 }
 func handleList(data []byte) ([]byte, bool) {
 	re := regexp.MustCompile(`([ ]*[-*])`)
 	sub := re.FindSubmatch(data)
 	// if lists, collapse to single level
 	if len(sub) > 1 {
 		return bytes.Replace(data, sub[1], []byte("-"), 1), true
 	}
 	return data, false
 }
 func isFence(data []byte) bool {
 	return len(data) >= 3 && string(data[0:3]) == "```"
 }
@ -63,67 +98,62 @@ func needsFence(data []byte) bool {
 }
 func normal(m *fsm, data []byte) stateFn {
-	m.flush()
+	if data, isList := handleList(data); isList {
-	// blank line
+		m.blockBuffer = append(data, '\n')
-	if isBlank(data) {
+		m.blockFlush()
 		m.out <- []byte("\n")
 		return normal
 	}
 	// header
 	if isHeader(data) {
 		m.out <- append(data, '\n')
 		return normal
 	}
 	if isFence(data) {
-		m.out <- append(data, '\n')
+		m.blockBuffer = append(data, '\n')
 		return fence
 	}
 	if needsFence(data) {
-		m.out <- []byte("```\n")
+		m.blockBuffer = append(m.blockBuffer, []byte("```\n")...)
-		m.out <- append(data[4:], '\n')
+		m.blockBuffer = append(m.blockBuffer, append(data[4:], '\n')...)
 		m.pending = []byte("```\n")
 		return toFence
 	}
-	if data[len(data)-1] != '.' {
+	if isTerminated(data) {
-		m.buffer = append(m.buffer, data...)
+		m.blockBuffer = append(m.blockBuffer, data...)
-		m.buffer = append(m.buffer, []byte(" ")...)
+		m.blockBuffer = append(m.blockBuffer, ' ')
 		return paragraph
 	}
 	// TODO
 	// collapse lists
-	m.out <- append(data, '\n')
+	m.blockBuffer = append(m.blockBuffer, append(data, '\n')...)
-
+	m.blockFlush()
 	return normal
 }
 func fence(m *fsm, data []byte) stateFn {
-	m.out <- append(data, '\n')
+	m.blockBuffer = append(m.blockBuffer, append(data, '\n')...)
 	// second fence returns to normal
 	if isFence(data) {
 		m.blockFlush()
 		return normal
 	}
 	return fence
 }
 func toFence(m *fsm, data []byte) stateFn {
 	if len(data) >= 3 {
 		m.out <- append(data[4:], '\n')
 	} else {
 		//m.out <- []byte("\n")
 	}
 	if needsFence(data) {
 		m.blockBuffer = append(m.blockBuffer, append(data[4:], '\n')...)
 		return toFence
 	}
 	m.blockFlush()
 	m.blockBuffer = append(m.blockBuffer, append(data, '\n')...)
 	return normal
 }
 func paragraph(m *fsm, data []byte) stateFn {
 	if triggerBreak(data) {
-		m.buffer = append(m.buffer, data...)
+		m.blockBuffer = append(m.blockBuffer, data...)
-		m.out <- append(m.buffer, '\n')
+		m.blockBuffer = bytes.TrimSpace(m.blockBuffer)
-		m.buffer = m.buffer[:0]
+		m.blockBuffer = append(m.blockBuffer, '\n')
 		m.blockFlush()
 		return normal
 	}
-	m.buffer = append(m.buffer, data...)
+	m.blockBuffer = append(m.blockBuffer, data...)
-	m.buffer = append(m.buffer, []byte(" ")...)
+	m.blockBuffer = append(m.blockBuffer, []byte(" ")...)
 	return paragraph
 }
--- a/proc.go
+++ b/proc.go
@ -6,11 +6,11 @@ import (
 	"regexp"
 )
-func FormatLinks(in chan []byte) chan []byte {
+func FormatLinks(in chan WorkItem) chan WorkItem {
-	out := make(chan []byte)
+	out := make(chan WorkItem)
 	go func() {
 		for b := range in {
-			out <- formatLinks(b)
+			out <- New(b.Index(), formatLinks(b.Payload()))
 		}
 		close(out)
 	}()
@ -37,33 +37,42 @@ func formatLinks(data []byte) []byte {
 	return data
 }
-func RemoveComments(in chan []byte) chan []byte {
+func RemoveComments(in chan WorkItem) chan WorkItem {
-	out := make(chan []byte)
+	out := make(chan WorkItem)
 	go func() {
 		re := regexp.MustCompile(`<!--.*-->`)
 		for b := range in {
-			out <- re.ReplaceAll(b, []byte{})
+			data := b.Payload()
 			for _, match := range re.FindAllSubmatch(data, -1) {
 				data = bytes.Replace(data, match[0], []byte(""), 1)
 			}
 			out <- New(b.Index(), append(bytes.TrimSpace(data), '\n'))
 			//out <- New(b.Index(), data)
 		}
 		close(out)
 	}()
 	return out
 }
-func FormatHeadings(in chan []byte) chan []byte {
+func FormatHeadings(in chan WorkItem) chan WorkItem {
-	out := make(chan []byte)
+	out := make(chan WorkItem)
 	go func() {
 		re := regexp.MustCompile(`^[#]{4,}`)
 		re2 := regexp.MustCompile(`^(#+)[^# ]`)
 		for b := range in {
 			// fix up more than 4 levels
-			b = re.ReplaceAll(b, []byte("###"))
+			data := re.ReplaceAll(b.Payload(), []byte("###"))
 			// ensure we have a space
-			sub := re2.FindSubmatch(b)
+			sub := re2.FindSubmatch(data)
 			if len(sub) > 0 {
-				b = bytes.Replace(b, sub[1], append(sub[1], []byte(" ")...), 1)
+				data = bytes.Replace(data, sub[1], append(sub[1], []byte(" ")...), 1)
 			}
 			// generally if we deal with a heading, add an extra blank line
 			if bytes.HasPrefix(data, []byte("#")) {
 				data = append(data, '\n')
 			}
 			// writeback
-			out <- b
+			out <- New(b.Index(), data)
 		}
 		close(out)