micro/internal/util/unicode.go

97 lines
2.3 KiB
Go
Raw Normal View History

2020-05-20 23:43:12 +03:00
package util
import (
"unicode"
"unicode/utf8"
)
// Unicode is annoying. A "code point" (rune in Go-speak) may need up to
// 4 bytes to represent it. In general, a code point will represent a
// complete character, but this is not always the case. A character with
// accents may be made up of multiple code points (the code point for the
// original character, and additional code points for each accent/marking).
// The functions below are meant to help deal with these additional "combining"
// code points. In underlying operations (search, replace, etc...), micro will
// treat a character with combining code points as just the original code point.
// For rendering, micro will display the combining characters. It's not perfect
// but it's pretty good.
var minMark = rune(unicode.Mark.R16[0].Lo)
func isMark(r rune) bool {
// Fast path
if r < minMark {
return false
}
return unicode.In(r, unicode.Mark)
}
2020-05-20 23:43:12 +03:00
// DecodeCharacter returns the next character from an array of bytes
// A character is a rune along with any accompanying combining runes
func DecodeCharacter(b []byte) (rune, []rune, int) {
r, size := utf8.DecodeRune(b)
b = b[size:]
c, s := utf8.DecodeRune(b)
var combc []rune
for isMark(c) {
2020-05-20 23:43:12 +03:00
combc = append(combc, c)
size += s
b = b[s:]
c, s = utf8.DecodeRune(b)
}
2020-05-20 23:47:08 +03:00
2020-05-20 23:43:12 +03:00
return r, combc, size
}
// DecodeCharacterInString returns the next character from a string
// A character is a rune along with any accompanying combining runes
func DecodeCharacterInString(str string) (rune, []rune, int) {
r, size := utf8.DecodeRuneInString(str)
str = str[size:]
c, s := utf8.DecodeRuneInString(str)
var combc []rune
for isMark(c) {
combc = append(combc, c)
size += s
str = str[s:]
c, s = utf8.DecodeRuneInString(str)
}
return r, combc, size
}
2020-05-20 23:43:12 +03:00
// CharacterCount returns the number of characters in a byte array
// Similar to utf8.RuneCount but for unicode characters
func CharacterCount(b []byte) int {
s := 0
for len(b) > 0 {
r, size := utf8.DecodeRune(b)
if !isMark(r) {
2020-05-20 23:43:12 +03:00
s++
}
b = b[size:]
}
return s
}
// CharacterCount returns the number of characters in a string
// Similar to utf8.RuneCountInString but for unicode characters
func CharacterCountInString(str string) int {
s := 0
for _, r := range str {
if !isMark(r) {
2020-05-20 23:43:12 +03:00
s++
}
}
return s
}