This commit is contained in:
Александр Кирюхин 2019-05-28 13:24:40 +03:00
parent 0911847693
commit 5c7748b15e
3 changed files with 76 additions and 7 deletions

View file

@ -9,12 +9,29 @@
Этот пакет - реализация [стеммера Портера](https://ru.wikipedia.org/wiki/Стемминг#Стеммерортера) для русского языка на Go.
Интерфейс совместим со стеммером https://github.com/caneroj1/stemmer
## Использование
`основа := StemmerRu.StemWord("слово")`
`основа := StemmerRu.Stem("слово")`
Преобразует слово на входе в его основу на выходе
Так же, из библиотеки https://github.com/caneroj1/stemmer взяты следющие методы:
```
// stem a list of words
stems := StemmerRu.StemMultiple(strings)
// stem a list of words in place, modifying the original slice
StemmerRu.StemMultipleMutate(strings)
// stem a list of words concurrently. this also stems in place, modifying
// the original slice.
// NOTE: the order of the strings is not guaranteed to be the same.
StemmerRu.StemConcurrent(strings)
```
## Пример
```

View file

@ -1,6 +1,7 @@
package StemmerRu
import (
"runtime"
"strings"
)
@ -17,7 +18,7 @@ var (
vowels = `аеиоуыэюя`
)
func StemWord(word string) string {
func Stem(word string) string {
word = strings.Replace(word, `ё`, `е`, -1)
@ -27,10 +28,9 @@ func StemWord(word string) string {
return word
}
R1pos := getRNPart(word, 0)
R2pos := getRNPart(word, R1pos)
if (R2pos < RVpos) {
if R2pos < RVpos {
R2pos = 0
} else {
R2pos -= RVpos
@ -140,3 +140,55 @@ func getRNPart(word string, startPos int) int {
return startPos
}
// Code from https://github.com/caneroj1/stemmer
// StemMultiple accepts a slice of strings and stems each of them.
func StemMultiple(words []string) (output []string) {
output = make([]string, len(words))
for idx, word := range words {
output[idx] = Stem(word)
}
return
}
// StemMultipleMutate accepts a pointer to a slice of strings and stems them in place.
// It modifies the original slice.
func StemMultipleMutate(words *[]string) {
for idx, word := range *words {
(*words)[idx] = Stem(word)
}
}
// StemConcurrent accepts a pointer to a slice of strings and stems them in place.
// It tries to offload the work into multiple threads. It makes no guarantees about
// the order of the stems in the modified slice.
func StemConcurrent(words *[]string) {
CPUs := runtime.NumCPU()
length := len(*words)
output := make(chan string)
partition := length / CPUs
var CPU int
for CPU = 0; CPU < CPUs; CPU++ {
go func(strs []string) {
for _, word := range strs {
output <- Stem(word)
}
}((*words)[CPU*partition : (CPU+1)*partition])
}
// if there are leftover words, stem them now
if length-(CPU)*partition > 0 {
go func(strs []string) {
for _, word := range strs {
output <- Stem(word)
}
}((*words)[(CPU)*partition : length])
}
for idx := 0; idx < length; idx++ {
(*words)[idx] = <-output
}
}

View file

@ -1,9 +1,9 @@
package StemmerRu
import (
"testing"
"io/ioutil"
"encoding/json"
"io/ioutil"
"testing"
)
var testFile = `tests.json`
@ -19,7 +19,7 @@ func TestStemWord(t *testing.T) {
t.Error("Can't parse json", err)
}
for source, expected := range *tests {
result := StemWord(source);
result := Stem(source)
if expected != result {
t.Errorf(`Expected "%s" (source: %s) but got "%s"`, result, source, result)
}