This commit is contained in:
Александр Кирюхин 2019-05-28 13:24:40 +03:00
parent 0911847693
commit 5c7748b15e
3 changed files with 76 additions and 7 deletions

View file

@ -9,12 +9,29 @@
Этот пакет - реализация [стеммера Портера](https://ru.wikipedia.org/wiki/Стемминг#Стеммерортера) для русского языка на Go. Этот пакет - реализация [стеммера Портера](https://ru.wikipedia.org/wiki/Стемминг#Стеммерортера) для русского языка на Go.
Интерфейс совместим со стеммером https://github.com/caneroj1/stemmer
## Использование ## Использование
`основа := StemmerRu.StemWord("слово")` `основа := StemmerRu.Stem("слово")`
Преобразует слово на входе в его основу на выходе Преобразует слово на входе в его основу на выходе
Так же, из библиотеки https://github.com/caneroj1/stemmer взяты следющие методы:
```
// stem a list of words
stems := StemmerRu.StemMultiple(strings)
// stem a list of words in place, modifying the original slice
StemmerRu.StemMultipleMutate(strings)
// stem a list of words concurrently. this also stems in place, modifying
// the original slice.
// NOTE: the order of the strings is not guaranteed to be the same.
StemmerRu.StemConcurrent(strings)
```
## Пример ## Пример
``` ```

View file

@ -1,6 +1,7 @@
package StemmerRu package StemmerRu
import ( import (
"runtime"
"strings" "strings"
) )
@ -17,7 +18,7 @@ var (
vowels = `аеиоуыэюя` vowels = `аеиоуыэюя`
) )
func StemWord(word string) string { func Stem(word string) string {
word = strings.Replace(word, `ё`, `е`, -1) word = strings.Replace(word, `ё`, `е`, -1)
@ -27,10 +28,9 @@ func StemWord(word string) string {
return word return word
} }
R1pos := getRNPart(word, 0) R1pos := getRNPart(word, 0)
R2pos := getRNPart(word, R1pos) R2pos := getRNPart(word, R1pos)
if (R2pos < RVpos) { if R2pos < RVpos {
R2pos = 0 R2pos = 0
} else { } else {
R2pos -= RVpos R2pos -= RVpos
@ -140,3 +140,55 @@ func getRNPart(word string, startPos int) int {
return startPos return startPos
} }
// Code from https://github.com/caneroj1/stemmer
// StemMultiple accepts a slice of strings and stems each of them.
func StemMultiple(words []string) (output []string) {
output = make([]string, len(words))
for idx, word := range words {
output[idx] = Stem(word)
}
return
}
// StemMultipleMutate accepts a pointer to a slice of strings and stems them in place.
// It modifies the original slice.
func StemMultipleMutate(words *[]string) {
for idx, word := range *words {
(*words)[idx] = Stem(word)
}
}
// StemConcurrent accepts a pointer to a slice of strings and stems them in place.
// It tries to offload the work into multiple threads. It makes no guarantees about
// the order of the stems in the modified slice.
func StemConcurrent(words *[]string) {
CPUs := runtime.NumCPU()
length := len(*words)
output := make(chan string)
partition := length / CPUs
var CPU int
for CPU = 0; CPU < CPUs; CPU++ {
go func(strs []string) {
for _, word := range strs {
output <- Stem(word)
}
}((*words)[CPU*partition : (CPU+1)*partition])
}
// if there are leftover words, stem them now
if length-(CPU)*partition > 0 {
go func(strs []string) {
for _, word := range strs {
output <- Stem(word)
}
}((*words)[(CPU)*partition : length])
}
for idx := 0; idx < length; idx++ {
(*words)[idx] = <-output
}
}

View file

@ -1,9 +1,9 @@
package StemmerRu package StemmerRu
import ( import (
"testing"
"io/ioutil"
"encoding/json" "encoding/json"
"io/ioutil"
"testing"
) )
var testFile = `tests.json` var testFile = `tests.json`
@ -19,7 +19,7 @@ func TestStemWord(t *testing.T) {
t.Error("Can't parse json", err) t.Error("Can't parse json", err)
} }
for source, expected := range *tests { for source, expected := range *tests {
result := StemWord(source); result := Stem(source)
if expected != result { if expected != result {
t.Errorf(`Expected "%s" (source: %s) but got "%s"`, result, source, result) t.Errorf(`Expected "%s" (source: %s) but got "%s"`, result, source, result)
} }