Compatibility with https://github.com/caneroj1/stemmer library
This commit is contained in:
parent
0911847693
commit
5c7748b15e
3 changed files with 76 additions and 7 deletions
19
README.md
19
README.md
|
@ -9,12 +9,29 @@
|
||||||
|
|
||||||
Этот пакет - реализация [стеммера Портера](https://ru.wikipedia.org/wiki/Стемминг#Стеммер_Портера) для русского языка на Go.
|
Этот пакет - реализация [стеммера Портера](https://ru.wikipedia.org/wiki/Стемминг#Стеммер_Портера) для русского языка на Go.
|
||||||
|
|
||||||
|
Интерфейс совместим со стеммером https://github.com/caneroj1/stemmer
|
||||||
|
|
||||||
## Использование
|
## Использование
|
||||||
|
|
||||||
`основа := StemmerRu.StemWord("слово")`
|
`основа := StemmerRu.Stem("слово")`
|
||||||
|
|
||||||
Преобразует слово на входе в его основу на выходе
|
Преобразует слово на входе в его основу на выходе
|
||||||
|
|
||||||
|
Так же, из библиотеки https://github.com/caneroj1/stemmer взяты следющие методы:
|
||||||
|
|
||||||
|
```
|
||||||
|
// stem a list of words
|
||||||
|
stems := StemmerRu.StemMultiple(strings)
|
||||||
|
|
||||||
|
// stem a list of words in place, modifying the original slice
|
||||||
|
StemmerRu.StemMultipleMutate(strings)
|
||||||
|
|
||||||
|
// stem a list of words concurrently. this also stems in place, modifying
|
||||||
|
// the original slice.
|
||||||
|
// NOTE: the order of the strings is not guaranteed to be the same.
|
||||||
|
StemmerRu.StemConcurrent(strings)
|
||||||
|
```
|
||||||
|
|
||||||
## Пример
|
## Пример
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
58
stemmer.go
58
stemmer.go
|
@ -1,6 +1,7 @@
|
||||||
package StemmerRu
|
package StemmerRu
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"runtime"
|
||||||
"strings"
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -17,7 +18,7 @@ var (
|
||||||
vowels = `аеиоуыэюя`
|
vowels = `аеиоуыэюя`
|
||||||
)
|
)
|
||||||
|
|
||||||
func StemWord(word string) string {
|
func Stem(word string) string {
|
||||||
|
|
||||||
word = strings.Replace(word, `ё`, `е`, -1)
|
word = strings.Replace(word, `ё`, `е`, -1)
|
||||||
|
|
||||||
|
@ -27,10 +28,9 @@ func StemWord(word string) string {
|
||||||
return word
|
return word
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
R1pos := getRNPart(word, 0)
|
R1pos := getRNPart(word, 0)
|
||||||
R2pos := getRNPart(word, R1pos)
|
R2pos := getRNPart(word, R1pos)
|
||||||
if (R2pos < RVpos) {
|
if R2pos < RVpos {
|
||||||
R2pos = 0
|
R2pos = 0
|
||||||
} else {
|
} else {
|
||||||
R2pos -= RVpos
|
R2pos -= RVpos
|
||||||
|
@ -140,3 +140,55 @@ func getRNPart(word string, startPos int) int {
|
||||||
|
|
||||||
return startPos
|
return startPos
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Code from https://github.com/caneroj1/stemmer
|
||||||
|
|
||||||
|
// StemMultiple accepts a slice of strings and stems each of them.
|
||||||
|
func StemMultiple(words []string) (output []string) {
|
||||||
|
output = make([]string, len(words))
|
||||||
|
for idx, word := range words {
|
||||||
|
output[idx] = Stem(word)
|
||||||
|
}
|
||||||
|
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// StemMultipleMutate accepts a pointer to a slice of strings and stems them in place.
|
||||||
|
// It modifies the original slice.
|
||||||
|
func StemMultipleMutate(words *[]string) {
|
||||||
|
for idx, word := range *words {
|
||||||
|
(*words)[idx] = Stem(word)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// StemConcurrent accepts a pointer to a slice of strings and stems them in place.
|
||||||
|
// It tries to offload the work into multiple threads. It makes no guarantees about
|
||||||
|
// the order of the stems in the modified slice.
|
||||||
|
func StemConcurrent(words *[]string) {
|
||||||
|
CPUs := runtime.NumCPU()
|
||||||
|
length := len(*words)
|
||||||
|
output := make(chan string)
|
||||||
|
partition := length / CPUs
|
||||||
|
|
||||||
|
var CPU int
|
||||||
|
for CPU = 0; CPU < CPUs; CPU++ {
|
||||||
|
go func(strs []string) {
|
||||||
|
for _, word := range strs {
|
||||||
|
output <- Stem(word)
|
||||||
|
}
|
||||||
|
}((*words)[CPU*partition : (CPU+1)*partition])
|
||||||
|
}
|
||||||
|
|
||||||
|
// if there are leftover words, stem them now
|
||||||
|
if length-(CPU)*partition > 0 {
|
||||||
|
go func(strs []string) {
|
||||||
|
for _, word := range strs {
|
||||||
|
output <- Stem(word)
|
||||||
|
}
|
||||||
|
}((*words)[(CPU)*partition : length])
|
||||||
|
}
|
||||||
|
|
||||||
|
for idx := 0; idx < length; idx++ {
|
||||||
|
(*words)[idx] = <-output
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
package StemmerRu
|
package StemmerRu
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"testing"
|
|
||||||
"io/ioutil"
|
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
"io/ioutil"
|
||||||
|
"testing"
|
||||||
)
|
)
|
||||||
|
|
||||||
var testFile = `tests.json`
|
var testFile = `tests.json`
|
||||||
|
@ -19,7 +19,7 @@ func TestStemWord(t *testing.T) {
|
||||||
t.Error("Can't parse json", err)
|
t.Error("Can't parse json", err)
|
||||||
}
|
}
|
||||||
for source, expected := range *tests {
|
for source, expected := range *tests {
|
||||||
result := StemWord(source);
|
result := Stem(source)
|
||||||
if expected != result {
|
if expected != result {
|
||||||
t.Errorf(`Expected "%s" (source: %s) but got "%s"`, result, source, result)
|
t.Errorf(`Expected "%s" (source: %s) but got "%s"`, result, source, result)
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue