From 5c7748b15e2fcc616c89492b85f367750e327d79 Mon Sep 17 00:00:00 2001 From: Alexander Kiryukhin Date: Tue, 28 May 2019 13:24:40 +0300 Subject: [PATCH] Compatibility with https://github.com/caneroj1/stemmer library --- README.md | 19 +++++++++++++++- stemmer.go | 58 ++++++++++++++++++++++++++++++++++++++++++++++--- stemmer_test.go | 6 ++--- 3 files changed, 76 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index edd8693..6ff53d4 100644 --- a/README.md +++ b/README.md @@ -9,12 +9,29 @@ Этот пакет - реализация [стеммера Портера](https://ru.wikipedia.org/wiki/Стемминг#Стеммер_Портера) для русского языка на Go. +Интерфейс совместим со стеммером https://github.com/caneroj1/stemmer + ## Использование -`основа := StemmerRu.StemWord("слово")` +`основа := StemmerRu.Stem("слово")` Преобразует слово на входе в его основу на выходе +Так же, из библиотеки https://github.com/caneroj1/stemmer взяты следющие методы: + +``` + // stem a list of words + stems := StemmerRu.StemMultiple(strings) + + // stem a list of words in place, modifying the original slice + StemmerRu.StemMultipleMutate(strings) + + // stem a list of words concurrently. this also stems in place, modifying + // the original slice. + // NOTE: the order of the strings is not guaranteed to be the same. + StemmerRu.StemConcurrent(strings) +``` + ## Пример ``` diff --git a/stemmer.go b/stemmer.go index f15281b..80dc592 100644 --- a/stemmer.go +++ b/stemmer.go @@ -1,6 +1,7 @@ package StemmerRu import ( + "runtime" "strings" ) @@ -17,7 +18,7 @@ var ( vowels = `аеиоуыэюя` ) -func StemWord(word string) string { +func Stem(word string) string { word = strings.Replace(word, `ё`, `е`, -1) @@ -27,10 +28,9 @@ func StemWord(word string) string { return word } - R1pos := getRNPart(word, 0) R2pos := getRNPart(word, R1pos) - if (R2pos < RVpos) { + if R2pos < RVpos { R2pos = 0 } else { R2pos -= RVpos @@ -140,3 +140,55 @@ func getRNPart(word string, startPos int) int { return startPos } + +// Code from https://github.com/caneroj1/stemmer + +// StemMultiple accepts a slice of strings and stems each of them. +func StemMultiple(words []string) (output []string) { + output = make([]string, len(words)) + for idx, word := range words { + output[idx] = Stem(word) + } + + return +} + +// StemMultipleMutate accepts a pointer to a slice of strings and stems them in place. +// It modifies the original slice. +func StemMultipleMutate(words *[]string) { + for idx, word := range *words { + (*words)[idx] = Stem(word) + } +} + +// StemConcurrent accepts a pointer to a slice of strings and stems them in place. +// It tries to offload the work into multiple threads. It makes no guarantees about +// the order of the stems in the modified slice. +func StemConcurrent(words *[]string) { + CPUs := runtime.NumCPU() + length := len(*words) + output := make(chan string) + partition := length / CPUs + + var CPU int + for CPU = 0; CPU < CPUs; CPU++ { + go func(strs []string) { + for _, word := range strs { + output <- Stem(word) + } + }((*words)[CPU*partition : (CPU+1)*partition]) + } + + // if there are leftover words, stem them now + if length-(CPU)*partition > 0 { + go func(strs []string) { + for _, word := range strs { + output <- Stem(word) + } + }((*words)[(CPU)*partition : length]) + } + + for idx := 0; idx < length; idx++ { + (*words)[idx] = <-output + } +} diff --git a/stemmer_test.go b/stemmer_test.go index 8b27774..334a329 100644 --- a/stemmer_test.go +++ b/stemmer_test.go @@ -1,9 +1,9 @@ package StemmerRu import ( - "testing" - "io/ioutil" "encoding/json" + "io/ioutil" + "testing" ) var testFile = `tests.json` @@ -19,7 +19,7 @@ func TestStemWord(t *testing.T) { t.Error("Can't parse json", err) } for source, expected := range *tests { - result := StemWord(source); + result := Stem(source) if expected != result { t.Errorf(`Expected "%s" (source: %s) but got "%s"`, result, source, result) }