Initial
This commit is contained in:
commit
6796db17de
6 changed files with 49899 additions and 0 deletions
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
.idea
|
21
LICENSE
Normal file
21
LICENSE
Normal file
|
@ -0,0 +1,21 @@
|
|||
MIT License
|
||||
|
||||
Copyright (c) 2018 Alexander Kiryukhin
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
32
README.md
Normal file
32
README.md
Normal file
|
@ -0,0 +1,32 @@
|
|||
# Стеммер Портера для русского языка
|
||||
|
||||
Стемминг - процесс получения основы слова из любой его формы. Иными словами, отсекает лишние суффиксы и окончания.
|
||||
|
||||
Самое очевидное применение - в полнотекстовом поиске, где нужно, чтобы слово находилось, даже если у него другое окончание.
|
||||
|
||||
Этот пакет - реализация [стеммера Портера](https://ru.wikipedia.org/wiki/Стемминг#Стеммер_Портера) для русского языка на Go.
|
||||
|
||||
## Использование
|
||||
|
||||
`основа := StemmerRu.StemWord("слово")`
|
||||
|
||||
Преобразует слово на входе в его основу на выходе
|
||||
|
||||
## Пример
|
||||
|
||||
```
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/neonxp/StemmerRu"
|
||||
)
|
||||
|
||||
func main() {
|
||||
fmt.Println(StemmerRu.StemWord("безмолвны") // выведет: безмолвн
|
||||
fmt.Println(StemmerRu.StemWord("безмолвные") // выведет: безмолвн
|
||||
fmt.Println(StemmerRu.StemWord("безмолвный") // выведет: безмолвн
|
||||
fmt.Println(StemmerRu.StemWord("безмолвным") // выведет: безмолвн
|
||||
fmt.Println(StemmerRu.StemWord("безмолвных") // выведет: безмолвн
|
||||
}
|
||||
```
|
142
StemmerRu/stemmer.go
Normal file
142
StemmerRu/stemmer.go
Normal file
|
@ -0,0 +1,142 @@
|
|||
package StemmerRu
|
||||
|
||||
import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
var (
|
||||
perfectiveGerund = [][]string{{`в`, `вши`, `вшись`}, {`ив`, `ивши`, `ившись`, `ыв`, `ывши`, `ывшись`}}
|
||||
adjective = []string{`ее`, `ие`, `ые`, `ое`, `ими`, `ыми`, `ей`, `ий`, `ый`, `ой`, `ем`, `им`, `ым`, `ом`, `его`, `ого`, `ему`, `ому`, `их`, `ых`, `ую`, `юю`, `ая`, `яя`, `ою`, `ею`}
|
||||
participle = [][]string{{`ем`, `нн`, `вш`, `ющ`, `щ`}, {`ивш`, `ывш`, `ующ`}}
|
||||
reflexive = []string{`ся`, `сь`}
|
||||
verb = [][]string{{`ла`, `на`, `ете`, `йте`, `ли`, `й`, `л`, `ем`, `н`, `ло`, `но`, `ет`, `ют`, `ны`, `ть`, `ешь`, `нно`}, {`ила`, `ыла`, `ена`, `ейте`, `уйте`, `ите`, `или`, `ыли`, `ей`, `уй`, `ил`, `ыл`, `им`, `ым`, `ен`, `ило`, `ыло`, `ено`, `ят`, `ует`, `уют`, `ит`, `ыт`, `ены`, `ить`, `ыть`, `ишь`, `ую`, `ю`}}
|
||||
noun = []string{`а`, `ев`, `ов`, `ие`, `ье`, `е`, `иями`, `ями`, `ами`, `еи`, `ии`, `и`, `ией`, `ей`, `ой`, `ий`, `й`, `иям`, `ям`, `ием`, `ем`, `ам`, `ом`, `о`, `у`, `ах`, `иях`, `ях`, `ы`, `ь`, `ию`, `ью`, `ю`, `ия`, `ья`, `я`}
|
||||
superlative = []string{`ейш`, `ейше`}
|
||||
derivational = []string{`ост`, `ость`}
|
||||
|
||||
vowels = `аеиоуыэюя`
|
||||
)
|
||||
|
||||
func StemWord(word string) string {
|
||||
|
||||
word = strings.Replace(word, `ё`, `е`, -1)
|
||||
|
||||
RVpos := getRVPart(word)
|
||||
|
||||
if RVpos == -1 {
|
||||
return word
|
||||
}
|
||||
|
||||
|
||||
R1pos := getRNPart(word, 0)
|
||||
R2pos := getRNPart(word, R1pos)
|
||||
if (R2pos < RVpos) {
|
||||
R2pos = 0
|
||||
} else {
|
||||
R2pos -= RVpos
|
||||
}
|
||||
|
||||
suffix := string([]rune(word)[RVpos:])
|
||||
prefix := string([]rune(word)[:RVpos])
|
||||
|
||||
// Step 1
|
||||
suffix, isTrimmed := trimSuffix(suffix, perfectiveGerund[1], perfectiveGerund[0])
|
||||
if !isTrimmed {
|
||||
suffix, isTrimmed = trimSuffix(suffix, reflexive, nil)
|
||||
suffix, isTrimmed = trimAdjectival(suffix)
|
||||
if !isTrimmed {
|
||||
suffix, isTrimmed = trimSuffix(suffix, verb[1], verb[0])
|
||||
if !isTrimmed {
|
||||
suffix, _ = trimSuffix(suffix, noun, nil)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Step 2
|
||||
suffix = strings.TrimSuffix(suffix, `и`)
|
||||
|
||||
// Step 3
|
||||
if R2pos < len([]rune(suffix)) {
|
||||
R2suffix := string([]rune(suffix)[R2pos:])
|
||||
R2prefix := string([]rune(suffix)[:R2pos])
|
||||
R2suffix, _ = trimSuffix(R2suffix, derivational, nil)
|
||||
suffix = R2prefix + R2suffix
|
||||
}
|
||||
|
||||
// Step 4
|
||||
suffix, isTrimmed = trimNN(suffix)
|
||||
if !isTrimmed {
|
||||
suffix, isTrimmed = trimSuffix(suffix, superlative, nil)
|
||||
if isTrimmed {
|
||||
suffix, _ = trimNN(suffix)
|
||||
} else {
|
||||
suffix = strings.TrimSuffix(suffix, `ь`)
|
||||
}
|
||||
}
|
||||
|
||||
return prefix + suffix
|
||||
}
|
||||
|
||||
func trimNN(word string) (string, bool) {
|
||||
if strings.HasSuffix(word, `нн`) {
|
||||
return strings.TrimSuffix(word, `нн`) + `н`, true
|
||||
}
|
||||
|
||||
return word, false
|
||||
}
|
||||
|
||||
func trimAdjectival(word string) (string, bool) {
|
||||
isTrimmedParticiple := false
|
||||
word, isTrimmedAdjective := trimSuffix(word, adjective, nil)
|
||||
if isTrimmedAdjective {
|
||||
word, isTrimmedParticiple = trimSuffix(word, participle[1], participle[0])
|
||||
}
|
||||
|
||||
return word, isTrimmedAdjective || isTrimmedParticiple
|
||||
}
|
||||
|
||||
func trimSuffix(word string, suffixes []string, suffixes2 []string) (string, bool) {
|
||||
for _, suffix := range suffixes {
|
||||
if strings.HasSuffix(word, suffix) {
|
||||
return strings.TrimSuffix(word, suffix), true
|
||||
}
|
||||
}
|
||||
if suffixes2 != nil {
|
||||
for _, suffix := range suffixes2 {
|
||||
if strings.HasSuffix(word, `а`+suffix) ||
|
||||
strings.HasSuffix(word, `я`+suffix) {
|
||||
return strings.TrimSuffix(word, suffix), true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return word, false
|
||||
}
|
||||
|
||||
func isVowel(char rune) bool {
|
||||
return strings.Contains(vowels, string(char))
|
||||
}
|
||||
|
||||
func getRVPart(word string) int {
|
||||
chars := []rune(word)
|
||||
for idx, char := range chars {
|
||||
if isVowel(char) {
|
||||
return idx + 1
|
||||
}
|
||||
}
|
||||
|
||||
return -1
|
||||
}
|
||||
|
||||
func getRNPart(word string, startPos int) int {
|
||||
chars := []rune(word)[startPos:]
|
||||
for idx, char := range chars {
|
||||
if idx+2 < len(chars) {
|
||||
if isVowel(char) && !isVowel(chars[idx+1]) {
|
||||
return startPos + idx + 2
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return startPos
|
||||
}
|
28
StemmerRu/stemmer_test.go
Normal file
28
StemmerRu/stemmer_test.go
Normal file
|
@ -0,0 +1,28 @@
|
|||
package StemmerRu
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"io/ioutil"
|
||||
"encoding/json"
|
||||
"path"
|
||||
)
|
||||
|
||||
var testFile = path.Join(`..`, `tests.json`)
|
||||
|
||||
func TestStemWord(t *testing.T) {
|
||||
file, err := ioutil.ReadFile(testFile)
|
||||
if err != nil {
|
||||
t.Error("Can't open file", testFile)
|
||||
}
|
||||
tests := &map[string]string{}
|
||||
err = json.Unmarshal(file, tests)
|
||||
if err != nil {
|
||||
t.Error("Can't parse json", err)
|
||||
}
|
||||
for source, expected := range *tests {
|
||||
result := StemWord(source);
|
||||
if expected != result {
|
||||
t.Errorf(`Expected "%s" (source: %s) but got "%s"`, result, source, result)
|
||||
}
|
||||
}
|
||||
}
|
49675
tests.json
Normal file
49675
tests.json
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Reference in a new issue