Skip to content

Commit

Permalink
🎇 增加simhash算法以及修复cosine算法的一些bug (#4)
Browse files Browse the repository at this point in the history
* 🎇 增加simhash算法以及修复cosine算法的一些bug

* 🎇 代码覆盖率,simhash在文本长度毕竟大时覆盖率会大很多。
  • Loading branch information
SummerSec authored May 29, 2022
1 parent bc95bc9 commit 3977214
Show file tree
Hide file tree
Showing 9 changed files with 228 additions and 14 deletions.
16 changes: 13 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,10 @@ go get -u github.com/antlabs/strsim
* 莱文斯坦-编辑距离(Levenshtein)
* Hamming
* Dice's coefficient
* Jaro
* JaroWinkler
* Cosine similarity algorithm
* Jaro
* JaroWinkler
* Cosine
* Simhash

## 内容
- [比较两个字符串相识度](#比较两个字符串相识度)
Expand All @@ -39,6 +40,9 @@ go get -u github.com/antlabs/strsim
- [选择Dice's coefficient](#选择Dice's-coefficient)
- [选择jaro](#选择jaro)
- [选择Hamming](#选择Hamming)
- [选择JaroWinkler](#选择JaroWinkler)
- [选择Cosine](#选择Cosine)
- [选择Simhash](#选择Simhash)
## 比较两个字符串相识度
```go
strsim.Compare("中国人", "")
Expand Down Expand Up @@ -86,3 +90,9 @@ strsim.Compare("abc", "ab", strsim.Hamming())
strsim.Compare("abc", "ab", strsim.Cosine())
```

### 选择Simhash

```go
strsim.Compare("abc", "ab", strsim.Simhash())
```

13 changes: 8 additions & 5 deletions cosine_conf.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,15 @@ import "github.com/antlabs/strsim/similarity"
// CosineConf is a configuration struct for Cosine similarity.

func Cosine() OptionFunc {

return OptionFunc(func(o *option) {
h := &similarity.Cosine{}
o.base64 = true
o.cmp = h.CompareUtf8
if o.ascii {
o.cmp = h.CompareAscii
if o.cmp == nil {
l := similarity.Cosine{}
o.base64 = true
o.cmp = l.CompareUtf8
if o.ascii {
o.cmp = l.CompareAscii
}
}
})

Expand Down
2 changes: 1 addition & 1 deletion jaro_winkler_conf.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ func JaroWinkler(matchWindow ...int) OptionFunc {
if len(matchWindow) > 0 {
mw = matchWindow[0]
}
d := &similarity.Jaro{MatchWindow: mw}
d := &similarity.JaroWinkler{MatchWindow: mw}
o.cmp = d.CompareUtf8
})
}
2 changes: 2 additions & 0 deletions prev_modify_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ func Test_ModifyString(t *testing.T) {

o.ignore |= ignoreCase
o.ignore |= ignoreSpace
o.base64 = true

for _, v := range []testCase{
{
test: "hello world",
Expand Down
17 changes: 17 additions & 0 deletions simhash_conf.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
package strsim

import "github.com/antlabs/strsim/similarity"

func Simhash() OptionFunc {
return OptionFunc(func(o *option) {
if o.cmp == nil {
l := similarity.Simhash{}
o.base64 = true
o.cmp = l.CompareUtf8
if o.ascii {
o.cmp = l.CompareAscii
}
}
})

}
9 changes: 6 additions & 3 deletions similarity/Cosine.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,13 @@ func (c Cosine) CompareAscii(s1, s2 string) float64 {
func (c Cosine) CompareUtf8(utf8Str1, utf8Str2 string) float64 {
l1 := utf8.RuneCountInString(utf8Str1)
l2 := utf8.RuneCountInString(utf8Str2)
dirts1 := make(map[string]int, l1)
dirts2 := make(map[string]int, l2)
//l1 := len(utf8Str1)
//l2 := len(utf8Str2)
l3 := utf8.RuneCountInString(base64Table)
dirts1 := make(map[string]int, l3)
dirts2 := make(map[string]int, l3)
// 将base64Table转化成[]string
base64 := StrToStrs(base64Table, utf8.RuneCountInString(base64Table))
base64 := StrToStrs(base64Table, l3)
// 遍历base64对dirts1和dirts2进行初始化
for _, v := range base64 {
dirts1[v] = 0
Expand Down
114 changes: 114 additions & 0 deletions similarity/simhash.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
package similarity

import (
"hash/crc32"
"strconv"
"unicode/utf8"
)

type Simhash struct {
}

func (s Simhash) CompareAscii(s1, s2 string) float64 {
return s.CompareUtf8(s1, s2)

}
func (s Simhash) CompareUtf8(utf8Str1, utf8Str2 string) float64 {
// 字符串长度
l1 := utf8.RuneCountInString(utf8Str1)
l2 := utf8.RuneCountInString(utf8Str2)
// 将字符串转换为字符数组
s1s := StrToStrs4(utf8Str1, l1)
s2s := StrToStrs4(utf8Str2, l2)
// 计算每个字符在字符数组中出现的次数
counts1 := make(map[string]int)
counts2 := make(map[string]int)
for _, s := range s1s {
// 如果字符在字符数组中出现过,则计数加1
if _, ok := counts1[s]; ok {
counts1[s]++
} else {
// 如果字符在字符数组中没出现过,则计数设为1
counts1[s] = 1
}
}
for _, s := range s2s {
if _, ok := counts2[s]; ok {
counts2[s]++
} else {
counts2[s] = 1
}
}
h1 := IntsToStr(Dimensionality(merge(hashcodeAndAdd(counts1))))
h2 := IntsToStr(Dimensionality(merge(hashcodeAndAdd(counts2))))

// 计算h1, h2的汉明距离
Hamming := Hamming{}
//fmt.Printf("h1: %s\nh2: %s\n", h1, h2)

return Hamming.CompareUtf8(h1, h2)

}

// 降维度
func Dimensionality(ins []int) []int {
for i := 0; i < len(ins); i++ {
if ins[i] > 0 {
ins[i] = 1
} else {
ins[i] = 0
}

}
return ins
}

//合并
func merge(ins [][]int) []int {
res := make([]int, len(ins[0]))
lens := len(ins)
for i := 0; i < lens; i++ {
for j := 0; j < len(ins[i]); j++ {
res[j] += ins[i][j]
}
}
return res
}

// 计算hashcode并加权
func hashcodeAndAdd(counts map[string]int) [][]int {
// hashmap
lens := len(counts)
h1 := make([][]int, lens)
// 计算counts1,counts2 中每个字符的hash值, 并且将出现的次数分为5个等级, 将每个字符的hash值与出现的次数等级相乘
c1 := (lens - 1) * 4.0
j := 0
//for j := 0; j < lens; j++ {
for k, v := range counts {
////计算每一个字符串的hash
//for i := 0; i < len(h1); i++ {
// 出现的次数除以5
c := strconv.FormatUint(uint64(crc32.ChecksumIEEE([]byte(k))), 2)
// 将字符串转换为数字数组
cs := Int32StrToInts(c)
if v <= c1/5.0 {
// 加权
h1[j] = Add(cs, 1)
} else if v <= c1/5.0*2 {
// 加权
h1[j] = Add(cs, 2)
} else if v <= c1/5.0*3 {
// 加权
h1[j] = Add(cs, 3)
} else if v <= c1/5.0*4 {
// 加权
h1[j] = Add(cs, 4)
} else {
// 加权
h1[j] = Add(cs, 5)
}
j++
}

return h1
}
60 changes: 58 additions & 2 deletions similarity/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package similarity
import (
"encoding/base64"
"reflect"
"strconv"
"unsafe"
)

Expand Down Expand Up @@ -41,8 +42,63 @@ func Base64Encode(s string) string {
// StrToStrs 字符串转化字符数组
func StrToStrs(s string, lenth int) []string {
base := make([]string, lenth)
for i := 0; i < len(s); i++ {
base = append(base, string(s[i]))
for i := 0; i < lenth; i++ {
base[i] = string(s[i])
}
return base
}

// StrToStrs4 每隔四个字符转换成一个字符串
func StrToStrs4(s string, lenth int) []string {
base := make([]string, lenth/4)
var j = 0
for i := 0; i < lenth; i += 4 {
//base = append(base, s[i:i+4])
base[j] = s[i : i+4]
j++
}
return base
}

// Add 加权
func Add(uint64 []int, int int) []int {
lens := len(uint64)
for i := 0; i < 32; i++ {
if i < lens {
if uint64[i] == 1 {
uint64[i] = int
} else {
uint64[i] = -int
}
} else {
uint64 = append(uint64, int)
}

}
return uint64
}

// Int32StrToInts 将uint64转换成string
func Int32StrToInts(ins string) []int {
uints := make([]int, 32)

for i := 0; i < len(ins); i++ {
if string(ins[i]) == "1" {
uints[i] = 1
} else if string(ins[i]) == "0" {
uints[i] = 0
}
}
return uints

}

// IntsToStr []int 转换成string
func IntsToStr(ins []int) string {
res := ""
for _, v := range ins {
res += strconv.Itoa(v)
}

return res
}
9 changes: 9 additions & 0 deletions strsim_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ func Test_Compare_Special(t *testing.T) {
Jaro(),
DiceCoefficient(1),
Hamming(),
Simhash(),
Cosine(),
JaroWinkler(),
} {
sim := Compare(v.arg1, v.arg2, o)
assert.Equal(t, v.sim, sim)
Expand All @@ -39,6 +42,9 @@ func Test_FindBestMatchOne(t *testing.T) {
DiceCoefficient(1),
Jaro(),
Default(),
Simhash(),
Cosine(),
JaroWinkler(),
} {
m := FindBestMatchOne(d.key, d.best, o)
assert.Equal(t, m.S, d.need)
Expand All @@ -54,6 +60,9 @@ func Test_FindBestMatch(t *testing.T) {
DiceCoefficient(1),
Jaro(),
Default(),
Simhash(),
Cosine(),
JaroWinkler(),
} {
m := FindBestMatch(d.key, d.best, o)
assert.Equal(t, m.Match.S, d.need)
Expand Down

0 comments on commit 3977214

Please sign in to comment.