🎇 增加simhash算法以及修复cosine算法的一些bug (#4)

* 🎇 增加simhash算法以及修复cosine算法的一些bug * 🎇 代码覆盖率，simhash在文本长度毕竟大时覆盖率会大很多。
antlabs · May 29, 2022 · 3977214 · 3977214
1 parent bc95bc9
commit 3977214
Show file tree

Hide file tree

Showing 9 changed files with 228 additions and 14 deletions.
diff --git a/README.md b/README.md
@@ -26,9 +26,10 @@ go get -u github.com/antlabs/strsim
     * 莱文斯坦-编辑距离(Levenshtein)
     * Hamming
     * Dice's coefficient
-    * Jaro
-    *  JaroWinkler 
-    * Cosine similarity algorithm
+    * Jaro 
+    * JaroWinkler 
+    * Cosine 
+    * Simhash
 
 ## 内容
 - [比较两个字符串相识度](#比较两个字符串相识度)
@@ -39,6 +40,9 @@ go get -u github.com/antlabs/strsim
     - [选择Dice's coefficient](#选择Dice's-coefficient)
     - [选择jaro](#选择jaro)
     - [选择Hamming](#选择Hamming)
+    - [选择JaroWinkler](#选择JaroWinkler)
+    - [选择Cosine](#选择Cosine)
+    - [选择Simhash](#选择Simhash)
 ## 比较两个字符串相识度
 ```go
 strsim.Compare("中国人", "中")
@@ -86,3 +90,9 @@ strsim.Compare("abc", "ab", strsim.Hamming())
 strsim.Compare("abc", "ab", strsim.Cosine())
 ```
 
+### 选择Simhash
+
+```go
+strsim.Compare("abc", "ab", strsim.Simhash())
+```
+
diff --git a/cosine_conf.go b/cosine_conf.go
@@ -5,12 +5,15 @@ import "github.com/antlabs/strsim/similarity"
 // CosineConf is a configuration struct for Cosine similarity.
 
 func Cosine() OptionFunc {
+
 	return OptionFunc(func(o *option) {
-		h := &similarity.Cosine{}
-		o.base64 = true
-		o.cmp = h.CompareUtf8
-		if o.ascii {
-			o.cmp = h.CompareAscii
+		if o.cmp == nil {
+			l := similarity.Cosine{}
+			o.base64 = true
+			o.cmp = l.CompareUtf8
+			if o.ascii {
+				o.cmp = l.CompareAscii
+			}
 		}
 	})
 

diff --git a/jaro_winkler_conf.go b/jaro_winkler_conf.go
@@ -9,7 +9,7 @@ func JaroWinkler(matchWindow ...int) OptionFunc {
 		if len(matchWindow) > 0 {
 			mw = matchWindow[0]
 		}
-		d := &similarity.Jaro{MatchWindow: mw}
+		d := &similarity.JaroWinkler{MatchWindow: mw}
 		o.cmp = d.CompareUtf8
 	})
 }
diff --git a/prev_modify_test.go b/prev_modify_test.go
@@ -20,6 +20,8 @@ func Test_ModifyString(t *testing.T) {
 
 	o.ignore |= ignoreCase
 	o.ignore |= ignoreSpace
+	o.base64 = true
+
 	for _, v := range []testCase{
 		{
 			test: "hello world",

diff --git a/simhash_conf.go b/simhash_conf.go
@@ -0,0 +1,17 @@
+package strsim
+
+import "github.com/antlabs/strsim/similarity"
+
+func Simhash() OptionFunc {
+	return OptionFunc(func(o *option) {
+		if o.cmp == nil {
+			l := similarity.Simhash{}
+			o.base64 = true
+			o.cmp = l.CompareUtf8
+			if o.ascii {
+				o.cmp = l.CompareAscii
+			}
+		}
+	})
+
+}
diff --git a/similarity/Cosine.go b/similarity/Cosine.go
@@ -16,10 +16,13 @@ func (c Cosine) CompareAscii(s1, s2 string) float64 {
 func (c Cosine) CompareUtf8(utf8Str1, utf8Str2 string) float64 {
 	l1 := utf8.RuneCountInString(utf8Str1)
 	l2 := utf8.RuneCountInString(utf8Str2)
-	dirts1 := make(map[string]int, l1)
-	dirts2 := make(map[string]int, l2)
+	//l1 := len(utf8Str1)
+	//l2 := len(utf8Str2)
+	l3 := utf8.RuneCountInString(base64Table)
+	dirts1 := make(map[string]int, l3)
+	dirts2 := make(map[string]int, l3)
 	// 将base64Table转化成[]string
-	base64 := StrToStrs(base64Table, utf8.RuneCountInString(base64Table))
+	base64 := StrToStrs(base64Table, l3)
 	// 遍历base64对dirts1和dirts2进行初始化
 	for _, v := range base64 {
 		dirts1[v] = 0

diff --git a/similarity/simhash.go b/similarity/simhash.go
@@ -0,0 +1,114 @@
+package similarity
+
+import (
+	"hash/crc32"
+	"strconv"
+	"unicode/utf8"
+)
+
+type Simhash struct {
+}
+
+func (s Simhash) CompareAscii(s1, s2 string) float64 {
+	return s.CompareUtf8(s1, s2)
+
+}
+func (s Simhash) CompareUtf8(utf8Str1, utf8Str2 string) float64 {
+	// 字符串长度
+	l1 := utf8.RuneCountInString(utf8Str1)
+	l2 := utf8.RuneCountInString(utf8Str2)
+	// 将字符串转换为字符数组
+	s1s := StrToStrs4(utf8Str1, l1)
+	s2s := StrToStrs4(utf8Str2, l2)
+	// 计算每个字符在字符数组中出现的次数
+	counts1 := make(map[string]int)
+	counts2 := make(map[string]int)
+	for _, s := range s1s {
+		// 如果字符在字符数组中出现过，则计数加1
+		if _, ok := counts1[s]; ok {
+			counts1[s]++
+		} else {
+			// 如果字符在字符数组中没出现过，则计数设为1
+			counts1[s] = 1
+		}
+	}
+	for _, s := range s2s {
+		if _, ok := counts2[s]; ok {
+			counts2[s]++
+		} else {
+			counts2[s] = 1
+		}
+	}
+	h1 := IntsToStr(Dimensionality(merge(hashcodeAndAdd(counts1))))
+	h2 := IntsToStr(Dimensionality(merge(hashcodeAndAdd(counts2))))
+
+	// 计算h1, h2的汉明距离
+	Hamming := Hamming{}
+	//fmt.Printf("h1: %s\nh2: %s\n", h1, h2)
+
+	return Hamming.CompareUtf8(h1, h2)
+
+}
+
+// 降维度
+func Dimensionality(ins []int) []int {
+	for i := 0; i < len(ins); i++ {
+		if ins[i] > 0 {
+			ins[i] = 1
+		} else {
+			ins[i] = 0
+		}
+
+	}
+	return ins
+}
+
+//合并
+func merge(ins [][]int) []int {
+	res := make([]int, len(ins[0]))
+	lens := len(ins)
+	for i := 0; i < lens; i++ {
+		for j := 0; j < len(ins[i]); j++ {
+			res[j] += ins[i][j]
+		}
+	}
+	return res
+}
+
+// 计算hashcode并加权
+func hashcodeAndAdd(counts map[string]int) [][]int {
+	// hashmap
+	lens := len(counts)
+	h1 := make([][]int, lens)
+	// 计算counts1,counts2 中每个字符的hash值, 并且将出现的次数分为5个等级, 将每个字符的hash值与出现的次数等级相乘
+	c1 := (lens - 1) * 4.0
+	j := 0
+	//for j := 0; j < lens; j++ {
+	for k, v := range counts {
+		////计算每一个字符串的hash
+		//for i := 0; i < len(h1); i++ {
+		// 出现的次数除以5
+		c := strconv.FormatUint(uint64(crc32.ChecksumIEEE([]byte(k))), 2)
+		// 将字符串转换为数字数组
+		cs := Int32StrToInts(c)
+		if v <= c1/5.0 {
+			// 加权
+			h1[j] = Add(cs, 1)
+		} else if v <= c1/5.0*2 {
+			// 加权
+			h1[j] = Add(cs, 2)
+		} else if v <= c1/5.0*3 {
+			// 加权
+			h1[j] = Add(cs, 3)
+		} else if v <= c1/5.0*4 {
+			// 加权
+			h1[j] = Add(cs, 4)
+		} else {
+			// 加权
+			h1[j] = Add(cs, 5)
+		}
+		j++
+	}
+
+	return h1
+}
diff --git a/similarity/utils.go b/similarity/utils.go
@@ -3,6 +3,7 @@ package similarity
 import (
 	"encoding/base64"
 	"reflect"
+	"strconv"
 	"unsafe"
 )
 
@@ -41,8 +42,63 @@ func Base64Encode(s string) string {
 // StrToStrs 字符串转化字符数组
 func StrToStrs(s string, lenth int) []string {
 	base := make([]string, lenth)
-	for i := 0; i < len(s); i++ {
-		base = append(base, string(s[i]))
+	for i := 0; i < lenth; i++ {
+		base[i] = string(s[i])
 	}
 	return base
 }
+
+// StrToStrs4 每隔四个字符转换成一个字符串
+func StrToStrs4(s string, lenth int) []string {
+	base := make([]string, lenth/4)
+	var j = 0
+	for i := 0; i < lenth; i += 4 {
+		//base = append(base, s[i:i+4])
+		base[j] = s[i : i+4]
+		j++
+	}
+	return base
+}
+
+// Add 加权
+func Add(uint64 []int, int int) []int {
+	lens := len(uint64)
+	for i := 0; i < 32; i++ {
+		if i < lens {
+			if uint64[i] == 1 {
+				uint64[i] = int
+			} else {
+				uint64[i] = -int
+			}
+		} else {
+			uint64 = append(uint64, int)
+		}
+
+	}
+	return uint64
+}
+
+// Int32StrToInts   将uint64转换成string
+func Int32StrToInts(ins string) []int {
+	uints := make([]int, 32)
+
+	for i := 0; i < len(ins); i++ {
+		if string(ins[i]) == "1" {
+			uints[i] = 1
+		} else if string(ins[i]) == "0" {
+			uints[i] = 0
+		}
+	}
+	return uints
+
+}
+
+// IntsToStr []int 转换成string
+func IntsToStr(ins []int) string {
+	res := ""
+	for _, v := range ins {
+		res += strconv.Itoa(v)
+	}
+
+	return res
+}
diff --git a/strsim_test.go b/strsim_test.go
@@ -18,6 +18,9 @@ func Test_Compare_Special(t *testing.T) {
 			Jaro(),
 			DiceCoefficient(1),
 			Hamming(),
+			Simhash(),
+			Cosine(),
+			JaroWinkler(),
 		} {
 			sim := Compare(v.arg1, v.arg2, o)
 			assert.Equal(t, v.sim, sim)
@@ -39,6 +42,9 @@ func Test_FindBestMatchOne(t *testing.T) {
 			DiceCoefficient(1),
 			Jaro(),
 			Default(),
+			Simhash(),
+			Cosine(),
+			JaroWinkler(),
 		} {
 			m := FindBestMatchOne(d.key, d.best, o)
 			assert.Equal(t, m.S, d.need)
@@ -54,6 +60,9 @@ func Test_FindBestMatch(t *testing.T) {
 			DiceCoefficient(1),
 			Jaro(),
 			Default(),
+			Simhash(),
+			Cosine(),
+			JaroWinkler(),
 		} {
 			m := FindBestMatch(d.key, d.best, o)
 			assert.Equal(t, m.Match.S, d.need)