Skip to content

Commit

Permalink
Generalized similarity calculation for minhasher.
Browse files Browse the repository at this point in the history
  • Loading branch information
ibenian committed Feb 25, 2018
1 parent a14f453 commit 026ea21
Show file tree
Hide file tree
Showing 2 changed files with 85 additions and 0 deletions.
26 changes: 26 additions & 0 deletions algebird-core/src/main/scala/com/twitter/algebird/MinHasher.scala
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,16 @@ abstract class MinHasher[H](val numHashes: Int, val numBands: Int)(implicit n: N
buildArray(left.bytes, right.bytes){ (l, r) => if (l == r) n.one else n.zero }
.map{ _.toDouble }.sum / numHashes

/**
* Generalized Jaccard similarity estimation for multiple sets (size of intersection / size of union).
* Jsim(S1..Sn) = P(hmin1 == hmin2 == ... == hminn) / numHashes
*/
def similarityMulti(sigs: MinHashSignature*): Double = {
buildArrayMulti(sigs.map(_.bytes))(vals => if (vals.forall(_ == vals.head)) n.one else n.zero)
.map(_.toDouble)
.sum / numHashes
}

/** Bucket keys to use for quickly finding other similar items via locality sensitive hashing */
def buckets(sig: MinHashSignature): List[Long] =
sig.bytes.grouped(numRows * hashSize)
Expand Down Expand Up @@ -130,6 +140,12 @@ abstract class MinHasher[H](val numHashes: Int, val numBands: Int)(implicit n: N

/** Decode two signatures into hash values, combine them somehow, and produce a new array */
protected def buildArray(left: Array[Byte], right: Array[Byte])(fn: (H, H) => H): Array[Byte]

/**
* Decode multiple signatures into hash values and combine them using given fn.
* This version is used by the new similarityMulti.
*/
protected def buildArrayMulti(buffers: Seq[Array[Byte]])(fn: Seq[H] => H): Array[Byte]
}

class MinHasher32(numHashes: Int, numBands: Int) extends MinHasher[Int](numHashes, numBands) {
Expand All @@ -155,6 +171,11 @@ class MinHasher32(numHashes: Int, numBands: Int) extends MinHasher[Int](numHashe
buildArray{ fn(leftBuffer.get, rightBuffer.get) }
}

protected def buildArrayMulti(buffers: Seq[Array[Byte]])(fn: Seq[Int] => Int): Array[Byte] = {
val intBuffers = buffers.map(b => ByteBuffer.wrap(b).asIntBuffer)
buildArray(fn(intBuffers.map(_.get).toVector))
}

/** Seems to work, but experimental and not generic yet */
def approxCount(sig: Array[Byte]) = {
val buffer = ByteBuffer.wrap(sig).asIntBuffer
Expand Down Expand Up @@ -185,4 +206,9 @@ class MinHasher16(numHashes: Int, numBands: Int) extends MinHasher[Char](numHash
val rightBuffer = ByteBuffer.wrap(right).asCharBuffer
buildArray{ fn(leftBuffer.get, rightBuffer.get) }
}

protected def buildArrayMulti(buffers: Seq[Array[Byte]])(fn: Seq[Char] => Char): Array[Byte] = {
val intBuffers = buffers.map(b => ByteBuffer.wrap(b).asCharBuffer)
buildArray(fn(intBuffers.map(_.get).toVector))
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,16 @@ class MinHasherSpec extends WordSpec with Matchers {
assert(error < epsilon)
}

def testMulti[H](mh: MinHasher[H], sets: Int, samples: Int, similarity: Double, epsilon: Double) = {
val randomSets = randomMultiSets(sets, samples, similarity)

val exact = exactSimilarityMulti(randomSets)
val sim = approxSimilarityMulti(mh, randomSets)
val error: Double = math.abs(exact - sim)
assert(error < epsilon)
info(s"sets: $sets, exact: $exact, sim: $sim, error: $error, epsion: $epsilon")
}

def randomSets(similarity: Double) = {
val s = 10000
val uniqueFraction = if (similarity == 1.0) 0.0 else (1 - similarity) / (1 + similarity)
Expand All @@ -42,16 +52,39 @@ class MinHasherSpec extends WordSpec with Matchers {
(unique1 ++ shared, unique2 ++ shared)
}

def randomMultiSets(sets: Int, samples: Int, similarity: Double) = {
val sharedSamples = similarity * samples
val uniqueSamples = samples - sharedSamples

val shared = 1.to(sharedSamples.toInt).map{ i => math.random }.toSet
for {
i <- 1 to sets
unique = 1.to((uniqueSamples / sets).toInt).map{ i => math.random }.toSet
} yield unique ++ shared
}

def exactSimilarity[T](x: Set[T], y: Set[T]) = {
(x & y).size.toDouble / (x ++ y).size
}

def exactSimilarityMulti[T](sets: Seq[Set[T]]) = {
sets.reduce(_ & _).size.toDouble / sets.reduce(_ | _).size.toDouble
}

def approxSimilarity[T, H](mh: MinHasher[H], x: Set[T], y: Set[T]) = {
val sig1 = x.map{ l => mh.init(l.toString) }.reduce{ (a, b) => mh.plus(a, b) }
val sig2 = y.map{ l => mh.init(l.toString) }.reduce{ (a, b) => mh.plus(a, b) }
mh.similarity(sig1, sig2)
}

def approxSimilarityMulti[T, H](mh: MinHasher[H], sets: Seq[Set[T]]) = {
val sigs = for {
s <- sets
sig = s.map{ l => mh.init(l.toString) }.reduce{ (a, b) => mh.plus(a, b) }
} yield sig
mh.similarityMulti(sigs: _*)
}

"MinHasher32" should {
"measure 0.5 similarity in 1024 bytes with < 0.1 error" in {
test(new MinHasher32(0.5, 1024), 0.5, 0.1)
Expand All @@ -63,4 +96,30 @@ class MinHasherSpec extends WordSpec with Matchers {
test(new MinHasher32(1.0, 1024), 1.0, 0.01)
}
}

"MinHasher32 multiset similarity with sets = 2" should {
// Repeating the above tests for multiset implementation (sets = 2)
"measure 0.5 multiset similarity in 1024 bytes with < 0.1 error" in {
testMulti(new MinHasher32(0.5, 1024), sets = 2, samples = 10000, similarity = 0.5, epsilon = 0.1)
}
"measure 0.8 multiset similarity in 1024 bytes with < 0.1 error" in {
testMulti(new MinHasher32(0.8, 1024), sets = 2, samples = 10000, similarity = 0.8, epsilon = 0.1)
}
"measure 1.0 multiset similarity in 1024 bytes with < 0.01 error" in {
testMulti(new MinHasher32(1.0, 1024), sets = 2, samples = 10000, similarity = 1.0, epsilon = 0.01)
}
}

"MinHasher32 multiset similarity with sets = 10" should {
// New tests for multiset similarity
"measure 0.5 multiset similarity in 1024 bytes with < 0.1 error" in {
testMulti(new MinHasher32(0.5, 1024), sets = 10, samples = 10000, similarity = 0.5, epsilon = 0.1)
}
"measure 0.8 multiset similarity in 1024 bytes with < 0.1 error" in {
testMulti(new MinHasher32(0.8, 1024), sets = 10, samples = 10000, similarity = 0.8, epsilon = 0.1)
}
"measure 1.0 multiset similarity in 1024 bytes with < 0.01 error" in {
testMulti(new MinHasher32(1.0, 1024), sets = 10, samples = 10000, similarity = 1.0, epsilon = 0.01)
}
}
}

0 comments on commit 026ea21

Please sign in to comment.