Generalized similarity calculation for minhasher.

twitter · Feb 25, 2018 · 026ea21 · 026ea21
1 parent a14f453
commit 026ea21
Show file tree

Hide file tree

Showing 2 changed files with 85 additions and 0 deletions.
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/MinHasher.scala b/algebird-core/src/main/scala/com/twitter/algebird/MinHasher.scala
@@ -88,6 +88,16 @@ abstract class MinHasher[H](val numHashes: Int, val numBands: Int)(implicit n: N
     buildArray(left.bytes, right.bytes){ (l, r) => if (l == r) n.one else n.zero }
       .map{ _.toDouble }.sum / numHashes
 
+  /**
+   * Generalized Jaccard similarity estimation for multiple sets (size of intersection / size of union).
+   * Jsim(S1..Sn) = P(hmin1 == hmin2 == ... == hminn) / numHashes
+   */
+  def similarityMulti(sigs: MinHashSignature*): Double = {
+    buildArrayMulti(sigs.map(_.bytes))(vals => if (vals.forall(_ == vals.head)) n.one else n.zero)
+      .map(_.toDouble)
+      .sum / numHashes
+  }
+
   /** Bucket keys to use for quickly finding other similar items via locality sensitive hashing */
   def buckets(sig: MinHashSignature): List[Long] =
     sig.bytes.grouped(numRows * hashSize)
@@ -130,6 +140,12 @@ abstract class MinHasher[H](val numHashes: Int, val numBands: Int)(implicit n: N
 
   /** Decode two signatures into hash values, combine them somehow, and produce a new array */
   protected def buildArray(left: Array[Byte], right: Array[Byte])(fn: (H, H) => H): Array[Byte]
+
+  /**
+   * Decode multiple signatures into hash values and combine them using given fn.
+   * This version is used by the new similarityMulti.
+   */
+  protected def buildArrayMulti(buffers: Seq[Array[Byte]])(fn: Seq[H] => H): Array[Byte]
 }
 
 class MinHasher32(numHashes: Int, numBands: Int) extends MinHasher[Int](numHashes, numBands) {
@@ -155,6 +171,11 @@ class MinHasher32(numHashes: Int, numBands: Int) extends MinHasher[Int](numHashe
     buildArray{ fn(leftBuffer.get, rightBuffer.get) }
   }
 
+  protected def buildArrayMulti(buffers: Seq[Array[Byte]])(fn: Seq[Int] => Int): Array[Byte] = {
+    val intBuffers = buffers.map(b => ByteBuffer.wrap(b).asIntBuffer)
+    buildArray(fn(intBuffers.map(_.get).toVector))
+  }
+
   /** Seems to work, but experimental and not generic yet */
   def approxCount(sig: Array[Byte]) = {
     val buffer = ByteBuffer.wrap(sig).asIntBuffer
@@ -185,4 +206,9 @@ class MinHasher16(numHashes: Int, numBands: Int) extends MinHasher[Char](numHash
     val rightBuffer = ByteBuffer.wrap(right).asCharBuffer
     buildArray{ fn(leftBuffer.get, rightBuffer.get) }
   }
+
+  protected def buildArrayMulti(buffers: Seq[Array[Byte]])(fn: Seq[Char] => Char): Array[Byte] = {
+    val intBuffers = buffers.map(b => ByteBuffer.wrap(b).asCharBuffer)
+    buildArray(fn(intBuffers.map(_.get).toVector))
+  }
 }
diff --git a/algebird-test/src/test/scala/com/twitter/algebird/MinHasherTest.scala b/algebird-test/src/test/scala/com/twitter/algebird/MinHasherTest.scala
@@ -31,6 +31,16 @@ class MinHasherSpec extends WordSpec with Matchers {
     assert(error < epsilon)
   }
 
+  def testMulti[H](mh: MinHasher[H], sets: Int, samples: Int, similarity: Double, epsilon: Double) = {
+    val randomSets = randomMultiSets(sets, samples, similarity)
+
+    val exact = exactSimilarityMulti(randomSets)
+    val sim = approxSimilarityMulti(mh, randomSets)
+    val error: Double = math.abs(exact - sim)
+    assert(error < epsilon)
+    info(s"sets: $sets, exact: $exact, sim: $sim, error: $error, epsion: $epsilon")
+  }
+
   def randomSets(similarity: Double) = {
     val s = 10000
     val uniqueFraction = if (similarity == 1.0) 0.0 else (1 - similarity) / (1 + similarity)
@@ -42,16 +52,39 @@ class MinHasherSpec extends WordSpec with Matchers {
     (unique1 ++ shared, unique2 ++ shared)
   }
 
+  def randomMultiSets(sets: Int, samples: Int, similarity: Double) = {
+    val sharedSamples = similarity * samples
+    val uniqueSamples = samples - sharedSamples
+
+    val shared = 1.to(sharedSamples.toInt).map{ i => math.random }.toSet
+    for {
+      i <- 1 to sets
+      unique = 1.to((uniqueSamples / sets).toInt).map{ i => math.random }.toSet
+    } yield unique ++ shared
+  }
+
   def exactSimilarity[T](x: Set[T], y: Set[T]) = {
     (x & y).size.toDouble / (x ++ y).size
   }
 
+  def exactSimilarityMulti[T](sets: Seq[Set[T]]) = {
+    sets.reduce(_ & _).size.toDouble / sets.reduce(_ | _).size.toDouble
+  }
+
   def approxSimilarity[T, H](mh: MinHasher[H], x: Set[T], y: Set[T]) = {
     val sig1 = x.map{ l => mh.init(l.toString) }.reduce{ (a, b) => mh.plus(a, b) }
     val sig2 = y.map{ l => mh.init(l.toString) }.reduce{ (a, b) => mh.plus(a, b) }
     mh.similarity(sig1, sig2)
   }
 
+  def approxSimilarityMulti[T, H](mh: MinHasher[H], sets: Seq[Set[T]]) = {
+    val sigs = for {
+      s <- sets
+      sig = s.map{ l => mh.init(l.toString) }.reduce{ (a, b) => mh.plus(a, b) }
+    } yield sig
+    mh.similarityMulti(sigs: _*)
+  }
+
   "MinHasher32" should {
     "measure 0.5 similarity in 1024 bytes with < 0.1 error" in {
       test(new MinHasher32(0.5, 1024), 0.5, 0.1)
@@ -63,4 +96,30 @@ class MinHasherSpec extends WordSpec with Matchers {
       test(new MinHasher32(1.0, 1024), 1.0, 0.01)
     }
   }
+
+  "MinHasher32 multiset similarity with sets = 2" should {
+    // Repeating the above tests for multiset implementation (sets = 2)
+    "measure 0.5 multiset similarity in 1024 bytes with < 0.1 error" in {
+      testMulti(new MinHasher32(0.5, 1024), sets = 2, samples = 10000, similarity = 0.5, epsilon = 0.1)
+    }
+    "measure 0.8 multiset similarity in 1024 bytes with < 0.1 error" in {
+      testMulti(new MinHasher32(0.8, 1024), sets = 2, samples = 10000, similarity = 0.8, epsilon = 0.1)
+    }
+    "measure 1.0 multiset similarity in 1024 bytes with < 0.01 error" in {
+      testMulti(new MinHasher32(1.0, 1024), sets = 2, samples = 10000, similarity = 1.0, epsilon = 0.01)
+    }
+  }
+
+  "MinHasher32 multiset similarity with sets = 10" should {
+    // New tests for multiset similarity
+    "measure 0.5 multiset similarity in 1024 bytes with < 0.1 error" in {
+      testMulti(new MinHasher32(0.5, 1024), sets = 10, samples = 10000, similarity = 0.5, epsilon = 0.1)
+    }
+    "measure 0.8 multiset similarity in 1024 bytes with < 0.1 error" in {
+      testMulti(new MinHasher32(0.8, 1024), sets = 10, samples = 10000, similarity = 0.8, epsilon = 0.1)
+    }
+    "measure 1.0 multiset similarity in 1024 bytes with < 0.01 error" in {
+      testMulti(new MinHasher32(1.0, 1024), sets = 10, samples = 10000, similarity = 1.0, epsilon = 0.01)
+    }
+  }
 }