From 026ea21908eef8e2ff7aa688cc7cbe276fc12cc3 Mon Sep 17 00:00:00 2001 From: ibenian Date: Sun, 25 Feb 2018 14:16:48 -0500 Subject: [PATCH] Generalized similarity calculation for minhasher. --- .../com/twitter/algebird/MinHasher.scala | 26 ++++++++ .../com/twitter/algebird/MinHasherTest.scala | 59 +++++++++++++++++++ 2 files changed, 85 insertions(+) diff --git a/algebird-core/src/main/scala/com/twitter/algebird/MinHasher.scala b/algebird-core/src/main/scala/com/twitter/algebird/MinHasher.scala index e3a9d4164..9f0ada5f9 100644 --- a/algebird-core/src/main/scala/com/twitter/algebird/MinHasher.scala +++ b/algebird-core/src/main/scala/com/twitter/algebird/MinHasher.scala @@ -88,6 +88,16 @@ abstract class MinHasher[H](val numHashes: Int, val numBands: Int)(implicit n: N buildArray(left.bytes, right.bytes){ (l, r) => if (l == r) n.one else n.zero } .map{ _.toDouble }.sum / numHashes + /** + * Generalized Jaccard similarity estimation for multiple sets (size of intersection / size of union). + * Jsim(S1..Sn) = P(hmin1 == hmin2 == ... == hminn) / numHashes + */ + def similarityMulti(sigs: MinHashSignature*): Double = { + buildArrayMulti(sigs.map(_.bytes))(vals => if (vals.forall(_ == vals.head)) n.one else n.zero) + .map(_.toDouble) + .sum / numHashes + } + /** Bucket keys to use for quickly finding other similar items via locality sensitive hashing */ def buckets(sig: MinHashSignature): List[Long] = sig.bytes.grouped(numRows * hashSize) @@ -130,6 +140,12 @@ abstract class MinHasher[H](val numHashes: Int, val numBands: Int)(implicit n: N /** Decode two signatures into hash values, combine them somehow, and produce a new array */ protected def buildArray(left: Array[Byte], right: Array[Byte])(fn: (H, H) => H): Array[Byte] + + /** + * Decode multiple signatures into hash values and combine them using given fn. + * This version is used by the new similarityMulti. + */ + protected def buildArrayMulti(buffers: Seq[Array[Byte]])(fn: Seq[H] => H): Array[Byte] } class MinHasher32(numHashes: Int, numBands: Int) extends MinHasher[Int](numHashes, numBands) { @@ -155,6 +171,11 @@ class MinHasher32(numHashes: Int, numBands: Int) extends MinHasher[Int](numHashe buildArray{ fn(leftBuffer.get, rightBuffer.get) } } + protected def buildArrayMulti(buffers: Seq[Array[Byte]])(fn: Seq[Int] => Int): Array[Byte] = { + val intBuffers = buffers.map(b => ByteBuffer.wrap(b).asIntBuffer) + buildArray(fn(intBuffers.map(_.get).toVector)) + } + /** Seems to work, but experimental and not generic yet */ def approxCount(sig: Array[Byte]) = { val buffer = ByteBuffer.wrap(sig).asIntBuffer @@ -185,4 +206,9 @@ class MinHasher16(numHashes: Int, numBands: Int) extends MinHasher[Char](numHash val rightBuffer = ByteBuffer.wrap(right).asCharBuffer buildArray{ fn(leftBuffer.get, rightBuffer.get) } } + + protected def buildArrayMulti(buffers: Seq[Array[Byte]])(fn: Seq[Char] => Char): Array[Byte] = { + val intBuffers = buffers.map(b => ByteBuffer.wrap(b).asCharBuffer) + buildArray(fn(intBuffers.map(_.get).toVector)) + } } diff --git a/algebird-test/src/test/scala/com/twitter/algebird/MinHasherTest.scala b/algebird-test/src/test/scala/com/twitter/algebird/MinHasherTest.scala index 417d6a8af..8a2d3fd04 100644 --- a/algebird-test/src/test/scala/com/twitter/algebird/MinHasherTest.scala +++ b/algebird-test/src/test/scala/com/twitter/algebird/MinHasherTest.scala @@ -31,6 +31,16 @@ class MinHasherSpec extends WordSpec with Matchers { assert(error < epsilon) } + def testMulti[H](mh: MinHasher[H], sets: Int, samples: Int, similarity: Double, epsilon: Double) = { + val randomSets = randomMultiSets(sets, samples, similarity) + + val exact = exactSimilarityMulti(randomSets) + val sim = approxSimilarityMulti(mh, randomSets) + val error: Double = math.abs(exact - sim) + assert(error < epsilon) + info(s"sets: $sets, exact: $exact, sim: $sim, error: $error, epsion: $epsilon") + } + def randomSets(similarity: Double) = { val s = 10000 val uniqueFraction = if (similarity == 1.0) 0.0 else (1 - similarity) / (1 + similarity) @@ -42,16 +52,39 @@ class MinHasherSpec extends WordSpec with Matchers { (unique1 ++ shared, unique2 ++ shared) } + def randomMultiSets(sets: Int, samples: Int, similarity: Double) = { + val sharedSamples = similarity * samples + val uniqueSamples = samples - sharedSamples + + val shared = 1.to(sharedSamples.toInt).map{ i => math.random }.toSet + for { + i <- 1 to sets + unique = 1.to((uniqueSamples / sets).toInt).map{ i => math.random }.toSet + } yield unique ++ shared + } + def exactSimilarity[T](x: Set[T], y: Set[T]) = { (x & y).size.toDouble / (x ++ y).size } + def exactSimilarityMulti[T](sets: Seq[Set[T]]) = { + sets.reduce(_ & _).size.toDouble / sets.reduce(_ | _).size.toDouble + } + def approxSimilarity[T, H](mh: MinHasher[H], x: Set[T], y: Set[T]) = { val sig1 = x.map{ l => mh.init(l.toString) }.reduce{ (a, b) => mh.plus(a, b) } val sig2 = y.map{ l => mh.init(l.toString) }.reduce{ (a, b) => mh.plus(a, b) } mh.similarity(sig1, sig2) } + def approxSimilarityMulti[T, H](mh: MinHasher[H], sets: Seq[Set[T]]) = { + val sigs = for { + s <- sets + sig = s.map{ l => mh.init(l.toString) }.reduce{ (a, b) => mh.plus(a, b) } + } yield sig + mh.similarityMulti(sigs: _*) + } + "MinHasher32" should { "measure 0.5 similarity in 1024 bytes with < 0.1 error" in { test(new MinHasher32(0.5, 1024), 0.5, 0.1) @@ -63,4 +96,30 @@ class MinHasherSpec extends WordSpec with Matchers { test(new MinHasher32(1.0, 1024), 1.0, 0.01) } } + + "MinHasher32 multiset similarity with sets = 2" should { + // Repeating the above tests for multiset implementation (sets = 2) + "measure 0.5 multiset similarity in 1024 bytes with < 0.1 error" in { + testMulti(new MinHasher32(0.5, 1024), sets = 2, samples = 10000, similarity = 0.5, epsilon = 0.1) + } + "measure 0.8 multiset similarity in 1024 bytes with < 0.1 error" in { + testMulti(new MinHasher32(0.8, 1024), sets = 2, samples = 10000, similarity = 0.8, epsilon = 0.1) + } + "measure 1.0 multiset similarity in 1024 bytes with < 0.01 error" in { + testMulti(new MinHasher32(1.0, 1024), sets = 2, samples = 10000, similarity = 1.0, epsilon = 0.01) + } + } + + "MinHasher32 multiset similarity with sets = 10" should { + // New tests for multiset similarity + "measure 0.5 multiset similarity in 1024 bytes with < 0.1 error" in { + testMulti(new MinHasher32(0.5, 1024), sets = 10, samples = 10000, similarity = 0.5, epsilon = 0.1) + } + "measure 0.8 multiset similarity in 1024 bytes with < 0.1 error" in { + testMulti(new MinHasher32(0.8, 1024), sets = 10, samples = 10000, similarity = 0.8, epsilon = 0.1) + } + "measure 1.0 multiset similarity in 1024 bytes with < 0.01 error" in { + testMulti(new MinHasher32(1.0, 1024), sets = 10, samples = 10000, similarity = 1.0, epsilon = 0.01) + } + } }