Skip to content

Commit

Permalink
v0.5.3 (#85)
Browse files Browse the repository at this point in the history
* additional enhancements; update changelog; version bump
  • Loading branch information
barrust authored Dec 29, 2021
1 parent 9ccbaa2 commit 27ae06d
Show file tree
Hide file tree
Showing 12 changed files with 150 additions and 72 deletions.
10 changes: 9 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,19 @@
# PyProbables Changelog

### Version 0.5.3
* Additional type hinting
* Improved format parsing and serialization; [see PR#81](https://github.com/barrust/pyprobables/pull/81). Thanks [@KOLANICH](https://github.com/KOLANICH)
* Bloom Filters
* Added `export_to_hex` functionality for Bloom Filters on Disk
* Export as C header (**\*.h**) for Bloom Filters on Disk and Counting Bloom Filters
* Added support for more input types for exporting and loading of saved files


### Version 0.5.2
* Add ability to hash bytes along with strings
* Make all tests files individually executable from the CLI. Thanks [@KOLANICH](https://github.com/KOLANICH)
* Added type hints


### Version 0.5.1
* Bloom Filter:
* Export as a C header (**\*.h**)
Expand Down
1 change: 1 addition & 0 deletions docs/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
sphinx>=3.0
15 changes: 9 additions & 6 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@
"sphinx.ext.todo",
]

# Turn off typehints in discription
autodoc_typehints = "description"

# Add any paths that contain templates here, relative to this directory.
templates_path = ["_templates"]

Expand All @@ -55,8 +58,8 @@
master_doc = "index"

# General information about the project.
project = u"probables"
copyright = u"2017, Tyler Barrus"
project = "probables"
copyright = "2017, Tyler Barrus"
author = probables.__author__

# The version info for the project you're documenting, acts as replacement for
Expand Down Expand Up @@ -137,8 +140,8 @@
(
master_doc,
"pyprobables.tex",
u"pyprobables Documentation",
u"Tyler Barrus",
"pyprobables Documentation",
"Tyler Barrus",
"manual",
),
]
Expand All @@ -148,7 +151,7 @@

# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [(master_doc, "pyprobables", u"pyprobables Documentation", [author], 1)]
man_pages = [(master_doc, "pyprobables", "pyprobables Documentation", [author], 1)]


# -- Options for Texinfo output -------------------------------------------
Expand All @@ -160,7 +163,7 @@
(
master_doc,
"pyprobables",
u"pyprobables Documentation",
"pyprobables Documentation",
author,
"pyprobables",
"One line description of project.",
Expand Down
2 changes: 1 addition & 1 deletion probables/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
__maintainer__ = "Tyler Barrus"
__email__ = "[email protected]"
__license__ = "MIT"
__version__ = "0.5.2"
__version__ = "0.5.3"
__credits__ = [] # type: ignore
__url__ = "https://github.com/barrust/pyprobables"
__bugtrack_url__ = "https://github.com/barrust/pyprobables/issues"
Expand Down
17 changes: 10 additions & 7 deletions probables/blooms/basebloom.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,10 @@ def _set_optimized_params(
HEADER_STRUCT_BE = Struct(">" + HEADER_STRUCT_FORMAT)

def __load(
self, blm_type: str, file: typing.Union[Path, str, IOBase], hash_function: typing.Optional[HashFuncT] = None
self,
blm_type: str,
file: typing.Union[Path, str, IOBase, mmap],
hash_function: typing.Optional[HashFuncT] = None,
) -> None:
"""load the Bloom Filter from file"""
# read in the needed information, and then call _set_optimized_params
Expand Down Expand Up @@ -290,7 +293,7 @@ def export_hex(self) -> str:
self.false_positive_rate,
)
if self.__blm_type in ["regular", "reg-ondisk"]:
bytes_string = hexlify(bytearray(self.bloom)) + hexlify(mybytes)
bytes_string = hexlify(bytearray(self.bloom[: self.bloom_length])) + hexlify(mybytes)
else:
bytes_string = b""
for val in self.bloom:
Expand Down Expand Up @@ -331,13 +334,13 @@ def export_c_header(self, filename: str) -> None:
Args:
filename (str): The filename to which the Bloom Filter will \
be written. """
trailer = self.__class__.HEADER_STRUCT_BE.pack(
self.estimated_elements,
self.elements_added,
self.false_positive_rate,
data = (
" " + line
for line in wrap(", ".join(("0x{:02x}".format(e) for e in bytearray.fromhex(self.export_hex()))), 80)
)
data = (" " + line for line in wrap(", ".join(("0x{:02x}".format(e) for e in chain(self.bloom, trailer))), 80))
bloom_type = "standard BloomFilter" if self.__blm_type in ("regular", "reg-ondisk") else "CountingBloomFilter"
with open(filename, "w") as file:
print("/* BloomFilter Export of a {} */".format(bloom_type), file=file)
print("#include <inttypes.h>", file=file)
print("const uint64_t estimated_elements = ", self.estimated_elements, ";", sep="", file=file)
print("const uint64_t elements_added = ", self.elements_added, ";", sep="", file=file)
Expand Down
25 changes: 9 additions & 16 deletions probables/blooms/bloom.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import mmap
import os
import typing
from pathlib import Path
from shutil import copyfile
from struct import calcsize, pack, unpack

Expand Down Expand Up @@ -251,7 +252,7 @@ class BloomFilterOnDisk(BaseBloom):

def __init__(
self,
filepath: str,
filepath: typing.Union[str, Path],
est_elements: typing.Optional[int] = None,
false_positive_rate: typing.Optional[float] = None,
hex_string: typing.Optional[str] = None,
Expand All @@ -271,7 +272,7 @@ def __init__(
pass

self.__file_pointer = None
self.__filename = None
self.__filename = Path(filepath)
self.__export_offset = calcsize("Qf")
self._on_disk = True

Expand Down Expand Up @@ -315,7 +316,7 @@ def close(self) -> None:
self.__file_pointer.close()
self.__file_pointer = None

def __load(self, filepath: str, hash_function: typing.Optional[HashFuncT] = None):
def __load(self, filepath: typing.Union[str, Path], hash_function: typing.Optional[HashFuncT] = None):
"""load the Bloom Filter on disk"""
# read the file, set the optimal params
# mmap everything
Expand All @@ -333,9 +334,9 @@ def __load(self, filepath: str, hash_function: typing.Optional[HashFuncT] = None
self.__file_pointer = open(filepath, "r+b") # type: ignore
self._bloom = mmap.mmap(self.__file_pointer.fileno(), 0) # type: ignore
self._on_disk = True
self.__filename = filepath # type: ignore
self.__filename = Path(filepath)

def export(self, filename: str) -> None: # type: ignore
def export(self, filename: typing.Union[str, Path]) -> None: # type: ignore
""" Export to disk if a different location
Args:
Expand All @@ -344,9 +345,10 @@ def export(self, filename: str) -> None: # type: ignore
Note:
Only exported if the filename is not the original filename """
self.__update()
if filename != self.__filename:
filename = Path(filename)
if filename.name != self.__filename.name:
# setup the new bloom filter
copyfile(self.__filename, filename)
copyfile(self.__filename.name, filename.name)
# otherwise, nothing to do!

def add_alt(self, hashes: HashResultsT) -> None:
Expand Down Expand Up @@ -425,15 +427,6 @@ def jaccard_index(self, second: SimpleBloomT) -> typing.Optional[float]:
return None
return _tmp_jaccard_index(self, second)

def export_hex(self) -> str:
""" Export to a hex string
Raises:
NotSupportedError: This functionality is currently not \
supported """
msg = "`export_hex` is currently not supported by the on disk Bloom Filter"
raise NotSupportedError(msg)

def _load_hex(self, hex_string: str, hash_function: typing.Optional[HashFuncT] = None):
"""load from hex ..."""
msg = "Loading from hex_string is currently not supported by the on disk Bloom Filter"
Expand Down
2 changes: 1 addition & 1 deletion probables/countminsketch/countminsketch.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,7 +381,7 @@ def join(self, second: "CountMinSketch") -> None:
elif self.elements_added < INT64_T_MIN:
self.__elements_added = INT64_T_MIN

def __load(self, file: typing.Union[Path, str, IOBase]):
def __load(self, file: typing.Union[Path, str, IOBase, mmap]):
"""load the count-min sketch from file"""
if not isinstance(file, (IOBase, mmap)):
file = Path(file)
Expand Down
19 changes: 12 additions & 7 deletions probables/cuckoo/countingcuckoo.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

from ..exceptions import CuckooFilterFullError
from ..hashes import KeyT, SimpleHashT
from ..utilities import MMap
from .cuckoo import CuckooFilter


Expand Down Expand Up @@ -215,25 +216,29 @@ def _check_if_present(self, idx_1: int, idx_2: int, fingerprint: int) -> typing.
return idx_2
return None

def _load(self, filename: typing.Union[Path, str]) -> None:
def _load(self, file: typing.Union[Path, str, IOBase, mmap]) -> None:
"""load a cuckoo filter from file"""
with open(filename, "rb") as filepointer:
if not isinstance(file, (IOBase, mmap)):
file = Path(file)
with MMap(file) as filepointer:
self._load(filepointer)
else:
offset = calcsize("II")
int_size = calcsize("II")
filepointer.seek(offset * -1, os.SEEK_END)
list_size = filepointer.tell()
mybytes = unpack("II", filepointer.read(offset))
file.seek(offset * -1, os.SEEK_END)
list_size = file.tell()
mybytes = unpack("II", file.read(offset))
self._bucket_size = mybytes[0]
self.__max_cuckoo_swaps = mybytes[1]
self._cuckoo_capacity = list_size // int_size // self.bucket_size
self._inserted_elements = 0
# now pull everything in!
filepointer.seek(0, os.SEEK_SET)
file.seek(0, os.SEEK_SET)
self._buckets = list()
for i in range(self.capacity):
self.buckets.append(list())
for _ in range(self.bucket_size):
finger, count = unpack("II", filepointer.read(int_size))
finger, count = unpack("II", file.read(int_size))
if finger > 0:
ccb = CountingCuckooBin(finger, count)
self.buckets[i].append(ccb)
Expand Down
13 changes: 8 additions & 5 deletions probables/cuckoo/cuckoo.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,14 +319,17 @@ def _insert_fingerprint(self, fingerprint, idx_1, idx_2):
# if we got here we have an error... we might need to know what is left
return fingerprint

def _load(self, filename: typing.Union[Path, str]) -> None:
def _load(self, file: typing.Union[Path, str, IOBase, mmap]) -> None:
"""load a cuckoo filter from file"""
filename = Path(filename)
with MMap(filename) as d:
self._parse_footer(d)
if not isinstance(file, (IOBase, mmap)):
file = Path(file)
with MMap(file) as filepointer:
self._load(filepointer)
else:
self._parse_footer(file) # type: ignore
self._inserted_elements = 0
# now pull everything in!
self._parse_buckets(d)
self._parse_buckets(file) # type: ignore

SINGLE_INT_C = "I"
SINGLE_INT_SIZE = calcsize(SINGLE_INT_C)
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pyprobables"
version = "0.5.2"
version = "0.5.3"
description = "Probabilistic data structures in Python"
authors = ["Tyler Barrus <[email protected]>"]
license = "MIT"
Expand Down
74 changes: 47 additions & 27 deletions tests/bloom_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,17 +306,18 @@ def test_bf_export_c_header(self):
data = fobj.readlines()
data = [x.strip() for x in data]

self.assertEqual("#include <inttypes.h>", data[0])
self.assertEqual("const uint64_t estimated_elements = {};".format(blm.estimated_elements), data[1])
self.assertEqual("const uint64_t elements_added = {};".format(blm.elements_added), data[2])
self.assertEqual("const float false_positive_rate = {};".format(blm.false_positive_rate), data[3])
self.assertEqual("const uint64_t number_bits = {};".format(blm.number_bits), data[4])
self.assertEqual("const unsigned int number_hashes = {};".format(blm.number_hashes), data[5])
self.assertEqual("const unsigned char bloom[] = {", data[6])
self.assertEqual("/* BloomFilter Export of a standard BloomFilter */", data[0])
self.assertEqual("#include <inttypes.h>", data[1])
self.assertEqual("const uint64_t estimated_elements = {};".format(blm.estimated_elements), data[2])
self.assertEqual("const uint64_t elements_added = {};".format(blm.elements_added), data[3])
self.assertEqual("const float false_positive_rate = {};".format(blm.false_positive_rate), data[4])
self.assertEqual("const uint64_t number_bits = {};".format(blm.number_bits), data[5])
self.assertEqual("const unsigned int number_hashes = {};".format(blm.number_hashes), data[6])
self.assertEqual("const unsigned char bloom[] = {", data[7])
self.assertEqual("};", data[-1])

# rebuild the hex version!
new_hex = "".join([x.strip().replace("0x", "") for x in " ".join(data[7:-1]).split(",")])
new_hex = "".join([x.strip().replace("0x", "") for x in " ".join(data[8:-1]).split(",")])
self.assertEqual(hex_val, new_hex)

def test_bf_load_invalid_hex(self):
Expand Down Expand Up @@ -722,26 +723,14 @@ def test_bfod_bytes(self):

def test_bfod_export_hex(self):
"""test that page error is thrown correctly"""

def runner():
"""runner"""
blm = BloomFilterOnDisk(fobj.name, 10, 0.05)
blm.export_hex()

with NamedTemporaryFile(dir=os.getcwd(), suffix=".blm", delete=DELETE_TEMP_FILES) as fobj:
self.assertRaises(NotSupportedError, runner)

def test_bfod_export_hex_msg(self):
"""test that page error is thrown correctly"""
hex_val = "6da491461a6bba4d000000000000000a000000000000000a3d4ccccd"
with NamedTemporaryFile(dir=os.getcwd(), suffix=".blm", delete=DELETE_TEMP_FILES) as fobj:
blm = BloomFilterOnDisk(fobj.name, 10, 0.05)
try:
blm.export_hex()
except NotSupportedError as ex:
msg = "`export_hex` is currently not supported by the on disk Bloom Filter"
self.assertEqual(str(ex), msg)
else:
self.assertEqual(True, False)
blm = BloomFilterOnDisk(fobj.name, est_elements=10, false_positive_rate=0.05)
for i in range(0, 10):
tmp = "this is a test {0}".format(i)
blm.add(tmp)
hex_out = blm.export_hex()
self.assertEqual(hex_out, hex_val)

def test_bfod_load_hex(self):
"""test that page error is thrown correctly"""
Expand All @@ -764,6 +753,37 @@ def test_bfod_load_hex_msg(self):
else:
self.assertEqual(True, False)

def test_bfod_export_c_header(self):
"""test exporting a c header"""
hex_val = "6da491461a6bba4d000000000000000a000000000000000a3d4ccccd"
with NamedTemporaryFile(dir=os.getcwd(), suffix=".blm", delete=DELETE_TEMP_FILES) as fobj:
blm = BloomFilterOnDisk(fobj.name, est_elements=10, false_positive_rate=0.05)
for i in range(0, 10):
tmp = "this is a test {0}".format(i)
blm.add(tmp)
with NamedTemporaryFile(dir=os.getcwd(), suffix=".blm", delete=DELETE_TEMP_FILES) as fobj:
blm.export_c_header(fobj.name)

# now load the file, parse it and do some tests!
with open(fobj.name, "r") as fobj:
data = fobj.readlines()

data = [x.strip() for x in data]

self.assertEqual("/* BloomFilter Export of a standard BloomFilter */", data[0])
self.assertEqual("#include <inttypes.h>", data[1])
self.assertEqual("const uint64_t estimated_elements = {};".format(blm.estimated_elements), data[2])
self.assertEqual("const uint64_t elements_added = {};".format(blm.elements_added), data[3])
self.assertEqual("const float false_positive_rate = {};".format(blm.false_positive_rate), data[4])
self.assertEqual("const uint64_t number_bits = {};".format(blm.number_bits), data[5])
self.assertEqual("const unsigned int number_hashes = {};".format(blm.number_hashes), data[6])
self.assertEqual("const unsigned char bloom[] = {", data[7])
self.assertEqual("};", data[-1])

# rebuild the hex version!
new_hex = "".join([x.strip().replace("0x", "") for x in " ".join(data[8:-1]).split(",")])
self.assertEqual(hex_val, new_hex)

def test_bfod_clear(self):
"""test clearing out the bloom filter on disk"""
with NamedTemporaryFile(dir=os.getcwd(), suffix=".blm", delete=DELETE_TEMP_FILES) as fobj:
Expand Down
Loading

0 comments on commit 27ae06d

Please sign in to comment.