Skip to content

Commit

Permalink
v1.0.0
Browse files Browse the repository at this point in the history
  • Loading branch information
martinellimarco committed Dec 15, 2020
1 parent 2b451af commit 45bc211
Show file tree
Hide file tree
Showing 7 changed files with 615 additions and 0 deletions.
47 changes: 47 additions & 0 deletions .github/workflows/deploy.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
name: Deploy

on:
release:
types: [created]

env:
BUILD_TYPE: Release

jobs:
build:
runs-on: ubuntu-20.04

steps:
- uses: actions/checkout@v2

- name: Install Dependencies
shell: bash
run: |
sudo apt update
sudo apt install libzstd-dev -y
- name: Create Build Environment
run: cmake -E make_directory ${{runner.workspace}}/build

- name: Configure CMake
shell: bash
working-directory: ${{runner.workspace}}/build
run: cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE

- name: Build
working-directory: ${{runner.workspace}}/build
shell: bash
run: cmake --build . --config $BUILD_TYPE

- name: Create DEB Package
id: create-deb-package
working-directory: ${{runner.workspace}}/build
shell: bash
run: cpack

- name: Release DEB
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
tag_name="${GITHUB_REF##*/}"
hub release edit -a $(ls ${{runner.workspace}}/build/*deb) -m "" "$tag_name"
38 changes: 38 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
name: Test Build

on: [push]

env:
BUILD_TYPE: Release

jobs:
build:
runs-on: ubuntu-20.04

steps:
- uses: actions/checkout@v2

- name: Install Dependencies
shell: bash
run: |
sudo apt update
sudo apt install libzstd-dev -y
- name: Create Build Environment
run: cmake -E make_directory ${{runner.workspace}}/build

- name: Configure CMake
shell: bash
working-directory: ${{runner.workspace}}/build
run: cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE

- name: Build
working-directory: ${{runner.workspace}}/build
shell: bash
run: cmake --build . --config $BUILD_TYPE

- name: Create DEB Package
id: create-deb-package
working-directory: ${{runner.workspace}}/build
shell: bash
run: cpack
38 changes: 38 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
cmake_minimum_required(VERSION 3.16)
project(t2sz VERSION 1.0.0 LANGUAGES C)

add_definitions(-DVERSION="${PROJECT_VERSION}")

set(CMAKE_C_STANDARD 99)

add_executable(t2sz src/t2sz.c)
target_link_libraries(t2sz zstd m)

if (CMAKE_BUILD_TYPE STREQUAL Release)
add_custom_command(TARGET t2sz POST_BUILD COMMAND ${CMAKE_STRIP} t2sz)
endif ()

install(TARGETS t2sz DESTINATION "/usr/bin/")

#set(CPACK_SET_DESTDIR ON)
#set(CPACK_GENERATOR "DEB;TGZ;RPM")
set(CPACK_GENERATOR "DEB")

set(CPACK_DEBIAN_FILE_NAME DEB-DEFAULT)
set(CPACK_DEBIAN_PACKAGE_MAINTAINER "Marco Martinelli <[email protected]>")
set(CPACK_DEBIAN_PACKAGE_SHLIBDEPS ON)
set(CPACK_DEBIAN_PACKAGE_DESCRIPTION "tar 2 seekable zstd.\nIt will compress a tar archive with Zstandard keeping each file in a different frame, for fast seeking.\nThe compressed archive can be uncompressed with any Zstandard tool, including zstd.")
set(CPACK_DEBIAN_PACKAGE_HOMEPAGE "https://github.com/martinellimarco/t2sz")

#set(CPACK_RPM_PACKAGE_AUTOREQ ON)
#set(CPACK_RPM_PACKAGE_DESCRIPTION "tar 2 seekable zstd.\nIt will compress a tar archive with Zstandard keeping each file in a different frame, for fast seeking.\nThe compressed archive can be uncompressed with any Zstandard tool, including zstd.")
#set(CPACK_RPM_PACKAGE_URL "https://github.com/martinellimarco/t2sz")
#set(CPACK_RPM_PACKAGE_LICENSE "GPLv3")

set(CPACK_PACKAGE_NAME ${PROJECT_NAME})
set(CPACK_PACKAGE_CONTACT "Marco Martinelli <[email protected]>")
set(CPACK_PACKAGE_VERSION_MAJOR ${PROJECT_VERSION_MAJOR})
set(CPACK_PACKAGE_VERSION_MINOR ${PROJECT_VERSION_MINOR})
set(CPACK_PACKAGE_VERSION_PATCH ${PROJECT_VERSION_PATCH})

include(CPack)
116 changes: 116 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,118 @@
# t2sz
Compress .tar archives to seekable .tar.zstd

It will compress a tar archive with [Zstandard](https://github.com/facebook/zstd) keeping each file in a different frame, unless `-s` is used.

This allows fast seeking and extraction of a single file without decompressing the whole archive.

When `-s SIZE` is used and a file is added, if the size of the file is less than `SIZE` then another one will be added in the same block, and so on until the sum of the sizes of all files packed together is at least `SIZE`.

A file will be never truncated. `SIZE` is only a minimum quantity.

A single block of one or more files is compressed into a single Zstandard frame. If the files in the same block are correlatable the compression ratio will be higher.

The compressed archive can be uncompressed with any Zstandard tool, including `zstd`.

To take advantage of seeking see the following projects:
- C/C++ library: [libzstd-seek](https://github.com/martinellimarco/libzstd-seek)
- Python library: [indexed_zstd](https://github.com/martinellimarco/indexed_zstd)
- FUSE mount: [ratarmount](https://github.com/mxmlnkn/ratarmount)

# Build

You'll need `libzstd-dev`

```bash
sudo apt install libzstd-dev
```

```bash
git clone https://github.com/martinellimarco/t2sz
mkdir t2sz/build
cd t2sz/build
cmake .. -DCMAKE_BUILD_TYPE="Release"
make
```

Install with

```bash
sudo make install
```

Or if you want a debian package you can run

```bash
cpack
```

then install it with

```bash
sudo dpkg -i t2sz*.deb
```

# Usage

```commandline
Usage: t2sz [OPTIONS...] [TAR ARCHIVE]
Examples:
t2sz archive.tar Compress archive.tar to archive.tar.zst
t2sz archive.tar -o output.tar.zst Compress archive.tar to output.tar.zst
t2sz archive.tar -o /dev/stdout Compress archive.tar to standard output
Options:
-l [1..22] Set compression level, from 1 (lower) to 22 (highest). Default is 22.
-o FILENAME Output file name.
-s SIZE Minimum size of an input block, in bytes.
A block is composed by one or more whole files. A file is never truncated.
If not specified one block will contain exactly one file, no matter the file size.
Each block is compressed to a zstd frame but if the archive has a lot of small files
having a file per block doesn't compress very well. With this you can set a trade off.
The greater is SIZE the smaller will be the archive at the expense of the seek speed.
SIZE may be followed by the following multiplicative suffixes:
k/K/KiB = 1024
M/MiB = 1024*1024
kB/KB = 1000
MB = 1000*1000
-v Verbose. List the elements in the tar archive and their size.
-f Overwrite output without prompting.
-h Print this help.
-V Print the version.
```

# About -s and -l

One may wonder what are the best choices for minimum block size `-s` and compression level `-l`.

The real answer is that it depends on the kind of data you are working with. In short, do your own math and feel free to report your results.

If you are working with big files (hundreds of MiB) then you will not have many benefits in terms of seeking time if you use `-s`, but you should increase your compression level to get smaller archives.

On the other hand, if you have a thousands of small files (few MiB or less) that usually compress well and are correlated you may take advantage of both `-s` and `-l`.

What follows is a test I made with a dataset of ~100.000 binary files less than 4MiB in size. The exact numbers are not important.

The first table shows the compression ratio of each combination of `-s` (min block size) and '-l' (level).

Intuitively at `-s 1K -l 1` the resulting archive is 57.38% of the size of the uncompressed .tar archive.

At the same time `-s 256M -l 22` gives the best results in term of compression ration, with a generated archive that is only 33.47% of the original.

Of course seeking in a block of 256M is not too fast. A safer choice in this particular case is something around `-s 32N`.

The second table shows the time it took to compress each archive, divided by the minimum time.

The fastest choice is at `-s 512K -l 1` while at `-s 256M -l 22` we get the slowest one, that takes 69.48 times more.


![compression ratio](doc/compression-ratio.png)

![speed ratio](doc/speed-ratio.png)


# License

See LICENSE
Binary file added doc/compression-ratio.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added doc/speed-ratio.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit 45bc211

Please sign in to comment.