Skip to content

Commit

Permalink
Compression ratio improvements.
Browse files Browse the repository at this point in the history
  • Loading branch information
IlyaGrebnov committed Nov 21, 2022
1 parent 4f61c07 commit 7e5666d
Show file tree
Hide file tree
Showing 7 changed files with 7,618 additions and 4,513 deletions.
3 changes: 3 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
* 2022-11-20 : Version 0.4.0
* Compression ratio improvements.

* 2022-11-10 : Version 0.3.0
* Compression ratio improvements.

Expand Down
150 changes: 89 additions & 61 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ Copyright (c) 2021-2022 Ilya Grebnov <[email protected]>
The bsc-m03 is released under the [GNU General Public License](LICENSE "GNU General Public License")

## Changes
* 2022-11-20 : Version 0.4.0
* Compression ratio improvements.
* 2022-11-10 : Version 0.3.0
* Compression ratio improvements.
* 2022-01-08 : Version 0.2.1
Expand All @@ -31,89 +33,115 @@ The bsc-m03 is released under the [GNU General Public License](LICENSE "GNU Gene
### Calgary Corpus ###
| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
|:---------------:|:-----------:|:------------:|:-------:|
| bib | 111261 | 24656 | 1.773 |
| book1 | 768771 | 204395 | 2.127 |
| book2 | 610856 | 139566 | 1.828 |
| geo | 102400 | 52580 | 4.108 |
| news | 377109 | 106395 | 2.257 |
| obj1 | 21504 | 9795 | 3.644 |
| obj2 | 246814 | 68414 | 2.218 |
| paper1 | 53161 | 15048 | 2.265 |
| paper2 | 82199 | 22687 | 2.208 |
| pic | 513216 | 44620 | 0.696 |
| progc | 39611 | 11320 | 2.286 |
| progl | 71646 | 13610 | 1.520 |
| progp | 49379 | 9316 | 1.509 |
| trans | 93695 | 15446 | 1.319 |
| bib | 111261 | 24550 | 1.765 |
| book1 | 768771 | 203954 | 2.122 |
| book2 | 610856 | 139181 | 1.823 |
| geo | 102400 | 52482 | 4.100 |
| news | 377109 | 105915 | 2.247 |
| obj1 | 21504 | 9779 | 3.638 |
| obj2 | 246814 | 68229 | 2.212 |
| paper1 | 53161 | 15010 | 2.259 |
| paper2 | 82199 | 22641 | 2.204 |
| pic | 513216 | 44587 | 0.695 |
| progc | 39611 | 11303 | 2.283 |
| progl | 71646 | 13565 | 1.515 |
| progp | 49379 | 9291 | 1.505 |
| trans | 93695 | 15399 | 1.315 |

### Canterbury Corpus ###
| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
|:---------------:|:-----------:|:------------:|:-------:|
| alice29.txt | 152089 | 38667 | 2.034 |
| asyoulik.txt | 125179 | 36019 | 2.302 |
| cp.html | 24603 | 6915 | 2.249 |
| fields.c | 11150 | 2695 | 1.934 |
| alice29.txt | 152089 | 38622 | 2.032 |
| asyoulik.txt | 125179 | 35959 | 2.298 |
| cp.html | 24603 | 6883 | 2.238 |
| fields.c | 11150 | 2704 | 1.940 |
| grammar.lsp | 3721 | 1130 | 2.429 |
| kennedy.xls | 1029744 | 56568 | 0.439 |
| lcet10.txt | 426754 | 95240 | 1.785 |
| plrabn12.txt | 481861 | 130068 | 2.159 |
| ptt5 | 513216 | 44620 | 0.696 |
| sum | 38240 | 11479 | 2.401 |
| xargs.1 | 4227 | 1585 | 3.000 |
| kennedy.xls | 1029744 | 56697 | 0.440 |
| lcet10.txt | 426754 | 95012 | 1.781 |
| plrabn12.txt | 481861 | 129960 | 2.158 |
| ptt5 | 513216 | 44587 | 0.695 |
| sum | 38240 | 11453 | 2.396 |
| xargs.1 | 4227 | 1587 | 3.004 |

### Large Canterbury Corpus ###
| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
|:---------------:|:-----------:|:------------:|:-------:|
| bible.txt | 4047392 | 701049 | 1.386 |
| E.coli | 4638690 | 1126463 | 1.943 |
| world192.txt | 2473400 | 378508 | 1.224 |
| bible.txt | 4047392 | 699609 | 1.383 |
| E.coli | 4638690 | 1125573 | 1.941 |
| world192.txt | 2473400 | 377228 | 1.220 |

### Silesia Corpus ###
| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
|:---------------:|:-----------:|:------------:|:-------:|
| dickens | 10192446 | 2203859 | 1.730 |
| mozilla | 51220480 | 15630325 | 2.441 |
| mr | 9970564 | 2158802 | 1.732 |
| nci | 33553445 | 1130423 | 0.270 |
| ooffice | 6152192 | 2511633 | 3.266 |
| osdb | 10085684 | 2221807 | 1.762 |
| reymont | 6627202 | 962152 | 1.161 |
| samba | 21606400 | 3816749 | 1.413 |
| sao | 7251944 | 4651078 | 5.131 |
| webster | 41458703 | 6267572 | 1.209 |
| xml | 5345280 | 362358 | 0.542 |
| x-ray | 8474240 | 3681801 | 3.476 |
| dickens | 10192446 | 2201665 | 1.728 |
| mozilla | 51220480 | 15588180 | 2.435 |
| mr | 9970564 | 2157681 | 1.731 |
| nci | 33553445 | 1127979 | 0.269 |
| ooffice | 6152192 | 2506008 | 3.259 |
| osdb | 10085684 | 2218043 | 1.759 |
| reymont | 6627202 | 961063 | 1.160 |
| samba | 21606400 | 3802811 | 1.408 |
| sao | 7251944 | 4649665 | 5.129 |
| webster | 41458703 | 6259839 | 1.208 |
| xml | 5345280 | 359928 | 0.539 |
| x-ray | 8474240 | 3682048 | 3.476 |

### Manzini Corpus ###
| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
|:---------------:|:-----------:|:------------:|:-------:|
| chr22.dna | 34553758 | 7206590 | 1.668 |
| etext99 | 105277340 | 21508150 | 1.634 |
| gcc-3.0.tar | 86630400 | 10131247 | 0.936 |
| howto | 39422105 | 7556359 | 1.533 |
| jdk13c | 69728899 | 2638786 | 0.303 |
| linux-2.4.5.tar | 116254720 | 16489301 | 1.135 |
| rctail96 | 114711151 | 9788959 | 0.683 |
| rfc | 116421901 | 14967795 | 1.029 |
| sprot34.dat | 109617186 | 17259191 | 1.260 |
| w3c2 | 104201579 | 5666677 | 0.435 |
| chr22.dna | 34553758 | 7199372 | 1.667 |
| etext99 | 105277340 | 21444578 | 1.630 |
| gcc-3.0.tar | 86630400 | 10083390 | 0.931 |
| howto | 39422105 | 7521856 | 1.526 |
| jdk13c | 69728899 | 2625797 | 0.301 |
| linux-2.4.5.tar | 116254720 | 16410352 | 1.129 |
| rctail96 | 114711151 | 9732849 | 0.679 |
| rfc | 116421901 | 14912324 | 1.025 |
| sprot34.dat | 109617186 | 17175727 | 1.254 |
| w3c2 | 104201579 | 5629500 | 0.432 |

### Maximum Compression Corpus ###
| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
|:---------------:|:-----------:|:------------:|:-------:|
| A10.jpg | 842468 | 823496 | 7.820 |
| AcroRd32.exe | 3870784 | 1560548 | 3.225 |
| english.dic | 465211 | 145707 | 2.506 |
| FlashMX.pdf | 4526946 | 3717253 | 6.569 |
| FP.LOG | 20617071 | 505982 | 0.196 |
| MSO97.DLL | 3782416 | 1882533 | 3.982 |
| ohs.doc | 4168192 | 805796 | 1.547 |
| rafale.bmp | 4149414 | 743544 | 1.434 |
| vcfiu.hlp | 4121418 | 608769 | 1.182 |
| world95.txt | 2988578 | 445466 | 1.192 |
| A10.jpg | 842468 | 823451 | 7.819 |
| AcroRd32.exe | 3870784 | 1557055 | 3.218 |
| english.dic | 465211 | 145156 | 2.496 |
| FlashMX.pdf | 4526946 | 3712598 | 6.561 |
| FP.LOG | 20617071 | 504344 | 0.196 |
| MSO97.DLL | 3782416 | 1878874 | 3.974 |
| ohs.doc | 4168192 | 803197 | 1.542 |
| rafale.bmp | 4149414 | 743978 | 1.434 |
| vcfiu.hlp | 4121418 | 606466 | 1.177 |
| world95.txt | 2988578 | 443815 | 1.188 |

### Large Text Compression Benchmark Corpus ###
| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
|:---------------:|:-----------:|:------------:|:-------:|
| enwik8 | 100000000 | 20339773 | 1.627 |
| enwik9 | 1000000000 | 160616907 | 1.285 |
| enwik8 | 100000000 | 20293393 | 1.623 |
| enwik9 | 1000000000 | 160258936 | 1.282 |

### Pizza & Chilli Corpus ###
| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
|:---------------:|:-----------:|:------------:|:-------:|
| dblp.xml | 296135874 | 21993088 | 0.594 |
| dna | 403927746 | 86344131 | 1.710 |
| english.1024MB | 1073741824 | 194009933 | 1.445 |
| pitches | 55832855 | 16998266 | 2.436 |
| proteins | 1184051855 | 304161163 | 2.055 |
| sources | 210866607 | 29848691 | 1.132 |

### Pizza & Chilli Repetitive Corpus ###
| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
|:---------------:|:-----------:|:------------:|:-------:|
| cere | 461286644 | 8566781 | 0.149 |
| coreutils | 205281778 | 4314630 | 0.168 |
| einstein.de.txt | 92758441 | 132953 | 0.011 |
| einstein.en.txt | 467626544 | 336959 | 0.006 |
| Escherichia_Coli | 112689515 | 7878020 | 0.559 |
| influenza | 154808555 | 1743038 | 0.090 |
| kernel | 257961616 | 2973476 | 0.092 |
| para | 429265758 | 10678863 | 0.199 |
| world_leaders | 46968181 | 519391 | 0.088 |
| fib41 | 267914296 | 83 | 0.000 |
| rs.13 | 216747218 | 87 | 0.000 |
| tm29 | 268435456 | 160 | 0.000 |
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.3.0
0.4.0
24 changes: 19 additions & 5 deletions bsc-m03.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ This file is a part of bsc-m03 project.

int32_t root_frequencies[MAX_ALPHABET_SIZE + 1];

#if !defined(BSC_DECOMPRESSION_ONLY)

template <class symbol_t> static int32_t compress_memory_block(uint8_t * buffer, int32_t block_size)
{
int32_t indexes[32] = { -1 };
Expand All @@ -71,7 +73,7 @@ template <class symbol_t> static int32_t compress_memory_block(uint8_t * buffer,
{
if (symbol_t * L = (symbol_t *)malloc(((size_t)block_symbols + 1) * sizeof(symbol_t)))
{
if (m03_parser<symbol_t> * parser = (m03_parser<symbol_t> *)malloc(sizeof(m03_parser<symbol_t>)))
if (m03_parser<symbol_t, m03_mode::encoding> * parser = (m03_parser<symbol_t, m03_mode::encoding> *)malloc(sizeof(m03_parser<symbol_t, m03_mode::encoding>)))
{
{
int32_t primary_index = indexes[0];
Expand All @@ -91,7 +93,7 @@ template <class symbol_t> static int32_t compress_memory_block(uint8_t * buffer,
coder.EncodeValue(1, indexes[t], block_symbols);
}

if (parser->initialize(L, block_symbols + 1, indexes[0], root_frequencies, 1 << (8 * symbol_size), &coder, m03_mode::encoding))
if (parser->initialize(L, block_symbols + 1, indexes[0], root_frequencies, 1 << (8 * symbol_size), &coder))
{
parser->run();
parser->destroy();
Expand Down Expand Up @@ -130,6 +132,8 @@ template <class symbol_t> static int32_t compress_memory_block(uint8_t * buffer,
return comressed_size;
}

#endif

template <class symbol_t> static int32_t decompress_burrows_wheeler_transform(RangeCoder * coder, int32_t primary_index, int32_t block_size, uint8_t * buffer)
{
int32_t result = -1;
Expand All @@ -138,9 +142,9 @@ template <class symbol_t> static int32_t decompress_burrows_wheeler_transform(Ra

if (symbol_t * L = (symbol_t *)malloc(((size_t)block_symbols + 1) * sizeof(symbol_t)))
{
if (m03_parser<symbol_t> * parser = (m03_parser<symbol_t> *)malloc(sizeof(m03_parser<symbol_t>)))
if (m03_parser<symbol_t, m03_mode::decoding> * parser = (m03_parser<symbol_t, m03_mode::decoding> *)malloc(sizeof(m03_parser<symbol_t, m03_mode::decoding>)))
{
if (parser->initialize(L, block_symbols + 1, primary_index, root_frequencies, 1 << (8 * symbol_size), coder, m03_mode::decoding))
if (parser->initialize(L, block_symbols + 1, primary_index, root_frequencies, 1 << (8 * symbol_size), coder))
{
parser->run();
parser->destroy();
Expand Down Expand Up @@ -222,6 +226,8 @@ static int32_t decompress_memory_block(uint8_t * buffer, int32_t comressed_size,
return decomressed_size;
}

#if !defined(BSC_DECOMPRESSION_ONLY)

static int compress_file(const char * input_file_name, const char * output_file_name, int32_t max_block_size, int32_t symbol_size)
{
clock_t start_time = clock();
Expand Down Expand Up @@ -305,6 +311,8 @@ static int compress_file(const char * input_file_name, const char * output_file_
return 0;
}

#endif

static int decompress_file(const char * input_file_name, const char * output_file_name)
{
clock_t start_time = clock();
Expand Down Expand Up @@ -400,16 +408,20 @@ static int decompress_file(const char * input_file_name, const char * output_fil

static int print_usage()
{
#if !defined(BSC_DECOMPRESSION_ONLY)
fprintf(stdout, "Usage: bsc-m03 <e|d> input-file output-file <options>\n");
fprintf(stdout, " -b<size> Block size in bytes, default 128MB (memory usage is ~13x).\n");
fprintf(stdout, " -w<8|16> Symbol width in bits.\n");
#else
fprintf(stdout, "Usage: bsc-m03 d input-file output-file\n");
#endif

return 0;
}

int main(int argc, const char * argv[])
{
fprintf(stdout, "bsc-m03 is experimental block sorting compressor. Version 0.3.0 (10 November 2022).\n");
fprintf(stdout, "bsc-m03 is experimental block sorting compressor. Version 0.4.0 (20 November 2022).\n");
fprintf(stdout, "Copyright (c) 2021-2022 Ilya Grebnov <[email protected]>. ABSOLUTELY NO WARRANTY.\n");
fprintf(stdout, "This program is based on (at least) the work of Michael Maniscalco (see AUTHORS).\n\n");

Expand Down Expand Up @@ -455,13 +467,15 @@ int main(int argc, const char * argv[])

switch (argv[1][0])
{
#if !defined(BSC_DECOMPRESSION_ONLY)
case 'c':
case 'C':
case 'e':
case 'E':
{
return compress_file(argv[2], argv[3], max_block_size, symbol_width / 8);
}
#endif

case 'd':
case 'D':
Expand Down
Loading

0 comments on commit 7e5666d

Please sign in to comment.