Skip to content

Commit

Permalink
Compression ratio improvements.
Browse files Browse the repository at this point in the history
  • Loading branch information
IlyaGrebnov committed Jan 4, 2023
1 parent 7e5666d commit 60afcd3
Show file tree
Hide file tree
Showing 7 changed files with 8,437 additions and 2,285 deletions.
3 changes: 3 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
* 2022-11-27 : Version 0.5.0
* Compression ratio improvements.

* 2022-11-20 : Version 0.4.0
* Compression ratio improvements.

Expand Down
160 changes: 81 additions & 79 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ Copyright (c) 2021-2022 Ilya Grebnov <[email protected]>
The bsc-m03 is released under the [GNU General Public License](LICENSE "GNU General Public License")

## Changes
* 2022-11-27 : Version 0.5.0
* Compression ratio improvements.
* 2022-11-20 : Version 0.4.0
* Compression ratio improvements.
* 2022-11-10 : Version 0.3.0
Expand All @@ -33,115 +35,115 @@ The bsc-m03 is released under the [GNU General Public License](LICENSE "GNU Gene
### Calgary Corpus ###
| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
|:---------------:|:-----------:|:------------:|:-------:|
| bib | 111261 | 24550 | 1.765 |
| book1 | 768771 | 203954 | 2.122 |
| book2 | 610856 | 139181 | 1.823 |
| geo | 102400 | 52482 | 4.100 |
| news | 377109 | 105915 | 2.247 |
| obj1 | 21504 | 9779 | 3.638 |
| obj2 | 246814 | 68229 | 2.212 |
| paper1 | 53161 | 15010 | 2.259 |
| paper2 | 82199 | 22641 | 2.204 |
| pic | 513216 | 44587 | 0.695 |
| progc | 39611 | 11303 | 2.283 |
| progl | 71646 | 13565 | 1.515 |
| progp | 49379 | 9291 | 1.505 |
| trans | 93695 | 15399 | 1.315 |
| bib | 111261 | 24479 | 1.760 |
| book1 | 768771 | 203745 | 2.120 |
| book2 | 610856 | 138870 | 1.819 |
| geo | 102400 | 52465 | 4.099 |
| news | 377109 | 105621 | 2.241 |
| obj1 | 21504 | 9775 | 3.637 |
| obj2 | 246814 | 68003 | 2.204 |
| paper1 | 53161 | 14957 | 2.251 |
| paper2 | 82199 | 22594 | 2.199 |
| pic | 513216 | 44424 | 0.692 |
| progc | 39611 | 11257 | 2.274 |
| progl | 71646 | 13512 | 1.509 |
| progp | 49379 | 9248 | 1.498 |
| trans | 93695 | 15310 | 1.307 |

### Canterbury Corpus ###
| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
|:---------------:|:-----------:|:------------:|:-------:|
| alice29.txt | 152089 | 38622 | 2.032 |
| asyoulik.txt | 125179 | 35959 | 2.298 |
| cp.html | 24603 | 6883 | 2.238 |
| fields.c | 11150 | 2704 | 1.940 |
| grammar.lsp | 3721 | 1130 | 2.429 |
| kennedy.xls | 1029744 | 56697 | 0.440 |
| lcet10.txt | 426754 | 95012 | 1.781 |
| plrabn12.txt | 481861 | 129960 | 2.158 |
| ptt5 | 513216 | 44587 | 0.695 |
| sum | 38240 | 11453 | 2.396 |
| xargs.1 | 4227 | 1587 | 3.004 |
| alice29.txt | 152089 | 38562 | 2.028 |
| asyoulik.txt | 125179 | 35889 | 2.294 |
| cp.html | 24603 | 6872 | 2.235 |
| fields.c | 11150 | 2685 | 1.926 |
| grammar.lsp | 3721 | 1120 | 2.408 |
| kennedy.xls | 1029744 | 57440 | 0.446 |
| lcet10.txt | 426754 | 94823 | 1.778 |
| plrabn12.txt | 481861 | 129770 | 2.154 |
| ptt5 | 513216 | 44424 | 0.692 |
| sum | 38240 | 11426 | 2.390 |
| xargs.1 | 4227 | 1585 | 3.000 |

### Large Canterbury Corpus ###
| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
|:---------------:|:-----------:|:------------:|:-------:|
| bible.txt | 4047392 | 699609 | 1.383 |
| E.coli | 4638690 | 1125573 | 1.941 |
| world192.txt | 2473400 | 377228 | 1.220 |
| bible.txt | 4047392 | 698395 | 1.380 |
| E.coli | 4638690 | 1126125 | 1.942 |
| world192.txt | 2473400 | 376173 | 1.217 |

### Silesia Corpus ###
| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
|:---------------:|:-----------:|:------------:|:-------:|
| dickens | 10192446 | 2201665 | 1.728 |
| mozilla | 51220480 | 15588180 | 2.435 |
| mr | 9970564 | 2157681 | 1.731 |
| nci | 33553445 | 1127979 | 0.269 |
| ooffice | 6152192 | 2506008 | 3.259 |
| osdb | 10085684 | 2218043 | 1.759 |
| reymont | 6627202 | 961063 | 1.160 |
| samba | 21606400 | 3802811 | 1.408 |
| sao | 7251944 | 4649665 | 5.129 |
| webster | 41458703 | 6259839 | 1.208 |
| xml | 5345280 | 359928 | 0.539 |
| x-ray | 8474240 | 3682048 | 3.476 |
| dickens | 10192446 | 2199344 | 1.726 |
| mozilla | 51220480 | 15589159 | 2.435 |
| mr | 9970564 | 2156826 | 1.731 |
| nci | 33553445 | 1126386 | 0.269 |
| ooffice | 6152192 | 2503991 | 3.256 |
| osdb | 10085684 | 2223002 | 1.763 |
| reymont | 6627202 | 958772 | 1.157 |
| samba | 21606400 | 3794300 | 1.405 |
| sao | 7251944 | 4649723 | 5.129 |
| webster | 41458703 | 6253627 | 1.207 |
| xml | 5345280 | 357958 | 0.536 |
| x-ray | 8474240 | 3681388 | 3.475 |

### Manzini Corpus ###
| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
|:---------------:|:-----------:|:------------:|:-------:|
| chr22.dna | 34553758 | 7199372 | 1.667 |
| etext99 | 105277340 | 21444578 | 1.630 |
| gcc-3.0.tar | 86630400 | 10083390 | 0.931 |
| howto | 39422105 | 7521856 | 1.526 |
| jdk13c | 69728899 | 2625797 | 0.301 |
| linux-2.4.5.tar | 116254720 | 16410352 | 1.129 |
| rctail96 | 114711151 | 9732849 | 0.679 |
| rfc | 116421901 | 14912324 | 1.025 |
| sprot34.dat | 109617186 | 17175727 | 1.254 |
| w3c2 | 104201579 | 5629500 | 0.432 |
| chr22.dna | 34553758 | 7206269 | 1.668 |
| etext99 | 105277340 | 21422251 | 1.628 |
| gcc-3.0.tar | 86630400 | 10046880 | 0.928 |
| howto | 39422105 | 7504315 | 1.523 |
| jdk13c | 69728899 | 2612434 | 0.300 |
| linux-2.4.5.tar | 116254720 | 16351863 | 1.125 |
| rctail96 | 114711151 | 9707347 | 0.677 |
| rfc | 116421901 | 14871775 | 1.022 |
| sprot34.dat | 109617186 | 17157222 | 1.252 |
| w3c2 | 104201579 | 5598687 | 0.430 |

### Maximum Compression Corpus ###
| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
|:---------------:|:-----------:|:------------:|:-------:|
| A10.jpg | 842468 | 823451 | 7.819 |
| AcroRd32.exe | 3870784 | 1557055 | 3.218 |
| english.dic | 465211 | 145156 | 2.496 |
| FlashMX.pdf | 4526946 | 3712598 | 6.561 |
| FP.LOG | 20617071 | 504344 | 0.196 |
| MSO97.DLL | 3782416 | 1878874 | 3.974 |
| ohs.doc | 4168192 | 803197 | 1.542 |
| rafale.bmp | 4149414 | 743978 | 1.434 |
| vcfiu.hlp | 4121418 | 606466 | 1.177 |
| world95.txt | 2988578 | 443815 | 1.188 |
| A10.jpg | 842468 | 823533 | 7.820 |
| AcroRd32.exe | 3870784 | 1555832 | 3.216 |
| english.dic | 465211 | 145096 | 2.495 |
| FlashMX.pdf | 4526946 | 3712716 | 6.561 |
| FP.LOG | 20617071 | 502648 | 0.195 |
| MSO97.DLL | 3782416 | 1878076 | 3.972 |
| ohs.doc | 4168192 | 803171 | 1.542 |
| rafale.bmp | 4149414 | 745470 | 1.437 |
| vcfiu.hlp | 4121418 | 604165 | 1.173 |
| world95.txt | 2988578 | 442271 | 1.184 |

### Large Text Compression Benchmark Corpus ###
| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
|:---------------:|:-----------:|:------------:|:-------:|
| enwik8 | 100000000 | 20293393 | 1.623 |
| enwik9 | 1000000000 | 160258936 | 1.282 |
| enwik8 | 100000000 | 20263925 | 1.621 |
| enwik9 | 1000000000 | 160018905 | 1.280 |

### Pizza & Chilli Corpus ###
| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
|:---------------:|:-----------:|:------------:|:-------:|
| dblp.xml | 296135874 | 21993088 | 0.594 |
| dna | 403927746 | 86344131 | 1.710 |
| english.1024MB | 1073741824 | 194009933 | 1.445 |
| pitches | 55832855 | 16998266 | 2.436 |
| proteins | 1184051855 | 304161163 | 2.055 |
| sources | 210866607 | 29848691 | 1.132 |
| dblp.xml | 296135874 | 21926695 | 0.592 |
| dna | 403927746 | 86414423 | 1.711 |
| english.1024MB | 1073741824 | 193810792 | 1.444 |
| pitches | 55832855 | 16984071 | 2.434 |
| proteins | 1184051855 | 304486803 | 2.057 |
| sources | 210866607 | 29749020 | 1.129 |

### Pizza & Chilli Repetitive Corpus ###
| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
|:---------------:|:-----------:|:------------:|:-------:|
| cere | 461286644 | 8566781 | 0.149 |
| coreutils | 205281778 | 4314630 | 0.168 |
| einstein.de.txt | 92758441 | 132953 | 0.011 |
| einstein.en.txt | 467626544 | 336959 | 0.006 |
| Escherichia_Coli | 112689515 | 7878020 | 0.559 |
| influenza | 154808555 | 1743038 | 0.090 |
| kernel | 257961616 | 2973476 | 0.092 |
| para | 429265758 | 10678863 | 0.199 |
| world_leaders | 46968181 | 519391 | 0.088 |
| cere | 461286644 | 8576879 | 0.149 |
| coreutils | 205281778 | 4293243 | 0.167 |
| einstein.de.txt | 92758441 | 132286 | 0.011 |
| einstein.en.txt | 467626544 | 336029 | 0.006 |
| Escherichia_Coli | 112689515 | 7928044 | 0.563 |
| influenza | 154808555 | 1760692 | 0.091 |
| kernel | 257961616 | 2955825 | 0.092 |
| para | 429265758 | 10730998 | 0.200 |
| world_leaders | 46968181 | 518220 | 0.088 |
| fib41 | 267914296 | 83 | 0.000 |
| rs.13 | 216747218 | 87 | 0.000 |
| tm29 | 268435456 | 160 | 0.000 |
| rs.13 | 216747218 | 86 | 0.000 |
| tm29 | 268435456 | 158 | 0.000 |
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.4.0
0.5.0
2 changes: 1 addition & 1 deletion bsc-m03.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -421,7 +421,7 @@ static int print_usage()

int main(int argc, const char * argv[])
{
fprintf(stdout, "bsc-m03 is experimental block sorting compressor. Version 0.4.0 (20 November 2022).\n");
fprintf(stdout, "bsc-m03 is experimental block sorting compressor. Version 0.5.0 (27 November 2022).\n");
fprintf(stdout, "Copyright (c) 2021-2022 Ilya Grebnov <[email protected]>. ABSOLUTELY NO WARRANTY.\n");
fprintf(stdout, "This program is based on (at least) the work of Michael Maniscalco (see AUTHORS).\n\n");

Expand Down
22 changes: 11 additions & 11 deletions m03_model.h
Original file line number Diff line number Diff line change
Expand Up @@ -232,18 +232,18 @@ template <m03_mode mode> class m03_model
{
int32_t state = 0;
state += 1 * (context);
state += 16 * (std::min((int32_t)symbols_remaining - 2, 7));
state += 128 * (std::min((int32_t)bit_scan_reverse(inferred_right + 1), 3));
state += 512 * (left_remaining + right_remaining + inferred_right == symbols_remaining);
state += 1024 * (left_remaining == total);
state += 2048 * (((int64_t)left_remaining * 11) / ((int64_t)right_remaining));
state += 32 * (std::min((int32_t)symbols_remaining - 2, 7));
state += 256 * (std::min((int32_t)bit_scan_reverse(inferred_right + 1), 3));
state += 1024 * (left_remaining + right_remaining + inferred_right == symbols_remaining);
state += 2048 * (left_remaining == total);
state += 4096 * (((int64_t)left_remaining * 11) / ((int64_t)right_remaining));

if (total == 1)
{
ptrdiff_t bucket = m03_T1_model_state_table[state];
ptrdiff_t bucket = m03_T1_model_m0_state_table[state];

uint16_t * RESTRICT predictor = &this->T1_model[bucket][0];
if (predictor[0] + predictor[1] > m03_T1_model_scale_table[bucket])
if (predictor[0] + predictor[1] > m03_T1_model_m0_scale_table[bucket])
{
predictor[0] = (predictor[0] + 1) >> 1;
predictor[1] = (predictor[1] + 1) >> 1;
Expand Down Expand Up @@ -414,10 +414,10 @@ template <m03_mode mode> class m03_model
int32_t state = 0;
state += 1 * (std::min((int32_t)bit_scan_reverse(total - 3), 7));
state += 8 * (context);
state += 128 * (std::min((int32_t)bit_scan_reverse(symbols_remaining - 1), 3));
state += 512 * (left_remaining == total);
state += 1024 * (inferred_right > 0);
state += 2048 * (((int64_t)left_remaining * 11) / ((int64_t)right_remaining));
state += 256 * (std::min((int32_t)bit_scan_reverse(symbols_remaining - 1), 3));
state += 1024 * (left_remaining == total);
state += 2048 * (inferred_right > 0);
state += 4096 * (((int64_t)left_remaining * 11) / ((int64_t)right_remaining));

int32_t pivot = (count == 0) | (count == total);

Expand Down
9 changes: 6 additions & 3 deletions m03_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -542,7 +542,7 @@ class m03_parser: m03_model<mode>
left_remaining -= ((uint32_t)(this->primary_index - parent_context_offset) < (uint32_t)left_interval_size );
right_remaining -= ((uint32_t)(this->primary_index - right_context_offset ) < (uint32_t)right_interval_size);

for (int32_t parent_symbol_index = 0; parent_symbol_index < parent_unique_symbols; ++parent_symbol_index)
for (int32_t pivot_history = 0, parent_symbol_index = 0; parent_symbol_index < parent_unique_symbols; ++parent_symbol_index)
{
symbol_t symbol = parent_context[parent_symbol_index].symbol;
int32_t offset = parent_context[parent_symbol_index].offset;
Expand All @@ -564,7 +564,10 @@ class m03_parser: m03_model<mode>
simple |= (total1 > 1) && (this->contexts[offset1].count == total1) && (this->contexts[offset1 + 1].count == 0);
}

context += 8 * simple;
context += 8 * simple + 16 * pivot_history;

left_leaf &= (left_remaining == left_interval_size );
right_leaf &= (right_remaining == right_interval_size);

if (total <= left_remaining + right_remaining - total)
{
Expand All @@ -590,7 +593,7 @@ class m03_parser: m03_model<mode>
count = std::min(left_remaining, total);
}

this->symbol_pivots[symbol][level] = (count == 0) | (count == total);
pivot_history |= (this->symbol_pivots[symbol][level] = (count == 0) | (count == total));

left_remaining = left_remaining - count;
right_remaining = right_remaining + count - total;
Expand Down
Loading

0 comments on commit 60afcd3

Please sign in to comment.