Skip to content

Commit

Permalink
Slightly improved compression using symbols history.
Browse files Browse the repository at this point in the history
  • Loading branch information
IlyaGrebnov committed Dec 7, 2021
1 parent 52d4631 commit f9631b6
Show file tree
Hide file tree
Showing 6 changed files with 141 additions and 118 deletions.
3 changes: 3 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
* 2021-12-07 : Version 0.1.1
* Slightly improved compression using symbols history.

* 2021-12-03 : Version 0.1.0
* Initial public release of the bsc-m03.
126 changes: 64 additions & 62 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ Copyright (c) 2021 Ilya Grebnov <[email protected]>
The libsais is released under the [GNU General Public License](LICENSE "GNU General Public License")

## Changes
* 2021-12-07 : Version 0.1.1
* Slightly improved compression using symbols history.
* 2021-12-03 : Version 0.1.0
* Initial public release of the bsc-m03.

Expand All @@ -18,89 +20,89 @@ The libsais is released under the [GNU General Public License](LICENSE "GNU Gene
### Calgary Corpus ###
| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
|:---------------:|:-----------:|:------------:|:-------:|
| bib | 111261 | 25143 | 1.808 |
| book1 | 768771 | 208157 | 2.166 |
| book2 | 610856 | 141591 | 1.854 |
| geo | 102400 | 52797 | 4.125 |
| news | 377109 | 108387 | 2.299 |
| obj1 | 21504 | 9901 | 3.683 |
| obj2 | 246814 | 69689 | 2.259 |
| paper1 | 53161 | 15384 | 2.315 |
| paper2 | 82199 | 23161 | 2.254 |
| pic | 513216 | 44920 | 0.700 |
| progc | 39611 | 11525 | 2.328 |
| progl | 71646 | 13921 | 1.554 |
| progp | 49379 | 9530 | 1.544 |
| trans | 93695 | 15759 | 1.346 |
| bib | 111261 | 25098 | 1.805 |
| book1 | 768771 | 207930 | 2.164 |
| book2 | 610856 | 141245 | 1.850 |
| geo | 102400 | 52825 | 4.127 |
| news | 377109 | 107965 | 2.290 |
| obj1 | 21504 | 9904 | 3.685 |
| obj2 | 246814 | 69337 | 2.247 |
| paper1 | 53161 | 15330 | 2.307 |
| paper2 | 82199 | 23099 | 2.248 |
| pic | 513216 | 44961 | 0.701 |
| progc | 39611 | 11526 | 2.328 |
| progl | 71646 | 13892 | 1.551 |
| progp | 49379 | 9514 | 1.541 |
| trans | 93695 | 15739 | 1.344 |

### Canterbury Corpus ###
| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
|:---------------:|:-----------:|:------------:|:-------:|
| alice29.txt | 152089 | 39310 | 2.068 |
| asyoulik.txt | 125179 | 36585 | 2.338 |
| cp.html | 24603 | 7042 | 2.290 |
| fields.c | 11150 | 2748 | 1.972 |
| grammar.lsp | 3721 | 1142 | 2.455 |
| kennedy.xls | 1029744 | 58440 | 0.454 |
| lcet10.txt | 426754 | 96730 | 1.813 |
| plrabn12.txt | 481861 | 131617 | 2.185 |
| ptt5 | 513216 | 44920 | 0.700 |
| sum | 38240 | 11599 | 2.427 |
| xargs.1 | 4227 | 1618 | 3.062 |
| alice29.txt | 152089 | 39249 | 2.065 |
| asyoulik.txt | 125179 | 36508 | 2.333 |
| cp.html | 24603 | 7046 | 2.291 |
| fields.c | 11150 | 2752 | 1.975 |
| grammar.lsp | 3721 | 1148 | 2.468 |
| kennedy.xls | 1029744 | 58978 | 0.458 |
| lcet10.txt | 426754 | 96523 | 1.809 |
| plrabn12.txt | 481861 | 131473 | 2.183 |
| ptt5 | 513216 | 44961 | 0.701 |
| sum | 38240 | 11636 | 2.434 |
| xargs.1 | 4227 | 1620 | 3.066 |

### Large Canterbury Corpus ###
| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
|:---------------:|:-----------:|:------------:|:-------:|
| bible.txt | 4047392 | 708602 | 1.401 |
| E.coli | 4638690 | 1137915 | 1.962 |
| world192.txt | 2473400 | 384776 | 1.245 |
| bible.txt | 4047392 | 707710 | 1.399 |
| E.coli | 4638690 | 1138016 | 1.963 |
| world192.txt | 2473400 | 383758 | 1.241 |

### Silesia Corpus ###
| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
|:---------------:|:-----------:|:------------:|:-------:|
| dickens | 10192446 | 2220939 | 1.743 |
| mozilla | 51220480 | 15831237 | 2.473 |
| mr | 9970564 | 2169223 | 1.741 |
| nci | 33553445 | 1148550 | 0.274 |
| ooffice | 6152192 | 2542258 | 3.306 |
| osdb | 10085684 | 2251471 | 1.786 |
| reymont | 6627202 | 972461 | 1.174 |
| samba | 21606400 | 3881872 | 1.437 |
| sao | 7251944 | 4672656 | 5.155 |
| webster | 41458703 | 6318267 | 1.219 |
| xml | 5345280 | 369196 | 0.553 |
| x-ray | 8474240 | 3697722 | 3.491 |
| dickens | 10192446 | 2218186 | 1.741 |
| mozilla | 51220480 | 15784688 | 2.465 |
| mr | 9970564 | 2168769 | 1.740 |
| nci | 33553445 | 1147399 | 0.274 |
| ooffice | 6152192 | 2533840 | 3.295 |
| osdb | 10085684 | 2250910 | 1.785 |
| reymont | 6627202 | 970070 | 1.171 |
| samba | 21606400 | 3868421 | 1.432 |
| sao | 7251944 | 4671956 | 5.154 |
| webster | 41458703 | 6309084 | 1.217 |
| xml | 5345280 | 367771 | 0.550 |
| x-ray | 8474240 | 3698091 | 3.491 |

### Manzini Corpus ###
| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
|:---------------:|:-----------:|:------------:|:-------:|
| chr22.dna | 34553758 | 7262753 | 1.681 |
| etext99 | 105277340 | 21730495 | 1.651 |
| gcc-3.0.tar | 86630400 | 10306097 | 0.952 |
| howto | 39422105 | 7662880 | 1.555 |
| jdk13c | 69728899 | 2692938 | 0.309 |
| linux-2.4.5.tar | 116254720 | 16773180 | 1.154 |
| rctail96 | 114711151 | 9949692 | 0.694 |
| rfc | 116421901 | 15192366 | 1.044 |
| sprot34.dat | 109617186 | 17534134 | 1.280 |
| w3c2 | 104201579 | 5800775 | 0.445 |
| chr22.dna | 34553758 | 7262254 | 1.681 |
| etext99 | 105277340 | 21704149 | 1.649 |
| gcc-3.0.tar | 86630400 | 10263588 | 0.948 |
| howto | 39422105 | 7635242 | 1.549 |
| jdk13c | 69728899 | 2680664 | 0.308 |
| linux-2.4.5.tar | 116254720 | 16701149 | 1.149 |
| rctail96 | 114711151 | 9918165 | 0.692 |
| rfc | 116421901 | 15141656 | 1.040 |
| sprot34.dat | 109617186 | 17473161 | 1.275 |
| w3c2 | 104201579 | 5766640 | 0.443 |

### Maximum Compression Corpus ###
| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
|:---------------:|:-----------:|:------------:|:-------:|
| A10.jpg | 842468 | 825162 | 7.836 |
| AcroRd32.exe | 3870784 | 1582677 | 3.271 |
| english.dic | 465211 | 148582 | 2.555 |
| FlashMX.pdf | 4526946 | 3735179 | 6.601 |
| FP.LOG | 20617071 | 514554 | 0.200 |
| MSO97.DLL | 3782416 | 1904460 | 4.028 |
| ohs.doc | 4168192 | 817718 | 1.569 |
| rafale.bmp | 4149414 | 750437 | 1.447 |
| vcfiu.hlp | 4121418 | 620358 | 1.204 |
| world95.txt | 2988578 | 452271 | 1.211 |
| A10.jpg | 842468 | 825193 | 7.836 |
| AcroRd32.exe | 3870784 | 1576102 | 3.257 |
| english.dic | 465211 | 148631 | 2.556 |
| FlashMX.pdf | 4526946 | 3732972 | 6.597 |
| FP.LOG | 20617071 | 513631 | 0.199 |
| MSO97.DLL | 3782416 | 1897323 | 4.013 |
| ohs.doc | 4168192 | 814842 | 1.564 |
| rafale.bmp | 4149414 | 750463 | 1.447 |
| vcfiu.hlp | 4121418 | 617351 | 1.198 |
| world95.txt | 2988578 | 451058 | 1.207 |

### Large Text Compression Benchmark Corpus ###
| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
|:---------------:|:-----------:|:------------:|:-------:|
| enwik8 | 100000000 | 20529360 | 1.642 |
| enwik9 | 1000000000 | 162084133 | 1.297 |
| enwik8 | 100000000 | 20487507 | 1.639 |
| enwik9 | 1000000000 | 161805758 | 1.294 |
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.1.0
0.1.1
4 changes: 2 additions & 2 deletions bsc-m03.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -409,7 +409,7 @@ static int print_usage()

int main(int argc, const char * argv[])
{
fprintf(stdout, "bsc-m03 is experimental block sorting compressor. Version 0.1.0 (3 December 2021).\n");
fprintf(stdout, "bsc-m03 is experimental block sorting compressor. Version 0.1.1 (7 December 2021).\n");
fprintf(stdout, "Copyright (c) 2021 Ilya Grebnov <[email protected]>. ABSOLUTELY NO WARRANTY.\n");
fprintf(stdout, "This program is based on (at least) the work of Michael Maniscalco and Atsushi Komiya.\n\n");

Expand Down Expand Up @@ -480,4 +480,4 @@ int main(int argc, const char * argv[])
return 0;
}

#pragma warning( pop )
#pragma warning( pop )
46 changes: 29 additions & 17 deletions m03_model.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ This file is a part of bsc-m03 project.
#pragma warning( disable : 6385 )
#pragma warning( disable : 6386 )

#define SYMBOL_HISTORY_MAX_DEPTH (16)

enum class m03_mode : int { encoding = 0, decoding = 1, };

class m03_model
Expand All @@ -46,9 +48,9 @@ class m03_model
this->coder = coder;
this->mode = mode;

for (int32_t s = 0; s < 1536; ++s) { T1_model[s][0] = T1_model[s][1] = 1; }
for (int32_t s = 0; s < 1536; ++s) { T2_model[s][0] = T2_model[s][1] = T2_model[s][2] = T2_model[s][3] = 1; }
for (int32_t s = 0; s < 768 ; ++s) { Ternary_model[s][0] = Ternary_model[s][1] = Ternary_model[s][2] = Ternary_model[s][3] = 1; }
for (int32_t s = 0; s < 3072; ++s) { T1_model[s][0] = T1_model[s][1] = 1; }
for (int32_t s = 0; s < 3072; ++s) { T2_model[s][0] = T2_model[s][1] = T2_model[s][2] = T2_model[s][3] = 1; }
for (int32_t s = 0; s < 1536; ++s) { Ternary_model[s][0] = Ternary_model[s][1] = Ternary_model[s][2] = Ternary_model[s][3] = 1; }
for (int32_t s = 0; s < 96 ; ++s) { for (int32_t c = 0; c < 16; ++c) { Tree_model[s][c] = 1; } }
}

Expand Down Expand Up @@ -215,31 +217,38 @@ class m03_model
}
}

int32_t predict(int32_t count, int32_t total, int32_t left_remaining, int32_t right_remaining, int32_t symbols_remaining)
int32_t predict(int32_t count, int32_t total, int32_t left_remaining, int32_t right_remaining, int32_t symbols_remaining, int32_t symbol, int32_t level)
{
level = std::min(level, SYMBOL_HISTORY_MAX_DEPTH - 1); this->Symbol_history[symbol][level] = 0;

int32_t inferred_right = std::max(total - left_remaining, 0);
right_remaining -= inferred_right; total -= inferred_right;

assert(total <= right_remaining);

if (total > 0)
{
int32_t history =
level > 1 ? this->Symbol_history[symbol][level - 1] | this->Symbol_history[symbol][level - 2] :
level > 0 ? this->Symbol_history[symbol][level - 1] : 0;

if (total <= 2)
{
int32_t state = 0;
state += 1 * (std::min((int32_t)symbols_remaining - 2, 5));
state += 8 * (std::min((int32_t)bit_scan_reverse(inferred_right + 1), 3));
state += 32 * (left_remaining + right_remaining == symbols_remaining);
state += 64 * (left_remaining == total);
state += 128 * (((int64_t)left_remaining * 11) / ((int64_t)right_remaining));
state += 128 * (history);
state += 256 * (((int64_t)left_remaining * 11) / ((int64_t)right_remaining));

if (total == 1)
{
static const int threshold[12] = { 147, 251, 374, 540, 761, 763, 1589, 2275, 2193, 3457, 3811, 1017 };

uint16_t * RESTRICT predictor = &this->T1_model[state][0];
uint16_t * RESTRICT predictor = &this->T1_model[state & 0xffffff7f][0];

if (predictor[0] + predictor[1] > threshold[state >> 7])
if (predictor[0] + predictor[1] > threshold[state >> 8])
{
predictor[0] = (predictor[0] + (predictor[0] < 2)) >> 1;
predictor[1] = (predictor[1] + (predictor[1] < 2)) >> 1;
Expand All @@ -265,7 +274,7 @@ class m03_model

uint16_t * RESTRICT predictor = &this->T2_model[state][0];

if (predictor[0] + predictor[1] + predictor[2] > threshold[state >> 7])
if (predictor[0] + predictor[1] + predictor[2] > threshold[state >> 8])
{
predictor[0] = (predictor[0] + (predictor[0] < 2)) >> 1;
predictor[1] = (predictor[1] + (predictor[1] < 2)) >> 1;
Expand All @@ -287,7 +296,7 @@ class m03_model
this->coder->Decode(cum_freq, predictor[count], predictor[0] + predictor[1] + predictor[2]);
}

predictor[count]++;
predictor[count]++; this->Symbol_history[symbol][level] = count != 1;
}
}
else
Expand All @@ -309,12 +318,13 @@ class m03_model
state += 1 * (std::min((int32_t)bit_scan_reverse(symbols_remaining - 1), 3));
state += 4 * (inferred_right > 0);
state += 8 * (left_remaining == total);
state += 16 * (std::min((int32_t)bit_scan_reverse(total - 2), 7));
state += 128 * (((int64_t)left_remaining * 9 + right_remaining) / ((int64_t)right_remaining * 2));
state += 16 * (history);
state += 32 * (std::min((int32_t)bit_scan_reverse(total - 2), 7));
state += 256 * (((int64_t)left_remaining * 9 + right_remaining) / ((int64_t)right_remaining * 2));

uint16_t * RESTRICT predictor = &this->Ternary_model[state][0];

if (predictor[0] + predictor[1] + predictor[2] > threshold[state >> 4])
if (predictor[0] + predictor[1] + predictor[2] > threshold[state >> 5])
{
predictor[0] = (predictor[0] + (predictor[0] < 2)) >> 1;
predictor[1] = (predictor[1] + (predictor[1] < 2)) >> 1;
Expand All @@ -336,7 +346,7 @@ class m03_model
this->coder->Decode(cum_freq, predictor[pivot], predictor[0] + predictor[1] + predictor[2]);
}

predictor[pivot]++; if (pivot != 1) { count = pivot == 0 ? 0 : total; }
predictor[pivot]++; this->Symbol_history[symbol][level] = pivot != 1; if (pivot != 1) { count = pivot == 0 ? 0 : total; }
}

if (pivot == 1)
Expand Down Expand Up @@ -399,10 +409,12 @@ class m03_model
private:
RangeCoder * coder;

uint16_t T1_model[1536][2];
uint16_t T2_model[1536][4];
uint16_t Ternary_model[768][4];
uint16_t T1_model[3072][2];
uint16_t T2_model[3072][4];
uint16_t Ternary_model[1536][4];
uint16_t Tree_model[96][16];

uint8_t Symbol_history[MAX_ALPHABET_SIZE][SYMBOL_HISTORY_MAX_DEPTH];
};

#pragma warning( pop )
#pragma warning( pop )
Loading

0 comments on commit f9631b6

Please sign in to comment.