-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
New API to find matches within specified sliding window.
- Loading branch information
1 parent
b15c658
commit bb75b1c
Showing
9 changed files
with
196 additions
and
47 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,7 +2,7 @@ | |
|
||
The esa-matchfinder is a C99 library for efficient Lempel-Ziv factorization using enhanced suffix array (ESA). | ||
|
||
Copyright (c) 2022 Ilya Grebnov <[email protected]> | ||
Copyright (c) 2022-2023 Ilya Grebnov <[email protected]> | ||
|
||
> * The esa-matchfinder is block based algorithm with maximum supported block size of 512 megabytes finding matches in range of 2..64 bytes using 12x bytes of extra memory. ESA_MATCHFINDER_MATCH_BITS definition could be changed to support larger match finding range, but with reduction in maximum supported block size. | ||
> * The esa-matchfinder does not employ any heuristics or search depth limitations and always finds distance optimal matches even on highly repetitive sources. The only exception is matches at beginning at the block; due to implementation details the esa-matchfinder can not find any matches with offset 0. | ||
|
@@ -29,6 +29,8 @@ The esa-matchfinder finds all distance optimal matches (between min_match_length | |
The esa-matchfinder released under the [Apache License Version 2.0](LICENSE "Apache license") and is considered suitable for production use. However, no warranty or fitness for a particular purpose is expressed or implied. | ||
|
||
## Changes | ||
* November 30, 2023 (1.1.0) | ||
* New API to find matches within specified sliding window. | ||
* June 19, 2022 (1.0.1) | ||
* Improved cache coherence for ARMv8 architecture. | ||
* June 12, 2022 (1.0.0) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
1.0.1 | ||
1.1.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,7 +3,7 @@ | |
This file is a part of esa-matchfinder, a library for efficient | ||
Lempel-Ziv factorization using enhanced suffix array (ESA). | ||
Copyright (c) 2022 Ilya Grebnov <[email protected]> | ||
Copyright (c) 2022-2023 Ilya Grebnov <[email protected]> | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
|
@@ -714,7 +714,61 @@ ESA_MATCHFINDER_MATCH * esa_matchfinder_find_all_matches(void * mf, ESA_MATCHFIN | |
|
||
const uint64_t min_match_length = (uint64_t)matchfinder_ctx->min_match_length_minus_1; | ||
const uint64_t new_offset = (uint64_t)position << ESA_MF_OFFSET_SHIFT; | ||
uint64_t best_match = ESA_MATCHFINDER_MAX_MATCH_LENGTH; | ||
uint64_t best_match = (uint64_t)(uint32_t)-1; | ||
uint64_t reference = plcp_leaf_link[position]; | ||
|
||
while (reference != 0) | ||
{ | ||
const uint64_t interval = sa_parent_link[reference]; | ||
const uint64_t match = min_match_length + (interval >> ESA_MF_LCP_SHIFT) + ((interval & ESA_MF_OFFSET_MASK) << (32 - ESA_MF_OFFSET_SHIFT)); | ||
|
||
#if defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__) | ||
if (offsetof(ESA_MATCHFINDER_MATCH, length) == 0 && offsetof(ESA_MATCHFINDER_MATCH, offset) == 4) | ||
{ | ||
*(uint64_t *)(void *)next_match = match; | ||
} | ||
else | ||
#endif | ||
{ | ||
next_match->length = (int32_t)(match ); | ||
next_match->offset = (int32_t)(match >> 32); | ||
} | ||
|
||
next_match += match > best_match; | ||
best_match = match; | ||
|
||
sa_parent_link[reference] = (interval & (~ESA_MF_OFFSET_MASK)) + new_offset; | ||
reference = interval & ESA_MF_PARENT_MASK; | ||
} | ||
|
||
return next_match; | ||
} | ||
|
||
ESA_MATCHFINDER_MATCH * esa_matchfinder_find_all_matches_in_window(void * mf, ESA_MATCHFINDER_MATCH * matches, uint64_t window_size) | ||
{ | ||
ESA_MF_CONTEXT * ESA_MF_RESTRICT const matchfinder_ctx = (ESA_MF_CONTEXT *)mf; | ||
|
||
const ptrdiff_t prefetch_distance = 4; | ||
const uint64_t position = matchfinder_ctx->position++; | ||
|
||
uint64_t * ESA_MF_RESTRICT const sa_parent_link = matchfinder_ctx->sa_parent_link; | ||
uint32_t * ESA_MF_RESTRICT const plcp_leaf_link = matchfinder_ctx->plcp_leaf_link; | ||
uint64_t * ESA_MF_RESTRICT const prefetch = &matchfinder_ctx->prefetch[position & (prefetch_distance - 1)][0]; | ||
ESA_MATCHFINDER_MATCH * ESA_MF_RESTRICT next_match = matches; | ||
|
||
esa_matchfinder_prefetchw(&sa_parent_link[ (sa_parent_link[prefetch[0]] & ESA_MF_PARENT_MASK)]); | ||
esa_matchfinder_prefetchw(&sa_parent_link[prefetch[0] = (sa_parent_link[prefetch[1]] & ESA_MF_PARENT_MASK)]); | ||
esa_matchfinder_prefetchw(&sa_parent_link[prefetch[1] = (sa_parent_link[prefetch[2]] & ESA_MF_PARENT_MASK)]); | ||
esa_matchfinder_prefetchw(&sa_parent_link[prefetch[2] = (sa_parent_link[prefetch[3]] & ESA_MF_PARENT_MASK)]); | ||
esa_matchfinder_prefetchw(&sa_parent_link[prefetch[3] = (sa_parent_link[prefetch[4]] & ESA_MF_PARENT_MASK)]); | ||
esa_matchfinder_prefetchw(&sa_parent_link[prefetch[4] = (sa_parent_link[prefetch[5]] & ESA_MF_PARENT_MASK)]); | ||
esa_matchfinder_prefetchw(&sa_parent_link[prefetch[5] = (sa_parent_link[prefetch[6]] & ESA_MF_PARENT_MASK)]); | ||
esa_matchfinder_prefetchw(&sa_parent_link[prefetch[6] = (plcp_leaf_link[position + 8 * prefetch_distance])]); | ||
esa_matchfinder_prefetchr(&plcp_leaf_link[position + 9 * prefetch_distance]); | ||
|
||
const uint64_t min_match_length = (uint64_t)matchfinder_ctx->min_match_length_minus_1; | ||
const uint64_t new_offset = (uint64_t)position << ESA_MF_OFFSET_SHIFT; | ||
uint64_t best_match = (position > window_size ? (position - window_size) << 32 : 0) + (uint64_t)(uint32_t)-1; | ||
uint64_t reference = plcp_leaf_link[position]; | ||
|
||
while (reference != 0) | ||
|
@@ -801,6 +855,65 @@ ESA_MATCHFINDER_MATCH esa_matchfinder_find_best_match(void * mf) | |
} | ||
} | ||
|
||
ESA_MATCHFINDER_MATCH esa_matchfinder_find_best_match_in_window(void * mf, uint64_t window_size) | ||
{ | ||
ESA_MF_CONTEXT * ESA_MF_RESTRICT const matchfinder_ctx = (ESA_MF_CONTEXT *)mf; | ||
|
||
const ptrdiff_t prefetch_distance = 4; | ||
const uint64_t position = matchfinder_ctx->position++; | ||
|
||
uint64_t * ESA_MF_RESTRICT const sa_parent_link = matchfinder_ctx->sa_parent_link; | ||
uint32_t * ESA_MF_RESTRICT const plcp_leaf_link = matchfinder_ctx->plcp_leaf_link; | ||
uint64_t * ESA_MF_RESTRICT const prefetch = &matchfinder_ctx->prefetch[position & (prefetch_distance - 1)][0]; | ||
|
||
esa_matchfinder_prefetchw(&sa_parent_link[ (sa_parent_link[prefetch[0]] & ESA_MF_PARENT_MASK)]); | ||
esa_matchfinder_prefetchw(&sa_parent_link[prefetch[0] = (sa_parent_link[prefetch[1]] & ESA_MF_PARENT_MASK)]); | ||
esa_matchfinder_prefetchw(&sa_parent_link[prefetch[1] = (sa_parent_link[prefetch[2]] & ESA_MF_PARENT_MASK)]); | ||
esa_matchfinder_prefetchw(&sa_parent_link[prefetch[2] = (sa_parent_link[prefetch[3]] & ESA_MF_PARENT_MASK)]); | ||
esa_matchfinder_prefetchw(&sa_parent_link[prefetch[3] = (sa_parent_link[prefetch[4]] & ESA_MF_PARENT_MASK)]); | ||
esa_matchfinder_prefetchw(&sa_parent_link[prefetch[4] = (sa_parent_link[prefetch[5]] & ESA_MF_PARENT_MASK)]); | ||
esa_matchfinder_prefetchw(&sa_parent_link[prefetch[5] = (sa_parent_link[prefetch[6]] & ESA_MF_PARENT_MASK)]); | ||
esa_matchfinder_prefetchw(&sa_parent_link[prefetch[6] = (plcp_leaf_link[position + 8 * prefetch_distance])]); | ||
esa_matchfinder_prefetchr(&plcp_leaf_link[position + 9 * prefetch_distance]); | ||
|
||
const uint64_t min_match_length = (uint64_t)matchfinder_ctx->min_match_length_minus_1; | ||
const uint64_t new_offset = (uint64_t)position << ESA_MF_OFFSET_SHIFT; | ||
const uint64_t match_cutoff = (position > window_size ? (position - window_size) << 32 : 0) + (uint64_t)(uint32_t)-1; | ||
|
||
uint64_t best_match = 0; | ||
uint64_t reference = plcp_leaf_link[position]; | ||
|
||
while (reference != 0) | ||
{ | ||
const uint64_t interval = sa_parent_link[reference]; | ||
uint64_t match = min_match_length + (interval >> ESA_MF_LCP_SHIFT) + ((interval & ESA_MF_OFFSET_MASK) << (32 - ESA_MF_OFFSET_SHIFT)); | ||
|
||
match = match > match_cutoff ? match : best_match; | ||
best_match = best_match == 0 ? match : best_match; | ||
|
||
sa_parent_link[reference] = (interval & (~ESA_MF_OFFSET_MASK)) + new_offset; | ||
reference = interval & ESA_MF_PARENT_MASK; | ||
} | ||
|
||
{ | ||
ESA_MATCHFINDER_MATCH match; | ||
|
||
#if defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__) | ||
if (offsetof(ESA_MATCHFINDER_MATCH, length) == 0 && offsetof(ESA_MATCHFINDER_MATCH, offset) == 4) | ||
{ | ||
*(uint64_t *)(void *)&match = best_match; | ||
} | ||
else | ||
#endif | ||
{ | ||
match.length = (int32_t)(best_match ); | ||
match.offset = (int32_t)(best_match >> 32); | ||
} | ||
|
||
return match; | ||
} | ||
} | ||
|
||
void esa_matchfinder_advance(void * mf, int32_t count) | ||
{ | ||
ESA_MF_CONTEXT * ESA_MF_RESTRICT const matchfinder_ctx = (ESA_MF_CONTEXT *)mf; | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,7 +3,7 @@ | |
This file is a part of esa-matchfinder, a library for efficient | ||
Lempel-Ziv factorization using enhanced suffix array (ESA). | ||
Copyright (c) 2022 Ilya Grebnov <[email protected]> | ||
Copyright (c) 2022-2023 Ilya Grebnov <[email protected]> | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
|
@@ -32,6 +32,11 @@ Please see the file LICENSE for full copyright and license details. | |
#define ESA_MATCHFINDER_NO_ERROR (0) | ||
#define ESA_MATCHFINDER_BAD_PARAMETER (-1) | ||
|
||
#define ESA_MATCHFINDER_VERSION_MAJOR 1 | ||
#define ESA_MATCHFINDER_VERSION_MINOR 1 | ||
#define ESA_MATCHFINDER_VERSION_PATCH 0 | ||
#define ESA_MATCHFINDER_VERSION_STRING "1.1.0" | ||
|
||
#ifdef __cplusplus | ||
extern "C" { | ||
#endif | ||
|
@@ -96,21 +101,39 @@ extern "C" { | |
int32_t esa_matchfinder_rewind(void * mf, int32_t position); | ||
|
||
/** | ||
* Finds all distance optimal matches at the current match-finder position and advances position by one byte. The recorded | ||
* matches will be sorted by strictly decreasing length and strictly increasing offset from the beginning of the block. | ||
* Finds all distance-optimal matches at the current position of the match-finder, and then advances the position by one byte. | ||
* The recorded matches will be sorted by strictly decreasing length and strictly increasing offset from the beginning of the block. | ||
* @param mf The enhanced suffix array (ESA) based match-finder. | ||
* @param matches The output array to record the matches (array must be of ESA_MATCHFINDER_MAX_MATCH_LENGTH size). | ||
* @return The pointer to the end of recorded matches array (if no matches were found, this will be the same as matches). | ||
*/ | ||
ESA_MATCHFINDER_MATCH * esa_matchfinder_find_all_matches(void * mf, ESA_MATCHFINDER_MATCH * matches); | ||
|
||
/** | ||
* Finds the best match at the current match-finder position and advances position by one byte. | ||
* Finds all distance-optimal matches within a specified sliding window at the current position of the match-finder, and then advances the position by one byte. | ||
* The recorded matches will be sorted by strictly decreasing length and strictly increasing offset from the beginning of the block. | ||
* @param mf The enhanced suffix array (ESA) based match-finder. | ||
* @param matches The output array to record the matches (array must be of ESA_MATCHFINDER_MAX_MATCH_LENGTH size). | ||
* @param window_size The maximum allowed distance between the current position and found matches. | ||
* @return The pointer to the end of recorded matches array (if no matches were found, this will be the same as matches). | ||
*/ | ||
ESA_MATCHFINDER_MATCH * esa_matchfinder_find_all_matches_in_window(void * mf, ESA_MATCHFINDER_MATCH * matches, uint64_t window_size); | ||
|
||
/** | ||
* Finds the best match at the current position of the match-finder, and then advances the position by one byte. | ||
* @param mf The enhanced suffix array (ESA) based match-finder. | ||
* @return The best match found (match of zero length and zero offset is returned if no matches were found). | ||
*/ | ||
ESA_MATCHFINDER_MATCH esa_matchfinder_find_best_match(void * mf); | ||
|
||
/** | ||
* Finds the best match within a specified sliding window at the current position of the match-finder, and then advances the position by one byte. | ||
* @param mf The enhanced suffix array (ESA) based match-finder. | ||
* @param window_size The maximum allowed distance between the current position and found match. | ||
* @return The best match found (match of zero length and zero offset is returned if no matches were found). | ||
*/ | ||
ESA_MATCHFINDER_MATCH esa_matchfinder_find_best_match_in_window(void * mf, uint64_t window_size); | ||
|
||
/** | ||
* Advances the match-finder position forward by the specified number of bytes without recording matches. | ||
* @param mf The enhanced suffix array (ESA) based match-finder. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
2.7.1 | ||
2.7.2 |
Oops, something went wrong.