diff --git a/CHANGES b/CHANGES
new file mode 100644
index 0000000..d7dae51
--- /dev/null
+++ b/CHANGES
@@ -0,0 +1,2 @@
+* 2021-12-03 : Version 0.1.0
+  * Initial public release of the bsc-m03.
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..dd740e1
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,5 @@
+﻿cmake_minimum_required (VERSION 3.9)
+
+project ("bsc-m03")
+
+add_executable (bsc-m03 bsc-m03.cpp hutucker/hu-tucker.c libsais/libsais.c libsais/libsais16.c)
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..f288702
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,674 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<https://www.gnu.org/licenses/>.
+
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<https://www.gnu.org/licenses/why-not-lgpl.html>.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..d328b37
--- /dev/null
+++ b/README.md
@@ -0,0 +1,106 @@
+# bsc-m03
+
+The bsc-m03 is experimental block sorting compressor based on M03 context aware compression algorithm invented by Michael Maniscalco:
+* Michael Maniscalco *M03: A solution for context based blocksort (BWT) compression*, 2004
+* Jurgen Abel *Post BWT stages of the Burrows-Wheeler compression algorithm*, 2010
+
+Copyright (c) 2021 Ilya Grebnov <ilya.grebnov@gmail.com>
+
+## License
+The libsais is released under the [GNU General Public License](LICENSE "GNU General Public License")
+
+## Changes
+* 2021-12-03 : Version 0.1.0
+  * Initial public release of the bsc-m03.
+
+# Benchmarks
+
+### Calgary Corpus ###
+| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
+|:---------------:|:-----------:|:------------:|:-------:|
+| bib | 111261 | 25143 | 1.808 |
+| book1 | 768771 | 208157 | 2.166 |
+| book2 | 610856 | 141591 | 1.854 |
+| geo | 102400 | 52797 | 4.125 |
+| news | 377109 | 108387 | 2.299 |
+| obj1 | 21504 | 9901 | 3.683 |
+| obj2 | 246814 | 69689 | 2.259 |
+| paper1 | 53161 | 15384 | 2.315 |
+| paper2 | 82199 | 23161 | 2.254 |
+| pic | 513216 | 44920 | 0.700 |
+| progc | 39611 | 11525 | 2.328 |
+| progl | 71646 | 13921 | 1.554 |
+| progp | 49379 | 9530 | 1.544 |
+| trans | 93695 | 15759 | 1.346 |
+
+### Canterbury Corpus ###
+| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
+|:---------------:|:-----------:|:------------:|:-------:|
+| alice29.txt | 152089 | 39310 | 2.068 |
+| asyoulik.txt | 125179 | 36585 | 2.338 |
+| cp.html | 24603 | 7042 | 2.290 |
+| fields.c | 11150 | 2748 | 1.972 |
+| grammar.lsp | 3721 | 1142 | 2.455 |
+| kennedy.xls | 1029744 | 58440 | 0.454 |
+| lcet10.txt | 426754 | 96730 | 1.813 |
+| plrabn12.txt | 481861 | 131617 | 2.185 |
+| ptt5 | 513216 | 44920 | 0.700 |
+| sum | 38240 | 11599 | 2.427 |
+| xargs.1 | 4227 | 1618 | 3.062 |
+
+### Large Canterbury Corpus ###
+| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
+|:---------------:|:-----------:|:------------:|:-------:|
+| bible.txt | 4047392 | 708602 | 1.401 |
+| E.coli | 4638690 | 1137915 | 1.962 |
+| world192.txt | 2473400 | 384776 | 1.245 |
+
+### Silesia Corpus ###
+| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
+|:---------------:|:-----------:|:------------:|:-------:|
+| dickens | 10192446 | 2220939 | 1.743 |
+| mozilla | 51220480 | 15831237 | 2.473 |
+| mr | 9970564 | 2169223 | 1.741 |
+| nci | 33553445 | 1148550 | 0.274 |
+| ooffice | 6152192 | 2542258 | 3.306 |
+| osdb | 10085684 | 2251471 | 1.786 |
+| reymont | 6627202 | 972461 | 1.174 |
+| samba | 21606400 | 3881872 | 1.437 |
+| sao | 7251944 | 4672656 | 5.155 |
+| webster | 41458703 | 6318267 | 1.219 |
+| xml | 5345280 | 369196 | 0.553 |
+| x-ray | 8474240 | 3697722 | 3.491 |
+
+### Manzini Corpus ###
+| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
+|:---------------:|:-----------:|:------------:|:-------:|
+| chr22.dna | 34553758 | 7262753 | 1.681 |
+| etext99 | 105277340 | 21730495 | 1.651 |
+| gcc-3.0.tar | 86630400 | 10306097 | 0.952 |
+| howto | 39422105 | 7662880 | 1.555 |
+| jdk13c | 69728899 | 2692938 | 0.309 |
+| linux-2.4.5.tar | 116254720 | 16773180 | 1.154 |
+| rctail96 | 114711151 | 9949692 | 0.694 |
+| rfc | 116421901 | 15192366 | 1.044 |
+| sprot34.dat | 109617186 | 17534134 | 1.280 |
+| w3c2 | 104201579 | 5800775 | 0.445 |
+
+### Maximum Compression Corpus ###
+| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
+|:---------------:|:-----------:|:------------:|:-------:|
+| A10.jpg | 842468 | 825162 | 7.836 |
+| AcroRd32.exe | 3870784 | 1582677 | 3.271 |
+| english.dic | 465211 | 148582 | 2.555 |
+| FlashMX.pdf | 4526946 | 3735179 | 6.601 |
+| FP.LOG | 20617071 | 514554 | 0.200 |
+| MSO97.DLL | 3782416 | 1904460 | 4.028 |
+| ohs.doc | 4168192 | 817718 | 1.569 |
+| rafale.bmp | 4149414 | 750437 | 1.447 |
+| vcfiu.hlp | 4121418 | 620358 | 1.204 |
+| world95.txt | 2988578 | 452271 | 1.211 |
+
+### Large Text Compression Benchmark Corpus ###
+| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
+|:---------------:|:-----------:|:------------:|:-------:|
+| enwik8 | 100000000 | 20529360 | 1.642 |
+| enwik9 | 1000000000 | 162084133 | 1.297 |
diff --git a/VERSION b/VERSION
new file mode 100644
index 0000000..6c6aa7c
--- /dev/null
+++ b/VERSION
@@ -0,0 +1 @@
+0.1.0
\ No newline at end of file
diff --git a/bsc-m03.cpp b/bsc-m03.cpp
new file mode 100644
index 0000000..0279baa
--- /dev/null
+++ b/bsc-m03.cpp
@@ -0,0 +1,483 @@
+/*--
+
+This file is a part of bsc-m03 project.
+
+    Copyright (c) 2021 Ilya Grebnov <ilya.grebnov@gmail.com>
+
+    bsc-m03 is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    bsc-m03 is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with bsc-m03.  If not, see <https://www.gnu.org/licenses/>.
+
+--*/
+
+#define _CRT_SECURE_NO_WARNINGS
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include <algorithm>
+
+#include "libsais/libsais.h"
+#include "libsais/libsais16.h"
+
+#include "common/platform.h"
+#include "common/rangecoder.h"
+
+#define MAX_ALPHABET_SIZE (256 * 256)
+
+#include "m03_parser.h"
+
+#pragma warning( push )
+#pragma warning( disable : 6385 )
+#pragma warning( disable : 6386 )
+
+int32_t root_frequencies[MAX_ALPHABET_SIZE + 1];
+
+static int32_t compress_memory_block(uint8_t * buffer, int32_t block_size, int32_t symbol_size)
+{
+    if (block_size % symbol_size != 0)
+    {
+        fprintf(stderr, "\nError: Block size of %d bytes is not a multiple of symbol width!\n", block_size);
+        return -2;
+    }
+
+    int32_t indexes[32] = { -1 };
+    int32_t comressed_size = -1;
+    int32_t block_symbols = block_size / symbol_size;
+    int32_t r = next_power_of_2(std::max(block_symbols / 16, 1048576));
+
+    if (int32_t * libsais_temp = (int32_t *)malloc(block_symbols * sizeof(int32_t)))
+    {
+        int32_t result = symbol_size == 1
+            ? libsais_bwt_aux(buffer, buffer, libsais_temp, block_symbols, 0, root_frequencies, r, indexes)
+            : libsais16_bwt_aux((uint16_t *)buffer, (uint16_t *)buffer, libsais_temp, block_symbols, 0, root_frequencies, r, indexes);
+
+        free(libsais_temp);
+
+        if (result == 0)
+        {
+            if (uint16_t * L = (uint16_t *)malloc(((size_t)block_symbols + 1) * sizeof(uint16_t)))
+            {
+                if (m03_parser * parser = (m03_parser *)malloc(sizeof(m03_parser)))
+                {
+                    {
+                        int32_t primary_index = indexes[0];
+
+                        if (symbol_size == 1)
+                        {
+                            for (int32_t p = 0; p < primary_index; ++p)             { L[p + 0] = ((uint16_t)buffer[p]); }
+                            for (int32_t p = primary_index; p < block_symbols; ++p) { L[p + 1] = ((uint16_t)buffer[p]); }
+                        }
+                        else
+                        {
+                            for (int32_t p = 0; p < primary_index; ++p)             { L[p + 0] = ((uint16_t *)buffer)[p]; }
+                            for (int32_t p = primary_index; p < block_symbols; ++p) { L[p + 1] = ((uint16_t *)buffer)[p]; }
+                        }
+
+                        L[primary_index] = 0;
+                    }
+
+                    RangeCoder coder;
+                    coder.InitEncoder(buffer, block_size);
+                    coder.EncodeValue(1, symbol_size, 2);
+
+                    for (int32_t t = 0; t <= (block_symbols - 1) / r; ++t)
+                    {
+                        coder.EncodeValue(1, indexes[t], block_symbols);
+                    }
+
+                    if (parser->initialize(L, block_symbols + 1, indexes[0], root_frequencies, symbol_size == 1 ? 256 : 256 * 256, &coder, m03_mode::encoding))
+                    {
+                        parser->run();
+                        parser->destroy();
+
+                        comressed_size = coder.FinishEncoder();
+                    }
+                    else
+                    {
+                        fprintf(stderr, "\nError: Not enough memory!\n");
+                    }
+
+                    free(parser);
+                }
+                else
+                {
+                    fprintf(stderr, "\nError: Not enough memory!\n");
+                }
+
+                free(L);
+            }
+            else
+            {
+                fprintf(stderr, "\nError: Not enough memory!\n");
+            }
+        }
+        else
+        {
+            fprintf(stderr, "\nError: libsais_bwt failed, please contact the author!\n");
+        }
+    }
+    else
+    {
+        fprintf(stderr, "\nError: Not enough memory!\n");
+    }
+
+    return comressed_size;
+}
+
+static int32_t decompress_memory_block(uint8_t * buffer, int32_t comressed_size, int32_t block_size)
+{
+    RangeCoder coder;
+    coder.InitDecoder(buffer);
+    int32_t symbol_size = coder.DecodeValue(1, 2);
+
+    int32_t indexes[32] = { -1 };
+    int32_t primary_index = -1;
+    int32_t decomressed_size = -1;
+    int32_t block_symbols = block_size / symbol_size;
+    int32_t r = next_power_of_2(std::max(block_symbols / 16, 1048576));
+
+    for (int32_t t = 0; t <= (block_symbols - 1) / r; ++t)
+    {
+        indexes[t] = coder.DecodeValue(1, block_symbols);
+    }
+
+    if (uint16_t * L = (uint16_t *)malloc(((size_t)block_symbols + 1) * sizeof(uint16_t)))
+    {
+        if (m03_parser * parser = (m03_parser *)malloc(sizeof(m03_parser)))
+        {
+            if (parser->initialize(L, block_symbols + 1, indexes[0], root_frequencies, symbol_size == 1 ? 256 : 256 * 256, &coder, m03_mode::decoding))
+            {
+                parser->run();
+                parser->destroy();
+
+                {
+                    primary_index = indexes[0];
+
+                    if (symbol_size == 1)
+                    {
+                        for (int32_t p = 0; p < primary_index; ++p)             { buffer[p] = (uint8_t)L[p + 0]; }
+                        for (int32_t p = primary_index; p < block_symbols; ++p) { buffer[p] = (uint8_t)L[p + 1]; }
+                    }
+                    else
+                    {
+                        for (int32_t p = 0; p < primary_index; ++p)             { ((uint16_t *)buffer)[p] = L[p + 0]; }
+                        for (int32_t p = primary_index; p < block_symbols; ++p) { ((uint16_t *)buffer)[p] = L[p + 1]; }
+                    }
+                }
+            }
+            else
+            {
+                fprintf(stderr, "\nError: Not enough memory!\n");
+            }
+
+            free(parser);
+        }
+        else
+        {
+            fprintf(stderr, "\nError: Not enough memory!\n");
+        }
+
+        free(L);
+    }
+    else
+    {
+        fprintf(stderr, "\nError: Not enough memory!\n");
+    }
+
+    if (primary_index > 0)
+    {
+        if (int32_t * libsais_temp = (int32_t *)malloc(((size_t)block_symbols + 1) * sizeof(int32_t)))
+        {
+            int32_t result = symbol_size == 1
+                ? libsais_unbwt_aux(buffer, buffer, libsais_temp, block_symbols, root_frequencies, r, indexes)
+                : libsais16_unbwt_aux((uint16_t *)buffer, (uint16_t *)buffer, libsais_temp, block_symbols, root_frequencies, r, indexes);
+
+            if (result == 0)
+            {
+                decomressed_size = block_size;
+            }
+            else
+            {
+                fprintf(stderr, "\nError: libsais_unbwt failed, please contact the author!\n");
+            }
+
+            free(libsais_temp);
+        }
+        else
+        {
+            fprintf(stderr, "\nError: Not enough memory!\n");
+        }
+    }
+
+    return decomressed_size;
+}
+
+static int compress_file(const char * input_file_name, const char * output_file_name, int32_t max_block_size, int32_t symbol_size)
+{
+    clock_t start_time = clock();
+    if (FILE * input_file = fopen(input_file_name, "rb"))
+    {
+        if (FILE * output_file = fopen(output_file_name, "wb"))
+        {
+            fseek(input_file, 0, SEEK_END); int64_t remaining_size = _ftelli64(input_file); rewind(input_file);
+
+            if (uint8_t * buffer = (uint8_t *)malloc(std::min(remaining_size, (int64_t)max_block_size) * sizeof(uint8_t)))
+            {
+                int64_t input_bytes = 0, output_bytes = 0;
+
+                while (remaining_size > 0)
+                {
+                    fprintf(stdout, "\rCompressing %.55s(%02d%%)", input_file_name, (int)((input_bytes * 100) / (input_bytes + remaining_size)));
+
+                    int32_t block_size = (int32_t)std::min(remaining_size, (int64_t)max_block_size);
+
+                    if (fread(buffer, sizeof(uint8_t), block_size, input_file) != block_size)
+                    {
+                        fprintf(stderr, "\nError: Unable to read input file!\n");
+                        break;
+                    }
+
+                    int32_t comressed_size = compress_memory_block(buffer, block_size, symbol_size);
+                    if (comressed_size <= 0) { break; }
+
+                    if (fwrite(&block_size, sizeof(uint8_t), sizeof(block_size), output_file) != sizeof(block_size))
+                    {
+                        fprintf(stderr, "\nError: Unable to write output file!\n");
+                        break;
+                    }
+
+                    if (fwrite(&comressed_size, sizeof(uint8_t), sizeof(comressed_size), output_file) != sizeof(comressed_size))
+                    {
+                        fprintf(stderr, "\nError: Unable to write output file!\n");
+                        break;
+                    }
+
+                    if (fwrite(buffer, sizeof(uint8_t), comressed_size, output_file) != comressed_size)
+                    {
+                        fprintf(stderr, "\nError: Unable to write output file\n");
+                        break;
+                    }
+
+                    remaining_size  -= block_size;
+                    input_bytes     += block_size;
+                    output_bytes    += sizeof(block_size) + sizeof(comressed_size) + comressed_size;
+                }
+
+                if (remaining_size == 0)
+                {
+                    fprintf(stdout, "\r%.55s compressed from %lld into %lld in %.3f seconds (%.3f bps).\n", input_file_name, input_bytes, output_bytes, ((double)clock() - start_time) / CLOCKS_PER_SEC, (8.0 * symbol_size * output_bytes) / input_bytes);
+                }
+
+                free(buffer);
+            }
+            else
+            {
+                fprintf(stderr, "Error: Not enough memory!\n");
+            }
+
+            fclose(output_file);
+        }
+        else
+        {
+            fprintf(stderr, "Error: Unable to open output file!\n");
+        }
+
+        fclose(input_file);
+    }
+    else
+    {
+        fprintf(stderr, "Error: Unable to open input file!\n");
+    }
+
+    return 0;
+}
+
+static int decompress_file(const char * input_file_name, const char * output_file_name)
+{
+    clock_t start_time = clock();
+    if (FILE * input_file = fopen(input_file_name, "rb"))
+    {
+        if (FILE * output_file = fopen(output_file_name, "wb"))
+        {
+            int32_t max_block_size;
+            if (fread(&max_block_size, sizeof(uint8_t), sizeof(max_block_size), input_file) == sizeof(max_block_size))
+            {
+                fseek(input_file, 0, SEEK_END); int64_t remaining_size = _ftelli64(input_file); rewind(input_file);
+
+                if (uint8_t * buffer = (uint8_t *)malloc(max_block_size * sizeof(uint8_t)))
+                {
+                    int64_t input_bytes = 0, output_bytes = 0;
+
+                    while (remaining_size > 0)
+                    {
+                        fprintf(stdout, "\rDecompressing %.55s(%02d%%)", input_file_name, (int)((input_bytes * 100) / (input_bytes + remaining_size)));
+
+                        int32_t block_size, comressed_size;
+                        if (fread(&block_size, sizeof(uint8_t), sizeof(block_size), input_file) != sizeof(block_size))
+                        {
+                            fprintf(stderr, "\nError: Unable to read input file!\n");
+                            break;
+                        }
+
+                        if (fread(&comressed_size, sizeof(uint8_t), sizeof(comressed_size), input_file) != sizeof(comressed_size))
+                        {
+                            fprintf(stderr, "\nError: Unable to read input file!\n");
+                            break;
+                        }
+
+                        if (block_size > max_block_size || comressed_size > max_block_size)
+                        {
+                            fprintf(stderr, "\nError: The compressed data is corrupted!\n");
+                            break;
+                        }
+
+                        if (fread(buffer, sizeof(uint8_t), comressed_size, input_file) != comressed_size)
+                        {
+                            fprintf(stderr, "\nError: Unable to read input file!\n");
+                            break;
+                        }
+
+                        int32_t decomressed_size = decompress_memory_block(buffer, comressed_size, block_size);
+                        if (decomressed_size != block_size) { break; }
+
+                        if (fwrite(buffer, sizeof(uint8_t), decomressed_size, output_file) != decomressed_size)
+                        {
+                            fprintf(stderr, "\nError: Unable to write output file\n");
+                            break;
+                        }
+
+                        remaining_size  -= sizeof(block_size) + sizeof(comressed_size) + comressed_size;
+                        input_bytes     += sizeof(block_size) + sizeof(comressed_size) + comressed_size;
+                        output_bytes    += decomressed_size;
+                    }
+
+                    if (remaining_size == 0)
+                    {
+                        fprintf(stdout, "\r%.55s decompressed from %lld into %lld in %.3f seconds.\n", input_file_name, input_bytes, output_bytes, ((double)clock() - start_time) / CLOCKS_PER_SEC);
+                    }
+
+                    free(buffer);
+                }
+                else
+                {
+                    fprintf(stderr, "Error: Not enough memory!\n");
+                }
+            }
+            else
+            {
+                fprintf(stderr, "Error: Unable to read input file!\n");
+            }
+
+            fclose(output_file);
+        }
+        else
+        {
+            fprintf(stderr, "Error: Unable to open output file!\n");
+        }
+
+        fclose(input_file);
+    }
+    else
+    {
+        fprintf(stderr, "Error: Unable to open input file!\n");
+    }
+
+    return 0;
+}
+
+static int print_usage()
+{
+    fprintf(stdout, "Usage: bsc-m03 <e|d> input-file output-file <options>\n");
+    fprintf(stdout, "  -b<size> Block size in bytes, default 128MB (memory usage is ~15x).\n");
+    fprintf(stdout, "  -w<8|16> Symbol width in bits.\n");
+
+    return 0;
+}
+
+int main(int argc, const char * argv[])
+{
+    fprintf(stdout, "bsc-m03 is experimental block sorting compressor. Version 0.1.0 (3 December 2021).\n");
+    fprintf(stdout, "Copyright (c) 2021 Ilya Grebnov <Ilya.Grebnov@gmail.com>. ABSOLUTELY NO WARRANTY.\n");
+    fprintf(stdout, "This program is based on (at least) the work of Michael Maniscalco and Atsushi Komiya.\n\n");
+
+    int32_t max_block_size  = 128 * 1024 * 1024;
+    int32_t symbol_width    = 8;
+
+    if (argc < 4 || strlen(argv[1]) != 1)
+    {
+        return print_usage();
+    }
+
+    for (int32_t i = 4; i < argc; ++i)
+    {
+        if (argv[i][0] != '-') 
+        { 
+            return print_usage(); 
+        }
+
+        switch (argv[i][1])
+        {
+            case 'b':
+            {
+                max_block_size = atoi(argv[i] + 2);
+                if (max_block_size <= 0) { return print_usage(); }
+
+                break;
+            }
+
+            case 'w':
+            {
+                symbol_width = atoi(argv[i] + 2);
+                if (symbol_width != 8 && symbol_width != 16) { return print_usage(); }
+
+                break;
+            }
+
+            default:
+            {
+                return print_usage();
+            }
+        }
+    }
+
+    switch (argv[1][0])
+    {
+        case 'c':
+        case 'C':
+        case 'e':
+        case 'E':
+        {
+            return compress_file(argv[2], argv[3], max_block_size, symbol_width / 8);
+        }
+
+        case 'd':
+        case 'D':
+        {
+            if (argc != 4) { return print_usage(); }
+
+            return decompress_file(argv[2], argv[3]);
+        }
+
+        default:
+        {
+            return print_usage();
+        }
+    }
+
+    return 0;
+}
+
+#pragma warning( pop )
\ No newline at end of file
diff --git a/common/LICENSE b/common/LICENSE
new file mode 100644
index 0000000..d645695
--- /dev/null
+++ b/common/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/common/platform.h b/common/platform.h
new file mode 100644
index 0000000..1252057
--- /dev/null
+++ b/common/platform.h
@@ -0,0 +1,125 @@
+/*-----------------------------------------------------------*/
+/* Block Sorting, Lossless Data Compression Library.         */
+/* Interface to platform specific functions and constants    */
+/*-----------------------------------------------------------*/
+
+/*--
+
+This file is a part of bsc and/or libbsc, a program and a library for
+lossless, block-sorting data compression.
+
+   Copyright (c) 2009-2021 Ilya Grebnov <ilya.grebnov@gmail.com>
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+Please see the file LICENSE for full copyright information.
+
+See also the bsc and libbsc web site:
+  http://libbsc.com/ for more information.
+
+--*/
+
+/*--
+
+NOTICE: This file has been modified for use in the bsc-m03 project.
+
+--*/
+
+#ifndef _LIBBSC_PLATFORM_H
+#define _LIBBSC_PLATFORM_H
+
+#if defined(_MSC_VER)
+    #include <intrin.h>
+#else
+    #include <immintrin.h>
+#endif
+
+#if defined(__GNUC__)
+    #define INLINE __inline__
+#elif defined(_MSC_VER)
+    #define INLINE __forceinline
+#elif defined(__IBMC__)
+    #define INLINE _Inline
+#elif defined(__cplusplus)
+    #define INLINE inline
+#else
+    #define INLINE /* */
+#endif
+
+#if defined(_MSC_VER)
+    #define NOINLINE __declspec(noinline)
+#elif defined(__GNUC__)
+    #define NOINLINE __attribute__ ((noinline))
+#else
+    #define NOINLINE /* */
+#endif
+
+#if defined(_MSC_VER)
+    #define ALIGNED(x) __declspec(align(x))
+#elif defined(__GNUC__)
+    #define ALIGNED(x) __attribute__ ((aligned(x)))
+#endif
+
+#if defined(__GNUC__) || defined(__clang__) || defined(__CUDACC__)
+    #define RESTRICT __restrict__
+#elif defined(_MSC_VER) || defined(__INTEL_COMPILER)
+    #define RESTRICT __restrict
+#else
+    #define RESTRICT /* */
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+    #define byteswap_uint64(x)    (__builtin_bswap64(x))
+    #define bit_scan_reverse(x)   (__builtin_clz(x) ^ 31)
+    #define bit_scan_forward(x)   (__builtin_ctz(x))
+    #define bit_scan_forward64(x) (__builtin_ctzll(x))
+#elif defined(_MSC_VER)
+    #define byteswap_uint64(x)  (_byteswap_uint64(x))
+
+    #pragma intrinsic(_BitScanReverse)
+    #pragma intrinsic(_BitScanForward)
+
+    static inline __forceinline unsigned long bit_scan_reverse(unsigned long x) 
+    {
+       unsigned long index;
+       _BitScanReverse(&index, x);
+       return index;
+    }
+
+    static inline __forceinline unsigned long bit_scan_forward(unsigned long x) 
+    {
+       unsigned long index;
+       _BitScanForward(&index, x);
+       return index;
+    }
+#endif
+
+    static INLINE unsigned int next_power_of_2(unsigned int v)
+    {
+        v--;
+
+        v |= v >> 1;
+        v |= v >> 2;
+        v |= v >> 4;
+        v |= v >> 8;
+        v |= v >> 16;
+        v++;
+
+        return v;
+    }
+
+#endif
+
+/*-----------------------------------------------------------*/
+/* End                                            platform.h */
+/*-----------------------------------------------------------*/
diff --git a/common/rangecoder.h b/common/rangecoder.h
new file mode 100644
index 0000000..4a5d6a6
--- /dev/null
+++ b/common/rangecoder.h
@@ -0,0 +1,238 @@
+/*-----------------------------------------------------------*/
+/* Block Sorting, Lossless Data Compression Library.         */
+/* Range coder                                               */
+/*-----------------------------------------------------------*/
+
+/*--
+
+This file is a part of bsc and/or libbsc, a program and a library for
+lossless, block-sorting data compression.
+
+   Copyright (c) 2009-2021 Ilya Grebnov <ilya.grebnov@gmail.com>
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+Please see the file LICENSE for full copyright information.
+
+See also the bsc and libbsc web site:
+  http://libbsc.com/ for more information.
+
+--*/
+
+/*--
+
+NOTICE: This file has been modified for use in the bsc-m03 project.
+
+--*/
+
+#ifndef _LIBBSC_CODER_RANGECODER_H
+#define _LIBBSC_CODER_RANGECODER_H
+
+#include "platform.h"
+
+class RangeCoder
+{
+
+private:
+
+    union ari
+    {
+        struct u
+        {
+            unsigned int low32;
+            unsigned int carry;
+        } u;
+        unsigned long long low;
+    } ari;
+
+    unsigned int ari_code;
+    unsigned int ari_ffnum;
+    unsigned int ari_cache;
+    unsigned int ari_range;
+
+    const unsigned char * RESTRICT ari_input;
+          unsigned char * RESTRICT ari_output;
+          unsigned char * RESTRICT ari_outputEOB;
+          unsigned char * RESTRICT ari_outputStart;
+
+    INLINE void OutputByte(unsigned char s)
+    {
+        *ari_output++ = s;
+    };
+
+    INLINE unsigned char InputByte()
+    {
+        return *ari_input++;
+    };
+
+    NOINLINE unsigned int ShiftLow()
+    {
+        if (ari.u.low32 < 0xff000000U || ari.u.carry)
+        {
+            OutputByte(ari_cache + ari.u.carry);
+            if (ari_ffnum)
+            {
+                unsigned char s = ari.u.carry - 1;
+                do { OutputByte(s); } while (--ari_ffnum);
+            }
+            ari_cache = ari.u.low32 >> 24; ari.u.carry = 0;
+        } 
+        else
+        {
+            ari_ffnum++;
+        }
+        
+        ari.u.low32 <<= 8; return ari_range << 8;
+    }
+
+public:
+
+    INLINE void InitEncoder(unsigned char * output, int outputSize)
+    {
+        ari_outputStart = output;
+        ari_output      = output;
+        ari_outputEOB   = output + outputSize - 16;
+        ari.low         = 0;
+        ari_ffnum       = 0;
+        ari_cache       = 0;
+        ari_range       = 0xffffffff;
+    };
+
+    INLINE int FinishEncoder()
+    {
+        ShiftLow(); ShiftLow(); ShiftLow(); ShiftLow(); ShiftLow();
+        return (int)(ari_output - ari_outputStart);
+    }
+
+    INLINE void Encode(unsigned int cum_freq, unsigned int sym_freq, unsigned int total_freq)
+    {
+        unsigned int range = ari_range / total_freq;
+        ari.low += (unsigned long long)cum_freq * range; ari_range = sym_freq * range;
+
+        while (ari_range < 0x1000000) { ari_range = ShiftLow(); }
+    }
+
+    template <int P = 12> INLINE unsigned int EncodeBit(unsigned int bit, int probability)
+    {
+        unsigned int range = (((unsigned long long)ari_range) * probability) >> P;
+        ari.low   = ari.low + ((~bit + 1u) & range);
+        ari_range = range   + ((~bit + 1u) & (ari_range - range - range));
+
+        while (ari_range < 0x1000000) { ari_range = ShiftLow(); }
+
+        return bit;
+    }
+
+    INLINE unsigned int EncodeValue(unsigned int min, unsigned int value, unsigned int max)
+    {
+        assert(min <= value && value <= max);
+
+        while (max - min >= 0x10000)
+        {
+            unsigned int median = min + ((max - min) >> 1);
+            if (value > median)
+            {
+                EncodeBit<1>(1, 1);
+                min = median + 1;
+            }
+            else
+            {
+                EncodeBit<1>(0, 1);
+                max = median;
+            }
+        }
+
+        if (min != max)
+        {
+            Encode(value - min, 1, max - min + 1);
+        }
+
+        return value;
+    }
+
+    INLINE void InitDecoder(const unsigned char * input)
+    {
+        ari_input = input;
+        ari_code  = 0;
+        ari_range = 0xffffffff;
+        ari_code  = (ari_code << 8) | InputByte();
+        ari_code  = (ari_code << 8) | InputByte();
+        ari_code  = (ari_code << 8) | InputByte();
+        ari_code  = (ari_code << 8) | InputByte();
+        ari_code  = (ari_code << 8) | InputByte();
+    };
+
+    INLINE unsigned int GetCumFreq(unsigned int total_freq)
+    {
+        while (ari_range < 0x1000000)
+        {
+            ari_range <<= 8; ari_code = (ari_code << 8) | InputByte();
+        }
+
+        return ari_code / (ari_range / total_freq);
+    }
+
+    INLINE void Decode(unsigned int cum_freq, unsigned int sym_freq, unsigned int total_freq)
+    {
+        unsigned int range = ari_range / total_freq;
+        ari_code -= cum_freq * range; ari_range = sym_freq * range;
+    }
+
+    template <int P = 12> INLINE int DecodeBit(int probability)
+    {
+        while (ari_range < 0x1000000)
+        {
+            ari_range <<= 8; ari_code = (ari_code << 8) | InputByte();
+        }
+
+        unsigned int range = (((unsigned long long)ari_range) * probability) >> P;
+        int bit = ari_code >= range;
+
+        ari_range = bit ? ari_range - range : range;
+        ari_code  = bit ? ari_code  - range : ari_code;
+
+        return bit;
+    }
+
+    INLINE unsigned int DecodeValue(unsigned int min, unsigned int max)
+    {
+        assert(min <= max);
+
+        while (max - min >= 0x10000)
+        {
+            unsigned int median = min + ((max - min) >> 1);
+            if (DecodeBit<1>(1))
+            {
+                min = median + 1;
+            }
+            else
+            {
+                max = median;
+            }
+        }
+
+        if (min != max)
+        {
+            unsigned int cum_freq = GetCumFreq(max - min + 1);
+            Decode(cum_freq, 1, max - min + 1); min += cum_freq;
+        }
+
+        return min;
+    }
+};
+
+#endif
+
+/*-----------------------------------------------------------*/
+/* End                                          rangecoder.h */
+/*-----------------------------------------------------------*/
diff --git a/hutucker/LICENSE b/hutucker/LICENSE
new file mode 100644
index 0000000..f288702
--- /dev/null
+++ b/hutucker/LICENSE
@@ -0,0 +1,674 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<https://www.gnu.org/licenses/>.
+
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<https://www.gnu.org/licenses/why-not-lgpl.html>.
diff --git a/hutucker/README b/hutucker/README
new file mode 100644
index 0000000..e104713
--- /dev/null
+++ b/hutucker/README
@@ -0,0 +1,88 @@
+This is an O(n log n) implementation of Hu-Tucker coding.[1]
+
+This is the algorithm:
+1. Label node 0, ..., n-1 'terminal'
+2. Repeat (n - 1) times:
+   (a) Find the pair (i, j) such that
+       (i)   i < j,
+       (ii)  neither node i nor j is labeled 'none',
+       (iii) none of node i+1, ..., j-1 is labeled 'terminal',
+       (iv)  weight[i] + weight[j] are minimal,
+       (v)   i is minimal if the selection is not unique after (iv), and
+       (vi)  j is minimal if the selection is not unique after (v)
+   (b) Merge node i with node j, and saves it as new node i
+   (c) weight[i] += weight[j]
+   (d) Label node i 'internal'
+   (e) Label node j 'none'
+3. A tree has been built with root being node 0.
+   Traverse this tree for length of code.
+   This tree is not alphabetical.
+   Nevertheless, the length of code produced by the tree is correct.
+
+See example.c for computing the actual code from the length.
+
+We need a non-trivial data structure to implement 2(a) efficiently.
+This is the data structure:
+1. It is a perfect binary tree.
+   The nodes in this tree are called "segnodes" to distinguish them from
+   nodes in the coding tree.
+   This tree shall have at least n leaf segnodes.
+2. Each segnode is implicitly associated with a range [a, b).
+   The range of the leaf segnode i is [i, i+1).
+   The range of each internal segnode is union of the ranges of its children.
+   (Alternatively, the range of each internal nodes is the union of
+    the ranges of all leaf nodes in its subtree.)
+3. Each segnode also has 6 explicit fields (n, m, l, r, i, j).
+   n: The number of nodes [a, b) labeled 'terminal' or 'internal'
+   m: The number of nodes [a, b) labeled 'terminal'
+   l: The index such that:
+      (i)   l in [a, b),
+      (ii)  node l is not labeled 'none',
+      (iii) none of node l, ..., i-1 is labeled 'terminal',
+      (iv)  weight[l] is minimal, and
+      (v)   l is minimal if the selection is not unique after (iv)
+   r: The index such that:
+      (i)   r in [a, b),
+      (ii)  node r is not labeled 'none',
+      (iii) none of node r, ..., b-1 is labeled 'terminal',
+      (iv)  weight[r] is minimal, and
+      (v)   r is minimal if the selection is not unique after (iv)
+   i, j: The pair of indices such that:
+      (i)   a <= i < j < b,
+      (ii)  neither node i nor j is labeled 'none',
+      (iii) none of node i+1, ..., j-1 is labeled 'terminal',
+      (iv)  weight[i] + weight[j] are minimal,
+      (v)   i is minimal if the selection is not unique after (iv), and
+      (vi)  j is minimal if the selection is not unique after (v)
+4. The explicit fields can be trivially computed for leaf segnodes:
+   (a) Leaf segnode i labeled 'terminal':
+       (n, m, l, r, i, j) = (1, 1, i, i, None, None)
+   (b) Leaf segnode i labeled 'internal':
+       (n, m, l, r, i, j) = (1, 0, i, i, None, None)
+   (c) Leaf segnode i labeled 'none':
+       (n, m, l, r, i, j) = (0, 0, None, None, None, None)
+5. The explicit fields can be efficiently computed for internal segnodes,
+   if we have access to correct labels of its children segnodes.
+   Let its left children be L, and its right children be R.
+   n: L.n + R.n
+   m: L.n + R.m
+   l: L.l if L.m > 0, otherwise the better of L.l and R.l
+   r: R.r if R.m > 0, otherwise the better of L.r and R.r
+   i, j: the best of (L.i, L.j), (L.r, R.l) and (R.i, R.j)
+
+Analysis:
+1. This data structure can be built in O(n).
+2. The (i, j) step 2(a) is the (i, j) of the root of the data structure,
+   which can be looked up in O(1).
+3. When the weight[i] and label of node i changed,
+   leaf segnode i and its ancestors need to be updated.
+   That's O(log n) updates and O(1) per update.
+   Same for node j.
+4. Step 2 is repeated O(n) times.
+   Other parts are trivial.
+   Therefore, the overall time is O(n log n).
+
+
+[1]: Hu, T. C.; Tucker, A. C. (1971) "Optimal Computer Search Trees
+     and Variable-Length Alphabetical Codes", SIAM Journal on
+     Applied Mathematics. 21 (4): 514.
diff --git a/hutucker/example.c b/hutucker/example.c
new file mode 100644
index 0000000..34dd53f
--- /dev/null
+++ b/hutucker/example.c
@@ -0,0 +1,84 @@
+/*
+ * Linearithmic Hu-Tucker Coding.
+ * Copyright (C) 2018  Pochang Chen
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "hu-tucker.h"
+
+int main() {
+    size_t n;
+    if (scanf("%zu", &n) != 1)
+        return 1;
+    if (n < 1) {
+        errno = EINVAL;
+        perror(NULL);
+        return 1;
+    }
+
+    unsigned long *weight = calloc(n, sizeof(unsigned long));
+    if (!weight) {
+        perror("calloc");
+        return 1;
+    }
+
+    for (size_t i = 0; i < n; i++)
+        scanf("%lu", weight + i);
+
+    unsigned long sumweight = 0;
+    for (size_t i = 0; i < n; i++) {
+        sumweight += weight[i];
+        if (sumweight < weight[i]) {
+            errno = EOVERFLOW;
+            perror(NULL);
+            return 1;
+        }
+    }
+
+    unsigned long *tmp = malloc(hutucker_tmp_size(n));
+    if (!tmp) {
+        perror("malloc");
+        return 1;
+    }
+    hutucker_get_lengths(n, weight, tmp);
+    free(tmp);
+
+    unsigned long maxlength = 0;
+    for (size_t i = 0; i < n; i++)
+        if (weight[i] > maxlength)
+            maxlength = weight[i];
+
+    unsigned char *str = malloc(maxlength + 1);
+    if (!str) {
+        perror("malloc");
+        return 1;
+    }
+    for (size_t i = 0, l = 0; i < n; i++) {
+        if (l < weight[i])
+            memset(str + l, '0', weight[i] - l);
+        l = weight[i];
+        str[l] = '\0';
+        puts(str);
+        for (size_t j = l - 1; j != (size_t) -1; j--)
+            if ((str[j] ^= '0' ^ '1') == '1')
+                break;
+    }
+
+    free(str);
+    free(weight);
+}
diff --git a/hutucker/hu-tucker.c b/hutucker/hu-tucker.c
new file mode 100644
index 0000000..82fe712
--- /dev/null
+++ b/hutucker/hu-tucker.c
@@ -0,0 +1,128 @@
+/*
+ * Linearithmic Hu-Tucker Coding.
+ * Copyright (C) 2018  Pochang Chen
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+/*--
+
+NOTICE: This file has been modified for use in the bsc-m03 project.
+
+--*/
+
+#include "hu-tucker.h"
+
+typedef struct {
+    // number of (terminal or internal) nodes under this segnode
+    size_t n;
+    // number of terminal node under this segnode, if n >= 1
+    size_t m;
+    // index of minimum weight in the leftmost block, if n >= 1
+    size_t l;
+    // index of minimum weight in the rightmost block, if n >= 1
+    size_t r;
+    // indices of minimum weight pair in the same block, if n >= 2
+    size_t i, j;
+} segnode;
+
+static void segupdate(segnode *pa, segnode *lc, segnode *rc, unsigned long *w) {
+    if (!lc->n) {
+        *pa = *rc;
+        return;
+    }
+    if (!rc->n) {
+        *pa = *lc;
+        return;
+    }
+    pa->n = lc->n + rc->n;
+    pa->m = lc->m + rc->m;
+    pa->l = ( lc->m || w[lc->l] <= w[rc->l]) ? lc->l : rc->l;
+    pa->r = (!rc->m && w[lc->r] <= w[rc->r]) ? lc->r : rc->r;
+    pa->i = lc->r;
+    pa->j = rc->l;
+    if (lc->n >= 2 && w[lc->i] + w[lc->j] <= w[pa->i] + w[pa->j]) {
+        pa->i = lc->i;
+        pa->j = lc->j;
+    }
+    if (rc->n >= 2 && w[rc->i] + w[rc->j] < w[pa->i] + w[pa->j]) {
+        pa->i = rc->i;
+        pa->j = rc->j;
+    }
+}
+static void segterminal(segnode *x, size_t id) {
+    x->n = x->m = 1;
+    x->l = x->r = id;
+}
+static void seginternal(segnode *x, size_t id) {
+    x->n = 1;
+    x->m = 0;
+    x->l = x->r = id;
+}
+static void segnone(segnode *x) {
+    x->n = 0;
+}
+static size_t raise_power_of_two(size_t n) {
+    size_t ans = 1;
+    while (ans < n)
+        ans *= 2;
+    return ans;
+}
+
+size_t hutucker_tmp_size(size_t n) {
+    // TODO check overflow for very large n
+    size_t m = raise_power_of_two(n);
+    return sizeof(segnode) * (2 * m - 1) +
+           sizeof(size_t) * (n + (2 * n - 1) + (2 * n - 1));
+}
+
+void hutucker_get_lengths(size_t n, unsigned long *weight, void *tmp) {
+    size_t m = raise_power_of_two(n);
+    segnode *seg = (segnode *) tmp;
+    size_t *cur = (size_t *) (seg + 2 * m - 1);
+    size_t *pa = (size_t *) (cur + n);
+    size_t *level = (size_t *) (pa + 2 * n - 1);
+
+    for (size_t i = 0; i < n; i++) {
+        segterminal(seg + m - 1 + i, i);
+        cur[i] = i;
+    }
+    for (size_t i = n; i < m; i++)
+        segnone(seg + m - 1 + i);
+
+    for (size_t i = m - 2; i != (size_t) -1; i--)
+        segupdate(seg + i, seg + 2 * i + 1, seg + 2 * i + 2, weight);
+
+    for (size_t k = 0; k < n - 1; k++) {
+        size_t i = seg->i, j = seg->j;
+
+        weight[i] += weight[j];
+        pa[cur[i]] = pa[cur[j]] = n + k;
+        cur[i] = n + k;
+
+        seginternal(seg + m - 1 + i, i);
+        for (size_t l = m + i; l /= 2; )
+            segupdate(seg + l - 1, seg + 2 * l - 1, seg + 2 * l, weight);
+
+        segnone(seg + m - 1 + j);
+        for (size_t l = m + j; l /= 2; )
+            segupdate(seg + l - 1, seg + 2 * l - 1, seg + 2 * l, weight);
+    }
+
+    level[2 * n - 2] = 0;
+    for (size_t i = 2 * n - 3; i != (size_t) -1; i--)
+        level[i] = level[pa[i]] + 1;
+    for (size_t i = 0; i < n; i++)
+        weight[i] = (unsigned long)level[i];
+}
diff --git a/hutucker/hu-tucker.h b/hutucker/hu-tucker.h
new file mode 100644
index 0000000..b637f4f
--- /dev/null
+++ b/hutucker/hu-tucker.h
@@ -0,0 +1,56 @@
+/*
+ * Linearithmic Hu-Tucker Coding.
+ * Copyright (C) 2018  Pochang Chen
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+/*--
+
+NOTICE: This file has been modified for use in the bsc-m03 project.
+
+--*/
+
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stddef.h>
+
+/**
+ * This algorithm needs some temporary memories to work.
+ * This function computes how much temporary memories are needed.
+ */
+size_t hutucker_tmp_size(size_t n);
+
+/**
+ * Given the weight of n symbols, determine the length of hu-tucker code
+ * of each symbols.
+ *
+ * Precondition: 
+ * n: number of symbols
+ * weight[i] (0 <= i < n): the weight of symbol i
+ * weight[0] + ... + weight[n - 1] must not exceed ULONG_MAX
+ * tmp: buffer with size >= hutucker_tmp_size(n)
+ *
+ * Postcondition:
+ * weight[i] (0 <= i < n): the length of hu-tucker code of symbol i
+ */
+void hutucker_get_lengths(size_t n, unsigned long *weight, void *tmp);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/libsais/CHANGES b/libsais/CHANGES
new file mode 100644
index 0000000..6d0b176
--- /dev/null
+++ b/libsais/CHANGES
@@ -0,0 +1,23 @@
+Changes in 2.6.0 (October 21, 2021)
+- libsais16 for 16-bit inputs.
+
+Changes in 2.5.0 (October 15, 2021)
+- Support for optional symbol frequency tables.
+
+Changes in 2.4.0 (July 14, 2021)
+- Reverse Burrows-Wheeler transform.
+
+Changes in 2.3.0 (June 23, 2021)
+- Burrows-Wheeler transform with auxiliary indexes.
+
+Changes in 2.2.0 (April 27, 2021)
+- libsais64 for inputs larger than 2GB.
+
+Changes in 2.1.0 (April 19, 2021)
+- Additional OpenMP acceleration.
+
+Changes in 2.0.0 (April 4, 2021)
+- OpenMP acceleration.
+
+Changes in 1.0.0 (February 23, 2021)
+- Initial Release.
diff --git a/libsais/LICENSE b/libsais/LICENSE
new file mode 100644
index 0000000..d645695
--- /dev/null
+++ b/libsais/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/libsais/VERSION b/libsais/VERSION
new file mode 100644
index 0000000..914ec96
--- /dev/null
+++ b/libsais/VERSION
@@ -0,0 +1 @@
+2.6.0
\ No newline at end of file
diff --git a/libsais/libsais.c b/libsais/libsais.c
new file mode 100644
index 0000000..885bd82
--- /dev/null
+++ b/libsais/libsais.c
@@ -0,0 +1,7599 @@
+/*--
+
+This file is a part of libsais, a library for linear time
+suffix array and burrows wheeler transform construction.
+
+   Copyright (c) 2021 Ilya Grebnov <ilya.grebnov@gmail.com>
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+Please see the file LICENSE for full copyright information.
+
+--*/
+
+#include "libsais_internal.h"
+
+#include "libsais.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+#if defined(_OPENMP)
+    #include <omp.h>
+#else
+    #define UNUSED(_x)                  (void)(_x)
+#endif
+
+typedef int32_t                         sa_sint_t;
+typedef uint32_t                        sa_uint_t;
+typedef ptrdiff_t                       fast_sint_t;
+typedef size_t                          fast_uint_t;
+
+#define SAINT_BIT                       (32)
+#define SAINT_MAX                       INT32_MAX
+#define SAINT_MIN                       INT32_MIN
+
+#define ALPHABET_SIZE                   (1 << CHAR_BIT)
+#define UNBWT_FASTBITS                  (17)
+
+#define SUFFIX_GROUP_BIT                (SAINT_BIT - 1)
+#define SUFFIX_GROUP_MARKER             (((sa_sint_t)1) << (SUFFIX_GROUP_BIT - 1))
+
+#define BUCKETS_INDEX2(_c, _s)          (((_c) << 1) + (_s))
+#define BUCKETS_INDEX4(_c, _s)          (((_c) << 2) + (_s))
+
+#define LIBSAIS_PER_THREAD_CACHE_SIZE   (24576)
+
+typedef struct LIBSAIS_THREAD_CACHE
+{
+        sa_sint_t                       symbol;
+        sa_sint_t                       index;
+} LIBSAIS_THREAD_CACHE;
+
+typedef union LIBSAIS_THREAD_STATE
+{
+    struct
+    {
+        fast_sint_t                     position;
+        fast_sint_t                     count;
+
+        fast_sint_t                     m;
+        fast_sint_t                     last_lms_suffix;
+
+        sa_sint_t *                     buckets;
+        LIBSAIS_THREAD_CACHE *          cache;
+    } state;
+
+    uint8_t padding[64];
+} LIBSAIS_THREAD_STATE;
+
+typedef struct LIBSAIS_CONTEXT
+{
+    sa_sint_t *                         buckets;
+    LIBSAIS_THREAD_STATE *              thread_state;
+    fast_sint_t                         threads;
+} LIBSAIS_CONTEXT;
+
+typedef struct LIBSAIS_UNBWT_CONTEXT
+{
+    sa_uint_t *                         bucket2;
+    uint16_t *                          fastbits;
+    sa_uint_t *                         buckets;
+    fast_sint_t                         threads;
+} LIBSAIS_UNBWT_CONTEXT;
+
+#if defined(__GNUC__) || defined(__clang__)
+    #define RESTRICT __restrict__
+#elif defined(_MSC_VER) || defined(__INTEL_COMPILER)
+    #define RESTRICT __restrict
+#else
+    #error Your compiler, configuration or platform is not supported.
+#endif
+
+#if defined(__has_builtin)
+    #if __has_builtin(__builtin_prefetch)
+        #define HAS_BUILTIN_PREFECTCH
+    #endif
+#elif defined(__GNUC__) && __GNUC__ > 3
+    #define HAS_BUILTIN_PREFECTCH
+#endif 
+
+#if defined(HAS_BUILTIN_PREFECTCH)
+    #define libsais_prefetch(address) __builtin_prefetch((const void *)(address), 0, 0)
+    #define libsais_prefetchw(address) __builtin_prefetch((const void *)(address), 1, 0)
+#elif defined (_M_IX86) || defined (_M_AMD64)
+    #include <intrin.h>
+    #define libsais_prefetch(address) _mm_prefetch((const void *)(address), _MM_HINT_NTA)
+    #define libsais_prefetchw(address) _m_prefetchw((const void *)(address))
+#elif defined (_M_ARM)
+    #include <intrin.h>
+    #define libsais_prefetch(address) __prefetch((const void *)(address))
+    #define libsais_prefetchw(address) __prefetchw((const void *)(address))
+#elif defined (_M_ARM64)
+    #include <intrin.h>
+    #define libsais_prefetch(address) __prefetch2((const void *)(address), 1)
+    #define libsais_prefetchw(address) __prefetch2((const void *)(address), 17)
+#else
+    #error Your compiler, configuration or platform is not supported.
+#endif
+
+#if !defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__)
+    #if defined(_LITTLE_ENDIAN) \
+            || (defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && BYTE_ORDER == LITTLE_ENDIAN) \
+            || (defined(_BYTE_ORDER) && defined(_LITTLE_ENDIAN) && _BYTE_ORDER == _LITTLE_ENDIAN) \
+            || (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN) \
+            || (defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+        #define __LITTLE_ENDIAN__
+    #elif defined(_BIG_ENDIAN) \
+            || (defined(BYTE_ORDER) && defined(BIG_ENDIAN) && BYTE_ORDER == BIG_ENDIAN) \
+            || (defined(_BYTE_ORDER) && defined(_BIG_ENDIAN) && _BYTE_ORDER == _BIG_ENDIAN) \
+            || (defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && __BYTE_ORDER == __BIG_ENDIAN) \
+            || (defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+        #define __BIG_ENDIAN__
+    #elif defined(_WIN32)
+        #define __LITTLE_ENDIAN__
+    #endif
+#endif
+
+#if defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__)
+    #if defined(__GNUC__) || defined(__clang__)
+        #define libsais_bswap16(x) (__builtin_bswap16(x))
+    #elif defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+        #define libsais_bswap16(x) (_byteswap_ushort(x))
+    #else
+        #define libsais_bswap16(x) ((uint16_t)(x >> 8) | (uint16_t)(x << 8))
+    #endif
+#elif !defined(__LITTLE_ENDIAN__) && defined(__BIG_ENDIAN__)
+    #define libsais_bswap16(x) (x)
+#else
+    #error Your compiler, configuration or platform is not supported.
+#endif
+
+static void * libsais_align_up(const void * address, size_t alignment)
+{
+    return (void *)((((ptrdiff_t)address) + ((ptrdiff_t)alignment) - 1) & (-((ptrdiff_t)alignment)));
+}
+
+static void * libsais_alloc_aligned(size_t size, size_t alignment)
+{
+    void * address = malloc(size + sizeof(short) + alignment - 1);
+    if (address != NULL)
+    {
+        void * aligned_address = libsais_align_up((void *)((ptrdiff_t)address + (ptrdiff_t)(sizeof(short))), alignment);
+        ((short *)aligned_address)[-1] = (short)((ptrdiff_t)aligned_address - (ptrdiff_t)address);
+
+        return aligned_address;
+    }
+
+    return NULL;
+}
+
+static void libsais_free_aligned(void * aligned_address)
+{
+    if (aligned_address != NULL)
+    {
+        free((void *)((ptrdiff_t)aligned_address - ((short *)aligned_address)[-1]));
+    }
+}
+
+static LIBSAIS_THREAD_STATE * libsais_alloc_thread_state(sa_sint_t threads)
+{
+    LIBSAIS_THREAD_STATE *  RESTRICT thread_state    = (LIBSAIS_THREAD_STATE *)libsais_alloc_aligned((size_t)threads * sizeof(LIBSAIS_THREAD_STATE), 4096);
+    sa_sint_t *             RESTRICT thread_buckets  = (sa_sint_t *)libsais_alloc_aligned((size_t)threads * 4 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096);
+    LIBSAIS_THREAD_CACHE *  RESTRICT thread_cache    = (LIBSAIS_THREAD_CACHE *)libsais_alloc_aligned((size_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE * sizeof(LIBSAIS_THREAD_CACHE), 4096);
+
+    if (thread_state != NULL && thread_buckets != NULL && thread_cache != NULL)
+    {
+        fast_sint_t t;
+        for (t = 0; t < threads; ++t)
+        { 
+            thread_state[t].state.buckets   = thread_buckets;   thread_buckets  += 4 * ALPHABET_SIZE;
+            thread_state[t].state.cache     = thread_cache;     thread_cache    += LIBSAIS_PER_THREAD_CACHE_SIZE;
+        }
+
+        return thread_state;
+    }
+
+    libsais_free_aligned(thread_cache);
+    libsais_free_aligned(thread_buckets);
+    libsais_free_aligned(thread_state);
+    return NULL;
+}
+
+static void libsais_free_thread_state(LIBSAIS_THREAD_STATE * thread_state)
+{
+    if (thread_state != NULL)
+    {
+        libsais_free_aligned(thread_state[0].state.cache);
+        libsais_free_aligned(thread_state[0].state.buckets);
+        libsais_free_aligned(thread_state);
+    }
+}
+
+static LIBSAIS_CONTEXT * libsais_create_ctx_main(sa_sint_t threads)
+{
+    LIBSAIS_CONTEXT *       RESTRICT ctx            = (LIBSAIS_CONTEXT *)libsais_alloc_aligned(sizeof(LIBSAIS_CONTEXT), 64);
+    sa_sint_t *             RESTRICT buckets        = (sa_sint_t *)libsais_alloc_aligned(8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096);
+    LIBSAIS_THREAD_STATE *  RESTRICT thread_state   = threads > 1 ? libsais_alloc_thread_state(threads) : NULL;
+
+    if (ctx != NULL && buckets != NULL && (thread_state != NULL || threads == 1))
+    {
+        ctx->buckets = buckets;
+        ctx->threads = threads;
+        ctx->thread_state = thread_state;
+
+        return ctx;
+    }
+
+    libsais_free_thread_state(thread_state);
+    libsais_free_aligned(buckets);
+    libsais_free_aligned(ctx);
+    return NULL;
+}
+
+static void libsais_free_ctx_main(LIBSAIS_CONTEXT * ctx)
+{
+    if (ctx != NULL)
+    {
+        libsais_free_thread_state(ctx->thread_state);
+        libsais_free_aligned(ctx->buckets);
+        libsais_free_aligned(ctx);
+    }
+}
+
+#if defined(_OPENMP)
+
+static sa_sint_t libsais_count_negative_marked_suffixes(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    sa_sint_t count = 0;
+
+    fast_sint_t i; for (i = omp_block_start; i < omp_block_start + omp_block_size; ++i) { count += (SA[i] < 0); }
+
+    return count;
+}
+
+static sa_sint_t libsais_count_zero_marked_suffixes(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    sa_sint_t count = 0;
+
+    fast_sint_t i; for (i = omp_block_start; i < omp_block_start + omp_block_size; ++i) { count += (SA[i] == 0); }
+
+    return count;
+}
+
+static void libsais_place_cached_suffixes(sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
+    {
+        libsais_prefetch(&cache[i + 2 * prefetch_distance]);
+
+        libsais_prefetchw(&SA[cache[i + prefetch_distance + 0].symbol]);
+        libsais_prefetchw(&SA[cache[i + prefetch_distance + 1].symbol]);
+        libsais_prefetchw(&SA[cache[i + prefetch_distance + 2].symbol]);
+        libsais_prefetchw(&SA[cache[i + prefetch_distance + 3].symbol]);
+
+        SA[cache[i + 0].symbol] = cache[i + 0].index;
+        SA[cache[i + 1].symbol] = cache[i + 1].index;
+        SA[cache[i + 2].symbol] = cache[i + 2].index;
+        SA[cache[i + 3].symbol] = cache[i + 3].index;
+    }
+
+    for (j += prefetch_distance + 3; i < j; i += 1)
+    {
+        SA[cache[i].symbol] = cache[i].index;
+    }
+}
+
+static void libsais_compact_and_place_cached_suffixes(sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j, l;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j; i += 4)
+    {
+        libsais_prefetchw(&cache[i + prefetch_distance]);
+
+        cache[l] = cache[i + 0]; l += cache[l].symbol >= 0;
+        cache[l] = cache[i + 1]; l += cache[l].symbol >= 0;
+        cache[l] = cache[i + 2]; l += cache[l].symbol >= 0;
+        cache[l] = cache[i + 3]; l += cache[l].symbol >= 0;
+    }
+
+    for (j += 3; i < j; i += 1)
+    {
+        cache[l] = cache[i]; l += cache[l].symbol >= 0;
+    }
+
+    libsais_place_cached_suffixes(SA, cache, omp_block_start, l - omp_block_start);
+}
+
+static void libsais_accumulate_counts_s32_2(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
+{
+    sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
+    fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s]; }
+}
+
+static void libsais_accumulate_counts_s32_3(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
+{
+    sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
+    sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
+    fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s]; }
+}
+
+static void libsais_accumulate_counts_s32_4(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
+{
+    sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
+    sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
+    sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride;
+    fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s]; }
+}
+
+static void libsais_accumulate_counts_s32_5(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
+{
+    sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
+    sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
+    sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride;
+    sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride;
+    fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s]; }
+}
+
+static void libsais_accumulate_counts_s32_6(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
+{
+    sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
+    sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
+    sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride;
+    sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride;
+    sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride;
+    fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s]; }
+}
+
+static void libsais_accumulate_counts_s32_7(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
+{
+    sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
+    sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
+    sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride;
+    sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride;
+    sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride;
+    sa_sint_t * RESTRICT bucket06 = bucket05 - bucket_stride;
+    fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s] + bucket06[s]; }
+}
+
+static void libsais_accumulate_counts_s32_8(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
+{
+    sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
+    sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
+    sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride;
+    sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride;
+    sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride;
+    sa_sint_t * RESTRICT bucket06 = bucket05 - bucket_stride;
+    sa_sint_t * RESTRICT bucket07 = bucket06 - bucket_stride;
+    fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s] + bucket06[s] + bucket07[s]; }
+}
+
+static void libsais_accumulate_counts_s32_9(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
+{
+    sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
+    sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
+    sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride;
+    sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride;
+    sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride;
+    sa_sint_t * RESTRICT bucket06 = bucket05 - bucket_stride;
+    sa_sint_t * RESTRICT bucket07 = bucket06 - bucket_stride;
+    sa_sint_t * RESTRICT bucket08 = bucket07 - bucket_stride;
+    fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s] + bucket06[s] + bucket07[s] + bucket08[s]; }
+}
+
+static void libsais_accumulate_counts_s32(sa_sint_t * RESTRICT buckets, fast_sint_t bucket_size, fast_sint_t bucket_stride, fast_sint_t num_buckets)
+{
+    while (num_buckets >= 9)
+    {
+        libsais_accumulate_counts_s32_9(buckets - (num_buckets - 9) * bucket_stride, bucket_size, bucket_stride); num_buckets -= 8;
+    }
+
+    switch (num_buckets)
+    {
+        case 1: break;
+        case 2: libsais_accumulate_counts_s32_2(buckets, bucket_size, bucket_stride); break;
+        case 3: libsais_accumulate_counts_s32_3(buckets, bucket_size, bucket_stride); break;
+        case 4: libsais_accumulate_counts_s32_4(buckets, bucket_size, bucket_stride); break;
+        case 5: libsais_accumulate_counts_s32_5(buckets, bucket_size, bucket_stride); break;
+        case 6: libsais_accumulate_counts_s32_6(buckets, bucket_size, bucket_stride); break;
+        case 7: libsais_accumulate_counts_s32_7(buckets, bucket_size, bucket_stride); break;
+        case 8: libsais_accumulate_counts_s32_8(buckets, bucket_size, bucket_stride); break;
+    }
+}
+
+#endif
+
+static void libsais_gather_lms_suffixes_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, fast_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    if (omp_block_size > 0)
+    {
+        const fast_sint_t prefetch_distance = 128;
+
+        fast_sint_t i, j = omp_block_start + omp_block_size, c0 = T[omp_block_start + omp_block_size - 1], c1 = -1;
+
+        while (j < n && (c1 = T[j]) == c0) { ++j; }
+
+        fast_uint_t s = c0 >= c1;
+
+        for (i = omp_block_start + omp_block_size - 2, j = omp_block_start + 3; i >= j; i -= 4)
+        {
+            libsais_prefetch(&T[i - prefetch_distance]);
+
+            c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+            c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 0); m -= ((s & 3) == 1);
+            c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 1); m -= ((s & 3) == 1);
+            c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 2); m -= ((s & 3) == 1);
+        }
+
+        for (j -= 3; i >= j; i -= 1)
+        {
+            c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+        }
+
+        SA[m] = (sa_sint_t)(i + 1);
+    }
+}
+
+static void libsais_gather_lms_suffixes_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536 && omp_get_dynamic() == 0)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (n / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais_gather_lms_suffixes_8u(T, SA, n, (fast_sint_t)n - 1, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            fast_sint_t t, m = 0; for (t = omp_num_threads - 1; t > omp_thread_num; --t) { m += thread_state[t].state.m; }
+
+            libsais_gather_lms_suffixes_8u(T, SA, n, (fast_sint_t)n - 1 - m, omp_block_start, omp_block_size);
+
+            #pragma omp barrier
+
+            if (thread_state[omp_thread_num].state.m > 0)
+            {
+                SA[(fast_sint_t)n - 1 - m] = (sa_sint_t)thread_state[omp_thread_num].state.last_lms_suffix;
+            }
+        }
+#endif
+    }
+}
+
+static sa_sint_t libsais_gather_lms_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t             i   = n - 2;
+    sa_sint_t             m   = n - 1;
+    fast_uint_t           s   = 1;
+    fast_sint_t           c0  = T[n - 1];
+    fast_sint_t           c1  = 0;
+
+    for (; i >= 3; i -= 4)
+    {
+        libsais_prefetch(&T[i - prefetch_distance]);
+
+        c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = i + 1; m -= ((s & 3) == 1);
+        c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i - 0; m -= ((s & 3) == 1);
+        c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = i - 1; m -= ((s & 3) == 1);
+        c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i - 2; m -= ((s & 3) == 1);
+    }
+
+    for (; i >= 0; i -= 1)
+    {
+        c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i + 1; m -= ((s & 3) == 1);
+    }
+
+    return n - 1 - m;
+}
+
+static sa_sint_t libsais_gather_compacted_lms_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t             i   = n - 2;
+    sa_sint_t             m   = n - 1;
+    fast_uint_t           s   = 1;
+    fast_sint_t           c0  = T[n - 1];
+    fast_sint_t           c1  = 0;
+
+    for (; i >= 3; i -= 4)
+    {
+        libsais_prefetch(&T[i - prefetch_distance]);
+
+        c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = i + 1; m -= ((fast_sint_t)(s & 3) == (c0 >= 0));
+        c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i - 0; m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
+        c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = i - 1; m -= ((fast_sint_t)(s & 3) == (c0 >= 0));
+        c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i - 2; m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
+    }
+
+    for (; i >= 0; i -= 1)
+    {
+        c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i + 1; m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
+    }
+
+    return n - 1 - m;
+}
+
+#if defined(_OPENMP)
+
+static void libsais_count_lms_suffixes_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    memset(buckets, 0, 4 * (size_t)k * sizeof(sa_sint_t));
+
+    sa_sint_t             i   = n - 2;
+    fast_uint_t           s   = 1;
+    fast_sint_t           c0  = T[n - 1];
+    fast_sint_t           c1  = 0;
+
+    for (; i >= prefetch_distance + 3; i -= 4)
+    {
+        libsais_prefetch(&T[i - 2 * prefetch_distance]);
+
+        libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 0], 0)]);
+        libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 1], 0)]);
+        libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 2], 0)]);
+        libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 3], 0)]);
+
+        c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+        buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+
+        c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+
+        c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+        buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+
+        c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+    }
+
+    for (; i >= 0; i -= 1)
+    {
+        c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+    }
+
+    buckets[BUCKETS_INDEX4((fast_uint_t)c0, (s << 1) & 3)]++;
+}
+
+#endif
+
+static void libsais_count_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
+
+    sa_sint_t             i   = n - 2;
+    fast_uint_t           s   = 1;
+    fast_sint_t           c0  = T[n - 1];
+    fast_sint_t           c1  = 0;
+
+    for (; i >= prefetch_distance + 3; i -= 4)
+    {
+        libsais_prefetch(&T[i - 2 * prefetch_distance]);
+
+        libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0], 0)]);
+        libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1], 0)]);
+        libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2], 0)]);
+        libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3], 0)]);
+
+        c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+        buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+        c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+
+        c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+        buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+        c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+    }
+
+    for (; i >= 0; i -= 1)
+    {
+        c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+    }
+
+    buckets[BUCKETS_INDEX2((fast_uint_t)c0, 0)]++;
+}
+
+#if defined(_OPENMP)
+
+static void libsais_count_compacted_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
+
+    sa_sint_t             i   = n - 2;
+    fast_uint_t           s   = 1;
+    fast_sint_t           c0  = T[n - 1];
+    fast_sint_t           c1  = 0;
+
+    for (; i >= prefetch_distance + 3; i -= 4)
+    {
+        libsais_prefetch(&T[i - 2 * prefetch_distance]);
+
+        libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0] & SAINT_MAX, 0)]);
+        libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1] & SAINT_MAX, 0)]);
+        libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2] & SAINT_MAX, 0)]);
+        libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3] & SAINT_MAX, 0)]);
+
+        c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+        c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+        c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+
+        c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+        c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+        c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+    }
+
+    for (; i >= 0; i -= 1)
+    {
+        c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+    }
+
+    c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, 0)]++;
+}
+
+#endif
+
+static sa_sint_t libsais_count_and_gather_lms_suffixes_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t));
+
+    fast_sint_t m = omp_block_start + omp_block_size - 1;
+
+    if (omp_block_size > 0)
+    {
+        const fast_sint_t prefetch_distance = 128;
+
+        fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1;
+
+        while (j < n && (c1 = T[j]) == c0) { ++j; }
+
+        fast_uint_t s = c0 >= c1;
+
+        for (i = m - 1, j = omp_block_start + 3; i >= j; i -= 4)
+        {
+            libsais_prefetch(&T[i - prefetch_distance]);
+
+            c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+            buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+
+            c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 0); m -= ((s & 3) == 1);
+            buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+
+            c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 1); m -= ((s & 3) == 1);
+            buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+
+            c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 2); m -= ((s & 3) == 1);
+            buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+        }
+
+        for (j -= 3; i >= j; i -= 1)
+        {
+            c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+            buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+        }
+
+        c1 = (i >= 0) ? T[i] : -1; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+        buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+    }
+
+    return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m);
+}
+
+static sa_sint_t libsais_count_and_gather_lms_suffixes_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    sa_sint_t m = 0;
+
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536 && omp_get_dynamic() == 0)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (n / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+        if (omp_num_threads == 1)
+        {
+            m = libsais_count_and_gather_lms_suffixes_8u(T, SA, n, buckets, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size;
+                thread_state[omp_thread_num].state.m = libsais_count_and_gather_lms_suffixes_8u(T, SA, n, thread_state[omp_thread_num].state.buckets, omp_block_start, omp_block_size);
+
+                if (thread_state[omp_thread_num].state.m > 0)
+                {
+                    thread_state[omp_thread_num].state.last_lms_suffix = SA[thread_state[omp_thread_num].state.position - 1];
+                }
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t));
+
+                fast_sint_t t;
+                for (t = omp_num_threads - 1; t >= 0; --t)
+                {
+                    m += (sa_sint_t)thread_state[t].state.m;
+
+                    if (t != omp_num_threads - 1 && thread_state[t].state.m > 0)
+                    {
+                        memcpy(&SA[n - m], &SA[thread_state[t].state.position - thread_state[t].state.m], (size_t)thread_state[t].state.m * sizeof(sa_sint_t));
+                    }
+
+                    {
+                        sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
+                        fast_sint_t s; for (s = 0; s < 4 * ALPHABET_SIZE; s += 1) { sa_sint_t A = buckets[s], B = temp_bucket[s]; buckets[s] = A + B; temp_bucket[s] = A; }
+                    }
+                }
+            }
+        }
+#endif
+    }
+
+    return m;
+}
+
+static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    memset(buckets, 0, 4 * (size_t)k * sizeof(sa_sint_t));
+
+    fast_sint_t m = omp_block_start + omp_block_size - 1;
+
+    if (omp_block_size > 0)
+    {
+        const fast_sint_t prefetch_distance = 32;
+
+        fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1;
+
+        while (j < n && (c1 = T[j]) == c0) { ++j; }
+
+        fast_uint_t s = c0 >= c1;
+
+        for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4)
+        {
+            libsais_prefetch(&T[i - 2 * prefetch_distance]);
+
+            libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 0], 0)]);
+            libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 1], 0)]);
+            libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 2], 0)]);
+            libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 3], 0)]);
+
+            c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+            buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+
+            c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 0); m -= ((s & 3) == 1);
+            buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+
+            c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 1); m -= ((s & 3) == 1);
+            buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+
+            c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 2); m -= ((s & 3) == 1);
+            buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+        }
+
+        for (j -= prefetch_distance + 3; i >= j; i -= 1)
+        {
+            c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+            buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+        }
+
+        c1 = (i >= 0) ? T[i] : -1; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+        buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+    }
+
+    return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m);
+}
+
+static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
+
+    fast_sint_t m = omp_block_start + omp_block_size - 1;
+
+    if (omp_block_size > 0)
+    {
+        const fast_sint_t prefetch_distance = 32;
+
+        fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1;
+
+        while (j < n && (c1 = T[j]) == c0) { ++j; }
+
+        fast_uint_t s = c0 >= c1;
+
+        for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4)
+        {
+            libsais_prefetch(&T[i - 2 * prefetch_distance]);
+
+            libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0], 0)]);
+            libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1], 0)]);
+            libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2], 0)]);
+            libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3], 0)]);
+
+            c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+            buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+            c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 0); m -= ((s & 3) == 1);
+            buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+
+            c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 1); m -= ((s & 3) == 1);
+            buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+            c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 2); m -= ((s & 3) == 1);
+            buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+        }
+
+        for (j -= prefetch_distance + 3; i >= j; i -= 1)
+        {
+            c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+            buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+        }
+
+        c1 = (i >= 0) ? T[i] : -1; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+        buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+    }
+
+    return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m);
+}
+
+static sa_sint_t libsais_count_and_gather_compacted_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
+
+    fast_sint_t m = omp_block_start + omp_block_size - 1;
+
+    if (omp_block_size > 0)
+    {
+        const fast_sint_t prefetch_distance = 32;
+
+        fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1;
+
+        while (j < n && (c1 = T[j]) == c0) { ++j; }
+
+        fast_uint_t s = c0 >= c1;
+
+        for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4)
+        {
+            libsais_prefetch(&T[i - 2 * prefetch_distance]);
+
+            libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0] & SAINT_MAX, 0)]);
+            libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1] & SAINT_MAX, 0)]);
+            libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2] & SAINT_MAX, 0)]);
+            libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3] & SAINT_MAX, 0)]);
+
+            c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((fast_sint_t)(s & 3) == (c0 >= 0));
+            c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+            c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 0); m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
+            c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+
+            c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 1); m -= ((fast_sint_t)(s & 3) == (c0 >= 0));
+            c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+            c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 2); m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
+            c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+        }
+
+        for (j -= prefetch_distance + 3; i >= j; i -= 1)
+        {
+            c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
+            c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+        }
+
+        c1 = (i >= 0) ? T[i] : -1; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((fast_sint_t)(s & 3) == (c0 >= 0));
+        c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+    }
+
+    return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m);
+}
+
+#if defined(_OPENMP)
+
+static fast_sint_t libsais_get_bucket_stride(fast_sint_t free_space, fast_sint_t bucket_size, fast_sint_t num_buckets)
+{
+    fast_sint_t bucket_size_1024 = (bucket_size + 1023) & (-1024); if (free_space / (num_buckets - 1) >= bucket_size_1024) { return bucket_size_1024; }
+    fast_sint_t bucket_size_16 = (bucket_size + 15) & (-16); if (free_space / (num_buckets - 1) >= bucket_size_16) { return bucket_size_16; }
+
+    return bucket_size;
+}
+
+static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_fs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    sa_sint_t m = 0;
+
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (n / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+        if (omp_num_threads == 1)
+        {
+            m = libsais_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            fast_sint_t bucket_size       = 4 * (fast_sint_t)k;
+            fast_sint_t bucket_stride     = libsais_get_bucket_stride(buckets - &SA[n], bucket_size, omp_num_threads);
+
+            {
+                thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size;
+                thread_state[omp_thread_num].state.count = libsais_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets - (omp_thread_num * bucket_stride), omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            if (omp_thread_num == omp_num_threads - 1)
+            {
+                fast_sint_t t;
+                for (t = omp_num_threads - 1; t >= 0; --t)
+                {
+                    m += (sa_sint_t)thread_state[t].state.count;
+
+                    if (t != omp_num_threads - 1 && thread_state[t].state.count > 0)
+                    {
+                        memcpy(&SA[n - m], &SA[thread_state[t].state.position - thread_state[t].state.count], (size_t)thread_state[t].state.count * sizeof(sa_sint_t));
+                    }
+                }
+            }
+            else
+            {
+                omp_num_threads     = omp_num_threads - 1;
+                omp_block_stride    = (bucket_size / omp_num_threads) & (-16);
+                omp_block_start     = omp_thread_num * omp_block_stride;
+                omp_block_size      = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : bucket_size - omp_block_start;
+
+                libsais_accumulate_counts_s32(buckets + omp_block_start, omp_block_size, bucket_stride, omp_num_threads + 1);
+            }
+        }
+#endif
+    }
+
+    return m;
+}
+
+static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_fs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    sa_sint_t m = 0;
+
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (n / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+        if (omp_num_threads == 1)
+        {
+            m = libsais_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            fast_sint_t bucket_size       = 2 * (fast_sint_t)k;
+            fast_sint_t bucket_stride     = libsais_get_bucket_stride(buckets - &SA[n], bucket_size, omp_num_threads);
+
+            {
+                thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size;
+                thread_state[omp_thread_num].state.count = libsais_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets - (omp_thread_num * bucket_stride), omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            if (omp_thread_num == omp_num_threads - 1)
+            {
+                fast_sint_t t;
+                for (t = omp_num_threads - 1; t >= 0; --t)
+                {
+                    m += (sa_sint_t)thread_state[t].state.count;
+
+                    if (t != omp_num_threads - 1 && thread_state[t].state.count > 0)
+                    {
+                        memcpy(&SA[n - m], &SA[thread_state[t].state.position - thread_state[t].state.count], (size_t)thread_state[t].state.count * sizeof(sa_sint_t));
+                    }
+                }
+            }
+            else
+            {
+                omp_num_threads     = omp_num_threads - 1;
+                omp_block_stride    = (bucket_size / omp_num_threads) & (-16);
+                omp_block_start     = omp_thread_num * omp_block_stride;
+                omp_block_size      = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : bucket_size - omp_block_start;
+
+                libsais_accumulate_counts_s32(buckets + omp_block_start, omp_block_size, bucket_stride, omp_num_threads + 1);
+            }
+        }
+#endif
+    }
+
+    return m;
+}
+
+static void libsais_count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (n / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA, n, k, buckets, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            fast_sint_t bucket_size       = 2 * (fast_sint_t)k;
+            fast_sint_t bucket_stride     = libsais_get_bucket_stride(buckets - &SA[n + n], bucket_size, omp_num_threads);
+
+            {
+                thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size;
+                thread_state[omp_thread_num].state.count = libsais_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA + n, n, k, buckets - (omp_thread_num * bucket_stride), omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                fast_sint_t t, m = 0; for (t = omp_num_threads - 1; t >= omp_thread_num; --t) { m += (sa_sint_t)thread_state[t].state.count; }
+
+                if (thread_state[omp_thread_num].state.count > 0)
+                {
+                    memcpy(&SA[n - m], &SA[n + thread_state[omp_thread_num].state.position - thread_state[omp_thread_num].state.count], (size_t)thread_state[omp_thread_num].state.count * sizeof(sa_sint_t));
+                }
+            }
+
+            {
+                omp_block_stride    = (bucket_size / omp_num_threads) & (-16);
+                omp_block_start     = omp_thread_num * omp_block_stride;
+                omp_block_size      = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : bucket_size - omp_block_start;
+
+                libsais_accumulate_counts_s32(buckets + omp_block_start, omp_block_size, bucket_stride, omp_num_threads);
+            }
+        }
+#endif
+    }
+}
+
+#endif
+
+static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_nofs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads)
+{
+    sa_sint_t m = 0;
+
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(2) if(threads > 1 && n >= 65536)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads);
+
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        if (omp_num_threads == 1)
+        {
+            m = libsais_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets, 0, n);
+        }
+#if defined(_OPENMP)
+        else if (omp_thread_num == 0)
+        {
+            libsais_count_lms_suffixes_32s_4k(T, n, k, buckets);
+        }
+        else
+        {
+            m = libsais_gather_lms_suffixes_32s(T, SA, n);
+        }
+#endif
+    }
+
+    return m;
+}
+
+static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_nofs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads)
+{
+    sa_sint_t m = 0;
+
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(2) if(threads > 1 && n >= 65536)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads);
+
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        if (omp_num_threads == 1)
+        {
+            m = libsais_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n);
+        }
+#if defined(_OPENMP)
+        else if (omp_thread_num == 0)
+        {
+            libsais_count_lms_suffixes_32s_2k(T, n, k, buckets);
+        }
+        else
+        {
+            m = libsais_gather_lms_suffixes_32s(T, SA, n);
+        }
+#endif
+    }
+
+    return m;
+}
+
+static sa_sint_t libsais_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads)
+{
+    sa_sint_t m = 0;
+
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(2) if(threads > 1 && n >= 65536)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads);
+
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        if (omp_num_threads == 1)
+        {
+            m = libsais_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n);
+        }
+#if defined(_OPENMP)
+        else if (omp_thread_num == 0)
+        {
+            libsais_count_compacted_lms_suffixes_32s_2k(T, n, k, buckets);
+        }
+        else
+        {
+            m = libsais_gather_compacted_lms_suffixes_32s(T, SA, n);
+        }
+#endif
+    }
+
+    return m;
+}
+
+static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    sa_sint_t m;
+
+#if defined(_OPENMP)
+    sa_sint_t max_threads = (sa_sint_t)((buckets - &SA[n]) / ((4 * (fast_sint_t)k + 15) & (-16))); if (max_threads > threads) { max_threads = threads; }
+    if (max_threads > 1 && n >= 65536 && n / k >= 2)
+    {
+        if (max_threads > n / 16 / k) { max_threads = n / 16 / k; }
+        m = libsais_count_and_gather_lms_suffixes_32s_4k_fs_omp(T, SA, n, k, buckets, max_threads > 2 ? max_threads : 2, thread_state);
+    }
+    else
+#else
+    UNUSED(thread_state);
+#endif
+    {
+        m = libsais_count_and_gather_lms_suffixes_32s_4k_nofs_omp(T, SA, n, k, buckets, threads);
+    }
+
+    return m;
+}
+
+static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    sa_sint_t m;
+
+#if defined(_OPENMP)
+    sa_sint_t max_threads = (sa_sint_t)((buckets - &SA[n]) / ((2 * (fast_sint_t)k + 15) & (-16))); if (max_threads > threads) { max_threads = threads; }
+    if (max_threads > 1 && n >= 65536 && n / k >= 2)
+    {
+        if (max_threads > n / 8 / k) { max_threads = n / 8 / k; }
+        m = libsais_count_and_gather_lms_suffixes_32s_2k_fs_omp(T, SA, n, k, buckets, max_threads > 2 ? max_threads : 2, thread_state);
+    }
+    else
+#else
+    UNUSED(thread_state);
+#endif
+    {
+        m = libsais_count_and_gather_lms_suffixes_32s_2k_nofs_omp(T, SA, n, k, buckets, threads);
+    }
+
+    return m;
+}
+
+static void libsais_count_and_gather_compacted_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+    sa_sint_t max_threads = (sa_sint_t)((buckets - &SA[n + n]) / ((2 * (fast_sint_t)k + 15) & (-16))); if (max_threads > threads) { max_threads = threads; }
+    if (max_threads > 1 && n >= 65536 && n / k >= 2)
+    {
+        if (max_threads > n / 8 / k) { max_threads = n / 8 / k; }
+        libsais_count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(T, SA, n, k, buckets, max_threads > 2 ? max_threads : 2, thread_state);
+    }
+    else
+#else
+    UNUSED(thread_state);
+#endif
+    {
+        libsais_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(T, SA, n, k, buckets, threads);
+    }
+}
+
+static void libsais_count_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    memset(buckets, 0, (size_t)k * sizeof(sa_sint_t));
+
+    fast_sint_t i, j;
+    for (i = 0, j = (fast_sint_t)n - 7; i < j; i += 8)
+    {
+        libsais_prefetch(&T[i + prefetch_distance]);
+
+        buckets[T[i + 0]]++;
+        buckets[T[i + 1]]++;
+        buckets[T[i + 2]]++;
+        buckets[T[i + 3]]++;
+        buckets[T[i + 4]]++;
+        buckets[T[i + 5]]++;
+        buckets[T[i + 6]]++;
+        buckets[T[i + 7]]++;
+    }
+
+    for (j += 7; i < j; i += 1)
+    {
+        buckets[T[i]]++;
+    }
+}
+
+static void libsais_initialize_buckets_start_and_end_8u(sa_sint_t * RESTRICT buckets, sa_sint_t * RESTRICT freq)
+{
+    sa_sint_t * RESTRICT bucket_start = &buckets[6 * ALPHABET_SIZE];
+    sa_sint_t * RESTRICT bucket_end   = &buckets[7 * ALPHABET_SIZE];
+
+    if (freq != NULL)
+    {
+        fast_sint_t i, j; sa_sint_t sum = 0;
+        for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1)
+        {
+            bucket_start[j] = sum;
+            sum += (freq[j] = buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)]);
+            bucket_end[j] = sum;
+        }
+    }
+    else
+    {
+        fast_sint_t i, j; sa_sint_t sum = 0;
+        for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1)
+        {
+            bucket_start[j] = sum;
+            sum += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)];
+            bucket_end[j] = sum;
+        }
+    }
+}
+
+static void libsais_initialize_buckets_start_and_end_32s_6k(sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+    sa_sint_t * RESTRICT bucket_start = &buckets[4 * k];
+    sa_sint_t * RESTRICT bucket_end   = &buckets[5 * k];
+
+    fast_sint_t i, j; sa_sint_t sum = 0;
+    for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1)
+    {
+        bucket_start[j] = sum;
+        sum += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)];
+        bucket_end[j] = sum;
+    }
+}
+
+static void libsais_initialize_buckets_start_and_end_32s_4k(sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+    sa_sint_t * RESTRICT bucket_start = &buckets[2 * k];
+    sa_sint_t * RESTRICT bucket_end   = &buckets[3 * k];
+
+    fast_sint_t i, j; sa_sint_t sum = 0;
+    for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1)
+    { 
+        bucket_start[j] = sum;
+        sum += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)];
+        bucket_end[j] = sum;
+    }
+}
+
+static void libsais_initialize_buckets_end_32s_2k(sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+    fast_sint_t i; sa_sint_t sum0 = 0;
+    for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0))
+    { 
+        sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)]; buckets[i + BUCKETS_INDEX2(0, 0)] = sum0;
+    }
+}
+
+static void libsais_initialize_buckets_start_and_end_32s_2k(sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+    fast_sint_t i, j;
+    for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1)
+    {
+        buckets[j] = buckets[i];
+    }
+
+    buckets[k] = 0; memcpy(&buckets[k + 1], buckets, ((size_t)k - 1) * sizeof(sa_sint_t));
+}
+
+static void libsais_initialize_buckets_start_32s_1k(sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+    fast_sint_t i; sa_sint_t sum = 0;
+    for (i = 0; i <= (fast_sint_t)k - 1; i += 1) { sa_sint_t tmp = buckets[i]; buckets[i] = sum; sum += tmp; }
+}
+
+static void libsais_initialize_buckets_end_32s_1k(sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+    fast_sint_t i; sa_sint_t sum = 0;
+    for (i = 0; i <= (fast_sint_t)k - 1; i += 1) { sum += buckets[i]; buckets[i] = sum; }
+}
+
+static sa_sint_t libsais_initialize_buckets_for_lms_suffixes_radix_sort_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix)
+{
+    {
+        fast_uint_t     s = 0;
+        fast_sint_t     c0 = T[first_lms_suffix];
+        fast_sint_t     c1 = 0;
+
+        for (; --first_lms_suffix >= 0; )
+        {
+            c1 = c0; c0 = T[first_lms_suffix]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+            buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]--;
+        }
+
+        buckets[BUCKETS_INDEX4((fast_uint_t)c0, (s << 1) & 3)]--;
+    }
+
+    {
+        sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE];
+
+        fast_sint_t i, j; sa_sint_t sum = 0;
+        for (i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0))
+        { 
+            temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum; sum += buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 3)]; temp_bucket[j] = sum;
+        }
+
+        return sum;
+    }
+}
+
+static void libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix)
+{
+    buckets[BUCKETS_INDEX2(T[first_lms_suffix], 0)]++;
+    buckets[BUCKETS_INDEX2(T[first_lms_suffix], 1)]--;
+
+    fast_sint_t i; sa_sint_t sum0 = 0, sum1 = 0;
+    for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0))
+    { 
+        sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)];
+        sum1 += buckets[i + BUCKETS_INDEX2(0, 1)];
+        
+        buckets[i + BUCKETS_INDEX2(0, 0)] = sum0;
+        buckets[i + BUCKETS_INDEX2(0, 1)] = sum1;
+    }
+}
+
+static sa_sint_t libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix)
+{
+    {
+        fast_uint_t     s = 0;
+        fast_sint_t     c0 = T[first_lms_suffix];
+        fast_sint_t     c1 = 0;
+
+        for (; --first_lms_suffix >= 0; )
+        {
+            c1 = c0; c0 = T[first_lms_suffix]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+            buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]--;
+        }
+
+        buckets[BUCKETS_INDEX4((fast_uint_t)c0, (s << 1) & 3)]--;
+    }
+
+    {
+        sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k];
+
+        fast_sint_t i, j; sa_sint_t sum = 0;
+        for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1)
+        { 
+            sum += buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 3)]; temp_bucket[j] = sum;
+        }
+
+        return sum;
+    }
+}
+
+static void libsais_initialize_buckets_for_radix_and_partial_sorting_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix)
+{
+    sa_sint_t * RESTRICT bucket_start = &buckets[2 * k];
+    sa_sint_t * RESTRICT bucket_end   = &buckets[3 * k];
+
+    buckets[BUCKETS_INDEX2(T[first_lms_suffix], 0)]++;
+    buckets[BUCKETS_INDEX2(T[first_lms_suffix], 1)]--;
+
+    fast_sint_t i, j; sa_sint_t sum0 = 0, sum1 = 0;
+    for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1)
+    { 
+        bucket_start[j] = sum1;
+
+        sum0 += buckets[i + BUCKETS_INDEX2(0, 1)];
+        sum1 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)];
+        buckets[i + BUCKETS_INDEX2(0, 1)] = sum0;
+
+        bucket_end[j] = sum1;
+    }
+}
+
+static void libsais_radix_sort_lms_suffixes_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4)
+    {
+        libsais_prefetch(&SA[i - 2 * prefetch_distance]);
+
+        libsais_prefetch(&T[SA[i - prefetch_distance - 0]]);
+        libsais_prefetch(&T[SA[i - prefetch_distance - 1]]);
+        libsais_prefetch(&T[SA[i - prefetch_distance - 2]]);
+        libsais_prefetch(&T[SA[i - prefetch_distance - 3]]);
+
+        sa_sint_t p0 = SA[i - 0]; SA[--induction_bucket[BUCKETS_INDEX2(T[p0], 0)]] = p0;
+        sa_sint_t p1 = SA[i - 1]; SA[--induction_bucket[BUCKETS_INDEX2(T[p1], 0)]] = p1;
+        sa_sint_t p2 = SA[i - 2]; SA[--induction_bucket[BUCKETS_INDEX2(T[p2], 0)]] = p2;
+        sa_sint_t p3 = SA[i - 3]; SA[--induction_bucket[BUCKETS_INDEX2(T[p3], 0)]] = p3;
+    }
+
+    for (j -= prefetch_distance + 3; i >= j; i -= 1)
+    {
+        sa_sint_t p = SA[i]; SA[--induction_bucket[BUCKETS_INDEX2(T[p], 0)]] = p;
+    }
+}
+
+static void libsais_radix_sort_lms_suffixes_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536 && m >= 65536 && omp_get_dynamic() == 0)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        if (omp_num_threads == 1)
+        {
+            libsais_radix_sort_lms_suffixes_8u(T, SA, &buckets[4 * ALPHABET_SIZE], (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                sa_sint_t * RESTRICT src_bucket = &buckets[4 * ALPHABET_SIZE];
+                sa_sint_t * RESTRICT dst_bucket = thread_state[omp_thread_num].state.buckets;
+
+                fast_sint_t i, j;
+                for (i = BUCKETS_INDEX2(0, 0), j = BUCKETS_INDEX4(0, 1); i <= BUCKETS_INDEX2(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX2(1, 0), j += BUCKETS_INDEX4(1, 0))
+                {
+                    dst_bucket[i] = src_bucket[i] - dst_bucket[j];
+                }
+            }
+
+            {
+                fast_sint_t t, omp_block_start = 0, omp_block_size = thread_state[omp_thread_num].state.m;
+                for (t = omp_num_threads - 1; t >= omp_thread_num; --t) omp_block_start += thread_state[t].state.m;
+
+                if (omp_block_start == (fast_sint_t)m && omp_block_size > 0)
+                {
+                    omp_block_start -= 1; omp_block_size -= 1;
+                }
+
+                libsais_radix_sort_lms_suffixes_8u(T, SA, thread_state[omp_thread_num].state.buckets, (fast_sint_t)n - omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+}
+
+static void libsais_radix_sort_lms_suffixes_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 3; i >= j; i -= 4)
+    {
+        libsais_prefetch(&SA[i - 3 * prefetch_distance]);
+        
+        libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 0]]);
+        libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 1]]);
+        libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 2]]);
+        libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 3]]);
+
+        libsais_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 0]]]);
+        libsais_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 1]]]);
+        libsais_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 2]]]);
+        libsais_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 3]]]);
+
+        sa_sint_t p0 = SA[i - 0]; SA[--induction_bucket[T[p0]]] = p0;
+        sa_sint_t p1 = SA[i - 1]; SA[--induction_bucket[T[p1]]] = p1;
+        sa_sint_t p2 = SA[i - 2]; SA[--induction_bucket[T[p2]]] = p2;
+        sa_sint_t p3 = SA[i - 3]; SA[--induction_bucket[T[p3]]] = p3;
+    }
+
+    for (j -= 2 * prefetch_distance + 3; i >= j; i -= 1)
+    {
+        sa_sint_t p = SA[i]; SA[--induction_bucket[T[p]]] = p;
+    }
+}
+
+static void libsais_radix_sort_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 3; i >= j; i -= 4)
+    {
+        libsais_prefetch(&SA[i - 3 * prefetch_distance]);
+        
+        libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 0]]);
+        libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 1]]);
+        libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 2]]);
+        libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 3]]);
+
+        libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 0]], 0)]);
+        libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 1]], 0)]);
+        libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 2]], 0)]);
+        libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 3]], 0)]);
+
+        sa_sint_t p0 = SA[i - 0]; SA[--induction_bucket[BUCKETS_INDEX2(T[p0], 0)]] = p0;
+        sa_sint_t p1 = SA[i - 1]; SA[--induction_bucket[BUCKETS_INDEX2(T[p1], 0)]] = p1;
+        sa_sint_t p2 = SA[i - 2]; SA[--induction_bucket[BUCKETS_INDEX2(T[p2], 0)]] = p2;
+        sa_sint_t p3 = SA[i - 3]; SA[--induction_bucket[BUCKETS_INDEX2(T[p3], 0)]] = p3;
+    }
+
+    for (j -= 2 * prefetch_distance + 3; i >= j; i -= 1)
+    {
+        sa_sint_t p = SA[i]; SA[--induction_bucket[BUCKETS_INDEX2(T[p], 0)]] = p;
+    }
+}
+
+#if defined(_OPENMP)
+
+static void libsais_radix_sort_lms_suffixes_32s_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
+    {
+        libsais_prefetch(&SA[i + 2 * prefetch_distance]);
+
+        libsais_prefetch(&T[SA[i + prefetch_distance + 0]]);
+        libsais_prefetch(&T[SA[i + prefetch_distance + 1]]);
+        libsais_prefetch(&T[SA[i + prefetch_distance + 2]]);
+        libsais_prefetch(&T[SA[i + prefetch_distance + 3]]);
+
+        libsais_prefetchw(&cache[i + prefetch_distance]);
+
+        cache[i + 0].symbol = T[cache[i + 0].index = SA[i + 0]];
+        cache[i + 1].symbol = T[cache[i + 1].index = SA[i + 1]];
+        cache[i + 2].symbol = T[cache[i + 2].index = SA[i + 2]];
+        cache[i + 3].symbol = T[cache[i + 3].index = SA[i + 3]];
+    }
+
+    for (j += prefetch_distance + 3; i < j; i += 1)
+    {
+        cache[i].symbol = T[cache[i].index = SA[i]];
+    }
+}
+
+static void libsais_radix_sort_lms_suffixes_32s_6k_block_sort(sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4)
+    {
+        libsais_prefetchw(&cache[i - 2 * prefetch_distance]);
+
+        libsais_prefetchw(&induction_bucket[cache[i - prefetch_distance - 0].symbol]);
+        libsais_prefetchw(&induction_bucket[cache[i - prefetch_distance - 1].symbol]);
+        libsais_prefetchw(&induction_bucket[cache[i - prefetch_distance - 2].symbol]);
+        libsais_prefetchw(&induction_bucket[cache[i - prefetch_distance - 3].symbol]);
+
+        cache[i - 0].symbol = --induction_bucket[cache[i - 0].symbol];
+        cache[i - 1].symbol = --induction_bucket[cache[i - 1].symbol];
+        cache[i - 2].symbol = --induction_bucket[cache[i - 2].symbol];
+        cache[i - 3].symbol = --induction_bucket[cache[i - 3].symbol];
+    }
+
+    for (j -= prefetch_distance + 3; i >= j; i -= 1)
+    {
+        cache[i].symbol = --induction_bucket[cache[i].symbol];
+    }
+}
+
+static void libsais_radix_sort_lms_suffixes_32s_2k_block_sort(sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4)
+    {
+        libsais_prefetchw(&cache[i - 2 * prefetch_distance]);
+
+        libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 0].symbol, 0)]);
+        libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 1].symbol, 0)]);
+        libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 2].symbol, 0)]);
+        libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 3].symbol, 0)]);
+
+        cache[i - 0].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 0].symbol, 0)];
+        cache[i - 1].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 1].symbol, 0)];
+        cache[i - 2].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 2].symbol, 0)];
+        cache[i - 3].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 3].symbol, 0)];
+    }
+
+    for (j -= prefetch_distance + 3; i >= j; i -= 1)
+    {
+        cache[i].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i].symbol, 0)];
+    }
+}
+
+static void libsais_radix_sort_lms_suffixes_32s_6k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(cache);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais_radix_sort_lms_suffixes_32s_6k(T, SA, induction_bucket, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                libsais_radix_sort_lms_suffixes_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                libsais_radix_sort_lms_suffixes_32s_6k_block_sort(induction_bucket, cache - block_start, block_start, block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+}
+
+static void libsais_radix_sort_lms_suffixes_32s_2k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(cache);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais_radix_sort_lms_suffixes_32s_2k(T, SA, induction_bucket, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                libsais_radix_sort_lms_suffixes_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                libsais_radix_sort_lms_suffixes_32s_2k_block_sort(induction_bucket, cache - block_start, block_start, block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+}
+
+#endif
+
+static void libsais_radix_sort_lms_suffixes_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    if (threads == 1 || m < 65536)
+    {
+        libsais_radix_sort_lms_suffixes_32s_6k(T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1);
+    }
+#if defined(_OPENMP)
+    else
+    {
+        fast_sint_t block_start, block_end;
+        for (block_start = 0; block_start < (fast_sint_t)m - 1; block_start = block_end)
+        {
+            block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end >= m) { block_end = (fast_sint_t)m - 1; }
+
+            libsais_radix_sort_lms_suffixes_32s_6k_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, (fast_sint_t)n - block_end, block_end - block_start, threads);
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+}
+
+static void libsais_radix_sort_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    if (threads == 1 || m < 65536)
+    {
+        libsais_radix_sort_lms_suffixes_32s_2k(T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1);
+    }
+#if defined(_OPENMP)
+    else
+    {
+        fast_sint_t block_start, block_end;
+        for (block_start = 0; block_start < (fast_sint_t)m - 1; block_start = block_end)
+        {
+            block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end >= m) { block_end = (fast_sint_t)m - 1; }
+
+            libsais_radix_sort_lms_suffixes_32s_2k_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, (fast_sint_t)n - block_end, block_end - block_start, threads);
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+}
+
+static sa_sint_t libsais_radix_sort_lms_suffixes_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t             i = n - 2;
+    sa_sint_t             m = 0;
+    fast_uint_t           s = 1;
+    fast_sint_t           c0 = T[n - 1];
+    fast_sint_t           c1 = 0;
+    fast_sint_t           c2 = 0;
+
+    for (; i >= prefetch_distance + 3; i -= 4)
+    {
+        libsais_prefetch(&T[i - 2 * prefetch_distance]);
+
+        libsais_prefetchw(&buckets[T[i - prefetch_distance - 0]]);
+        libsais_prefetchw(&buckets[T[i - prefetch_distance - 1]]);
+        libsais_prefetchw(&buckets[T[i - prefetch_distance - 2]]);
+        libsais_prefetchw(&buckets[T[i - prefetch_distance - 3]]);
+
+        c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); 
+        if ((s & 3) == 1) { SA[--buckets[c2 = c0]] = i + 1; m++; }
+        
+        c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); 
+        if ((s & 3) == 1) { SA[--buckets[c2 = c1]] = i - 0; m++; }
+
+        c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); 
+        if ((s & 3) == 1) { SA[--buckets[c2 = c0]] = i - 1; m++; }
+
+        c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); 
+        if ((s & 3) == 1) { SA[--buckets[c2 = c1]] = i - 2; m++; }
+    }
+
+    for (; i >= 0; i -= 1)
+    {
+        c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); 
+        if ((s & 3) == 1) { SA[--buckets[c2 = c1]] = i + 1; m++; }
+    }
+
+    if (m > 1)
+    {
+        SA[buckets[c2]] = 0;
+    }
+
+    return m;
+}
+
+static void libsais_radix_sort_set_markers_32s_6k(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
+    {
+        libsais_prefetch(&induction_bucket[i + 2 * prefetch_distance]);
+
+        libsais_prefetchw(&SA[induction_bucket[i + prefetch_distance + 0]]);
+        libsais_prefetchw(&SA[induction_bucket[i + prefetch_distance + 1]]);
+        libsais_prefetchw(&SA[induction_bucket[i + prefetch_distance + 2]]);
+        libsais_prefetchw(&SA[induction_bucket[i + prefetch_distance + 3]]);
+
+        SA[induction_bucket[i + 0]] |= SAINT_MIN;
+        SA[induction_bucket[i + 1]] |= SAINT_MIN;
+        SA[induction_bucket[i + 2]] |= SAINT_MIN;
+        SA[induction_bucket[i + 3]] |= SAINT_MIN;
+    }
+
+    for (j += prefetch_distance + 3; i < j; i += 1)
+    {
+        SA[induction_bucket[i]] |= SAINT_MIN;
+    }
+}
+
+static void libsais_radix_sort_set_markers_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
+    {
+        libsais_prefetch(&induction_bucket[BUCKETS_INDEX2(i + 2 * prefetch_distance, 0)]);
+
+        libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 0, 0)]]);
+        libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 1, 0)]]);
+        libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 2, 0)]]);
+        libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 3, 0)]]);
+
+        SA[induction_bucket[BUCKETS_INDEX2(i + 0, 0)]] |= SUFFIX_GROUP_MARKER;
+        SA[induction_bucket[BUCKETS_INDEX2(i + 1, 0)]] |= SUFFIX_GROUP_MARKER;
+        SA[induction_bucket[BUCKETS_INDEX2(i + 2, 0)]] |= SUFFIX_GROUP_MARKER;
+        SA[induction_bucket[BUCKETS_INDEX2(i + 3, 0)]] |= SUFFIX_GROUP_MARKER;
+    }
+
+    for (j += prefetch_distance + 3; i < j; i += 1)
+    {
+        SA[induction_bucket[BUCKETS_INDEX2(i, 0)]] |= SUFFIX_GROUP_MARKER;
+    }
+}
+
+static void libsais_radix_sort_set_markers_32s_6k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && k >= 65536)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+        fast_sint_t omp_block_stride  = (((fast_sint_t)k - 1) / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : (fast_sint_t)k - 1 - omp_block_start;
+#else
+        UNUSED(threads);
+
+        fast_sint_t omp_block_start   = 0;
+        fast_sint_t omp_block_size    = (fast_sint_t)k - 1;
+#endif
+
+        libsais_radix_sort_set_markers_32s_6k(SA, induction_bucket, omp_block_start, omp_block_size);
+    }
+}
+
+static void libsais_radix_sort_set_markers_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && k >= 65536)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+        fast_sint_t omp_block_stride  = (((fast_sint_t)k - 1) / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : (fast_sint_t)k - 1 - omp_block_start;
+#else
+        UNUSED(threads);
+
+        fast_sint_t omp_block_start   = 0;
+        fast_sint_t omp_block_size    = (fast_sint_t)k - 1;
+#endif
+
+        libsais_radix_sort_set_markers_32s_4k(SA, induction_bucket, omp_block_start, omp_block_size);
+    }
+}
+
+static void libsais_initialize_buckets_for_partial_sorting_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count)
+{
+    sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE];
+
+    buckets[BUCKETS_INDEX4((fast_uint_t)T[first_lms_suffix], 1)]++;
+
+    fast_sint_t i, j; sa_sint_t sum0 = left_suffixes_count + 1, sum1 = 0;
+    for (i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0))
+    { 
+        temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0;
+
+        sum0 += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 2)];
+        sum1 += buckets[i + BUCKETS_INDEX4(0, 1)];
+
+        buckets[j + BUCKETS_INDEX2(0, 0)] = sum0;
+        buckets[j + BUCKETS_INDEX2(0, 1)] = sum1;
+    }
+}
+
+static void libsais_initialize_buckets_for_partial_sorting_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count)
+{
+    sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k];
+
+    fast_sint_t i, j; sa_sint_t sum0 = left_suffixes_count + 1, sum1 = 0, sum2 = 0;
+    for (first_lms_suffix = T[first_lms_suffix], i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4((fast_sint_t)first_lms_suffix - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0))
+    {
+        sa_sint_t SS = buckets[i + BUCKETS_INDEX4(0, 0)];
+        sa_sint_t LS = buckets[i + BUCKETS_INDEX4(0, 1)];
+        sa_sint_t SL = buckets[i + BUCKETS_INDEX4(0, 2)];
+        sa_sint_t LL = buckets[i + BUCKETS_INDEX4(0, 3)];
+
+        buckets[i + BUCKETS_INDEX4(0, 0)] = sum0;
+        buckets[i + BUCKETS_INDEX4(0, 1)] = sum2;
+        buckets[i + BUCKETS_INDEX4(0, 2)] = 0;
+        buckets[i + BUCKETS_INDEX4(0, 3)] = 0;
+
+        sum0 += SS + SL; sum1 += LS; sum2 += LS + LL;
+
+        temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0;
+        temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum1;
+    }
+
+    for (sum1 += 1; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0))
+    { 
+        sa_sint_t SS = buckets[i + BUCKETS_INDEX4(0, 0)];
+        sa_sint_t LS = buckets[i + BUCKETS_INDEX4(0, 1)];
+        sa_sint_t SL = buckets[i + BUCKETS_INDEX4(0, 2)];
+        sa_sint_t LL = buckets[i + BUCKETS_INDEX4(0, 3)];
+
+        buckets[i + BUCKETS_INDEX4(0, 0)] = sum0;
+        buckets[i + BUCKETS_INDEX4(0, 1)] = sum2;
+        buckets[i + BUCKETS_INDEX4(0, 2)] = 0;
+        buckets[i + BUCKETS_INDEX4(0, 3)] = 0;
+
+        sum0 += SS + SL; sum1 += LS; sum2 += LS + LL;
+
+        temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0;
+        temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum1;
+    }
+}
+
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE];
+    sa_sint_t * RESTRICT distinct_names   = &buckets[2 * ALPHABET_SIZE];
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais_prefetch(&SA[i + 2 * prefetch_distance]);
+
+        libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1);
+        libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2);
+        libsais_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1);
+        libsais_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2);
+
+        sa_sint_t p0 = SA[i + 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]);
+        SA[induction_bucket[v0]++] = (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d;
+
+        sa_sint_t p1 = SA[i + 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]);
+        SA[induction_bucket[v1]++] = (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d;
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]);
+        SA[induction_bucket[v]++] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d;
+    }
+
+    return d;
+}
+
+#if defined(_OPENMP)
+
+static void libsais_partial_sorting_scan_left_to_right_8u_block_prepare(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size, LIBSAIS_THREAD_STATE * RESTRICT state)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
+    sa_sint_t * RESTRICT distinct_names   = &buckets[2 * ALPHABET_SIZE];
+
+    memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t));
+
+    fast_sint_t i, j, count = 0; sa_sint_t d = 1;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais_prefetch(&SA[i + 2 * prefetch_distance]);
+
+        libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1);
+        libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2);
+        libsais_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1);
+        libsais_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2);
+
+        sa_sint_t p0 = cache[count].index = SA[i + 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = cache[count++].symbol = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]); induction_bucket[v0]++; distinct_names[v0] = d;
+        sa_sint_t p1 = cache[count].index = SA[i + 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = cache[count++].symbol = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]); induction_bucket[v1]++; distinct_names[v1] = d;
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t p = cache[count].index = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = cache[count++].symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]); induction_bucket[v]++; distinct_names[v] = d;
+    }
+
+    state[0].state.position   = (fast_sint_t)d - 1;
+    state[0].state.count      = count;
+}
+
+static void libsais_partial_sorting_scan_left_to_right_8u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count, sa_sint_t d)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
+    sa_sint_t * RESTRICT distinct_names   = &buckets[2 * ALPHABET_SIZE];
+
+    fast_sint_t i, j;
+    for (i = 0, j = count - 1; i < j; i += 2)
+    {
+        libsais_prefetch(&cache[i + prefetch_distance]);
+
+        sa_sint_t p0 = cache[i + 0].index; d += (p0 < 0); sa_sint_t v0 = cache[i + 0].symbol;
+        SA[induction_bucket[v0]++] = (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d;
+
+        sa_sint_t p1 = cache[i + 1].index; d += (p1 < 0); sa_sint_t v1 = cache[i + 1].symbol;
+        SA[induction_bucket[v1]++] = (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d;
+    }
+
+    for (j += 1; i < j; i += 1)
+    {
+        sa_sint_t p = cache[i].index; d += (p < 0); sa_sint_t v = cache[i].symbol;
+        SA[induction_bucket[v]++] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d;
+    }
+}
+
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            d = libsais_partial_sorting_scan_left_to_right_8u(T, SA, buckets, d, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                libsais_partial_sorting_scan_left_to_right_8u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size, &thread_state[omp_thread_num]);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE];
+                sa_sint_t * RESTRICT distinct_names   = &buckets[2 * ALPHABET_SIZE];
+
+                fast_sint_t t;
+                for (t = 0; t < omp_num_threads; ++t)
+                {
+                    sa_sint_t * RESTRICT temp_induction_bucket    = &thread_state[t].state.buckets[0 * ALPHABET_SIZE];
+                    sa_sint_t * RESTRICT temp_distinct_names      = &thread_state[t].state.buckets[2 * ALPHABET_SIZE];
+
+                    fast_sint_t c; 
+                    for (c = 0; c < 2 * ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_induction_bucket[c]; induction_bucket[c] = A + B; temp_induction_bucket[c] = A; }
+
+                    for (d -= 1, c = 0; c < 2 * ALPHABET_SIZE; c += 1) { sa_sint_t A = distinct_names[c], B = temp_distinct_names[c], D = B + d; distinct_names[c] = B > 0 ? D : A; temp_distinct_names[c] = A; }
+                    d += 1 + (sa_sint_t)thread_state[t].state.position; thread_state[t].state.position = (fast_sint_t)d - thread_state[t].state.position;
+                }
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais_partial_sorting_scan_left_to_right_8u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count, (sa_sint_t)thread_state[omp_thread_num].state.position);
+            }
+        }
+#endif
+    }
+
+    return d;
+}
+
+#endif
+
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE];
+    sa_sint_t * RESTRICT distinct_names   = &buckets[2 * ALPHABET_SIZE];
+
+    SA[induction_bucket[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])]++] = (n - 1) | SAINT_MIN;
+    distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])] = ++d;
+
+    if (threads == 1 || left_suffixes_count < 65536)
+    {
+        d = libsais_partial_sorting_scan_left_to_right_8u(T, SA, buckets, d, 0, left_suffixes_count);
+    }
+#if defined(_OPENMP)
+    else
+    {
+        fast_sint_t block_start;
+        for (block_start = 0; block_start < left_suffixes_count; )
+        {
+            if (SA[block_start] == 0)
+            {
+                block_start++;
+            }
+            else
+            {
+                fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > left_suffixes_count) { block_max_end = left_suffixes_count;}
+                fast_sint_t block_end     = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; }
+                fast_sint_t block_size    = block_end - block_start;
+
+                if (block_size < 32)
+                {
+                    for (; block_start < block_end; block_start += 1)
+                    {
+                        sa_sint_t p = SA[block_start]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]);
+                        SA[induction_bucket[v]++] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d;
+                    }
+                }
+                else
+                {
+                    d = libsais_partial_sorting_scan_left_to_right_8u_block_omp(T, SA, buckets, d, block_start, block_size, threads, thread_state);
+                    block_start = block_end;
+                }
+            }
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+
+    return d;
+}
+
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais_prefetch(&SA[i + 3 * prefetch_distance]);
+
+        libsais_prefetch(&T[SA[i + 2 * prefetch_distance + 0] & SAINT_MAX] - 1);
+        libsais_prefetch(&T[SA[i + 2 * prefetch_distance + 0] & SAINT_MAX] - 2);
+        libsais_prefetch(&T[SA[i + 2 * prefetch_distance + 1] & SAINT_MAX] - 1);
+        libsais_prefetch(&T[SA[i + 2 * prefetch_distance + 1] & SAINT_MAX] - 2);
+
+        sa_sint_t p0 = SA[i + prefetch_distance + 0] & SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX4(T[p0 - (p0 > 0)], 0); libsais_prefetchw(&buckets[v0]);
+        sa_sint_t p1 = SA[i + prefetch_distance + 1] & SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX4(T[p1 - (p1 > 0)], 0); libsais_prefetchw(&buckets[v1]);
+
+        sa_sint_t p2 = SA[i + 0]; d += (p2 < 0); p2 &= SAINT_MAX; sa_sint_t v2 = BUCKETS_INDEX4(T[p2 - 1], T[p2 - 2] >= T[p2 - 1]);
+        SA[buckets[v2]++] = (p2 - 1) | ((sa_sint_t)(buckets[2 + v2] != d) << (SAINT_BIT - 1)); buckets[2 + v2] = d;
+
+        sa_sint_t p3 = SA[i + 1]; d += (p3 < 0); p3 &= SAINT_MAX; sa_sint_t v3 = BUCKETS_INDEX4(T[p3 - 1], T[p3 - 2] >= T[p3 - 1]);
+        SA[buckets[v3]++] = (p3 - 1) | ((sa_sint_t)(buckets[2 + v3] != d) << (SAINT_BIT - 1)); buckets[2 + v3] = d;
+    }
+
+    for (j += 2 * prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX4(T[p - 1], T[p - 2] >= T[p - 1]);
+        SA[buckets[v]++] = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d;
+    }
+
+    return d;
+}
+
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT induction_bucket = &buckets[2 * k];
+    sa_sint_t * RESTRICT distinct_names   = &buckets[0 * k];
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais_prefetchw(&SA[i + 3 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { const fast_sint_t Ts2 = T[(s2 & ~SUFFIX_GROUP_MARKER) - 1]; libsais_prefetchw(&induction_bucket[Ts2]); libsais_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts2, 0)]); }
+        sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { const fast_sint_t Ts3 = T[(s3 & ~SUFFIX_GROUP_MARKER) - 1]; libsais_prefetchw(&induction_bucket[Ts3]); libsais_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts3, 0)]); }
+
+        sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX;
+        if (p0 > 0)
+        {
+            SA[i + 0] = 0; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); p0 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] < T[p0 - 1]);
+            SA[induction_bucket[T[p0 - 1]]++] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d;
+        }
+
+        sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX;
+        if (p1 > 0)
+        {
+            SA[i + 1] = 0; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); p1 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] < T[p1 - 1]);
+            SA[induction_bucket[T[p1 - 1]]++] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d;
+        }
+    }
+
+    for (j += 2 * prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX;
+        if (p > 0)
+        {
+            SA[i] = 0; d += (p >> (SUFFIX_GROUP_BIT - 1)); p &= ~SUFFIX_GROUP_MARKER; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] < T[p - 1]);
+            SA[induction_bucket[T[p - 1]]++] = (p - 1) | ((sa_sint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d;
+        }
+    }
+
+    return d;
+}
+
+static void libsais_partial_sorting_scan_left_to_right_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais_prefetchw(&SA[i + 3 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { libsais_prefetchw(&induction_bucket[T[s2 - 1]]); libsais_prefetch(&T[s2] - 2); }
+        sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { libsais_prefetchw(&induction_bucket[T[s3 - 1]]); libsais_prefetch(&T[s3] - 2); }
+
+        sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { SA[i + 0] = 0; SA[induction_bucket[T[p0 - 1]]++] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)); }
+        sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { SA[i + 1] = 0; SA[induction_bucket[T[p1 - 1]]++] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)); }
+    }
+
+    for (j += 2 * prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { SA[i] = 0; SA[induction_bucket[T[p - 1]]++] = (p - 1) | ((sa_sint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1)); }
+    }
+}
+
+#if defined(_OPENMP)
+
+static void libsais_partial_sorting_scan_left_to_right_32s_6k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais_prefetch(&SA[i + 2 * prefetch_distance]);
+
+        libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1);
+        libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2);
+        libsais_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1);
+        libsais_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2);
+
+        libsais_prefetchw(&cache[i + prefetch_distance]);
+
+        sa_sint_t p0 = cache[i + 0].index = SA[i + 0]; sa_sint_t symbol0 = 0; p0 &= SAINT_MAX; if (p0 != 0) { symbol0 = BUCKETS_INDEX4(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]); } cache[i + 0].symbol = symbol0;
+        sa_sint_t p1 = cache[i + 1].index = SA[i + 1]; sa_sint_t symbol1 = 0; p1 &= SAINT_MAX; if (p1 != 0) { symbol1 = BUCKETS_INDEX4(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]); } cache[i + 1].symbol = symbol1;
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t p = cache[i].index = SA[i]; sa_sint_t symbol = 0; p &= SAINT_MAX; if (p != 0) { symbol = BUCKETS_INDEX4(T[p - 1], T[p - 2] >= T[p - 1]); } cache[i].symbol = symbol;
+    }
+}
+
+static void libsais_partial_sorting_scan_left_to_right_32s_4k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+
+        libsais_prefetchw(&cache[i + prefetch_distance]);
+
+        sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { cache[i + 0].index = p0; p0 &= ~SUFFIX_GROUP_MARKER; symbol0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] < T[p0 - 1]); p0 = 0; } cache[i + 0].symbol = symbol0; SA[i + 0] = p0 & SAINT_MAX;
+        sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { cache[i + 1].index = p1; p1 &= ~SUFFIX_GROUP_MARKER; symbol1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] < T[p1 - 1]); p1 = 0; } cache[i + 1].symbol = symbol1; SA[i + 1] = p1 & SAINT_MAX;
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { cache[i].index = p; p &= ~SUFFIX_GROUP_MARKER; symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] < T[p - 1]); p = 0; } cache[i].symbol = symbol; SA[i] = p & SAINT_MAX;
+    }
+}
+
+static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+
+        libsais_prefetchw(&cache[i + prefetch_distance]);
+
+        sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { cache[i + 0].index = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)); symbol0 = T[p0 - 1]; p0 = 0; } cache[i + 0].symbol = symbol0; SA[i + 0] = p0 & SAINT_MAX;
+        sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { cache[i + 1].index = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)); symbol1 = T[p1 - 1]; p1 = 0; } cache[i + 1].symbol = symbol1; SA[i + 1] = p1 & SAINT_MAX;
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { cache[i].index = (p - 1) | ((sa_sint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1)); symbol = T[p - 1]; p = 0; } cache[i].symbol = symbol; SA[i] = p & SAINT_MAX;
+    }
+}
+
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size;
+    for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais_prefetchw(&cache[i + 2 * prefetch_distance]);
+
+        libsais_prefetchw(&buckets[cache[i + prefetch_distance + 0].symbol]);
+        libsais_prefetchw(&buckets[cache[i + prefetch_distance + 1].symbol]);
+
+        sa_sint_t v0 = cache[i + 0].symbol, p0 = cache[i + 0].index; d += (p0 < 0); cache[i + 0].symbol = buckets[v0]++; cache[i + 0].index = (p0 - 1) | ((sa_sint_t)(buckets[2 + v0] != d) << (SAINT_BIT - 1)); buckets[2 + v0] = d;
+        if (cache[i + 0].symbol < omp_block_end) { sa_sint_t s = cache[i + 0].symbol, q = (cache[s].index = cache[i + 0].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] >= T[q - 1]); }
+
+        sa_sint_t v1 = cache[i + 1].symbol, p1 = cache[i + 1].index; d += (p1 < 0); cache[i + 1].symbol = buckets[v1]++; cache[i + 1].index = (p1 - 1) | ((sa_sint_t)(buckets[2 + v1] != d) << (SAINT_BIT - 1)); buckets[2 + v1] = d;
+        if (cache[i + 1].symbol < omp_block_end) { sa_sint_t s = cache[i + 1].symbol, q = (cache[s].index = cache[i + 1].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] >= T[q - 1]); }
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t v = cache[i].symbol, p = cache[i].index; d += (p < 0); cache[i].symbol = buckets[v]++; cache[i].index = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d;
+        if (cache[i].symbol < omp_block_end) { sa_sint_t s = cache[i].symbol, q = (cache[s].index = cache[i].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] >= T[q - 1]); }
+    }
+
+    return d;
+}
+
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT induction_bucket = &buckets[2 * k];
+    sa_sint_t * RESTRICT distinct_names   = &buckets[0 * k];
+
+    fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size;
+    for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais_prefetchw(&cache[i + 2 * prefetch_distance]);
+
+        sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0 >> 1]; libsais_prefetchw(s0 >= 0 ? Is0 : NULL); const sa_sint_t * Ds0 = &distinct_names[s0]; libsais_prefetchw(s0 >= 0 ? Ds0 : NULL); 
+        sa_sint_t s1 = cache[i + prefetch_distance + 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1 >> 1]; libsais_prefetchw(s1 >= 0 ? Is1 : NULL); const sa_sint_t * Ds1 = &distinct_names[s1]; libsais_prefetchw(s1 >= 0 ? Ds1 : NULL);
+        
+        sa_sint_t v0 = cache[i + 0].symbol;
+        if (v0 >= 0)
+        {
+            sa_sint_t p0 = cache[i + 0].index; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); cache[i + 0].symbol = induction_bucket[v0 >> 1]++; cache[i + 0].index = (p0 - 1) | (v0 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d;
+            if (cache[i + 0].symbol < omp_block_end) { sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index; if (np > 0) { cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]); np = 0; } cache[i + 0].index = np & SAINT_MAX; }
+        }
+
+        sa_sint_t v1 = cache[i + 1].symbol;
+        if (v1 >= 0)
+        {
+            sa_sint_t p1 = cache[i + 1].index; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); cache[i + 1].symbol = induction_bucket[v1 >> 1]++; cache[i + 1].index = (p1 - 1) | (v1 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d;
+            if (cache[i + 1].symbol < omp_block_end) { sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index; if (np > 0) { cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]); np = 0; } cache[i + 1].index = np & SAINT_MAX; }
+        }
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t v = cache[i].symbol;
+        if (v >= 0)
+        {
+            sa_sint_t p = cache[i].index; d += (p >> (SUFFIX_GROUP_BIT - 1)); cache[i].symbol = induction_bucket[v >> 1]++; cache[i].index = (p - 1) | (v << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d;
+            if (cache[i].symbol < omp_block_end) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]); np = 0; } cache[i].index = np & SAINT_MAX; }
+        }
+    }
+
+    return d;
+}
+
+static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size;
+    for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais_prefetchw(&cache[i + 2 * prefetch_distance]);
+
+        sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0]; libsais_prefetchw(s0 >= 0 ? Is0 : NULL);
+        sa_sint_t s1 = cache[i + prefetch_distance + 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1]; libsais_prefetchw(s1 >= 0 ? Is1 : NULL);
+        
+        sa_sint_t v0 = cache[i + 0].symbol;
+        if (v0 >= 0)
+        {
+            cache[i + 0].symbol = induction_bucket[v0]++;
+            if (cache[i + 0].symbol < omp_block_end) { sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index; if (np > 0) { cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; np = 0; } cache[i + 0].index = np & SAINT_MAX; }
+        }
+
+        sa_sint_t v1 = cache[i + 1].symbol;
+        if (v1 >= 0)
+        {
+            cache[i + 1].symbol = induction_bucket[v1]++;
+            if (cache[i + 1].symbol < omp_block_end) { sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index; if (np > 0) { cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; np = 0; } cache[i + 1].index = np & SAINT_MAX; }
+        }
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t v = cache[i].symbol;
+        if (v >= 0)
+        {
+            cache[i].symbol = induction_bucket[v]++;
+            if (cache[i].symbol < omp_block_end) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; np = 0; } cache[i].index = np & SAINT_MAX; }
+        }
+    }
+}
+
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(cache);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            d = libsais_partial_sorting_scan_left_to_right_32s_6k(T, SA, buckets, d, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                libsais_partial_sorting_scan_left_to_right_32s_6k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                d = libsais_partial_sorting_scan_left_to_right_32s_6k_block_sort(T, buckets, d, cache - block_start, block_start, block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+
+    return d;
+}
+
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(cache);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            d = libsais_partial_sorting_scan_left_to_right_32s_4k(T, SA, k, buckets, d, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                libsais_partial_sorting_scan_left_to_right_32s_4k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                d = libsais_partial_sorting_scan_left_to_right_32s_4k_block_sort(T, k, buckets, d, cache - block_start, block_start, block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+
+    return d;
+}
+
+static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(cache);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais_partial_sorting_scan_left_to_right_32s_1k(T, SA, buckets, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                libsais_partial_sorting_scan_left_to_right_32s_1k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                libsais_partial_sorting_scan_left_to_right_32s_1k_block_sort(T, buckets, cache - block_start, block_start, block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+}
+
+#endif
+
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    SA[buckets[BUCKETS_INDEX4(T[n - 1], T[n - 2] >= T[n - 1])]++] = (n - 1) | SAINT_MIN;
+    buckets[2 + BUCKETS_INDEX4(T[n - 1], T[n - 2] >= T[n - 1])] = ++d;
+
+    if (threads == 1 || left_suffixes_count < 65536)
+    {
+        d = libsais_partial_sorting_scan_left_to_right_32s_6k(T, SA, buckets, d, 0, left_suffixes_count);
+    }
+#if defined(_OPENMP)
+    else
+    {
+        fast_sint_t block_start, block_end;
+        for (block_start = 0; block_start < left_suffixes_count; block_start = block_end)
+        {
+            block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > left_suffixes_count) { block_end = left_suffixes_count; }
+
+            d = libsais_partial_sorting_scan_left_to_right_32s_6k_block_omp(T, SA, buckets, d, thread_state[0].state.cache, block_start, block_end - block_start, threads);
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+
+    return d;
+}
+
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    sa_sint_t * RESTRICT induction_bucket = &buckets[2 * k];
+    sa_sint_t * RESTRICT distinct_names   = &buckets[0 * k];
+
+    SA[induction_bucket[T[n - 1]]++] = (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1)) | SUFFIX_GROUP_MARKER;
+    distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] < T[n - 1])] = ++d;
+
+    if (threads == 1 || n < 65536)
+    {
+        d = libsais_partial_sorting_scan_left_to_right_32s_4k(T, SA, k, buckets, d, 0, n);
+    }
+#if defined(_OPENMP)
+    else
+    {
+        fast_sint_t block_start, block_end;
+        for (block_start = 0; block_start < n; block_start = block_end)
+        {
+            block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > n) { block_end = n; }
+
+            d = libsais_partial_sorting_scan_left_to_right_32s_4k_block_omp(T, SA, k, buckets, d, thread_state[0].state.cache, block_start, block_end - block_start, threads);
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+
+    return d;
+}
+
+static void libsais_partial_sorting_scan_left_to_right_32s_1k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    SA[buckets[T[n - 1]]++] = (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1));
+
+    if (threads == 1 || n < 65536)
+    {
+       libsais_partial_sorting_scan_left_to_right_32s_1k(T, SA, buckets, 0, n);
+    }
+#if defined(_OPENMP)
+    else
+    {
+        fast_sint_t block_start, block_end;
+        for (block_start = 0; block_start < n; block_start = block_end)
+        {
+            block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > n) { block_end = n; }
+
+            libsais_partial_sorting_scan_left_to_right_32s_1k_block_omp(T, SA, buckets, thread_state[0].state.cache, block_start, block_end - block_start, threads);
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+}
+
+static void libsais_partial_sorting_shift_markers_8u_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, const sa_sint_t * RESTRICT buckets, sa_sint_t threads)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    const sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE];
+
+    fast_sint_t c;
+
+#if defined(_OPENMP)
+    #pragma omp parallel for schedule(static, 1) num_threads(threads) if(threads > 1 && n >= 65536)
+#else
+    UNUSED(threads); UNUSED(n);
+#endif
+    for (c = BUCKETS_INDEX2(ALPHABET_SIZE - 1, 0); c >= BUCKETS_INDEX2(1, 0); c -= BUCKETS_INDEX2(1, 0))
+    {
+        fast_sint_t i, j; sa_sint_t s = SAINT_MIN;
+        for (i = (fast_sint_t)temp_bucket[c] - 1, j = (fast_sint_t)buckets[c - BUCKETS_INDEX2(1, 0)] + 3; i >= j; i -= 4)
+        {
+            libsais_prefetchw(&SA[i - prefetch_distance]);
+
+            sa_sint_t p0 = SA[i - 0], q0 = (p0 & SAINT_MIN) ^ s; s = s ^ q0; SA[i - 0] = p0 ^ q0;
+            sa_sint_t p1 = SA[i - 1], q1 = (p1 & SAINT_MIN) ^ s; s = s ^ q1; SA[i - 1] = p1 ^ q1;
+            sa_sint_t p2 = SA[i - 2], q2 = (p2 & SAINT_MIN) ^ s; s = s ^ q2; SA[i - 2] = p2 ^ q2;
+            sa_sint_t p3 = SA[i - 3], q3 = (p3 & SAINT_MIN) ^ s; s = s ^ q3; SA[i - 3] = p3 ^ q3;
+        }
+
+        for (j -= 3; i >= j; i -= 1)
+        {
+            sa_sint_t p = SA[i], q = (p & SAINT_MIN) ^ s; s = s ^ q; SA[i] = p ^ q;
+        }
+    }
+}
+
+static void libsais_partial_sorting_shift_markers_32s_6k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k, const sa_sint_t * RESTRICT buckets, sa_sint_t threads)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    const sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k];
+    
+    fast_sint_t c;
+
+#if defined(_OPENMP)
+    #pragma omp parallel for schedule(static, 1) num_threads(threads) if(threads > 1 && k >= 65536)
+#else
+    UNUSED(threads);
+#endif
+    for (c = (fast_sint_t)k - 1; c >= 1; c -= 1)
+    {
+        fast_sint_t i, j; sa_sint_t s = SAINT_MIN;
+        for (i = (fast_sint_t)buckets[BUCKETS_INDEX4(c, 0)] - 1, j = (fast_sint_t)temp_bucket[BUCKETS_INDEX2(c - 1, 0)] + 3; i >= j; i -= 4)
+        {
+            libsais_prefetchw(&SA[i - prefetch_distance]);
+
+            sa_sint_t p0 = SA[i - 0], q0 = (p0 & SAINT_MIN) ^ s; s = s ^ q0; SA[i - 0] = p0 ^ q0;
+            sa_sint_t p1 = SA[i - 1], q1 = (p1 & SAINT_MIN) ^ s; s = s ^ q1; SA[i - 1] = p1 ^ q1;
+            sa_sint_t p2 = SA[i - 2], q2 = (p2 & SAINT_MIN) ^ s; s = s ^ q2; SA[i - 2] = p2 ^ q2;
+            sa_sint_t p3 = SA[i - 3], q3 = (p3 & SAINT_MIN) ^ s; s = s ^ q3; SA[i - 3] = p3 ^ q3;
+        }
+
+        for (j -= 3; i >= j; i -= 1)
+        {
+            sa_sint_t p = SA[i], q = (p & SAINT_MIN) ^ s; s = s ^ q; SA[i] = p ^ q;
+        }
+    }
+}
+
+static void libsais_partial_sorting_shift_markers_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i; sa_sint_t s = SUFFIX_GROUP_MARKER;
+    for (i = (fast_sint_t)n - 1; i >= 3; i -= 4)
+    {
+        libsais_prefetchw(&SA[i - prefetch_distance]);
+
+        sa_sint_t p0 = SA[i - 0], q0 = ((p0 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p0 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q0; SA[i - 0] = p0 ^ q0;
+        sa_sint_t p1 = SA[i - 1], q1 = ((p1 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p1 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q1; SA[i - 1] = p1 ^ q1;
+        sa_sint_t p2 = SA[i - 2], q2 = ((p2 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p2 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q2; SA[i - 2] = p2 ^ q2;
+        sa_sint_t p3 = SA[i - 3], q3 = ((p3 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p3 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q3; SA[i - 3] = p3 ^ q3;
+    }
+
+    for (; i >= 0; i -= 1)
+    {
+        sa_sint_t p = SA[i], q = ((p & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q; SA[i] = p ^ q;
+    }
+}
+
+static void libsais_partial_sorting_shift_buckets_32s_6k(sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+    sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k];
+
+    fast_sint_t i;
+    for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0))
+    {
+        buckets[2 * i + BUCKETS_INDEX4(0, 0)] = temp_bucket[i + BUCKETS_INDEX2(0, 0)];
+        buckets[2 * i + BUCKETS_INDEX4(0, 1)] = temp_bucket[i + BUCKETS_INDEX2(0, 1)];
+    }
+}
+
+static sa_sint_t libsais_partial_sorting_scan_right_to_left_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
+    sa_sint_t * RESTRICT distinct_names   = &buckets[2 * ALPHABET_SIZE];
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+    {
+        libsais_prefetch(&SA[i - 2 * prefetch_distance]);
+
+        libsais_prefetch(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 1);
+        libsais_prefetch(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 2);
+        libsais_prefetch(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 1);
+        libsais_prefetch(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 2);
+
+        sa_sint_t p0 = SA[i - 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]);
+        SA[--induction_bucket[v0]] = (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d;
+
+        sa_sint_t p1 = SA[i - 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]);
+        SA[--induction_bucket[v1]] = (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d;
+    }
+
+    for (j -= prefetch_distance + 1; i >= j; i -= 1)
+    {
+        sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
+        SA[--induction_bucket[v]] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d;
+    }
+
+    return d;
+}
+
+#if defined(_OPENMP)
+
+static void libsais_partial_sorting_scan_right_to_left_8u_block_prepare(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size, LIBSAIS_THREAD_STATE * RESTRICT state)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
+    sa_sint_t * RESTRICT distinct_names   = &buckets[2 * ALPHABET_SIZE];
+
+    memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t));
+
+    fast_sint_t i, j, count = 0; sa_sint_t d = 1;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+    {
+        libsais_prefetch(&SA[i - 2 * prefetch_distance]);
+
+        libsais_prefetch(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 1);
+        libsais_prefetch(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 2);
+        libsais_prefetch(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 1);
+        libsais_prefetch(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 2);
+
+        sa_sint_t p0 = cache[count].index = SA[i - 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = cache[count++].symbol = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); induction_bucket[v0]++; distinct_names[v0] = d;
+        sa_sint_t p1 = cache[count].index = SA[i - 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = cache[count++].symbol = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); induction_bucket[v1]++; distinct_names[v1] = d;
+    }
+
+    for (j -= prefetch_distance + 1; i >= j; i -= 1)
+    {
+        sa_sint_t p = cache[count].index = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = cache[count++].symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); induction_bucket[v]++; distinct_names[v] = d;
+    }
+
+    state[0].state.position   = (fast_sint_t)d - 1;
+    state[0].state.count      = count;
+}
+
+static void libsais_partial_sorting_scan_right_to_left_8u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count, sa_sint_t d)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
+    sa_sint_t * RESTRICT distinct_names   = &buckets[2 * ALPHABET_SIZE];
+
+    fast_sint_t i, j;
+    for (i = 0, j = count - 1; i < j; i += 2)
+    {
+        libsais_prefetch(&cache[i + prefetch_distance]);
+
+        sa_sint_t p0 = cache[i + 0].index; d += (p0 < 0); sa_sint_t v0 = cache[i + 0].symbol;
+        SA[--induction_bucket[v0]] = (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d;
+
+        sa_sint_t p1 = cache[i + 1].index; d += (p1 < 0); sa_sint_t v1 = cache[i + 1].symbol;
+        SA[--induction_bucket[v1]] = (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d;
+    }
+
+    for (j += 1; i < j; i += 1)
+    {
+        sa_sint_t p = cache[i].index; d += (p < 0); sa_sint_t v = cache[i].symbol;
+        SA[--induction_bucket[v]] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d;
+    }
+}
+
+static sa_sint_t libsais_partial_sorting_scan_right_to_left_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            d = libsais_partial_sorting_scan_right_to_left_8u(T, SA, buckets, d, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                libsais_partial_sorting_scan_right_to_left_8u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size, &thread_state[omp_thread_num]);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
+                sa_sint_t * RESTRICT distinct_names   = &buckets[2 * ALPHABET_SIZE];
+
+                fast_sint_t t;
+                for (t = omp_num_threads - 1; t >= 0; --t)
+                {
+                    sa_sint_t * RESTRICT temp_induction_bucket    = &thread_state[t].state.buckets[0 * ALPHABET_SIZE];
+                    sa_sint_t * RESTRICT temp_distinct_names      = &thread_state[t].state.buckets[2 * ALPHABET_SIZE];
+
+                    fast_sint_t c; 
+                    for (c = 0; c < 2 * ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_induction_bucket[c]; induction_bucket[c] = A - B; temp_induction_bucket[c] = A; }
+
+                    for (d -= 1, c = 0; c < 2 * ALPHABET_SIZE; c += 1) { sa_sint_t A = distinct_names[c], B = temp_distinct_names[c], D = B + d; distinct_names[c] = B > 0 ? D : A; temp_distinct_names[c] = A; }
+                    d += 1 + (sa_sint_t)thread_state[t].state.position; thread_state[t].state.position = (fast_sint_t)d - thread_state[t].state.position;
+                }
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais_partial_sorting_scan_right_to_left_8u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count, (sa_sint_t)thread_state[omp_thread_num].state.position);
+            }
+        }
+#endif
+    }
+
+    return d;
+}
+
+#endif
+
+static void libsais_partial_sorting_scan_right_to_left_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    fast_sint_t scan_start    = (fast_sint_t)left_suffixes_count + 1;
+    fast_sint_t scan_end      = (fast_sint_t)n - (fast_sint_t)first_lms_suffix;
+
+    if (threads == 1 || (scan_end - scan_start) < 65536)
+    {
+        libsais_partial_sorting_scan_right_to_left_8u(T, SA, buckets, d, scan_start, scan_end - scan_start);
+    }
+#if defined(_OPENMP)
+    else
+    {
+        sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
+        sa_sint_t * RESTRICT distinct_names   = &buckets[2 * ALPHABET_SIZE];
+
+        fast_sint_t block_start;
+        for (block_start = scan_end - 1; block_start >= scan_start; )
+        {
+            if (SA[block_start] == 0)
+            {
+                block_start--;
+            }
+            else
+            {
+                fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end < scan_start) { block_max_end = scan_start - 1; }
+                fast_sint_t block_end     = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; }
+                fast_sint_t block_size    = block_start - block_end;
+
+                if (block_size < 32)
+                {
+                    for (; block_start > block_end; block_start -= 1)
+                    {
+                        sa_sint_t p = SA[block_start]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
+                        SA[--induction_bucket[v]] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d;
+                    }
+                }
+                else
+                {
+                    d = libsais_partial_sorting_scan_right_to_left_8u_block_omp(T, SA, buckets, d, block_end + 1, block_size, threads, thread_state);
+                    block_start = block_end;
+                }
+            }
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+}
+
+static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2)
+    {
+        libsais_prefetch(&SA[i - 3 * prefetch_distance]);
+
+        libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 0] & SAINT_MAX] - 1);
+        libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 0] & SAINT_MAX] - 2);
+        libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 1] & SAINT_MAX] - 1);
+        libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 1] & SAINT_MAX] - 2);
+
+        sa_sint_t p0 = SA[i - prefetch_distance - 0] & SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX4(T[p0 - (p0 > 0)], 0); libsais_prefetchw(&buckets[v0]);
+        sa_sint_t p1 = SA[i - prefetch_distance - 1] & SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX4(T[p1 - (p1 > 0)], 0); libsais_prefetchw(&buckets[v1]);
+
+        sa_sint_t p2 = SA[i - 0]; d += (p2 < 0); p2 &= SAINT_MAX; sa_sint_t v2 = BUCKETS_INDEX4(T[p2 - 1], T[p2 - 2] > T[p2 - 1]);
+        SA[--buckets[v2]] = (p2 - 1) | ((sa_sint_t)(buckets[2 + v2] != d) << (SAINT_BIT - 1)); buckets[2 + v2] = d;
+
+        sa_sint_t p3 = SA[i - 1]; d += (p3 < 0); p3 &= SAINT_MAX; sa_sint_t v3 = BUCKETS_INDEX4(T[p3 - 1], T[p3 - 2] > T[p3 - 1]);
+        SA[--buckets[v3]] = (p3 - 1) | ((sa_sint_t)(buckets[2 + v3] != d) << (SAINT_BIT - 1)); buckets[2 + v3] = d;
+    }
+
+    for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1)
+    {
+        sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX4(T[p - 1], T[p - 2] > T[p - 1]);
+        SA[--buckets[v]] = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d;
+    }
+
+    return d;
+}
+
+static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT induction_bucket = &buckets[3 * k];
+    sa_sint_t * RESTRICT distinct_names   = &buckets[0 * k];
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2)
+    {
+        libsais_prefetchw(&SA[i - 3 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0]; const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1]; const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { const fast_sint_t Ts2 = T[(s2 & ~SUFFIX_GROUP_MARKER) - 1]; libsais_prefetchw(&induction_bucket[Ts2]); libsais_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts2, 0)]); }
+        sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { const fast_sint_t Ts3 = T[(s3 & ~SUFFIX_GROUP_MARKER) - 1]; libsais_prefetchw(&induction_bucket[Ts3]); libsais_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts3, 0)]); }
+
+        sa_sint_t p0 = SA[i - 0];
+        if (p0 > 0)
+        {
+            SA[i - 0] = 0; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); p0 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]);
+            SA[--induction_bucket[T[p0 - 1]]] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d;
+        }
+
+        sa_sint_t p1 = SA[i - 1];
+        if (p1 > 0)
+        {
+            SA[i - 1] = 0; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); p1 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]);
+            SA[--induction_bucket[T[p1 - 1]]] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d;
+        }
+    }
+
+    for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1)
+    {
+        sa_sint_t p = SA[i];
+        if (p > 0)
+        {
+            SA[i] = 0; d += (p >> (SUFFIX_GROUP_BIT - 1)); p &= ~SUFFIX_GROUP_MARKER; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
+            SA[--induction_bucket[T[p - 1]]] = (p - 1) | ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d;
+        }
+    }
+
+    return d;
+}
+
+static void libsais_partial_sorting_scan_right_to_left_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2)
+    {
+        libsais_prefetchw(&SA[i - 3 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { libsais_prefetchw(&induction_bucket[T[s2 - 1]]); libsais_prefetch(&T[s2] - 2); }
+        sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { libsais_prefetchw(&induction_bucket[T[s3 - 1]]); libsais_prefetch(&T[s3] - 2); }
+
+        sa_sint_t p0 = SA[i - 0]; if (p0 > 0) { SA[i - 0] = 0; SA[--induction_bucket[T[p0 - 1]]] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)); }
+        sa_sint_t p1 = SA[i - 1]; if (p1 > 0) { SA[i - 1] = 0; SA[--induction_bucket[T[p1 - 1]]] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)); }
+    }
+
+    for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1)
+    {
+        sa_sint_t p = SA[i]; if (p > 0) { SA[i] = 0; SA[--induction_bucket[T[p - 1]]] = (p - 1) | ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)); }
+    }
+}
+
+#if defined(_OPENMP)
+
+static void libsais_partial_sorting_scan_right_to_left_32s_6k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais_prefetch(&SA[i + 2 * prefetch_distance]);
+
+        libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1);
+        libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2);
+        libsais_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1);
+        libsais_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2);
+
+        libsais_prefetchw(&cache[i + prefetch_distance]);
+
+        sa_sint_t p0 = cache[i + 0].index = SA[i + 0]; sa_sint_t symbol0 = 0; p0 &= SAINT_MAX; if (p0 != 0) { symbol0 = BUCKETS_INDEX4(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); } cache[i + 0].symbol = symbol0;
+        sa_sint_t p1 = cache[i + 1].index = SA[i + 1]; sa_sint_t symbol1 = 0; p1 &= SAINT_MAX; if (p1 != 0) { symbol1 = BUCKETS_INDEX4(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); } cache[i + 1].symbol = symbol1;
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t p = cache[i].index = SA[i]; sa_sint_t symbol = 0; p &= SAINT_MAX; if (p != 0) { symbol = BUCKETS_INDEX4(T[p - 1], T[p - 2] > T[p - 1]); } cache[i].symbol = symbol;
+    }
+}
+
+static void libsais_partial_sorting_scan_right_to_left_32s_4k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+
+        libsais_prefetchw(&cache[i + prefetch_distance]);
+
+        sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { SA[i + 0] = 0; cache[i + 0].index = p0; p0 &= ~SUFFIX_GROUP_MARKER; symbol0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); } cache[i + 0].symbol = symbol0;
+        sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { SA[i + 1] = 0; cache[i + 1].index = p1; p1 &= ~SUFFIX_GROUP_MARKER; symbol1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); } cache[i + 1].symbol = symbol1;
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { SA[i] = 0; cache[i].index = p; p &= ~SUFFIX_GROUP_MARKER; symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); } cache[i].symbol = symbol;
+    }
+}
+
+static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+
+        libsais_prefetchw(&cache[i + prefetch_distance]);
+
+        sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { SA[i + 0] = 0; cache[i + 0].index = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)); symbol0 = T[p0 - 1]; } cache[i + 0].symbol = symbol0;
+        sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { SA[i + 1] = 0; cache[i + 1].index = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)); symbol1 = T[p1 - 1]; } cache[i + 1].symbol = symbol1;
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { SA[i] = 0; cache[i].index = (p - 1) | ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)); symbol = T[p - 1]; } cache[i].symbol = symbol;
+    }
+}
+
+static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+    {
+        libsais_prefetchw(&cache[i - 2 * prefetch_distance]);
+
+        libsais_prefetchw(&buckets[cache[i - prefetch_distance - 0].symbol]);
+        libsais_prefetchw(&buckets[cache[i - prefetch_distance - 1].symbol]);
+
+        sa_sint_t v0 = cache[i - 0].symbol, p0 = cache[i - 0].index; d += (p0 < 0); cache[i - 0].symbol = --buckets[v0]; cache[i - 0].index = (p0 - 1) | ((sa_sint_t)(buckets[2 + v0] != d) << (SAINT_BIT - 1)); buckets[2 + v0] = d;
+        if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t s = cache[i - 0].symbol, q = (cache[s].index = cache[i - 0].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] > T[q - 1]); }
+
+        sa_sint_t v1 = cache[i - 1].symbol, p1 = cache[i - 1].index; d += (p1 < 0); cache[i - 1].symbol = --buckets[v1]; cache[i - 1].index = (p1 - 1) | ((sa_sint_t)(buckets[2 + v1] != d) << (SAINT_BIT - 1)); buckets[2 + v1] = d;
+        if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t s = cache[i - 1].symbol, q = (cache[s].index = cache[i - 1].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] > T[q - 1]); }
+    }
+
+    for (j -= prefetch_distance + 1; i >= j; i -= 1)
+    {
+        sa_sint_t v = cache[i].symbol, p = cache[i].index; d += (p < 0); cache[i].symbol = --buckets[v]; cache[i].index = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d;
+        if (cache[i].symbol >= omp_block_start) { sa_sint_t s = cache[i].symbol, q = (cache[s].index = cache[i].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] > T[q - 1]); }
+    }
+
+    return d;
+}
+
+static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT induction_bucket = &buckets[3 * k];
+    sa_sint_t * RESTRICT distinct_names   = &buckets[0 * k];
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+    {
+        libsais_prefetchw(&cache[i - 2 * prefetch_distance]);
+
+        sa_sint_t s0 = cache[i - prefetch_distance - 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0 >> 1]; libsais_prefetchw(s0 >= 0 ? Is0 : NULL); const sa_sint_t * Ds0 = &distinct_names[s0]; libsais_prefetchw(s0 >= 0 ? Ds0 : NULL); 
+        sa_sint_t s1 = cache[i - prefetch_distance - 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1 >> 1]; libsais_prefetchw(s1 >= 0 ? Is1 : NULL); const sa_sint_t * Ds1 = &distinct_names[s1]; libsais_prefetchw(s1 >= 0 ? Ds1 : NULL);
+
+        sa_sint_t v0 = cache[i - 0].symbol;
+        if (v0 >= 0)
+        {
+            sa_sint_t p0 = cache[i - 0].index; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); cache[i - 0].symbol = --induction_bucket[v0 >> 1]; cache[i - 0].index = (p0 - 1) | (v0 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d;
+            if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 0].symbol, np = cache[i - 0].index; if (np > 0) { cache[i - 0].index = 0; cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]); } }
+        }
+
+        sa_sint_t v1 = cache[i - 1].symbol;
+        if (v1 >= 0)
+        {
+            sa_sint_t p1 = cache[i - 1].index; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); cache[i - 1].symbol = --induction_bucket[v1 >> 1]; cache[i - 1].index = (p1 - 1) | (v1 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d;
+            if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 1].symbol, np = cache[i - 1].index; if (np > 0) { cache[i - 1].index = 0; cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]); } }
+        }
+    }
+
+    for (j -= prefetch_distance + 1; i >= j; i -= 1)
+    {
+        sa_sint_t v = cache[i].symbol;
+        if (v >= 0)
+        {
+            sa_sint_t p = cache[i].index; d += (p >> (SUFFIX_GROUP_BIT - 1)); cache[i].symbol = --induction_bucket[v >> 1]; cache[i].index = (p - 1) | (v << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d;
+            if (cache[i].symbol >= omp_block_start) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[i].index = 0; cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]); } }
+        }
+    }
+
+    return d;
+}
+
+static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+    {
+        libsais_prefetchw(&cache[i - 2 * prefetch_distance]);
+
+        sa_sint_t s0 = cache[i - prefetch_distance - 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0]; libsais_prefetchw(s0 >= 0 ? Is0 : NULL);
+        sa_sint_t s1 = cache[i - prefetch_distance - 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1]; libsais_prefetchw(s1 >= 0 ? Is1 : NULL);
+
+        sa_sint_t v0 = cache[i - 0].symbol;
+        if (v0 >= 0)
+        {
+            cache[i - 0].symbol = --induction_bucket[v0];
+            if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 0].symbol, np = cache[i - 0].index; if (np > 0) { cache[i - 0].index = 0; cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; } }
+        }
+
+        sa_sint_t v1 = cache[i - 1].symbol;
+        if (v1 >= 0)
+        {
+            cache[i - 1].symbol = --induction_bucket[v1];
+            if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 1].symbol, np = cache[i - 1].index; if (np > 0) { cache[i - 1].index = 0; cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; }}
+        }
+    }
+
+    for (j -= prefetch_distance + 1; i >= j; i -= 1)
+    {
+        sa_sint_t v = cache[i].symbol;
+        if (v >= 0)
+        {
+            cache[i].symbol = --induction_bucket[v];
+            if (cache[i].symbol >= omp_block_start) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[i].index = 0; cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; } }
+        }
+    }
+}
+
+static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(cache);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            d = libsais_partial_sorting_scan_right_to_left_32s_6k(T, SA, buckets, d, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                libsais_partial_sorting_scan_right_to_left_32s_6k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                d = libsais_partial_sorting_scan_right_to_left_32s_6k_block_sort(T, buckets, d, cache - block_start, block_start, block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+
+    return d;
+}
+
+static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(cache);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            d = libsais_partial_sorting_scan_right_to_left_32s_4k(T, SA, k, buckets, d, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                libsais_partial_sorting_scan_right_to_left_32s_4k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                d = libsais_partial_sorting_scan_right_to_left_32s_4k_block_sort(T, k, buckets, d, cache - block_start, block_start, block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+
+    return d;
+}
+
+static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(cache);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais_partial_sorting_scan_right_to_left_32s_1k(T, SA, buckets, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                libsais_partial_sorting_scan_right_to_left_32s_1k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                libsais_partial_sorting_scan_right_to_left_32s_1k_block_sort(T, buckets, cache - block_start, block_start, block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+}
+
+#endif
+
+static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    fast_sint_t scan_start    = (fast_sint_t)left_suffixes_count + 1;
+    fast_sint_t scan_end      = (fast_sint_t)n - (fast_sint_t)first_lms_suffix;
+
+    if (threads == 1 || (scan_end - scan_start) < 65536)
+    {
+        d = libsais_partial_sorting_scan_right_to_left_32s_6k(T, SA, buckets, d, scan_start, scan_end - scan_start);
+    }
+#if defined(_OPENMP)
+    else
+    {
+        fast_sint_t block_start, block_end;
+        for (block_start = scan_end - 1; block_start >= scan_start; block_start = block_end)
+        {
+            block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < scan_start) { block_end = scan_start - 1; }
+
+            d = libsais_partial_sorting_scan_right_to_left_32s_6k_block_omp(T, SA, buckets, d, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads);
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+
+    return d;
+}
+
+static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    if (threads == 1 || n < 65536)
+    {
+        d = libsais_partial_sorting_scan_right_to_left_32s_4k(T, SA, k, buckets, d, 0, n);
+    }
+#if defined(_OPENMP)
+    else
+    {
+        fast_sint_t block_start, block_end;
+        for (block_start = (fast_sint_t)n - 1; block_start >= 0; block_start = block_end)
+        {
+            block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < 0) { block_end = -1; }
+
+            d = libsais_partial_sorting_scan_right_to_left_32s_4k_block_omp(T, SA, k, buckets, d, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads);
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+
+    return d;
+}
+
+static void libsais_partial_sorting_scan_right_to_left_32s_1k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    if (threads == 1 || n < 65536)
+    {
+        libsais_partial_sorting_scan_right_to_left_32s_1k(T, SA, buckets, 0, n);
+    }
+#if defined(_OPENMP)
+    else
+    {
+        fast_sint_t block_start, block_end;
+        for (block_start = (fast_sint_t)n - 1; block_start >= 0; block_start = block_end)
+        {
+            block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < 0) { block_end = -1; }
+
+            libsais_partial_sorting_scan_right_to_left_32s_1k_block_omp(T, SA, buckets, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads);
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+}
+
+static fast_sint_t libsais_partial_sorting_gather_lms_suffixes_32s_4k(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j, l;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j; i += 4)
+    {
+        libsais_prefetch(&SA[i + prefetch_distance]);
+
+        sa_sint_t s0 = SA[i + 0]; SA[l] = (s0 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s0 < 0);
+        sa_sint_t s1 = SA[i + 1]; SA[l] = (s1 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s1 < 0);
+        sa_sint_t s2 = SA[i + 2]; SA[l] = (s2 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s2 < 0);
+        sa_sint_t s3 = SA[i + 3]; SA[l] = (s3 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s3 < 0);
+    }
+
+    for (j += 3; i < j; i += 1)
+    {
+        sa_sint_t s = SA[i]; SA[l] = (s - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s < 0);
+    }
+
+    return l;
+}
+
+static fast_sint_t libsais_partial_sorting_gather_lms_suffixes_32s_1k(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j, l;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j; i += 4)
+    {
+        libsais_prefetch(&SA[i + prefetch_distance]);
+
+        sa_sint_t s0 = SA[i + 0]; SA[l] = s0 & SAINT_MAX; l += (s0 < 0);
+        sa_sint_t s1 = SA[i + 1]; SA[l] = s1 & SAINT_MAX; l += (s1 < 0);
+        sa_sint_t s2 = SA[i + 2]; SA[l] = s2 & SAINT_MAX; l += (s2 < 0);
+        sa_sint_t s3 = SA[i + 3]; SA[l] = s3 & SAINT_MAX; l += (s3 < 0);
+    }
+
+    for (j += 3; i < j; i += 1)
+    {
+        sa_sint_t s = SA[i]; SA[l] = s & SAINT_MAX; l += (s < 0);
+    }
+
+    return l;
+}
+
+static void libsais_partial_sorting_gather_lms_suffixes_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (n / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais_partial_sorting_gather_lms_suffixes_32s_4k(SA, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                thread_state[omp_thread_num].state.position = omp_block_start;
+                thread_state[omp_thread_num].state.count = libsais_partial_sorting_gather_lms_suffixes_32s_4k(SA, omp_block_start, omp_block_size) - omp_block_start;
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                fast_sint_t t, position = 0;
+                for (t = 0; t < omp_num_threads; ++t)
+                { 
+                    if (t > 0 && thread_state[t].state.count > 0)
+                    {
+                        memmove(&SA[position], &SA[thread_state[t].state.position], (size_t)thread_state[t].state.count * sizeof(sa_sint_t));
+                    }
+
+                    position += thread_state[t].state.count;
+                }
+            }
+        }
+#endif
+    }
+}
+
+static void libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (n / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais_partial_sorting_gather_lms_suffixes_32s_1k(SA, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                thread_state[omp_thread_num].state.position = omp_block_start;
+                thread_state[omp_thread_num].state.count = libsais_partial_sorting_gather_lms_suffixes_32s_1k(SA, omp_block_start, omp_block_size) - omp_block_start;
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                fast_sint_t t, position = 0;
+                for (t = 0; t < omp_num_threads; ++t)
+                { 
+                    if (t > 0 && thread_state[t].state.count > 0)
+                    {
+                        memmove(&SA[position], &SA[thread_state[t].state.position], (size_t)thread_state[t].state.count * sizeof(sa_sint_t));
+                    }
+
+                    position += thread_state[t].state.count;
+                }
+            }
+        }
+#endif
+    }
+}
+
+static void libsais_induce_partial_order_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    memset(&buckets[2 * ALPHABET_SIZE], 0, 2 * ALPHABET_SIZE * sizeof(sa_sint_t));
+
+    sa_sint_t d = libsais_partial_sorting_scan_left_to_right_8u_omp(T, SA, n, buckets, left_suffixes_count, 0, threads, thread_state);
+    libsais_partial_sorting_shift_markers_8u_omp(SA, n, buckets, threads);
+    libsais_partial_sorting_scan_right_to_left_8u_omp(T, SA, n, buckets, first_lms_suffix, left_suffixes_count, d, threads, thread_state);
+}
+
+static void libsais_induce_partial_order_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    sa_sint_t d = libsais_partial_sorting_scan_left_to_right_32s_6k_omp(T, SA, n, buckets, left_suffixes_count, 0, threads, thread_state);
+    libsais_partial_sorting_shift_markers_32s_6k_omp(SA, k, buckets, threads);
+    libsais_partial_sorting_shift_buckets_32s_6k(k, buckets);
+    libsais_partial_sorting_scan_right_to_left_32s_6k_omp(T, SA, n, buckets, first_lms_suffix, left_suffixes_count, d, threads, thread_state);
+}
+
+static void libsais_induce_partial_order_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
+
+    sa_sint_t d = libsais_partial_sorting_scan_left_to_right_32s_4k_omp(T, SA, n, k, buckets, 0, threads, thread_state);
+    libsais_partial_sorting_shift_markers_32s_4k(SA, n);
+    libsais_partial_sorting_scan_right_to_left_32s_4k_omp(T, SA, n, k, buckets, d, threads, thread_state);
+    libsais_partial_sorting_gather_lms_suffixes_32s_4k_omp(SA, n, threads, thread_state);
+}
+
+static void libsais_induce_partial_order_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    libsais_partial_sorting_scan_left_to_right_32s_1k_omp(T, SA, n, &buckets[1 * k], threads, thread_state);
+    libsais_partial_sorting_scan_right_to_left_32s_1k_omp(T, SA, n, &buckets[0 * k], threads, thread_state);
+    libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads, thread_state);
+}
+
+static void libsais_induce_partial_order_32s_1k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    libsais_count_suffixes_32s(T, n, k, buckets);
+    libsais_initialize_buckets_start_32s_1k(k, buckets);
+    libsais_partial_sorting_scan_left_to_right_32s_1k_omp(T, SA, n, buckets, threads, thread_state);
+
+    libsais_count_suffixes_32s(T, n, k, buckets);
+    libsais_initialize_buckets_end_32s_1k(k, buckets);
+    libsais_partial_sorting_scan_right_to_left_32s_1k_omp(T, SA, n, buckets, threads, thread_state);
+
+    libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads, thread_state);
+}
+
+static sa_sint_t libsais_renumber_lms_suffixes_8u(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t name, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT SAm = &SA[m];
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
+    {
+        libsais_prefetch(&SA[i + 2 * prefetch_distance]);
+
+        libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 0] & SAINT_MAX) >> 1]);
+        libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 1] & SAINT_MAX) >> 1]);
+        libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 2] & SAINT_MAX) >> 1]);
+        libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 3] & SAINT_MAX) >> 1]);
+
+        sa_sint_t p0 = SA[i + 0]; SAm[(p0 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p0 < 0;
+        sa_sint_t p1 = SA[i + 1]; SAm[(p1 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p1 < 0;
+        sa_sint_t p2 = SA[i + 2]; SAm[(p2 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p2 < 0;
+        sa_sint_t p3 = SA[i + 3]; SAm[(p3 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p3 < 0;
+    }
+
+    for (j += prefetch_distance + 3; i < j; i += 1)
+    {
+        sa_sint_t p = SA[i]; SAm[(p & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p < 0;
+    }
+
+    return name;
+}
+
+static fast_sint_t libsais_gather_marked_suffixes_8u(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t l, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    l -= 1;
+
+    fast_sint_t i, j;
+    for (i = (fast_sint_t)m + omp_block_start + omp_block_size - 1, j = (fast_sint_t)m + omp_block_start + 3; i >= j; i -= 4)
+    {
+        libsais_prefetch(&SA[i - prefetch_distance]);
+
+        sa_sint_t s0 = SA[i - 0]; SA[l] = s0 & SAINT_MAX; l -= s0 < 0;
+        sa_sint_t s1 = SA[i - 1]; SA[l] = s1 & SAINT_MAX; l -= s1 < 0;
+        sa_sint_t s2 = SA[i - 2]; SA[l] = s2 & SAINT_MAX; l -= s2 < 0;
+        sa_sint_t s3 = SA[i - 3]; SA[l] = s3 & SAINT_MAX; l -= s3 < 0;
+    }
+
+    for (j -= 3; i >= j; i -= 1)
+    {
+        sa_sint_t s = SA[i]; SA[l] = s & SAINT_MAX; l -= s < 0;
+    }
+
+    l += 1;
+
+    return l;
+}
+
+static sa_sint_t libsais_renumber_lms_suffixes_8u_omp(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    sa_sint_t name = 0;
+
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (m / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
+
+        if (omp_num_threads == 1)
+        {
+            name = libsais_renumber_lms_suffixes_8u(SA, m, 0, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                thread_state[omp_thread_num].state.count = libsais_count_negative_marked_suffixes(SA, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                fast_sint_t t, count = 0; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; }
+
+                if (omp_thread_num == omp_num_threads - 1)
+                {
+                    name = (sa_sint_t)(count + thread_state[omp_thread_num].state.count);
+                }
+
+                libsais_renumber_lms_suffixes_8u(SA, m, (sa_sint_t)count, omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+
+    return name;
+}
+
+static void libsais_gather_marked_lms_suffixes_8u_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais_gather_marked_suffixes_8u(SA, m, (fast_sint_t)n + (fast_sint_t)fs, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                if (omp_thread_num < omp_num_threads - 1)
+                {
+                    thread_state[omp_thread_num].state.position = libsais_gather_marked_suffixes_8u(SA, m, (fast_sint_t)m + omp_block_start + omp_block_size, omp_block_start, omp_block_size);
+                    thread_state[omp_thread_num].state.count = (fast_sint_t)m + omp_block_start + omp_block_size - thread_state[omp_thread_num].state.position;
+                }
+                else
+                {
+                    thread_state[omp_thread_num].state.position = libsais_gather_marked_suffixes_8u(SA, m, (fast_sint_t)n + (fast_sint_t)fs, omp_block_start, omp_block_size);
+                    thread_state[omp_thread_num].state.count = (fast_sint_t)n + (fast_sint_t)fs - thread_state[omp_thread_num].state.position;
+                }
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                fast_sint_t t, position = (fast_sint_t)n + (fast_sint_t)fs;
+                    
+                for (t = omp_num_threads - 1; t >= 0; --t)
+                { 
+                    position -= thread_state[t].state.count;
+                    if (t != omp_num_threads - 1 && thread_state[t].state.count > 0)
+                    {
+                        memmove(&SA[position], &SA[thread_state[t].state.position], (size_t)thread_state[t].state.count * sizeof(sa_sint_t));
+                    }
+                }
+            }
+        }
+#endif
+    }
+}
+
+static sa_sint_t libsais_renumber_and_gather_lms_suffixes_8u_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    memset(&SA[m], 0, ((size_t)n >> 1) * sizeof(sa_sint_t));
+
+    sa_sint_t name = libsais_renumber_lms_suffixes_8u_omp(SA, m, threads, thread_state);
+    if (name < m)
+    {
+        libsais_gather_marked_lms_suffixes_8u_omp(SA, n, m, fs, threads, thread_state);
+    }
+    else
+    {
+        fast_sint_t i; for (i = 0; i < m; i += 1) { SA[i] &= SAINT_MAX; }
+    }
+
+    return name;
+}
+
+static sa_sint_t libsais_renumber_distinct_lms_suffixes_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t name, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT SAm = &SA[m];
+
+    fast_sint_t i, j; sa_sint_t p0, p1, p2, p3 = 0;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
+    {
+        libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+        libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 0] & SAINT_MAX) >> 1]);
+        libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 1] & SAINT_MAX) >> 1]);
+        libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 2] & SAINT_MAX) >> 1]);
+        libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 3] & SAINT_MAX) >> 1]);
+
+        p0 = SA[i + 0]; SAm[(SA[i + 0] = p0 & SAINT_MAX) >> 1] = name | (p0 & p3 & SAINT_MIN); name += p0 < 0;
+        p1 = SA[i + 1]; SAm[(SA[i + 1] = p1 & SAINT_MAX) >> 1] = name | (p1 & p0 & SAINT_MIN); name += p1 < 0;
+        p2 = SA[i + 2]; SAm[(SA[i + 2] = p2 & SAINT_MAX) >> 1] = name | (p2 & p1 & SAINT_MIN); name += p2 < 0;
+        p3 = SA[i + 3]; SAm[(SA[i + 3] = p3 & SAINT_MAX) >> 1] = name | (p3 & p2 & SAINT_MIN); name += p3 < 0;
+    }
+
+    for (j += prefetch_distance + 3; i < j; i += 1)
+    {
+        p2 = p3; p3 = SA[i]; SAm[(SA[i] = p3 & SAINT_MAX) >> 1] = name | (p3 & p2 & SAINT_MIN); name += p3 < 0;
+    }
+
+    return name;
+}
+
+static void libsais_mark_distinct_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j; sa_sint_t p0, p1, p2, p3 = 0;
+    for (i = (fast_sint_t)m + omp_block_start, j = (fast_sint_t)m + omp_block_start + omp_block_size - 3; i < j; i += 4)
+    {
+        libsais_prefetchw(&SA[i + prefetch_distance]);
+
+        p0 = SA[i + 0]; SA[i + 0] = p0 & (p3 | SAINT_MAX); p0 = (p0 == 0) ? p3 : p0;
+        p1 = SA[i + 1]; SA[i + 1] = p1 & (p0 | SAINT_MAX); p1 = (p1 == 0) ? p0 : p1;
+        p2 = SA[i + 2]; SA[i + 2] = p2 & (p1 | SAINT_MAX); p2 = (p2 == 0) ? p1 : p2;
+        p3 = SA[i + 3]; SA[i + 3] = p3 & (p2 | SAINT_MAX); p3 = (p3 == 0) ? p2 : p3;
+    }
+
+    for (j += 3; i < j; i += 1)
+    {
+        p2 = p3; p3 = SA[i]; SA[i] = p3 & (p2 | SAINT_MAX); p3 = (p3 == 0) ? p2 : p3;
+    }
+}
+
+static void libsais_clamp_lms_suffixes_length_32s(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT SAm = &SA[m];
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3; i < j; i += 4)
+    {
+        libsais_prefetchw(&SAm[i + prefetch_distance]);
+
+        SAm[i + 0] = (SAm[i + 0] < 0 ? SAm[i + 0] : 0) & SAINT_MAX;
+        SAm[i + 1] = (SAm[i + 1] < 0 ? SAm[i + 1] : 0) & SAINT_MAX;
+        SAm[i + 2] = (SAm[i + 2] < 0 ? SAm[i + 2] : 0) & SAINT_MAX;
+        SAm[i + 3] = (SAm[i + 3] < 0 ? SAm[i + 3] : 0) & SAINT_MAX;
+    }
+
+    for (j += 3; i < j; i += 1)
+    {
+        SAm[i] = (SAm[i] < 0 ? SAm[i] : 0) & SAINT_MAX;
+    }
+}
+
+static sa_sint_t libsais_renumber_distinct_lms_suffixes_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    sa_sint_t name = 0;
+
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (m / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
+
+        if (omp_num_threads == 1)
+        {
+            name = libsais_renumber_distinct_lms_suffixes_32s_4k(SA, m, 1, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                thread_state[omp_thread_num].state.count = libsais_count_negative_marked_suffixes(SA, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                fast_sint_t t, count = 1; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; }
+
+                if (omp_thread_num == omp_num_threads - 1)
+                {
+                    name = (sa_sint_t)(count + thread_state[omp_thread_num].state.count);
+                }
+
+                libsais_renumber_distinct_lms_suffixes_32s_4k(SA, m, (sa_sint_t)count, omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+
+    return name - 1;
+}
+
+static void libsais_mark_distinct_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+        fast_sint_t omp_block_stride  = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start;
+#else
+        UNUSED(threads);
+
+        fast_sint_t omp_block_start   = 0;
+        fast_sint_t omp_block_size    = (fast_sint_t)n >> 1;
+#endif
+        libsais_mark_distinct_lms_suffixes_32s(SA, m, omp_block_start, omp_block_size);
+    }
+}
+
+static void libsais_clamp_lms_suffixes_length_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+        fast_sint_t omp_block_stride  = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start;
+#else
+        UNUSED(threads);
+
+        fast_sint_t omp_block_start   = 0;
+        fast_sint_t omp_block_size    = (fast_sint_t)n >> 1;
+#endif
+        libsais_clamp_lms_suffixes_length_32s(SA, m, omp_block_start, omp_block_size);
+    }
+}
+
+static sa_sint_t libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    memset(&SA[m], 0, ((size_t)n >> 1) * sizeof(sa_sint_t));
+
+    sa_sint_t name = libsais_renumber_distinct_lms_suffixes_32s_4k_omp(SA, m, threads, thread_state);
+    if (name < m)
+    {
+        libsais_mark_distinct_lms_suffixes_32s_omp(SA, n, m, threads);
+    }
+
+    return name;
+}
+
+static sa_sint_t libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT SAm = &SA[m];
+
+    {
+        libsais_gather_lms_suffixes_32s(T, SA, n);
+
+        memset(&SA[m], 0, ((size_t)n - (size_t)m - (size_t)m) * sizeof(sa_sint_t));
+
+        fast_sint_t i, j;
+        for (i = (fast_sint_t)n - (fast_sint_t)m, j = (fast_sint_t)n - 1 - prefetch_distance - 3; i < j; i += 4)
+        {
+            libsais_prefetch(&SA[i + 2 * prefetch_distance]);
+
+            libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]);
+            libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]);
+            libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 2]) >> 1]);
+            libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 3]) >> 1]);
+
+            SAm[((sa_uint_t)SA[i + 0]) >> 1] = SA[i + 1] - SA[i + 0] + 1 + SAINT_MIN;
+            SAm[((sa_uint_t)SA[i + 1]) >> 1] = SA[i + 2] - SA[i + 1] + 1 + SAINT_MIN;
+            SAm[((sa_uint_t)SA[i + 2]) >> 1] = SA[i + 3] - SA[i + 2] + 1 + SAINT_MIN;
+            SAm[((sa_uint_t)SA[i + 3]) >> 1] = SA[i + 4] - SA[i + 3] + 1 + SAINT_MIN;
+        }
+
+        for (j += prefetch_distance + 3; i < j; i += 1)
+        {
+            SAm[((sa_uint_t)SA[i]) >> 1] = SA[i + 1] - SA[i] + 1 + SAINT_MIN;
+        }
+
+        SAm[((sa_uint_t)SA[n - 1]) >> 1] = 1 + SAINT_MIN;
+    }
+
+    {
+        libsais_clamp_lms_suffixes_length_32s_omp(SA, n, m, threads);
+    }
+
+    sa_sint_t name = 1;
+
+    {
+        fast_sint_t i, j, p = SA[0], plen = SAm[p >> 1]; sa_sint_t pdiff = SAINT_MIN;
+        for (i = 1, j = m - prefetch_distance - 1; i < j; i += 2)
+        {
+            libsais_prefetch(&SA[i + 2 * prefetch_distance]);
+            
+            libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]); libsais_prefetch(&T[((sa_uint_t)SA[i + prefetch_distance + 0])]);
+            libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]); libsais_prefetch(&T[((sa_uint_t)SA[i + prefetch_distance + 1])]);
+
+            fast_sint_t q = SA[i + 0], qlen = SAm[q >> 1]; sa_sint_t qdiff = SAINT_MIN;
+            if (plen == qlen) { fast_sint_t l = 0; do { if (T[p + l] != T[q + l]) { break; } } while (++l < qlen); qdiff = (sa_sint_t)(l - qlen) & SAINT_MIN; }
+            SAm[p >> 1] = name | (pdiff & qdiff); name += (qdiff < 0);
+
+            p = SA[i + 1]; plen = SAm[p >> 1]; pdiff = SAINT_MIN;
+            if (qlen == plen) { fast_sint_t l = 0; do { if (T[q + l] != T[p + l]) { break; } } while (++l < plen); pdiff = (sa_sint_t)(l - plen) & SAINT_MIN; }
+            SAm[q >> 1] = name | (qdiff & pdiff); name += (pdiff < 0);
+        }
+
+        for (j += prefetch_distance + 1; i < j; i += 1)
+        {
+            fast_sint_t q = SA[i], qlen = SAm[q >> 1]; sa_sint_t qdiff = SAINT_MIN;
+            if (plen == qlen) { fast_sint_t l = 0; do { if (T[p + l] != T[q + l]) { break; } } while (++l < plen); qdiff = (sa_sint_t)(l - plen) & SAINT_MIN; }
+            SAm[p >> 1] = name | (pdiff & qdiff); name += (qdiff < 0);
+
+            p = q; plen = qlen; pdiff = qdiff;
+        }
+
+        SAm[p >> 1] = name | pdiff; name++;
+    }
+
+    if (name <= m)
+    {
+        libsais_mark_distinct_lms_suffixes_32s_omp(SA, n, m, threads);
+    }
+
+    return name - 1;
+}
+
+static void libsais_reconstruct_lms_suffixes(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    const sa_sint_t * RESTRICT SAnm = &SA[n - m];
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
+    {
+        libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+        libsais_prefetch(&SAnm[SA[i + prefetch_distance + 0]]);
+        libsais_prefetch(&SAnm[SA[i + prefetch_distance + 1]]);
+        libsais_prefetch(&SAnm[SA[i + prefetch_distance + 2]]);
+        libsais_prefetch(&SAnm[SA[i + prefetch_distance + 3]]);
+
+        SA[i + 0] = SAnm[SA[i + 0]];
+        SA[i + 1] = SAnm[SA[i + 1]];
+        SA[i + 2] = SAnm[SA[i + 2]];
+        SA[i + 3] = SAnm[SA[i + 3]];
+    }
+
+    for (j += prefetch_distance + 3; i < j; i += 1)
+    {
+        SA[i] = SAnm[SA[i]];
+    }
+}
+
+static void libsais_reconstruct_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+        fast_sint_t omp_block_stride  = (m / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
+#else
+        UNUSED(threads);
+
+        fast_sint_t omp_block_start   = 0;
+        fast_sint_t omp_block_size    = m;
+#endif
+
+        libsais_reconstruct_lms_suffixes(SA, n, m, omp_block_start, omp_block_size);
+    }
+}
+
+static void libsais_place_lms_suffixes_interval_8u(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, const sa_sint_t * RESTRICT buckets)
+{
+    const sa_sint_t * RESTRICT bucket_end = &buckets[7 * ALPHABET_SIZE];
+
+    fast_sint_t c, j = n;
+    for (c = ALPHABET_SIZE - 2; c >= 0; --c)
+    {
+        fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] - (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)];
+        if (l > 0)
+        {
+            fast_sint_t i = bucket_end[c];
+            if (j - i > 0)
+            {
+                memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
+            }
+
+            memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
+        }
+    }
+
+    memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
+}
+
+static void libsais_place_lms_suffixes_interval_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets)
+{
+    const sa_sint_t * RESTRICT bucket_end = &buckets[3 * k];
+
+    fast_sint_t c, j = n;
+    for (c = (fast_sint_t)k - 2; c >= 0; --c)
+    {
+        fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] - (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)];
+        if (l > 0)
+        {
+            fast_sint_t i = bucket_end[c];
+            if (j - i > 0)
+            {
+                memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
+            }
+
+            memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
+        }
+    }
+
+    memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
+}
+
+static void libsais_place_lms_suffixes_interval_32s_2k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets)
+{
+    fast_sint_t j = n;
+
+    if (k > 1)
+    {
+        fast_sint_t c;
+        for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0); c >= BUCKETS_INDEX2(0, 0); c -= BUCKETS_INDEX2(1, 0))
+        {
+            fast_sint_t l = (fast_sint_t)buckets[c + BUCKETS_INDEX2(1, 1)] - (fast_sint_t)buckets[c + BUCKETS_INDEX2(0, 1)];
+            if (l > 0)
+            {
+                fast_sint_t i = buckets[c];
+                if (j - i > 0)
+                {
+                    memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
+                }
+
+                memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
+            }
+        }
+    }
+
+    memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
+}
+
+static void libsais_place_lms_suffixes_interval_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t m, sa_sint_t * RESTRICT buckets)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t c = k - 1; fast_sint_t i, l = buckets[c];
+    for (i = (fast_sint_t)m - 1; i >= prefetch_distance + 3; i -= 4)
+    {
+        libsais_prefetch(&SA[i - 2 * prefetch_distance]);
+
+        libsais_prefetch(&T[SA[i - prefetch_distance - 0]]);
+        libsais_prefetch(&T[SA[i - prefetch_distance - 1]]);
+        libsais_prefetch(&T[SA[i - prefetch_distance - 2]]);
+        libsais_prefetch(&T[SA[i - prefetch_distance - 3]]);
+
+        sa_sint_t p0 = SA[i - 0]; if (T[p0] != c) { c = T[p0]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p0;
+        sa_sint_t p1 = SA[i - 1]; if (T[p1] != c) { c = T[p1]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p1;
+        sa_sint_t p2 = SA[i - 2]; if (T[p2] != c) { c = T[p2]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p2;
+        sa_sint_t p3 = SA[i - 3]; if (T[p3] != c) { c = T[p3]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p3;
+    }
+
+    for (; i >= 0; i -= 1)
+    {
+        sa_sint_t p = SA[i]; if (T[p] != c) { c = T[p]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p;
+    }
+
+    memset(&SA[0], 0, (size_t)l * sizeof(sa_sint_t));
+}
+
+static void libsais_place_lms_suffixes_histogram_32s_6k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets)
+{
+    const sa_sint_t * RESTRICT bucket_end = &buckets[5 * k];
+
+    fast_sint_t c, j = n;
+    for (c = (fast_sint_t)k - 2; c >= 0; --c)
+    {
+        fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX4(c, 1)];
+        if (l > 0)
+        {
+            fast_sint_t i = bucket_end[c];
+            if (j - i > 0)
+            {
+                memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
+            }
+
+            memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
+        }
+    }
+
+    memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
+}
+
+static void libsais_place_lms_suffixes_histogram_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets)
+{
+    const sa_sint_t * RESTRICT bucket_end = &buckets[3 * k];
+
+    fast_sint_t c, j = n;
+    for (c = (fast_sint_t)k - 2; c >= 0; --c)
+    {
+        fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)];
+        if (l > 0)
+        {
+            fast_sint_t i = bucket_end[c];
+            if (j - i > 0)
+            {
+                memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
+            }
+
+            memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
+        }
+    }
+
+    memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
+}
+
+static void libsais_place_lms_suffixes_histogram_32s_2k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets)
+{
+    fast_sint_t j = n;
+
+    if (k > 1)
+    {
+        fast_sint_t c;
+        for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0); c >= BUCKETS_INDEX2(0, 0); c -= BUCKETS_INDEX2(1, 0))
+        {
+            fast_sint_t l = (fast_sint_t)buckets[c + BUCKETS_INDEX2(0, 1)];
+            if (l > 0)
+            {
+                fast_sint_t i = buckets[c];
+                if (j - i > 0)
+                {
+                    memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
+                }
+
+                memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
+            }
+        }
+    }
+
+    memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
+}
+
+static void libsais_final_bwt_scan_left_to_right_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+
+        sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[i + 0] = T[p0] | SAINT_MIN; SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); }
+        sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[i + 1] = T[p1] | SAINT_MIN; SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); }
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[i] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); }
+    }
+}
+
+static void libsais_final_bwt_aux_scan_left_to_right_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+
+        sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[i + 0] = T[p0] | SAINT_MIN; SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); if ((p0 & rm) == 0) { I[p0 / (rm + 1)] = induction_bucket[T[p0]]; }}
+        sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[i + 1] = T[p1] | SAINT_MIN; SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); if ((p1 & rm) == 0) { I[p1 / (rm + 1)] = induction_bucket[T[p1]]; }}
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[i] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]]; } }
+    }
+}
+
+static void libsais_final_sorting_scan_left_to_right_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+
+        sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); }
+        sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); }
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); }
+    }
+}
+
+static void libsais_final_sorting_scan_left_to_right_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais_prefetchw(&SA[i + 3 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { libsais_prefetchw(&induction_bucket[T[s2 - 1]]); libsais_prefetch(&T[s2] - 2); }
+        sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { libsais_prefetchw(&induction_bucket[T[s3 - 1]]); libsais_prefetch(&T[s3] - 2); }
+
+        sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); }
+        sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); }
+    }
+
+    for (j += 2 * prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); }
+    }
+}
+
+#if defined(_OPENMP)
+
+static fast_sint_t libsais_final_bwt_scan_left_to_right_8u_block_prepare(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+   const fast_sint_t prefetch_distance = 32;
+
+   memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
+
+   fast_sint_t i, j, count = 0;
+   for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+   {
+       libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+       sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+       sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+
+       sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[i + 0] = T[p0] | SAINT_MIN; buckets[cache[count].symbol = T[p0]]++; cache[count++].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); }
+       sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[i + 1] = T[p1] | SAINT_MIN; buckets[cache[count].symbol = T[p1]]++; cache[count++].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); }
+   }
+
+   for (j += prefetch_distance + 1; i < j; i += 1)
+   {
+       sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[i] = T[p] | SAINT_MIN; buckets[cache[count].symbol = T[p]]++; cache[count++].index = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); }
+   }
+
+   return count;
+}
+
+static fast_sint_t libsais_final_sorting_scan_left_to_right_8u_block_prepare(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+   const fast_sint_t prefetch_distance = 32;
+
+   memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
+
+   fast_sint_t i, j, count = 0;
+   for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+   {
+       libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+       sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+       sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+
+       sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; buckets[cache[count].symbol = T[p0]]++; cache[count++].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); }
+       sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; buckets[cache[count].symbol = T[p1]]++; cache[count++].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); }
+   }
+
+   for (j += prefetch_distance + 1; i < j; i += 1)
+   {
+       sa_sint_t p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; buckets[cache[count].symbol = T[p]]++; cache[count++].index = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); }
+   }
+
+   return count;
+}
+
+static void libsais_final_order_scan_left_to_right_8u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = 0, j = count - 3; i < j; i += 4)
+    {
+        libsais_prefetch(&cache[i + prefetch_distance]);
+
+        SA[buckets[cache[i + 0].symbol]++] = cache[i + 0].index;
+        SA[buckets[cache[i + 1].symbol]++] = cache[i + 1].index;
+        SA[buckets[cache[i + 2].symbol]++] = cache[i + 2].index;
+        SA[buckets[cache[i + 3].symbol]++] = cache[i + 3].index;
+    }
+
+    for (j += 3; i < j; i += 1)
+    {
+        SA[buckets[cache[i].symbol]++] = cache[i].index;
+    }
+}
+
+static void libsais_final_bwt_aux_scan_left_to_right_8u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = 0, j = count - 3; i < j; i += 4)
+    {
+        libsais_prefetch(&cache[i + prefetch_distance]);
+
+        SA[buckets[cache[i + 0].symbol]++] = cache[i + 0].index; if ((cache[i + 0].index & rm) == 0) { I[(cache[i + 0].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 0].symbol]; }
+        SA[buckets[cache[i + 1].symbol]++] = cache[i + 1].index; if ((cache[i + 1].index & rm) == 0) { I[(cache[i + 1].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 1].symbol]; }
+        SA[buckets[cache[i + 2].symbol]++] = cache[i + 2].index; if ((cache[i + 2].index & rm) == 0) { I[(cache[i + 2].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 2].symbol]; }
+        SA[buckets[cache[i + 3].symbol]++] = cache[i + 3].index; if ((cache[i + 3].index & rm) == 0) { I[(cache[i + 3].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 3].symbol]; }
+    }
+
+    for (j += 3; i < j; i += 1)
+    {
+        SA[buckets[cache[i].symbol]++] = cache[i].index; if ((cache[i].index & rm) == 0) { I[(cache[i].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i].symbol]; }
+    }
+}
+
+static void libsais_final_sorting_scan_left_to_right_32s_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+
+        libsais_prefetchw(&cache[i + prefetch_distance]);
+
+        sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; cache[i + 0].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); symbol0 = T[p0]; } cache[i + 0].symbol = symbol0;
+        sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; cache[i + 1].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); symbol1 = T[p1]; } cache[i + 1].symbol = symbol1;
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t symbol = SAINT_MIN, p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; cache[i].index = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); symbol = T[p]; } cache[i].symbol = symbol;
+    }
+}
+
+static void libsais_final_sorting_scan_left_to_right_32s_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size;
+    for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais_prefetchw(&cache[i + 2 * prefetch_distance]);
+
+        sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0]; libsais_prefetchw(s0 >= 0 ? Is0 : NULL);
+        sa_sint_t s1 = cache[i + prefetch_distance + 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1]; libsais_prefetchw(s1 >= 0 ? Is1 : NULL);
+        
+        sa_sint_t v0 = cache[i + 0].symbol;
+        if (v0 >= 0)
+        {
+            cache[i + 0].symbol = induction_bucket[v0]++;
+            if (cache[i + 0].symbol < omp_block_end) { sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index; cache[i + 0].index = np ^ SAINT_MIN; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] < T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } }
+        }
+
+        sa_sint_t v1 = cache[i + 1].symbol;
+        if (v1 >= 0)
+        {
+            cache[i + 1].symbol = induction_bucket[v1]++;
+            if (cache[i + 1].symbol < omp_block_end) { sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index; cache[i + 1].index = np ^ SAINT_MIN; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] < T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } }
+        }
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t v = cache[i].symbol;
+        if (v >= 0)
+        {
+            cache[i].symbol = induction_bucket[v]++;
+            if (cache[i].symbol < omp_block_end) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; cache[i].index = np ^ SAINT_MIN; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] < T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } }
+        }
+    }
+}
+
+static void libsais_final_bwt_scan_left_to_right_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais_final_bwt_scan_left_to_right_8u(T, SA, induction_bucket, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                thread_state[omp_thread_num].state.count = libsais_final_bwt_scan_left_to_right_8u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                fast_sint_t t;
+                for (t = 0; t < omp_num_threads; ++t)
+                {
+                    sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
+                    fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A + B; temp_bucket[c] = A; }
+                }
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais_final_order_scan_left_to_right_8u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count);
+            }
+        }
+#endif
+    }
+}
+
+static void libsais_final_bwt_aux_scan_left_to_right_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais_final_bwt_aux_scan_left_to_right_8u(T, SA, rm, I, induction_bucket, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                thread_state[omp_thread_num].state.count = libsais_final_bwt_scan_left_to_right_8u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                fast_sint_t t;
+                for (t = 0; t < omp_num_threads; ++t)
+                {
+                    sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
+                    fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A + B; temp_bucket[c] = A; }
+                }
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais_final_bwt_aux_scan_left_to_right_8u_block_place(SA, rm, I, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count);
+            }
+        }
+#endif
+    }
+}
+
+static void libsais_final_sorting_scan_left_to_right_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais_final_sorting_scan_left_to_right_8u(T, SA, induction_bucket, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                thread_state[omp_thread_num].state.count = libsais_final_sorting_scan_left_to_right_8u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                fast_sint_t t;
+                for (t = 0; t < omp_num_threads; ++t)
+                {
+                    sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
+                    fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A + B; temp_bucket[c] = A; }
+                }
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais_final_order_scan_left_to_right_8u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count);
+            }
+        }
+#endif
+    }
+}
+
+static void libsais_final_sorting_scan_left_to_right_32s_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(cache);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais_final_sorting_scan_left_to_right_32s(T, SA, buckets, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                libsais_final_sorting_scan_left_to_right_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                libsais_final_sorting_scan_left_to_right_32s_block_sort(T, buckets, cache - block_start, block_start, block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+}
+
+#endif
+
+static void libsais_final_bwt_scan_left_to_right_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    SA[induction_bucket[T[(sa_sint_t)n - 1]]++] = ((sa_sint_t)n - 1) | ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1));
+
+    if (threads == 1 || n < 65536)
+    {
+        libsais_final_bwt_scan_left_to_right_8u(T, SA, induction_bucket, 0, n);
+    }
+#if defined(_OPENMP)
+    else
+    {
+        fast_sint_t block_start;
+        for (block_start = 0; block_start < n; )
+        {
+            if (SA[block_start] == 0)
+            {
+                block_start++;
+            }
+            else
+            {
+                fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > n) { block_max_end = n;}
+                fast_sint_t block_end     = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; }
+                fast_sint_t block_size    = block_end - block_start;
+
+                if (block_size < 32)
+                {
+                    for (; block_start < block_end; block_start += 1)
+                    {
+                        sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; SA[block_start] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); }
+                    }
+                }
+                else
+                {
+                    libsais_final_bwt_scan_left_to_right_8u_block_omp(T, SA, induction_bucket, block_start, block_size, threads, thread_state);
+                    block_start = block_end;
+                }
+            }
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+}
+
+static void libsais_final_bwt_aux_scan_left_to_right_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    SA[induction_bucket[T[(sa_sint_t)n - 1]]++] = ((sa_sint_t)n - 1) | ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1));
+
+    if ((((sa_sint_t)n - 1) & rm) == 0) { I[((sa_sint_t)n - 1) / (rm + 1)] = induction_bucket[T[(sa_sint_t)n - 1]]; }
+
+    if (threads == 1 || n < 65536)
+    {
+        libsais_final_bwt_aux_scan_left_to_right_8u(T, SA, rm, I, induction_bucket, 0, n);
+    }
+#if defined(_OPENMP)
+    else
+    {
+        fast_sint_t block_start;
+        for (block_start = 0; block_start < n; )
+        {
+            if (SA[block_start] == 0)
+            {
+                block_start++;
+            }
+            else
+            {
+                fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > n) { block_max_end = n;}
+                fast_sint_t block_end     = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; }
+                fast_sint_t block_size    = block_end - block_start;
+
+                if (block_size < 32)
+                {
+                    for (; block_start < block_end; block_start += 1)
+                    {
+                        sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; SA[block_start] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]]; } }
+                    }
+                }
+                else
+                {
+                    libsais_final_bwt_aux_scan_left_to_right_8u_block_omp(T, SA, rm, I, induction_bucket, block_start, block_size, threads, thread_state);
+                    block_start = block_end;
+                }
+            }
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+}
+
+static void libsais_final_sorting_scan_left_to_right_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    SA[induction_bucket[T[(sa_sint_t)n - 1]]++] = ((sa_sint_t)n - 1) | ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1));
+
+    if (threads == 1 || n < 65536)
+    {
+        libsais_final_sorting_scan_left_to_right_8u(T, SA, induction_bucket, 0, n);
+    }
+#if defined(_OPENMP)
+    else
+    {
+        fast_sint_t block_start;
+        for (block_start = 0; block_start < n; )
+        {
+            if (SA[block_start] == 0)
+            {
+                block_start++;
+            }
+            else
+            {
+                fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > n) { block_max_end = n;}
+                fast_sint_t block_end     = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; }
+                fast_sint_t block_size    = block_end - block_start;
+
+                if (block_size < 32)
+                {
+                    for (; block_start < block_end; block_start += 1)
+                    {
+                        sa_sint_t p = SA[block_start]; SA[block_start] = p ^ SAINT_MIN; if (p > 0) { p--; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); }
+                    }
+                }
+                else
+                {
+                    libsais_final_sorting_scan_left_to_right_8u_block_omp(T, SA, induction_bucket, block_start, block_size, threads, thread_state);
+                    block_start = block_end;
+                }
+            }
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+}
+
+static void libsais_final_sorting_scan_left_to_right_32s_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    SA[induction_bucket[T[n - 1]]++] = (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1));
+
+    if (threads == 1 || n < 65536)
+    {
+        libsais_final_sorting_scan_left_to_right_32s(T, SA, induction_bucket, 0, n);
+    }
+#if defined(_OPENMP)
+    else
+    {
+        fast_sint_t block_start, block_end;
+        for (block_start = 0; block_start < n; block_start = block_end)
+        {
+            block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > n) { block_end = n; }
+
+            libsais_final_sorting_scan_left_to_right_32s_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, block_start, block_end - block_start, threads);
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+}
+
+static sa_sint_t libsais_final_bwt_scan_right_to_left_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j; sa_sint_t index = -1;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+    {
+        libsais_prefetchw(&SA[i - 2 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+
+        sa_sint_t p0 = SA[i - 0]; index = (p0 == 0) ? (sa_sint_t)(i - 0) : index;
+        SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint8_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p0 : t; }
+
+        sa_sint_t p1 = SA[i - 1]; index = (p1 == 0) ? (sa_sint_t)(i - 1) : index;
+        SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint8_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p1 : t; }
+    }
+
+    for (j -= prefetch_distance + 1; i >= j; i -= 1)
+    {
+        sa_sint_t p = SA[i]; index = (p == 0) ? (sa_sint_t)i : index;
+        SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint8_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; }
+    }
+
+    return index;
+}
+
+static void libsais_final_bwt_aux_scan_right_to_left_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+    {
+        libsais_prefetchw(&SA[i - 2 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+
+        sa_sint_t p0 = SA[i - 0];
+        SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint8_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p0 : t; if ((p0 & rm) == 0) { I[p0 / (rm + 1)] = induction_bucket[T[p0]] + 1; } }
+
+        sa_sint_t p1 = SA[i - 1];
+        SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint8_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p1 : t; if ((p1 & rm) == 0) { I[p1 / (rm + 1)] = induction_bucket[T[p1]] + 1; } }
+    }
+
+    for (j -= prefetch_distance + 1; i >= j; i -= 1)
+    {
+        sa_sint_t p = SA[i];
+        SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint8_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]] + 1; } }
+    }
+}
+
+static void libsais_final_sorting_scan_right_to_left_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+    {
+        libsais_prefetchw(&SA[i - 2 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+
+        sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[--induction_bucket[T[p0]]] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); }
+        sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[--induction_bucket[T[p1]]] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); }
+    }
+
+    for (j -= prefetch_distance + 1; i >= j; i -= 1)
+    {
+        sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[--induction_bucket[T[p]]] = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); }
+    }
+}
+
+static void libsais_final_sorting_scan_right_to_left_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2)
+    {
+        libsais_prefetchw(&SA[i - 3 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { libsais_prefetchw(&induction_bucket[T[s2 - 1]]); libsais_prefetch(&T[s2] - 2); }
+        sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { libsais_prefetchw(&induction_bucket[T[s3 - 1]]); libsais_prefetch(&T[s3] - 2); }
+
+        sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[--induction_bucket[T[p0]]] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); }
+        sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[--induction_bucket[T[p1]]] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); }
+    }
+
+    for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1)
+    {
+        sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[--induction_bucket[T[p]]] = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); }
+    }
+}
+
+#if defined(_OPENMP)
+
+static fast_sint_t libsais_final_bwt_scan_right_to_left_8u_block_prepare(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+   const fast_sint_t prefetch_distance = 32;
+
+   memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
+
+   fast_sint_t i, j, count = 0;
+   for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+   {
+       libsais_prefetchw(&SA[i - 2 * prefetch_distance]);
+
+       sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+       sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+
+       sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint8_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count++].index = (c0 <= c1) ? p0 : t; }
+       sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint8_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count++].index = (c0 <= c1) ? p1 : t; }
+   }
+
+   for (j -= prefetch_distance + 1; i >= j; i -= 1)
+   {
+       sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint8_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count++].index = (c0 <= c1) ? p : t; }
+   }
+
+   return count;
+}
+
+static fast_sint_t libsais_final_bwt_aux_scan_right_to_left_8u_block_prepare(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+   const fast_sint_t prefetch_distance = 32;
+
+   memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
+
+   fast_sint_t i, j, count = 0;
+   for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+   {
+       libsais_prefetchw(&SA[i - 2 * prefetch_distance]);
+
+       sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+       sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+
+       sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint8_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count].index = (c0 <= c1) ? p0 : t; cache[count + 1].index = p0; count += 2; }
+       sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint8_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count].index = (c0 <= c1) ? p1 : t; cache[count + 1].index = p1; count += 2; }
+   }
+
+   for (j -= prefetch_distance + 1; i >= j; i -= 1)
+   {
+       sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint8_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count].index = (c0 <= c1) ? p : t; cache[count + 1].index = p; count += 2; }
+   }
+
+   return count;
+}
+
+static fast_sint_t libsais_final_sorting_scan_right_to_left_8u_block_prepare(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+   const fast_sint_t prefetch_distance = 32;
+
+   memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
+
+   fast_sint_t i, j, count = 0;
+   for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+   {
+       libsais_prefetchw(&SA[i - 2 * prefetch_distance]);
+
+       sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+       sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+
+       sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; buckets[cache[count].symbol = T[p0]]++; cache[count++].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); }
+       sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; buckets[cache[count].symbol = T[p1]]++; cache[count++].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); }
+   }
+
+   for (j -= prefetch_distance + 1; i >= j; i -= 1)
+   {
+       sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; buckets[cache[count].symbol = T[p]]++; cache[count++].index = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); }
+   }
+
+   return count;
+}
+
+static void libsais_final_order_scan_right_to_left_8u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = 0, j = count - 3; i < j; i += 4)
+    {
+        libsais_prefetch(&cache[i + prefetch_distance]);
+
+        SA[--buckets[cache[i + 0].symbol]] = cache[i + 0].index;
+        SA[--buckets[cache[i + 1].symbol]] = cache[i + 1].index;
+        SA[--buckets[cache[i + 2].symbol]] = cache[i + 2].index;
+        SA[--buckets[cache[i + 3].symbol]] = cache[i + 3].index;
+    }
+
+    for (j += 3; i < j; i += 1)
+    {
+        SA[--buckets[cache[i].symbol]] = cache[i].index;
+    }
+}
+
+static void libsais_final_bwt_aux_scan_right_to_left_8u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = 0, j = count - 6; i < j; i += 8)
+    {
+        libsais_prefetch(&cache[i + prefetch_distance]);
+
+        SA[--buckets[cache[i + 0].symbol]] = cache[i + 0].index; if ((cache[i + 1].index & rm) == 0) { I[cache[i + 1].index / (rm + 1)] = buckets[cache[i + 0].symbol] + 1; }
+        SA[--buckets[cache[i + 2].symbol]] = cache[i + 2].index; if ((cache[i + 3].index & rm) == 0) { I[cache[i + 3].index / (rm + 1)] = buckets[cache[i + 2].symbol] + 1; }
+        SA[--buckets[cache[i + 4].symbol]] = cache[i + 4].index; if ((cache[i + 5].index & rm) == 0) { I[cache[i + 5].index / (rm + 1)] = buckets[cache[i + 4].symbol] + 1; }
+        SA[--buckets[cache[i + 6].symbol]] = cache[i + 6].index; if ((cache[i + 7].index & rm) == 0) { I[cache[i + 7].index / (rm + 1)] = buckets[cache[i + 6].symbol] + 1; }
+    }
+
+    for (j += 6; i < j; i += 2)
+    {
+        SA[--buckets[cache[i].symbol]] = cache[i].index; if ((cache[i + 1].index & rm) == 0) { I[(cache[i + 1].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i].symbol] + 1; }
+    }
+}
+
+static void libsais_final_sorting_scan_right_to_left_32s_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+
+        libsais_prefetchw(&cache[i + prefetch_distance]);
+
+        sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; cache[i + 0].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); symbol0 = T[p0]; } cache[i + 0].symbol = symbol0;
+        sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; cache[i + 1].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); symbol1 = T[p1]; } cache[i + 1].symbol = symbol1;
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t symbol = SAINT_MIN, p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; cache[i].index = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); symbol = T[p]; } cache[i].symbol = symbol;
+    }
+}
+
+static void libsais_final_sorting_scan_right_to_left_32s_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+    {
+        libsais_prefetchw(&cache[i - 2 * prefetch_distance]);
+
+        sa_sint_t s0 = cache[i - prefetch_distance - 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0]; libsais_prefetchw(s0 >= 0 ? Is0 : NULL);
+        sa_sint_t s1 = cache[i - prefetch_distance - 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1]; libsais_prefetchw(s1 >= 0 ? Is1 : NULL);
+
+        sa_sint_t v0 = cache[i - 0].symbol;
+        if (v0 >= 0)
+        {
+            cache[i - 0].symbol = --induction_bucket[v0];
+            if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 0].symbol, np = cache[i - 0].index; cache[i - 0].index = np & SAINT_MAX; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] > T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } }
+        }
+
+        sa_sint_t v1 = cache[i - 1].symbol;
+        if (v1 >= 0)
+        {
+            cache[i - 1].symbol = --induction_bucket[v1];
+            if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 1].symbol, np = cache[i - 1].index; cache[i - 1].index = np & SAINT_MAX; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] > T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } }
+        }
+    }
+
+    for (j -= prefetch_distance + 1; i >= j; i -= 1)
+    {
+        sa_sint_t v = cache[i].symbol;
+        if (v >= 0)
+        {
+            cache[i].symbol = --induction_bucket[v];
+            if (cache[i].symbol >= omp_block_start) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; cache[i].index = np & SAINT_MAX; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] > T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } }
+        }
+    }
+}
+
+static void libsais_final_bwt_scan_right_to_left_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais_final_bwt_scan_right_to_left_8u(T, SA, induction_bucket, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                thread_state[omp_thread_num].state.count = libsais_final_bwt_scan_right_to_left_8u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                fast_sint_t t;
+                for (t = omp_num_threads - 1; t >= 0; --t)
+                {
+                    sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
+                    fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A - B; temp_bucket[c] = A; }
+                }
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais_final_order_scan_right_to_left_8u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count);
+            }
+        }
+#endif
+    }
+}
+
+static void libsais_final_bwt_aux_scan_right_to_left_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais_final_bwt_aux_scan_right_to_left_8u(T, SA, rm, I, induction_bucket, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                thread_state[omp_thread_num].state.count = libsais_final_bwt_aux_scan_right_to_left_8u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                fast_sint_t t;
+                for (t = omp_num_threads - 1; t >= 0; --t)
+                {
+                    sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
+                    fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A - B; temp_bucket[c] = A; }
+                }
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais_final_bwt_aux_scan_right_to_left_8u_block_place(SA, rm, I, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count);
+            }
+        }
+#endif
+    }
+}
+
+static void libsais_final_sorting_scan_right_to_left_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais_final_sorting_scan_right_to_left_8u(T, SA, induction_bucket, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                thread_state[omp_thread_num].state.count = libsais_final_sorting_scan_right_to_left_8u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                fast_sint_t t;
+                for (t = omp_num_threads - 1; t >= 0; --t)
+                {
+                    sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
+                    fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A - B; temp_bucket[c] = A; }
+                }
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais_final_order_scan_right_to_left_8u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count);
+            }
+        }
+#endif
+    }
+}
+
+static void libsais_final_sorting_scan_right_to_left_32s_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(cache);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais_final_sorting_scan_right_to_left_32s(T, SA, buckets, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                libsais_final_sorting_scan_right_to_left_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                libsais_final_sorting_scan_right_to_left_32s_block_sort(T, buckets, cache - block_start, block_start, block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+}
+
+#endif
+
+static sa_sint_t libsais_final_bwt_scan_right_to_left_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    sa_sint_t index = -1;
+
+    if (threads == 1 || n < 65536)
+    {
+        index = libsais_final_bwt_scan_right_to_left_8u(T, SA, induction_bucket, 0, n);
+    }
+#if defined(_OPENMP)
+    else
+    {
+        fast_sint_t block_start;
+        for (block_start = (fast_sint_t)n - 1; block_start >= 0; )
+        {
+            if (SA[block_start] == 0)
+            {
+                index = (sa_sint_t)block_start--;
+            }
+            else
+            {
+                fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end < 0) { block_max_end = -1; }
+                fast_sint_t block_end     = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; }
+                fast_sint_t block_size    = block_start - block_end;
+
+                if (block_size < 32)
+                {
+                    for (; block_start > block_end; block_start -= 1)
+                    {
+                        sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; uint8_t c0 = T[p - (p > 0)], c1 = T[p]; SA[block_start] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; }
+                    }
+                }
+                else
+                {
+                    libsais_final_bwt_scan_right_to_left_8u_block_omp(T, SA, induction_bucket, block_end + 1, block_size, threads, thread_state);
+                    block_start = block_end;
+                }
+            }
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+
+    return index;
+}
+
+static void libsais_final_bwt_aux_scan_right_to_left_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    if (threads == 1 || n < 65536)
+    {
+        libsais_final_bwt_aux_scan_right_to_left_8u(T, SA, rm, I, induction_bucket, 0, n);
+    }
+#if defined(_OPENMP)
+    else
+    {
+        fast_sint_t block_start;
+        for (block_start = (fast_sint_t)n - 1; block_start >= 0; )
+        {
+            if (SA[block_start] == 0)
+            {
+                block_start--;
+            }
+            else
+            {
+                fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * ((LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads) / 2); if (block_max_end < 0) { block_max_end = -1; }
+                fast_sint_t block_end     = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; }
+                fast_sint_t block_size    = block_start - block_end;
+
+                if (block_size < 32)
+                {
+                    for (; block_start > block_end; block_start -= 1)
+                    {
+                        sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; uint8_t c0 = T[p - (p > 0)], c1 = T[p]; SA[block_start] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]] + 1; } }
+                    }
+                }
+                else
+                {
+                    libsais_final_bwt_aux_scan_right_to_left_8u_block_omp(T, SA, rm, I, induction_bucket, block_end + 1, block_size, threads, thread_state);
+                    block_start = block_end;
+                }
+            }
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+}
+
+static void libsais_final_sorting_scan_right_to_left_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    if (threads == 1 || n < 65536)
+    {
+        libsais_final_sorting_scan_right_to_left_8u(T, SA, induction_bucket, 0, n);
+    }
+#if defined(_OPENMP)
+    else
+    {
+        fast_sint_t block_start;
+        for (block_start = (fast_sint_t)n - 1; block_start >= 0; )
+        {
+            if (SA[block_start] == 0)
+            {
+                block_start--;
+            }
+            else
+            {
+                fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end < -1) { block_max_end = -1; }
+                fast_sint_t block_end     = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; }
+                fast_sint_t block_size    = block_start - block_end;
+
+                if (block_size < 32)
+                {
+                    for (; block_start > block_end; block_start -= 1)
+                    {
+                        sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; SA[--induction_bucket[T[p]]] = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); }
+                    }
+                }
+                else
+                {
+                    libsais_final_sorting_scan_right_to_left_8u_block_omp(T, SA, induction_bucket, block_end + 1, block_size, threads, thread_state);
+                    block_start = block_end;
+                }
+            }
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+}
+
+static void libsais_final_sorting_scan_right_to_left_32s_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    if (threads == 1 || n < 65536)
+    {
+        libsais_final_sorting_scan_right_to_left_32s(T, SA, induction_bucket, 0, n);
+    }
+#if defined(_OPENMP)
+    else
+    {
+        fast_sint_t block_start, block_end;
+        for (block_start = (fast_sint_t)n - 1; block_start >= 0; block_start = block_end)
+        {
+            block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < 0) { block_end = -1; }
+
+            libsais_final_sorting_scan_right_to_left_32s_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads);
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+}
+
+static void libsais_clear_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT bucket_start, sa_sint_t * RESTRICT bucket_end, sa_sint_t threads)
+{
+    fast_sint_t c;
+
+#if defined(_OPENMP)
+    #pragma omp parallel for schedule(static, 1) num_threads(threads) if(threads > 1 && n >= 65536)
+#else
+    UNUSED(threads); UNUSED(n);
+#endif
+    for (c = 0; c < k; ++c)
+    {
+        if (bucket_end[c] > bucket_start[c])
+        {
+            memset(&SA[bucket_start[c]], 0, ((size_t)bucket_end[c] - (size_t)bucket_start[c]) * sizeof(sa_sint_t));
+        }
+    }
+}
+
+static sa_sint_t libsais_induce_final_order_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t bwt, sa_sint_t r, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    if (!bwt)
+    {
+        libsais_final_sorting_scan_left_to_right_8u_omp(T, SA, n, &buckets[6 * ALPHABET_SIZE], threads, thread_state);
+        if (threads > 1 && n >= 65536) { libsais_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE], &buckets[7 * ALPHABET_SIZE], threads); }
+        libsais_final_sorting_scan_right_to_left_8u_omp(T, SA, n, &buckets[7 * ALPHABET_SIZE], threads, thread_state);
+        return 0;
+    }
+    else if (I != NULL)
+    {
+        libsais_final_bwt_aux_scan_left_to_right_8u_omp(T, SA, n, r - 1, I, &buckets[6 * ALPHABET_SIZE], threads, thread_state);
+        if (threads > 1 && n >= 65536) { libsais_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE], &buckets[7 * ALPHABET_SIZE], threads); }
+        libsais_final_bwt_aux_scan_right_to_left_8u_omp(T, SA, n, r - 1, I, &buckets[7 * ALPHABET_SIZE], threads, thread_state);
+        return 0;
+    }
+    else
+    {
+        libsais_final_bwt_scan_left_to_right_8u_omp(T, SA, n, &buckets[6 * ALPHABET_SIZE], threads, thread_state);
+        if (threads > 1 && n >= 65536) { libsais_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE], &buckets[7 * ALPHABET_SIZE], threads); }
+        return libsais_final_bwt_scan_right_to_left_8u_omp(T, SA, n, &buckets[7 * ALPHABET_SIZE], threads, thread_state);
+    }
+}
+
+static void libsais_induce_final_order_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[4 * k], threads, thread_state);
+    libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[5 * k], threads, thread_state);
+}
+
+static void libsais_induce_final_order_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[2 * k], threads, thread_state);
+    libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[3 * k], threads, thread_state);
+}
+
+static void libsais_induce_final_order_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[1 * k], threads, thread_state);
+    libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[0 * k], threads, thread_state);
+}
+
+static void libsais_induce_final_order_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    libsais_count_suffixes_32s(T, n, k, buckets);
+    libsais_initialize_buckets_start_32s_1k(k, buckets);
+    libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, buckets, threads, thread_state);
+
+    libsais_count_suffixes_32s(T, n, k, buckets);
+    libsais_initialize_buckets_end_32s_1k(k, buckets);
+    libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, buckets, threads, thread_state);
+}
+
+static sa_sint_t libsais_renumber_unique_and_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t f, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT SAm = &SA[m];
+
+    sa_sint_t i, j;
+    for (i = (sa_sint_t)omp_block_start, j = (sa_sint_t)omp_block_start + (sa_sint_t)omp_block_size - 2 * (sa_sint_t)prefetch_distance - 3; i < j; i += 4)
+    {
+        libsais_prefetch(&SA[i + 3 * prefetch_distance]);
+
+        libsais_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 0]) >> 1]);
+        libsais_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 1]) >> 1]);
+        libsais_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 2]) >> 1]);
+        libsais_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 3]) >> 1]);
+
+        sa_uint_t q0 = (sa_uint_t)SA[i + prefetch_distance + 0]; const sa_sint_t * Tq0 = &T[q0]; libsais_prefetchw(SAm[q0 >> 1] < 0 ? Tq0 : NULL);
+        sa_uint_t q1 = (sa_uint_t)SA[i + prefetch_distance + 1]; const sa_sint_t * Tq1 = &T[q1]; libsais_prefetchw(SAm[q1 >> 1] < 0 ? Tq1 : NULL);
+        sa_uint_t q2 = (sa_uint_t)SA[i + prefetch_distance + 2]; const sa_sint_t * Tq2 = &T[q2]; libsais_prefetchw(SAm[q2 >> 1] < 0 ? Tq2 : NULL);
+        sa_uint_t q3 = (sa_uint_t)SA[i + prefetch_distance + 3]; const sa_sint_t * Tq3 = &T[q3]; libsais_prefetchw(SAm[q3 >> 1] < 0 ? Tq3 : NULL);
+
+        sa_uint_t p0 = (sa_uint_t)SA[i + 0]; sa_sint_t s0 = SAm[p0 >> 1]; if (s0 < 0) { T[p0] |= SAINT_MIN; f++; s0 = i + 0 + SAINT_MIN + f; } SAm[p0 >> 1] = s0 - f;
+        sa_uint_t p1 = (sa_uint_t)SA[i + 1]; sa_sint_t s1 = SAm[p1 >> 1]; if (s1 < 0) { T[p1] |= SAINT_MIN; f++; s1 = i + 1 + SAINT_MIN + f; } SAm[p1 >> 1] = s1 - f;
+        sa_uint_t p2 = (sa_uint_t)SA[i + 2]; sa_sint_t s2 = SAm[p2 >> 1]; if (s2 < 0) { T[p2] |= SAINT_MIN; f++; s2 = i + 2 + SAINT_MIN + f; } SAm[p2 >> 1] = s2 - f;
+        sa_uint_t p3 = (sa_uint_t)SA[i + 3]; sa_sint_t s3 = SAm[p3 >> 1]; if (s3 < 0) { T[p3] |= SAINT_MIN; f++; s3 = i + 3 + SAINT_MIN + f; } SAm[p3 >> 1] = s3 - f;
+    }
+
+    for (j += 2 * (sa_sint_t)prefetch_distance + 3; i < j; i += 1)
+    {
+        sa_uint_t p = (sa_uint_t)SA[i]; sa_sint_t s = SAm[p >> 1]; if (s < 0) { T[p] |= SAINT_MIN; f++; s = i + SAINT_MIN + f; } SAm[p >> 1] = s - f;
+    }
+
+    return f;
+}
+
+static void libsais_compact_unique_and_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t * pl, fast_sint_t * pr, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT SAl = &SA[0];
+    sa_sint_t * RESTRICT SAr = &SA[0];
+
+    fast_sint_t i, j, l = *pl - 1, r = *pr - 1;
+    for (i = (fast_sint_t)m + omp_block_start + omp_block_size - 1, j = (fast_sint_t)m + omp_block_start + 3; i >= j; i -= 4)
+    {
+        libsais_prefetch(&SA[i - prefetch_distance]);
+
+        sa_sint_t p0 = SA[i - 0]; SAl[l] = p0 & SAINT_MAX; l -= p0 < 0; SAr[r] = p0 - 1; r -= p0 > 0;
+        sa_sint_t p1 = SA[i - 1]; SAl[l] = p1 & SAINT_MAX; l -= p1 < 0; SAr[r] = p1 - 1; r -= p1 > 0;
+        sa_sint_t p2 = SA[i - 2]; SAl[l] = p2 & SAINT_MAX; l -= p2 < 0; SAr[r] = p2 - 1; r -= p2 > 0;
+        sa_sint_t p3 = SA[i - 3]; SAl[l] = p3 & SAINT_MAX; l -= p3 < 0; SAr[r] = p3 - 1; r -= p3 > 0;
+    }
+
+    for (j -= 3; i >= j; i -= 1)
+    {
+        sa_sint_t p = SA[i]; SAl[l] = p & SAINT_MAX; l -= p < 0; SAr[r] = p - 1; r -= p > 0;
+    }
+    
+    *pl = l + 1; *pr = r + 1;
+}
+
+
+#if defined(_OPENMP)
+
+static sa_sint_t libsais_count_unique_suffixes(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT SAm = &SA[m];
+
+    fast_sint_t i, j; sa_sint_t f0 = 0, f1 = 0, f2 = 0, f3 = 0;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
+    {
+        libsais_prefetch(&SA[i + 2 * prefetch_distance]);
+
+        libsais_prefetch(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]);
+        libsais_prefetch(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]);
+        libsais_prefetch(&SAm[((sa_uint_t)SA[i + prefetch_distance + 2]) >> 1]);
+        libsais_prefetch(&SAm[((sa_uint_t)SA[i + prefetch_distance + 3]) >> 1]);
+
+        f0 += SAm[((sa_uint_t)SA[i + 0]) >> 1] < 0;
+        f1 += SAm[((sa_uint_t)SA[i + 1]) >> 1] < 0;
+        f2 += SAm[((sa_uint_t)SA[i + 2]) >> 1] < 0;
+        f3 += SAm[((sa_uint_t)SA[i + 3]) >> 1] < 0;
+    }
+
+    for (j += prefetch_distance + 3; i < j; i += 1)
+    {
+        f0 += SAm[((sa_uint_t)SA[i]) >> 1] < 0;
+    }
+
+    return f0 + f1 + f2 + f3;
+}
+
+#endif
+
+static sa_sint_t libsais_renumber_unique_and_nonunique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    sa_sint_t f = 0;
+
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (m / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
+
+        if (omp_num_threads == 1)
+        {
+            f = libsais_renumber_unique_and_nonunique_lms_suffixes_32s(T, SA, m, 0, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                thread_state[omp_thread_num].state.count = libsais_count_unique_suffixes(SA, m, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                fast_sint_t t, count = 0; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; }
+
+                if (omp_thread_num == omp_num_threads - 1)
+                {
+                    f = (sa_sint_t)(count + thread_state[omp_thread_num].state.count);
+                }
+
+                libsais_renumber_unique_and_nonunique_lms_suffixes_32s(T, SA, m, (sa_sint_t)count, omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+
+    return f;
+}
+
+static void libsais_compact_unique_and_nonunique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072 && m < fs)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start;
+
+        if (omp_num_threads == 1)
+        {
+            fast_sint_t l = m, r = (fast_sint_t)n + (fast_sint_t)fs;
+            libsais_compact_unique_and_nonunique_lms_suffixes_32s(SA, m, &l, &r, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                thread_state[omp_thread_num].state.position   = (fast_sint_t)m + ((fast_sint_t)n >> 1) + omp_block_start + omp_block_size;
+                thread_state[omp_thread_num].state.count      = (fast_sint_t)m + omp_block_start + omp_block_size;
+
+                libsais_compact_unique_and_nonunique_lms_suffixes_32s(SA, m, &thread_state[omp_thread_num].state.position, &thread_state[omp_thread_num].state.count, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                fast_sint_t t, position;
+
+                for (position = m, t = omp_num_threads - 1; t >= 0; --t)
+                { 
+                    fast_sint_t omp_block_end     = t < omp_num_threads - 1 ? omp_block_stride * (t + 1) : ((fast_sint_t)n >> 1);
+                    fast_sint_t count             = ((fast_sint_t)m + ((fast_sint_t)n >> 1) + omp_block_end - thread_state[t].state.position);
+
+                    if (count > 0)
+                    {
+                        position -= count; memcpy(&SA[position], &SA[thread_state[t].state.position], (size_t)count * sizeof(sa_sint_t));
+                    }
+                }
+
+                for (position = (fast_sint_t)n + (fast_sint_t)fs, t = omp_num_threads - 1; t >= 0; --t)
+                {
+                    fast_sint_t omp_block_end     = t < omp_num_threads - 1 ? omp_block_stride * (t + 1) : ((fast_sint_t)n >> 1);
+                    fast_sint_t count             = ((fast_sint_t)m + omp_block_end - thread_state[t].state.count);
+
+                    if (count > 0)
+                    {
+                        position -= count; memcpy(&SA[position], &SA[thread_state[t].state.count], (size_t)count * sizeof(sa_sint_t));
+                    }
+                }
+            }
+        }
+#endif
+    }
+
+    memcpy(&SA[(fast_sint_t)n + (fast_sint_t)fs - (fast_sint_t)m], &SA[(fast_sint_t)m - (fast_sint_t)f], (size_t)f * sizeof(sa_sint_t));
+}
+
+static sa_sint_t libsais_compact_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    sa_sint_t f = libsais_renumber_unique_and_nonunique_lms_suffixes_32s_omp(T, SA, m, threads, thread_state);
+    libsais_compact_unique_and_nonunique_lms_suffixes_32s_omp(SA, n, m, fs, f, threads, thread_state);
+
+    return f;
+}
+
+static void libsais_merge_unique_lms_suffixes_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, fast_sint_t l, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    const sa_sint_t * RESTRICT SAnm = &SA[(fast_sint_t)n - (fast_sint_t)m - 1 + l];
+
+    sa_sint_t i, j; fast_sint_t tmp = *SAnm++;
+    for (i = (sa_sint_t)omp_block_start, j = (sa_sint_t)omp_block_start + (sa_sint_t)omp_block_size - 6; i < j; i += 4)
+    {
+        libsais_prefetch(&T[i + prefetch_distance]);
+
+        sa_sint_t c0 = T[i + 0]; if (c0 < 0) { T[i + 0] = c0 & SAINT_MAX; SA[tmp] = i + 0; i++; tmp = *SAnm++; }
+        sa_sint_t c1 = T[i + 1]; if (c1 < 0) { T[i + 1] = c1 & SAINT_MAX; SA[tmp] = i + 1; i++; tmp = *SAnm++; }
+        sa_sint_t c2 = T[i + 2]; if (c2 < 0) { T[i + 2] = c2 & SAINT_MAX; SA[tmp] = i + 2; i++; tmp = *SAnm++; }
+        sa_sint_t c3 = T[i + 3]; if (c3 < 0) { T[i + 3] = c3 & SAINT_MAX; SA[tmp] = i + 3; i++; tmp = *SAnm++; }
+    }
+
+    for (j += 6; i < j; i += 1)
+    {
+        sa_sint_t c = T[i]; if (c < 0) { T[i] = c & SAINT_MAX; SA[tmp] = i; i++; tmp = *SAnm++; }
+    }
+}
+
+static void libsais_merge_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, fast_sint_t l, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    const sa_sint_t * RESTRICT SAnm = &SA[(fast_sint_t)n - (fast_sint_t)m - 1 + l];
+
+    fast_sint_t i, j; sa_sint_t tmp = *SAnm++;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3; i < j; i += 4)
+    {
+        libsais_prefetch(&SA[i + prefetch_distance]);
+
+        if (SA[i + 0] == 0) { SA[i + 0] = tmp; tmp = *SAnm++; }
+        if (SA[i + 1] == 0) { SA[i + 1] = tmp; tmp = *SAnm++; }
+        if (SA[i + 2] == 0) { SA[i + 2] = tmp; tmp = *SAnm++; }
+        if (SA[i + 3] == 0) { SA[i + 3] = tmp; tmp = *SAnm++; }
+    }
+
+    for (j += 3; i < j; i += 1)
+    {
+        if (SA[i] == 0) { SA[i] = tmp; tmp = *SAnm++; }
+    }
+}
+
+static void libsais_merge_unique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (n / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais_merge_unique_lms_suffixes_32s(T, SA, n, m, 0, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                thread_state[omp_thread_num].state.count = libsais_count_negative_marked_suffixes(T, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                fast_sint_t t, count = 0; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; }
+
+                libsais_merge_unique_lms_suffixes_32s(T, SA, n, m, count, omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+}
+
+static void libsais_merge_nonunique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (m / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais_merge_nonunique_lms_suffixes_32s(SA, n, m, f, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                thread_state[omp_thread_num].state.count = libsais_count_zero_marked_suffixes(SA, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                fast_sint_t t, count = f; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; }
+
+                libsais_merge_nonunique_lms_suffixes_32s(SA, n, m, count, omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+}
+
+static void libsais_merge_compacted_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    libsais_merge_unique_lms_suffixes_32s_omp(T, SA, n, m, threads, thread_state);
+    libsais_merge_nonunique_lms_suffixes_32s_omp(SA, n, m, f, threads, thread_state);
+}
+
+static void libsais_reconstruct_compacted_lms_suffixes_32s_2k_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, sa_sint_t fs, sa_sint_t f, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    if (f > 0)
+    {
+        memmove(&SA[n - m - 1], &SA[n + fs - m], (size_t)f * sizeof(sa_sint_t));
+
+        libsais_count_and_gather_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state);
+        libsais_reconstruct_lms_suffixes_omp(SA, n, m - f, threads);
+
+        memcpy(&SA[n - m - 1 + f], &SA[0], ((size_t)m - (size_t)f) * sizeof(sa_sint_t));
+        memset(&SA[0], 0, (size_t)m * sizeof(sa_sint_t));
+
+        libsais_merge_compacted_lms_suffixes_32s_omp(T, SA, n, m, f, threads, thread_state);
+    }
+    else
+    {
+        libsais_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n);
+        libsais_reconstruct_lms_suffixes_omp(SA, n, m, threads);
+    }
+}
+
+static void libsais_reconstruct_compacted_lms_suffixes_32s_1k_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    if (f > 0)
+    {
+        memmove(&SA[n - m - 1], &SA[n + fs - m], (size_t)f * sizeof(sa_sint_t));
+
+        libsais_gather_compacted_lms_suffixes_32s(T, SA, n);
+        libsais_reconstruct_lms_suffixes_omp(SA, n, m - f, threads);
+
+        memcpy(&SA[n - m - 1 + f], &SA[0], ((size_t)m - (size_t)f) * sizeof(sa_sint_t));
+        memset(&SA[0], 0, (size_t)m * sizeof(sa_sint_t));
+
+        libsais_merge_compacted_lms_suffixes_32s_omp(T, SA, n, m, f, threads, thread_state);
+    }
+    else
+    {
+        libsais_gather_lms_suffixes_32s(T, SA, n);
+        libsais_reconstruct_lms_suffixes_omp(SA, n, m, threads);
+    }
+}
+
+static sa_sint_t libsais_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    if (k > 0 && fs / k >= 6)
+    {
+        sa_sint_t alignment = (fs - 1024) / k >= 6 ? 1024 : 16;
+        sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 6 ? (sa_sint_t *)libsais_align_up(&SA[n + fs - 6 * k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 6 * k];
+
+        sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_4k_omp(T, SA, n, k, buckets, threads, thread_state);
+        if (m > 1)
+        {
+            memset(SA, 0, ((size_t)n - (size_t)m) * sizeof(sa_sint_t));
+
+            sa_sint_t first_lms_suffix    = SA[n - m];
+            sa_sint_t left_suffixes_count = libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(T, k, buckets, first_lms_suffix);
+
+            libsais_radix_sort_lms_suffixes_32s_6k_omp(T, SA, n, m, &buckets[4 * k], threads, thread_state);
+            libsais_radix_sort_set_markers_32s_6k_omp(SA, k, &buckets[4 * k], threads);
+
+            if (threads > 1 && n >= 65536) { memset(&SA[(fast_sint_t)n - (fast_sint_t)m], 0, (size_t)m * sizeof(sa_sint_t)); }
+
+            libsais_initialize_buckets_for_partial_sorting_32s_6k(T, k, buckets, first_lms_suffix, left_suffixes_count);
+            libsais_induce_partial_order_32s_6k_omp(T, SA, n, k, buckets, first_lms_suffix, left_suffixes_count, threads, thread_state);
+
+            sa_sint_t names = libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(SA, n, m, threads, thread_state);
+            if (names < m)
+            {
+                sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
+
+                if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state) != 0)
+                {
+                    return -2;
+                }
+
+                libsais_reconstruct_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, m, fs, f, buckets, threads, thread_state);
+            }
+            else
+            {
+                libsais_count_lms_suffixes_32s_2k(T, n, k, buckets);
+            }
+
+            libsais_initialize_buckets_start_and_end_32s_4k(k, buckets);
+            libsais_place_lms_suffixes_histogram_32s_4k(SA, n, k, m, buckets);
+            libsais_induce_final_order_32s_4k(T, SA, n, k, buckets, threads, thread_state);
+        }
+        else
+        {
+            SA[0] = SA[n - 1];
+
+            libsais_initialize_buckets_start_and_end_32s_6k(k, buckets);
+            libsais_place_lms_suffixes_histogram_32s_6k(SA, n, k, m, buckets);
+            libsais_induce_final_order_32s_6k(T, SA, n, k, buckets, threads, thread_state);
+        }
+
+        return 0;
+    }
+    else if (k > 0 && fs / k >= 4)
+    {
+        sa_sint_t alignment = (fs - 1024) / k >= 4 ? 1024 : 16;
+        sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 4 ? (sa_sint_t *)libsais_align_up(&SA[n + fs - 4 * k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 4 * k];
+
+        sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state);
+        if (m > 1)
+        {
+            libsais_initialize_buckets_for_radix_and_partial_sorting_32s_4k(T, k, buckets, SA[n - m]);
+
+            libsais_radix_sort_lms_suffixes_32s_2k_omp(T, SA, n, m, &buckets[1], threads, thread_state);
+            libsais_radix_sort_set_markers_32s_4k_omp(SA, k, &buckets[1], threads);
+            
+            libsais_place_lms_suffixes_interval_32s_4k(SA, n, k, m - 1, buckets);
+            libsais_induce_partial_order_32s_4k_omp(T, SA, n, k, buckets, threads, thread_state);
+
+            sa_sint_t names = libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(SA, n, m, threads, thread_state);
+            if (names < m)
+            {
+                sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
+
+                if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state) != 0)
+                {
+                    return -2;
+                }
+
+                libsais_reconstruct_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, m, fs, f, buckets, threads, thread_state);
+            }
+            else
+            {
+                libsais_count_lms_suffixes_32s_2k(T, n, k, buckets);
+            }
+        }
+        else
+        {
+            SA[0] = SA[n - 1];
+        }
+
+        libsais_initialize_buckets_start_and_end_32s_4k(k, buckets);
+        libsais_place_lms_suffixes_histogram_32s_4k(SA, n, k, m, buckets);
+        libsais_induce_final_order_32s_4k(T, SA, n, k, buckets, threads, thread_state);
+
+        return 0;
+    }
+    else if (k > 0 && fs / k >= 2)
+    {
+        sa_sint_t alignment = (fs - 1024) / k >= 2 ? 1024 : 16;
+        sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 2 ? (sa_sint_t *)libsais_align_up(&SA[n + fs - 2 * k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 2 * k];
+
+        sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state);
+        if (m > 1)
+        {
+            libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(T, k, buckets, SA[n - m]);
+
+            libsais_radix_sort_lms_suffixes_32s_2k_omp(T, SA, n, m, &buckets[1], threads, thread_state);
+            libsais_place_lms_suffixes_interval_32s_2k(SA, n, k, m - 1, buckets);
+
+            libsais_initialize_buckets_start_and_end_32s_2k(k, buckets);
+            libsais_induce_partial_order_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state);
+
+            sa_sint_t names = libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(T, SA, n, m, threads);
+            if (names < m)
+            {
+                sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
+
+                if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state) != 0)
+                {
+                    return -2;
+                }
+
+                libsais_reconstruct_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, m, fs, f, buckets, threads, thread_state);
+            }
+            else
+            {
+                libsais_count_lms_suffixes_32s_2k(T, n, k, buckets);
+            }
+        }
+        else
+        {
+            SA[0] = SA[n - 1];
+        }
+
+        libsais_initialize_buckets_end_32s_2k(k, buckets);
+        libsais_place_lms_suffixes_histogram_32s_2k(SA, n, k, m, buckets);
+
+        libsais_initialize_buckets_start_and_end_32s_2k(k, buckets);
+        libsais_induce_final_order_32s_2k(T, SA, n, k, buckets, threads, thread_state);
+
+        return 0;
+    }
+    else
+    {
+        sa_sint_t * buffer = fs < k ? (sa_sint_t *)libsais_alloc_aligned((size_t)k * sizeof(sa_sint_t), 4096) : (sa_sint_t *)NULL;
+
+        sa_sint_t alignment = fs - 1024 >= k ? 1024 : 16;
+        sa_sint_t * RESTRICT buckets = fs - alignment >= k ? (sa_sint_t *)libsais_align_up(&SA[n + fs - k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : fs >= k ? &SA[n + fs - k] : buffer;
+
+        if (buckets == NULL) { return -2; }
+
+        memset(SA, 0, (size_t)n * sizeof(sa_sint_t));
+
+        libsais_count_suffixes_32s(T, n, k, buckets); 
+        libsais_initialize_buckets_end_32s_1k(k, buckets);
+
+        sa_sint_t m = libsais_radix_sort_lms_suffixes_32s_1k(T, SA, n, buckets);
+        if (m > 1)
+        {
+            libsais_induce_partial_order_32s_1k_omp(T, SA, n, k, buckets, threads, thread_state);
+
+            sa_sint_t names = libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(T, SA, n, m, threads);
+            if (names < m)
+            {
+                if (buffer != NULL) { libsais_free_aligned(buffer); buckets = NULL; }
+
+                sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
+
+                if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state) != 0)
+                {
+                    return -2;
+                }
+
+                libsais_reconstruct_compacted_lms_suffixes_32s_1k_omp(T, SA, n, m, fs, f, threads, thread_state);
+
+                if (buckets == NULL) { buckets = buffer = (sa_sint_t *)libsais_alloc_aligned((size_t)k * sizeof(sa_sint_t), 4096); }
+                if (buckets == NULL) { return -2; }
+            }
+            
+            libsais_count_suffixes_32s(T, n, k, buckets);
+            libsais_initialize_buckets_end_32s_1k(k, buckets);
+            libsais_place_lms_suffixes_interval_32s_1k(T, SA, k, m, buckets);
+        }
+
+        libsais_induce_final_order_32s_1k(T, SA, n, k, buckets, threads, thread_state);
+        libsais_free_aligned(buffer);
+
+        return 0;
+    }
+}
+
+int32_t libsais_main_32s_internal(int32_t * T, int32_t * SA, int32_t n, int32_t k, int32_t fs, int32_t threads)
+{
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state = threads > 1 ? libsais_alloc_thread_state(threads) : NULL;
+
+    sa_sint_t index = thread_state != NULL || threads == 1
+        ? libsais_main_32s(T, SA, n, k, fs, threads, thread_state)
+        : -2;
+
+    libsais_free_thread_state(thread_state);
+
+    return index;
+}
+
+static sa_sint_t libsais_main_8u(const uint8_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t bwt, sa_sint_t r, sa_sint_t * RESTRICT I, sa_sint_t fs, sa_sint_t * freq, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    sa_sint_t m = libsais_count_and_gather_lms_suffixes_8u_omp(T, SA, n, buckets, threads, thread_state);
+
+    libsais_initialize_buckets_start_and_end_8u(buckets, freq);
+
+    if (m > 0)
+    {
+        sa_sint_t first_lms_suffix    = SA[n - m];
+        sa_sint_t left_suffixes_count = libsais_initialize_buckets_for_lms_suffixes_radix_sort_8u(T, buckets, first_lms_suffix);
+
+        if (threads > 1 && n >= 65536) { memset(SA, 0, ((size_t)n - (size_t)m) * sizeof(sa_sint_t)); }
+        libsais_radix_sort_lms_suffixes_8u_omp(T, SA, n, m, buckets, threads, thread_state);
+        if (threads > 1 && n >= 65536) { memset(&SA[(fast_sint_t)n - (fast_sint_t)m], 0, (size_t)m * sizeof(sa_sint_t)); }
+
+        libsais_initialize_buckets_for_partial_sorting_8u(T, buckets, first_lms_suffix, left_suffixes_count);
+        libsais_induce_partial_order_8u_omp(T, SA, n, buckets, first_lms_suffix, left_suffixes_count, threads, thread_state);
+
+        sa_sint_t names = libsais_renumber_and_gather_lms_suffixes_8u_omp(SA, n, m, fs, threads, thread_state);
+        if (names < m)
+        {
+            if (libsais_main_32s(SA + n + fs - m, SA, m, names, fs + n - 2 * m, threads, thread_state) != 0)
+            {
+                return -2;
+            }
+
+            libsais_gather_lms_suffixes_8u_omp(T, SA, n, threads, thread_state);
+            libsais_reconstruct_lms_suffixes_omp(SA, n, m, threads);
+        }
+
+        libsais_place_lms_suffixes_interval_8u(SA, n, m, buckets);
+    }
+    else
+    {
+        memset(SA, 0, (size_t)n * sizeof(sa_sint_t));
+    }
+
+    return libsais_induce_final_order_8u_omp(T, SA, n, bwt, r, I, buckets, threads, thread_state);
+}
+
+static sa_sint_t libsais_main(const uint8_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t bwt, sa_sint_t r, sa_sint_t * I, sa_sint_t fs, sa_sint_t * freq, sa_sint_t threads)
+{
+    LIBSAIS_THREAD_STATE *  RESTRICT thread_state   = threads > 1 ? libsais_alloc_thread_state(threads) : NULL;
+    sa_sint_t *             RESTRICT buckets        = (sa_sint_t *)libsais_alloc_aligned(8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096);
+
+    sa_sint_t index = buckets != NULL && (thread_state != NULL || threads == 1)
+        ? libsais_main_8u(T, SA, n, buckets, bwt, r, I, fs, freq, threads, thread_state)
+        : -2;
+
+    libsais_free_aligned(buckets);
+    libsais_free_thread_state(thread_state);
+
+    return index;
+}
+
+static sa_sint_t libsais_main_ctx(const LIBSAIS_CONTEXT * ctx, const uint8_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t bwt, sa_sint_t r, sa_sint_t * I, sa_sint_t fs, sa_sint_t * freq)
+{
+    return ctx != NULL && (ctx->buckets != NULL && (ctx->thread_state != NULL || ctx->threads == 1))
+        ? libsais_main_8u(T, SA, n, ctx->buckets, bwt, r, I, fs, freq, (sa_sint_t)ctx->threads, ctx->thread_state)
+        : -2;
+}
+
+static void libsais_bwt_copy_8u(uint8_t * RESTRICT U, sa_sint_t * RESTRICT A, sa_sint_t n)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = 0, j = (fast_sint_t)n - 7; i < j; i += 8)
+    {
+        libsais_prefetch(&A[i + prefetch_distance]);
+
+        U[i + 0] = (uint8_t)A[i + 0];
+        U[i + 1] = (uint8_t)A[i + 1];
+        U[i + 2] = (uint8_t)A[i + 2];
+        U[i + 3] = (uint8_t)A[i + 3];
+        U[i + 4] = (uint8_t)A[i + 4];
+        U[i + 5] = (uint8_t)A[i + 5];
+        U[i + 6] = (uint8_t)A[i + 6];
+        U[i + 7] = (uint8_t)A[i + 7];
+    }
+
+    for (j += 7; i < j; i += 1)
+    {
+        U[i] = (uint8_t)A[i];
+    }
+}
+
+#if defined(_OPENMP)
+
+static void libsais_bwt_copy_8u_omp(uint8_t * RESTRICT U, sa_sint_t * RESTRICT A, sa_sint_t n, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+        fast_sint_t omp_block_stride  = ((fast_sint_t)n / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : (fast_sint_t)n - omp_block_start;
+#else
+        UNUSED(threads);
+
+        fast_sint_t omp_block_start   = 0;
+        fast_sint_t omp_block_size    = (fast_sint_t)n;
+#endif
+
+        libsais_bwt_copy_8u(U + omp_block_start, A + omp_block_start, (sa_sint_t)omp_block_size);
+    }
+}
+
+#endif
+
+void * libsais_create_ctx(void)
+{
+    return (void *)libsais_create_ctx_main(1);
+}
+
+void libsais_free_ctx(void * ctx)
+{
+    libsais_free_ctx_main((LIBSAIS_CONTEXT *)ctx);
+}
+
+int32_t libsais(const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq)
+{
+    if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0))
+    {
+        return -1;
+    }
+    else if (n < 2)
+    {
+        if (n == 1) { SA[0] = 0; }
+        return 0;
+    }
+
+    return libsais_main(T, SA, n, 0, 0, NULL, fs, freq, 1);
+}
+
+int32_t libsais_ctx(const void * ctx, const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq)
+{
+    if ((ctx == NULL) || (T == NULL) || (SA == NULL) || (n < 0) || (fs < 0))
+    {
+        return -1;
+    }
+    else if (n < 2)
+    {
+        if (n == 1) { SA[0] = 0; }
+        return 0;
+    }
+
+    return libsais_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, SA, n, 0, 0, NULL, fs, freq);
+}
+
+int32_t libsais_bwt(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq)
+{
+    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0))
+    { 
+        return -1; 
+    }
+    else if (n <= 1) 
+    { 
+        if (n == 1) { U[0] = T[0]; }
+        return n; 
+    }
+
+    sa_sint_t index = libsais_main(T, A, n, 1, 0, NULL, fs, freq, 1);
+    if (index >= 0) 
+    { 
+        index++;
+
+        U[0] = T[n - 1];
+        libsais_bwt_copy_8u(U + 1, A, index - 1);
+        libsais_bwt_copy_8u(U + index, A + index, n - index);
+    }
+
+    return index;
+}
+
+int32_t libsais_bwt_aux(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I)
+{
+    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) || ((r & (r - 1)) != 0) || (I == NULL))
+    { 
+        return -1; 
+    }
+    else if (n <= 1) 
+    { 
+        if (n == 1) { U[0] = T[0]; }
+
+        I[0] = n;
+        return 0;
+    }
+
+    if (libsais_main(T, A, n, 1, r, I, fs, freq, 1) != 0)
+    {
+        return -2;
+    }
+
+    U[0] = T[n - 1];
+    libsais_bwt_copy_8u(U + 1, A, I[0] - 1);
+    libsais_bwt_copy_8u(U + I[0], A + I[0], n - I[0]);
+
+    return 0;
+}
+
+int32_t libsais_bwt_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq)
+{
+    if ((ctx == NULL) || (T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0))
+    { 
+        return -1; 
+    }
+    else if (n <= 1) 
+    { 
+        if (n == 1) { U[0] = T[0]; }
+        return n; 
+    }
+
+    sa_sint_t index = libsais_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, A, n, 1, 0, NULL, fs, freq);
+    if (index >= 0) 
+    { 
+        index++;
+
+        U[0] = T[n - 1];
+
+#if defined(_OPENMP)
+        libsais_bwt_copy_8u_omp(U + 1, A, index - 1, (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads);
+        libsais_bwt_copy_8u_omp(U + index, A + index, n - index, (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads);
+#else
+        libsais_bwt_copy_8u(U + 1, A, index - 1);
+        libsais_bwt_copy_8u(U + index, A + index, n - index);
+#endif
+    }
+
+    return index;
+}
+
+int32_t libsais_bwt_aux_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I)
+{
+    if ((ctx == NULL) || (T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) || ((r & (r - 1)) != 0) || (I == NULL))
+    { 
+        return -1; 
+    }
+    else if (n <= 1) 
+    { 
+        if (n == 1) { U[0] = T[0]; }
+
+        I[0] = n;
+        return 0;
+    }
+
+    if (libsais_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, A, n, 1, r, I, fs, freq) != 0)
+    {
+        return -2;
+    }
+
+    U[0] = T[n - 1];
+
+#if defined(_OPENMP)
+    libsais_bwt_copy_8u_omp(U + 1, A, I[0] - 1, (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads);
+    libsais_bwt_copy_8u_omp(U + I[0], A + I[0], n - I[0], (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads);
+#else
+    libsais_bwt_copy_8u(U + 1, A, I[0] - 1);
+    libsais_bwt_copy_8u(U + I[0], A + I[0], n - I[0]);
+#endif
+
+    return 0;
+}
+
+#if defined(_OPENMP)
+
+void * libsais_create_ctx_omp(int32_t threads)
+{
+    if (threads < 0) { return NULL; }
+
+    threads = threads > 0 ? threads : omp_get_max_threads();
+    return (void *)libsais_create_ctx_main(threads);
+}
+
+int32_t libsais_omp(const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq, int32_t threads)
+{
+    if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0) || (threads < 0))
+    {
+        return -1;
+    }
+    else if (n < 2)
+    {
+        if (n == 1) { SA[0] = 0; }
+        return 0;
+    }
+
+    threads = threads > 0 ? threads : omp_get_max_threads();
+
+    return libsais_main(T, SA, n, 0, 0, NULL, fs, freq, threads);
+}
+
+int32_t libsais_bwt_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t threads)
+{
+    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (threads < 0))
+    {
+        return -1;
+    }
+    else if (n <= 1)
+    {
+        if (n == 1) { U[0] = T[0]; }
+        return n;
+    }
+
+    threads = threads > 0 ? threads : omp_get_max_threads();
+
+    sa_sint_t index = libsais_main(T, A, n, 1, 0, NULL, fs, freq, threads);
+    if (index >= 0)
+    {
+        index++;
+
+        U[0] = T[n - 1];
+        libsais_bwt_copy_8u_omp(U + 1, A, index - 1, threads);
+        libsais_bwt_copy_8u_omp(U + index, A + index, n - index, threads);
+    }
+
+    return index;
+}
+
+int32_t libsais_bwt_aux_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I, int32_t threads)
+{
+    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) || ((r & (r - 1)) != 0) || (I == NULL) || (threads < 0))
+    {
+        return -1;
+    }
+    else if (n <= 1)
+    {
+        if (n == 1) { U[0] = T[0];}
+
+        I[0] = n;
+        return 0;
+    }
+
+    threads = threads > 0 ? threads : omp_get_max_threads();
+
+    if (libsais_main(T, A, n, 1, r, I, fs, freq, threads) != 0)
+    {
+        return -2;
+    }
+
+    U[0] = T[n - 1];
+    libsais_bwt_copy_8u_omp(U + 1, A, I[0] - 1, threads);
+    libsais_bwt_copy_8u_omp(U + I[0], A + I[0], n - I[0], threads);
+
+    return 0;
+}
+
+#endif
+
+static LIBSAIS_UNBWT_CONTEXT * libsais_unbwt_create_ctx_main(sa_sint_t threads)
+{
+    LIBSAIS_UNBWT_CONTEXT *     RESTRICT ctx            = (LIBSAIS_UNBWT_CONTEXT *)libsais_alloc_aligned(sizeof(LIBSAIS_UNBWT_CONTEXT), 64);
+    sa_uint_t *                 RESTRICT bucket2        = (sa_uint_t *)libsais_alloc_aligned(ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t), 4096);
+    uint16_t *                  RESTRICT fastbits       = (uint16_t *)libsais_alloc_aligned((1 + (1 << UNBWT_FASTBITS)) * sizeof(uint16_t), 4096);
+    sa_uint_t *                 RESTRICT buckets        = threads > 1 ? (sa_uint_t *)libsais_alloc_aligned((size_t)threads * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) * sizeof(sa_uint_t), 4096) : NULL;
+
+    if (ctx != NULL && bucket2 != NULL && fastbits != NULL && (buckets != NULL || threads == 1))
+    {
+        ctx->bucket2    = bucket2;
+        ctx->fastbits   = fastbits;
+        ctx->buckets    = buckets;
+        ctx->threads    = threads;
+
+        return ctx;
+    }
+
+    libsais_free_aligned(buckets);
+    libsais_free_aligned(fastbits);
+    libsais_free_aligned(bucket2);
+    libsais_free_aligned(ctx);
+
+    return NULL;
+}
+
+static void libsais_unbwt_free_ctx_main(LIBSAIS_UNBWT_CONTEXT * ctx)
+{
+    if (ctx != NULL)
+    {
+        libsais_free_aligned(ctx->buckets);
+        libsais_free_aligned(ctx->fastbits);
+        libsais_free_aligned(ctx->bucket2);
+        libsais_free_aligned(ctx);
+    }
+}
+
+static void libsais_unbwt_compute_histogram(const uint8_t * RESTRICT T, fast_sint_t n, sa_uint_t * RESTRICT count)
+{
+    const fast_sint_t prefetch_distance = 256;
+
+    const uint8_t * RESTRICT T_p = T;
+
+    if (n >= 1024)
+    {
+        sa_uint_t copy[4 * (ALPHABET_SIZE + 16)];
+
+        memset(copy, 0, 4 * (ALPHABET_SIZE + 16) * sizeof(sa_uint_t));
+
+        sa_uint_t * RESTRICT copy0 = copy + 0 * (ALPHABET_SIZE + 16);
+        sa_uint_t * RESTRICT copy1 = copy + 1 * (ALPHABET_SIZE + 16);
+        sa_uint_t * RESTRICT copy2 = copy + 2 * (ALPHABET_SIZE + 16);
+        sa_uint_t * RESTRICT copy3 = copy + 3 * (ALPHABET_SIZE + 16);
+
+        for (; T_p < (uint8_t * )((ptrdiff_t)(T + 63) & (-64)); T_p += 1) { copy0[T_p[0]]++; }
+
+        fast_uint_t x = ((const uint32_t *)(const void *)T_p)[0], y = ((const uint32_t *)(const void *)T_p)[1];
+
+        for (; T_p < (uint8_t * )((ptrdiff_t)(T + n - 8) & (-64)); T_p += 64)
+        { 
+            libsais_prefetch(&T_p[prefetch_distance]);
+
+            fast_uint_t z = ((const uint32_t *)(const void *)T_p)[2], w = ((const uint32_t *)(const void *)T_p)[3];
+            copy0[(uint8_t)x]++; x >>= 8; copy1[(uint8_t)x]++; x >>= 8; copy2[(uint8_t)x]++; x >>= 8; copy3[x]++;
+            copy0[(uint8_t)y]++; y >>= 8; copy1[(uint8_t)y]++; y >>= 8; copy2[(uint8_t)y]++; y >>= 8; copy3[y]++;
+
+            x = ((const uint32_t *)(const void *)T_p)[4]; y = ((const uint32_t *)(const void *)T_p)[5];
+            copy0[(uint8_t)z]++; z >>= 8; copy1[(uint8_t)z]++; z >>= 8; copy2[(uint8_t)z]++; z >>= 8; copy3[z]++;
+            copy0[(uint8_t)w]++; w >>= 8; copy1[(uint8_t)w]++; w >>= 8; copy2[(uint8_t)w]++; w >>= 8; copy3[w]++;
+
+            z = ((const uint32_t *)(const void *)T_p)[6]; w = ((const uint32_t *)(const void *)T_p)[7];
+            copy0[(uint8_t)x]++; x >>= 8; copy1[(uint8_t)x]++; x >>= 8; copy2[(uint8_t)x]++; x >>= 8; copy3[x]++;
+            copy0[(uint8_t)y]++; y >>= 8; copy1[(uint8_t)y]++; y >>= 8; copy2[(uint8_t)y]++; y >>= 8; copy3[y]++;
+
+            x = ((const uint32_t *)(const void *)T_p)[8]; y = ((const uint32_t *)(const void *)T_p)[9];
+            copy0[(uint8_t)z]++; z >>= 8; copy1[(uint8_t)z]++; z >>= 8; copy2[(uint8_t)z]++; z >>= 8; copy3[z]++;
+            copy0[(uint8_t)w]++; w >>= 8; copy1[(uint8_t)w]++; w >>= 8; copy2[(uint8_t)w]++; w >>= 8; copy3[w]++;
+
+            z = ((const uint32_t *)(const void *)T_p)[10]; w = ((const uint32_t *)(const void *)T_p)[11];
+            copy0[(uint8_t)x]++; x >>= 8; copy1[(uint8_t)x]++; x >>= 8; copy2[(uint8_t)x]++; x >>= 8; copy3[x]++;
+            copy0[(uint8_t)y]++; y >>= 8; copy1[(uint8_t)y]++; y >>= 8; copy2[(uint8_t)y]++; y >>= 8; copy3[y]++;
+
+            x = ((const uint32_t *)(const void *)T_p)[12]; y = ((const uint32_t *)(const void *)T_p)[13];
+            copy0[(uint8_t)z]++; z >>= 8; copy1[(uint8_t)z]++; z >>= 8; copy2[(uint8_t)z]++; z >>= 8; copy3[z]++;
+            copy0[(uint8_t)w]++; w >>= 8; copy1[(uint8_t)w]++; w >>= 8; copy2[(uint8_t)w]++; w >>= 8; copy3[w]++;
+
+            z = ((const uint32_t *)(const void *)T_p)[14]; w = ((const uint32_t *)(const void *)T_p)[15];
+            copy0[(uint8_t)x]++; x >>= 8; copy1[(uint8_t)x]++; x >>= 8; copy2[(uint8_t)x]++; x >>= 8; copy3[x]++;
+            copy0[(uint8_t)y]++; y >>= 8; copy1[(uint8_t)y]++; y >>= 8; copy2[(uint8_t)y]++; y >>= 8; copy3[y]++;
+
+            x = ((const uint32_t *)(const void *)T_p)[16]; y = ((const uint32_t *)(const void *)T_p)[17];
+            copy0[(uint8_t)z]++; z >>= 8; copy1[(uint8_t)z]++; z >>= 8; copy2[(uint8_t)z]++; z >>= 8; copy3[z]++;
+            copy0[(uint8_t)w]++; w >>= 8; copy1[(uint8_t)w]++; w >>= 8; copy2[(uint8_t)w]++; w >>= 8; copy3[w]++;
+        }
+
+        copy0[(uint8_t)x]++; x >>= 8; copy1[(uint8_t)x]++; x >>= 8; copy2[(uint8_t)x]++; x >>= 8; copy3[x]++;
+        copy0[(uint8_t)y]++; y >>= 8; copy1[(uint8_t)y]++; y >>= 8; copy2[(uint8_t)y]++; y >>= 8; copy3[y]++;
+
+        T_p += 8;
+
+        fast_uint_t i; for (i = 0; i < ALPHABET_SIZE; i++) { count[i] += copy0[i] + copy1[i] + copy2[i] + copy3[i]; }
+    }
+
+    for (; T_p < T + n; T_p += 1) { count[T_p[0]]++; }
+}
+
+static void libsais_unbwt_transpose_bucket2(sa_uint_t * RESTRICT bucket2)
+{
+    fast_uint_t x, y, c, d;
+    for (x = 0; x != ALPHABET_SIZE; x += 16)
+    {
+        for (c = x; c != x + 16; ++c)
+        {
+            for (d = c + 1; d != x + 16; ++d)
+            {
+                sa_uint_t tmp = bucket2[(d << 8) + c]; bucket2[(d << 8) + c] = bucket2[(c << 8) + d]; bucket2[(c << 8) + d] = tmp;
+            }
+        }
+
+        for (y = x + 16; y != ALPHABET_SIZE; y += 16)
+        {
+            for (c = x; c != x + 16; ++c)
+            {
+                sa_uint_t * bucket2_yc = &bucket2[(y << 8) + c];
+                sa_uint_t * bucket2_cy = &bucket2[(c << 8) + y];
+
+                sa_uint_t tmp00 = bucket2_yc[ 0 * 256]; bucket2_yc[ 0 * 256] = bucket2_cy[ 0]; bucket2_cy[ 0] = tmp00;
+                sa_uint_t tmp01 = bucket2_yc[ 1 * 256]; bucket2_yc[ 1 * 256] = bucket2_cy[ 1]; bucket2_cy[ 1] = tmp01;
+                sa_uint_t tmp02 = bucket2_yc[ 2 * 256]; bucket2_yc[ 2 * 256] = bucket2_cy[ 2]; bucket2_cy[ 2] = tmp02;
+                sa_uint_t tmp03 = bucket2_yc[ 3 * 256]; bucket2_yc[ 3 * 256] = bucket2_cy[ 3]; bucket2_cy[ 3] = tmp03;
+                sa_uint_t tmp04 = bucket2_yc[ 4 * 256]; bucket2_yc[ 4 * 256] = bucket2_cy[ 4]; bucket2_cy[ 4] = tmp04;
+                sa_uint_t tmp05 = bucket2_yc[ 5 * 256]; bucket2_yc[ 5 * 256] = bucket2_cy[ 5]; bucket2_cy[ 5] = tmp05;
+                sa_uint_t tmp06 = bucket2_yc[ 6 * 256]; bucket2_yc[ 6 * 256] = bucket2_cy[ 6]; bucket2_cy[ 6] = tmp06;
+                sa_uint_t tmp07 = bucket2_yc[ 7 * 256]; bucket2_yc[ 7 * 256] = bucket2_cy[ 7]; bucket2_cy[ 7] = tmp07;
+                sa_uint_t tmp08 = bucket2_yc[ 8 * 256]; bucket2_yc[ 8 * 256] = bucket2_cy[ 8]; bucket2_cy[ 8] = tmp08;
+                sa_uint_t tmp09 = bucket2_yc[ 9 * 256]; bucket2_yc[ 9 * 256] = bucket2_cy[ 9]; bucket2_cy[ 9] = tmp09;
+                sa_uint_t tmp10 = bucket2_yc[10 * 256]; bucket2_yc[10 * 256] = bucket2_cy[10]; bucket2_cy[10] = tmp10;
+                sa_uint_t tmp11 = bucket2_yc[11 * 256]; bucket2_yc[11 * 256] = bucket2_cy[11]; bucket2_cy[11] = tmp11;
+                sa_uint_t tmp12 = bucket2_yc[12 * 256]; bucket2_yc[12 * 256] = bucket2_cy[12]; bucket2_cy[12] = tmp12;
+                sa_uint_t tmp13 = bucket2_yc[13 * 256]; bucket2_yc[13 * 256] = bucket2_cy[13]; bucket2_cy[13] = tmp13;
+                sa_uint_t tmp14 = bucket2_yc[14 * 256]; bucket2_yc[14 * 256] = bucket2_cy[14]; bucket2_cy[14] = tmp14;
+                sa_uint_t tmp15 = bucket2_yc[15 * 256]; bucket2_yc[15 * 256] = bucket2_cy[15]; bucket2_cy[15] = tmp15;
+            }
+        }
+    }
+}
+
+static void libsais_unbwt_compute_bigram_histogram_single(const uint8_t * RESTRICT T, sa_uint_t * RESTRICT bucket1, sa_uint_t * RESTRICT bucket2, fast_uint_t index)
+{
+    fast_uint_t sum, c;
+    for (sum = 1, c = 0; c < ALPHABET_SIZE; ++c)
+    {
+        fast_uint_t prev = sum; sum += bucket1[c]; bucket1[c] = (sa_uint_t)prev;
+        if (prev != sum)
+        {
+            sa_uint_t * RESTRICT bucket2_p = &bucket2[c << 8];
+
+            {
+                fast_uint_t hi = index; if (sum < hi) { hi = sum; }
+                libsais_unbwt_compute_histogram(&T[prev], (fast_sint_t)(hi - prev), bucket2_p);
+            }
+
+            {
+                fast_uint_t lo = index + 1; if (prev > lo) { lo = prev; }
+                libsais_unbwt_compute_histogram(&T[lo - 1], (fast_sint_t)(sum - lo), bucket2_p);
+            }
+        }
+    }
+
+    libsais_unbwt_transpose_bucket2(bucket2);
+}
+
+static void libsais_unbwt_calculate_fastbits(sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t lastc, fast_uint_t shift)
+{
+    fast_uint_t v, w, sum, c, d;
+    for (v = 0, w = 0, sum = 1, c = 0; c < ALPHABET_SIZE; ++c)
+    {
+        if (c == lastc) { sum += 1; }
+
+        for (d = 0; d < ALPHABET_SIZE; ++d, ++w)
+        {
+            fast_uint_t prev = sum; sum += bucket2[w]; bucket2[w] = (sa_uint_t)prev;
+            if (prev != sum)
+            {
+                for (; v <= ((sum - 1) >> shift); ++v) { fastbits[v] = (uint16_t)w; }
+            }
+        }
+    }
+}
+
+static void libsais_unbwt_calculate_biPSI(const uint8_t * RESTRICT T, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket1, sa_uint_t * RESTRICT bucket2, fast_uint_t index, fast_sint_t omp_block_start, fast_sint_t omp_block_end)
+{
+    {
+        fast_sint_t i = omp_block_start, j = (fast_sint_t)index; if (omp_block_end < j) { j = omp_block_end; }
+        for (; i < j; ++i)
+        {
+            fast_uint_t c = T[i];
+            fast_uint_t p = bucket1[c]++;
+            fast_sint_t t = (fast_sint_t)(index - p);
+
+            if (t != 0)
+            {
+                fast_uint_t w = (((fast_uint_t)T[p + (fast_uint_t)(t >> ((sizeof(fast_sint_t) * 8) - 1))]) << 8) + c;
+                P[bucket2[w]++] = (sa_uint_t)i;
+            }
+        }
+    }
+
+    {
+        fast_sint_t i = (fast_sint_t)index, j = omp_block_end; if (omp_block_start > i) { i = omp_block_start; }
+        for (i += 1; i <= j; ++i)
+        {
+            fast_uint_t c = T[i - 1];
+            fast_uint_t p = bucket1[c]++;
+            fast_sint_t t = (fast_sint_t)(index - p);
+
+            if (t != 0)
+            {
+                fast_uint_t w = (((fast_uint_t)T[p + (fast_uint_t)(t >> ((sizeof(fast_sint_t) * 8) - 1))]) << 8) + c;
+                P[bucket2[w]++] = (sa_uint_t)i;
+            }
+        }
+    }
+}
+
+static void libsais_unbwt_init_single(const uint8_t * RESTRICT T, sa_uint_t * RESTRICT P, sa_sint_t n, const sa_sint_t * freq, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits)
+{
+    sa_uint_t bucket1[ALPHABET_SIZE];
+
+    fast_uint_t index = I[0];
+    fast_uint_t lastc = T[0];
+    fast_uint_t shift = 0; while ((n >> shift) > (1 << UNBWT_FASTBITS)) { shift++; }
+
+    if (freq != NULL)
+    {
+        memcpy(bucket1, freq, ALPHABET_SIZE * sizeof(sa_uint_t));
+    }
+    else
+    {
+        memset(bucket1, 0, ALPHABET_SIZE * sizeof(sa_uint_t));
+        libsais_unbwt_compute_histogram(T, n, bucket1);
+    }
+
+    memset(bucket2, 0, ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t));
+    libsais_unbwt_compute_bigram_histogram_single(T, bucket1, bucket2, index);
+
+    libsais_unbwt_calculate_fastbits(bucket2, fastbits, lastc, shift);
+    libsais_unbwt_calculate_biPSI(T, P, bucket1, bucket2, index, 0, n);
+}
+
+#if defined(_OPENMP)
+
+static void libsais_unbwt_compute_bigram_histogram_parallel(const uint8_t * RESTRICT T, fast_uint_t index, sa_uint_t * RESTRICT bucket1, sa_uint_t * RESTRICT bucket2, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    fast_sint_t i;
+    for (i = omp_block_start; i < omp_block_start + omp_block_size; ++i)
+    {
+        fast_uint_t c = T[i];
+        fast_uint_t p = bucket1[c]++;
+        fast_sint_t t = (fast_sint_t)(index - p);
+
+        if (t != 0)
+        {
+            fast_uint_t w = (((fast_uint_t)T[p + (fast_uint_t)(t >> ((sizeof(fast_sint_t) * 8) - 1))]) << 8) + c;
+            bucket2[w]++;
+        }
+    }
+}
+
+static void libsais_unbwt_init_parallel(const uint8_t * RESTRICT T, sa_uint_t * RESTRICT P, sa_sint_t n, const sa_sint_t * freq, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, sa_uint_t * RESTRICT buckets, sa_sint_t threads)
+{
+    sa_uint_t bucket1[ALPHABET_SIZE];
+
+    fast_uint_t index = I[0];
+    fast_uint_t lastc = T[0];
+    fast_uint_t shift = 0; while ((n >> shift) > (1 << UNBWT_FASTBITS)) { shift++; }
+
+    memset(bucket1, 0, ALPHABET_SIZE * sizeof(sa_uint_t));
+    memset(bucket2, 0, ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t));
+
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+    {
+        fast_sint_t omp_thread_num  = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
+
+        if (omp_num_threads == 1)
+        {
+            libsais_unbwt_init_single(T, P, n, freq, I, bucket2, fastbits);
+        }
+        else
+        {
+            sa_uint_t * RESTRICT bucket1_local  = buckets + omp_thread_num * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE));
+            sa_uint_t * RESTRICT bucket2_local  = bucket1_local + ALPHABET_SIZE;
+
+            fast_sint_t omp_block_stride        = (n / omp_num_threads) & (-16);
+            fast_sint_t omp_block_start         = omp_thread_num * omp_block_stride;
+            fast_sint_t omp_block_size          = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+            {
+                memset(bucket1_local, 0, ALPHABET_SIZE * sizeof(sa_uint_t));
+                libsais_unbwt_compute_histogram(T + omp_block_start, omp_block_size, bucket1_local);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                {
+                    sa_uint_t * RESTRICT bucket1_temp = buckets;
+
+                    fast_sint_t t;
+                    for (t = 0; t < omp_num_threads; ++t, bucket1_temp += ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE))
+                    {
+                        fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_uint_t A = bucket1[c], B = bucket1_temp[c]; bucket1[c] = A + B; bucket1_temp[c] = A; }
+                    }
+                }
+
+                {
+                    fast_uint_t sum, c;
+                    for (sum = 1, c = 0; c < ALPHABET_SIZE; ++c) { fast_uint_t prev = sum; sum += bucket1[c]; bucket1[c] = (sa_uint_t)prev; }
+                }
+            }
+
+            #pragma omp barrier
+
+            {
+                fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_uint_t A = bucket1[c], B = bucket1_local[c]; bucket1_local[c] = A + B; }
+
+                memset(bucket2_local, 0, ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t));
+                libsais_unbwt_compute_bigram_histogram_parallel(T, index, bucket1_local, bucket2_local, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                fast_sint_t omp_bucket2_stride  = ((ALPHABET_SIZE * ALPHABET_SIZE) / omp_num_threads) & (-16);
+                fast_sint_t omp_bucket2_start   = omp_thread_num * omp_bucket2_stride;
+                fast_sint_t omp_bucket2_size    = omp_thread_num < omp_num_threads - 1 ? omp_bucket2_stride : (ALPHABET_SIZE * ALPHABET_SIZE) - omp_bucket2_start;
+
+                sa_uint_t * RESTRICT bucket2_temp = buckets + ALPHABET_SIZE;
+
+                fast_sint_t t;
+                for (t = 0; t < omp_num_threads; ++t, bucket2_temp += ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE))
+                {
+                    fast_sint_t c; for (c = omp_bucket2_start; c < omp_bucket2_start + omp_bucket2_size; c += 1) { sa_uint_t A = bucket2[c], B = bucket2_temp[c]; bucket2[c] = A + B; bucket2_temp[c] = A; }
+                }
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+
+                libsais_unbwt_calculate_fastbits(bucket2, fastbits, lastc, shift);
+
+                {
+                    fast_sint_t t;
+                    for (t = omp_num_threads - 1; t >= 1; --t) 
+                    { 
+                        sa_uint_t * RESTRICT dst_bucket1 = buckets + t * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE));
+                        sa_uint_t * RESTRICT src_bucket1 = dst_bucket1 - (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE));
+
+                        memcpy(dst_bucket1, src_bucket1, ALPHABET_SIZE * sizeof(sa_uint_t));
+                    }
+
+                    memcpy(buckets, bucket1, ALPHABET_SIZE * sizeof(sa_uint_t));
+                }
+            }
+
+            #pragma omp barrier
+
+            {
+                fast_sint_t c; for (c = 0; c < ALPHABET_SIZE * ALPHABET_SIZE; c += 1) { sa_uint_t A = bucket2[c], B = bucket2_local[c]; bucket2_local[c] = A + B; }
+
+                libsais_unbwt_calculate_biPSI(T, P, bucket1_local, bucket2_local, index, omp_block_start, omp_block_start + omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                memcpy(bucket2, buckets + ALPHABET_SIZE + (omp_num_threads - 1) * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)), ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t));
+            }
+        }
+    }
+}
+
+#endif
+
+static void libsais_unbwt_decode_1(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t * i0, fast_uint_t k)
+{
+    uint16_t * RESTRICT U0 = (uint16_t *)(void *)U;
+
+    fast_uint_t i, p0 = *i0;
+
+    for (i = 0; i != k; ++i)
+    {
+        uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais_bswap16(c0);
+    }
+
+    *i0 = p0;
+}
+
+static void libsais_unbwt_decode_2(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t k)
+{
+    uint16_t * RESTRICT U0 = (uint16_t *)(void *)U;
+    uint16_t * RESTRICT U1 = (uint16_t *)(void *)(((uint8_t *)U0) + r);
+
+    fast_uint_t i, p0 = *i0, p1 = *i1;
+
+    for (i = 0; i != k; ++i)
+    {
+        uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais_bswap16(c0);
+        uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = libsais_bswap16(c1);
+    }
+
+    *i0 = p0; *i1 = p1;
+}
+
+static void libsais_unbwt_decode_3(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t k)
+{
+    uint16_t * RESTRICT U0 = (uint16_t *)(void *)U;
+    uint16_t * RESTRICT U1 = (uint16_t *)(void *)(((uint8_t *)U0) + r);
+    uint16_t * RESTRICT U2 = (uint16_t *)(void *)(((uint8_t *)U1) + r);
+
+    fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2;
+
+    for (i = 0; i != k; ++i)
+    {
+        uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais_bswap16(c0);
+        uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = libsais_bswap16(c1);
+        uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = libsais_bswap16(c2);
+    }
+
+    *i0 = p0; *i1 = p1; *i2 = p2;
+}
+
+static void libsais_unbwt_decode_4(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t k)
+{
+    uint16_t * RESTRICT U0 = (uint16_t *)(void *)U;
+    uint16_t * RESTRICT U1 = (uint16_t *)(void *)(((uint8_t *)U0) + r);
+    uint16_t * RESTRICT U2 = (uint16_t *)(void *)(((uint8_t *)U1) + r);
+    uint16_t * RESTRICT U3 = (uint16_t *)(void *)(((uint8_t *)U2) + r);
+
+    fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3;
+
+    for (i = 0; i != k; ++i)
+    {
+        uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais_bswap16(c0);
+        uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = libsais_bswap16(c1);
+        uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = libsais_bswap16(c2);
+        uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = libsais_bswap16(c3);
+    }
+
+    *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3;
+}
+
+static void libsais_unbwt_decode_5(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t k)
+{
+    uint16_t * RESTRICT U0 = (uint16_t *)(void *)U;
+    uint16_t * RESTRICT U1 = (uint16_t *)(void *)(((uint8_t *)U0) + r);
+    uint16_t * RESTRICT U2 = (uint16_t *)(void *)(((uint8_t *)U1) + r);
+    uint16_t * RESTRICT U3 = (uint16_t *)(void *)(((uint8_t *)U2) + r);
+    uint16_t * RESTRICT U4 = (uint16_t *)(void *)(((uint8_t *)U3) + r);
+
+    fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4;
+
+    for (i = 0; i != k; ++i)
+    {
+        uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais_bswap16(c0);
+        uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = libsais_bswap16(c1);
+        uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = libsais_bswap16(c2);
+        uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = libsais_bswap16(c3);
+        uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = libsais_bswap16(c4);
+    }
+
+    *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4;
+}
+
+static void libsais_unbwt_decode_6(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t * i5, fast_uint_t k)
+{
+    uint16_t * RESTRICT U0 = (uint16_t *)(void *)U;
+    uint16_t * RESTRICT U1 = (uint16_t *)(void *)(((uint8_t *)U0) + r);
+    uint16_t * RESTRICT U2 = (uint16_t *)(void *)(((uint8_t *)U1) + r);
+    uint16_t * RESTRICT U3 = (uint16_t *)(void *)(((uint8_t *)U2) + r);
+    uint16_t * RESTRICT U4 = (uint16_t *)(void *)(((uint8_t *)U3) + r);
+    uint16_t * RESTRICT U5 = (uint16_t *)(void *)(((uint8_t *)U4) + r);
+
+    fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5;
+
+    for (i = 0; i != k; ++i)
+    {
+        uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais_bswap16(c0);
+        uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = libsais_bswap16(c1);
+        uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = libsais_bswap16(c2);
+        uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = libsais_bswap16(c3);
+        uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = libsais_bswap16(c4);
+        uint16_t c5 = fastbits[p5 >> shift]; if (bucket2[c5] <= p5) { do { c5++; } while (bucket2[c5] <= p5); } p5 = P[p5]; U5[i] = libsais_bswap16(c5);
+    }
+
+    *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4; *i5 = p5;
+}
+
+static void libsais_unbwt_decode_7(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t * i5, fast_uint_t * i6, fast_uint_t k)
+{
+    uint16_t * RESTRICT U0 = (uint16_t *)(void *)U;
+    uint16_t * RESTRICT U1 = (uint16_t *)(void *)(((uint8_t *)U0) + r);
+    uint16_t * RESTRICT U2 = (uint16_t *)(void *)(((uint8_t *)U1) + r);
+    uint16_t * RESTRICT U3 = (uint16_t *)(void *)(((uint8_t *)U2) + r);
+    uint16_t * RESTRICT U4 = (uint16_t *)(void *)(((uint8_t *)U3) + r);
+    uint16_t * RESTRICT U5 = (uint16_t *)(void *)(((uint8_t *)U4) + r);
+    uint16_t * RESTRICT U6 = (uint16_t *)(void *)(((uint8_t *)U5) + r);
+
+    fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5, p6 = *i6;
+
+    for (i = 0; i != k; ++i)
+    {
+        uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais_bswap16(c0);
+        uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = libsais_bswap16(c1);
+        uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = libsais_bswap16(c2);
+        uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = libsais_bswap16(c3);
+        uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = libsais_bswap16(c4);
+        uint16_t c5 = fastbits[p5 >> shift]; if (bucket2[c5] <= p5) { do { c5++; } while (bucket2[c5] <= p5); } p5 = P[p5]; U5[i] = libsais_bswap16(c5);
+        uint16_t c6 = fastbits[p6 >> shift]; if (bucket2[c6] <= p6) { do { c6++; } while (bucket2[c6] <= p6); } p6 = P[p6]; U6[i] = libsais_bswap16(c6);
+    }
+
+    *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4; *i5 = p5; *i6 = p6;
+}
+
+static void libsais_unbwt_decode_8(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t * i5, fast_uint_t * i6, fast_uint_t * i7, fast_uint_t k)
+{
+    uint16_t * RESTRICT U0 = (uint16_t *)(void *)U;
+    uint16_t * RESTRICT U1 = (uint16_t *)(void *)(((uint8_t *)U0) + r);
+    uint16_t * RESTRICT U2 = (uint16_t *)(void *)(((uint8_t *)U1) + r);
+    uint16_t * RESTRICT U3 = (uint16_t *)(void *)(((uint8_t *)U2) + r);
+    uint16_t * RESTRICT U4 = (uint16_t *)(void *)(((uint8_t *)U3) + r);
+    uint16_t * RESTRICT U5 = (uint16_t *)(void *)(((uint8_t *)U4) + r);
+    uint16_t * RESTRICT U6 = (uint16_t *)(void *)(((uint8_t *)U5) + r);
+    uint16_t * RESTRICT U7 = (uint16_t *)(void *)(((uint8_t *)U6) + r);
+
+    fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5, p6 = *i6, p7 = *i7;
+
+    for (i = 0; i != k; ++i)
+    {
+        uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais_bswap16(c0);
+        uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = libsais_bswap16(c1);
+        uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = libsais_bswap16(c2);
+        uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = libsais_bswap16(c3);
+        uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = libsais_bswap16(c4);
+        uint16_t c5 = fastbits[p5 >> shift]; if (bucket2[c5] <= p5) { do { c5++; } while (bucket2[c5] <= p5); } p5 = P[p5]; U5[i] = libsais_bswap16(c5);
+        uint16_t c6 = fastbits[p6 >> shift]; if (bucket2[c6] <= p6) { do { c6++; } while (bucket2[c6] <= p6); } p6 = P[p6]; U6[i] = libsais_bswap16(c6);
+        uint16_t c7 = fastbits[p7 >> shift]; if (bucket2[c7] <= p7) { do { c7++; } while (bucket2[c7] <= p7); } p7 = P[p7]; U7[i] = libsais_bswap16(c7);
+    }
+
+    *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4; *i5 = p5; *i6 = p6; *i7 = p7;
+}
+
+static void libsais_unbwt_decode(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n, sa_sint_t r, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_sint_t blocks, fast_uint_t reminder)
+{
+    fast_uint_t shift       = 0; while ((n >> shift) > (1 << UNBWT_FASTBITS)) { shift++; }
+    fast_uint_t offset      = 0;
+
+    while (blocks > 8)
+    {
+        fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6], i7 = I[7];
+        libsais_unbwt_decode_8(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, &i7, (fast_uint_t)r >> 1);
+        I += 8; blocks -= 8; offset += 8 * (fast_uint_t)r;
+    }
+
+    if (blocks == 1)
+    {
+        fast_uint_t i0 = I[0];
+        libsais_unbwt_decode_1(U + offset, P, bucket2, fastbits, shift, &i0, reminder >> 1);
+    }
+    else if (blocks == 2)
+    {
+        fast_uint_t i0 = I[0], i1 = I[1];
+        libsais_unbwt_decode_2(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, reminder >> 1);
+        libsais_unbwt_decode_1(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, &i0, ((fast_uint_t)r >> 1) - (reminder >> 1));
+    }
+    else if (blocks == 3)
+    {
+        fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2];
+        libsais_unbwt_decode_3(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, reminder >> 1);
+        libsais_unbwt_decode_2(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, ((fast_uint_t)r >> 1) - (reminder >> 1));
+    }
+    else if (blocks == 4)
+    {
+        fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3];
+        libsais_unbwt_decode_4(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, reminder >> 1);
+        libsais_unbwt_decode_3(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, ((fast_uint_t)r >> 1) - (reminder >> 1));
+    }
+    else if (blocks == 5)
+    {
+        fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4];
+        libsais_unbwt_decode_5(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, reminder >> 1);
+        libsais_unbwt_decode_4(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, ((fast_uint_t)r >> 1) - (reminder >> 1));
+    }
+    else if (blocks == 6)
+    {
+        fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5];
+        libsais_unbwt_decode_6(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, reminder >> 1);
+        libsais_unbwt_decode_5(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, ((fast_uint_t)r >> 1) - (reminder >> 1));
+    }
+    else if (blocks == 7)
+    {
+        fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6];
+        libsais_unbwt_decode_7(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, reminder >> 1);
+        libsais_unbwt_decode_6(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, ((fast_uint_t)r >> 1) - (reminder >> 1));
+    }
+    else
+    {
+        fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6], i7 = I[7];
+        libsais_unbwt_decode_8(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, &i7, reminder >> 1);
+        libsais_unbwt_decode_7(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, ((fast_uint_t)r >> 1) - (reminder >> 1));
+    }
+}
+
+static void libsais_unbwt_decode_omp(const uint8_t * RESTRICT T, uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n, sa_sint_t r, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, sa_sint_t threads)
+{
+    fast_uint_t lastc       = T[0];
+    fast_sint_t blocks      = 1 + (((fast_sint_t)n - 1) / (fast_sint_t)r);
+    fast_uint_t reminder    = (fast_uint_t)n - ((fast_uint_t)r * ((fast_uint_t)blocks - 1));
+
+#if defined(_OPENMP)
+    fast_sint_t max_threads = blocks < threads ? blocks : threads;
+    #pragma omp parallel num_threads(max_threads) if(max_threads > 1 && n >= 65536)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num      = omp_get_thread_num();
+        fast_sint_t omp_num_threads     = omp_get_num_threads();
+#else
+        UNUSED(threads);
+
+        fast_sint_t omp_thread_num      = 0;
+        fast_sint_t omp_num_threads     = 1;
+#endif
+
+        fast_sint_t omp_block_stride    = blocks / omp_num_threads;
+        fast_sint_t omp_block_reminder  = blocks % omp_num_threads;
+        fast_sint_t omp_block_size      = omp_block_stride + (omp_thread_num < omp_block_reminder);
+        fast_sint_t omp_block_start     = omp_block_stride * omp_thread_num + (omp_thread_num < omp_block_reminder ? omp_thread_num : omp_block_reminder);
+
+        libsais_unbwt_decode(U + r * omp_block_start, P, n, r, I + omp_block_start, bucket2, fastbits, omp_block_size, omp_thread_num < omp_num_threads - 1 ? (fast_uint_t)r : reminder);
+    }
+
+    U[n - 1] = (uint8_t)lastc;
+}
+
+static sa_sint_t libsais_unbwt_core(const uint8_t * RESTRICT T, uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n, const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, sa_uint_t * RESTRICT buckets, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+    if (threads > 1 && n >= 262144)
+    {
+        libsais_unbwt_init_parallel(T, P, n, freq, I, bucket2, fastbits, buckets, threads);
+    }
+    else
+#else
+    UNUSED(buckets);
+#endif
+    {
+        libsais_unbwt_init_single(T, P, n, freq, I, bucket2, fastbits);
+    }
+
+    libsais_unbwt_decode_omp(T, U, P, n, r, I, bucket2, fastbits, threads);
+    return 0;
+}
+
+static sa_sint_t libsais_unbwt_main(const uint8_t * T, uint8_t * U, sa_uint_t * P, sa_sint_t n, const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * I, sa_sint_t threads)
+{
+    fast_uint_t shift = 0; while ((n >> shift) > (1 << UNBWT_FASTBITS)) { shift++; }
+
+    sa_uint_t *     RESTRICT bucket2        = (sa_uint_t *)libsais_alloc_aligned(ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t), 4096);
+    uint16_t *      RESTRICT fastbits       = (uint16_t *)libsais_alloc_aligned(((size_t)1 + (size_t)(n >> shift)) * sizeof(uint16_t), 4096);
+    sa_uint_t *     RESTRICT buckets        = threads > 1 && n >= 262144 ? (sa_uint_t *)libsais_alloc_aligned((size_t)threads * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) * sizeof(sa_uint_t), 4096) : NULL;
+
+    sa_sint_t index = bucket2 != NULL && fastbits != NULL && (buckets != NULL || threads == 1 || n < 262144)
+        ? libsais_unbwt_core(T, U, P, n, freq, r, I, bucket2, fastbits, buckets, threads)
+        : -2;
+
+    libsais_free_aligned(buckets);
+    libsais_free_aligned(fastbits);
+    libsais_free_aligned(bucket2);
+
+    return index;
+}
+
+static sa_sint_t libsais_unbwt_main_ctx(const LIBSAIS_UNBWT_CONTEXT * ctx, const uint8_t * T, uint8_t * U, sa_uint_t * P, sa_sint_t n, const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * I)
+{
+    return ctx != NULL && ctx->bucket2 != NULL && ctx->fastbits != NULL && (ctx->buckets != NULL || ctx->threads == 1)
+        ? libsais_unbwt_core(T, U, P, n, freq, r, I, ctx->bucket2, ctx->fastbits, ctx->buckets, (sa_sint_t)ctx->threads)
+        : -2;
+}
+
+void * libsais_unbwt_create_ctx(void)
+{
+    return (void *)libsais_unbwt_create_ctx_main(1);
+}
+
+void libsais_unbwt_free_ctx(void * ctx)
+{
+    libsais_unbwt_free_ctx_main((LIBSAIS_UNBWT_CONTEXT *)ctx);
+}
+
+int32_t libsais_unbwt(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i)
+{
+    return libsais_unbwt_aux(T, U, A, n, freq, n, &i);
+}
+
+int32_t libsais_unbwt_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i)
+{
+    return libsais_unbwt_aux_ctx(ctx, T, U, A, n, freq, n, &i);
+}
+
+int32_t libsais_unbwt_aux(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I)
+{
+    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL))
+    {
+        return -1;
+    }
+    else if (n <= 1)
+    {
+        if (I[0] != n) { return -1; }
+        if (n == 1) { U[0] = T[0]; }
+        return 0;
+    }
+
+    fast_sint_t t; for (t = 0; t <= (n - 1) / r; ++t) { if (I[t] <= 0 || I[t] > n) { return -1; } }
+
+    return libsais_unbwt_main(T, U, (sa_uint_t *)A, n, freq, r, (const sa_uint_t *)I, 1);
+}
+
+int32_t libsais_unbwt_aux_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I)
+{
+    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL))
+    {
+        return -1;
+    }
+    else if (n <= 1)
+    {
+        if (I[0] != n) { return -1; }
+        if (n == 1) { U[0] = T[0]; }
+        return 0;
+    }
+
+    fast_sint_t t; for (t = 0; t <= (n - 1) / r; ++t) { if (I[t] <= 0 || I[t] > n) { return -1; } }
+
+    return libsais_unbwt_main_ctx((const LIBSAIS_UNBWT_CONTEXT *)ctx, T, U, (sa_uint_t *)A, n, freq, r, (const sa_uint_t *)I);
+}
+
+#if defined(_OPENMP)
+
+void * libsais_unbwt_create_ctx_omp(int32_t threads)
+{
+    if (threads < 0) { return NULL; }
+
+    threads = threads > 0 ? threads : omp_get_max_threads();
+    return (void *)libsais_unbwt_create_ctx_main(threads);
+}
+
+int32_t libsais_unbwt_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i, int32_t threads)
+{
+    return libsais_unbwt_aux_omp(T, U, A, n, freq, n, &i, threads);
+}
+
+int32_t libsais_unbwt_aux_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I, int32_t threads)
+{
+    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL) || (threads < 0))
+    {
+        return -1;
+    }
+    else if (n <= 1)
+    {
+        if (I[0] != n) { return -1; }
+        if (n == 1) { U[0] = T[0]; }
+        return 0;
+    }
+
+    fast_sint_t t; for (t = 0; t <= (n - 1) / r; ++t) { if (I[t] <= 0 || I[t] > n) { return -1; } }
+
+    threads = threads > 0 ? threads : omp_get_max_threads();
+    return libsais_unbwt_main(T, U, (sa_uint_t *)A, n, freq, r, (const sa_uint_t *)I, threads);
+}
+
+#endif
diff --git a/libsais/libsais.h b/libsais/libsais.h
new file mode 100644
index 0000000..c655d67
--- /dev/null
+++ b/libsais/libsais.h
@@ -0,0 +1,285 @@
+/*--
+
+This file is a part of libsais, a library for linear time
+suffix array and burrows wheeler transform construction.
+
+   Copyright (c) 2021 Ilya Grebnov <ilya.grebnov@gmail.com>
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+Please see the file LICENSE for full copyright information.
+
+--*/
+
+#ifndef LIBSAIS_H
+#define LIBSAIS_H 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+    #include <stdint.h>
+
+    /**
+    * Creates the libsais context that allows reusing allocated memory with each libsais operation. 
+    * In multi-threaded environments, use one context per thread for parallel executions.
+    * @return the libsais context, NULL otherwise.
+    */
+    void * libsais_create_ctx(void);
+
+#if defined(_OPENMP)
+    /**
+    * Creates the libsais context that allows reusing allocated memory with each parallel libsais operation using OpenMP. 
+    * In multi-threaded environments, use one context per thread for parallel executions.
+    * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
+    * @return the libsais context, NULL otherwise.
+    */
+    void * libsais_create_ctx_omp(int32_t threads);
+#endif
+
+    /**
+    * Destroys the libsass context and free previusly allocated memory.
+    * @param ctx The libsais context (can be NULL).
+    */
+    void libsais_free_ctx(void * ctx);
+
+    /**
+    * Constructs the suffix array of a given string.
+    * @param T [0..n-1] The input string.
+    * @param SA [0..n-1+fs] The output array of suffixes.
+    * @param n The length of the given string.
+    * @param fs The extra space available at the end of SA array (can be 0).
+    * @param freq [0..255] The output symbol frequency table (can be NULL).
+    * @return 0 if no error occurred, -1 or -2 otherwise.
+    */
+    int32_t libsais(const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq);
+
+    /**
+    * Constructs the suffix array of a given string using libsais context.
+    * @param ctx The libsais context.
+    * @param T [0..n-1] The input string.
+    * @param SA [0..n-1+fs] The output array of suffixes.
+    * @param n The length of the given string.
+    * @param fs The extra space available at the end of SA array (can be 0).
+    * @param freq [0..255] The output symbol frequency table (can be NULL).
+    * @return 0 if no error occurred, -1 or -2 otherwise.
+    */
+    int32_t libsais_ctx(const void * ctx, const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq);
+
+#if defined(_OPENMP)
+    /**
+    * Constructs the suffix array of a given string in parallel using OpenMP.
+    * @param T [0..n-1] The input string.
+    * @param SA [0..n-1+fs] The output array of suffixes.
+    * @param n The length of the given string.
+    * @param fs The extra space available at the end of SA array (can be 0).
+    * @param freq [0..255] The output symbol frequency table (can be NULL).
+    * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
+    * @return 0 if no error occurred, -1 or -2 otherwise.
+    */
+    int32_t libsais_omp(const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq, int32_t threads);
+#endif
+
+    /**
+    * Constructs the burrows-wheeler transformed string of a given string.
+    * @param T [0..n-1] The input string.
+    * @param U [0..n-1] The output string (can be T).
+    * @param A [0..n-1+fs] The temporary array.
+    * @param n The length of the given string.
+    * @param fs The extra space available at the end of A array (can be 0).
+    * @param freq [0..255] The output symbol frequency table (can be NULL).
+    * @return The primary index if no error occurred, -1 or -2 otherwise.
+    */
+    int32_t libsais_bwt(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq);
+
+    /**
+    * Constructs the burrows-wheeler transformed string of a given string with auxiliary indexes.
+    * @param T [0..n-1] The input string.
+    * @param U [0..n-1] The output string (can be T).
+    * @param A [0..n-1+fs] The temporary array.
+    * @param n The length of the given string.
+    * @param fs The extra space available at the end of A array (can be 0).
+    * @param freq [0..255] The output symbol frequency table (can be NULL).
+    * @param r The sampling rate for auxiliary indexes (must be power of 2).
+    * @param I [0..(n-1)/r] The output auxiliary indexes.
+    * @return 0 if no error occurred, -1 or -2 otherwise.
+    */
+    int32_t libsais_bwt_aux(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I);
+
+    /**
+    * Constructs the burrows-wheeler transformed string of a given string using libsais context.
+    * @param ctx The libsais context.
+    * @param T [0..n-1] The input string.
+    * @param U [0..n-1] The output string (can be T).
+    * @param A [0..n-1+fs] The temporary array.
+    * @param n The length of the given string.
+    * @param fs The extra space available at the end of A array (can be 0).
+    * @param freq [0..255] The output symbol frequency table (can be NULL).
+    * @return The primary index if no error occurred, -1 or -2 otherwise.
+    */
+    int32_t libsais_bwt_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq);
+
+    /**
+    * Constructs the burrows-wheeler transformed string of a given string with auxiliary indexes using libsais context.
+    * @param ctx The libsais context.
+    * @param T [0..n-1] The input string.
+    * @param U [0..n-1] The output string (can be T).
+    * @param A [0..n-1+fs] The temporary array.
+    * @param n The length of the given string.
+    * @param fs The extra space available at the end of A array (can be 0).
+    * @param freq [0..255] The output symbol frequency table (can be NULL).
+    * @param r The sampling rate for auxiliary indexes (must be power of 2).
+    * @param I [0..(n-1)/r] The output auxiliary indexes.
+    * @return 0 if no error occurred, -1 or -2 otherwise.
+    */
+    int32_t libsais_bwt_aux_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I);
+
+#if defined(_OPENMP)
+    /**
+    * Constructs the burrows-wheeler transformed string of a given string in parallel using OpenMP.
+    * @param T [0..n-1] The input string.
+    * @param U [0..n-1] The output string (can be T).
+    * @param A [0..n-1+fs] The temporary array.
+    * @param n The length of the given string.
+    * @param fs The extra space available at the end of A array (can be 0).
+    * @param freq [0..255] The output symbol frequency table (can be NULL).
+    * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
+    * @return The primary index if no error occurred, -1 or -2 otherwise.
+    */
+    int32_t libsais_bwt_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t threads);
+
+    /**
+    * Constructs the burrows-wheeler transformed string of a given string with auxiliary indexes in parallel using OpenMP.
+    * @param T [0..n-1] The input string.
+    * @param U [0..n-1] The output string (can be T).
+    * @param A [0..n-1+fs] The temporary array.
+    * @param n The length of the given string.
+    * @param fs The extra space available at the end of A array (can be 0).
+    * @param freq [0..255] The output symbol frequency table (can be NULL).
+    * @param r The sampling rate for auxiliary indexes (must be power of 2).
+    * @param I [0..(n-1)/r] The output auxiliary indexes.
+    * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
+    * @return 0 if no error occurred, -1 or -2 otherwise.
+    */
+    int32_t libsais_bwt_aux_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I, int32_t threads);
+#endif
+
+    /**
+    * Creates the libsais reverse BWT context that allows reusing allocated memory with each libsais_unbwt_* operation. 
+    * In multi-threaded environments, use one context per thread for parallel executions.
+    * @return the libsais context, NULL otherwise.
+    */
+    void * libsais_unbwt_create_ctx(void);
+
+#if defined(_OPENMP)
+    /**
+    * Creates the libsais reverse BWT context that allows reusing allocated memory with each parallel libsais_unbwt_* operation using OpenMP. 
+    * In multi-threaded environments, use one context per thread for parallel executions.
+    * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
+    * @return the libsais context, NULL otherwise.
+    */
+    void * libsais_unbwt_create_ctx_omp(int32_t threads);
+#endif
+
+    /**
+    * Destroys the libsass reverse BWT context and free previusly allocated memory.
+    * @param ctx The libsais context (can be NULL).
+    */
+    void libsais_unbwt_free_ctx(void * ctx);
+
+    /**
+    * Constructs the original string from a given burrows-wheeler transformed string with primary index.
+    * @param T [0..n-1] The input string.
+    * @param U [0..n-1] The output string (can be T).
+    * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size).
+    * @param n The length of the given string.
+    * @param freq [0..255] The input symbol frequency table (can be NULL).
+    * @param i The primary index.
+    * @return 0 if no error occurred, -1 or -2 otherwise.
+    */
+    int32_t libsais_unbwt(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i);
+
+    /**
+    * Constructs the original string from a given burrows-wheeler transformed string with primary index using libsais reverse BWT context.
+    * @param ctx The libsais reverse BWT context.
+    * @param T [0..n-1] The input string.
+    * @param U [0..n-1] The output string (can be T).
+    * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size).
+    * @param n The length of the given string.
+    * @param freq [0..255] The input symbol frequency table (can be NULL).
+    * @param i The primary index.
+    * @return 0 if no error occurred, -1 or -2 otherwise.
+    */
+    int32_t libsais_unbwt_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i);
+
+    /**
+    * Constructs the original string from a given burrows-wheeler transformed string with auxiliary indexes.
+    * @param T [0..n-1] The input string.
+    * @param U [0..n-1] The output string (can be T).
+    * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size).
+    * @param n The length of the given string.
+    * @param freq [0..255] The input symbol frequency table (can be NULL).
+    * @param r The sampling rate for auxiliary indexes (must be power of 2).
+    * @param I [0..(n-1)/r] The input auxiliary indexes.
+    * @return 0 if no error occurred, -1 or -2 otherwise.
+    */
+    int32_t libsais_unbwt_aux(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I);
+
+    /**
+    * Constructs the original string from a given burrows-wheeler transformed string with auxiliary indexes using libsais reverse BWT context.
+    * @param ctx The libsais reverse BWT context.
+    * @param T [0..n-1] The input string.
+    * @param U [0..n-1] The output string (can be T).
+    * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size).
+    * @param n The length of the given string.
+    * @param freq [0..255] The input symbol frequency table (can be NULL).
+    * @param r The sampling rate for auxiliary indexes (must be power of 2).
+    * @param I [0..(n-1)/r] The input auxiliary indexes.
+    * @return 0 if no error occurred, -1 or -2 otherwise.
+    */
+    int32_t libsais_unbwt_aux_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I);
+
+#if defined(_OPENMP)
+    /**
+    * Constructs the original string from a given burrows-wheeler transformed string with primary index in parallel using OpenMP.
+    * @param T [0..n-1] The input string.
+    * @param U [0..n-1] The output string (can be T).
+    * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size).
+    * @param n The length of the given string.
+    * @param freq [0..255] The input symbol frequency table (can be NULL).
+    * @param i The primary index.
+    * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
+    * @return 0 if no error occurred, -1 or -2 otherwise.
+    */
+    int32_t libsais_unbwt_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i, int32_t threads);
+
+    /**
+    * Constructs the original string from a given burrows-wheeler transformed string with auxiliary indexes in parallel using OpenMP.
+    * @param T [0..n-1] The input string.
+    * @param U [0..n-1] The output string (can be T).
+    * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size).
+    * @param n The length of the given string.
+    * @param freq [0..255] The input symbol frequency table (can be NULL).
+    * @param r The sampling rate for auxiliary indexes (must be power of 2).
+    * @param I [0..(n-1)/r] The input auxiliary indexes.
+    * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
+    * @return 0 if no error occurred, -1 or -2 otherwise.
+    */
+    int32_t libsais_unbwt_aux_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I, int32_t threads);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/libsais/libsais16.c b/libsais/libsais16.c
new file mode 100644
index 0000000..9a8d95b
--- /dev/null
+++ b/libsais/libsais16.c
@@ -0,0 +1,7342 @@
+/*--
+
+This file is a part of libsais, a library for linear time
+suffix array and burrows wheeler transform construction.
+
+   Copyright (c) 2021 Ilya Grebnov <ilya.grebnov@gmail.com>
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+Please see the file LICENSE for full copyright information.
+
+--*/
+
+#include "libsais16.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+#if defined(_OPENMP)
+    #include <omp.h>
+#else
+    #define UNUSED(_x)                  (void)(_x)
+#endif
+
+typedef int32_t                         sa_sint_t;
+typedef uint32_t                        sa_uint_t;
+typedef ptrdiff_t                       fast_sint_t;
+typedef size_t                          fast_uint_t;
+
+#define SAINT_BIT                       (32)
+#define SAINT_MAX                       INT32_MAX
+#define SAINT_MIN                       INT32_MIN
+
+#define ALPHABET_SIZE                   (1 << CHAR_BIT << CHAR_BIT)
+#define UNBWT_FASTBITS                  (17)
+
+#define SUFFIX_GROUP_BIT                (SAINT_BIT - 1)
+#define SUFFIX_GROUP_MARKER             (((sa_sint_t)1) << (SUFFIX_GROUP_BIT - 1))
+
+#define BUCKETS_INDEX2(_c, _s)          (((_c) << 1) + (_s))
+#define BUCKETS_INDEX4(_c, _s)          (((_c) << 2) + (_s))
+
+#define LIBSAIS_PER_THREAD_CACHE_SIZE   (24576)
+
+typedef struct LIBSAIS_THREAD_CACHE
+{
+        sa_sint_t                       symbol;
+        sa_sint_t                       index;
+} LIBSAIS_THREAD_CACHE;
+
+typedef union LIBSAIS_THREAD_STATE
+{
+    struct
+    {
+        fast_sint_t                     position;
+        fast_sint_t                     count;
+
+        fast_sint_t                     m;
+        fast_sint_t                     last_lms_suffix;
+
+        sa_sint_t *                     buckets;
+        LIBSAIS_THREAD_CACHE *          cache;
+    } state;
+
+    uint8_t padding[64];
+} LIBSAIS_THREAD_STATE;
+
+typedef struct LIBSAIS_CONTEXT
+{
+    sa_sint_t *                         buckets;
+    LIBSAIS_THREAD_STATE *              thread_state;
+    fast_sint_t                         threads;
+} LIBSAIS_CONTEXT;
+
+typedef struct LIBSAIS_UNBWT_CONTEXT
+{
+    sa_uint_t *                         bucket2;
+    uint16_t *                          fastbits;
+    sa_uint_t *                         buckets;
+    fast_sint_t                         threads;
+} LIBSAIS_UNBWT_CONTEXT;
+
+#if defined(__GNUC__) || defined(__clang__)
+    #define RESTRICT __restrict__
+#elif defined(_MSC_VER) || defined(__INTEL_COMPILER)
+    #define RESTRICT __restrict
+#else
+    #error Your compiler, configuration or platform is not supported.
+#endif
+
+#if defined(__has_builtin)
+    #if __has_builtin(__builtin_prefetch)
+        #define HAS_BUILTIN_PREFECTCH
+    #endif
+#elif defined(__GNUC__) && __GNUC__ > 3
+    #define HAS_BUILTIN_PREFECTCH
+#endif 
+
+#if defined(HAS_BUILTIN_PREFECTCH)
+    #define libsais16_prefetch(address) __builtin_prefetch((const void *)(address), 0, 0)
+    #define libsais16_prefetchw(address) __builtin_prefetch((const void *)(address), 1, 0)
+#elif defined (_M_IX86) || defined (_M_AMD64)
+    #include <intrin.h>
+    #define libsais16_prefetch(address) _mm_prefetch((const void *)(address), _MM_HINT_NTA)
+    #define libsais16_prefetchw(address) _m_prefetchw((const void *)(address))
+#elif defined (_M_ARM)
+    #include <intrin.h>
+    #define libsais16_prefetch(address) __prefetch((const void *)(address))
+    #define libsais16_prefetchw(address) __prefetchw((const void *)(address))
+#elif defined (_M_ARM64)
+    #include <intrin.h>
+    #define libsais16_prefetch(address) __prefetch2((const void *)(address), 1)
+    #define libsais16_prefetchw(address) __prefetch2((const void *)(address), 17)
+#else
+    #error Your compiler, configuration or platform is not supported.
+#endif
+
+#if !defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__)
+    #if defined(_LITTLE_ENDIAN) \
+            || (defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && BYTE_ORDER == LITTLE_ENDIAN) \
+            || (defined(_BYTE_ORDER) && defined(_LITTLE_ENDIAN) && _BYTE_ORDER == _LITTLE_ENDIAN) \
+            || (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN) \
+            || (defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+        #define __LITTLE_ENDIAN__
+    #elif defined(_BIG_ENDIAN) \
+            || (defined(BYTE_ORDER) && defined(BIG_ENDIAN) && BYTE_ORDER == BIG_ENDIAN) \
+            || (defined(_BYTE_ORDER) && defined(_BIG_ENDIAN) && _BYTE_ORDER == _BIG_ENDIAN) \
+            || (defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && __BYTE_ORDER == __BIG_ENDIAN) \
+            || (defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+        #define __BIG_ENDIAN__
+    #elif defined(_WIN32)
+        #define __LITTLE_ENDIAN__
+    #endif
+#endif
+
+static void * libsais16_align_up(const void * address, size_t alignment)
+{
+    return (void *)((((ptrdiff_t)address) + ((ptrdiff_t)alignment) - 1) & (-((ptrdiff_t)alignment)));
+}
+
+static void * libsais16_alloc_aligned(size_t size, size_t alignment)
+{
+    void * address = malloc(size + sizeof(short) + alignment - 1);
+    if (address != NULL)
+    {
+        void * aligned_address = libsais16_align_up((void *)((ptrdiff_t)address + (ptrdiff_t)(sizeof(short))), alignment);
+        ((short *)aligned_address)[-1] = (short)((ptrdiff_t)aligned_address - (ptrdiff_t)address);
+
+        return aligned_address;
+    }
+
+    return NULL;
+}
+
+static void libsais16_free_aligned(void * aligned_address)
+{
+    if (aligned_address != NULL)
+    {
+        free((void *)((ptrdiff_t)aligned_address - ((short *)aligned_address)[-1]));
+    }
+}
+
+static LIBSAIS_THREAD_STATE * libsais16_alloc_thread_state(sa_sint_t threads)
+{
+    LIBSAIS_THREAD_STATE *  RESTRICT thread_state    = (LIBSAIS_THREAD_STATE *)libsais16_alloc_aligned((size_t)threads * sizeof(LIBSAIS_THREAD_STATE), 4096);
+    sa_sint_t *             RESTRICT thread_buckets  = (sa_sint_t *)libsais16_alloc_aligned((size_t)threads * 4 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096);
+    LIBSAIS_THREAD_CACHE *  RESTRICT thread_cache    = (LIBSAIS_THREAD_CACHE *)libsais16_alloc_aligned((size_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE * sizeof(LIBSAIS_THREAD_CACHE), 4096);
+
+    if (thread_state != NULL && thread_buckets != NULL && thread_cache != NULL)
+    {
+        fast_sint_t t;
+        for (t = 0; t < threads; ++t)
+        { 
+            thread_state[t].state.buckets   = thread_buckets;   thread_buckets  += 4 * ALPHABET_SIZE;
+            thread_state[t].state.cache     = thread_cache;     thread_cache    += LIBSAIS_PER_THREAD_CACHE_SIZE;
+        }
+
+        return thread_state;
+    }
+
+    libsais16_free_aligned(thread_cache);
+    libsais16_free_aligned(thread_buckets);
+    libsais16_free_aligned(thread_state);
+    return NULL;
+}
+
+static void libsais16_free_thread_state(LIBSAIS_THREAD_STATE * thread_state)
+{
+    if (thread_state != NULL)
+    {
+        libsais16_free_aligned(thread_state[0].state.cache);
+        libsais16_free_aligned(thread_state[0].state.buckets);
+        libsais16_free_aligned(thread_state);
+    }
+}
+
+static LIBSAIS_CONTEXT * libsais16_create_ctx_main(sa_sint_t threads)
+{
+    LIBSAIS_CONTEXT *       RESTRICT ctx            = (LIBSAIS_CONTEXT *)libsais16_alloc_aligned(sizeof(LIBSAIS_CONTEXT), 64);
+    sa_sint_t *             RESTRICT buckets        = (sa_sint_t *)libsais16_alloc_aligned(8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096);
+    LIBSAIS_THREAD_STATE *  RESTRICT thread_state   = threads > 1 ? libsais16_alloc_thread_state(threads) : NULL;
+
+    if (ctx != NULL && buckets != NULL && (thread_state != NULL || threads == 1))
+    {
+        ctx->buckets = buckets;
+        ctx->threads = threads;
+        ctx->thread_state = thread_state;
+
+        return ctx;
+    }
+
+    libsais16_free_thread_state(thread_state);
+    libsais16_free_aligned(buckets);
+    libsais16_free_aligned(ctx);
+    return NULL;
+}
+
+static void libsais16_free_ctx_main(LIBSAIS_CONTEXT * ctx)
+{
+    if (ctx != NULL)
+    {
+        libsais16_free_thread_state(ctx->thread_state);
+        libsais16_free_aligned(ctx->buckets);
+        libsais16_free_aligned(ctx);
+    }
+}
+
+#if defined(_OPENMP)
+
+static sa_sint_t libsais16_count_negative_marked_suffixes(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    sa_sint_t count = 0;
+
+    fast_sint_t i; for (i = omp_block_start; i < omp_block_start + omp_block_size; ++i) { count += (SA[i] < 0); }
+
+    return count;
+}
+
+static sa_sint_t libsais16_count_zero_marked_suffixes(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    sa_sint_t count = 0;
+
+    fast_sint_t i; for (i = omp_block_start; i < omp_block_start + omp_block_size; ++i) { count += (SA[i] == 0); }
+
+    return count;
+}
+
+static void libsais16_place_cached_suffixes(sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
+    {
+        libsais16_prefetch(&cache[i + 2 * prefetch_distance]);
+
+        libsais16_prefetchw(&SA[cache[i + prefetch_distance + 0].symbol]);
+        libsais16_prefetchw(&SA[cache[i + prefetch_distance + 1].symbol]);
+        libsais16_prefetchw(&SA[cache[i + prefetch_distance + 2].symbol]);
+        libsais16_prefetchw(&SA[cache[i + prefetch_distance + 3].symbol]);
+
+        SA[cache[i + 0].symbol] = cache[i + 0].index;
+        SA[cache[i + 1].symbol] = cache[i + 1].index;
+        SA[cache[i + 2].symbol] = cache[i + 2].index;
+        SA[cache[i + 3].symbol] = cache[i + 3].index;
+    }
+
+    for (j += prefetch_distance + 3; i < j; i += 1)
+    {
+        SA[cache[i].symbol] = cache[i].index;
+    }
+}
+
+static void libsais16_compact_and_place_cached_suffixes(sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j, l;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j; i += 4)
+    {
+        libsais16_prefetchw(&cache[i + prefetch_distance]);
+
+        cache[l] = cache[i + 0]; l += cache[l].symbol >= 0;
+        cache[l] = cache[i + 1]; l += cache[l].symbol >= 0;
+        cache[l] = cache[i + 2]; l += cache[l].symbol >= 0;
+        cache[l] = cache[i + 3]; l += cache[l].symbol >= 0;
+    }
+
+    for (j += 3; i < j; i += 1)
+    {
+        cache[l] = cache[i]; l += cache[l].symbol >= 0;
+    }
+
+    libsais16_place_cached_suffixes(SA, cache, omp_block_start, l - omp_block_start);
+}
+
+static void libsais16_accumulate_counts_s32_2(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
+{
+    sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
+    fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s]; }
+}
+
+static void libsais16_accumulate_counts_s32_3(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
+{
+    sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
+    sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
+    fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s]; }
+}
+
+static void libsais16_accumulate_counts_s32_4(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
+{
+    sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
+    sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
+    sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride;
+    fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s]; }
+}
+
+static void libsais16_accumulate_counts_s32_5(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
+{
+    sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
+    sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
+    sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride;
+    sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride;
+    fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s]; }
+}
+
+static void libsais16_accumulate_counts_s32_6(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
+{
+    sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
+    sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
+    sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride;
+    sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride;
+    sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride;
+    fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s]; }
+}
+
+static void libsais16_accumulate_counts_s32_7(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
+{
+    sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
+    sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
+    sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride;
+    sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride;
+    sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride;
+    sa_sint_t * RESTRICT bucket06 = bucket05 - bucket_stride;
+    fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s] + bucket06[s]; }
+}
+
+static void libsais16_accumulate_counts_s32_8(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
+{
+    sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
+    sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
+    sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride;
+    sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride;
+    sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride;
+    sa_sint_t * RESTRICT bucket06 = bucket05 - bucket_stride;
+    sa_sint_t * RESTRICT bucket07 = bucket06 - bucket_stride;
+    fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s] + bucket06[s] + bucket07[s]; }
+}
+
+static void libsais16_accumulate_counts_s32_9(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
+{
+    sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
+    sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
+    sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride;
+    sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride;
+    sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride;
+    sa_sint_t * RESTRICT bucket06 = bucket05 - bucket_stride;
+    sa_sint_t * RESTRICT bucket07 = bucket06 - bucket_stride;
+    sa_sint_t * RESTRICT bucket08 = bucket07 - bucket_stride;
+    fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s] + bucket06[s] + bucket07[s] + bucket08[s]; }
+}
+
+static void libsais16_accumulate_counts_s32(sa_sint_t * RESTRICT buckets, fast_sint_t bucket_size, fast_sint_t bucket_stride, fast_sint_t num_buckets)
+{
+    while (num_buckets >= 9)
+    {
+        libsais16_accumulate_counts_s32_9(buckets - (num_buckets - 9) * bucket_stride, bucket_size, bucket_stride); num_buckets -= 8;
+    }
+
+    switch (num_buckets)
+    {
+        case 1: break;
+        case 2: libsais16_accumulate_counts_s32_2(buckets, bucket_size, bucket_stride); break;
+        case 3: libsais16_accumulate_counts_s32_3(buckets, bucket_size, bucket_stride); break;
+        case 4: libsais16_accumulate_counts_s32_4(buckets, bucket_size, bucket_stride); break;
+        case 5: libsais16_accumulate_counts_s32_5(buckets, bucket_size, bucket_stride); break;
+        case 6: libsais16_accumulate_counts_s32_6(buckets, bucket_size, bucket_stride); break;
+        case 7: libsais16_accumulate_counts_s32_7(buckets, bucket_size, bucket_stride); break;
+        case 8: libsais16_accumulate_counts_s32_8(buckets, bucket_size, bucket_stride); break;
+    }
+}
+
+#endif
+
+static void libsais16_gather_lms_suffixes_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, fast_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    if (omp_block_size > 0)
+    {
+        const fast_sint_t prefetch_distance = 128;
+
+        fast_sint_t i, j = omp_block_start + omp_block_size, c0 = T[omp_block_start + omp_block_size - 1], c1 = -1;
+
+        while (j < n && (c1 = T[j]) == c0) { ++j; }
+
+        fast_uint_t s = c0 >= c1;
+
+        for (i = omp_block_start + omp_block_size - 2, j = omp_block_start + 3; i >= j; i -= 4)
+        {
+            libsais16_prefetch(&T[i - prefetch_distance]);
+
+            c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+            c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 0); m -= ((s & 3) == 1);
+            c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 1); m -= ((s & 3) == 1);
+            c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 2); m -= ((s & 3) == 1);
+        }
+
+        for (j -= 3; i >= j; i -= 1)
+        {
+            c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+        }
+
+        SA[m] = (sa_sint_t)(i + 1);
+    }
+}
+
+static void libsais16_gather_lms_suffixes_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536 && omp_get_dynamic() == 0)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (n / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais16_gather_lms_suffixes_16u(T, SA, n, (fast_sint_t)n - 1, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            fast_sint_t t, m = 0; for (t = omp_num_threads - 1; t > omp_thread_num; --t) { m += thread_state[t].state.m; }
+
+            libsais16_gather_lms_suffixes_16u(T, SA, n, (fast_sint_t)n - 1 - m, omp_block_start, omp_block_size);
+
+            #pragma omp barrier
+
+            if (thread_state[omp_thread_num].state.m > 0)
+            {
+                SA[(fast_sint_t)n - 1 - m] = (sa_sint_t)thread_state[omp_thread_num].state.last_lms_suffix;
+            }
+        }
+#endif
+    }
+}
+
+static sa_sint_t libsais16_gather_lms_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t             i   = n - 2;
+    sa_sint_t             m   = n - 1;
+    fast_uint_t           s   = 1;
+    fast_sint_t           c0  = T[n - 1];
+    fast_sint_t           c1  = 0;
+
+    for (; i >= 3; i -= 4)
+    {
+        libsais16_prefetch(&T[i - prefetch_distance]);
+
+        c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = i + 1; m -= ((s & 3) == 1);
+        c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i - 0; m -= ((s & 3) == 1);
+        c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = i - 1; m -= ((s & 3) == 1);
+        c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i - 2; m -= ((s & 3) == 1);
+    }
+
+    for (; i >= 0; i -= 1)
+    {
+        c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i + 1; m -= ((s & 3) == 1);
+    }
+
+    return n - 1 - m;
+}
+
+static sa_sint_t libsais16_gather_compacted_lms_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t             i   = n - 2;
+    sa_sint_t             m   = n - 1;
+    fast_uint_t           s   = 1;
+    fast_sint_t           c0  = T[n - 1];
+    fast_sint_t           c1  = 0;
+
+    for (; i >= 3; i -= 4)
+    {
+        libsais16_prefetch(&T[i - prefetch_distance]);
+
+        c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = i + 1; m -= ((fast_sint_t)(s & 3) == (c0 >= 0));
+        c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i - 0; m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
+        c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = i - 1; m -= ((fast_sint_t)(s & 3) == (c0 >= 0));
+        c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i - 2; m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
+    }
+
+    for (; i >= 0; i -= 1)
+    {
+        c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i + 1; m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
+    }
+
+    return n - 1 - m;
+}
+
+#if defined(_OPENMP)
+
+static void libsais16_count_lms_suffixes_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    memset(buckets, 0, 4 * (size_t)k * sizeof(sa_sint_t));
+
+    sa_sint_t             i   = n - 2;
+    fast_uint_t           s   = 1;
+    fast_sint_t           c0  = T[n - 1];
+    fast_sint_t           c1  = 0;
+
+    for (; i >= prefetch_distance + 3; i -= 4)
+    {
+        libsais16_prefetch(&T[i - 2 * prefetch_distance]);
+
+        libsais16_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 0], 0)]);
+        libsais16_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 1], 0)]);
+        libsais16_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 2], 0)]);
+        libsais16_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 3], 0)]);
+
+        c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+        buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+
+        c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+
+        c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+        buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+
+        c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+    }
+
+    for (; i >= 0; i -= 1)
+    {
+        c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+    }
+
+    buckets[BUCKETS_INDEX4((fast_uint_t)c0, (s << 1) & 3)]++;
+}
+
+#endif
+
+static void libsais16_count_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
+
+    sa_sint_t             i   = n - 2;
+    fast_uint_t           s   = 1;
+    fast_sint_t           c0  = T[n - 1];
+    fast_sint_t           c1  = 0;
+
+    for (; i >= prefetch_distance + 3; i -= 4)
+    {
+        libsais16_prefetch(&T[i - 2 * prefetch_distance]);
+
+        libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0], 0)]);
+        libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1], 0)]);
+        libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2], 0)]);
+        libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3], 0)]);
+
+        c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+        buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+        c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+
+        c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+        buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+        c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+    }
+
+    for (; i >= 0; i -= 1)
+    {
+        c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+    }
+
+    buckets[BUCKETS_INDEX2((fast_uint_t)c0, 0)]++;
+}
+
+#if defined(_OPENMP)
+
+static void libsais16_count_compacted_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
+
+    sa_sint_t             i   = n - 2;
+    fast_uint_t           s   = 1;
+    fast_sint_t           c0  = T[n - 1];
+    fast_sint_t           c1  = 0;
+
+    for (; i >= prefetch_distance + 3; i -= 4)
+    {
+        libsais16_prefetch(&T[i - 2 * prefetch_distance]);
+
+        libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0] & SAINT_MAX, 0)]);
+        libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1] & SAINT_MAX, 0)]);
+        libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2] & SAINT_MAX, 0)]);
+        libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3] & SAINT_MAX, 0)]);
+
+        c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+        c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+        c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+
+        c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+        c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+        c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+    }
+
+    for (; i >= 0; i -= 1)
+    {
+        c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+    }
+
+    c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, 0)]++;
+}
+
+#endif
+
+static sa_sint_t libsais16_count_and_gather_lms_suffixes_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t));
+
+    fast_sint_t m = omp_block_start + omp_block_size - 1;
+
+    if (omp_block_size > 0)
+    {
+        const fast_sint_t prefetch_distance = 128;
+
+        fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1;
+
+        while (j < n && (c1 = T[j]) == c0) { ++j; }
+
+        fast_uint_t s = c0 >= c1;
+
+        for (i = m - 1, j = omp_block_start + 3; i >= j; i -= 4)
+        {
+            libsais16_prefetch(&T[i - prefetch_distance]);
+
+            c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+            buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+
+            c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 0); m -= ((s & 3) == 1);
+            buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+
+            c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 1); m -= ((s & 3) == 1);
+            buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+
+            c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 2); m -= ((s & 3) == 1);
+            buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+        }
+
+        for (j -= 3; i >= j; i -= 1)
+        {
+            c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+            buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+        }
+
+        c1 = (i >= 0) ? T[i] : -1; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+        buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+    }
+
+    return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m);
+}
+
+static sa_sint_t libsais16_count_and_gather_lms_suffixes_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    sa_sint_t m = 0;
+
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536 && omp_get_dynamic() == 0)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (n / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+        if (omp_num_threads == 1)
+        {
+            m = libsais16_count_and_gather_lms_suffixes_16u(T, SA, n, buckets, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size;
+                thread_state[omp_thread_num].state.m = libsais16_count_and_gather_lms_suffixes_16u(T, SA, n, thread_state[omp_thread_num].state.buckets, omp_block_start, omp_block_size);
+
+                if (thread_state[omp_thread_num].state.m > 0)
+                {
+                    thread_state[omp_thread_num].state.last_lms_suffix = SA[thread_state[omp_thread_num].state.position - 1];
+                }
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t));
+
+                fast_sint_t t;
+                for (t = omp_num_threads - 1; t >= 0; --t)
+                {
+                    m += (sa_sint_t)thread_state[t].state.m;
+
+                    if (t != omp_num_threads - 1 && thread_state[t].state.m > 0)
+                    {
+                        memcpy(&SA[n - m], &SA[thread_state[t].state.position - thread_state[t].state.m], (size_t)thread_state[t].state.m * sizeof(sa_sint_t));
+                    }
+
+                    {
+                        sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
+                        fast_sint_t s; for (s = 0; s < 4 * ALPHABET_SIZE; s += 1) { sa_sint_t A = buckets[s], B = temp_bucket[s]; buckets[s] = A + B; temp_bucket[s] = A; }
+                    }
+                }
+            }
+        }
+#endif
+    }
+
+    return m;
+}
+
+static sa_sint_t libsais16_count_and_gather_lms_suffixes_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    memset(buckets, 0, 4 * (size_t)k * sizeof(sa_sint_t));
+
+    fast_sint_t m = omp_block_start + omp_block_size - 1;
+
+    if (omp_block_size > 0)
+    {
+        const fast_sint_t prefetch_distance = 32;
+
+        fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1;
+
+        while (j < n && (c1 = T[j]) == c0) { ++j; }
+
+        fast_uint_t s = c0 >= c1;
+
+        for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4)
+        {
+            libsais16_prefetch(&T[i - 2 * prefetch_distance]);
+
+            libsais16_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 0], 0)]);
+            libsais16_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 1], 0)]);
+            libsais16_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 2], 0)]);
+            libsais16_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 3], 0)]);
+
+            c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+            buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+
+            c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 0); m -= ((s & 3) == 1);
+            buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+
+            c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 1); m -= ((s & 3) == 1);
+            buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+
+            c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 2); m -= ((s & 3) == 1);
+            buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+        }
+
+        for (j -= prefetch_distance + 3; i >= j; i -= 1)
+        {
+            c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+            buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+        }
+
+        c1 = (i >= 0) ? T[i] : -1; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+        buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+    }
+
+    return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m);
+}
+
+static sa_sint_t libsais16_count_and_gather_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
+
+    fast_sint_t m = omp_block_start + omp_block_size - 1;
+
+    if (omp_block_size > 0)
+    {
+        const fast_sint_t prefetch_distance = 32;
+
+        fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1;
+
+        while (j < n && (c1 = T[j]) == c0) { ++j; }
+
+        fast_uint_t s = c0 >= c1;
+
+        for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4)
+        {
+            libsais16_prefetch(&T[i - 2 * prefetch_distance]);
+
+            libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0], 0)]);
+            libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1], 0)]);
+            libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2], 0)]);
+            libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3], 0)]);
+
+            c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+            buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+            c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 0); m -= ((s & 3) == 1);
+            buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+
+            c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 1); m -= ((s & 3) == 1);
+            buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+            c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 2); m -= ((s & 3) == 1);
+            buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+        }
+
+        for (j -= prefetch_distance + 3; i >= j; i -= 1)
+        {
+            c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+            buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+        }
+
+        c1 = (i >= 0) ? T[i] : -1; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+        buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+    }
+
+    return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m);
+}
+
+static sa_sint_t libsais16_count_and_gather_compacted_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
+
+    fast_sint_t m = omp_block_start + omp_block_size - 1;
+
+    if (omp_block_size > 0)
+    {
+        const fast_sint_t prefetch_distance = 32;
+
+        fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1;
+
+        while (j < n && (c1 = T[j]) == c0) { ++j; }
+
+        fast_uint_t s = c0 >= c1;
+
+        for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4)
+        {
+            libsais16_prefetch(&T[i - 2 * prefetch_distance]);
+
+            libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0] & SAINT_MAX, 0)]);
+            libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1] & SAINT_MAX, 0)]);
+            libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2] & SAINT_MAX, 0)]);
+            libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3] & SAINT_MAX, 0)]);
+
+            c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((fast_sint_t)(s & 3) == (c0 >= 0));
+            c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+            c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 0); m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
+            c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+
+            c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 1); m -= ((fast_sint_t)(s & 3) == (c0 >= 0));
+            c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+            c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 2); m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
+            c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+        }
+
+        for (j -= prefetch_distance + 3; i >= j; i -= 1)
+        {
+            c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
+            c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+        }
+
+        c1 = (i >= 0) ? T[i] : -1; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((fast_sint_t)(s & 3) == (c0 >= 0));
+        c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+    }
+
+    return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m);
+}
+
+#if defined(_OPENMP)
+
+static fast_sint_t libsais16_get_bucket_stride(fast_sint_t free_space, fast_sint_t bucket_size, fast_sint_t num_buckets)
+{
+    fast_sint_t bucket_size_1024 = (bucket_size + 1023) & (-1024); if (free_space / (num_buckets - 1) >= bucket_size_1024) { return bucket_size_1024; }
+    fast_sint_t bucket_size_16 = (bucket_size + 15) & (-16); if (free_space / (num_buckets - 1) >= bucket_size_16) { return bucket_size_16; }
+
+    return bucket_size;
+}
+
+static sa_sint_t libsais16_count_and_gather_lms_suffixes_32s_4k_fs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    sa_sint_t m = 0;
+
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (n / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+        if (omp_num_threads == 1)
+        {
+            m = libsais16_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            fast_sint_t bucket_size       = 4 * (fast_sint_t)k;
+            fast_sint_t bucket_stride     = libsais16_get_bucket_stride(buckets - &SA[n], bucket_size, omp_num_threads);
+
+            {
+                thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size;
+                thread_state[omp_thread_num].state.count = libsais16_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets - (omp_thread_num * bucket_stride), omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            if (omp_thread_num == omp_num_threads - 1)
+            {
+                fast_sint_t t;
+                for (t = omp_num_threads - 1; t >= 0; --t)
+                {
+                    m += (sa_sint_t)thread_state[t].state.count;
+
+                    if (t != omp_num_threads - 1 && thread_state[t].state.count > 0)
+                    {
+                        memcpy(&SA[n - m], &SA[thread_state[t].state.position - thread_state[t].state.count], (size_t)thread_state[t].state.count * sizeof(sa_sint_t));
+                    }
+                }
+            }
+            else
+            {
+                omp_num_threads     = omp_num_threads - 1;
+                omp_block_stride    = (bucket_size / omp_num_threads) & (-16);
+                omp_block_start     = omp_thread_num * omp_block_stride;
+                omp_block_size      = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : bucket_size - omp_block_start;
+
+                libsais16_accumulate_counts_s32(buckets + omp_block_start, omp_block_size, bucket_stride, omp_num_threads + 1);
+            }
+        }
+#endif
+    }
+
+    return m;
+}
+
+static sa_sint_t libsais16_count_and_gather_lms_suffixes_32s_2k_fs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    sa_sint_t m = 0;
+
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (n / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+        if (omp_num_threads == 1)
+        {
+            m = libsais16_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            fast_sint_t bucket_size       = 2 * (fast_sint_t)k;
+            fast_sint_t bucket_stride     = libsais16_get_bucket_stride(buckets - &SA[n], bucket_size, omp_num_threads);
+
+            {
+                thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size;
+                thread_state[omp_thread_num].state.count = libsais16_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets - (omp_thread_num * bucket_stride), omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            if (omp_thread_num == omp_num_threads - 1)
+            {
+                fast_sint_t t;
+                for (t = omp_num_threads - 1; t >= 0; --t)
+                {
+                    m += (sa_sint_t)thread_state[t].state.count;
+
+                    if (t != omp_num_threads - 1 && thread_state[t].state.count > 0)
+                    {
+                        memcpy(&SA[n - m], &SA[thread_state[t].state.position - thread_state[t].state.count], (size_t)thread_state[t].state.count * sizeof(sa_sint_t));
+                    }
+                }
+            }
+            else
+            {
+                omp_num_threads     = omp_num_threads - 1;
+                omp_block_stride    = (bucket_size / omp_num_threads) & (-16);
+                omp_block_start     = omp_thread_num * omp_block_stride;
+                omp_block_size      = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : bucket_size - omp_block_start;
+
+                libsais16_accumulate_counts_s32(buckets + omp_block_start, omp_block_size, bucket_stride, omp_num_threads + 1);
+            }
+        }
+#endif
+    }
+
+    return m;
+}
+
+static void libsais16_count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (n / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais16_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA, n, k, buckets, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            fast_sint_t bucket_size       = 2 * (fast_sint_t)k;
+            fast_sint_t bucket_stride     = libsais16_get_bucket_stride(buckets - &SA[n + n], bucket_size, omp_num_threads);
+
+            {
+                thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size;
+                thread_state[omp_thread_num].state.count = libsais16_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA + n, n, k, buckets - (omp_thread_num * bucket_stride), omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                fast_sint_t t, m = 0; for (t = omp_num_threads - 1; t >= omp_thread_num; --t) { m += (sa_sint_t)thread_state[t].state.count; }
+
+                if (thread_state[omp_thread_num].state.count > 0)
+                {
+                    memcpy(&SA[n - m], &SA[n + thread_state[omp_thread_num].state.position - thread_state[omp_thread_num].state.count], (size_t)thread_state[omp_thread_num].state.count * sizeof(sa_sint_t));
+                }
+            }
+
+            {
+                omp_block_stride    = (bucket_size / omp_num_threads) & (-16);
+                omp_block_start     = omp_thread_num * omp_block_stride;
+                omp_block_size      = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : bucket_size - omp_block_start;
+
+                libsais16_accumulate_counts_s32(buckets + omp_block_start, omp_block_size, bucket_stride, omp_num_threads);
+            }
+        }
+#endif
+    }
+}
+
+#endif
+
+static sa_sint_t libsais16_count_and_gather_lms_suffixes_32s_4k_nofs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads)
+{
+    sa_sint_t m = 0;
+
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(2) if(threads > 1 && n >= 65536)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads);
+
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        if (omp_num_threads == 1)
+        {
+            m = libsais16_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets, 0, n);
+        }
+#if defined(_OPENMP)
+        else if (omp_thread_num == 0)
+        {
+            libsais16_count_lms_suffixes_32s_4k(T, n, k, buckets);
+        }
+        else
+        {
+            m = libsais16_gather_lms_suffixes_32s(T, SA, n);
+        }
+#endif
+    }
+
+    return m;
+}
+
+static sa_sint_t libsais16_count_and_gather_lms_suffixes_32s_2k_nofs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads)
+{
+    sa_sint_t m = 0;
+
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(2) if(threads > 1 && n >= 65536)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads);
+
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        if (omp_num_threads == 1)
+        {
+            m = libsais16_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n);
+        }
+#if defined(_OPENMP)
+        else if (omp_thread_num == 0)
+        {
+            libsais16_count_lms_suffixes_32s_2k(T, n, k, buckets);
+        }
+        else
+        {
+            m = libsais16_gather_lms_suffixes_32s(T, SA, n);
+        }
+#endif
+    }
+
+    return m;
+}
+
+static sa_sint_t libsais16_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads)
+{
+    sa_sint_t m = 0;
+
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(2) if(threads > 1 && n >= 65536)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads);
+
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        if (omp_num_threads == 1)
+        {
+            m = libsais16_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n);
+        }
+#if defined(_OPENMP)
+        else if (omp_thread_num == 0)
+        {
+            libsais16_count_compacted_lms_suffixes_32s_2k(T, n, k, buckets);
+        }
+        else
+        {
+            m = libsais16_gather_compacted_lms_suffixes_32s(T, SA, n);
+        }
+#endif
+    }
+
+    return m;
+}
+
+static sa_sint_t libsais16_count_and_gather_lms_suffixes_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    sa_sint_t m;
+
+#if defined(_OPENMP)
+    sa_sint_t max_threads = (sa_sint_t)((buckets - &SA[n]) / ((4 * (fast_sint_t)k + 15) & (-16))); if (max_threads > threads) { max_threads = threads; }
+    if (max_threads > 1 && n >= 65536 && n / k >= 2)
+    {
+        if (max_threads > n / 16 / k) { max_threads = n / 16 / k; }
+        m = libsais16_count_and_gather_lms_suffixes_32s_4k_fs_omp(T, SA, n, k, buckets, max_threads > 2 ? max_threads : 2, thread_state);
+    }
+    else
+#else
+    UNUSED(thread_state);
+#endif
+    {
+        m = libsais16_count_and_gather_lms_suffixes_32s_4k_nofs_omp(T, SA, n, k, buckets, threads);
+    }
+
+    return m;
+}
+
+static sa_sint_t libsais16_count_and_gather_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    sa_sint_t m;
+
+#if defined(_OPENMP)
+    sa_sint_t max_threads = (sa_sint_t)((buckets - &SA[n]) / ((2 * (fast_sint_t)k + 15) & (-16))); if (max_threads > threads) { max_threads = threads; }
+    if (max_threads > 1 && n >= 65536 && n / k >= 2)
+    {
+        if (max_threads > n / 8 / k) { max_threads = n / 8 / k; }
+        m = libsais16_count_and_gather_lms_suffixes_32s_2k_fs_omp(T, SA, n, k, buckets, max_threads > 2 ? max_threads : 2, thread_state);
+    }
+    else
+#else
+    UNUSED(thread_state);
+#endif
+    {
+        m = libsais16_count_and_gather_lms_suffixes_32s_2k_nofs_omp(T, SA, n, k, buckets, threads);
+    }
+
+    return m;
+}
+
+static void libsais16_count_and_gather_compacted_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+    sa_sint_t max_threads = (sa_sint_t)((buckets - &SA[n + n]) / ((2 * (fast_sint_t)k + 15) & (-16))); if (max_threads > threads) { max_threads = threads; }
+    if (max_threads > 1 && n >= 65536 && n / k >= 2)
+    {
+        if (max_threads > n / 8 / k) { max_threads = n / 8 / k; }
+        libsais16_count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(T, SA, n, k, buckets, max_threads > 2 ? max_threads : 2, thread_state);
+    }
+    else
+#else
+    UNUSED(thread_state);
+#endif
+    {
+        libsais16_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(T, SA, n, k, buckets, threads);
+    }
+}
+
+static void libsais16_count_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    memset(buckets, 0, (size_t)k * sizeof(sa_sint_t));
+
+    fast_sint_t i, j;
+    for (i = 0, j = (fast_sint_t)n - 7; i < j; i += 8)
+    {
+        libsais16_prefetch(&T[i + prefetch_distance]);
+
+        buckets[T[i + 0]]++;
+        buckets[T[i + 1]]++;
+        buckets[T[i + 2]]++;
+        buckets[T[i + 3]]++;
+        buckets[T[i + 4]]++;
+        buckets[T[i + 5]]++;
+        buckets[T[i + 6]]++;
+        buckets[T[i + 7]]++;
+    }
+
+    for (j += 7; i < j; i += 1)
+    {
+        buckets[T[i]]++;
+    }
+}
+
+static void libsais16_initialize_buckets_start_and_end_16u(sa_sint_t * RESTRICT buckets, sa_sint_t * RESTRICT freq)
+{
+    sa_sint_t * RESTRICT bucket_start = &buckets[6 * ALPHABET_SIZE];
+    sa_sint_t * RESTRICT bucket_end   = &buckets[7 * ALPHABET_SIZE];
+
+    if (freq != NULL)
+    {
+        fast_sint_t i, j; sa_sint_t sum = 0;
+        for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1)
+        {
+            bucket_start[j] = sum;
+            sum += (freq[j] = buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)]);
+            bucket_end[j] = sum;
+        }
+    }
+    else
+    {
+        fast_sint_t i, j; sa_sint_t sum = 0;
+        for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1)
+        {
+            bucket_start[j] = sum;
+            sum += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)];
+            bucket_end[j] = sum;
+        }
+    }
+}
+
+static void libsais16_initialize_buckets_start_and_end_32s_6k(sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+    sa_sint_t * RESTRICT bucket_start = &buckets[4 * k];
+    sa_sint_t * RESTRICT bucket_end   = &buckets[5 * k];
+
+    fast_sint_t i, j; sa_sint_t sum = 0;
+    for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1)
+    {
+        bucket_start[j] = sum;
+        sum += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)];
+        bucket_end[j] = sum;
+    }
+}
+
+static void libsais16_initialize_buckets_start_and_end_32s_4k(sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+    sa_sint_t * RESTRICT bucket_start = &buckets[2 * k];
+    sa_sint_t * RESTRICT bucket_end   = &buckets[3 * k];
+
+    fast_sint_t i, j; sa_sint_t sum = 0;
+    for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1)
+    { 
+        bucket_start[j] = sum;
+        sum += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)];
+        bucket_end[j] = sum;
+    }
+}
+
+static void libsais16_initialize_buckets_end_32s_2k(sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+    fast_sint_t i; sa_sint_t sum0 = 0;
+    for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0))
+    { 
+        sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)]; buckets[i + BUCKETS_INDEX2(0, 0)] = sum0;
+    }
+}
+
+static void libsais16_initialize_buckets_start_and_end_32s_2k(sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+    fast_sint_t i, j;
+    for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1)
+    {
+        buckets[j] = buckets[i];
+    }
+
+    buckets[k] = 0; memcpy(&buckets[k + 1], buckets, ((size_t)k - 1) * sizeof(sa_sint_t));
+}
+
+static void libsais16_initialize_buckets_start_32s_1k(sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+    fast_sint_t i; sa_sint_t sum = 0;
+    for (i = 0; i <= (fast_sint_t)k - 1; i += 1) { sa_sint_t tmp = buckets[i]; buckets[i] = sum; sum += tmp; }
+}
+
+static void libsais16_initialize_buckets_end_32s_1k(sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+    fast_sint_t i; sa_sint_t sum = 0;
+    for (i = 0; i <= (fast_sint_t)k - 1; i += 1) { sum += buckets[i]; buckets[i] = sum; }
+}
+
+static sa_sint_t libsais16_initialize_buckets_for_lms_suffixes_radix_sort_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix)
+{
+    {
+        fast_uint_t     s = 0;
+        fast_sint_t     c0 = T[first_lms_suffix];
+        fast_sint_t     c1 = 0;
+
+        for (; --first_lms_suffix >= 0; )
+        {
+            c1 = c0; c0 = T[first_lms_suffix]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+            buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]--;
+        }
+
+        buckets[BUCKETS_INDEX4((fast_uint_t)c0, (s << 1) & 3)]--;
+    }
+
+    {
+        sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE];
+
+        fast_sint_t i, j; sa_sint_t sum = 0;
+        for (i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0))
+        { 
+            temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum; sum += buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 3)]; temp_bucket[j] = sum;
+        }
+
+        return sum;
+    }
+}
+
+static void libsais16_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix)
+{
+    buckets[BUCKETS_INDEX2(T[first_lms_suffix], 0)]++;
+    buckets[BUCKETS_INDEX2(T[first_lms_suffix], 1)]--;
+
+    fast_sint_t i; sa_sint_t sum0 = 0, sum1 = 0;
+    for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0))
+    { 
+        sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)];
+        sum1 += buckets[i + BUCKETS_INDEX2(0, 1)];
+        
+        buckets[i + BUCKETS_INDEX2(0, 0)] = sum0;
+        buckets[i + BUCKETS_INDEX2(0, 1)] = sum1;
+    }
+}
+
+static sa_sint_t libsais16_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix)
+{
+    {
+        fast_uint_t     s = 0;
+        fast_sint_t     c0 = T[first_lms_suffix];
+        fast_sint_t     c1 = 0;
+
+        for (; --first_lms_suffix >= 0; )
+        {
+            c1 = c0; c0 = T[first_lms_suffix]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+            buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]--;
+        }
+
+        buckets[BUCKETS_INDEX4((fast_uint_t)c0, (s << 1) & 3)]--;
+    }
+
+    {
+        sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k];
+
+        fast_sint_t i, j; sa_sint_t sum = 0;
+        for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1)
+        { 
+            sum += buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 3)]; temp_bucket[j] = sum;
+        }
+
+        return sum;
+    }
+}
+
+static void libsais16_initialize_buckets_for_radix_and_partial_sorting_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix)
+{
+    sa_sint_t * RESTRICT bucket_start = &buckets[2 * k];
+    sa_sint_t * RESTRICT bucket_end   = &buckets[3 * k];
+
+    buckets[BUCKETS_INDEX2(T[first_lms_suffix], 0)]++;
+    buckets[BUCKETS_INDEX2(T[first_lms_suffix], 1)]--;
+
+    fast_sint_t i, j; sa_sint_t sum0 = 0, sum1 = 0;
+    for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1)
+    { 
+        bucket_start[j] = sum1;
+
+        sum0 += buckets[i + BUCKETS_INDEX2(0, 1)];
+        sum1 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)];
+        buckets[i + BUCKETS_INDEX2(0, 1)] = sum0;
+
+        bucket_end[j] = sum1;
+    }
+}
+
+static void libsais16_radix_sort_lms_suffixes_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4)
+    {
+        libsais16_prefetch(&SA[i - 2 * prefetch_distance]);
+
+        libsais16_prefetch(&T[SA[i - prefetch_distance - 0]]);
+        libsais16_prefetch(&T[SA[i - prefetch_distance - 1]]);
+        libsais16_prefetch(&T[SA[i - prefetch_distance - 2]]);
+        libsais16_prefetch(&T[SA[i - prefetch_distance - 3]]);
+
+        sa_sint_t p0 = SA[i - 0]; SA[--induction_bucket[BUCKETS_INDEX2(T[p0], 0)]] = p0;
+        sa_sint_t p1 = SA[i - 1]; SA[--induction_bucket[BUCKETS_INDEX2(T[p1], 0)]] = p1;
+        sa_sint_t p2 = SA[i - 2]; SA[--induction_bucket[BUCKETS_INDEX2(T[p2], 0)]] = p2;
+        sa_sint_t p3 = SA[i - 3]; SA[--induction_bucket[BUCKETS_INDEX2(T[p3], 0)]] = p3;
+    }
+
+    for (j -= prefetch_distance + 3; i >= j; i -= 1)
+    {
+        sa_sint_t p = SA[i]; SA[--induction_bucket[BUCKETS_INDEX2(T[p], 0)]] = p;
+    }
+}
+
+static void libsais16_radix_sort_lms_suffixes_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536 && m >= 65536 && omp_get_dynamic() == 0)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        if (omp_num_threads == 1)
+        {
+            libsais16_radix_sort_lms_suffixes_16u(T, SA, &buckets[4 * ALPHABET_SIZE], (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                sa_sint_t * RESTRICT src_bucket = &buckets[4 * ALPHABET_SIZE];
+                sa_sint_t * RESTRICT dst_bucket = thread_state[omp_thread_num].state.buckets;
+
+                fast_sint_t i, j;
+                for (i = BUCKETS_INDEX2(0, 0), j = BUCKETS_INDEX4(0, 1); i <= BUCKETS_INDEX2(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX2(1, 0), j += BUCKETS_INDEX4(1, 0))
+                {
+                    dst_bucket[i] = src_bucket[i] - dst_bucket[j];
+                }
+            }
+
+            {
+                fast_sint_t t, omp_block_start = 0, omp_block_size = thread_state[omp_thread_num].state.m;
+                for (t = omp_num_threads - 1; t >= omp_thread_num; --t) omp_block_start += thread_state[t].state.m;
+
+                if (omp_block_start == (fast_sint_t)m && omp_block_size > 0)
+                {
+                    omp_block_start -= 1; omp_block_size -= 1;
+                }
+
+                libsais16_radix_sort_lms_suffixes_16u(T, SA, thread_state[omp_thread_num].state.buckets, (fast_sint_t)n - omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+}
+
+static void libsais16_radix_sort_lms_suffixes_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 3; i >= j; i -= 4)
+    {
+        libsais16_prefetch(&SA[i - 3 * prefetch_distance]);
+        
+        libsais16_prefetch(&T[SA[i - 2 * prefetch_distance - 0]]);
+        libsais16_prefetch(&T[SA[i - 2 * prefetch_distance - 1]]);
+        libsais16_prefetch(&T[SA[i - 2 * prefetch_distance - 2]]);
+        libsais16_prefetch(&T[SA[i - 2 * prefetch_distance - 3]]);
+
+        libsais16_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 0]]]);
+        libsais16_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 1]]]);
+        libsais16_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 2]]]);
+        libsais16_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 3]]]);
+
+        sa_sint_t p0 = SA[i - 0]; SA[--induction_bucket[T[p0]]] = p0;
+        sa_sint_t p1 = SA[i - 1]; SA[--induction_bucket[T[p1]]] = p1;
+        sa_sint_t p2 = SA[i - 2]; SA[--induction_bucket[T[p2]]] = p2;
+        sa_sint_t p3 = SA[i - 3]; SA[--induction_bucket[T[p3]]] = p3;
+    }
+
+    for (j -= 2 * prefetch_distance + 3; i >= j; i -= 1)
+    {
+        sa_sint_t p = SA[i]; SA[--induction_bucket[T[p]]] = p;
+    }
+}
+
+static void libsais16_radix_sort_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 3; i >= j; i -= 4)
+    {
+        libsais16_prefetch(&SA[i - 3 * prefetch_distance]);
+        
+        libsais16_prefetch(&T[SA[i - 2 * prefetch_distance - 0]]);
+        libsais16_prefetch(&T[SA[i - 2 * prefetch_distance - 1]]);
+        libsais16_prefetch(&T[SA[i - 2 * prefetch_distance - 2]]);
+        libsais16_prefetch(&T[SA[i - 2 * prefetch_distance - 3]]);
+
+        libsais16_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 0]], 0)]);
+        libsais16_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 1]], 0)]);
+        libsais16_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 2]], 0)]);
+        libsais16_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 3]], 0)]);
+
+        sa_sint_t p0 = SA[i - 0]; SA[--induction_bucket[BUCKETS_INDEX2(T[p0], 0)]] = p0;
+        sa_sint_t p1 = SA[i - 1]; SA[--induction_bucket[BUCKETS_INDEX2(T[p1], 0)]] = p1;
+        sa_sint_t p2 = SA[i - 2]; SA[--induction_bucket[BUCKETS_INDEX2(T[p2], 0)]] = p2;
+        sa_sint_t p3 = SA[i - 3]; SA[--induction_bucket[BUCKETS_INDEX2(T[p3], 0)]] = p3;
+    }
+
+    for (j -= 2 * prefetch_distance + 3; i >= j; i -= 1)
+    {
+        sa_sint_t p = SA[i]; SA[--induction_bucket[BUCKETS_INDEX2(T[p], 0)]] = p;
+    }
+}
+
+#if defined(_OPENMP)
+
+static void libsais16_radix_sort_lms_suffixes_32s_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
+    {
+        libsais16_prefetch(&SA[i + 2 * prefetch_distance]);
+
+        libsais16_prefetch(&T[SA[i + prefetch_distance + 0]]);
+        libsais16_prefetch(&T[SA[i + prefetch_distance + 1]]);
+        libsais16_prefetch(&T[SA[i + prefetch_distance + 2]]);
+        libsais16_prefetch(&T[SA[i + prefetch_distance + 3]]);
+
+        libsais16_prefetchw(&cache[i + prefetch_distance]);
+
+        cache[i + 0].symbol = T[cache[i + 0].index = SA[i + 0]];
+        cache[i + 1].symbol = T[cache[i + 1].index = SA[i + 1]];
+        cache[i + 2].symbol = T[cache[i + 2].index = SA[i + 2]];
+        cache[i + 3].symbol = T[cache[i + 3].index = SA[i + 3]];
+    }
+
+    for (j += prefetch_distance + 3; i < j; i += 1)
+    {
+        cache[i].symbol = T[cache[i].index = SA[i]];
+    }
+}
+
+static void libsais16_radix_sort_lms_suffixes_32s_6k_block_sort(sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4)
+    {
+        libsais16_prefetchw(&cache[i - 2 * prefetch_distance]);
+
+        libsais16_prefetchw(&induction_bucket[cache[i - prefetch_distance - 0].symbol]);
+        libsais16_prefetchw(&induction_bucket[cache[i - prefetch_distance - 1].symbol]);
+        libsais16_prefetchw(&induction_bucket[cache[i - prefetch_distance - 2].symbol]);
+        libsais16_prefetchw(&induction_bucket[cache[i - prefetch_distance - 3].symbol]);
+
+        cache[i - 0].symbol = --induction_bucket[cache[i - 0].symbol];
+        cache[i - 1].symbol = --induction_bucket[cache[i - 1].symbol];
+        cache[i - 2].symbol = --induction_bucket[cache[i - 2].symbol];
+        cache[i - 3].symbol = --induction_bucket[cache[i - 3].symbol];
+    }
+
+    for (j -= prefetch_distance + 3; i >= j; i -= 1)
+    {
+        cache[i].symbol = --induction_bucket[cache[i].symbol];
+    }
+}
+
+static void libsais16_radix_sort_lms_suffixes_32s_2k_block_sort(sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4)
+    {
+        libsais16_prefetchw(&cache[i - 2 * prefetch_distance]);
+
+        libsais16_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 0].symbol, 0)]);
+        libsais16_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 1].symbol, 0)]);
+        libsais16_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 2].symbol, 0)]);
+        libsais16_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 3].symbol, 0)]);
+
+        cache[i - 0].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 0].symbol, 0)];
+        cache[i - 1].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 1].symbol, 0)];
+        cache[i - 2].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 2].symbol, 0)];
+        cache[i - 3].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 3].symbol, 0)];
+    }
+
+    for (j -= prefetch_distance + 3; i >= j; i -= 1)
+    {
+        cache[i].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i].symbol, 0)];
+    }
+}
+
+static void libsais16_radix_sort_lms_suffixes_32s_6k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(cache);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais16_radix_sort_lms_suffixes_32s_6k(T, SA, induction_bucket, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                libsais16_radix_sort_lms_suffixes_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                libsais16_radix_sort_lms_suffixes_32s_6k_block_sort(induction_bucket, cache - block_start, block_start, block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais16_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+}
+
+static void libsais16_radix_sort_lms_suffixes_32s_2k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(cache);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais16_radix_sort_lms_suffixes_32s_2k(T, SA, induction_bucket, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                libsais16_radix_sort_lms_suffixes_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                libsais16_radix_sort_lms_suffixes_32s_2k_block_sort(induction_bucket, cache - block_start, block_start, block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais16_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+}
+
+#endif
+
+static void libsais16_radix_sort_lms_suffixes_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    if (threads == 1 || m < 65536)
+    {
+        libsais16_radix_sort_lms_suffixes_32s_6k(T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1);
+    }
+#if defined(_OPENMP)
+    else
+    {
+        fast_sint_t block_start, block_end;
+        for (block_start = 0; block_start < (fast_sint_t)m - 1; block_start = block_end)
+        {
+            block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end >= m) { block_end = (fast_sint_t)m - 1; }
+
+            libsais16_radix_sort_lms_suffixes_32s_6k_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, (fast_sint_t)n - block_end, block_end - block_start, threads);
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+}
+
+static void libsais16_radix_sort_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    if (threads == 1 || m < 65536)
+    {
+        libsais16_radix_sort_lms_suffixes_32s_2k(T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1);
+    }
+#if defined(_OPENMP)
+    else
+    {
+        fast_sint_t block_start, block_end;
+        for (block_start = 0; block_start < (fast_sint_t)m - 1; block_start = block_end)
+        {
+            block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end >= m) { block_end = (fast_sint_t)m - 1; }
+
+            libsais16_radix_sort_lms_suffixes_32s_2k_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, (fast_sint_t)n - block_end, block_end - block_start, threads);
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+}
+
+static sa_sint_t libsais16_radix_sort_lms_suffixes_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t             i = n - 2;
+    sa_sint_t             m = 0;
+    fast_uint_t           s = 1;
+    fast_sint_t           c0 = T[n - 1];
+    fast_sint_t           c1 = 0;
+    fast_sint_t           c2 = 0;
+
+    for (; i >= prefetch_distance + 3; i -= 4)
+    {
+        libsais16_prefetch(&T[i - 2 * prefetch_distance]);
+
+        libsais16_prefetchw(&buckets[T[i - prefetch_distance - 0]]);
+        libsais16_prefetchw(&buckets[T[i - prefetch_distance - 1]]);
+        libsais16_prefetchw(&buckets[T[i - prefetch_distance - 2]]);
+        libsais16_prefetchw(&buckets[T[i - prefetch_distance - 3]]);
+
+        c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); 
+        if ((s & 3) == 1) { SA[--buckets[c2 = c0]] = i + 1; m++; }
+        
+        c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); 
+        if ((s & 3) == 1) { SA[--buckets[c2 = c1]] = i - 0; m++; }
+
+        c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); 
+        if ((s & 3) == 1) { SA[--buckets[c2 = c0]] = i - 1; m++; }
+
+        c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); 
+        if ((s & 3) == 1) { SA[--buckets[c2 = c1]] = i - 2; m++; }
+    }
+
+    for (; i >= 0; i -= 1)
+    {
+        c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); 
+        if ((s & 3) == 1) { SA[--buckets[c2 = c1]] = i + 1; m++; }
+    }
+
+    if (m > 1)
+    {
+        SA[buckets[c2]] = 0;
+    }
+
+    return m;
+}
+
+static void libsais16_radix_sort_set_markers_32s_6k(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
+    {
+        libsais16_prefetch(&induction_bucket[i + 2 * prefetch_distance]);
+
+        libsais16_prefetchw(&SA[induction_bucket[i + prefetch_distance + 0]]);
+        libsais16_prefetchw(&SA[induction_bucket[i + prefetch_distance + 1]]);
+        libsais16_prefetchw(&SA[induction_bucket[i + prefetch_distance + 2]]);
+        libsais16_prefetchw(&SA[induction_bucket[i + prefetch_distance + 3]]);
+
+        SA[induction_bucket[i + 0]] |= SAINT_MIN;
+        SA[induction_bucket[i + 1]] |= SAINT_MIN;
+        SA[induction_bucket[i + 2]] |= SAINT_MIN;
+        SA[induction_bucket[i + 3]] |= SAINT_MIN;
+    }
+
+    for (j += prefetch_distance + 3; i < j; i += 1)
+    {
+        SA[induction_bucket[i]] |= SAINT_MIN;
+    }
+}
+
+static void libsais16_radix_sort_set_markers_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
+    {
+        libsais16_prefetch(&induction_bucket[BUCKETS_INDEX2(i + 2 * prefetch_distance, 0)]);
+
+        libsais16_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 0, 0)]]);
+        libsais16_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 1, 0)]]);
+        libsais16_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 2, 0)]]);
+        libsais16_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 3, 0)]]);
+
+        SA[induction_bucket[BUCKETS_INDEX2(i + 0, 0)]] |= SUFFIX_GROUP_MARKER;
+        SA[induction_bucket[BUCKETS_INDEX2(i + 1, 0)]] |= SUFFIX_GROUP_MARKER;
+        SA[induction_bucket[BUCKETS_INDEX2(i + 2, 0)]] |= SUFFIX_GROUP_MARKER;
+        SA[induction_bucket[BUCKETS_INDEX2(i + 3, 0)]] |= SUFFIX_GROUP_MARKER;
+    }
+
+    for (j += prefetch_distance + 3; i < j; i += 1)
+    {
+        SA[induction_bucket[BUCKETS_INDEX2(i, 0)]] |= SUFFIX_GROUP_MARKER;
+    }
+}
+
+static void libsais16_radix_sort_set_markers_32s_6k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && k >= 65536)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+        fast_sint_t omp_block_stride  = (((fast_sint_t)k - 1) / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : (fast_sint_t)k - 1 - omp_block_start;
+#else
+        UNUSED(threads);
+
+        fast_sint_t omp_block_start   = 0;
+        fast_sint_t omp_block_size    = (fast_sint_t)k - 1;
+#endif
+
+        libsais16_radix_sort_set_markers_32s_6k(SA, induction_bucket, omp_block_start, omp_block_size);
+    }
+}
+
+static void libsais16_radix_sort_set_markers_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && k >= 65536)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+        fast_sint_t omp_block_stride  = (((fast_sint_t)k - 1) / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : (fast_sint_t)k - 1 - omp_block_start;
+#else
+        UNUSED(threads);
+
+        fast_sint_t omp_block_start   = 0;
+        fast_sint_t omp_block_size    = (fast_sint_t)k - 1;
+#endif
+
+        libsais16_radix_sort_set_markers_32s_4k(SA, induction_bucket, omp_block_start, omp_block_size);
+    }
+}
+
+static void libsais16_initialize_buckets_for_partial_sorting_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count)
+{
+    sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE];
+
+    buckets[BUCKETS_INDEX4((fast_uint_t)T[first_lms_suffix], 1)]++;
+
+    fast_sint_t i, j; sa_sint_t sum0 = left_suffixes_count + 1, sum1 = 0;
+    for (i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0))
+    { 
+        temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0;
+
+        sum0 += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 2)];
+        sum1 += buckets[i + BUCKETS_INDEX4(0, 1)];
+
+        buckets[j + BUCKETS_INDEX2(0, 0)] = sum0;
+        buckets[j + BUCKETS_INDEX2(0, 1)] = sum1;
+    }
+}
+
+static void libsais16_initialize_buckets_for_partial_sorting_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count)
+{
+    sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k];
+
+    fast_sint_t i, j; sa_sint_t sum0 = left_suffixes_count + 1, sum1 = 0, sum2 = 0;
+    for (first_lms_suffix = T[first_lms_suffix], i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4((fast_sint_t)first_lms_suffix - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0))
+    {
+        sa_sint_t SS = buckets[i + BUCKETS_INDEX4(0, 0)];
+        sa_sint_t LS = buckets[i + BUCKETS_INDEX4(0, 1)];
+        sa_sint_t SL = buckets[i + BUCKETS_INDEX4(0, 2)];
+        sa_sint_t LL = buckets[i + BUCKETS_INDEX4(0, 3)];
+
+        buckets[i + BUCKETS_INDEX4(0, 0)] = sum0;
+        buckets[i + BUCKETS_INDEX4(0, 1)] = sum2;
+        buckets[i + BUCKETS_INDEX4(0, 2)] = 0;
+        buckets[i + BUCKETS_INDEX4(0, 3)] = 0;
+
+        sum0 += SS + SL; sum1 += LS; sum2 += LS + LL;
+
+        temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0;
+        temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum1;
+    }
+
+    for (sum1 += 1; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0))
+    { 
+        sa_sint_t SS = buckets[i + BUCKETS_INDEX4(0, 0)];
+        sa_sint_t LS = buckets[i + BUCKETS_INDEX4(0, 1)];
+        sa_sint_t SL = buckets[i + BUCKETS_INDEX4(0, 2)];
+        sa_sint_t LL = buckets[i + BUCKETS_INDEX4(0, 3)];
+
+        buckets[i + BUCKETS_INDEX4(0, 0)] = sum0;
+        buckets[i + BUCKETS_INDEX4(0, 1)] = sum2;
+        buckets[i + BUCKETS_INDEX4(0, 2)] = 0;
+        buckets[i + BUCKETS_INDEX4(0, 3)] = 0;
+
+        sum0 += SS + SL; sum1 += LS; sum2 += LS + LL;
+
+        temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0;
+        temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum1;
+    }
+}
+
+static sa_sint_t libsais16_partial_sorting_scan_left_to_right_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE];
+    sa_sint_t * RESTRICT distinct_names   = &buckets[2 * ALPHABET_SIZE];
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais16_prefetch(&SA[i + 2 * prefetch_distance]);
+
+        libsais16_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1);
+        libsais16_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2);
+        libsais16_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1);
+        libsais16_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2);
+
+        sa_sint_t p0 = SA[i + 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]);
+        SA[induction_bucket[v0]++] = (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d;
+
+        sa_sint_t p1 = SA[i + 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]);
+        SA[induction_bucket[v1]++] = (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d;
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]);
+        SA[induction_bucket[v]++] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d;
+    }
+
+    return d;
+}
+
+#if defined(_OPENMP)
+
+static void libsais16_partial_sorting_scan_left_to_right_16u_block_prepare(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size, LIBSAIS_THREAD_STATE * RESTRICT state)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
+    sa_sint_t * RESTRICT distinct_names   = &buckets[2 * ALPHABET_SIZE];
+
+    memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t));
+
+    fast_sint_t i, j, count = 0; sa_sint_t d = 1;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais16_prefetch(&SA[i + 2 * prefetch_distance]);
+
+        libsais16_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1);
+        libsais16_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2);
+        libsais16_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1);
+        libsais16_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2);
+
+        sa_sint_t p0 = cache[count].index = SA[i + 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = cache[count++].symbol = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]); induction_bucket[v0]++; distinct_names[v0] = d;
+        sa_sint_t p1 = cache[count].index = SA[i + 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = cache[count++].symbol = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]); induction_bucket[v1]++; distinct_names[v1] = d;
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t p = cache[count].index = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = cache[count++].symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]); induction_bucket[v]++; distinct_names[v] = d;
+    }
+
+    state[0].state.position   = (fast_sint_t)d - 1;
+    state[0].state.count      = count;
+}
+
+static void libsais16_partial_sorting_scan_left_to_right_16u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count, sa_sint_t d)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
+    sa_sint_t * RESTRICT distinct_names   = &buckets[2 * ALPHABET_SIZE];
+
+    fast_sint_t i, j;
+    for (i = 0, j = count - 1; i < j; i += 2)
+    {
+        libsais16_prefetch(&cache[i + prefetch_distance]);
+
+        sa_sint_t p0 = cache[i + 0].index; d += (p0 < 0); sa_sint_t v0 = cache[i + 0].symbol;
+        SA[induction_bucket[v0]++] = (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d;
+
+        sa_sint_t p1 = cache[i + 1].index; d += (p1 < 0); sa_sint_t v1 = cache[i + 1].symbol;
+        SA[induction_bucket[v1]++] = (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d;
+    }
+
+    for (j += 1; i < j; i += 1)
+    {
+        sa_sint_t p = cache[i].index; d += (p < 0); sa_sint_t v = cache[i].symbol;
+        SA[induction_bucket[v]++] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d;
+    }
+}
+
+static sa_sint_t libsais16_partial_sorting_scan_left_to_right_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            d = libsais16_partial_sorting_scan_left_to_right_16u(T, SA, buckets, d, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                libsais16_partial_sorting_scan_left_to_right_16u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size, &thread_state[omp_thread_num]);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE];
+                sa_sint_t * RESTRICT distinct_names   = &buckets[2 * ALPHABET_SIZE];
+
+                fast_sint_t t;
+                for (t = 0; t < omp_num_threads; ++t)
+                {
+                    sa_sint_t * RESTRICT temp_induction_bucket    = &thread_state[t].state.buckets[0 * ALPHABET_SIZE];
+                    sa_sint_t * RESTRICT temp_distinct_names      = &thread_state[t].state.buckets[2 * ALPHABET_SIZE];
+
+                    fast_sint_t c; 
+                    for (c = 0; c < 2 * ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_induction_bucket[c]; induction_bucket[c] = A + B; temp_induction_bucket[c] = A; }
+
+                    for (d -= 1, c = 0; c < 2 * ALPHABET_SIZE; c += 1) { sa_sint_t A = distinct_names[c], B = temp_distinct_names[c], D = B + d; distinct_names[c] = B > 0 ? D : A; temp_distinct_names[c] = A; }
+                    d += 1 + (sa_sint_t)thread_state[t].state.position; thread_state[t].state.position = (fast_sint_t)d - thread_state[t].state.position;
+                }
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais16_partial_sorting_scan_left_to_right_16u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count, (sa_sint_t)thread_state[omp_thread_num].state.position);
+            }
+        }
+#endif
+    }
+
+    return d;
+}
+
+#endif
+
+static sa_sint_t libsais16_partial_sorting_scan_left_to_right_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE];
+    sa_sint_t * RESTRICT distinct_names   = &buckets[2 * ALPHABET_SIZE];
+
+    SA[induction_bucket[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])]++] = (n - 1) | SAINT_MIN;
+    distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])] = ++d;
+
+    if (threads == 1 || left_suffixes_count < 65536)
+    {
+        d = libsais16_partial_sorting_scan_left_to_right_16u(T, SA, buckets, d, 0, left_suffixes_count);
+    }
+#if defined(_OPENMP)
+    else
+    {
+        fast_sint_t block_start;
+        for (block_start = 0; block_start < left_suffixes_count; )
+        {
+            if (SA[block_start] == 0)
+            {
+                block_start++;
+            }
+            else
+            {
+                fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > left_suffixes_count) { block_max_end = left_suffixes_count;}
+                fast_sint_t block_end     = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; }
+                fast_sint_t block_size    = block_end - block_start;
+
+                if (block_size < 32)
+                {
+                    for (; block_start < block_end; block_start += 1)
+                    {
+                        sa_sint_t p = SA[block_start]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]);
+                        SA[induction_bucket[v]++] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d;
+                    }
+                }
+                else
+                {
+                    d = libsais16_partial_sorting_scan_left_to_right_16u_block_omp(T, SA, buckets, d, block_start, block_size, threads, thread_state);
+                    block_start = block_end;
+                }
+            }
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+
+    return d;
+}
+
+static sa_sint_t libsais16_partial_sorting_scan_left_to_right_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais16_prefetch(&SA[i + 3 * prefetch_distance]);
+
+        libsais16_prefetch(&T[SA[i + 2 * prefetch_distance + 0] & SAINT_MAX] - 1);
+        libsais16_prefetch(&T[SA[i + 2 * prefetch_distance + 0] & SAINT_MAX] - 2);
+        libsais16_prefetch(&T[SA[i + 2 * prefetch_distance + 1] & SAINT_MAX] - 1);
+        libsais16_prefetch(&T[SA[i + 2 * prefetch_distance + 1] & SAINT_MAX] - 2);
+
+        sa_sint_t p0 = SA[i + prefetch_distance + 0] & SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX4(T[p0 - (p0 > 0)], 0); libsais16_prefetchw(&buckets[v0]);
+        sa_sint_t p1 = SA[i + prefetch_distance + 1] & SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX4(T[p1 - (p1 > 0)], 0); libsais16_prefetchw(&buckets[v1]);
+
+        sa_sint_t p2 = SA[i + 0]; d += (p2 < 0); p2 &= SAINT_MAX; sa_sint_t v2 = BUCKETS_INDEX4(T[p2 - 1], T[p2 - 2] >= T[p2 - 1]);
+        SA[buckets[v2]++] = (p2 - 1) | ((sa_sint_t)(buckets[2 + v2] != d) << (SAINT_BIT - 1)); buckets[2 + v2] = d;
+
+        sa_sint_t p3 = SA[i + 1]; d += (p3 < 0); p3 &= SAINT_MAX; sa_sint_t v3 = BUCKETS_INDEX4(T[p3 - 1], T[p3 - 2] >= T[p3 - 1]);
+        SA[buckets[v3]++] = (p3 - 1) | ((sa_sint_t)(buckets[2 + v3] != d) << (SAINT_BIT - 1)); buckets[2 + v3] = d;
+    }
+
+    for (j += 2 * prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX4(T[p - 1], T[p - 2] >= T[p - 1]);
+        SA[buckets[v]++] = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d;
+    }
+
+    return d;
+}
+
+static sa_sint_t libsais16_partial_sorting_scan_left_to_right_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT induction_bucket = &buckets[2 * k];
+    sa_sint_t * RESTRICT distinct_names   = &buckets[0 * k];
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais16_prefetchw(&SA[i + 3 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+        sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { const fast_sint_t Ts2 = T[(s2 & ~SUFFIX_GROUP_MARKER) - 1]; libsais16_prefetchw(&induction_bucket[Ts2]); libsais16_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts2, 0)]); }
+        sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { const fast_sint_t Ts3 = T[(s3 & ~SUFFIX_GROUP_MARKER) - 1]; libsais16_prefetchw(&induction_bucket[Ts3]); libsais16_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts3, 0)]); }
+
+        sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX;
+        if (p0 > 0)
+        {
+            SA[i + 0] = 0; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); p0 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] < T[p0 - 1]);
+            SA[induction_bucket[T[p0 - 1]]++] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d;
+        }
+
+        sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX;
+        if (p1 > 0)
+        {
+            SA[i + 1] = 0; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); p1 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] < T[p1 - 1]);
+            SA[induction_bucket[T[p1 - 1]]++] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d;
+        }
+    }
+
+    for (j += 2 * prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX;
+        if (p > 0)
+        {
+            SA[i] = 0; d += (p >> (SUFFIX_GROUP_BIT - 1)); p &= ~SUFFIX_GROUP_MARKER; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] < T[p - 1]);
+            SA[induction_bucket[T[p - 1]]++] = (p - 1) | ((sa_sint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d;
+        }
+    }
+
+    return d;
+}
+
+static void libsais16_partial_sorting_scan_left_to_right_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais16_prefetchw(&SA[i + 3 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+        sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { libsais16_prefetchw(&induction_bucket[T[s2 - 1]]); libsais16_prefetch(&T[s2] - 2); }
+        sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { libsais16_prefetchw(&induction_bucket[T[s3 - 1]]); libsais16_prefetch(&T[s3] - 2); }
+
+        sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { SA[i + 0] = 0; SA[induction_bucket[T[p0 - 1]]++] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)); }
+        sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { SA[i + 1] = 0; SA[induction_bucket[T[p1 - 1]]++] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)); }
+    }
+
+    for (j += 2 * prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { SA[i] = 0; SA[induction_bucket[T[p - 1]]++] = (p - 1) | ((sa_sint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1)); }
+    }
+}
+
+#if defined(_OPENMP)
+
+static void libsais16_partial_sorting_scan_left_to_right_32s_6k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais16_prefetch(&SA[i + 2 * prefetch_distance]);
+
+        libsais16_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1);
+        libsais16_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2);
+        libsais16_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1);
+        libsais16_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2);
+
+        libsais16_prefetchw(&cache[i + prefetch_distance]);
+
+        sa_sint_t p0 = cache[i + 0].index = SA[i + 0]; sa_sint_t symbol0 = 0; p0 &= SAINT_MAX; if (p0 != 0) { symbol0 = BUCKETS_INDEX4(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]); } cache[i + 0].symbol = symbol0;
+        sa_sint_t p1 = cache[i + 1].index = SA[i + 1]; sa_sint_t symbol1 = 0; p1 &= SAINT_MAX; if (p1 != 0) { symbol1 = BUCKETS_INDEX4(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]); } cache[i + 1].symbol = symbol1;
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t p = cache[i].index = SA[i]; sa_sint_t symbol = 0; p &= SAINT_MAX; if (p != 0) { symbol = BUCKETS_INDEX4(T[p - 1], T[p - 2] >= T[p - 1]); } cache[i].symbol = symbol;
+    }
+}
+
+static void libsais16_partial_sorting_scan_left_to_right_32s_4k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais16_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+
+        libsais16_prefetchw(&cache[i + prefetch_distance]);
+
+        sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { cache[i + 0].index = p0; p0 &= ~SUFFIX_GROUP_MARKER; symbol0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] < T[p0 - 1]); p0 = 0; } cache[i + 0].symbol = symbol0; SA[i + 0] = p0 & SAINT_MAX;
+        sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { cache[i + 1].index = p1; p1 &= ~SUFFIX_GROUP_MARKER; symbol1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] < T[p1 - 1]); p1 = 0; } cache[i + 1].symbol = symbol1; SA[i + 1] = p1 & SAINT_MAX;
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { cache[i].index = p; p &= ~SUFFIX_GROUP_MARKER; symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] < T[p - 1]); p = 0; } cache[i].symbol = symbol; SA[i] = p & SAINT_MAX;
+    }
+}
+
+static void libsais16_partial_sorting_scan_left_to_right_32s_1k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais16_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+
+        libsais16_prefetchw(&cache[i + prefetch_distance]);
+
+        sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { cache[i + 0].index = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)); symbol0 = T[p0 - 1]; p0 = 0; } cache[i + 0].symbol = symbol0; SA[i + 0] = p0 & SAINT_MAX;
+        sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { cache[i + 1].index = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)); symbol1 = T[p1 - 1]; p1 = 0; } cache[i + 1].symbol = symbol1; SA[i + 1] = p1 & SAINT_MAX;
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { cache[i].index = (p - 1) | ((sa_sint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1)); symbol = T[p - 1]; p = 0; } cache[i].symbol = symbol; SA[i] = p & SAINT_MAX;
+    }
+}
+
+static sa_sint_t libsais16_partial_sorting_scan_left_to_right_32s_6k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size;
+    for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais16_prefetchw(&cache[i + 2 * prefetch_distance]);
+
+        libsais16_prefetchw(&buckets[cache[i + prefetch_distance + 0].symbol]);
+        libsais16_prefetchw(&buckets[cache[i + prefetch_distance + 1].symbol]);
+
+        sa_sint_t v0 = cache[i + 0].symbol, p0 = cache[i + 0].index; d += (p0 < 0); cache[i + 0].symbol = buckets[v0]++; cache[i + 0].index = (p0 - 1) | ((sa_sint_t)(buckets[2 + v0] != d) << (SAINT_BIT - 1)); buckets[2 + v0] = d;
+        if (cache[i + 0].symbol < omp_block_end) { sa_sint_t s = cache[i + 0].symbol, q = (cache[s].index = cache[i + 0].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] >= T[q - 1]); }
+
+        sa_sint_t v1 = cache[i + 1].symbol, p1 = cache[i + 1].index; d += (p1 < 0); cache[i + 1].symbol = buckets[v1]++; cache[i + 1].index = (p1 - 1) | ((sa_sint_t)(buckets[2 + v1] != d) << (SAINT_BIT - 1)); buckets[2 + v1] = d;
+        if (cache[i + 1].symbol < omp_block_end) { sa_sint_t s = cache[i + 1].symbol, q = (cache[s].index = cache[i + 1].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] >= T[q - 1]); }
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t v = cache[i].symbol, p = cache[i].index; d += (p < 0); cache[i].symbol = buckets[v]++; cache[i].index = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d;
+        if (cache[i].symbol < omp_block_end) { sa_sint_t s = cache[i].symbol, q = (cache[s].index = cache[i].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] >= T[q - 1]); }
+    }
+
+    return d;
+}
+
+static sa_sint_t libsais16_partial_sorting_scan_left_to_right_32s_4k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT induction_bucket = &buckets[2 * k];
+    sa_sint_t * RESTRICT distinct_names   = &buckets[0 * k];
+
+    fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size;
+    for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais16_prefetchw(&cache[i + 2 * prefetch_distance]);
+
+        sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0 >> 1]; libsais16_prefetchw(s0 >= 0 ? Is0 : NULL); const sa_sint_t * Ds0 = &distinct_names[s0]; libsais16_prefetchw(s0 >= 0 ? Ds0 : NULL); 
+        sa_sint_t s1 = cache[i + prefetch_distance + 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1 >> 1]; libsais16_prefetchw(s1 >= 0 ? Is1 : NULL); const sa_sint_t * Ds1 = &distinct_names[s1]; libsais16_prefetchw(s1 >= 0 ? Ds1 : NULL);
+        
+        sa_sint_t v0 = cache[i + 0].symbol;
+        if (v0 >= 0)
+        {
+            sa_sint_t p0 = cache[i + 0].index; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); cache[i + 0].symbol = induction_bucket[v0 >> 1]++; cache[i + 0].index = (p0 - 1) | (v0 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d;
+            if (cache[i + 0].symbol < omp_block_end) { sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index; if (np > 0) { cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]); np = 0; } cache[i + 0].index = np & SAINT_MAX; }
+        }
+
+        sa_sint_t v1 = cache[i + 1].symbol;
+        if (v1 >= 0)
+        {
+            sa_sint_t p1 = cache[i + 1].index; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); cache[i + 1].symbol = induction_bucket[v1 >> 1]++; cache[i + 1].index = (p1 - 1) | (v1 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d;
+            if (cache[i + 1].symbol < omp_block_end) { sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index; if (np > 0) { cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]); np = 0; } cache[i + 1].index = np & SAINT_MAX; }
+        }
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t v = cache[i].symbol;
+        if (v >= 0)
+        {
+            sa_sint_t p = cache[i].index; d += (p >> (SUFFIX_GROUP_BIT - 1)); cache[i].symbol = induction_bucket[v >> 1]++; cache[i].index = (p - 1) | (v << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d;
+            if (cache[i].symbol < omp_block_end) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]); np = 0; } cache[i].index = np & SAINT_MAX; }
+        }
+    }
+
+    return d;
+}
+
+static void libsais16_partial_sorting_scan_left_to_right_32s_1k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size;
+    for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais16_prefetchw(&cache[i + 2 * prefetch_distance]);
+
+        sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0]; libsais16_prefetchw(s0 >= 0 ? Is0 : NULL);
+        sa_sint_t s1 = cache[i + prefetch_distance + 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1]; libsais16_prefetchw(s1 >= 0 ? Is1 : NULL);
+        
+        sa_sint_t v0 = cache[i + 0].symbol;
+        if (v0 >= 0)
+        {
+            cache[i + 0].symbol = induction_bucket[v0]++;
+            if (cache[i + 0].symbol < omp_block_end) { sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index; if (np > 0) { cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; np = 0; } cache[i + 0].index = np & SAINT_MAX; }
+        }
+
+        sa_sint_t v1 = cache[i + 1].symbol;
+        if (v1 >= 0)
+        {
+            cache[i + 1].symbol = induction_bucket[v1]++;
+            if (cache[i + 1].symbol < omp_block_end) { sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index; if (np > 0) { cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; np = 0; } cache[i + 1].index = np & SAINT_MAX; }
+        }
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t v = cache[i].symbol;
+        if (v >= 0)
+        {
+            cache[i].symbol = induction_bucket[v]++;
+            if (cache[i].symbol < omp_block_end) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; np = 0; } cache[i].index = np & SAINT_MAX; }
+        }
+    }
+}
+
+static sa_sint_t libsais16_partial_sorting_scan_left_to_right_32s_6k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(cache);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            d = libsais16_partial_sorting_scan_left_to_right_32s_6k(T, SA, buckets, d, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                libsais16_partial_sorting_scan_left_to_right_32s_6k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                d = libsais16_partial_sorting_scan_left_to_right_32s_6k_block_sort(T, buckets, d, cache - block_start, block_start, block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais16_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+
+    return d;
+}
+
+static sa_sint_t libsais16_partial_sorting_scan_left_to_right_32s_4k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(cache);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            d = libsais16_partial_sorting_scan_left_to_right_32s_4k(T, SA, k, buckets, d, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                libsais16_partial_sorting_scan_left_to_right_32s_4k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                d = libsais16_partial_sorting_scan_left_to_right_32s_4k_block_sort(T, k, buckets, d, cache - block_start, block_start, block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais16_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+
+    return d;
+}
+
+static void libsais16_partial_sorting_scan_left_to_right_32s_1k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(cache);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais16_partial_sorting_scan_left_to_right_32s_1k(T, SA, buckets, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                libsais16_partial_sorting_scan_left_to_right_32s_1k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                libsais16_partial_sorting_scan_left_to_right_32s_1k_block_sort(T, buckets, cache - block_start, block_start, block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais16_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+}
+
+#endif
+
+static sa_sint_t libsais16_partial_sorting_scan_left_to_right_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    SA[buckets[BUCKETS_INDEX4(T[n - 1], T[n - 2] >= T[n - 1])]++] = (n - 1) | SAINT_MIN;
+    buckets[2 + BUCKETS_INDEX4(T[n - 1], T[n - 2] >= T[n - 1])] = ++d;
+
+    if (threads == 1 || left_suffixes_count < 65536)
+    {
+        d = libsais16_partial_sorting_scan_left_to_right_32s_6k(T, SA, buckets, d, 0, left_suffixes_count);
+    }
+#if defined(_OPENMP)
+    else
+    {
+        fast_sint_t block_start, block_end;
+        for (block_start = 0; block_start < left_suffixes_count; block_start = block_end)
+        {
+            block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > left_suffixes_count) { block_end = left_suffixes_count; }
+
+            d = libsais16_partial_sorting_scan_left_to_right_32s_6k_block_omp(T, SA, buckets, d, thread_state[0].state.cache, block_start, block_end - block_start, threads);
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+
+    return d;
+}
+
+static sa_sint_t libsais16_partial_sorting_scan_left_to_right_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    sa_sint_t * RESTRICT induction_bucket = &buckets[2 * k];
+    sa_sint_t * RESTRICT distinct_names   = &buckets[0 * k];
+
+    SA[induction_bucket[T[n - 1]]++] = (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1)) | SUFFIX_GROUP_MARKER;
+    distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] < T[n - 1])] = ++d;
+
+    if (threads == 1 || n < 65536)
+    {
+        d = libsais16_partial_sorting_scan_left_to_right_32s_4k(T, SA, k, buckets, d, 0, n);
+    }
+#if defined(_OPENMP)
+    else
+    {
+        fast_sint_t block_start, block_end;
+        for (block_start = 0; block_start < n; block_start = block_end)
+        {
+            block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > n) { block_end = n; }
+
+            d = libsais16_partial_sorting_scan_left_to_right_32s_4k_block_omp(T, SA, k, buckets, d, thread_state[0].state.cache, block_start, block_end - block_start, threads);
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+
+    return d;
+}
+
+static void libsais16_partial_sorting_scan_left_to_right_32s_1k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    SA[buckets[T[n - 1]]++] = (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1));
+
+    if (threads == 1 || n < 65536)
+    {
+       libsais16_partial_sorting_scan_left_to_right_32s_1k(T, SA, buckets, 0, n);
+    }
+#if defined(_OPENMP)
+    else
+    {
+        fast_sint_t block_start, block_end;
+        for (block_start = 0; block_start < n; block_start = block_end)
+        {
+            block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > n) { block_end = n; }
+
+            libsais16_partial_sorting_scan_left_to_right_32s_1k_block_omp(T, SA, buckets, thread_state[0].state.cache, block_start, block_end - block_start, threads);
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+}
+
+static void libsais16_partial_sorting_shift_markers_16u_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, const sa_sint_t * RESTRICT buckets, sa_sint_t threads)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    const sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE];
+
+    fast_sint_t c;
+
+#if defined(_OPENMP)
+    #pragma omp parallel for schedule(static, 1) num_threads(threads) if(threads > 1 && n >= 65536)
+#else
+    UNUSED(threads); UNUSED(n);
+#endif
+    for (c = BUCKETS_INDEX2(ALPHABET_SIZE - 1, 0); c >= BUCKETS_INDEX2(1, 0); c -= BUCKETS_INDEX2(1, 0))
+    {
+        fast_sint_t i, j; sa_sint_t s = SAINT_MIN;
+        for (i = (fast_sint_t)temp_bucket[c] - 1, j = (fast_sint_t)buckets[c - BUCKETS_INDEX2(1, 0)] + 3; i >= j; i -= 4)
+        {
+            libsais16_prefetchw(&SA[i - prefetch_distance]);
+
+            sa_sint_t p0 = SA[i - 0], q0 = (p0 & SAINT_MIN) ^ s; s = s ^ q0; SA[i - 0] = p0 ^ q0;
+            sa_sint_t p1 = SA[i - 1], q1 = (p1 & SAINT_MIN) ^ s; s = s ^ q1; SA[i - 1] = p1 ^ q1;
+            sa_sint_t p2 = SA[i - 2], q2 = (p2 & SAINT_MIN) ^ s; s = s ^ q2; SA[i - 2] = p2 ^ q2;
+            sa_sint_t p3 = SA[i - 3], q3 = (p3 & SAINT_MIN) ^ s; s = s ^ q3; SA[i - 3] = p3 ^ q3;
+        }
+
+        for (j -= 3; i >= j; i -= 1)
+        {
+            sa_sint_t p = SA[i], q = (p & SAINT_MIN) ^ s; s = s ^ q; SA[i] = p ^ q;
+        }
+    }
+}
+
+static void libsais16_partial_sorting_shift_markers_32s_6k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k, const sa_sint_t * RESTRICT buckets, sa_sint_t threads)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    const sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k];
+    
+    fast_sint_t c;
+
+#if defined(_OPENMP)
+    #pragma omp parallel for schedule(static, 1) num_threads(threads) if(threads > 1 && k >= 65536)
+#else
+    UNUSED(threads);
+#endif
+    for (c = (fast_sint_t)k - 1; c >= 1; c -= 1)
+    {
+        fast_sint_t i, j; sa_sint_t s = SAINT_MIN;
+        for (i = (fast_sint_t)buckets[BUCKETS_INDEX4(c, 0)] - 1, j = (fast_sint_t)temp_bucket[BUCKETS_INDEX2(c - 1, 0)] + 3; i >= j; i -= 4)
+        {
+            libsais16_prefetchw(&SA[i - prefetch_distance]);
+
+            sa_sint_t p0 = SA[i - 0], q0 = (p0 & SAINT_MIN) ^ s; s = s ^ q0; SA[i - 0] = p0 ^ q0;
+            sa_sint_t p1 = SA[i - 1], q1 = (p1 & SAINT_MIN) ^ s; s = s ^ q1; SA[i - 1] = p1 ^ q1;
+            sa_sint_t p2 = SA[i - 2], q2 = (p2 & SAINT_MIN) ^ s; s = s ^ q2; SA[i - 2] = p2 ^ q2;
+            sa_sint_t p3 = SA[i - 3], q3 = (p3 & SAINT_MIN) ^ s; s = s ^ q3; SA[i - 3] = p3 ^ q3;
+        }
+
+        for (j -= 3; i >= j; i -= 1)
+        {
+            sa_sint_t p = SA[i], q = (p & SAINT_MIN) ^ s; s = s ^ q; SA[i] = p ^ q;
+        }
+    }
+}
+
+static void libsais16_partial_sorting_shift_markers_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i; sa_sint_t s = SUFFIX_GROUP_MARKER;
+    for (i = (fast_sint_t)n - 1; i >= 3; i -= 4)
+    {
+        libsais16_prefetchw(&SA[i - prefetch_distance]);
+
+        sa_sint_t p0 = SA[i - 0], q0 = ((p0 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p0 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q0; SA[i - 0] = p0 ^ q0;
+        sa_sint_t p1 = SA[i - 1], q1 = ((p1 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p1 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q1; SA[i - 1] = p1 ^ q1;
+        sa_sint_t p2 = SA[i - 2], q2 = ((p2 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p2 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q2; SA[i - 2] = p2 ^ q2;
+        sa_sint_t p3 = SA[i - 3], q3 = ((p3 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p3 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q3; SA[i - 3] = p3 ^ q3;
+    }
+
+    for (; i >= 0; i -= 1)
+    {
+        sa_sint_t p = SA[i], q = ((p & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q; SA[i] = p ^ q;
+    }
+}
+
+static void libsais16_partial_sorting_shift_buckets_32s_6k(sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+    sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k];
+
+    fast_sint_t i;
+    for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0))
+    {
+        buckets[2 * i + BUCKETS_INDEX4(0, 0)] = temp_bucket[i + BUCKETS_INDEX2(0, 0)];
+        buckets[2 * i + BUCKETS_INDEX4(0, 1)] = temp_bucket[i + BUCKETS_INDEX2(0, 1)];
+    }
+}
+
+static sa_sint_t libsais16_partial_sorting_scan_right_to_left_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
+    sa_sint_t * RESTRICT distinct_names   = &buckets[2 * ALPHABET_SIZE];
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+    {
+        libsais16_prefetch(&SA[i - 2 * prefetch_distance]);
+
+        libsais16_prefetch(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 1);
+        libsais16_prefetch(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 2);
+        libsais16_prefetch(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 1);
+        libsais16_prefetch(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 2);
+
+        sa_sint_t p0 = SA[i - 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]);
+        SA[--induction_bucket[v0]] = (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d;
+
+        sa_sint_t p1 = SA[i - 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]);
+        SA[--induction_bucket[v1]] = (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d;
+    }
+
+    for (j -= prefetch_distance + 1; i >= j; i -= 1)
+    {
+        sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
+        SA[--induction_bucket[v]] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d;
+    }
+
+    return d;
+}
+
+#if defined(_OPENMP)
+
+static void libsais16_partial_sorting_scan_right_to_left_16u_block_prepare(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size, LIBSAIS_THREAD_STATE * RESTRICT state)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
+    sa_sint_t * RESTRICT distinct_names   = &buckets[2 * ALPHABET_SIZE];
+
+    memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t));
+
+    fast_sint_t i, j, count = 0; sa_sint_t d = 1;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+    {
+        libsais16_prefetch(&SA[i - 2 * prefetch_distance]);
+
+        libsais16_prefetch(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 1);
+        libsais16_prefetch(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 2);
+        libsais16_prefetch(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 1);
+        libsais16_prefetch(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 2);
+
+        sa_sint_t p0 = cache[count].index = SA[i - 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = cache[count++].symbol = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); induction_bucket[v0]++; distinct_names[v0] = d;
+        sa_sint_t p1 = cache[count].index = SA[i - 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = cache[count++].symbol = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); induction_bucket[v1]++; distinct_names[v1] = d;
+    }
+
+    for (j -= prefetch_distance + 1; i >= j; i -= 1)
+    {
+        sa_sint_t p = cache[count].index = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = cache[count++].symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); induction_bucket[v]++; distinct_names[v] = d;
+    }
+
+    state[0].state.position   = (fast_sint_t)d - 1;
+    state[0].state.count      = count;
+}
+
+static void libsais16_partial_sorting_scan_right_to_left_16u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count, sa_sint_t d)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
+    sa_sint_t * RESTRICT distinct_names   = &buckets[2 * ALPHABET_SIZE];
+
+    fast_sint_t i, j;
+    for (i = 0, j = count - 1; i < j; i += 2)
+    {
+        libsais16_prefetch(&cache[i + prefetch_distance]);
+
+        sa_sint_t p0 = cache[i + 0].index; d += (p0 < 0); sa_sint_t v0 = cache[i + 0].symbol;
+        SA[--induction_bucket[v0]] = (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d;
+
+        sa_sint_t p1 = cache[i + 1].index; d += (p1 < 0); sa_sint_t v1 = cache[i + 1].symbol;
+        SA[--induction_bucket[v1]] = (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d;
+    }
+
+    for (j += 1; i < j; i += 1)
+    {
+        sa_sint_t p = cache[i].index; d += (p < 0); sa_sint_t v = cache[i].symbol;
+        SA[--induction_bucket[v]] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d;
+    }
+}
+
+static sa_sint_t libsais16_partial_sorting_scan_right_to_left_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            d = libsais16_partial_sorting_scan_right_to_left_16u(T, SA, buckets, d, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                libsais16_partial_sorting_scan_right_to_left_16u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size, &thread_state[omp_thread_num]);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
+                sa_sint_t * RESTRICT distinct_names   = &buckets[2 * ALPHABET_SIZE];
+
+                fast_sint_t t;
+                for (t = omp_num_threads - 1; t >= 0; --t)
+                {
+                    sa_sint_t * RESTRICT temp_induction_bucket    = &thread_state[t].state.buckets[0 * ALPHABET_SIZE];
+                    sa_sint_t * RESTRICT temp_distinct_names      = &thread_state[t].state.buckets[2 * ALPHABET_SIZE];
+
+                    fast_sint_t c; 
+                    for (c = 0; c < 2 * ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_induction_bucket[c]; induction_bucket[c] = A - B; temp_induction_bucket[c] = A; }
+
+                    for (d -= 1, c = 0; c < 2 * ALPHABET_SIZE; c += 1) { sa_sint_t A = distinct_names[c], B = temp_distinct_names[c], D = B + d; distinct_names[c] = B > 0 ? D : A; temp_distinct_names[c] = A; }
+                    d += 1 + (sa_sint_t)thread_state[t].state.position; thread_state[t].state.position = (fast_sint_t)d - thread_state[t].state.position;
+                }
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais16_partial_sorting_scan_right_to_left_16u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count, (sa_sint_t)thread_state[omp_thread_num].state.position);
+            }
+        }
+#endif
+    }
+
+    return d;
+}
+
+#endif
+
+static void libsais16_partial_sorting_scan_right_to_left_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    fast_sint_t scan_start    = (fast_sint_t)left_suffixes_count + 1;
+    fast_sint_t scan_end      = (fast_sint_t)n - (fast_sint_t)first_lms_suffix;
+
+    if (threads == 1 || (scan_end - scan_start) < 65536)
+    {
+        libsais16_partial_sorting_scan_right_to_left_16u(T, SA, buckets, d, scan_start, scan_end - scan_start);
+    }
+#if defined(_OPENMP)
+    else
+    {
+        sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
+        sa_sint_t * RESTRICT distinct_names   = &buckets[2 * ALPHABET_SIZE];
+
+        fast_sint_t block_start;
+        for (block_start = scan_end - 1; block_start >= scan_start; )
+        {
+            if (SA[block_start] == 0)
+            {
+                block_start--;
+            }
+            else
+            {
+                fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end < scan_start) { block_max_end = scan_start - 1; }
+                fast_sint_t block_end     = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; }
+                fast_sint_t block_size    = block_start - block_end;
+
+                if (block_size < 32)
+                {
+                    for (; block_start > block_end; block_start -= 1)
+                    {
+                        sa_sint_t p = SA[block_start]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
+                        SA[--induction_bucket[v]] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d;
+                    }
+                }
+                else
+                {
+                    d = libsais16_partial_sorting_scan_right_to_left_16u_block_omp(T, SA, buckets, d, block_end + 1, block_size, threads, thread_state);
+                    block_start = block_end;
+                }
+            }
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+}
+
+static sa_sint_t libsais16_partial_sorting_scan_right_to_left_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2)
+    {
+        libsais16_prefetch(&SA[i - 3 * prefetch_distance]);
+
+        libsais16_prefetch(&T[SA[i - 2 * prefetch_distance - 0] & SAINT_MAX] - 1);
+        libsais16_prefetch(&T[SA[i - 2 * prefetch_distance - 0] & SAINT_MAX] - 2);
+        libsais16_prefetch(&T[SA[i - 2 * prefetch_distance - 1] & SAINT_MAX] - 1);
+        libsais16_prefetch(&T[SA[i - 2 * prefetch_distance - 1] & SAINT_MAX] - 2);
+
+        sa_sint_t p0 = SA[i - prefetch_distance - 0] & SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX4(T[p0 - (p0 > 0)], 0); libsais16_prefetchw(&buckets[v0]);
+        sa_sint_t p1 = SA[i - prefetch_distance - 1] & SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX4(T[p1 - (p1 > 0)], 0); libsais16_prefetchw(&buckets[v1]);
+
+        sa_sint_t p2 = SA[i - 0]; d += (p2 < 0); p2 &= SAINT_MAX; sa_sint_t v2 = BUCKETS_INDEX4(T[p2 - 1], T[p2 - 2] > T[p2 - 1]);
+        SA[--buckets[v2]] = (p2 - 1) | ((sa_sint_t)(buckets[2 + v2] != d) << (SAINT_BIT - 1)); buckets[2 + v2] = d;
+
+        sa_sint_t p3 = SA[i - 1]; d += (p3 < 0); p3 &= SAINT_MAX; sa_sint_t v3 = BUCKETS_INDEX4(T[p3 - 1], T[p3 - 2] > T[p3 - 1]);
+        SA[--buckets[v3]] = (p3 - 1) | ((sa_sint_t)(buckets[2 + v3] != d) << (SAINT_BIT - 1)); buckets[2 + v3] = d;
+    }
+
+    for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1)
+    {
+        sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX4(T[p - 1], T[p - 2] > T[p - 1]);
+        SA[--buckets[v]] = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d;
+    }
+
+    return d;
+}
+
+static sa_sint_t libsais16_partial_sorting_scan_right_to_left_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT induction_bucket = &buckets[3 * k];
+    sa_sint_t * RESTRICT distinct_names   = &buckets[0 * k];
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2)
+    {
+        libsais16_prefetchw(&SA[i - 3 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0]; const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1]; const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+        sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { const fast_sint_t Ts2 = T[(s2 & ~SUFFIX_GROUP_MARKER) - 1]; libsais16_prefetchw(&induction_bucket[Ts2]); libsais16_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts2, 0)]); }
+        sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { const fast_sint_t Ts3 = T[(s3 & ~SUFFIX_GROUP_MARKER) - 1]; libsais16_prefetchw(&induction_bucket[Ts3]); libsais16_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts3, 0)]); }
+
+        sa_sint_t p0 = SA[i - 0];
+        if (p0 > 0)
+        {
+            SA[i - 0] = 0; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); p0 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]);
+            SA[--induction_bucket[T[p0 - 1]]] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d;
+        }
+
+        sa_sint_t p1 = SA[i - 1];
+        if (p1 > 0)
+        {
+            SA[i - 1] = 0; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); p1 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]);
+            SA[--induction_bucket[T[p1 - 1]]] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d;
+        }
+    }
+
+    for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1)
+    {
+        sa_sint_t p = SA[i];
+        if (p > 0)
+        {
+            SA[i] = 0; d += (p >> (SUFFIX_GROUP_BIT - 1)); p &= ~SUFFIX_GROUP_MARKER; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
+            SA[--induction_bucket[T[p - 1]]] = (p - 1) | ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d;
+        }
+    }
+
+    return d;
+}
+
+static void libsais16_partial_sorting_scan_right_to_left_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2)
+    {
+        libsais16_prefetchw(&SA[i - 3 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+        sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { libsais16_prefetchw(&induction_bucket[T[s2 - 1]]); libsais16_prefetch(&T[s2] - 2); }
+        sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { libsais16_prefetchw(&induction_bucket[T[s3 - 1]]); libsais16_prefetch(&T[s3] - 2); }
+
+        sa_sint_t p0 = SA[i - 0]; if (p0 > 0) { SA[i - 0] = 0; SA[--induction_bucket[T[p0 - 1]]] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)); }
+        sa_sint_t p1 = SA[i - 1]; if (p1 > 0) { SA[i - 1] = 0; SA[--induction_bucket[T[p1 - 1]]] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)); }
+    }
+
+    for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1)
+    {
+        sa_sint_t p = SA[i]; if (p > 0) { SA[i] = 0; SA[--induction_bucket[T[p - 1]]] = (p - 1) | ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)); }
+    }
+}
+
+#if defined(_OPENMP)
+
+static void libsais16_partial_sorting_scan_right_to_left_32s_6k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais16_prefetch(&SA[i + 2 * prefetch_distance]);
+
+        libsais16_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1);
+        libsais16_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2);
+        libsais16_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1);
+        libsais16_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2);
+
+        libsais16_prefetchw(&cache[i + prefetch_distance]);
+
+        sa_sint_t p0 = cache[i + 0].index = SA[i + 0]; sa_sint_t symbol0 = 0; p0 &= SAINT_MAX; if (p0 != 0) { symbol0 = BUCKETS_INDEX4(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); } cache[i + 0].symbol = symbol0;
+        sa_sint_t p1 = cache[i + 1].index = SA[i + 1]; sa_sint_t symbol1 = 0; p1 &= SAINT_MAX; if (p1 != 0) { symbol1 = BUCKETS_INDEX4(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); } cache[i + 1].symbol = symbol1;
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t p = cache[i].index = SA[i]; sa_sint_t symbol = 0; p &= SAINT_MAX; if (p != 0) { symbol = BUCKETS_INDEX4(T[p - 1], T[p - 2] > T[p - 1]); } cache[i].symbol = symbol;
+    }
+}
+
+static void libsais16_partial_sorting_scan_right_to_left_32s_4k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais16_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+
+        libsais16_prefetchw(&cache[i + prefetch_distance]);
+
+        sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { SA[i + 0] = 0; cache[i + 0].index = p0; p0 &= ~SUFFIX_GROUP_MARKER; symbol0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); } cache[i + 0].symbol = symbol0;
+        sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { SA[i + 1] = 0; cache[i + 1].index = p1; p1 &= ~SUFFIX_GROUP_MARKER; symbol1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); } cache[i + 1].symbol = symbol1;
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { SA[i] = 0; cache[i].index = p; p &= ~SUFFIX_GROUP_MARKER; symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); } cache[i].symbol = symbol;
+    }
+}
+
+static void libsais16_partial_sorting_scan_right_to_left_32s_1k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais16_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+
+        libsais16_prefetchw(&cache[i + prefetch_distance]);
+
+        sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { SA[i + 0] = 0; cache[i + 0].index = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)); symbol0 = T[p0 - 1]; } cache[i + 0].symbol = symbol0;
+        sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { SA[i + 1] = 0; cache[i + 1].index = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)); symbol1 = T[p1 - 1]; } cache[i + 1].symbol = symbol1;
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { SA[i] = 0; cache[i].index = (p - 1) | ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)); symbol = T[p - 1]; } cache[i].symbol = symbol;
+    }
+}
+
+static sa_sint_t libsais16_partial_sorting_scan_right_to_left_32s_6k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+    {
+        libsais16_prefetchw(&cache[i - 2 * prefetch_distance]);
+
+        libsais16_prefetchw(&buckets[cache[i - prefetch_distance - 0].symbol]);
+        libsais16_prefetchw(&buckets[cache[i - prefetch_distance - 1].symbol]);
+
+        sa_sint_t v0 = cache[i - 0].symbol, p0 = cache[i - 0].index; d += (p0 < 0); cache[i - 0].symbol = --buckets[v0]; cache[i - 0].index = (p0 - 1) | ((sa_sint_t)(buckets[2 + v0] != d) << (SAINT_BIT - 1)); buckets[2 + v0] = d;
+        if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t s = cache[i - 0].symbol, q = (cache[s].index = cache[i - 0].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] > T[q - 1]); }
+
+        sa_sint_t v1 = cache[i - 1].symbol, p1 = cache[i - 1].index; d += (p1 < 0); cache[i - 1].symbol = --buckets[v1]; cache[i - 1].index = (p1 - 1) | ((sa_sint_t)(buckets[2 + v1] != d) << (SAINT_BIT - 1)); buckets[2 + v1] = d;
+        if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t s = cache[i - 1].symbol, q = (cache[s].index = cache[i - 1].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] > T[q - 1]); }
+    }
+
+    for (j -= prefetch_distance + 1; i >= j; i -= 1)
+    {
+        sa_sint_t v = cache[i].symbol, p = cache[i].index; d += (p < 0); cache[i].symbol = --buckets[v]; cache[i].index = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d;
+        if (cache[i].symbol >= omp_block_start) { sa_sint_t s = cache[i].symbol, q = (cache[s].index = cache[i].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] > T[q - 1]); }
+    }
+
+    return d;
+}
+
+static sa_sint_t libsais16_partial_sorting_scan_right_to_left_32s_4k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT induction_bucket = &buckets[3 * k];
+    sa_sint_t * RESTRICT distinct_names   = &buckets[0 * k];
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+    {
+        libsais16_prefetchw(&cache[i - 2 * prefetch_distance]);
+
+        sa_sint_t s0 = cache[i - prefetch_distance - 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0 >> 1]; libsais16_prefetchw(s0 >= 0 ? Is0 : NULL); const sa_sint_t * Ds0 = &distinct_names[s0]; libsais16_prefetchw(s0 >= 0 ? Ds0 : NULL); 
+        sa_sint_t s1 = cache[i - prefetch_distance - 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1 >> 1]; libsais16_prefetchw(s1 >= 0 ? Is1 : NULL); const sa_sint_t * Ds1 = &distinct_names[s1]; libsais16_prefetchw(s1 >= 0 ? Ds1 : NULL);
+
+        sa_sint_t v0 = cache[i - 0].symbol;
+        if (v0 >= 0)
+        {
+            sa_sint_t p0 = cache[i - 0].index; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); cache[i - 0].symbol = --induction_bucket[v0 >> 1]; cache[i - 0].index = (p0 - 1) | (v0 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d;
+            if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 0].symbol, np = cache[i - 0].index; if (np > 0) { cache[i - 0].index = 0; cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]); } }
+        }
+
+        sa_sint_t v1 = cache[i - 1].symbol;
+        if (v1 >= 0)
+        {
+            sa_sint_t p1 = cache[i - 1].index; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); cache[i - 1].symbol = --induction_bucket[v1 >> 1]; cache[i - 1].index = (p1 - 1) | (v1 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d;
+            if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 1].symbol, np = cache[i - 1].index; if (np > 0) { cache[i - 1].index = 0; cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]); } }
+        }
+    }
+
+    for (j -= prefetch_distance + 1; i >= j; i -= 1)
+    {
+        sa_sint_t v = cache[i].symbol;
+        if (v >= 0)
+        {
+            sa_sint_t p = cache[i].index; d += (p >> (SUFFIX_GROUP_BIT - 1)); cache[i].symbol = --induction_bucket[v >> 1]; cache[i].index = (p - 1) | (v << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d;
+            if (cache[i].symbol >= omp_block_start) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[i].index = 0; cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]); } }
+        }
+    }
+
+    return d;
+}
+
+static void libsais16_partial_sorting_scan_right_to_left_32s_1k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+    {
+        libsais16_prefetchw(&cache[i - 2 * prefetch_distance]);
+
+        sa_sint_t s0 = cache[i - prefetch_distance - 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0]; libsais16_prefetchw(s0 >= 0 ? Is0 : NULL);
+        sa_sint_t s1 = cache[i - prefetch_distance - 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1]; libsais16_prefetchw(s1 >= 0 ? Is1 : NULL);
+
+        sa_sint_t v0 = cache[i - 0].symbol;
+        if (v0 >= 0)
+        {
+            cache[i - 0].symbol = --induction_bucket[v0];
+            if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 0].symbol, np = cache[i - 0].index; if (np > 0) { cache[i - 0].index = 0; cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; } }
+        }
+
+        sa_sint_t v1 = cache[i - 1].symbol;
+        if (v1 >= 0)
+        {
+            cache[i - 1].symbol = --induction_bucket[v1];
+            if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 1].symbol, np = cache[i - 1].index; if (np > 0) { cache[i - 1].index = 0; cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; }}
+        }
+    }
+
+    for (j -= prefetch_distance + 1; i >= j; i -= 1)
+    {
+        sa_sint_t v = cache[i].symbol;
+        if (v >= 0)
+        {
+            cache[i].symbol = --induction_bucket[v];
+            if (cache[i].symbol >= omp_block_start) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[i].index = 0; cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; } }
+        }
+    }
+}
+
+static sa_sint_t libsais16_partial_sorting_scan_right_to_left_32s_6k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(cache);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            d = libsais16_partial_sorting_scan_right_to_left_32s_6k(T, SA, buckets, d, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                libsais16_partial_sorting_scan_right_to_left_32s_6k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                d = libsais16_partial_sorting_scan_right_to_left_32s_6k_block_sort(T, buckets, d, cache - block_start, block_start, block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais16_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+
+    return d;
+}
+
+static sa_sint_t libsais16_partial_sorting_scan_right_to_left_32s_4k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(cache);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            d = libsais16_partial_sorting_scan_right_to_left_32s_4k(T, SA, k, buckets, d, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                libsais16_partial_sorting_scan_right_to_left_32s_4k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                d = libsais16_partial_sorting_scan_right_to_left_32s_4k_block_sort(T, k, buckets, d, cache - block_start, block_start, block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais16_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+
+    return d;
+}
+
+static void libsais16_partial_sorting_scan_right_to_left_32s_1k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(cache);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais16_partial_sorting_scan_right_to_left_32s_1k(T, SA, buckets, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                libsais16_partial_sorting_scan_right_to_left_32s_1k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                libsais16_partial_sorting_scan_right_to_left_32s_1k_block_sort(T, buckets, cache - block_start, block_start, block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais16_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+}
+
+#endif
+
+static sa_sint_t libsais16_partial_sorting_scan_right_to_left_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    fast_sint_t scan_start    = (fast_sint_t)left_suffixes_count + 1;
+    fast_sint_t scan_end      = (fast_sint_t)n - (fast_sint_t)first_lms_suffix;
+
+    if (threads == 1 || (scan_end - scan_start) < 65536)
+    {
+        d = libsais16_partial_sorting_scan_right_to_left_32s_6k(T, SA, buckets, d, scan_start, scan_end - scan_start);
+    }
+#if defined(_OPENMP)
+    else
+    {
+        fast_sint_t block_start, block_end;
+        for (block_start = scan_end - 1; block_start >= scan_start; block_start = block_end)
+        {
+            block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < scan_start) { block_end = scan_start - 1; }
+
+            d = libsais16_partial_sorting_scan_right_to_left_32s_6k_block_omp(T, SA, buckets, d, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads);
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+
+    return d;
+}
+
+static sa_sint_t libsais16_partial_sorting_scan_right_to_left_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    if (threads == 1 || n < 65536)
+    {
+        d = libsais16_partial_sorting_scan_right_to_left_32s_4k(T, SA, k, buckets, d, 0, n);
+    }
+#if defined(_OPENMP)
+    else
+    {
+        fast_sint_t block_start, block_end;
+        for (block_start = (fast_sint_t)n - 1; block_start >= 0; block_start = block_end)
+        {
+            block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < 0) { block_end = -1; }
+
+            d = libsais16_partial_sorting_scan_right_to_left_32s_4k_block_omp(T, SA, k, buckets, d, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads);
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+
+    return d;
+}
+
+static void libsais16_partial_sorting_scan_right_to_left_32s_1k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    if (threads == 1 || n < 65536)
+    {
+        libsais16_partial_sorting_scan_right_to_left_32s_1k(T, SA, buckets, 0, n);
+    }
+#if defined(_OPENMP)
+    else
+    {
+        fast_sint_t block_start, block_end;
+        for (block_start = (fast_sint_t)n - 1; block_start >= 0; block_start = block_end)
+        {
+            block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < 0) { block_end = -1; }
+
+            libsais16_partial_sorting_scan_right_to_left_32s_1k_block_omp(T, SA, buckets, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads);
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+}
+
+static fast_sint_t libsais16_partial_sorting_gather_lms_suffixes_32s_4k(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j, l;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j; i += 4)
+    {
+        libsais16_prefetch(&SA[i + prefetch_distance]);
+
+        sa_sint_t s0 = SA[i + 0]; SA[l] = (s0 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s0 < 0);
+        sa_sint_t s1 = SA[i + 1]; SA[l] = (s1 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s1 < 0);
+        sa_sint_t s2 = SA[i + 2]; SA[l] = (s2 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s2 < 0);
+        sa_sint_t s3 = SA[i + 3]; SA[l] = (s3 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s3 < 0);
+    }
+
+    for (j += 3; i < j; i += 1)
+    {
+        sa_sint_t s = SA[i]; SA[l] = (s - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s < 0);
+    }
+
+    return l;
+}
+
+static fast_sint_t libsais16_partial_sorting_gather_lms_suffixes_32s_1k(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j, l;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j; i += 4)
+    {
+        libsais16_prefetch(&SA[i + prefetch_distance]);
+
+        sa_sint_t s0 = SA[i + 0]; SA[l] = s0 & SAINT_MAX; l += (s0 < 0);
+        sa_sint_t s1 = SA[i + 1]; SA[l] = s1 & SAINT_MAX; l += (s1 < 0);
+        sa_sint_t s2 = SA[i + 2]; SA[l] = s2 & SAINT_MAX; l += (s2 < 0);
+        sa_sint_t s3 = SA[i + 3]; SA[l] = s3 & SAINT_MAX; l += (s3 < 0);
+    }
+
+    for (j += 3; i < j; i += 1)
+    {
+        sa_sint_t s = SA[i]; SA[l] = s & SAINT_MAX; l += (s < 0);
+    }
+
+    return l;
+}
+
+static void libsais16_partial_sorting_gather_lms_suffixes_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (n / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais16_partial_sorting_gather_lms_suffixes_32s_4k(SA, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                thread_state[omp_thread_num].state.position = omp_block_start;
+                thread_state[omp_thread_num].state.count = libsais16_partial_sorting_gather_lms_suffixes_32s_4k(SA, omp_block_start, omp_block_size) - omp_block_start;
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                fast_sint_t t, position = 0;
+                for (t = 0; t < omp_num_threads; ++t)
+                { 
+                    if (t > 0 && thread_state[t].state.count > 0)
+                    {
+                        memmove(&SA[position], &SA[thread_state[t].state.position], (size_t)thread_state[t].state.count * sizeof(sa_sint_t));
+                    }
+
+                    position += thread_state[t].state.count;
+                }
+            }
+        }
+#endif
+    }
+}
+
+static void libsais16_partial_sorting_gather_lms_suffixes_32s_1k_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (n / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais16_partial_sorting_gather_lms_suffixes_32s_1k(SA, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                thread_state[omp_thread_num].state.position = omp_block_start;
+                thread_state[omp_thread_num].state.count = libsais16_partial_sorting_gather_lms_suffixes_32s_1k(SA, omp_block_start, omp_block_size) - omp_block_start;
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                fast_sint_t t, position = 0;
+                for (t = 0; t < omp_num_threads; ++t)
+                { 
+                    if (t > 0 && thread_state[t].state.count > 0)
+                    {
+                        memmove(&SA[position], &SA[thread_state[t].state.position], (size_t)thread_state[t].state.count * sizeof(sa_sint_t));
+                    }
+
+                    position += thread_state[t].state.count;
+                }
+            }
+        }
+#endif
+    }
+}
+
+static void libsais16_induce_partial_order_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    memset(&buckets[2 * ALPHABET_SIZE], 0, 2 * ALPHABET_SIZE * sizeof(sa_sint_t));
+
+    sa_sint_t d = libsais16_partial_sorting_scan_left_to_right_16u_omp(T, SA, n, buckets, left_suffixes_count, 0, threads, thread_state);
+    libsais16_partial_sorting_shift_markers_16u_omp(SA, n, buckets, threads);
+    libsais16_partial_sorting_scan_right_to_left_16u_omp(T, SA, n, buckets, first_lms_suffix, left_suffixes_count, d, threads, thread_state);
+}
+
+static void libsais16_induce_partial_order_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    sa_sint_t d = libsais16_partial_sorting_scan_left_to_right_32s_6k_omp(T, SA, n, buckets, left_suffixes_count, 0, threads, thread_state);
+    libsais16_partial_sorting_shift_markers_32s_6k_omp(SA, k, buckets, threads);
+    libsais16_partial_sorting_shift_buckets_32s_6k(k, buckets);
+    libsais16_partial_sorting_scan_right_to_left_32s_6k_omp(T, SA, n, buckets, first_lms_suffix, left_suffixes_count, d, threads, thread_state);
+}
+
+static void libsais16_induce_partial_order_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
+
+    sa_sint_t d = libsais16_partial_sorting_scan_left_to_right_32s_4k_omp(T, SA, n, k, buckets, 0, threads, thread_state);
+    libsais16_partial_sorting_shift_markers_32s_4k(SA, n);
+    libsais16_partial_sorting_scan_right_to_left_32s_4k_omp(T, SA, n, k, buckets, d, threads, thread_state);
+    libsais16_partial_sorting_gather_lms_suffixes_32s_4k_omp(SA, n, threads, thread_state);
+}
+
+static void libsais16_induce_partial_order_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    libsais16_partial_sorting_scan_left_to_right_32s_1k_omp(T, SA, n, &buckets[1 * k], threads, thread_state);
+    libsais16_partial_sorting_scan_right_to_left_32s_1k_omp(T, SA, n, &buckets[0 * k], threads, thread_state);
+    libsais16_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads, thread_state);
+}
+
+static void libsais16_induce_partial_order_32s_1k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    libsais16_count_suffixes_32s(T, n, k, buckets);
+    libsais16_initialize_buckets_start_32s_1k(k, buckets);
+    libsais16_partial_sorting_scan_left_to_right_32s_1k_omp(T, SA, n, buckets, threads, thread_state);
+
+    libsais16_count_suffixes_32s(T, n, k, buckets);
+    libsais16_initialize_buckets_end_32s_1k(k, buckets);
+    libsais16_partial_sorting_scan_right_to_left_32s_1k_omp(T, SA, n, buckets, threads, thread_state);
+
+    libsais16_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads, thread_state);
+}
+
+static sa_sint_t libsais16_renumber_lms_suffixes_16u(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t name, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT SAm = &SA[m];
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
+    {
+        libsais16_prefetch(&SA[i + 2 * prefetch_distance]);
+
+        libsais16_prefetchw(&SAm[(SA[i + prefetch_distance + 0] & SAINT_MAX) >> 1]);
+        libsais16_prefetchw(&SAm[(SA[i + prefetch_distance + 1] & SAINT_MAX) >> 1]);
+        libsais16_prefetchw(&SAm[(SA[i + prefetch_distance + 2] & SAINT_MAX) >> 1]);
+        libsais16_prefetchw(&SAm[(SA[i + prefetch_distance + 3] & SAINT_MAX) >> 1]);
+
+        sa_sint_t p0 = SA[i + 0]; SAm[(p0 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p0 < 0;
+        sa_sint_t p1 = SA[i + 1]; SAm[(p1 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p1 < 0;
+        sa_sint_t p2 = SA[i + 2]; SAm[(p2 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p2 < 0;
+        sa_sint_t p3 = SA[i + 3]; SAm[(p3 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p3 < 0;
+    }
+
+    for (j += prefetch_distance + 3; i < j; i += 1)
+    {
+        sa_sint_t p = SA[i]; SAm[(p & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p < 0;
+    }
+
+    return name;
+}
+
+static fast_sint_t libsais16_gather_marked_suffixes_16u(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t l, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    l -= 1;
+
+    fast_sint_t i, j;
+    for (i = (fast_sint_t)m + omp_block_start + omp_block_size - 1, j = (fast_sint_t)m + omp_block_start + 3; i >= j; i -= 4)
+    {
+        libsais16_prefetch(&SA[i - prefetch_distance]);
+
+        sa_sint_t s0 = SA[i - 0]; SA[l] = s0 & SAINT_MAX; l -= s0 < 0;
+        sa_sint_t s1 = SA[i - 1]; SA[l] = s1 & SAINT_MAX; l -= s1 < 0;
+        sa_sint_t s2 = SA[i - 2]; SA[l] = s2 & SAINT_MAX; l -= s2 < 0;
+        sa_sint_t s3 = SA[i - 3]; SA[l] = s3 & SAINT_MAX; l -= s3 < 0;
+    }
+
+    for (j -= 3; i >= j; i -= 1)
+    {
+        sa_sint_t s = SA[i]; SA[l] = s & SAINT_MAX; l -= s < 0;
+    }
+
+    l += 1;
+
+    return l;
+}
+
+static sa_sint_t libsais16_renumber_lms_suffixes_16u_omp(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    sa_sint_t name = 0;
+
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (m / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
+
+        if (omp_num_threads == 1)
+        {
+            name = libsais16_renumber_lms_suffixes_16u(SA, m, 0, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                thread_state[omp_thread_num].state.count = libsais16_count_negative_marked_suffixes(SA, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                fast_sint_t t, count = 0; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; }
+
+                if (omp_thread_num == omp_num_threads - 1)
+                {
+                    name = (sa_sint_t)(count + thread_state[omp_thread_num].state.count);
+                }
+
+                libsais16_renumber_lms_suffixes_16u(SA, m, (sa_sint_t)count, omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+
+    return name;
+}
+
+static void libsais16_gather_marked_lms_suffixes_16u_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais16_gather_marked_suffixes_16u(SA, m, (fast_sint_t)n + (fast_sint_t)fs, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                if (omp_thread_num < omp_num_threads - 1)
+                {
+                    thread_state[omp_thread_num].state.position = libsais16_gather_marked_suffixes_16u(SA, m, (fast_sint_t)m + omp_block_start + omp_block_size, omp_block_start, omp_block_size);
+                    thread_state[omp_thread_num].state.count = (fast_sint_t)m + omp_block_start + omp_block_size - thread_state[omp_thread_num].state.position;
+                }
+                else
+                {
+                    thread_state[omp_thread_num].state.position = libsais16_gather_marked_suffixes_16u(SA, m, (fast_sint_t)n + (fast_sint_t)fs, omp_block_start, omp_block_size);
+                    thread_state[omp_thread_num].state.count = (fast_sint_t)n + (fast_sint_t)fs - thread_state[omp_thread_num].state.position;
+                }
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                fast_sint_t t, position = (fast_sint_t)n + (fast_sint_t)fs;
+                    
+                for (t = omp_num_threads - 1; t >= 0; --t)
+                { 
+                    position -= thread_state[t].state.count;
+                    if (t != omp_num_threads - 1 && thread_state[t].state.count > 0)
+                    {
+                        memmove(&SA[position], &SA[thread_state[t].state.position], (size_t)thread_state[t].state.count * sizeof(sa_sint_t));
+                    }
+                }
+            }
+        }
+#endif
+    }
+}
+
+static sa_sint_t libsais16_renumber_and_gather_lms_suffixes_16u_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    memset(&SA[m], 0, ((size_t)n >> 1) * sizeof(sa_sint_t));
+
+    sa_sint_t name = libsais16_renumber_lms_suffixes_16u_omp(SA, m, threads, thread_state);
+    if (name < m)
+    {
+        libsais16_gather_marked_lms_suffixes_16u_omp(SA, n, m, fs, threads, thread_state);
+    }
+    else
+    {
+        fast_sint_t i; for (i = 0; i < m; i += 1) { SA[i] &= SAINT_MAX; }
+    }
+
+    return name;
+}
+
+static sa_sint_t libsais16_renumber_distinct_lms_suffixes_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t name, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT SAm = &SA[m];
+
+    fast_sint_t i, j; sa_sint_t p0, p1, p2, p3 = 0;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
+    {
+        libsais16_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+        libsais16_prefetchw(&SAm[(SA[i + prefetch_distance + 0] & SAINT_MAX) >> 1]);
+        libsais16_prefetchw(&SAm[(SA[i + prefetch_distance + 1] & SAINT_MAX) >> 1]);
+        libsais16_prefetchw(&SAm[(SA[i + prefetch_distance + 2] & SAINT_MAX) >> 1]);
+        libsais16_prefetchw(&SAm[(SA[i + prefetch_distance + 3] & SAINT_MAX) >> 1]);
+
+        p0 = SA[i + 0]; SAm[(SA[i + 0] = p0 & SAINT_MAX) >> 1] = name | (p0 & p3 & SAINT_MIN); name += p0 < 0;
+        p1 = SA[i + 1]; SAm[(SA[i + 1] = p1 & SAINT_MAX) >> 1] = name | (p1 & p0 & SAINT_MIN); name += p1 < 0;
+        p2 = SA[i + 2]; SAm[(SA[i + 2] = p2 & SAINT_MAX) >> 1] = name | (p2 & p1 & SAINT_MIN); name += p2 < 0;
+        p3 = SA[i + 3]; SAm[(SA[i + 3] = p3 & SAINT_MAX) >> 1] = name | (p3 & p2 & SAINT_MIN); name += p3 < 0;
+    }
+
+    for (j += prefetch_distance + 3; i < j; i += 1)
+    {
+        p2 = p3; p3 = SA[i]; SAm[(SA[i] = p3 & SAINT_MAX) >> 1] = name | (p3 & p2 & SAINT_MIN); name += p3 < 0;
+    }
+
+    return name;
+}
+
+static void libsais16_mark_distinct_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j; sa_sint_t p0, p1, p2, p3 = 0;
+    for (i = (fast_sint_t)m + omp_block_start, j = (fast_sint_t)m + omp_block_start + omp_block_size - 3; i < j; i += 4)
+    {
+        libsais16_prefetchw(&SA[i + prefetch_distance]);
+
+        p0 = SA[i + 0]; SA[i + 0] = p0 & (p3 | SAINT_MAX); p0 = (p0 == 0) ? p3 : p0;
+        p1 = SA[i + 1]; SA[i + 1] = p1 & (p0 | SAINT_MAX); p1 = (p1 == 0) ? p0 : p1;
+        p2 = SA[i + 2]; SA[i + 2] = p2 & (p1 | SAINT_MAX); p2 = (p2 == 0) ? p1 : p2;
+        p3 = SA[i + 3]; SA[i + 3] = p3 & (p2 | SAINT_MAX); p3 = (p3 == 0) ? p2 : p3;
+    }
+
+    for (j += 3; i < j; i += 1)
+    {
+        p2 = p3; p3 = SA[i]; SA[i] = p3 & (p2 | SAINT_MAX); p3 = (p3 == 0) ? p2 : p3;
+    }
+}
+
+static void libsais16_clamp_lms_suffixes_length_32s(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT SAm = &SA[m];
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3; i < j; i += 4)
+    {
+        libsais16_prefetchw(&SAm[i + prefetch_distance]);
+
+        SAm[i + 0] = (SAm[i + 0] < 0 ? SAm[i + 0] : 0) & SAINT_MAX;
+        SAm[i + 1] = (SAm[i + 1] < 0 ? SAm[i + 1] : 0) & SAINT_MAX;
+        SAm[i + 2] = (SAm[i + 2] < 0 ? SAm[i + 2] : 0) & SAINT_MAX;
+        SAm[i + 3] = (SAm[i + 3] < 0 ? SAm[i + 3] : 0) & SAINT_MAX;
+    }
+
+    for (j += 3; i < j; i += 1)
+    {
+        SAm[i] = (SAm[i] < 0 ? SAm[i] : 0) & SAINT_MAX;
+    }
+}
+
+static sa_sint_t libsais16_renumber_distinct_lms_suffixes_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    sa_sint_t name = 0;
+
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (m / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
+
+        if (omp_num_threads == 1)
+        {
+            name = libsais16_renumber_distinct_lms_suffixes_32s_4k(SA, m, 1, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                thread_state[omp_thread_num].state.count = libsais16_count_negative_marked_suffixes(SA, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                fast_sint_t t, count = 1; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; }
+
+                if (omp_thread_num == omp_num_threads - 1)
+                {
+                    name = (sa_sint_t)(count + thread_state[omp_thread_num].state.count);
+                }
+
+                libsais16_renumber_distinct_lms_suffixes_32s_4k(SA, m, (sa_sint_t)count, omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+
+    return name - 1;
+}
+
+static void libsais16_mark_distinct_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+        fast_sint_t omp_block_stride  = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start;
+#else
+        UNUSED(threads);
+
+        fast_sint_t omp_block_start   = 0;
+        fast_sint_t omp_block_size    = (fast_sint_t)n >> 1;
+#endif
+        libsais16_mark_distinct_lms_suffixes_32s(SA, m, omp_block_start, omp_block_size);
+    }
+}
+
+static void libsais16_clamp_lms_suffixes_length_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+        fast_sint_t omp_block_stride  = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start;
+#else
+        UNUSED(threads);
+
+        fast_sint_t omp_block_start   = 0;
+        fast_sint_t omp_block_size    = (fast_sint_t)n >> 1;
+#endif
+        libsais16_clamp_lms_suffixes_length_32s(SA, m, omp_block_start, omp_block_size);
+    }
+}
+
+static sa_sint_t libsais16_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    memset(&SA[m], 0, ((size_t)n >> 1) * sizeof(sa_sint_t));
+
+    sa_sint_t name = libsais16_renumber_distinct_lms_suffixes_32s_4k_omp(SA, m, threads, thread_state);
+    if (name < m)
+    {
+        libsais16_mark_distinct_lms_suffixes_32s_omp(SA, n, m, threads);
+    }
+
+    return name;
+}
+
+static sa_sint_t libsais16_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT SAm = &SA[m];
+
+    {
+        libsais16_gather_lms_suffixes_32s(T, SA, n);
+
+        memset(&SA[m], 0, ((size_t)n - (size_t)m - (size_t)m) * sizeof(sa_sint_t));
+
+        fast_sint_t i, j;
+        for (i = (fast_sint_t)n - (fast_sint_t)m, j = (fast_sint_t)n - 1 - prefetch_distance - 3; i < j; i += 4)
+        {
+            libsais16_prefetch(&SA[i + 2 * prefetch_distance]);
+
+            libsais16_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]);
+            libsais16_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]);
+            libsais16_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 2]) >> 1]);
+            libsais16_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 3]) >> 1]);
+
+            SAm[((sa_uint_t)SA[i + 0]) >> 1] = SA[i + 1] - SA[i + 0] + 1 + SAINT_MIN;
+            SAm[((sa_uint_t)SA[i + 1]) >> 1] = SA[i + 2] - SA[i + 1] + 1 + SAINT_MIN;
+            SAm[((sa_uint_t)SA[i + 2]) >> 1] = SA[i + 3] - SA[i + 2] + 1 + SAINT_MIN;
+            SAm[((sa_uint_t)SA[i + 3]) >> 1] = SA[i + 4] - SA[i + 3] + 1 + SAINT_MIN;
+        }
+
+        for (j += prefetch_distance + 3; i < j; i += 1)
+        {
+            SAm[((sa_uint_t)SA[i]) >> 1] = SA[i + 1] - SA[i] + 1 + SAINT_MIN;
+        }
+
+        SAm[((sa_uint_t)SA[n - 1]) >> 1] = 1 + SAINT_MIN;
+    }
+
+    {
+        libsais16_clamp_lms_suffixes_length_32s_omp(SA, n, m, threads);
+    }
+
+    sa_sint_t name = 1;
+
+    {
+        fast_sint_t i, j, p = SA[0], plen = SAm[p >> 1]; sa_sint_t pdiff = SAINT_MIN;
+        for (i = 1, j = m - prefetch_distance - 1; i < j; i += 2)
+        {
+            libsais16_prefetch(&SA[i + 2 * prefetch_distance]);
+            
+            libsais16_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]); libsais16_prefetch(&T[((sa_uint_t)SA[i + prefetch_distance + 0])]);
+            libsais16_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]); libsais16_prefetch(&T[((sa_uint_t)SA[i + prefetch_distance + 1])]);
+
+            fast_sint_t q = SA[i + 0], qlen = SAm[q >> 1]; sa_sint_t qdiff = SAINT_MIN;
+            if (plen == qlen) { fast_sint_t l = 0; do { if (T[p + l] != T[q + l]) { break; } } while (++l < qlen); qdiff = (sa_sint_t)(l - qlen) & SAINT_MIN; }
+            SAm[p >> 1] = name | (pdiff & qdiff); name += (qdiff < 0);
+
+            p = SA[i + 1]; plen = SAm[p >> 1]; pdiff = SAINT_MIN;
+            if (qlen == plen) { fast_sint_t l = 0; do { if (T[q + l] != T[p + l]) { break; } } while (++l < plen); pdiff = (sa_sint_t)(l - plen) & SAINT_MIN; }
+            SAm[q >> 1] = name | (qdiff & pdiff); name += (pdiff < 0);
+        }
+
+        for (j += prefetch_distance + 1; i < j; i += 1)
+        {
+            fast_sint_t q = SA[i], qlen = SAm[q >> 1]; sa_sint_t qdiff = SAINT_MIN;
+            if (plen == qlen) { fast_sint_t l = 0; do { if (T[p + l] != T[q + l]) { break; } } while (++l < plen); qdiff = (sa_sint_t)(l - plen) & SAINT_MIN; }
+            SAm[p >> 1] = name | (pdiff & qdiff); name += (qdiff < 0);
+
+            p = q; plen = qlen; pdiff = qdiff;
+        }
+
+        SAm[p >> 1] = name | pdiff; name++;
+    }
+
+    if (name <= m)
+    {
+        libsais16_mark_distinct_lms_suffixes_32s_omp(SA, n, m, threads);
+    }
+
+    return name - 1;
+}
+
+static void libsais16_reconstruct_lms_suffixes(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    const sa_sint_t * RESTRICT SAnm = &SA[n - m];
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
+    {
+        libsais16_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+        libsais16_prefetch(&SAnm[SA[i + prefetch_distance + 0]]);
+        libsais16_prefetch(&SAnm[SA[i + prefetch_distance + 1]]);
+        libsais16_prefetch(&SAnm[SA[i + prefetch_distance + 2]]);
+        libsais16_prefetch(&SAnm[SA[i + prefetch_distance + 3]]);
+
+        SA[i + 0] = SAnm[SA[i + 0]];
+        SA[i + 1] = SAnm[SA[i + 1]];
+        SA[i + 2] = SAnm[SA[i + 2]];
+        SA[i + 3] = SAnm[SA[i + 3]];
+    }
+
+    for (j += prefetch_distance + 3; i < j; i += 1)
+    {
+        SA[i] = SAnm[SA[i]];
+    }
+}
+
+static void libsais16_reconstruct_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+        fast_sint_t omp_block_stride  = (m / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
+#else
+        UNUSED(threads);
+
+        fast_sint_t omp_block_start   = 0;
+        fast_sint_t omp_block_size    = m;
+#endif
+
+        libsais16_reconstruct_lms_suffixes(SA, n, m, omp_block_start, omp_block_size);
+    }
+}
+
+static void libsais16_place_lms_suffixes_interval_16u(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, const sa_sint_t * RESTRICT buckets)
+{
+    const sa_sint_t * RESTRICT bucket_end = &buckets[7 * ALPHABET_SIZE];
+
+    fast_sint_t c, j = n;
+    for (c = ALPHABET_SIZE - 2; c >= 0; --c)
+    {
+        fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] - (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)];
+        if (l > 0)
+        {
+            fast_sint_t i = bucket_end[c];
+            if (j - i > 0)
+            {
+                memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
+            }
+
+            memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
+        }
+    }
+
+    memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
+}
+
+static void libsais16_place_lms_suffixes_interval_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets)
+{
+    const sa_sint_t * RESTRICT bucket_end = &buckets[3 * k];
+
+    fast_sint_t c, j = n;
+    for (c = (fast_sint_t)k - 2; c >= 0; --c)
+    {
+        fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] - (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)];
+        if (l > 0)
+        {
+            fast_sint_t i = bucket_end[c];
+            if (j - i > 0)
+            {
+                memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
+            }
+
+            memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
+        }
+    }
+
+    memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
+}
+
+static void libsais16_place_lms_suffixes_interval_32s_2k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets)
+{
+    fast_sint_t j = n;
+
+    if (k > 1)
+    {
+        fast_sint_t c;
+        for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0); c >= BUCKETS_INDEX2(0, 0); c -= BUCKETS_INDEX2(1, 0))
+        {
+            fast_sint_t l = (fast_sint_t)buckets[c + BUCKETS_INDEX2(1, 1)] - (fast_sint_t)buckets[c + BUCKETS_INDEX2(0, 1)];
+            if (l > 0)
+            {
+                fast_sint_t i = buckets[c];
+                if (j - i > 0)
+                {
+                    memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
+                }
+
+                memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
+            }
+        }
+    }
+
+    memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
+}
+
+static void libsais16_place_lms_suffixes_interval_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t m, sa_sint_t * RESTRICT buckets)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t c = k - 1; fast_sint_t i, l = buckets[c];
+    for (i = (fast_sint_t)m - 1; i >= prefetch_distance + 3; i -= 4)
+    {
+        libsais16_prefetch(&SA[i - 2 * prefetch_distance]);
+
+        libsais16_prefetch(&T[SA[i - prefetch_distance - 0]]);
+        libsais16_prefetch(&T[SA[i - prefetch_distance - 1]]);
+        libsais16_prefetch(&T[SA[i - prefetch_distance - 2]]);
+        libsais16_prefetch(&T[SA[i - prefetch_distance - 3]]);
+
+        sa_sint_t p0 = SA[i - 0]; if (T[p0] != c) { c = T[p0]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p0;
+        sa_sint_t p1 = SA[i - 1]; if (T[p1] != c) { c = T[p1]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p1;
+        sa_sint_t p2 = SA[i - 2]; if (T[p2] != c) { c = T[p2]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p2;
+        sa_sint_t p3 = SA[i - 3]; if (T[p3] != c) { c = T[p3]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p3;
+    }
+
+    for (; i >= 0; i -= 1)
+    {
+        sa_sint_t p = SA[i]; if (T[p] != c) { c = T[p]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p;
+    }
+
+    memset(&SA[0], 0, (size_t)l * sizeof(sa_sint_t));
+}
+
+static void libsais16_place_lms_suffixes_histogram_32s_6k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets)
+{
+    const sa_sint_t * RESTRICT bucket_end = &buckets[5 * k];
+
+    fast_sint_t c, j = n;
+    for (c = (fast_sint_t)k - 2; c >= 0; --c)
+    {
+        fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX4(c, 1)];
+        if (l > 0)
+        {
+            fast_sint_t i = bucket_end[c];
+            if (j - i > 0)
+            {
+                memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
+            }
+
+            memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
+        }
+    }
+
+    memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
+}
+
+static void libsais16_place_lms_suffixes_histogram_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets)
+{
+    const sa_sint_t * RESTRICT bucket_end = &buckets[3 * k];
+
+    fast_sint_t c, j = n;
+    for (c = (fast_sint_t)k - 2; c >= 0; --c)
+    {
+        fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)];
+        if (l > 0)
+        {
+            fast_sint_t i = bucket_end[c];
+            if (j - i > 0)
+            {
+                memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
+            }
+
+            memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
+        }
+    }
+
+    memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
+}
+
+static void libsais16_place_lms_suffixes_histogram_32s_2k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets)
+{
+    fast_sint_t j = n;
+
+    if (k > 1)
+    {
+        fast_sint_t c;
+        for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0); c >= BUCKETS_INDEX2(0, 0); c -= BUCKETS_INDEX2(1, 0))
+        {
+            fast_sint_t l = (fast_sint_t)buckets[c + BUCKETS_INDEX2(0, 1)];
+            if (l > 0)
+            {
+                fast_sint_t i = buckets[c];
+                if (j - i > 0)
+                {
+                    memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
+                }
+
+                memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
+            }
+        }
+    }
+
+    memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
+}
+
+static void libsais16_final_bwt_scan_left_to_right_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais16_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+
+        sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[i + 0] = T[p0] | SAINT_MIN; SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); }
+        sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[i + 1] = T[p1] | SAINT_MIN; SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); }
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[i] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); }
+    }
+}
+
+static void libsais16_final_bwt_aux_scan_left_to_right_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais16_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+
+        sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[i + 0] = T[p0] | SAINT_MIN; SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); if ((p0 & rm) == 0) { I[p0 / (rm + 1)] = induction_bucket[T[p0]]; }}
+        sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[i + 1] = T[p1] | SAINT_MIN; SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); if ((p1 & rm) == 0) { I[p1 / (rm + 1)] = induction_bucket[T[p1]]; }}
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[i] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]]; } }
+    }
+}
+
+static void libsais16_final_sorting_scan_left_to_right_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais16_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+
+        sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); }
+        sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); }
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); }
+    }
+}
+
+static void libsais16_final_sorting_scan_left_to_right_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais16_prefetchw(&SA[i + 3 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+        sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { libsais16_prefetchw(&induction_bucket[T[s2 - 1]]); libsais16_prefetch(&T[s2] - 2); }
+        sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { libsais16_prefetchw(&induction_bucket[T[s3 - 1]]); libsais16_prefetch(&T[s3] - 2); }
+
+        sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); }
+        sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); }
+    }
+
+    for (j += 2 * prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); }
+    }
+}
+
+#if defined(_OPENMP)
+
+static fast_sint_t libsais16_final_bwt_scan_left_to_right_16u_block_prepare(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+   const fast_sint_t prefetch_distance = 32;
+
+   memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
+
+   fast_sint_t i, j, count = 0;
+   for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+   {
+       libsais16_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+       sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+       sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+
+       sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[i + 0] = T[p0] | SAINT_MIN; buckets[cache[count].symbol = T[p0]]++; cache[count++].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); }
+       sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[i + 1] = T[p1] | SAINT_MIN; buckets[cache[count].symbol = T[p1]]++; cache[count++].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); }
+   }
+
+   for (j += prefetch_distance + 1; i < j; i += 1)
+   {
+       sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[i] = T[p] | SAINT_MIN; buckets[cache[count].symbol = T[p]]++; cache[count++].index = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); }
+   }
+
+   return count;
+}
+
+static fast_sint_t libsais16_final_sorting_scan_left_to_right_16u_block_prepare(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+   const fast_sint_t prefetch_distance = 32;
+
+   memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
+
+   fast_sint_t i, j, count = 0;
+   for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+   {
+       libsais16_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+       sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+       sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+
+       sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; buckets[cache[count].symbol = T[p0]]++; cache[count++].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); }
+       sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; buckets[cache[count].symbol = T[p1]]++; cache[count++].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); }
+   }
+
+   for (j += prefetch_distance + 1; i < j; i += 1)
+   {
+       sa_sint_t p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; buckets[cache[count].symbol = T[p]]++; cache[count++].index = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); }
+   }
+
+   return count;
+}
+
+static void libsais16_final_order_scan_left_to_right_16u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = 0, j = count - 3; i < j; i += 4)
+    {
+        libsais16_prefetch(&cache[i + prefetch_distance]);
+
+        SA[buckets[cache[i + 0].symbol]++] = cache[i + 0].index;
+        SA[buckets[cache[i + 1].symbol]++] = cache[i + 1].index;
+        SA[buckets[cache[i + 2].symbol]++] = cache[i + 2].index;
+        SA[buckets[cache[i + 3].symbol]++] = cache[i + 3].index;
+    }
+
+    for (j += 3; i < j; i += 1)
+    {
+        SA[buckets[cache[i].symbol]++] = cache[i].index;
+    }
+}
+
+static void libsais16_final_bwt_aux_scan_left_to_right_16u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = 0, j = count - 3; i < j; i += 4)
+    {
+        libsais16_prefetch(&cache[i + prefetch_distance]);
+
+        SA[buckets[cache[i + 0].symbol]++] = cache[i + 0].index; if ((cache[i + 0].index & rm) == 0) { I[(cache[i + 0].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 0].symbol]; }
+        SA[buckets[cache[i + 1].symbol]++] = cache[i + 1].index; if ((cache[i + 1].index & rm) == 0) { I[(cache[i + 1].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 1].symbol]; }
+        SA[buckets[cache[i + 2].symbol]++] = cache[i + 2].index; if ((cache[i + 2].index & rm) == 0) { I[(cache[i + 2].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 2].symbol]; }
+        SA[buckets[cache[i + 3].symbol]++] = cache[i + 3].index; if ((cache[i + 3].index & rm) == 0) { I[(cache[i + 3].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 3].symbol]; }
+    }
+
+    for (j += 3; i < j; i += 1)
+    {
+        SA[buckets[cache[i].symbol]++] = cache[i].index; if ((cache[i].index & rm) == 0) { I[(cache[i].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i].symbol]; }
+    }
+}
+
+static void libsais16_final_sorting_scan_left_to_right_32s_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais16_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+
+        libsais16_prefetchw(&cache[i + prefetch_distance]);
+
+        sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; cache[i + 0].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); symbol0 = T[p0]; } cache[i + 0].symbol = symbol0;
+        sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; cache[i + 1].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); symbol1 = T[p1]; } cache[i + 1].symbol = symbol1;
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t symbol = SAINT_MIN, p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; cache[i].index = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); symbol = T[p]; } cache[i].symbol = symbol;
+    }
+}
+
+static void libsais16_final_sorting_scan_left_to_right_32s_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size;
+    for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais16_prefetchw(&cache[i + 2 * prefetch_distance]);
+
+        sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0]; libsais16_prefetchw(s0 >= 0 ? Is0 : NULL);
+        sa_sint_t s1 = cache[i + prefetch_distance + 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1]; libsais16_prefetchw(s1 >= 0 ? Is1 : NULL);
+        
+        sa_sint_t v0 = cache[i + 0].symbol;
+        if (v0 >= 0)
+        {
+            cache[i + 0].symbol = induction_bucket[v0]++;
+            if (cache[i + 0].symbol < omp_block_end) { sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index; cache[i + 0].index = np ^ SAINT_MIN; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] < T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } }
+        }
+
+        sa_sint_t v1 = cache[i + 1].symbol;
+        if (v1 >= 0)
+        {
+            cache[i + 1].symbol = induction_bucket[v1]++;
+            if (cache[i + 1].symbol < omp_block_end) { sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index; cache[i + 1].index = np ^ SAINT_MIN; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] < T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } }
+        }
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t v = cache[i].symbol;
+        if (v >= 0)
+        {
+            cache[i].symbol = induction_bucket[v]++;
+            if (cache[i].symbol < omp_block_end) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; cache[i].index = np ^ SAINT_MIN; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] < T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } }
+        }
+    }
+}
+
+static void libsais16_final_bwt_scan_left_to_right_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais16_final_bwt_scan_left_to_right_16u(T, SA, induction_bucket, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                thread_state[omp_thread_num].state.count = libsais16_final_bwt_scan_left_to_right_16u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                fast_sint_t t;
+                for (t = 0; t < omp_num_threads; ++t)
+                {
+                    sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
+                    fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A + B; temp_bucket[c] = A; }
+                }
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais16_final_order_scan_left_to_right_16u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count);
+            }
+        }
+#endif
+    }
+}
+
+static void libsais16_final_bwt_aux_scan_left_to_right_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais16_final_bwt_aux_scan_left_to_right_16u(T, SA, rm, I, induction_bucket, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                thread_state[omp_thread_num].state.count = libsais16_final_bwt_scan_left_to_right_16u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                fast_sint_t t;
+                for (t = 0; t < omp_num_threads; ++t)
+                {
+                    sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
+                    fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A + B; temp_bucket[c] = A; }
+                }
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais16_final_bwt_aux_scan_left_to_right_16u_block_place(SA, rm, I, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count);
+            }
+        }
+#endif
+    }
+}
+
+static void libsais16_final_sorting_scan_left_to_right_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais16_final_sorting_scan_left_to_right_16u(T, SA, induction_bucket, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                thread_state[omp_thread_num].state.count = libsais16_final_sorting_scan_left_to_right_16u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                fast_sint_t t;
+                for (t = 0; t < omp_num_threads; ++t)
+                {
+                    sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
+                    fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A + B; temp_bucket[c] = A; }
+                }
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais16_final_order_scan_left_to_right_16u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count);
+            }
+        }
+#endif
+    }
+}
+
+static void libsais16_final_sorting_scan_left_to_right_32s_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(cache);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais16_final_sorting_scan_left_to_right_32s(T, SA, buckets, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                libsais16_final_sorting_scan_left_to_right_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                libsais16_final_sorting_scan_left_to_right_32s_block_sort(T, buckets, cache - block_start, block_start, block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais16_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+}
+
+#endif
+
+static void libsais16_final_bwt_scan_left_to_right_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    SA[induction_bucket[T[(sa_sint_t)n - 1]]++] = ((sa_sint_t)n - 1) | ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1));
+
+    if (threads == 1 || n < 65536)
+    {
+        libsais16_final_bwt_scan_left_to_right_16u(T, SA, induction_bucket, 0, n);
+    }
+#if defined(_OPENMP)
+    else
+    {
+        fast_sint_t block_start;
+        for (block_start = 0; block_start < n; )
+        {
+            if (SA[block_start] == 0)
+            {
+                block_start++;
+            }
+            else
+            {
+                fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > n) { block_max_end = n;}
+                fast_sint_t block_end     = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; }
+                fast_sint_t block_size    = block_end - block_start;
+
+                if (block_size < 32)
+                {
+                    for (; block_start < block_end; block_start += 1)
+                    {
+                        sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; SA[block_start] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); }
+                    }
+                }
+                else
+                {
+                    libsais16_final_bwt_scan_left_to_right_16u_block_omp(T, SA, induction_bucket, block_start, block_size, threads, thread_state);
+                    block_start = block_end;
+                }
+            }
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+}
+
+static void libsais16_final_bwt_aux_scan_left_to_right_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    SA[induction_bucket[T[(sa_sint_t)n - 1]]++] = ((sa_sint_t)n - 1) | ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1));
+
+    if ((((sa_sint_t)n - 1) & rm) == 0) { I[((sa_sint_t)n - 1) / (rm + 1)] = induction_bucket[T[(sa_sint_t)n - 1]]; }
+
+    if (threads == 1 || n < 65536)
+    {
+        libsais16_final_bwt_aux_scan_left_to_right_16u(T, SA, rm, I, induction_bucket, 0, n);
+    }
+#if defined(_OPENMP)
+    else
+    {
+        fast_sint_t block_start;
+        for (block_start = 0; block_start < n; )
+        {
+            if (SA[block_start] == 0)
+            {
+                block_start++;
+            }
+            else
+            {
+                fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > n) { block_max_end = n;}
+                fast_sint_t block_end     = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; }
+                fast_sint_t block_size    = block_end - block_start;
+
+                if (block_size < 32)
+                {
+                    for (; block_start < block_end; block_start += 1)
+                    {
+                        sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; SA[block_start] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]]; } }
+                    }
+                }
+                else
+                {
+                    libsais16_final_bwt_aux_scan_left_to_right_16u_block_omp(T, SA, rm, I, induction_bucket, block_start, block_size, threads, thread_state);
+                    block_start = block_end;
+                }
+            }
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+}
+
+static void libsais16_final_sorting_scan_left_to_right_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    SA[induction_bucket[T[(sa_sint_t)n - 1]]++] = ((sa_sint_t)n - 1) | ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1));
+
+    if (threads == 1 || n < 65536)
+    {
+        libsais16_final_sorting_scan_left_to_right_16u(T, SA, induction_bucket, 0, n);
+    }
+#if defined(_OPENMP)
+    else
+    {
+        fast_sint_t block_start;
+        for (block_start = 0; block_start < n; )
+        {
+            if (SA[block_start] == 0)
+            {
+                block_start++;
+            }
+            else
+            {
+                fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > n) { block_max_end = n;}
+                fast_sint_t block_end     = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; }
+                fast_sint_t block_size    = block_end - block_start;
+
+                if (block_size < 32)
+                {
+                    for (; block_start < block_end; block_start += 1)
+                    {
+                        sa_sint_t p = SA[block_start]; SA[block_start] = p ^ SAINT_MIN; if (p > 0) { p--; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); }
+                    }
+                }
+                else
+                {
+                    libsais16_final_sorting_scan_left_to_right_16u_block_omp(T, SA, induction_bucket, block_start, block_size, threads, thread_state);
+                    block_start = block_end;
+                }
+            }
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+}
+
+static void libsais16_final_sorting_scan_left_to_right_32s_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    SA[induction_bucket[T[n - 1]]++] = (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1));
+
+    if (threads == 1 || n < 65536)
+    {
+        libsais16_final_sorting_scan_left_to_right_32s(T, SA, induction_bucket, 0, n);
+    }
+#if defined(_OPENMP)
+    else
+    {
+        fast_sint_t block_start, block_end;
+        for (block_start = 0; block_start < n; block_start = block_end)
+        {
+            block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > n) { block_end = n; }
+
+            libsais16_final_sorting_scan_left_to_right_32s_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, block_start, block_end - block_start, threads);
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+}
+
+static sa_sint_t libsais16_final_bwt_scan_right_to_left_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j; sa_sint_t index = -1;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+    {
+        libsais16_prefetchw(&SA[i - 2 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+
+        sa_sint_t p0 = SA[i - 0]; index = (p0 == 0) ? (sa_sint_t)(i - 0) : index;
+        SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint16_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p0 : t; }
+
+        sa_sint_t p1 = SA[i - 1]; index = (p1 == 0) ? (sa_sint_t)(i - 1) : index;
+        SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint16_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p1 : t; }
+    }
+
+    for (j -= prefetch_distance + 1; i >= j; i -= 1)
+    {
+        sa_sint_t p = SA[i]; index = (p == 0) ? (sa_sint_t)i : index;
+        SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint16_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; }
+    }
+
+    return index;
+}
+
+static void libsais16_final_bwt_aux_scan_right_to_left_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+    {
+        libsais16_prefetchw(&SA[i - 2 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+
+        sa_sint_t p0 = SA[i - 0];
+        SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint16_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p0 : t; if ((p0 & rm) == 0) { I[p0 / (rm + 1)] = induction_bucket[T[p0]] + 1; } }
+
+        sa_sint_t p1 = SA[i - 1];
+        SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint16_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p1 : t; if ((p1 & rm) == 0) { I[p1 / (rm + 1)] = induction_bucket[T[p1]] + 1; } }
+    }
+
+    for (j -= prefetch_distance + 1; i >= j; i -= 1)
+    {
+        sa_sint_t p = SA[i];
+        SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint16_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]] + 1; } }
+    }
+}
+
+static void libsais16_final_sorting_scan_right_to_left_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+    {
+        libsais16_prefetchw(&SA[i - 2 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+
+        sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[--induction_bucket[T[p0]]] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); }
+        sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[--induction_bucket[T[p1]]] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); }
+    }
+
+    for (j -= prefetch_distance + 1; i >= j; i -= 1)
+    {
+        sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[--induction_bucket[T[p]]] = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); }
+    }
+}
+
+static void libsais16_final_sorting_scan_right_to_left_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2)
+    {
+        libsais16_prefetchw(&SA[i - 3 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+        sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { libsais16_prefetchw(&induction_bucket[T[s2 - 1]]); libsais16_prefetch(&T[s2] - 2); }
+        sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { libsais16_prefetchw(&induction_bucket[T[s3 - 1]]); libsais16_prefetch(&T[s3] - 2); }
+
+        sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[--induction_bucket[T[p0]]] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); }
+        sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[--induction_bucket[T[p1]]] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); }
+    }
+
+    for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1)
+    {
+        sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[--induction_bucket[T[p]]] = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); }
+    }
+}
+
+#if defined(_OPENMP)
+
+static fast_sint_t libsais16_final_bwt_scan_right_to_left_16u_block_prepare(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+   const fast_sint_t prefetch_distance = 32;
+
+   memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
+
+   fast_sint_t i, j, count = 0;
+   for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+   {
+       libsais16_prefetchw(&SA[i - 2 * prefetch_distance]);
+
+       sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+       sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+
+       sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint16_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count++].index = (c0 <= c1) ? p0 : t; }
+       sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint16_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count++].index = (c0 <= c1) ? p1 : t; }
+   }
+
+   for (j -= prefetch_distance + 1; i >= j; i -= 1)
+   {
+       sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint16_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count++].index = (c0 <= c1) ? p : t; }
+   }
+
+   return count;
+}
+
+static fast_sint_t libsais16_final_bwt_aux_scan_right_to_left_16u_block_prepare(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+   const fast_sint_t prefetch_distance = 32;
+
+   memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
+
+   fast_sint_t i, j, count = 0;
+   for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+   {
+       libsais16_prefetchw(&SA[i - 2 * prefetch_distance]);
+
+       sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+       sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+
+       sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint16_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count].index = (c0 <= c1) ? p0 : t; cache[count + 1].index = p0; count += 2; }
+       sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint16_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count].index = (c0 <= c1) ? p1 : t; cache[count + 1].index = p1; count += 2; }
+   }
+
+   for (j -= prefetch_distance + 1; i >= j; i -= 1)
+   {
+       sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint16_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count].index = (c0 <= c1) ? p : t; cache[count + 1].index = p; count += 2; }
+   }
+
+   return count;
+}
+
+static fast_sint_t libsais16_final_sorting_scan_right_to_left_16u_block_prepare(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+   const fast_sint_t prefetch_distance = 32;
+
+   memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
+
+   fast_sint_t i, j, count = 0;
+   for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+   {
+       libsais16_prefetchw(&SA[i - 2 * prefetch_distance]);
+
+       sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+       sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+
+       sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; buckets[cache[count].symbol = T[p0]]++; cache[count++].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); }
+       sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; buckets[cache[count].symbol = T[p1]]++; cache[count++].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); }
+   }
+
+   for (j -= prefetch_distance + 1; i >= j; i -= 1)
+   {
+       sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; buckets[cache[count].symbol = T[p]]++; cache[count++].index = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); }
+   }
+
+   return count;
+}
+
+static void libsais16_final_order_scan_right_to_left_16u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = 0, j = count - 3; i < j; i += 4)
+    {
+        libsais16_prefetch(&cache[i + prefetch_distance]);
+
+        SA[--buckets[cache[i + 0].symbol]] = cache[i + 0].index;
+        SA[--buckets[cache[i + 1].symbol]] = cache[i + 1].index;
+        SA[--buckets[cache[i + 2].symbol]] = cache[i + 2].index;
+        SA[--buckets[cache[i + 3].symbol]] = cache[i + 3].index;
+    }
+
+    for (j += 3; i < j; i += 1)
+    {
+        SA[--buckets[cache[i].symbol]] = cache[i].index;
+    }
+}
+
+static void libsais16_final_bwt_aux_scan_right_to_left_16u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = 0, j = count - 6; i < j; i += 8)
+    {
+        libsais16_prefetch(&cache[i + prefetch_distance]);
+
+        SA[--buckets[cache[i + 0].symbol]] = cache[i + 0].index; if ((cache[i + 1].index & rm) == 0) { I[cache[i + 1].index / (rm + 1)] = buckets[cache[i + 0].symbol] + 1; }
+        SA[--buckets[cache[i + 2].symbol]] = cache[i + 2].index; if ((cache[i + 3].index & rm) == 0) { I[cache[i + 3].index / (rm + 1)] = buckets[cache[i + 2].symbol] + 1; }
+        SA[--buckets[cache[i + 4].symbol]] = cache[i + 4].index; if ((cache[i + 5].index & rm) == 0) { I[cache[i + 5].index / (rm + 1)] = buckets[cache[i + 4].symbol] + 1; }
+        SA[--buckets[cache[i + 6].symbol]] = cache[i + 6].index; if ((cache[i + 7].index & rm) == 0) { I[cache[i + 7].index / (rm + 1)] = buckets[cache[i + 6].symbol] + 1; }
+    }
+
+    for (j += 6; i < j; i += 2)
+    {
+        SA[--buckets[cache[i].symbol]] = cache[i].index; if ((cache[i + 1].index & rm) == 0) { I[(cache[i + 1].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i].symbol] + 1; }
+    }
+}
+
+static void libsais16_final_sorting_scan_right_to_left_32s_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais16_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+
+        libsais16_prefetchw(&cache[i + prefetch_distance]);
+
+        sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; cache[i + 0].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); symbol0 = T[p0]; } cache[i + 0].symbol = symbol0;
+        sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; cache[i + 1].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); symbol1 = T[p1]; } cache[i + 1].symbol = symbol1;
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t symbol = SAINT_MIN, p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; cache[i].index = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); symbol = T[p]; } cache[i].symbol = symbol;
+    }
+}
+
+static void libsais16_final_sorting_scan_right_to_left_32s_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+    {
+        libsais16_prefetchw(&cache[i - 2 * prefetch_distance]);
+
+        sa_sint_t s0 = cache[i - prefetch_distance - 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0]; libsais16_prefetchw(s0 >= 0 ? Is0 : NULL);
+        sa_sint_t s1 = cache[i - prefetch_distance - 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1]; libsais16_prefetchw(s1 >= 0 ? Is1 : NULL);
+
+        sa_sint_t v0 = cache[i - 0].symbol;
+        if (v0 >= 0)
+        {
+            cache[i - 0].symbol = --induction_bucket[v0];
+            if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 0].symbol, np = cache[i - 0].index; cache[i - 0].index = np & SAINT_MAX; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] > T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } }
+        }
+
+        sa_sint_t v1 = cache[i - 1].symbol;
+        if (v1 >= 0)
+        {
+            cache[i - 1].symbol = --induction_bucket[v1];
+            if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 1].symbol, np = cache[i - 1].index; cache[i - 1].index = np & SAINT_MAX; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] > T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } }
+        }
+    }
+
+    for (j -= prefetch_distance + 1; i >= j; i -= 1)
+    {
+        sa_sint_t v = cache[i].symbol;
+        if (v >= 0)
+        {
+            cache[i].symbol = --induction_bucket[v];
+            if (cache[i].symbol >= omp_block_start) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; cache[i].index = np & SAINT_MAX; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] > T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } }
+        }
+    }
+}
+
+static void libsais16_final_bwt_scan_right_to_left_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais16_final_bwt_scan_right_to_left_16u(T, SA, induction_bucket, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                thread_state[omp_thread_num].state.count = libsais16_final_bwt_scan_right_to_left_16u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                fast_sint_t t;
+                for (t = omp_num_threads - 1; t >= 0; --t)
+                {
+                    sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
+                    fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A - B; temp_bucket[c] = A; }
+                }
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais16_final_order_scan_right_to_left_16u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count);
+            }
+        }
+#endif
+    }
+}
+
+static void libsais16_final_bwt_aux_scan_right_to_left_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais16_final_bwt_aux_scan_right_to_left_16u(T, SA, rm, I, induction_bucket, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                thread_state[omp_thread_num].state.count = libsais16_final_bwt_aux_scan_right_to_left_16u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                fast_sint_t t;
+                for (t = omp_num_threads - 1; t >= 0; --t)
+                {
+                    sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
+                    fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A - B; temp_bucket[c] = A; }
+                }
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais16_final_bwt_aux_scan_right_to_left_16u_block_place(SA, rm, I, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count);
+            }
+        }
+#endif
+    }
+}
+
+static void libsais16_final_sorting_scan_right_to_left_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais16_final_sorting_scan_right_to_left_16u(T, SA, induction_bucket, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                thread_state[omp_thread_num].state.count = libsais16_final_sorting_scan_right_to_left_16u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                fast_sint_t t;
+                for (t = omp_num_threads - 1; t >= 0; --t)
+                {
+                    sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
+                    fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A - B; temp_bucket[c] = A; }
+                }
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais16_final_order_scan_right_to_left_16u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count);
+            }
+        }
+#endif
+    }
+}
+
+static void libsais16_final_sorting_scan_right_to_left_32s_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(cache);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais16_final_sorting_scan_right_to_left_32s(T, SA, buckets, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                libsais16_final_sorting_scan_right_to_left_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                libsais16_final_sorting_scan_right_to_left_32s_block_sort(T, buckets, cache - block_start, block_start, block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais16_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+}
+
+#endif
+
+static sa_sint_t libsais16_final_bwt_scan_right_to_left_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    sa_sint_t index = -1;
+
+    if (threads == 1 || n < 65536)
+    {
+        index = libsais16_final_bwt_scan_right_to_left_16u(T, SA, induction_bucket, 0, n);
+    }
+#if defined(_OPENMP)
+    else
+    {
+        fast_sint_t block_start;
+        for (block_start = (fast_sint_t)n - 1; block_start >= 0; )
+        {
+            if (SA[block_start] == 0)
+            {
+                index = (sa_sint_t)block_start--;
+            }
+            else
+            {
+                fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end < 0) { block_max_end = -1; }
+                fast_sint_t block_end     = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; }
+                fast_sint_t block_size    = block_start - block_end;
+
+                if (block_size < 32)
+                {
+                    for (; block_start > block_end; block_start -= 1)
+                    {
+                        sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; uint16_t c0 = T[p - (p > 0)], c1 = T[p]; SA[block_start] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; }
+                    }
+                }
+                else
+                {
+                    libsais16_final_bwt_scan_right_to_left_16u_block_omp(T, SA, induction_bucket, block_end + 1, block_size, threads, thread_state);
+                    block_start = block_end;
+                }
+            }
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+
+    return index;
+}
+
+static void libsais16_final_bwt_aux_scan_right_to_left_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    if (threads == 1 || n < 65536)
+    {
+        libsais16_final_bwt_aux_scan_right_to_left_16u(T, SA, rm, I, induction_bucket, 0, n);
+    }
+#if defined(_OPENMP)
+    else
+    {
+        fast_sint_t block_start;
+        for (block_start = (fast_sint_t)n - 1; block_start >= 0; )
+        {
+            if (SA[block_start] == 0)
+            {
+                block_start--;
+            }
+            else
+            {
+                fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * ((LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads) / 2); if (block_max_end < 0) { block_max_end = -1; }
+                fast_sint_t block_end     = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; }
+                fast_sint_t block_size    = block_start - block_end;
+
+                if (block_size < 32)
+                {
+                    for (; block_start > block_end; block_start -= 1)
+                    {
+                        sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; uint16_t c0 = T[p - (p > 0)], c1 = T[p]; SA[block_start] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]] + 1; } }
+                    }
+                }
+                else
+                {
+                    libsais16_final_bwt_aux_scan_right_to_left_16u_block_omp(T, SA, rm, I, induction_bucket, block_end + 1, block_size, threads, thread_state);
+                    block_start = block_end;
+                }
+            }
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+}
+
+static void libsais16_final_sorting_scan_right_to_left_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    if (threads == 1 || n < 65536)
+    {
+        libsais16_final_sorting_scan_right_to_left_16u(T, SA, induction_bucket, 0, n);
+    }
+#if defined(_OPENMP)
+    else
+    {
+        fast_sint_t block_start;
+        for (block_start = (fast_sint_t)n - 1; block_start >= 0; )
+        {
+            if (SA[block_start] == 0)
+            {
+                block_start--;
+            }
+            else
+            {
+                fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end < -1) { block_max_end = -1; }
+                fast_sint_t block_end     = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; }
+                fast_sint_t block_size    = block_start - block_end;
+
+                if (block_size < 32)
+                {
+                    for (; block_start > block_end; block_start -= 1)
+                    {
+                        sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; SA[--induction_bucket[T[p]]] = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); }
+                    }
+                }
+                else
+                {
+                    libsais16_final_sorting_scan_right_to_left_16u_block_omp(T, SA, induction_bucket, block_end + 1, block_size, threads, thread_state);
+                    block_start = block_end;
+                }
+            }
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+}
+
+static void libsais16_final_sorting_scan_right_to_left_32s_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    if (threads == 1 || n < 65536)
+    {
+        libsais16_final_sorting_scan_right_to_left_32s(T, SA, induction_bucket, 0, n);
+    }
+#if defined(_OPENMP)
+    else
+    {
+        fast_sint_t block_start, block_end;
+        for (block_start = (fast_sint_t)n - 1; block_start >= 0; block_start = block_end)
+        {
+            block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < 0) { block_end = -1; }
+
+            libsais16_final_sorting_scan_right_to_left_32s_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads);
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+}
+
+static void libsais16_clear_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT bucket_start, sa_sint_t * RESTRICT bucket_end, sa_sint_t threads)
+{
+    fast_sint_t c;
+
+#if defined(_OPENMP)
+    #pragma omp parallel for schedule(static, 1) num_threads(threads) if(threads > 1 && n >= 65536)
+#else
+    UNUSED(threads); UNUSED(n);
+#endif
+    for (c = 0; c < k; ++c)
+    {
+        if (bucket_end[c] > bucket_start[c])
+        {
+            memset(&SA[bucket_start[c]], 0, ((size_t)bucket_end[c] - (size_t)bucket_start[c]) * sizeof(sa_sint_t));
+        }
+    }
+}
+
+static sa_sint_t libsais16_induce_final_order_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t bwt, sa_sint_t r, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    if (!bwt)
+    {
+        libsais16_final_sorting_scan_left_to_right_16u_omp(T, SA, n, &buckets[6 * ALPHABET_SIZE], threads, thread_state);
+        if (threads > 1 && n >= 65536) { libsais16_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE], &buckets[7 * ALPHABET_SIZE], threads); }
+        libsais16_final_sorting_scan_right_to_left_16u_omp(T, SA, n, &buckets[7 * ALPHABET_SIZE], threads, thread_state);
+        return 0;
+    }
+    else if (I != NULL)
+    {
+        libsais16_final_bwt_aux_scan_left_to_right_16u_omp(T, SA, n, r - 1, I, &buckets[6 * ALPHABET_SIZE], threads, thread_state);
+        if (threads > 1 && n >= 65536) { libsais16_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE], &buckets[7 * ALPHABET_SIZE], threads); }
+        libsais16_final_bwt_aux_scan_right_to_left_16u_omp(T, SA, n, r - 1, I, &buckets[7 * ALPHABET_SIZE], threads, thread_state);
+        return 0;
+    }
+    else
+    {
+        libsais16_final_bwt_scan_left_to_right_16u_omp(T, SA, n, &buckets[6 * ALPHABET_SIZE], threads, thread_state);
+        if (threads > 1 && n >= 65536) { libsais16_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE], &buckets[7 * ALPHABET_SIZE], threads); }
+        return libsais16_final_bwt_scan_right_to_left_16u_omp(T, SA, n, &buckets[7 * ALPHABET_SIZE], threads, thread_state);
+    }
+}
+
+static void libsais16_induce_final_order_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    libsais16_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[4 * k], threads, thread_state);
+    libsais16_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[5 * k], threads, thread_state);
+}
+
+static void libsais16_induce_final_order_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    libsais16_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[2 * k], threads, thread_state);
+    libsais16_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[3 * k], threads, thread_state);
+}
+
+static void libsais16_induce_final_order_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    libsais16_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[1 * k], threads, thread_state);
+    libsais16_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[0 * k], threads, thread_state);
+}
+
+static void libsais16_induce_final_order_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    libsais16_count_suffixes_32s(T, n, k, buckets);
+    libsais16_initialize_buckets_start_32s_1k(k, buckets);
+    libsais16_final_sorting_scan_left_to_right_32s_omp(T, SA, n, buckets, threads, thread_state);
+
+    libsais16_count_suffixes_32s(T, n, k, buckets);
+    libsais16_initialize_buckets_end_32s_1k(k, buckets);
+    libsais16_final_sorting_scan_right_to_left_32s_omp(T, SA, n, buckets, threads, thread_state);
+}
+
+static sa_sint_t libsais16_renumber_unique_and_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t f, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT SAm = &SA[m];
+
+    sa_sint_t i, j;
+    for (i = (sa_sint_t)omp_block_start, j = (sa_sint_t)omp_block_start + (sa_sint_t)omp_block_size - 2 * (sa_sint_t)prefetch_distance - 3; i < j; i += 4)
+    {
+        libsais16_prefetch(&SA[i + 3 * prefetch_distance]);
+
+        libsais16_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 0]) >> 1]);
+        libsais16_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 1]) >> 1]);
+        libsais16_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 2]) >> 1]);
+        libsais16_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 3]) >> 1]);
+
+        sa_uint_t q0 = (sa_uint_t)SA[i + prefetch_distance + 0]; const sa_sint_t * Tq0 = &T[q0]; libsais16_prefetchw(SAm[q0 >> 1] < 0 ? Tq0 : NULL);
+        sa_uint_t q1 = (sa_uint_t)SA[i + prefetch_distance + 1]; const sa_sint_t * Tq1 = &T[q1]; libsais16_prefetchw(SAm[q1 >> 1] < 0 ? Tq1 : NULL);
+        sa_uint_t q2 = (sa_uint_t)SA[i + prefetch_distance + 2]; const sa_sint_t * Tq2 = &T[q2]; libsais16_prefetchw(SAm[q2 >> 1] < 0 ? Tq2 : NULL);
+        sa_uint_t q3 = (sa_uint_t)SA[i + prefetch_distance + 3]; const sa_sint_t * Tq3 = &T[q3]; libsais16_prefetchw(SAm[q3 >> 1] < 0 ? Tq3 : NULL);
+
+        sa_uint_t p0 = (sa_uint_t)SA[i + 0]; sa_sint_t s0 = SAm[p0 >> 1]; if (s0 < 0) { T[p0] |= SAINT_MIN; f++; s0 = i + 0 + SAINT_MIN + f; } SAm[p0 >> 1] = s0 - f;
+        sa_uint_t p1 = (sa_uint_t)SA[i + 1]; sa_sint_t s1 = SAm[p1 >> 1]; if (s1 < 0) { T[p1] |= SAINT_MIN; f++; s1 = i + 1 + SAINT_MIN + f; } SAm[p1 >> 1] = s1 - f;
+        sa_uint_t p2 = (sa_uint_t)SA[i + 2]; sa_sint_t s2 = SAm[p2 >> 1]; if (s2 < 0) { T[p2] |= SAINT_MIN; f++; s2 = i + 2 + SAINT_MIN + f; } SAm[p2 >> 1] = s2 - f;
+        sa_uint_t p3 = (sa_uint_t)SA[i + 3]; sa_sint_t s3 = SAm[p3 >> 1]; if (s3 < 0) { T[p3] |= SAINT_MIN; f++; s3 = i + 3 + SAINT_MIN + f; } SAm[p3 >> 1] = s3 - f;
+    }
+
+    for (j += 2 * (sa_sint_t)prefetch_distance + 3; i < j; i += 1)
+    {
+        sa_uint_t p = (sa_uint_t)SA[i]; sa_sint_t s = SAm[p >> 1]; if (s < 0) { T[p] |= SAINT_MIN; f++; s = i + SAINT_MIN + f; } SAm[p >> 1] = s - f;
+    }
+
+    return f;
+}
+
+static void libsais16_compact_unique_and_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t * pl, fast_sint_t * pr, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT SAl = &SA[0];
+    sa_sint_t * RESTRICT SAr = &SA[0];
+
+    fast_sint_t i, j, l = *pl - 1, r = *pr - 1;
+    for (i = (fast_sint_t)m + omp_block_start + omp_block_size - 1, j = (fast_sint_t)m + omp_block_start + 3; i >= j; i -= 4)
+    {
+        libsais16_prefetch(&SA[i - prefetch_distance]);
+
+        sa_sint_t p0 = SA[i - 0]; SAl[l] = p0 & SAINT_MAX; l -= p0 < 0; SAr[r] = p0 - 1; r -= p0 > 0;
+        sa_sint_t p1 = SA[i - 1]; SAl[l] = p1 & SAINT_MAX; l -= p1 < 0; SAr[r] = p1 - 1; r -= p1 > 0;
+        sa_sint_t p2 = SA[i - 2]; SAl[l] = p2 & SAINT_MAX; l -= p2 < 0; SAr[r] = p2 - 1; r -= p2 > 0;
+        sa_sint_t p3 = SA[i - 3]; SAl[l] = p3 & SAINT_MAX; l -= p3 < 0; SAr[r] = p3 - 1; r -= p3 > 0;
+    }
+
+    for (j -= 3; i >= j; i -= 1)
+    {
+        sa_sint_t p = SA[i]; SAl[l] = p & SAINT_MAX; l -= p < 0; SAr[r] = p - 1; r -= p > 0;
+    }
+    
+    *pl = l + 1; *pr = r + 1;
+}
+
+
+#if defined(_OPENMP)
+
+static sa_sint_t libsais16_count_unique_suffixes(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT SAm = &SA[m];
+
+    fast_sint_t i, j; sa_sint_t f0 = 0, f1 = 0, f2 = 0, f3 = 0;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
+    {
+        libsais16_prefetch(&SA[i + 2 * prefetch_distance]);
+
+        libsais16_prefetch(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]);
+        libsais16_prefetch(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]);
+        libsais16_prefetch(&SAm[((sa_uint_t)SA[i + prefetch_distance + 2]) >> 1]);
+        libsais16_prefetch(&SAm[((sa_uint_t)SA[i + prefetch_distance + 3]) >> 1]);
+
+        f0 += SAm[((sa_uint_t)SA[i + 0]) >> 1] < 0;
+        f1 += SAm[((sa_uint_t)SA[i + 1]) >> 1] < 0;
+        f2 += SAm[((sa_uint_t)SA[i + 2]) >> 1] < 0;
+        f3 += SAm[((sa_uint_t)SA[i + 3]) >> 1] < 0;
+    }
+
+    for (j += prefetch_distance + 3; i < j; i += 1)
+    {
+        f0 += SAm[((sa_uint_t)SA[i]) >> 1] < 0;
+    }
+
+    return f0 + f1 + f2 + f3;
+}
+
+#endif
+
+static sa_sint_t libsais16_renumber_unique_and_nonunique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    sa_sint_t f = 0;
+
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (m / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
+
+        if (omp_num_threads == 1)
+        {
+            f = libsais16_renumber_unique_and_nonunique_lms_suffixes_32s(T, SA, m, 0, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                thread_state[omp_thread_num].state.count = libsais16_count_unique_suffixes(SA, m, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                fast_sint_t t, count = 0; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; }
+
+                if (omp_thread_num == omp_num_threads - 1)
+                {
+                    f = (sa_sint_t)(count + thread_state[omp_thread_num].state.count);
+                }
+
+                libsais16_renumber_unique_and_nonunique_lms_suffixes_32s(T, SA, m, (sa_sint_t)count, omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+
+    return f;
+}
+
+static void libsais16_compact_unique_and_nonunique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072 && m < fs)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start;
+
+        if (omp_num_threads == 1)
+        {
+            fast_sint_t l = m, r = (fast_sint_t)n + (fast_sint_t)fs;
+            libsais16_compact_unique_and_nonunique_lms_suffixes_32s(SA, m, &l, &r, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                thread_state[omp_thread_num].state.position   = (fast_sint_t)m + ((fast_sint_t)n >> 1) + omp_block_start + omp_block_size;
+                thread_state[omp_thread_num].state.count      = (fast_sint_t)m + omp_block_start + omp_block_size;
+
+                libsais16_compact_unique_and_nonunique_lms_suffixes_32s(SA, m, &thread_state[omp_thread_num].state.position, &thread_state[omp_thread_num].state.count, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                fast_sint_t t, position;
+
+                for (position = m, t = omp_num_threads - 1; t >= 0; --t)
+                { 
+                    fast_sint_t omp_block_end     = t < omp_num_threads - 1 ? omp_block_stride * (t + 1) : ((fast_sint_t)n >> 1);
+                    fast_sint_t count             = ((fast_sint_t)m + ((fast_sint_t)n >> 1) + omp_block_end - thread_state[t].state.position);
+
+                    if (count > 0)
+                    {
+                        position -= count; memcpy(&SA[position], &SA[thread_state[t].state.position], (size_t)count * sizeof(sa_sint_t));
+                    }
+                }
+
+                for (position = (fast_sint_t)n + (fast_sint_t)fs, t = omp_num_threads - 1; t >= 0; --t)
+                {
+                    fast_sint_t omp_block_end     = t < omp_num_threads - 1 ? omp_block_stride * (t + 1) : ((fast_sint_t)n >> 1);
+                    fast_sint_t count             = ((fast_sint_t)m + omp_block_end - thread_state[t].state.count);
+
+                    if (count > 0)
+                    {
+                        position -= count; memcpy(&SA[position], &SA[thread_state[t].state.count], (size_t)count * sizeof(sa_sint_t));
+                    }
+                }
+            }
+        }
+#endif
+    }
+
+    memcpy(&SA[(fast_sint_t)n + (fast_sint_t)fs - (fast_sint_t)m], &SA[(fast_sint_t)m - (fast_sint_t)f], (size_t)f * sizeof(sa_sint_t));
+}
+
+static sa_sint_t libsais16_compact_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    sa_sint_t f = libsais16_renumber_unique_and_nonunique_lms_suffixes_32s_omp(T, SA, m, threads, thread_state);
+    libsais16_compact_unique_and_nonunique_lms_suffixes_32s_omp(SA, n, m, fs, f, threads, thread_state);
+
+    return f;
+}
+
+static void libsais16_merge_unique_lms_suffixes_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, fast_sint_t l, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    const sa_sint_t * RESTRICT SAnm = &SA[(fast_sint_t)n - (fast_sint_t)m - 1 + l];
+
+    sa_sint_t i, j; fast_sint_t tmp = *SAnm++;
+    for (i = (sa_sint_t)omp_block_start, j = (sa_sint_t)omp_block_start + (sa_sint_t)omp_block_size - 6; i < j; i += 4)
+    {
+        libsais16_prefetch(&T[i + prefetch_distance]);
+
+        sa_sint_t c0 = T[i + 0]; if (c0 < 0) { T[i + 0] = c0 & SAINT_MAX; SA[tmp] = i + 0; i++; tmp = *SAnm++; }
+        sa_sint_t c1 = T[i + 1]; if (c1 < 0) { T[i + 1] = c1 & SAINT_MAX; SA[tmp] = i + 1; i++; tmp = *SAnm++; }
+        sa_sint_t c2 = T[i + 2]; if (c2 < 0) { T[i + 2] = c2 & SAINT_MAX; SA[tmp] = i + 2; i++; tmp = *SAnm++; }
+        sa_sint_t c3 = T[i + 3]; if (c3 < 0) { T[i + 3] = c3 & SAINT_MAX; SA[tmp] = i + 3; i++; tmp = *SAnm++; }
+    }
+
+    for (j += 6; i < j; i += 1)
+    {
+        sa_sint_t c = T[i]; if (c < 0) { T[i] = c & SAINT_MAX; SA[tmp] = i; i++; tmp = *SAnm++; }
+    }
+}
+
+static void libsais16_merge_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, fast_sint_t l, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    const sa_sint_t * RESTRICT SAnm = &SA[(fast_sint_t)n - (fast_sint_t)m - 1 + l];
+
+    fast_sint_t i, j; sa_sint_t tmp = *SAnm++;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3; i < j; i += 4)
+    {
+        libsais16_prefetch(&SA[i + prefetch_distance]);
+
+        if (SA[i + 0] == 0) { SA[i + 0] = tmp; tmp = *SAnm++; }
+        if (SA[i + 1] == 0) { SA[i + 1] = tmp; tmp = *SAnm++; }
+        if (SA[i + 2] == 0) { SA[i + 2] = tmp; tmp = *SAnm++; }
+        if (SA[i + 3] == 0) { SA[i + 3] = tmp; tmp = *SAnm++; }
+    }
+
+    for (j += 3; i < j; i += 1)
+    {
+        if (SA[i] == 0) { SA[i] = tmp; tmp = *SAnm++; }
+    }
+}
+
+static void libsais16_merge_unique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (n / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais16_merge_unique_lms_suffixes_32s(T, SA, n, m, 0, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                thread_state[omp_thread_num].state.count = libsais16_count_negative_marked_suffixes(T, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                fast_sint_t t, count = 0; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; }
+
+                libsais16_merge_unique_lms_suffixes_32s(T, SA, n, m, count, omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+}
+
+static void libsais16_merge_nonunique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (m / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais16_merge_nonunique_lms_suffixes_32s(SA, n, m, f, omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else
+        {
+            {
+                thread_state[omp_thread_num].state.count = libsais16_count_zero_marked_suffixes(SA, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                fast_sint_t t, count = f; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; }
+
+                libsais16_merge_nonunique_lms_suffixes_32s(SA, n, m, count, omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+}
+
+static void libsais16_merge_compacted_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    libsais16_merge_unique_lms_suffixes_32s_omp(T, SA, n, m, threads, thread_state);
+    libsais16_merge_nonunique_lms_suffixes_32s_omp(SA, n, m, f, threads, thread_state);
+}
+
+static void libsais16_reconstruct_compacted_lms_suffixes_32s_2k_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, sa_sint_t fs, sa_sint_t f, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    if (f > 0)
+    {
+        memmove(&SA[n - m - 1], &SA[n + fs - m], (size_t)f * sizeof(sa_sint_t));
+
+        libsais16_count_and_gather_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state);
+        libsais16_reconstruct_lms_suffixes_omp(SA, n, m - f, threads);
+
+        memcpy(&SA[n - m - 1 + f], &SA[0], ((size_t)m - (size_t)f) * sizeof(sa_sint_t));
+        memset(&SA[0], 0, (size_t)m * sizeof(sa_sint_t));
+
+        libsais16_merge_compacted_lms_suffixes_32s_omp(T, SA, n, m, f, threads, thread_state);
+    }
+    else
+    {
+        libsais16_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n);
+        libsais16_reconstruct_lms_suffixes_omp(SA, n, m, threads);
+    }
+}
+
+static void libsais16_reconstruct_compacted_lms_suffixes_32s_1k_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    if (f > 0)
+    {
+        memmove(&SA[n - m - 1], &SA[n + fs - m], (size_t)f * sizeof(sa_sint_t));
+
+        libsais16_gather_compacted_lms_suffixes_32s(T, SA, n);
+        libsais16_reconstruct_lms_suffixes_omp(SA, n, m - f, threads);
+
+        memcpy(&SA[n - m - 1 + f], &SA[0], ((size_t)m - (size_t)f) * sizeof(sa_sint_t));
+        memset(&SA[0], 0, (size_t)m * sizeof(sa_sint_t));
+
+        libsais16_merge_compacted_lms_suffixes_32s_omp(T, SA, n, m, f, threads, thread_state);
+    }
+    else
+    {
+        libsais16_gather_lms_suffixes_32s(T, SA, n);
+        libsais16_reconstruct_lms_suffixes_omp(SA, n, m, threads);
+    }
+}
+
+static sa_sint_t libsais16_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    if (k > 0 && fs / k >= 6)
+    {
+        sa_sint_t alignment = (fs - 1024) / k >= 6 ? 1024 : 16;
+        sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 6 ? (sa_sint_t *)libsais16_align_up(&SA[n + fs - 6 * k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 6 * k];
+
+        sa_sint_t m = libsais16_count_and_gather_lms_suffixes_32s_4k_omp(T, SA, n, k, buckets, threads, thread_state);
+        if (m > 1)
+        {
+            memset(SA, 0, ((size_t)n - (size_t)m) * sizeof(sa_sint_t));
+
+            sa_sint_t first_lms_suffix    = SA[n - m];
+            sa_sint_t left_suffixes_count = libsais16_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(T, k, buckets, first_lms_suffix);
+
+            libsais16_radix_sort_lms_suffixes_32s_6k_omp(T, SA, n, m, &buckets[4 * k], threads, thread_state);
+            libsais16_radix_sort_set_markers_32s_6k_omp(SA, k, &buckets[4 * k], threads);
+
+            if (threads > 1 && n >= 65536) { memset(&SA[(fast_sint_t)n - (fast_sint_t)m], 0, (size_t)m * sizeof(sa_sint_t)); }
+
+            libsais16_initialize_buckets_for_partial_sorting_32s_6k(T, k, buckets, first_lms_suffix, left_suffixes_count);
+            libsais16_induce_partial_order_32s_6k_omp(T, SA, n, k, buckets, first_lms_suffix, left_suffixes_count, threads, thread_state);
+
+            sa_sint_t names = libsais16_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(SA, n, m, threads, thread_state);
+            if (names < m)
+            {
+                sa_sint_t f = libsais16_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
+
+                if (libsais16_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state) != 0)
+                {
+                    return -2;
+                }
+
+                libsais16_reconstruct_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, m, fs, f, buckets, threads, thread_state);
+            }
+            else
+            {
+                libsais16_count_lms_suffixes_32s_2k(T, n, k, buckets);
+            }
+
+            libsais16_initialize_buckets_start_and_end_32s_4k(k, buckets);
+            libsais16_place_lms_suffixes_histogram_32s_4k(SA, n, k, m, buckets);
+            libsais16_induce_final_order_32s_4k(T, SA, n, k, buckets, threads, thread_state);
+        }
+        else
+        {
+            SA[0] = SA[n - 1];
+
+            libsais16_initialize_buckets_start_and_end_32s_6k(k, buckets);
+            libsais16_place_lms_suffixes_histogram_32s_6k(SA, n, k, m, buckets);
+            libsais16_induce_final_order_32s_6k(T, SA, n, k, buckets, threads, thread_state);
+        }
+
+        return 0;
+    }
+    else if (k > 0 && fs / k >= 4)
+    {
+        sa_sint_t alignment = (fs - 1024) / k >= 4 ? 1024 : 16;
+        sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 4 ? (sa_sint_t *)libsais16_align_up(&SA[n + fs - 4 * k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 4 * k];
+
+        sa_sint_t m = libsais16_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state);
+        if (m > 1)
+        {
+            libsais16_initialize_buckets_for_radix_and_partial_sorting_32s_4k(T, k, buckets, SA[n - m]);
+
+            libsais16_radix_sort_lms_suffixes_32s_2k_omp(T, SA, n, m, &buckets[1], threads, thread_state);
+            libsais16_radix_sort_set_markers_32s_4k_omp(SA, k, &buckets[1], threads);
+            
+            libsais16_place_lms_suffixes_interval_32s_4k(SA, n, k, m - 1, buckets);
+            libsais16_induce_partial_order_32s_4k_omp(T, SA, n, k, buckets, threads, thread_state);
+
+            sa_sint_t names = libsais16_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(SA, n, m, threads, thread_state);
+            if (names < m)
+            {
+                sa_sint_t f = libsais16_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
+
+                if (libsais16_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state) != 0)
+                {
+                    return -2;
+                }
+
+                libsais16_reconstruct_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, m, fs, f, buckets, threads, thread_state);
+            }
+            else
+            {
+                libsais16_count_lms_suffixes_32s_2k(T, n, k, buckets);
+            }
+        }
+        else
+        {
+            SA[0] = SA[n - 1];
+        }
+
+        libsais16_initialize_buckets_start_and_end_32s_4k(k, buckets);
+        libsais16_place_lms_suffixes_histogram_32s_4k(SA, n, k, m, buckets);
+        libsais16_induce_final_order_32s_4k(T, SA, n, k, buckets, threads, thread_state);
+
+        return 0;
+    }
+    else if (k > 0 && fs / k >= 2)
+    {
+        sa_sint_t alignment = (fs - 1024) / k >= 2 ? 1024 : 16;
+        sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 2 ? (sa_sint_t *)libsais16_align_up(&SA[n + fs - 2 * k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 2 * k];
+
+        sa_sint_t m = libsais16_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state);
+        if (m > 1)
+        {
+            libsais16_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(T, k, buckets, SA[n - m]);
+
+            libsais16_radix_sort_lms_suffixes_32s_2k_omp(T, SA, n, m, &buckets[1], threads, thread_state);
+            libsais16_place_lms_suffixes_interval_32s_2k(SA, n, k, m - 1, buckets);
+
+            libsais16_initialize_buckets_start_and_end_32s_2k(k, buckets);
+            libsais16_induce_partial_order_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state);
+
+            sa_sint_t names = libsais16_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(T, SA, n, m, threads);
+            if (names < m)
+            {
+                sa_sint_t f = libsais16_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
+
+                if (libsais16_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state) != 0)
+                {
+                    return -2;
+                }
+
+                libsais16_reconstruct_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, m, fs, f, buckets, threads, thread_state);
+            }
+            else
+            {
+                libsais16_count_lms_suffixes_32s_2k(T, n, k, buckets);
+            }
+        }
+        else
+        {
+            SA[0] = SA[n - 1];
+        }
+
+        libsais16_initialize_buckets_end_32s_2k(k, buckets);
+        libsais16_place_lms_suffixes_histogram_32s_2k(SA, n, k, m, buckets);
+
+        libsais16_initialize_buckets_start_and_end_32s_2k(k, buckets);
+        libsais16_induce_final_order_32s_2k(T, SA, n, k, buckets, threads, thread_state);
+
+        return 0;
+    }
+    else
+    {
+        sa_sint_t * buffer = fs < k ? (sa_sint_t *)libsais16_alloc_aligned((size_t)k * sizeof(sa_sint_t), 4096) : (sa_sint_t *)NULL;
+
+        sa_sint_t alignment = fs - 1024 >= k ? 1024 : 16;
+        sa_sint_t * RESTRICT buckets = fs - alignment >= k ? (sa_sint_t *)libsais16_align_up(&SA[n + fs - k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : fs >= k ? &SA[n + fs - k] : buffer;
+
+        if (buckets == NULL) { return -2; }
+
+        memset(SA, 0, (size_t)n * sizeof(sa_sint_t));
+
+        libsais16_count_suffixes_32s(T, n, k, buckets); 
+        libsais16_initialize_buckets_end_32s_1k(k, buckets);
+
+        sa_sint_t m = libsais16_radix_sort_lms_suffixes_32s_1k(T, SA, n, buckets);
+        if (m > 1)
+        {
+            libsais16_induce_partial_order_32s_1k_omp(T, SA, n, k, buckets, threads, thread_state);
+
+            sa_sint_t names = libsais16_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(T, SA, n, m, threads);
+            if (names < m)
+            {
+                if (buffer != NULL) { libsais16_free_aligned(buffer); buckets = NULL; }
+
+                sa_sint_t f = libsais16_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
+
+                if (libsais16_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state) != 0)
+                {
+                    return -2;
+                }
+
+                libsais16_reconstruct_compacted_lms_suffixes_32s_1k_omp(T, SA, n, m, fs, f, threads, thread_state);
+
+                if (buckets == NULL) { buckets = buffer = (sa_sint_t *)libsais16_alloc_aligned((size_t)k * sizeof(sa_sint_t), 4096); }
+                if (buckets == NULL) { return -2; }
+            }
+            
+            libsais16_count_suffixes_32s(T, n, k, buckets);
+            libsais16_initialize_buckets_end_32s_1k(k, buckets);
+            libsais16_place_lms_suffixes_interval_32s_1k(T, SA, k, m, buckets);
+        }
+
+        libsais16_induce_final_order_32s_1k(T, SA, n, k, buckets, threads, thread_state);
+        libsais16_free_aligned(buffer);
+
+        return 0;
+    }
+}
+
+static sa_sint_t libsais16_main_16u(const uint16_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t bwt, sa_sint_t r, sa_sint_t * RESTRICT I, sa_sint_t fs, sa_sint_t * freq, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    sa_sint_t m = libsais16_count_and_gather_lms_suffixes_16u_omp(T, SA, n, buckets, threads, thread_state);
+
+    libsais16_initialize_buckets_start_and_end_16u(buckets, freq);
+
+    if (m > 0)
+    {
+        sa_sint_t first_lms_suffix    = SA[n - m];
+        sa_sint_t left_suffixes_count = libsais16_initialize_buckets_for_lms_suffixes_radix_sort_16u(T, buckets, first_lms_suffix);
+
+        if (threads > 1 && n >= 65536) { memset(SA, 0, ((size_t)n - (size_t)m) * sizeof(sa_sint_t)); }
+        libsais16_radix_sort_lms_suffixes_16u_omp(T, SA, n, m, buckets, threads, thread_state);
+        if (threads > 1 && n >= 65536) { memset(&SA[(fast_sint_t)n - (fast_sint_t)m], 0, (size_t)m * sizeof(sa_sint_t)); }
+
+        libsais16_initialize_buckets_for_partial_sorting_16u(T, buckets, first_lms_suffix, left_suffixes_count);
+        libsais16_induce_partial_order_16u_omp(T, SA, n, buckets, first_lms_suffix, left_suffixes_count, threads, thread_state);
+
+        sa_sint_t names = libsais16_renumber_and_gather_lms_suffixes_16u_omp(SA, n, m, fs, threads, thread_state);
+        if (names < m)
+        {
+            if (libsais16_main_32s(SA + n + fs - m, SA, m, names, fs + n - 2 * m, threads, thread_state) != 0)
+            {
+                return -2;
+            }
+
+            libsais16_gather_lms_suffixes_16u_omp(T, SA, n, threads, thread_state);
+            libsais16_reconstruct_lms_suffixes_omp(SA, n, m, threads);
+        }
+
+        libsais16_place_lms_suffixes_interval_16u(SA, n, m, buckets);
+    }
+    else
+    {
+        memset(SA, 0, (size_t)n * sizeof(sa_sint_t));
+    }
+
+    return libsais16_induce_final_order_16u_omp(T, SA, n, bwt, r, I, buckets, threads, thread_state);
+}
+
+static sa_sint_t libsais16_main(const uint16_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t bwt, sa_sint_t r, sa_sint_t * I, sa_sint_t fs, sa_sint_t * freq, sa_sint_t threads)
+{
+    LIBSAIS_THREAD_STATE *  RESTRICT thread_state   = threads > 1 ? libsais16_alloc_thread_state(threads) : NULL;
+    sa_sint_t *             RESTRICT buckets        = (sa_sint_t *)libsais16_alloc_aligned(8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096);
+
+    sa_sint_t index = buckets != NULL && (thread_state != NULL || threads == 1)
+        ? libsais16_main_16u(T, SA, n, buckets, bwt, r, I, fs, freq, threads, thread_state)
+        : -2;
+
+    libsais16_free_aligned(buckets);
+    libsais16_free_thread_state(thread_state);
+
+    return index;
+}
+
+static sa_sint_t libsais16_main_ctx(const LIBSAIS_CONTEXT * ctx, const uint16_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t bwt, sa_sint_t r, sa_sint_t * I, sa_sint_t fs, sa_sint_t * freq)
+{
+    return ctx != NULL && (ctx->buckets != NULL && (ctx->thread_state != NULL || ctx->threads == 1))
+        ? libsais16_main_16u(T, SA, n, ctx->buckets, bwt, r, I, fs, freq, (sa_sint_t)ctx->threads, ctx->thread_state)
+        : -2;
+}
+
+static void libsais16_bwt_copy_16u(uint16_t * RESTRICT U, sa_sint_t * RESTRICT A, sa_sint_t n)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = 0, j = (fast_sint_t)n - 7; i < j; i += 8)
+    {
+        libsais16_prefetch(&A[i + prefetch_distance]);
+
+        U[i + 0] = (uint16_t)A[i + 0];
+        U[i + 1] = (uint16_t)A[i + 1];
+        U[i + 2] = (uint16_t)A[i + 2];
+        U[i + 3] = (uint16_t)A[i + 3];
+        U[i + 4] = (uint16_t)A[i + 4];
+        U[i + 5] = (uint16_t)A[i + 5];
+        U[i + 6] = (uint16_t)A[i + 6];
+        U[i + 7] = (uint16_t)A[i + 7];
+    }
+
+    for (j += 7; i < j; i += 1)
+    {
+        U[i] = (uint16_t)A[i];
+    }
+}
+
+#if defined(_OPENMP)
+
+static void libsais16_bwt_copy_16u_omp(uint16_t * RESTRICT U, sa_sint_t * RESTRICT A, sa_sint_t n, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+        fast_sint_t omp_block_stride  = ((fast_sint_t)n / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : (fast_sint_t)n - omp_block_start;
+#else
+        UNUSED(threads);
+
+        fast_sint_t omp_block_start   = 0;
+        fast_sint_t omp_block_size    = (fast_sint_t)n;
+#endif
+
+        libsais16_bwt_copy_16u(U + omp_block_start, A + omp_block_start, (sa_sint_t)omp_block_size);
+    }
+}
+
+#endif
+
+void * libsais16_create_ctx(void)
+{
+    return (void *)libsais16_create_ctx_main(1);
+}
+
+void libsais16_free_ctx(void * ctx)
+{
+    libsais16_free_ctx_main((LIBSAIS_CONTEXT *)ctx);
+}
+
+int32_t libsais16(const uint16_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq)
+{
+    if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0))
+    {
+        return -1;
+    }
+    else if (n < 2)
+    {
+        if (n == 1) { SA[0] = 0; }
+        return 0;
+    }
+
+    return libsais16_main(T, SA, n, 0, 0, NULL, fs, freq, 1);
+}
+
+int32_t libsais16_ctx(const void * ctx, const uint16_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq)
+{
+    if ((ctx == NULL) || (T == NULL) || (SA == NULL) || (n < 0) || (fs < 0))
+    {
+        return -1;
+    }
+    else if (n < 2)
+    {
+        if (n == 1) { SA[0] = 0; }
+        return 0;
+    }
+
+    return libsais16_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, SA, n, 0, 0, NULL, fs, freq);
+}
+
+int32_t libsais16_bwt(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq)
+{
+    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0))
+    { 
+        return -1; 
+    }
+    else if (n <= 1) 
+    { 
+        if (n == 1) { U[0] = T[0]; }
+        return n; 
+    }
+
+    sa_sint_t index = libsais16_main(T, A, n, 1, 0, NULL, fs, freq, 1);
+    if (index >= 0) 
+    { 
+        index++;
+
+        U[0] = T[n - 1];
+        libsais16_bwt_copy_16u(U + 1, A, index - 1);
+        libsais16_bwt_copy_16u(U + index, A + index, n - index);
+    }
+
+    return index;
+}
+
+int32_t libsais16_bwt_aux(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I)
+{
+    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) || ((r & (r - 1)) != 0) || (I == NULL))
+    { 
+        return -1; 
+    }
+    else if (n <= 1) 
+    { 
+        if (n == 1) { U[0] = T[0]; }
+
+        I[0] = n;
+        return 0;
+    }
+
+    if (libsais16_main(T, A, n, 1, r, I, fs, freq, 1) != 0)
+    {
+        return -2;
+    }
+
+    U[0] = T[n - 1];
+    libsais16_bwt_copy_16u(U + 1, A, I[0] - 1);
+    libsais16_bwt_copy_16u(U + I[0], A + I[0], n - I[0]);
+
+    return 0;
+}
+
+int32_t libsais16_bwt_ctx(const void * ctx, const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq)
+{
+    if ((ctx == NULL) || (T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0))
+    { 
+        return -1; 
+    }
+    else if (n <= 1) 
+    { 
+        if (n == 1) { U[0] = T[0]; }
+        return n; 
+    }
+
+    sa_sint_t index = libsais16_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, A, n, 1, 0, NULL, fs, freq);
+    if (index >= 0) 
+    { 
+        index++;
+
+        U[0] = T[n - 1];
+
+#if defined(_OPENMP)
+        libsais16_bwt_copy_16u_omp(U + 1, A, index - 1, (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads);
+        libsais16_bwt_copy_16u_omp(U + index, A + index, n - index, (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads);
+#else
+        libsais16_bwt_copy_16u(U + 1, A, index - 1);
+        libsais16_bwt_copy_16u(U + index, A + index, n - index);
+#endif
+    }
+
+    return index;
+}
+
+int32_t libsais16_bwt_aux_ctx(const void * ctx, const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I)
+{
+    if ((ctx == NULL) || (T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) || ((r & (r - 1)) != 0) || (I == NULL))
+    { 
+        return -1; 
+    }
+    else if (n <= 1) 
+    { 
+        if (n == 1) { U[0] = T[0]; }
+
+        I[0] = n;
+        return 0;
+    }
+
+    if (libsais16_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, A, n, 1, r, I, fs, freq) != 0)
+    {
+        return -2;
+    }
+
+    U[0] = T[n - 1];
+
+#if defined(_OPENMP)
+    libsais16_bwt_copy_16u_omp(U + 1, A, I[0] - 1, (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads);
+    libsais16_bwt_copy_16u_omp(U + I[0], A + I[0], n - I[0], (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads);
+#else
+    libsais16_bwt_copy_16u(U + 1, A, I[0] - 1);
+    libsais16_bwt_copy_16u(U + I[0], A + I[0], n - I[0]);
+#endif
+
+    return 0;
+}
+
+#if defined(_OPENMP)
+
+void * libsais16_create_ctx_omp(int32_t threads)
+{
+    if (threads < 0) { return NULL; }
+
+    threads = threads > 0 ? threads : omp_get_max_threads();
+    return (void *)libsais16_create_ctx_main(threads);
+}
+
+int32_t libsais16_omp(const uint16_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq, int32_t threads)
+{
+    if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0) || (threads < 0))
+    {
+        return -1;
+    }
+    else if (n < 2)
+    {
+        if (n == 1) { SA[0] = 0; }
+        return 0;
+    }
+
+    threads = threads > 0 ? threads : omp_get_max_threads();
+
+    return libsais16_main(T, SA, n, 0, 0, NULL, fs, freq, threads);
+}
+
+int32_t libsais16_bwt_omp(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t threads)
+{
+    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (threads < 0))
+    {
+        return -1;
+    }
+    else if (n <= 1)
+    {
+        if (n == 1) { U[0] = T[0]; }
+        return n;
+    }
+
+    threads = threads > 0 ? threads : omp_get_max_threads();
+
+    sa_sint_t index = libsais16_main(T, A, n, 1, 0, NULL, fs, freq, threads);
+    if (index >= 0)
+    {
+        index++;
+
+        U[0] = T[n - 1];
+        libsais16_bwt_copy_16u_omp(U + 1, A, index - 1, threads);
+        libsais16_bwt_copy_16u_omp(U + index, A + index, n - index, threads);
+    }
+
+    return index;
+}
+
+int32_t libsais16_bwt_aux_omp(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I, int32_t threads)
+{
+    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) || ((r & (r - 1)) != 0) || (I == NULL) || (threads < 0))
+    {
+        return -1;
+    }
+    else if (n <= 1)
+    {
+        if (n == 1) { U[0] = T[0];}
+
+        I[0] = n;
+        return 0;
+    }
+
+    threads = threads > 0 ? threads : omp_get_max_threads();
+
+    if (libsais16_main(T, A, n, 1, r, I, fs, freq, threads) != 0)
+    {
+        return -2;
+    }
+
+    U[0] = T[n - 1];
+    libsais16_bwt_copy_16u_omp(U + 1, A, I[0] - 1, threads);
+    libsais16_bwt_copy_16u_omp(U + I[0], A + I[0], n - I[0], threads);
+
+    return 0;
+}
+
+#endif
+
+static LIBSAIS_UNBWT_CONTEXT * libsais16_unbwt_create_ctx_main(sa_sint_t threads)
+{
+    LIBSAIS_UNBWT_CONTEXT *     RESTRICT ctx            = (LIBSAIS_UNBWT_CONTEXT *)libsais16_alloc_aligned(sizeof(LIBSAIS_UNBWT_CONTEXT), 64);
+    sa_uint_t *                 RESTRICT bucket2        = (sa_uint_t *)libsais16_alloc_aligned(ALPHABET_SIZE * sizeof(sa_uint_t), 4096);
+    uint16_t *                  RESTRICT fastbits       = (uint16_t *)libsais16_alloc_aligned((1 + (1 << UNBWT_FASTBITS)) * sizeof(uint16_t), 4096);
+    sa_uint_t *                 RESTRICT buckets        = threads > 1 ? (sa_uint_t *)libsais16_alloc_aligned((size_t)threads * ALPHABET_SIZE * sizeof(sa_uint_t), 4096) : NULL;
+
+    if (ctx != NULL && bucket2 != NULL && fastbits != NULL && (buckets != NULL || threads == 1))
+    {
+        ctx->bucket2    = bucket2;
+        ctx->fastbits   = fastbits;
+        ctx->buckets    = buckets;
+        ctx->threads    = threads;
+
+        return ctx;
+    }
+
+    libsais16_free_aligned(buckets);
+    libsais16_free_aligned(fastbits);
+    libsais16_free_aligned(bucket2);
+    libsais16_free_aligned(ctx);
+
+    return NULL;
+}
+
+static void libsais16_unbwt_free_ctx_main(LIBSAIS_UNBWT_CONTEXT * ctx)
+{
+    if (ctx != NULL)
+    {
+        libsais16_free_aligned(ctx->buckets);
+        libsais16_free_aligned(ctx->fastbits);
+        libsais16_free_aligned(ctx->bucket2);
+        libsais16_free_aligned(ctx);
+    }
+}
+
+static void libsais16_unbwt_compute_histogram(const uint16_t * RESTRICT T, fast_sint_t n, sa_uint_t * RESTRICT count)
+{
+    fast_sint_t i; for (i = 0; i < n; i += 1) { count[T[i]]++; }
+}
+
+static void libsais16_unbwt_calculate_fastbits(sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift)
+{
+    fast_uint_t v, w, sum;
+    for (v = 0, sum = 1, w = 0; w < ALPHABET_SIZE; ++w)
+    {
+        fast_uint_t prev = sum; sum += bucket2[w]; bucket2[w] = (sa_uint_t)prev;
+        if (prev != sum)
+        {
+            for (; v <= ((sum - 1) >> shift); ++v) { fastbits[v] = (uint16_t)w; }
+        }
+    }
+}
+
+static void libsais16_unbwt_calculate_P(const uint16_t * RESTRICT T, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, fast_uint_t index, fast_sint_t omp_block_start, fast_sint_t omp_block_end)
+{
+    {
+        fast_sint_t i = omp_block_start, j = (fast_sint_t)index; if (omp_block_end < j) { j = omp_block_end; }
+        for (; i < j; ++i) { fast_uint_t c = T[i]; P[bucket2[c]++] = (sa_uint_t)i; }
+    }
+
+    {
+        fast_sint_t i = (fast_sint_t)index, j = omp_block_end; if (omp_block_start > i) { i = omp_block_start; }
+        for (T -= 1, i += 1; i <= j; ++i) { fast_uint_t c = T[i]; P[bucket2[c]++] = (sa_uint_t)i; }
+    }
+}
+
+static void libsais16_unbwt_init_single(const uint16_t * RESTRICT T, sa_uint_t * RESTRICT P, sa_sint_t n, const sa_sint_t * freq, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits)
+{
+    fast_uint_t index = I[0];
+    fast_uint_t shift = 0; while ((n >> shift) > (1 << UNBWT_FASTBITS)) { shift++; }
+
+    if (freq != NULL)
+    {
+        memcpy(bucket2, freq, ALPHABET_SIZE * sizeof(sa_uint_t));
+    }
+    else
+    {
+        memset(bucket2, 0, ALPHABET_SIZE * sizeof(sa_uint_t));
+        libsais16_unbwt_compute_histogram(T, n, bucket2);
+    }
+
+    libsais16_unbwt_calculate_fastbits(bucket2, fastbits, shift);
+    libsais16_unbwt_calculate_P(T, P, bucket2, index, 0, n);
+}
+
+#if defined(_OPENMP)
+
+static void libsais16_unbwt_init_parallel(const uint16_t * RESTRICT T, sa_uint_t * RESTRICT P, sa_sint_t n, const sa_sint_t * freq, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, sa_uint_t * RESTRICT buckets, sa_sint_t threads)
+{
+    fast_uint_t index = I[0];
+    fast_uint_t shift = 0; while ((n >> shift) > (1 << UNBWT_FASTBITS)) { shift++; }
+
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+    {
+        fast_sint_t omp_thread_num  = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
+
+        if (omp_num_threads == 1)
+        {
+            libsais16_unbwt_init_single(T, P, n, freq, I, bucket2, fastbits);
+        }
+        else
+        {
+            {
+                sa_uint_t * RESTRICT bucket2_local  = buckets + omp_thread_num * ALPHABET_SIZE;
+                fast_sint_t omp_block_stride        = (n / omp_num_threads) & (-16);
+                fast_sint_t omp_block_start         = omp_thread_num * omp_block_stride;
+                fast_sint_t omp_block_size          = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+                memset(bucket2_local, 0, ALPHABET_SIZE * sizeof(sa_uint_t));
+                libsais16_unbwt_compute_histogram(T + omp_block_start, omp_block_size, bucket2_local);
+            }
+
+            #pragma omp barrier
+
+            {
+                sa_uint_t * RESTRICT bucket2_temp   = buckets;
+                fast_sint_t omp_block_stride        = (ALPHABET_SIZE / omp_num_threads) & (-16);
+                fast_sint_t omp_block_start         = omp_thread_num * omp_block_stride;
+                fast_sint_t omp_block_size          = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ALPHABET_SIZE - omp_block_start;
+
+                memset(bucket2 + omp_block_start, 0, omp_block_size * sizeof(sa_uint_t));
+
+                fast_sint_t t;
+                for (t = 0; t < omp_num_threads; ++t, bucket2_temp += ALPHABET_SIZE)
+                {
+                    fast_sint_t c; for (c = omp_block_start; c < omp_block_start + omp_block_size; c += 1) { sa_uint_t A = bucket2[c], B = bucket2_temp[c]; bucket2[c] = A + B; bucket2_temp[c] = A; }
+                }
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                libsais16_unbwt_calculate_fastbits(bucket2, fastbits, shift);
+            }
+
+            #pragma omp barrier
+
+            {
+                sa_uint_t * RESTRICT bucket2_local  = buckets + omp_thread_num * ALPHABET_SIZE;
+                fast_sint_t omp_block_stride        = (n / omp_num_threads) & (-16);
+                fast_sint_t omp_block_start         = omp_thread_num * omp_block_stride;
+                fast_sint_t omp_block_size          = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+                fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_uint_t A = bucket2[c], B = bucket2_local[c]; bucket2_local[c] = A + B; }
+
+                libsais16_unbwt_calculate_P(T, P, bucket2_local, index, omp_block_start, omp_block_start + omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                memcpy(bucket2, buckets + (omp_num_threads - 1) * ALPHABET_SIZE, ALPHABET_SIZE * sizeof(sa_uint_t));
+            }
+        }
+    }
+}
+
+#endif
+
+static void libsais16_unbwt_decode_1(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t * i0, fast_uint_t k)
+{
+    uint16_t * RESTRICT U0 = U;
+
+    fast_uint_t i, p0 = *i0;
+
+    for (i = 0; i != k; ++i)
+    {
+        uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0;
+    }
+
+    *i0 = p0;
+}
+
+static void libsais16_unbwt_decode_2(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t k)
+{
+    uint16_t * RESTRICT U0 = U;
+    uint16_t * RESTRICT U1 = U0 + r;
+
+    fast_uint_t i, p0 = *i0, p1 = *i1;
+
+    for (i = 0; i != k; ++i)
+    {
+        uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0;
+        uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = c1;
+    }
+
+    *i0 = p0; *i1 = p1;
+}
+
+static void libsais16_unbwt_decode_3(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t k)
+{
+    uint16_t * RESTRICT U0 = U;
+    uint16_t * RESTRICT U1 = U0 + r;
+    uint16_t * RESTRICT U2 = U1 + r;
+
+    fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2;
+
+    for (i = 0; i != k; ++i)
+    {
+        uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0;
+        uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = c1;
+        uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = c2;
+    }
+
+    *i0 = p0; *i1 = p1; *i2 = p2;
+}
+
+static void libsais16_unbwt_decode_4(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t k)
+{
+    uint16_t * RESTRICT U0 = U;
+    uint16_t * RESTRICT U1 = U0 + r;
+    uint16_t * RESTRICT U2 = U1 + r;
+    uint16_t * RESTRICT U3 = U2 + r;
+
+    fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3;
+
+    for (i = 0; i != k; ++i)
+    {
+        uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0;
+        uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = c1;
+        uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = c2;
+        uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = c3;
+    }
+
+    *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3;
+}
+
+static void libsais16_unbwt_decode_5(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t k)
+{
+    uint16_t * RESTRICT U0 = U;
+    uint16_t * RESTRICT U1 = U0 + r;
+    uint16_t * RESTRICT U2 = U1 + r;
+    uint16_t * RESTRICT U3 = U2 + r;
+    uint16_t * RESTRICT U4 = U3 + r;
+
+    fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4;
+
+    for (i = 0; i != k; ++i)
+    {
+        uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0;
+        uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = c1;
+        uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = c2;
+        uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = c3;
+        uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = c4;
+    }
+
+    *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4;
+}
+
+static void libsais16_unbwt_decode_6(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t * i5, fast_uint_t k)
+{
+    uint16_t * RESTRICT U0 = U;
+    uint16_t * RESTRICT U1 = U0 + r;
+    uint16_t * RESTRICT U2 = U1 + r;
+    uint16_t * RESTRICT U3 = U2 + r;
+    uint16_t * RESTRICT U4 = U3 + r;
+    uint16_t * RESTRICT U5 = U4 + r;
+
+    fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5;
+
+    for (i = 0; i != k; ++i)
+    {
+        uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0;
+        uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = c1;
+        uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = c2;
+        uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = c3;
+        uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = c4;
+        uint16_t c5 = fastbits[p5 >> shift]; if (bucket2[c5] <= p5) { do { c5++; } while (bucket2[c5] <= p5); } p5 = P[p5]; U5[i] = c5;
+    }
+
+    *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4; *i5 = p5;
+}
+
+static void libsais16_unbwt_decode_7(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t * i5, fast_uint_t * i6, fast_uint_t k)
+{
+    uint16_t * RESTRICT U0 = U;
+    uint16_t * RESTRICT U1 = U0 + r;
+    uint16_t * RESTRICT U2 = U1 + r;
+    uint16_t * RESTRICT U3 = U2 + r;
+    uint16_t * RESTRICT U4 = U3 + r;
+    uint16_t * RESTRICT U5 = U4 + r;
+    uint16_t * RESTRICT U6 = U5 + r;
+
+    fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5, p6 = *i6;
+
+    for (i = 0; i != k; ++i)
+    {
+        uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0;
+        uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = c1;
+        uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = c2;
+        uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = c3;
+        uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = c4;
+        uint16_t c5 = fastbits[p5 >> shift]; if (bucket2[c5] <= p5) { do { c5++; } while (bucket2[c5] <= p5); } p5 = P[p5]; U5[i] = c5;
+        uint16_t c6 = fastbits[p6 >> shift]; if (bucket2[c6] <= p6) { do { c6++; } while (bucket2[c6] <= p6); } p6 = P[p6]; U6[i] = c6;
+    }
+
+    *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4; *i5 = p5; *i6 = p6;
+}
+
+static void libsais16_unbwt_decode_8(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t * i5, fast_uint_t * i6, fast_uint_t * i7, fast_uint_t k)
+{
+    uint16_t * RESTRICT U0 = U;
+    uint16_t * RESTRICT U1 = U0 + r;
+    uint16_t * RESTRICT U2 = U1 + r;
+    uint16_t * RESTRICT U3 = U2 + r;
+    uint16_t * RESTRICT U4 = U3 + r;
+    uint16_t * RESTRICT U5 = U4 + r;
+    uint16_t * RESTRICT U6 = U5 + r;
+    uint16_t * RESTRICT U7 = U6 + r;
+
+    fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5, p6 = *i6, p7 = *i7;
+
+    for (i = 0; i != k; ++i)
+    {
+        uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0;
+        uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = c1;
+        uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = c2;
+        uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = c3;
+        uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = c4;
+        uint16_t c5 = fastbits[p5 >> shift]; if (bucket2[c5] <= p5) { do { c5++; } while (bucket2[c5] <= p5); } p5 = P[p5]; U5[i] = c5;
+        uint16_t c6 = fastbits[p6 >> shift]; if (bucket2[c6] <= p6) { do { c6++; } while (bucket2[c6] <= p6); } p6 = P[p6]; U6[i] = c6;
+        uint16_t c7 = fastbits[p7 >> shift]; if (bucket2[c7] <= p7) { do { c7++; } while (bucket2[c7] <= p7); } p7 = P[p7]; U7[i] = c7;
+    }
+
+    *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4; *i5 = p5; *i6 = p6; *i7 = p7;
+}
+
+static void libsais16_unbwt_decode(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n, sa_sint_t r, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_sint_t blocks, fast_uint_t reminder)
+{
+    fast_uint_t shift       = 0; while ((n >> shift) > (1 << UNBWT_FASTBITS)) { shift++; }
+    fast_uint_t offset      = 0;
+
+    while (blocks > 8)
+    {
+        fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6], i7 = I[7];
+        libsais16_unbwt_decode_8(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, &i7, (fast_uint_t)r);
+        I += 8; blocks -= 8; offset += 8 * (fast_uint_t)r;
+    }
+
+    if (blocks == 1)
+    {
+        fast_uint_t i0 = I[0];
+        libsais16_unbwt_decode_1(U + offset, P, bucket2, fastbits, shift, &i0, reminder);
+    }
+    else if (blocks == 2)
+    {
+        fast_uint_t i0 = I[0], i1 = I[1];
+        libsais16_unbwt_decode_2(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, reminder);
+        libsais16_unbwt_decode_1(U + offset + reminder, P, bucket2, fastbits, shift, &i0, ((fast_uint_t)r) - reminder);
+    }
+    else if (blocks == 3)
+    {
+        fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2];
+        libsais16_unbwt_decode_3(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, reminder);
+        libsais16_unbwt_decode_2(U + offset + reminder, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, ((fast_uint_t)r) - reminder);
+    }
+    else if (blocks == 4)
+    {
+        fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3];
+        libsais16_unbwt_decode_4(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, reminder);
+        libsais16_unbwt_decode_3(U + offset + reminder, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, ((fast_uint_t)r) - reminder);
+    }
+    else if (blocks == 5)
+    {
+        fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4];
+        libsais16_unbwt_decode_5(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, reminder);
+        libsais16_unbwt_decode_4(U + offset + reminder, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, ((fast_uint_t)r) - reminder);
+    }
+    else if (blocks == 6)
+    {
+        fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5];
+        libsais16_unbwt_decode_6(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, reminder);
+        libsais16_unbwt_decode_5(U + offset + reminder, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, ((fast_uint_t)r) - reminder);
+    }
+    else if (blocks == 7)
+    {
+        fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6];
+        libsais16_unbwt_decode_7(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, reminder);
+        libsais16_unbwt_decode_6(U + offset + reminder, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, ((fast_uint_t)r) - reminder);
+    }
+    else
+    {
+        fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6], i7 = I[7];
+        libsais16_unbwt_decode_8(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, &i7, reminder);
+        libsais16_unbwt_decode_7(U + offset + reminder, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, ((fast_uint_t)r) - reminder);
+    }
+}
+
+static void libsais16_unbwt_decode_omp(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n, sa_sint_t r, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, sa_sint_t threads)
+{
+    fast_sint_t blocks      = 1 + (((fast_sint_t)n - 1) / (fast_sint_t)r);
+    fast_uint_t reminder    = (fast_uint_t)n - ((fast_uint_t)r * ((fast_uint_t)blocks - 1));
+
+#if defined(_OPENMP)
+    fast_sint_t max_threads = blocks < threads ? blocks : threads;
+    #pragma omp parallel num_threads(max_threads) if(max_threads > 1 && n >= 65536)
+#endif
+    {
+#if defined(_OPENMP)
+        fast_sint_t omp_thread_num      = omp_get_thread_num();
+        fast_sint_t omp_num_threads     = omp_get_num_threads();
+#else
+        UNUSED(threads);
+
+        fast_sint_t omp_thread_num      = 0;
+        fast_sint_t omp_num_threads     = 1;
+#endif
+
+        fast_sint_t omp_block_stride    = blocks / omp_num_threads;
+        fast_sint_t omp_block_reminder  = blocks % omp_num_threads;
+        fast_sint_t omp_block_size      = omp_block_stride + (omp_thread_num < omp_block_reminder);
+        fast_sint_t omp_block_start     = omp_block_stride * omp_thread_num + (omp_thread_num < omp_block_reminder ? omp_thread_num : omp_block_reminder);
+
+        libsais16_unbwt_decode(U + r * omp_block_start, P, n, r, I + omp_block_start, bucket2, fastbits, omp_block_size, omp_thread_num < omp_num_threads - 1 ? (fast_uint_t)r : reminder);
+    }
+}
+
+static sa_sint_t libsais16_unbwt_core(const uint16_t * RESTRICT T, uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n, const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, sa_uint_t * RESTRICT buckets, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+    if (threads > 1 && n >= 262144)
+    {
+        libsais16_unbwt_init_parallel(T, P, n, freq, I, bucket2, fastbits, buckets, threads);
+    }
+    else
+#else
+    UNUSED(buckets);
+#endif
+    {
+        libsais16_unbwt_init_single(T, P, n, freq, I, bucket2, fastbits);
+    }
+
+    libsais16_unbwt_decode_omp(U, P, n, r, I, bucket2, fastbits, threads);
+    return 0;
+}
+
+static sa_sint_t libsais16_unbwt_main(const uint16_t * T, uint16_t * U, sa_uint_t * P, sa_sint_t n, const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * I, sa_sint_t threads)
+{
+    fast_uint_t shift = 0; while ((n >> shift) > (1 << UNBWT_FASTBITS)) { shift++; }
+
+    sa_uint_t *     RESTRICT bucket2        = (sa_uint_t *)libsais16_alloc_aligned(ALPHABET_SIZE * sizeof(sa_uint_t), 4096);
+    uint16_t *      RESTRICT fastbits       = (uint16_t *)libsais16_alloc_aligned(((size_t)1 + (size_t)(n >> shift)) * sizeof(uint16_t), 4096);
+    sa_uint_t *     RESTRICT buckets        = threads > 1 && n >= 262144 ? (sa_uint_t *)libsais16_alloc_aligned((size_t)threads * ALPHABET_SIZE * sizeof(sa_uint_t), 4096) : NULL;
+
+    sa_sint_t index = bucket2 != NULL && fastbits != NULL && (buckets != NULL || threads == 1 || n < 262144)
+        ? libsais16_unbwt_core(T, U, P, n, freq, r, I, bucket2, fastbits, buckets, threads)
+        : -2;
+
+    libsais16_free_aligned(buckets);
+    libsais16_free_aligned(fastbits);
+    libsais16_free_aligned(bucket2);
+
+    return index;
+}
+
+static sa_sint_t libsais16_unbwt_main_ctx(const LIBSAIS_UNBWT_CONTEXT * ctx, const uint16_t * T, uint16_t * U, sa_uint_t * P, sa_sint_t n, const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * I)
+{
+    return ctx != NULL && ctx->bucket2 != NULL && ctx->fastbits != NULL && (ctx->buckets != NULL || ctx->threads == 1)
+        ? libsais16_unbwt_core(T, U, P, n, freq, r, I, ctx->bucket2, ctx->fastbits, ctx->buckets, (sa_sint_t)ctx->threads)
+        : -2;
+}
+
+void * libsais16_unbwt_create_ctx(void)
+{
+    return (void *)libsais16_unbwt_create_ctx_main(1);
+}
+
+void libsais16_unbwt_free_ctx(void * ctx)
+{
+    libsais16_unbwt_free_ctx_main((LIBSAIS_UNBWT_CONTEXT *)ctx);
+}
+
+int32_t libsais16_unbwt(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i)
+{
+    return libsais16_unbwt_aux(T, U, A, n, freq, n, &i);
+}
+
+int32_t libsais16_unbwt_ctx(const void * ctx, const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i)
+{
+    return libsais16_unbwt_aux_ctx(ctx, T, U, A, n, freq, n, &i);
+}
+
+int32_t libsais16_unbwt_aux(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I)
+{
+    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL))
+    {
+        return -1;
+    }
+    else if (n <= 1)
+    {
+        if (I[0] != n) { return -1; }
+        if (n == 1) { U[0] = T[0]; }
+        return 0;
+    }
+
+    fast_sint_t t; for (t = 0; t <= (n - 1) / r; ++t) { if (I[t] <= 0 || I[t] > n) { return -1; } }
+
+    return libsais16_unbwt_main(T, U, (sa_uint_t *)A, n, freq, r, (const sa_uint_t *)I, 1);
+}
+
+int32_t libsais16_unbwt_aux_ctx(const void * ctx, const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I)
+{
+    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL))
+    {
+        return -1;
+    }
+    else if (n <= 1)
+    {
+        if (I[0] != n) { return -1; }
+        if (n == 1) { U[0] = T[0]; }
+        return 0;
+    }
+
+    fast_sint_t t; for (t = 0; t <= (n - 1) / r; ++t) { if (I[t] <= 0 || I[t] > n) { return -1; } }
+
+    return libsais16_unbwt_main_ctx((const LIBSAIS_UNBWT_CONTEXT *)ctx, T, U, (sa_uint_t *)A, n, freq, r, (const sa_uint_t *)I);
+}
+
+#if defined(_OPENMP)
+
+void * libsais16_unbwt_create_ctx_omp(int32_t threads)
+{
+    if (threads < 0) { return NULL; }
+
+    threads = threads > 0 ? threads : omp_get_max_threads();
+    return (void *)libsais16_unbwt_create_ctx_main(threads);
+}
+
+int32_t libsais16_unbwt_omp(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i, int32_t threads)
+{
+    return libsais16_unbwt_aux_omp(T, U, A, n, freq, n, &i, threads);
+}
+
+int32_t libsais16_unbwt_aux_omp(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I, int32_t threads)
+{
+    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL) || (threads < 0))
+    {
+        return -1;
+    }
+    else if (n <= 1)
+    {
+        if (I[0] != n) { return -1; }
+        if (n == 1) { U[0] = T[0]; }
+        return 0;
+    }
+
+    fast_sint_t t; for (t = 0; t <= (n - 1) / r; ++t) { if (I[t] <= 0 || I[t] > n) { return -1; } }
+
+    threads = threads > 0 ? threads : omp_get_max_threads();
+    return libsais16_unbwt_main(T, U, (sa_uint_t *)A, n, freq, r, (const sa_uint_t *)I, threads);
+}
+
+#endif
diff --git a/libsais/libsais16.h b/libsais/libsais16.h
new file mode 100644
index 0000000..c577058
--- /dev/null
+++ b/libsais/libsais16.h
@@ -0,0 +1,285 @@
+/*--
+
+This file is a part of libsais, a library for linear time
+suffix array and burrows wheeler transform construction.
+
+   Copyright (c) 2021 Ilya Grebnov <ilya.grebnov@gmail.com>
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+Please see the file LICENSE for full copyright information.
+
+--*/
+
+#ifndef LIBSAIS16_H
+#define LIBSAIS16_H 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+    #include <stdint.h>
+
+    /**
+    * Creates the libsais16 context that allows reusing allocated memory with each libsais16 operation. 
+    * In multi-threaded environments, use one context per thread for parallel executions.
+    * @return the libsais16 context, NULL otherwise.
+    */
+    void * libsais16_create_ctx(void);
+
+#if defined(_OPENMP)
+    /**
+    * Creates the libsais16 context that allows reusing allocated memory with each parallel libsais16 operation using OpenMP. 
+    * In multi-threaded environments, use one context per thread for parallel executions.
+    * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
+    * @return the libsais16 context, NULL otherwise.
+    */
+    void * libsais16_create_ctx_omp(int32_t threads);
+#endif
+
+    /**
+    * Destroys the libsass context and free previusly allocated memory.
+    * @param ctx The libsais16 context (can be NULL).
+    */
+    void libsais16_free_ctx(void * ctx);
+
+    /**
+    * Constructs the suffix array of a given 16-bit string.
+    * @param T [0..n-1] The input 16-bit string.
+    * @param SA [0..n-1+fs] The output array of suffixes.
+    * @param n The length of the given 16-bit string.
+    * @param fs The extra space available at the end of SA array (can be 0).
+    * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL).
+    * @return 0 if no error occurred, -1 or -2 otherwise.
+    */
+    int32_t libsais16(const uint16_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq);
+
+    /**
+    * Constructs the suffix array of a given 16-bit string using libsais16 context.
+    * @param ctx The libsais16 context.
+    * @param T [0..n-1] The input 16-bit string.
+    * @param SA [0..n-1+fs] The output array of suffixes.
+    * @param n The length of the given 16-bit string.
+    * @param fs The extra space available at the end of SA array (can be 0).
+    * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL).
+    * @return 0 if no error occurred, -1 or -2 otherwise.
+    */
+    int32_t libsais16_ctx(const void * ctx, const uint16_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq);
+
+#if defined(_OPENMP)
+    /**
+    * Constructs the suffix array of a given 16-bit string in parallel using OpenMP.
+    * @param T [0..n-1] The input 16-bit string.
+    * @param SA [0..n-1+fs] The output array of suffixes.
+    * @param n The length of the given 16-bit string.
+    * @param fs The extra space available at the end of SA array (can be 0).
+    * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL).
+    * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
+    * @return 0 if no error occurred, -1 or -2 otherwise.
+    */
+    int32_t libsais16_omp(const uint16_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq, int32_t threads);
+#endif
+
+    /**
+    * Constructs the burrows-wheeler transformed 16-bit string of a given 16-bit string.
+    * @param T [0..n-1] The input 16-bit string.
+    * @param U [0..n-1] The output 16-bit string (can be T).
+    * @param A [0..n-1+fs] The temporary array.
+    * @param n The length of the given 16-bit string.
+    * @param fs The extra space available at the end of A array (can be 0).
+    * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL).
+    * @return The primary index if no error occurred, -1 or -2 otherwise.
+    */
+    int32_t libsais16_bwt(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq);
+
+    /**
+    * Constructs the burrows-wheeler transformed 16-bit string of a given 16-bit string with auxiliary indexes.
+    * @param T [0..n-1] The input 16-bit string.
+    * @param U [0..n-1] The output 16-bit string (can be T).
+    * @param A [0..n-1+fs] The temporary array.
+    * @param n The length of the given 16-bit string.
+    * @param fs The extra space available at the end of A array (can be 0).
+    * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL).
+    * @param r The sampling rate for auxiliary indexes (must be power of 2).
+    * @param I [0..(n-1)/r] The output auxiliary indexes.
+    * @return 0 if no error occurred, -1 or -2 otherwise.
+    */
+    int32_t libsais16_bwt_aux(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I);
+
+    /**
+    * Constructs the burrows-wheeler transformed 16-bit string of a given 16-bit string using libsais16 context.
+    * @param ctx The libsais16 context.
+    * @param T [0..n-1] The input 16-bit string.
+    * @param U [0..n-1] The output 16-bit string (can be T).
+    * @param A [0..n-1+fs] The temporary array.
+    * @param n The length of the given 16-bit string.
+    * @param fs The extra space available at the end of A array (can be 0).
+    * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL).
+    * @return The primary index if no error occurred, -1 or -2 otherwise.
+    */
+    int32_t libsais16_bwt_ctx(const void * ctx, const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq);
+
+    /**
+    * Constructs the burrows-wheeler transformed 16-bit string of a given 16-bit string with auxiliary indexes using libsais16 context.
+    * @param ctx The libsais16 context.
+    * @param T [0..n-1] The input 16-bit string.
+    * @param U [0..n-1] The output 16-bit string (can be T).
+    * @param A [0..n-1+fs] The temporary array.
+    * @param n The length of the given 16-bit string.
+    * @param fs The extra space available at the end of A array (can be 0).
+    * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL).
+    * @param r The sampling rate for auxiliary indexes (must be power of 2).
+    * @param I [0..(n-1)/r] The output auxiliary indexes.
+    * @return 0 if no error occurred, -1 or -2 otherwise.
+    */
+    int32_t libsais16_bwt_aux_ctx(const void * ctx, const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I);
+
+#if defined(_OPENMP)
+    /**
+    * Constructs the burrows-wheeler transformed 16-bit string of a given 16-bit string in parallel using OpenMP.
+    * @param T [0..n-1] The input 16-bit string.
+    * @param U [0..n-1] The output 16-bit string (can be T).
+    * @param A [0..n-1+fs] The temporary array.
+    * @param n The length of the given 16-bit string.
+    * @param fs The extra space available at the end of A array (can be 0).
+    * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL).
+    * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
+    * @return The primary index if no error occurred, -1 or -2 otherwise.
+    */
+    int32_t libsais16_bwt_omp(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t threads);
+
+    /**
+    * Constructs the burrows-wheeler transformed 16-bit string of a given 16-bit string with auxiliary indexes in parallel using OpenMP.
+    * @param T [0..n-1] The input 16-bit string.
+    * @param U [0..n-1] The output 16-bit string (can be T).
+    * @param A [0..n-1+fs] The temporary array.
+    * @param n The length of the given 16-bit string.
+    * @param fs The extra space available at the end of A array (can be 0).
+    * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL).
+    * @param r The sampling rate for auxiliary indexes (must be power of 2).
+    * @param I [0..(n-1)/r] The output auxiliary indexes.
+    * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
+    * @return 0 if no error occurred, -1 or -2 otherwise.
+    */
+    int32_t libsais16_bwt_aux_omp(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I, int32_t threads);
+#endif
+
+    /**
+    * Creates the libsais16 reverse BWT context that allows reusing allocated memory with each libsais16_unbwt_* operation. 
+    * In multi-threaded environments, use one context per thread for parallel executions.
+    * @return the libsais16 context, NULL otherwise.
+    */
+    void * libsais16_unbwt_create_ctx(void);
+
+#if defined(_OPENMP)
+    /**
+    * Creates the libsais16 reverse BWT context that allows reusing allocated memory with each parallel libsais16_unbwt_* operation using OpenMP. 
+    * In multi-threaded environments, use one context per thread for parallel executions.
+    * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
+    * @return the libsais16 context, NULL otherwise.
+    */
+    void * libsais16_unbwt_create_ctx_omp(int32_t threads);
+#endif
+
+    /**
+    * Destroys the libsass reverse BWT context and free previusly allocated memory.
+    * @param ctx The libsais16 context (can be NULL).
+    */
+    void libsais16_unbwt_free_ctx(void * ctx);
+
+    /**
+    * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string with primary index.
+    * @param T [0..n-1] The input 16-bit string.
+    * @param U [0..n-1] The output 16-bit string (can be T).
+    * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size).
+    * @param n The length of the given 16-bit string.
+    * @param freq [0..65535] The input 16-bit symbol frequency table (can be NULL).
+    * @param i The primary index.
+    * @return 0 if no error occurred, -1 or -2 otherwise.
+    */
+    int32_t libsais16_unbwt(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i);
+
+    /**
+    * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string with primary index using libsais16 reverse BWT context.
+    * @param ctx The libsais16 reverse BWT context.
+    * @param T [0..n-1] The input 16-bit string.
+    * @param U [0..n-1] The output 16-bit string (can be T).
+    * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size).
+    * @param n The length of the given 16-bit string.
+    * @param freq [0..65535] The input 16-bit symbol frequency table (can be NULL).
+    * @param i The primary index.
+    * @return 0 if no error occurred, -1 or -2 otherwise.
+    */
+    int32_t libsais16_unbwt_ctx(const void * ctx, const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i);
+
+    /**
+    * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string with auxiliary indexes.
+    * @param T [0..n-1] The input 16-bit string.
+    * @param U [0..n-1] The output 16-bit string (can be T).
+    * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size).
+    * @param n The length of the given 16-bit string.
+    * @param freq [0..65535] The input 16-bit symbol frequency table (can be NULL).
+    * @param r The sampling rate for auxiliary indexes (must be power of 2).
+    * @param I [0..(n-1)/r] The input auxiliary indexes.
+    * @return 0 if no error occurred, -1 or -2 otherwise.
+    */
+    int32_t libsais16_unbwt_aux(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I);
+
+    /**
+    * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string with auxiliary indexes using libsais16 reverse BWT context.
+    * @param ctx The libsais16 reverse BWT context.
+    * @param T [0..n-1] The input 16-bit string.
+    * @param U [0..n-1] The output 16-bit string (can be T).
+    * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size).
+    * @param n The length of the given 16-bit string.
+    * @param freq [0..65535] The input 16-bit symbol frequency table (can be NULL).
+    * @param r The sampling rate for auxiliary indexes (must be power of 2).
+    * @param I [0..(n-1)/r] The input auxiliary indexes.
+    * @return 0 if no error occurred, -1 or -2 otherwise.
+    */
+    int32_t libsais16_unbwt_aux_ctx(const void * ctx, const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I);
+
+#if defined(_OPENMP)
+    /**
+    * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string with primary index in parallel using OpenMP.
+    * @param T [0..n-1] The input 16-bit string.
+    * @param U [0..n-1] The output 16-bit string (can be T).
+    * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size).
+    * @param n The length of the given 16-bit string.
+    * @param freq [0..65535] The input 16-bit symbol frequency table (can be NULL).
+    * @param i The primary index.
+    * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
+    * @return 0 if no error occurred, -1 or -2 otherwise.
+    */
+    int32_t libsais16_unbwt_omp(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i, int32_t threads);
+
+    /**
+    * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string with auxiliary indexes in parallel using OpenMP.
+    * @param T [0..n-1] The input 16-bit string.
+    * @param U [0..n-1] The output 16-bit string (can be T).
+    * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size).
+    * @param n The length of the given 16-bit string.
+    * @param freq [0..65535] The input 16-bit symbol frequency table (can be NULL).
+    * @param r The sampling rate for auxiliary indexes (must be power of 2).
+    * @param I [0..(n-1)/r] The input auxiliary indexes.
+    * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
+    * @return 0 if no error occurred, -1 or -2 otherwise.
+    */
+    int32_t libsais16_unbwt_aux_omp(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I, int32_t threads);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/libsais/libsais_internal.h b/libsais/libsais_internal.h
new file mode 100644
index 0000000..d11a213
--- /dev/null
+++ b/libsais/libsais_internal.h
@@ -0,0 +1,49 @@
+/*--
+
+This file is a part of libsais, a library for linear time
+suffix array and burrows wheeler transform construction.
+
+   Copyright (c) 2021 Ilya Grebnov <ilya.grebnov@gmail.com>
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+Please see the file LICENSE for full copyright information.
+
+--*/
+
+#ifndef LIBSAIS_INTERNAL_H
+#define LIBSAIS_INTERNAL_H 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+    #include <stdint.h>
+
+    /**
+    * Internal method to construct suffix array of an integer array. 
+    * Note, during suffix array construction input array will be modified and restored at the end if no error occurred.
+    * @param T [0..n-1] The input integer array.
+    * @param SA [0..n-1+fs] The output array of suffixes.
+    * @param n The length of the integer array.
+    * @param k The alphabet size of the input integer array.
+    * @param fs Extra space available at the end of SA array (can be 0).
+    * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
+    * @return 0 if no error occurred, -1 or -2 otherwise.
+    */
+    int32_t libsais_main_32s_internal(int32_t * T, int32_t * SA, int32_t n, int32_t k, int32_t fs, int32_t threads);
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/m03_model.h b/m03_model.h
new file mode 100644
index 0000000..0134b5b
--- /dev/null
+++ b/m03_model.h
@@ -0,0 +1,408 @@
+/*--
+
+This file is a part of bsc-m03 project.
+
+    Copyright (c) 2021 Ilya Grebnov <ilya.grebnov@gmail.com>
+
+    bsc-m03 is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    bsc-m03 is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with bsc-m03.  If not, see <https://www.gnu.org/licenses/>.
+
+--*/
+
+#pragma once
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "common/platform.h"
+#include "common/rangecoder.h"
+
+#include <algorithm>
+
+#pragma warning( push )
+#pragma warning( disable : 6385 )
+#pragma warning( disable : 6386 )
+
+enum class m03_mode : int { encoding = 0, decoding = 1, };
+
+class m03_model
+{
+protected:
+    m03_mode mode;
+
+    void initialize_model(RangeCoder * coder, m03_mode mode)
+    {
+        this->coder = coder;
+        this->mode  = mode;
+
+        for (int32_t s = 0; s < 1536; ++s) { T1_model[s][0] = T1_model[s][1] = 1; }
+        for (int32_t s = 0; s < 1536; ++s) { T2_model[s][0] = T2_model[s][1] = T2_model[s][2] = T2_model[s][3] = 1; }
+        for (int32_t s = 0; s < 768 ; ++s) { Ternary_model[s][0] = Ternary_model[s][1] = Ternary_model[s][2] = Ternary_model[s][3] = 1; }
+        for (int32_t s = 0; s < 96  ; ++s) { for (int32_t c = 0; c < 16; ++c) { Tree_model[s][c] = 1; } }
+    }
+
+    void encode_root_frequencies(const int32_t * root_frequencies, int32_t k, int32_t n)
+    {
+        int64_t  bit_freq[33];
+        int64_t  bit_freq_sum[33];
+
+        {
+            int64_t remaining_min = n, remaining_max = n, remaining_count = k;
+
+            memset(bit_freq, 0, sizeof(bit_freq));
+
+            for (ptrdiff_t p = 0; p < k; ++p)
+            {
+                bit_freq[bit_scan_reverse(root_frequencies[p] + 1)]++;
+            }
+
+            for (ptrdiff_t bit = 0; bit <= 32 && remaining_count > 0; ++bit)
+            {
+                int64_t min_value   = (1ll << (bit + 0)) - 1;
+                int64_t max_value   = (1ll << (bit + 1)) - 2;
+                int64_t min         = std::max(remaining_count - (remaining_max / (max_value + 1)), 0ll);
+                int64_t max         = remaining_count * max_value < remaining_min ? remaining_count - 1 : remaining_count;
+
+                this->coder->EncodeValue((unsigned int)min, (unsigned int)bit_freq[bit], (unsigned int)max);
+
+                remaining_min       -= bit_freq[bit] * max_value;
+                remaining_max       -= bit_freq[bit] * min_value;
+                remaining_count     -= bit_freq[bit];
+            }
+        }
+
+        {
+            int64_t bit_sum = 0, remaining_min = 0, remaining_max = 0, remaining_total = n;
+
+            for (ptrdiff_t bit = 32; bit >= 0; --bit)
+            {
+                int64_t min_value   = (1ll << (bit + 0)) - 1;
+                int64_t max_value   = (1ll << (bit + 1)) - 2;
+
+                bit_freq_sum[bit] = bit_sum; bit_sum += bit_freq[bit];
+
+                remaining_min       += min_value * bit_freq[bit];
+                remaining_max       += max_value * bit_freq[bit];
+            }
+
+            for (ptrdiff_t p = 0; p < k; ++p)
+            {
+                int32_t bit = bit_scan_reverse(root_frequencies[p] + 1);
+
+                {
+                    for (ptrdiff_t b = 0; b < bit; ++b)
+                    {
+                        if (bit_freq[b] > 0)
+                        {
+                            this->coder->Encode((unsigned int)bit_freq[b], (unsigned int)bit_freq_sum[b], (unsigned int)(bit_freq[b] + bit_freq_sum[b]));
+                        }
+
+                        assert(bit_freq_sum[b] > 0); bit_freq_sum[b]--;
+                    }
+
+                    if (bit_freq_sum[bit] > 0)
+                    {
+                        this->coder->Encode(0, (unsigned int)bit_freq[bit], (unsigned int)(bit_freq[bit] + bit_freq_sum[bit]));
+                    }
+
+                    assert(bit_freq[bit] > 0); bit_freq[bit]--;
+                }
+
+                {
+                    int64_t min_value   = (1ll << (bit + 0)) - 1;
+                    int64_t max_value   = (1ll << (bit + 1)) - 2;
+                    remaining_min       -= min_value;
+                    remaining_max       -= max_value;
+                    int64_t min         = std::max(min_value, remaining_total - remaining_max);
+                    int64_t max         = std::min(max_value, remaining_total - remaining_min);
+
+                    this->coder->EncodeValue((unsigned int)min, (unsigned int)root_frequencies[p], (unsigned int)max);
+
+                    remaining_total     -= root_frequencies[p];
+                }
+            }
+        }
+    }
+
+    void decode_root_frequencies(int32_t * root_frequencies, int32_t k, int32_t n)
+    {
+        int64_t  bit_freq[33];
+        int64_t  bit_freq_sum[33];
+
+        {
+            int64_t remaining_min = n, remaining_max = n, remaining_count = k;
+
+            memset(bit_freq, 0, sizeof(bit_freq));
+
+            for (ptrdiff_t bit = 0; bit <= 32 && remaining_count > 0; ++bit)
+            {
+                int64_t min_value   = (1ll << (bit + 0)) - 1;
+                int64_t max_value   = (1ll << (bit + 1)) - 2;
+                int64_t min         = std::max(remaining_count - (remaining_max / (max_value + 1)), 0ll);
+                int64_t max         = remaining_count * max_value < remaining_min ? remaining_count - 1 : remaining_count;
+
+                bit_freq[bit]       = this->coder->DecodeValue((unsigned int)min, (unsigned int)max);
+
+                remaining_min       -= bit_freq[bit] * max_value;
+                remaining_max       -= bit_freq[bit] * min_value;
+                remaining_count     -= bit_freq[bit];
+            }
+        }
+
+        {
+            int64_t bit_sum = 0, remaining_min = 0, remaining_max = 0, remaining_total = n;
+
+            for (ptrdiff_t bit = 32; bit >= 0; --bit)
+            {
+                int64_t min_value   = (1ll << (bit + 0)) - 1;
+                int64_t max_value   = (1ll << (bit + 1)) - 2;
+
+                bit_freq_sum[bit] = bit_sum; bit_sum += bit_freq[bit];
+
+                remaining_min       += min_value * bit_freq[bit];
+                remaining_max       += max_value * bit_freq[bit];
+            }
+
+            for (ptrdiff_t p = 0; p < k; ++p)
+            {
+                int32_t bit = 0;
+
+                while (bit_freq_sum[bit] > 0)
+                {
+                    if (bit_freq[bit] > 0)
+                    {
+                        unsigned int cum_freq = this->coder->GetCumFreq((unsigned int)(bit_freq[bit] + bit_freq_sum[bit]));
+                        if (cum_freq < bit_freq[bit])
+                        {
+                            this->coder->Decode(0, (unsigned int)bit_freq[bit], (unsigned int)(bit_freq[bit] + bit_freq_sum[bit]));
+                            break;
+                        }
+                        else
+                        {
+                            this->coder->Decode((unsigned int)bit_freq[bit], (unsigned int)bit_freq_sum[bit], (unsigned int)(bit_freq[bit] + bit_freq_sum[bit]));;
+                        }
+                    }
+
+                    bit_freq_sum[bit]--; bit++;
+                }
+
+                assert(bit_freq[bit] > 0); bit_freq[bit]--;
+
+                {
+                    int64_t min_value   = (1ll << (bit + 0)) - 1;
+                    int64_t max_value   = (1ll << (bit + 1)) - 2;
+                    remaining_min       -= min_value;
+                    remaining_max       -= max_value;
+                    int64_t min         = std::max(min_value, remaining_total - remaining_max);
+                    int64_t max         = std::min(max_value, remaining_total - remaining_min);
+
+                    root_frequencies[p] = this->coder->DecodeValue((unsigned int)min, (unsigned int)max);
+
+                    remaining_total     -= root_frequencies[p];
+                }
+            }
+        }
+    }
+
+    int32_t predict(int32_t count, int32_t total, int32_t left_remaining, int32_t right_remaining, int32_t symbols_remaining)
+    {
+        int32_t inferred_right = std::max(total - left_remaining, 0);
+        right_remaining -= inferred_right; total -= inferred_right;
+
+        assert(total <= right_remaining);
+
+        if (total > 0)
+        {
+            if (total <= 2)
+            {
+                int32_t state = 0;
+                state += 1   * (std::min((int32_t)symbols_remaining - 2, 5));
+                state += 8   * (std::min((int32_t)bit_scan_reverse(inferred_right + 1), 3));
+                state += 32  * (left_remaining + right_remaining == symbols_remaining);
+                state += 64  * (left_remaining == total);
+                state += 128 * (((int64_t)left_remaining * 11) / ((int64_t)right_remaining));
+
+                if (total == 1)
+                {
+                    static const int threshold[12] = { 147, 251, 374, 540, 761, 763, 1589, 2275, 2193, 3457, 3811, 1017 };
+
+                    uint16_t * RESTRICT predictor = &this->T1_model[state][0];
+
+                    if (predictor[0] + predictor[1] > threshold[state >> 7])
+                    {
+                        predictor[0] = (predictor[0] + (predictor[0] < 2)) >> 1;
+                        predictor[1] = (predictor[1] + (predictor[1] < 2)) >> 1;
+                    }
+
+                    if (this->mode == m03_mode::encoding)
+                    {
+                        this->coder->Encode(count ? predictor[0] : 0, predictor[count], predictor[0] + predictor[1]);
+                    }
+                    else
+                    {
+                        unsigned int cum_freq = this->coder->GetCumFreq(predictor[0] + predictor[1]);
+
+                        count = cum_freq >= predictor[0];
+                        this->coder->Decode(count ? predictor[0] : 0, predictor[count], predictor[0] + predictor[1]);
+                    }
+
+                    predictor[count]++;
+                }
+                else
+                {
+                    static const int threshold[12] = { 149, 221, 255, 287, 292, 343, 494, 396, 655, 820, 2984, 225 };
+
+                    uint16_t * RESTRICT predictor = &this->T2_model[state][0];
+
+                    if (predictor[0] + predictor[1] + predictor[2] > threshold[state >> 7])
+                    {
+                        predictor[0] = (predictor[0] + (predictor[0] < 2)) >> 1;
+                        predictor[1] = (predictor[1] + (predictor[1] < 2)) >> 1;
+                        predictor[2] = (predictor[2] + (predictor[2] < 2)) >> 1;
+                    }
+
+                    if (this->mode == m03_mode::encoding)
+                    {
+                        unsigned int cum_freq = count == 0 ? 0 : count == 1 ? predictor[0] : predictor[0] + predictor[1];
+                        this->coder->Encode(cum_freq, predictor[count], predictor[0] + predictor[1] + predictor[2]);
+                    }
+                    else
+                    {
+                        unsigned int cum_freq = this->coder->GetCumFreq(predictor[0] + predictor[1] + predictor[2]);
+                    
+                        count       = (cum_freq >= predictor[0]) + (cum_freq >= (unsigned int)(predictor[0] + predictor[1]));
+                        cum_freq    = count == 0 ? 0 : count == 1 ? predictor[0] : predictor[0] + predictor[1];
+
+                        this->coder->Decode(cum_freq, predictor[count], predictor[0] + predictor[1] + predictor[2]);
+                    }
+
+                    predictor[count]++;
+                }
+            }
+            else
+            {
+                int32_t pivot = (count > 0) + (count == total);
+
+                {
+                    static const int threshold[48] = 
+                    { 
+                        142, 129, 115, 89 , 70 , 59 , 53 , 44,
+                        243, 167, 132, 105, 98 , 109, 107, 134,
+                        247, 200, 162, 134, 137, 149, 201, 262,
+                        339, 253, 184, 171, 235, 288, 299, 348,
+                        512, 396, 178, 357, 466, 484, 697, 587,
+                        220, 157, 144, 167, 219, 141, 228, 1076,
+                    };
+
+                    int32_t state = 0;
+                    state += 1   * (std::min((int32_t)bit_scan_reverse(symbols_remaining - 1), 3));
+                    state += 4   * (inferred_right > 0);
+                    state += 8   * (left_remaining == total);
+                    state += 16  * (std::min((int32_t)bit_scan_reverse(total - 2), 7));
+                    state += 128 * (((int64_t)left_remaining * 9 + right_remaining) / ((int64_t)right_remaining * 2));
+
+                    uint16_t * RESTRICT predictor = &this->Ternary_model[state][0];
+
+                    if (predictor[0] + predictor[1] + predictor[2] > threshold[state >> 4])
+                    {
+                        predictor[0] = (predictor[0] + (predictor[0] < 2)) >> 1;
+                        predictor[1] = (predictor[1] + (predictor[1] < 2)) >> 1;
+                        predictor[2] = (predictor[2] + (predictor[2] < 2)) >> 1;
+                    }
+
+                    if (this->mode == m03_mode::encoding)
+                    {
+                        unsigned int cum_freq = pivot == 0 ? 0 : pivot == 1 ? predictor[0] : predictor[0] + predictor[1];
+                        this->coder->Encode(cum_freq, predictor[pivot], predictor[0] + predictor[1] + predictor[2]);
+                    }
+                    else
+                    {
+                        unsigned int cum_freq = this->coder->GetCumFreq(predictor[0] + predictor[1] + predictor[2]);
+
+                        pivot = (cum_freq >= predictor[0]) + (cum_freq >= (unsigned int)(predictor[0] + predictor[1]));
+                        cum_freq = pivot == 0 ? 0 : pivot == 1 ? predictor[0] : predictor[0] + predictor[1];
+
+                        this->coder->Decode(cum_freq, predictor[pivot], predictor[0] + predictor[1] + predictor[2]);
+                    }
+
+                    predictor[pivot]++; if (pivot != 1) { count = pivot == 0 ? 0 : total; }
+                }
+
+                if (pivot == 1)
+                {
+                    static const int threshold[48] =
+                    {
+                        275 , 167 , 218 , 163, 200, 123, 143, 61,
+                        515 , 335 , 344 , 268, 320, 244, 235, 85,
+                        863 , 474 , 527 , 387, 401, 298, 263, 107,
+                        1920, 968 , 629 , 500, 554, 286, 358, 121,
+                        3655, 1157, 1021, 623, 591, 365, 317, 109,
+                        2922, 249 , 776 , 159, 537, 133, 253, 158,
+                    };
+
+                    int32_t state = 0;
+                    state += 1  * (inferred_right >= total);
+                    state += 2  * (std::min(total - 3, 7));
+                    state += 16 * (((int64_t)left_remaining * 5) / ((int64_t)right_remaining));
+
+                    int32_t min = 1, max = total - 1, context = 1;
+                    while (min != max && context < 8)
+                    {
+                        uint16_t * RESTRICT predictor = &this->Tree_model[state][2 * context];
+
+                        if (predictor[0] + predictor[1] > threshold[state >> 1])
+                        {
+                            predictor[0] = (predictor[0] + (predictor[0] < 2)) >> 1;
+                            predictor[1] = (predictor[1] + (predictor[1] < 2)) >> 1;
+                        }
+
+                        int32_t median = min + ((max - min + 1) >> 1), bit = count >= median;
+
+                        if (this->mode == m03_mode::encoding)
+                        {
+                            this->coder->Encode(bit ? predictor[0] : 0, predictor[bit], predictor[0] + predictor[1]);
+                        }
+                        else
+                        {
+                            unsigned int cum_freq = this->coder->GetCumFreq(predictor[0] + predictor[1]);
+
+                            bit = cum_freq >= predictor[0];
+                            this->coder->Decode(bit ? predictor[0] : 0, predictor[bit], predictor[0] + predictor[1]);
+                        }
+
+                        predictor[bit]++; context += context + bit; min = bit ? median : min; max = bit ? max : median - 1;
+                    }
+
+                    count = this->mode == m03_mode::encoding
+                        ? this->coder->EncodeValue(min, count, max)
+                        : this->coder->DecodeValue(min, max);
+                }
+            }
+
+            return count;
+        }
+
+        return 0;
+    }
+    
+private:
+    RangeCoder *    coder;
+
+    uint16_t        T1_model[1536][2];
+    uint16_t        T2_model[1536][4];
+    uint16_t        Ternary_model[768][4];
+    uint16_t        Tree_model[96][16];
+};
+
+#pragma warning( pop )
\ No newline at end of file
diff --git a/m03_parser.h b/m03_parser.h
new file mode 100644
index 0000000..e296a13
--- /dev/null
+++ b/m03_parser.h
@@ -0,0 +1,709 @@
+/*--
+
+This file is a part of bsc-m03 project.
+
+    Copyright (c) 2021 Ilya Grebnov <ilya.grebnov@gmail.com>
+
+    bsc-m03 is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    bsc-m03 is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with bsc-m03.  If not, see <https://www.gnu.org/licenses/>.
+
+--*/
+
+#pragma once
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <algorithm>
+
+#include "common/platform.h"
+#include "common/rangecoder.h"
+
+#include "hutucker/hu-tucker.h"
+
+#include "m03_model.h"
+
+#define OPTIMAL_ABT_SMALL_THRESHOLD   (7)
+#define OPTIMAL_ABT_LARGE_THRESHOLD   (257)
+
+#pragma warning( push )
+#pragma warning( disable : 6385 )
+#pragma warning( disable : 6386 )
+
+#pragma pack(push, 1)
+
+typedef struct symbol_context
+{
+    int32_t     count;
+    int32_t     offset;
+    uint16_t    symbol;
+} symbol_context;
+
+#pragma pack(pop)
+
+typedef struct offset_queue
+{
+    int32_t *   offsets;
+    ptrdiff_t   count;
+    ptrdiff_t   size;
+
+    bool initialize(ptrdiff_t size)
+    {
+        this->count     = 0;
+        this->size      = size;
+        this->offsets   = (int32_t *)malloc(this->size * sizeof(int32_t));
+
+        return this->offsets != NULL;
+    }
+
+    INLINE void push_offset(const int32_t offset)
+    {
+        if (this->count == this->size)
+        {
+            this->offsets = this->resize();
+        }
+
+        this->offsets[this->count++] = offset;
+    }
+
+    INLINE void reset() { this->count = 0; }
+
+    INLINE void sort() { std::stable_sort(this->offsets, this->offsets + this->count); }
+
+    NOINLINE int32_t * resize()
+    {
+        return (int32_t *)realloc(this->offsets, (this->size += this->size) * sizeof(int32_t));
+    }
+
+    void destroy()
+    {
+        if (this->offsets != NULL) { free(this->offsets); this->offsets = NULL; }
+    }
+
+} offset_queue;
+
+class m03_parser: m03_model
+{
+public:
+
+    bool initialize(uint16_t * L, int32_t n, int32_t primary_index, int32_t * root_frequencies, int32_t k, RangeCoder * coder, m03_mode mode)
+    {
+        memset(this, 0, sizeof(m03_parser));
+
+        this->L                 = L;
+        this->n                 = n;
+        this->primary_index     = primary_index;
+        this->root_frequencies  = root_frequencies;
+        this->k                 = k;
+
+        if ((this->contexts = (symbol_context *)malloc(n * sizeof(symbol_context))) == NULL)
+        {
+            this->destroy();
+            return false;
+        }
+
+        if ((this->hutucker_tmp = malloc(hutucker_tmp_size(MAX_ALPHABET_SIZE + 1))) == NULL)
+        {
+            this->destroy();
+            return false;
+        }
+
+        if (!current_segments.initialize(next_power_of_2(std::max(n / 4, 64))))
+        {
+            this->destroy();
+            return false;
+        }
+
+        if (!next_segments.initialize(next_power_of_2(std::max(n / 4, 64))))
+        {
+            this->destroy();
+            return false;
+        }
+
+        this->initialize_model(coder, mode);
+        this->initialize_alphabetic_tree_roots();
+
+        return true;
+    }
+
+    void run()
+    {
+        if (this->mode == m03_mode::encoding)
+        {
+            this->encode_root_frequencies(this->root_frequencies, this->k, this->n - 1);
+            this->initialize_root_context(this->root_frequencies);
+            this->parse_contexts();
+
+            for (ptrdiff_t p = 0; p < n; ++p)
+            {
+                assert(p == this->primary_index || this->contexts[p].count  == 1   );
+                assert(p == this->primary_index || this->contexts[p].symbol == L[p]);
+            }
+        }
+        else
+        {
+            this->decode_root_frequencies(this->root_frequencies, this->k, this->n - 1);
+            this->initialize_root_context(this->root_frequencies);
+            this->parse_contexts();
+
+            for (ptrdiff_t p = 0; p < n; ++p)
+            {
+                L[p] = this->contexts[p].symbol;
+            }
+        }
+    }
+
+    void destroy()
+    {
+        if (this->contexts      != NULL) { free(this->contexts);        this->contexts      = NULL; }
+        if (this->hutucker_tmp  != NULL) { free(this->hutucker_tmp);    this->hutucker_tmp  = NULL; }
+
+        this->current_segments.destroy();
+        this->next_segments.destroy();
+    }
+
+private:
+
+    uint16_t *          L;
+    int32_t             n;
+    int32_t             primary_index;
+    int32_t *           root_frequencies;
+    int32_t             k;
+
+    symbol_context *    contexts;
+    offset_queue        current_segments;
+    offset_queue        next_segments;
+    void *              hutucker_tmp;
+
+    int32_t             parent_frequencies  [MAX_ALPHABET_SIZE + 1];
+    int32_t             left_frequencies    [MAX_ALPHABET_SIZE + 1];
+    symbol_context      left_contexts       [MAX_ALPHABET_SIZE + 1];
+
+    int32_t             alphabetic_tree_keys[OPTIMAL_ABT_LARGE_THRESHOLD];
+    int32_t             alphabetic_tree_weight[OPTIMAL_ABT_LARGE_THRESHOLD];
+    int64_t             alphabetic_tree_cost[OPTIMAL_ABT_LARGE_THRESHOLD][OPTIMAL_ABT_LARGE_THRESHOLD];
+    uint8_t             alphabetic_tree_root[OPTIMAL_ABT_LARGE_THRESHOLD][OPTIMAL_ABT_LARGE_THRESHOLD];
+
+    void initialize_alphabetic_tree_roots()
+    {
+        for (int32_t l = 0; l < OPTIMAL_ABT_LARGE_THRESHOLD - 1; ++l)
+        {
+            this->alphabetic_tree_root[l][l + 1] = this->alphabetic_tree_root[l][l] = l;
+        }
+    }
+
+    void initialize_root_context(const int32_t * root_frequencies)
+    {
+        int32_t unique_symbols = 0, total_symbols = 1;
+
+        this->current_segments.push_offset(0);
+
+        for (int32_t c = 0; c < this->k; ++c)
+        {
+            if (root_frequencies[c] > 0)
+            {
+                this->contexts[unique_symbols].count  = root_frequencies[c];
+                this->contexts[unique_symbols].offset = total_symbols;
+                this->contexts[unique_symbols].symbol = c;
+
+                this->current_segments.push_offset(total_symbols);
+                
+                unique_symbols++; total_symbols += root_frequencies[c];
+            }
+        }
+
+        m03_parser::normalize_context(&this->contexts[0], unique_symbols, total_symbols);
+    }
+
+    void parse_contexts()
+    {
+        while (this->current_segments.count > 0)
+        {
+            for (int32_t segment_start = 0; segment_start < this->current_segments.count;)
+            {
+                int32_t context_start   = this->current_segments.offsets[segment_start];
+                int32_t context_end     = context_start + this->contexts[context_start].count;
+                int32_t segment_end     = segment_start + 1;
+
+                while (segment_end < this->current_segments.count && this->current_segments.offsets[segment_end] < context_end)
+                {
+                    segment_end++;
+                }
+
+                assert(context_end - context_start > 1);
+                assert(segment_end - segment_start > 1);
+
+                if (this->is_trivial_context(context_start))
+                {
+                    m03_parser::split_trivial_context(this->contexts, this->next_segments, &this->current_segments.offsets[segment_start], &this->current_segments.offsets[segment_end]);
+                }
+                else
+                {
+                    m03_parser::populate_context_frequencies(&this->contexts[context_start], &this->contexts[this->primary_index], &this->parent_frequencies[0]);
+                    this->split_context_recursive(&this->current_segments.offsets[segment_start], &this->current_segments.offsets[segment_end]);
+                }
+
+                segment_start = segment_end;
+            }
+
+            this->next_segments.sort();
+            this->current_segments.reset();
+
+            std::swap(this->current_segments, this->next_segments);
+        }
+    }
+
+    void split_context_recursive(const int32_t * offsets, const int32_t * offsets_end)
+    {
+        assert(offsets_end - offsets > 0);
+
+        if (offsets_end - offsets == 1)
+        {
+            m03_parser::populate_next_segments(&this->contexts[offsets[0]], &this->contexts[this->primary_index], &this->parent_frequencies[0], this->next_segments);
+            return;
+        }
+
+        if (this->is_trivial_context(offsets[0]))
+        {
+            m03_parser::split_trivial_context(this->contexts, this->next_segments, offsets, offsets_end);
+            return;
+        }
+
+        if (offsets_end - offsets >= OPTIMAL_ABT_SMALL_THRESHOLD && offsets_end - offsets <= OPTIMAL_ABT_LARGE_THRESHOLD)
+        {
+            this->build_optimal_alphabetic_tree(offsets, offsets_end);
+            this->traverse_alphabetic_tree(offsets, offsets_end, 0, (int32_t)(offsets_end - offsets) - 1);
+            return;
+        }
+
+        const int32_t * offsets_pivot = (offsets_end - offsets) > 2 
+            ? this->choose_context_pivot_using_heuristic(offsets, offsets_end)
+            : &offsets[1];
+
+        this->split_context_by_pivot(offsets[0], offsets_pivot[0]);
+        this->split_context_recursive(offsets, offsets_pivot);
+        this->split_context_recursive(offsets_pivot, offsets_end);
+    }
+
+    void traverse_alphabetic_tree(const int32_t * offsets, const int32_t * offsets_end, int32_t l, int32_t r)
+    {
+        assert(l <= r);
+
+        if (l == r)
+        {
+            m03_parser::populate_next_segments(&this->contexts[offsets[l]], &this->contexts[this->primary_index], &this->parent_frequencies[0], this->next_segments);
+            return;
+        }
+
+        if (this->is_trivial_context(offsets[l]))
+        {
+            m03_parser::split_trivial_context(this->contexts, this->next_segments, &offsets[l], &offsets[r + 1]);
+            return;
+        }
+
+        int32_t offsets_pivot = this->alphabetic_tree_root[l][r];
+
+        this->split_context_by_pivot(offsets[l], offsets[offsets_pivot + 1]);
+        this->traverse_alphabetic_tree(offsets, offsets_end, l, offsets_pivot);
+        this->traverse_alphabetic_tree(offsets, offsets_end, offsets_pivot + 1, r);
+    }
+
+    const int32_t * choose_context_pivot_using_heuristic(const int32_t * offsets, const int32_t * offsets_end)
+    {
+        assert(offsets_end - offsets > 2);
+
+        int32_t context_begin = offsets[0];
+        int32_t context_end   = offsets[0] + this->contexts[offsets[0]].count;
+        size_t  offsets_count = offsets_end - offsets;
+
+        if (offsets_count == 3)
+        {
+            int64_t A = (int64_t)(offsets[1] ) - (int64_t)(context_begin);
+            int64_t C = (int64_t)(context_end) - (int64_t)(offsets[2]);
+
+            return C <= A ? &offsets[1] : &offsets[2];
+        }
+        else if (offsets_count == 4)
+        {
+            int64_t A = (int64_t)(offsets[1] ) - (int64_t)(context_begin);
+            int64_t B = (int64_t)(offsets[2] ) - (int64_t)(offsets[1]);
+            int64_t C = (int64_t)(offsets[3] ) - (int64_t)(offsets[2]);
+            int64_t D = (int64_t)(context_end) - (int64_t)(offsets[3]);
+
+            const int32_t * offset1 = &offsets[1]; int64_t cost1 = pivot_cost3(B, C, D);
+            const int32_t * offset2 = &offsets[2]; int64_t cost2 = A + B + C + D;
+            const int32_t * offset3 = &offsets[3]; int64_t cost3 = pivot_cost3(A, B, C);
+
+            if (cost2 <= cost1) { offset1 = offset2; cost1 = cost2; }
+            if (cost3 <  cost1) { offset1 = offset3; }
+
+            return offset1;
+        }
+        else if (offsets_count == 5)
+        {
+            int64_t A = (int64_t)(offsets[1] ) - (int64_t)(context_begin);
+            int64_t B = (int64_t)(offsets[2] ) - (int64_t)(offsets[1]);
+            int64_t C = (int64_t)(offsets[3] ) - (int64_t)(offsets[2]);
+            int64_t D = (int64_t)(offsets[4] ) - (int64_t)(offsets[3]);
+            int64_t E = (int64_t)(context_end) - (int64_t)(offsets[4]);
+
+            const int32_t * offset1 = &offsets[1]; int64_t cost1 = pivot_cost4(B, C, D, E);
+            const int32_t * offset2 = &offsets[2]; int64_t cost2 = A + B + pivot_cost3(C, D, E);
+            const int32_t * offset3 = &offsets[3]; int64_t cost3 = pivot_cost3(A, B, C) + D + E;
+            const int32_t * offset4 = &offsets[4]; int64_t cost4 = pivot_cost4(A, B, C, D);
+
+            if (cost2 <= cost1) { offset1 = offset2; cost1 = cost2; }
+            if (cost3 <  cost1) { offset1 = offset3; cost1 = cost3; }
+            if (cost4 <  cost1) { offset1 = offset4; }
+
+            return offset1;
+        }
+        else if (offsets_count == 6)
+        {
+            int64_t A = (int64_t)(offsets[1] ) - (int64_t)(context_begin);
+            int64_t B = (int64_t)(offsets[2] ) - (int64_t)(offsets[1]);
+            int64_t C = (int64_t)(offsets[3] ) - (int64_t)(offsets[2]);
+            int64_t D = (int64_t)(offsets[4] ) - (int64_t)(offsets[3]);
+            int64_t E = (int64_t)(offsets[5] ) - (int64_t)(offsets[4]);
+            int64_t F = (int64_t)(context_end) - (int64_t)(offsets[5]);
+
+            const int32_t * offset1 = &offsets[1]; int64_t cost1 = pivot_cost5(B, C, D, E, F);
+            const int32_t * offset2 = &offsets[2]; int64_t cost2 = A + B + pivot_cost4(C, D, E, F);
+            const int32_t * offset3 = &offsets[3]; int64_t cost3 = pivot_cost3(A, B, C) + pivot_cost3(D, E, F);
+            const int32_t * offset4 = &offsets[4]; int64_t cost4 = pivot_cost4(A, B, C, D) + E + F;
+            const int32_t * offset5 = &offsets[5]; int64_t cost5 = pivot_cost5(A, B, C, D, E);
+
+            if (cost2 <= cost1) { offset1 = offset2; cost1 = cost2; }
+            if (cost3 <= cost1) { offset1 = offset3; cost1 = cost3; }
+            if (cost4 <  cost1) { offset1 = offset4; cost1 = cost4; }
+            if (cost5 <  cost1) { offset1 = offset5; }
+
+            return offset1;
+        }
+        else
+        {
+            assert(offsets_count > OPTIMAL_ABT_LARGE_THRESHOLD);
+
+            {
+                for (int32_t segment_end = context_end, offsets_index = (int32_t)offsets_count - 1; offsets_index >= 0; --offsets_index)
+                {
+                    int32_t segment_start = offsets[offsets_index];
+
+                    this->left_frequencies[offsets_index] = segment_end - segment_start; segment_end = segment_start;
+                }
+
+                hutucker_get_lengths(offsets_count, (unsigned long *)this->left_frequencies, this->hutucker_tmp);
+            }
+
+            {
+                uint8_t path[64] = { 0 };
+                for (int32_t offsets_index = 0, length = 0; offsets_index < offsets_count; ++offsets_index)
+                {
+                    for (; length < this->left_frequencies[offsets_index]; ++length) { path[length] = 0; }
+
+                    length = this->left_frequencies[offsets_index]; if (path[0] == 1) { return &offsets[offsets_index]; }
+
+                    for (int32_t k = length - 1; k >= 0; --k) { if (path[k] ^= 1) { break; } }
+                }
+            }
+
+            return NULL;
+        }
+    }
+
+    void build_optimal_alphabetic_tree(const int32_t * offsets, const int32_t * offsets_end)
+    {
+        ptrdiff_t offsets_count = (ptrdiff_t)(offsets_end - offsets);
+
+        assert(offsets_count >= OPTIMAL_ABT_SMALL_THRESHOLD && offsets_count <= OPTIMAL_ABT_LARGE_THRESHOLD);
+
+        this->alphabetic_tree_keys[offsets_count - 1] = offsets[0] + this->contexts[offsets[0]].count - offsets[offsets_count - 1];
+
+        for (ptrdiff_t offsets_index = offsets_count - 2; offsets_index >= 0; --offsets_index)
+        {
+            this->alphabetic_tree_keys[offsets_index] = offsets[offsets_index + 1] - offsets[offsets_index];
+            this->alphabetic_tree_cost[offsets_index][offsets_index + 1] = this->alphabetic_tree_weight[offsets_index] = this->alphabetic_tree_keys[offsets_index] + this->alphabetic_tree_keys[offsets_index + 1];
+        }
+
+        for (ptrdiff_t length = 3; length <= offsets_count; ++length)
+        {
+            for (ptrdiff_t l = 0, r = length - 1; r < offsets_count; ++l, ++r)
+            {
+                uint8_t best_root = this->alphabetic_tree_root[l][r - 1];
+                int64_t best_cost = this->alphabetic_tree_cost[l][best_root] + this->alphabetic_tree_cost[best_root + 1][r];
+
+                for (ptrdiff_t root = (ptrdiff_t)best_root + 1; root <= (ptrdiff_t)this->alphabetic_tree_root[l + 1][r]; ++root)
+                {
+                    int64_t cost = this->alphabetic_tree_cost[l][root] + this->alphabetic_tree_cost[root + 1][r];
+                    if (cost < best_cost) { best_cost = cost; best_root = (uint8_t)root; }
+                }
+
+                this->alphabetic_tree_weight[l] += this->alphabetic_tree_keys[r];
+                this->alphabetic_tree_cost[l][r] = best_cost + this->alphabetic_tree_weight[l];
+                this->alphabetic_tree_root[l][r] = best_root;
+            }
+        }
+    }
+
+    void split_context_by_pivot(int32_t parent_context_offset, int32_t right_context_offset)
+    {
+        symbol_context * parent_context = &this->contexts[parent_context_offset];
+        int32_t parent_interval_size    = parent_context[0].count;
+        int32_t parent_unique_symbols   = 1;
+
+        symbol_context * left_context   = &this->left_contexts[0];
+        int32_t * left_frequencies      = &this->left_frequencies[0];
+        int32_t left_interval_size      = right_context_offset - parent_context_offset;
+        int32_t left_unique_symbols     = 0;
+
+        int32_t right_interval_size     = parent_interval_size - left_interval_size;
+        int32_t right_unique_symbols    = 0;
+
+        if (this->mode == m03_mode::encoding)
+        {
+            if (left_interval_size <= parent_interval_size - left_interval_size)
+            {
+                int32_t parent_total_symbols = parent_interval_size;
+
+                parent_total_symbols -= ((uint32_t)(this->primary_index - parent_context_offset) < (uint32_t)parent_total_symbols);
+
+                while (parent_total_symbols > 1 && parent_context[parent_unique_symbols].count > 0)
+                {
+                    parent_total_symbols -= parent_context[parent_unique_symbols].count;
+                    left_frequencies[parent_context[parent_unique_symbols].symbol] = 0;
+                    parent_unique_symbols++;
+                }
+
+                assert(parent_total_symbols > 0); parent_context[0].count = parent_total_symbols;
+                left_frequencies[parent_context[0].symbol] = 0;
+
+                for (int32_t p = parent_context_offset; p < right_context_offset; ++p) { left_frequencies[L[p]]++; }
+
+                left_frequencies[0] -= ((uint32_t)(this->primary_index - parent_context_offset) < (uint32_t)left_interval_size);
+            }
+            else
+            {
+                int32_t parent_total_symbols = parent_interval_size;
+
+                parent_total_symbols -= ((uint32_t)(this->primary_index - parent_context_offset) < (uint32_t)parent_total_symbols);
+
+                while (parent_total_symbols > 1 && parent_context[parent_unique_symbols].count > 0)
+                {
+                    parent_total_symbols -= parent_context[parent_unique_symbols].count;
+                    left_frequencies[parent_context[parent_unique_symbols].symbol] = parent_context[parent_unique_symbols].count;
+                    parent_unique_symbols++;
+                }
+
+                assert(parent_total_symbols > 0); parent_context[0].count = parent_total_symbols;
+                left_frequencies[parent_context[0].symbol] = parent_total_symbols;
+
+                for (int32_t p = right_context_offset; p < parent_context_offset + parent_interval_size; ++p) { left_frequencies[L[p]]--; }
+
+                left_frequencies[0] += ((uint32_t)(this->primary_index - right_context_offset) < (uint32_t)right_interval_size);
+            }
+        }
+        else
+        {
+            int32_t parent_total_symbols = parent_interval_size;
+
+            parent_total_symbols -= ((uint32_t)(this->primary_index - parent_context_offset) < (uint32_t)parent_total_symbols);
+
+            while (parent_total_symbols > 1 && parent_context[parent_unique_symbols].count > 0)
+            {
+                parent_total_symbols -= parent_context[parent_unique_symbols].count;
+                parent_unique_symbols++;
+            }
+
+            assert(parent_total_symbols > 0); parent_context[0].count = parent_total_symbols;
+        }
+
+        int32_t  left_remaining     = left_interval_size;
+        int32_t  right_remaining    = right_interval_size;
+
+        left_remaining  -= ((uint32_t)(this->primary_index - parent_context_offset) < (uint32_t)left_interval_size );
+        right_remaining -= ((uint32_t)(this->primary_index - right_context_offset ) < (uint32_t)right_interval_size);
+
+        for (int32_t parent_symbol_index = 0; parent_symbol_index < parent_unique_symbols; ++parent_symbol_index)
+        {
+            if (left_remaining > 0)
+            {
+                uint16_t symbol = parent_context[parent_symbol_index].symbol;
+                int32_t  total  = parent_context[parent_symbol_index].count;
+                int32_t  count  = left_frequencies[symbol];
+                
+                if (total <= left_remaining + right_remaining - total)
+                {
+                    count = left_remaining <= right_remaining
+                        ?         this->predict(        count, total, left_remaining , right_remaining, parent_unique_symbols - parent_symbol_index)
+                        : total - this->predict(total - count, total, right_remaining, left_remaining , parent_unique_symbols - parent_symbol_index);
+                }
+                else
+                {
+                    total = left_remaining + right_remaining - total;
+                    count = left_remaining - count;
+
+                    count = left_remaining <= right_remaining
+                        ?         this->predict(        count, total, left_remaining , right_remaining, parent_unique_symbols - parent_symbol_index)
+                        : total - this->predict(total - count, total, right_remaining, left_remaining , parent_unique_symbols - parent_symbol_index);
+
+                    count = left_remaining - count;
+                    total = left_remaining + right_remaining - total;
+                }
+
+                left_remaining  = left_remaining  - count;
+                right_remaining = right_remaining + count - total;
+
+                if (count > 0)
+                {
+                    left_context[left_unique_symbols].count     = count;
+                    left_context[left_unique_symbols].offset    = parent_context[parent_symbol_index].offset;
+                    left_context[left_unique_symbols].symbol    = symbol;
+
+                    parent_context[parent_symbol_index].count   -= count;
+                    parent_context[parent_symbol_index].offset  += count;
+
+                    left_unique_symbols++;
+                }
+            }
+
+            if (parent_context[parent_symbol_index].count > 0)
+            {
+                parent_context[right_unique_symbols] = parent_context[parent_symbol_index];
+                right_unique_symbols++;
+            }
+        }
+
+        {
+            memmove(&this->contexts[right_context_offset], &parent_context[0], right_unique_symbols * sizeof(symbol_context));
+            m03_parser::normalize_context(&this->contexts[right_context_offset], right_unique_symbols, right_interval_size);
+
+            memcpy(&parent_context[0], &left_context[0], left_unique_symbols * sizeof(symbol_context));
+            m03_parser::normalize_context(&parent_context[0], left_unique_symbols, left_interval_size);
+        }
+    }
+
+    INLINE bool is_trivial_context(int32_t context_start)
+    {
+        return this->contexts[context_start + 1].count == 0 && ((uint32_t)(this->primary_index - context_start) >= (uint32_t)this->contexts[context_start].count);
+    }
+
+    static void split_trivial_context(symbol_context * contexts, offset_queue & queue, const int32_t * offsets, const int32_t * offsets_end)
+    {
+        int32_t context_start           = *offsets++;
+        symbol_context parent_context   = contexts[context_start];
+
+        for (; offsets < offsets_end;)
+        {
+            symbol_context * context    = &contexts[context_start];
+            int32_t context_end         = *offsets++;
+            int32_t context_size        = context_end - context_start;
+
+            queue.push_offset(parent_context.offset);
+
+            context[0].count    = context_size;              parent_context.count  -= context_size;
+            context[0].offset   = parent_context.offset;     parent_context.offset += context_size;
+            context[0].symbol   = parent_context.symbol;     if (context_size > 1) { context[1].count = 0; }
+           
+            context_start       = context_end;
+        }
+
+        queue.push_offset(parent_context.offset);
+
+        contexts[context_start] = parent_context; if (contexts[context_start].count > 1) { contexts[context_start + 1].count = 0; }
+    }
+
+    static void populate_context_frequencies(symbol_context * context, symbol_context * primary_index_context, int32_t * frequencies)
+    {
+        int32_t total_symbols   = context[0].count;
+        int32_t unique_symbols  = 1;
+
+        total_symbols -= ((uint32_t)(primary_index_context - context) < (uint32_t)total_symbols);
+
+        while (total_symbols > 1 && context[unique_symbols].count > 0)
+        {
+            frequencies[context[unique_symbols].symbol] = context[unique_symbols].count;
+            total_symbols -= context[unique_symbols].count; unique_symbols++;
+        }
+
+        assert(total_symbols > 0); frequencies[context[0].symbol] = total_symbols;
+    }
+
+    static void populate_next_segments(symbol_context * context, symbol_context * primary_index_context, int32_t * frequencies, offset_queue & queue)
+    {
+        int32_t total_symbols   = context[0].count;
+        int32_t unique_symbols  = 1;
+
+        total_symbols -= ((uint32_t)(primary_index_context - context) < (uint32_t)total_symbols);
+
+        while (total_symbols > 1 && context[unique_symbols].count > 0)
+        {
+            if (frequencies[context[unique_symbols].symbol] != context[unique_symbols].count)
+            {
+                queue.push_offset(context[unique_symbols].offset);
+            }
+
+            total_symbols -= context[unique_symbols].count; unique_symbols++;
+        }
+
+        if (total_symbols > 0 && frequencies[context[0].symbol] != total_symbols)
+        {
+            queue.push_offset(context[0].offset);
+        }
+    }
+
+    static void normalize_context(symbol_context * context, int32_t unique_symbols, int32_t total_symbols)
+    {
+        if (unique_symbols > 1)
+        {
+            for (int32_t i = 1; i < unique_symbols; ++i)
+            {
+                symbol_context temp = context[i];
+
+                int32_t j = i;
+                while (j > 0 && (context[j - 1].count < temp.count || (context[j - 1].count == temp.count && context[j - 1].symbol > temp.symbol)))
+                {
+                    context[j] = context[j - 1]; j--;
+                }
+
+                context[j] = temp;
+            }
+
+            {
+                symbol_context * contexts_start  = &context[std::max(0, unique_symbols - 6)];
+                symbol_context * contexts_end    = &context[unique_symbols - 1];
+
+                while (contexts_start < contexts_end) { std::swap(*contexts_start++, *contexts_end--); }
+            }
+        }
+
+        assert(total_symbols > 0); context[0].count = total_symbols; if (unique_symbols < total_symbols) { context[unique_symbols].count = 0; }
+    }
+
+    INLINE static int64_t pivot_cost3(int64_t A, int64_t B, int64_t C)
+    {
+        return A + B + C + B + std::min(A, C);
+    }
+
+    INLINE static int64_t pivot_cost4(int64_t A, int64_t B, int64_t C, int64_t D)
+    {
+        return A + B + C + D + std::min(A + B + C + D, std::min(pivot_cost3(A, B, C), pivot_cost3(B, C, D)));
+    }
+
+    INLINE static int64_t pivot_cost5(int64_t A, int64_t B, int64_t C, int64_t D, int64_t E)
+    {
+        return A + B + C + D + E + std::min(std::min(pivot_cost4(B, C, D, E), A + B + pivot_cost3(C, D, E)), std::min(pivot_cost3(A, B, C) + D + E, pivot_cost4(A, B, C, D)));
+    }
+};
+
+#pragma warning( pop )
\ No newline at end of file