diff --git a/CHANGES b/CHANGES
new file mode 100644
index 0000000..d7dae51
--- /dev/null
+++ b/CHANGES
@@ -0,0 +1,2 @@
+* 2021-12-03 : Version 0.1.0
+ * Initial public release of the bsc-m03.
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..dd740e1
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,5 @@
+cmake_minimum_required (VERSION 3.9)
+
+project ("bsc-m03")
+
+add_executable (bsc-m03 bsc-m03.cpp hutucker/hu-tucker.c libsais/libsais.c libsais/libsais16.c)
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..f288702
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,674 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc.
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+ The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works. By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users. We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors. You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+ To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights. Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received. You must make sure that they, too, receive
+or can get the source code. And you must show them these terms so they
+know their rights.
+
+ Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+ For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software. For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+ Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so. This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software. The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable. Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products. If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+ Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary. To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ TERMS AND CONDITIONS
+
+ 0. Definitions.
+
+ "This License" refers to version 3 of the GNU General Public License.
+
+ "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+ "The Program" refers to any copyrightable work licensed under this
+License. Each licensee is addressed as "you". "Licensees" and
+"recipients" may be individuals or organizations.
+
+ To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy. The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+ A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+ To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy. Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+ To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies. Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+ An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License. If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+ 1. Source Code.
+
+ The "source code" for a work means the preferred form of the work
+for making modifications to it. "Object code" means any non-source
+form of a work.
+
+ A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+ The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form. A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+ The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities. However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work. For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+ The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+ The Corresponding Source for a work in source code form is that
+same work.
+
+ 2. Basic Permissions.
+
+ All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met. This License explicitly affirms your unlimited
+permission to run the unmodified Program. The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work. This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+ You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force. You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright. Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+ Conveying under any other circumstances is permitted solely under
+the conditions stated below. Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+ 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+ No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+ When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+ 4. Conveying Verbatim Copies.
+
+ You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+ You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+ 5. Conveying Modified Source Versions.
+
+ You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+ a) The work must carry prominent notices stating that you modified
+ it, and giving a relevant date.
+
+ b) The work must carry prominent notices stating that it is
+ released under this License and any conditions added under section
+ 7. This requirement modifies the requirement in section 4 to
+ "keep intact all notices".
+
+ c) You must license the entire work, as a whole, under this
+ License to anyone who comes into possession of a copy. This
+ License will therefore apply, along with any applicable section 7
+ additional terms, to the whole of the work, and all its parts,
+ regardless of how they are packaged. This License gives no
+ permission to license the work in any other way, but it does not
+ invalidate such permission if you have separately received it.
+
+ d) If the work has interactive user interfaces, each must display
+ Appropriate Legal Notices; however, if the Program has interactive
+ interfaces that do not display Appropriate Legal Notices, your
+ work need not make them do so.
+
+ A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit. Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+ 6. Conveying Non-Source Forms.
+
+ You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+ a) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by the
+ Corresponding Source fixed on a durable physical medium
+ customarily used for software interchange.
+
+ b) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by a
+ written offer, valid for at least three years and valid for as
+ long as you offer spare parts or customer support for that product
+ model, to give anyone who possesses the object code either (1) a
+ copy of the Corresponding Source for all the software in the
+ product that is covered by this License, on a durable physical
+ medium customarily used for software interchange, for a price no
+ more than your reasonable cost of physically performing this
+ conveying of source, or (2) access to copy the
+ Corresponding Source from a network server at no charge.
+
+ c) Convey individual copies of the object code with a copy of the
+ written offer to provide the Corresponding Source. This
+ alternative is allowed only occasionally and noncommercially, and
+ only if you received the object code with such an offer, in accord
+ with subsection 6b.
+
+ d) Convey the object code by offering access from a designated
+ place (gratis or for a charge), and offer equivalent access to the
+ Corresponding Source in the same way through the same place at no
+ further charge. You need not require recipients to copy the
+ Corresponding Source along with the object code. If the place to
+ copy the object code is a network server, the Corresponding Source
+ may be on a different server (operated by you or a third party)
+ that supports equivalent copying facilities, provided you maintain
+ clear directions next to the object code saying where to find the
+ Corresponding Source. Regardless of what server hosts the
+ Corresponding Source, you remain obligated to ensure that it is
+ available for as long as needed to satisfy these requirements.
+
+ e) Convey the object code using peer-to-peer transmission, provided
+ you inform other peers where the object code and Corresponding
+ Source of the work are being offered to the general public at no
+ charge under subsection 6d.
+
+ A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+ A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling. In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage. For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product. A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+ "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source. The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+ If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information. But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+ The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed. Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+ Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+ 7. Additional Terms.
+
+ "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law. If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+ When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it. (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.) You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+ Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+ a) Disclaiming warranty or limiting liability differently from the
+ terms of sections 15 and 16 of this License; or
+
+ b) Requiring preservation of specified reasonable legal notices or
+ author attributions in that material or in the Appropriate Legal
+ Notices displayed by works containing it; or
+
+ c) Prohibiting misrepresentation of the origin of that material, or
+ requiring that modified versions of such material be marked in
+ reasonable ways as different from the original version; or
+
+ d) Limiting the use for publicity purposes of names of licensors or
+ authors of the material; or
+
+ e) Declining to grant rights under trademark law for use of some
+ trade names, trademarks, or service marks; or
+
+ f) Requiring indemnification of licensors and authors of that
+ material by anyone who conveys the material (or modified versions of
+ it) with contractual assumptions of liability to the recipient, for
+ any liability that these contractual assumptions directly impose on
+ those licensors and authors.
+
+ All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10. If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term. If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+ If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+ Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+ 8. Termination.
+
+ You may not propagate or modify a covered work except as expressly
+provided under this License. Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+ However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+ Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+ Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License. If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+ 9. Acceptance Not Required for Having Copies.
+
+ You are not required to accept this License in order to receive or
+run a copy of the Program. Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance. However,
+nothing other than this License grants you permission to propagate or
+modify any covered work. These actions infringe copyright if you do
+not accept this License. Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+ 10. Automatic Licensing of Downstream Recipients.
+
+ Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License. You are not responsible
+for enforcing compliance by third parties with this License.
+
+ An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations. If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+ You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License. For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+ 11. Patents.
+
+ A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based. The
+work thus licensed is called the contributor's "contributor version".
+
+ A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version. For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+ Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+ In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement). To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+ If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients. "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+ If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+ A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License. You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+ Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+ 12. No Surrender of Others' Freedom.
+
+ If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all. For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+ 13. Use with the GNU Affero General Public License.
+
+ Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work. The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+ 14. Revised Versions of this License.
+
+ The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+ Each version is given a distinguishing version number. If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation. If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+ If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+ Later license versions may give you additional or different
+permissions. However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+ 15. Disclaimer of Warranty.
+
+ THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+ 16. Limitation of Liability.
+
+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+ 17. Interpretation of Sections 15 and 16.
+
+ If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+
+ Copyright (C)
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see .
+
+Also add information on how to contact you by electronic and paper mail.
+
+ If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+ Copyright (C)
+ This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+ You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+.
+
+ The GNU General Public License does not permit incorporating your program
+into proprietary programs. If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library. If this is what you want to do, use the GNU Lesser General
+Public License instead of this License. But first, please read
+.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..d328b37
--- /dev/null
+++ b/README.md
@@ -0,0 +1,106 @@
+# bsc-m03
+
+The bsc-m03 is experimental block sorting compressor based on M03 context aware compression algorithm invented by Michael Maniscalco:
+* Michael Maniscalco *M03: A solution for context based blocksort (BWT) compression*, 2004
+* Jurgen Abel *Post BWT stages of the Burrows-Wheeler compression algorithm*, 2010
+
+Copyright (c) 2021 Ilya Grebnov
+
+## License
+The libsais is released under the [GNU General Public License](LICENSE "GNU General Public License")
+
+## Changes
+* 2021-12-03 : Version 0.1.0
+ * Initial public release of the bsc-m03.
+
+# Benchmarks
+
+### Calgary Corpus ###
+| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
+|:---------------:|:-----------:|:------------:|:-------:|
+| bib | 111261 | 25143 | 1.808 |
+| book1 | 768771 | 208157 | 2.166 |
+| book2 | 610856 | 141591 | 1.854 |
+| geo | 102400 | 52797 | 4.125 |
+| news | 377109 | 108387 | 2.299 |
+| obj1 | 21504 | 9901 | 3.683 |
+| obj2 | 246814 | 69689 | 2.259 |
+| paper1 | 53161 | 15384 | 2.315 |
+| paper2 | 82199 | 23161 | 2.254 |
+| pic | 513216 | 44920 | 0.700 |
+| progc | 39611 | 11525 | 2.328 |
+| progl | 71646 | 13921 | 1.554 |
+| progp | 49379 | 9530 | 1.544 |
+| trans | 93695 | 15759 | 1.346 |
+
+### Canterbury Corpus ###
+| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
+|:---------------:|:-----------:|:------------:|:-------:|
+| alice29.txt | 152089 | 39310 | 2.068 |
+| asyoulik.txt | 125179 | 36585 | 2.338 |
+| cp.html | 24603 | 7042 | 2.290 |
+| fields.c | 11150 | 2748 | 1.972 |
+| grammar.lsp | 3721 | 1142 | 2.455 |
+| kennedy.xls | 1029744 | 58440 | 0.454 |
+| lcet10.txt | 426754 | 96730 | 1.813 |
+| plrabn12.txt | 481861 | 131617 | 2.185 |
+| ptt5 | 513216 | 44920 | 0.700 |
+| sum | 38240 | 11599 | 2.427 |
+| xargs.1 | 4227 | 1618 | 3.062 |
+
+### Large Canterbury Corpus ###
+| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
+|:---------------:|:-----------:|:------------:|:-------:|
+| bible.txt | 4047392 | 708602 | 1.401 |
+| E.coli | 4638690 | 1137915 | 1.962 |
+| world192.txt | 2473400 | 384776 | 1.245 |
+
+### Silesia Corpus ###
+| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
+|:---------------:|:-----------:|:------------:|:-------:|
+| dickens | 10192446 | 2220939 | 1.743 |
+| mozilla | 51220480 | 15831237 | 2.473 |
+| mr | 9970564 | 2169223 | 1.741 |
+| nci | 33553445 | 1148550 | 0.274 |
+| ooffice | 6152192 | 2542258 | 3.306 |
+| osdb | 10085684 | 2251471 | 1.786 |
+| reymont | 6627202 | 972461 | 1.174 |
+| samba | 21606400 | 3881872 | 1.437 |
+| sao | 7251944 | 4672656 | 5.155 |
+| webster | 41458703 | 6318267 | 1.219 |
+| xml | 5345280 | 369196 | 0.553 |
+| x-ray | 8474240 | 3697722 | 3.491 |
+
+### Manzini Corpus ###
+| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
+|:---------------:|:-----------:|:------------:|:-------:|
+| chr22.dna | 34553758 | 7262753 | 1.681 |
+| etext99 | 105277340 | 21730495 | 1.651 |
+| gcc-3.0.tar | 86630400 | 10306097 | 0.952 |
+| howto | 39422105 | 7662880 | 1.555 |
+| jdk13c | 69728899 | 2692938 | 0.309 |
+| linux-2.4.5.tar | 116254720 | 16773180 | 1.154 |
+| rctail96 | 114711151 | 9949692 | 0.694 |
+| rfc | 116421901 | 15192366 | 1.044 |
+| sprot34.dat | 109617186 | 17534134 | 1.280 |
+| w3c2 | 104201579 | 5800775 | 0.445 |
+
+### Maximum Compression Corpus ###
+| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
+|:---------------:|:-----------:|:------------:|:-------:|
+| A10.jpg | 842468 | 825162 | 7.836 |
+| AcroRd32.exe | 3870784 | 1582677 | 3.271 |
+| english.dic | 465211 | 148582 | 2.555 |
+| FlashMX.pdf | 4526946 | 3735179 | 6.601 |
+| FP.LOG | 20617071 | 514554 | 0.200 |
+| MSO97.DLL | 3782416 | 1904460 | 4.028 |
+| ohs.doc | 4168192 | 817718 | 1.569 |
+| rafale.bmp | 4149414 | 750437 | 1.447 |
+| vcfiu.hlp | 4121418 | 620358 | 1.204 |
+| world95.txt | 2988578 | 452271 | 1.211 |
+
+### Large Text Compression Benchmark Corpus ###
+| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
+|:---------------:|:-----------:|:------------:|:-------:|
+| enwik8 | 100000000 | 20529360 | 1.642 |
+| enwik9 | 1000000000 | 162084133 | 1.297 |
diff --git a/VERSION b/VERSION
new file mode 100644
index 0000000..6c6aa7c
--- /dev/null
+++ b/VERSION
@@ -0,0 +1 @@
+0.1.0
\ No newline at end of file
diff --git a/bsc-m03.cpp b/bsc-m03.cpp
new file mode 100644
index 0000000..0279baa
--- /dev/null
+++ b/bsc-m03.cpp
@@ -0,0 +1,483 @@
+/*--
+
+This file is a part of bsc-m03 project.
+
+ Copyright (c) 2021 Ilya Grebnov
+
+ bsc-m03 is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ bsc-m03 is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with bsc-m03. If not, see .
+
+--*/
+
+#define _CRT_SECURE_NO_WARNINGS
+
+#include
+#include
+#include
+#include
+#include
+
+#include
+
+#include "libsais/libsais.h"
+#include "libsais/libsais16.h"
+
+#include "common/platform.h"
+#include "common/rangecoder.h"
+
+#define MAX_ALPHABET_SIZE (256 * 256)
+
+#include "m03_parser.h"
+
+#pragma warning( push )
+#pragma warning( disable : 6385 )
+#pragma warning( disable : 6386 )
+
+int32_t root_frequencies[MAX_ALPHABET_SIZE + 1];
+
+static int32_t compress_memory_block(uint8_t * buffer, int32_t block_size, int32_t symbol_size)
+{
+ if (block_size % symbol_size != 0)
+ {
+ fprintf(stderr, "\nError: Block size of %d bytes is not a multiple of symbol width!\n", block_size);
+ return -2;
+ }
+
+ int32_t indexes[32] = { -1 };
+ int32_t comressed_size = -1;
+ int32_t block_symbols = block_size / symbol_size;
+ int32_t r = next_power_of_2(std::max(block_symbols / 16, 1048576));
+
+ if (int32_t * libsais_temp = (int32_t *)malloc(block_symbols * sizeof(int32_t)))
+ {
+ int32_t result = symbol_size == 1
+ ? libsais_bwt_aux(buffer, buffer, libsais_temp, block_symbols, 0, root_frequencies, r, indexes)
+ : libsais16_bwt_aux((uint16_t *)buffer, (uint16_t *)buffer, libsais_temp, block_symbols, 0, root_frequencies, r, indexes);
+
+ free(libsais_temp);
+
+ if (result == 0)
+ {
+ if (uint16_t * L = (uint16_t *)malloc(((size_t)block_symbols + 1) * sizeof(uint16_t)))
+ {
+ if (m03_parser * parser = (m03_parser *)malloc(sizeof(m03_parser)))
+ {
+ {
+ int32_t primary_index = indexes[0];
+
+ if (symbol_size == 1)
+ {
+ for (int32_t p = 0; p < primary_index; ++p) { L[p + 0] = ((uint16_t)buffer[p]); }
+ for (int32_t p = primary_index; p < block_symbols; ++p) { L[p + 1] = ((uint16_t)buffer[p]); }
+ }
+ else
+ {
+ for (int32_t p = 0; p < primary_index; ++p) { L[p + 0] = ((uint16_t *)buffer)[p]; }
+ for (int32_t p = primary_index; p < block_symbols; ++p) { L[p + 1] = ((uint16_t *)buffer)[p]; }
+ }
+
+ L[primary_index] = 0;
+ }
+
+ RangeCoder coder;
+ coder.InitEncoder(buffer, block_size);
+ coder.EncodeValue(1, symbol_size, 2);
+
+ for (int32_t t = 0; t <= (block_symbols - 1) / r; ++t)
+ {
+ coder.EncodeValue(1, indexes[t], block_symbols);
+ }
+
+ if (parser->initialize(L, block_symbols + 1, indexes[0], root_frequencies, symbol_size == 1 ? 256 : 256 * 256, &coder, m03_mode::encoding))
+ {
+ parser->run();
+ parser->destroy();
+
+ comressed_size = coder.FinishEncoder();
+ }
+ else
+ {
+ fprintf(stderr, "\nError: Not enough memory!\n");
+ }
+
+ free(parser);
+ }
+ else
+ {
+ fprintf(stderr, "\nError: Not enough memory!\n");
+ }
+
+ free(L);
+ }
+ else
+ {
+ fprintf(stderr, "\nError: Not enough memory!\n");
+ }
+ }
+ else
+ {
+ fprintf(stderr, "\nError: libsais_bwt failed, please contact the author!\n");
+ }
+ }
+ else
+ {
+ fprintf(stderr, "\nError: Not enough memory!\n");
+ }
+
+ return comressed_size;
+}
+
+static int32_t decompress_memory_block(uint8_t * buffer, int32_t comressed_size, int32_t block_size)
+{
+ RangeCoder coder;
+ coder.InitDecoder(buffer);
+ int32_t symbol_size = coder.DecodeValue(1, 2);
+
+ int32_t indexes[32] = { -1 };
+ int32_t primary_index = -1;
+ int32_t decomressed_size = -1;
+ int32_t block_symbols = block_size / symbol_size;
+ int32_t r = next_power_of_2(std::max(block_symbols / 16, 1048576));
+
+ for (int32_t t = 0; t <= (block_symbols - 1) / r; ++t)
+ {
+ indexes[t] = coder.DecodeValue(1, block_symbols);
+ }
+
+ if (uint16_t * L = (uint16_t *)malloc(((size_t)block_symbols + 1) * sizeof(uint16_t)))
+ {
+ if (m03_parser * parser = (m03_parser *)malloc(sizeof(m03_parser)))
+ {
+ if (parser->initialize(L, block_symbols + 1, indexes[0], root_frequencies, symbol_size == 1 ? 256 : 256 * 256, &coder, m03_mode::decoding))
+ {
+ parser->run();
+ parser->destroy();
+
+ {
+ primary_index = indexes[0];
+
+ if (symbol_size == 1)
+ {
+ for (int32_t p = 0; p < primary_index; ++p) { buffer[p] = (uint8_t)L[p + 0]; }
+ for (int32_t p = primary_index; p < block_symbols; ++p) { buffer[p] = (uint8_t)L[p + 1]; }
+ }
+ else
+ {
+ for (int32_t p = 0; p < primary_index; ++p) { ((uint16_t *)buffer)[p] = L[p + 0]; }
+ for (int32_t p = primary_index; p < block_symbols; ++p) { ((uint16_t *)buffer)[p] = L[p + 1]; }
+ }
+ }
+ }
+ else
+ {
+ fprintf(stderr, "\nError: Not enough memory!\n");
+ }
+
+ free(parser);
+ }
+ else
+ {
+ fprintf(stderr, "\nError: Not enough memory!\n");
+ }
+
+ free(L);
+ }
+ else
+ {
+ fprintf(stderr, "\nError: Not enough memory!\n");
+ }
+
+ if (primary_index > 0)
+ {
+ if (int32_t * libsais_temp = (int32_t *)malloc(((size_t)block_symbols + 1) * sizeof(int32_t)))
+ {
+ int32_t result = symbol_size == 1
+ ? libsais_unbwt_aux(buffer, buffer, libsais_temp, block_symbols, root_frequencies, r, indexes)
+ : libsais16_unbwt_aux((uint16_t *)buffer, (uint16_t *)buffer, libsais_temp, block_symbols, root_frequencies, r, indexes);
+
+ if (result == 0)
+ {
+ decomressed_size = block_size;
+ }
+ else
+ {
+ fprintf(stderr, "\nError: libsais_unbwt failed, please contact the author!\n");
+ }
+
+ free(libsais_temp);
+ }
+ else
+ {
+ fprintf(stderr, "\nError: Not enough memory!\n");
+ }
+ }
+
+ return decomressed_size;
+}
+
+static int compress_file(const char * input_file_name, const char * output_file_name, int32_t max_block_size, int32_t symbol_size)
+{
+ clock_t start_time = clock();
+ if (FILE * input_file = fopen(input_file_name, "rb"))
+ {
+ if (FILE * output_file = fopen(output_file_name, "wb"))
+ {
+ fseek(input_file, 0, SEEK_END); int64_t remaining_size = _ftelli64(input_file); rewind(input_file);
+
+ if (uint8_t * buffer = (uint8_t *)malloc(std::min(remaining_size, (int64_t)max_block_size) * sizeof(uint8_t)))
+ {
+ int64_t input_bytes = 0, output_bytes = 0;
+
+ while (remaining_size > 0)
+ {
+ fprintf(stdout, "\rCompressing %.55s(%02d%%)", input_file_name, (int)((input_bytes * 100) / (input_bytes + remaining_size)));
+
+ int32_t block_size = (int32_t)std::min(remaining_size, (int64_t)max_block_size);
+
+ if (fread(buffer, sizeof(uint8_t), block_size, input_file) != block_size)
+ {
+ fprintf(stderr, "\nError: Unable to read input file!\n");
+ break;
+ }
+
+ int32_t comressed_size = compress_memory_block(buffer, block_size, symbol_size);
+ if (comressed_size <= 0) { break; }
+
+ if (fwrite(&block_size, sizeof(uint8_t), sizeof(block_size), output_file) != sizeof(block_size))
+ {
+ fprintf(stderr, "\nError: Unable to write output file!\n");
+ break;
+ }
+
+ if (fwrite(&comressed_size, sizeof(uint8_t), sizeof(comressed_size), output_file) != sizeof(comressed_size))
+ {
+ fprintf(stderr, "\nError: Unable to write output file!\n");
+ break;
+ }
+
+ if (fwrite(buffer, sizeof(uint8_t), comressed_size, output_file) != comressed_size)
+ {
+ fprintf(stderr, "\nError: Unable to write output file\n");
+ break;
+ }
+
+ remaining_size -= block_size;
+ input_bytes += block_size;
+ output_bytes += sizeof(block_size) + sizeof(comressed_size) + comressed_size;
+ }
+
+ if (remaining_size == 0)
+ {
+ fprintf(stdout, "\r%.55s compressed from %lld into %lld in %.3f seconds (%.3f bps).\n", input_file_name, input_bytes, output_bytes, ((double)clock() - start_time) / CLOCKS_PER_SEC, (8.0 * symbol_size * output_bytes) / input_bytes);
+ }
+
+ free(buffer);
+ }
+ else
+ {
+ fprintf(stderr, "Error: Not enough memory!\n");
+ }
+
+ fclose(output_file);
+ }
+ else
+ {
+ fprintf(stderr, "Error: Unable to open output file!\n");
+ }
+
+ fclose(input_file);
+ }
+ else
+ {
+ fprintf(stderr, "Error: Unable to open input file!\n");
+ }
+
+ return 0;
+}
+
+static int decompress_file(const char * input_file_name, const char * output_file_name)
+{
+ clock_t start_time = clock();
+ if (FILE * input_file = fopen(input_file_name, "rb"))
+ {
+ if (FILE * output_file = fopen(output_file_name, "wb"))
+ {
+ int32_t max_block_size;
+ if (fread(&max_block_size, sizeof(uint8_t), sizeof(max_block_size), input_file) == sizeof(max_block_size))
+ {
+ fseek(input_file, 0, SEEK_END); int64_t remaining_size = _ftelli64(input_file); rewind(input_file);
+
+ if (uint8_t * buffer = (uint8_t *)malloc(max_block_size * sizeof(uint8_t)))
+ {
+ int64_t input_bytes = 0, output_bytes = 0;
+
+ while (remaining_size > 0)
+ {
+ fprintf(stdout, "\rDecompressing %.55s(%02d%%)", input_file_name, (int)((input_bytes * 100) / (input_bytes + remaining_size)));
+
+ int32_t block_size, comressed_size;
+ if (fread(&block_size, sizeof(uint8_t), sizeof(block_size), input_file) != sizeof(block_size))
+ {
+ fprintf(stderr, "\nError: Unable to read input file!\n");
+ break;
+ }
+
+ if (fread(&comressed_size, sizeof(uint8_t), sizeof(comressed_size), input_file) != sizeof(comressed_size))
+ {
+ fprintf(stderr, "\nError: Unable to read input file!\n");
+ break;
+ }
+
+ if (block_size > max_block_size || comressed_size > max_block_size)
+ {
+ fprintf(stderr, "\nError: The compressed data is corrupted!\n");
+ break;
+ }
+
+ if (fread(buffer, sizeof(uint8_t), comressed_size, input_file) != comressed_size)
+ {
+ fprintf(stderr, "\nError: Unable to read input file!\n");
+ break;
+ }
+
+ int32_t decomressed_size = decompress_memory_block(buffer, comressed_size, block_size);
+ if (decomressed_size != block_size) { break; }
+
+ if (fwrite(buffer, sizeof(uint8_t), decomressed_size, output_file) != decomressed_size)
+ {
+ fprintf(stderr, "\nError: Unable to write output file\n");
+ break;
+ }
+
+ remaining_size -= sizeof(block_size) + sizeof(comressed_size) + comressed_size;
+ input_bytes += sizeof(block_size) + sizeof(comressed_size) + comressed_size;
+ output_bytes += decomressed_size;
+ }
+
+ if (remaining_size == 0)
+ {
+ fprintf(stdout, "\r%.55s decompressed from %lld into %lld in %.3f seconds.\n", input_file_name, input_bytes, output_bytes, ((double)clock() - start_time) / CLOCKS_PER_SEC);
+ }
+
+ free(buffer);
+ }
+ else
+ {
+ fprintf(stderr, "Error: Not enough memory!\n");
+ }
+ }
+ else
+ {
+ fprintf(stderr, "Error: Unable to read input file!\n");
+ }
+
+ fclose(output_file);
+ }
+ else
+ {
+ fprintf(stderr, "Error: Unable to open output file!\n");
+ }
+
+ fclose(input_file);
+ }
+ else
+ {
+ fprintf(stderr, "Error: Unable to open input file!\n");
+ }
+
+ return 0;
+}
+
+static int print_usage()
+{
+ fprintf(stdout, "Usage: bsc-m03 input-file output-file \n");
+ fprintf(stdout, " -b Block size in bytes, default 128MB (memory usage is ~15x).\n");
+ fprintf(stdout, " -w<8|16> Symbol width in bits.\n");
+
+ return 0;
+}
+
+int main(int argc, const char * argv[])
+{
+ fprintf(stdout, "bsc-m03 is experimental block sorting compressor. Version 0.1.0 (3 December 2021).\n");
+ fprintf(stdout, "Copyright (c) 2021 Ilya Grebnov . ABSOLUTELY NO WARRANTY.\n");
+ fprintf(stdout, "This program is based on (at least) the work of Michael Maniscalco and Atsushi Komiya.\n\n");
+
+ int32_t max_block_size = 128 * 1024 * 1024;
+ int32_t symbol_width = 8;
+
+ if (argc < 4 || strlen(argv[1]) != 1)
+ {
+ return print_usage();
+ }
+
+ for (int32_t i = 4; i < argc; ++i)
+ {
+ if (argv[i][0] != '-')
+ {
+ return print_usage();
+ }
+
+ switch (argv[i][1])
+ {
+ case 'b':
+ {
+ max_block_size = atoi(argv[i] + 2);
+ if (max_block_size <= 0) { return print_usage(); }
+
+ break;
+ }
+
+ case 'w':
+ {
+ symbol_width = atoi(argv[i] + 2);
+ if (symbol_width != 8 && symbol_width != 16) { return print_usage(); }
+
+ break;
+ }
+
+ default:
+ {
+ return print_usage();
+ }
+ }
+ }
+
+ switch (argv[1][0])
+ {
+ case 'c':
+ case 'C':
+ case 'e':
+ case 'E':
+ {
+ return compress_file(argv[2], argv[3], max_block_size, symbol_width / 8);
+ }
+
+ case 'd':
+ case 'D':
+ {
+ if (argc != 4) { return print_usage(); }
+
+ return decompress_file(argv[2], argv[3]);
+ }
+
+ default:
+ {
+ return print_usage();
+ }
+ }
+
+ return 0;
+}
+
+#pragma warning( pop )
\ No newline at end of file
diff --git a/common/LICENSE b/common/LICENSE
new file mode 100644
index 0000000..d645695
--- /dev/null
+++ b/common/LICENSE
@@ -0,0 +1,202 @@
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/common/platform.h b/common/platform.h
new file mode 100644
index 0000000..1252057
--- /dev/null
+++ b/common/platform.h
@@ -0,0 +1,125 @@
+/*-----------------------------------------------------------*/
+/* Block Sorting, Lossless Data Compression Library. */
+/* Interface to platform specific functions and constants */
+/*-----------------------------------------------------------*/
+
+/*--
+
+This file is a part of bsc and/or libbsc, a program and a library for
+lossless, block-sorting data compression.
+
+ Copyright (c) 2009-2021 Ilya Grebnov
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+Please see the file LICENSE for full copyright information.
+
+See also the bsc and libbsc web site:
+ http://libbsc.com/ for more information.
+
+--*/
+
+/*--
+
+NOTICE: This file has been modified for use in the bsc-m03 project.
+
+--*/
+
+#ifndef _LIBBSC_PLATFORM_H
+#define _LIBBSC_PLATFORM_H
+
+#if defined(_MSC_VER)
+ #include
+#else
+ #include
+#endif
+
+#if defined(__GNUC__)
+ #define INLINE __inline__
+#elif defined(_MSC_VER)
+ #define INLINE __forceinline
+#elif defined(__IBMC__)
+ #define INLINE _Inline
+#elif defined(__cplusplus)
+ #define INLINE inline
+#else
+ #define INLINE /* */
+#endif
+
+#if defined(_MSC_VER)
+ #define NOINLINE __declspec(noinline)
+#elif defined(__GNUC__)
+ #define NOINLINE __attribute__ ((noinline))
+#else
+ #define NOINLINE /* */
+#endif
+
+#if defined(_MSC_VER)
+ #define ALIGNED(x) __declspec(align(x))
+#elif defined(__GNUC__)
+ #define ALIGNED(x) __attribute__ ((aligned(x)))
+#endif
+
+#if defined(__GNUC__) || defined(__clang__) || defined(__CUDACC__)
+ #define RESTRICT __restrict__
+#elif defined(_MSC_VER) || defined(__INTEL_COMPILER)
+ #define RESTRICT __restrict
+#else
+ #define RESTRICT /* */
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+ #define byteswap_uint64(x) (__builtin_bswap64(x))
+ #define bit_scan_reverse(x) (__builtin_clz(x) ^ 31)
+ #define bit_scan_forward(x) (__builtin_ctz(x))
+ #define bit_scan_forward64(x) (__builtin_ctzll(x))
+#elif defined(_MSC_VER)
+ #define byteswap_uint64(x) (_byteswap_uint64(x))
+
+ #pragma intrinsic(_BitScanReverse)
+ #pragma intrinsic(_BitScanForward)
+
+ static inline __forceinline unsigned long bit_scan_reverse(unsigned long x)
+ {
+ unsigned long index;
+ _BitScanReverse(&index, x);
+ return index;
+ }
+
+ static inline __forceinline unsigned long bit_scan_forward(unsigned long x)
+ {
+ unsigned long index;
+ _BitScanForward(&index, x);
+ return index;
+ }
+#endif
+
+ static INLINE unsigned int next_power_of_2(unsigned int v)
+ {
+ v--;
+
+ v |= v >> 1;
+ v |= v >> 2;
+ v |= v >> 4;
+ v |= v >> 8;
+ v |= v >> 16;
+ v++;
+
+ return v;
+ }
+
+#endif
+
+/*-----------------------------------------------------------*/
+/* End platform.h */
+/*-----------------------------------------------------------*/
diff --git a/common/rangecoder.h b/common/rangecoder.h
new file mode 100644
index 0000000..4a5d6a6
--- /dev/null
+++ b/common/rangecoder.h
@@ -0,0 +1,238 @@
+/*-----------------------------------------------------------*/
+/* Block Sorting, Lossless Data Compression Library. */
+/* Range coder */
+/*-----------------------------------------------------------*/
+
+/*--
+
+This file is a part of bsc and/or libbsc, a program and a library for
+lossless, block-sorting data compression.
+
+ Copyright (c) 2009-2021 Ilya Grebnov
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+Please see the file LICENSE for full copyright information.
+
+See also the bsc and libbsc web site:
+ http://libbsc.com/ for more information.
+
+--*/
+
+/*--
+
+NOTICE: This file has been modified for use in the bsc-m03 project.
+
+--*/
+
+#ifndef _LIBBSC_CODER_RANGECODER_H
+#define _LIBBSC_CODER_RANGECODER_H
+
+#include "platform.h"
+
+class RangeCoder
+{
+
+private:
+
+ union ari
+ {
+ struct u
+ {
+ unsigned int low32;
+ unsigned int carry;
+ } u;
+ unsigned long long low;
+ } ari;
+
+ unsigned int ari_code;
+ unsigned int ari_ffnum;
+ unsigned int ari_cache;
+ unsigned int ari_range;
+
+ const unsigned char * RESTRICT ari_input;
+ unsigned char * RESTRICT ari_output;
+ unsigned char * RESTRICT ari_outputEOB;
+ unsigned char * RESTRICT ari_outputStart;
+
+ INLINE void OutputByte(unsigned char s)
+ {
+ *ari_output++ = s;
+ };
+
+ INLINE unsigned char InputByte()
+ {
+ return *ari_input++;
+ };
+
+ NOINLINE unsigned int ShiftLow()
+ {
+ if (ari.u.low32 < 0xff000000U || ari.u.carry)
+ {
+ OutputByte(ari_cache + ari.u.carry);
+ if (ari_ffnum)
+ {
+ unsigned char s = ari.u.carry - 1;
+ do { OutputByte(s); } while (--ari_ffnum);
+ }
+ ari_cache = ari.u.low32 >> 24; ari.u.carry = 0;
+ }
+ else
+ {
+ ari_ffnum++;
+ }
+
+ ari.u.low32 <<= 8; return ari_range << 8;
+ }
+
+public:
+
+ INLINE void InitEncoder(unsigned char * output, int outputSize)
+ {
+ ari_outputStart = output;
+ ari_output = output;
+ ari_outputEOB = output + outputSize - 16;
+ ari.low = 0;
+ ari_ffnum = 0;
+ ari_cache = 0;
+ ari_range = 0xffffffff;
+ };
+
+ INLINE int FinishEncoder()
+ {
+ ShiftLow(); ShiftLow(); ShiftLow(); ShiftLow(); ShiftLow();
+ return (int)(ari_output - ari_outputStart);
+ }
+
+ INLINE void Encode(unsigned int cum_freq, unsigned int sym_freq, unsigned int total_freq)
+ {
+ unsigned int range = ari_range / total_freq;
+ ari.low += (unsigned long long)cum_freq * range; ari_range = sym_freq * range;
+
+ while (ari_range < 0x1000000) { ari_range = ShiftLow(); }
+ }
+
+ template INLINE unsigned int EncodeBit(unsigned int bit, int probability)
+ {
+ unsigned int range = (((unsigned long long)ari_range) * probability) >> P;
+ ari.low = ari.low + ((~bit + 1u) & range);
+ ari_range = range + ((~bit + 1u) & (ari_range - range - range));
+
+ while (ari_range < 0x1000000) { ari_range = ShiftLow(); }
+
+ return bit;
+ }
+
+ INLINE unsigned int EncodeValue(unsigned int min, unsigned int value, unsigned int max)
+ {
+ assert(min <= value && value <= max);
+
+ while (max - min >= 0x10000)
+ {
+ unsigned int median = min + ((max - min) >> 1);
+ if (value > median)
+ {
+ EncodeBit<1>(1, 1);
+ min = median + 1;
+ }
+ else
+ {
+ EncodeBit<1>(0, 1);
+ max = median;
+ }
+ }
+
+ if (min != max)
+ {
+ Encode(value - min, 1, max - min + 1);
+ }
+
+ return value;
+ }
+
+ INLINE void InitDecoder(const unsigned char * input)
+ {
+ ari_input = input;
+ ari_code = 0;
+ ari_range = 0xffffffff;
+ ari_code = (ari_code << 8) | InputByte();
+ ari_code = (ari_code << 8) | InputByte();
+ ari_code = (ari_code << 8) | InputByte();
+ ari_code = (ari_code << 8) | InputByte();
+ ari_code = (ari_code << 8) | InputByte();
+ };
+
+ INLINE unsigned int GetCumFreq(unsigned int total_freq)
+ {
+ while (ari_range < 0x1000000)
+ {
+ ari_range <<= 8; ari_code = (ari_code << 8) | InputByte();
+ }
+
+ return ari_code / (ari_range / total_freq);
+ }
+
+ INLINE void Decode(unsigned int cum_freq, unsigned int sym_freq, unsigned int total_freq)
+ {
+ unsigned int range = ari_range / total_freq;
+ ari_code -= cum_freq * range; ari_range = sym_freq * range;
+ }
+
+ template INLINE int DecodeBit(int probability)
+ {
+ while (ari_range < 0x1000000)
+ {
+ ari_range <<= 8; ari_code = (ari_code << 8) | InputByte();
+ }
+
+ unsigned int range = (((unsigned long long)ari_range) * probability) >> P;
+ int bit = ari_code >= range;
+
+ ari_range = bit ? ari_range - range : range;
+ ari_code = bit ? ari_code - range : ari_code;
+
+ return bit;
+ }
+
+ INLINE unsigned int DecodeValue(unsigned int min, unsigned int max)
+ {
+ assert(min <= max);
+
+ while (max - min >= 0x10000)
+ {
+ unsigned int median = min + ((max - min) >> 1);
+ if (DecodeBit<1>(1))
+ {
+ min = median + 1;
+ }
+ else
+ {
+ max = median;
+ }
+ }
+
+ if (min != max)
+ {
+ unsigned int cum_freq = GetCumFreq(max - min + 1);
+ Decode(cum_freq, 1, max - min + 1); min += cum_freq;
+ }
+
+ return min;
+ }
+};
+
+#endif
+
+/*-----------------------------------------------------------*/
+/* End rangecoder.h */
+/*-----------------------------------------------------------*/
diff --git a/hutucker/LICENSE b/hutucker/LICENSE
new file mode 100644
index 0000000..f288702
--- /dev/null
+++ b/hutucker/LICENSE
@@ -0,0 +1,674 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc.
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+ The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works. By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users. We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors. You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+ To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights. Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received. You must make sure that they, too, receive
+or can get the source code. And you must show them these terms so they
+know their rights.
+
+ Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+ For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software. For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+ Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so. This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software. The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable. Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products. If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+ Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary. To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ TERMS AND CONDITIONS
+
+ 0. Definitions.
+
+ "This License" refers to version 3 of the GNU General Public License.
+
+ "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+ "The Program" refers to any copyrightable work licensed under this
+License. Each licensee is addressed as "you". "Licensees" and
+"recipients" may be individuals or organizations.
+
+ To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy. The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+ A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+ To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy. Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+ To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies. Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+ An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License. If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+ 1. Source Code.
+
+ The "source code" for a work means the preferred form of the work
+for making modifications to it. "Object code" means any non-source
+form of a work.
+
+ A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+ The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form. A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+ The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities. However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work. For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+ The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+ The Corresponding Source for a work in source code form is that
+same work.
+
+ 2. Basic Permissions.
+
+ All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met. This License explicitly affirms your unlimited
+permission to run the unmodified Program. The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work. This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+ You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force. You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright. Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+ Conveying under any other circumstances is permitted solely under
+the conditions stated below. Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+ 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+ No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+ When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+ 4. Conveying Verbatim Copies.
+
+ You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+ You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+ 5. Conveying Modified Source Versions.
+
+ You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+ a) The work must carry prominent notices stating that you modified
+ it, and giving a relevant date.
+
+ b) The work must carry prominent notices stating that it is
+ released under this License and any conditions added under section
+ 7. This requirement modifies the requirement in section 4 to
+ "keep intact all notices".
+
+ c) You must license the entire work, as a whole, under this
+ License to anyone who comes into possession of a copy. This
+ License will therefore apply, along with any applicable section 7
+ additional terms, to the whole of the work, and all its parts,
+ regardless of how they are packaged. This License gives no
+ permission to license the work in any other way, but it does not
+ invalidate such permission if you have separately received it.
+
+ d) If the work has interactive user interfaces, each must display
+ Appropriate Legal Notices; however, if the Program has interactive
+ interfaces that do not display Appropriate Legal Notices, your
+ work need not make them do so.
+
+ A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit. Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+ 6. Conveying Non-Source Forms.
+
+ You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+ a) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by the
+ Corresponding Source fixed on a durable physical medium
+ customarily used for software interchange.
+
+ b) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by a
+ written offer, valid for at least three years and valid for as
+ long as you offer spare parts or customer support for that product
+ model, to give anyone who possesses the object code either (1) a
+ copy of the Corresponding Source for all the software in the
+ product that is covered by this License, on a durable physical
+ medium customarily used for software interchange, for a price no
+ more than your reasonable cost of physically performing this
+ conveying of source, or (2) access to copy the
+ Corresponding Source from a network server at no charge.
+
+ c) Convey individual copies of the object code with a copy of the
+ written offer to provide the Corresponding Source. This
+ alternative is allowed only occasionally and noncommercially, and
+ only if you received the object code with such an offer, in accord
+ with subsection 6b.
+
+ d) Convey the object code by offering access from a designated
+ place (gratis or for a charge), and offer equivalent access to the
+ Corresponding Source in the same way through the same place at no
+ further charge. You need not require recipients to copy the
+ Corresponding Source along with the object code. If the place to
+ copy the object code is a network server, the Corresponding Source
+ may be on a different server (operated by you or a third party)
+ that supports equivalent copying facilities, provided you maintain
+ clear directions next to the object code saying where to find the
+ Corresponding Source. Regardless of what server hosts the
+ Corresponding Source, you remain obligated to ensure that it is
+ available for as long as needed to satisfy these requirements.
+
+ e) Convey the object code using peer-to-peer transmission, provided
+ you inform other peers where the object code and Corresponding
+ Source of the work are being offered to the general public at no
+ charge under subsection 6d.
+
+ A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+ A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling. In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage. For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product. A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+ "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source. The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+ If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information. But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+ The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed. Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+ Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+ 7. Additional Terms.
+
+ "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law. If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+ When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it. (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.) You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+ Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+ a) Disclaiming warranty or limiting liability differently from the
+ terms of sections 15 and 16 of this License; or
+
+ b) Requiring preservation of specified reasonable legal notices or
+ author attributions in that material or in the Appropriate Legal
+ Notices displayed by works containing it; or
+
+ c) Prohibiting misrepresentation of the origin of that material, or
+ requiring that modified versions of such material be marked in
+ reasonable ways as different from the original version; or
+
+ d) Limiting the use for publicity purposes of names of licensors or
+ authors of the material; or
+
+ e) Declining to grant rights under trademark law for use of some
+ trade names, trademarks, or service marks; or
+
+ f) Requiring indemnification of licensors and authors of that
+ material by anyone who conveys the material (or modified versions of
+ it) with contractual assumptions of liability to the recipient, for
+ any liability that these contractual assumptions directly impose on
+ those licensors and authors.
+
+ All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10. If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term. If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+ If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+ Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+ 8. Termination.
+
+ You may not propagate or modify a covered work except as expressly
+provided under this License. Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+ However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+ Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+ Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License. If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+ 9. Acceptance Not Required for Having Copies.
+
+ You are not required to accept this License in order to receive or
+run a copy of the Program. Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance. However,
+nothing other than this License grants you permission to propagate or
+modify any covered work. These actions infringe copyright if you do
+not accept this License. Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+ 10. Automatic Licensing of Downstream Recipients.
+
+ Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License. You are not responsible
+for enforcing compliance by third parties with this License.
+
+ An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations. If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+ You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License. For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+ 11. Patents.
+
+ A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based. The
+work thus licensed is called the contributor's "contributor version".
+
+ A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version. For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+ Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+ In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement). To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+ If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients. "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+ If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+ A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License. You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+ Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+ 12. No Surrender of Others' Freedom.
+
+ If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all. For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+ 13. Use with the GNU Affero General Public License.
+
+ Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work. The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+ 14. Revised Versions of this License.
+
+ The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+ Each version is given a distinguishing version number. If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation. If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+ If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+ Later license versions may give you additional or different
+permissions. However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+ 15. Disclaimer of Warranty.
+
+ THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+ 16. Limitation of Liability.
+
+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+ 17. Interpretation of Sections 15 and 16.
+
+ If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+
+ Copyright (C)
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see .
+
+Also add information on how to contact you by electronic and paper mail.
+
+ If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+ Copyright (C)
+ This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+ You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+.
+
+ The GNU General Public License does not permit incorporating your program
+into proprietary programs. If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library. If this is what you want to do, use the GNU Lesser General
+Public License instead of this License. But first, please read
+.
diff --git a/hutucker/README b/hutucker/README
new file mode 100644
index 0000000..e104713
--- /dev/null
+++ b/hutucker/README
@@ -0,0 +1,88 @@
+This is an O(n log n) implementation of Hu-Tucker coding.[1]
+
+This is the algorithm:
+1. Label node 0, ..., n-1 'terminal'
+2. Repeat (n - 1) times:
+ (a) Find the pair (i, j) such that
+ (i) i < j,
+ (ii) neither node i nor j is labeled 'none',
+ (iii) none of node i+1, ..., j-1 is labeled 'terminal',
+ (iv) weight[i] + weight[j] are minimal,
+ (v) i is minimal if the selection is not unique after (iv), and
+ (vi) j is minimal if the selection is not unique after (v)
+ (b) Merge node i with node j, and saves it as new node i
+ (c) weight[i] += weight[j]
+ (d) Label node i 'internal'
+ (e) Label node j 'none'
+3. A tree has been built with root being node 0.
+ Traverse this tree for length of code.
+ This tree is not alphabetical.
+ Nevertheless, the length of code produced by the tree is correct.
+
+See example.c for computing the actual code from the length.
+
+We need a non-trivial data structure to implement 2(a) efficiently.
+This is the data structure:
+1. It is a perfect binary tree.
+ The nodes in this tree are called "segnodes" to distinguish them from
+ nodes in the coding tree.
+ This tree shall have at least n leaf segnodes.
+2. Each segnode is implicitly associated with a range [a, b).
+ The range of the leaf segnode i is [i, i+1).
+ The range of each internal segnode is union of the ranges of its children.
+ (Alternatively, the range of each internal nodes is the union of
+ the ranges of all leaf nodes in its subtree.)
+3. Each segnode also has 6 explicit fields (n, m, l, r, i, j).
+ n: The number of nodes [a, b) labeled 'terminal' or 'internal'
+ m: The number of nodes [a, b) labeled 'terminal'
+ l: The index such that:
+ (i) l in [a, b),
+ (ii) node l is not labeled 'none',
+ (iii) none of node l, ..., i-1 is labeled 'terminal',
+ (iv) weight[l] is minimal, and
+ (v) l is minimal if the selection is not unique after (iv)
+ r: The index such that:
+ (i) r in [a, b),
+ (ii) node r is not labeled 'none',
+ (iii) none of node r, ..., b-1 is labeled 'terminal',
+ (iv) weight[r] is minimal, and
+ (v) r is minimal if the selection is not unique after (iv)
+ i, j: The pair of indices such that:
+ (i) a <= i < j < b,
+ (ii) neither node i nor j is labeled 'none',
+ (iii) none of node i+1, ..., j-1 is labeled 'terminal',
+ (iv) weight[i] + weight[j] are minimal,
+ (v) i is minimal if the selection is not unique after (iv), and
+ (vi) j is minimal if the selection is not unique after (v)
+4. The explicit fields can be trivially computed for leaf segnodes:
+ (a) Leaf segnode i labeled 'terminal':
+ (n, m, l, r, i, j) = (1, 1, i, i, None, None)
+ (b) Leaf segnode i labeled 'internal':
+ (n, m, l, r, i, j) = (1, 0, i, i, None, None)
+ (c) Leaf segnode i labeled 'none':
+ (n, m, l, r, i, j) = (0, 0, None, None, None, None)
+5. The explicit fields can be efficiently computed for internal segnodes,
+ if we have access to correct labels of its children segnodes.
+ Let its left children be L, and its right children be R.
+ n: L.n + R.n
+ m: L.n + R.m
+ l: L.l if L.m > 0, otherwise the better of L.l and R.l
+ r: R.r if R.m > 0, otherwise the better of L.r and R.r
+ i, j: the best of (L.i, L.j), (L.r, R.l) and (R.i, R.j)
+
+Analysis:
+1. This data structure can be built in O(n).
+2. The (i, j) step 2(a) is the (i, j) of the root of the data structure,
+ which can be looked up in O(1).
+3. When the weight[i] and label of node i changed,
+ leaf segnode i and its ancestors need to be updated.
+ That's O(log n) updates and O(1) per update.
+ Same for node j.
+4. Step 2 is repeated O(n) times.
+ Other parts are trivial.
+ Therefore, the overall time is O(n log n).
+
+
+[1]: Hu, T. C.; Tucker, A. C. (1971) "Optimal Computer Search Trees
+ and Variable-Length Alphabetical Codes", SIAM Journal on
+ Applied Mathematics. 21 (4): 514.
diff --git a/hutucker/example.c b/hutucker/example.c
new file mode 100644
index 0000000..34dd53f
--- /dev/null
+++ b/hutucker/example.c
@@ -0,0 +1,84 @@
+/*
+ * Linearithmic Hu-Tucker Coding.
+ * Copyright (C) 2018 Pochang Chen
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+#include
+#include
+#include
+#include
+#include "hu-tucker.h"
+
+int main() {
+ size_t n;
+ if (scanf("%zu", &n) != 1)
+ return 1;
+ if (n < 1) {
+ errno = EINVAL;
+ perror(NULL);
+ return 1;
+ }
+
+ unsigned long *weight = calloc(n, sizeof(unsigned long));
+ if (!weight) {
+ perror("calloc");
+ return 1;
+ }
+
+ for (size_t i = 0; i < n; i++)
+ scanf("%lu", weight + i);
+
+ unsigned long sumweight = 0;
+ for (size_t i = 0; i < n; i++) {
+ sumweight += weight[i];
+ if (sumweight < weight[i]) {
+ errno = EOVERFLOW;
+ perror(NULL);
+ return 1;
+ }
+ }
+
+ unsigned long *tmp = malloc(hutucker_tmp_size(n));
+ if (!tmp) {
+ perror("malloc");
+ return 1;
+ }
+ hutucker_get_lengths(n, weight, tmp);
+ free(tmp);
+
+ unsigned long maxlength = 0;
+ for (size_t i = 0; i < n; i++)
+ if (weight[i] > maxlength)
+ maxlength = weight[i];
+
+ unsigned char *str = malloc(maxlength + 1);
+ if (!str) {
+ perror("malloc");
+ return 1;
+ }
+ for (size_t i = 0, l = 0; i < n; i++) {
+ if (l < weight[i])
+ memset(str + l, '0', weight[i] - l);
+ l = weight[i];
+ str[l] = '\0';
+ puts(str);
+ for (size_t j = l - 1; j != (size_t) -1; j--)
+ if ((str[j] ^= '0' ^ '1') == '1')
+ break;
+ }
+
+ free(str);
+ free(weight);
+}
diff --git a/hutucker/hu-tucker.c b/hutucker/hu-tucker.c
new file mode 100644
index 0000000..82fe712
--- /dev/null
+++ b/hutucker/hu-tucker.c
@@ -0,0 +1,128 @@
+/*
+ * Linearithmic Hu-Tucker Coding.
+ * Copyright (C) 2018 Pochang Chen
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+
+/*--
+
+NOTICE: This file has been modified for use in the bsc-m03 project.
+
+--*/
+
+#include "hu-tucker.h"
+
+typedef struct {
+ // number of (terminal or internal) nodes under this segnode
+ size_t n;
+ // number of terminal node under this segnode, if n >= 1
+ size_t m;
+ // index of minimum weight in the leftmost block, if n >= 1
+ size_t l;
+ // index of minimum weight in the rightmost block, if n >= 1
+ size_t r;
+ // indices of minimum weight pair in the same block, if n >= 2
+ size_t i, j;
+} segnode;
+
+static void segupdate(segnode *pa, segnode *lc, segnode *rc, unsigned long *w) {
+ if (!lc->n) {
+ *pa = *rc;
+ return;
+ }
+ if (!rc->n) {
+ *pa = *lc;
+ return;
+ }
+ pa->n = lc->n + rc->n;
+ pa->m = lc->m + rc->m;
+ pa->l = ( lc->m || w[lc->l] <= w[rc->l]) ? lc->l : rc->l;
+ pa->r = (!rc->m && w[lc->r] <= w[rc->r]) ? lc->r : rc->r;
+ pa->i = lc->r;
+ pa->j = rc->l;
+ if (lc->n >= 2 && w[lc->i] + w[lc->j] <= w[pa->i] + w[pa->j]) {
+ pa->i = lc->i;
+ pa->j = lc->j;
+ }
+ if (rc->n >= 2 && w[rc->i] + w[rc->j] < w[pa->i] + w[pa->j]) {
+ pa->i = rc->i;
+ pa->j = rc->j;
+ }
+}
+static void segterminal(segnode *x, size_t id) {
+ x->n = x->m = 1;
+ x->l = x->r = id;
+}
+static void seginternal(segnode *x, size_t id) {
+ x->n = 1;
+ x->m = 0;
+ x->l = x->r = id;
+}
+static void segnone(segnode *x) {
+ x->n = 0;
+}
+static size_t raise_power_of_two(size_t n) {
+ size_t ans = 1;
+ while (ans < n)
+ ans *= 2;
+ return ans;
+}
+
+size_t hutucker_tmp_size(size_t n) {
+ // TODO check overflow for very large n
+ size_t m = raise_power_of_two(n);
+ return sizeof(segnode) * (2 * m - 1) +
+ sizeof(size_t) * (n + (2 * n - 1) + (2 * n - 1));
+}
+
+void hutucker_get_lengths(size_t n, unsigned long *weight, void *tmp) {
+ size_t m = raise_power_of_two(n);
+ segnode *seg = (segnode *) tmp;
+ size_t *cur = (size_t *) (seg + 2 * m - 1);
+ size_t *pa = (size_t *) (cur + n);
+ size_t *level = (size_t *) (pa + 2 * n - 1);
+
+ for (size_t i = 0; i < n; i++) {
+ segterminal(seg + m - 1 + i, i);
+ cur[i] = i;
+ }
+ for (size_t i = n; i < m; i++)
+ segnone(seg + m - 1 + i);
+
+ for (size_t i = m - 2; i != (size_t) -1; i--)
+ segupdate(seg + i, seg + 2 * i + 1, seg + 2 * i + 2, weight);
+
+ for (size_t k = 0; k < n - 1; k++) {
+ size_t i = seg->i, j = seg->j;
+
+ weight[i] += weight[j];
+ pa[cur[i]] = pa[cur[j]] = n + k;
+ cur[i] = n + k;
+
+ seginternal(seg + m - 1 + i, i);
+ for (size_t l = m + i; l /= 2; )
+ segupdate(seg + l - 1, seg + 2 * l - 1, seg + 2 * l, weight);
+
+ segnone(seg + m - 1 + j);
+ for (size_t l = m + j; l /= 2; )
+ segupdate(seg + l - 1, seg + 2 * l - 1, seg + 2 * l, weight);
+ }
+
+ level[2 * n - 2] = 0;
+ for (size_t i = 2 * n - 3; i != (size_t) -1; i--)
+ level[i] = level[pa[i]] + 1;
+ for (size_t i = 0; i < n; i++)
+ weight[i] = (unsigned long)level[i];
+}
diff --git a/hutucker/hu-tucker.h b/hutucker/hu-tucker.h
new file mode 100644
index 0000000..b637f4f
--- /dev/null
+++ b/hutucker/hu-tucker.h
@@ -0,0 +1,56 @@
+/*
+ * Linearithmic Hu-Tucker Coding.
+ * Copyright (C) 2018 Pochang Chen
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+
+/*--
+
+NOTICE: This file has been modified for use in the bsc-m03 project.
+
+--*/
+
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include
+
+/**
+ * This algorithm needs some temporary memories to work.
+ * This function computes how much temporary memories are needed.
+ */
+size_t hutucker_tmp_size(size_t n);
+
+/**
+ * Given the weight of n symbols, determine the length of hu-tucker code
+ * of each symbols.
+ *
+ * Precondition:
+ * n: number of symbols
+ * weight[i] (0 <= i < n): the weight of symbol i
+ * weight[0] + ... + weight[n - 1] must not exceed ULONG_MAX
+ * tmp: buffer with size >= hutucker_tmp_size(n)
+ *
+ * Postcondition:
+ * weight[i] (0 <= i < n): the length of hu-tucker code of symbol i
+ */
+void hutucker_get_lengths(size_t n, unsigned long *weight, void *tmp);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/libsais/CHANGES b/libsais/CHANGES
new file mode 100644
index 0000000..6d0b176
--- /dev/null
+++ b/libsais/CHANGES
@@ -0,0 +1,23 @@
+Changes in 2.6.0 (October 21, 2021)
+- libsais16 for 16-bit inputs.
+
+Changes in 2.5.0 (October 15, 2021)
+- Support for optional symbol frequency tables.
+
+Changes in 2.4.0 (July 14, 2021)
+- Reverse Burrows-Wheeler transform.
+
+Changes in 2.3.0 (June 23, 2021)
+- Burrows-Wheeler transform with auxiliary indexes.
+
+Changes in 2.2.0 (April 27, 2021)
+- libsais64 for inputs larger than 2GB.
+
+Changes in 2.1.0 (April 19, 2021)
+- Additional OpenMP acceleration.
+
+Changes in 2.0.0 (April 4, 2021)
+- OpenMP acceleration.
+
+Changes in 1.0.0 (February 23, 2021)
+- Initial Release.
diff --git a/libsais/LICENSE b/libsais/LICENSE
new file mode 100644
index 0000000..d645695
--- /dev/null
+++ b/libsais/LICENSE
@@ -0,0 +1,202 @@
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/libsais/VERSION b/libsais/VERSION
new file mode 100644
index 0000000..914ec96
--- /dev/null
+++ b/libsais/VERSION
@@ -0,0 +1 @@
+2.6.0
\ No newline at end of file
diff --git a/libsais/libsais.c b/libsais/libsais.c
new file mode 100644
index 0000000..885bd82
--- /dev/null
+++ b/libsais/libsais.c
@@ -0,0 +1,7599 @@
+/*--
+
+This file is a part of libsais, a library for linear time
+suffix array and burrows wheeler transform construction.
+
+ Copyright (c) 2021 Ilya Grebnov
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+Please see the file LICENSE for full copyright information.
+
+--*/
+
+#include "libsais_internal.h"
+
+#include "libsais.h"
+
+#include
+#include
+#include
+#include
+#include
+
+#if defined(_OPENMP)
+ #include
+#else
+ #define UNUSED(_x) (void)(_x)
+#endif
+
+typedef int32_t sa_sint_t;
+typedef uint32_t sa_uint_t;
+typedef ptrdiff_t fast_sint_t;
+typedef size_t fast_uint_t;
+
+#define SAINT_BIT (32)
+#define SAINT_MAX INT32_MAX
+#define SAINT_MIN INT32_MIN
+
+#define ALPHABET_SIZE (1 << CHAR_BIT)
+#define UNBWT_FASTBITS (17)
+
+#define SUFFIX_GROUP_BIT (SAINT_BIT - 1)
+#define SUFFIX_GROUP_MARKER (((sa_sint_t)1) << (SUFFIX_GROUP_BIT - 1))
+
+#define BUCKETS_INDEX2(_c, _s) (((_c) << 1) + (_s))
+#define BUCKETS_INDEX4(_c, _s) (((_c) << 2) + (_s))
+
+#define LIBSAIS_PER_THREAD_CACHE_SIZE (24576)
+
+typedef struct LIBSAIS_THREAD_CACHE
+{
+ sa_sint_t symbol;
+ sa_sint_t index;
+} LIBSAIS_THREAD_CACHE;
+
+typedef union LIBSAIS_THREAD_STATE
+{
+ struct
+ {
+ fast_sint_t position;
+ fast_sint_t count;
+
+ fast_sint_t m;
+ fast_sint_t last_lms_suffix;
+
+ sa_sint_t * buckets;
+ LIBSAIS_THREAD_CACHE * cache;
+ } state;
+
+ uint8_t padding[64];
+} LIBSAIS_THREAD_STATE;
+
+typedef struct LIBSAIS_CONTEXT
+{
+ sa_sint_t * buckets;
+ LIBSAIS_THREAD_STATE * thread_state;
+ fast_sint_t threads;
+} LIBSAIS_CONTEXT;
+
+typedef struct LIBSAIS_UNBWT_CONTEXT
+{
+ sa_uint_t * bucket2;
+ uint16_t * fastbits;
+ sa_uint_t * buckets;
+ fast_sint_t threads;
+} LIBSAIS_UNBWT_CONTEXT;
+
+#if defined(__GNUC__) || defined(__clang__)
+ #define RESTRICT __restrict__
+#elif defined(_MSC_VER) || defined(__INTEL_COMPILER)
+ #define RESTRICT __restrict
+#else
+ #error Your compiler, configuration or platform is not supported.
+#endif
+
+#if defined(__has_builtin)
+ #if __has_builtin(__builtin_prefetch)
+ #define HAS_BUILTIN_PREFECTCH
+ #endif
+#elif defined(__GNUC__) && __GNUC__ > 3
+ #define HAS_BUILTIN_PREFECTCH
+#endif
+
+#if defined(HAS_BUILTIN_PREFECTCH)
+ #define libsais_prefetch(address) __builtin_prefetch((const void *)(address), 0, 0)
+ #define libsais_prefetchw(address) __builtin_prefetch((const void *)(address), 1, 0)
+#elif defined (_M_IX86) || defined (_M_AMD64)
+ #include
+ #define libsais_prefetch(address) _mm_prefetch((const void *)(address), _MM_HINT_NTA)
+ #define libsais_prefetchw(address) _m_prefetchw((const void *)(address))
+#elif defined (_M_ARM)
+ #include
+ #define libsais_prefetch(address) __prefetch((const void *)(address))
+ #define libsais_prefetchw(address) __prefetchw((const void *)(address))
+#elif defined (_M_ARM64)
+ #include
+ #define libsais_prefetch(address) __prefetch2((const void *)(address), 1)
+ #define libsais_prefetchw(address) __prefetch2((const void *)(address), 17)
+#else
+ #error Your compiler, configuration or platform is not supported.
+#endif
+
+#if !defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__)
+ #if defined(_LITTLE_ENDIAN) \
+ || (defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && BYTE_ORDER == LITTLE_ENDIAN) \
+ || (defined(_BYTE_ORDER) && defined(_LITTLE_ENDIAN) && _BYTE_ORDER == _LITTLE_ENDIAN) \
+ || (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN) \
+ || (defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+ #define __LITTLE_ENDIAN__
+ #elif defined(_BIG_ENDIAN) \
+ || (defined(BYTE_ORDER) && defined(BIG_ENDIAN) && BYTE_ORDER == BIG_ENDIAN) \
+ || (defined(_BYTE_ORDER) && defined(_BIG_ENDIAN) && _BYTE_ORDER == _BIG_ENDIAN) \
+ || (defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && __BYTE_ORDER == __BIG_ENDIAN) \
+ || (defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ #define __BIG_ENDIAN__
+ #elif defined(_WIN32)
+ #define __LITTLE_ENDIAN__
+ #endif
+#endif
+
+#if defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__)
+ #if defined(__GNUC__) || defined(__clang__)
+ #define libsais_bswap16(x) (__builtin_bswap16(x))
+ #elif defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+ #define libsais_bswap16(x) (_byteswap_ushort(x))
+ #else
+ #define libsais_bswap16(x) ((uint16_t)(x >> 8) | (uint16_t)(x << 8))
+ #endif
+#elif !defined(__LITTLE_ENDIAN__) && defined(__BIG_ENDIAN__)
+ #define libsais_bswap16(x) (x)
+#else
+ #error Your compiler, configuration or platform is not supported.
+#endif
+
+static void * libsais_align_up(const void * address, size_t alignment)
+{
+ return (void *)((((ptrdiff_t)address) + ((ptrdiff_t)alignment) - 1) & (-((ptrdiff_t)alignment)));
+}
+
+static void * libsais_alloc_aligned(size_t size, size_t alignment)
+{
+ void * address = malloc(size + sizeof(short) + alignment - 1);
+ if (address != NULL)
+ {
+ void * aligned_address = libsais_align_up((void *)((ptrdiff_t)address + (ptrdiff_t)(sizeof(short))), alignment);
+ ((short *)aligned_address)[-1] = (short)((ptrdiff_t)aligned_address - (ptrdiff_t)address);
+
+ return aligned_address;
+ }
+
+ return NULL;
+}
+
+static void libsais_free_aligned(void * aligned_address)
+{
+ if (aligned_address != NULL)
+ {
+ free((void *)((ptrdiff_t)aligned_address - ((short *)aligned_address)[-1]));
+ }
+}
+
+static LIBSAIS_THREAD_STATE * libsais_alloc_thread_state(sa_sint_t threads)
+{
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state = (LIBSAIS_THREAD_STATE *)libsais_alloc_aligned((size_t)threads * sizeof(LIBSAIS_THREAD_STATE), 4096);
+ sa_sint_t * RESTRICT thread_buckets = (sa_sint_t *)libsais_alloc_aligned((size_t)threads * 4 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096);
+ LIBSAIS_THREAD_CACHE * RESTRICT thread_cache = (LIBSAIS_THREAD_CACHE *)libsais_alloc_aligned((size_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE * sizeof(LIBSAIS_THREAD_CACHE), 4096);
+
+ if (thread_state != NULL && thread_buckets != NULL && thread_cache != NULL)
+ {
+ fast_sint_t t;
+ for (t = 0; t < threads; ++t)
+ {
+ thread_state[t].state.buckets = thread_buckets; thread_buckets += 4 * ALPHABET_SIZE;
+ thread_state[t].state.cache = thread_cache; thread_cache += LIBSAIS_PER_THREAD_CACHE_SIZE;
+ }
+
+ return thread_state;
+ }
+
+ libsais_free_aligned(thread_cache);
+ libsais_free_aligned(thread_buckets);
+ libsais_free_aligned(thread_state);
+ return NULL;
+}
+
+static void libsais_free_thread_state(LIBSAIS_THREAD_STATE * thread_state)
+{
+ if (thread_state != NULL)
+ {
+ libsais_free_aligned(thread_state[0].state.cache);
+ libsais_free_aligned(thread_state[0].state.buckets);
+ libsais_free_aligned(thread_state);
+ }
+}
+
+static LIBSAIS_CONTEXT * libsais_create_ctx_main(sa_sint_t threads)
+{
+ LIBSAIS_CONTEXT * RESTRICT ctx = (LIBSAIS_CONTEXT *)libsais_alloc_aligned(sizeof(LIBSAIS_CONTEXT), 64);
+ sa_sint_t * RESTRICT buckets = (sa_sint_t *)libsais_alloc_aligned(8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096);
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state = threads > 1 ? libsais_alloc_thread_state(threads) : NULL;
+
+ if (ctx != NULL && buckets != NULL && (thread_state != NULL || threads == 1))
+ {
+ ctx->buckets = buckets;
+ ctx->threads = threads;
+ ctx->thread_state = thread_state;
+
+ return ctx;
+ }
+
+ libsais_free_thread_state(thread_state);
+ libsais_free_aligned(buckets);
+ libsais_free_aligned(ctx);
+ return NULL;
+}
+
+static void libsais_free_ctx_main(LIBSAIS_CONTEXT * ctx)
+{
+ if (ctx != NULL)
+ {
+ libsais_free_thread_state(ctx->thread_state);
+ libsais_free_aligned(ctx->buckets);
+ libsais_free_aligned(ctx);
+ }
+}
+
+#if defined(_OPENMP)
+
+static sa_sint_t libsais_count_negative_marked_suffixes(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ sa_sint_t count = 0;
+
+ fast_sint_t i; for (i = omp_block_start; i < omp_block_start + omp_block_size; ++i) { count += (SA[i] < 0); }
+
+ return count;
+}
+
+static sa_sint_t libsais_count_zero_marked_suffixes(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ sa_sint_t count = 0;
+
+ fast_sint_t i; for (i = omp_block_start; i < omp_block_start + omp_block_size; ++i) { count += (SA[i] == 0); }
+
+ return count;
+}
+
+static void libsais_place_cached_suffixes(sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
+ {
+ libsais_prefetch(&cache[i + 2 * prefetch_distance]);
+
+ libsais_prefetchw(&SA[cache[i + prefetch_distance + 0].symbol]);
+ libsais_prefetchw(&SA[cache[i + prefetch_distance + 1].symbol]);
+ libsais_prefetchw(&SA[cache[i + prefetch_distance + 2].symbol]);
+ libsais_prefetchw(&SA[cache[i + prefetch_distance + 3].symbol]);
+
+ SA[cache[i + 0].symbol] = cache[i + 0].index;
+ SA[cache[i + 1].symbol] = cache[i + 1].index;
+ SA[cache[i + 2].symbol] = cache[i + 2].index;
+ SA[cache[i + 3].symbol] = cache[i + 3].index;
+ }
+
+ for (j += prefetch_distance + 3; i < j; i += 1)
+ {
+ SA[cache[i].symbol] = cache[i].index;
+ }
+}
+
+static void libsais_compact_and_place_cached_suffixes(sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j, l;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j; i += 4)
+ {
+ libsais_prefetchw(&cache[i + prefetch_distance]);
+
+ cache[l] = cache[i + 0]; l += cache[l].symbol >= 0;
+ cache[l] = cache[i + 1]; l += cache[l].symbol >= 0;
+ cache[l] = cache[i + 2]; l += cache[l].symbol >= 0;
+ cache[l] = cache[i + 3]; l += cache[l].symbol >= 0;
+ }
+
+ for (j += 3; i < j; i += 1)
+ {
+ cache[l] = cache[i]; l += cache[l].symbol >= 0;
+ }
+
+ libsais_place_cached_suffixes(SA, cache, omp_block_start, l - omp_block_start);
+}
+
+static void libsais_accumulate_counts_s32_2(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
+{
+ sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
+ fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s]; }
+}
+
+static void libsais_accumulate_counts_s32_3(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
+{
+ sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
+ sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
+ fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s]; }
+}
+
+static void libsais_accumulate_counts_s32_4(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
+{
+ sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
+ sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
+ sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride;
+ fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s]; }
+}
+
+static void libsais_accumulate_counts_s32_5(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
+{
+ sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
+ sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
+ sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride;
+ sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride;
+ fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s]; }
+}
+
+static void libsais_accumulate_counts_s32_6(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
+{
+ sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
+ sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
+ sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride;
+ sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride;
+ sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride;
+ fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s]; }
+}
+
+static void libsais_accumulate_counts_s32_7(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
+{
+ sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
+ sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
+ sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride;
+ sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride;
+ sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride;
+ sa_sint_t * RESTRICT bucket06 = bucket05 - bucket_stride;
+ fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s] + bucket06[s]; }
+}
+
+static void libsais_accumulate_counts_s32_8(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
+{
+ sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
+ sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
+ sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride;
+ sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride;
+ sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride;
+ sa_sint_t * RESTRICT bucket06 = bucket05 - bucket_stride;
+ sa_sint_t * RESTRICT bucket07 = bucket06 - bucket_stride;
+ fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s] + bucket06[s] + bucket07[s]; }
+}
+
+static void libsais_accumulate_counts_s32_9(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
+{
+ sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
+ sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
+ sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride;
+ sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride;
+ sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride;
+ sa_sint_t * RESTRICT bucket06 = bucket05 - bucket_stride;
+ sa_sint_t * RESTRICT bucket07 = bucket06 - bucket_stride;
+ sa_sint_t * RESTRICT bucket08 = bucket07 - bucket_stride;
+ fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s] + bucket06[s] + bucket07[s] + bucket08[s]; }
+}
+
+static void libsais_accumulate_counts_s32(sa_sint_t * RESTRICT buckets, fast_sint_t bucket_size, fast_sint_t bucket_stride, fast_sint_t num_buckets)
+{
+ while (num_buckets >= 9)
+ {
+ libsais_accumulate_counts_s32_9(buckets - (num_buckets - 9) * bucket_stride, bucket_size, bucket_stride); num_buckets -= 8;
+ }
+
+ switch (num_buckets)
+ {
+ case 1: break;
+ case 2: libsais_accumulate_counts_s32_2(buckets, bucket_size, bucket_stride); break;
+ case 3: libsais_accumulate_counts_s32_3(buckets, bucket_size, bucket_stride); break;
+ case 4: libsais_accumulate_counts_s32_4(buckets, bucket_size, bucket_stride); break;
+ case 5: libsais_accumulate_counts_s32_5(buckets, bucket_size, bucket_stride); break;
+ case 6: libsais_accumulate_counts_s32_6(buckets, bucket_size, bucket_stride); break;
+ case 7: libsais_accumulate_counts_s32_7(buckets, bucket_size, bucket_stride); break;
+ case 8: libsais_accumulate_counts_s32_8(buckets, bucket_size, bucket_stride); break;
+ }
+}
+
+#endif
+
+static void libsais_gather_lms_suffixes_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, fast_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ if (omp_block_size > 0)
+ {
+ const fast_sint_t prefetch_distance = 128;
+
+ fast_sint_t i, j = omp_block_start + omp_block_size, c0 = T[omp_block_start + omp_block_size - 1], c1 = -1;
+
+ while (j < n && (c1 = T[j]) == c0) { ++j; }
+
+ fast_uint_t s = c0 >= c1;
+
+ for (i = omp_block_start + omp_block_size - 2, j = omp_block_start + 3; i >= j; i -= 4)
+ {
+ libsais_prefetch(&T[i - prefetch_distance]);
+
+ c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+ c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 0); m -= ((s & 3) == 1);
+ c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 1); m -= ((s & 3) == 1);
+ c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 2); m -= ((s & 3) == 1);
+ }
+
+ for (j -= 3; i >= j; i -= 1)
+ {
+ c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+ }
+
+ SA[m] = (sa_sint_t)(i + 1);
+ }
+}
+
+static void libsais_gather_lms_suffixes_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536 && omp_get_dynamic() == 0)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+ if (omp_num_threads == 1)
+ {
+ libsais_gather_lms_suffixes_8u(T, SA, n, (fast_sint_t)n - 1, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ fast_sint_t t, m = 0; for (t = omp_num_threads - 1; t > omp_thread_num; --t) { m += thread_state[t].state.m; }
+
+ libsais_gather_lms_suffixes_8u(T, SA, n, (fast_sint_t)n - 1 - m, omp_block_start, omp_block_size);
+
+ #pragma omp barrier
+
+ if (thread_state[omp_thread_num].state.m > 0)
+ {
+ SA[(fast_sint_t)n - 1 - m] = (sa_sint_t)thread_state[omp_thread_num].state.last_lms_suffix;
+ }
+ }
+#endif
+ }
+}
+
+static sa_sint_t libsais_gather_lms_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t i = n - 2;
+ sa_sint_t m = n - 1;
+ fast_uint_t s = 1;
+ fast_sint_t c0 = T[n - 1];
+ fast_sint_t c1 = 0;
+
+ for (; i >= 3; i -= 4)
+ {
+ libsais_prefetch(&T[i - prefetch_distance]);
+
+ c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = i + 1; m -= ((s & 3) == 1);
+ c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i - 0; m -= ((s & 3) == 1);
+ c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = i - 1; m -= ((s & 3) == 1);
+ c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i - 2; m -= ((s & 3) == 1);
+ }
+
+ for (; i >= 0; i -= 1)
+ {
+ c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i + 1; m -= ((s & 3) == 1);
+ }
+
+ return n - 1 - m;
+}
+
+static sa_sint_t libsais_gather_compacted_lms_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t i = n - 2;
+ sa_sint_t m = n - 1;
+ fast_uint_t s = 1;
+ fast_sint_t c0 = T[n - 1];
+ fast_sint_t c1 = 0;
+
+ for (; i >= 3; i -= 4)
+ {
+ libsais_prefetch(&T[i - prefetch_distance]);
+
+ c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = i + 1; m -= ((fast_sint_t)(s & 3) == (c0 >= 0));
+ c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i - 0; m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
+ c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = i - 1; m -= ((fast_sint_t)(s & 3) == (c0 >= 0));
+ c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i - 2; m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
+ }
+
+ for (; i >= 0; i -= 1)
+ {
+ c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i + 1; m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
+ }
+
+ return n - 1 - m;
+}
+
+#if defined(_OPENMP)
+
+static void libsais_count_lms_suffixes_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ memset(buckets, 0, 4 * (size_t)k * sizeof(sa_sint_t));
+
+ sa_sint_t i = n - 2;
+ fast_uint_t s = 1;
+ fast_sint_t c0 = T[n - 1];
+ fast_sint_t c1 = 0;
+
+ for (; i >= prefetch_distance + 3; i -= 4)
+ {
+ libsais_prefetch(&T[i - 2 * prefetch_distance]);
+
+ libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 0], 0)]);
+ libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 1], 0)]);
+ libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 2], 0)]);
+ libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 3], 0)]);
+
+ c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+ buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+
+ c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+
+ c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+ buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+
+ c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+ }
+
+ for (; i >= 0; i -= 1)
+ {
+ c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+ }
+
+ buckets[BUCKETS_INDEX4((fast_uint_t)c0, (s << 1) & 3)]++;
+}
+
+#endif
+
+static void libsais_count_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
+
+ sa_sint_t i = n - 2;
+ fast_uint_t s = 1;
+ fast_sint_t c0 = T[n - 1];
+ fast_sint_t c1 = 0;
+
+ for (; i >= prefetch_distance + 3; i -= 4)
+ {
+ libsais_prefetch(&T[i - 2 * prefetch_distance]);
+
+ libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0], 0)]);
+ libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1], 0)]);
+ libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2], 0)]);
+ libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3], 0)]);
+
+ c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+ buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+ c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+
+ c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+ buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+ c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+ }
+
+ for (; i >= 0; i -= 1)
+ {
+ c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+ }
+
+ buckets[BUCKETS_INDEX2((fast_uint_t)c0, 0)]++;
+}
+
+#if defined(_OPENMP)
+
+static void libsais_count_compacted_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
+
+ sa_sint_t i = n - 2;
+ fast_uint_t s = 1;
+ fast_sint_t c0 = T[n - 1];
+ fast_sint_t c1 = 0;
+
+ for (; i >= prefetch_distance + 3; i -= 4)
+ {
+ libsais_prefetch(&T[i - 2 * prefetch_distance]);
+
+ libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0] & SAINT_MAX, 0)]);
+ libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1] & SAINT_MAX, 0)]);
+ libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2] & SAINT_MAX, 0)]);
+ libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3] & SAINT_MAX, 0)]);
+
+ c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+ c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+ c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+
+ c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+ c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+ c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+ }
+
+ for (; i >= 0; i -= 1)
+ {
+ c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+ }
+
+ c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, 0)]++;
+}
+
+#endif
+
+static sa_sint_t libsais_count_and_gather_lms_suffixes_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t));
+
+ fast_sint_t m = omp_block_start + omp_block_size - 1;
+
+ if (omp_block_size > 0)
+ {
+ const fast_sint_t prefetch_distance = 128;
+
+ fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1;
+
+ while (j < n && (c1 = T[j]) == c0) { ++j; }
+
+ fast_uint_t s = c0 >= c1;
+
+ for (i = m - 1, j = omp_block_start + 3; i >= j; i -= 4)
+ {
+ libsais_prefetch(&T[i - prefetch_distance]);
+
+ c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+
+ c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 0); m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+
+ c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 1); m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+
+ c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 2); m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+ }
+
+ for (j -= 3; i >= j; i -= 1)
+ {
+ c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+ }
+
+ c1 = (i >= 0) ? T[i] : -1; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+ }
+
+ return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m);
+}
+
+static sa_sint_t libsais_count_and_gather_lms_suffixes_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ sa_sint_t m = 0;
+
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536 && omp_get_dynamic() == 0)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+ if (omp_num_threads == 1)
+ {
+ m = libsais_count_and_gather_lms_suffixes_8u(T, SA, n, buckets, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size;
+ thread_state[omp_thread_num].state.m = libsais_count_and_gather_lms_suffixes_8u(T, SA, n, thread_state[omp_thread_num].state.buckets, omp_block_start, omp_block_size);
+
+ if (thread_state[omp_thread_num].state.m > 0)
+ {
+ thread_state[omp_thread_num].state.last_lms_suffix = SA[thread_state[omp_thread_num].state.position - 1];
+ }
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t));
+
+ fast_sint_t t;
+ for (t = omp_num_threads - 1; t >= 0; --t)
+ {
+ m += (sa_sint_t)thread_state[t].state.m;
+
+ if (t != omp_num_threads - 1 && thread_state[t].state.m > 0)
+ {
+ memcpy(&SA[n - m], &SA[thread_state[t].state.position - thread_state[t].state.m], (size_t)thread_state[t].state.m * sizeof(sa_sint_t));
+ }
+
+ {
+ sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
+ fast_sint_t s; for (s = 0; s < 4 * ALPHABET_SIZE; s += 1) { sa_sint_t A = buckets[s], B = temp_bucket[s]; buckets[s] = A + B; temp_bucket[s] = A; }
+ }
+ }
+ }
+ }
+#endif
+ }
+
+ return m;
+}
+
+static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ memset(buckets, 0, 4 * (size_t)k * sizeof(sa_sint_t));
+
+ fast_sint_t m = omp_block_start + omp_block_size - 1;
+
+ if (omp_block_size > 0)
+ {
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1;
+
+ while (j < n && (c1 = T[j]) == c0) { ++j; }
+
+ fast_uint_t s = c0 >= c1;
+
+ for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4)
+ {
+ libsais_prefetch(&T[i - 2 * prefetch_distance]);
+
+ libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 0], 0)]);
+ libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 1], 0)]);
+ libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 2], 0)]);
+ libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 3], 0)]);
+
+ c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+
+ c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 0); m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+
+ c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 1); m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+
+ c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 2); m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+ }
+
+ for (j -= prefetch_distance + 3; i >= j; i -= 1)
+ {
+ c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+ }
+
+ c1 = (i >= 0) ? T[i] : -1; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+ }
+
+ return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m);
+}
+
+static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
+
+ fast_sint_t m = omp_block_start + omp_block_size - 1;
+
+ if (omp_block_size > 0)
+ {
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1;
+
+ while (j < n && (c1 = T[j]) == c0) { ++j; }
+
+ fast_uint_t s = c0 >= c1;
+
+ for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4)
+ {
+ libsais_prefetch(&T[i - 2 * prefetch_distance]);
+
+ libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0], 0)]);
+ libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1], 0)]);
+ libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2], 0)]);
+ libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3], 0)]);
+
+ c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+ c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 0); m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+
+ c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 1); m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+ c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 2); m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+ }
+
+ for (j -= prefetch_distance + 3; i >= j; i -= 1)
+ {
+ c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+ }
+
+ c1 = (i >= 0) ? T[i] : -1; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+ }
+
+ return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m);
+}
+
+static sa_sint_t libsais_count_and_gather_compacted_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
+
+ fast_sint_t m = omp_block_start + omp_block_size - 1;
+
+ if (omp_block_size > 0)
+ {
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1;
+
+ while (j < n && (c1 = T[j]) == c0) { ++j; }
+
+ fast_uint_t s = c0 >= c1;
+
+ for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4)
+ {
+ libsais_prefetch(&T[i - 2 * prefetch_distance]);
+
+ libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0] & SAINT_MAX, 0)]);
+ libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1] & SAINT_MAX, 0)]);
+ libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2] & SAINT_MAX, 0)]);
+ libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3] & SAINT_MAX, 0)]);
+
+ c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((fast_sint_t)(s & 3) == (c0 >= 0));
+ c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+ c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 0); m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
+ c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+
+ c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 1); m -= ((fast_sint_t)(s & 3) == (c0 >= 0));
+ c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+ c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 2); m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
+ c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+ }
+
+ for (j -= prefetch_distance + 3; i >= j; i -= 1)
+ {
+ c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
+ c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+ }
+
+ c1 = (i >= 0) ? T[i] : -1; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((fast_sint_t)(s & 3) == (c0 >= 0));
+ c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+ }
+
+ return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m);
+}
+
+#if defined(_OPENMP)
+
+static fast_sint_t libsais_get_bucket_stride(fast_sint_t free_space, fast_sint_t bucket_size, fast_sint_t num_buckets)
+{
+ fast_sint_t bucket_size_1024 = (bucket_size + 1023) & (-1024); if (free_space / (num_buckets - 1) >= bucket_size_1024) { return bucket_size_1024; }
+ fast_sint_t bucket_size_16 = (bucket_size + 15) & (-16); if (free_space / (num_buckets - 1) >= bucket_size_16) { return bucket_size_16; }
+
+ return bucket_size;
+}
+
+static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_fs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ sa_sint_t m = 0;
+
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+ if (omp_num_threads == 1)
+ {
+ m = libsais_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ fast_sint_t bucket_size = 4 * (fast_sint_t)k;
+ fast_sint_t bucket_stride = libsais_get_bucket_stride(buckets - &SA[n], bucket_size, omp_num_threads);
+
+ {
+ thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size;
+ thread_state[omp_thread_num].state.count = libsais_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets - (omp_thread_num * bucket_stride), omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ if (omp_thread_num == omp_num_threads - 1)
+ {
+ fast_sint_t t;
+ for (t = omp_num_threads - 1; t >= 0; --t)
+ {
+ m += (sa_sint_t)thread_state[t].state.count;
+
+ if (t != omp_num_threads - 1 && thread_state[t].state.count > 0)
+ {
+ memcpy(&SA[n - m], &SA[thread_state[t].state.position - thread_state[t].state.count], (size_t)thread_state[t].state.count * sizeof(sa_sint_t));
+ }
+ }
+ }
+ else
+ {
+ omp_num_threads = omp_num_threads - 1;
+ omp_block_stride = (bucket_size / omp_num_threads) & (-16);
+ omp_block_start = omp_thread_num * omp_block_stride;
+ omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : bucket_size - omp_block_start;
+
+ libsais_accumulate_counts_s32(buckets + omp_block_start, omp_block_size, bucket_stride, omp_num_threads + 1);
+ }
+ }
+#endif
+ }
+
+ return m;
+}
+
+static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_fs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ sa_sint_t m = 0;
+
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+ if (omp_num_threads == 1)
+ {
+ m = libsais_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ fast_sint_t bucket_size = 2 * (fast_sint_t)k;
+ fast_sint_t bucket_stride = libsais_get_bucket_stride(buckets - &SA[n], bucket_size, omp_num_threads);
+
+ {
+ thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size;
+ thread_state[omp_thread_num].state.count = libsais_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets - (omp_thread_num * bucket_stride), omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ if (omp_thread_num == omp_num_threads - 1)
+ {
+ fast_sint_t t;
+ for (t = omp_num_threads - 1; t >= 0; --t)
+ {
+ m += (sa_sint_t)thread_state[t].state.count;
+
+ if (t != omp_num_threads - 1 && thread_state[t].state.count > 0)
+ {
+ memcpy(&SA[n - m], &SA[thread_state[t].state.position - thread_state[t].state.count], (size_t)thread_state[t].state.count * sizeof(sa_sint_t));
+ }
+ }
+ }
+ else
+ {
+ omp_num_threads = omp_num_threads - 1;
+ omp_block_stride = (bucket_size / omp_num_threads) & (-16);
+ omp_block_start = omp_thread_num * omp_block_stride;
+ omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : bucket_size - omp_block_start;
+
+ libsais_accumulate_counts_s32(buckets + omp_block_start, omp_block_size, bucket_stride, omp_num_threads + 1);
+ }
+ }
+#endif
+ }
+
+ return m;
+}
+
+static void libsais_count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+ if (omp_num_threads == 1)
+ {
+ libsais_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA, n, k, buckets, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ fast_sint_t bucket_size = 2 * (fast_sint_t)k;
+ fast_sint_t bucket_stride = libsais_get_bucket_stride(buckets - &SA[n + n], bucket_size, omp_num_threads);
+
+ {
+ thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size;
+ thread_state[omp_thread_num].state.count = libsais_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA + n, n, k, buckets - (omp_thread_num * bucket_stride), omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ {
+ fast_sint_t t, m = 0; for (t = omp_num_threads - 1; t >= omp_thread_num; --t) { m += (sa_sint_t)thread_state[t].state.count; }
+
+ if (thread_state[omp_thread_num].state.count > 0)
+ {
+ memcpy(&SA[n - m], &SA[n + thread_state[omp_thread_num].state.position - thread_state[omp_thread_num].state.count], (size_t)thread_state[omp_thread_num].state.count * sizeof(sa_sint_t));
+ }
+ }
+
+ {
+ omp_block_stride = (bucket_size / omp_num_threads) & (-16);
+ omp_block_start = omp_thread_num * omp_block_stride;
+ omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : bucket_size - omp_block_start;
+
+ libsais_accumulate_counts_s32(buckets + omp_block_start, omp_block_size, bucket_stride, omp_num_threads);
+ }
+ }
+#endif
+ }
+}
+
+#endif
+
+static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_nofs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads)
+{
+ sa_sint_t m = 0;
+
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(2) if(threads > 1 && n >= 65536)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads);
+
+ fast_sint_t omp_num_threads = 1;
+#endif
+ if (omp_num_threads == 1)
+ {
+ m = libsais_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets, 0, n);
+ }
+#if defined(_OPENMP)
+ else if (omp_thread_num == 0)
+ {
+ libsais_count_lms_suffixes_32s_4k(T, n, k, buckets);
+ }
+ else
+ {
+ m = libsais_gather_lms_suffixes_32s(T, SA, n);
+ }
+#endif
+ }
+
+ return m;
+}
+
+static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_nofs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads)
+{
+ sa_sint_t m = 0;
+
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(2) if(threads > 1 && n >= 65536)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads);
+
+ fast_sint_t omp_num_threads = 1;
+#endif
+ if (omp_num_threads == 1)
+ {
+ m = libsais_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n);
+ }
+#if defined(_OPENMP)
+ else if (omp_thread_num == 0)
+ {
+ libsais_count_lms_suffixes_32s_2k(T, n, k, buckets);
+ }
+ else
+ {
+ m = libsais_gather_lms_suffixes_32s(T, SA, n);
+ }
+#endif
+ }
+
+ return m;
+}
+
+static sa_sint_t libsais_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads)
+{
+ sa_sint_t m = 0;
+
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(2) if(threads > 1 && n >= 65536)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads);
+
+ fast_sint_t omp_num_threads = 1;
+#endif
+ if (omp_num_threads == 1)
+ {
+ m = libsais_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n);
+ }
+#if defined(_OPENMP)
+ else if (omp_thread_num == 0)
+ {
+ libsais_count_compacted_lms_suffixes_32s_2k(T, n, k, buckets);
+ }
+ else
+ {
+ m = libsais_gather_compacted_lms_suffixes_32s(T, SA, n);
+ }
+#endif
+ }
+
+ return m;
+}
+
+static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ sa_sint_t m;
+
+#if defined(_OPENMP)
+ sa_sint_t max_threads = (sa_sint_t)((buckets - &SA[n]) / ((4 * (fast_sint_t)k + 15) & (-16))); if (max_threads > threads) { max_threads = threads; }
+ if (max_threads > 1 && n >= 65536 && n / k >= 2)
+ {
+ if (max_threads > n / 16 / k) { max_threads = n / 16 / k; }
+ m = libsais_count_and_gather_lms_suffixes_32s_4k_fs_omp(T, SA, n, k, buckets, max_threads > 2 ? max_threads : 2, thread_state);
+ }
+ else
+#else
+ UNUSED(thread_state);
+#endif
+ {
+ m = libsais_count_and_gather_lms_suffixes_32s_4k_nofs_omp(T, SA, n, k, buckets, threads);
+ }
+
+ return m;
+}
+
+static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ sa_sint_t m;
+
+#if defined(_OPENMP)
+ sa_sint_t max_threads = (sa_sint_t)((buckets - &SA[n]) / ((2 * (fast_sint_t)k + 15) & (-16))); if (max_threads > threads) { max_threads = threads; }
+ if (max_threads > 1 && n >= 65536 && n / k >= 2)
+ {
+ if (max_threads > n / 8 / k) { max_threads = n / 8 / k; }
+ m = libsais_count_and_gather_lms_suffixes_32s_2k_fs_omp(T, SA, n, k, buckets, max_threads > 2 ? max_threads : 2, thread_state);
+ }
+ else
+#else
+ UNUSED(thread_state);
+#endif
+ {
+ m = libsais_count_and_gather_lms_suffixes_32s_2k_nofs_omp(T, SA, n, k, buckets, threads);
+ }
+
+ return m;
+}
+
+static void libsais_count_and_gather_compacted_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+ sa_sint_t max_threads = (sa_sint_t)((buckets - &SA[n + n]) / ((2 * (fast_sint_t)k + 15) & (-16))); if (max_threads > threads) { max_threads = threads; }
+ if (max_threads > 1 && n >= 65536 && n / k >= 2)
+ {
+ if (max_threads > n / 8 / k) { max_threads = n / 8 / k; }
+ libsais_count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(T, SA, n, k, buckets, max_threads > 2 ? max_threads : 2, thread_state);
+ }
+ else
+#else
+ UNUSED(thread_state);
+#endif
+ {
+ libsais_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(T, SA, n, k, buckets, threads);
+ }
+}
+
+static void libsais_count_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ memset(buckets, 0, (size_t)k * sizeof(sa_sint_t));
+
+ fast_sint_t i, j;
+ for (i = 0, j = (fast_sint_t)n - 7; i < j; i += 8)
+ {
+ libsais_prefetch(&T[i + prefetch_distance]);
+
+ buckets[T[i + 0]]++;
+ buckets[T[i + 1]]++;
+ buckets[T[i + 2]]++;
+ buckets[T[i + 3]]++;
+ buckets[T[i + 4]]++;
+ buckets[T[i + 5]]++;
+ buckets[T[i + 6]]++;
+ buckets[T[i + 7]]++;
+ }
+
+ for (j += 7; i < j; i += 1)
+ {
+ buckets[T[i]]++;
+ }
+}
+
+static void libsais_initialize_buckets_start_and_end_8u(sa_sint_t * RESTRICT buckets, sa_sint_t * RESTRICT freq)
+{
+ sa_sint_t * RESTRICT bucket_start = &buckets[6 * ALPHABET_SIZE];
+ sa_sint_t * RESTRICT bucket_end = &buckets[7 * ALPHABET_SIZE];
+
+ if (freq != NULL)
+ {
+ fast_sint_t i, j; sa_sint_t sum = 0;
+ for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1)
+ {
+ bucket_start[j] = sum;
+ sum += (freq[j] = buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)]);
+ bucket_end[j] = sum;
+ }
+ }
+ else
+ {
+ fast_sint_t i, j; sa_sint_t sum = 0;
+ for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1)
+ {
+ bucket_start[j] = sum;
+ sum += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)];
+ bucket_end[j] = sum;
+ }
+ }
+}
+
+static void libsais_initialize_buckets_start_and_end_32s_6k(sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+ sa_sint_t * RESTRICT bucket_start = &buckets[4 * k];
+ sa_sint_t * RESTRICT bucket_end = &buckets[5 * k];
+
+ fast_sint_t i, j; sa_sint_t sum = 0;
+ for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1)
+ {
+ bucket_start[j] = sum;
+ sum += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)];
+ bucket_end[j] = sum;
+ }
+}
+
+static void libsais_initialize_buckets_start_and_end_32s_4k(sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+ sa_sint_t * RESTRICT bucket_start = &buckets[2 * k];
+ sa_sint_t * RESTRICT bucket_end = &buckets[3 * k];
+
+ fast_sint_t i, j; sa_sint_t sum = 0;
+ for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1)
+ {
+ bucket_start[j] = sum;
+ sum += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)];
+ bucket_end[j] = sum;
+ }
+}
+
+static void libsais_initialize_buckets_end_32s_2k(sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+ fast_sint_t i; sa_sint_t sum0 = 0;
+ for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0))
+ {
+ sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)]; buckets[i + BUCKETS_INDEX2(0, 0)] = sum0;
+ }
+}
+
+static void libsais_initialize_buckets_start_and_end_32s_2k(sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+ fast_sint_t i, j;
+ for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1)
+ {
+ buckets[j] = buckets[i];
+ }
+
+ buckets[k] = 0; memcpy(&buckets[k + 1], buckets, ((size_t)k - 1) * sizeof(sa_sint_t));
+}
+
+static void libsais_initialize_buckets_start_32s_1k(sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+ fast_sint_t i; sa_sint_t sum = 0;
+ for (i = 0; i <= (fast_sint_t)k - 1; i += 1) { sa_sint_t tmp = buckets[i]; buckets[i] = sum; sum += tmp; }
+}
+
+static void libsais_initialize_buckets_end_32s_1k(sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+ fast_sint_t i; sa_sint_t sum = 0;
+ for (i = 0; i <= (fast_sint_t)k - 1; i += 1) { sum += buckets[i]; buckets[i] = sum; }
+}
+
+static sa_sint_t libsais_initialize_buckets_for_lms_suffixes_radix_sort_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix)
+{
+ {
+ fast_uint_t s = 0;
+ fast_sint_t c0 = T[first_lms_suffix];
+ fast_sint_t c1 = 0;
+
+ for (; --first_lms_suffix >= 0; )
+ {
+ c1 = c0; c0 = T[first_lms_suffix]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]--;
+ }
+
+ buckets[BUCKETS_INDEX4((fast_uint_t)c0, (s << 1) & 3)]--;
+ }
+
+ {
+ sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE];
+
+ fast_sint_t i, j; sa_sint_t sum = 0;
+ for (i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0))
+ {
+ temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum; sum += buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 3)]; temp_bucket[j] = sum;
+ }
+
+ return sum;
+ }
+}
+
+static void libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix)
+{
+ buckets[BUCKETS_INDEX2(T[first_lms_suffix], 0)]++;
+ buckets[BUCKETS_INDEX2(T[first_lms_suffix], 1)]--;
+
+ fast_sint_t i; sa_sint_t sum0 = 0, sum1 = 0;
+ for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0))
+ {
+ sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)];
+ sum1 += buckets[i + BUCKETS_INDEX2(0, 1)];
+
+ buckets[i + BUCKETS_INDEX2(0, 0)] = sum0;
+ buckets[i + BUCKETS_INDEX2(0, 1)] = sum1;
+ }
+}
+
+static sa_sint_t libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix)
+{
+ {
+ fast_uint_t s = 0;
+ fast_sint_t c0 = T[first_lms_suffix];
+ fast_sint_t c1 = 0;
+
+ for (; --first_lms_suffix >= 0; )
+ {
+ c1 = c0; c0 = T[first_lms_suffix]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]--;
+ }
+
+ buckets[BUCKETS_INDEX4((fast_uint_t)c0, (s << 1) & 3)]--;
+ }
+
+ {
+ sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k];
+
+ fast_sint_t i, j; sa_sint_t sum = 0;
+ for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1)
+ {
+ sum += buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 3)]; temp_bucket[j] = sum;
+ }
+
+ return sum;
+ }
+}
+
+static void libsais_initialize_buckets_for_radix_and_partial_sorting_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix)
+{
+ sa_sint_t * RESTRICT bucket_start = &buckets[2 * k];
+ sa_sint_t * RESTRICT bucket_end = &buckets[3 * k];
+
+ buckets[BUCKETS_INDEX2(T[first_lms_suffix], 0)]++;
+ buckets[BUCKETS_INDEX2(T[first_lms_suffix], 1)]--;
+
+ fast_sint_t i, j; sa_sint_t sum0 = 0, sum1 = 0;
+ for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1)
+ {
+ bucket_start[j] = sum1;
+
+ sum0 += buckets[i + BUCKETS_INDEX2(0, 1)];
+ sum1 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)];
+ buckets[i + BUCKETS_INDEX2(0, 1)] = sum0;
+
+ bucket_end[j] = sum1;
+ }
+}
+
+static void libsais_radix_sort_lms_suffixes_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4)
+ {
+ libsais_prefetch(&SA[i - 2 * prefetch_distance]);
+
+ libsais_prefetch(&T[SA[i - prefetch_distance - 0]]);
+ libsais_prefetch(&T[SA[i - prefetch_distance - 1]]);
+ libsais_prefetch(&T[SA[i - prefetch_distance - 2]]);
+ libsais_prefetch(&T[SA[i - prefetch_distance - 3]]);
+
+ sa_sint_t p0 = SA[i - 0]; SA[--induction_bucket[BUCKETS_INDEX2(T[p0], 0)]] = p0;
+ sa_sint_t p1 = SA[i - 1]; SA[--induction_bucket[BUCKETS_INDEX2(T[p1], 0)]] = p1;
+ sa_sint_t p2 = SA[i - 2]; SA[--induction_bucket[BUCKETS_INDEX2(T[p2], 0)]] = p2;
+ sa_sint_t p3 = SA[i - 3]; SA[--induction_bucket[BUCKETS_INDEX2(T[p3], 0)]] = p3;
+ }
+
+ for (j -= prefetch_distance + 3; i >= j; i -= 1)
+ {
+ sa_sint_t p = SA[i]; SA[--induction_bucket[BUCKETS_INDEX2(T[p], 0)]] = p;
+ }
+}
+
+static void libsais_radix_sort_lms_suffixes_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536 && m >= 65536 && omp_get_dynamic() == 0)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_num_threads = 1;
+#endif
+ if (omp_num_threads == 1)
+ {
+ libsais_radix_sort_lms_suffixes_8u(T, SA, &buckets[4 * ALPHABET_SIZE], (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ sa_sint_t * RESTRICT src_bucket = &buckets[4 * ALPHABET_SIZE];
+ sa_sint_t * RESTRICT dst_bucket = thread_state[omp_thread_num].state.buckets;
+
+ fast_sint_t i, j;
+ for (i = BUCKETS_INDEX2(0, 0), j = BUCKETS_INDEX4(0, 1); i <= BUCKETS_INDEX2(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX2(1, 0), j += BUCKETS_INDEX4(1, 0))
+ {
+ dst_bucket[i] = src_bucket[i] - dst_bucket[j];
+ }
+ }
+
+ {
+ fast_sint_t t, omp_block_start = 0, omp_block_size = thread_state[omp_thread_num].state.m;
+ for (t = omp_num_threads - 1; t >= omp_thread_num; --t) omp_block_start += thread_state[t].state.m;
+
+ if (omp_block_start == (fast_sint_t)m && omp_block_size > 0)
+ {
+ omp_block_start -= 1; omp_block_size -= 1;
+ }
+
+ libsais_radix_sort_lms_suffixes_8u(T, SA, thread_state[omp_thread_num].state.buckets, (fast_sint_t)n - omp_block_start, omp_block_size);
+ }
+ }
+#endif
+ }
+}
+
+static void libsais_radix_sort_lms_suffixes_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 3; i >= j; i -= 4)
+ {
+ libsais_prefetch(&SA[i - 3 * prefetch_distance]);
+
+ libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 0]]);
+ libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 1]]);
+ libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 2]]);
+ libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 3]]);
+
+ libsais_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 0]]]);
+ libsais_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 1]]]);
+ libsais_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 2]]]);
+ libsais_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 3]]]);
+
+ sa_sint_t p0 = SA[i - 0]; SA[--induction_bucket[T[p0]]] = p0;
+ sa_sint_t p1 = SA[i - 1]; SA[--induction_bucket[T[p1]]] = p1;
+ sa_sint_t p2 = SA[i - 2]; SA[--induction_bucket[T[p2]]] = p2;
+ sa_sint_t p3 = SA[i - 3]; SA[--induction_bucket[T[p3]]] = p3;
+ }
+
+ for (j -= 2 * prefetch_distance + 3; i >= j; i -= 1)
+ {
+ sa_sint_t p = SA[i]; SA[--induction_bucket[T[p]]] = p;
+ }
+}
+
+static void libsais_radix_sort_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 3; i >= j; i -= 4)
+ {
+ libsais_prefetch(&SA[i - 3 * prefetch_distance]);
+
+ libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 0]]);
+ libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 1]]);
+ libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 2]]);
+ libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 3]]);
+
+ libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 0]], 0)]);
+ libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 1]], 0)]);
+ libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 2]], 0)]);
+ libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 3]], 0)]);
+
+ sa_sint_t p0 = SA[i - 0]; SA[--induction_bucket[BUCKETS_INDEX2(T[p0], 0)]] = p0;
+ sa_sint_t p1 = SA[i - 1]; SA[--induction_bucket[BUCKETS_INDEX2(T[p1], 0)]] = p1;
+ sa_sint_t p2 = SA[i - 2]; SA[--induction_bucket[BUCKETS_INDEX2(T[p2], 0)]] = p2;
+ sa_sint_t p3 = SA[i - 3]; SA[--induction_bucket[BUCKETS_INDEX2(T[p3], 0)]] = p3;
+ }
+
+ for (j -= 2 * prefetch_distance + 3; i >= j; i -= 1)
+ {
+ sa_sint_t p = SA[i]; SA[--induction_bucket[BUCKETS_INDEX2(T[p], 0)]] = p;
+ }
+}
+
+#if defined(_OPENMP)
+
+static void libsais_radix_sort_lms_suffixes_32s_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
+ {
+ libsais_prefetch(&SA[i + 2 * prefetch_distance]);
+
+ libsais_prefetch(&T[SA[i + prefetch_distance + 0]]);
+ libsais_prefetch(&T[SA[i + prefetch_distance + 1]]);
+ libsais_prefetch(&T[SA[i + prefetch_distance + 2]]);
+ libsais_prefetch(&T[SA[i + prefetch_distance + 3]]);
+
+ libsais_prefetchw(&cache[i + prefetch_distance]);
+
+ cache[i + 0].symbol = T[cache[i + 0].index = SA[i + 0]];
+ cache[i + 1].symbol = T[cache[i + 1].index = SA[i + 1]];
+ cache[i + 2].symbol = T[cache[i + 2].index = SA[i + 2]];
+ cache[i + 3].symbol = T[cache[i + 3].index = SA[i + 3]];
+ }
+
+ for (j += prefetch_distance + 3; i < j; i += 1)
+ {
+ cache[i].symbol = T[cache[i].index = SA[i]];
+ }
+}
+
+static void libsais_radix_sort_lms_suffixes_32s_6k_block_sort(sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4)
+ {
+ libsais_prefetchw(&cache[i - 2 * prefetch_distance]);
+
+ libsais_prefetchw(&induction_bucket[cache[i - prefetch_distance - 0].symbol]);
+ libsais_prefetchw(&induction_bucket[cache[i - prefetch_distance - 1].symbol]);
+ libsais_prefetchw(&induction_bucket[cache[i - prefetch_distance - 2].symbol]);
+ libsais_prefetchw(&induction_bucket[cache[i - prefetch_distance - 3].symbol]);
+
+ cache[i - 0].symbol = --induction_bucket[cache[i - 0].symbol];
+ cache[i - 1].symbol = --induction_bucket[cache[i - 1].symbol];
+ cache[i - 2].symbol = --induction_bucket[cache[i - 2].symbol];
+ cache[i - 3].symbol = --induction_bucket[cache[i - 3].symbol];
+ }
+
+ for (j -= prefetch_distance + 3; i >= j; i -= 1)
+ {
+ cache[i].symbol = --induction_bucket[cache[i].symbol];
+ }
+}
+
+static void libsais_radix_sort_lms_suffixes_32s_2k_block_sort(sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4)
+ {
+ libsais_prefetchw(&cache[i - 2 * prefetch_distance]);
+
+ libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 0].symbol, 0)]);
+ libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 1].symbol, 0)]);
+ libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 2].symbol, 0)]);
+ libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 3].symbol, 0)]);
+
+ cache[i - 0].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 0].symbol, 0)];
+ cache[i - 1].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 1].symbol, 0)];
+ cache[i - 2].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 2].symbol, 0)];
+ cache[i - 3].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 3].symbol, 0)];
+ }
+
+ for (j -= prefetch_distance + 3; i >= j; i -= 1)
+ {
+ cache[i].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i].symbol, 0)];
+ }
+}
+
+static void libsais_radix_sort_lms_suffixes_32s_6k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(cache);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+ omp_block_start += block_start;
+
+ if (omp_num_threads == 1)
+ {
+ libsais_radix_sort_lms_suffixes_32s_6k(T, SA, induction_bucket, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ libsais_radix_sort_lms_suffixes_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ libsais_radix_sort_lms_suffixes_32s_6k_block_sort(induction_bucket, cache - block_start, block_start, block_size);
+ }
+
+ #pragma omp barrier
+
+ {
+ libsais_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+ }
+ }
+#endif
+ }
+}
+
+static void libsais_radix_sort_lms_suffixes_32s_2k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(cache);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+ omp_block_start += block_start;
+
+ if (omp_num_threads == 1)
+ {
+ libsais_radix_sort_lms_suffixes_32s_2k(T, SA, induction_bucket, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ libsais_radix_sort_lms_suffixes_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ libsais_radix_sort_lms_suffixes_32s_2k_block_sort(induction_bucket, cache - block_start, block_start, block_size);
+ }
+
+ #pragma omp barrier
+
+ {
+ libsais_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+ }
+ }
+#endif
+ }
+}
+
+#endif
+
+static void libsais_radix_sort_lms_suffixes_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ if (threads == 1 || m < 65536)
+ {
+ libsais_radix_sort_lms_suffixes_32s_6k(T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ fast_sint_t block_start, block_end;
+ for (block_start = 0; block_start < (fast_sint_t)m - 1; block_start = block_end)
+ {
+ block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end >= m) { block_end = (fast_sint_t)m - 1; }
+
+ libsais_radix_sort_lms_suffixes_32s_6k_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, (fast_sint_t)n - block_end, block_end - block_start, threads);
+ }
+ }
+#else
+ UNUSED(thread_state);
+#endif
+}
+
+static void libsais_radix_sort_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ if (threads == 1 || m < 65536)
+ {
+ libsais_radix_sort_lms_suffixes_32s_2k(T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ fast_sint_t block_start, block_end;
+ for (block_start = 0; block_start < (fast_sint_t)m - 1; block_start = block_end)
+ {
+ block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end >= m) { block_end = (fast_sint_t)m - 1; }
+
+ libsais_radix_sort_lms_suffixes_32s_2k_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, (fast_sint_t)n - block_end, block_end - block_start, threads);
+ }
+ }
+#else
+ UNUSED(thread_state);
+#endif
+}
+
+static sa_sint_t libsais_radix_sort_lms_suffixes_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t i = n - 2;
+ sa_sint_t m = 0;
+ fast_uint_t s = 1;
+ fast_sint_t c0 = T[n - 1];
+ fast_sint_t c1 = 0;
+ fast_sint_t c2 = 0;
+
+ for (; i >= prefetch_distance + 3; i -= 4)
+ {
+ libsais_prefetch(&T[i - 2 * prefetch_distance]);
+
+ libsais_prefetchw(&buckets[T[i - prefetch_distance - 0]]);
+ libsais_prefetchw(&buckets[T[i - prefetch_distance - 1]]);
+ libsais_prefetchw(&buckets[T[i - prefetch_distance - 2]]);
+ libsais_prefetchw(&buckets[T[i - prefetch_distance - 3]]);
+
+ c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+ if ((s & 3) == 1) { SA[--buckets[c2 = c0]] = i + 1; m++; }
+
+ c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ if ((s & 3) == 1) { SA[--buckets[c2 = c1]] = i - 0; m++; }
+
+ c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+ if ((s & 3) == 1) { SA[--buckets[c2 = c0]] = i - 1; m++; }
+
+ c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ if ((s & 3) == 1) { SA[--buckets[c2 = c1]] = i - 2; m++; }
+ }
+
+ for (; i >= 0; i -= 1)
+ {
+ c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ if ((s & 3) == 1) { SA[--buckets[c2 = c1]] = i + 1; m++; }
+ }
+
+ if (m > 1)
+ {
+ SA[buckets[c2]] = 0;
+ }
+
+ return m;
+}
+
+static void libsais_radix_sort_set_markers_32s_6k(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
+ {
+ libsais_prefetch(&induction_bucket[i + 2 * prefetch_distance]);
+
+ libsais_prefetchw(&SA[induction_bucket[i + prefetch_distance + 0]]);
+ libsais_prefetchw(&SA[induction_bucket[i + prefetch_distance + 1]]);
+ libsais_prefetchw(&SA[induction_bucket[i + prefetch_distance + 2]]);
+ libsais_prefetchw(&SA[induction_bucket[i + prefetch_distance + 3]]);
+
+ SA[induction_bucket[i + 0]] |= SAINT_MIN;
+ SA[induction_bucket[i + 1]] |= SAINT_MIN;
+ SA[induction_bucket[i + 2]] |= SAINT_MIN;
+ SA[induction_bucket[i + 3]] |= SAINT_MIN;
+ }
+
+ for (j += prefetch_distance + 3; i < j; i += 1)
+ {
+ SA[induction_bucket[i]] |= SAINT_MIN;
+ }
+}
+
+static void libsais_radix_sort_set_markers_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
+ {
+ libsais_prefetch(&induction_bucket[BUCKETS_INDEX2(i + 2 * prefetch_distance, 0)]);
+
+ libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 0, 0)]]);
+ libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 1, 0)]]);
+ libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 2, 0)]]);
+ libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 3, 0)]]);
+
+ SA[induction_bucket[BUCKETS_INDEX2(i + 0, 0)]] |= SUFFIX_GROUP_MARKER;
+ SA[induction_bucket[BUCKETS_INDEX2(i + 1, 0)]] |= SUFFIX_GROUP_MARKER;
+ SA[induction_bucket[BUCKETS_INDEX2(i + 2, 0)]] |= SUFFIX_GROUP_MARKER;
+ SA[induction_bucket[BUCKETS_INDEX2(i + 3, 0)]] |= SUFFIX_GROUP_MARKER;
+ }
+
+ for (j += prefetch_distance + 3; i < j; i += 1)
+ {
+ SA[induction_bucket[BUCKETS_INDEX2(i, 0)]] |= SUFFIX_GROUP_MARKER;
+ }
+}
+
+static void libsais_radix_sort_set_markers_32s_6k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && k >= 65536)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+ fast_sint_t omp_block_stride = (((fast_sint_t)k - 1) / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : (fast_sint_t)k - 1 - omp_block_start;
+#else
+ UNUSED(threads);
+
+ fast_sint_t omp_block_start = 0;
+ fast_sint_t omp_block_size = (fast_sint_t)k - 1;
+#endif
+
+ libsais_radix_sort_set_markers_32s_6k(SA, induction_bucket, omp_block_start, omp_block_size);
+ }
+}
+
+static void libsais_radix_sort_set_markers_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && k >= 65536)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+ fast_sint_t omp_block_stride = (((fast_sint_t)k - 1) / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : (fast_sint_t)k - 1 - omp_block_start;
+#else
+ UNUSED(threads);
+
+ fast_sint_t omp_block_start = 0;
+ fast_sint_t omp_block_size = (fast_sint_t)k - 1;
+#endif
+
+ libsais_radix_sort_set_markers_32s_4k(SA, induction_bucket, omp_block_start, omp_block_size);
+ }
+}
+
+static void libsais_initialize_buckets_for_partial_sorting_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count)
+{
+ sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE];
+
+ buckets[BUCKETS_INDEX4((fast_uint_t)T[first_lms_suffix], 1)]++;
+
+ fast_sint_t i, j; sa_sint_t sum0 = left_suffixes_count + 1, sum1 = 0;
+ for (i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0))
+ {
+ temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0;
+
+ sum0 += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 2)];
+ sum1 += buckets[i + BUCKETS_INDEX4(0, 1)];
+
+ buckets[j + BUCKETS_INDEX2(0, 0)] = sum0;
+ buckets[j + BUCKETS_INDEX2(0, 1)] = sum1;
+ }
+}
+
+static void libsais_initialize_buckets_for_partial_sorting_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count)
+{
+ sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k];
+
+ fast_sint_t i, j; sa_sint_t sum0 = left_suffixes_count + 1, sum1 = 0, sum2 = 0;
+ for (first_lms_suffix = T[first_lms_suffix], i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4((fast_sint_t)first_lms_suffix - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0))
+ {
+ sa_sint_t SS = buckets[i + BUCKETS_INDEX4(0, 0)];
+ sa_sint_t LS = buckets[i + BUCKETS_INDEX4(0, 1)];
+ sa_sint_t SL = buckets[i + BUCKETS_INDEX4(0, 2)];
+ sa_sint_t LL = buckets[i + BUCKETS_INDEX4(0, 3)];
+
+ buckets[i + BUCKETS_INDEX4(0, 0)] = sum0;
+ buckets[i + BUCKETS_INDEX4(0, 1)] = sum2;
+ buckets[i + BUCKETS_INDEX4(0, 2)] = 0;
+ buckets[i + BUCKETS_INDEX4(0, 3)] = 0;
+
+ sum0 += SS + SL; sum1 += LS; sum2 += LS + LL;
+
+ temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0;
+ temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum1;
+ }
+
+ for (sum1 += 1; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0))
+ {
+ sa_sint_t SS = buckets[i + BUCKETS_INDEX4(0, 0)];
+ sa_sint_t LS = buckets[i + BUCKETS_INDEX4(0, 1)];
+ sa_sint_t SL = buckets[i + BUCKETS_INDEX4(0, 2)];
+ sa_sint_t LL = buckets[i + BUCKETS_INDEX4(0, 3)];
+
+ buckets[i + BUCKETS_INDEX4(0, 0)] = sum0;
+ buckets[i + BUCKETS_INDEX4(0, 1)] = sum2;
+ buckets[i + BUCKETS_INDEX4(0, 2)] = 0;
+ buckets[i + BUCKETS_INDEX4(0, 3)] = 0;
+
+ sum0 += SS + SL; sum1 += LS; sum2 += LS + LL;
+
+ temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0;
+ temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum1;
+ }
+}
+
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE];
+ sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais_prefetch(&SA[i + 2 * prefetch_distance]);
+
+ libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1);
+ libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2);
+ libsais_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1);
+ libsais_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2);
+
+ sa_sint_t p0 = SA[i + 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]);
+ SA[induction_bucket[v0]++] = (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d;
+
+ sa_sint_t p1 = SA[i + 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]);
+ SA[induction_bucket[v1]++] = (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d;
+ }
+
+ for (j += prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]);
+ SA[induction_bucket[v]++] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d;
+ }
+
+ return d;
+}
+
+#if defined(_OPENMP)
+
+static void libsais_partial_sorting_scan_left_to_right_8u_block_prepare(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size, LIBSAIS_THREAD_STATE * RESTRICT state)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
+ sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
+
+ memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t));
+
+ fast_sint_t i, j, count = 0; sa_sint_t d = 1;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais_prefetch(&SA[i + 2 * prefetch_distance]);
+
+ libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1);
+ libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2);
+ libsais_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1);
+ libsais_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2);
+
+ sa_sint_t p0 = cache[count].index = SA[i + 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = cache[count++].symbol = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]); induction_bucket[v0]++; distinct_names[v0] = d;
+ sa_sint_t p1 = cache[count].index = SA[i + 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = cache[count++].symbol = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]); induction_bucket[v1]++; distinct_names[v1] = d;
+ }
+
+ for (j += prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t p = cache[count].index = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = cache[count++].symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]); induction_bucket[v]++; distinct_names[v] = d;
+ }
+
+ state[0].state.position = (fast_sint_t)d - 1;
+ state[0].state.count = count;
+}
+
+static void libsais_partial_sorting_scan_left_to_right_8u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count, sa_sint_t d)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
+ sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
+
+ fast_sint_t i, j;
+ for (i = 0, j = count - 1; i < j; i += 2)
+ {
+ libsais_prefetch(&cache[i + prefetch_distance]);
+
+ sa_sint_t p0 = cache[i + 0].index; d += (p0 < 0); sa_sint_t v0 = cache[i + 0].symbol;
+ SA[induction_bucket[v0]++] = (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d;
+
+ sa_sint_t p1 = cache[i + 1].index; d += (p1 < 0); sa_sint_t v1 = cache[i + 1].symbol;
+ SA[induction_bucket[v1]++] = (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d;
+ }
+
+ for (j += 1; i < j; i += 1)
+ {
+ sa_sint_t p = cache[i].index; d += (p < 0); sa_sint_t v = cache[i].symbol;
+ SA[induction_bucket[v]++] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d;
+ }
+}
+
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+ omp_block_start += block_start;
+
+ if (omp_num_threads == 1)
+ {
+ d = libsais_partial_sorting_scan_left_to_right_8u(T, SA, buckets, d, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ libsais_partial_sorting_scan_left_to_right_8u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size, &thread_state[omp_thread_num]);
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE];
+ sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
+
+ fast_sint_t t;
+ for (t = 0; t < omp_num_threads; ++t)
+ {
+ sa_sint_t * RESTRICT temp_induction_bucket = &thread_state[t].state.buckets[0 * ALPHABET_SIZE];
+ sa_sint_t * RESTRICT temp_distinct_names = &thread_state[t].state.buckets[2 * ALPHABET_SIZE];
+
+ fast_sint_t c;
+ for (c = 0; c < 2 * ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_induction_bucket[c]; induction_bucket[c] = A + B; temp_induction_bucket[c] = A; }
+
+ for (d -= 1, c = 0; c < 2 * ALPHABET_SIZE; c += 1) { sa_sint_t A = distinct_names[c], B = temp_distinct_names[c], D = B + d; distinct_names[c] = B > 0 ? D : A; temp_distinct_names[c] = A; }
+ d += 1 + (sa_sint_t)thread_state[t].state.position; thread_state[t].state.position = (fast_sint_t)d - thread_state[t].state.position;
+ }
+ }
+
+ #pragma omp barrier
+
+ {
+ libsais_partial_sorting_scan_left_to_right_8u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count, (sa_sint_t)thread_state[omp_thread_num].state.position);
+ }
+ }
+#endif
+ }
+
+ return d;
+}
+
+#endif
+
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE];
+ sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
+
+ SA[induction_bucket[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])]++] = (n - 1) | SAINT_MIN;
+ distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])] = ++d;
+
+ if (threads == 1 || left_suffixes_count < 65536)
+ {
+ d = libsais_partial_sorting_scan_left_to_right_8u(T, SA, buckets, d, 0, left_suffixes_count);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ fast_sint_t block_start;
+ for (block_start = 0; block_start < left_suffixes_count; )
+ {
+ if (SA[block_start] == 0)
+ {
+ block_start++;
+ }
+ else
+ {
+ fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > left_suffixes_count) { block_max_end = left_suffixes_count;}
+ fast_sint_t block_end = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; }
+ fast_sint_t block_size = block_end - block_start;
+
+ if (block_size < 32)
+ {
+ for (; block_start < block_end; block_start += 1)
+ {
+ sa_sint_t p = SA[block_start]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]);
+ SA[induction_bucket[v]++] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d;
+ }
+ }
+ else
+ {
+ d = libsais_partial_sorting_scan_left_to_right_8u_block_omp(T, SA, buckets, d, block_start, block_size, threads, thread_state);
+ block_start = block_end;
+ }
+ }
+ }
+ }
+#else
+ UNUSED(thread_state);
+#endif
+
+ return d;
+}
+
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais_prefetch(&SA[i + 3 * prefetch_distance]);
+
+ libsais_prefetch(&T[SA[i + 2 * prefetch_distance + 0] & SAINT_MAX] - 1);
+ libsais_prefetch(&T[SA[i + 2 * prefetch_distance + 0] & SAINT_MAX] - 2);
+ libsais_prefetch(&T[SA[i + 2 * prefetch_distance + 1] & SAINT_MAX] - 1);
+ libsais_prefetch(&T[SA[i + 2 * prefetch_distance + 1] & SAINT_MAX] - 2);
+
+ sa_sint_t p0 = SA[i + prefetch_distance + 0] & SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX4(T[p0 - (p0 > 0)], 0); libsais_prefetchw(&buckets[v0]);
+ sa_sint_t p1 = SA[i + prefetch_distance + 1] & SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX4(T[p1 - (p1 > 0)], 0); libsais_prefetchw(&buckets[v1]);
+
+ sa_sint_t p2 = SA[i + 0]; d += (p2 < 0); p2 &= SAINT_MAX; sa_sint_t v2 = BUCKETS_INDEX4(T[p2 - 1], T[p2 - 2] >= T[p2 - 1]);
+ SA[buckets[v2]++] = (p2 - 1) | ((sa_sint_t)(buckets[2 + v2] != d) << (SAINT_BIT - 1)); buckets[2 + v2] = d;
+
+ sa_sint_t p3 = SA[i + 1]; d += (p3 < 0); p3 &= SAINT_MAX; sa_sint_t v3 = BUCKETS_INDEX4(T[p3 - 1], T[p3 - 2] >= T[p3 - 1]);
+ SA[buckets[v3]++] = (p3 - 1) | ((sa_sint_t)(buckets[2 + v3] != d) << (SAINT_BIT - 1)); buckets[2 + v3] = d;
+ }
+
+ for (j += 2 * prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX4(T[p - 1], T[p - 2] >= T[p - 1]);
+ SA[buckets[v]++] = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d;
+ }
+
+ return d;
+}
+
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t * RESTRICT induction_bucket = &buckets[2 * k];
+ sa_sint_t * RESTRICT distinct_names = &buckets[0 * k];
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais_prefetchw(&SA[i + 3 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+ sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { const fast_sint_t Ts2 = T[(s2 & ~SUFFIX_GROUP_MARKER) - 1]; libsais_prefetchw(&induction_bucket[Ts2]); libsais_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts2, 0)]); }
+ sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { const fast_sint_t Ts3 = T[(s3 & ~SUFFIX_GROUP_MARKER) - 1]; libsais_prefetchw(&induction_bucket[Ts3]); libsais_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts3, 0)]); }
+
+ sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX;
+ if (p0 > 0)
+ {
+ SA[i + 0] = 0; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); p0 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] < T[p0 - 1]);
+ SA[induction_bucket[T[p0 - 1]]++] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d;
+ }
+
+ sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX;
+ if (p1 > 0)
+ {
+ SA[i + 1] = 0; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); p1 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] < T[p1 - 1]);
+ SA[induction_bucket[T[p1 - 1]]++] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d;
+ }
+ }
+
+ for (j += 2 * prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX;
+ if (p > 0)
+ {
+ SA[i] = 0; d += (p >> (SUFFIX_GROUP_BIT - 1)); p &= ~SUFFIX_GROUP_MARKER; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] < T[p - 1]);
+ SA[induction_bucket[T[p - 1]]++] = (p - 1) | ((sa_sint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d;
+ }
+ }
+
+ return d;
+}
+
+static void libsais_partial_sorting_scan_left_to_right_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais_prefetchw(&SA[i + 3 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+ sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { libsais_prefetchw(&induction_bucket[T[s2 - 1]]); libsais_prefetch(&T[s2] - 2); }
+ sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { libsais_prefetchw(&induction_bucket[T[s3 - 1]]); libsais_prefetch(&T[s3] - 2); }
+
+ sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { SA[i + 0] = 0; SA[induction_bucket[T[p0 - 1]]++] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)); }
+ sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { SA[i + 1] = 0; SA[induction_bucket[T[p1 - 1]]++] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)); }
+ }
+
+ for (j += 2 * prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { SA[i] = 0; SA[induction_bucket[T[p - 1]]++] = (p - 1) | ((sa_sint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1)); }
+ }
+}
+
+#if defined(_OPENMP)
+
+static void libsais_partial_sorting_scan_left_to_right_32s_6k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais_prefetch(&SA[i + 2 * prefetch_distance]);
+
+ libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1);
+ libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2);
+ libsais_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1);
+ libsais_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2);
+
+ libsais_prefetchw(&cache[i + prefetch_distance]);
+
+ sa_sint_t p0 = cache[i + 0].index = SA[i + 0]; sa_sint_t symbol0 = 0; p0 &= SAINT_MAX; if (p0 != 0) { symbol0 = BUCKETS_INDEX4(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]); } cache[i + 0].symbol = symbol0;
+ sa_sint_t p1 = cache[i + 1].index = SA[i + 1]; sa_sint_t symbol1 = 0; p1 &= SAINT_MAX; if (p1 != 0) { symbol1 = BUCKETS_INDEX4(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]); } cache[i + 1].symbol = symbol1;
+ }
+
+ for (j += prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t p = cache[i].index = SA[i]; sa_sint_t symbol = 0; p &= SAINT_MAX; if (p != 0) { symbol = BUCKETS_INDEX4(T[p - 1], T[p - 2] >= T[p - 1]); } cache[i].symbol = symbol;
+ }
+}
+
+static void libsais_partial_sorting_scan_left_to_right_32s_4k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+
+ libsais_prefetchw(&cache[i + prefetch_distance]);
+
+ sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { cache[i + 0].index = p0; p0 &= ~SUFFIX_GROUP_MARKER; symbol0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] < T[p0 - 1]); p0 = 0; } cache[i + 0].symbol = symbol0; SA[i + 0] = p0 & SAINT_MAX;
+ sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { cache[i + 1].index = p1; p1 &= ~SUFFIX_GROUP_MARKER; symbol1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] < T[p1 - 1]); p1 = 0; } cache[i + 1].symbol = symbol1; SA[i + 1] = p1 & SAINT_MAX;
+ }
+
+ for (j += prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { cache[i].index = p; p &= ~SUFFIX_GROUP_MARKER; symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] < T[p - 1]); p = 0; } cache[i].symbol = symbol; SA[i] = p & SAINT_MAX;
+ }
+}
+
+static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+
+ libsais_prefetchw(&cache[i + prefetch_distance]);
+
+ sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { cache[i + 0].index = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)); symbol0 = T[p0 - 1]; p0 = 0; } cache[i + 0].symbol = symbol0; SA[i + 0] = p0 & SAINT_MAX;
+ sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { cache[i + 1].index = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)); symbol1 = T[p1 - 1]; p1 = 0; } cache[i + 1].symbol = symbol1; SA[i + 1] = p1 & SAINT_MAX;
+ }
+
+ for (j += prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { cache[i].index = (p - 1) | ((sa_sint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1)); symbol = T[p - 1]; p = 0; } cache[i].symbol = symbol; SA[i] = p & SAINT_MAX;
+ }
+}
+
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size;
+ for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais_prefetchw(&cache[i + 2 * prefetch_distance]);
+
+ libsais_prefetchw(&buckets[cache[i + prefetch_distance + 0].symbol]);
+ libsais_prefetchw(&buckets[cache[i + prefetch_distance + 1].symbol]);
+
+ sa_sint_t v0 = cache[i + 0].symbol, p0 = cache[i + 0].index; d += (p0 < 0); cache[i + 0].symbol = buckets[v0]++; cache[i + 0].index = (p0 - 1) | ((sa_sint_t)(buckets[2 + v0] != d) << (SAINT_BIT - 1)); buckets[2 + v0] = d;
+ if (cache[i + 0].symbol < omp_block_end) { sa_sint_t s = cache[i + 0].symbol, q = (cache[s].index = cache[i + 0].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] >= T[q - 1]); }
+
+ sa_sint_t v1 = cache[i + 1].symbol, p1 = cache[i + 1].index; d += (p1 < 0); cache[i + 1].symbol = buckets[v1]++; cache[i + 1].index = (p1 - 1) | ((sa_sint_t)(buckets[2 + v1] != d) << (SAINT_BIT - 1)); buckets[2 + v1] = d;
+ if (cache[i + 1].symbol < omp_block_end) { sa_sint_t s = cache[i + 1].symbol, q = (cache[s].index = cache[i + 1].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] >= T[q - 1]); }
+ }
+
+ for (j += prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t v = cache[i].symbol, p = cache[i].index; d += (p < 0); cache[i].symbol = buckets[v]++; cache[i].index = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d;
+ if (cache[i].symbol < omp_block_end) { sa_sint_t s = cache[i].symbol, q = (cache[s].index = cache[i].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] >= T[q - 1]); }
+ }
+
+ return d;
+}
+
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t * RESTRICT induction_bucket = &buckets[2 * k];
+ sa_sint_t * RESTRICT distinct_names = &buckets[0 * k];
+
+ fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size;
+ for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais_prefetchw(&cache[i + 2 * prefetch_distance]);
+
+ sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0 >> 1]; libsais_prefetchw(s0 >= 0 ? Is0 : NULL); const sa_sint_t * Ds0 = &distinct_names[s0]; libsais_prefetchw(s0 >= 0 ? Ds0 : NULL);
+ sa_sint_t s1 = cache[i + prefetch_distance + 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1 >> 1]; libsais_prefetchw(s1 >= 0 ? Is1 : NULL); const sa_sint_t * Ds1 = &distinct_names[s1]; libsais_prefetchw(s1 >= 0 ? Ds1 : NULL);
+
+ sa_sint_t v0 = cache[i + 0].symbol;
+ if (v0 >= 0)
+ {
+ sa_sint_t p0 = cache[i + 0].index; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); cache[i + 0].symbol = induction_bucket[v0 >> 1]++; cache[i + 0].index = (p0 - 1) | (v0 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d;
+ if (cache[i + 0].symbol < omp_block_end) { sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index; if (np > 0) { cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]); np = 0; } cache[i + 0].index = np & SAINT_MAX; }
+ }
+
+ sa_sint_t v1 = cache[i + 1].symbol;
+ if (v1 >= 0)
+ {
+ sa_sint_t p1 = cache[i + 1].index; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); cache[i + 1].symbol = induction_bucket[v1 >> 1]++; cache[i + 1].index = (p1 - 1) | (v1 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d;
+ if (cache[i + 1].symbol < omp_block_end) { sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index; if (np > 0) { cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]); np = 0; } cache[i + 1].index = np & SAINT_MAX; }
+ }
+ }
+
+ for (j += prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t v = cache[i].symbol;
+ if (v >= 0)
+ {
+ sa_sint_t p = cache[i].index; d += (p >> (SUFFIX_GROUP_BIT - 1)); cache[i].symbol = induction_bucket[v >> 1]++; cache[i].index = (p - 1) | (v << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d;
+ if (cache[i].symbol < omp_block_end) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]); np = 0; } cache[i].index = np & SAINT_MAX; }
+ }
+ }
+
+ return d;
+}
+
+static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size;
+ for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais_prefetchw(&cache[i + 2 * prefetch_distance]);
+
+ sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0]; libsais_prefetchw(s0 >= 0 ? Is0 : NULL);
+ sa_sint_t s1 = cache[i + prefetch_distance + 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1]; libsais_prefetchw(s1 >= 0 ? Is1 : NULL);
+
+ sa_sint_t v0 = cache[i + 0].symbol;
+ if (v0 >= 0)
+ {
+ cache[i + 0].symbol = induction_bucket[v0]++;
+ if (cache[i + 0].symbol < omp_block_end) { sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index; if (np > 0) { cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; np = 0; } cache[i + 0].index = np & SAINT_MAX; }
+ }
+
+ sa_sint_t v1 = cache[i + 1].symbol;
+ if (v1 >= 0)
+ {
+ cache[i + 1].symbol = induction_bucket[v1]++;
+ if (cache[i + 1].symbol < omp_block_end) { sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index; if (np > 0) { cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; np = 0; } cache[i + 1].index = np & SAINT_MAX; }
+ }
+ }
+
+ for (j += prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t v = cache[i].symbol;
+ if (v >= 0)
+ {
+ cache[i].symbol = induction_bucket[v]++;
+ if (cache[i].symbol < omp_block_end) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; np = 0; } cache[i].index = np & SAINT_MAX; }
+ }
+ }
+}
+
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(cache);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+ omp_block_start += block_start;
+
+ if (omp_num_threads == 1)
+ {
+ d = libsais_partial_sorting_scan_left_to_right_32s_6k(T, SA, buckets, d, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ libsais_partial_sorting_scan_left_to_right_32s_6k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ d = libsais_partial_sorting_scan_left_to_right_32s_6k_block_sort(T, buckets, d, cache - block_start, block_start, block_size);
+ }
+
+ #pragma omp barrier
+
+ {
+ libsais_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+ }
+ }
+#endif
+ }
+
+ return d;
+}
+
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(cache);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+ omp_block_start += block_start;
+
+ if (omp_num_threads == 1)
+ {
+ d = libsais_partial_sorting_scan_left_to_right_32s_4k(T, SA, k, buckets, d, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ libsais_partial_sorting_scan_left_to_right_32s_4k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ d = libsais_partial_sorting_scan_left_to_right_32s_4k_block_sort(T, k, buckets, d, cache - block_start, block_start, block_size);
+ }
+
+ #pragma omp barrier
+
+ {
+ libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+ }
+ }
+#endif
+ }
+
+ return d;
+}
+
+static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(cache);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+ omp_block_start += block_start;
+
+ if (omp_num_threads == 1)
+ {
+ libsais_partial_sorting_scan_left_to_right_32s_1k(T, SA, buckets, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ libsais_partial_sorting_scan_left_to_right_32s_1k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ libsais_partial_sorting_scan_left_to_right_32s_1k_block_sort(T, buckets, cache - block_start, block_start, block_size);
+ }
+
+ #pragma omp barrier
+
+ {
+ libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+ }
+ }
+#endif
+ }
+}
+
+#endif
+
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ SA[buckets[BUCKETS_INDEX4(T[n - 1], T[n - 2] >= T[n - 1])]++] = (n - 1) | SAINT_MIN;
+ buckets[2 + BUCKETS_INDEX4(T[n - 1], T[n - 2] >= T[n - 1])] = ++d;
+
+ if (threads == 1 || left_suffixes_count < 65536)
+ {
+ d = libsais_partial_sorting_scan_left_to_right_32s_6k(T, SA, buckets, d, 0, left_suffixes_count);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ fast_sint_t block_start, block_end;
+ for (block_start = 0; block_start < left_suffixes_count; block_start = block_end)
+ {
+ block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > left_suffixes_count) { block_end = left_suffixes_count; }
+
+ d = libsais_partial_sorting_scan_left_to_right_32s_6k_block_omp(T, SA, buckets, d, thread_state[0].state.cache, block_start, block_end - block_start, threads);
+ }
+ }
+#else
+ UNUSED(thread_state);
+#endif
+
+ return d;
+}
+
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ sa_sint_t * RESTRICT induction_bucket = &buckets[2 * k];
+ sa_sint_t * RESTRICT distinct_names = &buckets[0 * k];
+
+ SA[induction_bucket[T[n - 1]]++] = (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1)) | SUFFIX_GROUP_MARKER;
+ distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] < T[n - 1])] = ++d;
+
+ if (threads == 1 || n < 65536)
+ {
+ d = libsais_partial_sorting_scan_left_to_right_32s_4k(T, SA, k, buckets, d, 0, n);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ fast_sint_t block_start, block_end;
+ for (block_start = 0; block_start < n; block_start = block_end)
+ {
+ block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > n) { block_end = n; }
+
+ d = libsais_partial_sorting_scan_left_to_right_32s_4k_block_omp(T, SA, k, buckets, d, thread_state[0].state.cache, block_start, block_end - block_start, threads);
+ }
+ }
+#else
+ UNUSED(thread_state);
+#endif
+
+ return d;
+}
+
+static void libsais_partial_sorting_scan_left_to_right_32s_1k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ SA[buckets[T[n - 1]]++] = (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1));
+
+ if (threads == 1 || n < 65536)
+ {
+ libsais_partial_sorting_scan_left_to_right_32s_1k(T, SA, buckets, 0, n);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ fast_sint_t block_start, block_end;
+ for (block_start = 0; block_start < n; block_start = block_end)
+ {
+ block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > n) { block_end = n; }
+
+ libsais_partial_sorting_scan_left_to_right_32s_1k_block_omp(T, SA, buckets, thread_state[0].state.cache, block_start, block_end - block_start, threads);
+ }
+ }
+#else
+ UNUSED(thread_state);
+#endif
+}
+
+static void libsais_partial_sorting_shift_markers_8u_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, const sa_sint_t * RESTRICT buckets, sa_sint_t threads)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ const sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE];
+
+ fast_sint_t c;
+
+#if defined(_OPENMP)
+ #pragma omp parallel for schedule(static, 1) num_threads(threads) if(threads > 1 && n >= 65536)
+#else
+ UNUSED(threads); UNUSED(n);
+#endif
+ for (c = BUCKETS_INDEX2(ALPHABET_SIZE - 1, 0); c >= BUCKETS_INDEX2(1, 0); c -= BUCKETS_INDEX2(1, 0))
+ {
+ fast_sint_t i, j; sa_sint_t s = SAINT_MIN;
+ for (i = (fast_sint_t)temp_bucket[c] - 1, j = (fast_sint_t)buckets[c - BUCKETS_INDEX2(1, 0)] + 3; i >= j; i -= 4)
+ {
+ libsais_prefetchw(&SA[i - prefetch_distance]);
+
+ sa_sint_t p0 = SA[i - 0], q0 = (p0 & SAINT_MIN) ^ s; s = s ^ q0; SA[i - 0] = p0 ^ q0;
+ sa_sint_t p1 = SA[i - 1], q1 = (p1 & SAINT_MIN) ^ s; s = s ^ q1; SA[i - 1] = p1 ^ q1;
+ sa_sint_t p2 = SA[i - 2], q2 = (p2 & SAINT_MIN) ^ s; s = s ^ q2; SA[i - 2] = p2 ^ q2;
+ sa_sint_t p3 = SA[i - 3], q3 = (p3 & SAINT_MIN) ^ s; s = s ^ q3; SA[i - 3] = p3 ^ q3;
+ }
+
+ for (j -= 3; i >= j; i -= 1)
+ {
+ sa_sint_t p = SA[i], q = (p & SAINT_MIN) ^ s; s = s ^ q; SA[i] = p ^ q;
+ }
+ }
+}
+
+static void libsais_partial_sorting_shift_markers_32s_6k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k, const sa_sint_t * RESTRICT buckets, sa_sint_t threads)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ const sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k];
+
+ fast_sint_t c;
+
+#if defined(_OPENMP)
+ #pragma omp parallel for schedule(static, 1) num_threads(threads) if(threads > 1 && k >= 65536)
+#else
+ UNUSED(threads);
+#endif
+ for (c = (fast_sint_t)k - 1; c >= 1; c -= 1)
+ {
+ fast_sint_t i, j; sa_sint_t s = SAINT_MIN;
+ for (i = (fast_sint_t)buckets[BUCKETS_INDEX4(c, 0)] - 1, j = (fast_sint_t)temp_bucket[BUCKETS_INDEX2(c - 1, 0)] + 3; i >= j; i -= 4)
+ {
+ libsais_prefetchw(&SA[i - prefetch_distance]);
+
+ sa_sint_t p0 = SA[i - 0], q0 = (p0 & SAINT_MIN) ^ s; s = s ^ q0; SA[i - 0] = p0 ^ q0;
+ sa_sint_t p1 = SA[i - 1], q1 = (p1 & SAINT_MIN) ^ s; s = s ^ q1; SA[i - 1] = p1 ^ q1;
+ sa_sint_t p2 = SA[i - 2], q2 = (p2 & SAINT_MIN) ^ s; s = s ^ q2; SA[i - 2] = p2 ^ q2;
+ sa_sint_t p3 = SA[i - 3], q3 = (p3 & SAINT_MIN) ^ s; s = s ^ q3; SA[i - 3] = p3 ^ q3;
+ }
+
+ for (j -= 3; i >= j; i -= 1)
+ {
+ sa_sint_t p = SA[i], q = (p & SAINT_MIN) ^ s; s = s ^ q; SA[i] = p ^ q;
+ }
+ }
+}
+
+static void libsais_partial_sorting_shift_markers_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i; sa_sint_t s = SUFFIX_GROUP_MARKER;
+ for (i = (fast_sint_t)n - 1; i >= 3; i -= 4)
+ {
+ libsais_prefetchw(&SA[i - prefetch_distance]);
+
+ sa_sint_t p0 = SA[i - 0], q0 = ((p0 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p0 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q0; SA[i - 0] = p0 ^ q0;
+ sa_sint_t p1 = SA[i - 1], q1 = ((p1 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p1 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q1; SA[i - 1] = p1 ^ q1;
+ sa_sint_t p2 = SA[i - 2], q2 = ((p2 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p2 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q2; SA[i - 2] = p2 ^ q2;
+ sa_sint_t p3 = SA[i - 3], q3 = ((p3 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p3 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q3; SA[i - 3] = p3 ^ q3;
+ }
+
+ for (; i >= 0; i -= 1)
+ {
+ sa_sint_t p = SA[i], q = ((p & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q; SA[i] = p ^ q;
+ }
+}
+
+static void libsais_partial_sorting_shift_buckets_32s_6k(sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+ sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k];
+
+ fast_sint_t i;
+ for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0))
+ {
+ buckets[2 * i + BUCKETS_INDEX4(0, 0)] = temp_bucket[i + BUCKETS_INDEX2(0, 0)];
+ buckets[2 * i + BUCKETS_INDEX4(0, 1)] = temp_bucket[i + BUCKETS_INDEX2(0, 1)];
+ }
+}
+
+static sa_sint_t libsais_partial_sorting_scan_right_to_left_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
+ sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
+
+ fast_sint_t i, j;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+ {
+ libsais_prefetch(&SA[i - 2 * prefetch_distance]);
+
+ libsais_prefetch(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 1);
+ libsais_prefetch(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 2);
+ libsais_prefetch(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 1);
+ libsais_prefetch(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 2);
+
+ sa_sint_t p0 = SA[i - 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]);
+ SA[--induction_bucket[v0]] = (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d;
+
+ sa_sint_t p1 = SA[i - 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]);
+ SA[--induction_bucket[v1]] = (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d;
+ }
+
+ for (j -= prefetch_distance + 1; i >= j; i -= 1)
+ {
+ sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
+ SA[--induction_bucket[v]] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d;
+ }
+
+ return d;
+}
+
+#if defined(_OPENMP)
+
+static void libsais_partial_sorting_scan_right_to_left_8u_block_prepare(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size, LIBSAIS_THREAD_STATE * RESTRICT state)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
+ sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
+
+ memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t));
+
+ fast_sint_t i, j, count = 0; sa_sint_t d = 1;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+ {
+ libsais_prefetch(&SA[i - 2 * prefetch_distance]);
+
+ libsais_prefetch(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 1);
+ libsais_prefetch(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 2);
+ libsais_prefetch(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 1);
+ libsais_prefetch(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 2);
+
+ sa_sint_t p0 = cache[count].index = SA[i - 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = cache[count++].symbol = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); induction_bucket[v0]++; distinct_names[v0] = d;
+ sa_sint_t p1 = cache[count].index = SA[i - 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = cache[count++].symbol = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); induction_bucket[v1]++; distinct_names[v1] = d;
+ }
+
+ for (j -= prefetch_distance + 1; i >= j; i -= 1)
+ {
+ sa_sint_t p = cache[count].index = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = cache[count++].symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); induction_bucket[v]++; distinct_names[v] = d;
+ }
+
+ state[0].state.position = (fast_sint_t)d - 1;
+ state[0].state.count = count;
+}
+
+static void libsais_partial_sorting_scan_right_to_left_8u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count, sa_sint_t d)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
+ sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
+
+ fast_sint_t i, j;
+ for (i = 0, j = count - 1; i < j; i += 2)
+ {
+ libsais_prefetch(&cache[i + prefetch_distance]);
+
+ sa_sint_t p0 = cache[i + 0].index; d += (p0 < 0); sa_sint_t v0 = cache[i + 0].symbol;
+ SA[--induction_bucket[v0]] = (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d;
+
+ sa_sint_t p1 = cache[i + 1].index; d += (p1 < 0); sa_sint_t v1 = cache[i + 1].symbol;
+ SA[--induction_bucket[v1]] = (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d;
+ }
+
+ for (j += 1; i < j; i += 1)
+ {
+ sa_sint_t p = cache[i].index; d += (p < 0); sa_sint_t v = cache[i].symbol;
+ SA[--induction_bucket[v]] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d;
+ }
+}
+
+static sa_sint_t libsais_partial_sorting_scan_right_to_left_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+ omp_block_start += block_start;
+
+ if (omp_num_threads == 1)
+ {
+ d = libsais_partial_sorting_scan_right_to_left_8u(T, SA, buckets, d, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ libsais_partial_sorting_scan_right_to_left_8u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size, &thread_state[omp_thread_num]);
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
+ sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
+
+ fast_sint_t t;
+ for (t = omp_num_threads - 1; t >= 0; --t)
+ {
+ sa_sint_t * RESTRICT temp_induction_bucket = &thread_state[t].state.buckets[0 * ALPHABET_SIZE];
+ sa_sint_t * RESTRICT temp_distinct_names = &thread_state[t].state.buckets[2 * ALPHABET_SIZE];
+
+ fast_sint_t c;
+ for (c = 0; c < 2 * ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_induction_bucket[c]; induction_bucket[c] = A - B; temp_induction_bucket[c] = A; }
+
+ for (d -= 1, c = 0; c < 2 * ALPHABET_SIZE; c += 1) { sa_sint_t A = distinct_names[c], B = temp_distinct_names[c], D = B + d; distinct_names[c] = B > 0 ? D : A; temp_distinct_names[c] = A; }
+ d += 1 + (sa_sint_t)thread_state[t].state.position; thread_state[t].state.position = (fast_sint_t)d - thread_state[t].state.position;
+ }
+ }
+
+ #pragma omp barrier
+
+ {
+ libsais_partial_sorting_scan_right_to_left_8u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count, (sa_sint_t)thread_state[omp_thread_num].state.position);
+ }
+ }
+#endif
+ }
+
+ return d;
+}
+
+#endif
+
+static void libsais_partial_sorting_scan_right_to_left_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ fast_sint_t scan_start = (fast_sint_t)left_suffixes_count + 1;
+ fast_sint_t scan_end = (fast_sint_t)n - (fast_sint_t)first_lms_suffix;
+
+ if (threads == 1 || (scan_end - scan_start) < 65536)
+ {
+ libsais_partial_sorting_scan_right_to_left_8u(T, SA, buckets, d, scan_start, scan_end - scan_start);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
+ sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
+
+ fast_sint_t block_start;
+ for (block_start = scan_end - 1; block_start >= scan_start; )
+ {
+ if (SA[block_start] == 0)
+ {
+ block_start--;
+ }
+ else
+ {
+ fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end < scan_start) { block_max_end = scan_start - 1; }
+ fast_sint_t block_end = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; }
+ fast_sint_t block_size = block_start - block_end;
+
+ if (block_size < 32)
+ {
+ for (; block_start > block_end; block_start -= 1)
+ {
+ sa_sint_t p = SA[block_start]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
+ SA[--induction_bucket[v]] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d;
+ }
+ }
+ else
+ {
+ d = libsais_partial_sorting_scan_right_to_left_8u_block_omp(T, SA, buckets, d, block_end + 1, block_size, threads, thread_state);
+ block_start = block_end;
+ }
+ }
+ }
+ }
+#else
+ UNUSED(thread_state);
+#endif
+}
+
+static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2)
+ {
+ libsais_prefetch(&SA[i - 3 * prefetch_distance]);
+
+ libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 0] & SAINT_MAX] - 1);
+ libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 0] & SAINT_MAX] - 2);
+ libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 1] & SAINT_MAX] - 1);
+ libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 1] & SAINT_MAX] - 2);
+
+ sa_sint_t p0 = SA[i - prefetch_distance - 0] & SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX4(T[p0 - (p0 > 0)], 0); libsais_prefetchw(&buckets[v0]);
+ sa_sint_t p1 = SA[i - prefetch_distance - 1] & SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX4(T[p1 - (p1 > 0)], 0); libsais_prefetchw(&buckets[v1]);
+
+ sa_sint_t p2 = SA[i - 0]; d += (p2 < 0); p2 &= SAINT_MAX; sa_sint_t v2 = BUCKETS_INDEX4(T[p2 - 1], T[p2 - 2] > T[p2 - 1]);
+ SA[--buckets[v2]] = (p2 - 1) | ((sa_sint_t)(buckets[2 + v2] != d) << (SAINT_BIT - 1)); buckets[2 + v2] = d;
+
+ sa_sint_t p3 = SA[i - 1]; d += (p3 < 0); p3 &= SAINT_MAX; sa_sint_t v3 = BUCKETS_INDEX4(T[p3 - 1], T[p3 - 2] > T[p3 - 1]);
+ SA[--buckets[v3]] = (p3 - 1) | ((sa_sint_t)(buckets[2 + v3] != d) << (SAINT_BIT - 1)); buckets[2 + v3] = d;
+ }
+
+ for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1)
+ {
+ sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX4(T[p - 1], T[p - 2] > T[p - 1]);
+ SA[--buckets[v]] = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d;
+ }
+
+ return d;
+}
+
+static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t * RESTRICT induction_bucket = &buckets[3 * k];
+ sa_sint_t * RESTRICT distinct_names = &buckets[0 * k];
+
+ fast_sint_t i, j;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2)
+ {
+ libsais_prefetchw(&SA[i - 3 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0]; const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1]; const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+ sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { const fast_sint_t Ts2 = T[(s2 & ~SUFFIX_GROUP_MARKER) - 1]; libsais_prefetchw(&induction_bucket[Ts2]); libsais_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts2, 0)]); }
+ sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { const fast_sint_t Ts3 = T[(s3 & ~SUFFIX_GROUP_MARKER) - 1]; libsais_prefetchw(&induction_bucket[Ts3]); libsais_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts3, 0)]); }
+
+ sa_sint_t p0 = SA[i - 0];
+ if (p0 > 0)
+ {
+ SA[i - 0] = 0; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); p0 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]);
+ SA[--induction_bucket[T[p0 - 1]]] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d;
+ }
+
+ sa_sint_t p1 = SA[i - 1];
+ if (p1 > 0)
+ {
+ SA[i - 1] = 0; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); p1 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]);
+ SA[--induction_bucket[T[p1 - 1]]] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d;
+ }
+ }
+
+ for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1)
+ {
+ sa_sint_t p = SA[i];
+ if (p > 0)
+ {
+ SA[i] = 0; d += (p >> (SUFFIX_GROUP_BIT - 1)); p &= ~SUFFIX_GROUP_MARKER; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
+ SA[--induction_bucket[T[p - 1]]] = (p - 1) | ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d;
+ }
+ }
+
+ return d;
+}
+
+static void libsais_partial_sorting_scan_right_to_left_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2)
+ {
+ libsais_prefetchw(&SA[i - 3 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+ sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { libsais_prefetchw(&induction_bucket[T[s2 - 1]]); libsais_prefetch(&T[s2] - 2); }
+ sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { libsais_prefetchw(&induction_bucket[T[s3 - 1]]); libsais_prefetch(&T[s3] - 2); }
+
+ sa_sint_t p0 = SA[i - 0]; if (p0 > 0) { SA[i - 0] = 0; SA[--induction_bucket[T[p0 - 1]]] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)); }
+ sa_sint_t p1 = SA[i - 1]; if (p1 > 0) { SA[i - 1] = 0; SA[--induction_bucket[T[p1 - 1]]] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)); }
+ }
+
+ for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1)
+ {
+ sa_sint_t p = SA[i]; if (p > 0) { SA[i] = 0; SA[--induction_bucket[T[p - 1]]] = (p - 1) | ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)); }
+ }
+}
+
+#if defined(_OPENMP)
+
+static void libsais_partial_sorting_scan_right_to_left_32s_6k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais_prefetch(&SA[i + 2 * prefetch_distance]);
+
+ libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1);
+ libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2);
+ libsais_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1);
+ libsais_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2);
+
+ libsais_prefetchw(&cache[i + prefetch_distance]);
+
+ sa_sint_t p0 = cache[i + 0].index = SA[i + 0]; sa_sint_t symbol0 = 0; p0 &= SAINT_MAX; if (p0 != 0) { symbol0 = BUCKETS_INDEX4(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); } cache[i + 0].symbol = symbol0;
+ sa_sint_t p1 = cache[i + 1].index = SA[i + 1]; sa_sint_t symbol1 = 0; p1 &= SAINT_MAX; if (p1 != 0) { symbol1 = BUCKETS_INDEX4(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); } cache[i + 1].symbol = symbol1;
+ }
+
+ for (j += prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t p = cache[i].index = SA[i]; sa_sint_t symbol = 0; p &= SAINT_MAX; if (p != 0) { symbol = BUCKETS_INDEX4(T[p - 1], T[p - 2] > T[p - 1]); } cache[i].symbol = symbol;
+ }
+}
+
+static void libsais_partial_sorting_scan_right_to_left_32s_4k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+
+ libsais_prefetchw(&cache[i + prefetch_distance]);
+
+ sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { SA[i + 0] = 0; cache[i + 0].index = p0; p0 &= ~SUFFIX_GROUP_MARKER; symbol0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); } cache[i + 0].symbol = symbol0;
+ sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { SA[i + 1] = 0; cache[i + 1].index = p1; p1 &= ~SUFFIX_GROUP_MARKER; symbol1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); } cache[i + 1].symbol = symbol1;
+ }
+
+ for (j += prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { SA[i] = 0; cache[i].index = p; p &= ~SUFFIX_GROUP_MARKER; symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); } cache[i].symbol = symbol;
+ }
+}
+
+static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+
+ libsais_prefetchw(&cache[i + prefetch_distance]);
+
+ sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { SA[i + 0] = 0; cache[i + 0].index = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)); symbol0 = T[p0 - 1]; } cache[i + 0].symbol = symbol0;
+ sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { SA[i + 1] = 0; cache[i + 1].index = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)); symbol1 = T[p1 - 1]; } cache[i + 1].symbol = symbol1;
+ }
+
+ for (j += prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { SA[i] = 0; cache[i].index = (p - 1) | ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)); symbol = T[p - 1]; } cache[i].symbol = symbol;
+ }
+}
+
+static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+ {
+ libsais_prefetchw(&cache[i - 2 * prefetch_distance]);
+
+ libsais_prefetchw(&buckets[cache[i - prefetch_distance - 0].symbol]);
+ libsais_prefetchw(&buckets[cache[i - prefetch_distance - 1].symbol]);
+
+ sa_sint_t v0 = cache[i - 0].symbol, p0 = cache[i - 0].index; d += (p0 < 0); cache[i - 0].symbol = --buckets[v0]; cache[i - 0].index = (p0 - 1) | ((sa_sint_t)(buckets[2 + v0] != d) << (SAINT_BIT - 1)); buckets[2 + v0] = d;
+ if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t s = cache[i - 0].symbol, q = (cache[s].index = cache[i - 0].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] > T[q - 1]); }
+
+ sa_sint_t v1 = cache[i - 1].symbol, p1 = cache[i - 1].index; d += (p1 < 0); cache[i - 1].symbol = --buckets[v1]; cache[i - 1].index = (p1 - 1) | ((sa_sint_t)(buckets[2 + v1] != d) << (SAINT_BIT - 1)); buckets[2 + v1] = d;
+ if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t s = cache[i - 1].symbol, q = (cache[s].index = cache[i - 1].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] > T[q - 1]); }
+ }
+
+ for (j -= prefetch_distance + 1; i >= j; i -= 1)
+ {
+ sa_sint_t v = cache[i].symbol, p = cache[i].index; d += (p < 0); cache[i].symbol = --buckets[v]; cache[i].index = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d;
+ if (cache[i].symbol >= omp_block_start) { sa_sint_t s = cache[i].symbol, q = (cache[s].index = cache[i].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] > T[q - 1]); }
+ }
+
+ return d;
+}
+
+static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t * RESTRICT induction_bucket = &buckets[3 * k];
+ sa_sint_t * RESTRICT distinct_names = &buckets[0 * k];
+
+ fast_sint_t i, j;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+ {
+ libsais_prefetchw(&cache[i - 2 * prefetch_distance]);
+
+ sa_sint_t s0 = cache[i - prefetch_distance - 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0 >> 1]; libsais_prefetchw(s0 >= 0 ? Is0 : NULL); const sa_sint_t * Ds0 = &distinct_names[s0]; libsais_prefetchw(s0 >= 0 ? Ds0 : NULL);
+ sa_sint_t s1 = cache[i - prefetch_distance - 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1 >> 1]; libsais_prefetchw(s1 >= 0 ? Is1 : NULL); const sa_sint_t * Ds1 = &distinct_names[s1]; libsais_prefetchw(s1 >= 0 ? Ds1 : NULL);
+
+ sa_sint_t v0 = cache[i - 0].symbol;
+ if (v0 >= 0)
+ {
+ sa_sint_t p0 = cache[i - 0].index; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); cache[i - 0].symbol = --induction_bucket[v0 >> 1]; cache[i - 0].index = (p0 - 1) | (v0 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d;
+ if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 0].symbol, np = cache[i - 0].index; if (np > 0) { cache[i - 0].index = 0; cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]); } }
+ }
+
+ sa_sint_t v1 = cache[i - 1].symbol;
+ if (v1 >= 0)
+ {
+ sa_sint_t p1 = cache[i - 1].index; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); cache[i - 1].symbol = --induction_bucket[v1 >> 1]; cache[i - 1].index = (p1 - 1) | (v1 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d;
+ if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 1].symbol, np = cache[i - 1].index; if (np > 0) { cache[i - 1].index = 0; cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]); } }
+ }
+ }
+
+ for (j -= prefetch_distance + 1; i >= j; i -= 1)
+ {
+ sa_sint_t v = cache[i].symbol;
+ if (v >= 0)
+ {
+ sa_sint_t p = cache[i].index; d += (p >> (SUFFIX_GROUP_BIT - 1)); cache[i].symbol = --induction_bucket[v >> 1]; cache[i].index = (p - 1) | (v << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d;
+ if (cache[i].symbol >= omp_block_start) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[i].index = 0; cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]); } }
+ }
+ }
+
+ return d;
+}
+
+static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+ {
+ libsais_prefetchw(&cache[i - 2 * prefetch_distance]);
+
+ sa_sint_t s0 = cache[i - prefetch_distance - 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0]; libsais_prefetchw(s0 >= 0 ? Is0 : NULL);
+ sa_sint_t s1 = cache[i - prefetch_distance - 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1]; libsais_prefetchw(s1 >= 0 ? Is1 : NULL);
+
+ sa_sint_t v0 = cache[i - 0].symbol;
+ if (v0 >= 0)
+ {
+ cache[i - 0].symbol = --induction_bucket[v0];
+ if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 0].symbol, np = cache[i - 0].index; if (np > 0) { cache[i - 0].index = 0; cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; } }
+ }
+
+ sa_sint_t v1 = cache[i - 1].symbol;
+ if (v1 >= 0)
+ {
+ cache[i - 1].symbol = --induction_bucket[v1];
+ if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 1].symbol, np = cache[i - 1].index; if (np > 0) { cache[i - 1].index = 0; cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; }}
+ }
+ }
+
+ for (j -= prefetch_distance + 1; i >= j; i -= 1)
+ {
+ sa_sint_t v = cache[i].symbol;
+ if (v >= 0)
+ {
+ cache[i].symbol = --induction_bucket[v];
+ if (cache[i].symbol >= omp_block_start) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[i].index = 0; cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; } }
+ }
+ }
+}
+
+static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(cache);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+ omp_block_start += block_start;
+
+ if (omp_num_threads == 1)
+ {
+ d = libsais_partial_sorting_scan_right_to_left_32s_6k(T, SA, buckets, d, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ libsais_partial_sorting_scan_right_to_left_32s_6k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ d = libsais_partial_sorting_scan_right_to_left_32s_6k_block_sort(T, buckets, d, cache - block_start, block_start, block_size);
+ }
+
+ #pragma omp barrier
+
+ {
+ libsais_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+ }
+ }
+#endif
+ }
+
+ return d;
+}
+
+static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(cache);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+ omp_block_start += block_start;
+
+ if (omp_num_threads == 1)
+ {
+ d = libsais_partial_sorting_scan_right_to_left_32s_4k(T, SA, k, buckets, d, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ libsais_partial_sorting_scan_right_to_left_32s_4k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ d = libsais_partial_sorting_scan_right_to_left_32s_4k_block_sort(T, k, buckets, d, cache - block_start, block_start, block_size);
+ }
+
+ #pragma omp barrier
+
+ {
+ libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+ }
+ }
+#endif
+ }
+
+ return d;
+}
+
+static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(cache);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+ omp_block_start += block_start;
+
+ if (omp_num_threads == 1)
+ {
+ libsais_partial_sorting_scan_right_to_left_32s_1k(T, SA, buckets, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ libsais_partial_sorting_scan_right_to_left_32s_1k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ libsais_partial_sorting_scan_right_to_left_32s_1k_block_sort(T, buckets, cache - block_start, block_start, block_size);
+ }
+
+ #pragma omp barrier
+
+ {
+ libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+ }
+ }
+#endif
+ }
+}
+
+#endif
+
+static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ fast_sint_t scan_start = (fast_sint_t)left_suffixes_count + 1;
+ fast_sint_t scan_end = (fast_sint_t)n - (fast_sint_t)first_lms_suffix;
+
+ if (threads == 1 || (scan_end - scan_start) < 65536)
+ {
+ d = libsais_partial_sorting_scan_right_to_left_32s_6k(T, SA, buckets, d, scan_start, scan_end - scan_start);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ fast_sint_t block_start, block_end;
+ for (block_start = scan_end - 1; block_start >= scan_start; block_start = block_end)
+ {
+ block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < scan_start) { block_end = scan_start - 1; }
+
+ d = libsais_partial_sorting_scan_right_to_left_32s_6k_block_omp(T, SA, buckets, d, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads);
+ }
+ }
+#else
+ UNUSED(thread_state);
+#endif
+
+ return d;
+}
+
+static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ if (threads == 1 || n < 65536)
+ {
+ d = libsais_partial_sorting_scan_right_to_left_32s_4k(T, SA, k, buckets, d, 0, n);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ fast_sint_t block_start, block_end;
+ for (block_start = (fast_sint_t)n - 1; block_start >= 0; block_start = block_end)
+ {
+ block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < 0) { block_end = -1; }
+
+ d = libsais_partial_sorting_scan_right_to_left_32s_4k_block_omp(T, SA, k, buckets, d, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads);
+ }
+ }
+#else
+ UNUSED(thread_state);
+#endif
+
+ return d;
+}
+
+static void libsais_partial_sorting_scan_right_to_left_32s_1k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ if (threads == 1 || n < 65536)
+ {
+ libsais_partial_sorting_scan_right_to_left_32s_1k(T, SA, buckets, 0, n);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ fast_sint_t block_start, block_end;
+ for (block_start = (fast_sint_t)n - 1; block_start >= 0; block_start = block_end)
+ {
+ block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < 0) { block_end = -1; }
+
+ libsais_partial_sorting_scan_right_to_left_32s_1k_block_omp(T, SA, buckets, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads);
+ }
+ }
+#else
+ UNUSED(thread_state);
+#endif
+}
+
+static fast_sint_t libsais_partial_sorting_gather_lms_suffixes_32s_4k(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j, l;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j; i += 4)
+ {
+ libsais_prefetch(&SA[i + prefetch_distance]);
+
+ sa_sint_t s0 = SA[i + 0]; SA[l] = (s0 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s0 < 0);
+ sa_sint_t s1 = SA[i + 1]; SA[l] = (s1 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s1 < 0);
+ sa_sint_t s2 = SA[i + 2]; SA[l] = (s2 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s2 < 0);
+ sa_sint_t s3 = SA[i + 3]; SA[l] = (s3 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s3 < 0);
+ }
+
+ for (j += 3; i < j; i += 1)
+ {
+ sa_sint_t s = SA[i]; SA[l] = (s - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s < 0);
+ }
+
+ return l;
+}
+
+static fast_sint_t libsais_partial_sorting_gather_lms_suffixes_32s_1k(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j, l;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j; i += 4)
+ {
+ libsais_prefetch(&SA[i + prefetch_distance]);
+
+ sa_sint_t s0 = SA[i + 0]; SA[l] = s0 & SAINT_MAX; l += (s0 < 0);
+ sa_sint_t s1 = SA[i + 1]; SA[l] = s1 & SAINT_MAX; l += (s1 < 0);
+ sa_sint_t s2 = SA[i + 2]; SA[l] = s2 & SAINT_MAX; l += (s2 < 0);
+ sa_sint_t s3 = SA[i + 3]; SA[l] = s3 & SAINT_MAX; l += (s3 < 0);
+ }
+
+ for (j += 3; i < j; i += 1)
+ {
+ sa_sint_t s = SA[i]; SA[l] = s & SAINT_MAX; l += (s < 0);
+ }
+
+ return l;
+}
+
+static void libsais_partial_sorting_gather_lms_suffixes_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+ if (omp_num_threads == 1)
+ {
+ libsais_partial_sorting_gather_lms_suffixes_32s_4k(SA, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ thread_state[omp_thread_num].state.position = omp_block_start;
+ thread_state[omp_thread_num].state.count = libsais_partial_sorting_gather_lms_suffixes_32s_4k(SA, omp_block_start, omp_block_size) - omp_block_start;
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ fast_sint_t t, position = 0;
+ for (t = 0; t < omp_num_threads; ++t)
+ {
+ if (t > 0 && thread_state[t].state.count > 0)
+ {
+ memmove(&SA[position], &SA[thread_state[t].state.position], (size_t)thread_state[t].state.count * sizeof(sa_sint_t));
+ }
+
+ position += thread_state[t].state.count;
+ }
+ }
+ }
+#endif
+ }
+}
+
+static void libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+ if (omp_num_threads == 1)
+ {
+ libsais_partial_sorting_gather_lms_suffixes_32s_1k(SA, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ thread_state[omp_thread_num].state.position = omp_block_start;
+ thread_state[omp_thread_num].state.count = libsais_partial_sorting_gather_lms_suffixes_32s_1k(SA, omp_block_start, omp_block_size) - omp_block_start;
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ fast_sint_t t, position = 0;
+ for (t = 0; t < omp_num_threads; ++t)
+ {
+ if (t > 0 && thread_state[t].state.count > 0)
+ {
+ memmove(&SA[position], &SA[thread_state[t].state.position], (size_t)thread_state[t].state.count * sizeof(sa_sint_t));
+ }
+
+ position += thread_state[t].state.count;
+ }
+ }
+ }
+#endif
+ }
+}
+
+static void libsais_induce_partial_order_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ memset(&buckets[2 * ALPHABET_SIZE], 0, 2 * ALPHABET_SIZE * sizeof(sa_sint_t));
+
+ sa_sint_t d = libsais_partial_sorting_scan_left_to_right_8u_omp(T, SA, n, buckets, left_suffixes_count, 0, threads, thread_state);
+ libsais_partial_sorting_shift_markers_8u_omp(SA, n, buckets, threads);
+ libsais_partial_sorting_scan_right_to_left_8u_omp(T, SA, n, buckets, first_lms_suffix, left_suffixes_count, d, threads, thread_state);
+}
+
+static void libsais_induce_partial_order_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ sa_sint_t d = libsais_partial_sorting_scan_left_to_right_32s_6k_omp(T, SA, n, buckets, left_suffixes_count, 0, threads, thread_state);
+ libsais_partial_sorting_shift_markers_32s_6k_omp(SA, k, buckets, threads);
+ libsais_partial_sorting_shift_buckets_32s_6k(k, buckets);
+ libsais_partial_sorting_scan_right_to_left_32s_6k_omp(T, SA, n, buckets, first_lms_suffix, left_suffixes_count, d, threads, thread_state);
+}
+
+static void libsais_induce_partial_order_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
+
+ sa_sint_t d = libsais_partial_sorting_scan_left_to_right_32s_4k_omp(T, SA, n, k, buckets, 0, threads, thread_state);
+ libsais_partial_sorting_shift_markers_32s_4k(SA, n);
+ libsais_partial_sorting_scan_right_to_left_32s_4k_omp(T, SA, n, k, buckets, d, threads, thread_state);
+ libsais_partial_sorting_gather_lms_suffixes_32s_4k_omp(SA, n, threads, thread_state);
+}
+
+static void libsais_induce_partial_order_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ libsais_partial_sorting_scan_left_to_right_32s_1k_omp(T, SA, n, &buckets[1 * k], threads, thread_state);
+ libsais_partial_sorting_scan_right_to_left_32s_1k_omp(T, SA, n, &buckets[0 * k], threads, thread_state);
+ libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads, thread_state);
+}
+
+static void libsais_induce_partial_order_32s_1k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ libsais_count_suffixes_32s(T, n, k, buckets);
+ libsais_initialize_buckets_start_32s_1k(k, buckets);
+ libsais_partial_sorting_scan_left_to_right_32s_1k_omp(T, SA, n, buckets, threads, thread_state);
+
+ libsais_count_suffixes_32s(T, n, k, buckets);
+ libsais_initialize_buckets_end_32s_1k(k, buckets);
+ libsais_partial_sorting_scan_right_to_left_32s_1k_omp(T, SA, n, buckets, threads, thread_state);
+
+ libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads, thread_state);
+}
+
+static sa_sint_t libsais_renumber_lms_suffixes_8u(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t name, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t * RESTRICT SAm = &SA[m];
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
+ {
+ libsais_prefetch(&SA[i + 2 * prefetch_distance]);
+
+ libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 0] & SAINT_MAX) >> 1]);
+ libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 1] & SAINT_MAX) >> 1]);
+ libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 2] & SAINT_MAX) >> 1]);
+ libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 3] & SAINT_MAX) >> 1]);
+
+ sa_sint_t p0 = SA[i + 0]; SAm[(p0 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p0 < 0;
+ sa_sint_t p1 = SA[i + 1]; SAm[(p1 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p1 < 0;
+ sa_sint_t p2 = SA[i + 2]; SAm[(p2 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p2 < 0;
+ sa_sint_t p3 = SA[i + 3]; SAm[(p3 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p3 < 0;
+ }
+
+ for (j += prefetch_distance + 3; i < j; i += 1)
+ {
+ sa_sint_t p = SA[i]; SAm[(p & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p < 0;
+ }
+
+ return name;
+}
+
+static fast_sint_t libsais_gather_marked_suffixes_8u(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t l, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ l -= 1;
+
+ fast_sint_t i, j;
+ for (i = (fast_sint_t)m + omp_block_start + omp_block_size - 1, j = (fast_sint_t)m + omp_block_start + 3; i >= j; i -= 4)
+ {
+ libsais_prefetch(&SA[i - prefetch_distance]);
+
+ sa_sint_t s0 = SA[i - 0]; SA[l] = s0 & SAINT_MAX; l -= s0 < 0;
+ sa_sint_t s1 = SA[i - 1]; SA[l] = s1 & SAINT_MAX; l -= s1 < 0;
+ sa_sint_t s2 = SA[i - 2]; SA[l] = s2 & SAINT_MAX; l -= s2 < 0;
+ sa_sint_t s3 = SA[i - 3]; SA[l] = s3 & SAINT_MAX; l -= s3 < 0;
+ }
+
+ for (j -= 3; i >= j; i -= 1)
+ {
+ sa_sint_t s = SA[i]; SA[l] = s & SAINT_MAX; l -= s < 0;
+ }
+
+ l += 1;
+
+ return l;
+}
+
+static sa_sint_t libsais_renumber_lms_suffixes_8u_omp(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ sa_sint_t name = 0;
+
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
+
+ if (omp_num_threads == 1)
+ {
+ name = libsais_renumber_lms_suffixes_8u(SA, m, 0, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ thread_state[omp_thread_num].state.count = libsais_count_negative_marked_suffixes(SA, omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ {
+ fast_sint_t t, count = 0; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; }
+
+ if (omp_thread_num == omp_num_threads - 1)
+ {
+ name = (sa_sint_t)(count + thread_state[omp_thread_num].state.count);
+ }
+
+ libsais_renumber_lms_suffixes_8u(SA, m, (sa_sint_t)count, omp_block_start, omp_block_size);
+ }
+ }
+#endif
+ }
+
+ return name;
+}
+
+static void libsais_gather_marked_lms_suffixes_8u_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start;
+
+ if (omp_num_threads == 1)
+ {
+ libsais_gather_marked_suffixes_8u(SA, m, (fast_sint_t)n + (fast_sint_t)fs, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ if (omp_thread_num < omp_num_threads - 1)
+ {
+ thread_state[omp_thread_num].state.position = libsais_gather_marked_suffixes_8u(SA, m, (fast_sint_t)m + omp_block_start + omp_block_size, omp_block_start, omp_block_size);
+ thread_state[omp_thread_num].state.count = (fast_sint_t)m + omp_block_start + omp_block_size - thread_state[omp_thread_num].state.position;
+ }
+ else
+ {
+ thread_state[omp_thread_num].state.position = libsais_gather_marked_suffixes_8u(SA, m, (fast_sint_t)n + (fast_sint_t)fs, omp_block_start, omp_block_size);
+ thread_state[omp_thread_num].state.count = (fast_sint_t)n + (fast_sint_t)fs - thread_state[omp_thread_num].state.position;
+ }
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ fast_sint_t t, position = (fast_sint_t)n + (fast_sint_t)fs;
+
+ for (t = omp_num_threads - 1; t >= 0; --t)
+ {
+ position -= thread_state[t].state.count;
+ if (t != omp_num_threads - 1 && thread_state[t].state.count > 0)
+ {
+ memmove(&SA[position], &SA[thread_state[t].state.position], (size_t)thread_state[t].state.count * sizeof(sa_sint_t));
+ }
+ }
+ }
+ }
+#endif
+ }
+}
+
+static sa_sint_t libsais_renumber_and_gather_lms_suffixes_8u_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ memset(&SA[m], 0, ((size_t)n >> 1) * sizeof(sa_sint_t));
+
+ sa_sint_t name = libsais_renumber_lms_suffixes_8u_omp(SA, m, threads, thread_state);
+ if (name < m)
+ {
+ libsais_gather_marked_lms_suffixes_8u_omp(SA, n, m, fs, threads, thread_state);
+ }
+ else
+ {
+ fast_sint_t i; for (i = 0; i < m; i += 1) { SA[i] &= SAINT_MAX; }
+ }
+
+ return name;
+}
+
+static sa_sint_t libsais_renumber_distinct_lms_suffixes_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t name, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t * RESTRICT SAm = &SA[m];
+
+ fast_sint_t i, j; sa_sint_t p0, p1, p2, p3 = 0;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
+ {
+ libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+ libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 0] & SAINT_MAX) >> 1]);
+ libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 1] & SAINT_MAX) >> 1]);
+ libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 2] & SAINT_MAX) >> 1]);
+ libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 3] & SAINT_MAX) >> 1]);
+
+ p0 = SA[i + 0]; SAm[(SA[i + 0] = p0 & SAINT_MAX) >> 1] = name | (p0 & p3 & SAINT_MIN); name += p0 < 0;
+ p1 = SA[i + 1]; SAm[(SA[i + 1] = p1 & SAINT_MAX) >> 1] = name | (p1 & p0 & SAINT_MIN); name += p1 < 0;
+ p2 = SA[i + 2]; SAm[(SA[i + 2] = p2 & SAINT_MAX) >> 1] = name | (p2 & p1 & SAINT_MIN); name += p2 < 0;
+ p3 = SA[i + 3]; SAm[(SA[i + 3] = p3 & SAINT_MAX) >> 1] = name | (p3 & p2 & SAINT_MIN); name += p3 < 0;
+ }
+
+ for (j += prefetch_distance + 3; i < j; i += 1)
+ {
+ p2 = p3; p3 = SA[i]; SAm[(SA[i] = p3 & SAINT_MAX) >> 1] = name | (p3 & p2 & SAINT_MIN); name += p3 < 0;
+ }
+
+ return name;
+}
+
+static void libsais_mark_distinct_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j; sa_sint_t p0, p1, p2, p3 = 0;
+ for (i = (fast_sint_t)m + omp_block_start, j = (fast_sint_t)m + omp_block_start + omp_block_size - 3; i < j; i += 4)
+ {
+ libsais_prefetchw(&SA[i + prefetch_distance]);
+
+ p0 = SA[i + 0]; SA[i + 0] = p0 & (p3 | SAINT_MAX); p0 = (p0 == 0) ? p3 : p0;
+ p1 = SA[i + 1]; SA[i + 1] = p1 & (p0 | SAINT_MAX); p1 = (p1 == 0) ? p0 : p1;
+ p2 = SA[i + 2]; SA[i + 2] = p2 & (p1 | SAINT_MAX); p2 = (p2 == 0) ? p1 : p2;
+ p3 = SA[i + 3]; SA[i + 3] = p3 & (p2 | SAINT_MAX); p3 = (p3 == 0) ? p2 : p3;
+ }
+
+ for (j += 3; i < j; i += 1)
+ {
+ p2 = p3; p3 = SA[i]; SA[i] = p3 & (p2 | SAINT_MAX); p3 = (p3 == 0) ? p2 : p3;
+ }
+}
+
+static void libsais_clamp_lms_suffixes_length_32s(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t * RESTRICT SAm = &SA[m];
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - 3; i < j; i += 4)
+ {
+ libsais_prefetchw(&SAm[i + prefetch_distance]);
+
+ SAm[i + 0] = (SAm[i + 0] < 0 ? SAm[i + 0] : 0) & SAINT_MAX;
+ SAm[i + 1] = (SAm[i + 1] < 0 ? SAm[i + 1] : 0) & SAINT_MAX;
+ SAm[i + 2] = (SAm[i + 2] < 0 ? SAm[i + 2] : 0) & SAINT_MAX;
+ SAm[i + 3] = (SAm[i + 3] < 0 ? SAm[i + 3] : 0) & SAINT_MAX;
+ }
+
+ for (j += 3; i < j; i += 1)
+ {
+ SAm[i] = (SAm[i] < 0 ? SAm[i] : 0) & SAINT_MAX;
+ }
+}
+
+static sa_sint_t libsais_renumber_distinct_lms_suffixes_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ sa_sint_t name = 0;
+
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
+
+ if (omp_num_threads == 1)
+ {
+ name = libsais_renumber_distinct_lms_suffixes_32s_4k(SA, m, 1, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ thread_state[omp_thread_num].state.count = libsais_count_negative_marked_suffixes(SA, omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ {
+ fast_sint_t t, count = 1; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; }
+
+ if (omp_thread_num == omp_num_threads - 1)
+ {
+ name = (sa_sint_t)(count + thread_state[omp_thread_num].state.count);
+ }
+
+ libsais_renumber_distinct_lms_suffixes_32s_4k(SA, m, (sa_sint_t)count, omp_block_start, omp_block_size);
+ }
+ }
+#endif
+ }
+
+ return name - 1;
+}
+
+static void libsais_mark_distinct_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+ fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start;
+#else
+ UNUSED(threads);
+
+ fast_sint_t omp_block_start = 0;
+ fast_sint_t omp_block_size = (fast_sint_t)n >> 1;
+#endif
+ libsais_mark_distinct_lms_suffixes_32s(SA, m, omp_block_start, omp_block_size);
+ }
+}
+
+static void libsais_clamp_lms_suffixes_length_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+ fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start;
+#else
+ UNUSED(threads);
+
+ fast_sint_t omp_block_start = 0;
+ fast_sint_t omp_block_size = (fast_sint_t)n >> 1;
+#endif
+ libsais_clamp_lms_suffixes_length_32s(SA, m, omp_block_start, omp_block_size);
+ }
+}
+
+static sa_sint_t libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ memset(&SA[m], 0, ((size_t)n >> 1) * sizeof(sa_sint_t));
+
+ sa_sint_t name = libsais_renumber_distinct_lms_suffixes_32s_4k_omp(SA, m, threads, thread_state);
+ if (name < m)
+ {
+ libsais_mark_distinct_lms_suffixes_32s_omp(SA, n, m, threads);
+ }
+
+ return name;
+}
+
+static sa_sint_t libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t * RESTRICT SAm = &SA[m];
+
+ {
+ libsais_gather_lms_suffixes_32s(T, SA, n);
+
+ memset(&SA[m], 0, ((size_t)n - (size_t)m - (size_t)m) * sizeof(sa_sint_t));
+
+ fast_sint_t i, j;
+ for (i = (fast_sint_t)n - (fast_sint_t)m, j = (fast_sint_t)n - 1 - prefetch_distance - 3; i < j; i += 4)
+ {
+ libsais_prefetch(&SA[i + 2 * prefetch_distance]);
+
+ libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]);
+ libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]);
+ libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 2]) >> 1]);
+ libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 3]) >> 1]);
+
+ SAm[((sa_uint_t)SA[i + 0]) >> 1] = SA[i + 1] - SA[i + 0] + 1 + SAINT_MIN;
+ SAm[((sa_uint_t)SA[i + 1]) >> 1] = SA[i + 2] - SA[i + 1] + 1 + SAINT_MIN;
+ SAm[((sa_uint_t)SA[i + 2]) >> 1] = SA[i + 3] - SA[i + 2] + 1 + SAINT_MIN;
+ SAm[((sa_uint_t)SA[i + 3]) >> 1] = SA[i + 4] - SA[i + 3] + 1 + SAINT_MIN;
+ }
+
+ for (j += prefetch_distance + 3; i < j; i += 1)
+ {
+ SAm[((sa_uint_t)SA[i]) >> 1] = SA[i + 1] - SA[i] + 1 + SAINT_MIN;
+ }
+
+ SAm[((sa_uint_t)SA[n - 1]) >> 1] = 1 + SAINT_MIN;
+ }
+
+ {
+ libsais_clamp_lms_suffixes_length_32s_omp(SA, n, m, threads);
+ }
+
+ sa_sint_t name = 1;
+
+ {
+ fast_sint_t i, j, p = SA[0], plen = SAm[p >> 1]; sa_sint_t pdiff = SAINT_MIN;
+ for (i = 1, j = m - prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais_prefetch(&SA[i + 2 * prefetch_distance]);
+
+ libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]); libsais_prefetch(&T[((sa_uint_t)SA[i + prefetch_distance + 0])]);
+ libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]); libsais_prefetch(&T[((sa_uint_t)SA[i + prefetch_distance + 1])]);
+
+ fast_sint_t q = SA[i + 0], qlen = SAm[q >> 1]; sa_sint_t qdiff = SAINT_MIN;
+ if (plen == qlen) { fast_sint_t l = 0; do { if (T[p + l] != T[q + l]) { break; } } while (++l < qlen); qdiff = (sa_sint_t)(l - qlen) & SAINT_MIN; }
+ SAm[p >> 1] = name | (pdiff & qdiff); name += (qdiff < 0);
+
+ p = SA[i + 1]; plen = SAm[p >> 1]; pdiff = SAINT_MIN;
+ if (qlen == plen) { fast_sint_t l = 0; do { if (T[q + l] != T[p + l]) { break; } } while (++l < plen); pdiff = (sa_sint_t)(l - plen) & SAINT_MIN; }
+ SAm[q >> 1] = name | (qdiff & pdiff); name += (pdiff < 0);
+ }
+
+ for (j += prefetch_distance + 1; i < j; i += 1)
+ {
+ fast_sint_t q = SA[i], qlen = SAm[q >> 1]; sa_sint_t qdiff = SAINT_MIN;
+ if (plen == qlen) { fast_sint_t l = 0; do { if (T[p + l] != T[q + l]) { break; } } while (++l < plen); qdiff = (sa_sint_t)(l - plen) & SAINT_MIN; }
+ SAm[p >> 1] = name | (pdiff & qdiff); name += (qdiff < 0);
+
+ p = q; plen = qlen; pdiff = qdiff;
+ }
+
+ SAm[p >> 1] = name | pdiff; name++;
+ }
+
+ if (name <= m)
+ {
+ libsais_mark_distinct_lms_suffixes_32s_omp(SA, n, m, threads);
+ }
+
+ return name - 1;
+}
+
+static void libsais_reconstruct_lms_suffixes(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ const sa_sint_t * RESTRICT SAnm = &SA[n - m];
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
+ {
+ libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+ libsais_prefetch(&SAnm[SA[i + prefetch_distance + 0]]);
+ libsais_prefetch(&SAnm[SA[i + prefetch_distance + 1]]);
+ libsais_prefetch(&SAnm[SA[i + prefetch_distance + 2]]);
+ libsais_prefetch(&SAnm[SA[i + prefetch_distance + 3]]);
+
+ SA[i + 0] = SAnm[SA[i + 0]];
+ SA[i + 1] = SAnm[SA[i + 1]];
+ SA[i + 2] = SAnm[SA[i + 2]];
+ SA[i + 3] = SAnm[SA[i + 3]];
+ }
+
+ for (j += prefetch_distance + 3; i < j; i += 1)
+ {
+ SA[i] = SAnm[SA[i]];
+ }
+}
+
+static void libsais_reconstruct_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+ fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
+#else
+ UNUSED(threads);
+
+ fast_sint_t omp_block_start = 0;
+ fast_sint_t omp_block_size = m;
+#endif
+
+ libsais_reconstruct_lms_suffixes(SA, n, m, omp_block_start, omp_block_size);
+ }
+}
+
+static void libsais_place_lms_suffixes_interval_8u(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, const sa_sint_t * RESTRICT buckets)
+{
+ const sa_sint_t * RESTRICT bucket_end = &buckets[7 * ALPHABET_SIZE];
+
+ fast_sint_t c, j = n;
+ for (c = ALPHABET_SIZE - 2; c >= 0; --c)
+ {
+ fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] - (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)];
+ if (l > 0)
+ {
+ fast_sint_t i = bucket_end[c];
+ if (j - i > 0)
+ {
+ memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
+ }
+
+ memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
+ }
+ }
+
+ memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
+}
+
+static void libsais_place_lms_suffixes_interval_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets)
+{
+ const sa_sint_t * RESTRICT bucket_end = &buckets[3 * k];
+
+ fast_sint_t c, j = n;
+ for (c = (fast_sint_t)k - 2; c >= 0; --c)
+ {
+ fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] - (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)];
+ if (l > 0)
+ {
+ fast_sint_t i = bucket_end[c];
+ if (j - i > 0)
+ {
+ memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
+ }
+
+ memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
+ }
+ }
+
+ memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
+}
+
+static void libsais_place_lms_suffixes_interval_32s_2k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets)
+{
+ fast_sint_t j = n;
+
+ if (k > 1)
+ {
+ fast_sint_t c;
+ for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0); c >= BUCKETS_INDEX2(0, 0); c -= BUCKETS_INDEX2(1, 0))
+ {
+ fast_sint_t l = (fast_sint_t)buckets[c + BUCKETS_INDEX2(1, 1)] - (fast_sint_t)buckets[c + BUCKETS_INDEX2(0, 1)];
+ if (l > 0)
+ {
+ fast_sint_t i = buckets[c];
+ if (j - i > 0)
+ {
+ memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
+ }
+
+ memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
+ }
+ }
+ }
+
+ memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
+}
+
+static void libsais_place_lms_suffixes_interval_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t m, sa_sint_t * RESTRICT buckets)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t c = k - 1; fast_sint_t i, l = buckets[c];
+ for (i = (fast_sint_t)m - 1; i >= prefetch_distance + 3; i -= 4)
+ {
+ libsais_prefetch(&SA[i - 2 * prefetch_distance]);
+
+ libsais_prefetch(&T[SA[i - prefetch_distance - 0]]);
+ libsais_prefetch(&T[SA[i - prefetch_distance - 1]]);
+ libsais_prefetch(&T[SA[i - prefetch_distance - 2]]);
+ libsais_prefetch(&T[SA[i - prefetch_distance - 3]]);
+
+ sa_sint_t p0 = SA[i - 0]; if (T[p0] != c) { c = T[p0]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p0;
+ sa_sint_t p1 = SA[i - 1]; if (T[p1] != c) { c = T[p1]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p1;
+ sa_sint_t p2 = SA[i - 2]; if (T[p2] != c) { c = T[p2]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p2;
+ sa_sint_t p3 = SA[i - 3]; if (T[p3] != c) { c = T[p3]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p3;
+ }
+
+ for (; i >= 0; i -= 1)
+ {
+ sa_sint_t p = SA[i]; if (T[p] != c) { c = T[p]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p;
+ }
+
+ memset(&SA[0], 0, (size_t)l * sizeof(sa_sint_t));
+}
+
+static void libsais_place_lms_suffixes_histogram_32s_6k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets)
+{
+ const sa_sint_t * RESTRICT bucket_end = &buckets[5 * k];
+
+ fast_sint_t c, j = n;
+ for (c = (fast_sint_t)k - 2; c >= 0; --c)
+ {
+ fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX4(c, 1)];
+ if (l > 0)
+ {
+ fast_sint_t i = bucket_end[c];
+ if (j - i > 0)
+ {
+ memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
+ }
+
+ memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
+ }
+ }
+
+ memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
+}
+
+static void libsais_place_lms_suffixes_histogram_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets)
+{
+ const sa_sint_t * RESTRICT bucket_end = &buckets[3 * k];
+
+ fast_sint_t c, j = n;
+ for (c = (fast_sint_t)k - 2; c >= 0; --c)
+ {
+ fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)];
+ if (l > 0)
+ {
+ fast_sint_t i = bucket_end[c];
+ if (j - i > 0)
+ {
+ memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
+ }
+
+ memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
+ }
+ }
+
+ memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
+}
+
+static void libsais_place_lms_suffixes_histogram_32s_2k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets)
+{
+ fast_sint_t j = n;
+
+ if (k > 1)
+ {
+ fast_sint_t c;
+ for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0); c >= BUCKETS_INDEX2(0, 0); c -= BUCKETS_INDEX2(1, 0))
+ {
+ fast_sint_t l = (fast_sint_t)buckets[c + BUCKETS_INDEX2(0, 1)];
+ if (l > 0)
+ {
+ fast_sint_t i = buckets[c];
+ if (j - i > 0)
+ {
+ memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
+ }
+
+ memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
+ }
+ }
+ }
+
+ memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
+}
+
+static void libsais_final_bwt_scan_left_to_right_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+
+ sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[i + 0] = T[p0] | SAINT_MIN; SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); }
+ sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[i + 1] = T[p1] | SAINT_MIN; SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); }
+ }
+
+ for (j += prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[i] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); }
+ }
+}
+
+static void libsais_final_bwt_aux_scan_left_to_right_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+
+ sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[i + 0] = T[p0] | SAINT_MIN; SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); if ((p0 & rm) == 0) { I[p0 / (rm + 1)] = induction_bucket[T[p0]]; }}
+ sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[i + 1] = T[p1] | SAINT_MIN; SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); if ((p1 & rm) == 0) { I[p1 / (rm + 1)] = induction_bucket[T[p1]]; }}
+ }
+
+ for (j += prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[i] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]]; } }
+ }
+}
+
+static void libsais_final_sorting_scan_left_to_right_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+
+ sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); }
+ sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); }
+ }
+
+ for (j += prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); }
+ }
+}
+
+static void libsais_final_sorting_scan_left_to_right_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais_prefetchw(&SA[i + 3 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+ sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { libsais_prefetchw(&induction_bucket[T[s2 - 1]]); libsais_prefetch(&T[s2] - 2); }
+ sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { libsais_prefetchw(&induction_bucket[T[s3 - 1]]); libsais_prefetch(&T[s3] - 2); }
+
+ sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); }
+ sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); }
+ }
+
+ for (j += 2 * prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); }
+ }
+}
+
+#if defined(_OPENMP)
+
+static fast_sint_t libsais_final_bwt_scan_left_to_right_8u_block_prepare(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
+
+ fast_sint_t i, j, count = 0;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+
+ sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[i + 0] = T[p0] | SAINT_MIN; buckets[cache[count].symbol = T[p0]]++; cache[count++].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); }
+ sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[i + 1] = T[p1] | SAINT_MIN; buckets[cache[count].symbol = T[p1]]++; cache[count++].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); }
+ }
+
+ for (j += prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[i] = T[p] | SAINT_MIN; buckets[cache[count].symbol = T[p]]++; cache[count++].index = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); }
+ }
+
+ return count;
+}
+
+static fast_sint_t libsais_final_sorting_scan_left_to_right_8u_block_prepare(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
+
+ fast_sint_t i, j, count = 0;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+
+ sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; buckets[cache[count].symbol = T[p0]]++; cache[count++].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); }
+ sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; buckets[cache[count].symbol = T[p1]]++; cache[count++].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); }
+ }
+
+ for (j += prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; buckets[cache[count].symbol = T[p]]++; cache[count++].index = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); }
+ }
+
+ return count;
+}
+
+static void libsais_final_order_scan_left_to_right_8u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = 0, j = count - 3; i < j; i += 4)
+ {
+ libsais_prefetch(&cache[i + prefetch_distance]);
+
+ SA[buckets[cache[i + 0].symbol]++] = cache[i + 0].index;
+ SA[buckets[cache[i + 1].symbol]++] = cache[i + 1].index;
+ SA[buckets[cache[i + 2].symbol]++] = cache[i + 2].index;
+ SA[buckets[cache[i + 3].symbol]++] = cache[i + 3].index;
+ }
+
+ for (j += 3; i < j; i += 1)
+ {
+ SA[buckets[cache[i].symbol]++] = cache[i].index;
+ }
+}
+
+static void libsais_final_bwt_aux_scan_left_to_right_8u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = 0, j = count - 3; i < j; i += 4)
+ {
+ libsais_prefetch(&cache[i + prefetch_distance]);
+
+ SA[buckets[cache[i + 0].symbol]++] = cache[i + 0].index; if ((cache[i + 0].index & rm) == 0) { I[(cache[i + 0].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 0].symbol]; }
+ SA[buckets[cache[i + 1].symbol]++] = cache[i + 1].index; if ((cache[i + 1].index & rm) == 0) { I[(cache[i + 1].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 1].symbol]; }
+ SA[buckets[cache[i + 2].symbol]++] = cache[i + 2].index; if ((cache[i + 2].index & rm) == 0) { I[(cache[i + 2].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 2].symbol]; }
+ SA[buckets[cache[i + 3].symbol]++] = cache[i + 3].index; if ((cache[i + 3].index & rm) == 0) { I[(cache[i + 3].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 3].symbol]; }
+ }
+
+ for (j += 3; i < j; i += 1)
+ {
+ SA[buckets[cache[i].symbol]++] = cache[i].index; if ((cache[i].index & rm) == 0) { I[(cache[i].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i].symbol]; }
+ }
+}
+
+static void libsais_final_sorting_scan_left_to_right_32s_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+
+ libsais_prefetchw(&cache[i + prefetch_distance]);
+
+ sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; cache[i + 0].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); symbol0 = T[p0]; } cache[i + 0].symbol = symbol0;
+ sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; cache[i + 1].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); symbol1 = T[p1]; } cache[i + 1].symbol = symbol1;
+ }
+
+ for (j += prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t symbol = SAINT_MIN, p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; cache[i].index = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); symbol = T[p]; } cache[i].symbol = symbol;
+ }
+}
+
+static void libsais_final_sorting_scan_left_to_right_32s_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size;
+ for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais_prefetchw(&cache[i + 2 * prefetch_distance]);
+
+ sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0]; libsais_prefetchw(s0 >= 0 ? Is0 : NULL);
+ sa_sint_t s1 = cache[i + prefetch_distance + 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1]; libsais_prefetchw(s1 >= 0 ? Is1 : NULL);
+
+ sa_sint_t v0 = cache[i + 0].symbol;
+ if (v0 >= 0)
+ {
+ cache[i + 0].symbol = induction_bucket[v0]++;
+ if (cache[i + 0].symbol < omp_block_end) { sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index; cache[i + 0].index = np ^ SAINT_MIN; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] < T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } }
+ }
+
+ sa_sint_t v1 = cache[i + 1].symbol;
+ if (v1 >= 0)
+ {
+ cache[i + 1].symbol = induction_bucket[v1]++;
+ if (cache[i + 1].symbol < omp_block_end) { sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index; cache[i + 1].index = np ^ SAINT_MIN; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] < T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } }
+ }
+ }
+
+ for (j += prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t v = cache[i].symbol;
+ if (v >= 0)
+ {
+ cache[i].symbol = induction_bucket[v]++;
+ if (cache[i].symbol < omp_block_end) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; cache[i].index = np ^ SAINT_MIN; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] < T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } }
+ }
+ }
+}
+
+static void libsais_final_bwt_scan_left_to_right_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+ omp_block_start += block_start;
+
+ if (omp_num_threads == 1)
+ {
+ libsais_final_bwt_scan_left_to_right_8u(T, SA, induction_bucket, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ thread_state[omp_thread_num].state.count = libsais_final_bwt_scan_left_to_right_8u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ fast_sint_t t;
+ for (t = 0; t < omp_num_threads; ++t)
+ {
+ sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
+ fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A + B; temp_bucket[c] = A; }
+ }
+ }
+
+ #pragma omp barrier
+
+ {
+ libsais_final_order_scan_left_to_right_8u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count);
+ }
+ }
+#endif
+ }
+}
+
+static void libsais_final_bwt_aux_scan_left_to_right_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+ omp_block_start += block_start;
+
+ if (omp_num_threads == 1)
+ {
+ libsais_final_bwt_aux_scan_left_to_right_8u(T, SA, rm, I, induction_bucket, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ thread_state[omp_thread_num].state.count = libsais_final_bwt_scan_left_to_right_8u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ fast_sint_t t;
+ for (t = 0; t < omp_num_threads; ++t)
+ {
+ sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
+ fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A + B; temp_bucket[c] = A; }
+ }
+ }
+
+ #pragma omp barrier
+
+ {
+ libsais_final_bwt_aux_scan_left_to_right_8u_block_place(SA, rm, I, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count);
+ }
+ }
+#endif
+ }
+}
+
+static void libsais_final_sorting_scan_left_to_right_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+ omp_block_start += block_start;
+
+ if (omp_num_threads == 1)
+ {
+ libsais_final_sorting_scan_left_to_right_8u(T, SA, induction_bucket, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ thread_state[omp_thread_num].state.count = libsais_final_sorting_scan_left_to_right_8u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ fast_sint_t t;
+ for (t = 0; t < omp_num_threads; ++t)
+ {
+ sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
+ fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A + B; temp_bucket[c] = A; }
+ }
+ }
+
+ #pragma omp barrier
+
+ {
+ libsais_final_order_scan_left_to_right_8u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count);
+ }
+ }
+#endif
+ }
+}
+
+static void libsais_final_sorting_scan_left_to_right_32s_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(cache);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+ omp_block_start += block_start;
+
+ if (omp_num_threads == 1)
+ {
+ libsais_final_sorting_scan_left_to_right_32s(T, SA, buckets, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ libsais_final_sorting_scan_left_to_right_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ libsais_final_sorting_scan_left_to_right_32s_block_sort(T, buckets, cache - block_start, block_start, block_size);
+ }
+
+ #pragma omp barrier
+
+ {
+ libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+ }
+ }
+#endif
+ }
+}
+
+#endif
+
+static void libsais_final_bwt_scan_left_to_right_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ SA[induction_bucket[T[(sa_sint_t)n - 1]]++] = ((sa_sint_t)n - 1) | ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1));
+
+ if (threads == 1 || n < 65536)
+ {
+ libsais_final_bwt_scan_left_to_right_8u(T, SA, induction_bucket, 0, n);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ fast_sint_t block_start;
+ for (block_start = 0; block_start < n; )
+ {
+ if (SA[block_start] == 0)
+ {
+ block_start++;
+ }
+ else
+ {
+ fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > n) { block_max_end = n;}
+ fast_sint_t block_end = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; }
+ fast_sint_t block_size = block_end - block_start;
+
+ if (block_size < 32)
+ {
+ for (; block_start < block_end; block_start += 1)
+ {
+ sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; SA[block_start] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); }
+ }
+ }
+ else
+ {
+ libsais_final_bwt_scan_left_to_right_8u_block_omp(T, SA, induction_bucket, block_start, block_size, threads, thread_state);
+ block_start = block_end;
+ }
+ }
+ }
+ }
+#else
+ UNUSED(thread_state);
+#endif
+}
+
+static void libsais_final_bwt_aux_scan_left_to_right_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ SA[induction_bucket[T[(sa_sint_t)n - 1]]++] = ((sa_sint_t)n - 1) | ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1));
+
+ if ((((sa_sint_t)n - 1) & rm) == 0) { I[((sa_sint_t)n - 1) / (rm + 1)] = induction_bucket[T[(sa_sint_t)n - 1]]; }
+
+ if (threads == 1 || n < 65536)
+ {
+ libsais_final_bwt_aux_scan_left_to_right_8u(T, SA, rm, I, induction_bucket, 0, n);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ fast_sint_t block_start;
+ for (block_start = 0; block_start < n; )
+ {
+ if (SA[block_start] == 0)
+ {
+ block_start++;
+ }
+ else
+ {
+ fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > n) { block_max_end = n;}
+ fast_sint_t block_end = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; }
+ fast_sint_t block_size = block_end - block_start;
+
+ if (block_size < 32)
+ {
+ for (; block_start < block_end; block_start += 1)
+ {
+ sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; SA[block_start] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]]; } }
+ }
+ }
+ else
+ {
+ libsais_final_bwt_aux_scan_left_to_right_8u_block_omp(T, SA, rm, I, induction_bucket, block_start, block_size, threads, thread_state);
+ block_start = block_end;
+ }
+ }
+ }
+ }
+#else
+ UNUSED(thread_state);
+#endif
+}
+
+static void libsais_final_sorting_scan_left_to_right_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ SA[induction_bucket[T[(sa_sint_t)n - 1]]++] = ((sa_sint_t)n - 1) | ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1));
+
+ if (threads == 1 || n < 65536)
+ {
+ libsais_final_sorting_scan_left_to_right_8u(T, SA, induction_bucket, 0, n);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ fast_sint_t block_start;
+ for (block_start = 0; block_start < n; )
+ {
+ if (SA[block_start] == 0)
+ {
+ block_start++;
+ }
+ else
+ {
+ fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > n) { block_max_end = n;}
+ fast_sint_t block_end = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; }
+ fast_sint_t block_size = block_end - block_start;
+
+ if (block_size < 32)
+ {
+ for (; block_start < block_end; block_start += 1)
+ {
+ sa_sint_t p = SA[block_start]; SA[block_start] = p ^ SAINT_MIN; if (p > 0) { p--; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); }
+ }
+ }
+ else
+ {
+ libsais_final_sorting_scan_left_to_right_8u_block_omp(T, SA, induction_bucket, block_start, block_size, threads, thread_state);
+ block_start = block_end;
+ }
+ }
+ }
+ }
+#else
+ UNUSED(thread_state);
+#endif
+}
+
+static void libsais_final_sorting_scan_left_to_right_32s_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ SA[induction_bucket[T[n - 1]]++] = (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1));
+
+ if (threads == 1 || n < 65536)
+ {
+ libsais_final_sorting_scan_left_to_right_32s(T, SA, induction_bucket, 0, n);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ fast_sint_t block_start, block_end;
+ for (block_start = 0; block_start < n; block_start = block_end)
+ {
+ block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > n) { block_end = n; }
+
+ libsais_final_sorting_scan_left_to_right_32s_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, block_start, block_end - block_start, threads);
+ }
+ }
+#else
+ UNUSED(thread_state);
+#endif
+}
+
+static sa_sint_t libsais_final_bwt_scan_right_to_left_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j; sa_sint_t index = -1;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+ {
+ libsais_prefetchw(&SA[i - 2 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+
+ sa_sint_t p0 = SA[i - 0]; index = (p0 == 0) ? (sa_sint_t)(i - 0) : index;
+ SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint8_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p0 : t; }
+
+ sa_sint_t p1 = SA[i - 1]; index = (p1 == 0) ? (sa_sint_t)(i - 1) : index;
+ SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint8_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p1 : t; }
+ }
+
+ for (j -= prefetch_distance + 1; i >= j; i -= 1)
+ {
+ sa_sint_t p = SA[i]; index = (p == 0) ? (sa_sint_t)i : index;
+ SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint8_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; }
+ }
+
+ return index;
+}
+
+static void libsais_final_bwt_aux_scan_right_to_left_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+ {
+ libsais_prefetchw(&SA[i - 2 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+
+ sa_sint_t p0 = SA[i - 0];
+ SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint8_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p0 : t; if ((p0 & rm) == 0) { I[p0 / (rm + 1)] = induction_bucket[T[p0]] + 1; } }
+
+ sa_sint_t p1 = SA[i - 1];
+ SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint8_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p1 : t; if ((p1 & rm) == 0) { I[p1 / (rm + 1)] = induction_bucket[T[p1]] + 1; } }
+ }
+
+ for (j -= prefetch_distance + 1; i >= j; i -= 1)
+ {
+ sa_sint_t p = SA[i];
+ SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint8_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]] + 1; } }
+ }
+}
+
+static void libsais_final_sorting_scan_right_to_left_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+ {
+ libsais_prefetchw(&SA[i - 2 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+
+ sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[--induction_bucket[T[p0]]] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); }
+ sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[--induction_bucket[T[p1]]] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); }
+ }
+
+ for (j -= prefetch_distance + 1; i >= j; i -= 1)
+ {
+ sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[--induction_bucket[T[p]]] = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); }
+ }
+}
+
+static void libsais_final_sorting_scan_right_to_left_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2)
+ {
+ libsais_prefetchw(&SA[i - 3 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+ sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { libsais_prefetchw(&induction_bucket[T[s2 - 1]]); libsais_prefetch(&T[s2] - 2); }
+ sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { libsais_prefetchw(&induction_bucket[T[s3 - 1]]); libsais_prefetch(&T[s3] - 2); }
+
+ sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[--induction_bucket[T[p0]]] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); }
+ sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[--induction_bucket[T[p1]]] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); }
+ }
+
+ for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1)
+ {
+ sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[--induction_bucket[T[p]]] = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); }
+ }
+}
+
+#if defined(_OPENMP)
+
+static fast_sint_t libsais_final_bwt_scan_right_to_left_8u_block_prepare(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
+
+ fast_sint_t i, j, count = 0;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+ {
+ libsais_prefetchw(&SA[i - 2 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+
+ sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint8_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count++].index = (c0 <= c1) ? p0 : t; }
+ sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint8_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count++].index = (c0 <= c1) ? p1 : t; }
+ }
+
+ for (j -= prefetch_distance + 1; i >= j; i -= 1)
+ {
+ sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint8_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count++].index = (c0 <= c1) ? p : t; }
+ }
+
+ return count;
+}
+
+static fast_sint_t libsais_final_bwt_aux_scan_right_to_left_8u_block_prepare(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
+
+ fast_sint_t i, j, count = 0;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+ {
+ libsais_prefetchw(&SA[i - 2 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+
+ sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint8_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count].index = (c0 <= c1) ? p0 : t; cache[count + 1].index = p0; count += 2; }
+ sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint8_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count].index = (c0 <= c1) ? p1 : t; cache[count + 1].index = p1; count += 2; }
+ }
+
+ for (j -= prefetch_distance + 1; i >= j; i -= 1)
+ {
+ sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint8_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count].index = (c0 <= c1) ? p : t; cache[count + 1].index = p; count += 2; }
+ }
+
+ return count;
+}
+
+static fast_sint_t libsais_final_sorting_scan_right_to_left_8u_block_prepare(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
+
+ fast_sint_t i, j, count = 0;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+ {
+ libsais_prefetchw(&SA[i - 2 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+
+ sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; buckets[cache[count].symbol = T[p0]]++; cache[count++].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); }
+ sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; buckets[cache[count].symbol = T[p1]]++; cache[count++].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); }
+ }
+
+ for (j -= prefetch_distance + 1; i >= j; i -= 1)
+ {
+ sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; buckets[cache[count].symbol = T[p]]++; cache[count++].index = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); }
+ }
+
+ return count;
+}
+
+static void libsais_final_order_scan_right_to_left_8u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = 0, j = count - 3; i < j; i += 4)
+ {
+ libsais_prefetch(&cache[i + prefetch_distance]);
+
+ SA[--buckets[cache[i + 0].symbol]] = cache[i + 0].index;
+ SA[--buckets[cache[i + 1].symbol]] = cache[i + 1].index;
+ SA[--buckets[cache[i + 2].symbol]] = cache[i + 2].index;
+ SA[--buckets[cache[i + 3].symbol]] = cache[i + 3].index;
+ }
+
+ for (j += 3; i < j; i += 1)
+ {
+ SA[--buckets[cache[i].symbol]] = cache[i].index;
+ }
+}
+
+static void libsais_final_bwt_aux_scan_right_to_left_8u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = 0, j = count - 6; i < j; i += 8)
+ {
+ libsais_prefetch(&cache[i + prefetch_distance]);
+
+ SA[--buckets[cache[i + 0].symbol]] = cache[i + 0].index; if ((cache[i + 1].index & rm) == 0) { I[cache[i + 1].index / (rm + 1)] = buckets[cache[i + 0].symbol] + 1; }
+ SA[--buckets[cache[i + 2].symbol]] = cache[i + 2].index; if ((cache[i + 3].index & rm) == 0) { I[cache[i + 3].index / (rm + 1)] = buckets[cache[i + 2].symbol] + 1; }
+ SA[--buckets[cache[i + 4].symbol]] = cache[i + 4].index; if ((cache[i + 5].index & rm) == 0) { I[cache[i + 5].index / (rm + 1)] = buckets[cache[i + 4].symbol] + 1; }
+ SA[--buckets[cache[i + 6].symbol]] = cache[i + 6].index; if ((cache[i + 7].index & rm) == 0) { I[cache[i + 7].index / (rm + 1)] = buckets[cache[i + 6].symbol] + 1; }
+ }
+
+ for (j += 6; i < j; i += 2)
+ {
+ SA[--buckets[cache[i].symbol]] = cache[i].index; if ((cache[i + 1].index & rm) == 0) { I[(cache[i + 1].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i].symbol] + 1; }
+ }
+}
+
+static void libsais_final_sorting_scan_right_to_left_32s_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+
+ libsais_prefetchw(&cache[i + prefetch_distance]);
+
+ sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; cache[i + 0].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); symbol0 = T[p0]; } cache[i + 0].symbol = symbol0;
+ sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; cache[i + 1].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); symbol1 = T[p1]; } cache[i + 1].symbol = symbol1;
+ }
+
+ for (j += prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t symbol = SAINT_MIN, p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; cache[i].index = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); symbol = T[p]; } cache[i].symbol = symbol;
+ }
+}
+
+static void libsais_final_sorting_scan_right_to_left_32s_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+ {
+ libsais_prefetchw(&cache[i - 2 * prefetch_distance]);
+
+ sa_sint_t s0 = cache[i - prefetch_distance - 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0]; libsais_prefetchw(s0 >= 0 ? Is0 : NULL);
+ sa_sint_t s1 = cache[i - prefetch_distance - 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1]; libsais_prefetchw(s1 >= 0 ? Is1 : NULL);
+
+ sa_sint_t v0 = cache[i - 0].symbol;
+ if (v0 >= 0)
+ {
+ cache[i - 0].symbol = --induction_bucket[v0];
+ if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 0].symbol, np = cache[i - 0].index; cache[i - 0].index = np & SAINT_MAX; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] > T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } }
+ }
+
+ sa_sint_t v1 = cache[i - 1].symbol;
+ if (v1 >= 0)
+ {
+ cache[i - 1].symbol = --induction_bucket[v1];
+ if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 1].symbol, np = cache[i - 1].index; cache[i - 1].index = np & SAINT_MAX; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] > T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } }
+ }
+ }
+
+ for (j -= prefetch_distance + 1; i >= j; i -= 1)
+ {
+ sa_sint_t v = cache[i].symbol;
+ if (v >= 0)
+ {
+ cache[i].symbol = --induction_bucket[v];
+ if (cache[i].symbol >= omp_block_start) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; cache[i].index = np & SAINT_MAX; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] > T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } }
+ }
+ }
+}
+
+static void libsais_final_bwt_scan_right_to_left_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+ omp_block_start += block_start;
+
+ if (omp_num_threads == 1)
+ {
+ libsais_final_bwt_scan_right_to_left_8u(T, SA, induction_bucket, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ thread_state[omp_thread_num].state.count = libsais_final_bwt_scan_right_to_left_8u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ fast_sint_t t;
+ for (t = omp_num_threads - 1; t >= 0; --t)
+ {
+ sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
+ fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A - B; temp_bucket[c] = A; }
+ }
+ }
+
+ #pragma omp barrier
+
+ {
+ libsais_final_order_scan_right_to_left_8u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count);
+ }
+ }
+#endif
+ }
+}
+
+static void libsais_final_bwt_aux_scan_right_to_left_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+ omp_block_start += block_start;
+
+ if (omp_num_threads == 1)
+ {
+ libsais_final_bwt_aux_scan_right_to_left_8u(T, SA, rm, I, induction_bucket, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ thread_state[omp_thread_num].state.count = libsais_final_bwt_aux_scan_right_to_left_8u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ fast_sint_t t;
+ for (t = omp_num_threads - 1; t >= 0; --t)
+ {
+ sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
+ fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A - B; temp_bucket[c] = A; }
+ }
+ }
+
+ #pragma omp barrier
+
+ {
+ libsais_final_bwt_aux_scan_right_to_left_8u_block_place(SA, rm, I, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count);
+ }
+ }
+#endif
+ }
+}
+
+static void libsais_final_sorting_scan_right_to_left_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+ omp_block_start += block_start;
+
+ if (omp_num_threads == 1)
+ {
+ libsais_final_sorting_scan_right_to_left_8u(T, SA, induction_bucket, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ thread_state[omp_thread_num].state.count = libsais_final_sorting_scan_right_to_left_8u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ fast_sint_t t;
+ for (t = omp_num_threads - 1; t >= 0; --t)
+ {
+ sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
+ fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A - B; temp_bucket[c] = A; }
+ }
+ }
+
+ #pragma omp barrier
+
+ {
+ libsais_final_order_scan_right_to_left_8u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count);
+ }
+ }
+#endif
+ }
+}
+
+static void libsais_final_sorting_scan_right_to_left_32s_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(cache);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+ omp_block_start += block_start;
+
+ if (omp_num_threads == 1)
+ {
+ libsais_final_sorting_scan_right_to_left_32s(T, SA, buckets, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ libsais_final_sorting_scan_right_to_left_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ libsais_final_sorting_scan_right_to_left_32s_block_sort(T, buckets, cache - block_start, block_start, block_size);
+ }
+
+ #pragma omp barrier
+
+ {
+ libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+ }
+ }
+#endif
+ }
+}
+
+#endif
+
+static sa_sint_t libsais_final_bwt_scan_right_to_left_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ sa_sint_t index = -1;
+
+ if (threads == 1 || n < 65536)
+ {
+ index = libsais_final_bwt_scan_right_to_left_8u(T, SA, induction_bucket, 0, n);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ fast_sint_t block_start;
+ for (block_start = (fast_sint_t)n - 1; block_start >= 0; )
+ {
+ if (SA[block_start] == 0)
+ {
+ index = (sa_sint_t)block_start--;
+ }
+ else
+ {
+ fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end < 0) { block_max_end = -1; }
+ fast_sint_t block_end = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; }
+ fast_sint_t block_size = block_start - block_end;
+
+ if (block_size < 32)
+ {
+ for (; block_start > block_end; block_start -= 1)
+ {
+ sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; uint8_t c0 = T[p - (p > 0)], c1 = T[p]; SA[block_start] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; }
+ }
+ }
+ else
+ {
+ libsais_final_bwt_scan_right_to_left_8u_block_omp(T, SA, induction_bucket, block_end + 1, block_size, threads, thread_state);
+ block_start = block_end;
+ }
+ }
+ }
+ }
+#else
+ UNUSED(thread_state);
+#endif
+
+ return index;
+}
+
+static void libsais_final_bwt_aux_scan_right_to_left_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ if (threads == 1 || n < 65536)
+ {
+ libsais_final_bwt_aux_scan_right_to_left_8u(T, SA, rm, I, induction_bucket, 0, n);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ fast_sint_t block_start;
+ for (block_start = (fast_sint_t)n - 1; block_start >= 0; )
+ {
+ if (SA[block_start] == 0)
+ {
+ block_start--;
+ }
+ else
+ {
+ fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * ((LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads) / 2); if (block_max_end < 0) { block_max_end = -1; }
+ fast_sint_t block_end = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; }
+ fast_sint_t block_size = block_start - block_end;
+
+ if (block_size < 32)
+ {
+ for (; block_start > block_end; block_start -= 1)
+ {
+ sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; uint8_t c0 = T[p - (p > 0)], c1 = T[p]; SA[block_start] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]] + 1; } }
+ }
+ }
+ else
+ {
+ libsais_final_bwt_aux_scan_right_to_left_8u_block_omp(T, SA, rm, I, induction_bucket, block_end + 1, block_size, threads, thread_state);
+ block_start = block_end;
+ }
+ }
+ }
+ }
+#else
+ UNUSED(thread_state);
+#endif
+}
+
+static void libsais_final_sorting_scan_right_to_left_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ if (threads == 1 || n < 65536)
+ {
+ libsais_final_sorting_scan_right_to_left_8u(T, SA, induction_bucket, 0, n);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ fast_sint_t block_start;
+ for (block_start = (fast_sint_t)n - 1; block_start >= 0; )
+ {
+ if (SA[block_start] == 0)
+ {
+ block_start--;
+ }
+ else
+ {
+ fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end < -1) { block_max_end = -1; }
+ fast_sint_t block_end = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; }
+ fast_sint_t block_size = block_start - block_end;
+
+ if (block_size < 32)
+ {
+ for (; block_start > block_end; block_start -= 1)
+ {
+ sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; SA[--induction_bucket[T[p]]] = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); }
+ }
+ }
+ else
+ {
+ libsais_final_sorting_scan_right_to_left_8u_block_omp(T, SA, induction_bucket, block_end + 1, block_size, threads, thread_state);
+ block_start = block_end;
+ }
+ }
+ }
+ }
+#else
+ UNUSED(thread_state);
+#endif
+}
+
+static void libsais_final_sorting_scan_right_to_left_32s_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ if (threads == 1 || n < 65536)
+ {
+ libsais_final_sorting_scan_right_to_left_32s(T, SA, induction_bucket, 0, n);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ fast_sint_t block_start, block_end;
+ for (block_start = (fast_sint_t)n - 1; block_start >= 0; block_start = block_end)
+ {
+ block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < 0) { block_end = -1; }
+
+ libsais_final_sorting_scan_right_to_left_32s_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads);
+ }
+ }
+#else
+ UNUSED(thread_state);
+#endif
+}
+
+static void libsais_clear_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT bucket_start, sa_sint_t * RESTRICT bucket_end, sa_sint_t threads)
+{
+ fast_sint_t c;
+
+#if defined(_OPENMP)
+ #pragma omp parallel for schedule(static, 1) num_threads(threads) if(threads > 1 && n >= 65536)
+#else
+ UNUSED(threads); UNUSED(n);
+#endif
+ for (c = 0; c < k; ++c)
+ {
+ if (bucket_end[c] > bucket_start[c])
+ {
+ memset(&SA[bucket_start[c]], 0, ((size_t)bucket_end[c] - (size_t)bucket_start[c]) * sizeof(sa_sint_t));
+ }
+ }
+}
+
+static sa_sint_t libsais_induce_final_order_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t bwt, sa_sint_t r, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ if (!bwt)
+ {
+ libsais_final_sorting_scan_left_to_right_8u_omp(T, SA, n, &buckets[6 * ALPHABET_SIZE], threads, thread_state);
+ if (threads > 1 && n >= 65536) { libsais_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE], &buckets[7 * ALPHABET_SIZE], threads); }
+ libsais_final_sorting_scan_right_to_left_8u_omp(T, SA, n, &buckets[7 * ALPHABET_SIZE], threads, thread_state);
+ return 0;
+ }
+ else if (I != NULL)
+ {
+ libsais_final_bwt_aux_scan_left_to_right_8u_omp(T, SA, n, r - 1, I, &buckets[6 * ALPHABET_SIZE], threads, thread_state);
+ if (threads > 1 && n >= 65536) { libsais_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE], &buckets[7 * ALPHABET_SIZE], threads); }
+ libsais_final_bwt_aux_scan_right_to_left_8u_omp(T, SA, n, r - 1, I, &buckets[7 * ALPHABET_SIZE], threads, thread_state);
+ return 0;
+ }
+ else
+ {
+ libsais_final_bwt_scan_left_to_right_8u_omp(T, SA, n, &buckets[6 * ALPHABET_SIZE], threads, thread_state);
+ if (threads > 1 && n >= 65536) { libsais_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE], &buckets[7 * ALPHABET_SIZE], threads); }
+ return libsais_final_bwt_scan_right_to_left_8u_omp(T, SA, n, &buckets[7 * ALPHABET_SIZE], threads, thread_state);
+ }
+}
+
+static void libsais_induce_final_order_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[4 * k], threads, thread_state);
+ libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[5 * k], threads, thread_state);
+}
+
+static void libsais_induce_final_order_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[2 * k], threads, thread_state);
+ libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[3 * k], threads, thread_state);
+}
+
+static void libsais_induce_final_order_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[1 * k], threads, thread_state);
+ libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[0 * k], threads, thread_state);
+}
+
+static void libsais_induce_final_order_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ libsais_count_suffixes_32s(T, n, k, buckets);
+ libsais_initialize_buckets_start_32s_1k(k, buckets);
+ libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, buckets, threads, thread_state);
+
+ libsais_count_suffixes_32s(T, n, k, buckets);
+ libsais_initialize_buckets_end_32s_1k(k, buckets);
+ libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, buckets, threads, thread_state);
+}
+
+static sa_sint_t libsais_renumber_unique_and_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t f, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t * RESTRICT SAm = &SA[m];
+
+ sa_sint_t i, j;
+ for (i = (sa_sint_t)omp_block_start, j = (sa_sint_t)omp_block_start + (sa_sint_t)omp_block_size - 2 * (sa_sint_t)prefetch_distance - 3; i < j; i += 4)
+ {
+ libsais_prefetch(&SA[i + 3 * prefetch_distance]);
+
+ libsais_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 0]) >> 1]);
+ libsais_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 1]) >> 1]);
+ libsais_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 2]) >> 1]);
+ libsais_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 3]) >> 1]);
+
+ sa_uint_t q0 = (sa_uint_t)SA[i + prefetch_distance + 0]; const sa_sint_t * Tq0 = &T[q0]; libsais_prefetchw(SAm[q0 >> 1] < 0 ? Tq0 : NULL);
+ sa_uint_t q1 = (sa_uint_t)SA[i + prefetch_distance + 1]; const sa_sint_t * Tq1 = &T[q1]; libsais_prefetchw(SAm[q1 >> 1] < 0 ? Tq1 : NULL);
+ sa_uint_t q2 = (sa_uint_t)SA[i + prefetch_distance + 2]; const sa_sint_t * Tq2 = &T[q2]; libsais_prefetchw(SAm[q2 >> 1] < 0 ? Tq2 : NULL);
+ sa_uint_t q3 = (sa_uint_t)SA[i + prefetch_distance + 3]; const sa_sint_t * Tq3 = &T[q3]; libsais_prefetchw(SAm[q3 >> 1] < 0 ? Tq3 : NULL);
+
+ sa_uint_t p0 = (sa_uint_t)SA[i + 0]; sa_sint_t s0 = SAm[p0 >> 1]; if (s0 < 0) { T[p0] |= SAINT_MIN; f++; s0 = i + 0 + SAINT_MIN + f; } SAm[p0 >> 1] = s0 - f;
+ sa_uint_t p1 = (sa_uint_t)SA[i + 1]; sa_sint_t s1 = SAm[p1 >> 1]; if (s1 < 0) { T[p1] |= SAINT_MIN; f++; s1 = i + 1 + SAINT_MIN + f; } SAm[p1 >> 1] = s1 - f;
+ sa_uint_t p2 = (sa_uint_t)SA[i + 2]; sa_sint_t s2 = SAm[p2 >> 1]; if (s2 < 0) { T[p2] |= SAINT_MIN; f++; s2 = i + 2 + SAINT_MIN + f; } SAm[p2 >> 1] = s2 - f;
+ sa_uint_t p3 = (sa_uint_t)SA[i + 3]; sa_sint_t s3 = SAm[p3 >> 1]; if (s3 < 0) { T[p3] |= SAINT_MIN; f++; s3 = i + 3 + SAINT_MIN + f; } SAm[p3 >> 1] = s3 - f;
+ }
+
+ for (j += 2 * (sa_sint_t)prefetch_distance + 3; i < j; i += 1)
+ {
+ sa_uint_t p = (sa_uint_t)SA[i]; sa_sint_t s = SAm[p >> 1]; if (s < 0) { T[p] |= SAINT_MIN; f++; s = i + SAINT_MIN + f; } SAm[p >> 1] = s - f;
+ }
+
+ return f;
+}
+
+static void libsais_compact_unique_and_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t * pl, fast_sint_t * pr, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t * RESTRICT SAl = &SA[0];
+ sa_sint_t * RESTRICT SAr = &SA[0];
+
+ fast_sint_t i, j, l = *pl - 1, r = *pr - 1;
+ for (i = (fast_sint_t)m + omp_block_start + omp_block_size - 1, j = (fast_sint_t)m + omp_block_start + 3; i >= j; i -= 4)
+ {
+ libsais_prefetch(&SA[i - prefetch_distance]);
+
+ sa_sint_t p0 = SA[i - 0]; SAl[l] = p0 & SAINT_MAX; l -= p0 < 0; SAr[r] = p0 - 1; r -= p0 > 0;
+ sa_sint_t p1 = SA[i - 1]; SAl[l] = p1 & SAINT_MAX; l -= p1 < 0; SAr[r] = p1 - 1; r -= p1 > 0;
+ sa_sint_t p2 = SA[i - 2]; SAl[l] = p2 & SAINT_MAX; l -= p2 < 0; SAr[r] = p2 - 1; r -= p2 > 0;
+ sa_sint_t p3 = SA[i - 3]; SAl[l] = p3 & SAINT_MAX; l -= p3 < 0; SAr[r] = p3 - 1; r -= p3 > 0;
+ }
+
+ for (j -= 3; i >= j; i -= 1)
+ {
+ sa_sint_t p = SA[i]; SAl[l] = p & SAINT_MAX; l -= p < 0; SAr[r] = p - 1; r -= p > 0;
+ }
+
+ *pl = l + 1; *pr = r + 1;
+}
+
+
+#if defined(_OPENMP)
+
+static sa_sint_t libsais_count_unique_suffixes(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t * RESTRICT SAm = &SA[m];
+
+ fast_sint_t i, j; sa_sint_t f0 = 0, f1 = 0, f2 = 0, f3 = 0;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
+ {
+ libsais_prefetch(&SA[i + 2 * prefetch_distance]);
+
+ libsais_prefetch(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]);
+ libsais_prefetch(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]);
+ libsais_prefetch(&SAm[((sa_uint_t)SA[i + prefetch_distance + 2]) >> 1]);
+ libsais_prefetch(&SAm[((sa_uint_t)SA[i + prefetch_distance + 3]) >> 1]);
+
+ f0 += SAm[((sa_uint_t)SA[i + 0]) >> 1] < 0;
+ f1 += SAm[((sa_uint_t)SA[i + 1]) >> 1] < 0;
+ f2 += SAm[((sa_uint_t)SA[i + 2]) >> 1] < 0;
+ f3 += SAm[((sa_uint_t)SA[i + 3]) >> 1] < 0;
+ }
+
+ for (j += prefetch_distance + 3; i < j; i += 1)
+ {
+ f0 += SAm[((sa_uint_t)SA[i]) >> 1] < 0;
+ }
+
+ return f0 + f1 + f2 + f3;
+}
+
+#endif
+
+static sa_sint_t libsais_renumber_unique_and_nonunique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ sa_sint_t f = 0;
+
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
+
+ if (omp_num_threads == 1)
+ {
+ f = libsais_renumber_unique_and_nonunique_lms_suffixes_32s(T, SA, m, 0, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ thread_state[omp_thread_num].state.count = libsais_count_unique_suffixes(SA, m, omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ {
+ fast_sint_t t, count = 0; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; }
+
+ if (omp_thread_num == omp_num_threads - 1)
+ {
+ f = (sa_sint_t)(count + thread_state[omp_thread_num].state.count);
+ }
+
+ libsais_renumber_unique_and_nonunique_lms_suffixes_32s(T, SA, m, (sa_sint_t)count, omp_block_start, omp_block_size);
+ }
+ }
+#endif
+ }
+
+ return f;
+}
+
+static void libsais_compact_unique_and_nonunique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072 && m < fs)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start;
+
+ if (omp_num_threads == 1)
+ {
+ fast_sint_t l = m, r = (fast_sint_t)n + (fast_sint_t)fs;
+ libsais_compact_unique_and_nonunique_lms_suffixes_32s(SA, m, &l, &r, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ thread_state[omp_thread_num].state.position = (fast_sint_t)m + ((fast_sint_t)n >> 1) + omp_block_start + omp_block_size;
+ thread_state[omp_thread_num].state.count = (fast_sint_t)m + omp_block_start + omp_block_size;
+
+ libsais_compact_unique_and_nonunique_lms_suffixes_32s(SA, m, &thread_state[omp_thread_num].state.position, &thread_state[omp_thread_num].state.count, omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ fast_sint_t t, position;
+
+ for (position = m, t = omp_num_threads - 1; t >= 0; --t)
+ {
+ fast_sint_t omp_block_end = t < omp_num_threads - 1 ? omp_block_stride * (t + 1) : ((fast_sint_t)n >> 1);
+ fast_sint_t count = ((fast_sint_t)m + ((fast_sint_t)n >> 1) + omp_block_end - thread_state[t].state.position);
+
+ if (count > 0)
+ {
+ position -= count; memcpy(&SA[position], &SA[thread_state[t].state.position], (size_t)count * sizeof(sa_sint_t));
+ }
+ }
+
+ for (position = (fast_sint_t)n + (fast_sint_t)fs, t = omp_num_threads - 1; t >= 0; --t)
+ {
+ fast_sint_t omp_block_end = t < omp_num_threads - 1 ? omp_block_stride * (t + 1) : ((fast_sint_t)n >> 1);
+ fast_sint_t count = ((fast_sint_t)m + omp_block_end - thread_state[t].state.count);
+
+ if (count > 0)
+ {
+ position -= count; memcpy(&SA[position], &SA[thread_state[t].state.count], (size_t)count * sizeof(sa_sint_t));
+ }
+ }
+ }
+ }
+#endif
+ }
+
+ memcpy(&SA[(fast_sint_t)n + (fast_sint_t)fs - (fast_sint_t)m], &SA[(fast_sint_t)m - (fast_sint_t)f], (size_t)f * sizeof(sa_sint_t));
+}
+
+static sa_sint_t libsais_compact_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ sa_sint_t f = libsais_renumber_unique_and_nonunique_lms_suffixes_32s_omp(T, SA, m, threads, thread_state);
+ libsais_compact_unique_and_nonunique_lms_suffixes_32s_omp(SA, n, m, fs, f, threads, thread_state);
+
+ return f;
+}
+
+static void libsais_merge_unique_lms_suffixes_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, fast_sint_t l, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ const sa_sint_t * RESTRICT SAnm = &SA[(fast_sint_t)n - (fast_sint_t)m - 1 + l];
+
+ sa_sint_t i, j; fast_sint_t tmp = *SAnm++;
+ for (i = (sa_sint_t)omp_block_start, j = (sa_sint_t)omp_block_start + (sa_sint_t)omp_block_size - 6; i < j; i += 4)
+ {
+ libsais_prefetch(&T[i + prefetch_distance]);
+
+ sa_sint_t c0 = T[i + 0]; if (c0 < 0) { T[i + 0] = c0 & SAINT_MAX; SA[tmp] = i + 0; i++; tmp = *SAnm++; }
+ sa_sint_t c1 = T[i + 1]; if (c1 < 0) { T[i + 1] = c1 & SAINT_MAX; SA[tmp] = i + 1; i++; tmp = *SAnm++; }
+ sa_sint_t c2 = T[i + 2]; if (c2 < 0) { T[i + 2] = c2 & SAINT_MAX; SA[tmp] = i + 2; i++; tmp = *SAnm++; }
+ sa_sint_t c3 = T[i + 3]; if (c3 < 0) { T[i + 3] = c3 & SAINT_MAX; SA[tmp] = i + 3; i++; tmp = *SAnm++; }
+ }
+
+ for (j += 6; i < j; i += 1)
+ {
+ sa_sint_t c = T[i]; if (c < 0) { T[i] = c & SAINT_MAX; SA[tmp] = i; i++; tmp = *SAnm++; }
+ }
+}
+
+static void libsais_merge_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, fast_sint_t l, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ const sa_sint_t * RESTRICT SAnm = &SA[(fast_sint_t)n - (fast_sint_t)m - 1 + l];
+
+ fast_sint_t i, j; sa_sint_t tmp = *SAnm++;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - 3; i < j; i += 4)
+ {
+ libsais_prefetch(&SA[i + prefetch_distance]);
+
+ if (SA[i + 0] == 0) { SA[i + 0] = tmp; tmp = *SAnm++; }
+ if (SA[i + 1] == 0) { SA[i + 1] = tmp; tmp = *SAnm++; }
+ if (SA[i + 2] == 0) { SA[i + 2] = tmp; tmp = *SAnm++; }
+ if (SA[i + 3] == 0) { SA[i + 3] = tmp; tmp = *SAnm++; }
+ }
+
+ for (j += 3; i < j; i += 1)
+ {
+ if (SA[i] == 0) { SA[i] = tmp; tmp = *SAnm++; }
+ }
+}
+
+static void libsais_merge_unique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+ if (omp_num_threads == 1)
+ {
+ libsais_merge_unique_lms_suffixes_32s(T, SA, n, m, 0, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ thread_state[omp_thread_num].state.count = libsais_count_negative_marked_suffixes(T, omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ {
+ fast_sint_t t, count = 0; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; }
+
+ libsais_merge_unique_lms_suffixes_32s(T, SA, n, m, count, omp_block_start, omp_block_size);
+ }
+ }
+#endif
+ }
+}
+
+static void libsais_merge_nonunique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
+
+ if (omp_num_threads == 1)
+ {
+ libsais_merge_nonunique_lms_suffixes_32s(SA, n, m, f, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ thread_state[omp_thread_num].state.count = libsais_count_zero_marked_suffixes(SA, omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ {
+ fast_sint_t t, count = f; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; }
+
+ libsais_merge_nonunique_lms_suffixes_32s(SA, n, m, count, omp_block_start, omp_block_size);
+ }
+ }
+#endif
+ }
+}
+
+static void libsais_merge_compacted_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ libsais_merge_unique_lms_suffixes_32s_omp(T, SA, n, m, threads, thread_state);
+ libsais_merge_nonunique_lms_suffixes_32s_omp(SA, n, m, f, threads, thread_state);
+}
+
+static void libsais_reconstruct_compacted_lms_suffixes_32s_2k_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, sa_sint_t fs, sa_sint_t f, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ if (f > 0)
+ {
+ memmove(&SA[n - m - 1], &SA[n + fs - m], (size_t)f * sizeof(sa_sint_t));
+
+ libsais_count_and_gather_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state);
+ libsais_reconstruct_lms_suffixes_omp(SA, n, m - f, threads);
+
+ memcpy(&SA[n - m - 1 + f], &SA[0], ((size_t)m - (size_t)f) * sizeof(sa_sint_t));
+ memset(&SA[0], 0, (size_t)m * sizeof(sa_sint_t));
+
+ libsais_merge_compacted_lms_suffixes_32s_omp(T, SA, n, m, f, threads, thread_state);
+ }
+ else
+ {
+ libsais_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n);
+ libsais_reconstruct_lms_suffixes_omp(SA, n, m, threads);
+ }
+}
+
+static void libsais_reconstruct_compacted_lms_suffixes_32s_1k_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ if (f > 0)
+ {
+ memmove(&SA[n - m - 1], &SA[n + fs - m], (size_t)f * sizeof(sa_sint_t));
+
+ libsais_gather_compacted_lms_suffixes_32s(T, SA, n);
+ libsais_reconstruct_lms_suffixes_omp(SA, n, m - f, threads);
+
+ memcpy(&SA[n - m - 1 + f], &SA[0], ((size_t)m - (size_t)f) * sizeof(sa_sint_t));
+ memset(&SA[0], 0, (size_t)m * sizeof(sa_sint_t));
+
+ libsais_merge_compacted_lms_suffixes_32s_omp(T, SA, n, m, f, threads, thread_state);
+ }
+ else
+ {
+ libsais_gather_lms_suffixes_32s(T, SA, n);
+ libsais_reconstruct_lms_suffixes_omp(SA, n, m, threads);
+ }
+}
+
+static sa_sint_t libsais_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ if (k > 0 && fs / k >= 6)
+ {
+ sa_sint_t alignment = (fs - 1024) / k >= 6 ? 1024 : 16;
+ sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 6 ? (sa_sint_t *)libsais_align_up(&SA[n + fs - 6 * k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 6 * k];
+
+ sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_4k_omp(T, SA, n, k, buckets, threads, thread_state);
+ if (m > 1)
+ {
+ memset(SA, 0, ((size_t)n - (size_t)m) * sizeof(sa_sint_t));
+
+ sa_sint_t first_lms_suffix = SA[n - m];
+ sa_sint_t left_suffixes_count = libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(T, k, buckets, first_lms_suffix);
+
+ libsais_radix_sort_lms_suffixes_32s_6k_omp(T, SA, n, m, &buckets[4 * k], threads, thread_state);
+ libsais_radix_sort_set_markers_32s_6k_omp(SA, k, &buckets[4 * k], threads);
+
+ if (threads > 1 && n >= 65536) { memset(&SA[(fast_sint_t)n - (fast_sint_t)m], 0, (size_t)m * sizeof(sa_sint_t)); }
+
+ libsais_initialize_buckets_for_partial_sorting_32s_6k(T, k, buckets, first_lms_suffix, left_suffixes_count);
+ libsais_induce_partial_order_32s_6k_omp(T, SA, n, k, buckets, first_lms_suffix, left_suffixes_count, threads, thread_state);
+
+ sa_sint_t names = libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(SA, n, m, threads, thread_state);
+ if (names < m)
+ {
+ sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
+
+ if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state) != 0)
+ {
+ return -2;
+ }
+
+ libsais_reconstruct_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, m, fs, f, buckets, threads, thread_state);
+ }
+ else
+ {
+ libsais_count_lms_suffixes_32s_2k(T, n, k, buckets);
+ }
+
+ libsais_initialize_buckets_start_and_end_32s_4k(k, buckets);
+ libsais_place_lms_suffixes_histogram_32s_4k(SA, n, k, m, buckets);
+ libsais_induce_final_order_32s_4k(T, SA, n, k, buckets, threads, thread_state);
+ }
+ else
+ {
+ SA[0] = SA[n - 1];
+
+ libsais_initialize_buckets_start_and_end_32s_6k(k, buckets);
+ libsais_place_lms_suffixes_histogram_32s_6k(SA, n, k, m, buckets);
+ libsais_induce_final_order_32s_6k(T, SA, n, k, buckets, threads, thread_state);
+ }
+
+ return 0;
+ }
+ else if (k > 0 && fs / k >= 4)
+ {
+ sa_sint_t alignment = (fs - 1024) / k >= 4 ? 1024 : 16;
+ sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 4 ? (sa_sint_t *)libsais_align_up(&SA[n + fs - 4 * k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 4 * k];
+
+ sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state);
+ if (m > 1)
+ {
+ libsais_initialize_buckets_for_radix_and_partial_sorting_32s_4k(T, k, buckets, SA[n - m]);
+
+ libsais_radix_sort_lms_suffixes_32s_2k_omp(T, SA, n, m, &buckets[1], threads, thread_state);
+ libsais_radix_sort_set_markers_32s_4k_omp(SA, k, &buckets[1], threads);
+
+ libsais_place_lms_suffixes_interval_32s_4k(SA, n, k, m - 1, buckets);
+ libsais_induce_partial_order_32s_4k_omp(T, SA, n, k, buckets, threads, thread_state);
+
+ sa_sint_t names = libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(SA, n, m, threads, thread_state);
+ if (names < m)
+ {
+ sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
+
+ if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state) != 0)
+ {
+ return -2;
+ }
+
+ libsais_reconstruct_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, m, fs, f, buckets, threads, thread_state);
+ }
+ else
+ {
+ libsais_count_lms_suffixes_32s_2k(T, n, k, buckets);
+ }
+ }
+ else
+ {
+ SA[0] = SA[n - 1];
+ }
+
+ libsais_initialize_buckets_start_and_end_32s_4k(k, buckets);
+ libsais_place_lms_suffixes_histogram_32s_4k(SA, n, k, m, buckets);
+ libsais_induce_final_order_32s_4k(T, SA, n, k, buckets, threads, thread_state);
+
+ return 0;
+ }
+ else if (k > 0 && fs / k >= 2)
+ {
+ sa_sint_t alignment = (fs - 1024) / k >= 2 ? 1024 : 16;
+ sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 2 ? (sa_sint_t *)libsais_align_up(&SA[n + fs - 2 * k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 2 * k];
+
+ sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state);
+ if (m > 1)
+ {
+ libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(T, k, buckets, SA[n - m]);
+
+ libsais_radix_sort_lms_suffixes_32s_2k_omp(T, SA, n, m, &buckets[1], threads, thread_state);
+ libsais_place_lms_suffixes_interval_32s_2k(SA, n, k, m - 1, buckets);
+
+ libsais_initialize_buckets_start_and_end_32s_2k(k, buckets);
+ libsais_induce_partial_order_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state);
+
+ sa_sint_t names = libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(T, SA, n, m, threads);
+ if (names < m)
+ {
+ sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
+
+ if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state) != 0)
+ {
+ return -2;
+ }
+
+ libsais_reconstruct_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, m, fs, f, buckets, threads, thread_state);
+ }
+ else
+ {
+ libsais_count_lms_suffixes_32s_2k(T, n, k, buckets);
+ }
+ }
+ else
+ {
+ SA[0] = SA[n - 1];
+ }
+
+ libsais_initialize_buckets_end_32s_2k(k, buckets);
+ libsais_place_lms_suffixes_histogram_32s_2k(SA, n, k, m, buckets);
+
+ libsais_initialize_buckets_start_and_end_32s_2k(k, buckets);
+ libsais_induce_final_order_32s_2k(T, SA, n, k, buckets, threads, thread_state);
+
+ return 0;
+ }
+ else
+ {
+ sa_sint_t * buffer = fs < k ? (sa_sint_t *)libsais_alloc_aligned((size_t)k * sizeof(sa_sint_t), 4096) : (sa_sint_t *)NULL;
+
+ sa_sint_t alignment = fs - 1024 >= k ? 1024 : 16;
+ sa_sint_t * RESTRICT buckets = fs - alignment >= k ? (sa_sint_t *)libsais_align_up(&SA[n + fs - k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : fs >= k ? &SA[n + fs - k] : buffer;
+
+ if (buckets == NULL) { return -2; }
+
+ memset(SA, 0, (size_t)n * sizeof(sa_sint_t));
+
+ libsais_count_suffixes_32s(T, n, k, buckets);
+ libsais_initialize_buckets_end_32s_1k(k, buckets);
+
+ sa_sint_t m = libsais_radix_sort_lms_suffixes_32s_1k(T, SA, n, buckets);
+ if (m > 1)
+ {
+ libsais_induce_partial_order_32s_1k_omp(T, SA, n, k, buckets, threads, thread_state);
+
+ sa_sint_t names = libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(T, SA, n, m, threads);
+ if (names < m)
+ {
+ if (buffer != NULL) { libsais_free_aligned(buffer); buckets = NULL; }
+
+ sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
+
+ if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state) != 0)
+ {
+ return -2;
+ }
+
+ libsais_reconstruct_compacted_lms_suffixes_32s_1k_omp(T, SA, n, m, fs, f, threads, thread_state);
+
+ if (buckets == NULL) { buckets = buffer = (sa_sint_t *)libsais_alloc_aligned((size_t)k * sizeof(sa_sint_t), 4096); }
+ if (buckets == NULL) { return -2; }
+ }
+
+ libsais_count_suffixes_32s(T, n, k, buckets);
+ libsais_initialize_buckets_end_32s_1k(k, buckets);
+ libsais_place_lms_suffixes_interval_32s_1k(T, SA, k, m, buckets);
+ }
+
+ libsais_induce_final_order_32s_1k(T, SA, n, k, buckets, threads, thread_state);
+ libsais_free_aligned(buffer);
+
+ return 0;
+ }
+}
+
+int32_t libsais_main_32s_internal(int32_t * T, int32_t * SA, int32_t n, int32_t k, int32_t fs, int32_t threads)
+{
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state = threads > 1 ? libsais_alloc_thread_state(threads) : NULL;
+
+ sa_sint_t index = thread_state != NULL || threads == 1
+ ? libsais_main_32s(T, SA, n, k, fs, threads, thread_state)
+ : -2;
+
+ libsais_free_thread_state(thread_state);
+
+ return index;
+}
+
+static sa_sint_t libsais_main_8u(const uint8_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t bwt, sa_sint_t r, sa_sint_t * RESTRICT I, sa_sint_t fs, sa_sint_t * freq, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ sa_sint_t m = libsais_count_and_gather_lms_suffixes_8u_omp(T, SA, n, buckets, threads, thread_state);
+
+ libsais_initialize_buckets_start_and_end_8u(buckets, freq);
+
+ if (m > 0)
+ {
+ sa_sint_t first_lms_suffix = SA[n - m];
+ sa_sint_t left_suffixes_count = libsais_initialize_buckets_for_lms_suffixes_radix_sort_8u(T, buckets, first_lms_suffix);
+
+ if (threads > 1 && n >= 65536) { memset(SA, 0, ((size_t)n - (size_t)m) * sizeof(sa_sint_t)); }
+ libsais_radix_sort_lms_suffixes_8u_omp(T, SA, n, m, buckets, threads, thread_state);
+ if (threads > 1 && n >= 65536) { memset(&SA[(fast_sint_t)n - (fast_sint_t)m], 0, (size_t)m * sizeof(sa_sint_t)); }
+
+ libsais_initialize_buckets_for_partial_sorting_8u(T, buckets, first_lms_suffix, left_suffixes_count);
+ libsais_induce_partial_order_8u_omp(T, SA, n, buckets, first_lms_suffix, left_suffixes_count, threads, thread_state);
+
+ sa_sint_t names = libsais_renumber_and_gather_lms_suffixes_8u_omp(SA, n, m, fs, threads, thread_state);
+ if (names < m)
+ {
+ if (libsais_main_32s(SA + n + fs - m, SA, m, names, fs + n - 2 * m, threads, thread_state) != 0)
+ {
+ return -2;
+ }
+
+ libsais_gather_lms_suffixes_8u_omp(T, SA, n, threads, thread_state);
+ libsais_reconstruct_lms_suffixes_omp(SA, n, m, threads);
+ }
+
+ libsais_place_lms_suffixes_interval_8u(SA, n, m, buckets);
+ }
+ else
+ {
+ memset(SA, 0, (size_t)n * sizeof(sa_sint_t));
+ }
+
+ return libsais_induce_final_order_8u_omp(T, SA, n, bwt, r, I, buckets, threads, thread_state);
+}
+
+static sa_sint_t libsais_main(const uint8_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t bwt, sa_sint_t r, sa_sint_t * I, sa_sint_t fs, sa_sint_t * freq, sa_sint_t threads)
+{
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state = threads > 1 ? libsais_alloc_thread_state(threads) : NULL;
+ sa_sint_t * RESTRICT buckets = (sa_sint_t *)libsais_alloc_aligned(8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096);
+
+ sa_sint_t index = buckets != NULL && (thread_state != NULL || threads == 1)
+ ? libsais_main_8u(T, SA, n, buckets, bwt, r, I, fs, freq, threads, thread_state)
+ : -2;
+
+ libsais_free_aligned(buckets);
+ libsais_free_thread_state(thread_state);
+
+ return index;
+}
+
+static sa_sint_t libsais_main_ctx(const LIBSAIS_CONTEXT * ctx, const uint8_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t bwt, sa_sint_t r, sa_sint_t * I, sa_sint_t fs, sa_sint_t * freq)
+{
+ return ctx != NULL && (ctx->buckets != NULL && (ctx->thread_state != NULL || ctx->threads == 1))
+ ? libsais_main_8u(T, SA, n, ctx->buckets, bwt, r, I, fs, freq, (sa_sint_t)ctx->threads, ctx->thread_state)
+ : -2;
+}
+
+static void libsais_bwt_copy_8u(uint8_t * RESTRICT U, sa_sint_t * RESTRICT A, sa_sint_t n)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = 0, j = (fast_sint_t)n - 7; i < j; i += 8)
+ {
+ libsais_prefetch(&A[i + prefetch_distance]);
+
+ U[i + 0] = (uint8_t)A[i + 0];
+ U[i + 1] = (uint8_t)A[i + 1];
+ U[i + 2] = (uint8_t)A[i + 2];
+ U[i + 3] = (uint8_t)A[i + 3];
+ U[i + 4] = (uint8_t)A[i + 4];
+ U[i + 5] = (uint8_t)A[i + 5];
+ U[i + 6] = (uint8_t)A[i + 6];
+ U[i + 7] = (uint8_t)A[i + 7];
+ }
+
+ for (j += 7; i < j; i += 1)
+ {
+ U[i] = (uint8_t)A[i];
+ }
+}
+
+#if defined(_OPENMP)
+
+static void libsais_bwt_copy_8u_omp(uint8_t * RESTRICT U, sa_sint_t * RESTRICT A, sa_sint_t n, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+ fast_sint_t omp_block_stride = ((fast_sint_t)n / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : (fast_sint_t)n - omp_block_start;
+#else
+ UNUSED(threads);
+
+ fast_sint_t omp_block_start = 0;
+ fast_sint_t omp_block_size = (fast_sint_t)n;
+#endif
+
+ libsais_bwt_copy_8u(U + omp_block_start, A + omp_block_start, (sa_sint_t)omp_block_size);
+ }
+}
+
+#endif
+
+void * libsais_create_ctx(void)
+{
+ return (void *)libsais_create_ctx_main(1);
+}
+
+void libsais_free_ctx(void * ctx)
+{
+ libsais_free_ctx_main((LIBSAIS_CONTEXT *)ctx);
+}
+
+int32_t libsais(const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq)
+{
+ if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0))
+ {
+ return -1;
+ }
+ else if (n < 2)
+ {
+ if (n == 1) { SA[0] = 0; }
+ return 0;
+ }
+
+ return libsais_main(T, SA, n, 0, 0, NULL, fs, freq, 1);
+}
+
+int32_t libsais_ctx(const void * ctx, const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq)
+{
+ if ((ctx == NULL) || (T == NULL) || (SA == NULL) || (n < 0) || (fs < 0))
+ {
+ return -1;
+ }
+ else if (n < 2)
+ {
+ if (n == 1) { SA[0] = 0; }
+ return 0;
+ }
+
+ return libsais_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, SA, n, 0, 0, NULL, fs, freq);
+}
+
+int32_t libsais_bwt(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq)
+{
+ if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0))
+ {
+ return -1;
+ }
+ else if (n <= 1)
+ {
+ if (n == 1) { U[0] = T[0]; }
+ return n;
+ }
+
+ sa_sint_t index = libsais_main(T, A, n, 1, 0, NULL, fs, freq, 1);
+ if (index >= 0)
+ {
+ index++;
+
+ U[0] = T[n - 1];
+ libsais_bwt_copy_8u(U + 1, A, index - 1);
+ libsais_bwt_copy_8u(U + index, A + index, n - index);
+ }
+
+ return index;
+}
+
+int32_t libsais_bwt_aux(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I)
+{
+ if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) || ((r & (r - 1)) != 0) || (I == NULL))
+ {
+ return -1;
+ }
+ else if (n <= 1)
+ {
+ if (n == 1) { U[0] = T[0]; }
+
+ I[0] = n;
+ return 0;
+ }
+
+ if (libsais_main(T, A, n, 1, r, I, fs, freq, 1) != 0)
+ {
+ return -2;
+ }
+
+ U[0] = T[n - 1];
+ libsais_bwt_copy_8u(U + 1, A, I[0] - 1);
+ libsais_bwt_copy_8u(U + I[0], A + I[0], n - I[0]);
+
+ return 0;
+}
+
+int32_t libsais_bwt_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq)
+{
+ if ((ctx == NULL) || (T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0))
+ {
+ return -1;
+ }
+ else if (n <= 1)
+ {
+ if (n == 1) { U[0] = T[0]; }
+ return n;
+ }
+
+ sa_sint_t index = libsais_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, A, n, 1, 0, NULL, fs, freq);
+ if (index >= 0)
+ {
+ index++;
+
+ U[0] = T[n - 1];
+
+#if defined(_OPENMP)
+ libsais_bwt_copy_8u_omp(U + 1, A, index - 1, (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads);
+ libsais_bwt_copy_8u_omp(U + index, A + index, n - index, (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads);
+#else
+ libsais_bwt_copy_8u(U + 1, A, index - 1);
+ libsais_bwt_copy_8u(U + index, A + index, n - index);
+#endif
+ }
+
+ return index;
+}
+
+int32_t libsais_bwt_aux_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I)
+{
+ if ((ctx == NULL) || (T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) || ((r & (r - 1)) != 0) || (I == NULL))
+ {
+ return -1;
+ }
+ else if (n <= 1)
+ {
+ if (n == 1) { U[0] = T[0]; }
+
+ I[0] = n;
+ return 0;
+ }
+
+ if (libsais_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, A, n, 1, r, I, fs, freq) != 0)
+ {
+ return -2;
+ }
+
+ U[0] = T[n - 1];
+
+#if defined(_OPENMP)
+ libsais_bwt_copy_8u_omp(U + 1, A, I[0] - 1, (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads);
+ libsais_bwt_copy_8u_omp(U + I[0], A + I[0], n - I[0], (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads);
+#else
+ libsais_bwt_copy_8u(U + 1, A, I[0] - 1);
+ libsais_bwt_copy_8u(U + I[0], A + I[0], n - I[0]);
+#endif
+
+ return 0;
+}
+
+#if defined(_OPENMP)
+
+void * libsais_create_ctx_omp(int32_t threads)
+{
+ if (threads < 0) { return NULL; }
+
+ threads = threads > 0 ? threads : omp_get_max_threads();
+ return (void *)libsais_create_ctx_main(threads);
+}
+
+int32_t libsais_omp(const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq, int32_t threads)
+{
+ if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0) || (threads < 0))
+ {
+ return -1;
+ }
+ else if (n < 2)
+ {
+ if (n == 1) { SA[0] = 0; }
+ return 0;
+ }
+
+ threads = threads > 0 ? threads : omp_get_max_threads();
+
+ return libsais_main(T, SA, n, 0, 0, NULL, fs, freq, threads);
+}
+
+int32_t libsais_bwt_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t threads)
+{
+ if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (threads < 0))
+ {
+ return -1;
+ }
+ else if (n <= 1)
+ {
+ if (n == 1) { U[0] = T[0]; }
+ return n;
+ }
+
+ threads = threads > 0 ? threads : omp_get_max_threads();
+
+ sa_sint_t index = libsais_main(T, A, n, 1, 0, NULL, fs, freq, threads);
+ if (index >= 0)
+ {
+ index++;
+
+ U[0] = T[n - 1];
+ libsais_bwt_copy_8u_omp(U + 1, A, index - 1, threads);
+ libsais_bwt_copy_8u_omp(U + index, A + index, n - index, threads);
+ }
+
+ return index;
+}
+
+int32_t libsais_bwt_aux_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I, int32_t threads)
+{
+ if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) || ((r & (r - 1)) != 0) || (I == NULL) || (threads < 0))
+ {
+ return -1;
+ }
+ else if (n <= 1)
+ {
+ if (n == 1) { U[0] = T[0];}
+
+ I[0] = n;
+ return 0;
+ }
+
+ threads = threads > 0 ? threads : omp_get_max_threads();
+
+ if (libsais_main(T, A, n, 1, r, I, fs, freq, threads) != 0)
+ {
+ return -2;
+ }
+
+ U[0] = T[n - 1];
+ libsais_bwt_copy_8u_omp(U + 1, A, I[0] - 1, threads);
+ libsais_bwt_copy_8u_omp(U + I[0], A + I[0], n - I[0], threads);
+
+ return 0;
+}
+
+#endif
+
+static LIBSAIS_UNBWT_CONTEXT * libsais_unbwt_create_ctx_main(sa_sint_t threads)
+{
+ LIBSAIS_UNBWT_CONTEXT * RESTRICT ctx = (LIBSAIS_UNBWT_CONTEXT *)libsais_alloc_aligned(sizeof(LIBSAIS_UNBWT_CONTEXT), 64);
+ sa_uint_t * RESTRICT bucket2 = (sa_uint_t *)libsais_alloc_aligned(ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t), 4096);
+ uint16_t * RESTRICT fastbits = (uint16_t *)libsais_alloc_aligned((1 + (1 << UNBWT_FASTBITS)) * sizeof(uint16_t), 4096);
+ sa_uint_t * RESTRICT buckets = threads > 1 ? (sa_uint_t *)libsais_alloc_aligned((size_t)threads * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) * sizeof(sa_uint_t), 4096) : NULL;
+
+ if (ctx != NULL && bucket2 != NULL && fastbits != NULL && (buckets != NULL || threads == 1))
+ {
+ ctx->bucket2 = bucket2;
+ ctx->fastbits = fastbits;
+ ctx->buckets = buckets;
+ ctx->threads = threads;
+
+ return ctx;
+ }
+
+ libsais_free_aligned(buckets);
+ libsais_free_aligned(fastbits);
+ libsais_free_aligned(bucket2);
+ libsais_free_aligned(ctx);
+
+ return NULL;
+}
+
+static void libsais_unbwt_free_ctx_main(LIBSAIS_UNBWT_CONTEXT * ctx)
+{
+ if (ctx != NULL)
+ {
+ libsais_free_aligned(ctx->buckets);
+ libsais_free_aligned(ctx->fastbits);
+ libsais_free_aligned(ctx->bucket2);
+ libsais_free_aligned(ctx);
+ }
+}
+
+static void libsais_unbwt_compute_histogram(const uint8_t * RESTRICT T, fast_sint_t n, sa_uint_t * RESTRICT count)
+{
+ const fast_sint_t prefetch_distance = 256;
+
+ const uint8_t * RESTRICT T_p = T;
+
+ if (n >= 1024)
+ {
+ sa_uint_t copy[4 * (ALPHABET_SIZE + 16)];
+
+ memset(copy, 0, 4 * (ALPHABET_SIZE + 16) * sizeof(sa_uint_t));
+
+ sa_uint_t * RESTRICT copy0 = copy + 0 * (ALPHABET_SIZE + 16);
+ sa_uint_t * RESTRICT copy1 = copy + 1 * (ALPHABET_SIZE + 16);
+ sa_uint_t * RESTRICT copy2 = copy + 2 * (ALPHABET_SIZE + 16);
+ sa_uint_t * RESTRICT copy3 = copy + 3 * (ALPHABET_SIZE + 16);
+
+ for (; T_p < (uint8_t * )((ptrdiff_t)(T + 63) & (-64)); T_p += 1) { copy0[T_p[0]]++; }
+
+ fast_uint_t x = ((const uint32_t *)(const void *)T_p)[0], y = ((const uint32_t *)(const void *)T_p)[1];
+
+ for (; T_p < (uint8_t * )((ptrdiff_t)(T + n - 8) & (-64)); T_p += 64)
+ {
+ libsais_prefetch(&T_p[prefetch_distance]);
+
+ fast_uint_t z = ((const uint32_t *)(const void *)T_p)[2], w = ((const uint32_t *)(const void *)T_p)[3];
+ copy0[(uint8_t)x]++; x >>= 8; copy1[(uint8_t)x]++; x >>= 8; copy2[(uint8_t)x]++; x >>= 8; copy3[x]++;
+ copy0[(uint8_t)y]++; y >>= 8; copy1[(uint8_t)y]++; y >>= 8; copy2[(uint8_t)y]++; y >>= 8; copy3[y]++;
+
+ x = ((const uint32_t *)(const void *)T_p)[4]; y = ((const uint32_t *)(const void *)T_p)[5];
+ copy0[(uint8_t)z]++; z >>= 8; copy1[(uint8_t)z]++; z >>= 8; copy2[(uint8_t)z]++; z >>= 8; copy3[z]++;
+ copy0[(uint8_t)w]++; w >>= 8; copy1[(uint8_t)w]++; w >>= 8; copy2[(uint8_t)w]++; w >>= 8; copy3[w]++;
+
+ z = ((const uint32_t *)(const void *)T_p)[6]; w = ((const uint32_t *)(const void *)T_p)[7];
+ copy0[(uint8_t)x]++; x >>= 8; copy1[(uint8_t)x]++; x >>= 8; copy2[(uint8_t)x]++; x >>= 8; copy3[x]++;
+ copy0[(uint8_t)y]++; y >>= 8; copy1[(uint8_t)y]++; y >>= 8; copy2[(uint8_t)y]++; y >>= 8; copy3[y]++;
+
+ x = ((const uint32_t *)(const void *)T_p)[8]; y = ((const uint32_t *)(const void *)T_p)[9];
+ copy0[(uint8_t)z]++; z >>= 8; copy1[(uint8_t)z]++; z >>= 8; copy2[(uint8_t)z]++; z >>= 8; copy3[z]++;
+ copy0[(uint8_t)w]++; w >>= 8; copy1[(uint8_t)w]++; w >>= 8; copy2[(uint8_t)w]++; w >>= 8; copy3[w]++;
+
+ z = ((const uint32_t *)(const void *)T_p)[10]; w = ((const uint32_t *)(const void *)T_p)[11];
+ copy0[(uint8_t)x]++; x >>= 8; copy1[(uint8_t)x]++; x >>= 8; copy2[(uint8_t)x]++; x >>= 8; copy3[x]++;
+ copy0[(uint8_t)y]++; y >>= 8; copy1[(uint8_t)y]++; y >>= 8; copy2[(uint8_t)y]++; y >>= 8; copy3[y]++;
+
+ x = ((const uint32_t *)(const void *)T_p)[12]; y = ((const uint32_t *)(const void *)T_p)[13];
+ copy0[(uint8_t)z]++; z >>= 8; copy1[(uint8_t)z]++; z >>= 8; copy2[(uint8_t)z]++; z >>= 8; copy3[z]++;
+ copy0[(uint8_t)w]++; w >>= 8; copy1[(uint8_t)w]++; w >>= 8; copy2[(uint8_t)w]++; w >>= 8; copy3[w]++;
+
+ z = ((const uint32_t *)(const void *)T_p)[14]; w = ((const uint32_t *)(const void *)T_p)[15];
+ copy0[(uint8_t)x]++; x >>= 8; copy1[(uint8_t)x]++; x >>= 8; copy2[(uint8_t)x]++; x >>= 8; copy3[x]++;
+ copy0[(uint8_t)y]++; y >>= 8; copy1[(uint8_t)y]++; y >>= 8; copy2[(uint8_t)y]++; y >>= 8; copy3[y]++;
+
+ x = ((const uint32_t *)(const void *)T_p)[16]; y = ((const uint32_t *)(const void *)T_p)[17];
+ copy0[(uint8_t)z]++; z >>= 8; copy1[(uint8_t)z]++; z >>= 8; copy2[(uint8_t)z]++; z >>= 8; copy3[z]++;
+ copy0[(uint8_t)w]++; w >>= 8; copy1[(uint8_t)w]++; w >>= 8; copy2[(uint8_t)w]++; w >>= 8; copy3[w]++;
+ }
+
+ copy0[(uint8_t)x]++; x >>= 8; copy1[(uint8_t)x]++; x >>= 8; copy2[(uint8_t)x]++; x >>= 8; copy3[x]++;
+ copy0[(uint8_t)y]++; y >>= 8; copy1[(uint8_t)y]++; y >>= 8; copy2[(uint8_t)y]++; y >>= 8; copy3[y]++;
+
+ T_p += 8;
+
+ fast_uint_t i; for (i = 0; i < ALPHABET_SIZE; i++) { count[i] += copy0[i] + copy1[i] + copy2[i] + copy3[i]; }
+ }
+
+ for (; T_p < T + n; T_p += 1) { count[T_p[0]]++; }
+}
+
+static void libsais_unbwt_transpose_bucket2(sa_uint_t * RESTRICT bucket2)
+{
+ fast_uint_t x, y, c, d;
+ for (x = 0; x != ALPHABET_SIZE; x += 16)
+ {
+ for (c = x; c != x + 16; ++c)
+ {
+ for (d = c + 1; d != x + 16; ++d)
+ {
+ sa_uint_t tmp = bucket2[(d << 8) + c]; bucket2[(d << 8) + c] = bucket2[(c << 8) + d]; bucket2[(c << 8) + d] = tmp;
+ }
+ }
+
+ for (y = x + 16; y != ALPHABET_SIZE; y += 16)
+ {
+ for (c = x; c != x + 16; ++c)
+ {
+ sa_uint_t * bucket2_yc = &bucket2[(y << 8) + c];
+ sa_uint_t * bucket2_cy = &bucket2[(c << 8) + y];
+
+ sa_uint_t tmp00 = bucket2_yc[ 0 * 256]; bucket2_yc[ 0 * 256] = bucket2_cy[ 0]; bucket2_cy[ 0] = tmp00;
+ sa_uint_t tmp01 = bucket2_yc[ 1 * 256]; bucket2_yc[ 1 * 256] = bucket2_cy[ 1]; bucket2_cy[ 1] = tmp01;
+ sa_uint_t tmp02 = bucket2_yc[ 2 * 256]; bucket2_yc[ 2 * 256] = bucket2_cy[ 2]; bucket2_cy[ 2] = tmp02;
+ sa_uint_t tmp03 = bucket2_yc[ 3 * 256]; bucket2_yc[ 3 * 256] = bucket2_cy[ 3]; bucket2_cy[ 3] = tmp03;
+ sa_uint_t tmp04 = bucket2_yc[ 4 * 256]; bucket2_yc[ 4 * 256] = bucket2_cy[ 4]; bucket2_cy[ 4] = tmp04;
+ sa_uint_t tmp05 = bucket2_yc[ 5 * 256]; bucket2_yc[ 5 * 256] = bucket2_cy[ 5]; bucket2_cy[ 5] = tmp05;
+ sa_uint_t tmp06 = bucket2_yc[ 6 * 256]; bucket2_yc[ 6 * 256] = bucket2_cy[ 6]; bucket2_cy[ 6] = tmp06;
+ sa_uint_t tmp07 = bucket2_yc[ 7 * 256]; bucket2_yc[ 7 * 256] = bucket2_cy[ 7]; bucket2_cy[ 7] = tmp07;
+ sa_uint_t tmp08 = bucket2_yc[ 8 * 256]; bucket2_yc[ 8 * 256] = bucket2_cy[ 8]; bucket2_cy[ 8] = tmp08;
+ sa_uint_t tmp09 = bucket2_yc[ 9 * 256]; bucket2_yc[ 9 * 256] = bucket2_cy[ 9]; bucket2_cy[ 9] = tmp09;
+ sa_uint_t tmp10 = bucket2_yc[10 * 256]; bucket2_yc[10 * 256] = bucket2_cy[10]; bucket2_cy[10] = tmp10;
+ sa_uint_t tmp11 = bucket2_yc[11 * 256]; bucket2_yc[11 * 256] = bucket2_cy[11]; bucket2_cy[11] = tmp11;
+ sa_uint_t tmp12 = bucket2_yc[12 * 256]; bucket2_yc[12 * 256] = bucket2_cy[12]; bucket2_cy[12] = tmp12;
+ sa_uint_t tmp13 = bucket2_yc[13 * 256]; bucket2_yc[13 * 256] = bucket2_cy[13]; bucket2_cy[13] = tmp13;
+ sa_uint_t tmp14 = bucket2_yc[14 * 256]; bucket2_yc[14 * 256] = bucket2_cy[14]; bucket2_cy[14] = tmp14;
+ sa_uint_t tmp15 = bucket2_yc[15 * 256]; bucket2_yc[15 * 256] = bucket2_cy[15]; bucket2_cy[15] = tmp15;
+ }
+ }
+ }
+}
+
+static void libsais_unbwt_compute_bigram_histogram_single(const uint8_t * RESTRICT T, sa_uint_t * RESTRICT bucket1, sa_uint_t * RESTRICT bucket2, fast_uint_t index)
+{
+ fast_uint_t sum, c;
+ for (sum = 1, c = 0; c < ALPHABET_SIZE; ++c)
+ {
+ fast_uint_t prev = sum; sum += bucket1[c]; bucket1[c] = (sa_uint_t)prev;
+ if (prev != sum)
+ {
+ sa_uint_t * RESTRICT bucket2_p = &bucket2[c << 8];
+
+ {
+ fast_uint_t hi = index; if (sum < hi) { hi = sum; }
+ libsais_unbwt_compute_histogram(&T[prev], (fast_sint_t)(hi - prev), bucket2_p);
+ }
+
+ {
+ fast_uint_t lo = index + 1; if (prev > lo) { lo = prev; }
+ libsais_unbwt_compute_histogram(&T[lo - 1], (fast_sint_t)(sum - lo), bucket2_p);
+ }
+ }
+ }
+
+ libsais_unbwt_transpose_bucket2(bucket2);
+}
+
+static void libsais_unbwt_calculate_fastbits(sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t lastc, fast_uint_t shift)
+{
+ fast_uint_t v, w, sum, c, d;
+ for (v = 0, w = 0, sum = 1, c = 0; c < ALPHABET_SIZE; ++c)
+ {
+ if (c == lastc) { sum += 1; }
+
+ for (d = 0; d < ALPHABET_SIZE; ++d, ++w)
+ {
+ fast_uint_t prev = sum; sum += bucket2[w]; bucket2[w] = (sa_uint_t)prev;
+ if (prev != sum)
+ {
+ for (; v <= ((sum - 1) >> shift); ++v) { fastbits[v] = (uint16_t)w; }
+ }
+ }
+ }
+}
+
+static void libsais_unbwt_calculate_biPSI(const uint8_t * RESTRICT T, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket1, sa_uint_t * RESTRICT bucket2, fast_uint_t index, fast_sint_t omp_block_start, fast_sint_t omp_block_end)
+{
+ {
+ fast_sint_t i = omp_block_start, j = (fast_sint_t)index; if (omp_block_end < j) { j = omp_block_end; }
+ for (; i < j; ++i)
+ {
+ fast_uint_t c = T[i];
+ fast_uint_t p = bucket1[c]++;
+ fast_sint_t t = (fast_sint_t)(index - p);
+
+ if (t != 0)
+ {
+ fast_uint_t w = (((fast_uint_t)T[p + (fast_uint_t)(t >> ((sizeof(fast_sint_t) * 8) - 1))]) << 8) + c;
+ P[bucket2[w]++] = (sa_uint_t)i;
+ }
+ }
+ }
+
+ {
+ fast_sint_t i = (fast_sint_t)index, j = omp_block_end; if (omp_block_start > i) { i = omp_block_start; }
+ for (i += 1; i <= j; ++i)
+ {
+ fast_uint_t c = T[i - 1];
+ fast_uint_t p = bucket1[c]++;
+ fast_sint_t t = (fast_sint_t)(index - p);
+
+ if (t != 0)
+ {
+ fast_uint_t w = (((fast_uint_t)T[p + (fast_uint_t)(t >> ((sizeof(fast_sint_t) * 8) - 1))]) << 8) + c;
+ P[bucket2[w]++] = (sa_uint_t)i;
+ }
+ }
+ }
+}
+
+static void libsais_unbwt_init_single(const uint8_t * RESTRICT T, sa_uint_t * RESTRICT P, sa_sint_t n, const sa_sint_t * freq, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits)
+{
+ sa_uint_t bucket1[ALPHABET_SIZE];
+
+ fast_uint_t index = I[0];
+ fast_uint_t lastc = T[0];
+ fast_uint_t shift = 0; while ((n >> shift) > (1 << UNBWT_FASTBITS)) { shift++; }
+
+ if (freq != NULL)
+ {
+ memcpy(bucket1, freq, ALPHABET_SIZE * sizeof(sa_uint_t));
+ }
+ else
+ {
+ memset(bucket1, 0, ALPHABET_SIZE * sizeof(sa_uint_t));
+ libsais_unbwt_compute_histogram(T, n, bucket1);
+ }
+
+ memset(bucket2, 0, ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t));
+ libsais_unbwt_compute_bigram_histogram_single(T, bucket1, bucket2, index);
+
+ libsais_unbwt_calculate_fastbits(bucket2, fastbits, lastc, shift);
+ libsais_unbwt_calculate_biPSI(T, P, bucket1, bucket2, index, 0, n);
+}
+
+#if defined(_OPENMP)
+
+static void libsais_unbwt_compute_bigram_histogram_parallel(const uint8_t * RESTRICT T, fast_uint_t index, sa_uint_t * RESTRICT bucket1, sa_uint_t * RESTRICT bucket2, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ fast_sint_t i;
+ for (i = omp_block_start; i < omp_block_start + omp_block_size; ++i)
+ {
+ fast_uint_t c = T[i];
+ fast_uint_t p = bucket1[c]++;
+ fast_sint_t t = (fast_sint_t)(index - p);
+
+ if (t != 0)
+ {
+ fast_uint_t w = (((fast_uint_t)T[p + (fast_uint_t)(t >> ((sizeof(fast_sint_t) * 8) - 1))]) << 8) + c;
+ bucket2[w]++;
+ }
+ }
+}
+
+static void libsais_unbwt_init_parallel(const uint8_t * RESTRICT T, sa_uint_t * RESTRICT P, sa_sint_t n, const sa_sint_t * freq, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, sa_uint_t * RESTRICT buckets, sa_sint_t threads)
+{
+ sa_uint_t bucket1[ALPHABET_SIZE];
+
+ fast_uint_t index = I[0];
+ fast_uint_t lastc = T[0];
+ fast_uint_t shift = 0; while ((n >> shift) > (1 << UNBWT_FASTBITS)) { shift++; }
+
+ memset(bucket1, 0, ALPHABET_SIZE * sizeof(sa_uint_t));
+ memset(bucket2, 0, ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t));
+
+ #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+ {
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+
+ if (omp_num_threads == 1)
+ {
+ libsais_unbwt_init_single(T, P, n, freq, I, bucket2, fastbits);
+ }
+ else
+ {
+ sa_uint_t * RESTRICT bucket1_local = buckets + omp_thread_num * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE));
+ sa_uint_t * RESTRICT bucket2_local = bucket1_local + ALPHABET_SIZE;
+
+ fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+ {
+ memset(bucket1_local, 0, ALPHABET_SIZE * sizeof(sa_uint_t));
+ libsais_unbwt_compute_histogram(T + omp_block_start, omp_block_size, bucket1_local);
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ {
+ sa_uint_t * RESTRICT bucket1_temp = buckets;
+
+ fast_sint_t t;
+ for (t = 0; t < omp_num_threads; ++t, bucket1_temp += ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE))
+ {
+ fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_uint_t A = bucket1[c], B = bucket1_temp[c]; bucket1[c] = A + B; bucket1_temp[c] = A; }
+ }
+ }
+
+ {
+ fast_uint_t sum, c;
+ for (sum = 1, c = 0; c < ALPHABET_SIZE; ++c) { fast_uint_t prev = sum; sum += bucket1[c]; bucket1[c] = (sa_uint_t)prev; }
+ }
+ }
+
+ #pragma omp barrier
+
+ {
+ fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_uint_t A = bucket1[c], B = bucket1_local[c]; bucket1_local[c] = A + B; }
+
+ memset(bucket2_local, 0, ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t));
+ libsais_unbwt_compute_bigram_histogram_parallel(T, index, bucket1_local, bucket2_local, omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ {
+ fast_sint_t omp_bucket2_stride = ((ALPHABET_SIZE * ALPHABET_SIZE) / omp_num_threads) & (-16);
+ fast_sint_t omp_bucket2_start = omp_thread_num * omp_bucket2_stride;
+ fast_sint_t omp_bucket2_size = omp_thread_num < omp_num_threads - 1 ? omp_bucket2_stride : (ALPHABET_SIZE * ALPHABET_SIZE) - omp_bucket2_start;
+
+ sa_uint_t * RESTRICT bucket2_temp = buckets + ALPHABET_SIZE;
+
+ fast_sint_t t;
+ for (t = 0; t < omp_num_threads; ++t, bucket2_temp += ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE))
+ {
+ fast_sint_t c; for (c = omp_bucket2_start; c < omp_bucket2_start + omp_bucket2_size; c += 1) { sa_uint_t A = bucket2[c], B = bucket2_temp[c]; bucket2[c] = A + B; bucket2_temp[c] = A; }
+ }
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+
+ libsais_unbwt_calculate_fastbits(bucket2, fastbits, lastc, shift);
+
+ {
+ fast_sint_t t;
+ for (t = omp_num_threads - 1; t >= 1; --t)
+ {
+ sa_uint_t * RESTRICT dst_bucket1 = buckets + t * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE));
+ sa_uint_t * RESTRICT src_bucket1 = dst_bucket1 - (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE));
+
+ memcpy(dst_bucket1, src_bucket1, ALPHABET_SIZE * sizeof(sa_uint_t));
+ }
+
+ memcpy(buckets, bucket1, ALPHABET_SIZE * sizeof(sa_uint_t));
+ }
+ }
+
+ #pragma omp barrier
+
+ {
+ fast_sint_t c; for (c = 0; c < ALPHABET_SIZE * ALPHABET_SIZE; c += 1) { sa_uint_t A = bucket2[c], B = bucket2_local[c]; bucket2_local[c] = A + B; }
+
+ libsais_unbwt_calculate_biPSI(T, P, bucket1_local, bucket2_local, index, omp_block_start, omp_block_start + omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ memcpy(bucket2, buckets + ALPHABET_SIZE + (omp_num_threads - 1) * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)), ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t));
+ }
+ }
+ }
+}
+
+#endif
+
+static void libsais_unbwt_decode_1(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t * i0, fast_uint_t k)
+{
+ uint16_t * RESTRICT U0 = (uint16_t *)(void *)U;
+
+ fast_uint_t i, p0 = *i0;
+
+ for (i = 0; i != k; ++i)
+ {
+ uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais_bswap16(c0);
+ }
+
+ *i0 = p0;
+}
+
+static void libsais_unbwt_decode_2(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t k)
+{
+ uint16_t * RESTRICT U0 = (uint16_t *)(void *)U;
+ uint16_t * RESTRICT U1 = (uint16_t *)(void *)(((uint8_t *)U0) + r);
+
+ fast_uint_t i, p0 = *i0, p1 = *i1;
+
+ for (i = 0; i != k; ++i)
+ {
+ uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais_bswap16(c0);
+ uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = libsais_bswap16(c1);
+ }
+
+ *i0 = p0; *i1 = p1;
+}
+
+static void libsais_unbwt_decode_3(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t k)
+{
+ uint16_t * RESTRICT U0 = (uint16_t *)(void *)U;
+ uint16_t * RESTRICT U1 = (uint16_t *)(void *)(((uint8_t *)U0) + r);
+ uint16_t * RESTRICT U2 = (uint16_t *)(void *)(((uint8_t *)U1) + r);
+
+ fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2;
+
+ for (i = 0; i != k; ++i)
+ {
+ uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais_bswap16(c0);
+ uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = libsais_bswap16(c1);
+ uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = libsais_bswap16(c2);
+ }
+
+ *i0 = p0; *i1 = p1; *i2 = p2;
+}
+
+static void libsais_unbwt_decode_4(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t k)
+{
+ uint16_t * RESTRICT U0 = (uint16_t *)(void *)U;
+ uint16_t * RESTRICT U1 = (uint16_t *)(void *)(((uint8_t *)U0) + r);
+ uint16_t * RESTRICT U2 = (uint16_t *)(void *)(((uint8_t *)U1) + r);
+ uint16_t * RESTRICT U3 = (uint16_t *)(void *)(((uint8_t *)U2) + r);
+
+ fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3;
+
+ for (i = 0; i != k; ++i)
+ {
+ uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais_bswap16(c0);
+ uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = libsais_bswap16(c1);
+ uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = libsais_bswap16(c2);
+ uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = libsais_bswap16(c3);
+ }
+
+ *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3;
+}
+
+static void libsais_unbwt_decode_5(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t k)
+{
+ uint16_t * RESTRICT U0 = (uint16_t *)(void *)U;
+ uint16_t * RESTRICT U1 = (uint16_t *)(void *)(((uint8_t *)U0) + r);
+ uint16_t * RESTRICT U2 = (uint16_t *)(void *)(((uint8_t *)U1) + r);
+ uint16_t * RESTRICT U3 = (uint16_t *)(void *)(((uint8_t *)U2) + r);
+ uint16_t * RESTRICT U4 = (uint16_t *)(void *)(((uint8_t *)U3) + r);
+
+ fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4;
+
+ for (i = 0; i != k; ++i)
+ {
+ uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais_bswap16(c0);
+ uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = libsais_bswap16(c1);
+ uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = libsais_bswap16(c2);
+ uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = libsais_bswap16(c3);
+ uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = libsais_bswap16(c4);
+ }
+
+ *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4;
+}
+
+static void libsais_unbwt_decode_6(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t * i5, fast_uint_t k)
+{
+ uint16_t * RESTRICT U0 = (uint16_t *)(void *)U;
+ uint16_t * RESTRICT U1 = (uint16_t *)(void *)(((uint8_t *)U0) + r);
+ uint16_t * RESTRICT U2 = (uint16_t *)(void *)(((uint8_t *)U1) + r);
+ uint16_t * RESTRICT U3 = (uint16_t *)(void *)(((uint8_t *)U2) + r);
+ uint16_t * RESTRICT U4 = (uint16_t *)(void *)(((uint8_t *)U3) + r);
+ uint16_t * RESTRICT U5 = (uint16_t *)(void *)(((uint8_t *)U4) + r);
+
+ fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5;
+
+ for (i = 0; i != k; ++i)
+ {
+ uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais_bswap16(c0);
+ uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = libsais_bswap16(c1);
+ uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = libsais_bswap16(c2);
+ uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = libsais_bswap16(c3);
+ uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = libsais_bswap16(c4);
+ uint16_t c5 = fastbits[p5 >> shift]; if (bucket2[c5] <= p5) { do { c5++; } while (bucket2[c5] <= p5); } p5 = P[p5]; U5[i] = libsais_bswap16(c5);
+ }
+
+ *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4; *i5 = p5;
+}
+
+static void libsais_unbwt_decode_7(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t * i5, fast_uint_t * i6, fast_uint_t k)
+{
+ uint16_t * RESTRICT U0 = (uint16_t *)(void *)U;
+ uint16_t * RESTRICT U1 = (uint16_t *)(void *)(((uint8_t *)U0) + r);
+ uint16_t * RESTRICT U2 = (uint16_t *)(void *)(((uint8_t *)U1) + r);
+ uint16_t * RESTRICT U3 = (uint16_t *)(void *)(((uint8_t *)U2) + r);
+ uint16_t * RESTRICT U4 = (uint16_t *)(void *)(((uint8_t *)U3) + r);
+ uint16_t * RESTRICT U5 = (uint16_t *)(void *)(((uint8_t *)U4) + r);
+ uint16_t * RESTRICT U6 = (uint16_t *)(void *)(((uint8_t *)U5) + r);
+
+ fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5, p6 = *i6;
+
+ for (i = 0; i != k; ++i)
+ {
+ uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais_bswap16(c0);
+ uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = libsais_bswap16(c1);
+ uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = libsais_bswap16(c2);
+ uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = libsais_bswap16(c3);
+ uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = libsais_bswap16(c4);
+ uint16_t c5 = fastbits[p5 >> shift]; if (bucket2[c5] <= p5) { do { c5++; } while (bucket2[c5] <= p5); } p5 = P[p5]; U5[i] = libsais_bswap16(c5);
+ uint16_t c6 = fastbits[p6 >> shift]; if (bucket2[c6] <= p6) { do { c6++; } while (bucket2[c6] <= p6); } p6 = P[p6]; U6[i] = libsais_bswap16(c6);
+ }
+
+ *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4; *i5 = p5; *i6 = p6;
+}
+
+static void libsais_unbwt_decode_8(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t * i5, fast_uint_t * i6, fast_uint_t * i7, fast_uint_t k)
+{
+ uint16_t * RESTRICT U0 = (uint16_t *)(void *)U;
+ uint16_t * RESTRICT U1 = (uint16_t *)(void *)(((uint8_t *)U0) + r);
+ uint16_t * RESTRICT U2 = (uint16_t *)(void *)(((uint8_t *)U1) + r);
+ uint16_t * RESTRICT U3 = (uint16_t *)(void *)(((uint8_t *)U2) + r);
+ uint16_t * RESTRICT U4 = (uint16_t *)(void *)(((uint8_t *)U3) + r);
+ uint16_t * RESTRICT U5 = (uint16_t *)(void *)(((uint8_t *)U4) + r);
+ uint16_t * RESTRICT U6 = (uint16_t *)(void *)(((uint8_t *)U5) + r);
+ uint16_t * RESTRICT U7 = (uint16_t *)(void *)(((uint8_t *)U6) + r);
+
+ fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5, p6 = *i6, p7 = *i7;
+
+ for (i = 0; i != k; ++i)
+ {
+ uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais_bswap16(c0);
+ uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = libsais_bswap16(c1);
+ uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = libsais_bswap16(c2);
+ uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = libsais_bswap16(c3);
+ uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = libsais_bswap16(c4);
+ uint16_t c5 = fastbits[p5 >> shift]; if (bucket2[c5] <= p5) { do { c5++; } while (bucket2[c5] <= p5); } p5 = P[p5]; U5[i] = libsais_bswap16(c5);
+ uint16_t c6 = fastbits[p6 >> shift]; if (bucket2[c6] <= p6) { do { c6++; } while (bucket2[c6] <= p6); } p6 = P[p6]; U6[i] = libsais_bswap16(c6);
+ uint16_t c7 = fastbits[p7 >> shift]; if (bucket2[c7] <= p7) { do { c7++; } while (bucket2[c7] <= p7); } p7 = P[p7]; U7[i] = libsais_bswap16(c7);
+ }
+
+ *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4; *i5 = p5; *i6 = p6; *i7 = p7;
+}
+
+static void libsais_unbwt_decode(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n, sa_sint_t r, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_sint_t blocks, fast_uint_t reminder)
+{
+ fast_uint_t shift = 0; while ((n >> shift) > (1 << UNBWT_FASTBITS)) { shift++; }
+ fast_uint_t offset = 0;
+
+ while (blocks > 8)
+ {
+ fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6], i7 = I[7];
+ libsais_unbwt_decode_8(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, &i7, (fast_uint_t)r >> 1);
+ I += 8; blocks -= 8; offset += 8 * (fast_uint_t)r;
+ }
+
+ if (blocks == 1)
+ {
+ fast_uint_t i0 = I[0];
+ libsais_unbwt_decode_1(U + offset, P, bucket2, fastbits, shift, &i0, reminder >> 1);
+ }
+ else if (blocks == 2)
+ {
+ fast_uint_t i0 = I[0], i1 = I[1];
+ libsais_unbwt_decode_2(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, reminder >> 1);
+ libsais_unbwt_decode_1(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, &i0, ((fast_uint_t)r >> 1) - (reminder >> 1));
+ }
+ else if (blocks == 3)
+ {
+ fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2];
+ libsais_unbwt_decode_3(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, reminder >> 1);
+ libsais_unbwt_decode_2(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, ((fast_uint_t)r >> 1) - (reminder >> 1));
+ }
+ else if (blocks == 4)
+ {
+ fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3];
+ libsais_unbwt_decode_4(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, reminder >> 1);
+ libsais_unbwt_decode_3(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, ((fast_uint_t)r >> 1) - (reminder >> 1));
+ }
+ else if (blocks == 5)
+ {
+ fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4];
+ libsais_unbwt_decode_5(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, reminder >> 1);
+ libsais_unbwt_decode_4(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, ((fast_uint_t)r >> 1) - (reminder >> 1));
+ }
+ else if (blocks == 6)
+ {
+ fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5];
+ libsais_unbwt_decode_6(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, reminder >> 1);
+ libsais_unbwt_decode_5(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, ((fast_uint_t)r >> 1) - (reminder >> 1));
+ }
+ else if (blocks == 7)
+ {
+ fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6];
+ libsais_unbwt_decode_7(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, reminder >> 1);
+ libsais_unbwt_decode_6(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, ((fast_uint_t)r >> 1) - (reminder >> 1));
+ }
+ else
+ {
+ fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6], i7 = I[7];
+ libsais_unbwt_decode_8(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, &i7, reminder >> 1);
+ libsais_unbwt_decode_7(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, ((fast_uint_t)r >> 1) - (reminder >> 1));
+ }
+}
+
+static void libsais_unbwt_decode_omp(const uint8_t * RESTRICT T, uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n, sa_sint_t r, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, sa_sint_t threads)
+{
+ fast_uint_t lastc = T[0];
+ fast_sint_t blocks = 1 + (((fast_sint_t)n - 1) / (fast_sint_t)r);
+ fast_uint_t reminder = (fast_uint_t)n - ((fast_uint_t)r * ((fast_uint_t)blocks - 1));
+
+#if defined(_OPENMP)
+ fast_sint_t max_threads = blocks < threads ? blocks : threads;
+ #pragma omp parallel num_threads(max_threads) if(max_threads > 1 && n >= 65536)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+
+ fast_sint_t omp_block_stride = blocks / omp_num_threads;
+ fast_sint_t omp_block_reminder = blocks % omp_num_threads;
+ fast_sint_t omp_block_size = omp_block_stride + (omp_thread_num < omp_block_reminder);
+ fast_sint_t omp_block_start = omp_block_stride * omp_thread_num + (omp_thread_num < omp_block_reminder ? omp_thread_num : omp_block_reminder);
+
+ libsais_unbwt_decode(U + r * omp_block_start, P, n, r, I + omp_block_start, bucket2, fastbits, omp_block_size, omp_thread_num < omp_num_threads - 1 ? (fast_uint_t)r : reminder);
+ }
+
+ U[n - 1] = (uint8_t)lastc;
+}
+
+static sa_sint_t libsais_unbwt_core(const uint8_t * RESTRICT T, uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n, const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, sa_uint_t * RESTRICT buckets, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+ if (threads > 1 && n >= 262144)
+ {
+ libsais_unbwt_init_parallel(T, P, n, freq, I, bucket2, fastbits, buckets, threads);
+ }
+ else
+#else
+ UNUSED(buckets);
+#endif
+ {
+ libsais_unbwt_init_single(T, P, n, freq, I, bucket2, fastbits);
+ }
+
+ libsais_unbwt_decode_omp(T, U, P, n, r, I, bucket2, fastbits, threads);
+ return 0;
+}
+
+static sa_sint_t libsais_unbwt_main(const uint8_t * T, uint8_t * U, sa_uint_t * P, sa_sint_t n, const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * I, sa_sint_t threads)
+{
+ fast_uint_t shift = 0; while ((n >> shift) > (1 << UNBWT_FASTBITS)) { shift++; }
+
+ sa_uint_t * RESTRICT bucket2 = (sa_uint_t *)libsais_alloc_aligned(ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t), 4096);
+ uint16_t * RESTRICT fastbits = (uint16_t *)libsais_alloc_aligned(((size_t)1 + (size_t)(n >> shift)) * sizeof(uint16_t), 4096);
+ sa_uint_t * RESTRICT buckets = threads > 1 && n >= 262144 ? (sa_uint_t *)libsais_alloc_aligned((size_t)threads * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) * sizeof(sa_uint_t), 4096) : NULL;
+
+ sa_sint_t index = bucket2 != NULL && fastbits != NULL && (buckets != NULL || threads == 1 || n < 262144)
+ ? libsais_unbwt_core(T, U, P, n, freq, r, I, bucket2, fastbits, buckets, threads)
+ : -2;
+
+ libsais_free_aligned(buckets);
+ libsais_free_aligned(fastbits);
+ libsais_free_aligned(bucket2);
+
+ return index;
+}
+
+static sa_sint_t libsais_unbwt_main_ctx(const LIBSAIS_UNBWT_CONTEXT * ctx, const uint8_t * T, uint8_t * U, sa_uint_t * P, sa_sint_t n, const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * I)
+{
+ return ctx != NULL && ctx->bucket2 != NULL && ctx->fastbits != NULL && (ctx->buckets != NULL || ctx->threads == 1)
+ ? libsais_unbwt_core(T, U, P, n, freq, r, I, ctx->bucket2, ctx->fastbits, ctx->buckets, (sa_sint_t)ctx->threads)
+ : -2;
+}
+
+void * libsais_unbwt_create_ctx(void)
+{
+ return (void *)libsais_unbwt_create_ctx_main(1);
+}
+
+void libsais_unbwt_free_ctx(void * ctx)
+{
+ libsais_unbwt_free_ctx_main((LIBSAIS_UNBWT_CONTEXT *)ctx);
+}
+
+int32_t libsais_unbwt(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i)
+{
+ return libsais_unbwt_aux(T, U, A, n, freq, n, &i);
+}
+
+int32_t libsais_unbwt_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i)
+{
+ return libsais_unbwt_aux_ctx(ctx, T, U, A, n, freq, n, &i);
+}
+
+int32_t libsais_unbwt_aux(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I)
+{
+ if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL))
+ {
+ return -1;
+ }
+ else if (n <= 1)
+ {
+ if (I[0] != n) { return -1; }
+ if (n == 1) { U[0] = T[0]; }
+ return 0;
+ }
+
+ fast_sint_t t; for (t = 0; t <= (n - 1) / r; ++t) { if (I[t] <= 0 || I[t] > n) { return -1; } }
+
+ return libsais_unbwt_main(T, U, (sa_uint_t *)A, n, freq, r, (const sa_uint_t *)I, 1);
+}
+
+int32_t libsais_unbwt_aux_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I)
+{
+ if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL))
+ {
+ return -1;
+ }
+ else if (n <= 1)
+ {
+ if (I[0] != n) { return -1; }
+ if (n == 1) { U[0] = T[0]; }
+ return 0;
+ }
+
+ fast_sint_t t; for (t = 0; t <= (n - 1) / r; ++t) { if (I[t] <= 0 || I[t] > n) { return -1; } }
+
+ return libsais_unbwt_main_ctx((const LIBSAIS_UNBWT_CONTEXT *)ctx, T, U, (sa_uint_t *)A, n, freq, r, (const sa_uint_t *)I);
+}
+
+#if defined(_OPENMP)
+
+void * libsais_unbwt_create_ctx_omp(int32_t threads)
+{
+ if (threads < 0) { return NULL; }
+
+ threads = threads > 0 ? threads : omp_get_max_threads();
+ return (void *)libsais_unbwt_create_ctx_main(threads);
+}
+
+int32_t libsais_unbwt_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i, int32_t threads)
+{
+ return libsais_unbwt_aux_omp(T, U, A, n, freq, n, &i, threads);
+}
+
+int32_t libsais_unbwt_aux_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I, int32_t threads)
+{
+ if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL) || (threads < 0))
+ {
+ return -1;
+ }
+ else if (n <= 1)
+ {
+ if (I[0] != n) { return -1; }
+ if (n == 1) { U[0] = T[0]; }
+ return 0;
+ }
+
+ fast_sint_t t; for (t = 0; t <= (n - 1) / r; ++t) { if (I[t] <= 0 || I[t] > n) { return -1; } }
+
+ threads = threads > 0 ? threads : omp_get_max_threads();
+ return libsais_unbwt_main(T, U, (sa_uint_t *)A, n, freq, r, (const sa_uint_t *)I, threads);
+}
+
+#endif
diff --git a/libsais/libsais.h b/libsais/libsais.h
new file mode 100644
index 0000000..c655d67
--- /dev/null
+++ b/libsais/libsais.h
@@ -0,0 +1,285 @@
+/*--
+
+This file is a part of libsais, a library for linear time
+suffix array and burrows wheeler transform construction.
+
+ Copyright (c) 2021 Ilya Grebnov
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+Please see the file LICENSE for full copyright information.
+
+--*/
+
+#ifndef LIBSAIS_H
+#define LIBSAIS_H 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ #include
+
+ /**
+ * Creates the libsais context that allows reusing allocated memory with each libsais operation.
+ * In multi-threaded environments, use one context per thread for parallel executions.
+ * @return the libsais context, NULL otherwise.
+ */
+ void * libsais_create_ctx(void);
+
+#if defined(_OPENMP)
+ /**
+ * Creates the libsais context that allows reusing allocated memory with each parallel libsais operation using OpenMP.
+ * In multi-threaded environments, use one context per thread for parallel executions.
+ * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
+ * @return the libsais context, NULL otherwise.
+ */
+ void * libsais_create_ctx_omp(int32_t threads);
+#endif
+
+ /**
+ * Destroys the libsass context and free previusly allocated memory.
+ * @param ctx The libsais context (can be NULL).
+ */
+ void libsais_free_ctx(void * ctx);
+
+ /**
+ * Constructs the suffix array of a given string.
+ * @param T [0..n-1] The input string.
+ * @param SA [0..n-1+fs] The output array of suffixes.
+ * @param n The length of the given string.
+ * @param fs The extra space available at the end of SA array (can be 0).
+ * @param freq [0..255] The output symbol frequency table (can be NULL).
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+ int32_t libsais(const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq);
+
+ /**
+ * Constructs the suffix array of a given string using libsais context.
+ * @param ctx The libsais context.
+ * @param T [0..n-1] The input string.
+ * @param SA [0..n-1+fs] The output array of suffixes.
+ * @param n The length of the given string.
+ * @param fs The extra space available at the end of SA array (can be 0).
+ * @param freq [0..255] The output symbol frequency table (can be NULL).
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+ int32_t libsais_ctx(const void * ctx, const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq);
+
+#if defined(_OPENMP)
+ /**
+ * Constructs the suffix array of a given string in parallel using OpenMP.
+ * @param T [0..n-1] The input string.
+ * @param SA [0..n-1+fs] The output array of suffixes.
+ * @param n The length of the given string.
+ * @param fs The extra space available at the end of SA array (can be 0).
+ * @param freq [0..255] The output symbol frequency table (can be NULL).
+ * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+ int32_t libsais_omp(const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq, int32_t threads);
+#endif
+
+ /**
+ * Constructs the burrows-wheeler transformed string of a given string.
+ * @param T [0..n-1] The input string.
+ * @param U [0..n-1] The output string (can be T).
+ * @param A [0..n-1+fs] The temporary array.
+ * @param n The length of the given string.
+ * @param fs The extra space available at the end of A array (can be 0).
+ * @param freq [0..255] The output symbol frequency table (can be NULL).
+ * @return The primary index if no error occurred, -1 or -2 otherwise.
+ */
+ int32_t libsais_bwt(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq);
+
+ /**
+ * Constructs the burrows-wheeler transformed string of a given string with auxiliary indexes.
+ * @param T [0..n-1] The input string.
+ * @param U [0..n-1] The output string (can be T).
+ * @param A [0..n-1+fs] The temporary array.
+ * @param n The length of the given string.
+ * @param fs The extra space available at the end of A array (can be 0).
+ * @param freq [0..255] The output symbol frequency table (can be NULL).
+ * @param r The sampling rate for auxiliary indexes (must be power of 2).
+ * @param I [0..(n-1)/r] The output auxiliary indexes.
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+ int32_t libsais_bwt_aux(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I);
+
+ /**
+ * Constructs the burrows-wheeler transformed string of a given string using libsais context.
+ * @param ctx The libsais context.
+ * @param T [0..n-1] The input string.
+ * @param U [0..n-1] The output string (can be T).
+ * @param A [0..n-1+fs] The temporary array.
+ * @param n The length of the given string.
+ * @param fs The extra space available at the end of A array (can be 0).
+ * @param freq [0..255] The output symbol frequency table (can be NULL).
+ * @return The primary index if no error occurred, -1 or -2 otherwise.
+ */
+ int32_t libsais_bwt_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq);
+
+ /**
+ * Constructs the burrows-wheeler transformed string of a given string with auxiliary indexes using libsais context.
+ * @param ctx The libsais context.
+ * @param T [0..n-1] The input string.
+ * @param U [0..n-1] The output string (can be T).
+ * @param A [0..n-1+fs] The temporary array.
+ * @param n The length of the given string.
+ * @param fs The extra space available at the end of A array (can be 0).
+ * @param freq [0..255] The output symbol frequency table (can be NULL).
+ * @param r The sampling rate for auxiliary indexes (must be power of 2).
+ * @param I [0..(n-1)/r] The output auxiliary indexes.
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+ int32_t libsais_bwt_aux_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I);
+
+#if defined(_OPENMP)
+ /**
+ * Constructs the burrows-wheeler transformed string of a given string in parallel using OpenMP.
+ * @param T [0..n-1] The input string.
+ * @param U [0..n-1] The output string (can be T).
+ * @param A [0..n-1+fs] The temporary array.
+ * @param n The length of the given string.
+ * @param fs The extra space available at the end of A array (can be 0).
+ * @param freq [0..255] The output symbol frequency table (can be NULL).
+ * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
+ * @return The primary index if no error occurred, -1 or -2 otherwise.
+ */
+ int32_t libsais_bwt_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t threads);
+
+ /**
+ * Constructs the burrows-wheeler transformed string of a given string with auxiliary indexes in parallel using OpenMP.
+ * @param T [0..n-1] The input string.
+ * @param U [0..n-1] The output string (can be T).
+ * @param A [0..n-1+fs] The temporary array.
+ * @param n The length of the given string.
+ * @param fs The extra space available at the end of A array (can be 0).
+ * @param freq [0..255] The output symbol frequency table (can be NULL).
+ * @param r The sampling rate for auxiliary indexes (must be power of 2).
+ * @param I [0..(n-1)/r] The output auxiliary indexes.
+ * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+ int32_t libsais_bwt_aux_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I, int32_t threads);
+#endif
+
+ /**
+ * Creates the libsais reverse BWT context that allows reusing allocated memory with each libsais_unbwt_* operation.
+ * In multi-threaded environments, use one context per thread for parallel executions.
+ * @return the libsais context, NULL otherwise.
+ */
+ void * libsais_unbwt_create_ctx(void);
+
+#if defined(_OPENMP)
+ /**
+ * Creates the libsais reverse BWT context that allows reusing allocated memory with each parallel libsais_unbwt_* operation using OpenMP.
+ * In multi-threaded environments, use one context per thread for parallel executions.
+ * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
+ * @return the libsais context, NULL otherwise.
+ */
+ void * libsais_unbwt_create_ctx_omp(int32_t threads);
+#endif
+
+ /**
+ * Destroys the libsass reverse BWT context and free previusly allocated memory.
+ * @param ctx The libsais context (can be NULL).
+ */
+ void libsais_unbwt_free_ctx(void * ctx);
+
+ /**
+ * Constructs the original string from a given burrows-wheeler transformed string with primary index.
+ * @param T [0..n-1] The input string.
+ * @param U [0..n-1] The output string (can be T).
+ * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size).
+ * @param n The length of the given string.
+ * @param freq [0..255] The input symbol frequency table (can be NULL).
+ * @param i The primary index.
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+ int32_t libsais_unbwt(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i);
+
+ /**
+ * Constructs the original string from a given burrows-wheeler transformed string with primary index using libsais reverse BWT context.
+ * @param ctx The libsais reverse BWT context.
+ * @param T [0..n-1] The input string.
+ * @param U [0..n-1] The output string (can be T).
+ * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size).
+ * @param n The length of the given string.
+ * @param freq [0..255] The input symbol frequency table (can be NULL).
+ * @param i The primary index.
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+ int32_t libsais_unbwt_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i);
+
+ /**
+ * Constructs the original string from a given burrows-wheeler transformed string with auxiliary indexes.
+ * @param T [0..n-1] The input string.
+ * @param U [0..n-1] The output string (can be T).
+ * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size).
+ * @param n The length of the given string.
+ * @param freq [0..255] The input symbol frequency table (can be NULL).
+ * @param r The sampling rate for auxiliary indexes (must be power of 2).
+ * @param I [0..(n-1)/r] The input auxiliary indexes.
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+ int32_t libsais_unbwt_aux(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I);
+
+ /**
+ * Constructs the original string from a given burrows-wheeler transformed string with auxiliary indexes using libsais reverse BWT context.
+ * @param ctx The libsais reverse BWT context.
+ * @param T [0..n-1] The input string.
+ * @param U [0..n-1] The output string (can be T).
+ * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size).
+ * @param n The length of the given string.
+ * @param freq [0..255] The input symbol frequency table (can be NULL).
+ * @param r The sampling rate for auxiliary indexes (must be power of 2).
+ * @param I [0..(n-1)/r] The input auxiliary indexes.
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+ int32_t libsais_unbwt_aux_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I);
+
+#if defined(_OPENMP)
+ /**
+ * Constructs the original string from a given burrows-wheeler transformed string with primary index in parallel using OpenMP.
+ * @param T [0..n-1] The input string.
+ * @param U [0..n-1] The output string (can be T).
+ * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size).
+ * @param n The length of the given string.
+ * @param freq [0..255] The input symbol frequency table (can be NULL).
+ * @param i The primary index.
+ * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+ int32_t libsais_unbwt_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i, int32_t threads);
+
+ /**
+ * Constructs the original string from a given burrows-wheeler transformed string with auxiliary indexes in parallel using OpenMP.
+ * @param T [0..n-1] The input string.
+ * @param U [0..n-1] The output string (can be T).
+ * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size).
+ * @param n The length of the given string.
+ * @param freq [0..255] The input symbol frequency table (can be NULL).
+ * @param r The sampling rate for auxiliary indexes (must be power of 2).
+ * @param I [0..(n-1)/r] The input auxiliary indexes.
+ * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+ int32_t libsais_unbwt_aux_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I, int32_t threads);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/libsais/libsais16.c b/libsais/libsais16.c
new file mode 100644
index 0000000..9a8d95b
--- /dev/null
+++ b/libsais/libsais16.c
@@ -0,0 +1,7342 @@
+/*--
+
+This file is a part of libsais, a library for linear time
+suffix array and burrows wheeler transform construction.
+
+ Copyright (c) 2021 Ilya Grebnov
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+Please see the file LICENSE for full copyright information.
+
+--*/
+
+#include "libsais16.h"
+
+#include
+#include
+#include
+#include
+#include
+
+#if defined(_OPENMP)
+ #include
+#else
+ #define UNUSED(_x) (void)(_x)
+#endif
+
+typedef int32_t sa_sint_t;
+typedef uint32_t sa_uint_t;
+typedef ptrdiff_t fast_sint_t;
+typedef size_t fast_uint_t;
+
+#define SAINT_BIT (32)
+#define SAINT_MAX INT32_MAX
+#define SAINT_MIN INT32_MIN
+
+#define ALPHABET_SIZE (1 << CHAR_BIT << CHAR_BIT)
+#define UNBWT_FASTBITS (17)
+
+#define SUFFIX_GROUP_BIT (SAINT_BIT - 1)
+#define SUFFIX_GROUP_MARKER (((sa_sint_t)1) << (SUFFIX_GROUP_BIT - 1))
+
+#define BUCKETS_INDEX2(_c, _s) (((_c) << 1) + (_s))
+#define BUCKETS_INDEX4(_c, _s) (((_c) << 2) + (_s))
+
+#define LIBSAIS_PER_THREAD_CACHE_SIZE (24576)
+
+typedef struct LIBSAIS_THREAD_CACHE
+{
+ sa_sint_t symbol;
+ sa_sint_t index;
+} LIBSAIS_THREAD_CACHE;
+
+typedef union LIBSAIS_THREAD_STATE
+{
+ struct
+ {
+ fast_sint_t position;
+ fast_sint_t count;
+
+ fast_sint_t m;
+ fast_sint_t last_lms_suffix;
+
+ sa_sint_t * buckets;
+ LIBSAIS_THREAD_CACHE * cache;
+ } state;
+
+ uint8_t padding[64];
+} LIBSAIS_THREAD_STATE;
+
+typedef struct LIBSAIS_CONTEXT
+{
+ sa_sint_t * buckets;
+ LIBSAIS_THREAD_STATE * thread_state;
+ fast_sint_t threads;
+} LIBSAIS_CONTEXT;
+
+typedef struct LIBSAIS_UNBWT_CONTEXT
+{
+ sa_uint_t * bucket2;
+ uint16_t * fastbits;
+ sa_uint_t * buckets;
+ fast_sint_t threads;
+} LIBSAIS_UNBWT_CONTEXT;
+
+#if defined(__GNUC__) || defined(__clang__)
+ #define RESTRICT __restrict__
+#elif defined(_MSC_VER) || defined(__INTEL_COMPILER)
+ #define RESTRICT __restrict
+#else
+ #error Your compiler, configuration or platform is not supported.
+#endif
+
+#if defined(__has_builtin)
+ #if __has_builtin(__builtin_prefetch)
+ #define HAS_BUILTIN_PREFECTCH
+ #endif
+#elif defined(__GNUC__) && __GNUC__ > 3
+ #define HAS_BUILTIN_PREFECTCH
+#endif
+
+#if defined(HAS_BUILTIN_PREFECTCH)
+ #define libsais16_prefetch(address) __builtin_prefetch((const void *)(address), 0, 0)
+ #define libsais16_prefetchw(address) __builtin_prefetch((const void *)(address), 1, 0)
+#elif defined (_M_IX86) || defined (_M_AMD64)
+ #include
+ #define libsais16_prefetch(address) _mm_prefetch((const void *)(address), _MM_HINT_NTA)
+ #define libsais16_prefetchw(address) _m_prefetchw((const void *)(address))
+#elif defined (_M_ARM)
+ #include
+ #define libsais16_prefetch(address) __prefetch((const void *)(address))
+ #define libsais16_prefetchw(address) __prefetchw((const void *)(address))
+#elif defined (_M_ARM64)
+ #include
+ #define libsais16_prefetch(address) __prefetch2((const void *)(address), 1)
+ #define libsais16_prefetchw(address) __prefetch2((const void *)(address), 17)
+#else
+ #error Your compiler, configuration or platform is not supported.
+#endif
+
+#if !defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__)
+ #if defined(_LITTLE_ENDIAN) \
+ || (defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && BYTE_ORDER == LITTLE_ENDIAN) \
+ || (defined(_BYTE_ORDER) && defined(_LITTLE_ENDIAN) && _BYTE_ORDER == _LITTLE_ENDIAN) \
+ || (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN) \
+ || (defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+ #define __LITTLE_ENDIAN__
+ #elif defined(_BIG_ENDIAN) \
+ || (defined(BYTE_ORDER) && defined(BIG_ENDIAN) && BYTE_ORDER == BIG_ENDIAN) \
+ || (defined(_BYTE_ORDER) && defined(_BIG_ENDIAN) && _BYTE_ORDER == _BIG_ENDIAN) \
+ || (defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && __BYTE_ORDER == __BIG_ENDIAN) \
+ || (defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ #define __BIG_ENDIAN__
+ #elif defined(_WIN32)
+ #define __LITTLE_ENDIAN__
+ #endif
+#endif
+
+static void * libsais16_align_up(const void * address, size_t alignment)
+{
+ return (void *)((((ptrdiff_t)address) + ((ptrdiff_t)alignment) - 1) & (-((ptrdiff_t)alignment)));
+}
+
+static void * libsais16_alloc_aligned(size_t size, size_t alignment)
+{
+ void * address = malloc(size + sizeof(short) + alignment - 1);
+ if (address != NULL)
+ {
+ void * aligned_address = libsais16_align_up((void *)((ptrdiff_t)address + (ptrdiff_t)(sizeof(short))), alignment);
+ ((short *)aligned_address)[-1] = (short)((ptrdiff_t)aligned_address - (ptrdiff_t)address);
+
+ return aligned_address;
+ }
+
+ return NULL;
+}
+
+static void libsais16_free_aligned(void * aligned_address)
+{
+ if (aligned_address != NULL)
+ {
+ free((void *)((ptrdiff_t)aligned_address - ((short *)aligned_address)[-1]));
+ }
+}
+
+static LIBSAIS_THREAD_STATE * libsais16_alloc_thread_state(sa_sint_t threads)
+{
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state = (LIBSAIS_THREAD_STATE *)libsais16_alloc_aligned((size_t)threads * sizeof(LIBSAIS_THREAD_STATE), 4096);
+ sa_sint_t * RESTRICT thread_buckets = (sa_sint_t *)libsais16_alloc_aligned((size_t)threads * 4 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096);
+ LIBSAIS_THREAD_CACHE * RESTRICT thread_cache = (LIBSAIS_THREAD_CACHE *)libsais16_alloc_aligned((size_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE * sizeof(LIBSAIS_THREAD_CACHE), 4096);
+
+ if (thread_state != NULL && thread_buckets != NULL && thread_cache != NULL)
+ {
+ fast_sint_t t;
+ for (t = 0; t < threads; ++t)
+ {
+ thread_state[t].state.buckets = thread_buckets; thread_buckets += 4 * ALPHABET_SIZE;
+ thread_state[t].state.cache = thread_cache; thread_cache += LIBSAIS_PER_THREAD_CACHE_SIZE;
+ }
+
+ return thread_state;
+ }
+
+ libsais16_free_aligned(thread_cache);
+ libsais16_free_aligned(thread_buckets);
+ libsais16_free_aligned(thread_state);
+ return NULL;
+}
+
+static void libsais16_free_thread_state(LIBSAIS_THREAD_STATE * thread_state)
+{
+ if (thread_state != NULL)
+ {
+ libsais16_free_aligned(thread_state[0].state.cache);
+ libsais16_free_aligned(thread_state[0].state.buckets);
+ libsais16_free_aligned(thread_state);
+ }
+}
+
+static LIBSAIS_CONTEXT * libsais16_create_ctx_main(sa_sint_t threads)
+{
+ LIBSAIS_CONTEXT * RESTRICT ctx = (LIBSAIS_CONTEXT *)libsais16_alloc_aligned(sizeof(LIBSAIS_CONTEXT), 64);
+ sa_sint_t * RESTRICT buckets = (sa_sint_t *)libsais16_alloc_aligned(8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096);
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state = threads > 1 ? libsais16_alloc_thread_state(threads) : NULL;
+
+ if (ctx != NULL && buckets != NULL && (thread_state != NULL || threads == 1))
+ {
+ ctx->buckets = buckets;
+ ctx->threads = threads;
+ ctx->thread_state = thread_state;
+
+ return ctx;
+ }
+
+ libsais16_free_thread_state(thread_state);
+ libsais16_free_aligned(buckets);
+ libsais16_free_aligned(ctx);
+ return NULL;
+}
+
+static void libsais16_free_ctx_main(LIBSAIS_CONTEXT * ctx)
+{
+ if (ctx != NULL)
+ {
+ libsais16_free_thread_state(ctx->thread_state);
+ libsais16_free_aligned(ctx->buckets);
+ libsais16_free_aligned(ctx);
+ }
+}
+
+#if defined(_OPENMP)
+
+static sa_sint_t libsais16_count_negative_marked_suffixes(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ sa_sint_t count = 0;
+
+ fast_sint_t i; for (i = omp_block_start; i < omp_block_start + omp_block_size; ++i) { count += (SA[i] < 0); }
+
+ return count;
+}
+
+static sa_sint_t libsais16_count_zero_marked_suffixes(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ sa_sint_t count = 0;
+
+ fast_sint_t i; for (i = omp_block_start; i < omp_block_start + omp_block_size; ++i) { count += (SA[i] == 0); }
+
+ return count;
+}
+
+static void libsais16_place_cached_suffixes(sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
+ {
+ libsais16_prefetch(&cache[i + 2 * prefetch_distance]);
+
+ libsais16_prefetchw(&SA[cache[i + prefetch_distance + 0].symbol]);
+ libsais16_prefetchw(&SA[cache[i + prefetch_distance + 1].symbol]);
+ libsais16_prefetchw(&SA[cache[i + prefetch_distance + 2].symbol]);
+ libsais16_prefetchw(&SA[cache[i + prefetch_distance + 3].symbol]);
+
+ SA[cache[i + 0].symbol] = cache[i + 0].index;
+ SA[cache[i + 1].symbol] = cache[i + 1].index;
+ SA[cache[i + 2].symbol] = cache[i + 2].index;
+ SA[cache[i + 3].symbol] = cache[i + 3].index;
+ }
+
+ for (j += prefetch_distance + 3; i < j; i += 1)
+ {
+ SA[cache[i].symbol] = cache[i].index;
+ }
+}
+
+static void libsais16_compact_and_place_cached_suffixes(sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j, l;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j; i += 4)
+ {
+ libsais16_prefetchw(&cache[i + prefetch_distance]);
+
+ cache[l] = cache[i + 0]; l += cache[l].symbol >= 0;
+ cache[l] = cache[i + 1]; l += cache[l].symbol >= 0;
+ cache[l] = cache[i + 2]; l += cache[l].symbol >= 0;
+ cache[l] = cache[i + 3]; l += cache[l].symbol >= 0;
+ }
+
+ for (j += 3; i < j; i += 1)
+ {
+ cache[l] = cache[i]; l += cache[l].symbol >= 0;
+ }
+
+ libsais16_place_cached_suffixes(SA, cache, omp_block_start, l - omp_block_start);
+}
+
+static void libsais16_accumulate_counts_s32_2(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
+{
+ sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
+ fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s]; }
+}
+
+static void libsais16_accumulate_counts_s32_3(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
+{
+ sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
+ sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
+ fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s]; }
+}
+
+static void libsais16_accumulate_counts_s32_4(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
+{
+ sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
+ sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
+ sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride;
+ fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s]; }
+}
+
+static void libsais16_accumulate_counts_s32_5(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
+{
+ sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
+ sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
+ sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride;
+ sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride;
+ fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s]; }
+}
+
+static void libsais16_accumulate_counts_s32_6(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
+{
+ sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
+ sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
+ sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride;
+ sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride;
+ sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride;
+ fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s]; }
+}
+
+static void libsais16_accumulate_counts_s32_7(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
+{
+ sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
+ sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
+ sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride;
+ sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride;
+ sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride;
+ sa_sint_t * RESTRICT bucket06 = bucket05 - bucket_stride;
+ fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s] + bucket06[s]; }
+}
+
+static void libsais16_accumulate_counts_s32_8(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
+{
+ sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
+ sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
+ sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride;
+ sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride;
+ sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride;
+ sa_sint_t * RESTRICT bucket06 = bucket05 - bucket_stride;
+ sa_sint_t * RESTRICT bucket07 = bucket06 - bucket_stride;
+ fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s] + bucket06[s] + bucket07[s]; }
+}
+
+static void libsais16_accumulate_counts_s32_9(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
+{
+ sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
+ sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
+ sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride;
+ sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride;
+ sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride;
+ sa_sint_t * RESTRICT bucket06 = bucket05 - bucket_stride;
+ sa_sint_t * RESTRICT bucket07 = bucket06 - bucket_stride;
+ sa_sint_t * RESTRICT bucket08 = bucket07 - bucket_stride;
+ fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s] + bucket06[s] + bucket07[s] + bucket08[s]; }
+}
+
+static void libsais16_accumulate_counts_s32(sa_sint_t * RESTRICT buckets, fast_sint_t bucket_size, fast_sint_t bucket_stride, fast_sint_t num_buckets)
+{
+ while (num_buckets >= 9)
+ {
+ libsais16_accumulate_counts_s32_9(buckets - (num_buckets - 9) * bucket_stride, bucket_size, bucket_stride); num_buckets -= 8;
+ }
+
+ switch (num_buckets)
+ {
+ case 1: break;
+ case 2: libsais16_accumulate_counts_s32_2(buckets, bucket_size, bucket_stride); break;
+ case 3: libsais16_accumulate_counts_s32_3(buckets, bucket_size, bucket_stride); break;
+ case 4: libsais16_accumulate_counts_s32_4(buckets, bucket_size, bucket_stride); break;
+ case 5: libsais16_accumulate_counts_s32_5(buckets, bucket_size, bucket_stride); break;
+ case 6: libsais16_accumulate_counts_s32_6(buckets, bucket_size, bucket_stride); break;
+ case 7: libsais16_accumulate_counts_s32_7(buckets, bucket_size, bucket_stride); break;
+ case 8: libsais16_accumulate_counts_s32_8(buckets, bucket_size, bucket_stride); break;
+ }
+}
+
+#endif
+
+static void libsais16_gather_lms_suffixes_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, fast_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ if (omp_block_size > 0)
+ {
+ const fast_sint_t prefetch_distance = 128;
+
+ fast_sint_t i, j = omp_block_start + omp_block_size, c0 = T[omp_block_start + omp_block_size - 1], c1 = -1;
+
+ while (j < n && (c1 = T[j]) == c0) { ++j; }
+
+ fast_uint_t s = c0 >= c1;
+
+ for (i = omp_block_start + omp_block_size - 2, j = omp_block_start + 3; i >= j; i -= 4)
+ {
+ libsais16_prefetch(&T[i - prefetch_distance]);
+
+ c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+ c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 0); m -= ((s & 3) == 1);
+ c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 1); m -= ((s & 3) == 1);
+ c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 2); m -= ((s & 3) == 1);
+ }
+
+ for (j -= 3; i >= j; i -= 1)
+ {
+ c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+ }
+
+ SA[m] = (sa_sint_t)(i + 1);
+ }
+}
+
+static void libsais16_gather_lms_suffixes_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536 && omp_get_dynamic() == 0)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+ if (omp_num_threads == 1)
+ {
+ libsais16_gather_lms_suffixes_16u(T, SA, n, (fast_sint_t)n - 1, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ fast_sint_t t, m = 0; for (t = omp_num_threads - 1; t > omp_thread_num; --t) { m += thread_state[t].state.m; }
+
+ libsais16_gather_lms_suffixes_16u(T, SA, n, (fast_sint_t)n - 1 - m, omp_block_start, omp_block_size);
+
+ #pragma omp barrier
+
+ if (thread_state[omp_thread_num].state.m > 0)
+ {
+ SA[(fast_sint_t)n - 1 - m] = (sa_sint_t)thread_state[omp_thread_num].state.last_lms_suffix;
+ }
+ }
+#endif
+ }
+}
+
+static sa_sint_t libsais16_gather_lms_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t i = n - 2;
+ sa_sint_t m = n - 1;
+ fast_uint_t s = 1;
+ fast_sint_t c0 = T[n - 1];
+ fast_sint_t c1 = 0;
+
+ for (; i >= 3; i -= 4)
+ {
+ libsais16_prefetch(&T[i - prefetch_distance]);
+
+ c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = i + 1; m -= ((s & 3) == 1);
+ c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i - 0; m -= ((s & 3) == 1);
+ c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = i - 1; m -= ((s & 3) == 1);
+ c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i - 2; m -= ((s & 3) == 1);
+ }
+
+ for (; i >= 0; i -= 1)
+ {
+ c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i + 1; m -= ((s & 3) == 1);
+ }
+
+ return n - 1 - m;
+}
+
+static sa_sint_t libsais16_gather_compacted_lms_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t i = n - 2;
+ sa_sint_t m = n - 1;
+ fast_uint_t s = 1;
+ fast_sint_t c0 = T[n - 1];
+ fast_sint_t c1 = 0;
+
+ for (; i >= 3; i -= 4)
+ {
+ libsais16_prefetch(&T[i - prefetch_distance]);
+
+ c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = i + 1; m -= ((fast_sint_t)(s & 3) == (c0 >= 0));
+ c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i - 0; m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
+ c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = i - 1; m -= ((fast_sint_t)(s & 3) == (c0 >= 0));
+ c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i - 2; m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
+ }
+
+ for (; i >= 0; i -= 1)
+ {
+ c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i + 1; m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
+ }
+
+ return n - 1 - m;
+}
+
+#if defined(_OPENMP)
+
+static void libsais16_count_lms_suffixes_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ memset(buckets, 0, 4 * (size_t)k * sizeof(sa_sint_t));
+
+ sa_sint_t i = n - 2;
+ fast_uint_t s = 1;
+ fast_sint_t c0 = T[n - 1];
+ fast_sint_t c1 = 0;
+
+ for (; i >= prefetch_distance + 3; i -= 4)
+ {
+ libsais16_prefetch(&T[i - 2 * prefetch_distance]);
+
+ libsais16_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 0], 0)]);
+ libsais16_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 1], 0)]);
+ libsais16_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 2], 0)]);
+ libsais16_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 3], 0)]);
+
+ c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+ buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+
+ c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+
+ c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+ buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+
+ c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+ }
+
+ for (; i >= 0; i -= 1)
+ {
+ c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+ }
+
+ buckets[BUCKETS_INDEX4((fast_uint_t)c0, (s << 1) & 3)]++;
+}
+
+#endif
+
+static void libsais16_count_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
+
+ sa_sint_t i = n - 2;
+ fast_uint_t s = 1;
+ fast_sint_t c0 = T[n - 1];
+ fast_sint_t c1 = 0;
+
+ for (; i >= prefetch_distance + 3; i -= 4)
+ {
+ libsais16_prefetch(&T[i - 2 * prefetch_distance]);
+
+ libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0], 0)]);
+ libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1], 0)]);
+ libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2], 0)]);
+ libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3], 0)]);
+
+ c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+ buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+ c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+
+ c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+ buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+ c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+ }
+
+ for (; i >= 0; i -= 1)
+ {
+ c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+ }
+
+ buckets[BUCKETS_INDEX2((fast_uint_t)c0, 0)]++;
+}
+
+#if defined(_OPENMP)
+
+static void libsais16_count_compacted_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
+
+ sa_sint_t i = n - 2;
+ fast_uint_t s = 1;
+ fast_sint_t c0 = T[n - 1];
+ fast_sint_t c1 = 0;
+
+ for (; i >= prefetch_distance + 3; i -= 4)
+ {
+ libsais16_prefetch(&T[i - 2 * prefetch_distance]);
+
+ libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0] & SAINT_MAX, 0)]);
+ libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1] & SAINT_MAX, 0)]);
+ libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2] & SAINT_MAX, 0)]);
+ libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3] & SAINT_MAX, 0)]);
+
+ c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+ c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+ c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+
+ c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+ c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+ c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+ }
+
+ for (; i >= 0; i -= 1)
+ {
+ c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+ }
+
+ c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, 0)]++;
+}
+
+#endif
+
+static sa_sint_t libsais16_count_and_gather_lms_suffixes_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t));
+
+ fast_sint_t m = omp_block_start + omp_block_size - 1;
+
+ if (omp_block_size > 0)
+ {
+ const fast_sint_t prefetch_distance = 128;
+
+ fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1;
+
+ while (j < n && (c1 = T[j]) == c0) { ++j; }
+
+ fast_uint_t s = c0 >= c1;
+
+ for (i = m - 1, j = omp_block_start + 3; i >= j; i -= 4)
+ {
+ libsais16_prefetch(&T[i - prefetch_distance]);
+
+ c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+
+ c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 0); m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+
+ c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 1); m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+
+ c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 2); m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+ }
+
+ for (j -= 3; i >= j; i -= 1)
+ {
+ c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+ }
+
+ c1 = (i >= 0) ? T[i] : -1; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+ }
+
+ return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m);
+}
+
+static sa_sint_t libsais16_count_and_gather_lms_suffixes_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ sa_sint_t m = 0;
+
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536 && omp_get_dynamic() == 0)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+ if (omp_num_threads == 1)
+ {
+ m = libsais16_count_and_gather_lms_suffixes_16u(T, SA, n, buckets, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size;
+ thread_state[omp_thread_num].state.m = libsais16_count_and_gather_lms_suffixes_16u(T, SA, n, thread_state[omp_thread_num].state.buckets, omp_block_start, omp_block_size);
+
+ if (thread_state[omp_thread_num].state.m > 0)
+ {
+ thread_state[omp_thread_num].state.last_lms_suffix = SA[thread_state[omp_thread_num].state.position - 1];
+ }
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t));
+
+ fast_sint_t t;
+ for (t = omp_num_threads - 1; t >= 0; --t)
+ {
+ m += (sa_sint_t)thread_state[t].state.m;
+
+ if (t != omp_num_threads - 1 && thread_state[t].state.m > 0)
+ {
+ memcpy(&SA[n - m], &SA[thread_state[t].state.position - thread_state[t].state.m], (size_t)thread_state[t].state.m * sizeof(sa_sint_t));
+ }
+
+ {
+ sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
+ fast_sint_t s; for (s = 0; s < 4 * ALPHABET_SIZE; s += 1) { sa_sint_t A = buckets[s], B = temp_bucket[s]; buckets[s] = A + B; temp_bucket[s] = A; }
+ }
+ }
+ }
+ }
+#endif
+ }
+
+ return m;
+}
+
+static sa_sint_t libsais16_count_and_gather_lms_suffixes_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ memset(buckets, 0, 4 * (size_t)k * sizeof(sa_sint_t));
+
+ fast_sint_t m = omp_block_start + omp_block_size - 1;
+
+ if (omp_block_size > 0)
+ {
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1;
+
+ while (j < n && (c1 = T[j]) == c0) { ++j; }
+
+ fast_uint_t s = c0 >= c1;
+
+ for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4)
+ {
+ libsais16_prefetch(&T[i - 2 * prefetch_distance]);
+
+ libsais16_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 0], 0)]);
+ libsais16_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 1], 0)]);
+ libsais16_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 2], 0)]);
+ libsais16_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 3], 0)]);
+
+ c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+
+ c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 0); m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+
+ c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 1); m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+
+ c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 2); m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+ }
+
+ for (j -= prefetch_distance + 3; i >= j; i -= 1)
+ {
+ c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+ }
+
+ c1 = (i >= 0) ? T[i] : -1; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+ }
+
+ return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m);
+}
+
+static sa_sint_t libsais16_count_and_gather_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
+
+ fast_sint_t m = omp_block_start + omp_block_size - 1;
+
+ if (omp_block_size > 0)
+ {
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1;
+
+ while (j < n && (c1 = T[j]) == c0) { ++j; }
+
+ fast_uint_t s = c0 >= c1;
+
+ for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4)
+ {
+ libsais16_prefetch(&T[i - 2 * prefetch_distance]);
+
+ libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0], 0)]);
+ libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1], 0)]);
+ libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2], 0)]);
+ libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3], 0)]);
+
+ c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+ c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 0); m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+
+ c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 1); m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+ c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 2); m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+ }
+
+ for (j -= prefetch_distance + 3; i >= j; i -= 1)
+ {
+ c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+ }
+
+ c1 = (i >= 0) ? T[i] : -1; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+ }
+
+ return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m);
+}
+
+static sa_sint_t libsais16_count_and_gather_compacted_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
+
+ fast_sint_t m = omp_block_start + omp_block_size - 1;
+
+ if (omp_block_size > 0)
+ {
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1;
+
+ while (j < n && (c1 = T[j]) == c0) { ++j; }
+
+ fast_uint_t s = c0 >= c1;
+
+ for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4)
+ {
+ libsais16_prefetch(&T[i - 2 * prefetch_distance]);
+
+ libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0] & SAINT_MAX, 0)]);
+ libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1] & SAINT_MAX, 0)]);
+ libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2] & SAINT_MAX, 0)]);
+ libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3] & SAINT_MAX, 0)]);
+
+ c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((fast_sint_t)(s & 3) == (c0 >= 0));
+ c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+ c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 0); m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
+ c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+
+ c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 1); m -= ((fast_sint_t)(s & 3) == (c0 >= 0));
+ c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+ c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 2); m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
+ c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+ }
+
+ for (j -= prefetch_distance + 3; i >= j; i -= 1)
+ {
+ c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
+ c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+ }
+
+ c1 = (i >= 0) ? T[i] : -1; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((fast_sint_t)(s & 3) == (c0 >= 0));
+ c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+ }
+
+ return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m);
+}
+
+#if defined(_OPENMP)
+
+static fast_sint_t libsais16_get_bucket_stride(fast_sint_t free_space, fast_sint_t bucket_size, fast_sint_t num_buckets)
+{
+ fast_sint_t bucket_size_1024 = (bucket_size + 1023) & (-1024); if (free_space / (num_buckets - 1) >= bucket_size_1024) { return bucket_size_1024; }
+ fast_sint_t bucket_size_16 = (bucket_size + 15) & (-16); if (free_space / (num_buckets - 1) >= bucket_size_16) { return bucket_size_16; }
+
+ return bucket_size;
+}
+
+static sa_sint_t libsais16_count_and_gather_lms_suffixes_32s_4k_fs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ sa_sint_t m = 0;
+
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+ if (omp_num_threads == 1)
+ {
+ m = libsais16_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ fast_sint_t bucket_size = 4 * (fast_sint_t)k;
+ fast_sint_t bucket_stride = libsais16_get_bucket_stride(buckets - &SA[n], bucket_size, omp_num_threads);
+
+ {
+ thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size;
+ thread_state[omp_thread_num].state.count = libsais16_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets - (omp_thread_num * bucket_stride), omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ if (omp_thread_num == omp_num_threads - 1)
+ {
+ fast_sint_t t;
+ for (t = omp_num_threads - 1; t >= 0; --t)
+ {
+ m += (sa_sint_t)thread_state[t].state.count;
+
+ if (t != omp_num_threads - 1 && thread_state[t].state.count > 0)
+ {
+ memcpy(&SA[n - m], &SA[thread_state[t].state.position - thread_state[t].state.count], (size_t)thread_state[t].state.count * sizeof(sa_sint_t));
+ }
+ }
+ }
+ else
+ {
+ omp_num_threads = omp_num_threads - 1;
+ omp_block_stride = (bucket_size / omp_num_threads) & (-16);
+ omp_block_start = omp_thread_num * omp_block_stride;
+ omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : bucket_size - omp_block_start;
+
+ libsais16_accumulate_counts_s32(buckets + omp_block_start, omp_block_size, bucket_stride, omp_num_threads + 1);
+ }
+ }
+#endif
+ }
+
+ return m;
+}
+
+static sa_sint_t libsais16_count_and_gather_lms_suffixes_32s_2k_fs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ sa_sint_t m = 0;
+
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+ if (omp_num_threads == 1)
+ {
+ m = libsais16_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ fast_sint_t bucket_size = 2 * (fast_sint_t)k;
+ fast_sint_t bucket_stride = libsais16_get_bucket_stride(buckets - &SA[n], bucket_size, omp_num_threads);
+
+ {
+ thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size;
+ thread_state[omp_thread_num].state.count = libsais16_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets - (omp_thread_num * bucket_stride), omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ if (omp_thread_num == omp_num_threads - 1)
+ {
+ fast_sint_t t;
+ for (t = omp_num_threads - 1; t >= 0; --t)
+ {
+ m += (sa_sint_t)thread_state[t].state.count;
+
+ if (t != omp_num_threads - 1 && thread_state[t].state.count > 0)
+ {
+ memcpy(&SA[n - m], &SA[thread_state[t].state.position - thread_state[t].state.count], (size_t)thread_state[t].state.count * sizeof(sa_sint_t));
+ }
+ }
+ }
+ else
+ {
+ omp_num_threads = omp_num_threads - 1;
+ omp_block_stride = (bucket_size / omp_num_threads) & (-16);
+ omp_block_start = omp_thread_num * omp_block_stride;
+ omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : bucket_size - omp_block_start;
+
+ libsais16_accumulate_counts_s32(buckets + omp_block_start, omp_block_size, bucket_stride, omp_num_threads + 1);
+ }
+ }
+#endif
+ }
+
+ return m;
+}
+
+static void libsais16_count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+ if (omp_num_threads == 1)
+ {
+ libsais16_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA, n, k, buckets, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ fast_sint_t bucket_size = 2 * (fast_sint_t)k;
+ fast_sint_t bucket_stride = libsais16_get_bucket_stride(buckets - &SA[n + n], bucket_size, omp_num_threads);
+
+ {
+ thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size;
+ thread_state[omp_thread_num].state.count = libsais16_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA + n, n, k, buckets - (omp_thread_num * bucket_stride), omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ {
+ fast_sint_t t, m = 0; for (t = omp_num_threads - 1; t >= omp_thread_num; --t) { m += (sa_sint_t)thread_state[t].state.count; }
+
+ if (thread_state[omp_thread_num].state.count > 0)
+ {
+ memcpy(&SA[n - m], &SA[n + thread_state[omp_thread_num].state.position - thread_state[omp_thread_num].state.count], (size_t)thread_state[omp_thread_num].state.count * sizeof(sa_sint_t));
+ }
+ }
+
+ {
+ omp_block_stride = (bucket_size / omp_num_threads) & (-16);
+ omp_block_start = omp_thread_num * omp_block_stride;
+ omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : bucket_size - omp_block_start;
+
+ libsais16_accumulate_counts_s32(buckets + omp_block_start, omp_block_size, bucket_stride, omp_num_threads);
+ }
+ }
+#endif
+ }
+}
+
+#endif
+
+static sa_sint_t libsais16_count_and_gather_lms_suffixes_32s_4k_nofs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads)
+{
+ sa_sint_t m = 0;
+
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(2) if(threads > 1 && n >= 65536)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads);
+
+ fast_sint_t omp_num_threads = 1;
+#endif
+ if (omp_num_threads == 1)
+ {
+ m = libsais16_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets, 0, n);
+ }
+#if defined(_OPENMP)
+ else if (omp_thread_num == 0)
+ {
+ libsais16_count_lms_suffixes_32s_4k(T, n, k, buckets);
+ }
+ else
+ {
+ m = libsais16_gather_lms_suffixes_32s(T, SA, n);
+ }
+#endif
+ }
+
+ return m;
+}
+
+static sa_sint_t libsais16_count_and_gather_lms_suffixes_32s_2k_nofs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads)
+{
+ sa_sint_t m = 0;
+
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(2) if(threads > 1 && n >= 65536)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads);
+
+ fast_sint_t omp_num_threads = 1;
+#endif
+ if (omp_num_threads == 1)
+ {
+ m = libsais16_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n);
+ }
+#if defined(_OPENMP)
+ else if (omp_thread_num == 0)
+ {
+ libsais16_count_lms_suffixes_32s_2k(T, n, k, buckets);
+ }
+ else
+ {
+ m = libsais16_gather_lms_suffixes_32s(T, SA, n);
+ }
+#endif
+ }
+
+ return m;
+}
+
+static sa_sint_t libsais16_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads)
+{
+ sa_sint_t m = 0;
+
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(2) if(threads > 1 && n >= 65536)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads);
+
+ fast_sint_t omp_num_threads = 1;
+#endif
+ if (omp_num_threads == 1)
+ {
+ m = libsais16_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n);
+ }
+#if defined(_OPENMP)
+ else if (omp_thread_num == 0)
+ {
+ libsais16_count_compacted_lms_suffixes_32s_2k(T, n, k, buckets);
+ }
+ else
+ {
+ m = libsais16_gather_compacted_lms_suffixes_32s(T, SA, n);
+ }
+#endif
+ }
+
+ return m;
+}
+
+static sa_sint_t libsais16_count_and_gather_lms_suffixes_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ sa_sint_t m;
+
+#if defined(_OPENMP)
+ sa_sint_t max_threads = (sa_sint_t)((buckets - &SA[n]) / ((4 * (fast_sint_t)k + 15) & (-16))); if (max_threads > threads) { max_threads = threads; }
+ if (max_threads > 1 && n >= 65536 && n / k >= 2)
+ {
+ if (max_threads > n / 16 / k) { max_threads = n / 16 / k; }
+ m = libsais16_count_and_gather_lms_suffixes_32s_4k_fs_omp(T, SA, n, k, buckets, max_threads > 2 ? max_threads : 2, thread_state);
+ }
+ else
+#else
+ UNUSED(thread_state);
+#endif
+ {
+ m = libsais16_count_and_gather_lms_suffixes_32s_4k_nofs_omp(T, SA, n, k, buckets, threads);
+ }
+
+ return m;
+}
+
+static sa_sint_t libsais16_count_and_gather_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ sa_sint_t m;
+
+#if defined(_OPENMP)
+ sa_sint_t max_threads = (sa_sint_t)((buckets - &SA[n]) / ((2 * (fast_sint_t)k + 15) & (-16))); if (max_threads > threads) { max_threads = threads; }
+ if (max_threads > 1 && n >= 65536 && n / k >= 2)
+ {
+ if (max_threads > n / 8 / k) { max_threads = n / 8 / k; }
+ m = libsais16_count_and_gather_lms_suffixes_32s_2k_fs_omp(T, SA, n, k, buckets, max_threads > 2 ? max_threads : 2, thread_state);
+ }
+ else
+#else
+ UNUSED(thread_state);
+#endif
+ {
+ m = libsais16_count_and_gather_lms_suffixes_32s_2k_nofs_omp(T, SA, n, k, buckets, threads);
+ }
+
+ return m;
+}
+
+static void libsais16_count_and_gather_compacted_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+ sa_sint_t max_threads = (sa_sint_t)((buckets - &SA[n + n]) / ((2 * (fast_sint_t)k + 15) & (-16))); if (max_threads > threads) { max_threads = threads; }
+ if (max_threads > 1 && n >= 65536 && n / k >= 2)
+ {
+ if (max_threads > n / 8 / k) { max_threads = n / 8 / k; }
+ libsais16_count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(T, SA, n, k, buckets, max_threads > 2 ? max_threads : 2, thread_state);
+ }
+ else
+#else
+ UNUSED(thread_state);
+#endif
+ {
+ libsais16_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(T, SA, n, k, buckets, threads);
+ }
+}
+
+static void libsais16_count_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ memset(buckets, 0, (size_t)k * sizeof(sa_sint_t));
+
+ fast_sint_t i, j;
+ for (i = 0, j = (fast_sint_t)n - 7; i < j; i += 8)
+ {
+ libsais16_prefetch(&T[i + prefetch_distance]);
+
+ buckets[T[i + 0]]++;
+ buckets[T[i + 1]]++;
+ buckets[T[i + 2]]++;
+ buckets[T[i + 3]]++;
+ buckets[T[i + 4]]++;
+ buckets[T[i + 5]]++;
+ buckets[T[i + 6]]++;
+ buckets[T[i + 7]]++;
+ }
+
+ for (j += 7; i < j; i += 1)
+ {
+ buckets[T[i]]++;
+ }
+}
+
+static void libsais16_initialize_buckets_start_and_end_16u(sa_sint_t * RESTRICT buckets, sa_sint_t * RESTRICT freq)
+{
+ sa_sint_t * RESTRICT bucket_start = &buckets[6 * ALPHABET_SIZE];
+ sa_sint_t * RESTRICT bucket_end = &buckets[7 * ALPHABET_SIZE];
+
+ if (freq != NULL)
+ {
+ fast_sint_t i, j; sa_sint_t sum = 0;
+ for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1)
+ {
+ bucket_start[j] = sum;
+ sum += (freq[j] = buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)]);
+ bucket_end[j] = sum;
+ }
+ }
+ else
+ {
+ fast_sint_t i, j; sa_sint_t sum = 0;
+ for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1)
+ {
+ bucket_start[j] = sum;
+ sum += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)];
+ bucket_end[j] = sum;
+ }
+ }
+}
+
+static void libsais16_initialize_buckets_start_and_end_32s_6k(sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+ sa_sint_t * RESTRICT bucket_start = &buckets[4 * k];
+ sa_sint_t * RESTRICT bucket_end = &buckets[5 * k];
+
+ fast_sint_t i, j; sa_sint_t sum = 0;
+ for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1)
+ {
+ bucket_start[j] = sum;
+ sum += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)];
+ bucket_end[j] = sum;
+ }
+}
+
+static void libsais16_initialize_buckets_start_and_end_32s_4k(sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+ sa_sint_t * RESTRICT bucket_start = &buckets[2 * k];
+ sa_sint_t * RESTRICT bucket_end = &buckets[3 * k];
+
+ fast_sint_t i, j; sa_sint_t sum = 0;
+ for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1)
+ {
+ bucket_start[j] = sum;
+ sum += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)];
+ bucket_end[j] = sum;
+ }
+}
+
+static void libsais16_initialize_buckets_end_32s_2k(sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+ fast_sint_t i; sa_sint_t sum0 = 0;
+ for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0))
+ {
+ sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)]; buckets[i + BUCKETS_INDEX2(0, 0)] = sum0;
+ }
+}
+
+static void libsais16_initialize_buckets_start_and_end_32s_2k(sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+ fast_sint_t i, j;
+ for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1)
+ {
+ buckets[j] = buckets[i];
+ }
+
+ buckets[k] = 0; memcpy(&buckets[k + 1], buckets, ((size_t)k - 1) * sizeof(sa_sint_t));
+}
+
+static void libsais16_initialize_buckets_start_32s_1k(sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+ fast_sint_t i; sa_sint_t sum = 0;
+ for (i = 0; i <= (fast_sint_t)k - 1; i += 1) { sa_sint_t tmp = buckets[i]; buckets[i] = sum; sum += tmp; }
+}
+
+static void libsais16_initialize_buckets_end_32s_1k(sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+ fast_sint_t i; sa_sint_t sum = 0;
+ for (i = 0; i <= (fast_sint_t)k - 1; i += 1) { sum += buckets[i]; buckets[i] = sum; }
+}
+
+static sa_sint_t libsais16_initialize_buckets_for_lms_suffixes_radix_sort_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix)
+{
+ {
+ fast_uint_t s = 0;
+ fast_sint_t c0 = T[first_lms_suffix];
+ fast_sint_t c1 = 0;
+
+ for (; --first_lms_suffix >= 0; )
+ {
+ c1 = c0; c0 = T[first_lms_suffix]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]--;
+ }
+
+ buckets[BUCKETS_INDEX4((fast_uint_t)c0, (s << 1) & 3)]--;
+ }
+
+ {
+ sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE];
+
+ fast_sint_t i, j; sa_sint_t sum = 0;
+ for (i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0))
+ {
+ temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum; sum += buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 3)]; temp_bucket[j] = sum;
+ }
+
+ return sum;
+ }
+}
+
+static void libsais16_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix)
+{
+ buckets[BUCKETS_INDEX2(T[first_lms_suffix], 0)]++;
+ buckets[BUCKETS_INDEX2(T[first_lms_suffix], 1)]--;
+
+ fast_sint_t i; sa_sint_t sum0 = 0, sum1 = 0;
+ for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0))
+ {
+ sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)];
+ sum1 += buckets[i + BUCKETS_INDEX2(0, 1)];
+
+ buckets[i + BUCKETS_INDEX2(0, 0)] = sum0;
+ buckets[i + BUCKETS_INDEX2(0, 1)] = sum1;
+ }
+}
+
+static sa_sint_t libsais16_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix)
+{
+ {
+ fast_uint_t s = 0;
+ fast_sint_t c0 = T[first_lms_suffix];
+ fast_sint_t c1 = 0;
+
+ for (; --first_lms_suffix >= 0; )
+ {
+ c1 = c0; c0 = T[first_lms_suffix]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]--;
+ }
+
+ buckets[BUCKETS_INDEX4((fast_uint_t)c0, (s << 1) & 3)]--;
+ }
+
+ {
+ sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k];
+
+ fast_sint_t i, j; sa_sint_t sum = 0;
+ for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1)
+ {
+ sum += buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 3)]; temp_bucket[j] = sum;
+ }
+
+ return sum;
+ }
+}
+
+static void libsais16_initialize_buckets_for_radix_and_partial_sorting_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix)
+{
+ sa_sint_t * RESTRICT bucket_start = &buckets[2 * k];
+ sa_sint_t * RESTRICT bucket_end = &buckets[3 * k];
+
+ buckets[BUCKETS_INDEX2(T[first_lms_suffix], 0)]++;
+ buckets[BUCKETS_INDEX2(T[first_lms_suffix], 1)]--;
+
+ fast_sint_t i, j; sa_sint_t sum0 = 0, sum1 = 0;
+ for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1)
+ {
+ bucket_start[j] = sum1;
+
+ sum0 += buckets[i + BUCKETS_INDEX2(0, 1)];
+ sum1 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)];
+ buckets[i + BUCKETS_INDEX2(0, 1)] = sum0;
+
+ bucket_end[j] = sum1;
+ }
+}
+
+static void libsais16_radix_sort_lms_suffixes_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4)
+ {
+ libsais16_prefetch(&SA[i - 2 * prefetch_distance]);
+
+ libsais16_prefetch(&T[SA[i - prefetch_distance - 0]]);
+ libsais16_prefetch(&T[SA[i - prefetch_distance - 1]]);
+ libsais16_prefetch(&T[SA[i - prefetch_distance - 2]]);
+ libsais16_prefetch(&T[SA[i - prefetch_distance - 3]]);
+
+ sa_sint_t p0 = SA[i - 0]; SA[--induction_bucket[BUCKETS_INDEX2(T[p0], 0)]] = p0;
+ sa_sint_t p1 = SA[i - 1]; SA[--induction_bucket[BUCKETS_INDEX2(T[p1], 0)]] = p1;
+ sa_sint_t p2 = SA[i - 2]; SA[--induction_bucket[BUCKETS_INDEX2(T[p2], 0)]] = p2;
+ sa_sint_t p3 = SA[i - 3]; SA[--induction_bucket[BUCKETS_INDEX2(T[p3], 0)]] = p3;
+ }
+
+ for (j -= prefetch_distance + 3; i >= j; i -= 1)
+ {
+ sa_sint_t p = SA[i]; SA[--induction_bucket[BUCKETS_INDEX2(T[p], 0)]] = p;
+ }
+}
+
+static void libsais16_radix_sort_lms_suffixes_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536 && m >= 65536 && omp_get_dynamic() == 0)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_num_threads = 1;
+#endif
+ if (omp_num_threads == 1)
+ {
+ libsais16_radix_sort_lms_suffixes_16u(T, SA, &buckets[4 * ALPHABET_SIZE], (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ sa_sint_t * RESTRICT src_bucket = &buckets[4 * ALPHABET_SIZE];
+ sa_sint_t * RESTRICT dst_bucket = thread_state[omp_thread_num].state.buckets;
+
+ fast_sint_t i, j;
+ for (i = BUCKETS_INDEX2(0, 0), j = BUCKETS_INDEX4(0, 1); i <= BUCKETS_INDEX2(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX2(1, 0), j += BUCKETS_INDEX4(1, 0))
+ {
+ dst_bucket[i] = src_bucket[i] - dst_bucket[j];
+ }
+ }
+
+ {
+ fast_sint_t t, omp_block_start = 0, omp_block_size = thread_state[omp_thread_num].state.m;
+ for (t = omp_num_threads - 1; t >= omp_thread_num; --t) omp_block_start += thread_state[t].state.m;
+
+ if (omp_block_start == (fast_sint_t)m && omp_block_size > 0)
+ {
+ omp_block_start -= 1; omp_block_size -= 1;
+ }
+
+ libsais16_radix_sort_lms_suffixes_16u(T, SA, thread_state[omp_thread_num].state.buckets, (fast_sint_t)n - omp_block_start, omp_block_size);
+ }
+ }
+#endif
+ }
+}
+
+static void libsais16_radix_sort_lms_suffixes_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 3; i >= j; i -= 4)
+ {
+ libsais16_prefetch(&SA[i - 3 * prefetch_distance]);
+
+ libsais16_prefetch(&T[SA[i - 2 * prefetch_distance - 0]]);
+ libsais16_prefetch(&T[SA[i - 2 * prefetch_distance - 1]]);
+ libsais16_prefetch(&T[SA[i - 2 * prefetch_distance - 2]]);
+ libsais16_prefetch(&T[SA[i - 2 * prefetch_distance - 3]]);
+
+ libsais16_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 0]]]);
+ libsais16_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 1]]]);
+ libsais16_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 2]]]);
+ libsais16_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 3]]]);
+
+ sa_sint_t p0 = SA[i - 0]; SA[--induction_bucket[T[p0]]] = p0;
+ sa_sint_t p1 = SA[i - 1]; SA[--induction_bucket[T[p1]]] = p1;
+ sa_sint_t p2 = SA[i - 2]; SA[--induction_bucket[T[p2]]] = p2;
+ sa_sint_t p3 = SA[i - 3]; SA[--induction_bucket[T[p3]]] = p3;
+ }
+
+ for (j -= 2 * prefetch_distance + 3; i >= j; i -= 1)
+ {
+ sa_sint_t p = SA[i]; SA[--induction_bucket[T[p]]] = p;
+ }
+}
+
+static void libsais16_radix_sort_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 3; i >= j; i -= 4)
+ {
+ libsais16_prefetch(&SA[i - 3 * prefetch_distance]);
+
+ libsais16_prefetch(&T[SA[i - 2 * prefetch_distance - 0]]);
+ libsais16_prefetch(&T[SA[i - 2 * prefetch_distance - 1]]);
+ libsais16_prefetch(&T[SA[i - 2 * prefetch_distance - 2]]);
+ libsais16_prefetch(&T[SA[i - 2 * prefetch_distance - 3]]);
+
+ libsais16_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 0]], 0)]);
+ libsais16_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 1]], 0)]);
+ libsais16_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 2]], 0)]);
+ libsais16_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 3]], 0)]);
+
+ sa_sint_t p0 = SA[i - 0]; SA[--induction_bucket[BUCKETS_INDEX2(T[p0], 0)]] = p0;
+ sa_sint_t p1 = SA[i - 1]; SA[--induction_bucket[BUCKETS_INDEX2(T[p1], 0)]] = p1;
+ sa_sint_t p2 = SA[i - 2]; SA[--induction_bucket[BUCKETS_INDEX2(T[p2], 0)]] = p2;
+ sa_sint_t p3 = SA[i - 3]; SA[--induction_bucket[BUCKETS_INDEX2(T[p3], 0)]] = p3;
+ }
+
+ for (j -= 2 * prefetch_distance + 3; i >= j; i -= 1)
+ {
+ sa_sint_t p = SA[i]; SA[--induction_bucket[BUCKETS_INDEX2(T[p], 0)]] = p;
+ }
+}
+
+#if defined(_OPENMP)
+
+static void libsais16_radix_sort_lms_suffixes_32s_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
+ {
+ libsais16_prefetch(&SA[i + 2 * prefetch_distance]);
+
+ libsais16_prefetch(&T[SA[i + prefetch_distance + 0]]);
+ libsais16_prefetch(&T[SA[i + prefetch_distance + 1]]);
+ libsais16_prefetch(&T[SA[i + prefetch_distance + 2]]);
+ libsais16_prefetch(&T[SA[i + prefetch_distance + 3]]);
+
+ libsais16_prefetchw(&cache[i + prefetch_distance]);
+
+ cache[i + 0].symbol = T[cache[i + 0].index = SA[i + 0]];
+ cache[i + 1].symbol = T[cache[i + 1].index = SA[i + 1]];
+ cache[i + 2].symbol = T[cache[i + 2].index = SA[i + 2]];
+ cache[i + 3].symbol = T[cache[i + 3].index = SA[i + 3]];
+ }
+
+ for (j += prefetch_distance + 3; i < j; i += 1)
+ {
+ cache[i].symbol = T[cache[i].index = SA[i]];
+ }
+}
+
+static void libsais16_radix_sort_lms_suffixes_32s_6k_block_sort(sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4)
+ {
+ libsais16_prefetchw(&cache[i - 2 * prefetch_distance]);
+
+ libsais16_prefetchw(&induction_bucket[cache[i - prefetch_distance - 0].symbol]);
+ libsais16_prefetchw(&induction_bucket[cache[i - prefetch_distance - 1].symbol]);
+ libsais16_prefetchw(&induction_bucket[cache[i - prefetch_distance - 2].symbol]);
+ libsais16_prefetchw(&induction_bucket[cache[i - prefetch_distance - 3].symbol]);
+
+ cache[i - 0].symbol = --induction_bucket[cache[i - 0].symbol];
+ cache[i - 1].symbol = --induction_bucket[cache[i - 1].symbol];
+ cache[i - 2].symbol = --induction_bucket[cache[i - 2].symbol];
+ cache[i - 3].symbol = --induction_bucket[cache[i - 3].symbol];
+ }
+
+ for (j -= prefetch_distance + 3; i >= j; i -= 1)
+ {
+ cache[i].symbol = --induction_bucket[cache[i].symbol];
+ }
+}
+
+static void libsais16_radix_sort_lms_suffixes_32s_2k_block_sort(sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4)
+ {
+ libsais16_prefetchw(&cache[i - 2 * prefetch_distance]);
+
+ libsais16_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 0].symbol, 0)]);
+ libsais16_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 1].symbol, 0)]);
+ libsais16_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 2].symbol, 0)]);
+ libsais16_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 3].symbol, 0)]);
+
+ cache[i - 0].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 0].symbol, 0)];
+ cache[i - 1].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 1].symbol, 0)];
+ cache[i - 2].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 2].symbol, 0)];
+ cache[i - 3].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 3].symbol, 0)];
+ }
+
+ for (j -= prefetch_distance + 3; i >= j; i -= 1)
+ {
+ cache[i].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i].symbol, 0)];
+ }
+}
+
+static void libsais16_radix_sort_lms_suffixes_32s_6k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(cache);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+ omp_block_start += block_start;
+
+ if (omp_num_threads == 1)
+ {
+ libsais16_radix_sort_lms_suffixes_32s_6k(T, SA, induction_bucket, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ libsais16_radix_sort_lms_suffixes_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ libsais16_radix_sort_lms_suffixes_32s_6k_block_sort(induction_bucket, cache - block_start, block_start, block_size);
+ }
+
+ #pragma omp barrier
+
+ {
+ libsais16_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+ }
+ }
+#endif
+ }
+}
+
+static void libsais16_radix_sort_lms_suffixes_32s_2k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(cache);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+ omp_block_start += block_start;
+
+ if (omp_num_threads == 1)
+ {
+ libsais16_radix_sort_lms_suffixes_32s_2k(T, SA, induction_bucket, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ libsais16_radix_sort_lms_suffixes_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ libsais16_radix_sort_lms_suffixes_32s_2k_block_sort(induction_bucket, cache - block_start, block_start, block_size);
+ }
+
+ #pragma omp barrier
+
+ {
+ libsais16_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+ }
+ }
+#endif
+ }
+}
+
+#endif
+
+static void libsais16_radix_sort_lms_suffixes_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ if (threads == 1 || m < 65536)
+ {
+ libsais16_radix_sort_lms_suffixes_32s_6k(T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ fast_sint_t block_start, block_end;
+ for (block_start = 0; block_start < (fast_sint_t)m - 1; block_start = block_end)
+ {
+ block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end >= m) { block_end = (fast_sint_t)m - 1; }
+
+ libsais16_radix_sort_lms_suffixes_32s_6k_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, (fast_sint_t)n - block_end, block_end - block_start, threads);
+ }
+ }
+#else
+ UNUSED(thread_state);
+#endif
+}
+
+static void libsais16_radix_sort_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ if (threads == 1 || m < 65536)
+ {
+ libsais16_radix_sort_lms_suffixes_32s_2k(T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ fast_sint_t block_start, block_end;
+ for (block_start = 0; block_start < (fast_sint_t)m - 1; block_start = block_end)
+ {
+ block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end >= m) { block_end = (fast_sint_t)m - 1; }
+
+ libsais16_radix_sort_lms_suffixes_32s_2k_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, (fast_sint_t)n - block_end, block_end - block_start, threads);
+ }
+ }
+#else
+ UNUSED(thread_state);
+#endif
+}
+
+static sa_sint_t libsais16_radix_sort_lms_suffixes_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t i = n - 2;
+ sa_sint_t m = 0;
+ fast_uint_t s = 1;
+ fast_sint_t c0 = T[n - 1];
+ fast_sint_t c1 = 0;
+ fast_sint_t c2 = 0;
+
+ for (; i >= prefetch_distance + 3; i -= 4)
+ {
+ libsais16_prefetch(&T[i - 2 * prefetch_distance]);
+
+ libsais16_prefetchw(&buckets[T[i - prefetch_distance - 0]]);
+ libsais16_prefetchw(&buckets[T[i - prefetch_distance - 1]]);
+ libsais16_prefetchw(&buckets[T[i - prefetch_distance - 2]]);
+ libsais16_prefetchw(&buckets[T[i - prefetch_distance - 3]]);
+
+ c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+ if ((s & 3) == 1) { SA[--buckets[c2 = c0]] = i + 1; m++; }
+
+ c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ if ((s & 3) == 1) { SA[--buckets[c2 = c1]] = i - 0; m++; }
+
+ c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+ if ((s & 3) == 1) { SA[--buckets[c2 = c0]] = i - 1; m++; }
+
+ c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ if ((s & 3) == 1) { SA[--buckets[c2 = c1]] = i - 2; m++; }
+ }
+
+ for (; i >= 0; i -= 1)
+ {
+ c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ if ((s & 3) == 1) { SA[--buckets[c2 = c1]] = i + 1; m++; }
+ }
+
+ if (m > 1)
+ {
+ SA[buckets[c2]] = 0;
+ }
+
+ return m;
+}
+
+static void libsais16_radix_sort_set_markers_32s_6k(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
+ {
+ libsais16_prefetch(&induction_bucket[i + 2 * prefetch_distance]);
+
+ libsais16_prefetchw(&SA[induction_bucket[i + prefetch_distance + 0]]);
+ libsais16_prefetchw(&SA[induction_bucket[i + prefetch_distance + 1]]);
+ libsais16_prefetchw(&SA[induction_bucket[i + prefetch_distance + 2]]);
+ libsais16_prefetchw(&SA[induction_bucket[i + prefetch_distance + 3]]);
+
+ SA[induction_bucket[i + 0]] |= SAINT_MIN;
+ SA[induction_bucket[i + 1]] |= SAINT_MIN;
+ SA[induction_bucket[i + 2]] |= SAINT_MIN;
+ SA[induction_bucket[i + 3]] |= SAINT_MIN;
+ }
+
+ for (j += prefetch_distance + 3; i < j; i += 1)
+ {
+ SA[induction_bucket[i]] |= SAINT_MIN;
+ }
+}
+
+static void libsais16_radix_sort_set_markers_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
+ {
+ libsais16_prefetch(&induction_bucket[BUCKETS_INDEX2(i + 2 * prefetch_distance, 0)]);
+
+ libsais16_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 0, 0)]]);
+ libsais16_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 1, 0)]]);
+ libsais16_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 2, 0)]]);
+ libsais16_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 3, 0)]]);
+
+ SA[induction_bucket[BUCKETS_INDEX2(i + 0, 0)]] |= SUFFIX_GROUP_MARKER;
+ SA[induction_bucket[BUCKETS_INDEX2(i + 1, 0)]] |= SUFFIX_GROUP_MARKER;
+ SA[induction_bucket[BUCKETS_INDEX2(i + 2, 0)]] |= SUFFIX_GROUP_MARKER;
+ SA[induction_bucket[BUCKETS_INDEX2(i + 3, 0)]] |= SUFFIX_GROUP_MARKER;
+ }
+
+ for (j += prefetch_distance + 3; i < j; i += 1)
+ {
+ SA[induction_bucket[BUCKETS_INDEX2(i, 0)]] |= SUFFIX_GROUP_MARKER;
+ }
+}
+
+static void libsais16_radix_sort_set_markers_32s_6k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && k >= 65536)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+ fast_sint_t omp_block_stride = (((fast_sint_t)k - 1) / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : (fast_sint_t)k - 1 - omp_block_start;
+#else
+ UNUSED(threads);
+
+ fast_sint_t omp_block_start = 0;
+ fast_sint_t omp_block_size = (fast_sint_t)k - 1;
+#endif
+
+ libsais16_radix_sort_set_markers_32s_6k(SA, induction_bucket, omp_block_start, omp_block_size);
+ }
+}
+
+static void libsais16_radix_sort_set_markers_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && k >= 65536)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+ fast_sint_t omp_block_stride = (((fast_sint_t)k - 1) / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : (fast_sint_t)k - 1 - omp_block_start;
+#else
+ UNUSED(threads);
+
+ fast_sint_t omp_block_start = 0;
+ fast_sint_t omp_block_size = (fast_sint_t)k - 1;
+#endif
+
+ libsais16_radix_sort_set_markers_32s_4k(SA, induction_bucket, omp_block_start, omp_block_size);
+ }
+}
+
+static void libsais16_initialize_buckets_for_partial_sorting_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count)
+{
+ sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE];
+
+ buckets[BUCKETS_INDEX4((fast_uint_t)T[first_lms_suffix], 1)]++;
+
+ fast_sint_t i, j; sa_sint_t sum0 = left_suffixes_count + 1, sum1 = 0;
+ for (i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0))
+ {
+ temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0;
+
+ sum0 += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 2)];
+ sum1 += buckets[i + BUCKETS_INDEX4(0, 1)];
+
+ buckets[j + BUCKETS_INDEX2(0, 0)] = sum0;
+ buckets[j + BUCKETS_INDEX2(0, 1)] = sum1;
+ }
+}
+
+static void libsais16_initialize_buckets_for_partial_sorting_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count)
+{
+ sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k];
+
+ fast_sint_t i, j; sa_sint_t sum0 = left_suffixes_count + 1, sum1 = 0, sum2 = 0;
+ for (first_lms_suffix = T[first_lms_suffix], i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4((fast_sint_t)first_lms_suffix - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0))
+ {
+ sa_sint_t SS = buckets[i + BUCKETS_INDEX4(0, 0)];
+ sa_sint_t LS = buckets[i + BUCKETS_INDEX4(0, 1)];
+ sa_sint_t SL = buckets[i + BUCKETS_INDEX4(0, 2)];
+ sa_sint_t LL = buckets[i + BUCKETS_INDEX4(0, 3)];
+
+ buckets[i + BUCKETS_INDEX4(0, 0)] = sum0;
+ buckets[i + BUCKETS_INDEX4(0, 1)] = sum2;
+ buckets[i + BUCKETS_INDEX4(0, 2)] = 0;
+ buckets[i + BUCKETS_INDEX4(0, 3)] = 0;
+
+ sum0 += SS + SL; sum1 += LS; sum2 += LS + LL;
+
+ temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0;
+ temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum1;
+ }
+
+ for (sum1 += 1; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0))
+ {
+ sa_sint_t SS = buckets[i + BUCKETS_INDEX4(0, 0)];
+ sa_sint_t LS = buckets[i + BUCKETS_INDEX4(0, 1)];
+ sa_sint_t SL = buckets[i + BUCKETS_INDEX4(0, 2)];
+ sa_sint_t LL = buckets[i + BUCKETS_INDEX4(0, 3)];
+
+ buckets[i + BUCKETS_INDEX4(0, 0)] = sum0;
+ buckets[i + BUCKETS_INDEX4(0, 1)] = sum2;
+ buckets[i + BUCKETS_INDEX4(0, 2)] = 0;
+ buckets[i + BUCKETS_INDEX4(0, 3)] = 0;
+
+ sum0 += SS + SL; sum1 += LS; sum2 += LS + LL;
+
+ temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0;
+ temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum1;
+ }
+}
+
+static sa_sint_t libsais16_partial_sorting_scan_left_to_right_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE];
+ sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais16_prefetch(&SA[i + 2 * prefetch_distance]);
+
+ libsais16_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1);
+ libsais16_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2);
+ libsais16_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1);
+ libsais16_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2);
+
+ sa_sint_t p0 = SA[i + 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]);
+ SA[induction_bucket[v0]++] = (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d;
+
+ sa_sint_t p1 = SA[i + 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]);
+ SA[induction_bucket[v1]++] = (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d;
+ }
+
+ for (j += prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]);
+ SA[induction_bucket[v]++] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d;
+ }
+
+ return d;
+}
+
+#if defined(_OPENMP)
+
+static void libsais16_partial_sorting_scan_left_to_right_16u_block_prepare(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size, LIBSAIS_THREAD_STATE * RESTRICT state)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
+ sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
+
+ memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t));
+
+ fast_sint_t i, j, count = 0; sa_sint_t d = 1;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais16_prefetch(&SA[i + 2 * prefetch_distance]);
+
+ libsais16_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1);
+ libsais16_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2);
+ libsais16_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1);
+ libsais16_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2);
+
+ sa_sint_t p0 = cache[count].index = SA[i + 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = cache[count++].symbol = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]); induction_bucket[v0]++; distinct_names[v0] = d;
+ sa_sint_t p1 = cache[count].index = SA[i + 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = cache[count++].symbol = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]); induction_bucket[v1]++; distinct_names[v1] = d;
+ }
+
+ for (j += prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t p = cache[count].index = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = cache[count++].symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]); induction_bucket[v]++; distinct_names[v] = d;
+ }
+
+ state[0].state.position = (fast_sint_t)d - 1;
+ state[0].state.count = count;
+}
+
+static void libsais16_partial_sorting_scan_left_to_right_16u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count, sa_sint_t d)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
+ sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
+
+ fast_sint_t i, j;
+ for (i = 0, j = count - 1; i < j; i += 2)
+ {
+ libsais16_prefetch(&cache[i + prefetch_distance]);
+
+ sa_sint_t p0 = cache[i + 0].index; d += (p0 < 0); sa_sint_t v0 = cache[i + 0].symbol;
+ SA[induction_bucket[v0]++] = (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d;
+
+ sa_sint_t p1 = cache[i + 1].index; d += (p1 < 0); sa_sint_t v1 = cache[i + 1].symbol;
+ SA[induction_bucket[v1]++] = (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d;
+ }
+
+ for (j += 1; i < j; i += 1)
+ {
+ sa_sint_t p = cache[i].index; d += (p < 0); sa_sint_t v = cache[i].symbol;
+ SA[induction_bucket[v]++] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d;
+ }
+}
+
+static sa_sint_t libsais16_partial_sorting_scan_left_to_right_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+ omp_block_start += block_start;
+
+ if (omp_num_threads == 1)
+ {
+ d = libsais16_partial_sorting_scan_left_to_right_16u(T, SA, buckets, d, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ libsais16_partial_sorting_scan_left_to_right_16u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size, &thread_state[omp_thread_num]);
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE];
+ sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
+
+ fast_sint_t t;
+ for (t = 0; t < omp_num_threads; ++t)
+ {
+ sa_sint_t * RESTRICT temp_induction_bucket = &thread_state[t].state.buckets[0 * ALPHABET_SIZE];
+ sa_sint_t * RESTRICT temp_distinct_names = &thread_state[t].state.buckets[2 * ALPHABET_SIZE];
+
+ fast_sint_t c;
+ for (c = 0; c < 2 * ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_induction_bucket[c]; induction_bucket[c] = A + B; temp_induction_bucket[c] = A; }
+
+ for (d -= 1, c = 0; c < 2 * ALPHABET_SIZE; c += 1) { sa_sint_t A = distinct_names[c], B = temp_distinct_names[c], D = B + d; distinct_names[c] = B > 0 ? D : A; temp_distinct_names[c] = A; }
+ d += 1 + (sa_sint_t)thread_state[t].state.position; thread_state[t].state.position = (fast_sint_t)d - thread_state[t].state.position;
+ }
+ }
+
+ #pragma omp barrier
+
+ {
+ libsais16_partial_sorting_scan_left_to_right_16u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count, (sa_sint_t)thread_state[omp_thread_num].state.position);
+ }
+ }
+#endif
+ }
+
+ return d;
+}
+
+#endif
+
+static sa_sint_t libsais16_partial_sorting_scan_left_to_right_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE];
+ sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
+
+ SA[induction_bucket[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])]++] = (n - 1) | SAINT_MIN;
+ distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])] = ++d;
+
+ if (threads == 1 || left_suffixes_count < 65536)
+ {
+ d = libsais16_partial_sorting_scan_left_to_right_16u(T, SA, buckets, d, 0, left_suffixes_count);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ fast_sint_t block_start;
+ for (block_start = 0; block_start < left_suffixes_count; )
+ {
+ if (SA[block_start] == 0)
+ {
+ block_start++;
+ }
+ else
+ {
+ fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > left_suffixes_count) { block_max_end = left_suffixes_count;}
+ fast_sint_t block_end = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; }
+ fast_sint_t block_size = block_end - block_start;
+
+ if (block_size < 32)
+ {
+ for (; block_start < block_end; block_start += 1)
+ {
+ sa_sint_t p = SA[block_start]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]);
+ SA[induction_bucket[v]++] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d;
+ }
+ }
+ else
+ {
+ d = libsais16_partial_sorting_scan_left_to_right_16u_block_omp(T, SA, buckets, d, block_start, block_size, threads, thread_state);
+ block_start = block_end;
+ }
+ }
+ }
+ }
+#else
+ UNUSED(thread_state);
+#endif
+
+ return d;
+}
+
+static sa_sint_t libsais16_partial_sorting_scan_left_to_right_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais16_prefetch(&SA[i + 3 * prefetch_distance]);
+
+ libsais16_prefetch(&T[SA[i + 2 * prefetch_distance + 0] & SAINT_MAX] - 1);
+ libsais16_prefetch(&T[SA[i + 2 * prefetch_distance + 0] & SAINT_MAX] - 2);
+ libsais16_prefetch(&T[SA[i + 2 * prefetch_distance + 1] & SAINT_MAX] - 1);
+ libsais16_prefetch(&T[SA[i + 2 * prefetch_distance + 1] & SAINT_MAX] - 2);
+
+ sa_sint_t p0 = SA[i + prefetch_distance + 0] & SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX4(T[p0 - (p0 > 0)], 0); libsais16_prefetchw(&buckets[v0]);
+ sa_sint_t p1 = SA[i + prefetch_distance + 1] & SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX4(T[p1 - (p1 > 0)], 0); libsais16_prefetchw(&buckets[v1]);
+
+ sa_sint_t p2 = SA[i + 0]; d += (p2 < 0); p2 &= SAINT_MAX; sa_sint_t v2 = BUCKETS_INDEX4(T[p2 - 1], T[p2 - 2] >= T[p2 - 1]);
+ SA[buckets[v2]++] = (p2 - 1) | ((sa_sint_t)(buckets[2 + v2] != d) << (SAINT_BIT - 1)); buckets[2 + v2] = d;
+
+ sa_sint_t p3 = SA[i + 1]; d += (p3 < 0); p3 &= SAINT_MAX; sa_sint_t v3 = BUCKETS_INDEX4(T[p3 - 1], T[p3 - 2] >= T[p3 - 1]);
+ SA[buckets[v3]++] = (p3 - 1) | ((sa_sint_t)(buckets[2 + v3] != d) << (SAINT_BIT - 1)); buckets[2 + v3] = d;
+ }
+
+ for (j += 2 * prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX4(T[p - 1], T[p - 2] >= T[p - 1]);
+ SA[buckets[v]++] = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d;
+ }
+
+ return d;
+}
+
+static sa_sint_t libsais16_partial_sorting_scan_left_to_right_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t * RESTRICT induction_bucket = &buckets[2 * k];
+ sa_sint_t * RESTRICT distinct_names = &buckets[0 * k];
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais16_prefetchw(&SA[i + 3 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+ sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { const fast_sint_t Ts2 = T[(s2 & ~SUFFIX_GROUP_MARKER) - 1]; libsais16_prefetchw(&induction_bucket[Ts2]); libsais16_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts2, 0)]); }
+ sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { const fast_sint_t Ts3 = T[(s3 & ~SUFFIX_GROUP_MARKER) - 1]; libsais16_prefetchw(&induction_bucket[Ts3]); libsais16_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts3, 0)]); }
+
+ sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX;
+ if (p0 > 0)
+ {
+ SA[i + 0] = 0; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); p0 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] < T[p0 - 1]);
+ SA[induction_bucket[T[p0 - 1]]++] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d;
+ }
+
+ sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX;
+ if (p1 > 0)
+ {
+ SA[i + 1] = 0; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); p1 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] < T[p1 - 1]);
+ SA[induction_bucket[T[p1 - 1]]++] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d;
+ }
+ }
+
+ for (j += 2 * prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX;
+ if (p > 0)
+ {
+ SA[i] = 0; d += (p >> (SUFFIX_GROUP_BIT - 1)); p &= ~SUFFIX_GROUP_MARKER; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] < T[p - 1]);
+ SA[induction_bucket[T[p - 1]]++] = (p - 1) | ((sa_sint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d;
+ }
+ }
+
+ return d;
+}
+
+static void libsais16_partial_sorting_scan_left_to_right_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais16_prefetchw(&SA[i + 3 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+ sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { libsais16_prefetchw(&induction_bucket[T[s2 - 1]]); libsais16_prefetch(&T[s2] - 2); }
+ sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { libsais16_prefetchw(&induction_bucket[T[s3 - 1]]); libsais16_prefetch(&T[s3] - 2); }
+
+ sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { SA[i + 0] = 0; SA[induction_bucket[T[p0 - 1]]++] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)); }
+ sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { SA[i + 1] = 0; SA[induction_bucket[T[p1 - 1]]++] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)); }
+ }
+
+ for (j += 2 * prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { SA[i] = 0; SA[induction_bucket[T[p - 1]]++] = (p - 1) | ((sa_sint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1)); }
+ }
+}
+
+#if defined(_OPENMP)
+
+static void libsais16_partial_sorting_scan_left_to_right_32s_6k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais16_prefetch(&SA[i + 2 * prefetch_distance]);
+
+ libsais16_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1);
+ libsais16_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2);
+ libsais16_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1);
+ libsais16_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2);
+
+ libsais16_prefetchw(&cache[i + prefetch_distance]);
+
+ sa_sint_t p0 = cache[i + 0].index = SA[i + 0]; sa_sint_t symbol0 = 0; p0 &= SAINT_MAX; if (p0 != 0) { symbol0 = BUCKETS_INDEX4(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]); } cache[i + 0].symbol = symbol0;
+ sa_sint_t p1 = cache[i + 1].index = SA[i + 1]; sa_sint_t symbol1 = 0; p1 &= SAINT_MAX; if (p1 != 0) { symbol1 = BUCKETS_INDEX4(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]); } cache[i + 1].symbol = symbol1;
+ }
+
+ for (j += prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t p = cache[i].index = SA[i]; sa_sint_t symbol = 0; p &= SAINT_MAX; if (p != 0) { symbol = BUCKETS_INDEX4(T[p - 1], T[p - 2] >= T[p - 1]); } cache[i].symbol = symbol;
+ }
+}
+
+static void libsais16_partial_sorting_scan_left_to_right_32s_4k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais16_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+
+ libsais16_prefetchw(&cache[i + prefetch_distance]);
+
+ sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { cache[i + 0].index = p0; p0 &= ~SUFFIX_GROUP_MARKER; symbol0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] < T[p0 - 1]); p0 = 0; } cache[i + 0].symbol = symbol0; SA[i + 0] = p0 & SAINT_MAX;
+ sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { cache[i + 1].index = p1; p1 &= ~SUFFIX_GROUP_MARKER; symbol1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] < T[p1 - 1]); p1 = 0; } cache[i + 1].symbol = symbol1; SA[i + 1] = p1 & SAINT_MAX;
+ }
+
+ for (j += prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { cache[i].index = p; p &= ~SUFFIX_GROUP_MARKER; symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] < T[p - 1]); p = 0; } cache[i].symbol = symbol; SA[i] = p & SAINT_MAX;
+ }
+}
+
+static void libsais16_partial_sorting_scan_left_to_right_32s_1k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais16_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+
+ libsais16_prefetchw(&cache[i + prefetch_distance]);
+
+ sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { cache[i + 0].index = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)); symbol0 = T[p0 - 1]; p0 = 0; } cache[i + 0].symbol = symbol0; SA[i + 0] = p0 & SAINT_MAX;
+ sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { cache[i + 1].index = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)); symbol1 = T[p1 - 1]; p1 = 0; } cache[i + 1].symbol = symbol1; SA[i + 1] = p1 & SAINT_MAX;
+ }
+
+ for (j += prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { cache[i].index = (p - 1) | ((sa_sint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1)); symbol = T[p - 1]; p = 0; } cache[i].symbol = symbol; SA[i] = p & SAINT_MAX;
+ }
+}
+
+static sa_sint_t libsais16_partial_sorting_scan_left_to_right_32s_6k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size;
+ for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais16_prefetchw(&cache[i + 2 * prefetch_distance]);
+
+ libsais16_prefetchw(&buckets[cache[i + prefetch_distance + 0].symbol]);
+ libsais16_prefetchw(&buckets[cache[i + prefetch_distance + 1].symbol]);
+
+ sa_sint_t v0 = cache[i + 0].symbol, p0 = cache[i + 0].index; d += (p0 < 0); cache[i + 0].symbol = buckets[v0]++; cache[i + 0].index = (p0 - 1) | ((sa_sint_t)(buckets[2 + v0] != d) << (SAINT_BIT - 1)); buckets[2 + v0] = d;
+ if (cache[i + 0].symbol < omp_block_end) { sa_sint_t s = cache[i + 0].symbol, q = (cache[s].index = cache[i + 0].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] >= T[q - 1]); }
+
+ sa_sint_t v1 = cache[i + 1].symbol, p1 = cache[i + 1].index; d += (p1 < 0); cache[i + 1].symbol = buckets[v1]++; cache[i + 1].index = (p1 - 1) | ((sa_sint_t)(buckets[2 + v1] != d) << (SAINT_BIT - 1)); buckets[2 + v1] = d;
+ if (cache[i + 1].symbol < omp_block_end) { sa_sint_t s = cache[i + 1].symbol, q = (cache[s].index = cache[i + 1].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] >= T[q - 1]); }
+ }
+
+ for (j += prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t v = cache[i].symbol, p = cache[i].index; d += (p < 0); cache[i].symbol = buckets[v]++; cache[i].index = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d;
+ if (cache[i].symbol < omp_block_end) { sa_sint_t s = cache[i].symbol, q = (cache[s].index = cache[i].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] >= T[q - 1]); }
+ }
+
+ return d;
+}
+
+static sa_sint_t libsais16_partial_sorting_scan_left_to_right_32s_4k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t * RESTRICT induction_bucket = &buckets[2 * k];
+ sa_sint_t * RESTRICT distinct_names = &buckets[0 * k];
+
+ fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size;
+ for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais16_prefetchw(&cache[i + 2 * prefetch_distance]);
+
+ sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0 >> 1]; libsais16_prefetchw(s0 >= 0 ? Is0 : NULL); const sa_sint_t * Ds0 = &distinct_names[s0]; libsais16_prefetchw(s0 >= 0 ? Ds0 : NULL);
+ sa_sint_t s1 = cache[i + prefetch_distance + 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1 >> 1]; libsais16_prefetchw(s1 >= 0 ? Is1 : NULL); const sa_sint_t * Ds1 = &distinct_names[s1]; libsais16_prefetchw(s1 >= 0 ? Ds1 : NULL);
+
+ sa_sint_t v0 = cache[i + 0].symbol;
+ if (v0 >= 0)
+ {
+ sa_sint_t p0 = cache[i + 0].index; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); cache[i + 0].symbol = induction_bucket[v0 >> 1]++; cache[i + 0].index = (p0 - 1) | (v0 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d;
+ if (cache[i + 0].symbol < omp_block_end) { sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index; if (np > 0) { cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]); np = 0; } cache[i + 0].index = np & SAINT_MAX; }
+ }
+
+ sa_sint_t v1 = cache[i + 1].symbol;
+ if (v1 >= 0)
+ {
+ sa_sint_t p1 = cache[i + 1].index; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); cache[i + 1].symbol = induction_bucket[v1 >> 1]++; cache[i + 1].index = (p1 - 1) | (v1 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d;
+ if (cache[i + 1].symbol < omp_block_end) { sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index; if (np > 0) { cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]); np = 0; } cache[i + 1].index = np & SAINT_MAX; }
+ }
+ }
+
+ for (j += prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t v = cache[i].symbol;
+ if (v >= 0)
+ {
+ sa_sint_t p = cache[i].index; d += (p >> (SUFFIX_GROUP_BIT - 1)); cache[i].symbol = induction_bucket[v >> 1]++; cache[i].index = (p - 1) | (v << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d;
+ if (cache[i].symbol < omp_block_end) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]); np = 0; } cache[i].index = np & SAINT_MAX; }
+ }
+ }
+
+ return d;
+}
+
+static void libsais16_partial_sorting_scan_left_to_right_32s_1k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size;
+ for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais16_prefetchw(&cache[i + 2 * prefetch_distance]);
+
+ sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0]; libsais16_prefetchw(s0 >= 0 ? Is0 : NULL);
+ sa_sint_t s1 = cache[i + prefetch_distance + 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1]; libsais16_prefetchw(s1 >= 0 ? Is1 : NULL);
+
+ sa_sint_t v0 = cache[i + 0].symbol;
+ if (v0 >= 0)
+ {
+ cache[i + 0].symbol = induction_bucket[v0]++;
+ if (cache[i + 0].symbol < omp_block_end) { sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index; if (np > 0) { cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; np = 0; } cache[i + 0].index = np & SAINT_MAX; }
+ }
+
+ sa_sint_t v1 = cache[i + 1].symbol;
+ if (v1 >= 0)
+ {
+ cache[i + 1].symbol = induction_bucket[v1]++;
+ if (cache[i + 1].symbol < omp_block_end) { sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index; if (np > 0) { cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; np = 0; } cache[i + 1].index = np & SAINT_MAX; }
+ }
+ }
+
+ for (j += prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t v = cache[i].symbol;
+ if (v >= 0)
+ {
+ cache[i].symbol = induction_bucket[v]++;
+ if (cache[i].symbol < omp_block_end) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; np = 0; } cache[i].index = np & SAINT_MAX; }
+ }
+ }
+}
+
+static sa_sint_t libsais16_partial_sorting_scan_left_to_right_32s_6k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(cache);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+ omp_block_start += block_start;
+
+ if (omp_num_threads == 1)
+ {
+ d = libsais16_partial_sorting_scan_left_to_right_32s_6k(T, SA, buckets, d, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ libsais16_partial_sorting_scan_left_to_right_32s_6k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ d = libsais16_partial_sorting_scan_left_to_right_32s_6k_block_sort(T, buckets, d, cache - block_start, block_start, block_size);
+ }
+
+ #pragma omp barrier
+
+ {
+ libsais16_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+ }
+ }
+#endif
+ }
+
+ return d;
+}
+
+static sa_sint_t libsais16_partial_sorting_scan_left_to_right_32s_4k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(cache);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+ omp_block_start += block_start;
+
+ if (omp_num_threads == 1)
+ {
+ d = libsais16_partial_sorting_scan_left_to_right_32s_4k(T, SA, k, buckets, d, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ libsais16_partial_sorting_scan_left_to_right_32s_4k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ d = libsais16_partial_sorting_scan_left_to_right_32s_4k_block_sort(T, k, buckets, d, cache - block_start, block_start, block_size);
+ }
+
+ #pragma omp barrier
+
+ {
+ libsais16_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+ }
+ }
+#endif
+ }
+
+ return d;
+}
+
+static void libsais16_partial_sorting_scan_left_to_right_32s_1k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(cache);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+ omp_block_start += block_start;
+
+ if (omp_num_threads == 1)
+ {
+ libsais16_partial_sorting_scan_left_to_right_32s_1k(T, SA, buckets, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ libsais16_partial_sorting_scan_left_to_right_32s_1k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ libsais16_partial_sorting_scan_left_to_right_32s_1k_block_sort(T, buckets, cache - block_start, block_start, block_size);
+ }
+
+ #pragma omp barrier
+
+ {
+ libsais16_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+ }
+ }
+#endif
+ }
+}
+
+#endif
+
+static sa_sint_t libsais16_partial_sorting_scan_left_to_right_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ SA[buckets[BUCKETS_INDEX4(T[n - 1], T[n - 2] >= T[n - 1])]++] = (n - 1) | SAINT_MIN;
+ buckets[2 + BUCKETS_INDEX4(T[n - 1], T[n - 2] >= T[n - 1])] = ++d;
+
+ if (threads == 1 || left_suffixes_count < 65536)
+ {
+ d = libsais16_partial_sorting_scan_left_to_right_32s_6k(T, SA, buckets, d, 0, left_suffixes_count);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ fast_sint_t block_start, block_end;
+ for (block_start = 0; block_start < left_suffixes_count; block_start = block_end)
+ {
+ block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > left_suffixes_count) { block_end = left_suffixes_count; }
+
+ d = libsais16_partial_sorting_scan_left_to_right_32s_6k_block_omp(T, SA, buckets, d, thread_state[0].state.cache, block_start, block_end - block_start, threads);
+ }
+ }
+#else
+ UNUSED(thread_state);
+#endif
+
+ return d;
+}
+
+static sa_sint_t libsais16_partial_sorting_scan_left_to_right_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ sa_sint_t * RESTRICT induction_bucket = &buckets[2 * k];
+ sa_sint_t * RESTRICT distinct_names = &buckets[0 * k];
+
+ SA[induction_bucket[T[n - 1]]++] = (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1)) | SUFFIX_GROUP_MARKER;
+ distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] < T[n - 1])] = ++d;
+
+ if (threads == 1 || n < 65536)
+ {
+ d = libsais16_partial_sorting_scan_left_to_right_32s_4k(T, SA, k, buckets, d, 0, n);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ fast_sint_t block_start, block_end;
+ for (block_start = 0; block_start < n; block_start = block_end)
+ {
+ block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > n) { block_end = n; }
+
+ d = libsais16_partial_sorting_scan_left_to_right_32s_4k_block_omp(T, SA, k, buckets, d, thread_state[0].state.cache, block_start, block_end - block_start, threads);
+ }
+ }
+#else
+ UNUSED(thread_state);
+#endif
+
+ return d;
+}
+
+static void libsais16_partial_sorting_scan_left_to_right_32s_1k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ SA[buckets[T[n - 1]]++] = (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1));
+
+ if (threads == 1 || n < 65536)
+ {
+ libsais16_partial_sorting_scan_left_to_right_32s_1k(T, SA, buckets, 0, n);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ fast_sint_t block_start, block_end;
+ for (block_start = 0; block_start < n; block_start = block_end)
+ {
+ block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > n) { block_end = n; }
+
+ libsais16_partial_sorting_scan_left_to_right_32s_1k_block_omp(T, SA, buckets, thread_state[0].state.cache, block_start, block_end - block_start, threads);
+ }
+ }
+#else
+ UNUSED(thread_state);
+#endif
+}
+
+static void libsais16_partial_sorting_shift_markers_16u_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, const sa_sint_t * RESTRICT buckets, sa_sint_t threads)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ const sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE];
+
+ fast_sint_t c;
+
+#if defined(_OPENMP)
+ #pragma omp parallel for schedule(static, 1) num_threads(threads) if(threads > 1 && n >= 65536)
+#else
+ UNUSED(threads); UNUSED(n);
+#endif
+ for (c = BUCKETS_INDEX2(ALPHABET_SIZE - 1, 0); c >= BUCKETS_INDEX2(1, 0); c -= BUCKETS_INDEX2(1, 0))
+ {
+ fast_sint_t i, j; sa_sint_t s = SAINT_MIN;
+ for (i = (fast_sint_t)temp_bucket[c] - 1, j = (fast_sint_t)buckets[c - BUCKETS_INDEX2(1, 0)] + 3; i >= j; i -= 4)
+ {
+ libsais16_prefetchw(&SA[i - prefetch_distance]);
+
+ sa_sint_t p0 = SA[i - 0], q0 = (p0 & SAINT_MIN) ^ s; s = s ^ q0; SA[i - 0] = p0 ^ q0;
+ sa_sint_t p1 = SA[i - 1], q1 = (p1 & SAINT_MIN) ^ s; s = s ^ q1; SA[i - 1] = p1 ^ q1;
+ sa_sint_t p2 = SA[i - 2], q2 = (p2 & SAINT_MIN) ^ s; s = s ^ q2; SA[i - 2] = p2 ^ q2;
+ sa_sint_t p3 = SA[i - 3], q3 = (p3 & SAINT_MIN) ^ s; s = s ^ q3; SA[i - 3] = p3 ^ q3;
+ }
+
+ for (j -= 3; i >= j; i -= 1)
+ {
+ sa_sint_t p = SA[i], q = (p & SAINT_MIN) ^ s; s = s ^ q; SA[i] = p ^ q;
+ }
+ }
+}
+
+static void libsais16_partial_sorting_shift_markers_32s_6k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k, const sa_sint_t * RESTRICT buckets, sa_sint_t threads)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ const sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k];
+
+ fast_sint_t c;
+
+#if defined(_OPENMP)
+ #pragma omp parallel for schedule(static, 1) num_threads(threads) if(threads > 1 && k >= 65536)
+#else
+ UNUSED(threads);
+#endif
+ for (c = (fast_sint_t)k - 1; c >= 1; c -= 1)
+ {
+ fast_sint_t i, j; sa_sint_t s = SAINT_MIN;
+ for (i = (fast_sint_t)buckets[BUCKETS_INDEX4(c, 0)] - 1, j = (fast_sint_t)temp_bucket[BUCKETS_INDEX2(c - 1, 0)] + 3; i >= j; i -= 4)
+ {
+ libsais16_prefetchw(&SA[i - prefetch_distance]);
+
+ sa_sint_t p0 = SA[i - 0], q0 = (p0 & SAINT_MIN) ^ s; s = s ^ q0; SA[i - 0] = p0 ^ q0;
+ sa_sint_t p1 = SA[i - 1], q1 = (p1 & SAINT_MIN) ^ s; s = s ^ q1; SA[i - 1] = p1 ^ q1;
+ sa_sint_t p2 = SA[i - 2], q2 = (p2 & SAINT_MIN) ^ s; s = s ^ q2; SA[i - 2] = p2 ^ q2;
+ sa_sint_t p3 = SA[i - 3], q3 = (p3 & SAINT_MIN) ^ s; s = s ^ q3; SA[i - 3] = p3 ^ q3;
+ }
+
+ for (j -= 3; i >= j; i -= 1)
+ {
+ sa_sint_t p = SA[i], q = (p & SAINT_MIN) ^ s; s = s ^ q; SA[i] = p ^ q;
+ }
+ }
+}
+
+static void libsais16_partial_sorting_shift_markers_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i; sa_sint_t s = SUFFIX_GROUP_MARKER;
+ for (i = (fast_sint_t)n - 1; i >= 3; i -= 4)
+ {
+ libsais16_prefetchw(&SA[i - prefetch_distance]);
+
+ sa_sint_t p0 = SA[i - 0], q0 = ((p0 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p0 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q0; SA[i - 0] = p0 ^ q0;
+ sa_sint_t p1 = SA[i - 1], q1 = ((p1 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p1 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q1; SA[i - 1] = p1 ^ q1;
+ sa_sint_t p2 = SA[i - 2], q2 = ((p2 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p2 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q2; SA[i - 2] = p2 ^ q2;
+ sa_sint_t p3 = SA[i - 3], q3 = ((p3 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p3 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q3; SA[i - 3] = p3 ^ q3;
+ }
+
+ for (; i >= 0; i -= 1)
+ {
+ sa_sint_t p = SA[i], q = ((p & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q; SA[i] = p ^ q;
+ }
+}
+
+static void libsais16_partial_sorting_shift_buckets_32s_6k(sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+ sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k];
+
+ fast_sint_t i;
+ for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0))
+ {
+ buckets[2 * i + BUCKETS_INDEX4(0, 0)] = temp_bucket[i + BUCKETS_INDEX2(0, 0)];
+ buckets[2 * i + BUCKETS_INDEX4(0, 1)] = temp_bucket[i + BUCKETS_INDEX2(0, 1)];
+ }
+}
+
+static sa_sint_t libsais16_partial_sorting_scan_right_to_left_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
+ sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
+
+ fast_sint_t i, j;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+ {
+ libsais16_prefetch(&SA[i - 2 * prefetch_distance]);
+
+ libsais16_prefetch(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 1);
+ libsais16_prefetch(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 2);
+ libsais16_prefetch(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 1);
+ libsais16_prefetch(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 2);
+
+ sa_sint_t p0 = SA[i - 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]);
+ SA[--induction_bucket[v0]] = (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d;
+
+ sa_sint_t p1 = SA[i - 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]);
+ SA[--induction_bucket[v1]] = (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d;
+ }
+
+ for (j -= prefetch_distance + 1; i >= j; i -= 1)
+ {
+ sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
+ SA[--induction_bucket[v]] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d;
+ }
+
+ return d;
+}
+
+#if defined(_OPENMP)
+
+static void libsais16_partial_sorting_scan_right_to_left_16u_block_prepare(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size, LIBSAIS_THREAD_STATE * RESTRICT state)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
+ sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
+
+ memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t));
+
+ fast_sint_t i, j, count = 0; sa_sint_t d = 1;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+ {
+ libsais16_prefetch(&SA[i - 2 * prefetch_distance]);
+
+ libsais16_prefetch(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 1);
+ libsais16_prefetch(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 2);
+ libsais16_prefetch(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 1);
+ libsais16_prefetch(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 2);
+
+ sa_sint_t p0 = cache[count].index = SA[i - 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = cache[count++].symbol = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); induction_bucket[v0]++; distinct_names[v0] = d;
+ sa_sint_t p1 = cache[count].index = SA[i - 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = cache[count++].symbol = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); induction_bucket[v1]++; distinct_names[v1] = d;
+ }
+
+ for (j -= prefetch_distance + 1; i >= j; i -= 1)
+ {
+ sa_sint_t p = cache[count].index = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = cache[count++].symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); induction_bucket[v]++; distinct_names[v] = d;
+ }
+
+ state[0].state.position = (fast_sint_t)d - 1;
+ state[0].state.count = count;
+}
+
+static void libsais16_partial_sorting_scan_right_to_left_16u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count, sa_sint_t d)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
+ sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
+
+ fast_sint_t i, j;
+ for (i = 0, j = count - 1; i < j; i += 2)
+ {
+ libsais16_prefetch(&cache[i + prefetch_distance]);
+
+ sa_sint_t p0 = cache[i + 0].index; d += (p0 < 0); sa_sint_t v0 = cache[i + 0].symbol;
+ SA[--induction_bucket[v0]] = (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d;
+
+ sa_sint_t p1 = cache[i + 1].index; d += (p1 < 0); sa_sint_t v1 = cache[i + 1].symbol;
+ SA[--induction_bucket[v1]] = (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d;
+ }
+
+ for (j += 1; i < j; i += 1)
+ {
+ sa_sint_t p = cache[i].index; d += (p < 0); sa_sint_t v = cache[i].symbol;
+ SA[--induction_bucket[v]] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d;
+ }
+}
+
+static sa_sint_t libsais16_partial_sorting_scan_right_to_left_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+ omp_block_start += block_start;
+
+ if (omp_num_threads == 1)
+ {
+ d = libsais16_partial_sorting_scan_right_to_left_16u(T, SA, buckets, d, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ libsais16_partial_sorting_scan_right_to_left_16u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size, &thread_state[omp_thread_num]);
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
+ sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
+
+ fast_sint_t t;
+ for (t = omp_num_threads - 1; t >= 0; --t)
+ {
+ sa_sint_t * RESTRICT temp_induction_bucket = &thread_state[t].state.buckets[0 * ALPHABET_SIZE];
+ sa_sint_t * RESTRICT temp_distinct_names = &thread_state[t].state.buckets[2 * ALPHABET_SIZE];
+
+ fast_sint_t c;
+ for (c = 0; c < 2 * ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_induction_bucket[c]; induction_bucket[c] = A - B; temp_induction_bucket[c] = A; }
+
+ for (d -= 1, c = 0; c < 2 * ALPHABET_SIZE; c += 1) { sa_sint_t A = distinct_names[c], B = temp_distinct_names[c], D = B + d; distinct_names[c] = B > 0 ? D : A; temp_distinct_names[c] = A; }
+ d += 1 + (sa_sint_t)thread_state[t].state.position; thread_state[t].state.position = (fast_sint_t)d - thread_state[t].state.position;
+ }
+ }
+
+ #pragma omp barrier
+
+ {
+ libsais16_partial_sorting_scan_right_to_left_16u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count, (sa_sint_t)thread_state[omp_thread_num].state.position);
+ }
+ }
+#endif
+ }
+
+ return d;
+}
+
+#endif
+
+static void libsais16_partial_sorting_scan_right_to_left_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ fast_sint_t scan_start = (fast_sint_t)left_suffixes_count + 1;
+ fast_sint_t scan_end = (fast_sint_t)n - (fast_sint_t)first_lms_suffix;
+
+ if (threads == 1 || (scan_end - scan_start) < 65536)
+ {
+ libsais16_partial_sorting_scan_right_to_left_16u(T, SA, buckets, d, scan_start, scan_end - scan_start);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
+ sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
+
+ fast_sint_t block_start;
+ for (block_start = scan_end - 1; block_start >= scan_start; )
+ {
+ if (SA[block_start] == 0)
+ {
+ block_start--;
+ }
+ else
+ {
+ fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end < scan_start) { block_max_end = scan_start - 1; }
+ fast_sint_t block_end = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; }
+ fast_sint_t block_size = block_start - block_end;
+
+ if (block_size < 32)
+ {
+ for (; block_start > block_end; block_start -= 1)
+ {
+ sa_sint_t p = SA[block_start]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
+ SA[--induction_bucket[v]] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d;
+ }
+ }
+ else
+ {
+ d = libsais16_partial_sorting_scan_right_to_left_16u_block_omp(T, SA, buckets, d, block_end + 1, block_size, threads, thread_state);
+ block_start = block_end;
+ }
+ }
+ }
+ }
+#else
+ UNUSED(thread_state);
+#endif
+}
+
+static sa_sint_t libsais16_partial_sorting_scan_right_to_left_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2)
+ {
+ libsais16_prefetch(&SA[i - 3 * prefetch_distance]);
+
+ libsais16_prefetch(&T[SA[i - 2 * prefetch_distance - 0] & SAINT_MAX] - 1);
+ libsais16_prefetch(&T[SA[i - 2 * prefetch_distance - 0] & SAINT_MAX] - 2);
+ libsais16_prefetch(&T[SA[i - 2 * prefetch_distance - 1] & SAINT_MAX] - 1);
+ libsais16_prefetch(&T[SA[i - 2 * prefetch_distance - 1] & SAINT_MAX] - 2);
+
+ sa_sint_t p0 = SA[i - prefetch_distance - 0] & SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX4(T[p0 - (p0 > 0)], 0); libsais16_prefetchw(&buckets[v0]);
+ sa_sint_t p1 = SA[i - prefetch_distance - 1] & SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX4(T[p1 - (p1 > 0)], 0); libsais16_prefetchw(&buckets[v1]);
+
+ sa_sint_t p2 = SA[i - 0]; d += (p2 < 0); p2 &= SAINT_MAX; sa_sint_t v2 = BUCKETS_INDEX4(T[p2 - 1], T[p2 - 2] > T[p2 - 1]);
+ SA[--buckets[v2]] = (p2 - 1) | ((sa_sint_t)(buckets[2 + v2] != d) << (SAINT_BIT - 1)); buckets[2 + v2] = d;
+
+ sa_sint_t p3 = SA[i - 1]; d += (p3 < 0); p3 &= SAINT_MAX; sa_sint_t v3 = BUCKETS_INDEX4(T[p3 - 1], T[p3 - 2] > T[p3 - 1]);
+ SA[--buckets[v3]] = (p3 - 1) | ((sa_sint_t)(buckets[2 + v3] != d) << (SAINT_BIT - 1)); buckets[2 + v3] = d;
+ }
+
+ for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1)
+ {
+ sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX4(T[p - 1], T[p - 2] > T[p - 1]);
+ SA[--buckets[v]] = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d;
+ }
+
+ return d;
+}
+
+static sa_sint_t libsais16_partial_sorting_scan_right_to_left_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t * RESTRICT induction_bucket = &buckets[3 * k];
+ sa_sint_t * RESTRICT distinct_names = &buckets[0 * k];
+
+ fast_sint_t i, j;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2)
+ {
+ libsais16_prefetchw(&SA[i - 3 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0]; const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1]; const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+ sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { const fast_sint_t Ts2 = T[(s2 & ~SUFFIX_GROUP_MARKER) - 1]; libsais16_prefetchw(&induction_bucket[Ts2]); libsais16_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts2, 0)]); }
+ sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { const fast_sint_t Ts3 = T[(s3 & ~SUFFIX_GROUP_MARKER) - 1]; libsais16_prefetchw(&induction_bucket[Ts3]); libsais16_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts3, 0)]); }
+
+ sa_sint_t p0 = SA[i - 0];
+ if (p0 > 0)
+ {
+ SA[i - 0] = 0; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); p0 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]);
+ SA[--induction_bucket[T[p0 - 1]]] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d;
+ }
+
+ sa_sint_t p1 = SA[i - 1];
+ if (p1 > 0)
+ {
+ SA[i - 1] = 0; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); p1 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]);
+ SA[--induction_bucket[T[p1 - 1]]] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d;
+ }
+ }
+
+ for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1)
+ {
+ sa_sint_t p = SA[i];
+ if (p > 0)
+ {
+ SA[i] = 0; d += (p >> (SUFFIX_GROUP_BIT - 1)); p &= ~SUFFIX_GROUP_MARKER; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
+ SA[--induction_bucket[T[p - 1]]] = (p - 1) | ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d;
+ }
+ }
+
+ return d;
+}
+
+static void libsais16_partial_sorting_scan_right_to_left_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2)
+ {
+ libsais16_prefetchw(&SA[i - 3 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+ sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { libsais16_prefetchw(&induction_bucket[T[s2 - 1]]); libsais16_prefetch(&T[s2] - 2); }
+ sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { libsais16_prefetchw(&induction_bucket[T[s3 - 1]]); libsais16_prefetch(&T[s3] - 2); }
+
+ sa_sint_t p0 = SA[i - 0]; if (p0 > 0) { SA[i - 0] = 0; SA[--induction_bucket[T[p0 - 1]]] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)); }
+ sa_sint_t p1 = SA[i - 1]; if (p1 > 0) { SA[i - 1] = 0; SA[--induction_bucket[T[p1 - 1]]] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)); }
+ }
+
+ for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1)
+ {
+ sa_sint_t p = SA[i]; if (p > 0) { SA[i] = 0; SA[--induction_bucket[T[p - 1]]] = (p - 1) | ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)); }
+ }
+}
+
+#if defined(_OPENMP)
+
+static void libsais16_partial_sorting_scan_right_to_left_32s_6k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais16_prefetch(&SA[i + 2 * prefetch_distance]);
+
+ libsais16_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1);
+ libsais16_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2);
+ libsais16_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1);
+ libsais16_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2);
+
+ libsais16_prefetchw(&cache[i + prefetch_distance]);
+
+ sa_sint_t p0 = cache[i + 0].index = SA[i + 0]; sa_sint_t symbol0 = 0; p0 &= SAINT_MAX; if (p0 != 0) { symbol0 = BUCKETS_INDEX4(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); } cache[i + 0].symbol = symbol0;
+ sa_sint_t p1 = cache[i + 1].index = SA[i + 1]; sa_sint_t symbol1 = 0; p1 &= SAINT_MAX; if (p1 != 0) { symbol1 = BUCKETS_INDEX4(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); } cache[i + 1].symbol = symbol1;
+ }
+
+ for (j += prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t p = cache[i].index = SA[i]; sa_sint_t symbol = 0; p &= SAINT_MAX; if (p != 0) { symbol = BUCKETS_INDEX4(T[p - 1], T[p - 2] > T[p - 1]); } cache[i].symbol = symbol;
+ }
+}
+
+static void libsais16_partial_sorting_scan_right_to_left_32s_4k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais16_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+
+ libsais16_prefetchw(&cache[i + prefetch_distance]);
+
+ sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { SA[i + 0] = 0; cache[i + 0].index = p0; p0 &= ~SUFFIX_GROUP_MARKER; symbol0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); } cache[i + 0].symbol = symbol0;
+ sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { SA[i + 1] = 0; cache[i + 1].index = p1; p1 &= ~SUFFIX_GROUP_MARKER; symbol1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); } cache[i + 1].symbol = symbol1;
+ }
+
+ for (j += prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { SA[i] = 0; cache[i].index = p; p &= ~SUFFIX_GROUP_MARKER; symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); } cache[i].symbol = symbol;
+ }
+}
+
+static void libsais16_partial_sorting_scan_right_to_left_32s_1k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais16_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+
+ libsais16_prefetchw(&cache[i + prefetch_distance]);
+
+ sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { SA[i + 0] = 0; cache[i + 0].index = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)); symbol0 = T[p0 - 1]; } cache[i + 0].symbol = symbol0;
+ sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { SA[i + 1] = 0; cache[i + 1].index = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)); symbol1 = T[p1 - 1]; } cache[i + 1].symbol = symbol1;
+ }
+
+ for (j += prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { SA[i] = 0; cache[i].index = (p - 1) | ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)); symbol = T[p - 1]; } cache[i].symbol = symbol;
+ }
+}
+
+static sa_sint_t libsais16_partial_sorting_scan_right_to_left_32s_6k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+ {
+ libsais16_prefetchw(&cache[i - 2 * prefetch_distance]);
+
+ libsais16_prefetchw(&buckets[cache[i - prefetch_distance - 0].symbol]);
+ libsais16_prefetchw(&buckets[cache[i - prefetch_distance - 1].symbol]);
+
+ sa_sint_t v0 = cache[i - 0].symbol, p0 = cache[i - 0].index; d += (p0 < 0); cache[i - 0].symbol = --buckets[v0]; cache[i - 0].index = (p0 - 1) | ((sa_sint_t)(buckets[2 + v0] != d) << (SAINT_BIT - 1)); buckets[2 + v0] = d;
+ if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t s = cache[i - 0].symbol, q = (cache[s].index = cache[i - 0].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] > T[q - 1]); }
+
+ sa_sint_t v1 = cache[i - 1].symbol, p1 = cache[i - 1].index; d += (p1 < 0); cache[i - 1].symbol = --buckets[v1]; cache[i - 1].index = (p1 - 1) | ((sa_sint_t)(buckets[2 + v1] != d) << (SAINT_BIT - 1)); buckets[2 + v1] = d;
+ if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t s = cache[i - 1].symbol, q = (cache[s].index = cache[i - 1].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] > T[q - 1]); }
+ }
+
+ for (j -= prefetch_distance + 1; i >= j; i -= 1)
+ {
+ sa_sint_t v = cache[i].symbol, p = cache[i].index; d += (p < 0); cache[i].symbol = --buckets[v]; cache[i].index = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d;
+ if (cache[i].symbol >= omp_block_start) { sa_sint_t s = cache[i].symbol, q = (cache[s].index = cache[i].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] > T[q - 1]); }
+ }
+
+ return d;
+}
+
+static sa_sint_t libsais16_partial_sorting_scan_right_to_left_32s_4k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t * RESTRICT induction_bucket = &buckets[3 * k];
+ sa_sint_t * RESTRICT distinct_names = &buckets[0 * k];
+
+ fast_sint_t i, j;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+ {
+ libsais16_prefetchw(&cache[i - 2 * prefetch_distance]);
+
+ sa_sint_t s0 = cache[i - prefetch_distance - 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0 >> 1]; libsais16_prefetchw(s0 >= 0 ? Is0 : NULL); const sa_sint_t * Ds0 = &distinct_names[s0]; libsais16_prefetchw(s0 >= 0 ? Ds0 : NULL);
+ sa_sint_t s1 = cache[i - prefetch_distance - 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1 >> 1]; libsais16_prefetchw(s1 >= 0 ? Is1 : NULL); const sa_sint_t * Ds1 = &distinct_names[s1]; libsais16_prefetchw(s1 >= 0 ? Ds1 : NULL);
+
+ sa_sint_t v0 = cache[i - 0].symbol;
+ if (v0 >= 0)
+ {
+ sa_sint_t p0 = cache[i - 0].index; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); cache[i - 0].symbol = --induction_bucket[v0 >> 1]; cache[i - 0].index = (p0 - 1) | (v0 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d;
+ if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 0].symbol, np = cache[i - 0].index; if (np > 0) { cache[i - 0].index = 0; cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]); } }
+ }
+
+ sa_sint_t v1 = cache[i - 1].symbol;
+ if (v1 >= 0)
+ {
+ sa_sint_t p1 = cache[i - 1].index; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); cache[i - 1].symbol = --induction_bucket[v1 >> 1]; cache[i - 1].index = (p1 - 1) | (v1 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d;
+ if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 1].symbol, np = cache[i - 1].index; if (np > 0) { cache[i - 1].index = 0; cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]); } }
+ }
+ }
+
+ for (j -= prefetch_distance + 1; i >= j; i -= 1)
+ {
+ sa_sint_t v = cache[i].symbol;
+ if (v >= 0)
+ {
+ sa_sint_t p = cache[i].index; d += (p >> (SUFFIX_GROUP_BIT - 1)); cache[i].symbol = --induction_bucket[v >> 1]; cache[i].index = (p - 1) | (v << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d;
+ if (cache[i].symbol >= omp_block_start) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[i].index = 0; cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]); } }
+ }
+ }
+
+ return d;
+}
+
+static void libsais16_partial_sorting_scan_right_to_left_32s_1k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+ {
+ libsais16_prefetchw(&cache[i - 2 * prefetch_distance]);
+
+ sa_sint_t s0 = cache[i - prefetch_distance - 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0]; libsais16_prefetchw(s0 >= 0 ? Is0 : NULL);
+ sa_sint_t s1 = cache[i - prefetch_distance - 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1]; libsais16_prefetchw(s1 >= 0 ? Is1 : NULL);
+
+ sa_sint_t v0 = cache[i - 0].symbol;
+ if (v0 >= 0)
+ {
+ cache[i - 0].symbol = --induction_bucket[v0];
+ if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 0].symbol, np = cache[i - 0].index; if (np > 0) { cache[i - 0].index = 0; cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; } }
+ }
+
+ sa_sint_t v1 = cache[i - 1].symbol;
+ if (v1 >= 0)
+ {
+ cache[i - 1].symbol = --induction_bucket[v1];
+ if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 1].symbol, np = cache[i - 1].index; if (np > 0) { cache[i - 1].index = 0; cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; }}
+ }
+ }
+
+ for (j -= prefetch_distance + 1; i >= j; i -= 1)
+ {
+ sa_sint_t v = cache[i].symbol;
+ if (v >= 0)
+ {
+ cache[i].symbol = --induction_bucket[v];
+ if (cache[i].symbol >= omp_block_start) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[i].index = 0; cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; } }
+ }
+ }
+}
+
+static sa_sint_t libsais16_partial_sorting_scan_right_to_left_32s_6k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(cache);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+ omp_block_start += block_start;
+
+ if (omp_num_threads == 1)
+ {
+ d = libsais16_partial_sorting_scan_right_to_left_32s_6k(T, SA, buckets, d, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ libsais16_partial_sorting_scan_right_to_left_32s_6k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ d = libsais16_partial_sorting_scan_right_to_left_32s_6k_block_sort(T, buckets, d, cache - block_start, block_start, block_size);
+ }
+
+ #pragma omp barrier
+
+ {
+ libsais16_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+ }
+ }
+#endif
+ }
+
+ return d;
+}
+
+static sa_sint_t libsais16_partial_sorting_scan_right_to_left_32s_4k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(cache);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+ omp_block_start += block_start;
+
+ if (omp_num_threads == 1)
+ {
+ d = libsais16_partial_sorting_scan_right_to_left_32s_4k(T, SA, k, buckets, d, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ libsais16_partial_sorting_scan_right_to_left_32s_4k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ d = libsais16_partial_sorting_scan_right_to_left_32s_4k_block_sort(T, k, buckets, d, cache - block_start, block_start, block_size);
+ }
+
+ #pragma omp barrier
+
+ {
+ libsais16_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+ }
+ }
+#endif
+ }
+
+ return d;
+}
+
+static void libsais16_partial_sorting_scan_right_to_left_32s_1k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(cache);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+ omp_block_start += block_start;
+
+ if (omp_num_threads == 1)
+ {
+ libsais16_partial_sorting_scan_right_to_left_32s_1k(T, SA, buckets, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ libsais16_partial_sorting_scan_right_to_left_32s_1k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ libsais16_partial_sorting_scan_right_to_left_32s_1k_block_sort(T, buckets, cache - block_start, block_start, block_size);
+ }
+
+ #pragma omp barrier
+
+ {
+ libsais16_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+ }
+ }
+#endif
+ }
+}
+
+#endif
+
+static sa_sint_t libsais16_partial_sorting_scan_right_to_left_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ fast_sint_t scan_start = (fast_sint_t)left_suffixes_count + 1;
+ fast_sint_t scan_end = (fast_sint_t)n - (fast_sint_t)first_lms_suffix;
+
+ if (threads == 1 || (scan_end - scan_start) < 65536)
+ {
+ d = libsais16_partial_sorting_scan_right_to_left_32s_6k(T, SA, buckets, d, scan_start, scan_end - scan_start);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ fast_sint_t block_start, block_end;
+ for (block_start = scan_end - 1; block_start >= scan_start; block_start = block_end)
+ {
+ block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < scan_start) { block_end = scan_start - 1; }
+
+ d = libsais16_partial_sorting_scan_right_to_left_32s_6k_block_omp(T, SA, buckets, d, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads);
+ }
+ }
+#else
+ UNUSED(thread_state);
+#endif
+
+ return d;
+}
+
+static sa_sint_t libsais16_partial_sorting_scan_right_to_left_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ if (threads == 1 || n < 65536)
+ {
+ d = libsais16_partial_sorting_scan_right_to_left_32s_4k(T, SA, k, buckets, d, 0, n);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ fast_sint_t block_start, block_end;
+ for (block_start = (fast_sint_t)n - 1; block_start >= 0; block_start = block_end)
+ {
+ block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < 0) { block_end = -1; }
+
+ d = libsais16_partial_sorting_scan_right_to_left_32s_4k_block_omp(T, SA, k, buckets, d, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads);
+ }
+ }
+#else
+ UNUSED(thread_state);
+#endif
+
+ return d;
+}
+
+static void libsais16_partial_sorting_scan_right_to_left_32s_1k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ if (threads == 1 || n < 65536)
+ {
+ libsais16_partial_sorting_scan_right_to_left_32s_1k(T, SA, buckets, 0, n);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ fast_sint_t block_start, block_end;
+ for (block_start = (fast_sint_t)n - 1; block_start >= 0; block_start = block_end)
+ {
+ block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < 0) { block_end = -1; }
+
+ libsais16_partial_sorting_scan_right_to_left_32s_1k_block_omp(T, SA, buckets, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads);
+ }
+ }
+#else
+ UNUSED(thread_state);
+#endif
+}
+
+static fast_sint_t libsais16_partial_sorting_gather_lms_suffixes_32s_4k(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j, l;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j; i += 4)
+ {
+ libsais16_prefetch(&SA[i + prefetch_distance]);
+
+ sa_sint_t s0 = SA[i + 0]; SA[l] = (s0 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s0 < 0);
+ sa_sint_t s1 = SA[i + 1]; SA[l] = (s1 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s1 < 0);
+ sa_sint_t s2 = SA[i + 2]; SA[l] = (s2 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s2 < 0);
+ sa_sint_t s3 = SA[i + 3]; SA[l] = (s3 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s3 < 0);
+ }
+
+ for (j += 3; i < j; i += 1)
+ {
+ sa_sint_t s = SA[i]; SA[l] = (s - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s < 0);
+ }
+
+ return l;
+}
+
+static fast_sint_t libsais16_partial_sorting_gather_lms_suffixes_32s_1k(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j, l;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j; i += 4)
+ {
+ libsais16_prefetch(&SA[i + prefetch_distance]);
+
+ sa_sint_t s0 = SA[i + 0]; SA[l] = s0 & SAINT_MAX; l += (s0 < 0);
+ sa_sint_t s1 = SA[i + 1]; SA[l] = s1 & SAINT_MAX; l += (s1 < 0);
+ sa_sint_t s2 = SA[i + 2]; SA[l] = s2 & SAINT_MAX; l += (s2 < 0);
+ sa_sint_t s3 = SA[i + 3]; SA[l] = s3 & SAINT_MAX; l += (s3 < 0);
+ }
+
+ for (j += 3; i < j; i += 1)
+ {
+ sa_sint_t s = SA[i]; SA[l] = s & SAINT_MAX; l += (s < 0);
+ }
+
+ return l;
+}
+
+static void libsais16_partial_sorting_gather_lms_suffixes_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+ if (omp_num_threads == 1)
+ {
+ libsais16_partial_sorting_gather_lms_suffixes_32s_4k(SA, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ thread_state[omp_thread_num].state.position = omp_block_start;
+ thread_state[omp_thread_num].state.count = libsais16_partial_sorting_gather_lms_suffixes_32s_4k(SA, omp_block_start, omp_block_size) - omp_block_start;
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ fast_sint_t t, position = 0;
+ for (t = 0; t < omp_num_threads; ++t)
+ {
+ if (t > 0 && thread_state[t].state.count > 0)
+ {
+ memmove(&SA[position], &SA[thread_state[t].state.position], (size_t)thread_state[t].state.count * sizeof(sa_sint_t));
+ }
+
+ position += thread_state[t].state.count;
+ }
+ }
+ }
+#endif
+ }
+}
+
+static void libsais16_partial_sorting_gather_lms_suffixes_32s_1k_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+ if (omp_num_threads == 1)
+ {
+ libsais16_partial_sorting_gather_lms_suffixes_32s_1k(SA, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ thread_state[omp_thread_num].state.position = omp_block_start;
+ thread_state[omp_thread_num].state.count = libsais16_partial_sorting_gather_lms_suffixes_32s_1k(SA, omp_block_start, omp_block_size) - omp_block_start;
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ fast_sint_t t, position = 0;
+ for (t = 0; t < omp_num_threads; ++t)
+ {
+ if (t > 0 && thread_state[t].state.count > 0)
+ {
+ memmove(&SA[position], &SA[thread_state[t].state.position], (size_t)thread_state[t].state.count * sizeof(sa_sint_t));
+ }
+
+ position += thread_state[t].state.count;
+ }
+ }
+ }
+#endif
+ }
+}
+
+static void libsais16_induce_partial_order_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ memset(&buckets[2 * ALPHABET_SIZE], 0, 2 * ALPHABET_SIZE * sizeof(sa_sint_t));
+
+ sa_sint_t d = libsais16_partial_sorting_scan_left_to_right_16u_omp(T, SA, n, buckets, left_suffixes_count, 0, threads, thread_state);
+ libsais16_partial_sorting_shift_markers_16u_omp(SA, n, buckets, threads);
+ libsais16_partial_sorting_scan_right_to_left_16u_omp(T, SA, n, buckets, first_lms_suffix, left_suffixes_count, d, threads, thread_state);
+}
+
+static void libsais16_induce_partial_order_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ sa_sint_t d = libsais16_partial_sorting_scan_left_to_right_32s_6k_omp(T, SA, n, buckets, left_suffixes_count, 0, threads, thread_state);
+ libsais16_partial_sorting_shift_markers_32s_6k_omp(SA, k, buckets, threads);
+ libsais16_partial_sorting_shift_buckets_32s_6k(k, buckets);
+ libsais16_partial_sorting_scan_right_to_left_32s_6k_omp(T, SA, n, buckets, first_lms_suffix, left_suffixes_count, d, threads, thread_state);
+}
+
+static void libsais16_induce_partial_order_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
+
+ sa_sint_t d = libsais16_partial_sorting_scan_left_to_right_32s_4k_omp(T, SA, n, k, buckets, 0, threads, thread_state);
+ libsais16_partial_sorting_shift_markers_32s_4k(SA, n);
+ libsais16_partial_sorting_scan_right_to_left_32s_4k_omp(T, SA, n, k, buckets, d, threads, thread_state);
+ libsais16_partial_sorting_gather_lms_suffixes_32s_4k_omp(SA, n, threads, thread_state);
+}
+
+static void libsais16_induce_partial_order_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ libsais16_partial_sorting_scan_left_to_right_32s_1k_omp(T, SA, n, &buckets[1 * k], threads, thread_state);
+ libsais16_partial_sorting_scan_right_to_left_32s_1k_omp(T, SA, n, &buckets[0 * k], threads, thread_state);
+ libsais16_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads, thread_state);
+}
+
+static void libsais16_induce_partial_order_32s_1k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ libsais16_count_suffixes_32s(T, n, k, buckets);
+ libsais16_initialize_buckets_start_32s_1k(k, buckets);
+ libsais16_partial_sorting_scan_left_to_right_32s_1k_omp(T, SA, n, buckets, threads, thread_state);
+
+ libsais16_count_suffixes_32s(T, n, k, buckets);
+ libsais16_initialize_buckets_end_32s_1k(k, buckets);
+ libsais16_partial_sorting_scan_right_to_left_32s_1k_omp(T, SA, n, buckets, threads, thread_state);
+
+ libsais16_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads, thread_state);
+}
+
+static sa_sint_t libsais16_renumber_lms_suffixes_16u(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t name, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t * RESTRICT SAm = &SA[m];
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
+ {
+ libsais16_prefetch(&SA[i + 2 * prefetch_distance]);
+
+ libsais16_prefetchw(&SAm[(SA[i + prefetch_distance + 0] & SAINT_MAX) >> 1]);
+ libsais16_prefetchw(&SAm[(SA[i + prefetch_distance + 1] & SAINT_MAX) >> 1]);
+ libsais16_prefetchw(&SAm[(SA[i + prefetch_distance + 2] & SAINT_MAX) >> 1]);
+ libsais16_prefetchw(&SAm[(SA[i + prefetch_distance + 3] & SAINT_MAX) >> 1]);
+
+ sa_sint_t p0 = SA[i + 0]; SAm[(p0 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p0 < 0;
+ sa_sint_t p1 = SA[i + 1]; SAm[(p1 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p1 < 0;
+ sa_sint_t p2 = SA[i + 2]; SAm[(p2 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p2 < 0;
+ sa_sint_t p3 = SA[i + 3]; SAm[(p3 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p3 < 0;
+ }
+
+ for (j += prefetch_distance + 3; i < j; i += 1)
+ {
+ sa_sint_t p = SA[i]; SAm[(p & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p < 0;
+ }
+
+ return name;
+}
+
+static fast_sint_t libsais16_gather_marked_suffixes_16u(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t l, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ l -= 1;
+
+ fast_sint_t i, j;
+ for (i = (fast_sint_t)m + omp_block_start + omp_block_size - 1, j = (fast_sint_t)m + omp_block_start + 3; i >= j; i -= 4)
+ {
+ libsais16_prefetch(&SA[i - prefetch_distance]);
+
+ sa_sint_t s0 = SA[i - 0]; SA[l] = s0 & SAINT_MAX; l -= s0 < 0;
+ sa_sint_t s1 = SA[i - 1]; SA[l] = s1 & SAINT_MAX; l -= s1 < 0;
+ sa_sint_t s2 = SA[i - 2]; SA[l] = s2 & SAINT_MAX; l -= s2 < 0;
+ sa_sint_t s3 = SA[i - 3]; SA[l] = s3 & SAINT_MAX; l -= s3 < 0;
+ }
+
+ for (j -= 3; i >= j; i -= 1)
+ {
+ sa_sint_t s = SA[i]; SA[l] = s & SAINT_MAX; l -= s < 0;
+ }
+
+ l += 1;
+
+ return l;
+}
+
+static sa_sint_t libsais16_renumber_lms_suffixes_16u_omp(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ sa_sint_t name = 0;
+
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
+
+ if (omp_num_threads == 1)
+ {
+ name = libsais16_renumber_lms_suffixes_16u(SA, m, 0, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ thread_state[omp_thread_num].state.count = libsais16_count_negative_marked_suffixes(SA, omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ {
+ fast_sint_t t, count = 0; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; }
+
+ if (omp_thread_num == omp_num_threads - 1)
+ {
+ name = (sa_sint_t)(count + thread_state[omp_thread_num].state.count);
+ }
+
+ libsais16_renumber_lms_suffixes_16u(SA, m, (sa_sint_t)count, omp_block_start, omp_block_size);
+ }
+ }
+#endif
+ }
+
+ return name;
+}
+
+static void libsais16_gather_marked_lms_suffixes_16u_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start;
+
+ if (omp_num_threads == 1)
+ {
+ libsais16_gather_marked_suffixes_16u(SA, m, (fast_sint_t)n + (fast_sint_t)fs, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ if (omp_thread_num < omp_num_threads - 1)
+ {
+ thread_state[omp_thread_num].state.position = libsais16_gather_marked_suffixes_16u(SA, m, (fast_sint_t)m + omp_block_start + omp_block_size, omp_block_start, omp_block_size);
+ thread_state[omp_thread_num].state.count = (fast_sint_t)m + omp_block_start + omp_block_size - thread_state[omp_thread_num].state.position;
+ }
+ else
+ {
+ thread_state[omp_thread_num].state.position = libsais16_gather_marked_suffixes_16u(SA, m, (fast_sint_t)n + (fast_sint_t)fs, omp_block_start, omp_block_size);
+ thread_state[omp_thread_num].state.count = (fast_sint_t)n + (fast_sint_t)fs - thread_state[omp_thread_num].state.position;
+ }
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ fast_sint_t t, position = (fast_sint_t)n + (fast_sint_t)fs;
+
+ for (t = omp_num_threads - 1; t >= 0; --t)
+ {
+ position -= thread_state[t].state.count;
+ if (t != omp_num_threads - 1 && thread_state[t].state.count > 0)
+ {
+ memmove(&SA[position], &SA[thread_state[t].state.position], (size_t)thread_state[t].state.count * sizeof(sa_sint_t));
+ }
+ }
+ }
+ }
+#endif
+ }
+}
+
+static sa_sint_t libsais16_renumber_and_gather_lms_suffixes_16u_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ memset(&SA[m], 0, ((size_t)n >> 1) * sizeof(sa_sint_t));
+
+ sa_sint_t name = libsais16_renumber_lms_suffixes_16u_omp(SA, m, threads, thread_state);
+ if (name < m)
+ {
+ libsais16_gather_marked_lms_suffixes_16u_omp(SA, n, m, fs, threads, thread_state);
+ }
+ else
+ {
+ fast_sint_t i; for (i = 0; i < m; i += 1) { SA[i] &= SAINT_MAX; }
+ }
+
+ return name;
+}
+
+static sa_sint_t libsais16_renumber_distinct_lms_suffixes_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t name, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t * RESTRICT SAm = &SA[m];
+
+ fast_sint_t i, j; sa_sint_t p0, p1, p2, p3 = 0;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
+ {
+ libsais16_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+ libsais16_prefetchw(&SAm[(SA[i + prefetch_distance + 0] & SAINT_MAX) >> 1]);
+ libsais16_prefetchw(&SAm[(SA[i + prefetch_distance + 1] & SAINT_MAX) >> 1]);
+ libsais16_prefetchw(&SAm[(SA[i + prefetch_distance + 2] & SAINT_MAX) >> 1]);
+ libsais16_prefetchw(&SAm[(SA[i + prefetch_distance + 3] & SAINT_MAX) >> 1]);
+
+ p0 = SA[i + 0]; SAm[(SA[i + 0] = p0 & SAINT_MAX) >> 1] = name | (p0 & p3 & SAINT_MIN); name += p0 < 0;
+ p1 = SA[i + 1]; SAm[(SA[i + 1] = p1 & SAINT_MAX) >> 1] = name | (p1 & p0 & SAINT_MIN); name += p1 < 0;
+ p2 = SA[i + 2]; SAm[(SA[i + 2] = p2 & SAINT_MAX) >> 1] = name | (p2 & p1 & SAINT_MIN); name += p2 < 0;
+ p3 = SA[i + 3]; SAm[(SA[i + 3] = p3 & SAINT_MAX) >> 1] = name | (p3 & p2 & SAINT_MIN); name += p3 < 0;
+ }
+
+ for (j += prefetch_distance + 3; i < j; i += 1)
+ {
+ p2 = p3; p3 = SA[i]; SAm[(SA[i] = p3 & SAINT_MAX) >> 1] = name | (p3 & p2 & SAINT_MIN); name += p3 < 0;
+ }
+
+ return name;
+}
+
+static void libsais16_mark_distinct_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j; sa_sint_t p0, p1, p2, p3 = 0;
+ for (i = (fast_sint_t)m + omp_block_start, j = (fast_sint_t)m + omp_block_start + omp_block_size - 3; i < j; i += 4)
+ {
+ libsais16_prefetchw(&SA[i + prefetch_distance]);
+
+ p0 = SA[i + 0]; SA[i + 0] = p0 & (p3 | SAINT_MAX); p0 = (p0 == 0) ? p3 : p0;
+ p1 = SA[i + 1]; SA[i + 1] = p1 & (p0 | SAINT_MAX); p1 = (p1 == 0) ? p0 : p1;
+ p2 = SA[i + 2]; SA[i + 2] = p2 & (p1 | SAINT_MAX); p2 = (p2 == 0) ? p1 : p2;
+ p3 = SA[i + 3]; SA[i + 3] = p3 & (p2 | SAINT_MAX); p3 = (p3 == 0) ? p2 : p3;
+ }
+
+ for (j += 3; i < j; i += 1)
+ {
+ p2 = p3; p3 = SA[i]; SA[i] = p3 & (p2 | SAINT_MAX); p3 = (p3 == 0) ? p2 : p3;
+ }
+}
+
+static void libsais16_clamp_lms_suffixes_length_32s(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t * RESTRICT SAm = &SA[m];
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - 3; i < j; i += 4)
+ {
+ libsais16_prefetchw(&SAm[i + prefetch_distance]);
+
+ SAm[i + 0] = (SAm[i + 0] < 0 ? SAm[i + 0] : 0) & SAINT_MAX;
+ SAm[i + 1] = (SAm[i + 1] < 0 ? SAm[i + 1] : 0) & SAINT_MAX;
+ SAm[i + 2] = (SAm[i + 2] < 0 ? SAm[i + 2] : 0) & SAINT_MAX;
+ SAm[i + 3] = (SAm[i + 3] < 0 ? SAm[i + 3] : 0) & SAINT_MAX;
+ }
+
+ for (j += 3; i < j; i += 1)
+ {
+ SAm[i] = (SAm[i] < 0 ? SAm[i] : 0) & SAINT_MAX;
+ }
+}
+
+static sa_sint_t libsais16_renumber_distinct_lms_suffixes_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ sa_sint_t name = 0;
+
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
+
+ if (omp_num_threads == 1)
+ {
+ name = libsais16_renumber_distinct_lms_suffixes_32s_4k(SA, m, 1, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ thread_state[omp_thread_num].state.count = libsais16_count_negative_marked_suffixes(SA, omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ {
+ fast_sint_t t, count = 1; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; }
+
+ if (omp_thread_num == omp_num_threads - 1)
+ {
+ name = (sa_sint_t)(count + thread_state[omp_thread_num].state.count);
+ }
+
+ libsais16_renumber_distinct_lms_suffixes_32s_4k(SA, m, (sa_sint_t)count, omp_block_start, omp_block_size);
+ }
+ }
+#endif
+ }
+
+ return name - 1;
+}
+
+static void libsais16_mark_distinct_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+ fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start;
+#else
+ UNUSED(threads);
+
+ fast_sint_t omp_block_start = 0;
+ fast_sint_t omp_block_size = (fast_sint_t)n >> 1;
+#endif
+ libsais16_mark_distinct_lms_suffixes_32s(SA, m, omp_block_start, omp_block_size);
+ }
+}
+
+static void libsais16_clamp_lms_suffixes_length_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+ fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start;
+#else
+ UNUSED(threads);
+
+ fast_sint_t omp_block_start = 0;
+ fast_sint_t omp_block_size = (fast_sint_t)n >> 1;
+#endif
+ libsais16_clamp_lms_suffixes_length_32s(SA, m, omp_block_start, omp_block_size);
+ }
+}
+
+static sa_sint_t libsais16_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ memset(&SA[m], 0, ((size_t)n >> 1) * sizeof(sa_sint_t));
+
+ sa_sint_t name = libsais16_renumber_distinct_lms_suffixes_32s_4k_omp(SA, m, threads, thread_state);
+ if (name < m)
+ {
+ libsais16_mark_distinct_lms_suffixes_32s_omp(SA, n, m, threads);
+ }
+
+ return name;
+}
+
+static sa_sint_t libsais16_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t * RESTRICT SAm = &SA[m];
+
+ {
+ libsais16_gather_lms_suffixes_32s(T, SA, n);
+
+ memset(&SA[m], 0, ((size_t)n - (size_t)m - (size_t)m) * sizeof(sa_sint_t));
+
+ fast_sint_t i, j;
+ for (i = (fast_sint_t)n - (fast_sint_t)m, j = (fast_sint_t)n - 1 - prefetch_distance - 3; i < j; i += 4)
+ {
+ libsais16_prefetch(&SA[i + 2 * prefetch_distance]);
+
+ libsais16_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]);
+ libsais16_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]);
+ libsais16_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 2]) >> 1]);
+ libsais16_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 3]) >> 1]);
+
+ SAm[((sa_uint_t)SA[i + 0]) >> 1] = SA[i + 1] - SA[i + 0] + 1 + SAINT_MIN;
+ SAm[((sa_uint_t)SA[i + 1]) >> 1] = SA[i + 2] - SA[i + 1] + 1 + SAINT_MIN;
+ SAm[((sa_uint_t)SA[i + 2]) >> 1] = SA[i + 3] - SA[i + 2] + 1 + SAINT_MIN;
+ SAm[((sa_uint_t)SA[i + 3]) >> 1] = SA[i + 4] - SA[i + 3] + 1 + SAINT_MIN;
+ }
+
+ for (j += prefetch_distance + 3; i < j; i += 1)
+ {
+ SAm[((sa_uint_t)SA[i]) >> 1] = SA[i + 1] - SA[i] + 1 + SAINT_MIN;
+ }
+
+ SAm[((sa_uint_t)SA[n - 1]) >> 1] = 1 + SAINT_MIN;
+ }
+
+ {
+ libsais16_clamp_lms_suffixes_length_32s_omp(SA, n, m, threads);
+ }
+
+ sa_sint_t name = 1;
+
+ {
+ fast_sint_t i, j, p = SA[0], plen = SAm[p >> 1]; sa_sint_t pdiff = SAINT_MIN;
+ for (i = 1, j = m - prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais16_prefetch(&SA[i + 2 * prefetch_distance]);
+
+ libsais16_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]); libsais16_prefetch(&T[((sa_uint_t)SA[i + prefetch_distance + 0])]);
+ libsais16_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]); libsais16_prefetch(&T[((sa_uint_t)SA[i + prefetch_distance + 1])]);
+
+ fast_sint_t q = SA[i + 0], qlen = SAm[q >> 1]; sa_sint_t qdiff = SAINT_MIN;
+ if (plen == qlen) { fast_sint_t l = 0; do { if (T[p + l] != T[q + l]) { break; } } while (++l < qlen); qdiff = (sa_sint_t)(l - qlen) & SAINT_MIN; }
+ SAm[p >> 1] = name | (pdiff & qdiff); name += (qdiff < 0);
+
+ p = SA[i + 1]; plen = SAm[p >> 1]; pdiff = SAINT_MIN;
+ if (qlen == plen) { fast_sint_t l = 0; do { if (T[q + l] != T[p + l]) { break; } } while (++l < plen); pdiff = (sa_sint_t)(l - plen) & SAINT_MIN; }
+ SAm[q >> 1] = name | (qdiff & pdiff); name += (pdiff < 0);
+ }
+
+ for (j += prefetch_distance + 1; i < j; i += 1)
+ {
+ fast_sint_t q = SA[i], qlen = SAm[q >> 1]; sa_sint_t qdiff = SAINT_MIN;
+ if (plen == qlen) { fast_sint_t l = 0; do { if (T[p + l] != T[q + l]) { break; } } while (++l < plen); qdiff = (sa_sint_t)(l - plen) & SAINT_MIN; }
+ SAm[p >> 1] = name | (pdiff & qdiff); name += (qdiff < 0);
+
+ p = q; plen = qlen; pdiff = qdiff;
+ }
+
+ SAm[p >> 1] = name | pdiff; name++;
+ }
+
+ if (name <= m)
+ {
+ libsais16_mark_distinct_lms_suffixes_32s_omp(SA, n, m, threads);
+ }
+
+ return name - 1;
+}
+
+static void libsais16_reconstruct_lms_suffixes(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ const sa_sint_t * RESTRICT SAnm = &SA[n - m];
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
+ {
+ libsais16_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+ libsais16_prefetch(&SAnm[SA[i + prefetch_distance + 0]]);
+ libsais16_prefetch(&SAnm[SA[i + prefetch_distance + 1]]);
+ libsais16_prefetch(&SAnm[SA[i + prefetch_distance + 2]]);
+ libsais16_prefetch(&SAnm[SA[i + prefetch_distance + 3]]);
+
+ SA[i + 0] = SAnm[SA[i + 0]];
+ SA[i + 1] = SAnm[SA[i + 1]];
+ SA[i + 2] = SAnm[SA[i + 2]];
+ SA[i + 3] = SAnm[SA[i + 3]];
+ }
+
+ for (j += prefetch_distance + 3; i < j; i += 1)
+ {
+ SA[i] = SAnm[SA[i]];
+ }
+}
+
+static void libsais16_reconstruct_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+ fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
+#else
+ UNUSED(threads);
+
+ fast_sint_t omp_block_start = 0;
+ fast_sint_t omp_block_size = m;
+#endif
+
+ libsais16_reconstruct_lms_suffixes(SA, n, m, omp_block_start, omp_block_size);
+ }
+}
+
+static void libsais16_place_lms_suffixes_interval_16u(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, const sa_sint_t * RESTRICT buckets)
+{
+ const sa_sint_t * RESTRICT bucket_end = &buckets[7 * ALPHABET_SIZE];
+
+ fast_sint_t c, j = n;
+ for (c = ALPHABET_SIZE - 2; c >= 0; --c)
+ {
+ fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] - (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)];
+ if (l > 0)
+ {
+ fast_sint_t i = bucket_end[c];
+ if (j - i > 0)
+ {
+ memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
+ }
+
+ memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
+ }
+ }
+
+ memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
+}
+
+static void libsais16_place_lms_suffixes_interval_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets)
+{
+ const sa_sint_t * RESTRICT bucket_end = &buckets[3 * k];
+
+ fast_sint_t c, j = n;
+ for (c = (fast_sint_t)k - 2; c >= 0; --c)
+ {
+ fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] - (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)];
+ if (l > 0)
+ {
+ fast_sint_t i = bucket_end[c];
+ if (j - i > 0)
+ {
+ memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
+ }
+
+ memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
+ }
+ }
+
+ memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
+}
+
+static void libsais16_place_lms_suffixes_interval_32s_2k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets)
+{
+ fast_sint_t j = n;
+
+ if (k > 1)
+ {
+ fast_sint_t c;
+ for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0); c >= BUCKETS_INDEX2(0, 0); c -= BUCKETS_INDEX2(1, 0))
+ {
+ fast_sint_t l = (fast_sint_t)buckets[c + BUCKETS_INDEX2(1, 1)] - (fast_sint_t)buckets[c + BUCKETS_INDEX2(0, 1)];
+ if (l > 0)
+ {
+ fast_sint_t i = buckets[c];
+ if (j - i > 0)
+ {
+ memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
+ }
+
+ memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
+ }
+ }
+ }
+
+ memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
+}
+
+static void libsais16_place_lms_suffixes_interval_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t m, sa_sint_t * RESTRICT buckets)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t c = k - 1; fast_sint_t i, l = buckets[c];
+ for (i = (fast_sint_t)m - 1; i >= prefetch_distance + 3; i -= 4)
+ {
+ libsais16_prefetch(&SA[i - 2 * prefetch_distance]);
+
+ libsais16_prefetch(&T[SA[i - prefetch_distance - 0]]);
+ libsais16_prefetch(&T[SA[i - prefetch_distance - 1]]);
+ libsais16_prefetch(&T[SA[i - prefetch_distance - 2]]);
+ libsais16_prefetch(&T[SA[i - prefetch_distance - 3]]);
+
+ sa_sint_t p0 = SA[i - 0]; if (T[p0] != c) { c = T[p0]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p0;
+ sa_sint_t p1 = SA[i - 1]; if (T[p1] != c) { c = T[p1]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p1;
+ sa_sint_t p2 = SA[i - 2]; if (T[p2] != c) { c = T[p2]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p2;
+ sa_sint_t p3 = SA[i - 3]; if (T[p3] != c) { c = T[p3]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p3;
+ }
+
+ for (; i >= 0; i -= 1)
+ {
+ sa_sint_t p = SA[i]; if (T[p] != c) { c = T[p]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p;
+ }
+
+ memset(&SA[0], 0, (size_t)l * sizeof(sa_sint_t));
+}
+
+static void libsais16_place_lms_suffixes_histogram_32s_6k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets)
+{
+ const sa_sint_t * RESTRICT bucket_end = &buckets[5 * k];
+
+ fast_sint_t c, j = n;
+ for (c = (fast_sint_t)k - 2; c >= 0; --c)
+ {
+ fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX4(c, 1)];
+ if (l > 0)
+ {
+ fast_sint_t i = bucket_end[c];
+ if (j - i > 0)
+ {
+ memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
+ }
+
+ memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
+ }
+ }
+
+ memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
+}
+
+static void libsais16_place_lms_suffixes_histogram_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets)
+{
+ const sa_sint_t * RESTRICT bucket_end = &buckets[3 * k];
+
+ fast_sint_t c, j = n;
+ for (c = (fast_sint_t)k - 2; c >= 0; --c)
+ {
+ fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)];
+ if (l > 0)
+ {
+ fast_sint_t i = bucket_end[c];
+ if (j - i > 0)
+ {
+ memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
+ }
+
+ memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
+ }
+ }
+
+ memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
+}
+
+static void libsais16_place_lms_suffixes_histogram_32s_2k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets)
+{
+ fast_sint_t j = n;
+
+ if (k > 1)
+ {
+ fast_sint_t c;
+ for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0); c >= BUCKETS_INDEX2(0, 0); c -= BUCKETS_INDEX2(1, 0))
+ {
+ fast_sint_t l = (fast_sint_t)buckets[c + BUCKETS_INDEX2(0, 1)];
+ if (l > 0)
+ {
+ fast_sint_t i = buckets[c];
+ if (j - i > 0)
+ {
+ memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
+ }
+
+ memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
+ }
+ }
+ }
+
+ memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
+}
+
+static void libsais16_final_bwt_scan_left_to_right_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais16_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+
+ sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[i + 0] = T[p0] | SAINT_MIN; SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); }
+ sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[i + 1] = T[p1] | SAINT_MIN; SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); }
+ }
+
+ for (j += prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[i] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); }
+ }
+}
+
+static void libsais16_final_bwt_aux_scan_left_to_right_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais16_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+
+ sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[i + 0] = T[p0] | SAINT_MIN; SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); if ((p0 & rm) == 0) { I[p0 / (rm + 1)] = induction_bucket[T[p0]]; }}
+ sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[i + 1] = T[p1] | SAINT_MIN; SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); if ((p1 & rm) == 0) { I[p1 / (rm + 1)] = induction_bucket[T[p1]]; }}
+ }
+
+ for (j += prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[i] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]]; } }
+ }
+}
+
+static void libsais16_final_sorting_scan_left_to_right_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais16_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+
+ sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); }
+ sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); }
+ }
+
+ for (j += prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); }
+ }
+}
+
+static void libsais16_final_sorting_scan_left_to_right_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais16_prefetchw(&SA[i + 3 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+ sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { libsais16_prefetchw(&induction_bucket[T[s2 - 1]]); libsais16_prefetch(&T[s2] - 2); }
+ sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { libsais16_prefetchw(&induction_bucket[T[s3 - 1]]); libsais16_prefetch(&T[s3] - 2); }
+
+ sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); }
+ sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); }
+ }
+
+ for (j += 2 * prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); }
+ }
+}
+
+#if defined(_OPENMP)
+
+static fast_sint_t libsais16_final_bwt_scan_left_to_right_16u_block_prepare(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
+
+ fast_sint_t i, j, count = 0;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais16_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+
+ sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[i + 0] = T[p0] | SAINT_MIN; buckets[cache[count].symbol = T[p0]]++; cache[count++].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); }
+ sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[i + 1] = T[p1] | SAINT_MIN; buckets[cache[count].symbol = T[p1]]++; cache[count++].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); }
+ }
+
+ for (j += prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[i] = T[p] | SAINT_MIN; buckets[cache[count].symbol = T[p]]++; cache[count++].index = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); }
+ }
+
+ return count;
+}
+
+static fast_sint_t libsais16_final_sorting_scan_left_to_right_16u_block_prepare(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
+
+ fast_sint_t i, j, count = 0;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais16_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+
+ sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; buckets[cache[count].symbol = T[p0]]++; cache[count++].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); }
+ sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; buckets[cache[count].symbol = T[p1]]++; cache[count++].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); }
+ }
+
+ for (j += prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; buckets[cache[count].symbol = T[p]]++; cache[count++].index = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); }
+ }
+
+ return count;
+}
+
+static void libsais16_final_order_scan_left_to_right_16u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = 0, j = count - 3; i < j; i += 4)
+ {
+ libsais16_prefetch(&cache[i + prefetch_distance]);
+
+ SA[buckets[cache[i + 0].symbol]++] = cache[i + 0].index;
+ SA[buckets[cache[i + 1].symbol]++] = cache[i + 1].index;
+ SA[buckets[cache[i + 2].symbol]++] = cache[i + 2].index;
+ SA[buckets[cache[i + 3].symbol]++] = cache[i + 3].index;
+ }
+
+ for (j += 3; i < j; i += 1)
+ {
+ SA[buckets[cache[i].symbol]++] = cache[i].index;
+ }
+}
+
+static void libsais16_final_bwt_aux_scan_left_to_right_16u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = 0, j = count - 3; i < j; i += 4)
+ {
+ libsais16_prefetch(&cache[i + prefetch_distance]);
+
+ SA[buckets[cache[i + 0].symbol]++] = cache[i + 0].index; if ((cache[i + 0].index & rm) == 0) { I[(cache[i + 0].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 0].symbol]; }
+ SA[buckets[cache[i + 1].symbol]++] = cache[i + 1].index; if ((cache[i + 1].index & rm) == 0) { I[(cache[i + 1].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 1].symbol]; }
+ SA[buckets[cache[i + 2].symbol]++] = cache[i + 2].index; if ((cache[i + 2].index & rm) == 0) { I[(cache[i + 2].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 2].symbol]; }
+ SA[buckets[cache[i + 3].symbol]++] = cache[i + 3].index; if ((cache[i + 3].index & rm) == 0) { I[(cache[i + 3].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 3].symbol]; }
+ }
+
+ for (j += 3; i < j; i += 1)
+ {
+ SA[buckets[cache[i].symbol]++] = cache[i].index; if ((cache[i].index & rm) == 0) { I[(cache[i].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i].symbol]; }
+ }
+}
+
+static void libsais16_final_sorting_scan_left_to_right_32s_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais16_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+
+ libsais16_prefetchw(&cache[i + prefetch_distance]);
+
+ sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; cache[i + 0].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); symbol0 = T[p0]; } cache[i + 0].symbol = symbol0;
+ sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; cache[i + 1].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); symbol1 = T[p1]; } cache[i + 1].symbol = symbol1;
+ }
+
+ for (j += prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t symbol = SAINT_MIN, p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; cache[i].index = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); symbol = T[p]; } cache[i].symbol = symbol;
+ }
+}
+
+static void libsais16_final_sorting_scan_left_to_right_32s_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size;
+ for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais16_prefetchw(&cache[i + 2 * prefetch_distance]);
+
+ sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0]; libsais16_prefetchw(s0 >= 0 ? Is0 : NULL);
+ sa_sint_t s1 = cache[i + prefetch_distance + 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1]; libsais16_prefetchw(s1 >= 0 ? Is1 : NULL);
+
+ sa_sint_t v0 = cache[i + 0].symbol;
+ if (v0 >= 0)
+ {
+ cache[i + 0].symbol = induction_bucket[v0]++;
+ if (cache[i + 0].symbol < omp_block_end) { sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index; cache[i + 0].index = np ^ SAINT_MIN; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] < T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } }
+ }
+
+ sa_sint_t v1 = cache[i + 1].symbol;
+ if (v1 >= 0)
+ {
+ cache[i + 1].symbol = induction_bucket[v1]++;
+ if (cache[i + 1].symbol < omp_block_end) { sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index; cache[i + 1].index = np ^ SAINT_MIN; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] < T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } }
+ }
+ }
+
+ for (j += prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t v = cache[i].symbol;
+ if (v >= 0)
+ {
+ cache[i].symbol = induction_bucket[v]++;
+ if (cache[i].symbol < omp_block_end) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; cache[i].index = np ^ SAINT_MIN; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] < T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } }
+ }
+ }
+}
+
+static void libsais16_final_bwt_scan_left_to_right_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+ omp_block_start += block_start;
+
+ if (omp_num_threads == 1)
+ {
+ libsais16_final_bwt_scan_left_to_right_16u(T, SA, induction_bucket, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ thread_state[omp_thread_num].state.count = libsais16_final_bwt_scan_left_to_right_16u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ fast_sint_t t;
+ for (t = 0; t < omp_num_threads; ++t)
+ {
+ sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
+ fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A + B; temp_bucket[c] = A; }
+ }
+ }
+
+ #pragma omp barrier
+
+ {
+ libsais16_final_order_scan_left_to_right_16u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count);
+ }
+ }
+#endif
+ }
+}
+
+static void libsais16_final_bwt_aux_scan_left_to_right_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+ omp_block_start += block_start;
+
+ if (omp_num_threads == 1)
+ {
+ libsais16_final_bwt_aux_scan_left_to_right_16u(T, SA, rm, I, induction_bucket, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ thread_state[omp_thread_num].state.count = libsais16_final_bwt_scan_left_to_right_16u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ fast_sint_t t;
+ for (t = 0; t < omp_num_threads; ++t)
+ {
+ sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
+ fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A + B; temp_bucket[c] = A; }
+ }
+ }
+
+ #pragma omp barrier
+
+ {
+ libsais16_final_bwt_aux_scan_left_to_right_16u_block_place(SA, rm, I, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count);
+ }
+ }
+#endif
+ }
+}
+
+static void libsais16_final_sorting_scan_left_to_right_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+ omp_block_start += block_start;
+
+ if (omp_num_threads == 1)
+ {
+ libsais16_final_sorting_scan_left_to_right_16u(T, SA, induction_bucket, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ thread_state[omp_thread_num].state.count = libsais16_final_sorting_scan_left_to_right_16u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ fast_sint_t t;
+ for (t = 0; t < omp_num_threads; ++t)
+ {
+ sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
+ fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A + B; temp_bucket[c] = A; }
+ }
+ }
+
+ #pragma omp barrier
+
+ {
+ libsais16_final_order_scan_left_to_right_16u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count);
+ }
+ }
+#endif
+ }
+}
+
+static void libsais16_final_sorting_scan_left_to_right_32s_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(cache);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+ omp_block_start += block_start;
+
+ if (omp_num_threads == 1)
+ {
+ libsais16_final_sorting_scan_left_to_right_32s(T, SA, buckets, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ libsais16_final_sorting_scan_left_to_right_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ libsais16_final_sorting_scan_left_to_right_32s_block_sort(T, buckets, cache - block_start, block_start, block_size);
+ }
+
+ #pragma omp barrier
+
+ {
+ libsais16_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+ }
+ }
+#endif
+ }
+}
+
+#endif
+
+static void libsais16_final_bwt_scan_left_to_right_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ SA[induction_bucket[T[(sa_sint_t)n - 1]]++] = ((sa_sint_t)n - 1) | ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1));
+
+ if (threads == 1 || n < 65536)
+ {
+ libsais16_final_bwt_scan_left_to_right_16u(T, SA, induction_bucket, 0, n);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ fast_sint_t block_start;
+ for (block_start = 0; block_start < n; )
+ {
+ if (SA[block_start] == 0)
+ {
+ block_start++;
+ }
+ else
+ {
+ fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > n) { block_max_end = n;}
+ fast_sint_t block_end = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; }
+ fast_sint_t block_size = block_end - block_start;
+
+ if (block_size < 32)
+ {
+ for (; block_start < block_end; block_start += 1)
+ {
+ sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; SA[block_start] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); }
+ }
+ }
+ else
+ {
+ libsais16_final_bwt_scan_left_to_right_16u_block_omp(T, SA, induction_bucket, block_start, block_size, threads, thread_state);
+ block_start = block_end;
+ }
+ }
+ }
+ }
+#else
+ UNUSED(thread_state);
+#endif
+}
+
+static void libsais16_final_bwt_aux_scan_left_to_right_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ SA[induction_bucket[T[(sa_sint_t)n - 1]]++] = ((sa_sint_t)n - 1) | ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1));
+
+ if ((((sa_sint_t)n - 1) & rm) == 0) { I[((sa_sint_t)n - 1) / (rm + 1)] = induction_bucket[T[(sa_sint_t)n - 1]]; }
+
+ if (threads == 1 || n < 65536)
+ {
+ libsais16_final_bwt_aux_scan_left_to_right_16u(T, SA, rm, I, induction_bucket, 0, n);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ fast_sint_t block_start;
+ for (block_start = 0; block_start < n; )
+ {
+ if (SA[block_start] == 0)
+ {
+ block_start++;
+ }
+ else
+ {
+ fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > n) { block_max_end = n;}
+ fast_sint_t block_end = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; }
+ fast_sint_t block_size = block_end - block_start;
+
+ if (block_size < 32)
+ {
+ for (; block_start < block_end; block_start += 1)
+ {
+ sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; SA[block_start] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]]; } }
+ }
+ }
+ else
+ {
+ libsais16_final_bwt_aux_scan_left_to_right_16u_block_omp(T, SA, rm, I, induction_bucket, block_start, block_size, threads, thread_state);
+ block_start = block_end;
+ }
+ }
+ }
+ }
+#else
+ UNUSED(thread_state);
+#endif
+}
+
+static void libsais16_final_sorting_scan_left_to_right_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ SA[induction_bucket[T[(sa_sint_t)n - 1]]++] = ((sa_sint_t)n - 1) | ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1));
+
+ if (threads == 1 || n < 65536)
+ {
+ libsais16_final_sorting_scan_left_to_right_16u(T, SA, induction_bucket, 0, n);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ fast_sint_t block_start;
+ for (block_start = 0; block_start < n; )
+ {
+ if (SA[block_start] == 0)
+ {
+ block_start++;
+ }
+ else
+ {
+ fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > n) { block_max_end = n;}
+ fast_sint_t block_end = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; }
+ fast_sint_t block_size = block_end - block_start;
+
+ if (block_size < 32)
+ {
+ for (; block_start < block_end; block_start += 1)
+ {
+ sa_sint_t p = SA[block_start]; SA[block_start] = p ^ SAINT_MIN; if (p > 0) { p--; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); }
+ }
+ }
+ else
+ {
+ libsais16_final_sorting_scan_left_to_right_16u_block_omp(T, SA, induction_bucket, block_start, block_size, threads, thread_state);
+ block_start = block_end;
+ }
+ }
+ }
+ }
+#else
+ UNUSED(thread_state);
+#endif
+}
+
+static void libsais16_final_sorting_scan_left_to_right_32s_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ SA[induction_bucket[T[n - 1]]++] = (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1));
+
+ if (threads == 1 || n < 65536)
+ {
+ libsais16_final_sorting_scan_left_to_right_32s(T, SA, induction_bucket, 0, n);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ fast_sint_t block_start, block_end;
+ for (block_start = 0; block_start < n; block_start = block_end)
+ {
+ block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > n) { block_end = n; }
+
+ libsais16_final_sorting_scan_left_to_right_32s_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, block_start, block_end - block_start, threads);
+ }
+ }
+#else
+ UNUSED(thread_state);
+#endif
+}
+
+static sa_sint_t libsais16_final_bwt_scan_right_to_left_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j; sa_sint_t index = -1;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+ {
+ libsais16_prefetchw(&SA[i - 2 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+
+ sa_sint_t p0 = SA[i - 0]; index = (p0 == 0) ? (sa_sint_t)(i - 0) : index;
+ SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint16_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p0 : t; }
+
+ sa_sint_t p1 = SA[i - 1]; index = (p1 == 0) ? (sa_sint_t)(i - 1) : index;
+ SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint16_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p1 : t; }
+ }
+
+ for (j -= prefetch_distance + 1; i >= j; i -= 1)
+ {
+ sa_sint_t p = SA[i]; index = (p == 0) ? (sa_sint_t)i : index;
+ SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint16_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; }
+ }
+
+ return index;
+}
+
+static void libsais16_final_bwt_aux_scan_right_to_left_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+ {
+ libsais16_prefetchw(&SA[i - 2 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+
+ sa_sint_t p0 = SA[i - 0];
+ SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint16_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p0 : t; if ((p0 & rm) == 0) { I[p0 / (rm + 1)] = induction_bucket[T[p0]] + 1; } }
+
+ sa_sint_t p1 = SA[i - 1];
+ SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint16_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p1 : t; if ((p1 & rm) == 0) { I[p1 / (rm + 1)] = induction_bucket[T[p1]] + 1; } }
+ }
+
+ for (j -= prefetch_distance + 1; i >= j; i -= 1)
+ {
+ sa_sint_t p = SA[i];
+ SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint16_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]] + 1; } }
+ }
+}
+
+static void libsais16_final_sorting_scan_right_to_left_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+ {
+ libsais16_prefetchw(&SA[i - 2 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+
+ sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[--induction_bucket[T[p0]]] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); }
+ sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[--induction_bucket[T[p1]]] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); }
+ }
+
+ for (j -= prefetch_distance + 1; i >= j; i -= 1)
+ {
+ sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[--induction_bucket[T[p]]] = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); }
+ }
+}
+
+static void libsais16_final_sorting_scan_right_to_left_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2)
+ {
+ libsais16_prefetchw(&SA[i - 3 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+ sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { libsais16_prefetchw(&induction_bucket[T[s2 - 1]]); libsais16_prefetch(&T[s2] - 2); }
+ sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { libsais16_prefetchw(&induction_bucket[T[s3 - 1]]); libsais16_prefetch(&T[s3] - 2); }
+
+ sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[--induction_bucket[T[p0]]] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); }
+ sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[--induction_bucket[T[p1]]] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); }
+ }
+
+ for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1)
+ {
+ sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[--induction_bucket[T[p]]] = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); }
+ }
+}
+
+#if defined(_OPENMP)
+
+static fast_sint_t libsais16_final_bwt_scan_right_to_left_16u_block_prepare(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
+
+ fast_sint_t i, j, count = 0;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+ {
+ libsais16_prefetchw(&SA[i - 2 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+
+ sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint16_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count++].index = (c0 <= c1) ? p0 : t; }
+ sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint16_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count++].index = (c0 <= c1) ? p1 : t; }
+ }
+
+ for (j -= prefetch_distance + 1; i >= j; i -= 1)
+ {
+ sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint16_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count++].index = (c0 <= c1) ? p : t; }
+ }
+
+ return count;
+}
+
+static fast_sint_t libsais16_final_bwt_aux_scan_right_to_left_16u_block_prepare(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
+
+ fast_sint_t i, j, count = 0;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+ {
+ libsais16_prefetchw(&SA[i - 2 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+
+ sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint16_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count].index = (c0 <= c1) ? p0 : t; cache[count + 1].index = p0; count += 2; }
+ sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint16_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count].index = (c0 <= c1) ? p1 : t; cache[count + 1].index = p1; count += 2; }
+ }
+
+ for (j -= prefetch_distance + 1; i >= j; i -= 1)
+ {
+ sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint16_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count].index = (c0 <= c1) ? p : t; cache[count + 1].index = p; count += 2; }
+ }
+
+ return count;
+}
+
+static fast_sint_t libsais16_final_sorting_scan_right_to_left_16u_block_prepare(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
+
+ fast_sint_t i, j, count = 0;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+ {
+ libsais16_prefetchw(&SA[i - 2 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+
+ sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; buckets[cache[count].symbol = T[p0]]++; cache[count++].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); }
+ sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; buckets[cache[count].symbol = T[p1]]++; cache[count++].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); }
+ }
+
+ for (j -= prefetch_distance + 1; i >= j; i -= 1)
+ {
+ sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; buckets[cache[count].symbol = T[p]]++; cache[count++].index = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); }
+ }
+
+ return count;
+}
+
+static void libsais16_final_order_scan_right_to_left_16u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = 0, j = count - 3; i < j; i += 4)
+ {
+ libsais16_prefetch(&cache[i + prefetch_distance]);
+
+ SA[--buckets[cache[i + 0].symbol]] = cache[i + 0].index;
+ SA[--buckets[cache[i + 1].symbol]] = cache[i + 1].index;
+ SA[--buckets[cache[i + 2].symbol]] = cache[i + 2].index;
+ SA[--buckets[cache[i + 3].symbol]] = cache[i + 3].index;
+ }
+
+ for (j += 3; i < j; i += 1)
+ {
+ SA[--buckets[cache[i].symbol]] = cache[i].index;
+ }
+}
+
+static void libsais16_final_bwt_aux_scan_right_to_left_16u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = 0, j = count - 6; i < j; i += 8)
+ {
+ libsais16_prefetch(&cache[i + prefetch_distance]);
+
+ SA[--buckets[cache[i + 0].symbol]] = cache[i + 0].index; if ((cache[i + 1].index & rm) == 0) { I[cache[i + 1].index / (rm + 1)] = buckets[cache[i + 0].symbol] + 1; }
+ SA[--buckets[cache[i + 2].symbol]] = cache[i + 2].index; if ((cache[i + 3].index & rm) == 0) { I[cache[i + 3].index / (rm + 1)] = buckets[cache[i + 2].symbol] + 1; }
+ SA[--buckets[cache[i + 4].symbol]] = cache[i + 4].index; if ((cache[i + 5].index & rm) == 0) { I[cache[i + 5].index / (rm + 1)] = buckets[cache[i + 4].symbol] + 1; }
+ SA[--buckets[cache[i + 6].symbol]] = cache[i + 6].index; if ((cache[i + 7].index & rm) == 0) { I[cache[i + 7].index / (rm + 1)] = buckets[cache[i + 6].symbol] + 1; }
+ }
+
+ for (j += 6; i < j; i += 2)
+ {
+ SA[--buckets[cache[i].symbol]] = cache[i].index; if ((cache[i + 1].index & rm) == 0) { I[(cache[i + 1].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i].symbol] + 1; }
+ }
+}
+
+static void libsais16_final_sorting_scan_right_to_left_32s_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+ {
+ libsais16_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL);
+
+ libsais16_prefetchw(&cache[i + prefetch_distance]);
+
+ sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; cache[i + 0].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); symbol0 = T[p0]; } cache[i + 0].symbol = symbol0;
+ sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; cache[i + 1].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); symbol1 = T[p1]; } cache[i + 1].symbol = symbol1;
+ }
+
+ for (j += prefetch_distance + 1; i < j; i += 1)
+ {
+ sa_sint_t symbol = SAINT_MIN, p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; cache[i].index = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); symbol = T[p]; } cache[i].symbol = symbol;
+ }
+}
+
+static void libsais16_final_sorting_scan_right_to_left_32s_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+ {
+ libsais16_prefetchw(&cache[i - 2 * prefetch_distance]);
+
+ sa_sint_t s0 = cache[i - prefetch_distance - 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0]; libsais16_prefetchw(s0 >= 0 ? Is0 : NULL);
+ sa_sint_t s1 = cache[i - prefetch_distance - 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1]; libsais16_prefetchw(s1 >= 0 ? Is1 : NULL);
+
+ sa_sint_t v0 = cache[i - 0].symbol;
+ if (v0 >= 0)
+ {
+ cache[i - 0].symbol = --induction_bucket[v0];
+ if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 0].symbol, np = cache[i - 0].index; cache[i - 0].index = np & SAINT_MAX; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] > T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } }
+ }
+
+ sa_sint_t v1 = cache[i - 1].symbol;
+ if (v1 >= 0)
+ {
+ cache[i - 1].symbol = --induction_bucket[v1];
+ if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 1].symbol, np = cache[i - 1].index; cache[i - 1].index = np & SAINT_MAX; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] > T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } }
+ }
+ }
+
+ for (j -= prefetch_distance + 1; i >= j; i -= 1)
+ {
+ sa_sint_t v = cache[i].symbol;
+ if (v >= 0)
+ {
+ cache[i].symbol = --induction_bucket[v];
+ if (cache[i].symbol >= omp_block_start) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; cache[i].index = np & SAINT_MAX; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] > T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } }
+ }
+ }
+}
+
+static void libsais16_final_bwt_scan_right_to_left_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+ omp_block_start += block_start;
+
+ if (omp_num_threads == 1)
+ {
+ libsais16_final_bwt_scan_right_to_left_16u(T, SA, induction_bucket, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ thread_state[omp_thread_num].state.count = libsais16_final_bwt_scan_right_to_left_16u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ fast_sint_t t;
+ for (t = omp_num_threads - 1; t >= 0; --t)
+ {
+ sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
+ fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A - B; temp_bucket[c] = A; }
+ }
+ }
+
+ #pragma omp barrier
+
+ {
+ libsais16_final_order_scan_right_to_left_16u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count);
+ }
+ }
+#endif
+ }
+}
+
+static void libsais16_final_bwt_aux_scan_right_to_left_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+ omp_block_start += block_start;
+
+ if (omp_num_threads == 1)
+ {
+ libsais16_final_bwt_aux_scan_right_to_left_16u(T, SA, rm, I, induction_bucket, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ thread_state[omp_thread_num].state.count = libsais16_final_bwt_aux_scan_right_to_left_16u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ fast_sint_t t;
+ for (t = omp_num_threads - 1; t >= 0; --t)
+ {
+ sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
+ fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A - B; temp_bucket[c] = A; }
+ }
+ }
+
+ #pragma omp barrier
+
+ {
+ libsais16_final_bwt_aux_scan_right_to_left_16u_block_place(SA, rm, I, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count);
+ }
+ }
+#endif
+ }
+}
+
+static void libsais16_final_sorting_scan_right_to_left_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+ omp_block_start += block_start;
+
+ if (omp_num_threads == 1)
+ {
+ libsais16_final_sorting_scan_right_to_left_16u(T, SA, induction_bucket, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ thread_state[omp_thread_num].state.count = libsais16_final_sorting_scan_right_to_left_16u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ fast_sint_t t;
+ for (t = omp_num_threads - 1; t >= 0; --t)
+ {
+ sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
+ fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A - B; temp_bucket[c] = A; }
+ }
+ }
+
+ #pragma omp barrier
+
+ {
+ libsais16_final_order_scan_right_to_left_16u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count);
+ }
+ }
+#endif
+ }
+}
+
+static void libsais16_final_sorting_scan_right_to_left_32s_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(cache);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+ omp_block_start += block_start;
+
+ if (omp_num_threads == 1)
+ {
+ libsais16_final_sorting_scan_right_to_left_32s(T, SA, buckets, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ libsais16_final_sorting_scan_right_to_left_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ libsais16_final_sorting_scan_right_to_left_32s_block_sort(T, buckets, cache - block_start, block_start, block_size);
+ }
+
+ #pragma omp barrier
+
+ {
+ libsais16_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+ }
+ }
+#endif
+ }
+}
+
+#endif
+
+static sa_sint_t libsais16_final_bwt_scan_right_to_left_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ sa_sint_t index = -1;
+
+ if (threads == 1 || n < 65536)
+ {
+ index = libsais16_final_bwt_scan_right_to_left_16u(T, SA, induction_bucket, 0, n);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ fast_sint_t block_start;
+ for (block_start = (fast_sint_t)n - 1; block_start >= 0; )
+ {
+ if (SA[block_start] == 0)
+ {
+ index = (sa_sint_t)block_start--;
+ }
+ else
+ {
+ fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end < 0) { block_max_end = -1; }
+ fast_sint_t block_end = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; }
+ fast_sint_t block_size = block_start - block_end;
+
+ if (block_size < 32)
+ {
+ for (; block_start > block_end; block_start -= 1)
+ {
+ sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; uint16_t c0 = T[p - (p > 0)], c1 = T[p]; SA[block_start] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; }
+ }
+ }
+ else
+ {
+ libsais16_final_bwt_scan_right_to_left_16u_block_omp(T, SA, induction_bucket, block_end + 1, block_size, threads, thread_state);
+ block_start = block_end;
+ }
+ }
+ }
+ }
+#else
+ UNUSED(thread_state);
+#endif
+
+ return index;
+}
+
+static void libsais16_final_bwt_aux_scan_right_to_left_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ if (threads == 1 || n < 65536)
+ {
+ libsais16_final_bwt_aux_scan_right_to_left_16u(T, SA, rm, I, induction_bucket, 0, n);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ fast_sint_t block_start;
+ for (block_start = (fast_sint_t)n - 1; block_start >= 0; )
+ {
+ if (SA[block_start] == 0)
+ {
+ block_start--;
+ }
+ else
+ {
+ fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * ((LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads) / 2); if (block_max_end < 0) { block_max_end = -1; }
+ fast_sint_t block_end = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; }
+ fast_sint_t block_size = block_start - block_end;
+
+ if (block_size < 32)
+ {
+ for (; block_start > block_end; block_start -= 1)
+ {
+ sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; uint16_t c0 = T[p - (p > 0)], c1 = T[p]; SA[block_start] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]] + 1; } }
+ }
+ }
+ else
+ {
+ libsais16_final_bwt_aux_scan_right_to_left_16u_block_omp(T, SA, rm, I, induction_bucket, block_end + 1, block_size, threads, thread_state);
+ block_start = block_end;
+ }
+ }
+ }
+ }
+#else
+ UNUSED(thread_state);
+#endif
+}
+
+static void libsais16_final_sorting_scan_right_to_left_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ if (threads == 1 || n < 65536)
+ {
+ libsais16_final_sorting_scan_right_to_left_16u(T, SA, induction_bucket, 0, n);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ fast_sint_t block_start;
+ for (block_start = (fast_sint_t)n - 1; block_start >= 0; )
+ {
+ if (SA[block_start] == 0)
+ {
+ block_start--;
+ }
+ else
+ {
+ fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end < -1) { block_max_end = -1; }
+ fast_sint_t block_end = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; }
+ fast_sint_t block_size = block_start - block_end;
+
+ if (block_size < 32)
+ {
+ for (; block_start > block_end; block_start -= 1)
+ {
+ sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; SA[--induction_bucket[T[p]]] = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); }
+ }
+ }
+ else
+ {
+ libsais16_final_sorting_scan_right_to_left_16u_block_omp(T, SA, induction_bucket, block_end + 1, block_size, threads, thread_state);
+ block_start = block_end;
+ }
+ }
+ }
+ }
+#else
+ UNUSED(thread_state);
+#endif
+}
+
+static void libsais16_final_sorting_scan_right_to_left_32s_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ if (threads == 1 || n < 65536)
+ {
+ libsais16_final_sorting_scan_right_to_left_32s(T, SA, induction_bucket, 0, n);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ fast_sint_t block_start, block_end;
+ for (block_start = (fast_sint_t)n - 1; block_start >= 0; block_start = block_end)
+ {
+ block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < 0) { block_end = -1; }
+
+ libsais16_final_sorting_scan_right_to_left_32s_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads);
+ }
+ }
+#else
+ UNUSED(thread_state);
+#endif
+}
+
+static void libsais16_clear_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT bucket_start, sa_sint_t * RESTRICT bucket_end, sa_sint_t threads)
+{
+ fast_sint_t c;
+
+#if defined(_OPENMP)
+ #pragma omp parallel for schedule(static, 1) num_threads(threads) if(threads > 1 && n >= 65536)
+#else
+ UNUSED(threads); UNUSED(n);
+#endif
+ for (c = 0; c < k; ++c)
+ {
+ if (bucket_end[c] > bucket_start[c])
+ {
+ memset(&SA[bucket_start[c]], 0, ((size_t)bucket_end[c] - (size_t)bucket_start[c]) * sizeof(sa_sint_t));
+ }
+ }
+}
+
+static sa_sint_t libsais16_induce_final_order_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t bwt, sa_sint_t r, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ if (!bwt)
+ {
+ libsais16_final_sorting_scan_left_to_right_16u_omp(T, SA, n, &buckets[6 * ALPHABET_SIZE], threads, thread_state);
+ if (threads > 1 && n >= 65536) { libsais16_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE], &buckets[7 * ALPHABET_SIZE], threads); }
+ libsais16_final_sorting_scan_right_to_left_16u_omp(T, SA, n, &buckets[7 * ALPHABET_SIZE], threads, thread_state);
+ return 0;
+ }
+ else if (I != NULL)
+ {
+ libsais16_final_bwt_aux_scan_left_to_right_16u_omp(T, SA, n, r - 1, I, &buckets[6 * ALPHABET_SIZE], threads, thread_state);
+ if (threads > 1 && n >= 65536) { libsais16_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE], &buckets[7 * ALPHABET_SIZE], threads); }
+ libsais16_final_bwt_aux_scan_right_to_left_16u_omp(T, SA, n, r - 1, I, &buckets[7 * ALPHABET_SIZE], threads, thread_state);
+ return 0;
+ }
+ else
+ {
+ libsais16_final_bwt_scan_left_to_right_16u_omp(T, SA, n, &buckets[6 * ALPHABET_SIZE], threads, thread_state);
+ if (threads > 1 && n >= 65536) { libsais16_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE], &buckets[7 * ALPHABET_SIZE], threads); }
+ return libsais16_final_bwt_scan_right_to_left_16u_omp(T, SA, n, &buckets[7 * ALPHABET_SIZE], threads, thread_state);
+ }
+}
+
+static void libsais16_induce_final_order_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ libsais16_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[4 * k], threads, thread_state);
+ libsais16_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[5 * k], threads, thread_state);
+}
+
+static void libsais16_induce_final_order_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ libsais16_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[2 * k], threads, thread_state);
+ libsais16_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[3 * k], threads, thread_state);
+}
+
+static void libsais16_induce_final_order_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ libsais16_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[1 * k], threads, thread_state);
+ libsais16_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[0 * k], threads, thread_state);
+}
+
+static void libsais16_induce_final_order_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ libsais16_count_suffixes_32s(T, n, k, buckets);
+ libsais16_initialize_buckets_start_32s_1k(k, buckets);
+ libsais16_final_sorting_scan_left_to_right_32s_omp(T, SA, n, buckets, threads, thread_state);
+
+ libsais16_count_suffixes_32s(T, n, k, buckets);
+ libsais16_initialize_buckets_end_32s_1k(k, buckets);
+ libsais16_final_sorting_scan_right_to_left_32s_omp(T, SA, n, buckets, threads, thread_state);
+}
+
+static sa_sint_t libsais16_renumber_unique_and_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t f, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t * RESTRICT SAm = &SA[m];
+
+ sa_sint_t i, j;
+ for (i = (sa_sint_t)omp_block_start, j = (sa_sint_t)omp_block_start + (sa_sint_t)omp_block_size - 2 * (sa_sint_t)prefetch_distance - 3; i < j; i += 4)
+ {
+ libsais16_prefetch(&SA[i + 3 * prefetch_distance]);
+
+ libsais16_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 0]) >> 1]);
+ libsais16_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 1]) >> 1]);
+ libsais16_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 2]) >> 1]);
+ libsais16_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 3]) >> 1]);
+
+ sa_uint_t q0 = (sa_uint_t)SA[i + prefetch_distance + 0]; const sa_sint_t * Tq0 = &T[q0]; libsais16_prefetchw(SAm[q0 >> 1] < 0 ? Tq0 : NULL);
+ sa_uint_t q1 = (sa_uint_t)SA[i + prefetch_distance + 1]; const sa_sint_t * Tq1 = &T[q1]; libsais16_prefetchw(SAm[q1 >> 1] < 0 ? Tq1 : NULL);
+ sa_uint_t q2 = (sa_uint_t)SA[i + prefetch_distance + 2]; const sa_sint_t * Tq2 = &T[q2]; libsais16_prefetchw(SAm[q2 >> 1] < 0 ? Tq2 : NULL);
+ sa_uint_t q3 = (sa_uint_t)SA[i + prefetch_distance + 3]; const sa_sint_t * Tq3 = &T[q3]; libsais16_prefetchw(SAm[q3 >> 1] < 0 ? Tq3 : NULL);
+
+ sa_uint_t p0 = (sa_uint_t)SA[i + 0]; sa_sint_t s0 = SAm[p0 >> 1]; if (s0 < 0) { T[p0] |= SAINT_MIN; f++; s0 = i + 0 + SAINT_MIN + f; } SAm[p0 >> 1] = s0 - f;
+ sa_uint_t p1 = (sa_uint_t)SA[i + 1]; sa_sint_t s1 = SAm[p1 >> 1]; if (s1 < 0) { T[p1] |= SAINT_MIN; f++; s1 = i + 1 + SAINT_MIN + f; } SAm[p1 >> 1] = s1 - f;
+ sa_uint_t p2 = (sa_uint_t)SA[i + 2]; sa_sint_t s2 = SAm[p2 >> 1]; if (s2 < 0) { T[p2] |= SAINT_MIN; f++; s2 = i + 2 + SAINT_MIN + f; } SAm[p2 >> 1] = s2 - f;
+ sa_uint_t p3 = (sa_uint_t)SA[i + 3]; sa_sint_t s3 = SAm[p3 >> 1]; if (s3 < 0) { T[p3] |= SAINT_MIN; f++; s3 = i + 3 + SAINT_MIN + f; } SAm[p3 >> 1] = s3 - f;
+ }
+
+ for (j += 2 * (sa_sint_t)prefetch_distance + 3; i < j; i += 1)
+ {
+ sa_uint_t p = (sa_uint_t)SA[i]; sa_sint_t s = SAm[p >> 1]; if (s < 0) { T[p] |= SAINT_MIN; f++; s = i + SAINT_MIN + f; } SAm[p >> 1] = s - f;
+ }
+
+ return f;
+}
+
+static void libsais16_compact_unique_and_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t * pl, fast_sint_t * pr, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t * RESTRICT SAl = &SA[0];
+ sa_sint_t * RESTRICT SAr = &SA[0];
+
+ fast_sint_t i, j, l = *pl - 1, r = *pr - 1;
+ for (i = (fast_sint_t)m + omp_block_start + omp_block_size - 1, j = (fast_sint_t)m + omp_block_start + 3; i >= j; i -= 4)
+ {
+ libsais16_prefetch(&SA[i - prefetch_distance]);
+
+ sa_sint_t p0 = SA[i - 0]; SAl[l] = p0 & SAINT_MAX; l -= p0 < 0; SAr[r] = p0 - 1; r -= p0 > 0;
+ sa_sint_t p1 = SA[i - 1]; SAl[l] = p1 & SAINT_MAX; l -= p1 < 0; SAr[r] = p1 - 1; r -= p1 > 0;
+ sa_sint_t p2 = SA[i - 2]; SAl[l] = p2 & SAINT_MAX; l -= p2 < 0; SAr[r] = p2 - 1; r -= p2 > 0;
+ sa_sint_t p3 = SA[i - 3]; SAl[l] = p3 & SAINT_MAX; l -= p3 < 0; SAr[r] = p3 - 1; r -= p3 > 0;
+ }
+
+ for (j -= 3; i >= j; i -= 1)
+ {
+ sa_sint_t p = SA[i]; SAl[l] = p & SAINT_MAX; l -= p < 0; SAr[r] = p - 1; r -= p > 0;
+ }
+
+ *pl = l + 1; *pr = r + 1;
+}
+
+
+#if defined(_OPENMP)
+
+static sa_sint_t libsais16_count_unique_suffixes(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t * RESTRICT SAm = &SA[m];
+
+ fast_sint_t i, j; sa_sint_t f0 = 0, f1 = 0, f2 = 0, f3 = 0;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
+ {
+ libsais16_prefetch(&SA[i + 2 * prefetch_distance]);
+
+ libsais16_prefetch(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]);
+ libsais16_prefetch(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]);
+ libsais16_prefetch(&SAm[((sa_uint_t)SA[i + prefetch_distance + 2]) >> 1]);
+ libsais16_prefetch(&SAm[((sa_uint_t)SA[i + prefetch_distance + 3]) >> 1]);
+
+ f0 += SAm[((sa_uint_t)SA[i + 0]) >> 1] < 0;
+ f1 += SAm[((sa_uint_t)SA[i + 1]) >> 1] < 0;
+ f2 += SAm[((sa_uint_t)SA[i + 2]) >> 1] < 0;
+ f3 += SAm[((sa_uint_t)SA[i + 3]) >> 1] < 0;
+ }
+
+ for (j += prefetch_distance + 3; i < j; i += 1)
+ {
+ f0 += SAm[((sa_uint_t)SA[i]) >> 1] < 0;
+ }
+
+ return f0 + f1 + f2 + f3;
+}
+
+#endif
+
+static sa_sint_t libsais16_renumber_unique_and_nonunique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ sa_sint_t f = 0;
+
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
+
+ if (omp_num_threads == 1)
+ {
+ f = libsais16_renumber_unique_and_nonunique_lms_suffixes_32s(T, SA, m, 0, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ thread_state[omp_thread_num].state.count = libsais16_count_unique_suffixes(SA, m, omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ {
+ fast_sint_t t, count = 0; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; }
+
+ if (omp_thread_num == omp_num_threads - 1)
+ {
+ f = (sa_sint_t)(count + thread_state[omp_thread_num].state.count);
+ }
+
+ libsais16_renumber_unique_and_nonunique_lms_suffixes_32s(T, SA, m, (sa_sint_t)count, omp_block_start, omp_block_size);
+ }
+ }
+#endif
+ }
+
+ return f;
+}
+
+static void libsais16_compact_unique_and_nonunique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072 && m < fs)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start;
+
+ if (omp_num_threads == 1)
+ {
+ fast_sint_t l = m, r = (fast_sint_t)n + (fast_sint_t)fs;
+ libsais16_compact_unique_and_nonunique_lms_suffixes_32s(SA, m, &l, &r, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ thread_state[omp_thread_num].state.position = (fast_sint_t)m + ((fast_sint_t)n >> 1) + omp_block_start + omp_block_size;
+ thread_state[omp_thread_num].state.count = (fast_sint_t)m + omp_block_start + omp_block_size;
+
+ libsais16_compact_unique_and_nonunique_lms_suffixes_32s(SA, m, &thread_state[omp_thread_num].state.position, &thread_state[omp_thread_num].state.count, omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ fast_sint_t t, position;
+
+ for (position = m, t = omp_num_threads - 1; t >= 0; --t)
+ {
+ fast_sint_t omp_block_end = t < omp_num_threads - 1 ? omp_block_stride * (t + 1) : ((fast_sint_t)n >> 1);
+ fast_sint_t count = ((fast_sint_t)m + ((fast_sint_t)n >> 1) + omp_block_end - thread_state[t].state.position);
+
+ if (count > 0)
+ {
+ position -= count; memcpy(&SA[position], &SA[thread_state[t].state.position], (size_t)count * sizeof(sa_sint_t));
+ }
+ }
+
+ for (position = (fast_sint_t)n + (fast_sint_t)fs, t = omp_num_threads - 1; t >= 0; --t)
+ {
+ fast_sint_t omp_block_end = t < omp_num_threads - 1 ? omp_block_stride * (t + 1) : ((fast_sint_t)n >> 1);
+ fast_sint_t count = ((fast_sint_t)m + omp_block_end - thread_state[t].state.count);
+
+ if (count > 0)
+ {
+ position -= count; memcpy(&SA[position], &SA[thread_state[t].state.count], (size_t)count * sizeof(sa_sint_t));
+ }
+ }
+ }
+ }
+#endif
+ }
+
+ memcpy(&SA[(fast_sint_t)n + (fast_sint_t)fs - (fast_sint_t)m], &SA[(fast_sint_t)m - (fast_sint_t)f], (size_t)f * sizeof(sa_sint_t));
+}
+
+static sa_sint_t libsais16_compact_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ sa_sint_t f = libsais16_renumber_unique_and_nonunique_lms_suffixes_32s_omp(T, SA, m, threads, thread_state);
+ libsais16_compact_unique_and_nonunique_lms_suffixes_32s_omp(SA, n, m, fs, f, threads, thread_state);
+
+ return f;
+}
+
+static void libsais16_merge_unique_lms_suffixes_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, fast_sint_t l, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ const sa_sint_t * RESTRICT SAnm = &SA[(fast_sint_t)n - (fast_sint_t)m - 1 + l];
+
+ sa_sint_t i, j; fast_sint_t tmp = *SAnm++;
+ for (i = (sa_sint_t)omp_block_start, j = (sa_sint_t)omp_block_start + (sa_sint_t)omp_block_size - 6; i < j; i += 4)
+ {
+ libsais16_prefetch(&T[i + prefetch_distance]);
+
+ sa_sint_t c0 = T[i + 0]; if (c0 < 0) { T[i + 0] = c0 & SAINT_MAX; SA[tmp] = i + 0; i++; tmp = *SAnm++; }
+ sa_sint_t c1 = T[i + 1]; if (c1 < 0) { T[i + 1] = c1 & SAINT_MAX; SA[tmp] = i + 1; i++; tmp = *SAnm++; }
+ sa_sint_t c2 = T[i + 2]; if (c2 < 0) { T[i + 2] = c2 & SAINT_MAX; SA[tmp] = i + 2; i++; tmp = *SAnm++; }
+ sa_sint_t c3 = T[i + 3]; if (c3 < 0) { T[i + 3] = c3 & SAINT_MAX; SA[tmp] = i + 3; i++; tmp = *SAnm++; }
+ }
+
+ for (j += 6; i < j; i += 1)
+ {
+ sa_sint_t c = T[i]; if (c < 0) { T[i] = c & SAINT_MAX; SA[tmp] = i; i++; tmp = *SAnm++; }
+ }
+}
+
+static void libsais16_merge_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, fast_sint_t l, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ const sa_sint_t * RESTRICT SAnm = &SA[(fast_sint_t)n - (fast_sint_t)m - 1 + l];
+
+ fast_sint_t i, j; sa_sint_t tmp = *SAnm++;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - 3; i < j; i += 4)
+ {
+ libsais16_prefetch(&SA[i + prefetch_distance]);
+
+ if (SA[i + 0] == 0) { SA[i + 0] = tmp; tmp = *SAnm++; }
+ if (SA[i + 1] == 0) { SA[i + 1] = tmp; tmp = *SAnm++; }
+ if (SA[i + 2] == 0) { SA[i + 2] = tmp; tmp = *SAnm++; }
+ if (SA[i + 3] == 0) { SA[i + 3] = tmp; tmp = *SAnm++; }
+ }
+
+ for (j += 3; i < j; i += 1)
+ {
+ if (SA[i] == 0) { SA[i] = tmp; tmp = *SAnm++; }
+ }
+}
+
+static void libsais16_merge_unique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+ if (omp_num_threads == 1)
+ {
+ libsais16_merge_unique_lms_suffixes_32s(T, SA, n, m, 0, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ thread_state[omp_thread_num].state.count = libsais16_count_negative_marked_suffixes(T, omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ {
+ fast_sint_t t, count = 0; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; }
+
+ libsais16_merge_unique_lms_suffixes_32s(T, SA, n, m, count, omp_block_start, omp_block_size);
+ }
+ }
+#endif
+ }
+}
+
+static void libsais16_merge_nonunique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads); UNUSED(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+ fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
+
+ if (omp_num_threads == 1)
+ {
+ libsais16_merge_nonunique_lms_suffixes_32s(SA, n, m, f, omp_block_start, omp_block_size);
+ }
+#if defined(_OPENMP)
+ else
+ {
+ {
+ thread_state[omp_thread_num].state.count = libsais16_count_zero_marked_suffixes(SA, omp_block_start, omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ {
+ fast_sint_t t, count = f; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; }
+
+ libsais16_merge_nonunique_lms_suffixes_32s(SA, n, m, count, omp_block_start, omp_block_size);
+ }
+ }
+#endif
+ }
+}
+
+static void libsais16_merge_compacted_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ libsais16_merge_unique_lms_suffixes_32s_omp(T, SA, n, m, threads, thread_state);
+ libsais16_merge_nonunique_lms_suffixes_32s_omp(SA, n, m, f, threads, thread_state);
+}
+
+static void libsais16_reconstruct_compacted_lms_suffixes_32s_2k_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, sa_sint_t fs, sa_sint_t f, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ if (f > 0)
+ {
+ memmove(&SA[n - m - 1], &SA[n + fs - m], (size_t)f * sizeof(sa_sint_t));
+
+ libsais16_count_and_gather_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state);
+ libsais16_reconstruct_lms_suffixes_omp(SA, n, m - f, threads);
+
+ memcpy(&SA[n - m - 1 + f], &SA[0], ((size_t)m - (size_t)f) * sizeof(sa_sint_t));
+ memset(&SA[0], 0, (size_t)m * sizeof(sa_sint_t));
+
+ libsais16_merge_compacted_lms_suffixes_32s_omp(T, SA, n, m, f, threads, thread_state);
+ }
+ else
+ {
+ libsais16_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n);
+ libsais16_reconstruct_lms_suffixes_omp(SA, n, m, threads);
+ }
+}
+
+static void libsais16_reconstruct_compacted_lms_suffixes_32s_1k_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ if (f > 0)
+ {
+ memmove(&SA[n - m - 1], &SA[n + fs - m], (size_t)f * sizeof(sa_sint_t));
+
+ libsais16_gather_compacted_lms_suffixes_32s(T, SA, n);
+ libsais16_reconstruct_lms_suffixes_omp(SA, n, m - f, threads);
+
+ memcpy(&SA[n - m - 1 + f], &SA[0], ((size_t)m - (size_t)f) * sizeof(sa_sint_t));
+ memset(&SA[0], 0, (size_t)m * sizeof(sa_sint_t));
+
+ libsais16_merge_compacted_lms_suffixes_32s_omp(T, SA, n, m, f, threads, thread_state);
+ }
+ else
+ {
+ libsais16_gather_lms_suffixes_32s(T, SA, n);
+ libsais16_reconstruct_lms_suffixes_omp(SA, n, m, threads);
+ }
+}
+
+static sa_sint_t libsais16_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ if (k > 0 && fs / k >= 6)
+ {
+ sa_sint_t alignment = (fs - 1024) / k >= 6 ? 1024 : 16;
+ sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 6 ? (sa_sint_t *)libsais16_align_up(&SA[n + fs - 6 * k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 6 * k];
+
+ sa_sint_t m = libsais16_count_and_gather_lms_suffixes_32s_4k_omp(T, SA, n, k, buckets, threads, thread_state);
+ if (m > 1)
+ {
+ memset(SA, 0, ((size_t)n - (size_t)m) * sizeof(sa_sint_t));
+
+ sa_sint_t first_lms_suffix = SA[n - m];
+ sa_sint_t left_suffixes_count = libsais16_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(T, k, buckets, first_lms_suffix);
+
+ libsais16_radix_sort_lms_suffixes_32s_6k_omp(T, SA, n, m, &buckets[4 * k], threads, thread_state);
+ libsais16_radix_sort_set_markers_32s_6k_omp(SA, k, &buckets[4 * k], threads);
+
+ if (threads > 1 && n >= 65536) { memset(&SA[(fast_sint_t)n - (fast_sint_t)m], 0, (size_t)m * sizeof(sa_sint_t)); }
+
+ libsais16_initialize_buckets_for_partial_sorting_32s_6k(T, k, buckets, first_lms_suffix, left_suffixes_count);
+ libsais16_induce_partial_order_32s_6k_omp(T, SA, n, k, buckets, first_lms_suffix, left_suffixes_count, threads, thread_state);
+
+ sa_sint_t names = libsais16_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(SA, n, m, threads, thread_state);
+ if (names < m)
+ {
+ sa_sint_t f = libsais16_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
+
+ if (libsais16_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state) != 0)
+ {
+ return -2;
+ }
+
+ libsais16_reconstruct_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, m, fs, f, buckets, threads, thread_state);
+ }
+ else
+ {
+ libsais16_count_lms_suffixes_32s_2k(T, n, k, buckets);
+ }
+
+ libsais16_initialize_buckets_start_and_end_32s_4k(k, buckets);
+ libsais16_place_lms_suffixes_histogram_32s_4k(SA, n, k, m, buckets);
+ libsais16_induce_final_order_32s_4k(T, SA, n, k, buckets, threads, thread_state);
+ }
+ else
+ {
+ SA[0] = SA[n - 1];
+
+ libsais16_initialize_buckets_start_and_end_32s_6k(k, buckets);
+ libsais16_place_lms_suffixes_histogram_32s_6k(SA, n, k, m, buckets);
+ libsais16_induce_final_order_32s_6k(T, SA, n, k, buckets, threads, thread_state);
+ }
+
+ return 0;
+ }
+ else if (k > 0 && fs / k >= 4)
+ {
+ sa_sint_t alignment = (fs - 1024) / k >= 4 ? 1024 : 16;
+ sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 4 ? (sa_sint_t *)libsais16_align_up(&SA[n + fs - 4 * k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 4 * k];
+
+ sa_sint_t m = libsais16_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state);
+ if (m > 1)
+ {
+ libsais16_initialize_buckets_for_radix_and_partial_sorting_32s_4k(T, k, buckets, SA[n - m]);
+
+ libsais16_radix_sort_lms_suffixes_32s_2k_omp(T, SA, n, m, &buckets[1], threads, thread_state);
+ libsais16_radix_sort_set_markers_32s_4k_omp(SA, k, &buckets[1], threads);
+
+ libsais16_place_lms_suffixes_interval_32s_4k(SA, n, k, m - 1, buckets);
+ libsais16_induce_partial_order_32s_4k_omp(T, SA, n, k, buckets, threads, thread_state);
+
+ sa_sint_t names = libsais16_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(SA, n, m, threads, thread_state);
+ if (names < m)
+ {
+ sa_sint_t f = libsais16_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
+
+ if (libsais16_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state) != 0)
+ {
+ return -2;
+ }
+
+ libsais16_reconstruct_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, m, fs, f, buckets, threads, thread_state);
+ }
+ else
+ {
+ libsais16_count_lms_suffixes_32s_2k(T, n, k, buckets);
+ }
+ }
+ else
+ {
+ SA[0] = SA[n - 1];
+ }
+
+ libsais16_initialize_buckets_start_and_end_32s_4k(k, buckets);
+ libsais16_place_lms_suffixes_histogram_32s_4k(SA, n, k, m, buckets);
+ libsais16_induce_final_order_32s_4k(T, SA, n, k, buckets, threads, thread_state);
+
+ return 0;
+ }
+ else if (k > 0 && fs / k >= 2)
+ {
+ sa_sint_t alignment = (fs - 1024) / k >= 2 ? 1024 : 16;
+ sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 2 ? (sa_sint_t *)libsais16_align_up(&SA[n + fs - 2 * k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 2 * k];
+
+ sa_sint_t m = libsais16_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state);
+ if (m > 1)
+ {
+ libsais16_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(T, k, buckets, SA[n - m]);
+
+ libsais16_radix_sort_lms_suffixes_32s_2k_omp(T, SA, n, m, &buckets[1], threads, thread_state);
+ libsais16_place_lms_suffixes_interval_32s_2k(SA, n, k, m - 1, buckets);
+
+ libsais16_initialize_buckets_start_and_end_32s_2k(k, buckets);
+ libsais16_induce_partial_order_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state);
+
+ sa_sint_t names = libsais16_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(T, SA, n, m, threads);
+ if (names < m)
+ {
+ sa_sint_t f = libsais16_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
+
+ if (libsais16_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state) != 0)
+ {
+ return -2;
+ }
+
+ libsais16_reconstruct_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, m, fs, f, buckets, threads, thread_state);
+ }
+ else
+ {
+ libsais16_count_lms_suffixes_32s_2k(T, n, k, buckets);
+ }
+ }
+ else
+ {
+ SA[0] = SA[n - 1];
+ }
+
+ libsais16_initialize_buckets_end_32s_2k(k, buckets);
+ libsais16_place_lms_suffixes_histogram_32s_2k(SA, n, k, m, buckets);
+
+ libsais16_initialize_buckets_start_and_end_32s_2k(k, buckets);
+ libsais16_induce_final_order_32s_2k(T, SA, n, k, buckets, threads, thread_state);
+
+ return 0;
+ }
+ else
+ {
+ sa_sint_t * buffer = fs < k ? (sa_sint_t *)libsais16_alloc_aligned((size_t)k * sizeof(sa_sint_t), 4096) : (sa_sint_t *)NULL;
+
+ sa_sint_t alignment = fs - 1024 >= k ? 1024 : 16;
+ sa_sint_t * RESTRICT buckets = fs - alignment >= k ? (sa_sint_t *)libsais16_align_up(&SA[n + fs - k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : fs >= k ? &SA[n + fs - k] : buffer;
+
+ if (buckets == NULL) { return -2; }
+
+ memset(SA, 0, (size_t)n * sizeof(sa_sint_t));
+
+ libsais16_count_suffixes_32s(T, n, k, buckets);
+ libsais16_initialize_buckets_end_32s_1k(k, buckets);
+
+ sa_sint_t m = libsais16_radix_sort_lms_suffixes_32s_1k(T, SA, n, buckets);
+ if (m > 1)
+ {
+ libsais16_induce_partial_order_32s_1k_omp(T, SA, n, k, buckets, threads, thread_state);
+
+ sa_sint_t names = libsais16_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(T, SA, n, m, threads);
+ if (names < m)
+ {
+ if (buffer != NULL) { libsais16_free_aligned(buffer); buckets = NULL; }
+
+ sa_sint_t f = libsais16_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
+
+ if (libsais16_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state) != 0)
+ {
+ return -2;
+ }
+
+ libsais16_reconstruct_compacted_lms_suffixes_32s_1k_omp(T, SA, n, m, fs, f, threads, thread_state);
+
+ if (buckets == NULL) { buckets = buffer = (sa_sint_t *)libsais16_alloc_aligned((size_t)k * sizeof(sa_sint_t), 4096); }
+ if (buckets == NULL) { return -2; }
+ }
+
+ libsais16_count_suffixes_32s(T, n, k, buckets);
+ libsais16_initialize_buckets_end_32s_1k(k, buckets);
+ libsais16_place_lms_suffixes_interval_32s_1k(T, SA, k, m, buckets);
+ }
+
+ libsais16_induce_final_order_32s_1k(T, SA, n, k, buckets, threads, thread_state);
+ libsais16_free_aligned(buffer);
+
+ return 0;
+ }
+}
+
+static sa_sint_t libsais16_main_16u(const uint16_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t bwt, sa_sint_t r, sa_sint_t * RESTRICT I, sa_sint_t fs, sa_sint_t * freq, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+ sa_sint_t m = libsais16_count_and_gather_lms_suffixes_16u_omp(T, SA, n, buckets, threads, thread_state);
+
+ libsais16_initialize_buckets_start_and_end_16u(buckets, freq);
+
+ if (m > 0)
+ {
+ sa_sint_t first_lms_suffix = SA[n - m];
+ sa_sint_t left_suffixes_count = libsais16_initialize_buckets_for_lms_suffixes_radix_sort_16u(T, buckets, first_lms_suffix);
+
+ if (threads > 1 && n >= 65536) { memset(SA, 0, ((size_t)n - (size_t)m) * sizeof(sa_sint_t)); }
+ libsais16_radix_sort_lms_suffixes_16u_omp(T, SA, n, m, buckets, threads, thread_state);
+ if (threads > 1 && n >= 65536) { memset(&SA[(fast_sint_t)n - (fast_sint_t)m], 0, (size_t)m * sizeof(sa_sint_t)); }
+
+ libsais16_initialize_buckets_for_partial_sorting_16u(T, buckets, first_lms_suffix, left_suffixes_count);
+ libsais16_induce_partial_order_16u_omp(T, SA, n, buckets, first_lms_suffix, left_suffixes_count, threads, thread_state);
+
+ sa_sint_t names = libsais16_renumber_and_gather_lms_suffixes_16u_omp(SA, n, m, fs, threads, thread_state);
+ if (names < m)
+ {
+ if (libsais16_main_32s(SA + n + fs - m, SA, m, names, fs + n - 2 * m, threads, thread_state) != 0)
+ {
+ return -2;
+ }
+
+ libsais16_gather_lms_suffixes_16u_omp(T, SA, n, threads, thread_state);
+ libsais16_reconstruct_lms_suffixes_omp(SA, n, m, threads);
+ }
+
+ libsais16_place_lms_suffixes_interval_16u(SA, n, m, buckets);
+ }
+ else
+ {
+ memset(SA, 0, (size_t)n * sizeof(sa_sint_t));
+ }
+
+ return libsais16_induce_final_order_16u_omp(T, SA, n, bwt, r, I, buckets, threads, thread_state);
+}
+
+static sa_sint_t libsais16_main(const uint16_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t bwt, sa_sint_t r, sa_sint_t * I, sa_sint_t fs, sa_sint_t * freq, sa_sint_t threads)
+{
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state = threads > 1 ? libsais16_alloc_thread_state(threads) : NULL;
+ sa_sint_t * RESTRICT buckets = (sa_sint_t *)libsais16_alloc_aligned(8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096);
+
+ sa_sint_t index = buckets != NULL && (thread_state != NULL || threads == 1)
+ ? libsais16_main_16u(T, SA, n, buckets, bwt, r, I, fs, freq, threads, thread_state)
+ : -2;
+
+ libsais16_free_aligned(buckets);
+ libsais16_free_thread_state(thread_state);
+
+ return index;
+}
+
+static sa_sint_t libsais16_main_ctx(const LIBSAIS_CONTEXT * ctx, const uint16_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t bwt, sa_sint_t r, sa_sint_t * I, sa_sint_t fs, sa_sint_t * freq)
+{
+ return ctx != NULL && (ctx->buckets != NULL && (ctx->thread_state != NULL || ctx->threads == 1))
+ ? libsais16_main_16u(T, SA, n, ctx->buckets, bwt, r, I, fs, freq, (sa_sint_t)ctx->threads, ctx->thread_state)
+ : -2;
+}
+
+static void libsais16_bwt_copy_16u(uint16_t * RESTRICT U, sa_sint_t * RESTRICT A, sa_sint_t n)
+{
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = 0, j = (fast_sint_t)n - 7; i < j; i += 8)
+ {
+ libsais16_prefetch(&A[i + prefetch_distance]);
+
+ U[i + 0] = (uint16_t)A[i + 0];
+ U[i + 1] = (uint16_t)A[i + 1];
+ U[i + 2] = (uint16_t)A[i + 2];
+ U[i + 3] = (uint16_t)A[i + 3];
+ U[i + 4] = (uint16_t)A[i + 4];
+ U[i + 5] = (uint16_t)A[i + 5];
+ U[i + 6] = (uint16_t)A[i + 6];
+ U[i + 7] = (uint16_t)A[i + 7];
+ }
+
+ for (j += 7; i < j; i += 1)
+ {
+ U[i] = (uint16_t)A[i];
+ }
+}
+
+#if defined(_OPENMP)
+
+static void libsais16_bwt_copy_16u_omp(uint16_t * RESTRICT U, sa_sint_t * RESTRICT A, sa_sint_t n, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+ #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+ fast_sint_t omp_block_stride = ((fast_sint_t)n / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : (fast_sint_t)n - omp_block_start;
+#else
+ UNUSED(threads);
+
+ fast_sint_t omp_block_start = 0;
+ fast_sint_t omp_block_size = (fast_sint_t)n;
+#endif
+
+ libsais16_bwt_copy_16u(U + omp_block_start, A + omp_block_start, (sa_sint_t)omp_block_size);
+ }
+}
+
+#endif
+
+void * libsais16_create_ctx(void)
+{
+ return (void *)libsais16_create_ctx_main(1);
+}
+
+void libsais16_free_ctx(void * ctx)
+{
+ libsais16_free_ctx_main((LIBSAIS_CONTEXT *)ctx);
+}
+
+int32_t libsais16(const uint16_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq)
+{
+ if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0))
+ {
+ return -1;
+ }
+ else if (n < 2)
+ {
+ if (n == 1) { SA[0] = 0; }
+ return 0;
+ }
+
+ return libsais16_main(T, SA, n, 0, 0, NULL, fs, freq, 1);
+}
+
+int32_t libsais16_ctx(const void * ctx, const uint16_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq)
+{
+ if ((ctx == NULL) || (T == NULL) || (SA == NULL) || (n < 0) || (fs < 0))
+ {
+ return -1;
+ }
+ else if (n < 2)
+ {
+ if (n == 1) { SA[0] = 0; }
+ return 0;
+ }
+
+ return libsais16_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, SA, n, 0, 0, NULL, fs, freq);
+}
+
+int32_t libsais16_bwt(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq)
+{
+ if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0))
+ {
+ return -1;
+ }
+ else if (n <= 1)
+ {
+ if (n == 1) { U[0] = T[0]; }
+ return n;
+ }
+
+ sa_sint_t index = libsais16_main(T, A, n, 1, 0, NULL, fs, freq, 1);
+ if (index >= 0)
+ {
+ index++;
+
+ U[0] = T[n - 1];
+ libsais16_bwt_copy_16u(U + 1, A, index - 1);
+ libsais16_bwt_copy_16u(U + index, A + index, n - index);
+ }
+
+ return index;
+}
+
+int32_t libsais16_bwt_aux(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I)
+{
+ if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) || ((r & (r - 1)) != 0) || (I == NULL))
+ {
+ return -1;
+ }
+ else if (n <= 1)
+ {
+ if (n == 1) { U[0] = T[0]; }
+
+ I[0] = n;
+ return 0;
+ }
+
+ if (libsais16_main(T, A, n, 1, r, I, fs, freq, 1) != 0)
+ {
+ return -2;
+ }
+
+ U[0] = T[n - 1];
+ libsais16_bwt_copy_16u(U + 1, A, I[0] - 1);
+ libsais16_bwt_copy_16u(U + I[0], A + I[0], n - I[0]);
+
+ return 0;
+}
+
+int32_t libsais16_bwt_ctx(const void * ctx, const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq)
+{
+ if ((ctx == NULL) || (T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0))
+ {
+ return -1;
+ }
+ else if (n <= 1)
+ {
+ if (n == 1) { U[0] = T[0]; }
+ return n;
+ }
+
+ sa_sint_t index = libsais16_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, A, n, 1, 0, NULL, fs, freq);
+ if (index >= 0)
+ {
+ index++;
+
+ U[0] = T[n - 1];
+
+#if defined(_OPENMP)
+ libsais16_bwt_copy_16u_omp(U + 1, A, index - 1, (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads);
+ libsais16_bwt_copy_16u_omp(U + index, A + index, n - index, (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads);
+#else
+ libsais16_bwt_copy_16u(U + 1, A, index - 1);
+ libsais16_bwt_copy_16u(U + index, A + index, n - index);
+#endif
+ }
+
+ return index;
+}
+
+int32_t libsais16_bwt_aux_ctx(const void * ctx, const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I)
+{
+ if ((ctx == NULL) || (T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) || ((r & (r - 1)) != 0) || (I == NULL))
+ {
+ return -1;
+ }
+ else if (n <= 1)
+ {
+ if (n == 1) { U[0] = T[0]; }
+
+ I[0] = n;
+ return 0;
+ }
+
+ if (libsais16_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, A, n, 1, r, I, fs, freq) != 0)
+ {
+ return -2;
+ }
+
+ U[0] = T[n - 1];
+
+#if defined(_OPENMP)
+ libsais16_bwt_copy_16u_omp(U + 1, A, I[0] - 1, (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads);
+ libsais16_bwt_copy_16u_omp(U + I[0], A + I[0], n - I[0], (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads);
+#else
+ libsais16_bwt_copy_16u(U + 1, A, I[0] - 1);
+ libsais16_bwt_copy_16u(U + I[0], A + I[0], n - I[0]);
+#endif
+
+ return 0;
+}
+
+#if defined(_OPENMP)
+
+void * libsais16_create_ctx_omp(int32_t threads)
+{
+ if (threads < 0) { return NULL; }
+
+ threads = threads > 0 ? threads : omp_get_max_threads();
+ return (void *)libsais16_create_ctx_main(threads);
+}
+
+int32_t libsais16_omp(const uint16_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq, int32_t threads)
+{
+ if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0) || (threads < 0))
+ {
+ return -1;
+ }
+ else if (n < 2)
+ {
+ if (n == 1) { SA[0] = 0; }
+ return 0;
+ }
+
+ threads = threads > 0 ? threads : omp_get_max_threads();
+
+ return libsais16_main(T, SA, n, 0, 0, NULL, fs, freq, threads);
+}
+
+int32_t libsais16_bwt_omp(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t threads)
+{
+ if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (threads < 0))
+ {
+ return -1;
+ }
+ else if (n <= 1)
+ {
+ if (n == 1) { U[0] = T[0]; }
+ return n;
+ }
+
+ threads = threads > 0 ? threads : omp_get_max_threads();
+
+ sa_sint_t index = libsais16_main(T, A, n, 1, 0, NULL, fs, freq, threads);
+ if (index >= 0)
+ {
+ index++;
+
+ U[0] = T[n - 1];
+ libsais16_bwt_copy_16u_omp(U + 1, A, index - 1, threads);
+ libsais16_bwt_copy_16u_omp(U + index, A + index, n - index, threads);
+ }
+
+ return index;
+}
+
+int32_t libsais16_bwt_aux_omp(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I, int32_t threads)
+{
+ if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) || ((r & (r - 1)) != 0) || (I == NULL) || (threads < 0))
+ {
+ return -1;
+ }
+ else if (n <= 1)
+ {
+ if (n == 1) { U[0] = T[0];}
+
+ I[0] = n;
+ return 0;
+ }
+
+ threads = threads > 0 ? threads : omp_get_max_threads();
+
+ if (libsais16_main(T, A, n, 1, r, I, fs, freq, threads) != 0)
+ {
+ return -2;
+ }
+
+ U[0] = T[n - 1];
+ libsais16_bwt_copy_16u_omp(U + 1, A, I[0] - 1, threads);
+ libsais16_bwt_copy_16u_omp(U + I[0], A + I[0], n - I[0], threads);
+
+ return 0;
+}
+
+#endif
+
+static LIBSAIS_UNBWT_CONTEXT * libsais16_unbwt_create_ctx_main(sa_sint_t threads)
+{
+ LIBSAIS_UNBWT_CONTEXT * RESTRICT ctx = (LIBSAIS_UNBWT_CONTEXT *)libsais16_alloc_aligned(sizeof(LIBSAIS_UNBWT_CONTEXT), 64);
+ sa_uint_t * RESTRICT bucket2 = (sa_uint_t *)libsais16_alloc_aligned(ALPHABET_SIZE * sizeof(sa_uint_t), 4096);
+ uint16_t * RESTRICT fastbits = (uint16_t *)libsais16_alloc_aligned((1 + (1 << UNBWT_FASTBITS)) * sizeof(uint16_t), 4096);
+ sa_uint_t * RESTRICT buckets = threads > 1 ? (sa_uint_t *)libsais16_alloc_aligned((size_t)threads * ALPHABET_SIZE * sizeof(sa_uint_t), 4096) : NULL;
+
+ if (ctx != NULL && bucket2 != NULL && fastbits != NULL && (buckets != NULL || threads == 1))
+ {
+ ctx->bucket2 = bucket2;
+ ctx->fastbits = fastbits;
+ ctx->buckets = buckets;
+ ctx->threads = threads;
+
+ return ctx;
+ }
+
+ libsais16_free_aligned(buckets);
+ libsais16_free_aligned(fastbits);
+ libsais16_free_aligned(bucket2);
+ libsais16_free_aligned(ctx);
+
+ return NULL;
+}
+
+static void libsais16_unbwt_free_ctx_main(LIBSAIS_UNBWT_CONTEXT * ctx)
+{
+ if (ctx != NULL)
+ {
+ libsais16_free_aligned(ctx->buckets);
+ libsais16_free_aligned(ctx->fastbits);
+ libsais16_free_aligned(ctx->bucket2);
+ libsais16_free_aligned(ctx);
+ }
+}
+
+static void libsais16_unbwt_compute_histogram(const uint16_t * RESTRICT T, fast_sint_t n, sa_uint_t * RESTRICT count)
+{
+ fast_sint_t i; for (i = 0; i < n; i += 1) { count[T[i]]++; }
+}
+
+static void libsais16_unbwt_calculate_fastbits(sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift)
+{
+ fast_uint_t v, w, sum;
+ for (v = 0, sum = 1, w = 0; w < ALPHABET_SIZE; ++w)
+ {
+ fast_uint_t prev = sum; sum += bucket2[w]; bucket2[w] = (sa_uint_t)prev;
+ if (prev != sum)
+ {
+ for (; v <= ((sum - 1) >> shift); ++v) { fastbits[v] = (uint16_t)w; }
+ }
+ }
+}
+
+static void libsais16_unbwt_calculate_P(const uint16_t * RESTRICT T, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, fast_uint_t index, fast_sint_t omp_block_start, fast_sint_t omp_block_end)
+{
+ {
+ fast_sint_t i = omp_block_start, j = (fast_sint_t)index; if (omp_block_end < j) { j = omp_block_end; }
+ for (; i < j; ++i) { fast_uint_t c = T[i]; P[bucket2[c]++] = (sa_uint_t)i; }
+ }
+
+ {
+ fast_sint_t i = (fast_sint_t)index, j = omp_block_end; if (omp_block_start > i) { i = omp_block_start; }
+ for (T -= 1, i += 1; i <= j; ++i) { fast_uint_t c = T[i]; P[bucket2[c]++] = (sa_uint_t)i; }
+ }
+}
+
+static void libsais16_unbwt_init_single(const uint16_t * RESTRICT T, sa_uint_t * RESTRICT P, sa_sint_t n, const sa_sint_t * freq, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits)
+{
+ fast_uint_t index = I[0];
+ fast_uint_t shift = 0; while ((n >> shift) > (1 << UNBWT_FASTBITS)) { shift++; }
+
+ if (freq != NULL)
+ {
+ memcpy(bucket2, freq, ALPHABET_SIZE * sizeof(sa_uint_t));
+ }
+ else
+ {
+ memset(bucket2, 0, ALPHABET_SIZE * sizeof(sa_uint_t));
+ libsais16_unbwt_compute_histogram(T, n, bucket2);
+ }
+
+ libsais16_unbwt_calculate_fastbits(bucket2, fastbits, shift);
+ libsais16_unbwt_calculate_P(T, P, bucket2, index, 0, n);
+}
+
+#if defined(_OPENMP)
+
+static void libsais16_unbwt_init_parallel(const uint16_t * RESTRICT T, sa_uint_t * RESTRICT P, sa_sint_t n, const sa_sint_t * freq, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, sa_uint_t * RESTRICT buckets, sa_sint_t threads)
+{
+ fast_uint_t index = I[0];
+ fast_uint_t shift = 0; while ((n >> shift) > (1 << UNBWT_FASTBITS)) { shift++; }
+
+ #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+ {
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+
+ if (omp_num_threads == 1)
+ {
+ libsais16_unbwt_init_single(T, P, n, freq, I, bucket2, fastbits);
+ }
+ else
+ {
+ {
+ sa_uint_t * RESTRICT bucket2_local = buckets + omp_thread_num * ALPHABET_SIZE;
+ fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+ memset(bucket2_local, 0, ALPHABET_SIZE * sizeof(sa_uint_t));
+ libsais16_unbwt_compute_histogram(T + omp_block_start, omp_block_size, bucket2_local);
+ }
+
+ #pragma omp barrier
+
+ {
+ sa_uint_t * RESTRICT bucket2_temp = buckets;
+ fast_sint_t omp_block_stride = (ALPHABET_SIZE / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ALPHABET_SIZE - omp_block_start;
+
+ memset(bucket2 + omp_block_start, 0, omp_block_size * sizeof(sa_uint_t));
+
+ fast_sint_t t;
+ for (t = 0; t < omp_num_threads; ++t, bucket2_temp += ALPHABET_SIZE)
+ {
+ fast_sint_t c; for (c = omp_block_start; c < omp_block_start + omp_block_size; c += 1) { sa_uint_t A = bucket2[c], B = bucket2_temp[c]; bucket2[c] = A + B; bucket2_temp[c] = A; }
+ }
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ libsais16_unbwt_calculate_fastbits(bucket2, fastbits, shift);
+ }
+
+ #pragma omp barrier
+
+ {
+ sa_uint_t * RESTRICT bucket2_local = buckets + omp_thread_num * ALPHABET_SIZE;
+ fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+ fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_uint_t A = bucket2[c], B = bucket2_local[c]; bucket2_local[c] = A + B; }
+
+ libsais16_unbwt_calculate_P(T, P, bucket2_local, index, omp_block_start, omp_block_start + omp_block_size);
+ }
+
+ #pragma omp barrier
+
+ #pragma omp master
+ {
+ memcpy(bucket2, buckets + (omp_num_threads - 1) * ALPHABET_SIZE, ALPHABET_SIZE * sizeof(sa_uint_t));
+ }
+ }
+ }
+}
+
+#endif
+
+static void libsais16_unbwt_decode_1(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t * i0, fast_uint_t k)
+{
+ uint16_t * RESTRICT U0 = U;
+
+ fast_uint_t i, p0 = *i0;
+
+ for (i = 0; i != k; ++i)
+ {
+ uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0;
+ }
+
+ *i0 = p0;
+}
+
+static void libsais16_unbwt_decode_2(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t k)
+{
+ uint16_t * RESTRICT U0 = U;
+ uint16_t * RESTRICT U1 = U0 + r;
+
+ fast_uint_t i, p0 = *i0, p1 = *i1;
+
+ for (i = 0; i != k; ++i)
+ {
+ uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0;
+ uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = c1;
+ }
+
+ *i0 = p0; *i1 = p1;
+}
+
+static void libsais16_unbwt_decode_3(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t k)
+{
+ uint16_t * RESTRICT U0 = U;
+ uint16_t * RESTRICT U1 = U0 + r;
+ uint16_t * RESTRICT U2 = U1 + r;
+
+ fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2;
+
+ for (i = 0; i != k; ++i)
+ {
+ uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0;
+ uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = c1;
+ uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = c2;
+ }
+
+ *i0 = p0; *i1 = p1; *i2 = p2;
+}
+
+static void libsais16_unbwt_decode_4(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t k)
+{
+ uint16_t * RESTRICT U0 = U;
+ uint16_t * RESTRICT U1 = U0 + r;
+ uint16_t * RESTRICT U2 = U1 + r;
+ uint16_t * RESTRICT U3 = U2 + r;
+
+ fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3;
+
+ for (i = 0; i != k; ++i)
+ {
+ uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0;
+ uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = c1;
+ uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = c2;
+ uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = c3;
+ }
+
+ *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3;
+}
+
+static void libsais16_unbwt_decode_5(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t k)
+{
+ uint16_t * RESTRICT U0 = U;
+ uint16_t * RESTRICT U1 = U0 + r;
+ uint16_t * RESTRICT U2 = U1 + r;
+ uint16_t * RESTRICT U3 = U2 + r;
+ uint16_t * RESTRICT U4 = U3 + r;
+
+ fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4;
+
+ for (i = 0; i != k; ++i)
+ {
+ uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0;
+ uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = c1;
+ uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = c2;
+ uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = c3;
+ uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = c4;
+ }
+
+ *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4;
+}
+
+static void libsais16_unbwt_decode_6(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t * i5, fast_uint_t k)
+{
+ uint16_t * RESTRICT U0 = U;
+ uint16_t * RESTRICT U1 = U0 + r;
+ uint16_t * RESTRICT U2 = U1 + r;
+ uint16_t * RESTRICT U3 = U2 + r;
+ uint16_t * RESTRICT U4 = U3 + r;
+ uint16_t * RESTRICT U5 = U4 + r;
+
+ fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5;
+
+ for (i = 0; i != k; ++i)
+ {
+ uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0;
+ uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = c1;
+ uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = c2;
+ uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = c3;
+ uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = c4;
+ uint16_t c5 = fastbits[p5 >> shift]; if (bucket2[c5] <= p5) { do { c5++; } while (bucket2[c5] <= p5); } p5 = P[p5]; U5[i] = c5;
+ }
+
+ *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4; *i5 = p5;
+}
+
+static void libsais16_unbwt_decode_7(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t * i5, fast_uint_t * i6, fast_uint_t k)
+{
+ uint16_t * RESTRICT U0 = U;
+ uint16_t * RESTRICT U1 = U0 + r;
+ uint16_t * RESTRICT U2 = U1 + r;
+ uint16_t * RESTRICT U3 = U2 + r;
+ uint16_t * RESTRICT U4 = U3 + r;
+ uint16_t * RESTRICT U5 = U4 + r;
+ uint16_t * RESTRICT U6 = U5 + r;
+
+ fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5, p6 = *i6;
+
+ for (i = 0; i != k; ++i)
+ {
+ uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0;
+ uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = c1;
+ uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = c2;
+ uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = c3;
+ uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = c4;
+ uint16_t c5 = fastbits[p5 >> shift]; if (bucket2[c5] <= p5) { do { c5++; } while (bucket2[c5] <= p5); } p5 = P[p5]; U5[i] = c5;
+ uint16_t c6 = fastbits[p6 >> shift]; if (bucket2[c6] <= p6) { do { c6++; } while (bucket2[c6] <= p6); } p6 = P[p6]; U6[i] = c6;
+ }
+
+ *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4; *i5 = p5; *i6 = p6;
+}
+
+static void libsais16_unbwt_decode_8(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t * i5, fast_uint_t * i6, fast_uint_t * i7, fast_uint_t k)
+{
+ uint16_t * RESTRICT U0 = U;
+ uint16_t * RESTRICT U1 = U0 + r;
+ uint16_t * RESTRICT U2 = U1 + r;
+ uint16_t * RESTRICT U3 = U2 + r;
+ uint16_t * RESTRICT U4 = U3 + r;
+ uint16_t * RESTRICT U5 = U4 + r;
+ uint16_t * RESTRICT U6 = U5 + r;
+ uint16_t * RESTRICT U7 = U6 + r;
+
+ fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5, p6 = *i6, p7 = *i7;
+
+ for (i = 0; i != k; ++i)
+ {
+ uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0;
+ uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = c1;
+ uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = c2;
+ uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = c3;
+ uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = c4;
+ uint16_t c5 = fastbits[p5 >> shift]; if (bucket2[c5] <= p5) { do { c5++; } while (bucket2[c5] <= p5); } p5 = P[p5]; U5[i] = c5;
+ uint16_t c6 = fastbits[p6 >> shift]; if (bucket2[c6] <= p6) { do { c6++; } while (bucket2[c6] <= p6); } p6 = P[p6]; U6[i] = c6;
+ uint16_t c7 = fastbits[p7 >> shift]; if (bucket2[c7] <= p7) { do { c7++; } while (bucket2[c7] <= p7); } p7 = P[p7]; U7[i] = c7;
+ }
+
+ *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4; *i5 = p5; *i6 = p6; *i7 = p7;
+}
+
+static void libsais16_unbwt_decode(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n, sa_sint_t r, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_sint_t blocks, fast_uint_t reminder)
+{
+ fast_uint_t shift = 0; while ((n >> shift) > (1 << UNBWT_FASTBITS)) { shift++; }
+ fast_uint_t offset = 0;
+
+ while (blocks > 8)
+ {
+ fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6], i7 = I[7];
+ libsais16_unbwt_decode_8(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, &i7, (fast_uint_t)r);
+ I += 8; blocks -= 8; offset += 8 * (fast_uint_t)r;
+ }
+
+ if (blocks == 1)
+ {
+ fast_uint_t i0 = I[0];
+ libsais16_unbwt_decode_1(U + offset, P, bucket2, fastbits, shift, &i0, reminder);
+ }
+ else if (blocks == 2)
+ {
+ fast_uint_t i0 = I[0], i1 = I[1];
+ libsais16_unbwt_decode_2(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, reminder);
+ libsais16_unbwt_decode_1(U + offset + reminder, P, bucket2, fastbits, shift, &i0, ((fast_uint_t)r) - reminder);
+ }
+ else if (blocks == 3)
+ {
+ fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2];
+ libsais16_unbwt_decode_3(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, reminder);
+ libsais16_unbwt_decode_2(U + offset + reminder, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, ((fast_uint_t)r) - reminder);
+ }
+ else if (blocks == 4)
+ {
+ fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3];
+ libsais16_unbwt_decode_4(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, reminder);
+ libsais16_unbwt_decode_3(U + offset + reminder, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, ((fast_uint_t)r) - reminder);
+ }
+ else if (blocks == 5)
+ {
+ fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4];
+ libsais16_unbwt_decode_5(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, reminder);
+ libsais16_unbwt_decode_4(U + offset + reminder, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, ((fast_uint_t)r) - reminder);
+ }
+ else if (blocks == 6)
+ {
+ fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5];
+ libsais16_unbwt_decode_6(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, reminder);
+ libsais16_unbwt_decode_5(U + offset + reminder, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, ((fast_uint_t)r) - reminder);
+ }
+ else if (blocks == 7)
+ {
+ fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6];
+ libsais16_unbwt_decode_7(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, reminder);
+ libsais16_unbwt_decode_6(U + offset + reminder, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, ((fast_uint_t)r) - reminder);
+ }
+ else
+ {
+ fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6], i7 = I[7];
+ libsais16_unbwt_decode_8(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, &i7, reminder);
+ libsais16_unbwt_decode_7(U + offset + reminder, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, ((fast_uint_t)r) - reminder);
+ }
+}
+
+static void libsais16_unbwt_decode_omp(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n, sa_sint_t r, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, sa_sint_t threads)
+{
+ fast_sint_t blocks = 1 + (((fast_sint_t)n - 1) / (fast_sint_t)r);
+ fast_uint_t reminder = (fast_uint_t)n - ((fast_uint_t)r * ((fast_uint_t)blocks - 1));
+
+#if defined(_OPENMP)
+ fast_sint_t max_threads = blocks < threads ? blocks : threads;
+ #pragma omp parallel num_threads(max_threads) if(max_threads > 1 && n >= 65536)
+#endif
+ {
+#if defined(_OPENMP)
+ fast_sint_t omp_thread_num = omp_get_thread_num();
+ fast_sint_t omp_num_threads = omp_get_num_threads();
+#else
+ UNUSED(threads);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+#endif
+
+ fast_sint_t omp_block_stride = blocks / omp_num_threads;
+ fast_sint_t omp_block_reminder = blocks % omp_num_threads;
+ fast_sint_t omp_block_size = omp_block_stride + (omp_thread_num < omp_block_reminder);
+ fast_sint_t omp_block_start = omp_block_stride * omp_thread_num + (omp_thread_num < omp_block_reminder ? omp_thread_num : omp_block_reminder);
+
+ libsais16_unbwt_decode(U + r * omp_block_start, P, n, r, I + omp_block_start, bucket2, fastbits, omp_block_size, omp_thread_num < omp_num_threads - 1 ? (fast_uint_t)r : reminder);
+ }
+}
+
+static sa_sint_t libsais16_unbwt_core(const uint16_t * RESTRICT T, uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n, const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, sa_uint_t * RESTRICT buckets, sa_sint_t threads)
+{
+#if defined(_OPENMP)
+ if (threads > 1 && n >= 262144)
+ {
+ libsais16_unbwt_init_parallel(T, P, n, freq, I, bucket2, fastbits, buckets, threads);
+ }
+ else
+#else
+ UNUSED(buckets);
+#endif
+ {
+ libsais16_unbwt_init_single(T, P, n, freq, I, bucket2, fastbits);
+ }
+
+ libsais16_unbwt_decode_omp(U, P, n, r, I, bucket2, fastbits, threads);
+ return 0;
+}
+
+static sa_sint_t libsais16_unbwt_main(const uint16_t * T, uint16_t * U, sa_uint_t * P, sa_sint_t n, const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * I, sa_sint_t threads)
+{
+ fast_uint_t shift = 0; while ((n >> shift) > (1 << UNBWT_FASTBITS)) { shift++; }
+
+ sa_uint_t * RESTRICT bucket2 = (sa_uint_t *)libsais16_alloc_aligned(ALPHABET_SIZE * sizeof(sa_uint_t), 4096);
+ uint16_t * RESTRICT fastbits = (uint16_t *)libsais16_alloc_aligned(((size_t)1 + (size_t)(n >> shift)) * sizeof(uint16_t), 4096);
+ sa_uint_t * RESTRICT buckets = threads > 1 && n >= 262144 ? (sa_uint_t *)libsais16_alloc_aligned((size_t)threads * ALPHABET_SIZE * sizeof(sa_uint_t), 4096) : NULL;
+
+ sa_sint_t index = bucket2 != NULL && fastbits != NULL && (buckets != NULL || threads == 1 || n < 262144)
+ ? libsais16_unbwt_core(T, U, P, n, freq, r, I, bucket2, fastbits, buckets, threads)
+ : -2;
+
+ libsais16_free_aligned(buckets);
+ libsais16_free_aligned(fastbits);
+ libsais16_free_aligned(bucket2);
+
+ return index;
+}
+
+static sa_sint_t libsais16_unbwt_main_ctx(const LIBSAIS_UNBWT_CONTEXT * ctx, const uint16_t * T, uint16_t * U, sa_uint_t * P, sa_sint_t n, const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * I)
+{
+ return ctx != NULL && ctx->bucket2 != NULL && ctx->fastbits != NULL && (ctx->buckets != NULL || ctx->threads == 1)
+ ? libsais16_unbwt_core(T, U, P, n, freq, r, I, ctx->bucket2, ctx->fastbits, ctx->buckets, (sa_sint_t)ctx->threads)
+ : -2;
+}
+
+void * libsais16_unbwt_create_ctx(void)
+{
+ return (void *)libsais16_unbwt_create_ctx_main(1);
+}
+
+void libsais16_unbwt_free_ctx(void * ctx)
+{
+ libsais16_unbwt_free_ctx_main((LIBSAIS_UNBWT_CONTEXT *)ctx);
+}
+
+int32_t libsais16_unbwt(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i)
+{
+ return libsais16_unbwt_aux(T, U, A, n, freq, n, &i);
+}
+
+int32_t libsais16_unbwt_ctx(const void * ctx, const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i)
+{
+ return libsais16_unbwt_aux_ctx(ctx, T, U, A, n, freq, n, &i);
+}
+
+int32_t libsais16_unbwt_aux(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I)
+{
+ if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL))
+ {
+ return -1;
+ }
+ else if (n <= 1)
+ {
+ if (I[0] != n) { return -1; }
+ if (n == 1) { U[0] = T[0]; }
+ return 0;
+ }
+
+ fast_sint_t t; for (t = 0; t <= (n - 1) / r; ++t) { if (I[t] <= 0 || I[t] > n) { return -1; } }
+
+ return libsais16_unbwt_main(T, U, (sa_uint_t *)A, n, freq, r, (const sa_uint_t *)I, 1);
+}
+
+int32_t libsais16_unbwt_aux_ctx(const void * ctx, const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I)
+{
+ if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL))
+ {
+ return -1;
+ }
+ else if (n <= 1)
+ {
+ if (I[0] != n) { return -1; }
+ if (n == 1) { U[0] = T[0]; }
+ return 0;
+ }
+
+ fast_sint_t t; for (t = 0; t <= (n - 1) / r; ++t) { if (I[t] <= 0 || I[t] > n) { return -1; } }
+
+ return libsais16_unbwt_main_ctx((const LIBSAIS_UNBWT_CONTEXT *)ctx, T, U, (sa_uint_t *)A, n, freq, r, (const sa_uint_t *)I);
+}
+
+#if defined(_OPENMP)
+
+void * libsais16_unbwt_create_ctx_omp(int32_t threads)
+{
+ if (threads < 0) { return NULL; }
+
+ threads = threads > 0 ? threads : omp_get_max_threads();
+ return (void *)libsais16_unbwt_create_ctx_main(threads);
+}
+
+int32_t libsais16_unbwt_omp(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i, int32_t threads)
+{
+ return libsais16_unbwt_aux_omp(T, U, A, n, freq, n, &i, threads);
+}
+
+int32_t libsais16_unbwt_aux_omp(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I, int32_t threads)
+{
+ if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL) || (threads < 0))
+ {
+ return -1;
+ }
+ else if (n <= 1)
+ {
+ if (I[0] != n) { return -1; }
+ if (n == 1) { U[0] = T[0]; }
+ return 0;
+ }
+
+ fast_sint_t t; for (t = 0; t <= (n - 1) / r; ++t) { if (I[t] <= 0 || I[t] > n) { return -1; } }
+
+ threads = threads > 0 ? threads : omp_get_max_threads();
+ return libsais16_unbwt_main(T, U, (sa_uint_t *)A, n, freq, r, (const sa_uint_t *)I, threads);
+}
+
+#endif
diff --git a/libsais/libsais16.h b/libsais/libsais16.h
new file mode 100644
index 0000000..c577058
--- /dev/null
+++ b/libsais/libsais16.h
@@ -0,0 +1,285 @@
+/*--
+
+This file is a part of libsais, a library for linear time
+suffix array and burrows wheeler transform construction.
+
+ Copyright (c) 2021 Ilya Grebnov
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+Please see the file LICENSE for full copyright information.
+
+--*/
+
+#ifndef LIBSAIS16_H
+#define LIBSAIS16_H 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ #include
+
+ /**
+ * Creates the libsais16 context that allows reusing allocated memory with each libsais16 operation.
+ * In multi-threaded environments, use one context per thread for parallel executions.
+ * @return the libsais16 context, NULL otherwise.
+ */
+ void * libsais16_create_ctx(void);
+
+#if defined(_OPENMP)
+ /**
+ * Creates the libsais16 context that allows reusing allocated memory with each parallel libsais16 operation using OpenMP.
+ * In multi-threaded environments, use one context per thread for parallel executions.
+ * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
+ * @return the libsais16 context, NULL otherwise.
+ */
+ void * libsais16_create_ctx_omp(int32_t threads);
+#endif
+
+ /**
+ * Destroys the libsass context and free previusly allocated memory.
+ * @param ctx The libsais16 context (can be NULL).
+ */
+ void libsais16_free_ctx(void * ctx);
+
+ /**
+ * Constructs the suffix array of a given 16-bit string.
+ * @param T [0..n-1] The input 16-bit string.
+ * @param SA [0..n-1+fs] The output array of suffixes.
+ * @param n The length of the given 16-bit string.
+ * @param fs The extra space available at the end of SA array (can be 0).
+ * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL).
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+ int32_t libsais16(const uint16_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq);
+
+ /**
+ * Constructs the suffix array of a given 16-bit string using libsais16 context.
+ * @param ctx The libsais16 context.
+ * @param T [0..n-1] The input 16-bit string.
+ * @param SA [0..n-1+fs] The output array of suffixes.
+ * @param n The length of the given 16-bit string.
+ * @param fs The extra space available at the end of SA array (can be 0).
+ * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL).
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+ int32_t libsais16_ctx(const void * ctx, const uint16_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq);
+
+#if defined(_OPENMP)
+ /**
+ * Constructs the suffix array of a given 16-bit string in parallel using OpenMP.
+ * @param T [0..n-1] The input 16-bit string.
+ * @param SA [0..n-1+fs] The output array of suffixes.
+ * @param n The length of the given 16-bit string.
+ * @param fs The extra space available at the end of SA array (can be 0).
+ * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL).
+ * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+ int32_t libsais16_omp(const uint16_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq, int32_t threads);
+#endif
+
+ /**
+ * Constructs the burrows-wheeler transformed 16-bit string of a given 16-bit string.
+ * @param T [0..n-1] The input 16-bit string.
+ * @param U [0..n-1] The output 16-bit string (can be T).
+ * @param A [0..n-1+fs] The temporary array.
+ * @param n The length of the given 16-bit string.
+ * @param fs The extra space available at the end of A array (can be 0).
+ * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL).
+ * @return The primary index if no error occurred, -1 or -2 otherwise.
+ */
+ int32_t libsais16_bwt(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq);
+
+ /**
+ * Constructs the burrows-wheeler transformed 16-bit string of a given 16-bit string with auxiliary indexes.
+ * @param T [0..n-1] The input 16-bit string.
+ * @param U [0..n-1] The output 16-bit string (can be T).
+ * @param A [0..n-1+fs] The temporary array.
+ * @param n The length of the given 16-bit string.
+ * @param fs The extra space available at the end of A array (can be 0).
+ * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL).
+ * @param r The sampling rate for auxiliary indexes (must be power of 2).
+ * @param I [0..(n-1)/r] The output auxiliary indexes.
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+ int32_t libsais16_bwt_aux(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I);
+
+ /**
+ * Constructs the burrows-wheeler transformed 16-bit string of a given 16-bit string using libsais16 context.
+ * @param ctx The libsais16 context.
+ * @param T [0..n-1] The input 16-bit string.
+ * @param U [0..n-1] The output 16-bit string (can be T).
+ * @param A [0..n-1+fs] The temporary array.
+ * @param n The length of the given 16-bit string.
+ * @param fs The extra space available at the end of A array (can be 0).
+ * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL).
+ * @return The primary index if no error occurred, -1 or -2 otherwise.
+ */
+ int32_t libsais16_bwt_ctx(const void * ctx, const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq);
+
+ /**
+ * Constructs the burrows-wheeler transformed 16-bit string of a given 16-bit string with auxiliary indexes using libsais16 context.
+ * @param ctx The libsais16 context.
+ * @param T [0..n-1] The input 16-bit string.
+ * @param U [0..n-1] The output 16-bit string (can be T).
+ * @param A [0..n-1+fs] The temporary array.
+ * @param n The length of the given 16-bit string.
+ * @param fs The extra space available at the end of A array (can be 0).
+ * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL).
+ * @param r The sampling rate for auxiliary indexes (must be power of 2).
+ * @param I [0..(n-1)/r] The output auxiliary indexes.
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+ int32_t libsais16_bwt_aux_ctx(const void * ctx, const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I);
+
+#if defined(_OPENMP)
+ /**
+ * Constructs the burrows-wheeler transformed 16-bit string of a given 16-bit string in parallel using OpenMP.
+ * @param T [0..n-1] The input 16-bit string.
+ * @param U [0..n-1] The output 16-bit string (can be T).
+ * @param A [0..n-1+fs] The temporary array.
+ * @param n The length of the given 16-bit string.
+ * @param fs The extra space available at the end of A array (can be 0).
+ * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL).
+ * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
+ * @return The primary index if no error occurred, -1 or -2 otherwise.
+ */
+ int32_t libsais16_bwt_omp(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t threads);
+
+ /**
+ * Constructs the burrows-wheeler transformed 16-bit string of a given 16-bit string with auxiliary indexes in parallel using OpenMP.
+ * @param T [0..n-1] The input 16-bit string.
+ * @param U [0..n-1] The output 16-bit string (can be T).
+ * @param A [0..n-1+fs] The temporary array.
+ * @param n The length of the given 16-bit string.
+ * @param fs The extra space available at the end of A array (can be 0).
+ * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL).
+ * @param r The sampling rate for auxiliary indexes (must be power of 2).
+ * @param I [0..(n-1)/r] The output auxiliary indexes.
+ * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+ int32_t libsais16_bwt_aux_omp(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I, int32_t threads);
+#endif
+
+ /**
+ * Creates the libsais16 reverse BWT context that allows reusing allocated memory with each libsais16_unbwt_* operation.
+ * In multi-threaded environments, use one context per thread for parallel executions.
+ * @return the libsais16 context, NULL otherwise.
+ */
+ void * libsais16_unbwt_create_ctx(void);
+
+#if defined(_OPENMP)
+ /**
+ * Creates the libsais16 reverse BWT context that allows reusing allocated memory with each parallel libsais16_unbwt_* operation using OpenMP.
+ * In multi-threaded environments, use one context per thread for parallel executions.
+ * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
+ * @return the libsais16 context, NULL otherwise.
+ */
+ void * libsais16_unbwt_create_ctx_omp(int32_t threads);
+#endif
+
+ /**
+ * Destroys the libsass reverse BWT context and free previusly allocated memory.
+ * @param ctx The libsais16 context (can be NULL).
+ */
+ void libsais16_unbwt_free_ctx(void * ctx);
+
+ /**
+ * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string with primary index.
+ * @param T [0..n-1] The input 16-bit string.
+ * @param U [0..n-1] The output 16-bit string (can be T).
+ * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size).
+ * @param n The length of the given 16-bit string.
+ * @param freq [0..65535] The input 16-bit symbol frequency table (can be NULL).
+ * @param i The primary index.
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+ int32_t libsais16_unbwt(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i);
+
+ /**
+ * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string with primary index using libsais16 reverse BWT context.
+ * @param ctx The libsais16 reverse BWT context.
+ * @param T [0..n-1] The input 16-bit string.
+ * @param U [0..n-1] The output 16-bit string (can be T).
+ * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size).
+ * @param n The length of the given 16-bit string.
+ * @param freq [0..65535] The input 16-bit symbol frequency table (can be NULL).
+ * @param i The primary index.
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+ int32_t libsais16_unbwt_ctx(const void * ctx, const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i);
+
+ /**
+ * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string with auxiliary indexes.
+ * @param T [0..n-1] The input 16-bit string.
+ * @param U [0..n-1] The output 16-bit string (can be T).
+ * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size).
+ * @param n The length of the given 16-bit string.
+ * @param freq [0..65535] The input 16-bit symbol frequency table (can be NULL).
+ * @param r The sampling rate for auxiliary indexes (must be power of 2).
+ * @param I [0..(n-1)/r] The input auxiliary indexes.
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+ int32_t libsais16_unbwt_aux(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I);
+
+ /**
+ * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string with auxiliary indexes using libsais16 reverse BWT context.
+ * @param ctx The libsais16 reverse BWT context.
+ * @param T [0..n-1] The input 16-bit string.
+ * @param U [0..n-1] The output 16-bit string (can be T).
+ * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size).
+ * @param n The length of the given 16-bit string.
+ * @param freq [0..65535] The input 16-bit symbol frequency table (can be NULL).
+ * @param r The sampling rate for auxiliary indexes (must be power of 2).
+ * @param I [0..(n-1)/r] The input auxiliary indexes.
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+ int32_t libsais16_unbwt_aux_ctx(const void * ctx, const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I);
+
+#if defined(_OPENMP)
+ /**
+ * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string with primary index in parallel using OpenMP.
+ * @param T [0..n-1] The input 16-bit string.
+ * @param U [0..n-1] The output 16-bit string (can be T).
+ * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size).
+ * @param n The length of the given 16-bit string.
+ * @param freq [0..65535] The input 16-bit symbol frequency table (can be NULL).
+ * @param i The primary index.
+ * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+ int32_t libsais16_unbwt_omp(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i, int32_t threads);
+
+ /**
+ * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string with auxiliary indexes in parallel using OpenMP.
+ * @param T [0..n-1] The input 16-bit string.
+ * @param U [0..n-1] The output 16-bit string (can be T).
+ * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size).
+ * @param n The length of the given 16-bit string.
+ * @param freq [0..65535] The input 16-bit symbol frequency table (can be NULL).
+ * @param r The sampling rate for auxiliary indexes (must be power of 2).
+ * @param I [0..(n-1)/r] The input auxiliary indexes.
+ * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+ int32_t libsais16_unbwt_aux_omp(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I, int32_t threads);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/libsais/libsais_internal.h b/libsais/libsais_internal.h
new file mode 100644
index 0000000..d11a213
--- /dev/null
+++ b/libsais/libsais_internal.h
@@ -0,0 +1,49 @@
+/*--
+
+This file is a part of libsais, a library for linear time
+suffix array and burrows wheeler transform construction.
+
+ Copyright (c) 2021 Ilya Grebnov
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+Please see the file LICENSE for full copyright information.
+
+--*/
+
+#ifndef LIBSAIS_INTERNAL_H
+#define LIBSAIS_INTERNAL_H 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ #include
+
+ /**
+ * Internal method to construct suffix array of an integer array.
+ * Note, during suffix array construction input array will be modified and restored at the end if no error occurred.
+ * @param T [0..n-1] The input integer array.
+ * @param SA [0..n-1+fs] The output array of suffixes.
+ * @param n The length of the integer array.
+ * @param k The alphabet size of the input integer array.
+ * @param fs Extra space available at the end of SA array (can be 0).
+ * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+ int32_t libsais_main_32s_internal(int32_t * T, int32_t * SA, int32_t n, int32_t k, int32_t fs, int32_t threads);
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/m03_model.h b/m03_model.h
new file mode 100644
index 0000000..0134b5b
--- /dev/null
+++ b/m03_model.h
@@ -0,0 +1,408 @@
+/*--
+
+This file is a part of bsc-m03 project.
+
+ Copyright (c) 2021 Ilya Grebnov
+
+ bsc-m03 is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ bsc-m03 is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with bsc-m03. If not, see .
+
+--*/
+
+#pragma once
+
+#include
+#include
+#include
+
+#include "common/platform.h"
+#include "common/rangecoder.h"
+
+#include
+
+#pragma warning( push )
+#pragma warning( disable : 6385 )
+#pragma warning( disable : 6386 )
+
+enum class m03_mode : int { encoding = 0, decoding = 1, };
+
+class m03_model
+{
+protected:
+ m03_mode mode;
+
+ void initialize_model(RangeCoder * coder, m03_mode mode)
+ {
+ this->coder = coder;
+ this->mode = mode;
+
+ for (int32_t s = 0; s < 1536; ++s) { T1_model[s][0] = T1_model[s][1] = 1; }
+ for (int32_t s = 0; s < 1536; ++s) { T2_model[s][0] = T2_model[s][1] = T2_model[s][2] = T2_model[s][3] = 1; }
+ for (int32_t s = 0; s < 768 ; ++s) { Ternary_model[s][0] = Ternary_model[s][1] = Ternary_model[s][2] = Ternary_model[s][3] = 1; }
+ for (int32_t s = 0; s < 96 ; ++s) { for (int32_t c = 0; c < 16; ++c) { Tree_model[s][c] = 1; } }
+ }
+
+ void encode_root_frequencies(const int32_t * root_frequencies, int32_t k, int32_t n)
+ {
+ int64_t bit_freq[33];
+ int64_t bit_freq_sum[33];
+
+ {
+ int64_t remaining_min = n, remaining_max = n, remaining_count = k;
+
+ memset(bit_freq, 0, sizeof(bit_freq));
+
+ for (ptrdiff_t p = 0; p < k; ++p)
+ {
+ bit_freq[bit_scan_reverse(root_frequencies[p] + 1)]++;
+ }
+
+ for (ptrdiff_t bit = 0; bit <= 32 && remaining_count > 0; ++bit)
+ {
+ int64_t min_value = (1ll << (bit + 0)) - 1;
+ int64_t max_value = (1ll << (bit + 1)) - 2;
+ int64_t min = std::max(remaining_count - (remaining_max / (max_value + 1)), 0ll);
+ int64_t max = remaining_count * max_value < remaining_min ? remaining_count - 1 : remaining_count;
+
+ this->coder->EncodeValue((unsigned int)min, (unsigned int)bit_freq[bit], (unsigned int)max);
+
+ remaining_min -= bit_freq[bit] * max_value;
+ remaining_max -= bit_freq[bit] * min_value;
+ remaining_count -= bit_freq[bit];
+ }
+ }
+
+ {
+ int64_t bit_sum = 0, remaining_min = 0, remaining_max = 0, remaining_total = n;
+
+ for (ptrdiff_t bit = 32; bit >= 0; --bit)
+ {
+ int64_t min_value = (1ll << (bit + 0)) - 1;
+ int64_t max_value = (1ll << (bit + 1)) - 2;
+
+ bit_freq_sum[bit] = bit_sum; bit_sum += bit_freq[bit];
+
+ remaining_min += min_value * bit_freq[bit];
+ remaining_max += max_value * bit_freq[bit];
+ }
+
+ for (ptrdiff_t p = 0; p < k; ++p)
+ {
+ int32_t bit = bit_scan_reverse(root_frequencies[p] + 1);
+
+ {
+ for (ptrdiff_t b = 0; b < bit; ++b)
+ {
+ if (bit_freq[b] > 0)
+ {
+ this->coder->Encode((unsigned int)bit_freq[b], (unsigned int)bit_freq_sum[b], (unsigned int)(bit_freq[b] + bit_freq_sum[b]));
+ }
+
+ assert(bit_freq_sum[b] > 0); bit_freq_sum[b]--;
+ }
+
+ if (bit_freq_sum[bit] > 0)
+ {
+ this->coder->Encode(0, (unsigned int)bit_freq[bit], (unsigned int)(bit_freq[bit] + bit_freq_sum[bit]));
+ }
+
+ assert(bit_freq[bit] > 0); bit_freq[bit]--;
+ }
+
+ {
+ int64_t min_value = (1ll << (bit + 0)) - 1;
+ int64_t max_value = (1ll << (bit + 1)) - 2;
+ remaining_min -= min_value;
+ remaining_max -= max_value;
+ int64_t min = std::max(min_value, remaining_total - remaining_max);
+ int64_t max = std::min(max_value, remaining_total - remaining_min);
+
+ this->coder->EncodeValue((unsigned int)min, (unsigned int)root_frequencies[p], (unsigned int)max);
+
+ remaining_total -= root_frequencies[p];
+ }
+ }
+ }
+ }
+
+ void decode_root_frequencies(int32_t * root_frequencies, int32_t k, int32_t n)
+ {
+ int64_t bit_freq[33];
+ int64_t bit_freq_sum[33];
+
+ {
+ int64_t remaining_min = n, remaining_max = n, remaining_count = k;
+
+ memset(bit_freq, 0, sizeof(bit_freq));
+
+ for (ptrdiff_t bit = 0; bit <= 32 && remaining_count > 0; ++bit)
+ {
+ int64_t min_value = (1ll << (bit + 0)) - 1;
+ int64_t max_value = (1ll << (bit + 1)) - 2;
+ int64_t min = std::max(remaining_count - (remaining_max / (max_value + 1)), 0ll);
+ int64_t max = remaining_count * max_value < remaining_min ? remaining_count - 1 : remaining_count;
+
+ bit_freq[bit] = this->coder->DecodeValue((unsigned int)min, (unsigned int)max);
+
+ remaining_min -= bit_freq[bit] * max_value;
+ remaining_max -= bit_freq[bit] * min_value;
+ remaining_count -= bit_freq[bit];
+ }
+ }
+
+ {
+ int64_t bit_sum = 0, remaining_min = 0, remaining_max = 0, remaining_total = n;
+
+ for (ptrdiff_t bit = 32; bit >= 0; --bit)
+ {
+ int64_t min_value = (1ll << (bit + 0)) - 1;
+ int64_t max_value = (1ll << (bit + 1)) - 2;
+
+ bit_freq_sum[bit] = bit_sum; bit_sum += bit_freq[bit];
+
+ remaining_min += min_value * bit_freq[bit];
+ remaining_max += max_value * bit_freq[bit];
+ }
+
+ for (ptrdiff_t p = 0; p < k; ++p)
+ {
+ int32_t bit = 0;
+
+ while (bit_freq_sum[bit] > 0)
+ {
+ if (bit_freq[bit] > 0)
+ {
+ unsigned int cum_freq = this->coder->GetCumFreq((unsigned int)(bit_freq[bit] + bit_freq_sum[bit]));
+ if (cum_freq < bit_freq[bit])
+ {
+ this->coder->Decode(0, (unsigned int)bit_freq[bit], (unsigned int)(bit_freq[bit] + bit_freq_sum[bit]));
+ break;
+ }
+ else
+ {
+ this->coder->Decode((unsigned int)bit_freq[bit], (unsigned int)bit_freq_sum[bit], (unsigned int)(bit_freq[bit] + bit_freq_sum[bit]));;
+ }
+ }
+
+ bit_freq_sum[bit]--; bit++;
+ }
+
+ assert(bit_freq[bit] > 0); bit_freq[bit]--;
+
+ {
+ int64_t min_value = (1ll << (bit + 0)) - 1;
+ int64_t max_value = (1ll << (bit + 1)) - 2;
+ remaining_min -= min_value;
+ remaining_max -= max_value;
+ int64_t min = std::max(min_value, remaining_total - remaining_max);
+ int64_t max = std::min(max_value, remaining_total - remaining_min);
+
+ root_frequencies[p] = this->coder->DecodeValue((unsigned int)min, (unsigned int)max);
+
+ remaining_total -= root_frequencies[p];
+ }
+ }
+ }
+ }
+
+ int32_t predict(int32_t count, int32_t total, int32_t left_remaining, int32_t right_remaining, int32_t symbols_remaining)
+ {
+ int32_t inferred_right = std::max(total - left_remaining, 0);
+ right_remaining -= inferred_right; total -= inferred_right;
+
+ assert(total <= right_remaining);
+
+ if (total > 0)
+ {
+ if (total <= 2)
+ {
+ int32_t state = 0;
+ state += 1 * (std::min((int32_t)symbols_remaining - 2, 5));
+ state += 8 * (std::min((int32_t)bit_scan_reverse(inferred_right + 1), 3));
+ state += 32 * (left_remaining + right_remaining == symbols_remaining);
+ state += 64 * (left_remaining == total);
+ state += 128 * (((int64_t)left_remaining * 11) / ((int64_t)right_remaining));
+
+ if (total == 1)
+ {
+ static const int threshold[12] = { 147, 251, 374, 540, 761, 763, 1589, 2275, 2193, 3457, 3811, 1017 };
+
+ uint16_t * RESTRICT predictor = &this->T1_model[state][0];
+
+ if (predictor[0] + predictor[1] > threshold[state >> 7])
+ {
+ predictor[0] = (predictor[0] + (predictor[0] < 2)) >> 1;
+ predictor[1] = (predictor[1] + (predictor[1] < 2)) >> 1;
+ }
+
+ if (this->mode == m03_mode::encoding)
+ {
+ this->coder->Encode(count ? predictor[0] : 0, predictor[count], predictor[0] + predictor[1]);
+ }
+ else
+ {
+ unsigned int cum_freq = this->coder->GetCumFreq(predictor[0] + predictor[1]);
+
+ count = cum_freq >= predictor[0];
+ this->coder->Decode(count ? predictor[0] : 0, predictor[count], predictor[0] + predictor[1]);
+ }
+
+ predictor[count]++;
+ }
+ else
+ {
+ static const int threshold[12] = { 149, 221, 255, 287, 292, 343, 494, 396, 655, 820, 2984, 225 };
+
+ uint16_t * RESTRICT predictor = &this->T2_model[state][0];
+
+ if (predictor[0] + predictor[1] + predictor[2] > threshold[state >> 7])
+ {
+ predictor[0] = (predictor[0] + (predictor[0] < 2)) >> 1;
+ predictor[1] = (predictor[1] + (predictor[1] < 2)) >> 1;
+ predictor[2] = (predictor[2] + (predictor[2] < 2)) >> 1;
+ }
+
+ if (this->mode == m03_mode::encoding)
+ {
+ unsigned int cum_freq = count == 0 ? 0 : count == 1 ? predictor[0] : predictor[0] + predictor[1];
+ this->coder->Encode(cum_freq, predictor[count], predictor[0] + predictor[1] + predictor[2]);
+ }
+ else
+ {
+ unsigned int cum_freq = this->coder->GetCumFreq(predictor[0] + predictor[1] + predictor[2]);
+
+ count = (cum_freq >= predictor[0]) + (cum_freq >= (unsigned int)(predictor[0] + predictor[1]));
+ cum_freq = count == 0 ? 0 : count == 1 ? predictor[0] : predictor[0] + predictor[1];
+
+ this->coder->Decode(cum_freq, predictor[count], predictor[0] + predictor[1] + predictor[2]);
+ }
+
+ predictor[count]++;
+ }
+ }
+ else
+ {
+ int32_t pivot = (count > 0) + (count == total);
+
+ {
+ static const int threshold[48] =
+ {
+ 142, 129, 115, 89 , 70 , 59 , 53 , 44,
+ 243, 167, 132, 105, 98 , 109, 107, 134,
+ 247, 200, 162, 134, 137, 149, 201, 262,
+ 339, 253, 184, 171, 235, 288, 299, 348,
+ 512, 396, 178, 357, 466, 484, 697, 587,
+ 220, 157, 144, 167, 219, 141, 228, 1076,
+ };
+
+ int32_t state = 0;
+ state += 1 * (std::min((int32_t)bit_scan_reverse(symbols_remaining - 1), 3));
+ state += 4 * (inferred_right > 0);
+ state += 8 * (left_remaining == total);
+ state += 16 * (std::min((int32_t)bit_scan_reverse(total - 2), 7));
+ state += 128 * (((int64_t)left_remaining * 9 + right_remaining) / ((int64_t)right_remaining * 2));
+
+ uint16_t * RESTRICT predictor = &this->Ternary_model[state][0];
+
+ if (predictor[0] + predictor[1] + predictor[2] > threshold[state >> 4])
+ {
+ predictor[0] = (predictor[0] + (predictor[0] < 2)) >> 1;
+ predictor[1] = (predictor[1] + (predictor[1] < 2)) >> 1;
+ predictor[2] = (predictor[2] + (predictor[2] < 2)) >> 1;
+ }
+
+ if (this->mode == m03_mode::encoding)
+ {
+ unsigned int cum_freq = pivot == 0 ? 0 : pivot == 1 ? predictor[0] : predictor[0] + predictor[1];
+ this->coder->Encode(cum_freq, predictor[pivot], predictor[0] + predictor[1] + predictor[2]);
+ }
+ else
+ {
+ unsigned int cum_freq = this->coder->GetCumFreq(predictor[0] + predictor[1] + predictor[2]);
+
+ pivot = (cum_freq >= predictor[0]) + (cum_freq >= (unsigned int)(predictor[0] + predictor[1]));
+ cum_freq = pivot == 0 ? 0 : pivot == 1 ? predictor[0] : predictor[0] + predictor[1];
+
+ this->coder->Decode(cum_freq, predictor[pivot], predictor[0] + predictor[1] + predictor[2]);
+ }
+
+ predictor[pivot]++; if (pivot != 1) { count = pivot == 0 ? 0 : total; }
+ }
+
+ if (pivot == 1)
+ {
+ static const int threshold[48] =
+ {
+ 275 , 167 , 218 , 163, 200, 123, 143, 61,
+ 515 , 335 , 344 , 268, 320, 244, 235, 85,
+ 863 , 474 , 527 , 387, 401, 298, 263, 107,
+ 1920, 968 , 629 , 500, 554, 286, 358, 121,
+ 3655, 1157, 1021, 623, 591, 365, 317, 109,
+ 2922, 249 , 776 , 159, 537, 133, 253, 158,
+ };
+
+ int32_t state = 0;
+ state += 1 * (inferred_right >= total);
+ state += 2 * (std::min(total - 3, 7));
+ state += 16 * (((int64_t)left_remaining * 5) / ((int64_t)right_remaining));
+
+ int32_t min = 1, max = total - 1, context = 1;
+ while (min != max && context < 8)
+ {
+ uint16_t * RESTRICT predictor = &this->Tree_model[state][2 * context];
+
+ if (predictor[0] + predictor[1] > threshold[state >> 1])
+ {
+ predictor[0] = (predictor[0] + (predictor[0] < 2)) >> 1;
+ predictor[1] = (predictor[1] + (predictor[1] < 2)) >> 1;
+ }
+
+ int32_t median = min + ((max - min + 1) >> 1), bit = count >= median;
+
+ if (this->mode == m03_mode::encoding)
+ {
+ this->coder->Encode(bit ? predictor[0] : 0, predictor[bit], predictor[0] + predictor[1]);
+ }
+ else
+ {
+ unsigned int cum_freq = this->coder->GetCumFreq(predictor[0] + predictor[1]);
+
+ bit = cum_freq >= predictor[0];
+ this->coder->Decode(bit ? predictor[0] : 0, predictor[bit], predictor[0] + predictor[1]);
+ }
+
+ predictor[bit]++; context += context + bit; min = bit ? median : min; max = bit ? max : median - 1;
+ }
+
+ count = this->mode == m03_mode::encoding
+ ? this->coder->EncodeValue(min, count, max)
+ : this->coder->DecodeValue(min, max);
+ }
+ }
+
+ return count;
+ }
+
+ return 0;
+ }
+
+private:
+ RangeCoder * coder;
+
+ uint16_t T1_model[1536][2];
+ uint16_t T2_model[1536][4];
+ uint16_t Ternary_model[768][4];
+ uint16_t Tree_model[96][16];
+};
+
+#pragma warning( pop )
\ No newline at end of file
diff --git a/m03_parser.h b/m03_parser.h
new file mode 100644
index 0000000..e296a13
--- /dev/null
+++ b/m03_parser.h
@@ -0,0 +1,709 @@
+/*--
+
+This file is a part of bsc-m03 project.
+
+ Copyright (c) 2021 Ilya Grebnov
+
+ bsc-m03 is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ bsc-m03 is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with bsc-m03. If not, see .
+
+--*/
+
+#pragma once
+
+#include
+#include
+#include
+
+#include
+
+#include "common/platform.h"
+#include "common/rangecoder.h"
+
+#include "hutucker/hu-tucker.h"
+
+#include "m03_model.h"
+
+#define OPTIMAL_ABT_SMALL_THRESHOLD (7)
+#define OPTIMAL_ABT_LARGE_THRESHOLD (257)
+
+#pragma warning( push )
+#pragma warning( disable : 6385 )
+#pragma warning( disable : 6386 )
+
+#pragma pack(push, 1)
+
+typedef struct symbol_context
+{
+ int32_t count;
+ int32_t offset;
+ uint16_t symbol;
+} symbol_context;
+
+#pragma pack(pop)
+
+typedef struct offset_queue
+{
+ int32_t * offsets;
+ ptrdiff_t count;
+ ptrdiff_t size;
+
+ bool initialize(ptrdiff_t size)
+ {
+ this->count = 0;
+ this->size = size;
+ this->offsets = (int32_t *)malloc(this->size * sizeof(int32_t));
+
+ return this->offsets != NULL;
+ }
+
+ INLINE void push_offset(const int32_t offset)
+ {
+ if (this->count == this->size)
+ {
+ this->offsets = this->resize();
+ }
+
+ this->offsets[this->count++] = offset;
+ }
+
+ INLINE void reset() { this->count = 0; }
+
+ INLINE void sort() { std::stable_sort(this->offsets, this->offsets + this->count); }
+
+ NOINLINE int32_t * resize()
+ {
+ return (int32_t *)realloc(this->offsets, (this->size += this->size) * sizeof(int32_t));
+ }
+
+ void destroy()
+ {
+ if (this->offsets != NULL) { free(this->offsets); this->offsets = NULL; }
+ }
+
+} offset_queue;
+
+class m03_parser: m03_model
+{
+public:
+
+ bool initialize(uint16_t * L, int32_t n, int32_t primary_index, int32_t * root_frequencies, int32_t k, RangeCoder * coder, m03_mode mode)
+ {
+ memset(this, 0, sizeof(m03_parser));
+
+ this->L = L;
+ this->n = n;
+ this->primary_index = primary_index;
+ this->root_frequencies = root_frequencies;
+ this->k = k;
+
+ if ((this->contexts = (symbol_context *)malloc(n * sizeof(symbol_context))) == NULL)
+ {
+ this->destroy();
+ return false;
+ }
+
+ if ((this->hutucker_tmp = malloc(hutucker_tmp_size(MAX_ALPHABET_SIZE + 1))) == NULL)
+ {
+ this->destroy();
+ return false;
+ }
+
+ if (!current_segments.initialize(next_power_of_2(std::max(n / 4, 64))))
+ {
+ this->destroy();
+ return false;
+ }
+
+ if (!next_segments.initialize(next_power_of_2(std::max(n / 4, 64))))
+ {
+ this->destroy();
+ return false;
+ }
+
+ this->initialize_model(coder, mode);
+ this->initialize_alphabetic_tree_roots();
+
+ return true;
+ }
+
+ void run()
+ {
+ if (this->mode == m03_mode::encoding)
+ {
+ this->encode_root_frequencies(this->root_frequencies, this->k, this->n - 1);
+ this->initialize_root_context(this->root_frequencies);
+ this->parse_contexts();
+
+ for (ptrdiff_t p = 0; p < n; ++p)
+ {
+ assert(p == this->primary_index || this->contexts[p].count == 1 );
+ assert(p == this->primary_index || this->contexts[p].symbol == L[p]);
+ }
+ }
+ else
+ {
+ this->decode_root_frequencies(this->root_frequencies, this->k, this->n - 1);
+ this->initialize_root_context(this->root_frequencies);
+ this->parse_contexts();
+
+ for (ptrdiff_t p = 0; p < n; ++p)
+ {
+ L[p] = this->contexts[p].symbol;
+ }
+ }
+ }
+
+ void destroy()
+ {
+ if (this->contexts != NULL) { free(this->contexts); this->contexts = NULL; }
+ if (this->hutucker_tmp != NULL) { free(this->hutucker_tmp); this->hutucker_tmp = NULL; }
+
+ this->current_segments.destroy();
+ this->next_segments.destroy();
+ }
+
+private:
+
+ uint16_t * L;
+ int32_t n;
+ int32_t primary_index;
+ int32_t * root_frequencies;
+ int32_t k;
+
+ symbol_context * contexts;
+ offset_queue current_segments;
+ offset_queue next_segments;
+ void * hutucker_tmp;
+
+ int32_t parent_frequencies [MAX_ALPHABET_SIZE + 1];
+ int32_t left_frequencies [MAX_ALPHABET_SIZE + 1];
+ symbol_context left_contexts [MAX_ALPHABET_SIZE + 1];
+
+ int32_t alphabetic_tree_keys[OPTIMAL_ABT_LARGE_THRESHOLD];
+ int32_t alphabetic_tree_weight[OPTIMAL_ABT_LARGE_THRESHOLD];
+ int64_t alphabetic_tree_cost[OPTIMAL_ABT_LARGE_THRESHOLD][OPTIMAL_ABT_LARGE_THRESHOLD];
+ uint8_t alphabetic_tree_root[OPTIMAL_ABT_LARGE_THRESHOLD][OPTIMAL_ABT_LARGE_THRESHOLD];
+
+ void initialize_alphabetic_tree_roots()
+ {
+ for (int32_t l = 0; l < OPTIMAL_ABT_LARGE_THRESHOLD - 1; ++l)
+ {
+ this->alphabetic_tree_root[l][l + 1] = this->alphabetic_tree_root[l][l] = l;
+ }
+ }
+
+ void initialize_root_context(const int32_t * root_frequencies)
+ {
+ int32_t unique_symbols = 0, total_symbols = 1;
+
+ this->current_segments.push_offset(0);
+
+ for (int32_t c = 0; c < this->k; ++c)
+ {
+ if (root_frequencies[c] > 0)
+ {
+ this->contexts[unique_symbols].count = root_frequencies[c];
+ this->contexts[unique_symbols].offset = total_symbols;
+ this->contexts[unique_symbols].symbol = c;
+
+ this->current_segments.push_offset(total_symbols);
+
+ unique_symbols++; total_symbols += root_frequencies[c];
+ }
+ }
+
+ m03_parser::normalize_context(&this->contexts[0], unique_symbols, total_symbols);
+ }
+
+ void parse_contexts()
+ {
+ while (this->current_segments.count > 0)
+ {
+ for (int32_t segment_start = 0; segment_start < this->current_segments.count;)
+ {
+ int32_t context_start = this->current_segments.offsets[segment_start];
+ int32_t context_end = context_start + this->contexts[context_start].count;
+ int32_t segment_end = segment_start + 1;
+
+ while (segment_end < this->current_segments.count && this->current_segments.offsets[segment_end] < context_end)
+ {
+ segment_end++;
+ }
+
+ assert(context_end - context_start > 1);
+ assert(segment_end - segment_start > 1);
+
+ if (this->is_trivial_context(context_start))
+ {
+ m03_parser::split_trivial_context(this->contexts, this->next_segments, &this->current_segments.offsets[segment_start], &this->current_segments.offsets[segment_end]);
+ }
+ else
+ {
+ m03_parser::populate_context_frequencies(&this->contexts[context_start], &this->contexts[this->primary_index], &this->parent_frequencies[0]);
+ this->split_context_recursive(&this->current_segments.offsets[segment_start], &this->current_segments.offsets[segment_end]);
+ }
+
+ segment_start = segment_end;
+ }
+
+ this->next_segments.sort();
+ this->current_segments.reset();
+
+ std::swap(this->current_segments, this->next_segments);
+ }
+ }
+
+ void split_context_recursive(const int32_t * offsets, const int32_t * offsets_end)
+ {
+ assert(offsets_end - offsets > 0);
+
+ if (offsets_end - offsets == 1)
+ {
+ m03_parser::populate_next_segments(&this->contexts[offsets[0]], &this->contexts[this->primary_index], &this->parent_frequencies[0], this->next_segments);
+ return;
+ }
+
+ if (this->is_trivial_context(offsets[0]))
+ {
+ m03_parser::split_trivial_context(this->contexts, this->next_segments, offsets, offsets_end);
+ return;
+ }
+
+ if (offsets_end - offsets >= OPTIMAL_ABT_SMALL_THRESHOLD && offsets_end - offsets <= OPTIMAL_ABT_LARGE_THRESHOLD)
+ {
+ this->build_optimal_alphabetic_tree(offsets, offsets_end);
+ this->traverse_alphabetic_tree(offsets, offsets_end, 0, (int32_t)(offsets_end - offsets) - 1);
+ return;
+ }
+
+ const int32_t * offsets_pivot = (offsets_end - offsets) > 2
+ ? this->choose_context_pivot_using_heuristic(offsets, offsets_end)
+ : &offsets[1];
+
+ this->split_context_by_pivot(offsets[0], offsets_pivot[0]);
+ this->split_context_recursive(offsets, offsets_pivot);
+ this->split_context_recursive(offsets_pivot, offsets_end);
+ }
+
+ void traverse_alphabetic_tree(const int32_t * offsets, const int32_t * offsets_end, int32_t l, int32_t r)
+ {
+ assert(l <= r);
+
+ if (l == r)
+ {
+ m03_parser::populate_next_segments(&this->contexts[offsets[l]], &this->contexts[this->primary_index], &this->parent_frequencies[0], this->next_segments);
+ return;
+ }
+
+ if (this->is_trivial_context(offsets[l]))
+ {
+ m03_parser::split_trivial_context(this->contexts, this->next_segments, &offsets[l], &offsets[r + 1]);
+ return;
+ }
+
+ int32_t offsets_pivot = this->alphabetic_tree_root[l][r];
+
+ this->split_context_by_pivot(offsets[l], offsets[offsets_pivot + 1]);
+ this->traverse_alphabetic_tree(offsets, offsets_end, l, offsets_pivot);
+ this->traverse_alphabetic_tree(offsets, offsets_end, offsets_pivot + 1, r);
+ }
+
+ const int32_t * choose_context_pivot_using_heuristic(const int32_t * offsets, const int32_t * offsets_end)
+ {
+ assert(offsets_end - offsets > 2);
+
+ int32_t context_begin = offsets[0];
+ int32_t context_end = offsets[0] + this->contexts[offsets[0]].count;
+ size_t offsets_count = offsets_end - offsets;
+
+ if (offsets_count == 3)
+ {
+ int64_t A = (int64_t)(offsets[1] ) - (int64_t)(context_begin);
+ int64_t C = (int64_t)(context_end) - (int64_t)(offsets[2]);
+
+ return C <= A ? &offsets[1] : &offsets[2];
+ }
+ else if (offsets_count == 4)
+ {
+ int64_t A = (int64_t)(offsets[1] ) - (int64_t)(context_begin);
+ int64_t B = (int64_t)(offsets[2] ) - (int64_t)(offsets[1]);
+ int64_t C = (int64_t)(offsets[3] ) - (int64_t)(offsets[2]);
+ int64_t D = (int64_t)(context_end) - (int64_t)(offsets[3]);
+
+ const int32_t * offset1 = &offsets[1]; int64_t cost1 = pivot_cost3(B, C, D);
+ const int32_t * offset2 = &offsets[2]; int64_t cost2 = A + B + C + D;
+ const int32_t * offset3 = &offsets[3]; int64_t cost3 = pivot_cost3(A, B, C);
+
+ if (cost2 <= cost1) { offset1 = offset2; cost1 = cost2; }
+ if (cost3 < cost1) { offset1 = offset3; }
+
+ return offset1;
+ }
+ else if (offsets_count == 5)
+ {
+ int64_t A = (int64_t)(offsets[1] ) - (int64_t)(context_begin);
+ int64_t B = (int64_t)(offsets[2] ) - (int64_t)(offsets[1]);
+ int64_t C = (int64_t)(offsets[3] ) - (int64_t)(offsets[2]);
+ int64_t D = (int64_t)(offsets[4] ) - (int64_t)(offsets[3]);
+ int64_t E = (int64_t)(context_end) - (int64_t)(offsets[4]);
+
+ const int32_t * offset1 = &offsets[1]; int64_t cost1 = pivot_cost4(B, C, D, E);
+ const int32_t * offset2 = &offsets[2]; int64_t cost2 = A + B + pivot_cost3(C, D, E);
+ const int32_t * offset3 = &offsets[3]; int64_t cost3 = pivot_cost3(A, B, C) + D + E;
+ const int32_t * offset4 = &offsets[4]; int64_t cost4 = pivot_cost4(A, B, C, D);
+
+ if (cost2 <= cost1) { offset1 = offset2; cost1 = cost2; }
+ if (cost3 < cost1) { offset1 = offset3; cost1 = cost3; }
+ if (cost4 < cost1) { offset1 = offset4; }
+
+ return offset1;
+ }
+ else if (offsets_count == 6)
+ {
+ int64_t A = (int64_t)(offsets[1] ) - (int64_t)(context_begin);
+ int64_t B = (int64_t)(offsets[2] ) - (int64_t)(offsets[1]);
+ int64_t C = (int64_t)(offsets[3] ) - (int64_t)(offsets[2]);
+ int64_t D = (int64_t)(offsets[4] ) - (int64_t)(offsets[3]);
+ int64_t E = (int64_t)(offsets[5] ) - (int64_t)(offsets[4]);
+ int64_t F = (int64_t)(context_end) - (int64_t)(offsets[5]);
+
+ const int32_t * offset1 = &offsets[1]; int64_t cost1 = pivot_cost5(B, C, D, E, F);
+ const int32_t * offset2 = &offsets[2]; int64_t cost2 = A + B + pivot_cost4(C, D, E, F);
+ const int32_t * offset3 = &offsets[3]; int64_t cost3 = pivot_cost3(A, B, C) + pivot_cost3(D, E, F);
+ const int32_t * offset4 = &offsets[4]; int64_t cost4 = pivot_cost4(A, B, C, D) + E + F;
+ const int32_t * offset5 = &offsets[5]; int64_t cost5 = pivot_cost5(A, B, C, D, E);
+
+ if (cost2 <= cost1) { offset1 = offset2; cost1 = cost2; }
+ if (cost3 <= cost1) { offset1 = offset3; cost1 = cost3; }
+ if (cost4 < cost1) { offset1 = offset4; cost1 = cost4; }
+ if (cost5 < cost1) { offset1 = offset5; }
+
+ return offset1;
+ }
+ else
+ {
+ assert(offsets_count > OPTIMAL_ABT_LARGE_THRESHOLD);
+
+ {
+ for (int32_t segment_end = context_end, offsets_index = (int32_t)offsets_count - 1; offsets_index >= 0; --offsets_index)
+ {
+ int32_t segment_start = offsets[offsets_index];
+
+ this->left_frequencies[offsets_index] = segment_end - segment_start; segment_end = segment_start;
+ }
+
+ hutucker_get_lengths(offsets_count, (unsigned long *)this->left_frequencies, this->hutucker_tmp);
+ }
+
+ {
+ uint8_t path[64] = { 0 };
+ for (int32_t offsets_index = 0, length = 0; offsets_index < offsets_count; ++offsets_index)
+ {
+ for (; length < this->left_frequencies[offsets_index]; ++length) { path[length] = 0; }
+
+ length = this->left_frequencies[offsets_index]; if (path[0] == 1) { return &offsets[offsets_index]; }
+
+ for (int32_t k = length - 1; k >= 0; --k) { if (path[k] ^= 1) { break; } }
+ }
+ }
+
+ return NULL;
+ }
+ }
+
+ void build_optimal_alphabetic_tree(const int32_t * offsets, const int32_t * offsets_end)
+ {
+ ptrdiff_t offsets_count = (ptrdiff_t)(offsets_end - offsets);
+
+ assert(offsets_count >= OPTIMAL_ABT_SMALL_THRESHOLD && offsets_count <= OPTIMAL_ABT_LARGE_THRESHOLD);
+
+ this->alphabetic_tree_keys[offsets_count - 1] = offsets[0] + this->contexts[offsets[0]].count - offsets[offsets_count - 1];
+
+ for (ptrdiff_t offsets_index = offsets_count - 2; offsets_index >= 0; --offsets_index)
+ {
+ this->alphabetic_tree_keys[offsets_index] = offsets[offsets_index + 1] - offsets[offsets_index];
+ this->alphabetic_tree_cost[offsets_index][offsets_index + 1] = this->alphabetic_tree_weight[offsets_index] = this->alphabetic_tree_keys[offsets_index] + this->alphabetic_tree_keys[offsets_index + 1];
+ }
+
+ for (ptrdiff_t length = 3; length <= offsets_count; ++length)
+ {
+ for (ptrdiff_t l = 0, r = length - 1; r < offsets_count; ++l, ++r)
+ {
+ uint8_t best_root = this->alphabetic_tree_root[l][r - 1];
+ int64_t best_cost = this->alphabetic_tree_cost[l][best_root] + this->alphabetic_tree_cost[best_root + 1][r];
+
+ for (ptrdiff_t root = (ptrdiff_t)best_root + 1; root <= (ptrdiff_t)this->alphabetic_tree_root[l + 1][r]; ++root)
+ {
+ int64_t cost = this->alphabetic_tree_cost[l][root] + this->alphabetic_tree_cost[root + 1][r];
+ if (cost < best_cost) { best_cost = cost; best_root = (uint8_t)root; }
+ }
+
+ this->alphabetic_tree_weight[l] += this->alphabetic_tree_keys[r];
+ this->alphabetic_tree_cost[l][r] = best_cost + this->alphabetic_tree_weight[l];
+ this->alphabetic_tree_root[l][r] = best_root;
+ }
+ }
+ }
+
+ void split_context_by_pivot(int32_t parent_context_offset, int32_t right_context_offset)
+ {
+ symbol_context * parent_context = &this->contexts[parent_context_offset];
+ int32_t parent_interval_size = parent_context[0].count;
+ int32_t parent_unique_symbols = 1;
+
+ symbol_context * left_context = &this->left_contexts[0];
+ int32_t * left_frequencies = &this->left_frequencies[0];
+ int32_t left_interval_size = right_context_offset - parent_context_offset;
+ int32_t left_unique_symbols = 0;
+
+ int32_t right_interval_size = parent_interval_size - left_interval_size;
+ int32_t right_unique_symbols = 0;
+
+ if (this->mode == m03_mode::encoding)
+ {
+ if (left_interval_size <= parent_interval_size - left_interval_size)
+ {
+ int32_t parent_total_symbols = parent_interval_size;
+
+ parent_total_symbols -= ((uint32_t)(this->primary_index - parent_context_offset) < (uint32_t)parent_total_symbols);
+
+ while (parent_total_symbols > 1 && parent_context[parent_unique_symbols].count > 0)
+ {
+ parent_total_symbols -= parent_context[parent_unique_symbols].count;
+ left_frequencies[parent_context[parent_unique_symbols].symbol] = 0;
+ parent_unique_symbols++;
+ }
+
+ assert(parent_total_symbols > 0); parent_context[0].count = parent_total_symbols;
+ left_frequencies[parent_context[0].symbol] = 0;
+
+ for (int32_t p = parent_context_offset; p < right_context_offset; ++p) { left_frequencies[L[p]]++; }
+
+ left_frequencies[0] -= ((uint32_t)(this->primary_index - parent_context_offset) < (uint32_t)left_interval_size);
+ }
+ else
+ {
+ int32_t parent_total_symbols = parent_interval_size;
+
+ parent_total_symbols -= ((uint32_t)(this->primary_index - parent_context_offset) < (uint32_t)parent_total_symbols);
+
+ while (parent_total_symbols > 1 && parent_context[parent_unique_symbols].count > 0)
+ {
+ parent_total_symbols -= parent_context[parent_unique_symbols].count;
+ left_frequencies[parent_context[parent_unique_symbols].symbol] = parent_context[parent_unique_symbols].count;
+ parent_unique_symbols++;
+ }
+
+ assert(parent_total_symbols > 0); parent_context[0].count = parent_total_symbols;
+ left_frequencies[parent_context[0].symbol] = parent_total_symbols;
+
+ for (int32_t p = right_context_offset; p < parent_context_offset + parent_interval_size; ++p) { left_frequencies[L[p]]--; }
+
+ left_frequencies[0] += ((uint32_t)(this->primary_index - right_context_offset) < (uint32_t)right_interval_size);
+ }
+ }
+ else
+ {
+ int32_t parent_total_symbols = parent_interval_size;
+
+ parent_total_symbols -= ((uint32_t)(this->primary_index - parent_context_offset) < (uint32_t)parent_total_symbols);
+
+ while (parent_total_symbols > 1 && parent_context[parent_unique_symbols].count > 0)
+ {
+ parent_total_symbols -= parent_context[parent_unique_symbols].count;
+ parent_unique_symbols++;
+ }
+
+ assert(parent_total_symbols > 0); parent_context[0].count = parent_total_symbols;
+ }
+
+ int32_t left_remaining = left_interval_size;
+ int32_t right_remaining = right_interval_size;
+
+ left_remaining -= ((uint32_t)(this->primary_index - parent_context_offset) < (uint32_t)left_interval_size );
+ right_remaining -= ((uint32_t)(this->primary_index - right_context_offset ) < (uint32_t)right_interval_size);
+
+ for (int32_t parent_symbol_index = 0; parent_symbol_index < parent_unique_symbols; ++parent_symbol_index)
+ {
+ if (left_remaining > 0)
+ {
+ uint16_t symbol = parent_context[parent_symbol_index].symbol;
+ int32_t total = parent_context[parent_symbol_index].count;
+ int32_t count = left_frequencies[symbol];
+
+ if (total <= left_remaining + right_remaining - total)
+ {
+ count = left_remaining <= right_remaining
+ ? this->predict( count, total, left_remaining , right_remaining, parent_unique_symbols - parent_symbol_index)
+ : total - this->predict(total - count, total, right_remaining, left_remaining , parent_unique_symbols - parent_symbol_index);
+ }
+ else
+ {
+ total = left_remaining + right_remaining - total;
+ count = left_remaining - count;
+
+ count = left_remaining <= right_remaining
+ ? this->predict( count, total, left_remaining , right_remaining, parent_unique_symbols - parent_symbol_index)
+ : total - this->predict(total - count, total, right_remaining, left_remaining , parent_unique_symbols - parent_symbol_index);
+
+ count = left_remaining - count;
+ total = left_remaining + right_remaining - total;
+ }
+
+ left_remaining = left_remaining - count;
+ right_remaining = right_remaining + count - total;
+
+ if (count > 0)
+ {
+ left_context[left_unique_symbols].count = count;
+ left_context[left_unique_symbols].offset = parent_context[parent_symbol_index].offset;
+ left_context[left_unique_symbols].symbol = symbol;
+
+ parent_context[parent_symbol_index].count -= count;
+ parent_context[parent_symbol_index].offset += count;
+
+ left_unique_symbols++;
+ }
+ }
+
+ if (parent_context[parent_symbol_index].count > 0)
+ {
+ parent_context[right_unique_symbols] = parent_context[parent_symbol_index];
+ right_unique_symbols++;
+ }
+ }
+
+ {
+ memmove(&this->contexts[right_context_offset], &parent_context[0], right_unique_symbols * sizeof(symbol_context));
+ m03_parser::normalize_context(&this->contexts[right_context_offset], right_unique_symbols, right_interval_size);
+
+ memcpy(&parent_context[0], &left_context[0], left_unique_symbols * sizeof(symbol_context));
+ m03_parser::normalize_context(&parent_context[0], left_unique_symbols, left_interval_size);
+ }
+ }
+
+ INLINE bool is_trivial_context(int32_t context_start)
+ {
+ return this->contexts[context_start + 1].count == 0 && ((uint32_t)(this->primary_index - context_start) >= (uint32_t)this->contexts[context_start].count);
+ }
+
+ static void split_trivial_context(symbol_context * contexts, offset_queue & queue, const int32_t * offsets, const int32_t * offsets_end)
+ {
+ int32_t context_start = *offsets++;
+ symbol_context parent_context = contexts[context_start];
+
+ for (; offsets < offsets_end;)
+ {
+ symbol_context * context = &contexts[context_start];
+ int32_t context_end = *offsets++;
+ int32_t context_size = context_end - context_start;
+
+ queue.push_offset(parent_context.offset);
+
+ context[0].count = context_size; parent_context.count -= context_size;
+ context[0].offset = parent_context.offset; parent_context.offset += context_size;
+ context[0].symbol = parent_context.symbol; if (context_size > 1) { context[1].count = 0; }
+
+ context_start = context_end;
+ }
+
+ queue.push_offset(parent_context.offset);
+
+ contexts[context_start] = parent_context; if (contexts[context_start].count > 1) { contexts[context_start + 1].count = 0; }
+ }
+
+ static void populate_context_frequencies(symbol_context * context, symbol_context * primary_index_context, int32_t * frequencies)
+ {
+ int32_t total_symbols = context[0].count;
+ int32_t unique_symbols = 1;
+
+ total_symbols -= ((uint32_t)(primary_index_context - context) < (uint32_t)total_symbols);
+
+ while (total_symbols > 1 && context[unique_symbols].count > 0)
+ {
+ frequencies[context[unique_symbols].symbol] = context[unique_symbols].count;
+ total_symbols -= context[unique_symbols].count; unique_symbols++;
+ }
+
+ assert(total_symbols > 0); frequencies[context[0].symbol] = total_symbols;
+ }
+
+ static void populate_next_segments(symbol_context * context, symbol_context * primary_index_context, int32_t * frequencies, offset_queue & queue)
+ {
+ int32_t total_symbols = context[0].count;
+ int32_t unique_symbols = 1;
+
+ total_symbols -= ((uint32_t)(primary_index_context - context) < (uint32_t)total_symbols);
+
+ while (total_symbols > 1 && context[unique_symbols].count > 0)
+ {
+ if (frequencies[context[unique_symbols].symbol] != context[unique_symbols].count)
+ {
+ queue.push_offset(context[unique_symbols].offset);
+ }
+
+ total_symbols -= context[unique_symbols].count; unique_symbols++;
+ }
+
+ if (total_symbols > 0 && frequencies[context[0].symbol] != total_symbols)
+ {
+ queue.push_offset(context[0].offset);
+ }
+ }
+
+ static void normalize_context(symbol_context * context, int32_t unique_symbols, int32_t total_symbols)
+ {
+ if (unique_symbols > 1)
+ {
+ for (int32_t i = 1; i < unique_symbols; ++i)
+ {
+ symbol_context temp = context[i];
+
+ int32_t j = i;
+ while (j > 0 && (context[j - 1].count < temp.count || (context[j - 1].count == temp.count && context[j - 1].symbol > temp.symbol)))
+ {
+ context[j] = context[j - 1]; j--;
+ }
+
+ context[j] = temp;
+ }
+
+ {
+ symbol_context * contexts_start = &context[std::max(0, unique_symbols - 6)];
+ symbol_context * contexts_end = &context[unique_symbols - 1];
+
+ while (contexts_start < contexts_end) { std::swap(*contexts_start++, *contexts_end--); }
+ }
+ }
+
+ assert(total_symbols > 0); context[0].count = total_symbols; if (unique_symbols < total_symbols) { context[unique_symbols].count = 0; }
+ }
+
+ INLINE static int64_t pivot_cost3(int64_t A, int64_t B, int64_t C)
+ {
+ return A + B + C + B + std::min(A, C);
+ }
+
+ INLINE static int64_t pivot_cost4(int64_t A, int64_t B, int64_t C, int64_t D)
+ {
+ return A + B + C + D + std::min(A + B + C + D, std::min(pivot_cost3(A, B, C), pivot_cost3(B, C, D)));
+ }
+
+ INLINE static int64_t pivot_cost5(int64_t A, int64_t B, int64_t C, int64_t D, int64_t E)
+ {
+ return A + B + C + D + E + std::min(std::min(pivot_cost4(B, C, D, E), A + B + pivot_cost3(C, D, E)), std::min(pivot_cost3(A, B, C) + D + E, pivot_cost4(A, B, C, D)));
+ }
+};
+
+#pragma warning( pop )
\ No newline at end of file