diff --git a/CHANGES b/CHANGES new file mode 100644 index 0000000..d7dae51 --- /dev/null +++ b/CHANGES @@ -0,0 +1,2 @@ +* 2021-12-03 : Version 0.1.0 + * Initial public release of the bsc-m03. diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..dd740e1 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,5 @@ +cmake_minimum_required (VERSION 3.9) + +project ("bsc-m03") + +add_executable (bsc-m03 bsc-m03.cpp hutucker/hu-tucker.c libsais/libsais.c libsais/libsais16.c) \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..f288702 --- /dev/null +++ b/LICENSE @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/README.md b/README.md new file mode 100644 index 0000000..d328b37 --- /dev/null +++ b/README.md @@ -0,0 +1,106 @@ +# bsc-m03 + +The bsc-m03 is experimental block sorting compressor based on M03 context aware compression algorithm invented by Michael Maniscalco: +* Michael Maniscalco *M03: A solution for context based blocksort (BWT) compression*, 2004 +* Jurgen Abel *Post BWT stages of the Burrows-Wheeler compression algorithm*, 2010 + +Copyright (c) 2021 Ilya Grebnov + +## License +The libsais is released under the [GNU General Public License](LICENSE "GNU General Public License") + +## Changes +* 2021-12-03 : Version 0.1.0 + * Initial public release of the bsc-m03. + +# Benchmarks + +### Calgary Corpus ### +| File name | Input size (bytes) | Output size (bytes) | Bits per symbol | +|:---------------:|:-----------:|:------------:|:-------:| +| bib | 111261 | 25143 | 1.808 | +| book1 | 768771 | 208157 | 2.166 | +| book2 | 610856 | 141591 | 1.854 | +| geo | 102400 | 52797 | 4.125 | +| news | 377109 | 108387 | 2.299 | +| obj1 | 21504 | 9901 | 3.683 | +| obj2 | 246814 | 69689 | 2.259 | +| paper1 | 53161 | 15384 | 2.315 | +| paper2 | 82199 | 23161 | 2.254 | +| pic | 513216 | 44920 | 0.700 | +| progc | 39611 | 11525 | 2.328 | +| progl | 71646 | 13921 | 1.554 | +| progp | 49379 | 9530 | 1.544 | +| trans | 93695 | 15759 | 1.346 | + +### Canterbury Corpus ### +| File name | Input size (bytes) | Output size (bytes) | Bits per symbol | +|:---------------:|:-----------:|:------------:|:-------:| +| alice29.txt | 152089 | 39310 | 2.068 | +| asyoulik.txt | 125179 | 36585 | 2.338 | +| cp.html | 24603 | 7042 | 2.290 | +| fields.c | 11150 | 2748 | 1.972 | +| grammar.lsp | 3721 | 1142 | 2.455 | +| kennedy.xls | 1029744 | 58440 | 0.454 | +| lcet10.txt | 426754 | 96730 | 1.813 | +| plrabn12.txt | 481861 | 131617 | 2.185 | +| ptt5 | 513216 | 44920 | 0.700 | +| sum | 38240 | 11599 | 2.427 | +| xargs.1 | 4227 | 1618 | 3.062 | + +### Large Canterbury Corpus ### +| File name | Input size (bytes) | Output size (bytes) | Bits per symbol | +|:---------------:|:-----------:|:------------:|:-------:| +| bible.txt | 4047392 | 708602 | 1.401 | +| E.coli | 4638690 | 1137915 | 1.962 | +| world192.txt | 2473400 | 384776 | 1.245 | + +### Silesia Corpus ### +| File name | Input size (bytes) | Output size (bytes) | Bits per symbol | +|:---------------:|:-----------:|:------------:|:-------:| +| dickens | 10192446 | 2220939 | 1.743 | +| mozilla | 51220480 | 15831237 | 2.473 | +| mr | 9970564 | 2169223 | 1.741 | +| nci | 33553445 | 1148550 | 0.274 | +| ooffice | 6152192 | 2542258 | 3.306 | +| osdb | 10085684 | 2251471 | 1.786 | +| reymont | 6627202 | 972461 | 1.174 | +| samba | 21606400 | 3881872 | 1.437 | +| sao | 7251944 | 4672656 | 5.155 | +| webster | 41458703 | 6318267 | 1.219 | +| xml | 5345280 | 369196 | 0.553 | +| x-ray | 8474240 | 3697722 | 3.491 | + +### Manzini Corpus ### +| File name | Input size (bytes) | Output size (bytes) | Bits per symbol | +|:---------------:|:-----------:|:------------:|:-------:| +| chr22.dna | 34553758 | 7262753 | 1.681 | +| etext99 | 105277340 | 21730495 | 1.651 | +| gcc-3.0.tar | 86630400 | 10306097 | 0.952 | +| howto | 39422105 | 7662880 | 1.555 | +| jdk13c | 69728899 | 2692938 | 0.309 | +| linux-2.4.5.tar | 116254720 | 16773180 | 1.154 | +| rctail96 | 114711151 | 9949692 | 0.694 | +| rfc | 116421901 | 15192366 | 1.044 | +| sprot34.dat | 109617186 | 17534134 | 1.280 | +| w3c2 | 104201579 | 5800775 | 0.445 | + +### Maximum Compression Corpus ### +| File name | Input size (bytes) | Output size (bytes) | Bits per symbol | +|:---------------:|:-----------:|:------------:|:-------:| +| A10.jpg | 842468 | 825162 | 7.836 | +| AcroRd32.exe | 3870784 | 1582677 | 3.271 | +| english.dic | 465211 | 148582 | 2.555 | +| FlashMX.pdf | 4526946 | 3735179 | 6.601 | +| FP.LOG | 20617071 | 514554 | 0.200 | +| MSO97.DLL | 3782416 | 1904460 | 4.028 | +| ohs.doc | 4168192 | 817718 | 1.569 | +| rafale.bmp | 4149414 | 750437 | 1.447 | +| vcfiu.hlp | 4121418 | 620358 | 1.204 | +| world95.txt | 2988578 | 452271 | 1.211 | + +### Large Text Compression Benchmark Corpus ### +| File name | Input size (bytes) | Output size (bytes) | Bits per symbol | +|:---------------:|:-----------:|:------------:|:-------:| +| enwik8 | 100000000 | 20529360 | 1.642 | +| enwik9 | 1000000000 | 162084133 | 1.297 | diff --git a/VERSION b/VERSION new file mode 100644 index 0000000..6c6aa7c --- /dev/null +++ b/VERSION @@ -0,0 +1 @@ +0.1.0 \ No newline at end of file diff --git a/bsc-m03.cpp b/bsc-m03.cpp new file mode 100644 index 0000000..0279baa --- /dev/null +++ b/bsc-m03.cpp @@ -0,0 +1,483 @@ +/*-- + +This file is a part of bsc-m03 project. + + Copyright (c) 2021 Ilya Grebnov + + bsc-m03 is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + bsc-m03 is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with bsc-m03. If not, see . + +--*/ + +#define _CRT_SECURE_NO_WARNINGS + +#include +#include +#include +#include +#include + +#include + +#include "libsais/libsais.h" +#include "libsais/libsais16.h" + +#include "common/platform.h" +#include "common/rangecoder.h" + +#define MAX_ALPHABET_SIZE (256 * 256) + +#include "m03_parser.h" + +#pragma warning( push ) +#pragma warning( disable : 6385 ) +#pragma warning( disable : 6386 ) + +int32_t root_frequencies[MAX_ALPHABET_SIZE + 1]; + +static int32_t compress_memory_block(uint8_t * buffer, int32_t block_size, int32_t symbol_size) +{ + if (block_size % symbol_size != 0) + { + fprintf(stderr, "\nError: Block size of %d bytes is not a multiple of symbol width!\n", block_size); + return -2; + } + + int32_t indexes[32] = { -1 }; + int32_t comressed_size = -1; + int32_t block_symbols = block_size / symbol_size; + int32_t r = next_power_of_2(std::max(block_symbols / 16, 1048576)); + + if (int32_t * libsais_temp = (int32_t *)malloc(block_symbols * sizeof(int32_t))) + { + int32_t result = symbol_size == 1 + ? libsais_bwt_aux(buffer, buffer, libsais_temp, block_symbols, 0, root_frequencies, r, indexes) + : libsais16_bwt_aux((uint16_t *)buffer, (uint16_t *)buffer, libsais_temp, block_symbols, 0, root_frequencies, r, indexes); + + free(libsais_temp); + + if (result == 0) + { + if (uint16_t * L = (uint16_t *)malloc(((size_t)block_symbols + 1) * sizeof(uint16_t))) + { + if (m03_parser * parser = (m03_parser *)malloc(sizeof(m03_parser))) + { + { + int32_t primary_index = indexes[0]; + + if (symbol_size == 1) + { + for (int32_t p = 0; p < primary_index; ++p) { L[p + 0] = ((uint16_t)buffer[p]); } + for (int32_t p = primary_index; p < block_symbols; ++p) { L[p + 1] = ((uint16_t)buffer[p]); } + } + else + { + for (int32_t p = 0; p < primary_index; ++p) { L[p + 0] = ((uint16_t *)buffer)[p]; } + for (int32_t p = primary_index; p < block_symbols; ++p) { L[p + 1] = ((uint16_t *)buffer)[p]; } + } + + L[primary_index] = 0; + } + + RangeCoder coder; + coder.InitEncoder(buffer, block_size); + coder.EncodeValue(1, symbol_size, 2); + + for (int32_t t = 0; t <= (block_symbols - 1) / r; ++t) + { + coder.EncodeValue(1, indexes[t], block_symbols); + } + + if (parser->initialize(L, block_symbols + 1, indexes[0], root_frequencies, symbol_size == 1 ? 256 : 256 * 256, &coder, m03_mode::encoding)) + { + parser->run(); + parser->destroy(); + + comressed_size = coder.FinishEncoder(); + } + else + { + fprintf(stderr, "\nError: Not enough memory!\n"); + } + + free(parser); + } + else + { + fprintf(stderr, "\nError: Not enough memory!\n"); + } + + free(L); + } + else + { + fprintf(stderr, "\nError: Not enough memory!\n"); + } + } + else + { + fprintf(stderr, "\nError: libsais_bwt failed, please contact the author!\n"); + } + } + else + { + fprintf(stderr, "\nError: Not enough memory!\n"); + } + + return comressed_size; +} + +static int32_t decompress_memory_block(uint8_t * buffer, int32_t comressed_size, int32_t block_size) +{ + RangeCoder coder; + coder.InitDecoder(buffer); + int32_t symbol_size = coder.DecodeValue(1, 2); + + int32_t indexes[32] = { -1 }; + int32_t primary_index = -1; + int32_t decomressed_size = -1; + int32_t block_symbols = block_size / symbol_size; + int32_t r = next_power_of_2(std::max(block_symbols / 16, 1048576)); + + for (int32_t t = 0; t <= (block_symbols - 1) / r; ++t) + { + indexes[t] = coder.DecodeValue(1, block_symbols); + } + + if (uint16_t * L = (uint16_t *)malloc(((size_t)block_symbols + 1) * sizeof(uint16_t))) + { + if (m03_parser * parser = (m03_parser *)malloc(sizeof(m03_parser))) + { + if (parser->initialize(L, block_symbols + 1, indexes[0], root_frequencies, symbol_size == 1 ? 256 : 256 * 256, &coder, m03_mode::decoding)) + { + parser->run(); + parser->destroy(); + + { + primary_index = indexes[0]; + + if (symbol_size == 1) + { + for (int32_t p = 0; p < primary_index; ++p) { buffer[p] = (uint8_t)L[p + 0]; } + for (int32_t p = primary_index; p < block_symbols; ++p) { buffer[p] = (uint8_t)L[p + 1]; } + } + else + { + for (int32_t p = 0; p < primary_index; ++p) { ((uint16_t *)buffer)[p] = L[p + 0]; } + for (int32_t p = primary_index; p < block_symbols; ++p) { ((uint16_t *)buffer)[p] = L[p + 1]; } + } + } + } + else + { + fprintf(stderr, "\nError: Not enough memory!\n"); + } + + free(parser); + } + else + { + fprintf(stderr, "\nError: Not enough memory!\n"); + } + + free(L); + } + else + { + fprintf(stderr, "\nError: Not enough memory!\n"); + } + + if (primary_index > 0) + { + if (int32_t * libsais_temp = (int32_t *)malloc(((size_t)block_symbols + 1) * sizeof(int32_t))) + { + int32_t result = symbol_size == 1 + ? libsais_unbwt_aux(buffer, buffer, libsais_temp, block_symbols, root_frequencies, r, indexes) + : libsais16_unbwt_aux((uint16_t *)buffer, (uint16_t *)buffer, libsais_temp, block_symbols, root_frequencies, r, indexes); + + if (result == 0) + { + decomressed_size = block_size; + } + else + { + fprintf(stderr, "\nError: libsais_unbwt failed, please contact the author!\n"); + } + + free(libsais_temp); + } + else + { + fprintf(stderr, "\nError: Not enough memory!\n"); + } + } + + return decomressed_size; +} + +static int compress_file(const char * input_file_name, const char * output_file_name, int32_t max_block_size, int32_t symbol_size) +{ + clock_t start_time = clock(); + if (FILE * input_file = fopen(input_file_name, "rb")) + { + if (FILE * output_file = fopen(output_file_name, "wb")) + { + fseek(input_file, 0, SEEK_END); int64_t remaining_size = _ftelli64(input_file); rewind(input_file); + + if (uint8_t * buffer = (uint8_t *)malloc(std::min(remaining_size, (int64_t)max_block_size) * sizeof(uint8_t))) + { + int64_t input_bytes = 0, output_bytes = 0; + + while (remaining_size > 0) + { + fprintf(stdout, "\rCompressing %.55s(%02d%%)", input_file_name, (int)((input_bytes * 100) / (input_bytes + remaining_size))); + + int32_t block_size = (int32_t)std::min(remaining_size, (int64_t)max_block_size); + + if (fread(buffer, sizeof(uint8_t), block_size, input_file) != block_size) + { + fprintf(stderr, "\nError: Unable to read input file!\n"); + break; + } + + int32_t comressed_size = compress_memory_block(buffer, block_size, symbol_size); + if (comressed_size <= 0) { break; } + + if (fwrite(&block_size, sizeof(uint8_t), sizeof(block_size), output_file) != sizeof(block_size)) + { + fprintf(stderr, "\nError: Unable to write output file!\n"); + break; + } + + if (fwrite(&comressed_size, sizeof(uint8_t), sizeof(comressed_size), output_file) != sizeof(comressed_size)) + { + fprintf(stderr, "\nError: Unable to write output file!\n"); + break; + } + + if (fwrite(buffer, sizeof(uint8_t), comressed_size, output_file) != comressed_size) + { + fprintf(stderr, "\nError: Unable to write output file\n"); + break; + } + + remaining_size -= block_size; + input_bytes += block_size; + output_bytes += sizeof(block_size) + sizeof(comressed_size) + comressed_size; + } + + if (remaining_size == 0) + { + fprintf(stdout, "\r%.55s compressed from %lld into %lld in %.3f seconds (%.3f bps).\n", input_file_name, input_bytes, output_bytes, ((double)clock() - start_time) / CLOCKS_PER_SEC, (8.0 * symbol_size * output_bytes) / input_bytes); + } + + free(buffer); + } + else + { + fprintf(stderr, "Error: Not enough memory!\n"); + } + + fclose(output_file); + } + else + { + fprintf(stderr, "Error: Unable to open output file!\n"); + } + + fclose(input_file); + } + else + { + fprintf(stderr, "Error: Unable to open input file!\n"); + } + + return 0; +} + +static int decompress_file(const char * input_file_name, const char * output_file_name) +{ + clock_t start_time = clock(); + if (FILE * input_file = fopen(input_file_name, "rb")) + { + if (FILE * output_file = fopen(output_file_name, "wb")) + { + int32_t max_block_size; + if (fread(&max_block_size, sizeof(uint8_t), sizeof(max_block_size), input_file) == sizeof(max_block_size)) + { + fseek(input_file, 0, SEEK_END); int64_t remaining_size = _ftelli64(input_file); rewind(input_file); + + if (uint8_t * buffer = (uint8_t *)malloc(max_block_size * sizeof(uint8_t))) + { + int64_t input_bytes = 0, output_bytes = 0; + + while (remaining_size > 0) + { + fprintf(stdout, "\rDecompressing %.55s(%02d%%)", input_file_name, (int)((input_bytes * 100) / (input_bytes + remaining_size))); + + int32_t block_size, comressed_size; + if (fread(&block_size, sizeof(uint8_t), sizeof(block_size), input_file) != sizeof(block_size)) + { + fprintf(stderr, "\nError: Unable to read input file!\n"); + break; + } + + if (fread(&comressed_size, sizeof(uint8_t), sizeof(comressed_size), input_file) != sizeof(comressed_size)) + { + fprintf(stderr, "\nError: Unable to read input file!\n"); + break; + } + + if (block_size > max_block_size || comressed_size > max_block_size) + { + fprintf(stderr, "\nError: The compressed data is corrupted!\n"); + break; + } + + if (fread(buffer, sizeof(uint8_t), comressed_size, input_file) != comressed_size) + { + fprintf(stderr, "\nError: Unable to read input file!\n"); + break; + } + + int32_t decomressed_size = decompress_memory_block(buffer, comressed_size, block_size); + if (decomressed_size != block_size) { break; } + + if (fwrite(buffer, sizeof(uint8_t), decomressed_size, output_file) != decomressed_size) + { + fprintf(stderr, "\nError: Unable to write output file\n"); + break; + } + + remaining_size -= sizeof(block_size) + sizeof(comressed_size) + comressed_size; + input_bytes += sizeof(block_size) + sizeof(comressed_size) + comressed_size; + output_bytes += decomressed_size; + } + + if (remaining_size == 0) + { + fprintf(stdout, "\r%.55s decompressed from %lld into %lld in %.3f seconds.\n", input_file_name, input_bytes, output_bytes, ((double)clock() - start_time) / CLOCKS_PER_SEC); + } + + free(buffer); + } + else + { + fprintf(stderr, "Error: Not enough memory!\n"); + } + } + else + { + fprintf(stderr, "Error: Unable to read input file!\n"); + } + + fclose(output_file); + } + else + { + fprintf(stderr, "Error: Unable to open output file!\n"); + } + + fclose(input_file); + } + else + { + fprintf(stderr, "Error: Unable to open input file!\n"); + } + + return 0; +} + +static int print_usage() +{ + fprintf(stdout, "Usage: bsc-m03 input-file output-file \n"); + fprintf(stdout, " -b Block size in bytes, default 128MB (memory usage is ~15x).\n"); + fprintf(stdout, " -w<8|16> Symbol width in bits.\n"); + + return 0; +} + +int main(int argc, const char * argv[]) +{ + fprintf(stdout, "bsc-m03 is experimental block sorting compressor. Version 0.1.0 (3 December 2021).\n"); + fprintf(stdout, "Copyright (c) 2021 Ilya Grebnov . ABSOLUTELY NO WARRANTY.\n"); + fprintf(stdout, "This program is based on (at least) the work of Michael Maniscalco and Atsushi Komiya.\n\n"); + + int32_t max_block_size = 128 * 1024 * 1024; + int32_t symbol_width = 8; + + if (argc < 4 || strlen(argv[1]) != 1) + { + return print_usage(); + } + + for (int32_t i = 4; i < argc; ++i) + { + if (argv[i][0] != '-') + { + return print_usage(); + } + + switch (argv[i][1]) + { + case 'b': + { + max_block_size = atoi(argv[i] + 2); + if (max_block_size <= 0) { return print_usage(); } + + break; + } + + case 'w': + { + symbol_width = atoi(argv[i] + 2); + if (symbol_width != 8 && symbol_width != 16) { return print_usage(); } + + break; + } + + default: + { + return print_usage(); + } + } + } + + switch (argv[1][0]) + { + case 'c': + case 'C': + case 'e': + case 'E': + { + return compress_file(argv[2], argv[3], max_block_size, symbol_width / 8); + } + + case 'd': + case 'D': + { + if (argc != 4) { return print_usage(); } + + return decompress_file(argv[2], argv[3]); + } + + default: + { + return print_usage(); + } + } + + return 0; +} + +#pragma warning( pop ) \ No newline at end of file diff --git a/common/LICENSE b/common/LICENSE new file mode 100644 index 0000000..d645695 --- /dev/null +++ b/common/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/common/platform.h b/common/platform.h new file mode 100644 index 0000000..1252057 --- /dev/null +++ b/common/platform.h @@ -0,0 +1,125 @@ +/*-----------------------------------------------------------*/ +/* Block Sorting, Lossless Data Compression Library. */ +/* Interface to platform specific functions and constants */ +/*-----------------------------------------------------------*/ + +/*-- + +This file is a part of bsc and/or libbsc, a program and a library for +lossless, block-sorting data compression. + + Copyright (c) 2009-2021 Ilya Grebnov + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +Please see the file LICENSE for full copyright information. + +See also the bsc and libbsc web site: + http://libbsc.com/ for more information. + +--*/ + +/*-- + +NOTICE: This file has been modified for use in the bsc-m03 project. + +--*/ + +#ifndef _LIBBSC_PLATFORM_H +#define _LIBBSC_PLATFORM_H + +#if defined(_MSC_VER) + #include +#else + #include +#endif + +#if defined(__GNUC__) + #define INLINE __inline__ +#elif defined(_MSC_VER) + #define INLINE __forceinline +#elif defined(__IBMC__) + #define INLINE _Inline +#elif defined(__cplusplus) + #define INLINE inline +#else + #define INLINE /* */ +#endif + +#if defined(_MSC_VER) + #define NOINLINE __declspec(noinline) +#elif defined(__GNUC__) + #define NOINLINE __attribute__ ((noinline)) +#else + #define NOINLINE /* */ +#endif + +#if defined(_MSC_VER) + #define ALIGNED(x) __declspec(align(x)) +#elif defined(__GNUC__) + #define ALIGNED(x) __attribute__ ((aligned(x))) +#endif + +#if defined(__GNUC__) || defined(__clang__) || defined(__CUDACC__) + #define RESTRICT __restrict__ +#elif defined(_MSC_VER) || defined(__INTEL_COMPILER) + #define RESTRICT __restrict +#else + #define RESTRICT /* */ +#endif + +#if defined(__GNUC__) || defined(__clang__) + #define byteswap_uint64(x) (__builtin_bswap64(x)) + #define bit_scan_reverse(x) (__builtin_clz(x) ^ 31) + #define bit_scan_forward(x) (__builtin_ctz(x)) + #define bit_scan_forward64(x) (__builtin_ctzll(x)) +#elif defined(_MSC_VER) + #define byteswap_uint64(x) (_byteswap_uint64(x)) + + #pragma intrinsic(_BitScanReverse) + #pragma intrinsic(_BitScanForward) + + static inline __forceinline unsigned long bit_scan_reverse(unsigned long x) + { + unsigned long index; + _BitScanReverse(&index, x); + return index; + } + + static inline __forceinline unsigned long bit_scan_forward(unsigned long x) + { + unsigned long index; + _BitScanForward(&index, x); + return index; + } +#endif + + static INLINE unsigned int next_power_of_2(unsigned int v) + { + v--; + + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v++; + + return v; + } + +#endif + +/*-----------------------------------------------------------*/ +/* End platform.h */ +/*-----------------------------------------------------------*/ diff --git a/common/rangecoder.h b/common/rangecoder.h new file mode 100644 index 0000000..4a5d6a6 --- /dev/null +++ b/common/rangecoder.h @@ -0,0 +1,238 @@ +/*-----------------------------------------------------------*/ +/* Block Sorting, Lossless Data Compression Library. */ +/* Range coder */ +/*-----------------------------------------------------------*/ + +/*-- + +This file is a part of bsc and/or libbsc, a program and a library for +lossless, block-sorting data compression. + + Copyright (c) 2009-2021 Ilya Grebnov + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +Please see the file LICENSE for full copyright information. + +See also the bsc and libbsc web site: + http://libbsc.com/ for more information. + +--*/ + +/*-- + +NOTICE: This file has been modified for use in the bsc-m03 project. + +--*/ + +#ifndef _LIBBSC_CODER_RANGECODER_H +#define _LIBBSC_CODER_RANGECODER_H + +#include "platform.h" + +class RangeCoder +{ + +private: + + union ari + { + struct u + { + unsigned int low32; + unsigned int carry; + } u; + unsigned long long low; + } ari; + + unsigned int ari_code; + unsigned int ari_ffnum; + unsigned int ari_cache; + unsigned int ari_range; + + const unsigned char * RESTRICT ari_input; + unsigned char * RESTRICT ari_output; + unsigned char * RESTRICT ari_outputEOB; + unsigned char * RESTRICT ari_outputStart; + + INLINE void OutputByte(unsigned char s) + { + *ari_output++ = s; + }; + + INLINE unsigned char InputByte() + { + return *ari_input++; + }; + + NOINLINE unsigned int ShiftLow() + { + if (ari.u.low32 < 0xff000000U || ari.u.carry) + { + OutputByte(ari_cache + ari.u.carry); + if (ari_ffnum) + { + unsigned char s = ari.u.carry - 1; + do { OutputByte(s); } while (--ari_ffnum); + } + ari_cache = ari.u.low32 >> 24; ari.u.carry = 0; + } + else + { + ari_ffnum++; + } + + ari.u.low32 <<= 8; return ari_range << 8; + } + +public: + + INLINE void InitEncoder(unsigned char * output, int outputSize) + { + ari_outputStart = output; + ari_output = output; + ari_outputEOB = output + outputSize - 16; + ari.low = 0; + ari_ffnum = 0; + ari_cache = 0; + ari_range = 0xffffffff; + }; + + INLINE int FinishEncoder() + { + ShiftLow(); ShiftLow(); ShiftLow(); ShiftLow(); ShiftLow(); + return (int)(ari_output - ari_outputStart); + } + + INLINE void Encode(unsigned int cum_freq, unsigned int sym_freq, unsigned int total_freq) + { + unsigned int range = ari_range / total_freq; + ari.low += (unsigned long long)cum_freq * range; ari_range = sym_freq * range; + + while (ari_range < 0x1000000) { ari_range = ShiftLow(); } + } + + template INLINE unsigned int EncodeBit(unsigned int bit, int probability) + { + unsigned int range = (((unsigned long long)ari_range) * probability) >> P; + ari.low = ari.low + ((~bit + 1u) & range); + ari_range = range + ((~bit + 1u) & (ari_range - range - range)); + + while (ari_range < 0x1000000) { ari_range = ShiftLow(); } + + return bit; + } + + INLINE unsigned int EncodeValue(unsigned int min, unsigned int value, unsigned int max) + { + assert(min <= value && value <= max); + + while (max - min >= 0x10000) + { + unsigned int median = min + ((max - min) >> 1); + if (value > median) + { + EncodeBit<1>(1, 1); + min = median + 1; + } + else + { + EncodeBit<1>(0, 1); + max = median; + } + } + + if (min != max) + { + Encode(value - min, 1, max - min + 1); + } + + return value; + } + + INLINE void InitDecoder(const unsigned char * input) + { + ari_input = input; + ari_code = 0; + ari_range = 0xffffffff; + ari_code = (ari_code << 8) | InputByte(); + ari_code = (ari_code << 8) | InputByte(); + ari_code = (ari_code << 8) | InputByte(); + ari_code = (ari_code << 8) | InputByte(); + ari_code = (ari_code << 8) | InputByte(); + }; + + INLINE unsigned int GetCumFreq(unsigned int total_freq) + { + while (ari_range < 0x1000000) + { + ari_range <<= 8; ari_code = (ari_code << 8) | InputByte(); + } + + return ari_code / (ari_range / total_freq); + } + + INLINE void Decode(unsigned int cum_freq, unsigned int sym_freq, unsigned int total_freq) + { + unsigned int range = ari_range / total_freq; + ari_code -= cum_freq * range; ari_range = sym_freq * range; + } + + template INLINE int DecodeBit(int probability) + { + while (ari_range < 0x1000000) + { + ari_range <<= 8; ari_code = (ari_code << 8) | InputByte(); + } + + unsigned int range = (((unsigned long long)ari_range) * probability) >> P; + int bit = ari_code >= range; + + ari_range = bit ? ari_range - range : range; + ari_code = bit ? ari_code - range : ari_code; + + return bit; + } + + INLINE unsigned int DecodeValue(unsigned int min, unsigned int max) + { + assert(min <= max); + + while (max - min >= 0x10000) + { + unsigned int median = min + ((max - min) >> 1); + if (DecodeBit<1>(1)) + { + min = median + 1; + } + else + { + max = median; + } + } + + if (min != max) + { + unsigned int cum_freq = GetCumFreq(max - min + 1); + Decode(cum_freq, 1, max - min + 1); min += cum_freq; + } + + return min; + } +}; + +#endif + +/*-----------------------------------------------------------*/ +/* End rangecoder.h */ +/*-----------------------------------------------------------*/ diff --git a/hutucker/LICENSE b/hutucker/LICENSE new file mode 100644 index 0000000..f288702 --- /dev/null +++ b/hutucker/LICENSE @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/hutucker/README b/hutucker/README new file mode 100644 index 0000000..e104713 --- /dev/null +++ b/hutucker/README @@ -0,0 +1,88 @@ +This is an O(n log n) implementation of Hu-Tucker coding.[1] + +This is the algorithm: +1. Label node 0, ..., n-1 'terminal' +2. Repeat (n - 1) times: + (a) Find the pair (i, j) such that + (i) i < j, + (ii) neither node i nor j is labeled 'none', + (iii) none of node i+1, ..., j-1 is labeled 'terminal', + (iv) weight[i] + weight[j] are minimal, + (v) i is minimal if the selection is not unique after (iv), and + (vi) j is minimal if the selection is not unique after (v) + (b) Merge node i with node j, and saves it as new node i + (c) weight[i] += weight[j] + (d) Label node i 'internal' + (e) Label node j 'none' +3. A tree has been built with root being node 0. + Traverse this tree for length of code. + This tree is not alphabetical. + Nevertheless, the length of code produced by the tree is correct. + +See example.c for computing the actual code from the length. + +We need a non-trivial data structure to implement 2(a) efficiently. +This is the data structure: +1. It is a perfect binary tree. + The nodes in this tree are called "segnodes" to distinguish them from + nodes in the coding tree. + This tree shall have at least n leaf segnodes. +2. Each segnode is implicitly associated with a range [a, b). + The range of the leaf segnode i is [i, i+1). + The range of each internal segnode is union of the ranges of its children. + (Alternatively, the range of each internal nodes is the union of + the ranges of all leaf nodes in its subtree.) +3. Each segnode also has 6 explicit fields (n, m, l, r, i, j). + n: The number of nodes [a, b) labeled 'terminal' or 'internal' + m: The number of nodes [a, b) labeled 'terminal' + l: The index such that: + (i) l in [a, b), + (ii) node l is not labeled 'none', + (iii) none of node l, ..., i-1 is labeled 'terminal', + (iv) weight[l] is minimal, and + (v) l is minimal if the selection is not unique after (iv) + r: The index such that: + (i) r in [a, b), + (ii) node r is not labeled 'none', + (iii) none of node r, ..., b-1 is labeled 'terminal', + (iv) weight[r] is minimal, and + (v) r is minimal if the selection is not unique after (iv) + i, j: The pair of indices such that: + (i) a <= i < j < b, + (ii) neither node i nor j is labeled 'none', + (iii) none of node i+1, ..., j-1 is labeled 'terminal', + (iv) weight[i] + weight[j] are minimal, + (v) i is minimal if the selection is not unique after (iv), and + (vi) j is minimal if the selection is not unique after (v) +4. The explicit fields can be trivially computed for leaf segnodes: + (a) Leaf segnode i labeled 'terminal': + (n, m, l, r, i, j) = (1, 1, i, i, None, None) + (b) Leaf segnode i labeled 'internal': + (n, m, l, r, i, j) = (1, 0, i, i, None, None) + (c) Leaf segnode i labeled 'none': + (n, m, l, r, i, j) = (0, 0, None, None, None, None) +5. The explicit fields can be efficiently computed for internal segnodes, + if we have access to correct labels of its children segnodes. + Let its left children be L, and its right children be R. + n: L.n + R.n + m: L.n + R.m + l: L.l if L.m > 0, otherwise the better of L.l and R.l + r: R.r if R.m > 0, otherwise the better of L.r and R.r + i, j: the best of (L.i, L.j), (L.r, R.l) and (R.i, R.j) + +Analysis: +1. This data structure can be built in O(n). +2. The (i, j) step 2(a) is the (i, j) of the root of the data structure, + which can be looked up in O(1). +3. When the weight[i] and label of node i changed, + leaf segnode i and its ancestors need to be updated. + That's O(log n) updates and O(1) per update. + Same for node j. +4. Step 2 is repeated O(n) times. + Other parts are trivial. + Therefore, the overall time is O(n log n). + + +[1]: Hu, T. C.; Tucker, A. C. (1971) "Optimal Computer Search Trees + and Variable-Length Alphabetical Codes", SIAM Journal on + Applied Mathematics. 21 (4): 514. diff --git a/hutucker/example.c b/hutucker/example.c new file mode 100644 index 0000000..34dd53f --- /dev/null +++ b/hutucker/example.c @@ -0,0 +1,84 @@ +/* + * Linearithmic Hu-Tucker Coding. + * Copyright (C) 2018 Pochang Chen + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +#include +#include +#include +#include +#include "hu-tucker.h" + +int main() { + size_t n; + if (scanf("%zu", &n) != 1) + return 1; + if (n < 1) { + errno = EINVAL; + perror(NULL); + return 1; + } + + unsigned long *weight = calloc(n, sizeof(unsigned long)); + if (!weight) { + perror("calloc"); + return 1; + } + + for (size_t i = 0; i < n; i++) + scanf("%lu", weight + i); + + unsigned long sumweight = 0; + for (size_t i = 0; i < n; i++) { + sumweight += weight[i]; + if (sumweight < weight[i]) { + errno = EOVERFLOW; + perror(NULL); + return 1; + } + } + + unsigned long *tmp = malloc(hutucker_tmp_size(n)); + if (!tmp) { + perror("malloc"); + return 1; + } + hutucker_get_lengths(n, weight, tmp); + free(tmp); + + unsigned long maxlength = 0; + for (size_t i = 0; i < n; i++) + if (weight[i] > maxlength) + maxlength = weight[i]; + + unsigned char *str = malloc(maxlength + 1); + if (!str) { + perror("malloc"); + return 1; + } + for (size_t i = 0, l = 0; i < n; i++) { + if (l < weight[i]) + memset(str + l, '0', weight[i] - l); + l = weight[i]; + str[l] = '\0'; + puts(str); + for (size_t j = l - 1; j != (size_t) -1; j--) + if ((str[j] ^= '0' ^ '1') == '1') + break; + } + + free(str); + free(weight); +} diff --git a/hutucker/hu-tucker.c b/hutucker/hu-tucker.c new file mode 100644 index 0000000..82fe712 --- /dev/null +++ b/hutucker/hu-tucker.c @@ -0,0 +1,128 @@ +/* + * Linearithmic Hu-Tucker Coding. + * Copyright (C) 2018 Pochang Chen + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +/*-- + +NOTICE: This file has been modified for use in the bsc-m03 project. + +--*/ + +#include "hu-tucker.h" + +typedef struct { + // number of (terminal or internal) nodes under this segnode + size_t n; + // number of terminal node under this segnode, if n >= 1 + size_t m; + // index of minimum weight in the leftmost block, if n >= 1 + size_t l; + // index of minimum weight in the rightmost block, if n >= 1 + size_t r; + // indices of minimum weight pair in the same block, if n >= 2 + size_t i, j; +} segnode; + +static void segupdate(segnode *pa, segnode *lc, segnode *rc, unsigned long *w) { + if (!lc->n) { + *pa = *rc; + return; + } + if (!rc->n) { + *pa = *lc; + return; + } + pa->n = lc->n + rc->n; + pa->m = lc->m + rc->m; + pa->l = ( lc->m || w[lc->l] <= w[rc->l]) ? lc->l : rc->l; + pa->r = (!rc->m && w[lc->r] <= w[rc->r]) ? lc->r : rc->r; + pa->i = lc->r; + pa->j = rc->l; + if (lc->n >= 2 && w[lc->i] + w[lc->j] <= w[pa->i] + w[pa->j]) { + pa->i = lc->i; + pa->j = lc->j; + } + if (rc->n >= 2 && w[rc->i] + w[rc->j] < w[pa->i] + w[pa->j]) { + pa->i = rc->i; + pa->j = rc->j; + } +} +static void segterminal(segnode *x, size_t id) { + x->n = x->m = 1; + x->l = x->r = id; +} +static void seginternal(segnode *x, size_t id) { + x->n = 1; + x->m = 0; + x->l = x->r = id; +} +static void segnone(segnode *x) { + x->n = 0; +} +static size_t raise_power_of_two(size_t n) { + size_t ans = 1; + while (ans < n) + ans *= 2; + return ans; +} + +size_t hutucker_tmp_size(size_t n) { + // TODO check overflow for very large n + size_t m = raise_power_of_two(n); + return sizeof(segnode) * (2 * m - 1) + + sizeof(size_t) * (n + (2 * n - 1) + (2 * n - 1)); +} + +void hutucker_get_lengths(size_t n, unsigned long *weight, void *tmp) { + size_t m = raise_power_of_two(n); + segnode *seg = (segnode *) tmp; + size_t *cur = (size_t *) (seg + 2 * m - 1); + size_t *pa = (size_t *) (cur + n); + size_t *level = (size_t *) (pa + 2 * n - 1); + + for (size_t i = 0; i < n; i++) { + segterminal(seg + m - 1 + i, i); + cur[i] = i; + } + for (size_t i = n; i < m; i++) + segnone(seg + m - 1 + i); + + for (size_t i = m - 2; i != (size_t) -1; i--) + segupdate(seg + i, seg + 2 * i + 1, seg + 2 * i + 2, weight); + + for (size_t k = 0; k < n - 1; k++) { + size_t i = seg->i, j = seg->j; + + weight[i] += weight[j]; + pa[cur[i]] = pa[cur[j]] = n + k; + cur[i] = n + k; + + seginternal(seg + m - 1 + i, i); + for (size_t l = m + i; l /= 2; ) + segupdate(seg + l - 1, seg + 2 * l - 1, seg + 2 * l, weight); + + segnone(seg + m - 1 + j); + for (size_t l = m + j; l /= 2; ) + segupdate(seg + l - 1, seg + 2 * l - 1, seg + 2 * l, weight); + } + + level[2 * n - 2] = 0; + for (size_t i = 2 * n - 3; i != (size_t) -1; i--) + level[i] = level[pa[i]] + 1; + for (size_t i = 0; i < n; i++) + weight[i] = (unsigned long)level[i]; +} diff --git a/hutucker/hu-tucker.h b/hutucker/hu-tucker.h new file mode 100644 index 0000000..b637f4f --- /dev/null +++ b/hutucker/hu-tucker.h @@ -0,0 +1,56 @@ +/* + * Linearithmic Hu-Tucker Coding. + * Copyright (C) 2018 Pochang Chen + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +/*-- + +NOTICE: This file has been modified for use in the bsc-m03 project. + +--*/ + +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +/** + * This algorithm needs some temporary memories to work. + * This function computes how much temporary memories are needed. + */ +size_t hutucker_tmp_size(size_t n); + +/** + * Given the weight of n symbols, determine the length of hu-tucker code + * of each symbols. + * + * Precondition: + * n: number of symbols + * weight[i] (0 <= i < n): the weight of symbol i + * weight[0] + ... + weight[n - 1] must not exceed ULONG_MAX + * tmp: buffer with size >= hutucker_tmp_size(n) + * + * Postcondition: + * weight[i] (0 <= i < n): the length of hu-tucker code of symbol i + */ +void hutucker_get_lengths(size_t n, unsigned long *weight, void *tmp); + +#ifdef __cplusplus +} +#endif diff --git a/libsais/CHANGES b/libsais/CHANGES new file mode 100644 index 0000000..6d0b176 --- /dev/null +++ b/libsais/CHANGES @@ -0,0 +1,23 @@ +Changes in 2.6.0 (October 21, 2021) +- libsais16 for 16-bit inputs. + +Changes in 2.5.0 (October 15, 2021) +- Support for optional symbol frequency tables. + +Changes in 2.4.0 (July 14, 2021) +- Reverse Burrows-Wheeler transform. + +Changes in 2.3.0 (June 23, 2021) +- Burrows-Wheeler transform with auxiliary indexes. + +Changes in 2.2.0 (April 27, 2021) +- libsais64 for inputs larger than 2GB. + +Changes in 2.1.0 (April 19, 2021) +- Additional OpenMP acceleration. + +Changes in 2.0.0 (April 4, 2021) +- OpenMP acceleration. + +Changes in 1.0.0 (February 23, 2021) +- Initial Release. diff --git a/libsais/LICENSE b/libsais/LICENSE new file mode 100644 index 0000000..d645695 --- /dev/null +++ b/libsais/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/libsais/VERSION b/libsais/VERSION new file mode 100644 index 0000000..914ec96 --- /dev/null +++ b/libsais/VERSION @@ -0,0 +1 @@ +2.6.0 \ No newline at end of file diff --git a/libsais/libsais.c b/libsais/libsais.c new file mode 100644 index 0000000..885bd82 --- /dev/null +++ b/libsais/libsais.c @@ -0,0 +1,7599 @@ +/*-- + +This file is a part of libsais, a library for linear time +suffix array and burrows wheeler transform construction. + + Copyright (c) 2021 Ilya Grebnov + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +Please see the file LICENSE for full copyright information. + +--*/ + +#include "libsais_internal.h" + +#include "libsais.h" + +#include +#include +#include +#include +#include + +#if defined(_OPENMP) + #include +#else + #define UNUSED(_x) (void)(_x) +#endif + +typedef int32_t sa_sint_t; +typedef uint32_t sa_uint_t; +typedef ptrdiff_t fast_sint_t; +typedef size_t fast_uint_t; + +#define SAINT_BIT (32) +#define SAINT_MAX INT32_MAX +#define SAINT_MIN INT32_MIN + +#define ALPHABET_SIZE (1 << CHAR_BIT) +#define UNBWT_FASTBITS (17) + +#define SUFFIX_GROUP_BIT (SAINT_BIT - 1) +#define SUFFIX_GROUP_MARKER (((sa_sint_t)1) << (SUFFIX_GROUP_BIT - 1)) + +#define BUCKETS_INDEX2(_c, _s) (((_c) << 1) + (_s)) +#define BUCKETS_INDEX4(_c, _s) (((_c) << 2) + (_s)) + +#define LIBSAIS_PER_THREAD_CACHE_SIZE (24576) + +typedef struct LIBSAIS_THREAD_CACHE +{ + sa_sint_t symbol; + sa_sint_t index; +} LIBSAIS_THREAD_CACHE; + +typedef union LIBSAIS_THREAD_STATE +{ + struct + { + fast_sint_t position; + fast_sint_t count; + + fast_sint_t m; + fast_sint_t last_lms_suffix; + + sa_sint_t * buckets; + LIBSAIS_THREAD_CACHE * cache; + } state; + + uint8_t padding[64]; +} LIBSAIS_THREAD_STATE; + +typedef struct LIBSAIS_CONTEXT +{ + sa_sint_t * buckets; + LIBSAIS_THREAD_STATE * thread_state; + fast_sint_t threads; +} LIBSAIS_CONTEXT; + +typedef struct LIBSAIS_UNBWT_CONTEXT +{ + sa_uint_t * bucket2; + uint16_t * fastbits; + sa_uint_t * buckets; + fast_sint_t threads; +} LIBSAIS_UNBWT_CONTEXT; + +#if defined(__GNUC__) || defined(__clang__) + #define RESTRICT __restrict__ +#elif defined(_MSC_VER) || defined(__INTEL_COMPILER) + #define RESTRICT __restrict +#else + #error Your compiler, configuration or platform is not supported. +#endif + +#if defined(__has_builtin) + #if __has_builtin(__builtin_prefetch) + #define HAS_BUILTIN_PREFECTCH + #endif +#elif defined(__GNUC__) && __GNUC__ > 3 + #define HAS_BUILTIN_PREFECTCH +#endif + +#if defined(HAS_BUILTIN_PREFECTCH) + #define libsais_prefetch(address) __builtin_prefetch((const void *)(address), 0, 0) + #define libsais_prefetchw(address) __builtin_prefetch((const void *)(address), 1, 0) +#elif defined (_M_IX86) || defined (_M_AMD64) + #include + #define libsais_prefetch(address) _mm_prefetch((const void *)(address), _MM_HINT_NTA) + #define libsais_prefetchw(address) _m_prefetchw((const void *)(address)) +#elif defined (_M_ARM) + #include + #define libsais_prefetch(address) __prefetch((const void *)(address)) + #define libsais_prefetchw(address) __prefetchw((const void *)(address)) +#elif defined (_M_ARM64) + #include + #define libsais_prefetch(address) __prefetch2((const void *)(address), 1) + #define libsais_prefetchw(address) __prefetch2((const void *)(address), 17) +#else + #error Your compiler, configuration or platform is not supported. +#endif + +#if !defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__) + #if defined(_LITTLE_ENDIAN) \ + || (defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && BYTE_ORDER == LITTLE_ENDIAN) \ + || (defined(_BYTE_ORDER) && defined(_LITTLE_ENDIAN) && _BYTE_ORDER == _LITTLE_ENDIAN) \ + || (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN) \ + || (defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) + #define __LITTLE_ENDIAN__ + #elif defined(_BIG_ENDIAN) \ + || (defined(BYTE_ORDER) && defined(BIG_ENDIAN) && BYTE_ORDER == BIG_ENDIAN) \ + || (defined(_BYTE_ORDER) && defined(_BIG_ENDIAN) && _BYTE_ORDER == _BIG_ENDIAN) \ + || (defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && __BYTE_ORDER == __BIG_ENDIAN) \ + || (defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + #define __BIG_ENDIAN__ + #elif defined(_WIN32) + #define __LITTLE_ENDIAN__ + #endif +#endif + +#if defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__) + #if defined(__GNUC__) || defined(__clang__) + #define libsais_bswap16(x) (__builtin_bswap16(x)) + #elif defined(_MSC_VER) && !defined(__INTEL_COMPILER) + #define libsais_bswap16(x) (_byteswap_ushort(x)) + #else + #define libsais_bswap16(x) ((uint16_t)(x >> 8) | (uint16_t)(x << 8)) + #endif +#elif !defined(__LITTLE_ENDIAN__) && defined(__BIG_ENDIAN__) + #define libsais_bswap16(x) (x) +#else + #error Your compiler, configuration or platform is not supported. +#endif + +static void * libsais_align_up(const void * address, size_t alignment) +{ + return (void *)((((ptrdiff_t)address) + ((ptrdiff_t)alignment) - 1) & (-((ptrdiff_t)alignment))); +} + +static void * libsais_alloc_aligned(size_t size, size_t alignment) +{ + void * address = malloc(size + sizeof(short) + alignment - 1); + if (address != NULL) + { + void * aligned_address = libsais_align_up((void *)((ptrdiff_t)address + (ptrdiff_t)(sizeof(short))), alignment); + ((short *)aligned_address)[-1] = (short)((ptrdiff_t)aligned_address - (ptrdiff_t)address); + + return aligned_address; + } + + return NULL; +} + +static void libsais_free_aligned(void * aligned_address) +{ + if (aligned_address != NULL) + { + free((void *)((ptrdiff_t)aligned_address - ((short *)aligned_address)[-1])); + } +} + +static LIBSAIS_THREAD_STATE * libsais_alloc_thread_state(sa_sint_t threads) +{ + LIBSAIS_THREAD_STATE * RESTRICT thread_state = (LIBSAIS_THREAD_STATE *)libsais_alloc_aligned((size_t)threads * sizeof(LIBSAIS_THREAD_STATE), 4096); + sa_sint_t * RESTRICT thread_buckets = (sa_sint_t *)libsais_alloc_aligned((size_t)threads * 4 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096); + LIBSAIS_THREAD_CACHE * RESTRICT thread_cache = (LIBSAIS_THREAD_CACHE *)libsais_alloc_aligned((size_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE * sizeof(LIBSAIS_THREAD_CACHE), 4096); + + if (thread_state != NULL && thread_buckets != NULL && thread_cache != NULL) + { + fast_sint_t t; + for (t = 0; t < threads; ++t) + { + thread_state[t].state.buckets = thread_buckets; thread_buckets += 4 * ALPHABET_SIZE; + thread_state[t].state.cache = thread_cache; thread_cache += LIBSAIS_PER_THREAD_CACHE_SIZE; + } + + return thread_state; + } + + libsais_free_aligned(thread_cache); + libsais_free_aligned(thread_buckets); + libsais_free_aligned(thread_state); + return NULL; +} + +static void libsais_free_thread_state(LIBSAIS_THREAD_STATE * thread_state) +{ + if (thread_state != NULL) + { + libsais_free_aligned(thread_state[0].state.cache); + libsais_free_aligned(thread_state[0].state.buckets); + libsais_free_aligned(thread_state); + } +} + +static LIBSAIS_CONTEXT * libsais_create_ctx_main(sa_sint_t threads) +{ + LIBSAIS_CONTEXT * RESTRICT ctx = (LIBSAIS_CONTEXT *)libsais_alloc_aligned(sizeof(LIBSAIS_CONTEXT), 64); + sa_sint_t * RESTRICT buckets = (sa_sint_t *)libsais_alloc_aligned(8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096); + LIBSAIS_THREAD_STATE * RESTRICT thread_state = threads > 1 ? libsais_alloc_thread_state(threads) : NULL; + + if (ctx != NULL && buckets != NULL && (thread_state != NULL || threads == 1)) + { + ctx->buckets = buckets; + ctx->threads = threads; + ctx->thread_state = thread_state; + + return ctx; + } + + libsais_free_thread_state(thread_state); + libsais_free_aligned(buckets); + libsais_free_aligned(ctx); + return NULL; +} + +static void libsais_free_ctx_main(LIBSAIS_CONTEXT * ctx) +{ + if (ctx != NULL) + { + libsais_free_thread_state(ctx->thread_state); + libsais_free_aligned(ctx->buckets); + libsais_free_aligned(ctx); + } +} + +#if defined(_OPENMP) + +static sa_sint_t libsais_count_negative_marked_suffixes(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + sa_sint_t count = 0; + + fast_sint_t i; for (i = omp_block_start; i < omp_block_start + omp_block_size; ++i) { count += (SA[i] < 0); } + + return count; +} + +static sa_sint_t libsais_count_zero_marked_suffixes(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + sa_sint_t count = 0; + + fast_sint_t i; for (i = omp_block_start; i < omp_block_start + omp_block_size; ++i) { count += (SA[i] == 0); } + + return count; +} + +static void libsais_place_cached_suffixes(sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) + { + libsais_prefetch(&cache[i + 2 * prefetch_distance]); + + libsais_prefetchw(&SA[cache[i + prefetch_distance + 0].symbol]); + libsais_prefetchw(&SA[cache[i + prefetch_distance + 1].symbol]); + libsais_prefetchw(&SA[cache[i + prefetch_distance + 2].symbol]); + libsais_prefetchw(&SA[cache[i + prefetch_distance + 3].symbol]); + + SA[cache[i + 0].symbol] = cache[i + 0].index; + SA[cache[i + 1].symbol] = cache[i + 1].index; + SA[cache[i + 2].symbol] = cache[i + 2].index; + SA[cache[i + 3].symbol] = cache[i + 3].index; + } + + for (j += prefetch_distance + 3; i < j; i += 1) + { + SA[cache[i].symbol] = cache[i].index; + } +} + +static void libsais_compact_and_place_cached_suffixes(sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j, l; + for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j; i += 4) + { + libsais_prefetchw(&cache[i + prefetch_distance]); + + cache[l] = cache[i + 0]; l += cache[l].symbol >= 0; + cache[l] = cache[i + 1]; l += cache[l].symbol >= 0; + cache[l] = cache[i + 2]; l += cache[l].symbol >= 0; + cache[l] = cache[i + 3]; l += cache[l].symbol >= 0; + } + + for (j += 3; i < j; i += 1) + { + cache[l] = cache[i]; l += cache[l].symbol >= 0; + } + + libsais_place_cached_suffixes(SA, cache, omp_block_start, l - omp_block_start); +} + +static void libsais_accumulate_counts_s32_2(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) +{ + sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; + fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s]; } +} + +static void libsais_accumulate_counts_s32_3(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) +{ + sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; + sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride; + fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s]; } +} + +static void libsais_accumulate_counts_s32_4(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) +{ + sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; + sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride; + sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride; + fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s]; } +} + +static void libsais_accumulate_counts_s32_5(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) +{ + sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; + sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride; + sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride; + sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride; + fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s]; } +} + +static void libsais_accumulate_counts_s32_6(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) +{ + sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; + sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride; + sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride; + sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride; + sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride; + fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s]; } +} + +static void libsais_accumulate_counts_s32_7(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) +{ + sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; + sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride; + sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride; + sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride; + sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride; + sa_sint_t * RESTRICT bucket06 = bucket05 - bucket_stride; + fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s] + bucket06[s]; } +} + +static void libsais_accumulate_counts_s32_8(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) +{ + sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; + sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride; + sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride; + sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride; + sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride; + sa_sint_t * RESTRICT bucket06 = bucket05 - bucket_stride; + sa_sint_t * RESTRICT bucket07 = bucket06 - bucket_stride; + fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s] + bucket06[s] + bucket07[s]; } +} + +static void libsais_accumulate_counts_s32_9(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) +{ + sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; + sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride; + sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride; + sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride; + sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride; + sa_sint_t * RESTRICT bucket06 = bucket05 - bucket_stride; + sa_sint_t * RESTRICT bucket07 = bucket06 - bucket_stride; + sa_sint_t * RESTRICT bucket08 = bucket07 - bucket_stride; + fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s] + bucket06[s] + bucket07[s] + bucket08[s]; } +} + +static void libsais_accumulate_counts_s32(sa_sint_t * RESTRICT buckets, fast_sint_t bucket_size, fast_sint_t bucket_stride, fast_sint_t num_buckets) +{ + while (num_buckets >= 9) + { + libsais_accumulate_counts_s32_9(buckets - (num_buckets - 9) * bucket_stride, bucket_size, bucket_stride); num_buckets -= 8; + } + + switch (num_buckets) + { + case 1: break; + case 2: libsais_accumulate_counts_s32_2(buckets, bucket_size, bucket_stride); break; + case 3: libsais_accumulate_counts_s32_3(buckets, bucket_size, bucket_stride); break; + case 4: libsais_accumulate_counts_s32_4(buckets, bucket_size, bucket_stride); break; + case 5: libsais_accumulate_counts_s32_5(buckets, bucket_size, bucket_stride); break; + case 6: libsais_accumulate_counts_s32_6(buckets, bucket_size, bucket_stride); break; + case 7: libsais_accumulate_counts_s32_7(buckets, bucket_size, bucket_stride); break; + case 8: libsais_accumulate_counts_s32_8(buckets, bucket_size, bucket_stride); break; + } +} + +#endif + +static void libsais_gather_lms_suffixes_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, fast_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + if (omp_block_size > 0) + { + const fast_sint_t prefetch_distance = 128; + + fast_sint_t i, j = omp_block_start + omp_block_size, c0 = T[omp_block_start + omp_block_size - 1], c1 = -1; + + while (j < n && (c1 = T[j]) == c0) { ++j; } + + fast_uint_t s = c0 >= c1; + + for (i = omp_block_start + omp_block_size - 2, j = omp_block_start + 3; i >= j; i -= 4) + { + libsais_prefetch(&T[i - prefetch_distance]); + + c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1); + c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 0); m -= ((s & 3) == 1); + c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 1); m -= ((s & 3) == 1); + c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 2); m -= ((s & 3) == 1); + } + + for (j -= 3; i >= j; i -= 1) + { + c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1); + } + + SA[m] = (sa_sint_t)(i + 1); + } +} + +static void libsais_gather_lms_suffixes_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536 && omp_get_dynamic() == 0) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; + + if (omp_num_threads == 1) + { + libsais_gather_lms_suffixes_8u(T, SA, n, (fast_sint_t)n - 1, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + fast_sint_t t, m = 0; for (t = omp_num_threads - 1; t > omp_thread_num; --t) { m += thread_state[t].state.m; } + + libsais_gather_lms_suffixes_8u(T, SA, n, (fast_sint_t)n - 1 - m, omp_block_start, omp_block_size); + + #pragma omp barrier + + if (thread_state[omp_thread_num].state.m > 0) + { + SA[(fast_sint_t)n - 1 - m] = (sa_sint_t)thread_state[omp_thread_num].state.last_lms_suffix; + } + } +#endif + } +} + +static sa_sint_t libsais_gather_lms_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n) +{ + const fast_sint_t prefetch_distance = 32; + + sa_sint_t i = n - 2; + sa_sint_t m = n - 1; + fast_uint_t s = 1; + fast_sint_t c0 = T[n - 1]; + fast_sint_t c1 = 0; + + for (; i >= 3; i -= 4) + { + libsais_prefetch(&T[i - prefetch_distance]); + + c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = i + 1; m -= ((s & 3) == 1); + c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i - 0; m -= ((s & 3) == 1); + c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = i - 1; m -= ((s & 3) == 1); + c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i - 2; m -= ((s & 3) == 1); + } + + for (; i >= 0; i -= 1) + { + c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i + 1; m -= ((s & 3) == 1); + } + + return n - 1 - m; +} + +static sa_sint_t libsais_gather_compacted_lms_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n) +{ + const fast_sint_t prefetch_distance = 32; + + sa_sint_t i = n - 2; + sa_sint_t m = n - 1; + fast_uint_t s = 1; + fast_sint_t c0 = T[n - 1]; + fast_sint_t c1 = 0; + + for (; i >= 3; i -= 4) + { + libsais_prefetch(&T[i - prefetch_distance]); + + c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = i + 1; m -= ((fast_sint_t)(s & 3) == (c0 >= 0)); + c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i - 0; m -= ((fast_sint_t)(s & 3) == (c1 >= 0)); + c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = i - 1; m -= ((fast_sint_t)(s & 3) == (c0 >= 0)); + c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i - 2; m -= ((fast_sint_t)(s & 3) == (c1 >= 0)); + } + + for (; i >= 0; i -= 1) + { + c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i + 1; m -= ((fast_sint_t)(s & 3) == (c1 >= 0)); + } + + return n - 1 - m; +} + +#if defined(_OPENMP) + +static void libsais_count_lms_suffixes_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets) +{ + const fast_sint_t prefetch_distance = 32; + + memset(buckets, 0, 4 * (size_t)k * sizeof(sa_sint_t)); + + sa_sint_t i = n - 2; + fast_uint_t s = 1; + fast_sint_t c0 = T[n - 1]; + fast_sint_t c1 = 0; + + for (; i >= prefetch_distance + 3; i -= 4) + { + libsais_prefetch(&T[i - 2 * prefetch_distance]); + + libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 0], 0)]); + libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 1], 0)]); + libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 2], 0)]); + libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 3], 0)]); + + c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); + buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++; + + c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); + buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++; + + c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); + buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++; + + c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); + buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++; + } + + for (; i >= 0; i -= 1) + { + c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); + buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++; + } + + buckets[BUCKETS_INDEX4((fast_uint_t)c0, (s << 1) & 3)]++; +} + +#endif + +static void libsais_count_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets) +{ + const fast_sint_t prefetch_distance = 32; + + memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t)); + + sa_sint_t i = n - 2; + fast_uint_t s = 1; + fast_sint_t c0 = T[n - 1]; + fast_sint_t c1 = 0; + + for (; i >= prefetch_distance + 3; i -= 4) + { + libsais_prefetch(&T[i - 2 * prefetch_distance]); + + libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0], 0)]); + libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1], 0)]); + libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2], 0)]); + libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3], 0)]); + + c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); + buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++; + + c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); + buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++; + + c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); + buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++; + + c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); + buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++; + } + + for (; i >= 0; i -= 1) + { + c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); + buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++; + } + + buckets[BUCKETS_INDEX2((fast_uint_t)c0, 0)]++; +} + +#if defined(_OPENMP) + +static void libsais_count_compacted_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets) +{ + const fast_sint_t prefetch_distance = 32; + + memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t)); + + sa_sint_t i = n - 2; + fast_uint_t s = 1; + fast_sint_t c0 = T[n - 1]; + fast_sint_t c1 = 0; + + for (; i >= prefetch_distance + 3; i -= 4) + { + libsais_prefetch(&T[i - 2 * prefetch_distance]); + + libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0] & SAINT_MAX, 0)]); + libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1] & SAINT_MAX, 0)]); + libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2] & SAINT_MAX, 0)]); + libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3] & SAINT_MAX, 0)]); + + c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); + c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++; + + c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); + c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++; + + c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); + c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++; + + c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); + c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++; + } + + for (; i >= 0; i -= 1) + { + c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); + c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++; + } + + c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, 0)]++; +} + +#endif + +static sa_sint_t libsais_count_and_gather_lms_suffixes_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t)); + + fast_sint_t m = omp_block_start + omp_block_size - 1; + + if (omp_block_size > 0) + { + const fast_sint_t prefetch_distance = 128; + + fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1; + + while (j < n && (c1 = T[j]) == c0) { ++j; } + + fast_uint_t s = c0 >= c1; + + for (i = m - 1, j = omp_block_start + 3; i >= j; i -= 4) + { + libsais_prefetch(&T[i - prefetch_distance]); + + c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++; + + c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 0); m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++; + + c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 1); m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++; + + c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 2); m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++; + } + + for (j -= 3; i >= j; i -= 1) + { + c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++; + } + + c1 = (i >= 0) ? T[i] : -1; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++; + } + + return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m); +} + +static sa_sint_t libsais_count_and_gather_lms_suffixes_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + sa_sint_t m = 0; + +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536 && omp_get_dynamic() == 0) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; + + if (omp_num_threads == 1) + { + m = libsais_count_and_gather_lms_suffixes_8u(T, SA, n, buckets, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size; + thread_state[omp_thread_num].state.m = libsais_count_and_gather_lms_suffixes_8u(T, SA, n, thread_state[omp_thread_num].state.buckets, omp_block_start, omp_block_size); + + if (thread_state[omp_thread_num].state.m > 0) + { + thread_state[omp_thread_num].state.last_lms_suffix = SA[thread_state[omp_thread_num].state.position - 1]; + } + } + + #pragma omp barrier + + #pragma omp master + { + memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t)); + + fast_sint_t t; + for (t = omp_num_threads - 1; t >= 0; --t) + { + m += (sa_sint_t)thread_state[t].state.m; + + if (t != omp_num_threads - 1 && thread_state[t].state.m > 0) + { + memcpy(&SA[n - m], &SA[thread_state[t].state.position - thread_state[t].state.m], (size_t)thread_state[t].state.m * sizeof(sa_sint_t)); + } + + { + sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; + fast_sint_t s; for (s = 0; s < 4 * ALPHABET_SIZE; s += 1) { sa_sint_t A = buckets[s], B = temp_bucket[s]; buckets[s] = A + B; temp_bucket[s] = A; } + } + } + } + } +#endif + } + + return m; +} + +static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + memset(buckets, 0, 4 * (size_t)k * sizeof(sa_sint_t)); + + fast_sint_t m = omp_block_start + omp_block_size - 1; + + if (omp_block_size > 0) + { + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1; + + while (j < n && (c1 = T[j]) == c0) { ++j; } + + fast_uint_t s = c0 >= c1; + + for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) + { + libsais_prefetch(&T[i - 2 * prefetch_distance]); + + libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 0], 0)]); + libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 1], 0)]); + libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 2], 0)]); + libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 3], 0)]); + + c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++; + + c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 0); m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++; + + c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 1); m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++; + + c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 2); m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++; + } + + for (j -= prefetch_distance + 3; i >= j; i -= 1) + { + c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++; + } + + c1 = (i >= 0) ? T[i] : -1; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++; + } + + return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m); +} + +static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t)); + + fast_sint_t m = omp_block_start + omp_block_size - 1; + + if (omp_block_size > 0) + { + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1; + + while (j < n && (c1 = T[j]) == c0) { ++j; } + + fast_uint_t s = c0 >= c1; + + for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) + { + libsais_prefetch(&T[i - 2 * prefetch_distance]); + + libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0], 0)]); + libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1], 0)]); + libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2], 0)]); + libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3], 0)]); + + c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++; + + c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 0); m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++; + + c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 1); m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++; + + c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 2); m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++; + } + + for (j -= prefetch_distance + 3; i >= j; i -= 1) + { + c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++; + } + + c1 = (i >= 0) ? T[i] : -1; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++; + } + + return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m); +} + +static sa_sint_t libsais_count_and_gather_compacted_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t)); + + fast_sint_t m = omp_block_start + omp_block_size - 1; + + if (omp_block_size > 0) + { + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1; + + while (j < n && (c1 = T[j]) == c0) { ++j; } + + fast_uint_t s = c0 >= c1; + + for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) + { + libsais_prefetch(&T[i - 2 * prefetch_distance]); + + libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0] & SAINT_MAX, 0)]); + libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1] & SAINT_MAX, 0)]); + libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2] & SAINT_MAX, 0)]); + libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3] & SAINT_MAX, 0)]); + + c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((fast_sint_t)(s & 3) == (c0 >= 0)); + c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++; + + c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 0); m -= ((fast_sint_t)(s & 3) == (c1 >= 0)); + c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++; + + c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 1); m -= ((fast_sint_t)(s & 3) == (c0 >= 0)); + c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++; + + c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 2); m -= ((fast_sint_t)(s & 3) == (c1 >= 0)); + c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++; + } + + for (j -= prefetch_distance + 3; i >= j; i -= 1) + { + c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((fast_sint_t)(s & 3) == (c1 >= 0)); + c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++; + } + + c1 = (i >= 0) ? T[i] : -1; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((fast_sint_t)(s & 3) == (c0 >= 0)); + c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++; + } + + return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m); +} + +#if defined(_OPENMP) + +static fast_sint_t libsais_get_bucket_stride(fast_sint_t free_space, fast_sint_t bucket_size, fast_sint_t num_buckets) +{ + fast_sint_t bucket_size_1024 = (bucket_size + 1023) & (-1024); if (free_space / (num_buckets - 1) >= bucket_size_1024) { return bucket_size_1024; } + fast_sint_t bucket_size_16 = (bucket_size + 15) & (-16); if (free_space / (num_buckets - 1) >= bucket_size_16) { return bucket_size_16; } + + return bucket_size; +} + +static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_fs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + sa_sint_t m = 0; + +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; + + if (omp_num_threads == 1) + { + m = libsais_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + fast_sint_t bucket_size = 4 * (fast_sint_t)k; + fast_sint_t bucket_stride = libsais_get_bucket_stride(buckets - &SA[n], bucket_size, omp_num_threads); + + { + thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size; + thread_state[omp_thread_num].state.count = libsais_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets - (omp_thread_num * bucket_stride), omp_block_start, omp_block_size); + } + + #pragma omp barrier + + if (omp_thread_num == omp_num_threads - 1) + { + fast_sint_t t; + for (t = omp_num_threads - 1; t >= 0; --t) + { + m += (sa_sint_t)thread_state[t].state.count; + + if (t != omp_num_threads - 1 && thread_state[t].state.count > 0) + { + memcpy(&SA[n - m], &SA[thread_state[t].state.position - thread_state[t].state.count], (size_t)thread_state[t].state.count * sizeof(sa_sint_t)); + } + } + } + else + { + omp_num_threads = omp_num_threads - 1; + omp_block_stride = (bucket_size / omp_num_threads) & (-16); + omp_block_start = omp_thread_num * omp_block_stride; + omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : bucket_size - omp_block_start; + + libsais_accumulate_counts_s32(buckets + omp_block_start, omp_block_size, bucket_stride, omp_num_threads + 1); + } + } +#endif + } + + return m; +} + +static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_fs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + sa_sint_t m = 0; + +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; + + if (omp_num_threads == 1) + { + m = libsais_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + fast_sint_t bucket_size = 2 * (fast_sint_t)k; + fast_sint_t bucket_stride = libsais_get_bucket_stride(buckets - &SA[n], bucket_size, omp_num_threads); + + { + thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size; + thread_state[omp_thread_num].state.count = libsais_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets - (omp_thread_num * bucket_stride), omp_block_start, omp_block_size); + } + + #pragma omp barrier + + if (omp_thread_num == omp_num_threads - 1) + { + fast_sint_t t; + for (t = omp_num_threads - 1; t >= 0; --t) + { + m += (sa_sint_t)thread_state[t].state.count; + + if (t != omp_num_threads - 1 && thread_state[t].state.count > 0) + { + memcpy(&SA[n - m], &SA[thread_state[t].state.position - thread_state[t].state.count], (size_t)thread_state[t].state.count * sizeof(sa_sint_t)); + } + } + } + else + { + omp_num_threads = omp_num_threads - 1; + omp_block_stride = (bucket_size / omp_num_threads) & (-16); + omp_block_start = omp_thread_num * omp_block_stride; + omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : bucket_size - omp_block_start; + + libsais_accumulate_counts_s32(buckets + omp_block_start, omp_block_size, bucket_stride, omp_num_threads + 1); + } + } +#endif + } + + return m; +} + +static void libsais_count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; + + if (omp_num_threads == 1) + { + libsais_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA, n, k, buckets, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + fast_sint_t bucket_size = 2 * (fast_sint_t)k; + fast_sint_t bucket_stride = libsais_get_bucket_stride(buckets - &SA[n + n], bucket_size, omp_num_threads); + + { + thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size; + thread_state[omp_thread_num].state.count = libsais_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA + n, n, k, buckets - (omp_thread_num * bucket_stride), omp_block_start, omp_block_size); + } + + #pragma omp barrier + + { + fast_sint_t t, m = 0; for (t = omp_num_threads - 1; t >= omp_thread_num; --t) { m += (sa_sint_t)thread_state[t].state.count; } + + if (thread_state[omp_thread_num].state.count > 0) + { + memcpy(&SA[n - m], &SA[n + thread_state[omp_thread_num].state.position - thread_state[omp_thread_num].state.count], (size_t)thread_state[omp_thread_num].state.count * sizeof(sa_sint_t)); + } + } + + { + omp_block_stride = (bucket_size / omp_num_threads) & (-16); + omp_block_start = omp_thread_num * omp_block_stride; + omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : bucket_size - omp_block_start; + + libsais_accumulate_counts_s32(buckets + omp_block_start, omp_block_size, bucket_stride, omp_num_threads); + } + } +#endif + } +} + +#endif + +static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_nofs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads) +{ + sa_sint_t m = 0; + +#if defined(_OPENMP) + #pragma omp parallel num_threads(2) if(threads > 1 && n >= 65536) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); + + fast_sint_t omp_num_threads = 1; +#endif + if (omp_num_threads == 1) + { + m = libsais_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets, 0, n); + } +#if defined(_OPENMP) + else if (omp_thread_num == 0) + { + libsais_count_lms_suffixes_32s_4k(T, n, k, buckets); + } + else + { + m = libsais_gather_lms_suffixes_32s(T, SA, n); + } +#endif + } + + return m; +} + +static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_nofs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads) +{ + sa_sint_t m = 0; + +#if defined(_OPENMP) + #pragma omp parallel num_threads(2) if(threads > 1 && n >= 65536) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); + + fast_sint_t omp_num_threads = 1; +#endif + if (omp_num_threads == 1) + { + m = libsais_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n); + } +#if defined(_OPENMP) + else if (omp_thread_num == 0) + { + libsais_count_lms_suffixes_32s_2k(T, n, k, buckets); + } + else + { + m = libsais_gather_lms_suffixes_32s(T, SA, n); + } +#endif + } + + return m; +} + +static sa_sint_t libsais_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads) +{ + sa_sint_t m = 0; + +#if defined(_OPENMP) + #pragma omp parallel num_threads(2) if(threads > 1 && n >= 65536) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); + + fast_sint_t omp_num_threads = 1; +#endif + if (omp_num_threads == 1) + { + m = libsais_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n); + } +#if defined(_OPENMP) + else if (omp_thread_num == 0) + { + libsais_count_compacted_lms_suffixes_32s_2k(T, n, k, buckets); + } + else + { + m = libsais_gather_compacted_lms_suffixes_32s(T, SA, n); + } +#endif + } + + return m; +} + +static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + sa_sint_t m; + +#if defined(_OPENMP) + sa_sint_t max_threads = (sa_sint_t)((buckets - &SA[n]) / ((4 * (fast_sint_t)k + 15) & (-16))); if (max_threads > threads) { max_threads = threads; } + if (max_threads > 1 && n >= 65536 && n / k >= 2) + { + if (max_threads > n / 16 / k) { max_threads = n / 16 / k; } + m = libsais_count_and_gather_lms_suffixes_32s_4k_fs_omp(T, SA, n, k, buckets, max_threads > 2 ? max_threads : 2, thread_state); + } + else +#else + UNUSED(thread_state); +#endif + { + m = libsais_count_and_gather_lms_suffixes_32s_4k_nofs_omp(T, SA, n, k, buckets, threads); + } + + return m; +} + +static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + sa_sint_t m; + +#if defined(_OPENMP) + sa_sint_t max_threads = (sa_sint_t)((buckets - &SA[n]) / ((2 * (fast_sint_t)k + 15) & (-16))); if (max_threads > threads) { max_threads = threads; } + if (max_threads > 1 && n >= 65536 && n / k >= 2) + { + if (max_threads > n / 8 / k) { max_threads = n / 8 / k; } + m = libsais_count_and_gather_lms_suffixes_32s_2k_fs_omp(T, SA, n, k, buckets, max_threads > 2 ? max_threads : 2, thread_state); + } + else +#else + UNUSED(thread_state); +#endif + { + m = libsais_count_and_gather_lms_suffixes_32s_2k_nofs_omp(T, SA, n, k, buckets, threads); + } + + return m; +} + +static void libsais_count_and_gather_compacted_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ +#if defined(_OPENMP) + sa_sint_t max_threads = (sa_sint_t)((buckets - &SA[n + n]) / ((2 * (fast_sint_t)k + 15) & (-16))); if (max_threads > threads) { max_threads = threads; } + if (max_threads > 1 && n >= 65536 && n / k >= 2) + { + if (max_threads > n / 8 / k) { max_threads = n / 8 / k; } + libsais_count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(T, SA, n, k, buckets, max_threads > 2 ? max_threads : 2, thread_state); + } + else +#else + UNUSED(thread_state); +#endif + { + libsais_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(T, SA, n, k, buckets, threads); + } +} + +static void libsais_count_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets) +{ + const fast_sint_t prefetch_distance = 32; + + memset(buckets, 0, (size_t)k * sizeof(sa_sint_t)); + + fast_sint_t i, j; + for (i = 0, j = (fast_sint_t)n - 7; i < j; i += 8) + { + libsais_prefetch(&T[i + prefetch_distance]); + + buckets[T[i + 0]]++; + buckets[T[i + 1]]++; + buckets[T[i + 2]]++; + buckets[T[i + 3]]++; + buckets[T[i + 4]]++; + buckets[T[i + 5]]++; + buckets[T[i + 6]]++; + buckets[T[i + 7]]++; + } + + for (j += 7; i < j; i += 1) + { + buckets[T[i]]++; + } +} + +static void libsais_initialize_buckets_start_and_end_8u(sa_sint_t * RESTRICT buckets, sa_sint_t * RESTRICT freq) +{ + sa_sint_t * RESTRICT bucket_start = &buckets[6 * ALPHABET_SIZE]; + sa_sint_t * RESTRICT bucket_end = &buckets[7 * ALPHABET_SIZE]; + + if (freq != NULL) + { + fast_sint_t i, j; sa_sint_t sum = 0; + for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1) + { + bucket_start[j] = sum; + sum += (freq[j] = buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)]); + bucket_end[j] = sum; + } + } + else + { + fast_sint_t i, j; sa_sint_t sum = 0; + for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1) + { + bucket_start[j] = sum; + sum += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)]; + bucket_end[j] = sum; + } + } +} + +static void libsais_initialize_buckets_start_and_end_32s_6k(sa_sint_t k, sa_sint_t * RESTRICT buckets) +{ + sa_sint_t * RESTRICT bucket_start = &buckets[4 * k]; + sa_sint_t * RESTRICT bucket_end = &buckets[5 * k]; + + fast_sint_t i, j; sa_sint_t sum = 0; + for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1) + { + bucket_start[j] = sum; + sum += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)]; + bucket_end[j] = sum; + } +} + +static void libsais_initialize_buckets_start_and_end_32s_4k(sa_sint_t k, sa_sint_t * RESTRICT buckets) +{ + sa_sint_t * RESTRICT bucket_start = &buckets[2 * k]; + sa_sint_t * RESTRICT bucket_end = &buckets[3 * k]; + + fast_sint_t i, j; sa_sint_t sum = 0; + for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1) + { + bucket_start[j] = sum; + sum += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)]; + bucket_end[j] = sum; + } +} + +static void libsais_initialize_buckets_end_32s_2k(sa_sint_t k, sa_sint_t * RESTRICT buckets) +{ + fast_sint_t i; sa_sint_t sum0 = 0; + for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0)) + { + sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)]; buckets[i + BUCKETS_INDEX2(0, 0)] = sum0; + } +} + +static void libsais_initialize_buckets_start_and_end_32s_2k(sa_sint_t k, sa_sint_t * RESTRICT buckets) +{ + fast_sint_t i, j; + for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1) + { + buckets[j] = buckets[i]; + } + + buckets[k] = 0; memcpy(&buckets[k + 1], buckets, ((size_t)k - 1) * sizeof(sa_sint_t)); +} + +static void libsais_initialize_buckets_start_32s_1k(sa_sint_t k, sa_sint_t * RESTRICT buckets) +{ + fast_sint_t i; sa_sint_t sum = 0; + for (i = 0; i <= (fast_sint_t)k - 1; i += 1) { sa_sint_t tmp = buckets[i]; buckets[i] = sum; sum += tmp; } +} + +static void libsais_initialize_buckets_end_32s_1k(sa_sint_t k, sa_sint_t * RESTRICT buckets) +{ + fast_sint_t i; sa_sint_t sum = 0; + for (i = 0; i <= (fast_sint_t)k - 1; i += 1) { sum += buckets[i]; buckets[i] = sum; } +} + +static sa_sint_t libsais_initialize_buckets_for_lms_suffixes_radix_sort_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix) +{ + { + fast_uint_t s = 0; + fast_sint_t c0 = T[first_lms_suffix]; + fast_sint_t c1 = 0; + + for (; --first_lms_suffix >= 0; ) + { + c1 = c0; c0 = T[first_lms_suffix]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); + buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]--; + } + + buckets[BUCKETS_INDEX4((fast_uint_t)c0, (s << 1) & 3)]--; + } + + { + sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE]; + + fast_sint_t i, j; sa_sint_t sum = 0; + for (i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) + { + temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum; sum += buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 3)]; temp_bucket[j] = sum; + } + + return sum; + } +} + +static void libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix) +{ + buckets[BUCKETS_INDEX2(T[first_lms_suffix], 0)]++; + buckets[BUCKETS_INDEX2(T[first_lms_suffix], 1)]--; + + fast_sint_t i; sa_sint_t sum0 = 0, sum1 = 0; + for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0)) + { + sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)]; + sum1 += buckets[i + BUCKETS_INDEX2(0, 1)]; + + buckets[i + BUCKETS_INDEX2(0, 0)] = sum0; + buckets[i + BUCKETS_INDEX2(0, 1)] = sum1; + } +} + +static sa_sint_t libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix) +{ + { + fast_uint_t s = 0; + fast_sint_t c0 = T[first_lms_suffix]; + fast_sint_t c1 = 0; + + for (; --first_lms_suffix >= 0; ) + { + c1 = c0; c0 = T[first_lms_suffix]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); + buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]--; + } + + buckets[BUCKETS_INDEX4((fast_uint_t)c0, (s << 1) & 3)]--; + } + + { + sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k]; + + fast_sint_t i, j; sa_sint_t sum = 0; + for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1) + { + sum += buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 3)]; temp_bucket[j] = sum; + } + + return sum; + } +} + +static void libsais_initialize_buckets_for_radix_and_partial_sorting_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix) +{ + sa_sint_t * RESTRICT bucket_start = &buckets[2 * k]; + sa_sint_t * RESTRICT bucket_end = &buckets[3 * k]; + + buckets[BUCKETS_INDEX2(T[first_lms_suffix], 0)]++; + buckets[BUCKETS_INDEX2(T[first_lms_suffix], 1)]--; + + fast_sint_t i, j; sa_sint_t sum0 = 0, sum1 = 0; + for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1) + { + bucket_start[j] = sum1; + + sum0 += buckets[i + BUCKETS_INDEX2(0, 1)]; + sum1 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)]; + buckets[i + BUCKETS_INDEX2(0, 1)] = sum0; + + bucket_end[j] = sum1; + } +} + +static void libsais_radix_sort_lms_suffixes_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) + { + libsais_prefetch(&SA[i - 2 * prefetch_distance]); + + libsais_prefetch(&T[SA[i - prefetch_distance - 0]]); + libsais_prefetch(&T[SA[i - prefetch_distance - 1]]); + libsais_prefetch(&T[SA[i - prefetch_distance - 2]]); + libsais_prefetch(&T[SA[i - prefetch_distance - 3]]); + + sa_sint_t p0 = SA[i - 0]; SA[--induction_bucket[BUCKETS_INDEX2(T[p0], 0)]] = p0; + sa_sint_t p1 = SA[i - 1]; SA[--induction_bucket[BUCKETS_INDEX2(T[p1], 0)]] = p1; + sa_sint_t p2 = SA[i - 2]; SA[--induction_bucket[BUCKETS_INDEX2(T[p2], 0)]] = p2; + sa_sint_t p3 = SA[i - 3]; SA[--induction_bucket[BUCKETS_INDEX2(T[p3], 0)]] = p3; + } + + for (j -= prefetch_distance + 3; i >= j; i -= 1) + { + sa_sint_t p = SA[i]; SA[--induction_bucket[BUCKETS_INDEX2(T[p], 0)]] = p; + } +} + +static void libsais_radix_sort_lms_suffixes_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536 && m >= 65536 && omp_get_dynamic() == 0) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_num_threads = 1; +#endif + if (omp_num_threads == 1) + { + libsais_radix_sort_lms_suffixes_8u(T, SA, &buckets[4 * ALPHABET_SIZE], (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1); + } +#if defined(_OPENMP) + else + { + { + sa_sint_t * RESTRICT src_bucket = &buckets[4 * ALPHABET_SIZE]; + sa_sint_t * RESTRICT dst_bucket = thread_state[omp_thread_num].state.buckets; + + fast_sint_t i, j; + for (i = BUCKETS_INDEX2(0, 0), j = BUCKETS_INDEX4(0, 1); i <= BUCKETS_INDEX2(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX2(1, 0), j += BUCKETS_INDEX4(1, 0)) + { + dst_bucket[i] = src_bucket[i] - dst_bucket[j]; + } + } + + { + fast_sint_t t, omp_block_start = 0, omp_block_size = thread_state[omp_thread_num].state.m; + for (t = omp_num_threads - 1; t >= omp_thread_num; --t) omp_block_start += thread_state[t].state.m; + + if (omp_block_start == (fast_sint_t)m && omp_block_size > 0) + { + omp_block_start -= 1; omp_block_size -= 1; + } + + libsais_radix_sort_lms_suffixes_8u(T, SA, thread_state[omp_thread_num].state.buckets, (fast_sint_t)n - omp_block_start, omp_block_size); + } + } +#endif + } +} + +static void libsais_radix_sort_lms_suffixes_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 3; i >= j; i -= 4) + { + libsais_prefetch(&SA[i - 3 * prefetch_distance]); + + libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 0]]); + libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 1]]); + libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 2]]); + libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 3]]); + + libsais_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 0]]]); + libsais_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 1]]]); + libsais_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 2]]]); + libsais_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 3]]]); + + sa_sint_t p0 = SA[i - 0]; SA[--induction_bucket[T[p0]]] = p0; + sa_sint_t p1 = SA[i - 1]; SA[--induction_bucket[T[p1]]] = p1; + sa_sint_t p2 = SA[i - 2]; SA[--induction_bucket[T[p2]]] = p2; + sa_sint_t p3 = SA[i - 3]; SA[--induction_bucket[T[p3]]] = p3; + } + + for (j -= 2 * prefetch_distance + 3; i >= j; i -= 1) + { + sa_sint_t p = SA[i]; SA[--induction_bucket[T[p]]] = p; + } +} + +static void libsais_radix_sort_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 3; i >= j; i -= 4) + { + libsais_prefetch(&SA[i - 3 * prefetch_distance]); + + libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 0]]); + libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 1]]); + libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 2]]); + libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 3]]); + + libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 0]], 0)]); + libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 1]], 0)]); + libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 2]], 0)]); + libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 3]], 0)]); + + sa_sint_t p0 = SA[i - 0]; SA[--induction_bucket[BUCKETS_INDEX2(T[p0], 0)]] = p0; + sa_sint_t p1 = SA[i - 1]; SA[--induction_bucket[BUCKETS_INDEX2(T[p1], 0)]] = p1; + sa_sint_t p2 = SA[i - 2]; SA[--induction_bucket[BUCKETS_INDEX2(T[p2], 0)]] = p2; + sa_sint_t p3 = SA[i - 3]; SA[--induction_bucket[BUCKETS_INDEX2(T[p3], 0)]] = p3; + } + + for (j -= 2 * prefetch_distance + 3; i >= j; i -= 1) + { + sa_sint_t p = SA[i]; SA[--induction_bucket[BUCKETS_INDEX2(T[p], 0)]] = p; + } +} + +#if defined(_OPENMP) + +static void libsais_radix_sort_lms_suffixes_32s_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) + { + libsais_prefetch(&SA[i + 2 * prefetch_distance]); + + libsais_prefetch(&T[SA[i + prefetch_distance + 0]]); + libsais_prefetch(&T[SA[i + prefetch_distance + 1]]); + libsais_prefetch(&T[SA[i + prefetch_distance + 2]]); + libsais_prefetch(&T[SA[i + prefetch_distance + 3]]); + + libsais_prefetchw(&cache[i + prefetch_distance]); + + cache[i + 0].symbol = T[cache[i + 0].index = SA[i + 0]]; + cache[i + 1].symbol = T[cache[i + 1].index = SA[i + 1]]; + cache[i + 2].symbol = T[cache[i + 2].index = SA[i + 2]]; + cache[i + 3].symbol = T[cache[i + 3].index = SA[i + 3]]; + } + + for (j += prefetch_distance + 3; i < j; i += 1) + { + cache[i].symbol = T[cache[i].index = SA[i]]; + } +} + +static void libsais_radix_sort_lms_suffixes_32s_6k_block_sort(sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) + { + libsais_prefetchw(&cache[i - 2 * prefetch_distance]); + + libsais_prefetchw(&induction_bucket[cache[i - prefetch_distance - 0].symbol]); + libsais_prefetchw(&induction_bucket[cache[i - prefetch_distance - 1].symbol]); + libsais_prefetchw(&induction_bucket[cache[i - prefetch_distance - 2].symbol]); + libsais_prefetchw(&induction_bucket[cache[i - prefetch_distance - 3].symbol]); + + cache[i - 0].symbol = --induction_bucket[cache[i - 0].symbol]; + cache[i - 1].symbol = --induction_bucket[cache[i - 1].symbol]; + cache[i - 2].symbol = --induction_bucket[cache[i - 2].symbol]; + cache[i - 3].symbol = --induction_bucket[cache[i - 3].symbol]; + } + + for (j -= prefetch_distance + 3; i >= j; i -= 1) + { + cache[i].symbol = --induction_bucket[cache[i].symbol]; + } +} + +static void libsais_radix_sort_lms_suffixes_32s_2k_block_sort(sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) + { + libsais_prefetchw(&cache[i - 2 * prefetch_distance]); + + libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 0].symbol, 0)]); + libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 1].symbol, 0)]); + libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 2].symbol, 0)]); + libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 3].symbol, 0)]); + + cache[i - 0].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 0].symbol, 0)]; + cache[i - 1].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 1].symbol, 0)]; + cache[i - 2].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 2].symbol, 0)]; + cache[i - 3].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 3].symbol, 0)]; + } + + for (j -= prefetch_distance + 3; i >= j; i -= 1) + { + cache[i].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i].symbol, 0)]; + } +} + +static void libsais_radix_sort_lms_suffixes_32s_6k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(cache); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; + + omp_block_start += block_start; + + if (omp_num_threads == 1) + { + libsais_radix_sort_lms_suffixes_32s_6k(T, SA, induction_bucket, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + libsais_radix_sort_lms_suffixes_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); + } + + #pragma omp barrier + + #pragma omp master + { + libsais_radix_sort_lms_suffixes_32s_6k_block_sort(induction_bucket, cache - block_start, block_start, block_size); + } + + #pragma omp barrier + + { + libsais_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); + } + } +#endif + } +} + +static void libsais_radix_sort_lms_suffixes_32s_2k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(cache); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; + + omp_block_start += block_start; + + if (omp_num_threads == 1) + { + libsais_radix_sort_lms_suffixes_32s_2k(T, SA, induction_bucket, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + libsais_radix_sort_lms_suffixes_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); + } + + #pragma omp barrier + + #pragma omp master + { + libsais_radix_sort_lms_suffixes_32s_2k_block_sort(induction_bucket, cache - block_start, block_start, block_size); + } + + #pragma omp barrier + + { + libsais_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); + } + } +#endif + } +} + +#endif + +static void libsais_radix_sort_lms_suffixes_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + if (threads == 1 || m < 65536) + { + libsais_radix_sort_lms_suffixes_32s_6k(T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1); + } +#if defined(_OPENMP) + else + { + fast_sint_t block_start, block_end; + for (block_start = 0; block_start < (fast_sint_t)m - 1; block_start = block_end) + { + block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end >= m) { block_end = (fast_sint_t)m - 1; } + + libsais_radix_sort_lms_suffixes_32s_6k_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, (fast_sint_t)n - block_end, block_end - block_start, threads); + } + } +#else + UNUSED(thread_state); +#endif +} + +static void libsais_radix_sort_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + if (threads == 1 || m < 65536) + { + libsais_radix_sort_lms_suffixes_32s_2k(T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1); + } +#if defined(_OPENMP) + else + { + fast_sint_t block_start, block_end; + for (block_start = 0; block_start < (fast_sint_t)m - 1; block_start = block_end) + { + block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end >= m) { block_end = (fast_sint_t)m - 1; } + + libsais_radix_sort_lms_suffixes_32s_2k_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, (fast_sint_t)n - block_end, block_end - block_start, threads); + } + } +#else + UNUSED(thread_state); +#endif +} + +static sa_sint_t libsais_radix_sort_lms_suffixes_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets) +{ + const fast_sint_t prefetch_distance = 32; + + sa_sint_t i = n - 2; + sa_sint_t m = 0; + fast_uint_t s = 1; + fast_sint_t c0 = T[n - 1]; + fast_sint_t c1 = 0; + fast_sint_t c2 = 0; + + for (; i >= prefetch_distance + 3; i -= 4) + { + libsais_prefetch(&T[i - 2 * prefetch_distance]); + + libsais_prefetchw(&buckets[T[i - prefetch_distance - 0]]); + libsais_prefetchw(&buckets[T[i - prefetch_distance - 1]]); + libsais_prefetchw(&buckets[T[i - prefetch_distance - 2]]); + libsais_prefetchw(&buckets[T[i - prefetch_distance - 3]]); + + c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); + if ((s & 3) == 1) { SA[--buckets[c2 = c0]] = i + 1; m++; } + + c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); + if ((s & 3) == 1) { SA[--buckets[c2 = c1]] = i - 0; m++; } + + c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); + if ((s & 3) == 1) { SA[--buckets[c2 = c0]] = i - 1; m++; } + + c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); + if ((s & 3) == 1) { SA[--buckets[c2 = c1]] = i - 2; m++; } + } + + for (; i >= 0; i -= 1) + { + c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); + if ((s & 3) == 1) { SA[--buckets[c2 = c1]] = i + 1; m++; } + } + + if (m > 1) + { + SA[buckets[c2]] = 0; + } + + return m; +} + +static void libsais_radix_sort_set_markers_32s_6k(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) + { + libsais_prefetch(&induction_bucket[i + 2 * prefetch_distance]); + + libsais_prefetchw(&SA[induction_bucket[i + prefetch_distance + 0]]); + libsais_prefetchw(&SA[induction_bucket[i + prefetch_distance + 1]]); + libsais_prefetchw(&SA[induction_bucket[i + prefetch_distance + 2]]); + libsais_prefetchw(&SA[induction_bucket[i + prefetch_distance + 3]]); + + SA[induction_bucket[i + 0]] |= SAINT_MIN; + SA[induction_bucket[i + 1]] |= SAINT_MIN; + SA[induction_bucket[i + 2]] |= SAINT_MIN; + SA[induction_bucket[i + 3]] |= SAINT_MIN; + } + + for (j += prefetch_distance + 3; i < j; i += 1) + { + SA[induction_bucket[i]] |= SAINT_MIN; + } +} + +static void libsais_radix_sort_set_markers_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) + { + libsais_prefetch(&induction_bucket[BUCKETS_INDEX2(i + 2 * prefetch_distance, 0)]); + + libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 0, 0)]]); + libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 1, 0)]]); + libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 2, 0)]]); + libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 3, 0)]]); + + SA[induction_bucket[BUCKETS_INDEX2(i + 0, 0)]] |= SUFFIX_GROUP_MARKER; + SA[induction_bucket[BUCKETS_INDEX2(i + 1, 0)]] |= SUFFIX_GROUP_MARKER; + SA[induction_bucket[BUCKETS_INDEX2(i + 2, 0)]] |= SUFFIX_GROUP_MARKER; + SA[induction_bucket[BUCKETS_INDEX2(i + 3, 0)]] |= SUFFIX_GROUP_MARKER; + } + + for (j += prefetch_distance + 3; i < j; i += 1) + { + SA[induction_bucket[BUCKETS_INDEX2(i, 0)]] |= SUFFIX_GROUP_MARKER; + } +} + +static void libsais_radix_sort_set_markers_32s_6k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && k >= 65536) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); + fast_sint_t omp_block_stride = (((fast_sint_t)k - 1) / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : (fast_sint_t)k - 1 - omp_block_start; +#else + UNUSED(threads); + + fast_sint_t omp_block_start = 0; + fast_sint_t omp_block_size = (fast_sint_t)k - 1; +#endif + + libsais_radix_sort_set_markers_32s_6k(SA, induction_bucket, omp_block_start, omp_block_size); + } +} + +static void libsais_radix_sort_set_markers_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && k >= 65536) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); + fast_sint_t omp_block_stride = (((fast_sint_t)k - 1) / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : (fast_sint_t)k - 1 - omp_block_start; +#else + UNUSED(threads); + + fast_sint_t omp_block_start = 0; + fast_sint_t omp_block_size = (fast_sint_t)k - 1; +#endif + + libsais_radix_sort_set_markers_32s_4k(SA, induction_bucket, omp_block_start, omp_block_size); + } +} + +static void libsais_initialize_buckets_for_partial_sorting_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count) +{ + sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE]; + + buckets[BUCKETS_INDEX4((fast_uint_t)T[first_lms_suffix], 1)]++; + + fast_sint_t i, j; sa_sint_t sum0 = left_suffixes_count + 1, sum1 = 0; + for (i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) + { + temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0; + + sum0 += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 2)]; + sum1 += buckets[i + BUCKETS_INDEX4(0, 1)]; + + buckets[j + BUCKETS_INDEX2(0, 0)] = sum0; + buckets[j + BUCKETS_INDEX2(0, 1)] = sum1; + } +} + +static void libsais_initialize_buckets_for_partial_sorting_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count) +{ + sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k]; + + fast_sint_t i, j; sa_sint_t sum0 = left_suffixes_count + 1, sum1 = 0, sum2 = 0; + for (first_lms_suffix = T[first_lms_suffix], i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4((fast_sint_t)first_lms_suffix - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) + { + sa_sint_t SS = buckets[i + BUCKETS_INDEX4(0, 0)]; + sa_sint_t LS = buckets[i + BUCKETS_INDEX4(0, 1)]; + sa_sint_t SL = buckets[i + BUCKETS_INDEX4(0, 2)]; + sa_sint_t LL = buckets[i + BUCKETS_INDEX4(0, 3)]; + + buckets[i + BUCKETS_INDEX4(0, 0)] = sum0; + buckets[i + BUCKETS_INDEX4(0, 1)] = sum2; + buckets[i + BUCKETS_INDEX4(0, 2)] = 0; + buckets[i + BUCKETS_INDEX4(0, 3)] = 0; + + sum0 += SS + SL; sum1 += LS; sum2 += LS + LL; + + temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0; + temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum1; + } + + for (sum1 += 1; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) + { + sa_sint_t SS = buckets[i + BUCKETS_INDEX4(0, 0)]; + sa_sint_t LS = buckets[i + BUCKETS_INDEX4(0, 1)]; + sa_sint_t SL = buckets[i + BUCKETS_INDEX4(0, 2)]; + sa_sint_t LL = buckets[i + BUCKETS_INDEX4(0, 3)]; + + buckets[i + BUCKETS_INDEX4(0, 0)] = sum0; + buckets[i + BUCKETS_INDEX4(0, 1)] = sum2; + buckets[i + BUCKETS_INDEX4(0, 2)] = 0; + buckets[i + BUCKETS_INDEX4(0, 3)] = 0; + + sum0 += SS + SL; sum1 += LS; sum2 += LS + LL; + + temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0; + temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum1; + } +} + +static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE]; + sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) + { + libsais_prefetch(&SA[i + 2 * prefetch_distance]); + + libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1); + libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2); + libsais_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1); + libsais_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2); + + sa_sint_t p0 = SA[i + 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]); + SA[induction_bucket[v0]++] = (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d; + + sa_sint_t p1 = SA[i + 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]); + SA[induction_bucket[v1]++] = (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d; + } + + for (j += prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]); + SA[induction_bucket[v]++] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; + } + + return d; +} + +#if defined(_OPENMP) + +static void libsais_partial_sorting_scan_left_to_right_8u_block_prepare(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size, LIBSAIS_THREAD_STATE * RESTRICT state) +{ + const fast_sint_t prefetch_distance = 32; + + sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; + sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; + + memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t)); + + fast_sint_t i, j, count = 0; sa_sint_t d = 1; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) + { + libsais_prefetch(&SA[i + 2 * prefetch_distance]); + + libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1); + libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2); + libsais_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1); + libsais_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2); + + sa_sint_t p0 = cache[count].index = SA[i + 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = cache[count++].symbol = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]); induction_bucket[v0]++; distinct_names[v0] = d; + sa_sint_t p1 = cache[count].index = SA[i + 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = cache[count++].symbol = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]); induction_bucket[v1]++; distinct_names[v1] = d; + } + + for (j += prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t p = cache[count].index = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = cache[count++].symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]); induction_bucket[v]++; distinct_names[v] = d; + } + + state[0].state.position = (fast_sint_t)d - 1; + state[0].state.count = count; +} + +static void libsais_partial_sorting_scan_left_to_right_8u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count, sa_sint_t d) +{ + const fast_sint_t prefetch_distance = 32; + + sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; + sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; + + fast_sint_t i, j; + for (i = 0, j = count - 1; i < j; i += 2) + { + libsais_prefetch(&cache[i + prefetch_distance]); + + sa_sint_t p0 = cache[i + 0].index; d += (p0 < 0); sa_sint_t v0 = cache[i + 0].symbol; + SA[induction_bucket[v0]++] = (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d; + + sa_sint_t p1 = cache[i + 1].index; d += (p1 < 0); sa_sint_t v1 = cache[i + 1].symbol; + SA[induction_bucket[v1]++] = (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d; + } + + for (j += 1; i < j; i += 1) + { + sa_sint_t p = cache[i].index; d += (p < 0); sa_sint_t v = cache[i].symbol; + SA[induction_bucket[v]++] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; + } +} + +static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; + + omp_block_start += block_start; + + if (omp_num_threads == 1) + { + d = libsais_partial_sorting_scan_left_to_right_8u(T, SA, buckets, d, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + libsais_partial_sorting_scan_left_to_right_8u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size, &thread_state[omp_thread_num]); + } + + #pragma omp barrier + + #pragma omp master + { + sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE]; + sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; + + fast_sint_t t; + for (t = 0; t < omp_num_threads; ++t) + { + sa_sint_t * RESTRICT temp_induction_bucket = &thread_state[t].state.buckets[0 * ALPHABET_SIZE]; + sa_sint_t * RESTRICT temp_distinct_names = &thread_state[t].state.buckets[2 * ALPHABET_SIZE]; + + fast_sint_t c; + for (c = 0; c < 2 * ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_induction_bucket[c]; induction_bucket[c] = A + B; temp_induction_bucket[c] = A; } + + for (d -= 1, c = 0; c < 2 * ALPHABET_SIZE; c += 1) { sa_sint_t A = distinct_names[c], B = temp_distinct_names[c], D = B + d; distinct_names[c] = B > 0 ? D : A; temp_distinct_names[c] = A; } + d += 1 + (sa_sint_t)thread_state[t].state.position; thread_state[t].state.position = (fast_sint_t)d - thread_state[t].state.position; + } + } + + #pragma omp barrier + + { + libsais_partial_sorting_scan_left_to_right_8u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count, (sa_sint_t)thread_state[omp_thread_num].state.position); + } + } +#endif + } + + return d; +} + +#endif + +static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE]; + sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; + + SA[induction_bucket[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])]++] = (n - 1) | SAINT_MIN; + distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])] = ++d; + + if (threads == 1 || left_suffixes_count < 65536) + { + d = libsais_partial_sorting_scan_left_to_right_8u(T, SA, buckets, d, 0, left_suffixes_count); + } +#if defined(_OPENMP) + else + { + fast_sint_t block_start; + for (block_start = 0; block_start < left_suffixes_count; ) + { + if (SA[block_start] == 0) + { + block_start++; + } + else + { + fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > left_suffixes_count) { block_max_end = left_suffixes_count;} + fast_sint_t block_end = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; } + fast_sint_t block_size = block_end - block_start; + + if (block_size < 32) + { + for (; block_start < block_end; block_start += 1) + { + sa_sint_t p = SA[block_start]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]); + SA[induction_bucket[v]++] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; + } + } + else + { + d = libsais_partial_sorting_scan_left_to_right_8u_block_omp(T, SA, buckets, d, block_start, block_size, threads, thread_state); + block_start = block_end; + } + } + } + } +#else + UNUSED(thread_state); +#endif + + return d; +} + +static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2) + { + libsais_prefetch(&SA[i + 3 * prefetch_distance]); + + libsais_prefetch(&T[SA[i + 2 * prefetch_distance + 0] & SAINT_MAX] - 1); + libsais_prefetch(&T[SA[i + 2 * prefetch_distance + 0] & SAINT_MAX] - 2); + libsais_prefetch(&T[SA[i + 2 * prefetch_distance + 1] & SAINT_MAX] - 1); + libsais_prefetch(&T[SA[i + 2 * prefetch_distance + 1] & SAINT_MAX] - 2); + + sa_sint_t p0 = SA[i + prefetch_distance + 0] & SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX4(T[p0 - (p0 > 0)], 0); libsais_prefetchw(&buckets[v0]); + sa_sint_t p1 = SA[i + prefetch_distance + 1] & SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX4(T[p1 - (p1 > 0)], 0); libsais_prefetchw(&buckets[v1]); + + sa_sint_t p2 = SA[i + 0]; d += (p2 < 0); p2 &= SAINT_MAX; sa_sint_t v2 = BUCKETS_INDEX4(T[p2 - 1], T[p2 - 2] >= T[p2 - 1]); + SA[buckets[v2]++] = (p2 - 1) | ((sa_sint_t)(buckets[2 + v2] != d) << (SAINT_BIT - 1)); buckets[2 + v2] = d; + + sa_sint_t p3 = SA[i + 1]; d += (p3 < 0); p3 &= SAINT_MAX; sa_sint_t v3 = BUCKETS_INDEX4(T[p3 - 1], T[p3 - 2] >= T[p3 - 1]); + SA[buckets[v3]++] = (p3 - 1) | ((sa_sint_t)(buckets[2 + v3] != d) << (SAINT_BIT - 1)); buckets[2 + v3] = d; + } + + for (j += 2 * prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX4(T[p - 1], T[p - 2] >= T[p - 1]); + SA[buckets[v]++] = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d; + } + + return d; +} + +static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + sa_sint_t * RESTRICT induction_bucket = &buckets[2 * k]; + sa_sint_t * RESTRICT distinct_names = &buckets[0 * k]; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2) + { + libsais_prefetchw(&SA[i + 3 * prefetch_distance]); + + sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { const fast_sint_t Ts2 = T[(s2 & ~SUFFIX_GROUP_MARKER) - 1]; libsais_prefetchw(&induction_bucket[Ts2]); libsais_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts2, 0)]); } + sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { const fast_sint_t Ts3 = T[(s3 & ~SUFFIX_GROUP_MARKER) - 1]; libsais_prefetchw(&induction_bucket[Ts3]); libsais_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts3, 0)]); } + + sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; + if (p0 > 0) + { + SA[i + 0] = 0; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); p0 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] < T[p0 - 1]); + SA[induction_bucket[T[p0 - 1]]++] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d; + } + + sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; + if (p1 > 0) + { + SA[i + 1] = 0; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); p1 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] < T[p1 - 1]); + SA[induction_bucket[T[p1 - 1]]++] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d; + } + } + + for (j += 2 * prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; + if (p > 0) + { + SA[i] = 0; d += (p >> (SUFFIX_GROUP_BIT - 1)); p &= ~SUFFIX_GROUP_MARKER; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] < T[p - 1]); + SA[induction_bucket[T[p - 1]]++] = (p - 1) | ((sa_sint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d; + } + } + + return d; +} + +static void libsais_partial_sorting_scan_left_to_right_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2) + { + libsais_prefetchw(&SA[i + 3 * prefetch_distance]); + + sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { libsais_prefetchw(&induction_bucket[T[s2 - 1]]); libsais_prefetch(&T[s2] - 2); } + sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { libsais_prefetchw(&induction_bucket[T[s3 - 1]]); libsais_prefetch(&T[s3] - 2); } + + sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { SA[i + 0] = 0; SA[induction_bucket[T[p0 - 1]]++] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)); } + sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { SA[i + 1] = 0; SA[induction_bucket[T[p1 - 1]]++] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)); } + } + + for (j += 2 * prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { SA[i] = 0; SA[induction_bucket[T[p - 1]]++] = (p - 1) | ((sa_sint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1)); } + } +} + +#if defined(_OPENMP) + +static void libsais_partial_sorting_scan_left_to_right_32s_6k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) + { + libsais_prefetch(&SA[i + 2 * prefetch_distance]); + + libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1); + libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2); + libsais_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1); + libsais_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2); + + libsais_prefetchw(&cache[i + prefetch_distance]); + + sa_sint_t p0 = cache[i + 0].index = SA[i + 0]; sa_sint_t symbol0 = 0; p0 &= SAINT_MAX; if (p0 != 0) { symbol0 = BUCKETS_INDEX4(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]); } cache[i + 0].symbol = symbol0; + sa_sint_t p1 = cache[i + 1].index = SA[i + 1]; sa_sint_t symbol1 = 0; p1 &= SAINT_MAX; if (p1 != 0) { symbol1 = BUCKETS_INDEX4(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]); } cache[i + 1].symbol = symbol1; + } + + for (j += prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t p = cache[i].index = SA[i]; sa_sint_t symbol = 0; p &= SAINT_MAX; if (p != 0) { symbol = BUCKETS_INDEX4(T[p - 1], T[p - 2] >= T[p - 1]); } cache[i].symbol = symbol; + } +} + +static void libsais_partial_sorting_scan_left_to_right_32s_4k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) + { + libsais_prefetchw(&SA[i + 2 * prefetch_distance]); + + sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + + libsais_prefetchw(&cache[i + prefetch_distance]); + + sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { cache[i + 0].index = p0; p0 &= ~SUFFIX_GROUP_MARKER; symbol0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] < T[p0 - 1]); p0 = 0; } cache[i + 0].symbol = symbol0; SA[i + 0] = p0 & SAINT_MAX; + sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { cache[i + 1].index = p1; p1 &= ~SUFFIX_GROUP_MARKER; symbol1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] < T[p1 - 1]); p1 = 0; } cache[i + 1].symbol = symbol1; SA[i + 1] = p1 & SAINT_MAX; + } + + for (j += prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { cache[i].index = p; p &= ~SUFFIX_GROUP_MARKER; symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] < T[p - 1]); p = 0; } cache[i].symbol = symbol; SA[i] = p & SAINT_MAX; + } +} + +static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) + { + libsais_prefetchw(&SA[i + 2 * prefetch_distance]); + + sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + + libsais_prefetchw(&cache[i + prefetch_distance]); + + sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { cache[i + 0].index = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)); symbol0 = T[p0 - 1]; p0 = 0; } cache[i + 0].symbol = symbol0; SA[i + 0] = p0 & SAINT_MAX; + sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { cache[i + 1].index = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)); symbol1 = T[p1 - 1]; p1 = 0; } cache[i + 1].symbol = symbol1; SA[i + 1] = p1 & SAINT_MAX; + } + + for (j += prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { cache[i].index = (p - 1) | ((sa_sint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1)); symbol = T[p - 1]; p = 0; } cache[i].symbol = symbol; SA[i] = p & SAINT_MAX; + } +} + +static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size; + for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2) + { + libsais_prefetchw(&cache[i + 2 * prefetch_distance]); + + libsais_prefetchw(&buckets[cache[i + prefetch_distance + 0].symbol]); + libsais_prefetchw(&buckets[cache[i + prefetch_distance + 1].symbol]); + + sa_sint_t v0 = cache[i + 0].symbol, p0 = cache[i + 0].index; d += (p0 < 0); cache[i + 0].symbol = buckets[v0]++; cache[i + 0].index = (p0 - 1) | ((sa_sint_t)(buckets[2 + v0] != d) << (SAINT_BIT - 1)); buckets[2 + v0] = d; + if (cache[i + 0].symbol < omp_block_end) { sa_sint_t s = cache[i + 0].symbol, q = (cache[s].index = cache[i + 0].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] >= T[q - 1]); } + + sa_sint_t v1 = cache[i + 1].symbol, p1 = cache[i + 1].index; d += (p1 < 0); cache[i + 1].symbol = buckets[v1]++; cache[i + 1].index = (p1 - 1) | ((sa_sint_t)(buckets[2 + v1] != d) << (SAINT_BIT - 1)); buckets[2 + v1] = d; + if (cache[i + 1].symbol < omp_block_end) { sa_sint_t s = cache[i + 1].symbol, q = (cache[s].index = cache[i + 1].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] >= T[q - 1]); } + } + + for (j += prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t v = cache[i].symbol, p = cache[i].index; d += (p < 0); cache[i].symbol = buckets[v]++; cache[i].index = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d; + if (cache[i].symbol < omp_block_end) { sa_sint_t s = cache[i].symbol, q = (cache[s].index = cache[i].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] >= T[q - 1]); } + } + + return d; +} + +static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + sa_sint_t * RESTRICT induction_bucket = &buckets[2 * k]; + sa_sint_t * RESTRICT distinct_names = &buckets[0 * k]; + + fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size; + for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2) + { + libsais_prefetchw(&cache[i + 2 * prefetch_distance]); + + sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0 >> 1]; libsais_prefetchw(s0 >= 0 ? Is0 : NULL); const sa_sint_t * Ds0 = &distinct_names[s0]; libsais_prefetchw(s0 >= 0 ? Ds0 : NULL); + sa_sint_t s1 = cache[i + prefetch_distance + 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1 >> 1]; libsais_prefetchw(s1 >= 0 ? Is1 : NULL); const sa_sint_t * Ds1 = &distinct_names[s1]; libsais_prefetchw(s1 >= 0 ? Ds1 : NULL); + + sa_sint_t v0 = cache[i + 0].symbol; + if (v0 >= 0) + { + sa_sint_t p0 = cache[i + 0].index; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); cache[i + 0].symbol = induction_bucket[v0 >> 1]++; cache[i + 0].index = (p0 - 1) | (v0 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d; + if (cache[i + 0].symbol < omp_block_end) { sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index; if (np > 0) { cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]); np = 0; } cache[i + 0].index = np & SAINT_MAX; } + } + + sa_sint_t v1 = cache[i + 1].symbol; + if (v1 >= 0) + { + sa_sint_t p1 = cache[i + 1].index; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); cache[i + 1].symbol = induction_bucket[v1 >> 1]++; cache[i + 1].index = (p1 - 1) | (v1 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d; + if (cache[i + 1].symbol < omp_block_end) { sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index; if (np > 0) { cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]); np = 0; } cache[i + 1].index = np & SAINT_MAX; } + } + } + + for (j += prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t v = cache[i].symbol; + if (v >= 0) + { + sa_sint_t p = cache[i].index; d += (p >> (SUFFIX_GROUP_BIT - 1)); cache[i].symbol = induction_bucket[v >> 1]++; cache[i].index = (p - 1) | (v << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d; + if (cache[i].symbol < omp_block_end) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]); np = 0; } cache[i].index = np & SAINT_MAX; } + } + } + + return d; +} + +static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size; + for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2) + { + libsais_prefetchw(&cache[i + 2 * prefetch_distance]); + + sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0]; libsais_prefetchw(s0 >= 0 ? Is0 : NULL); + sa_sint_t s1 = cache[i + prefetch_distance + 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1]; libsais_prefetchw(s1 >= 0 ? Is1 : NULL); + + sa_sint_t v0 = cache[i + 0].symbol; + if (v0 >= 0) + { + cache[i + 0].symbol = induction_bucket[v0]++; + if (cache[i + 0].symbol < omp_block_end) { sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index; if (np > 0) { cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; np = 0; } cache[i + 0].index = np & SAINT_MAX; } + } + + sa_sint_t v1 = cache[i + 1].symbol; + if (v1 >= 0) + { + cache[i + 1].symbol = induction_bucket[v1]++; + if (cache[i + 1].symbol < omp_block_end) { sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index; if (np > 0) { cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; np = 0; } cache[i + 1].index = np & SAINT_MAX; } + } + } + + for (j += prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t v = cache[i].symbol; + if (v >= 0) + { + cache[i].symbol = induction_bucket[v]++; + if (cache[i].symbol < omp_block_end) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; np = 0; } cache[i].index = np & SAINT_MAX; } + } + } +} + +static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(cache); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; + + omp_block_start += block_start; + + if (omp_num_threads == 1) + { + d = libsais_partial_sorting_scan_left_to_right_32s_6k(T, SA, buckets, d, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + libsais_partial_sorting_scan_left_to_right_32s_6k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); + } + + #pragma omp barrier + + #pragma omp master + { + d = libsais_partial_sorting_scan_left_to_right_32s_6k_block_sort(T, buckets, d, cache - block_start, block_start, block_size); + } + + #pragma omp barrier + + { + libsais_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); + } + } +#endif + } + + return d; +} + +static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(cache); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; + + omp_block_start += block_start; + + if (omp_num_threads == 1) + { + d = libsais_partial_sorting_scan_left_to_right_32s_4k(T, SA, k, buckets, d, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + libsais_partial_sorting_scan_left_to_right_32s_4k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); + } + + #pragma omp barrier + + #pragma omp master + { + d = libsais_partial_sorting_scan_left_to_right_32s_4k_block_sort(T, k, buckets, d, cache - block_start, block_start, block_size); + } + + #pragma omp barrier + + { + libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); + } + } +#endif + } + + return d; +} + +static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(cache); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; + + omp_block_start += block_start; + + if (omp_num_threads == 1) + { + libsais_partial_sorting_scan_left_to_right_32s_1k(T, SA, buckets, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + libsais_partial_sorting_scan_left_to_right_32s_1k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); + } + + #pragma omp barrier + + #pragma omp master + { + libsais_partial_sorting_scan_left_to_right_32s_1k_block_sort(T, buckets, cache - block_start, block_start, block_size); + } + + #pragma omp barrier + + { + libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); + } + } +#endif + } +} + +#endif + +static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + SA[buckets[BUCKETS_INDEX4(T[n - 1], T[n - 2] >= T[n - 1])]++] = (n - 1) | SAINT_MIN; + buckets[2 + BUCKETS_INDEX4(T[n - 1], T[n - 2] >= T[n - 1])] = ++d; + + if (threads == 1 || left_suffixes_count < 65536) + { + d = libsais_partial_sorting_scan_left_to_right_32s_6k(T, SA, buckets, d, 0, left_suffixes_count); + } +#if defined(_OPENMP) + else + { + fast_sint_t block_start, block_end; + for (block_start = 0; block_start < left_suffixes_count; block_start = block_end) + { + block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > left_suffixes_count) { block_end = left_suffixes_count; } + + d = libsais_partial_sorting_scan_left_to_right_32s_6k_block_omp(T, SA, buckets, d, thread_state[0].state.cache, block_start, block_end - block_start, threads); + } + } +#else + UNUSED(thread_state); +#endif + + return d; +} + +static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + sa_sint_t * RESTRICT induction_bucket = &buckets[2 * k]; + sa_sint_t * RESTRICT distinct_names = &buckets[0 * k]; + + SA[induction_bucket[T[n - 1]]++] = (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1)) | SUFFIX_GROUP_MARKER; + distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] < T[n - 1])] = ++d; + + if (threads == 1 || n < 65536) + { + d = libsais_partial_sorting_scan_left_to_right_32s_4k(T, SA, k, buckets, d, 0, n); + } +#if defined(_OPENMP) + else + { + fast_sint_t block_start, block_end; + for (block_start = 0; block_start < n; block_start = block_end) + { + block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > n) { block_end = n; } + + d = libsais_partial_sorting_scan_left_to_right_32s_4k_block_omp(T, SA, k, buckets, d, thread_state[0].state.cache, block_start, block_end - block_start, threads); + } + } +#else + UNUSED(thread_state); +#endif + + return d; +} + +static void libsais_partial_sorting_scan_left_to_right_32s_1k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + SA[buckets[T[n - 1]]++] = (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1)); + + if (threads == 1 || n < 65536) + { + libsais_partial_sorting_scan_left_to_right_32s_1k(T, SA, buckets, 0, n); + } +#if defined(_OPENMP) + else + { + fast_sint_t block_start, block_end; + for (block_start = 0; block_start < n; block_start = block_end) + { + block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > n) { block_end = n; } + + libsais_partial_sorting_scan_left_to_right_32s_1k_block_omp(T, SA, buckets, thread_state[0].state.cache, block_start, block_end - block_start, threads); + } + } +#else + UNUSED(thread_state); +#endif +} + +static void libsais_partial_sorting_shift_markers_8u_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, const sa_sint_t * RESTRICT buckets, sa_sint_t threads) +{ + const fast_sint_t prefetch_distance = 32; + + const sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE]; + + fast_sint_t c; + +#if defined(_OPENMP) + #pragma omp parallel for schedule(static, 1) num_threads(threads) if(threads > 1 && n >= 65536) +#else + UNUSED(threads); UNUSED(n); +#endif + for (c = BUCKETS_INDEX2(ALPHABET_SIZE - 1, 0); c >= BUCKETS_INDEX2(1, 0); c -= BUCKETS_INDEX2(1, 0)) + { + fast_sint_t i, j; sa_sint_t s = SAINT_MIN; + for (i = (fast_sint_t)temp_bucket[c] - 1, j = (fast_sint_t)buckets[c - BUCKETS_INDEX2(1, 0)] + 3; i >= j; i -= 4) + { + libsais_prefetchw(&SA[i - prefetch_distance]); + + sa_sint_t p0 = SA[i - 0], q0 = (p0 & SAINT_MIN) ^ s; s = s ^ q0; SA[i - 0] = p0 ^ q0; + sa_sint_t p1 = SA[i - 1], q1 = (p1 & SAINT_MIN) ^ s; s = s ^ q1; SA[i - 1] = p1 ^ q1; + sa_sint_t p2 = SA[i - 2], q2 = (p2 & SAINT_MIN) ^ s; s = s ^ q2; SA[i - 2] = p2 ^ q2; + sa_sint_t p3 = SA[i - 3], q3 = (p3 & SAINT_MIN) ^ s; s = s ^ q3; SA[i - 3] = p3 ^ q3; + } + + for (j -= 3; i >= j; i -= 1) + { + sa_sint_t p = SA[i], q = (p & SAINT_MIN) ^ s; s = s ^ q; SA[i] = p ^ q; + } + } +} + +static void libsais_partial_sorting_shift_markers_32s_6k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k, const sa_sint_t * RESTRICT buckets, sa_sint_t threads) +{ + const fast_sint_t prefetch_distance = 32; + + const sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k]; + + fast_sint_t c; + +#if defined(_OPENMP) + #pragma omp parallel for schedule(static, 1) num_threads(threads) if(threads > 1 && k >= 65536) +#else + UNUSED(threads); +#endif + for (c = (fast_sint_t)k - 1; c >= 1; c -= 1) + { + fast_sint_t i, j; sa_sint_t s = SAINT_MIN; + for (i = (fast_sint_t)buckets[BUCKETS_INDEX4(c, 0)] - 1, j = (fast_sint_t)temp_bucket[BUCKETS_INDEX2(c - 1, 0)] + 3; i >= j; i -= 4) + { + libsais_prefetchw(&SA[i - prefetch_distance]); + + sa_sint_t p0 = SA[i - 0], q0 = (p0 & SAINT_MIN) ^ s; s = s ^ q0; SA[i - 0] = p0 ^ q0; + sa_sint_t p1 = SA[i - 1], q1 = (p1 & SAINT_MIN) ^ s; s = s ^ q1; SA[i - 1] = p1 ^ q1; + sa_sint_t p2 = SA[i - 2], q2 = (p2 & SAINT_MIN) ^ s; s = s ^ q2; SA[i - 2] = p2 ^ q2; + sa_sint_t p3 = SA[i - 3], q3 = (p3 & SAINT_MIN) ^ s; s = s ^ q3; SA[i - 3] = p3 ^ q3; + } + + for (j -= 3; i >= j; i -= 1) + { + sa_sint_t p = SA[i], q = (p & SAINT_MIN) ^ s; s = s ^ q; SA[i] = p ^ q; + } + } +} + +static void libsais_partial_sorting_shift_markers_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i; sa_sint_t s = SUFFIX_GROUP_MARKER; + for (i = (fast_sint_t)n - 1; i >= 3; i -= 4) + { + libsais_prefetchw(&SA[i - prefetch_distance]); + + sa_sint_t p0 = SA[i - 0], q0 = ((p0 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p0 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q0; SA[i - 0] = p0 ^ q0; + sa_sint_t p1 = SA[i - 1], q1 = ((p1 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p1 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q1; SA[i - 1] = p1 ^ q1; + sa_sint_t p2 = SA[i - 2], q2 = ((p2 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p2 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q2; SA[i - 2] = p2 ^ q2; + sa_sint_t p3 = SA[i - 3], q3 = ((p3 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p3 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q3; SA[i - 3] = p3 ^ q3; + } + + for (; i >= 0; i -= 1) + { + sa_sint_t p = SA[i], q = ((p & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q; SA[i] = p ^ q; + } +} + +static void libsais_partial_sorting_shift_buckets_32s_6k(sa_sint_t k, sa_sint_t * RESTRICT buckets) +{ + sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k]; + + fast_sint_t i; + for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0)) + { + buckets[2 * i + BUCKETS_INDEX4(0, 0)] = temp_bucket[i + BUCKETS_INDEX2(0, 0)]; + buckets[2 * i + BUCKETS_INDEX4(0, 1)] = temp_bucket[i + BUCKETS_INDEX2(0, 1)]; + } +} + +static sa_sint_t libsais_partial_sorting_scan_right_to_left_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; + sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; + + fast_sint_t i, j; + for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) + { + libsais_prefetch(&SA[i - 2 * prefetch_distance]); + + libsais_prefetch(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 1); + libsais_prefetch(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 2); + libsais_prefetch(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 1); + libsais_prefetch(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 2); + + sa_sint_t p0 = SA[i - 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); + SA[--induction_bucket[v0]] = (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d; + + sa_sint_t p1 = SA[i - 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); + SA[--induction_bucket[v1]] = (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d; + } + + for (j -= prefetch_distance + 1; i >= j; i -= 1) + { + sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); + SA[--induction_bucket[v]] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; + } + + return d; +} + +#if defined(_OPENMP) + +static void libsais_partial_sorting_scan_right_to_left_8u_block_prepare(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size, LIBSAIS_THREAD_STATE * RESTRICT state) +{ + const fast_sint_t prefetch_distance = 32; + + sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; + sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; + + memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t)); + + fast_sint_t i, j, count = 0; sa_sint_t d = 1; + for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) + { + libsais_prefetch(&SA[i - 2 * prefetch_distance]); + + libsais_prefetch(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 1); + libsais_prefetch(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 2); + libsais_prefetch(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 1); + libsais_prefetch(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 2); + + sa_sint_t p0 = cache[count].index = SA[i - 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = cache[count++].symbol = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); induction_bucket[v0]++; distinct_names[v0] = d; + sa_sint_t p1 = cache[count].index = SA[i - 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = cache[count++].symbol = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); induction_bucket[v1]++; distinct_names[v1] = d; + } + + for (j -= prefetch_distance + 1; i >= j; i -= 1) + { + sa_sint_t p = cache[count].index = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = cache[count++].symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); induction_bucket[v]++; distinct_names[v] = d; + } + + state[0].state.position = (fast_sint_t)d - 1; + state[0].state.count = count; +} + +static void libsais_partial_sorting_scan_right_to_left_8u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count, sa_sint_t d) +{ + const fast_sint_t prefetch_distance = 32; + + sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; + sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; + + fast_sint_t i, j; + for (i = 0, j = count - 1; i < j; i += 2) + { + libsais_prefetch(&cache[i + prefetch_distance]); + + sa_sint_t p0 = cache[i + 0].index; d += (p0 < 0); sa_sint_t v0 = cache[i + 0].symbol; + SA[--induction_bucket[v0]] = (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d; + + sa_sint_t p1 = cache[i + 1].index; d += (p1 < 0); sa_sint_t v1 = cache[i + 1].symbol; + SA[--induction_bucket[v1]] = (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d; + } + + for (j += 1; i < j; i += 1) + { + sa_sint_t p = cache[i].index; d += (p < 0); sa_sint_t v = cache[i].symbol; + SA[--induction_bucket[v]] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; + } +} + +static sa_sint_t libsais_partial_sorting_scan_right_to_left_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; + + omp_block_start += block_start; + + if (omp_num_threads == 1) + { + d = libsais_partial_sorting_scan_right_to_left_8u(T, SA, buckets, d, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + libsais_partial_sorting_scan_right_to_left_8u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size, &thread_state[omp_thread_num]); + } + + #pragma omp barrier + + #pragma omp master + { + sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; + sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; + + fast_sint_t t; + for (t = omp_num_threads - 1; t >= 0; --t) + { + sa_sint_t * RESTRICT temp_induction_bucket = &thread_state[t].state.buckets[0 * ALPHABET_SIZE]; + sa_sint_t * RESTRICT temp_distinct_names = &thread_state[t].state.buckets[2 * ALPHABET_SIZE]; + + fast_sint_t c; + for (c = 0; c < 2 * ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_induction_bucket[c]; induction_bucket[c] = A - B; temp_induction_bucket[c] = A; } + + for (d -= 1, c = 0; c < 2 * ALPHABET_SIZE; c += 1) { sa_sint_t A = distinct_names[c], B = temp_distinct_names[c], D = B + d; distinct_names[c] = B > 0 ? D : A; temp_distinct_names[c] = A; } + d += 1 + (sa_sint_t)thread_state[t].state.position; thread_state[t].state.position = (fast_sint_t)d - thread_state[t].state.position; + } + } + + #pragma omp barrier + + { + libsais_partial_sorting_scan_right_to_left_8u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count, (sa_sint_t)thread_state[omp_thread_num].state.position); + } + } +#endif + } + + return d; +} + +#endif + +static void libsais_partial_sorting_scan_right_to_left_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + fast_sint_t scan_start = (fast_sint_t)left_suffixes_count + 1; + fast_sint_t scan_end = (fast_sint_t)n - (fast_sint_t)first_lms_suffix; + + if (threads == 1 || (scan_end - scan_start) < 65536) + { + libsais_partial_sorting_scan_right_to_left_8u(T, SA, buckets, d, scan_start, scan_end - scan_start); + } +#if defined(_OPENMP) + else + { + sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; + sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; + + fast_sint_t block_start; + for (block_start = scan_end - 1; block_start >= scan_start; ) + { + if (SA[block_start] == 0) + { + block_start--; + } + else + { + fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end < scan_start) { block_max_end = scan_start - 1; } + fast_sint_t block_end = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; } + fast_sint_t block_size = block_start - block_end; + + if (block_size < 32) + { + for (; block_start > block_end; block_start -= 1) + { + sa_sint_t p = SA[block_start]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); + SA[--induction_bucket[v]] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; + } + } + else + { + d = libsais_partial_sorting_scan_right_to_left_8u_block_omp(T, SA, buckets, d, block_end + 1, block_size, threads, thread_state); + block_start = block_end; + } + } + } + } +#else + UNUSED(thread_state); +#endif +} + +static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2) + { + libsais_prefetch(&SA[i - 3 * prefetch_distance]); + + libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 0] & SAINT_MAX] - 1); + libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 0] & SAINT_MAX] - 2); + libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 1] & SAINT_MAX] - 1); + libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 1] & SAINT_MAX] - 2); + + sa_sint_t p0 = SA[i - prefetch_distance - 0] & SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX4(T[p0 - (p0 > 0)], 0); libsais_prefetchw(&buckets[v0]); + sa_sint_t p1 = SA[i - prefetch_distance - 1] & SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX4(T[p1 - (p1 > 0)], 0); libsais_prefetchw(&buckets[v1]); + + sa_sint_t p2 = SA[i - 0]; d += (p2 < 0); p2 &= SAINT_MAX; sa_sint_t v2 = BUCKETS_INDEX4(T[p2 - 1], T[p2 - 2] > T[p2 - 1]); + SA[--buckets[v2]] = (p2 - 1) | ((sa_sint_t)(buckets[2 + v2] != d) << (SAINT_BIT - 1)); buckets[2 + v2] = d; + + sa_sint_t p3 = SA[i - 1]; d += (p3 < 0); p3 &= SAINT_MAX; sa_sint_t v3 = BUCKETS_INDEX4(T[p3 - 1], T[p3 - 2] > T[p3 - 1]); + SA[--buckets[v3]] = (p3 - 1) | ((sa_sint_t)(buckets[2 + v3] != d) << (SAINT_BIT - 1)); buckets[2 + v3] = d; + } + + for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) + { + sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX4(T[p - 1], T[p - 2] > T[p - 1]); + SA[--buckets[v]] = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d; + } + + return d; +} + +static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + sa_sint_t * RESTRICT induction_bucket = &buckets[3 * k]; + sa_sint_t * RESTRICT distinct_names = &buckets[0 * k]; + + fast_sint_t i, j; + for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2) + { + libsais_prefetchw(&SA[i - 3 * prefetch_distance]); + + sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0]; const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1]; const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { const fast_sint_t Ts2 = T[(s2 & ~SUFFIX_GROUP_MARKER) - 1]; libsais_prefetchw(&induction_bucket[Ts2]); libsais_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts2, 0)]); } + sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { const fast_sint_t Ts3 = T[(s3 & ~SUFFIX_GROUP_MARKER) - 1]; libsais_prefetchw(&induction_bucket[Ts3]); libsais_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts3, 0)]); } + + sa_sint_t p0 = SA[i - 0]; + if (p0 > 0) + { + SA[i - 0] = 0; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); p0 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); + SA[--induction_bucket[T[p0 - 1]]] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d; + } + + sa_sint_t p1 = SA[i - 1]; + if (p1 > 0) + { + SA[i - 1] = 0; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); p1 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); + SA[--induction_bucket[T[p1 - 1]]] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d; + } + } + + for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) + { + sa_sint_t p = SA[i]; + if (p > 0) + { + SA[i] = 0; d += (p >> (SUFFIX_GROUP_BIT - 1)); p &= ~SUFFIX_GROUP_MARKER; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); + SA[--induction_bucket[T[p - 1]]] = (p - 1) | ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d; + } + } + + return d; +} + +static void libsais_partial_sorting_scan_right_to_left_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2) + { + libsais_prefetchw(&SA[i - 3 * prefetch_distance]); + + sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { libsais_prefetchw(&induction_bucket[T[s2 - 1]]); libsais_prefetch(&T[s2] - 2); } + sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { libsais_prefetchw(&induction_bucket[T[s3 - 1]]); libsais_prefetch(&T[s3] - 2); } + + sa_sint_t p0 = SA[i - 0]; if (p0 > 0) { SA[i - 0] = 0; SA[--induction_bucket[T[p0 - 1]]] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)); } + sa_sint_t p1 = SA[i - 1]; if (p1 > 0) { SA[i - 1] = 0; SA[--induction_bucket[T[p1 - 1]]] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)); } + } + + for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) + { + sa_sint_t p = SA[i]; if (p > 0) { SA[i] = 0; SA[--induction_bucket[T[p - 1]]] = (p - 1) | ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)); } + } +} + +#if defined(_OPENMP) + +static void libsais_partial_sorting_scan_right_to_left_32s_6k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) + { + libsais_prefetch(&SA[i + 2 * prefetch_distance]); + + libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1); + libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2); + libsais_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1); + libsais_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2); + + libsais_prefetchw(&cache[i + prefetch_distance]); + + sa_sint_t p0 = cache[i + 0].index = SA[i + 0]; sa_sint_t symbol0 = 0; p0 &= SAINT_MAX; if (p0 != 0) { symbol0 = BUCKETS_INDEX4(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); } cache[i + 0].symbol = symbol0; + sa_sint_t p1 = cache[i + 1].index = SA[i + 1]; sa_sint_t symbol1 = 0; p1 &= SAINT_MAX; if (p1 != 0) { symbol1 = BUCKETS_INDEX4(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); } cache[i + 1].symbol = symbol1; + } + + for (j += prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t p = cache[i].index = SA[i]; sa_sint_t symbol = 0; p &= SAINT_MAX; if (p != 0) { symbol = BUCKETS_INDEX4(T[p - 1], T[p - 2] > T[p - 1]); } cache[i].symbol = symbol; + } +} + +static void libsais_partial_sorting_scan_right_to_left_32s_4k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) + { + libsais_prefetchw(&SA[i + 2 * prefetch_distance]); + + sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + + libsais_prefetchw(&cache[i + prefetch_distance]); + + sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { SA[i + 0] = 0; cache[i + 0].index = p0; p0 &= ~SUFFIX_GROUP_MARKER; symbol0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); } cache[i + 0].symbol = symbol0; + sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { SA[i + 1] = 0; cache[i + 1].index = p1; p1 &= ~SUFFIX_GROUP_MARKER; symbol1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); } cache[i + 1].symbol = symbol1; + } + + for (j += prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { SA[i] = 0; cache[i].index = p; p &= ~SUFFIX_GROUP_MARKER; symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); } cache[i].symbol = symbol; + } +} + +static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) + { + libsais_prefetchw(&SA[i + 2 * prefetch_distance]); + + sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + + libsais_prefetchw(&cache[i + prefetch_distance]); + + sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { SA[i + 0] = 0; cache[i + 0].index = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)); symbol0 = T[p0 - 1]; } cache[i + 0].symbol = symbol0; + sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { SA[i + 1] = 0; cache[i + 1].index = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)); symbol1 = T[p1 - 1]; } cache[i + 1].symbol = symbol1; + } + + for (j += prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { SA[i] = 0; cache[i].index = (p - 1) | ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)); symbol = T[p - 1]; } cache[i].symbol = symbol; + } +} + +static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) + { + libsais_prefetchw(&cache[i - 2 * prefetch_distance]); + + libsais_prefetchw(&buckets[cache[i - prefetch_distance - 0].symbol]); + libsais_prefetchw(&buckets[cache[i - prefetch_distance - 1].symbol]); + + sa_sint_t v0 = cache[i - 0].symbol, p0 = cache[i - 0].index; d += (p0 < 0); cache[i - 0].symbol = --buckets[v0]; cache[i - 0].index = (p0 - 1) | ((sa_sint_t)(buckets[2 + v0] != d) << (SAINT_BIT - 1)); buckets[2 + v0] = d; + if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t s = cache[i - 0].symbol, q = (cache[s].index = cache[i - 0].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] > T[q - 1]); } + + sa_sint_t v1 = cache[i - 1].symbol, p1 = cache[i - 1].index; d += (p1 < 0); cache[i - 1].symbol = --buckets[v1]; cache[i - 1].index = (p1 - 1) | ((sa_sint_t)(buckets[2 + v1] != d) << (SAINT_BIT - 1)); buckets[2 + v1] = d; + if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t s = cache[i - 1].symbol, q = (cache[s].index = cache[i - 1].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] > T[q - 1]); } + } + + for (j -= prefetch_distance + 1; i >= j; i -= 1) + { + sa_sint_t v = cache[i].symbol, p = cache[i].index; d += (p < 0); cache[i].symbol = --buckets[v]; cache[i].index = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d; + if (cache[i].symbol >= omp_block_start) { sa_sint_t s = cache[i].symbol, q = (cache[s].index = cache[i].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] > T[q - 1]); } + } + + return d; +} + +static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + sa_sint_t * RESTRICT induction_bucket = &buckets[3 * k]; + sa_sint_t * RESTRICT distinct_names = &buckets[0 * k]; + + fast_sint_t i, j; + for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) + { + libsais_prefetchw(&cache[i - 2 * prefetch_distance]); + + sa_sint_t s0 = cache[i - prefetch_distance - 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0 >> 1]; libsais_prefetchw(s0 >= 0 ? Is0 : NULL); const sa_sint_t * Ds0 = &distinct_names[s0]; libsais_prefetchw(s0 >= 0 ? Ds0 : NULL); + sa_sint_t s1 = cache[i - prefetch_distance - 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1 >> 1]; libsais_prefetchw(s1 >= 0 ? Is1 : NULL); const sa_sint_t * Ds1 = &distinct_names[s1]; libsais_prefetchw(s1 >= 0 ? Ds1 : NULL); + + sa_sint_t v0 = cache[i - 0].symbol; + if (v0 >= 0) + { + sa_sint_t p0 = cache[i - 0].index; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); cache[i - 0].symbol = --induction_bucket[v0 >> 1]; cache[i - 0].index = (p0 - 1) | (v0 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d; + if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 0].symbol, np = cache[i - 0].index; if (np > 0) { cache[i - 0].index = 0; cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]); } } + } + + sa_sint_t v1 = cache[i - 1].symbol; + if (v1 >= 0) + { + sa_sint_t p1 = cache[i - 1].index; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); cache[i - 1].symbol = --induction_bucket[v1 >> 1]; cache[i - 1].index = (p1 - 1) | (v1 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d; + if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 1].symbol, np = cache[i - 1].index; if (np > 0) { cache[i - 1].index = 0; cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]); } } + } + } + + for (j -= prefetch_distance + 1; i >= j; i -= 1) + { + sa_sint_t v = cache[i].symbol; + if (v >= 0) + { + sa_sint_t p = cache[i].index; d += (p >> (SUFFIX_GROUP_BIT - 1)); cache[i].symbol = --induction_bucket[v >> 1]; cache[i].index = (p - 1) | (v << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d; + if (cache[i].symbol >= omp_block_start) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[i].index = 0; cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]); } } + } + } + + return d; +} + +static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) + { + libsais_prefetchw(&cache[i - 2 * prefetch_distance]); + + sa_sint_t s0 = cache[i - prefetch_distance - 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0]; libsais_prefetchw(s0 >= 0 ? Is0 : NULL); + sa_sint_t s1 = cache[i - prefetch_distance - 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1]; libsais_prefetchw(s1 >= 0 ? Is1 : NULL); + + sa_sint_t v0 = cache[i - 0].symbol; + if (v0 >= 0) + { + cache[i - 0].symbol = --induction_bucket[v0]; + if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 0].symbol, np = cache[i - 0].index; if (np > 0) { cache[i - 0].index = 0; cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; } } + } + + sa_sint_t v1 = cache[i - 1].symbol; + if (v1 >= 0) + { + cache[i - 1].symbol = --induction_bucket[v1]; + if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 1].symbol, np = cache[i - 1].index; if (np > 0) { cache[i - 1].index = 0; cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; }} + } + } + + for (j -= prefetch_distance + 1; i >= j; i -= 1) + { + sa_sint_t v = cache[i].symbol; + if (v >= 0) + { + cache[i].symbol = --induction_bucket[v]; + if (cache[i].symbol >= omp_block_start) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[i].index = 0; cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; } } + } + } +} + +static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(cache); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; + + omp_block_start += block_start; + + if (omp_num_threads == 1) + { + d = libsais_partial_sorting_scan_right_to_left_32s_6k(T, SA, buckets, d, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + libsais_partial_sorting_scan_right_to_left_32s_6k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); + } + + #pragma omp barrier + + #pragma omp master + { + d = libsais_partial_sorting_scan_right_to_left_32s_6k_block_sort(T, buckets, d, cache - block_start, block_start, block_size); + } + + #pragma omp barrier + + { + libsais_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); + } + } +#endif + } + + return d; +} + +static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(cache); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; + + omp_block_start += block_start; + + if (omp_num_threads == 1) + { + d = libsais_partial_sorting_scan_right_to_left_32s_4k(T, SA, k, buckets, d, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + libsais_partial_sorting_scan_right_to_left_32s_4k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); + } + + #pragma omp barrier + + #pragma omp master + { + d = libsais_partial_sorting_scan_right_to_left_32s_4k_block_sort(T, k, buckets, d, cache - block_start, block_start, block_size); + } + + #pragma omp barrier + + { + libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); + } + } +#endif + } + + return d; +} + +static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(cache); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; + + omp_block_start += block_start; + + if (omp_num_threads == 1) + { + libsais_partial_sorting_scan_right_to_left_32s_1k(T, SA, buckets, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + libsais_partial_sorting_scan_right_to_left_32s_1k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); + } + + #pragma omp barrier + + #pragma omp master + { + libsais_partial_sorting_scan_right_to_left_32s_1k_block_sort(T, buckets, cache - block_start, block_start, block_size); + } + + #pragma omp barrier + + { + libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); + } + } +#endif + } +} + +#endif + +static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + fast_sint_t scan_start = (fast_sint_t)left_suffixes_count + 1; + fast_sint_t scan_end = (fast_sint_t)n - (fast_sint_t)first_lms_suffix; + + if (threads == 1 || (scan_end - scan_start) < 65536) + { + d = libsais_partial_sorting_scan_right_to_left_32s_6k(T, SA, buckets, d, scan_start, scan_end - scan_start); + } +#if defined(_OPENMP) + else + { + fast_sint_t block_start, block_end; + for (block_start = scan_end - 1; block_start >= scan_start; block_start = block_end) + { + block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < scan_start) { block_end = scan_start - 1; } + + d = libsais_partial_sorting_scan_right_to_left_32s_6k_block_omp(T, SA, buckets, d, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads); + } + } +#else + UNUSED(thread_state); +#endif + + return d; +} + +static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + if (threads == 1 || n < 65536) + { + d = libsais_partial_sorting_scan_right_to_left_32s_4k(T, SA, k, buckets, d, 0, n); + } +#if defined(_OPENMP) + else + { + fast_sint_t block_start, block_end; + for (block_start = (fast_sint_t)n - 1; block_start >= 0; block_start = block_end) + { + block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < 0) { block_end = -1; } + + d = libsais_partial_sorting_scan_right_to_left_32s_4k_block_omp(T, SA, k, buckets, d, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads); + } + } +#else + UNUSED(thread_state); +#endif + + return d; +} + +static void libsais_partial_sorting_scan_right_to_left_32s_1k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + if (threads == 1 || n < 65536) + { + libsais_partial_sorting_scan_right_to_left_32s_1k(T, SA, buckets, 0, n); + } +#if defined(_OPENMP) + else + { + fast_sint_t block_start, block_end; + for (block_start = (fast_sint_t)n - 1; block_start >= 0; block_start = block_end) + { + block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < 0) { block_end = -1; } + + libsais_partial_sorting_scan_right_to_left_32s_1k_block_omp(T, SA, buckets, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads); + } + } +#else + UNUSED(thread_state); +#endif +} + +static fast_sint_t libsais_partial_sorting_gather_lms_suffixes_32s_4k(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j, l; + for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j; i += 4) + { + libsais_prefetch(&SA[i + prefetch_distance]); + + sa_sint_t s0 = SA[i + 0]; SA[l] = (s0 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s0 < 0); + sa_sint_t s1 = SA[i + 1]; SA[l] = (s1 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s1 < 0); + sa_sint_t s2 = SA[i + 2]; SA[l] = (s2 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s2 < 0); + sa_sint_t s3 = SA[i + 3]; SA[l] = (s3 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s3 < 0); + } + + for (j += 3; i < j; i += 1) + { + sa_sint_t s = SA[i]; SA[l] = (s - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s < 0); + } + + return l; +} + +static fast_sint_t libsais_partial_sorting_gather_lms_suffixes_32s_1k(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j, l; + for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j; i += 4) + { + libsais_prefetch(&SA[i + prefetch_distance]); + + sa_sint_t s0 = SA[i + 0]; SA[l] = s0 & SAINT_MAX; l += (s0 < 0); + sa_sint_t s1 = SA[i + 1]; SA[l] = s1 & SAINT_MAX; l += (s1 < 0); + sa_sint_t s2 = SA[i + 2]; SA[l] = s2 & SAINT_MAX; l += (s2 < 0); + sa_sint_t s3 = SA[i + 3]; SA[l] = s3 & SAINT_MAX; l += (s3 < 0); + } + + for (j += 3; i < j; i += 1) + { + sa_sint_t s = SA[i]; SA[l] = s & SAINT_MAX; l += (s < 0); + } + + return l; +} + +static void libsais_partial_sorting_gather_lms_suffixes_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; + + if (omp_num_threads == 1) + { + libsais_partial_sorting_gather_lms_suffixes_32s_4k(SA, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + thread_state[omp_thread_num].state.position = omp_block_start; + thread_state[omp_thread_num].state.count = libsais_partial_sorting_gather_lms_suffixes_32s_4k(SA, omp_block_start, omp_block_size) - omp_block_start; + } + + #pragma omp barrier + + #pragma omp master + { + fast_sint_t t, position = 0; + for (t = 0; t < omp_num_threads; ++t) + { + if (t > 0 && thread_state[t].state.count > 0) + { + memmove(&SA[position], &SA[thread_state[t].state.position], (size_t)thread_state[t].state.count * sizeof(sa_sint_t)); + } + + position += thread_state[t].state.count; + } + } + } +#endif + } +} + +static void libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; + + if (omp_num_threads == 1) + { + libsais_partial_sorting_gather_lms_suffixes_32s_1k(SA, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + thread_state[omp_thread_num].state.position = omp_block_start; + thread_state[omp_thread_num].state.count = libsais_partial_sorting_gather_lms_suffixes_32s_1k(SA, omp_block_start, omp_block_size) - omp_block_start; + } + + #pragma omp barrier + + #pragma omp master + { + fast_sint_t t, position = 0; + for (t = 0; t < omp_num_threads; ++t) + { + if (t > 0 && thread_state[t].state.count > 0) + { + memmove(&SA[position], &SA[thread_state[t].state.position], (size_t)thread_state[t].state.count * sizeof(sa_sint_t)); + } + + position += thread_state[t].state.count; + } + } + } +#endif + } +} + +static void libsais_induce_partial_order_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + memset(&buckets[2 * ALPHABET_SIZE], 0, 2 * ALPHABET_SIZE * sizeof(sa_sint_t)); + + sa_sint_t d = libsais_partial_sorting_scan_left_to_right_8u_omp(T, SA, n, buckets, left_suffixes_count, 0, threads, thread_state); + libsais_partial_sorting_shift_markers_8u_omp(SA, n, buckets, threads); + libsais_partial_sorting_scan_right_to_left_8u_omp(T, SA, n, buckets, first_lms_suffix, left_suffixes_count, d, threads, thread_state); +} + +static void libsais_induce_partial_order_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + sa_sint_t d = libsais_partial_sorting_scan_left_to_right_32s_6k_omp(T, SA, n, buckets, left_suffixes_count, 0, threads, thread_state); + libsais_partial_sorting_shift_markers_32s_6k_omp(SA, k, buckets, threads); + libsais_partial_sorting_shift_buckets_32s_6k(k, buckets); + libsais_partial_sorting_scan_right_to_left_32s_6k_omp(T, SA, n, buckets, first_lms_suffix, left_suffixes_count, d, threads, thread_state); +} + +static void libsais_induce_partial_order_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t)); + + sa_sint_t d = libsais_partial_sorting_scan_left_to_right_32s_4k_omp(T, SA, n, k, buckets, 0, threads, thread_state); + libsais_partial_sorting_shift_markers_32s_4k(SA, n); + libsais_partial_sorting_scan_right_to_left_32s_4k_omp(T, SA, n, k, buckets, d, threads, thread_state); + libsais_partial_sorting_gather_lms_suffixes_32s_4k_omp(SA, n, threads, thread_state); +} + +static void libsais_induce_partial_order_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + libsais_partial_sorting_scan_left_to_right_32s_1k_omp(T, SA, n, &buckets[1 * k], threads, thread_state); + libsais_partial_sorting_scan_right_to_left_32s_1k_omp(T, SA, n, &buckets[0 * k], threads, thread_state); + libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads, thread_state); +} + +static void libsais_induce_partial_order_32s_1k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + libsais_count_suffixes_32s(T, n, k, buckets); + libsais_initialize_buckets_start_32s_1k(k, buckets); + libsais_partial_sorting_scan_left_to_right_32s_1k_omp(T, SA, n, buckets, threads, thread_state); + + libsais_count_suffixes_32s(T, n, k, buckets); + libsais_initialize_buckets_end_32s_1k(k, buckets); + libsais_partial_sorting_scan_right_to_left_32s_1k_omp(T, SA, n, buckets, threads, thread_state); + + libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads, thread_state); +} + +static sa_sint_t libsais_renumber_lms_suffixes_8u(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t name, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + sa_sint_t * RESTRICT SAm = &SA[m]; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) + { + libsais_prefetch(&SA[i + 2 * prefetch_distance]); + + libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 0] & SAINT_MAX) >> 1]); + libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 1] & SAINT_MAX) >> 1]); + libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 2] & SAINT_MAX) >> 1]); + libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 3] & SAINT_MAX) >> 1]); + + sa_sint_t p0 = SA[i + 0]; SAm[(p0 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p0 < 0; + sa_sint_t p1 = SA[i + 1]; SAm[(p1 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p1 < 0; + sa_sint_t p2 = SA[i + 2]; SAm[(p2 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p2 < 0; + sa_sint_t p3 = SA[i + 3]; SAm[(p3 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p3 < 0; + } + + for (j += prefetch_distance + 3; i < j; i += 1) + { + sa_sint_t p = SA[i]; SAm[(p & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p < 0; + } + + return name; +} + +static fast_sint_t libsais_gather_marked_suffixes_8u(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t l, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + l -= 1; + + fast_sint_t i, j; + for (i = (fast_sint_t)m + omp_block_start + omp_block_size - 1, j = (fast_sint_t)m + omp_block_start + 3; i >= j; i -= 4) + { + libsais_prefetch(&SA[i - prefetch_distance]); + + sa_sint_t s0 = SA[i - 0]; SA[l] = s0 & SAINT_MAX; l -= s0 < 0; + sa_sint_t s1 = SA[i - 1]; SA[l] = s1 & SAINT_MAX; l -= s1 < 0; + sa_sint_t s2 = SA[i - 2]; SA[l] = s2 & SAINT_MAX; l -= s2 < 0; + sa_sint_t s3 = SA[i - 3]; SA[l] = s3 & SAINT_MAX; l -= s3 < 0; + } + + for (j -= 3; i >= j; i -= 1) + { + sa_sint_t s = SA[i]; SA[l] = s & SAINT_MAX; l -= s < 0; + } + + l += 1; + + return l; +} + +static sa_sint_t libsais_renumber_lms_suffixes_8u_omp(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + sa_sint_t name = 0; + +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start; + + if (omp_num_threads == 1) + { + name = libsais_renumber_lms_suffixes_8u(SA, m, 0, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + thread_state[omp_thread_num].state.count = libsais_count_negative_marked_suffixes(SA, omp_block_start, omp_block_size); + } + + #pragma omp barrier + + { + fast_sint_t t, count = 0; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; } + + if (omp_thread_num == omp_num_threads - 1) + { + name = (sa_sint_t)(count + thread_state[omp_thread_num].state.count); + } + + libsais_renumber_lms_suffixes_8u(SA, m, (sa_sint_t)count, omp_block_start, omp_block_size); + } + } +#endif + } + + return name; +} + +static void libsais_gather_marked_lms_suffixes_8u_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start; + + if (omp_num_threads == 1) + { + libsais_gather_marked_suffixes_8u(SA, m, (fast_sint_t)n + (fast_sint_t)fs, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + if (omp_thread_num < omp_num_threads - 1) + { + thread_state[omp_thread_num].state.position = libsais_gather_marked_suffixes_8u(SA, m, (fast_sint_t)m + omp_block_start + omp_block_size, omp_block_start, omp_block_size); + thread_state[omp_thread_num].state.count = (fast_sint_t)m + omp_block_start + omp_block_size - thread_state[omp_thread_num].state.position; + } + else + { + thread_state[omp_thread_num].state.position = libsais_gather_marked_suffixes_8u(SA, m, (fast_sint_t)n + (fast_sint_t)fs, omp_block_start, omp_block_size); + thread_state[omp_thread_num].state.count = (fast_sint_t)n + (fast_sint_t)fs - thread_state[omp_thread_num].state.position; + } + } + + #pragma omp barrier + + #pragma omp master + { + fast_sint_t t, position = (fast_sint_t)n + (fast_sint_t)fs; + + for (t = omp_num_threads - 1; t >= 0; --t) + { + position -= thread_state[t].state.count; + if (t != omp_num_threads - 1 && thread_state[t].state.count > 0) + { + memmove(&SA[position], &SA[thread_state[t].state.position], (size_t)thread_state[t].state.count * sizeof(sa_sint_t)); + } + } + } + } +#endif + } +} + +static sa_sint_t libsais_renumber_and_gather_lms_suffixes_8u_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + memset(&SA[m], 0, ((size_t)n >> 1) * sizeof(sa_sint_t)); + + sa_sint_t name = libsais_renumber_lms_suffixes_8u_omp(SA, m, threads, thread_state); + if (name < m) + { + libsais_gather_marked_lms_suffixes_8u_omp(SA, n, m, fs, threads, thread_state); + } + else + { + fast_sint_t i; for (i = 0; i < m; i += 1) { SA[i] &= SAINT_MAX; } + } + + return name; +} + +static sa_sint_t libsais_renumber_distinct_lms_suffixes_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t name, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + sa_sint_t * RESTRICT SAm = &SA[m]; + + fast_sint_t i, j; sa_sint_t p0, p1, p2, p3 = 0; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) + { + libsais_prefetchw(&SA[i + 2 * prefetch_distance]); + + libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 0] & SAINT_MAX) >> 1]); + libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 1] & SAINT_MAX) >> 1]); + libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 2] & SAINT_MAX) >> 1]); + libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 3] & SAINT_MAX) >> 1]); + + p0 = SA[i + 0]; SAm[(SA[i + 0] = p0 & SAINT_MAX) >> 1] = name | (p0 & p3 & SAINT_MIN); name += p0 < 0; + p1 = SA[i + 1]; SAm[(SA[i + 1] = p1 & SAINT_MAX) >> 1] = name | (p1 & p0 & SAINT_MIN); name += p1 < 0; + p2 = SA[i + 2]; SAm[(SA[i + 2] = p2 & SAINT_MAX) >> 1] = name | (p2 & p1 & SAINT_MIN); name += p2 < 0; + p3 = SA[i + 3]; SAm[(SA[i + 3] = p3 & SAINT_MAX) >> 1] = name | (p3 & p2 & SAINT_MIN); name += p3 < 0; + } + + for (j += prefetch_distance + 3; i < j; i += 1) + { + p2 = p3; p3 = SA[i]; SAm[(SA[i] = p3 & SAINT_MAX) >> 1] = name | (p3 & p2 & SAINT_MIN); name += p3 < 0; + } + + return name; +} + +static void libsais_mark_distinct_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; sa_sint_t p0, p1, p2, p3 = 0; + for (i = (fast_sint_t)m + omp_block_start, j = (fast_sint_t)m + omp_block_start + omp_block_size - 3; i < j; i += 4) + { + libsais_prefetchw(&SA[i + prefetch_distance]); + + p0 = SA[i + 0]; SA[i + 0] = p0 & (p3 | SAINT_MAX); p0 = (p0 == 0) ? p3 : p0; + p1 = SA[i + 1]; SA[i + 1] = p1 & (p0 | SAINT_MAX); p1 = (p1 == 0) ? p0 : p1; + p2 = SA[i + 2]; SA[i + 2] = p2 & (p1 | SAINT_MAX); p2 = (p2 == 0) ? p1 : p2; + p3 = SA[i + 3]; SA[i + 3] = p3 & (p2 | SAINT_MAX); p3 = (p3 == 0) ? p2 : p3; + } + + for (j += 3; i < j; i += 1) + { + p2 = p3; p3 = SA[i]; SA[i] = p3 & (p2 | SAINT_MAX); p3 = (p3 == 0) ? p2 : p3; + } +} + +static void libsais_clamp_lms_suffixes_length_32s(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + sa_sint_t * RESTRICT SAm = &SA[m]; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - 3; i < j; i += 4) + { + libsais_prefetchw(&SAm[i + prefetch_distance]); + + SAm[i + 0] = (SAm[i + 0] < 0 ? SAm[i + 0] : 0) & SAINT_MAX; + SAm[i + 1] = (SAm[i + 1] < 0 ? SAm[i + 1] : 0) & SAINT_MAX; + SAm[i + 2] = (SAm[i + 2] < 0 ? SAm[i + 2] : 0) & SAINT_MAX; + SAm[i + 3] = (SAm[i + 3] < 0 ? SAm[i + 3] : 0) & SAINT_MAX; + } + + for (j += 3; i < j; i += 1) + { + SAm[i] = (SAm[i] < 0 ? SAm[i] : 0) & SAINT_MAX; + } +} + +static sa_sint_t libsais_renumber_distinct_lms_suffixes_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + sa_sint_t name = 0; + +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start; + + if (omp_num_threads == 1) + { + name = libsais_renumber_distinct_lms_suffixes_32s_4k(SA, m, 1, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + thread_state[omp_thread_num].state.count = libsais_count_negative_marked_suffixes(SA, omp_block_start, omp_block_size); + } + + #pragma omp barrier + + { + fast_sint_t t, count = 1; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; } + + if (omp_thread_num == omp_num_threads - 1) + { + name = (sa_sint_t)(count + thread_state[omp_thread_num].state.count); + } + + libsais_renumber_distinct_lms_suffixes_32s_4k(SA, m, (sa_sint_t)count, omp_block_start, omp_block_size); + } + } +#endif + } + + return name - 1; +} + +static void libsais_mark_distinct_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); + fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start; +#else + UNUSED(threads); + + fast_sint_t omp_block_start = 0; + fast_sint_t omp_block_size = (fast_sint_t)n >> 1; +#endif + libsais_mark_distinct_lms_suffixes_32s(SA, m, omp_block_start, omp_block_size); + } +} + +static void libsais_clamp_lms_suffixes_length_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); + fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start; +#else + UNUSED(threads); + + fast_sint_t omp_block_start = 0; + fast_sint_t omp_block_size = (fast_sint_t)n >> 1; +#endif + libsais_clamp_lms_suffixes_length_32s(SA, m, omp_block_start, omp_block_size); + } +} + +static sa_sint_t libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + memset(&SA[m], 0, ((size_t)n >> 1) * sizeof(sa_sint_t)); + + sa_sint_t name = libsais_renumber_distinct_lms_suffixes_32s_4k_omp(SA, m, threads, thread_state); + if (name < m) + { + libsais_mark_distinct_lms_suffixes_32s_omp(SA, n, m, threads); + } + + return name; +} + +static sa_sint_t libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads) +{ + const fast_sint_t prefetch_distance = 32; + + sa_sint_t * RESTRICT SAm = &SA[m]; + + { + libsais_gather_lms_suffixes_32s(T, SA, n); + + memset(&SA[m], 0, ((size_t)n - (size_t)m - (size_t)m) * sizeof(sa_sint_t)); + + fast_sint_t i, j; + for (i = (fast_sint_t)n - (fast_sint_t)m, j = (fast_sint_t)n - 1 - prefetch_distance - 3; i < j; i += 4) + { + libsais_prefetch(&SA[i + 2 * prefetch_distance]); + + libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]); + libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]); + libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 2]) >> 1]); + libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 3]) >> 1]); + + SAm[((sa_uint_t)SA[i + 0]) >> 1] = SA[i + 1] - SA[i + 0] + 1 + SAINT_MIN; + SAm[((sa_uint_t)SA[i + 1]) >> 1] = SA[i + 2] - SA[i + 1] + 1 + SAINT_MIN; + SAm[((sa_uint_t)SA[i + 2]) >> 1] = SA[i + 3] - SA[i + 2] + 1 + SAINT_MIN; + SAm[((sa_uint_t)SA[i + 3]) >> 1] = SA[i + 4] - SA[i + 3] + 1 + SAINT_MIN; + } + + for (j += prefetch_distance + 3; i < j; i += 1) + { + SAm[((sa_uint_t)SA[i]) >> 1] = SA[i + 1] - SA[i] + 1 + SAINT_MIN; + } + + SAm[((sa_uint_t)SA[n - 1]) >> 1] = 1 + SAINT_MIN; + } + + { + libsais_clamp_lms_suffixes_length_32s_omp(SA, n, m, threads); + } + + sa_sint_t name = 1; + + { + fast_sint_t i, j, p = SA[0], plen = SAm[p >> 1]; sa_sint_t pdiff = SAINT_MIN; + for (i = 1, j = m - prefetch_distance - 1; i < j; i += 2) + { + libsais_prefetch(&SA[i + 2 * prefetch_distance]); + + libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]); libsais_prefetch(&T[((sa_uint_t)SA[i + prefetch_distance + 0])]); + libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]); libsais_prefetch(&T[((sa_uint_t)SA[i + prefetch_distance + 1])]); + + fast_sint_t q = SA[i + 0], qlen = SAm[q >> 1]; sa_sint_t qdiff = SAINT_MIN; + if (plen == qlen) { fast_sint_t l = 0; do { if (T[p + l] != T[q + l]) { break; } } while (++l < qlen); qdiff = (sa_sint_t)(l - qlen) & SAINT_MIN; } + SAm[p >> 1] = name | (pdiff & qdiff); name += (qdiff < 0); + + p = SA[i + 1]; plen = SAm[p >> 1]; pdiff = SAINT_MIN; + if (qlen == plen) { fast_sint_t l = 0; do { if (T[q + l] != T[p + l]) { break; } } while (++l < plen); pdiff = (sa_sint_t)(l - plen) & SAINT_MIN; } + SAm[q >> 1] = name | (qdiff & pdiff); name += (pdiff < 0); + } + + for (j += prefetch_distance + 1; i < j; i += 1) + { + fast_sint_t q = SA[i], qlen = SAm[q >> 1]; sa_sint_t qdiff = SAINT_MIN; + if (plen == qlen) { fast_sint_t l = 0; do { if (T[p + l] != T[q + l]) { break; } } while (++l < plen); qdiff = (sa_sint_t)(l - plen) & SAINT_MIN; } + SAm[p >> 1] = name | (pdiff & qdiff); name += (qdiff < 0); + + p = q; plen = qlen; pdiff = qdiff; + } + + SAm[p >> 1] = name | pdiff; name++; + } + + if (name <= m) + { + libsais_mark_distinct_lms_suffixes_32s_omp(SA, n, m, threads); + } + + return name - 1; +} + +static void libsais_reconstruct_lms_suffixes(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + const sa_sint_t * RESTRICT SAnm = &SA[n - m]; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) + { + libsais_prefetchw(&SA[i + 2 * prefetch_distance]); + + libsais_prefetch(&SAnm[SA[i + prefetch_distance + 0]]); + libsais_prefetch(&SAnm[SA[i + prefetch_distance + 1]]); + libsais_prefetch(&SAnm[SA[i + prefetch_distance + 2]]); + libsais_prefetch(&SAnm[SA[i + prefetch_distance + 3]]); + + SA[i + 0] = SAnm[SA[i + 0]]; + SA[i + 1] = SAnm[SA[i + 1]]; + SA[i + 2] = SAnm[SA[i + 2]]; + SA[i + 3] = SAnm[SA[i + 3]]; + } + + for (j += prefetch_distance + 3; i < j; i += 1) + { + SA[i] = SAnm[SA[i]]; + } +} + +static void libsais_reconstruct_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); + fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start; +#else + UNUSED(threads); + + fast_sint_t omp_block_start = 0; + fast_sint_t omp_block_size = m; +#endif + + libsais_reconstruct_lms_suffixes(SA, n, m, omp_block_start, omp_block_size); + } +} + +static void libsais_place_lms_suffixes_interval_8u(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, const sa_sint_t * RESTRICT buckets) +{ + const sa_sint_t * RESTRICT bucket_end = &buckets[7 * ALPHABET_SIZE]; + + fast_sint_t c, j = n; + for (c = ALPHABET_SIZE - 2; c >= 0; --c) + { + fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] - (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)]; + if (l > 0) + { + fast_sint_t i = bucket_end[c]; + if (j - i > 0) + { + memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t)); + } + + memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t)); + } + } + + memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t)); +} + +static void libsais_place_lms_suffixes_interval_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets) +{ + const sa_sint_t * RESTRICT bucket_end = &buckets[3 * k]; + + fast_sint_t c, j = n; + for (c = (fast_sint_t)k - 2; c >= 0; --c) + { + fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] - (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)]; + if (l > 0) + { + fast_sint_t i = bucket_end[c]; + if (j - i > 0) + { + memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t)); + } + + memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t)); + } + } + + memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t)); +} + +static void libsais_place_lms_suffixes_interval_32s_2k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets) +{ + fast_sint_t j = n; + + if (k > 1) + { + fast_sint_t c; + for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0); c >= BUCKETS_INDEX2(0, 0); c -= BUCKETS_INDEX2(1, 0)) + { + fast_sint_t l = (fast_sint_t)buckets[c + BUCKETS_INDEX2(1, 1)] - (fast_sint_t)buckets[c + BUCKETS_INDEX2(0, 1)]; + if (l > 0) + { + fast_sint_t i = buckets[c]; + if (j - i > 0) + { + memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t)); + } + + memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t)); + } + } + } + + memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t)); +} + +static void libsais_place_lms_suffixes_interval_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t m, sa_sint_t * RESTRICT buckets) +{ + const fast_sint_t prefetch_distance = 32; + + sa_sint_t c = k - 1; fast_sint_t i, l = buckets[c]; + for (i = (fast_sint_t)m - 1; i >= prefetch_distance + 3; i -= 4) + { + libsais_prefetch(&SA[i - 2 * prefetch_distance]); + + libsais_prefetch(&T[SA[i - prefetch_distance - 0]]); + libsais_prefetch(&T[SA[i - prefetch_distance - 1]]); + libsais_prefetch(&T[SA[i - prefetch_distance - 2]]); + libsais_prefetch(&T[SA[i - prefetch_distance - 3]]); + + sa_sint_t p0 = SA[i - 0]; if (T[p0] != c) { c = T[p0]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p0; + sa_sint_t p1 = SA[i - 1]; if (T[p1] != c) { c = T[p1]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p1; + sa_sint_t p2 = SA[i - 2]; if (T[p2] != c) { c = T[p2]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p2; + sa_sint_t p3 = SA[i - 3]; if (T[p3] != c) { c = T[p3]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p3; + } + + for (; i >= 0; i -= 1) + { + sa_sint_t p = SA[i]; if (T[p] != c) { c = T[p]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p; + } + + memset(&SA[0], 0, (size_t)l * sizeof(sa_sint_t)); +} + +static void libsais_place_lms_suffixes_histogram_32s_6k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets) +{ + const sa_sint_t * RESTRICT bucket_end = &buckets[5 * k]; + + fast_sint_t c, j = n; + for (c = (fast_sint_t)k - 2; c >= 0; --c) + { + fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX4(c, 1)]; + if (l > 0) + { + fast_sint_t i = bucket_end[c]; + if (j - i > 0) + { + memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t)); + } + + memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t)); + } + } + + memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t)); +} + +static void libsais_place_lms_suffixes_histogram_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets) +{ + const sa_sint_t * RESTRICT bucket_end = &buckets[3 * k]; + + fast_sint_t c, j = n; + for (c = (fast_sint_t)k - 2; c >= 0; --c) + { + fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)]; + if (l > 0) + { + fast_sint_t i = bucket_end[c]; + if (j - i > 0) + { + memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t)); + } + + memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t)); + } + } + + memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t)); +} + +static void libsais_place_lms_suffixes_histogram_32s_2k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets) +{ + fast_sint_t j = n; + + if (k > 1) + { + fast_sint_t c; + for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0); c >= BUCKETS_INDEX2(0, 0); c -= BUCKETS_INDEX2(1, 0)) + { + fast_sint_t l = (fast_sint_t)buckets[c + BUCKETS_INDEX2(0, 1)]; + if (l > 0) + { + fast_sint_t i = buckets[c]; + if (j - i > 0) + { + memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t)); + } + + memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t)); + } + } + } + + memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t)); +} + +static void libsais_final_bwt_scan_left_to_right_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) + { + libsais_prefetchw(&SA[i + 2 * prefetch_distance]); + + sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + + sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[i + 0] = T[p0] | SAINT_MIN; SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); } + sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[i + 1] = T[p1] | SAINT_MIN; SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); } + } + + for (j += prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[i] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); } + } +} + +static void libsais_final_bwt_aux_scan_left_to_right_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) + { + libsais_prefetchw(&SA[i + 2 * prefetch_distance]); + + sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + + sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[i + 0] = T[p0] | SAINT_MIN; SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); if ((p0 & rm) == 0) { I[p0 / (rm + 1)] = induction_bucket[T[p0]]; }} + sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[i + 1] = T[p1] | SAINT_MIN; SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); if ((p1 & rm) == 0) { I[p1 / (rm + 1)] = induction_bucket[T[p1]]; }} + } + + for (j += prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[i] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]]; } } + } +} + +static void libsais_final_sorting_scan_left_to_right_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) + { + libsais_prefetchw(&SA[i + 2 * prefetch_distance]); + + sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + + sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); } + sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); } + } + + for (j += prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); } + } +} + +static void libsais_final_sorting_scan_left_to_right_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2) + { + libsais_prefetchw(&SA[i + 3 * prefetch_distance]); + + sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { libsais_prefetchw(&induction_bucket[T[s2 - 1]]); libsais_prefetch(&T[s2] - 2); } + sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { libsais_prefetchw(&induction_bucket[T[s3 - 1]]); libsais_prefetch(&T[s3] - 2); } + + sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); } + sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); } + } + + for (j += 2 * prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); } + } +} + +#if defined(_OPENMP) + +static fast_sint_t libsais_final_bwt_scan_left_to_right_8u_block_prepare(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t)); + + fast_sint_t i, j, count = 0; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) + { + libsais_prefetchw(&SA[i + 2 * prefetch_distance]); + + sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + + sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[i + 0] = T[p0] | SAINT_MIN; buckets[cache[count].symbol = T[p0]]++; cache[count++].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); } + sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[i + 1] = T[p1] | SAINT_MIN; buckets[cache[count].symbol = T[p1]]++; cache[count++].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); } + } + + for (j += prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[i] = T[p] | SAINT_MIN; buckets[cache[count].symbol = T[p]]++; cache[count++].index = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); } + } + + return count; +} + +static fast_sint_t libsais_final_sorting_scan_left_to_right_8u_block_prepare(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t)); + + fast_sint_t i, j, count = 0; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) + { + libsais_prefetchw(&SA[i + 2 * prefetch_distance]); + + sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + + sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; buckets[cache[count].symbol = T[p0]]++; cache[count++].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); } + sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; buckets[cache[count].symbol = T[p1]]++; cache[count++].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); } + } + + for (j += prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; buckets[cache[count].symbol = T[p]]++; cache[count++].index = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); } + } + + return count; +} + +static void libsais_final_order_scan_left_to_right_8u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = 0, j = count - 3; i < j; i += 4) + { + libsais_prefetch(&cache[i + prefetch_distance]); + + SA[buckets[cache[i + 0].symbol]++] = cache[i + 0].index; + SA[buckets[cache[i + 1].symbol]++] = cache[i + 1].index; + SA[buckets[cache[i + 2].symbol]++] = cache[i + 2].index; + SA[buckets[cache[i + 3].symbol]++] = cache[i + 3].index; + } + + for (j += 3; i < j; i += 1) + { + SA[buckets[cache[i].symbol]++] = cache[i].index; + } +} + +static void libsais_final_bwt_aux_scan_left_to_right_8u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = 0, j = count - 3; i < j; i += 4) + { + libsais_prefetch(&cache[i + prefetch_distance]); + + SA[buckets[cache[i + 0].symbol]++] = cache[i + 0].index; if ((cache[i + 0].index & rm) == 0) { I[(cache[i + 0].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 0].symbol]; } + SA[buckets[cache[i + 1].symbol]++] = cache[i + 1].index; if ((cache[i + 1].index & rm) == 0) { I[(cache[i + 1].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 1].symbol]; } + SA[buckets[cache[i + 2].symbol]++] = cache[i + 2].index; if ((cache[i + 2].index & rm) == 0) { I[(cache[i + 2].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 2].symbol]; } + SA[buckets[cache[i + 3].symbol]++] = cache[i + 3].index; if ((cache[i + 3].index & rm) == 0) { I[(cache[i + 3].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 3].symbol]; } + } + + for (j += 3; i < j; i += 1) + { + SA[buckets[cache[i].symbol]++] = cache[i].index; if ((cache[i].index & rm) == 0) { I[(cache[i].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i].symbol]; } + } +} + +static void libsais_final_sorting_scan_left_to_right_32s_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) + { + libsais_prefetchw(&SA[i + 2 * prefetch_distance]); + + sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + + libsais_prefetchw(&cache[i + prefetch_distance]); + + sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; cache[i + 0].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); symbol0 = T[p0]; } cache[i + 0].symbol = symbol0; + sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; cache[i + 1].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); symbol1 = T[p1]; } cache[i + 1].symbol = symbol1; + } + + for (j += prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t symbol = SAINT_MIN, p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; cache[i].index = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); symbol = T[p]; } cache[i].symbol = symbol; + } +} + +static void libsais_final_sorting_scan_left_to_right_32s_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size; + for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2) + { + libsais_prefetchw(&cache[i + 2 * prefetch_distance]); + + sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0]; libsais_prefetchw(s0 >= 0 ? Is0 : NULL); + sa_sint_t s1 = cache[i + prefetch_distance + 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1]; libsais_prefetchw(s1 >= 0 ? Is1 : NULL); + + sa_sint_t v0 = cache[i + 0].symbol; + if (v0 >= 0) + { + cache[i + 0].symbol = induction_bucket[v0]++; + if (cache[i + 0].symbol < omp_block_end) { sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index; cache[i + 0].index = np ^ SAINT_MIN; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] < T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } } + } + + sa_sint_t v1 = cache[i + 1].symbol; + if (v1 >= 0) + { + cache[i + 1].symbol = induction_bucket[v1]++; + if (cache[i + 1].symbol < omp_block_end) { sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index; cache[i + 1].index = np ^ SAINT_MIN; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] < T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } } + } + } + + for (j += prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t v = cache[i].symbol; + if (v >= 0) + { + cache[i].symbol = induction_bucket[v]++; + if (cache[i].symbol < omp_block_end) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; cache[i].index = np ^ SAINT_MIN; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] < T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } } + } + } +} + +static void libsais_final_bwt_scan_left_to_right_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; + + omp_block_start += block_start; + + if (omp_num_threads == 1) + { + libsais_final_bwt_scan_left_to_right_8u(T, SA, induction_bucket, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + thread_state[omp_thread_num].state.count = libsais_final_bwt_scan_left_to_right_8u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size); + } + + #pragma omp barrier + + #pragma omp master + { + fast_sint_t t; + for (t = 0; t < omp_num_threads; ++t) + { + sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; + fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A + B; temp_bucket[c] = A; } + } + } + + #pragma omp barrier + + { + libsais_final_order_scan_left_to_right_8u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count); + } + } +#endif + } +} + +static void libsais_final_bwt_aux_scan_left_to_right_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; + + omp_block_start += block_start; + + if (omp_num_threads == 1) + { + libsais_final_bwt_aux_scan_left_to_right_8u(T, SA, rm, I, induction_bucket, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + thread_state[omp_thread_num].state.count = libsais_final_bwt_scan_left_to_right_8u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size); + } + + #pragma omp barrier + + #pragma omp master + { + fast_sint_t t; + for (t = 0; t < omp_num_threads; ++t) + { + sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; + fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A + B; temp_bucket[c] = A; } + } + } + + #pragma omp barrier + + { + libsais_final_bwt_aux_scan_left_to_right_8u_block_place(SA, rm, I, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count); + } + } +#endif + } +} + +static void libsais_final_sorting_scan_left_to_right_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; + + omp_block_start += block_start; + + if (omp_num_threads == 1) + { + libsais_final_sorting_scan_left_to_right_8u(T, SA, induction_bucket, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + thread_state[omp_thread_num].state.count = libsais_final_sorting_scan_left_to_right_8u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size); + } + + #pragma omp barrier + + #pragma omp master + { + fast_sint_t t; + for (t = 0; t < omp_num_threads; ++t) + { + sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; + fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A + B; temp_bucket[c] = A; } + } + } + + #pragma omp barrier + + { + libsais_final_order_scan_left_to_right_8u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count); + } + } +#endif + } +} + +static void libsais_final_sorting_scan_left_to_right_32s_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(cache); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; + + omp_block_start += block_start; + + if (omp_num_threads == 1) + { + libsais_final_sorting_scan_left_to_right_32s(T, SA, buckets, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + libsais_final_sorting_scan_left_to_right_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); + } + + #pragma omp barrier + + #pragma omp master + { + libsais_final_sorting_scan_left_to_right_32s_block_sort(T, buckets, cache - block_start, block_start, block_size); + } + + #pragma omp barrier + + { + libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); + } + } +#endif + } +} + +#endif + +static void libsais_final_bwt_scan_left_to_right_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + SA[induction_bucket[T[(sa_sint_t)n - 1]]++] = ((sa_sint_t)n - 1) | ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1)); + + if (threads == 1 || n < 65536) + { + libsais_final_bwt_scan_left_to_right_8u(T, SA, induction_bucket, 0, n); + } +#if defined(_OPENMP) + else + { + fast_sint_t block_start; + for (block_start = 0; block_start < n; ) + { + if (SA[block_start] == 0) + { + block_start++; + } + else + { + fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > n) { block_max_end = n;} + fast_sint_t block_end = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; } + fast_sint_t block_size = block_end - block_start; + + if (block_size < 32) + { + for (; block_start < block_end; block_start += 1) + { + sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; SA[block_start] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); } + } + } + else + { + libsais_final_bwt_scan_left_to_right_8u_block_omp(T, SA, induction_bucket, block_start, block_size, threads, thread_state); + block_start = block_end; + } + } + } + } +#else + UNUSED(thread_state); +#endif +} + +static void libsais_final_bwt_aux_scan_left_to_right_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + SA[induction_bucket[T[(sa_sint_t)n - 1]]++] = ((sa_sint_t)n - 1) | ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1)); + + if ((((sa_sint_t)n - 1) & rm) == 0) { I[((sa_sint_t)n - 1) / (rm + 1)] = induction_bucket[T[(sa_sint_t)n - 1]]; } + + if (threads == 1 || n < 65536) + { + libsais_final_bwt_aux_scan_left_to_right_8u(T, SA, rm, I, induction_bucket, 0, n); + } +#if defined(_OPENMP) + else + { + fast_sint_t block_start; + for (block_start = 0; block_start < n; ) + { + if (SA[block_start] == 0) + { + block_start++; + } + else + { + fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > n) { block_max_end = n;} + fast_sint_t block_end = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; } + fast_sint_t block_size = block_end - block_start; + + if (block_size < 32) + { + for (; block_start < block_end; block_start += 1) + { + sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; SA[block_start] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]]; } } + } + } + else + { + libsais_final_bwt_aux_scan_left_to_right_8u_block_omp(T, SA, rm, I, induction_bucket, block_start, block_size, threads, thread_state); + block_start = block_end; + } + } + } + } +#else + UNUSED(thread_state); +#endif +} + +static void libsais_final_sorting_scan_left_to_right_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + SA[induction_bucket[T[(sa_sint_t)n - 1]]++] = ((sa_sint_t)n - 1) | ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1)); + + if (threads == 1 || n < 65536) + { + libsais_final_sorting_scan_left_to_right_8u(T, SA, induction_bucket, 0, n); + } +#if defined(_OPENMP) + else + { + fast_sint_t block_start; + for (block_start = 0; block_start < n; ) + { + if (SA[block_start] == 0) + { + block_start++; + } + else + { + fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > n) { block_max_end = n;} + fast_sint_t block_end = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; } + fast_sint_t block_size = block_end - block_start; + + if (block_size < 32) + { + for (; block_start < block_end; block_start += 1) + { + sa_sint_t p = SA[block_start]; SA[block_start] = p ^ SAINT_MIN; if (p > 0) { p--; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); } + } + } + else + { + libsais_final_sorting_scan_left_to_right_8u_block_omp(T, SA, induction_bucket, block_start, block_size, threads, thread_state); + block_start = block_end; + } + } + } + } +#else + UNUSED(thread_state); +#endif +} + +static void libsais_final_sorting_scan_left_to_right_32s_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + SA[induction_bucket[T[n - 1]]++] = (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1)); + + if (threads == 1 || n < 65536) + { + libsais_final_sorting_scan_left_to_right_32s(T, SA, induction_bucket, 0, n); + } +#if defined(_OPENMP) + else + { + fast_sint_t block_start, block_end; + for (block_start = 0; block_start < n; block_start = block_end) + { + block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > n) { block_end = n; } + + libsais_final_sorting_scan_left_to_right_32s_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, block_start, block_end - block_start, threads); + } + } +#else + UNUSED(thread_state); +#endif +} + +static sa_sint_t libsais_final_bwt_scan_right_to_left_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; sa_sint_t index = -1; + for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) + { + libsais_prefetchw(&SA[i - 2 * prefetch_distance]); + + sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + + sa_sint_t p0 = SA[i - 0]; index = (p0 == 0) ? (sa_sint_t)(i - 0) : index; + SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint8_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p0 : t; } + + sa_sint_t p1 = SA[i - 1]; index = (p1 == 0) ? (sa_sint_t)(i - 1) : index; + SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint8_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p1 : t; } + } + + for (j -= prefetch_distance + 1; i >= j; i -= 1) + { + sa_sint_t p = SA[i]; index = (p == 0) ? (sa_sint_t)i : index; + SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint8_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; } + } + + return index; +} + +static void libsais_final_bwt_aux_scan_right_to_left_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) + { + libsais_prefetchw(&SA[i - 2 * prefetch_distance]); + + sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + + sa_sint_t p0 = SA[i - 0]; + SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint8_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p0 : t; if ((p0 & rm) == 0) { I[p0 / (rm + 1)] = induction_bucket[T[p0]] + 1; } } + + sa_sint_t p1 = SA[i - 1]; + SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint8_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p1 : t; if ((p1 & rm) == 0) { I[p1 / (rm + 1)] = induction_bucket[T[p1]] + 1; } } + } + + for (j -= prefetch_distance + 1; i >= j; i -= 1) + { + sa_sint_t p = SA[i]; + SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint8_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]] + 1; } } + } +} + +static void libsais_final_sorting_scan_right_to_left_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) + { + libsais_prefetchw(&SA[i - 2 * prefetch_distance]); + + sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + + sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[--induction_bucket[T[p0]]] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); } + sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[--induction_bucket[T[p1]]] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); } + } + + for (j -= prefetch_distance + 1; i >= j; i -= 1) + { + sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[--induction_bucket[T[p]]] = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); } + } +} + +static void libsais_final_sorting_scan_right_to_left_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2) + { + libsais_prefetchw(&SA[i - 3 * prefetch_distance]); + + sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { libsais_prefetchw(&induction_bucket[T[s2 - 1]]); libsais_prefetch(&T[s2] - 2); } + sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { libsais_prefetchw(&induction_bucket[T[s3 - 1]]); libsais_prefetch(&T[s3] - 2); } + + sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[--induction_bucket[T[p0]]] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); } + sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[--induction_bucket[T[p1]]] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); } + } + + for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) + { + sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[--induction_bucket[T[p]]] = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); } + } +} + +#if defined(_OPENMP) + +static fast_sint_t libsais_final_bwt_scan_right_to_left_8u_block_prepare(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t)); + + fast_sint_t i, j, count = 0; + for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) + { + libsais_prefetchw(&SA[i - 2 * prefetch_distance]); + + sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + + sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint8_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count++].index = (c0 <= c1) ? p0 : t; } + sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint8_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count++].index = (c0 <= c1) ? p1 : t; } + } + + for (j -= prefetch_distance + 1; i >= j; i -= 1) + { + sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint8_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count++].index = (c0 <= c1) ? p : t; } + } + + return count; +} + +static fast_sint_t libsais_final_bwt_aux_scan_right_to_left_8u_block_prepare(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t)); + + fast_sint_t i, j, count = 0; + for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) + { + libsais_prefetchw(&SA[i - 2 * prefetch_distance]); + + sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + + sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint8_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count].index = (c0 <= c1) ? p0 : t; cache[count + 1].index = p0; count += 2; } + sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint8_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count].index = (c0 <= c1) ? p1 : t; cache[count + 1].index = p1; count += 2; } + } + + for (j -= prefetch_distance + 1; i >= j; i -= 1) + { + sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint8_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count].index = (c0 <= c1) ? p : t; cache[count + 1].index = p; count += 2; } + } + + return count; +} + +static fast_sint_t libsais_final_sorting_scan_right_to_left_8u_block_prepare(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t)); + + fast_sint_t i, j, count = 0; + for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) + { + libsais_prefetchw(&SA[i - 2 * prefetch_distance]); + + sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + + sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; buckets[cache[count].symbol = T[p0]]++; cache[count++].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); } + sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; buckets[cache[count].symbol = T[p1]]++; cache[count++].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); } + } + + for (j -= prefetch_distance + 1; i >= j; i -= 1) + { + sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; buckets[cache[count].symbol = T[p]]++; cache[count++].index = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); } + } + + return count; +} + +static void libsais_final_order_scan_right_to_left_8u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = 0, j = count - 3; i < j; i += 4) + { + libsais_prefetch(&cache[i + prefetch_distance]); + + SA[--buckets[cache[i + 0].symbol]] = cache[i + 0].index; + SA[--buckets[cache[i + 1].symbol]] = cache[i + 1].index; + SA[--buckets[cache[i + 2].symbol]] = cache[i + 2].index; + SA[--buckets[cache[i + 3].symbol]] = cache[i + 3].index; + } + + for (j += 3; i < j; i += 1) + { + SA[--buckets[cache[i].symbol]] = cache[i].index; + } +} + +static void libsais_final_bwt_aux_scan_right_to_left_8u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = 0, j = count - 6; i < j; i += 8) + { + libsais_prefetch(&cache[i + prefetch_distance]); + + SA[--buckets[cache[i + 0].symbol]] = cache[i + 0].index; if ((cache[i + 1].index & rm) == 0) { I[cache[i + 1].index / (rm + 1)] = buckets[cache[i + 0].symbol] + 1; } + SA[--buckets[cache[i + 2].symbol]] = cache[i + 2].index; if ((cache[i + 3].index & rm) == 0) { I[cache[i + 3].index / (rm + 1)] = buckets[cache[i + 2].symbol] + 1; } + SA[--buckets[cache[i + 4].symbol]] = cache[i + 4].index; if ((cache[i + 5].index & rm) == 0) { I[cache[i + 5].index / (rm + 1)] = buckets[cache[i + 4].symbol] + 1; } + SA[--buckets[cache[i + 6].symbol]] = cache[i + 6].index; if ((cache[i + 7].index & rm) == 0) { I[cache[i + 7].index / (rm + 1)] = buckets[cache[i + 6].symbol] + 1; } + } + + for (j += 6; i < j; i += 2) + { + SA[--buckets[cache[i].symbol]] = cache[i].index; if ((cache[i + 1].index & rm) == 0) { I[(cache[i + 1].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i].symbol] + 1; } + } +} + +static void libsais_final_sorting_scan_right_to_left_32s_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) + { + libsais_prefetchw(&SA[i + 2 * prefetch_distance]); + + sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + + libsais_prefetchw(&cache[i + prefetch_distance]); + + sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; cache[i + 0].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); symbol0 = T[p0]; } cache[i + 0].symbol = symbol0; + sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; cache[i + 1].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); symbol1 = T[p1]; } cache[i + 1].symbol = symbol1; + } + + for (j += prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t symbol = SAINT_MIN, p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; cache[i].index = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); symbol = T[p]; } cache[i].symbol = symbol; + } +} + +static void libsais_final_sorting_scan_right_to_left_32s_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) + { + libsais_prefetchw(&cache[i - 2 * prefetch_distance]); + + sa_sint_t s0 = cache[i - prefetch_distance - 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0]; libsais_prefetchw(s0 >= 0 ? Is0 : NULL); + sa_sint_t s1 = cache[i - prefetch_distance - 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1]; libsais_prefetchw(s1 >= 0 ? Is1 : NULL); + + sa_sint_t v0 = cache[i - 0].symbol; + if (v0 >= 0) + { + cache[i - 0].symbol = --induction_bucket[v0]; + if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 0].symbol, np = cache[i - 0].index; cache[i - 0].index = np & SAINT_MAX; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] > T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } } + } + + sa_sint_t v1 = cache[i - 1].symbol; + if (v1 >= 0) + { + cache[i - 1].symbol = --induction_bucket[v1]; + if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 1].symbol, np = cache[i - 1].index; cache[i - 1].index = np & SAINT_MAX; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] > T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } } + } + } + + for (j -= prefetch_distance + 1; i >= j; i -= 1) + { + sa_sint_t v = cache[i].symbol; + if (v >= 0) + { + cache[i].symbol = --induction_bucket[v]; + if (cache[i].symbol >= omp_block_start) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; cache[i].index = np & SAINT_MAX; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] > T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } } + } + } +} + +static void libsais_final_bwt_scan_right_to_left_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; + + omp_block_start += block_start; + + if (omp_num_threads == 1) + { + libsais_final_bwt_scan_right_to_left_8u(T, SA, induction_bucket, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + thread_state[omp_thread_num].state.count = libsais_final_bwt_scan_right_to_left_8u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size); + } + + #pragma omp barrier + + #pragma omp master + { + fast_sint_t t; + for (t = omp_num_threads - 1; t >= 0; --t) + { + sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; + fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A - B; temp_bucket[c] = A; } + } + } + + #pragma omp barrier + + { + libsais_final_order_scan_right_to_left_8u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count); + } + } +#endif + } +} + +static void libsais_final_bwt_aux_scan_right_to_left_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; + + omp_block_start += block_start; + + if (omp_num_threads == 1) + { + libsais_final_bwt_aux_scan_right_to_left_8u(T, SA, rm, I, induction_bucket, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + thread_state[omp_thread_num].state.count = libsais_final_bwt_aux_scan_right_to_left_8u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size); + } + + #pragma omp barrier + + #pragma omp master + { + fast_sint_t t; + for (t = omp_num_threads - 1; t >= 0; --t) + { + sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; + fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A - B; temp_bucket[c] = A; } + } + } + + #pragma omp barrier + + { + libsais_final_bwt_aux_scan_right_to_left_8u_block_place(SA, rm, I, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count); + } + } +#endif + } +} + +static void libsais_final_sorting_scan_right_to_left_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; + + omp_block_start += block_start; + + if (omp_num_threads == 1) + { + libsais_final_sorting_scan_right_to_left_8u(T, SA, induction_bucket, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + thread_state[omp_thread_num].state.count = libsais_final_sorting_scan_right_to_left_8u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size); + } + + #pragma omp barrier + + #pragma omp master + { + fast_sint_t t; + for (t = omp_num_threads - 1; t >= 0; --t) + { + sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; + fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A - B; temp_bucket[c] = A; } + } + } + + #pragma omp barrier + + { + libsais_final_order_scan_right_to_left_8u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count); + } + } +#endif + } +} + +static void libsais_final_sorting_scan_right_to_left_32s_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(cache); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; + + omp_block_start += block_start; + + if (omp_num_threads == 1) + { + libsais_final_sorting_scan_right_to_left_32s(T, SA, buckets, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + libsais_final_sorting_scan_right_to_left_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); + } + + #pragma omp barrier + + #pragma omp master + { + libsais_final_sorting_scan_right_to_left_32s_block_sort(T, buckets, cache - block_start, block_start, block_size); + } + + #pragma omp barrier + + { + libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); + } + } +#endif + } +} + +#endif + +static sa_sint_t libsais_final_bwt_scan_right_to_left_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + sa_sint_t index = -1; + + if (threads == 1 || n < 65536) + { + index = libsais_final_bwt_scan_right_to_left_8u(T, SA, induction_bucket, 0, n); + } +#if defined(_OPENMP) + else + { + fast_sint_t block_start; + for (block_start = (fast_sint_t)n - 1; block_start >= 0; ) + { + if (SA[block_start] == 0) + { + index = (sa_sint_t)block_start--; + } + else + { + fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end < 0) { block_max_end = -1; } + fast_sint_t block_end = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; } + fast_sint_t block_size = block_start - block_end; + + if (block_size < 32) + { + for (; block_start > block_end; block_start -= 1) + { + sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; uint8_t c0 = T[p - (p > 0)], c1 = T[p]; SA[block_start] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; } + } + } + else + { + libsais_final_bwt_scan_right_to_left_8u_block_omp(T, SA, induction_bucket, block_end + 1, block_size, threads, thread_state); + block_start = block_end; + } + } + } + } +#else + UNUSED(thread_state); +#endif + + return index; +} + +static void libsais_final_bwt_aux_scan_right_to_left_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + if (threads == 1 || n < 65536) + { + libsais_final_bwt_aux_scan_right_to_left_8u(T, SA, rm, I, induction_bucket, 0, n); + } +#if defined(_OPENMP) + else + { + fast_sint_t block_start; + for (block_start = (fast_sint_t)n - 1; block_start >= 0; ) + { + if (SA[block_start] == 0) + { + block_start--; + } + else + { + fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * ((LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads) / 2); if (block_max_end < 0) { block_max_end = -1; } + fast_sint_t block_end = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; } + fast_sint_t block_size = block_start - block_end; + + if (block_size < 32) + { + for (; block_start > block_end; block_start -= 1) + { + sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; uint8_t c0 = T[p - (p > 0)], c1 = T[p]; SA[block_start] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]] + 1; } } + } + } + else + { + libsais_final_bwt_aux_scan_right_to_left_8u_block_omp(T, SA, rm, I, induction_bucket, block_end + 1, block_size, threads, thread_state); + block_start = block_end; + } + } + } + } +#else + UNUSED(thread_state); +#endif +} + +static void libsais_final_sorting_scan_right_to_left_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + if (threads == 1 || n < 65536) + { + libsais_final_sorting_scan_right_to_left_8u(T, SA, induction_bucket, 0, n); + } +#if defined(_OPENMP) + else + { + fast_sint_t block_start; + for (block_start = (fast_sint_t)n - 1; block_start >= 0; ) + { + if (SA[block_start] == 0) + { + block_start--; + } + else + { + fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end < -1) { block_max_end = -1; } + fast_sint_t block_end = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; } + fast_sint_t block_size = block_start - block_end; + + if (block_size < 32) + { + for (; block_start > block_end; block_start -= 1) + { + sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; SA[--induction_bucket[T[p]]] = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); } + } + } + else + { + libsais_final_sorting_scan_right_to_left_8u_block_omp(T, SA, induction_bucket, block_end + 1, block_size, threads, thread_state); + block_start = block_end; + } + } + } + } +#else + UNUSED(thread_state); +#endif +} + +static void libsais_final_sorting_scan_right_to_left_32s_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + if (threads == 1 || n < 65536) + { + libsais_final_sorting_scan_right_to_left_32s(T, SA, induction_bucket, 0, n); + } +#if defined(_OPENMP) + else + { + fast_sint_t block_start, block_end; + for (block_start = (fast_sint_t)n - 1; block_start >= 0; block_start = block_end) + { + block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < 0) { block_end = -1; } + + libsais_final_sorting_scan_right_to_left_32s_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads); + } + } +#else + UNUSED(thread_state); +#endif +} + +static void libsais_clear_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT bucket_start, sa_sint_t * RESTRICT bucket_end, sa_sint_t threads) +{ + fast_sint_t c; + +#if defined(_OPENMP) + #pragma omp parallel for schedule(static, 1) num_threads(threads) if(threads > 1 && n >= 65536) +#else + UNUSED(threads); UNUSED(n); +#endif + for (c = 0; c < k; ++c) + { + if (bucket_end[c] > bucket_start[c]) + { + memset(&SA[bucket_start[c]], 0, ((size_t)bucket_end[c] - (size_t)bucket_start[c]) * sizeof(sa_sint_t)); + } + } +} + +static sa_sint_t libsais_induce_final_order_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t bwt, sa_sint_t r, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + if (!bwt) + { + libsais_final_sorting_scan_left_to_right_8u_omp(T, SA, n, &buckets[6 * ALPHABET_SIZE], threads, thread_state); + if (threads > 1 && n >= 65536) { libsais_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE], &buckets[7 * ALPHABET_SIZE], threads); } + libsais_final_sorting_scan_right_to_left_8u_omp(T, SA, n, &buckets[7 * ALPHABET_SIZE], threads, thread_state); + return 0; + } + else if (I != NULL) + { + libsais_final_bwt_aux_scan_left_to_right_8u_omp(T, SA, n, r - 1, I, &buckets[6 * ALPHABET_SIZE], threads, thread_state); + if (threads > 1 && n >= 65536) { libsais_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE], &buckets[7 * ALPHABET_SIZE], threads); } + libsais_final_bwt_aux_scan_right_to_left_8u_omp(T, SA, n, r - 1, I, &buckets[7 * ALPHABET_SIZE], threads, thread_state); + return 0; + } + else + { + libsais_final_bwt_scan_left_to_right_8u_omp(T, SA, n, &buckets[6 * ALPHABET_SIZE], threads, thread_state); + if (threads > 1 && n >= 65536) { libsais_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE], &buckets[7 * ALPHABET_SIZE], threads); } + return libsais_final_bwt_scan_right_to_left_8u_omp(T, SA, n, &buckets[7 * ALPHABET_SIZE], threads, thread_state); + } +} + +static void libsais_induce_final_order_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[4 * k], threads, thread_state); + libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[5 * k], threads, thread_state); +} + +static void libsais_induce_final_order_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[2 * k], threads, thread_state); + libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[3 * k], threads, thread_state); +} + +static void libsais_induce_final_order_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[1 * k], threads, thread_state); + libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[0 * k], threads, thread_state); +} + +static void libsais_induce_final_order_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + libsais_count_suffixes_32s(T, n, k, buckets); + libsais_initialize_buckets_start_32s_1k(k, buckets); + libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, buckets, threads, thread_state); + + libsais_count_suffixes_32s(T, n, k, buckets); + libsais_initialize_buckets_end_32s_1k(k, buckets); + libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, buckets, threads, thread_state); +} + +static sa_sint_t libsais_renumber_unique_and_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t f, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + sa_sint_t * RESTRICT SAm = &SA[m]; + + sa_sint_t i, j; + for (i = (sa_sint_t)omp_block_start, j = (sa_sint_t)omp_block_start + (sa_sint_t)omp_block_size - 2 * (sa_sint_t)prefetch_distance - 3; i < j; i += 4) + { + libsais_prefetch(&SA[i + 3 * prefetch_distance]); + + libsais_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 0]) >> 1]); + libsais_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 1]) >> 1]); + libsais_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 2]) >> 1]); + libsais_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 3]) >> 1]); + + sa_uint_t q0 = (sa_uint_t)SA[i + prefetch_distance + 0]; const sa_sint_t * Tq0 = &T[q0]; libsais_prefetchw(SAm[q0 >> 1] < 0 ? Tq0 : NULL); + sa_uint_t q1 = (sa_uint_t)SA[i + prefetch_distance + 1]; const sa_sint_t * Tq1 = &T[q1]; libsais_prefetchw(SAm[q1 >> 1] < 0 ? Tq1 : NULL); + sa_uint_t q2 = (sa_uint_t)SA[i + prefetch_distance + 2]; const sa_sint_t * Tq2 = &T[q2]; libsais_prefetchw(SAm[q2 >> 1] < 0 ? Tq2 : NULL); + sa_uint_t q3 = (sa_uint_t)SA[i + prefetch_distance + 3]; const sa_sint_t * Tq3 = &T[q3]; libsais_prefetchw(SAm[q3 >> 1] < 0 ? Tq3 : NULL); + + sa_uint_t p0 = (sa_uint_t)SA[i + 0]; sa_sint_t s0 = SAm[p0 >> 1]; if (s0 < 0) { T[p0] |= SAINT_MIN; f++; s0 = i + 0 + SAINT_MIN + f; } SAm[p0 >> 1] = s0 - f; + sa_uint_t p1 = (sa_uint_t)SA[i + 1]; sa_sint_t s1 = SAm[p1 >> 1]; if (s1 < 0) { T[p1] |= SAINT_MIN; f++; s1 = i + 1 + SAINT_MIN + f; } SAm[p1 >> 1] = s1 - f; + sa_uint_t p2 = (sa_uint_t)SA[i + 2]; sa_sint_t s2 = SAm[p2 >> 1]; if (s2 < 0) { T[p2] |= SAINT_MIN; f++; s2 = i + 2 + SAINT_MIN + f; } SAm[p2 >> 1] = s2 - f; + sa_uint_t p3 = (sa_uint_t)SA[i + 3]; sa_sint_t s3 = SAm[p3 >> 1]; if (s3 < 0) { T[p3] |= SAINT_MIN; f++; s3 = i + 3 + SAINT_MIN + f; } SAm[p3 >> 1] = s3 - f; + } + + for (j += 2 * (sa_sint_t)prefetch_distance + 3; i < j; i += 1) + { + sa_uint_t p = (sa_uint_t)SA[i]; sa_sint_t s = SAm[p >> 1]; if (s < 0) { T[p] |= SAINT_MIN; f++; s = i + SAINT_MIN + f; } SAm[p >> 1] = s - f; + } + + return f; +} + +static void libsais_compact_unique_and_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t * pl, fast_sint_t * pr, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + sa_sint_t * RESTRICT SAl = &SA[0]; + sa_sint_t * RESTRICT SAr = &SA[0]; + + fast_sint_t i, j, l = *pl - 1, r = *pr - 1; + for (i = (fast_sint_t)m + omp_block_start + omp_block_size - 1, j = (fast_sint_t)m + omp_block_start + 3; i >= j; i -= 4) + { + libsais_prefetch(&SA[i - prefetch_distance]); + + sa_sint_t p0 = SA[i - 0]; SAl[l] = p0 & SAINT_MAX; l -= p0 < 0; SAr[r] = p0 - 1; r -= p0 > 0; + sa_sint_t p1 = SA[i - 1]; SAl[l] = p1 & SAINT_MAX; l -= p1 < 0; SAr[r] = p1 - 1; r -= p1 > 0; + sa_sint_t p2 = SA[i - 2]; SAl[l] = p2 & SAINT_MAX; l -= p2 < 0; SAr[r] = p2 - 1; r -= p2 > 0; + sa_sint_t p3 = SA[i - 3]; SAl[l] = p3 & SAINT_MAX; l -= p3 < 0; SAr[r] = p3 - 1; r -= p3 > 0; + } + + for (j -= 3; i >= j; i -= 1) + { + sa_sint_t p = SA[i]; SAl[l] = p & SAINT_MAX; l -= p < 0; SAr[r] = p - 1; r -= p > 0; + } + + *pl = l + 1; *pr = r + 1; +} + + +#if defined(_OPENMP) + +static sa_sint_t libsais_count_unique_suffixes(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + sa_sint_t * RESTRICT SAm = &SA[m]; + + fast_sint_t i, j; sa_sint_t f0 = 0, f1 = 0, f2 = 0, f3 = 0; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) + { + libsais_prefetch(&SA[i + 2 * prefetch_distance]); + + libsais_prefetch(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]); + libsais_prefetch(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]); + libsais_prefetch(&SAm[((sa_uint_t)SA[i + prefetch_distance + 2]) >> 1]); + libsais_prefetch(&SAm[((sa_uint_t)SA[i + prefetch_distance + 3]) >> 1]); + + f0 += SAm[((sa_uint_t)SA[i + 0]) >> 1] < 0; + f1 += SAm[((sa_uint_t)SA[i + 1]) >> 1] < 0; + f2 += SAm[((sa_uint_t)SA[i + 2]) >> 1] < 0; + f3 += SAm[((sa_uint_t)SA[i + 3]) >> 1] < 0; + } + + for (j += prefetch_distance + 3; i < j; i += 1) + { + f0 += SAm[((sa_uint_t)SA[i]) >> 1] < 0; + } + + return f0 + f1 + f2 + f3; +} + +#endif + +static sa_sint_t libsais_renumber_unique_and_nonunique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + sa_sint_t f = 0; + +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start; + + if (omp_num_threads == 1) + { + f = libsais_renumber_unique_and_nonunique_lms_suffixes_32s(T, SA, m, 0, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + thread_state[omp_thread_num].state.count = libsais_count_unique_suffixes(SA, m, omp_block_start, omp_block_size); + } + + #pragma omp barrier + + { + fast_sint_t t, count = 0; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; } + + if (omp_thread_num == omp_num_threads - 1) + { + f = (sa_sint_t)(count + thread_state[omp_thread_num].state.count); + } + + libsais_renumber_unique_and_nonunique_lms_suffixes_32s(T, SA, m, (sa_sint_t)count, omp_block_start, omp_block_size); + } + } +#endif + } + + return f; +} + +static void libsais_compact_unique_and_nonunique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072 && m < fs) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start; + + if (omp_num_threads == 1) + { + fast_sint_t l = m, r = (fast_sint_t)n + (fast_sint_t)fs; + libsais_compact_unique_and_nonunique_lms_suffixes_32s(SA, m, &l, &r, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + thread_state[omp_thread_num].state.position = (fast_sint_t)m + ((fast_sint_t)n >> 1) + omp_block_start + omp_block_size; + thread_state[omp_thread_num].state.count = (fast_sint_t)m + omp_block_start + omp_block_size; + + libsais_compact_unique_and_nonunique_lms_suffixes_32s(SA, m, &thread_state[omp_thread_num].state.position, &thread_state[omp_thread_num].state.count, omp_block_start, omp_block_size); + } + + #pragma omp barrier + + #pragma omp master + { + fast_sint_t t, position; + + for (position = m, t = omp_num_threads - 1; t >= 0; --t) + { + fast_sint_t omp_block_end = t < omp_num_threads - 1 ? omp_block_stride * (t + 1) : ((fast_sint_t)n >> 1); + fast_sint_t count = ((fast_sint_t)m + ((fast_sint_t)n >> 1) + omp_block_end - thread_state[t].state.position); + + if (count > 0) + { + position -= count; memcpy(&SA[position], &SA[thread_state[t].state.position], (size_t)count * sizeof(sa_sint_t)); + } + } + + for (position = (fast_sint_t)n + (fast_sint_t)fs, t = omp_num_threads - 1; t >= 0; --t) + { + fast_sint_t omp_block_end = t < omp_num_threads - 1 ? omp_block_stride * (t + 1) : ((fast_sint_t)n >> 1); + fast_sint_t count = ((fast_sint_t)m + omp_block_end - thread_state[t].state.count); + + if (count > 0) + { + position -= count; memcpy(&SA[position], &SA[thread_state[t].state.count], (size_t)count * sizeof(sa_sint_t)); + } + } + } + } +#endif + } + + memcpy(&SA[(fast_sint_t)n + (fast_sint_t)fs - (fast_sint_t)m], &SA[(fast_sint_t)m - (fast_sint_t)f], (size_t)f * sizeof(sa_sint_t)); +} + +static sa_sint_t libsais_compact_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + sa_sint_t f = libsais_renumber_unique_and_nonunique_lms_suffixes_32s_omp(T, SA, m, threads, thread_state); + libsais_compact_unique_and_nonunique_lms_suffixes_32s_omp(SA, n, m, fs, f, threads, thread_state); + + return f; +} + +static void libsais_merge_unique_lms_suffixes_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, fast_sint_t l, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + const sa_sint_t * RESTRICT SAnm = &SA[(fast_sint_t)n - (fast_sint_t)m - 1 + l]; + + sa_sint_t i, j; fast_sint_t tmp = *SAnm++; + for (i = (sa_sint_t)omp_block_start, j = (sa_sint_t)omp_block_start + (sa_sint_t)omp_block_size - 6; i < j; i += 4) + { + libsais_prefetch(&T[i + prefetch_distance]); + + sa_sint_t c0 = T[i + 0]; if (c0 < 0) { T[i + 0] = c0 & SAINT_MAX; SA[tmp] = i + 0; i++; tmp = *SAnm++; } + sa_sint_t c1 = T[i + 1]; if (c1 < 0) { T[i + 1] = c1 & SAINT_MAX; SA[tmp] = i + 1; i++; tmp = *SAnm++; } + sa_sint_t c2 = T[i + 2]; if (c2 < 0) { T[i + 2] = c2 & SAINT_MAX; SA[tmp] = i + 2; i++; tmp = *SAnm++; } + sa_sint_t c3 = T[i + 3]; if (c3 < 0) { T[i + 3] = c3 & SAINT_MAX; SA[tmp] = i + 3; i++; tmp = *SAnm++; } + } + + for (j += 6; i < j; i += 1) + { + sa_sint_t c = T[i]; if (c < 0) { T[i] = c & SAINT_MAX; SA[tmp] = i; i++; tmp = *SAnm++; } + } +} + +static void libsais_merge_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, fast_sint_t l, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + const sa_sint_t * RESTRICT SAnm = &SA[(fast_sint_t)n - (fast_sint_t)m - 1 + l]; + + fast_sint_t i, j; sa_sint_t tmp = *SAnm++; + for (i = omp_block_start, j = omp_block_start + omp_block_size - 3; i < j; i += 4) + { + libsais_prefetch(&SA[i + prefetch_distance]); + + if (SA[i + 0] == 0) { SA[i + 0] = tmp; tmp = *SAnm++; } + if (SA[i + 1] == 0) { SA[i + 1] = tmp; tmp = *SAnm++; } + if (SA[i + 2] == 0) { SA[i + 2] = tmp; tmp = *SAnm++; } + if (SA[i + 3] == 0) { SA[i + 3] = tmp; tmp = *SAnm++; } + } + + for (j += 3; i < j; i += 1) + { + if (SA[i] == 0) { SA[i] = tmp; tmp = *SAnm++; } + } +} + +static void libsais_merge_unique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; + + if (omp_num_threads == 1) + { + libsais_merge_unique_lms_suffixes_32s(T, SA, n, m, 0, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + thread_state[omp_thread_num].state.count = libsais_count_negative_marked_suffixes(T, omp_block_start, omp_block_size); + } + + #pragma omp barrier + + { + fast_sint_t t, count = 0; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; } + + libsais_merge_unique_lms_suffixes_32s(T, SA, n, m, count, omp_block_start, omp_block_size); + } + } +#endif + } +} + +static void libsais_merge_nonunique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start; + + if (omp_num_threads == 1) + { + libsais_merge_nonunique_lms_suffixes_32s(SA, n, m, f, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + thread_state[omp_thread_num].state.count = libsais_count_zero_marked_suffixes(SA, omp_block_start, omp_block_size); + } + + #pragma omp barrier + + { + fast_sint_t t, count = f; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; } + + libsais_merge_nonunique_lms_suffixes_32s(SA, n, m, count, omp_block_start, omp_block_size); + } + } +#endif + } +} + +static void libsais_merge_compacted_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + libsais_merge_unique_lms_suffixes_32s_omp(T, SA, n, m, threads, thread_state); + libsais_merge_nonunique_lms_suffixes_32s_omp(SA, n, m, f, threads, thread_state); +} + +static void libsais_reconstruct_compacted_lms_suffixes_32s_2k_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, sa_sint_t fs, sa_sint_t f, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + if (f > 0) + { + memmove(&SA[n - m - 1], &SA[n + fs - m], (size_t)f * sizeof(sa_sint_t)); + + libsais_count_and_gather_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state); + libsais_reconstruct_lms_suffixes_omp(SA, n, m - f, threads); + + memcpy(&SA[n - m - 1 + f], &SA[0], ((size_t)m - (size_t)f) * sizeof(sa_sint_t)); + memset(&SA[0], 0, (size_t)m * sizeof(sa_sint_t)); + + libsais_merge_compacted_lms_suffixes_32s_omp(T, SA, n, m, f, threads, thread_state); + } + else + { + libsais_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n); + libsais_reconstruct_lms_suffixes_omp(SA, n, m, threads); + } +} + +static void libsais_reconstruct_compacted_lms_suffixes_32s_1k_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + if (f > 0) + { + memmove(&SA[n - m - 1], &SA[n + fs - m], (size_t)f * sizeof(sa_sint_t)); + + libsais_gather_compacted_lms_suffixes_32s(T, SA, n); + libsais_reconstruct_lms_suffixes_omp(SA, n, m - f, threads); + + memcpy(&SA[n - m - 1 + f], &SA[0], ((size_t)m - (size_t)f) * sizeof(sa_sint_t)); + memset(&SA[0], 0, (size_t)m * sizeof(sa_sint_t)); + + libsais_merge_compacted_lms_suffixes_32s_omp(T, SA, n, m, f, threads, thread_state); + } + else + { + libsais_gather_lms_suffixes_32s(T, SA, n); + libsais_reconstruct_lms_suffixes_omp(SA, n, m, threads); + } +} + +static sa_sint_t libsais_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + if (k > 0 && fs / k >= 6) + { + sa_sint_t alignment = (fs - 1024) / k >= 6 ? 1024 : 16; + sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 6 ? (sa_sint_t *)libsais_align_up(&SA[n + fs - 6 * k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 6 * k]; + + sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_4k_omp(T, SA, n, k, buckets, threads, thread_state); + if (m > 1) + { + memset(SA, 0, ((size_t)n - (size_t)m) * sizeof(sa_sint_t)); + + sa_sint_t first_lms_suffix = SA[n - m]; + sa_sint_t left_suffixes_count = libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(T, k, buckets, first_lms_suffix); + + libsais_radix_sort_lms_suffixes_32s_6k_omp(T, SA, n, m, &buckets[4 * k], threads, thread_state); + libsais_radix_sort_set_markers_32s_6k_omp(SA, k, &buckets[4 * k], threads); + + if (threads > 1 && n >= 65536) { memset(&SA[(fast_sint_t)n - (fast_sint_t)m], 0, (size_t)m * sizeof(sa_sint_t)); } + + libsais_initialize_buckets_for_partial_sorting_32s_6k(T, k, buckets, first_lms_suffix, left_suffixes_count); + libsais_induce_partial_order_32s_6k_omp(T, SA, n, k, buckets, first_lms_suffix, left_suffixes_count, threads, thread_state); + + sa_sint_t names = libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(SA, n, m, threads, thread_state); + if (names < m) + { + sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state); + + if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state) != 0) + { + return -2; + } + + libsais_reconstruct_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, m, fs, f, buckets, threads, thread_state); + } + else + { + libsais_count_lms_suffixes_32s_2k(T, n, k, buckets); + } + + libsais_initialize_buckets_start_and_end_32s_4k(k, buckets); + libsais_place_lms_suffixes_histogram_32s_4k(SA, n, k, m, buckets); + libsais_induce_final_order_32s_4k(T, SA, n, k, buckets, threads, thread_state); + } + else + { + SA[0] = SA[n - 1]; + + libsais_initialize_buckets_start_and_end_32s_6k(k, buckets); + libsais_place_lms_suffixes_histogram_32s_6k(SA, n, k, m, buckets); + libsais_induce_final_order_32s_6k(T, SA, n, k, buckets, threads, thread_state); + } + + return 0; + } + else if (k > 0 && fs / k >= 4) + { + sa_sint_t alignment = (fs - 1024) / k >= 4 ? 1024 : 16; + sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 4 ? (sa_sint_t *)libsais_align_up(&SA[n + fs - 4 * k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 4 * k]; + + sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state); + if (m > 1) + { + libsais_initialize_buckets_for_radix_and_partial_sorting_32s_4k(T, k, buckets, SA[n - m]); + + libsais_radix_sort_lms_suffixes_32s_2k_omp(T, SA, n, m, &buckets[1], threads, thread_state); + libsais_radix_sort_set_markers_32s_4k_omp(SA, k, &buckets[1], threads); + + libsais_place_lms_suffixes_interval_32s_4k(SA, n, k, m - 1, buckets); + libsais_induce_partial_order_32s_4k_omp(T, SA, n, k, buckets, threads, thread_state); + + sa_sint_t names = libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(SA, n, m, threads, thread_state); + if (names < m) + { + sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state); + + if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state) != 0) + { + return -2; + } + + libsais_reconstruct_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, m, fs, f, buckets, threads, thread_state); + } + else + { + libsais_count_lms_suffixes_32s_2k(T, n, k, buckets); + } + } + else + { + SA[0] = SA[n - 1]; + } + + libsais_initialize_buckets_start_and_end_32s_4k(k, buckets); + libsais_place_lms_suffixes_histogram_32s_4k(SA, n, k, m, buckets); + libsais_induce_final_order_32s_4k(T, SA, n, k, buckets, threads, thread_state); + + return 0; + } + else if (k > 0 && fs / k >= 2) + { + sa_sint_t alignment = (fs - 1024) / k >= 2 ? 1024 : 16; + sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 2 ? (sa_sint_t *)libsais_align_up(&SA[n + fs - 2 * k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 2 * k]; + + sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state); + if (m > 1) + { + libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(T, k, buckets, SA[n - m]); + + libsais_radix_sort_lms_suffixes_32s_2k_omp(T, SA, n, m, &buckets[1], threads, thread_state); + libsais_place_lms_suffixes_interval_32s_2k(SA, n, k, m - 1, buckets); + + libsais_initialize_buckets_start_and_end_32s_2k(k, buckets); + libsais_induce_partial_order_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state); + + sa_sint_t names = libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(T, SA, n, m, threads); + if (names < m) + { + sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state); + + if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state) != 0) + { + return -2; + } + + libsais_reconstruct_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, m, fs, f, buckets, threads, thread_state); + } + else + { + libsais_count_lms_suffixes_32s_2k(T, n, k, buckets); + } + } + else + { + SA[0] = SA[n - 1]; + } + + libsais_initialize_buckets_end_32s_2k(k, buckets); + libsais_place_lms_suffixes_histogram_32s_2k(SA, n, k, m, buckets); + + libsais_initialize_buckets_start_and_end_32s_2k(k, buckets); + libsais_induce_final_order_32s_2k(T, SA, n, k, buckets, threads, thread_state); + + return 0; + } + else + { + sa_sint_t * buffer = fs < k ? (sa_sint_t *)libsais_alloc_aligned((size_t)k * sizeof(sa_sint_t), 4096) : (sa_sint_t *)NULL; + + sa_sint_t alignment = fs - 1024 >= k ? 1024 : 16; + sa_sint_t * RESTRICT buckets = fs - alignment >= k ? (sa_sint_t *)libsais_align_up(&SA[n + fs - k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : fs >= k ? &SA[n + fs - k] : buffer; + + if (buckets == NULL) { return -2; } + + memset(SA, 0, (size_t)n * sizeof(sa_sint_t)); + + libsais_count_suffixes_32s(T, n, k, buckets); + libsais_initialize_buckets_end_32s_1k(k, buckets); + + sa_sint_t m = libsais_radix_sort_lms_suffixes_32s_1k(T, SA, n, buckets); + if (m > 1) + { + libsais_induce_partial_order_32s_1k_omp(T, SA, n, k, buckets, threads, thread_state); + + sa_sint_t names = libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(T, SA, n, m, threads); + if (names < m) + { + if (buffer != NULL) { libsais_free_aligned(buffer); buckets = NULL; } + + sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state); + + if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state) != 0) + { + return -2; + } + + libsais_reconstruct_compacted_lms_suffixes_32s_1k_omp(T, SA, n, m, fs, f, threads, thread_state); + + if (buckets == NULL) { buckets = buffer = (sa_sint_t *)libsais_alloc_aligned((size_t)k * sizeof(sa_sint_t), 4096); } + if (buckets == NULL) { return -2; } + } + + libsais_count_suffixes_32s(T, n, k, buckets); + libsais_initialize_buckets_end_32s_1k(k, buckets); + libsais_place_lms_suffixes_interval_32s_1k(T, SA, k, m, buckets); + } + + libsais_induce_final_order_32s_1k(T, SA, n, k, buckets, threads, thread_state); + libsais_free_aligned(buffer); + + return 0; + } +} + +int32_t libsais_main_32s_internal(int32_t * T, int32_t * SA, int32_t n, int32_t k, int32_t fs, int32_t threads) +{ + LIBSAIS_THREAD_STATE * RESTRICT thread_state = threads > 1 ? libsais_alloc_thread_state(threads) : NULL; + + sa_sint_t index = thread_state != NULL || threads == 1 + ? libsais_main_32s(T, SA, n, k, fs, threads, thread_state) + : -2; + + libsais_free_thread_state(thread_state); + + return index; +} + +static sa_sint_t libsais_main_8u(const uint8_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t bwt, sa_sint_t r, sa_sint_t * RESTRICT I, sa_sint_t fs, sa_sint_t * freq, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + sa_sint_t m = libsais_count_and_gather_lms_suffixes_8u_omp(T, SA, n, buckets, threads, thread_state); + + libsais_initialize_buckets_start_and_end_8u(buckets, freq); + + if (m > 0) + { + sa_sint_t first_lms_suffix = SA[n - m]; + sa_sint_t left_suffixes_count = libsais_initialize_buckets_for_lms_suffixes_radix_sort_8u(T, buckets, first_lms_suffix); + + if (threads > 1 && n >= 65536) { memset(SA, 0, ((size_t)n - (size_t)m) * sizeof(sa_sint_t)); } + libsais_radix_sort_lms_suffixes_8u_omp(T, SA, n, m, buckets, threads, thread_state); + if (threads > 1 && n >= 65536) { memset(&SA[(fast_sint_t)n - (fast_sint_t)m], 0, (size_t)m * sizeof(sa_sint_t)); } + + libsais_initialize_buckets_for_partial_sorting_8u(T, buckets, first_lms_suffix, left_suffixes_count); + libsais_induce_partial_order_8u_omp(T, SA, n, buckets, first_lms_suffix, left_suffixes_count, threads, thread_state); + + sa_sint_t names = libsais_renumber_and_gather_lms_suffixes_8u_omp(SA, n, m, fs, threads, thread_state); + if (names < m) + { + if (libsais_main_32s(SA + n + fs - m, SA, m, names, fs + n - 2 * m, threads, thread_state) != 0) + { + return -2; + } + + libsais_gather_lms_suffixes_8u_omp(T, SA, n, threads, thread_state); + libsais_reconstruct_lms_suffixes_omp(SA, n, m, threads); + } + + libsais_place_lms_suffixes_interval_8u(SA, n, m, buckets); + } + else + { + memset(SA, 0, (size_t)n * sizeof(sa_sint_t)); + } + + return libsais_induce_final_order_8u_omp(T, SA, n, bwt, r, I, buckets, threads, thread_state); +} + +static sa_sint_t libsais_main(const uint8_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t bwt, sa_sint_t r, sa_sint_t * I, sa_sint_t fs, sa_sint_t * freq, sa_sint_t threads) +{ + LIBSAIS_THREAD_STATE * RESTRICT thread_state = threads > 1 ? libsais_alloc_thread_state(threads) : NULL; + sa_sint_t * RESTRICT buckets = (sa_sint_t *)libsais_alloc_aligned(8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096); + + sa_sint_t index = buckets != NULL && (thread_state != NULL || threads == 1) + ? libsais_main_8u(T, SA, n, buckets, bwt, r, I, fs, freq, threads, thread_state) + : -2; + + libsais_free_aligned(buckets); + libsais_free_thread_state(thread_state); + + return index; +} + +static sa_sint_t libsais_main_ctx(const LIBSAIS_CONTEXT * ctx, const uint8_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t bwt, sa_sint_t r, sa_sint_t * I, sa_sint_t fs, sa_sint_t * freq) +{ + return ctx != NULL && (ctx->buckets != NULL && (ctx->thread_state != NULL || ctx->threads == 1)) + ? libsais_main_8u(T, SA, n, ctx->buckets, bwt, r, I, fs, freq, (sa_sint_t)ctx->threads, ctx->thread_state) + : -2; +} + +static void libsais_bwt_copy_8u(uint8_t * RESTRICT U, sa_sint_t * RESTRICT A, sa_sint_t n) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = 0, j = (fast_sint_t)n - 7; i < j; i += 8) + { + libsais_prefetch(&A[i + prefetch_distance]); + + U[i + 0] = (uint8_t)A[i + 0]; + U[i + 1] = (uint8_t)A[i + 1]; + U[i + 2] = (uint8_t)A[i + 2]; + U[i + 3] = (uint8_t)A[i + 3]; + U[i + 4] = (uint8_t)A[i + 4]; + U[i + 5] = (uint8_t)A[i + 5]; + U[i + 6] = (uint8_t)A[i + 6]; + U[i + 7] = (uint8_t)A[i + 7]; + } + + for (j += 7; i < j; i += 1) + { + U[i] = (uint8_t)A[i]; + } +} + +#if defined(_OPENMP) + +static void libsais_bwt_copy_8u_omp(uint8_t * RESTRICT U, sa_sint_t * RESTRICT A, sa_sint_t n, sa_sint_t threads) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); + fast_sint_t omp_block_stride = ((fast_sint_t)n / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : (fast_sint_t)n - omp_block_start; +#else + UNUSED(threads); + + fast_sint_t omp_block_start = 0; + fast_sint_t omp_block_size = (fast_sint_t)n; +#endif + + libsais_bwt_copy_8u(U + omp_block_start, A + omp_block_start, (sa_sint_t)omp_block_size); + } +} + +#endif + +void * libsais_create_ctx(void) +{ + return (void *)libsais_create_ctx_main(1); +} + +void libsais_free_ctx(void * ctx) +{ + libsais_free_ctx_main((LIBSAIS_CONTEXT *)ctx); +} + +int32_t libsais(const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq) +{ + if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0)) + { + return -1; + } + else if (n < 2) + { + if (n == 1) { SA[0] = 0; } + return 0; + } + + return libsais_main(T, SA, n, 0, 0, NULL, fs, freq, 1); +} + +int32_t libsais_ctx(const void * ctx, const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq) +{ + if ((ctx == NULL) || (T == NULL) || (SA == NULL) || (n < 0) || (fs < 0)) + { + return -1; + } + else if (n < 2) + { + if (n == 1) { SA[0] = 0; } + return 0; + } + + return libsais_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, SA, n, 0, 0, NULL, fs, freq); +} + +int32_t libsais_bwt(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq) +{ + if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0)) + { + return -1; + } + else if (n <= 1) + { + if (n == 1) { U[0] = T[0]; } + return n; + } + + sa_sint_t index = libsais_main(T, A, n, 1, 0, NULL, fs, freq, 1); + if (index >= 0) + { + index++; + + U[0] = T[n - 1]; + libsais_bwt_copy_8u(U + 1, A, index - 1); + libsais_bwt_copy_8u(U + index, A + index, n - index); + } + + return index; +} + +int32_t libsais_bwt_aux(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I) +{ + if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) || ((r & (r - 1)) != 0) || (I == NULL)) + { + return -1; + } + else if (n <= 1) + { + if (n == 1) { U[0] = T[0]; } + + I[0] = n; + return 0; + } + + if (libsais_main(T, A, n, 1, r, I, fs, freq, 1) != 0) + { + return -2; + } + + U[0] = T[n - 1]; + libsais_bwt_copy_8u(U + 1, A, I[0] - 1); + libsais_bwt_copy_8u(U + I[0], A + I[0], n - I[0]); + + return 0; +} + +int32_t libsais_bwt_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq) +{ + if ((ctx == NULL) || (T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0)) + { + return -1; + } + else if (n <= 1) + { + if (n == 1) { U[0] = T[0]; } + return n; + } + + sa_sint_t index = libsais_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, A, n, 1, 0, NULL, fs, freq); + if (index >= 0) + { + index++; + + U[0] = T[n - 1]; + +#if defined(_OPENMP) + libsais_bwt_copy_8u_omp(U + 1, A, index - 1, (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads); + libsais_bwt_copy_8u_omp(U + index, A + index, n - index, (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads); +#else + libsais_bwt_copy_8u(U + 1, A, index - 1); + libsais_bwt_copy_8u(U + index, A + index, n - index); +#endif + } + + return index; +} + +int32_t libsais_bwt_aux_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I) +{ + if ((ctx == NULL) || (T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) || ((r & (r - 1)) != 0) || (I == NULL)) + { + return -1; + } + else if (n <= 1) + { + if (n == 1) { U[0] = T[0]; } + + I[0] = n; + return 0; + } + + if (libsais_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, A, n, 1, r, I, fs, freq) != 0) + { + return -2; + } + + U[0] = T[n - 1]; + +#if defined(_OPENMP) + libsais_bwt_copy_8u_omp(U + 1, A, I[0] - 1, (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads); + libsais_bwt_copy_8u_omp(U + I[0], A + I[0], n - I[0], (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads); +#else + libsais_bwt_copy_8u(U + 1, A, I[0] - 1); + libsais_bwt_copy_8u(U + I[0], A + I[0], n - I[0]); +#endif + + return 0; +} + +#if defined(_OPENMP) + +void * libsais_create_ctx_omp(int32_t threads) +{ + if (threads < 0) { return NULL; } + + threads = threads > 0 ? threads : omp_get_max_threads(); + return (void *)libsais_create_ctx_main(threads); +} + +int32_t libsais_omp(const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq, int32_t threads) +{ + if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0) || (threads < 0)) + { + return -1; + } + else if (n < 2) + { + if (n == 1) { SA[0] = 0; } + return 0; + } + + threads = threads > 0 ? threads : omp_get_max_threads(); + + return libsais_main(T, SA, n, 0, 0, NULL, fs, freq, threads); +} + +int32_t libsais_bwt_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t threads) +{ + if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (threads < 0)) + { + return -1; + } + else if (n <= 1) + { + if (n == 1) { U[0] = T[0]; } + return n; + } + + threads = threads > 0 ? threads : omp_get_max_threads(); + + sa_sint_t index = libsais_main(T, A, n, 1, 0, NULL, fs, freq, threads); + if (index >= 0) + { + index++; + + U[0] = T[n - 1]; + libsais_bwt_copy_8u_omp(U + 1, A, index - 1, threads); + libsais_bwt_copy_8u_omp(U + index, A + index, n - index, threads); + } + + return index; +} + +int32_t libsais_bwt_aux_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I, int32_t threads) +{ + if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) || ((r & (r - 1)) != 0) || (I == NULL) || (threads < 0)) + { + return -1; + } + else if (n <= 1) + { + if (n == 1) { U[0] = T[0];} + + I[0] = n; + return 0; + } + + threads = threads > 0 ? threads : omp_get_max_threads(); + + if (libsais_main(T, A, n, 1, r, I, fs, freq, threads) != 0) + { + return -2; + } + + U[0] = T[n - 1]; + libsais_bwt_copy_8u_omp(U + 1, A, I[0] - 1, threads); + libsais_bwt_copy_8u_omp(U + I[0], A + I[0], n - I[0], threads); + + return 0; +} + +#endif + +static LIBSAIS_UNBWT_CONTEXT * libsais_unbwt_create_ctx_main(sa_sint_t threads) +{ + LIBSAIS_UNBWT_CONTEXT * RESTRICT ctx = (LIBSAIS_UNBWT_CONTEXT *)libsais_alloc_aligned(sizeof(LIBSAIS_UNBWT_CONTEXT), 64); + sa_uint_t * RESTRICT bucket2 = (sa_uint_t *)libsais_alloc_aligned(ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t), 4096); + uint16_t * RESTRICT fastbits = (uint16_t *)libsais_alloc_aligned((1 + (1 << UNBWT_FASTBITS)) * sizeof(uint16_t), 4096); + sa_uint_t * RESTRICT buckets = threads > 1 ? (sa_uint_t *)libsais_alloc_aligned((size_t)threads * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) * sizeof(sa_uint_t), 4096) : NULL; + + if (ctx != NULL && bucket2 != NULL && fastbits != NULL && (buckets != NULL || threads == 1)) + { + ctx->bucket2 = bucket2; + ctx->fastbits = fastbits; + ctx->buckets = buckets; + ctx->threads = threads; + + return ctx; + } + + libsais_free_aligned(buckets); + libsais_free_aligned(fastbits); + libsais_free_aligned(bucket2); + libsais_free_aligned(ctx); + + return NULL; +} + +static void libsais_unbwt_free_ctx_main(LIBSAIS_UNBWT_CONTEXT * ctx) +{ + if (ctx != NULL) + { + libsais_free_aligned(ctx->buckets); + libsais_free_aligned(ctx->fastbits); + libsais_free_aligned(ctx->bucket2); + libsais_free_aligned(ctx); + } +} + +static void libsais_unbwt_compute_histogram(const uint8_t * RESTRICT T, fast_sint_t n, sa_uint_t * RESTRICT count) +{ + const fast_sint_t prefetch_distance = 256; + + const uint8_t * RESTRICT T_p = T; + + if (n >= 1024) + { + sa_uint_t copy[4 * (ALPHABET_SIZE + 16)]; + + memset(copy, 0, 4 * (ALPHABET_SIZE + 16) * sizeof(sa_uint_t)); + + sa_uint_t * RESTRICT copy0 = copy + 0 * (ALPHABET_SIZE + 16); + sa_uint_t * RESTRICT copy1 = copy + 1 * (ALPHABET_SIZE + 16); + sa_uint_t * RESTRICT copy2 = copy + 2 * (ALPHABET_SIZE + 16); + sa_uint_t * RESTRICT copy3 = copy + 3 * (ALPHABET_SIZE + 16); + + for (; T_p < (uint8_t * )((ptrdiff_t)(T + 63) & (-64)); T_p += 1) { copy0[T_p[0]]++; } + + fast_uint_t x = ((const uint32_t *)(const void *)T_p)[0], y = ((const uint32_t *)(const void *)T_p)[1]; + + for (; T_p < (uint8_t * )((ptrdiff_t)(T + n - 8) & (-64)); T_p += 64) + { + libsais_prefetch(&T_p[prefetch_distance]); + + fast_uint_t z = ((const uint32_t *)(const void *)T_p)[2], w = ((const uint32_t *)(const void *)T_p)[3]; + copy0[(uint8_t)x]++; x >>= 8; copy1[(uint8_t)x]++; x >>= 8; copy2[(uint8_t)x]++; x >>= 8; copy3[x]++; + copy0[(uint8_t)y]++; y >>= 8; copy1[(uint8_t)y]++; y >>= 8; copy2[(uint8_t)y]++; y >>= 8; copy3[y]++; + + x = ((const uint32_t *)(const void *)T_p)[4]; y = ((const uint32_t *)(const void *)T_p)[5]; + copy0[(uint8_t)z]++; z >>= 8; copy1[(uint8_t)z]++; z >>= 8; copy2[(uint8_t)z]++; z >>= 8; copy3[z]++; + copy0[(uint8_t)w]++; w >>= 8; copy1[(uint8_t)w]++; w >>= 8; copy2[(uint8_t)w]++; w >>= 8; copy3[w]++; + + z = ((const uint32_t *)(const void *)T_p)[6]; w = ((const uint32_t *)(const void *)T_p)[7]; + copy0[(uint8_t)x]++; x >>= 8; copy1[(uint8_t)x]++; x >>= 8; copy2[(uint8_t)x]++; x >>= 8; copy3[x]++; + copy0[(uint8_t)y]++; y >>= 8; copy1[(uint8_t)y]++; y >>= 8; copy2[(uint8_t)y]++; y >>= 8; copy3[y]++; + + x = ((const uint32_t *)(const void *)T_p)[8]; y = ((const uint32_t *)(const void *)T_p)[9]; + copy0[(uint8_t)z]++; z >>= 8; copy1[(uint8_t)z]++; z >>= 8; copy2[(uint8_t)z]++; z >>= 8; copy3[z]++; + copy0[(uint8_t)w]++; w >>= 8; copy1[(uint8_t)w]++; w >>= 8; copy2[(uint8_t)w]++; w >>= 8; copy3[w]++; + + z = ((const uint32_t *)(const void *)T_p)[10]; w = ((const uint32_t *)(const void *)T_p)[11]; + copy0[(uint8_t)x]++; x >>= 8; copy1[(uint8_t)x]++; x >>= 8; copy2[(uint8_t)x]++; x >>= 8; copy3[x]++; + copy0[(uint8_t)y]++; y >>= 8; copy1[(uint8_t)y]++; y >>= 8; copy2[(uint8_t)y]++; y >>= 8; copy3[y]++; + + x = ((const uint32_t *)(const void *)T_p)[12]; y = ((const uint32_t *)(const void *)T_p)[13]; + copy0[(uint8_t)z]++; z >>= 8; copy1[(uint8_t)z]++; z >>= 8; copy2[(uint8_t)z]++; z >>= 8; copy3[z]++; + copy0[(uint8_t)w]++; w >>= 8; copy1[(uint8_t)w]++; w >>= 8; copy2[(uint8_t)w]++; w >>= 8; copy3[w]++; + + z = ((const uint32_t *)(const void *)T_p)[14]; w = ((const uint32_t *)(const void *)T_p)[15]; + copy0[(uint8_t)x]++; x >>= 8; copy1[(uint8_t)x]++; x >>= 8; copy2[(uint8_t)x]++; x >>= 8; copy3[x]++; + copy0[(uint8_t)y]++; y >>= 8; copy1[(uint8_t)y]++; y >>= 8; copy2[(uint8_t)y]++; y >>= 8; copy3[y]++; + + x = ((const uint32_t *)(const void *)T_p)[16]; y = ((const uint32_t *)(const void *)T_p)[17]; + copy0[(uint8_t)z]++; z >>= 8; copy1[(uint8_t)z]++; z >>= 8; copy2[(uint8_t)z]++; z >>= 8; copy3[z]++; + copy0[(uint8_t)w]++; w >>= 8; copy1[(uint8_t)w]++; w >>= 8; copy2[(uint8_t)w]++; w >>= 8; copy3[w]++; + } + + copy0[(uint8_t)x]++; x >>= 8; copy1[(uint8_t)x]++; x >>= 8; copy2[(uint8_t)x]++; x >>= 8; copy3[x]++; + copy0[(uint8_t)y]++; y >>= 8; copy1[(uint8_t)y]++; y >>= 8; copy2[(uint8_t)y]++; y >>= 8; copy3[y]++; + + T_p += 8; + + fast_uint_t i; for (i = 0; i < ALPHABET_SIZE; i++) { count[i] += copy0[i] + copy1[i] + copy2[i] + copy3[i]; } + } + + for (; T_p < T + n; T_p += 1) { count[T_p[0]]++; } +} + +static void libsais_unbwt_transpose_bucket2(sa_uint_t * RESTRICT bucket2) +{ + fast_uint_t x, y, c, d; + for (x = 0; x != ALPHABET_SIZE; x += 16) + { + for (c = x; c != x + 16; ++c) + { + for (d = c + 1; d != x + 16; ++d) + { + sa_uint_t tmp = bucket2[(d << 8) + c]; bucket2[(d << 8) + c] = bucket2[(c << 8) + d]; bucket2[(c << 8) + d] = tmp; + } + } + + for (y = x + 16; y != ALPHABET_SIZE; y += 16) + { + for (c = x; c != x + 16; ++c) + { + sa_uint_t * bucket2_yc = &bucket2[(y << 8) + c]; + sa_uint_t * bucket2_cy = &bucket2[(c << 8) + y]; + + sa_uint_t tmp00 = bucket2_yc[ 0 * 256]; bucket2_yc[ 0 * 256] = bucket2_cy[ 0]; bucket2_cy[ 0] = tmp00; + sa_uint_t tmp01 = bucket2_yc[ 1 * 256]; bucket2_yc[ 1 * 256] = bucket2_cy[ 1]; bucket2_cy[ 1] = tmp01; + sa_uint_t tmp02 = bucket2_yc[ 2 * 256]; bucket2_yc[ 2 * 256] = bucket2_cy[ 2]; bucket2_cy[ 2] = tmp02; + sa_uint_t tmp03 = bucket2_yc[ 3 * 256]; bucket2_yc[ 3 * 256] = bucket2_cy[ 3]; bucket2_cy[ 3] = tmp03; + sa_uint_t tmp04 = bucket2_yc[ 4 * 256]; bucket2_yc[ 4 * 256] = bucket2_cy[ 4]; bucket2_cy[ 4] = tmp04; + sa_uint_t tmp05 = bucket2_yc[ 5 * 256]; bucket2_yc[ 5 * 256] = bucket2_cy[ 5]; bucket2_cy[ 5] = tmp05; + sa_uint_t tmp06 = bucket2_yc[ 6 * 256]; bucket2_yc[ 6 * 256] = bucket2_cy[ 6]; bucket2_cy[ 6] = tmp06; + sa_uint_t tmp07 = bucket2_yc[ 7 * 256]; bucket2_yc[ 7 * 256] = bucket2_cy[ 7]; bucket2_cy[ 7] = tmp07; + sa_uint_t tmp08 = bucket2_yc[ 8 * 256]; bucket2_yc[ 8 * 256] = bucket2_cy[ 8]; bucket2_cy[ 8] = tmp08; + sa_uint_t tmp09 = bucket2_yc[ 9 * 256]; bucket2_yc[ 9 * 256] = bucket2_cy[ 9]; bucket2_cy[ 9] = tmp09; + sa_uint_t tmp10 = bucket2_yc[10 * 256]; bucket2_yc[10 * 256] = bucket2_cy[10]; bucket2_cy[10] = tmp10; + sa_uint_t tmp11 = bucket2_yc[11 * 256]; bucket2_yc[11 * 256] = bucket2_cy[11]; bucket2_cy[11] = tmp11; + sa_uint_t tmp12 = bucket2_yc[12 * 256]; bucket2_yc[12 * 256] = bucket2_cy[12]; bucket2_cy[12] = tmp12; + sa_uint_t tmp13 = bucket2_yc[13 * 256]; bucket2_yc[13 * 256] = bucket2_cy[13]; bucket2_cy[13] = tmp13; + sa_uint_t tmp14 = bucket2_yc[14 * 256]; bucket2_yc[14 * 256] = bucket2_cy[14]; bucket2_cy[14] = tmp14; + sa_uint_t tmp15 = bucket2_yc[15 * 256]; bucket2_yc[15 * 256] = bucket2_cy[15]; bucket2_cy[15] = tmp15; + } + } + } +} + +static void libsais_unbwt_compute_bigram_histogram_single(const uint8_t * RESTRICT T, sa_uint_t * RESTRICT bucket1, sa_uint_t * RESTRICT bucket2, fast_uint_t index) +{ + fast_uint_t sum, c; + for (sum = 1, c = 0; c < ALPHABET_SIZE; ++c) + { + fast_uint_t prev = sum; sum += bucket1[c]; bucket1[c] = (sa_uint_t)prev; + if (prev != sum) + { + sa_uint_t * RESTRICT bucket2_p = &bucket2[c << 8]; + + { + fast_uint_t hi = index; if (sum < hi) { hi = sum; } + libsais_unbwt_compute_histogram(&T[prev], (fast_sint_t)(hi - prev), bucket2_p); + } + + { + fast_uint_t lo = index + 1; if (prev > lo) { lo = prev; } + libsais_unbwt_compute_histogram(&T[lo - 1], (fast_sint_t)(sum - lo), bucket2_p); + } + } + } + + libsais_unbwt_transpose_bucket2(bucket2); +} + +static void libsais_unbwt_calculate_fastbits(sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t lastc, fast_uint_t shift) +{ + fast_uint_t v, w, sum, c, d; + for (v = 0, w = 0, sum = 1, c = 0; c < ALPHABET_SIZE; ++c) + { + if (c == lastc) { sum += 1; } + + for (d = 0; d < ALPHABET_SIZE; ++d, ++w) + { + fast_uint_t prev = sum; sum += bucket2[w]; bucket2[w] = (sa_uint_t)prev; + if (prev != sum) + { + for (; v <= ((sum - 1) >> shift); ++v) { fastbits[v] = (uint16_t)w; } + } + } + } +} + +static void libsais_unbwt_calculate_biPSI(const uint8_t * RESTRICT T, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket1, sa_uint_t * RESTRICT bucket2, fast_uint_t index, fast_sint_t omp_block_start, fast_sint_t omp_block_end) +{ + { + fast_sint_t i = omp_block_start, j = (fast_sint_t)index; if (omp_block_end < j) { j = omp_block_end; } + for (; i < j; ++i) + { + fast_uint_t c = T[i]; + fast_uint_t p = bucket1[c]++; + fast_sint_t t = (fast_sint_t)(index - p); + + if (t != 0) + { + fast_uint_t w = (((fast_uint_t)T[p + (fast_uint_t)(t >> ((sizeof(fast_sint_t) * 8) - 1))]) << 8) + c; + P[bucket2[w]++] = (sa_uint_t)i; + } + } + } + + { + fast_sint_t i = (fast_sint_t)index, j = omp_block_end; if (omp_block_start > i) { i = omp_block_start; } + for (i += 1; i <= j; ++i) + { + fast_uint_t c = T[i - 1]; + fast_uint_t p = bucket1[c]++; + fast_sint_t t = (fast_sint_t)(index - p); + + if (t != 0) + { + fast_uint_t w = (((fast_uint_t)T[p + (fast_uint_t)(t >> ((sizeof(fast_sint_t) * 8) - 1))]) << 8) + c; + P[bucket2[w]++] = (sa_uint_t)i; + } + } + } +} + +static void libsais_unbwt_init_single(const uint8_t * RESTRICT T, sa_uint_t * RESTRICT P, sa_sint_t n, const sa_sint_t * freq, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits) +{ + sa_uint_t bucket1[ALPHABET_SIZE]; + + fast_uint_t index = I[0]; + fast_uint_t lastc = T[0]; + fast_uint_t shift = 0; while ((n >> shift) > (1 << UNBWT_FASTBITS)) { shift++; } + + if (freq != NULL) + { + memcpy(bucket1, freq, ALPHABET_SIZE * sizeof(sa_uint_t)); + } + else + { + memset(bucket1, 0, ALPHABET_SIZE * sizeof(sa_uint_t)); + libsais_unbwt_compute_histogram(T, n, bucket1); + } + + memset(bucket2, 0, ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t)); + libsais_unbwt_compute_bigram_histogram_single(T, bucket1, bucket2, index); + + libsais_unbwt_calculate_fastbits(bucket2, fastbits, lastc, shift); + libsais_unbwt_calculate_biPSI(T, P, bucket1, bucket2, index, 0, n); +} + +#if defined(_OPENMP) + +static void libsais_unbwt_compute_bigram_histogram_parallel(const uint8_t * RESTRICT T, fast_uint_t index, sa_uint_t * RESTRICT bucket1, sa_uint_t * RESTRICT bucket2, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + fast_sint_t i; + for (i = omp_block_start; i < omp_block_start + omp_block_size; ++i) + { + fast_uint_t c = T[i]; + fast_uint_t p = bucket1[c]++; + fast_sint_t t = (fast_sint_t)(index - p); + + if (t != 0) + { + fast_uint_t w = (((fast_uint_t)T[p + (fast_uint_t)(t >> ((sizeof(fast_sint_t) * 8) - 1))]) << 8) + c; + bucket2[w]++; + } + } +} + +static void libsais_unbwt_init_parallel(const uint8_t * RESTRICT T, sa_uint_t * RESTRICT P, sa_sint_t n, const sa_sint_t * freq, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, sa_uint_t * RESTRICT buckets, sa_sint_t threads) +{ + sa_uint_t bucket1[ALPHABET_SIZE]; + + fast_uint_t index = I[0]; + fast_uint_t lastc = T[0]; + fast_uint_t shift = 0; while ((n >> shift) > (1 << UNBWT_FASTBITS)) { shift++; } + + memset(bucket1, 0, ALPHABET_SIZE * sizeof(sa_uint_t)); + memset(bucket2, 0, ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t)); + + #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) + { + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); + + if (omp_num_threads == 1) + { + libsais_unbwt_init_single(T, P, n, freq, I, bucket2, fastbits); + } + else + { + sa_uint_t * RESTRICT bucket1_local = buckets + omp_thread_num * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)); + sa_uint_t * RESTRICT bucket2_local = bucket1_local + ALPHABET_SIZE; + + fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; + + { + memset(bucket1_local, 0, ALPHABET_SIZE * sizeof(sa_uint_t)); + libsais_unbwt_compute_histogram(T + omp_block_start, omp_block_size, bucket1_local); + } + + #pragma omp barrier + + #pragma omp master + { + { + sa_uint_t * RESTRICT bucket1_temp = buckets; + + fast_sint_t t; + for (t = 0; t < omp_num_threads; ++t, bucket1_temp += ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) + { + fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_uint_t A = bucket1[c], B = bucket1_temp[c]; bucket1[c] = A + B; bucket1_temp[c] = A; } + } + } + + { + fast_uint_t sum, c; + for (sum = 1, c = 0; c < ALPHABET_SIZE; ++c) { fast_uint_t prev = sum; sum += bucket1[c]; bucket1[c] = (sa_uint_t)prev; } + } + } + + #pragma omp barrier + + { + fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_uint_t A = bucket1[c], B = bucket1_local[c]; bucket1_local[c] = A + B; } + + memset(bucket2_local, 0, ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t)); + libsais_unbwt_compute_bigram_histogram_parallel(T, index, bucket1_local, bucket2_local, omp_block_start, omp_block_size); + } + + #pragma omp barrier + + { + fast_sint_t omp_bucket2_stride = ((ALPHABET_SIZE * ALPHABET_SIZE) / omp_num_threads) & (-16); + fast_sint_t omp_bucket2_start = omp_thread_num * omp_bucket2_stride; + fast_sint_t omp_bucket2_size = omp_thread_num < omp_num_threads - 1 ? omp_bucket2_stride : (ALPHABET_SIZE * ALPHABET_SIZE) - omp_bucket2_start; + + sa_uint_t * RESTRICT bucket2_temp = buckets + ALPHABET_SIZE; + + fast_sint_t t; + for (t = 0; t < omp_num_threads; ++t, bucket2_temp += ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) + { + fast_sint_t c; for (c = omp_bucket2_start; c < omp_bucket2_start + omp_bucket2_size; c += 1) { sa_uint_t A = bucket2[c], B = bucket2_temp[c]; bucket2[c] = A + B; bucket2_temp[c] = A; } + } + } + + #pragma omp barrier + + #pragma omp master + { + + libsais_unbwt_calculate_fastbits(bucket2, fastbits, lastc, shift); + + { + fast_sint_t t; + for (t = omp_num_threads - 1; t >= 1; --t) + { + sa_uint_t * RESTRICT dst_bucket1 = buckets + t * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)); + sa_uint_t * RESTRICT src_bucket1 = dst_bucket1 - (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)); + + memcpy(dst_bucket1, src_bucket1, ALPHABET_SIZE * sizeof(sa_uint_t)); + } + + memcpy(buckets, bucket1, ALPHABET_SIZE * sizeof(sa_uint_t)); + } + } + + #pragma omp barrier + + { + fast_sint_t c; for (c = 0; c < ALPHABET_SIZE * ALPHABET_SIZE; c += 1) { sa_uint_t A = bucket2[c], B = bucket2_local[c]; bucket2_local[c] = A + B; } + + libsais_unbwt_calculate_biPSI(T, P, bucket1_local, bucket2_local, index, omp_block_start, omp_block_start + omp_block_size); + } + + #pragma omp barrier + + #pragma omp master + { + memcpy(bucket2, buckets + ALPHABET_SIZE + (omp_num_threads - 1) * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)), ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t)); + } + } + } +} + +#endif + +static void libsais_unbwt_decode_1(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t * i0, fast_uint_t k) +{ + uint16_t * RESTRICT U0 = (uint16_t *)(void *)U; + + fast_uint_t i, p0 = *i0; + + for (i = 0; i != k; ++i) + { + uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais_bswap16(c0); + } + + *i0 = p0; +} + +static void libsais_unbwt_decode_2(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t k) +{ + uint16_t * RESTRICT U0 = (uint16_t *)(void *)U; + uint16_t * RESTRICT U1 = (uint16_t *)(void *)(((uint8_t *)U0) + r); + + fast_uint_t i, p0 = *i0, p1 = *i1; + + for (i = 0; i != k; ++i) + { + uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais_bswap16(c0); + uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = libsais_bswap16(c1); + } + + *i0 = p0; *i1 = p1; +} + +static void libsais_unbwt_decode_3(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t k) +{ + uint16_t * RESTRICT U0 = (uint16_t *)(void *)U; + uint16_t * RESTRICT U1 = (uint16_t *)(void *)(((uint8_t *)U0) + r); + uint16_t * RESTRICT U2 = (uint16_t *)(void *)(((uint8_t *)U1) + r); + + fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2; + + for (i = 0; i != k; ++i) + { + uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais_bswap16(c0); + uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = libsais_bswap16(c1); + uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = libsais_bswap16(c2); + } + + *i0 = p0; *i1 = p1; *i2 = p2; +} + +static void libsais_unbwt_decode_4(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t k) +{ + uint16_t * RESTRICT U0 = (uint16_t *)(void *)U; + uint16_t * RESTRICT U1 = (uint16_t *)(void *)(((uint8_t *)U0) + r); + uint16_t * RESTRICT U2 = (uint16_t *)(void *)(((uint8_t *)U1) + r); + uint16_t * RESTRICT U3 = (uint16_t *)(void *)(((uint8_t *)U2) + r); + + fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3; + + for (i = 0; i != k; ++i) + { + uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais_bswap16(c0); + uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = libsais_bswap16(c1); + uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = libsais_bswap16(c2); + uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = libsais_bswap16(c3); + } + + *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; +} + +static void libsais_unbwt_decode_5(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t k) +{ + uint16_t * RESTRICT U0 = (uint16_t *)(void *)U; + uint16_t * RESTRICT U1 = (uint16_t *)(void *)(((uint8_t *)U0) + r); + uint16_t * RESTRICT U2 = (uint16_t *)(void *)(((uint8_t *)U1) + r); + uint16_t * RESTRICT U3 = (uint16_t *)(void *)(((uint8_t *)U2) + r); + uint16_t * RESTRICT U4 = (uint16_t *)(void *)(((uint8_t *)U3) + r); + + fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4; + + for (i = 0; i != k; ++i) + { + uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais_bswap16(c0); + uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = libsais_bswap16(c1); + uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = libsais_bswap16(c2); + uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = libsais_bswap16(c3); + uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = libsais_bswap16(c4); + } + + *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4; +} + +static void libsais_unbwt_decode_6(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t * i5, fast_uint_t k) +{ + uint16_t * RESTRICT U0 = (uint16_t *)(void *)U; + uint16_t * RESTRICT U1 = (uint16_t *)(void *)(((uint8_t *)U0) + r); + uint16_t * RESTRICT U2 = (uint16_t *)(void *)(((uint8_t *)U1) + r); + uint16_t * RESTRICT U3 = (uint16_t *)(void *)(((uint8_t *)U2) + r); + uint16_t * RESTRICT U4 = (uint16_t *)(void *)(((uint8_t *)U3) + r); + uint16_t * RESTRICT U5 = (uint16_t *)(void *)(((uint8_t *)U4) + r); + + fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5; + + for (i = 0; i != k; ++i) + { + uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais_bswap16(c0); + uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = libsais_bswap16(c1); + uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = libsais_bswap16(c2); + uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = libsais_bswap16(c3); + uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = libsais_bswap16(c4); + uint16_t c5 = fastbits[p5 >> shift]; if (bucket2[c5] <= p5) { do { c5++; } while (bucket2[c5] <= p5); } p5 = P[p5]; U5[i] = libsais_bswap16(c5); + } + + *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4; *i5 = p5; +} + +static void libsais_unbwt_decode_7(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t * i5, fast_uint_t * i6, fast_uint_t k) +{ + uint16_t * RESTRICT U0 = (uint16_t *)(void *)U; + uint16_t * RESTRICT U1 = (uint16_t *)(void *)(((uint8_t *)U0) + r); + uint16_t * RESTRICT U2 = (uint16_t *)(void *)(((uint8_t *)U1) + r); + uint16_t * RESTRICT U3 = (uint16_t *)(void *)(((uint8_t *)U2) + r); + uint16_t * RESTRICT U4 = (uint16_t *)(void *)(((uint8_t *)U3) + r); + uint16_t * RESTRICT U5 = (uint16_t *)(void *)(((uint8_t *)U4) + r); + uint16_t * RESTRICT U6 = (uint16_t *)(void *)(((uint8_t *)U5) + r); + + fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5, p6 = *i6; + + for (i = 0; i != k; ++i) + { + uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais_bswap16(c0); + uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = libsais_bswap16(c1); + uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = libsais_bswap16(c2); + uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = libsais_bswap16(c3); + uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = libsais_bswap16(c4); + uint16_t c5 = fastbits[p5 >> shift]; if (bucket2[c5] <= p5) { do { c5++; } while (bucket2[c5] <= p5); } p5 = P[p5]; U5[i] = libsais_bswap16(c5); + uint16_t c6 = fastbits[p6 >> shift]; if (bucket2[c6] <= p6) { do { c6++; } while (bucket2[c6] <= p6); } p6 = P[p6]; U6[i] = libsais_bswap16(c6); + } + + *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4; *i5 = p5; *i6 = p6; +} + +static void libsais_unbwt_decode_8(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t * i5, fast_uint_t * i6, fast_uint_t * i7, fast_uint_t k) +{ + uint16_t * RESTRICT U0 = (uint16_t *)(void *)U; + uint16_t * RESTRICT U1 = (uint16_t *)(void *)(((uint8_t *)U0) + r); + uint16_t * RESTRICT U2 = (uint16_t *)(void *)(((uint8_t *)U1) + r); + uint16_t * RESTRICT U3 = (uint16_t *)(void *)(((uint8_t *)U2) + r); + uint16_t * RESTRICT U4 = (uint16_t *)(void *)(((uint8_t *)U3) + r); + uint16_t * RESTRICT U5 = (uint16_t *)(void *)(((uint8_t *)U4) + r); + uint16_t * RESTRICT U6 = (uint16_t *)(void *)(((uint8_t *)U5) + r); + uint16_t * RESTRICT U7 = (uint16_t *)(void *)(((uint8_t *)U6) + r); + + fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5, p6 = *i6, p7 = *i7; + + for (i = 0; i != k; ++i) + { + uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais_bswap16(c0); + uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = libsais_bswap16(c1); + uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = libsais_bswap16(c2); + uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = libsais_bswap16(c3); + uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = libsais_bswap16(c4); + uint16_t c5 = fastbits[p5 >> shift]; if (bucket2[c5] <= p5) { do { c5++; } while (bucket2[c5] <= p5); } p5 = P[p5]; U5[i] = libsais_bswap16(c5); + uint16_t c6 = fastbits[p6 >> shift]; if (bucket2[c6] <= p6) { do { c6++; } while (bucket2[c6] <= p6); } p6 = P[p6]; U6[i] = libsais_bswap16(c6); + uint16_t c7 = fastbits[p7 >> shift]; if (bucket2[c7] <= p7) { do { c7++; } while (bucket2[c7] <= p7); } p7 = P[p7]; U7[i] = libsais_bswap16(c7); + } + + *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4; *i5 = p5; *i6 = p6; *i7 = p7; +} + +static void libsais_unbwt_decode(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n, sa_sint_t r, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_sint_t blocks, fast_uint_t reminder) +{ + fast_uint_t shift = 0; while ((n >> shift) > (1 << UNBWT_FASTBITS)) { shift++; } + fast_uint_t offset = 0; + + while (blocks > 8) + { + fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6], i7 = I[7]; + libsais_unbwt_decode_8(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, &i7, (fast_uint_t)r >> 1); + I += 8; blocks -= 8; offset += 8 * (fast_uint_t)r; + } + + if (blocks == 1) + { + fast_uint_t i0 = I[0]; + libsais_unbwt_decode_1(U + offset, P, bucket2, fastbits, shift, &i0, reminder >> 1); + } + else if (blocks == 2) + { + fast_uint_t i0 = I[0], i1 = I[1]; + libsais_unbwt_decode_2(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, reminder >> 1); + libsais_unbwt_decode_1(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, &i0, ((fast_uint_t)r >> 1) - (reminder >> 1)); + } + else if (blocks == 3) + { + fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2]; + libsais_unbwt_decode_3(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, reminder >> 1); + libsais_unbwt_decode_2(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, ((fast_uint_t)r >> 1) - (reminder >> 1)); + } + else if (blocks == 4) + { + fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3]; + libsais_unbwt_decode_4(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, reminder >> 1); + libsais_unbwt_decode_3(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, ((fast_uint_t)r >> 1) - (reminder >> 1)); + } + else if (blocks == 5) + { + fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4]; + libsais_unbwt_decode_5(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, reminder >> 1); + libsais_unbwt_decode_4(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, ((fast_uint_t)r >> 1) - (reminder >> 1)); + } + else if (blocks == 6) + { + fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5]; + libsais_unbwt_decode_6(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, reminder >> 1); + libsais_unbwt_decode_5(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, ((fast_uint_t)r >> 1) - (reminder >> 1)); + } + else if (blocks == 7) + { + fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6]; + libsais_unbwt_decode_7(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, reminder >> 1); + libsais_unbwt_decode_6(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, ((fast_uint_t)r >> 1) - (reminder >> 1)); + } + else + { + fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6], i7 = I[7]; + libsais_unbwt_decode_8(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, &i7, reminder >> 1); + libsais_unbwt_decode_7(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, ((fast_uint_t)r >> 1) - (reminder >> 1)); + } +} + +static void libsais_unbwt_decode_omp(const uint8_t * RESTRICT T, uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n, sa_sint_t r, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, sa_sint_t threads) +{ + fast_uint_t lastc = T[0]; + fast_sint_t blocks = 1 + (((fast_sint_t)n - 1) / (fast_sint_t)r); + fast_uint_t reminder = (fast_uint_t)n - ((fast_uint_t)r * ((fast_uint_t)blocks - 1)); + +#if defined(_OPENMP) + fast_sint_t max_threads = blocks < threads ? blocks : threads; + #pragma omp parallel num_threads(max_threads) if(max_threads > 1 && n >= 65536) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + + fast_sint_t omp_block_stride = blocks / omp_num_threads; + fast_sint_t omp_block_reminder = blocks % omp_num_threads; + fast_sint_t omp_block_size = omp_block_stride + (omp_thread_num < omp_block_reminder); + fast_sint_t omp_block_start = omp_block_stride * omp_thread_num + (omp_thread_num < omp_block_reminder ? omp_thread_num : omp_block_reminder); + + libsais_unbwt_decode(U + r * omp_block_start, P, n, r, I + omp_block_start, bucket2, fastbits, omp_block_size, omp_thread_num < omp_num_threads - 1 ? (fast_uint_t)r : reminder); + } + + U[n - 1] = (uint8_t)lastc; +} + +static sa_sint_t libsais_unbwt_core(const uint8_t * RESTRICT T, uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n, const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, sa_uint_t * RESTRICT buckets, sa_sint_t threads) +{ +#if defined(_OPENMP) + if (threads > 1 && n >= 262144) + { + libsais_unbwt_init_parallel(T, P, n, freq, I, bucket2, fastbits, buckets, threads); + } + else +#else + UNUSED(buckets); +#endif + { + libsais_unbwt_init_single(T, P, n, freq, I, bucket2, fastbits); + } + + libsais_unbwt_decode_omp(T, U, P, n, r, I, bucket2, fastbits, threads); + return 0; +} + +static sa_sint_t libsais_unbwt_main(const uint8_t * T, uint8_t * U, sa_uint_t * P, sa_sint_t n, const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * I, sa_sint_t threads) +{ + fast_uint_t shift = 0; while ((n >> shift) > (1 << UNBWT_FASTBITS)) { shift++; } + + sa_uint_t * RESTRICT bucket2 = (sa_uint_t *)libsais_alloc_aligned(ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t), 4096); + uint16_t * RESTRICT fastbits = (uint16_t *)libsais_alloc_aligned(((size_t)1 + (size_t)(n >> shift)) * sizeof(uint16_t), 4096); + sa_uint_t * RESTRICT buckets = threads > 1 && n >= 262144 ? (sa_uint_t *)libsais_alloc_aligned((size_t)threads * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) * sizeof(sa_uint_t), 4096) : NULL; + + sa_sint_t index = bucket2 != NULL && fastbits != NULL && (buckets != NULL || threads == 1 || n < 262144) + ? libsais_unbwt_core(T, U, P, n, freq, r, I, bucket2, fastbits, buckets, threads) + : -2; + + libsais_free_aligned(buckets); + libsais_free_aligned(fastbits); + libsais_free_aligned(bucket2); + + return index; +} + +static sa_sint_t libsais_unbwt_main_ctx(const LIBSAIS_UNBWT_CONTEXT * ctx, const uint8_t * T, uint8_t * U, sa_uint_t * P, sa_sint_t n, const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * I) +{ + return ctx != NULL && ctx->bucket2 != NULL && ctx->fastbits != NULL && (ctx->buckets != NULL || ctx->threads == 1) + ? libsais_unbwt_core(T, U, P, n, freq, r, I, ctx->bucket2, ctx->fastbits, ctx->buckets, (sa_sint_t)ctx->threads) + : -2; +} + +void * libsais_unbwt_create_ctx(void) +{ + return (void *)libsais_unbwt_create_ctx_main(1); +} + +void libsais_unbwt_free_ctx(void * ctx) +{ + libsais_unbwt_free_ctx_main((LIBSAIS_UNBWT_CONTEXT *)ctx); +} + +int32_t libsais_unbwt(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i) +{ + return libsais_unbwt_aux(T, U, A, n, freq, n, &i); +} + +int32_t libsais_unbwt_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i) +{ + return libsais_unbwt_aux_ctx(ctx, T, U, A, n, freq, n, &i); +} + +int32_t libsais_unbwt_aux(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I) +{ + if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL)) + { + return -1; + } + else if (n <= 1) + { + if (I[0] != n) { return -1; } + if (n == 1) { U[0] = T[0]; } + return 0; + } + + fast_sint_t t; for (t = 0; t <= (n - 1) / r; ++t) { if (I[t] <= 0 || I[t] > n) { return -1; } } + + return libsais_unbwt_main(T, U, (sa_uint_t *)A, n, freq, r, (const sa_uint_t *)I, 1); +} + +int32_t libsais_unbwt_aux_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I) +{ + if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL)) + { + return -1; + } + else if (n <= 1) + { + if (I[0] != n) { return -1; } + if (n == 1) { U[0] = T[0]; } + return 0; + } + + fast_sint_t t; for (t = 0; t <= (n - 1) / r; ++t) { if (I[t] <= 0 || I[t] > n) { return -1; } } + + return libsais_unbwt_main_ctx((const LIBSAIS_UNBWT_CONTEXT *)ctx, T, U, (sa_uint_t *)A, n, freq, r, (const sa_uint_t *)I); +} + +#if defined(_OPENMP) + +void * libsais_unbwt_create_ctx_omp(int32_t threads) +{ + if (threads < 0) { return NULL; } + + threads = threads > 0 ? threads : omp_get_max_threads(); + return (void *)libsais_unbwt_create_ctx_main(threads); +} + +int32_t libsais_unbwt_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i, int32_t threads) +{ + return libsais_unbwt_aux_omp(T, U, A, n, freq, n, &i, threads); +} + +int32_t libsais_unbwt_aux_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I, int32_t threads) +{ + if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL) || (threads < 0)) + { + return -1; + } + else if (n <= 1) + { + if (I[0] != n) { return -1; } + if (n == 1) { U[0] = T[0]; } + return 0; + } + + fast_sint_t t; for (t = 0; t <= (n - 1) / r; ++t) { if (I[t] <= 0 || I[t] > n) { return -1; } } + + threads = threads > 0 ? threads : omp_get_max_threads(); + return libsais_unbwt_main(T, U, (sa_uint_t *)A, n, freq, r, (const sa_uint_t *)I, threads); +} + +#endif diff --git a/libsais/libsais.h b/libsais/libsais.h new file mode 100644 index 0000000..c655d67 --- /dev/null +++ b/libsais/libsais.h @@ -0,0 +1,285 @@ +/*-- + +This file is a part of libsais, a library for linear time +suffix array and burrows wheeler transform construction. + + Copyright (c) 2021 Ilya Grebnov + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +Please see the file LICENSE for full copyright information. + +--*/ + +#ifndef LIBSAIS_H +#define LIBSAIS_H 1 + +#ifdef __cplusplus +extern "C" { +#endif + + #include + + /** + * Creates the libsais context that allows reusing allocated memory with each libsais operation. + * In multi-threaded environments, use one context per thread for parallel executions. + * @return the libsais context, NULL otherwise. + */ + void * libsais_create_ctx(void); + +#if defined(_OPENMP) + /** + * Creates the libsais context that allows reusing allocated memory with each parallel libsais operation using OpenMP. + * In multi-threaded environments, use one context per thread for parallel executions. + * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). + * @return the libsais context, NULL otherwise. + */ + void * libsais_create_ctx_omp(int32_t threads); +#endif + + /** + * Destroys the libsass context and free previusly allocated memory. + * @param ctx The libsais context (can be NULL). + */ + void libsais_free_ctx(void * ctx); + + /** + * Constructs the suffix array of a given string. + * @param T [0..n-1] The input string. + * @param SA [0..n-1+fs] The output array of suffixes. + * @param n The length of the given string. + * @param fs The extra space available at the end of SA array (can be 0). + * @param freq [0..255] The output symbol frequency table (can be NULL). + * @return 0 if no error occurred, -1 or -2 otherwise. + */ + int32_t libsais(const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq); + + /** + * Constructs the suffix array of a given string using libsais context. + * @param ctx The libsais context. + * @param T [0..n-1] The input string. + * @param SA [0..n-1+fs] The output array of suffixes. + * @param n The length of the given string. + * @param fs The extra space available at the end of SA array (can be 0). + * @param freq [0..255] The output symbol frequency table (can be NULL). + * @return 0 if no error occurred, -1 or -2 otherwise. + */ + int32_t libsais_ctx(const void * ctx, const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq); + +#if defined(_OPENMP) + /** + * Constructs the suffix array of a given string in parallel using OpenMP. + * @param T [0..n-1] The input string. + * @param SA [0..n-1+fs] The output array of suffixes. + * @param n The length of the given string. + * @param fs The extra space available at the end of SA array (can be 0). + * @param freq [0..255] The output symbol frequency table (can be NULL). + * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). + * @return 0 if no error occurred, -1 or -2 otherwise. + */ + int32_t libsais_omp(const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq, int32_t threads); +#endif + + /** + * Constructs the burrows-wheeler transformed string of a given string. + * @param T [0..n-1] The input string. + * @param U [0..n-1] The output string (can be T). + * @param A [0..n-1+fs] The temporary array. + * @param n The length of the given string. + * @param fs The extra space available at the end of A array (can be 0). + * @param freq [0..255] The output symbol frequency table (can be NULL). + * @return The primary index if no error occurred, -1 or -2 otherwise. + */ + int32_t libsais_bwt(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq); + + /** + * Constructs the burrows-wheeler transformed string of a given string with auxiliary indexes. + * @param T [0..n-1] The input string. + * @param U [0..n-1] The output string (can be T). + * @param A [0..n-1+fs] The temporary array. + * @param n The length of the given string. + * @param fs The extra space available at the end of A array (can be 0). + * @param freq [0..255] The output symbol frequency table (can be NULL). + * @param r The sampling rate for auxiliary indexes (must be power of 2). + * @param I [0..(n-1)/r] The output auxiliary indexes. + * @return 0 if no error occurred, -1 or -2 otherwise. + */ + int32_t libsais_bwt_aux(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I); + + /** + * Constructs the burrows-wheeler transformed string of a given string using libsais context. + * @param ctx The libsais context. + * @param T [0..n-1] The input string. + * @param U [0..n-1] The output string (can be T). + * @param A [0..n-1+fs] The temporary array. + * @param n The length of the given string. + * @param fs The extra space available at the end of A array (can be 0). + * @param freq [0..255] The output symbol frequency table (can be NULL). + * @return The primary index if no error occurred, -1 or -2 otherwise. + */ + int32_t libsais_bwt_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq); + + /** + * Constructs the burrows-wheeler transformed string of a given string with auxiliary indexes using libsais context. + * @param ctx The libsais context. + * @param T [0..n-1] The input string. + * @param U [0..n-1] The output string (can be T). + * @param A [0..n-1+fs] The temporary array. + * @param n The length of the given string. + * @param fs The extra space available at the end of A array (can be 0). + * @param freq [0..255] The output symbol frequency table (can be NULL). + * @param r The sampling rate for auxiliary indexes (must be power of 2). + * @param I [0..(n-1)/r] The output auxiliary indexes. + * @return 0 if no error occurred, -1 or -2 otherwise. + */ + int32_t libsais_bwt_aux_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I); + +#if defined(_OPENMP) + /** + * Constructs the burrows-wheeler transformed string of a given string in parallel using OpenMP. + * @param T [0..n-1] The input string. + * @param U [0..n-1] The output string (can be T). + * @param A [0..n-1+fs] The temporary array. + * @param n The length of the given string. + * @param fs The extra space available at the end of A array (can be 0). + * @param freq [0..255] The output symbol frequency table (can be NULL). + * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). + * @return The primary index if no error occurred, -1 or -2 otherwise. + */ + int32_t libsais_bwt_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t threads); + + /** + * Constructs the burrows-wheeler transformed string of a given string with auxiliary indexes in parallel using OpenMP. + * @param T [0..n-1] The input string. + * @param U [0..n-1] The output string (can be T). + * @param A [0..n-1+fs] The temporary array. + * @param n The length of the given string. + * @param fs The extra space available at the end of A array (can be 0). + * @param freq [0..255] The output symbol frequency table (can be NULL). + * @param r The sampling rate for auxiliary indexes (must be power of 2). + * @param I [0..(n-1)/r] The output auxiliary indexes. + * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). + * @return 0 if no error occurred, -1 or -2 otherwise. + */ + int32_t libsais_bwt_aux_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I, int32_t threads); +#endif + + /** + * Creates the libsais reverse BWT context that allows reusing allocated memory with each libsais_unbwt_* operation. + * In multi-threaded environments, use one context per thread for parallel executions. + * @return the libsais context, NULL otherwise. + */ + void * libsais_unbwt_create_ctx(void); + +#if defined(_OPENMP) + /** + * Creates the libsais reverse BWT context that allows reusing allocated memory with each parallel libsais_unbwt_* operation using OpenMP. + * In multi-threaded environments, use one context per thread for parallel executions. + * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). + * @return the libsais context, NULL otherwise. + */ + void * libsais_unbwt_create_ctx_omp(int32_t threads); +#endif + + /** + * Destroys the libsass reverse BWT context and free previusly allocated memory. + * @param ctx The libsais context (can be NULL). + */ + void libsais_unbwt_free_ctx(void * ctx); + + /** + * Constructs the original string from a given burrows-wheeler transformed string with primary index. + * @param T [0..n-1] The input string. + * @param U [0..n-1] The output string (can be T). + * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). + * @param n The length of the given string. + * @param freq [0..255] The input symbol frequency table (can be NULL). + * @param i The primary index. + * @return 0 if no error occurred, -1 or -2 otherwise. + */ + int32_t libsais_unbwt(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i); + + /** + * Constructs the original string from a given burrows-wheeler transformed string with primary index using libsais reverse BWT context. + * @param ctx The libsais reverse BWT context. + * @param T [0..n-1] The input string. + * @param U [0..n-1] The output string (can be T). + * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). + * @param n The length of the given string. + * @param freq [0..255] The input symbol frequency table (can be NULL). + * @param i The primary index. + * @return 0 if no error occurred, -1 or -2 otherwise. + */ + int32_t libsais_unbwt_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i); + + /** + * Constructs the original string from a given burrows-wheeler transformed string with auxiliary indexes. + * @param T [0..n-1] The input string. + * @param U [0..n-1] The output string (can be T). + * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). + * @param n The length of the given string. + * @param freq [0..255] The input symbol frequency table (can be NULL). + * @param r The sampling rate for auxiliary indexes (must be power of 2). + * @param I [0..(n-1)/r] The input auxiliary indexes. + * @return 0 if no error occurred, -1 or -2 otherwise. + */ + int32_t libsais_unbwt_aux(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I); + + /** + * Constructs the original string from a given burrows-wheeler transformed string with auxiliary indexes using libsais reverse BWT context. + * @param ctx The libsais reverse BWT context. + * @param T [0..n-1] The input string. + * @param U [0..n-1] The output string (can be T). + * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). + * @param n The length of the given string. + * @param freq [0..255] The input symbol frequency table (can be NULL). + * @param r The sampling rate for auxiliary indexes (must be power of 2). + * @param I [0..(n-1)/r] The input auxiliary indexes. + * @return 0 if no error occurred, -1 or -2 otherwise. + */ + int32_t libsais_unbwt_aux_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I); + +#if defined(_OPENMP) + /** + * Constructs the original string from a given burrows-wheeler transformed string with primary index in parallel using OpenMP. + * @param T [0..n-1] The input string. + * @param U [0..n-1] The output string (can be T). + * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). + * @param n The length of the given string. + * @param freq [0..255] The input symbol frequency table (can be NULL). + * @param i The primary index. + * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). + * @return 0 if no error occurred, -1 or -2 otherwise. + */ + int32_t libsais_unbwt_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i, int32_t threads); + + /** + * Constructs the original string from a given burrows-wheeler transformed string with auxiliary indexes in parallel using OpenMP. + * @param T [0..n-1] The input string. + * @param U [0..n-1] The output string (can be T). + * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). + * @param n The length of the given string. + * @param freq [0..255] The input symbol frequency table (can be NULL). + * @param r The sampling rate for auxiliary indexes (must be power of 2). + * @param I [0..(n-1)/r] The input auxiliary indexes. + * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). + * @return 0 if no error occurred, -1 or -2 otherwise. + */ + int32_t libsais_unbwt_aux_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I, int32_t threads); +#endif + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/libsais/libsais16.c b/libsais/libsais16.c new file mode 100644 index 0000000..9a8d95b --- /dev/null +++ b/libsais/libsais16.c @@ -0,0 +1,7342 @@ +/*-- + +This file is a part of libsais, a library for linear time +suffix array and burrows wheeler transform construction. + + Copyright (c) 2021 Ilya Grebnov + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +Please see the file LICENSE for full copyright information. + +--*/ + +#include "libsais16.h" + +#include +#include +#include +#include +#include + +#if defined(_OPENMP) + #include +#else + #define UNUSED(_x) (void)(_x) +#endif + +typedef int32_t sa_sint_t; +typedef uint32_t sa_uint_t; +typedef ptrdiff_t fast_sint_t; +typedef size_t fast_uint_t; + +#define SAINT_BIT (32) +#define SAINT_MAX INT32_MAX +#define SAINT_MIN INT32_MIN + +#define ALPHABET_SIZE (1 << CHAR_BIT << CHAR_BIT) +#define UNBWT_FASTBITS (17) + +#define SUFFIX_GROUP_BIT (SAINT_BIT - 1) +#define SUFFIX_GROUP_MARKER (((sa_sint_t)1) << (SUFFIX_GROUP_BIT - 1)) + +#define BUCKETS_INDEX2(_c, _s) (((_c) << 1) + (_s)) +#define BUCKETS_INDEX4(_c, _s) (((_c) << 2) + (_s)) + +#define LIBSAIS_PER_THREAD_CACHE_SIZE (24576) + +typedef struct LIBSAIS_THREAD_CACHE +{ + sa_sint_t symbol; + sa_sint_t index; +} LIBSAIS_THREAD_CACHE; + +typedef union LIBSAIS_THREAD_STATE +{ + struct + { + fast_sint_t position; + fast_sint_t count; + + fast_sint_t m; + fast_sint_t last_lms_suffix; + + sa_sint_t * buckets; + LIBSAIS_THREAD_CACHE * cache; + } state; + + uint8_t padding[64]; +} LIBSAIS_THREAD_STATE; + +typedef struct LIBSAIS_CONTEXT +{ + sa_sint_t * buckets; + LIBSAIS_THREAD_STATE * thread_state; + fast_sint_t threads; +} LIBSAIS_CONTEXT; + +typedef struct LIBSAIS_UNBWT_CONTEXT +{ + sa_uint_t * bucket2; + uint16_t * fastbits; + sa_uint_t * buckets; + fast_sint_t threads; +} LIBSAIS_UNBWT_CONTEXT; + +#if defined(__GNUC__) || defined(__clang__) + #define RESTRICT __restrict__ +#elif defined(_MSC_VER) || defined(__INTEL_COMPILER) + #define RESTRICT __restrict +#else + #error Your compiler, configuration or platform is not supported. +#endif + +#if defined(__has_builtin) + #if __has_builtin(__builtin_prefetch) + #define HAS_BUILTIN_PREFECTCH + #endif +#elif defined(__GNUC__) && __GNUC__ > 3 + #define HAS_BUILTIN_PREFECTCH +#endif + +#if defined(HAS_BUILTIN_PREFECTCH) + #define libsais16_prefetch(address) __builtin_prefetch((const void *)(address), 0, 0) + #define libsais16_prefetchw(address) __builtin_prefetch((const void *)(address), 1, 0) +#elif defined (_M_IX86) || defined (_M_AMD64) + #include + #define libsais16_prefetch(address) _mm_prefetch((const void *)(address), _MM_HINT_NTA) + #define libsais16_prefetchw(address) _m_prefetchw((const void *)(address)) +#elif defined (_M_ARM) + #include + #define libsais16_prefetch(address) __prefetch((const void *)(address)) + #define libsais16_prefetchw(address) __prefetchw((const void *)(address)) +#elif defined (_M_ARM64) + #include + #define libsais16_prefetch(address) __prefetch2((const void *)(address), 1) + #define libsais16_prefetchw(address) __prefetch2((const void *)(address), 17) +#else + #error Your compiler, configuration or platform is not supported. +#endif + +#if !defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__) + #if defined(_LITTLE_ENDIAN) \ + || (defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && BYTE_ORDER == LITTLE_ENDIAN) \ + || (defined(_BYTE_ORDER) && defined(_LITTLE_ENDIAN) && _BYTE_ORDER == _LITTLE_ENDIAN) \ + || (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN) \ + || (defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) + #define __LITTLE_ENDIAN__ + #elif defined(_BIG_ENDIAN) \ + || (defined(BYTE_ORDER) && defined(BIG_ENDIAN) && BYTE_ORDER == BIG_ENDIAN) \ + || (defined(_BYTE_ORDER) && defined(_BIG_ENDIAN) && _BYTE_ORDER == _BIG_ENDIAN) \ + || (defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && __BYTE_ORDER == __BIG_ENDIAN) \ + || (defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + #define __BIG_ENDIAN__ + #elif defined(_WIN32) + #define __LITTLE_ENDIAN__ + #endif +#endif + +static void * libsais16_align_up(const void * address, size_t alignment) +{ + return (void *)((((ptrdiff_t)address) + ((ptrdiff_t)alignment) - 1) & (-((ptrdiff_t)alignment))); +} + +static void * libsais16_alloc_aligned(size_t size, size_t alignment) +{ + void * address = malloc(size + sizeof(short) + alignment - 1); + if (address != NULL) + { + void * aligned_address = libsais16_align_up((void *)((ptrdiff_t)address + (ptrdiff_t)(sizeof(short))), alignment); + ((short *)aligned_address)[-1] = (short)((ptrdiff_t)aligned_address - (ptrdiff_t)address); + + return aligned_address; + } + + return NULL; +} + +static void libsais16_free_aligned(void * aligned_address) +{ + if (aligned_address != NULL) + { + free((void *)((ptrdiff_t)aligned_address - ((short *)aligned_address)[-1])); + } +} + +static LIBSAIS_THREAD_STATE * libsais16_alloc_thread_state(sa_sint_t threads) +{ + LIBSAIS_THREAD_STATE * RESTRICT thread_state = (LIBSAIS_THREAD_STATE *)libsais16_alloc_aligned((size_t)threads * sizeof(LIBSAIS_THREAD_STATE), 4096); + sa_sint_t * RESTRICT thread_buckets = (sa_sint_t *)libsais16_alloc_aligned((size_t)threads * 4 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096); + LIBSAIS_THREAD_CACHE * RESTRICT thread_cache = (LIBSAIS_THREAD_CACHE *)libsais16_alloc_aligned((size_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE * sizeof(LIBSAIS_THREAD_CACHE), 4096); + + if (thread_state != NULL && thread_buckets != NULL && thread_cache != NULL) + { + fast_sint_t t; + for (t = 0; t < threads; ++t) + { + thread_state[t].state.buckets = thread_buckets; thread_buckets += 4 * ALPHABET_SIZE; + thread_state[t].state.cache = thread_cache; thread_cache += LIBSAIS_PER_THREAD_CACHE_SIZE; + } + + return thread_state; + } + + libsais16_free_aligned(thread_cache); + libsais16_free_aligned(thread_buckets); + libsais16_free_aligned(thread_state); + return NULL; +} + +static void libsais16_free_thread_state(LIBSAIS_THREAD_STATE * thread_state) +{ + if (thread_state != NULL) + { + libsais16_free_aligned(thread_state[0].state.cache); + libsais16_free_aligned(thread_state[0].state.buckets); + libsais16_free_aligned(thread_state); + } +} + +static LIBSAIS_CONTEXT * libsais16_create_ctx_main(sa_sint_t threads) +{ + LIBSAIS_CONTEXT * RESTRICT ctx = (LIBSAIS_CONTEXT *)libsais16_alloc_aligned(sizeof(LIBSAIS_CONTEXT), 64); + sa_sint_t * RESTRICT buckets = (sa_sint_t *)libsais16_alloc_aligned(8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096); + LIBSAIS_THREAD_STATE * RESTRICT thread_state = threads > 1 ? libsais16_alloc_thread_state(threads) : NULL; + + if (ctx != NULL && buckets != NULL && (thread_state != NULL || threads == 1)) + { + ctx->buckets = buckets; + ctx->threads = threads; + ctx->thread_state = thread_state; + + return ctx; + } + + libsais16_free_thread_state(thread_state); + libsais16_free_aligned(buckets); + libsais16_free_aligned(ctx); + return NULL; +} + +static void libsais16_free_ctx_main(LIBSAIS_CONTEXT * ctx) +{ + if (ctx != NULL) + { + libsais16_free_thread_state(ctx->thread_state); + libsais16_free_aligned(ctx->buckets); + libsais16_free_aligned(ctx); + } +} + +#if defined(_OPENMP) + +static sa_sint_t libsais16_count_negative_marked_suffixes(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + sa_sint_t count = 0; + + fast_sint_t i; for (i = omp_block_start; i < omp_block_start + omp_block_size; ++i) { count += (SA[i] < 0); } + + return count; +} + +static sa_sint_t libsais16_count_zero_marked_suffixes(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + sa_sint_t count = 0; + + fast_sint_t i; for (i = omp_block_start; i < omp_block_start + omp_block_size; ++i) { count += (SA[i] == 0); } + + return count; +} + +static void libsais16_place_cached_suffixes(sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) + { + libsais16_prefetch(&cache[i + 2 * prefetch_distance]); + + libsais16_prefetchw(&SA[cache[i + prefetch_distance + 0].symbol]); + libsais16_prefetchw(&SA[cache[i + prefetch_distance + 1].symbol]); + libsais16_prefetchw(&SA[cache[i + prefetch_distance + 2].symbol]); + libsais16_prefetchw(&SA[cache[i + prefetch_distance + 3].symbol]); + + SA[cache[i + 0].symbol] = cache[i + 0].index; + SA[cache[i + 1].symbol] = cache[i + 1].index; + SA[cache[i + 2].symbol] = cache[i + 2].index; + SA[cache[i + 3].symbol] = cache[i + 3].index; + } + + for (j += prefetch_distance + 3; i < j; i += 1) + { + SA[cache[i].symbol] = cache[i].index; + } +} + +static void libsais16_compact_and_place_cached_suffixes(sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j, l; + for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j; i += 4) + { + libsais16_prefetchw(&cache[i + prefetch_distance]); + + cache[l] = cache[i + 0]; l += cache[l].symbol >= 0; + cache[l] = cache[i + 1]; l += cache[l].symbol >= 0; + cache[l] = cache[i + 2]; l += cache[l].symbol >= 0; + cache[l] = cache[i + 3]; l += cache[l].symbol >= 0; + } + + for (j += 3; i < j; i += 1) + { + cache[l] = cache[i]; l += cache[l].symbol >= 0; + } + + libsais16_place_cached_suffixes(SA, cache, omp_block_start, l - omp_block_start); +} + +static void libsais16_accumulate_counts_s32_2(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) +{ + sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; + fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s]; } +} + +static void libsais16_accumulate_counts_s32_3(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) +{ + sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; + sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride; + fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s]; } +} + +static void libsais16_accumulate_counts_s32_4(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) +{ + sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; + sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride; + sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride; + fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s]; } +} + +static void libsais16_accumulate_counts_s32_5(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) +{ + sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; + sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride; + sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride; + sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride; + fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s]; } +} + +static void libsais16_accumulate_counts_s32_6(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) +{ + sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; + sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride; + sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride; + sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride; + sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride; + fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s]; } +} + +static void libsais16_accumulate_counts_s32_7(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) +{ + sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; + sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride; + sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride; + sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride; + sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride; + sa_sint_t * RESTRICT bucket06 = bucket05 - bucket_stride; + fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s] + bucket06[s]; } +} + +static void libsais16_accumulate_counts_s32_8(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) +{ + sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; + sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride; + sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride; + sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride; + sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride; + sa_sint_t * RESTRICT bucket06 = bucket05 - bucket_stride; + sa_sint_t * RESTRICT bucket07 = bucket06 - bucket_stride; + fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s] + bucket06[s] + bucket07[s]; } +} + +static void libsais16_accumulate_counts_s32_9(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) +{ + sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; + sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride; + sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride; + sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride; + sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride; + sa_sint_t * RESTRICT bucket06 = bucket05 - bucket_stride; + sa_sint_t * RESTRICT bucket07 = bucket06 - bucket_stride; + sa_sint_t * RESTRICT bucket08 = bucket07 - bucket_stride; + fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s] + bucket06[s] + bucket07[s] + bucket08[s]; } +} + +static void libsais16_accumulate_counts_s32(sa_sint_t * RESTRICT buckets, fast_sint_t bucket_size, fast_sint_t bucket_stride, fast_sint_t num_buckets) +{ + while (num_buckets >= 9) + { + libsais16_accumulate_counts_s32_9(buckets - (num_buckets - 9) * bucket_stride, bucket_size, bucket_stride); num_buckets -= 8; + } + + switch (num_buckets) + { + case 1: break; + case 2: libsais16_accumulate_counts_s32_2(buckets, bucket_size, bucket_stride); break; + case 3: libsais16_accumulate_counts_s32_3(buckets, bucket_size, bucket_stride); break; + case 4: libsais16_accumulate_counts_s32_4(buckets, bucket_size, bucket_stride); break; + case 5: libsais16_accumulate_counts_s32_5(buckets, bucket_size, bucket_stride); break; + case 6: libsais16_accumulate_counts_s32_6(buckets, bucket_size, bucket_stride); break; + case 7: libsais16_accumulate_counts_s32_7(buckets, bucket_size, bucket_stride); break; + case 8: libsais16_accumulate_counts_s32_8(buckets, bucket_size, bucket_stride); break; + } +} + +#endif + +static void libsais16_gather_lms_suffixes_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, fast_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + if (omp_block_size > 0) + { + const fast_sint_t prefetch_distance = 128; + + fast_sint_t i, j = omp_block_start + omp_block_size, c0 = T[omp_block_start + omp_block_size - 1], c1 = -1; + + while (j < n && (c1 = T[j]) == c0) { ++j; } + + fast_uint_t s = c0 >= c1; + + for (i = omp_block_start + omp_block_size - 2, j = omp_block_start + 3; i >= j; i -= 4) + { + libsais16_prefetch(&T[i - prefetch_distance]); + + c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1); + c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 0); m -= ((s & 3) == 1); + c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 1); m -= ((s & 3) == 1); + c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 2); m -= ((s & 3) == 1); + } + + for (j -= 3; i >= j; i -= 1) + { + c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1); + } + + SA[m] = (sa_sint_t)(i + 1); + } +} + +static void libsais16_gather_lms_suffixes_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536 && omp_get_dynamic() == 0) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; + + if (omp_num_threads == 1) + { + libsais16_gather_lms_suffixes_16u(T, SA, n, (fast_sint_t)n - 1, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + fast_sint_t t, m = 0; for (t = omp_num_threads - 1; t > omp_thread_num; --t) { m += thread_state[t].state.m; } + + libsais16_gather_lms_suffixes_16u(T, SA, n, (fast_sint_t)n - 1 - m, omp_block_start, omp_block_size); + + #pragma omp barrier + + if (thread_state[omp_thread_num].state.m > 0) + { + SA[(fast_sint_t)n - 1 - m] = (sa_sint_t)thread_state[omp_thread_num].state.last_lms_suffix; + } + } +#endif + } +} + +static sa_sint_t libsais16_gather_lms_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n) +{ + const fast_sint_t prefetch_distance = 32; + + sa_sint_t i = n - 2; + sa_sint_t m = n - 1; + fast_uint_t s = 1; + fast_sint_t c0 = T[n - 1]; + fast_sint_t c1 = 0; + + for (; i >= 3; i -= 4) + { + libsais16_prefetch(&T[i - prefetch_distance]); + + c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = i + 1; m -= ((s & 3) == 1); + c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i - 0; m -= ((s & 3) == 1); + c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = i - 1; m -= ((s & 3) == 1); + c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i - 2; m -= ((s & 3) == 1); + } + + for (; i >= 0; i -= 1) + { + c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i + 1; m -= ((s & 3) == 1); + } + + return n - 1 - m; +} + +static sa_sint_t libsais16_gather_compacted_lms_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n) +{ + const fast_sint_t prefetch_distance = 32; + + sa_sint_t i = n - 2; + sa_sint_t m = n - 1; + fast_uint_t s = 1; + fast_sint_t c0 = T[n - 1]; + fast_sint_t c1 = 0; + + for (; i >= 3; i -= 4) + { + libsais16_prefetch(&T[i - prefetch_distance]); + + c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = i + 1; m -= ((fast_sint_t)(s & 3) == (c0 >= 0)); + c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i - 0; m -= ((fast_sint_t)(s & 3) == (c1 >= 0)); + c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = i - 1; m -= ((fast_sint_t)(s & 3) == (c0 >= 0)); + c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i - 2; m -= ((fast_sint_t)(s & 3) == (c1 >= 0)); + } + + for (; i >= 0; i -= 1) + { + c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i + 1; m -= ((fast_sint_t)(s & 3) == (c1 >= 0)); + } + + return n - 1 - m; +} + +#if defined(_OPENMP) + +static void libsais16_count_lms_suffixes_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets) +{ + const fast_sint_t prefetch_distance = 32; + + memset(buckets, 0, 4 * (size_t)k * sizeof(sa_sint_t)); + + sa_sint_t i = n - 2; + fast_uint_t s = 1; + fast_sint_t c0 = T[n - 1]; + fast_sint_t c1 = 0; + + for (; i >= prefetch_distance + 3; i -= 4) + { + libsais16_prefetch(&T[i - 2 * prefetch_distance]); + + libsais16_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 0], 0)]); + libsais16_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 1], 0)]); + libsais16_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 2], 0)]); + libsais16_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 3], 0)]); + + c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); + buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++; + + c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); + buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++; + + c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); + buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++; + + c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); + buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++; + } + + for (; i >= 0; i -= 1) + { + c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); + buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++; + } + + buckets[BUCKETS_INDEX4((fast_uint_t)c0, (s << 1) & 3)]++; +} + +#endif + +static void libsais16_count_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets) +{ + const fast_sint_t prefetch_distance = 32; + + memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t)); + + sa_sint_t i = n - 2; + fast_uint_t s = 1; + fast_sint_t c0 = T[n - 1]; + fast_sint_t c1 = 0; + + for (; i >= prefetch_distance + 3; i -= 4) + { + libsais16_prefetch(&T[i - 2 * prefetch_distance]); + + libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0], 0)]); + libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1], 0)]); + libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2], 0)]); + libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3], 0)]); + + c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); + buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++; + + c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); + buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++; + + c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); + buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++; + + c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); + buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++; + } + + for (; i >= 0; i -= 1) + { + c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); + buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++; + } + + buckets[BUCKETS_INDEX2((fast_uint_t)c0, 0)]++; +} + +#if defined(_OPENMP) + +static void libsais16_count_compacted_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets) +{ + const fast_sint_t prefetch_distance = 32; + + memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t)); + + sa_sint_t i = n - 2; + fast_uint_t s = 1; + fast_sint_t c0 = T[n - 1]; + fast_sint_t c1 = 0; + + for (; i >= prefetch_distance + 3; i -= 4) + { + libsais16_prefetch(&T[i - 2 * prefetch_distance]); + + libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0] & SAINT_MAX, 0)]); + libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1] & SAINT_MAX, 0)]); + libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2] & SAINT_MAX, 0)]); + libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3] & SAINT_MAX, 0)]); + + c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); + c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++; + + c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); + c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++; + + c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); + c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++; + + c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); + c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++; + } + + for (; i >= 0; i -= 1) + { + c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); + c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++; + } + + c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, 0)]++; +} + +#endif + +static sa_sint_t libsais16_count_and_gather_lms_suffixes_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t)); + + fast_sint_t m = omp_block_start + omp_block_size - 1; + + if (omp_block_size > 0) + { + const fast_sint_t prefetch_distance = 128; + + fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1; + + while (j < n && (c1 = T[j]) == c0) { ++j; } + + fast_uint_t s = c0 >= c1; + + for (i = m - 1, j = omp_block_start + 3; i >= j; i -= 4) + { + libsais16_prefetch(&T[i - prefetch_distance]); + + c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++; + + c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 0); m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++; + + c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 1); m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++; + + c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 2); m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++; + } + + for (j -= 3; i >= j; i -= 1) + { + c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++; + } + + c1 = (i >= 0) ? T[i] : -1; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++; + } + + return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m); +} + +static sa_sint_t libsais16_count_and_gather_lms_suffixes_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + sa_sint_t m = 0; + +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536 && omp_get_dynamic() == 0) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; + + if (omp_num_threads == 1) + { + m = libsais16_count_and_gather_lms_suffixes_16u(T, SA, n, buckets, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size; + thread_state[omp_thread_num].state.m = libsais16_count_and_gather_lms_suffixes_16u(T, SA, n, thread_state[omp_thread_num].state.buckets, omp_block_start, omp_block_size); + + if (thread_state[omp_thread_num].state.m > 0) + { + thread_state[omp_thread_num].state.last_lms_suffix = SA[thread_state[omp_thread_num].state.position - 1]; + } + } + + #pragma omp barrier + + #pragma omp master + { + memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t)); + + fast_sint_t t; + for (t = omp_num_threads - 1; t >= 0; --t) + { + m += (sa_sint_t)thread_state[t].state.m; + + if (t != omp_num_threads - 1 && thread_state[t].state.m > 0) + { + memcpy(&SA[n - m], &SA[thread_state[t].state.position - thread_state[t].state.m], (size_t)thread_state[t].state.m * sizeof(sa_sint_t)); + } + + { + sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; + fast_sint_t s; for (s = 0; s < 4 * ALPHABET_SIZE; s += 1) { sa_sint_t A = buckets[s], B = temp_bucket[s]; buckets[s] = A + B; temp_bucket[s] = A; } + } + } + } + } +#endif + } + + return m; +} + +static sa_sint_t libsais16_count_and_gather_lms_suffixes_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + memset(buckets, 0, 4 * (size_t)k * sizeof(sa_sint_t)); + + fast_sint_t m = omp_block_start + omp_block_size - 1; + + if (omp_block_size > 0) + { + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1; + + while (j < n && (c1 = T[j]) == c0) { ++j; } + + fast_uint_t s = c0 >= c1; + + for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) + { + libsais16_prefetch(&T[i - 2 * prefetch_distance]); + + libsais16_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 0], 0)]); + libsais16_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 1], 0)]); + libsais16_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 2], 0)]); + libsais16_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 3], 0)]); + + c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++; + + c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 0); m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++; + + c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 1); m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++; + + c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 2); m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++; + } + + for (j -= prefetch_distance + 3; i >= j; i -= 1) + { + c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++; + } + + c1 = (i >= 0) ? T[i] : -1; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++; + } + + return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m); +} + +static sa_sint_t libsais16_count_and_gather_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t)); + + fast_sint_t m = omp_block_start + omp_block_size - 1; + + if (omp_block_size > 0) + { + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1; + + while (j < n && (c1 = T[j]) == c0) { ++j; } + + fast_uint_t s = c0 >= c1; + + for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) + { + libsais16_prefetch(&T[i - 2 * prefetch_distance]); + + libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0], 0)]); + libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1], 0)]); + libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2], 0)]); + libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3], 0)]); + + c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++; + + c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 0); m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++; + + c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 1); m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++; + + c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 2); m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++; + } + + for (j -= prefetch_distance + 3; i >= j; i -= 1) + { + c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++; + } + + c1 = (i >= 0) ? T[i] : -1; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++; + } + + return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m); +} + +static sa_sint_t libsais16_count_and_gather_compacted_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t)); + + fast_sint_t m = omp_block_start + omp_block_size - 1; + + if (omp_block_size > 0) + { + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1; + + while (j < n && (c1 = T[j]) == c0) { ++j; } + + fast_uint_t s = c0 >= c1; + + for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) + { + libsais16_prefetch(&T[i - 2 * prefetch_distance]); + + libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0] & SAINT_MAX, 0)]); + libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1] & SAINT_MAX, 0)]); + libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2] & SAINT_MAX, 0)]); + libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3] & SAINT_MAX, 0)]); + + c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((fast_sint_t)(s & 3) == (c0 >= 0)); + c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++; + + c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 0); m -= ((fast_sint_t)(s & 3) == (c1 >= 0)); + c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++; + + c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 1); m -= ((fast_sint_t)(s & 3) == (c0 >= 0)); + c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++; + + c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 2); m -= ((fast_sint_t)(s & 3) == (c1 >= 0)); + c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++; + } + + for (j -= prefetch_distance + 3; i >= j; i -= 1) + { + c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((fast_sint_t)(s & 3) == (c1 >= 0)); + c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++; + } + + c1 = (i >= 0) ? T[i] : -1; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((fast_sint_t)(s & 3) == (c0 >= 0)); + c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++; + } + + return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m); +} + +#if defined(_OPENMP) + +static fast_sint_t libsais16_get_bucket_stride(fast_sint_t free_space, fast_sint_t bucket_size, fast_sint_t num_buckets) +{ + fast_sint_t bucket_size_1024 = (bucket_size + 1023) & (-1024); if (free_space / (num_buckets - 1) >= bucket_size_1024) { return bucket_size_1024; } + fast_sint_t bucket_size_16 = (bucket_size + 15) & (-16); if (free_space / (num_buckets - 1) >= bucket_size_16) { return bucket_size_16; } + + return bucket_size; +} + +static sa_sint_t libsais16_count_and_gather_lms_suffixes_32s_4k_fs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + sa_sint_t m = 0; + +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; + + if (omp_num_threads == 1) + { + m = libsais16_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + fast_sint_t bucket_size = 4 * (fast_sint_t)k; + fast_sint_t bucket_stride = libsais16_get_bucket_stride(buckets - &SA[n], bucket_size, omp_num_threads); + + { + thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size; + thread_state[omp_thread_num].state.count = libsais16_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets - (omp_thread_num * bucket_stride), omp_block_start, omp_block_size); + } + + #pragma omp barrier + + if (omp_thread_num == omp_num_threads - 1) + { + fast_sint_t t; + for (t = omp_num_threads - 1; t >= 0; --t) + { + m += (sa_sint_t)thread_state[t].state.count; + + if (t != omp_num_threads - 1 && thread_state[t].state.count > 0) + { + memcpy(&SA[n - m], &SA[thread_state[t].state.position - thread_state[t].state.count], (size_t)thread_state[t].state.count * sizeof(sa_sint_t)); + } + } + } + else + { + omp_num_threads = omp_num_threads - 1; + omp_block_stride = (bucket_size / omp_num_threads) & (-16); + omp_block_start = omp_thread_num * omp_block_stride; + omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : bucket_size - omp_block_start; + + libsais16_accumulate_counts_s32(buckets + omp_block_start, omp_block_size, bucket_stride, omp_num_threads + 1); + } + } +#endif + } + + return m; +} + +static sa_sint_t libsais16_count_and_gather_lms_suffixes_32s_2k_fs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + sa_sint_t m = 0; + +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; + + if (omp_num_threads == 1) + { + m = libsais16_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + fast_sint_t bucket_size = 2 * (fast_sint_t)k; + fast_sint_t bucket_stride = libsais16_get_bucket_stride(buckets - &SA[n], bucket_size, omp_num_threads); + + { + thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size; + thread_state[omp_thread_num].state.count = libsais16_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets - (omp_thread_num * bucket_stride), omp_block_start, omp_block_size); + } + + #pragma omp barrier + + if (omp_thread_num == omp_num_threads - 1) + { + fast_sint_t t; + for (t = omp_num_threads - 1; t >= 0; --t) + { + m += (sa_sint_t)thread_state[t].state.count; + + if (t != omp_num_threads - 1 && thread_state[t].state.count > 0) + { + memcpy(&SA[n - m], &SA[thread_state[t].state.position - thread_state[t].state.count], (size_t)thread_state[t].state.count * sizeof(sa_sint_t)); + } + } + } + else + { + omp_num_threads = omp_num_threads - 1; + omp_block_stride = (bucket_size / omp_num_threads) & (-16); + omp_block_start = omp_thread_num * omp_block_stride; + omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : bucket_size - omp_block_start; + + libsais16_accumulate_counts_s32(buckets + omp_block_start, omp_block_size, bucket_stride, omp_num_threads + 1); + } + } +#endif + } + + return m; +} + +static void libsais16_count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; + + if (omp_num_threads == 1) + { + libsais16_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA, n, k, buckets, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + fast_sint_t bucket_size = 2 * (fast_sint_t)k; + fast_sint_t bucket_stride = libsais16_get_bucket_stride(buckets - &SA[n + n], bucket_size, omp_num_threads); + + { + thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size; + thread_state[omp_thread_num].state.count = libsais16_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA + n, n, k, buckets - (omp_thread_num * bucket_stride), omp_block_start, omp_block_size); + } + + #pragma omp barrier + + { + fast_sint_t t, m = 0; for (t = omp_num_threads - 1; t >= omp_thread_num; --t) { m += (sa_sint_t)thread_state[t].state.count; } + + if (thread_state[omp_thread_num].state.count > 0) + { + memcpy(&SA[n - m], &SA[n + thread_state[omp_thread_num].state.position - thread_state[omp_thread_num].state.count], (size_t)thread_state[omp_thread_num].state.count * sizeof(sa_sint_t)); + } + } + + { + omp_block_stride = (bucket_size / omp_num_threads) & (-16); + omp_block_start = omp_thread_num * omp_block_stride; + omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : bucket_size - omp_block_start; + + libsais16_accumulate_counts_s32(buckets + omp_block_start, omp_block_size, bucket_stride, omp_num_threads); + } + } +#endif + } +} + +#endif + +static sa_sint_t libsais16_count_and_gather_lms_suffixes_32s_4k_nofs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads) +{ + sa_sint_t m = 0; + +#if defined(_OPENMP) + #pragma omp parallel num_threads(2) if(threads > 1 && n >= 65536) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); + + fast_sint_t omp_num_threads = 1; +#endif + if (omp_num_threads == 1) + { + m = libsais16_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets, 0, n); + } +#if defined(_OPENMP) + else if (omp_thread_num == 0) + { + libsais16_count_lms_suffixes_32s_4k(T, n, k, buckets); + } + else + { + m = libsais16_gather_lms_suffixes_32s(T, SA, n); + } +#endif + } + + return m; +} + +static sa_sint_t libsais16_count_and_gather_lms_suffixes_32s_2k_nofs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads) +{ + sa_sint_t m = 0; + +#if defined(_OPENMP) + #pragma omp parallel num_threads(2) if(threads > 1 && n >= 65536) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); + + fast_sint_t omp_num_threads = 1; +#endif + if (omp_num_threads == 1) + { + m = libsais16_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n); + } +#if defined(_OPENMP) + else if (omp_thread_num == 0) + { + libsais16_count_lms_suffixes_32s_2k(T, n, k, buckets); + } + else + { + m = libsais16_gather_lms_suffixes_32s(T, SA, n); + } +#endif + } + + return m; +} + +static sa_sint_t libsais16_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads) +{ + sa_sint_t m = 0; + +#if defined(_OPENMP) + #pragma omp parallel num_threads(2) if(threads > 1 && n >= 65536) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); + + fast_sint_t omp_num_threads = 1; +#endif + if (omp_num_threads == 1) + { + m = libsais16_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n); + } +#if defined(_OPENMP) + else if (omp_thread_num == 0) + { + libsais16_count_compacted_lms_suffixes_32s_2k(T, n, k, buckets); + } + else + { + m = libsais16_gather_compacted_lms_suffixes_32s(T, SA, n); + } +#endif + } + + return m; +} + +static sa_sint_t libsais16_count_and_gather_lms_suffixes_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + sa_sint_t m; + +#if defined(_OPENMP) + sa_sint_t max_threads = (sa_sint_t)((buckets - &SA[n]) / ((4 * (fast_sint_t)k + 15) & (-16))); if (max_threads > threads) { max_threads = threads; } + if (max_threads > 1 && n >= 65536 && n / k >= 2) + { + if (max_threads > n / 16 / k) { max_threads = n / 16 / k; } + m = libsais16_count_and_gather_lms_suffixes_32s_4k_fs_omp(T, SA, n, k, buckets, max_threads > 2 ? max_threads : 2, thread_state); + } + else +#else + UNUSED(thread_state); +#endif + { + m = libsais16_count_and_gather_lms_suffixes_32s_4k_nofs_omp(T, SA, n, k, buckets, threads); + } + + return m; +} + +static sa_sint_t libsais16_count_and_gather_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + sa_sint_t m; + +#if defined(_OPENMP) + sa_sint_t max_threads = (sa_sint_t)((buckets - &SA[n]) / ((2 * (fast_sint_t)k + 15) & (-16))); if (max_threads > threads) { max_threads = threads; } + if (max_threads > 1 && n >= 65536 && n / k >= 2) + { + if (max_threads > n / 8 / k) { max_threads = n / 8 / k; } + m = libsais16_count_and_gather_lms_suffixes_32s_2k_fs_omp(T, SA, n, k, buckets, max_threads > 2 ? max_threads : 2, thread_state); + } + else +#else + UNUSED(thread_state); +#endif + { + m = libsais16_count_and_gather_lms_suffixes_32s_2k_nofs_omp(T, SA, n, k, buckets, threads); + } + + return m; +} + +static void libsais16_count_and_gather_compacted_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ +#if defined(_OPENMP) + sa_sint_t max_threads = (sa_sint_t)((buckets - &SA[n + n]) / ((2 * (fast_sint_t)k + 15) & (-16))); if (max_threads > threads) { max_threads = threads; } + if (max_threads > 1 && n >= 65536 && n / k >= 2) + { + if (max_threads > n / 8 / k) { max_threads = n / 8 / k; } + libsais16_count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(T, SA, n, k, buckets, max_threads > 2 ? max_threads : 2, thread_state); + } + else +#else + UNUSED(thread_state); +#endif + { + libsais16_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(T, SA, n, k, buckets, threads); + } +} + +static void libsais16_count_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets) +{ + const fast_sint_t prefetch_distance = 32; + + memset(buckets, 0, (size_t)k * sizeof(sa_sint_t)); + + fast_sint_t i, j; + for (i = 0, j = (fast_sint_t)n - 7; i < j; i += 8) + { + libsais16_prefetch(&T[i + prefetch_distance]); + + buckets[T[i + 0]]++; + buckets[T[i + 1]]++; + buckets[T[i + 2]]++; + buckets[T[i + 3]]++; + buckets[T[i + 4]]++; + buckets[T[i + 5]]++; + buckets[T[i + 6]]++; + buckets[T[i + 7]]++; + } + + for (j += 7; i < j; i += 1) + { + buckets[T[i]]++; + } +} + +static void libsais16_initialize_buckets_start_and_end_16u(sa_sint_t * RESTRICT buckets, sa_sint_t * RESTRICT freq) +{ + sa_sint_t * RESTRICT bucket_start = &buckets[6 * ALPHABET_SIZE]; + sa_sint_t * RESTRICT bucket_end = &buckets[7 * ALPHABET_SIZE]; + + if (freq != NULL) + { + fast_sint_t i, j; sa_sint_t sum = 0; + for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1) + { + bucket_start[j] = sum; + sum += (freq[j] = buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)]); + bucket_end[j] = sum; + } + } + else + { + fast_sint_t i, j; sa_sint_t sum = 0; + for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1) + { + bucket_start[j] = sum; + sum += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)]; + bucket_end[j] = sum; + } + } +} + +static void libsais16_initialize_buckets_start_and_end_32s_6k(sa_sint_t k, sa_sint_t * RESTRICT buckets) +{ + sa_sint_t * RESTRICT bucket_start = &buckets[4 * k]; + sa_sint_t * RESTRICT bucket_end = &buckets[5 * k]; + + fast_sint_t i, j; sa_sint_t sum = 0; + for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1) + { + bucket_start[j] = sum; + sum += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)]; + bucket_end[j] = sum; + } +} + +static void libsais16_initialize_buckets_start_and_end_32s_4k(sa_sint_t k, sa_sint_t * RESTRICT buckets) +{ + sa_sint_t * RESTRICT bucket_start = &buckets[2 * k]; + sa_sint_t * RESTRICT bucket_end = &buckets[3 * k]; + + fast_sint_t i, j; sa_sint_t sum = 0; + for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1) + { + bucket_start[j] = sum; + sum += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)]; + bucket_end[j] = sum; + } +} + +static void libsais16_initialize_buckets_end_32s_2k(sa_sint_t k, sa_sint_t * RESTRICT buckets) +{ + fast_sint_t i; sa_sint_t sum0 = 0; + for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0)) + { + sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)]; buckets[i + BUCKETS_INDEX2(0, 0)] = sum0; + } +} + +static void libsais16_initialize_buckets_start_and_end_32s_2k(sa_sint_t k, sa_sint_t * RESTRICT buckets) +{ + fast_sint_t i, j; + for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1) + { + buckets[j] = buckets[i]; + } + + buckets[k] = 0; memcpy(&buckets[k + 1], buckets, ((size_t)k - 1) * sizeof(sa_sint_t)); +} + +static void libsais16_initialize_buckets_start_32s_1k(sa_sint_t k, sa_sint_t * RESTRICT buckets) +{ + fast_sint_t i; sa_sint_t sum = 0; + for (i = 0; i <= (fast_sint_t)k - 1; i += 1) { sa_sint_t tmp = buckets[i]; buckets[i] = sum; sum += tmp; } +} + +static void libsais16_initialize_buckets_end_32s_1k(sa_sint_t k, sa_sint_t * RESTRICT buckets) +{ + fast_sint_t i; sa_sint_t sum = 0; + for (i = 0; i <= (fast_sint_t)k - 1; i += 1) { sum += buckets[i]; buckets[i] = sum; } +} + +static sa_sint_t libsais16_initialize_buckets_for_lms_suffixes_radix_sort_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix) +{ + { + fast_uint_t s = 0; + fast_sint_t c0 = T[first_lms_suffix]; + fast_sint_t c1 = 0; + + for (; --first_lms_suffix >= 0; ) + { + c1 = c0; c0 = T[first_lms_suffix]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); + buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]--; + } + + buckets[BUCKETS_INDEX4((fast_uint_t)c0, (s << 1) & 3)]--; + } + + { + sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE]; + + fast_sint_t i, j; sa_sint_t sum = 0; + for (i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) + { + temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum; sum += buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 3)]; temp_bucket[j] = sum; + } + + return sum; + } +} + +static void libsais16_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix) +{ + buckets[BUCKETS_INDEX2(T[first_lms_suffix], 0)]++; + buckets[BUCKETS_INDEX2(T[first_lms_suffix], 1)]--; + + fast_sint_t i; sa_sint_t sum0 = 0, sum1 = 0; + for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0)) + { + sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)]; + sum1 += buckets[i + BUCKETS_INDEX2(0, 1)]; + + buckets[i + BUCKETS_INDEX2(0, 0)] = sum0; + buckets[i + BUCKETS_INDEX2(0, 1)] = sum1; + } +} + +static sa_sint_t libsais16_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix) +{ + { + fast_uint_t s = 0; + fast_sint_t c0 = T[first_lms_suffix]; + fast_sint_t c1 = 0; + + for (; --first_lms_suffix >= 0; ) + { + c1 = c0; c0 = T[first_lms_suffix]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); + buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]--; + } + + buckets[BUCKETS_INDEX4((fast_uint_t)c0, (s << 1) & 3)]--; + } + + { + sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k]; + + fast_sint_t i, j; sa_sint_t sum = 0; + for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1) + { + sum += buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 3)]; temp_bucket[j] = sum; + } + + return sum; + } +} + +static void libsais16_initialize_buckets_for_radix_and_partial_sorting_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix) +{ + sa_sint_t * RESTRICT bucket_start = &buckets[2 * k]; + sa_sint_t * RESTRICT bucket_end = &buckets[3 * k]; + + buckets[BUCKETS_INDEX2(T[first_lms_suffix], 0)]++; + buckets[BUCKETS_INDEX2(T[first_lms_suffix], 1)]--; + + fast_sint_t i, j; sa_sint_t sum0 = 0, sum1 = 0; + for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1) + { + bucket_start[j] = sum1; + + sum0 += buckets[i + BUCKETS_INDEX2(0, 1)]; + sum1 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)]; + buckets[i + BUCKETS_INDEX2(0, 1)] = sum0; + + bucket_end[j] = sum1; + } +} + +static void libsais16_radix_sort_lms_suffixes_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) + { + libsais16_prefetch(&SA[i - 2 * prefetch_distance]); + + libsais16_prefetch(&T[SA[i - prefetch_distance - 0]]); + libsais16_prefetch(&T[SA[i - prefetch_distance - 1]]); + libsais16_prefetch(&T[SA[i - prefetch_distance - 2]]); + libsais16_prefetch(&T[SA[i - prefetch_distance - 3]]); + + sa_sint_t p0 = SA[i - 0]; SA[--induction_bucket[BUCKETS_INDEX2(T[p0], 0)]] = p0; + sa_sint_t p1 = SA[i - 1]; SA[--induction_bucket[BUCKETS_INDEX2(T[p1], 0)]] = p1; + sa_sint_t p2 = SA[i - 2]; SA[--induction_bucket[BUCKETS_INDEX2(T[p2], 0)]] = p2; + sa_sint_t p3 = SA[i - 3]; SA[--induction_bucket[BUCKETS_INDEX2(T[p3], 0)]] = p3; + } + + for (j -= prefetch_distance + 3; i >= j; i -= 1) + { + sa_sint_t p = SA[i]; SA[--induction_bucket[BUCKETS_INDEX2(T[p], 0)]] = p; + } +} + +static void libsais16_radix_sort_lms_suffixes_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536 && m >= 65536 && omp_get_dynamic() == 0) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_num_threads = 1; +#endif + if (omp_num_threads == 1) + { + libsais16_radix_sort_lms_suffixes_16u(T, SA, &buckets[4 * ALPHABET_SIZE], (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1); + } +#if defined(_OPENMP) + else + { + { + sa_sint_t * RESTRICT src_bucket = &buckets[4 * ALPHABET_SIZE]; + sa_sint_t * RESTRICT dst_bucket = thread_state[omp_thread_num].state.buckets; + + fast_sint_t i, j; + for (i = BUCKETS_INDEX2(0, 0), j = BUCKETS_INDEX4(0, 1); i <= BUCKETS_INDEX2(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX2(1, 0), j += BUCKETS_INDEX4(1, 0)) + { + dst_bucket[i] = src_bucket[i] - dst_bucket[j]; + } + } + + { + fast_sint_t t, omp_block_start = 0, omp_block_size = thread_state[omp_thread_num].state.m; + for (t = omp_num_threads - 1; t >= omp_thread_num; --t) omp_block_start += thread_state[t].state.m; + + if (omp_block_start == (fast_sint_t)m && omp_block_size > 0) + { + omp_block_start -= 1; omp_block_size -= 1; + } + + libsais16_radix_sort_lms_suffixes_16u(T, SA, thread_state[omp_thread_num].state.buckets, (fast_sint_t)n - omp_block_start, omp_block_size); + } + } +#endif + } +} + +static void libsais16_radix_sort_lms_suffixes_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 3; i >= j; i -= 4) + { + libsais16_prefetch(&SA[i - 3 * prefetch_distance]); + + libsais16_prefetch(&T[SA[i - 2 * prefetch_distance - 0]]); + libsais16_prefetch(&T[SA[i - 2 * prefetch_distance - 1]]); + libsais16_prefetch(&T[SA[i - 2 * prefetch_distance - 2]]); + libsais16_prefetch(&T[SA[i - 2 * prefetch_distance - 3]]); + + libsais16_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 0]]]); + libsais16_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 1]]]); + libsais16_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 2]]]); + libsais16_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 3]]]); + + sa_sint_t p0 = SA[i - 0]; SA[--induction_bucket[T[p0]]] = p0; + sa_sint_t p1 = SA[i - 1]; SA[--induction_bucket[T[p1]]] = p1; + sa_sint_t p2 = SA[i - 2]; SA[--induction_bucket[T[p2]]] = p2; + sa_sint_t p3 = SA[i - 3]; SA[--induction_bucket[T[p3]]] = p3; + } + + for (j -= 2 * prefetch_distance + 3; i >= j; i -= 1) + { + sa_sint_t p = SA[i]; SA[--induction_bucket[T[p]]] = p; + } +} + +static void libsais16_radix_sort_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 3; i >= j; i -= 4) + { + libsais16_prefetch(&SA[i - 3 * prefetch_distance]); + + libsais16_prefetch(&T[SA[i - 2 * prefetch_distance - 0]]); + libsais16_prefetch(&T[SA[i - 2 * prefetch_distance - 1]]); + libsais16_prefetch(&T[SA[i - 2 * prefetch_distance - 2]]); + libsais16_prefetch(&T[SA[i - 2 * prefetch_distance - 3]]); + + libsais16_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 0]], 0)]); + libsais16_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 1]], 0)]); + libsais16_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 2]], 0)]); + libsais16_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 3]], 0)]); + + sa_sint_t p0 = SA[i - 0]; SA[--induction_bucket[BUCKETS_INDEX2(T[p0], 0)]] = p0; + sa_sint_t p1 = SA[i - 1]; SA[--induction_bucket[BUCKETS_INDEX2(T[p1], 0)]] = p1; + sa_sint_t p2 = SA[i - 2]; SA[--induction_bucket[BUCKETS_INDEX2(T[p2], 0)]] = p2; + sa_sint_t p3 = SA[i - 3]; SA[--induction_bucket[BUCKETS_INDEX2(T[p3], 0)]] = p3; + } + + for (j -= 2 * prefetch_distance + 3; i >= j; i -= 1) + { + sa_sint_t p = SA[i]; SA[--induction_bucket[BUCKETS_INDEX2(T[p], 0)]] = p; + } +} + +#if defined(_OPENMP) + +static void libsais16_radix_sort_lms_suffixes_32s_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) + { + libsais16_prefetch(&SA[i + 2 * prefetch_distance]); + + libsais16_prefetch(&T[SA[i + prefetch_distance + 0]]); + libsais16_prefetch(&T[SA[i + prefetch_distance + 1]]); + libsais16_prefetch(&T[SA[i + prefetch_distance + 2]]); + libsais16_prefetch(&T[SA[i + prefetch_distance + 3]]); + + libsais16_prefetchw(&cache[i + prefetch_distance]); + + cache[i + 0].symbol = T[cache[i + 0].index = SA[i + 0]]; + cache[i + 1].symbol = T[cache[i + 1].index = SA[i + 1]]; + cache[i + 2].symbol = T[cache[i + 2].index = SA[i + 2]]; + cache[i + 3].symbol = T[cache[i + 3].index = SA[i + 3]]; + } + + for (j += prefetch_distance + 3; i < j; i += 1) + { + cache[i].symbol = T[cache[i].index = SA[i]]; + } +} + +static void libsais16_radix_sort_lms_suffixes_32s_6k_block_sort(sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) + { + libsais16_prefetchw(&cache[i - 2 * prefetch_distance]); + + libsais16_prefetchw(&induction_bucket[cache[i - prefetch_distance - 0].symbol]); + libsais16_prefetchw(&induction_bucket[cache[i - prefetch_distance - 1].symbol]); + libsais16_prefetchw(&induction_bucket[cache[i - prefetch_distance - 2].symbol]); + libsais16_prefetchw(&induction_bucket[cache[i - prefetch_distance - 3].symbol]); + + cache[i - 0].symbol = --induction_bucket[cache[i - 0].symbol]; + cache[i - 1].symbol = --induction_bucket[cache[i - 1].symbol]; + cache[i - 2].symbol = --induction_bucket[cache[i - 2].symbol]; + cache[i - 3].symbol = --induction_bucket[cache[i - 3].symbol]; + } + + for (j -= prefetch_distance + 3; i >= j; i -= 1) + { + cache[i].symbol = --induction_bucket[cache[i].symbol]; + } +} + +static void libsais16_radix_sort_lms_suffixes_32s_2k_block_sort(sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) + { + libsais16_prefetchw(&cache[i - 2 * prefetch_distance]); + + libsais16_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 0].symbol, 0)]); + libsais16_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 1].symbol, 0)]); + libsais16_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 2].symbol, 0)]); + libsais16_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 3].symbol, 0)]); + + cache[i - 0].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 0].symbol, 0)]; + cache[i - 1].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 1].symbol, 0)]; + cache[i - 2].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 2].symbol, 0)]; + cache[i - 3].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 3].symbol, 0)]; + } + + for (j -= prefetch_distance + 3; i >= j; i -= 1) + { + cache[i].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i].symbol, 0)]; + } +} + +static void libsais16_radix_sort_lms_suffixes_32s_6k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(cache); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; + + omp_block_start += block_start; + + if (omp_num_threads == 1) + { + libsais16_radix_sort_lms_suffixes_32s_6k(T, SA, induction_bucket, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + libsais16_radix_sort_lms_suffixes_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); + } + + #pragma omp barrier + + #pragma omp master + { + libsais16_radix_sort_lms_suffixes_32s_6k_block_sort(induction_bucket, cache - block_start, block_start, block_size); + } + + #pragma omp barrier + + { + libsais16_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); + } + } +#endif + } +} + +static void libsais16_radix_sort_lms_suffixes_32s_2k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(cache); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; + + omp_block_start += block_start; + + if (omp_num_threads == 1) + { + libsais16_radix_sort_lms_suffixes_32s_2k(T, SA, induction_bucket, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + libsais16_radix_sort_lms_suffixes_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); + } + + #pragma omp barrier + + #pragma omp master + { + libsais16_radix_sort_lms_suffixes_32s_2k_block_sort(induction_bucket, cache - block_start, block_start, block_size); + } + + #pragma omp barrier + + { + libsais16_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); + } + } +#endif + } +} + +#endif + +static void libsais16_radix_sort_lms_suffixes_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + if (threads == 1 || m < 65536) + { + libsais16_radix_sort_lms_suffixes_32s_6k(T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1); + } +#if defined(_OPENMP) + else + { + fast_sint_t block_start, block_end; + for (block_start = 0; block_start < (fast_sint_t)m - 1; block_start = block_end) + { + block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end >= m) { block_end = (fast_sint_t)m - 1; } + + libsais16_radix_sort_lms_suffixes_32s_6k_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, (fast_sint_t)n - block_end, block_end - block_start, threads); + } + } +#else + UNUSED(thread_state); +#endif +} + +static void libsais16_radix_sort_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + if (threads == 1 || m < 65536) + { + libsais16_radix_sort_lms_suffixes_32s_2k(T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1); + } +#if defined(_OPENMP) + else + { + fast_sint_t block_start, block_end; + for (block_start = 0; block_start < (fast_sint_t)m - 1; block_start = block_end) + { + block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end >= m) { block_end = (fast_sint_t)m - 1; } + + libsais16_radix_sort_lms_suffixes_32s_2k_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, (fast_sint_t)n - block_end, block_end - block_start, threads); + } + } +#else + UNUSED(thread_state); +#endif +} + +static sa_sint_t libsais16_radix_sort_lms_suffixes_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets) +{ + const fast_sint_t prefetch_distance = 32; + + sa_sint_t i = n - 2; + sa_sint_t m = 0; + fast_uint_t s = 1; + fast_sint_t c0 = T[n - 1]; + fast_sint_t c1 = 0; + fast_sint_t c2 = 0; + + for (; i >= prefetch_distance + 3; i -= 4) + { + libsais16_prefetch(&T[i - 2 * prefetch_distance]); + + libsais16_prefetchw(&buckets[T[i - prefetch_distance - 0]]); + libsais16_prefetchw(&buckets[T[i - prefetch_distance - 1]]); + libsais16_prefetchw(&buckets[T[i - prefetch_distance - 2]]); + libsais16_prefetchw(&buckets[T[i - prefetch_distance - 3]]); + + c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); + if ((s & 3) == 1) { SA[--buckets[c2 = c0]] = i + 1; m++; } + + c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); + if ((s & 3) == 1) { SA[--buckets[c2 = c1]] = i - 0; m++; } + + c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); + if ((s & 3) == 1) { SA[--buckets[c2 = c0]] = i - 1; m++; } + + c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); + if ((s & 3) == 1) { SA[--buckets[c2 = c1]] = i - 2; m++; } + } + + for (; i >= 0; i -= 1) + { + c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); + if ((s & 3) == 1) { SA[--buckets[c2 = c1]] = i + 1; m++; } + } + + if (m > 1) + { + SA[buckets[c2]] = 0; + } + + return m; +} + +static void libsais16_radix_sort_set_markers_32s_6k(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) + { + libsais16_prefetch(&induction_bucket[i + 2 * prefetch_distance]); + + libsais16_prefetchw(&SA[induction_bucket[i + prefetch_distance + 0]]); + libsais16_prefetchw(&SA[induction_bucket[i + prefetch_distance + 1]]); + libsais16_prefetchw(&SA[induction_bucket[i + prefetch_distance + 2]]); + libsais16_prefetchw(&SA[induction_bucket[i + prefetch_distance + 3]]); + + SA[induction_bucket[i + 0]] |= SAINT_MIN; + SA[induction_bucket[i + 1]] |= SAINT_MIN; + SA[induction_bucket[i + 2]] |= SAINT_MIN; + SA[induction_bucket[i + 3]] |= SAINT_MIN; + } + + for (j += prefetch_distance + 3; i < j; i += 1) + { + SA[induction_bucket[i]] |= SAINT_MIN; + } +} + +static void libsais16_radix_sort_set_markers_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) + { + libsais16_prefetch(&induction_bucket[BUCKETS_INDEX2(i + 2 * prefetch_distance, 0)]); + + libsais16_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 0, 0)]]); + libsais16_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 1, 0)]]); + libsais16_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 2, 0)]]); + libsais16_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 3, 0)]]); + + SA[induction_bucket[BUCKETS_INDEX2(i + 0, 0)]] |= SUFFIX_GROUP_MARKER; + SA[induction_bucket[BUCKETS_INDEX2(i + 1, 0)]] |= SUFFIX_GROUP_MARKER; + SA[induction_bucket[BUCKETS_INDEX2(i + 2, 0)]] |= SUFFIX_GROUP_MARKER; + SA[induction_bucket[BUCKETS_INDEX2(i + 3, 0)]] |= SUFFIX_GROUP_MARKER; + } + + for (j += prefetch_distance + 3; i < j; i += 1) + { + SA[induction_bucket[BUCKETS_INDEX2(i, 0)]] |= SUFFIX_GROUP_MARKER; + } +} + +static void libsais16_radix_sort_set_markers_32s_6k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && k >= 65536) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); + fast_sint_t omp_block_stride = (((fast_sint_t)k - 1) / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : (fast_sint_t)k - 1 - omp_block_start; +#else + UNUSED(threads); + + fast_sint_t omp_block_start = 0; + fast_sint_t omp_block_size = (fast_sint_t)k - 1; +#endif + + libsais16_radix_sort_set_markers_32s_6k(SA, induction_bucket, omp_block_start, omp_block_size); + } +} + +static void libsais16_radix_sort_set_markers_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && k >= 65536) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); + fast_sint_t omp_block_stride = (((fast_sint_t)k - 1) / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : (fast_sint_t)k - 1 - omp_block_start; +#else + UNUSED(threads); + + fast_sint_t omp_block_start = 0; + fast_sint_t omp_block_size = (fast_sint_t)k - 1; +#endif + + libsais16_radix_sort_set_markers_32s_4k(SA, induction_bucket, omp_block_start, omp_block_size); + } +} + +static void libsais16_initialize_buckets_for_partial_sorting_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count) +{ + sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE]; + + buckets[BUCKETS_INDEX4((fast_uint_t)T[first_lms_suffix], 1)]++; + + fast_sint_t i, j; sa_sint_t sum0 = left_suffixes_count + 1, sum1 = 0; + for (i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) + { + temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0; + + sum0 += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 2)]; + sum1 += buckets[i + BUCKETS_INDEX4(0, 1)]; + + buckets[j + BUCKETS_INDEX2(0, 0)] = sum0; + buckets[j + BUCKETS_INDEX2(0, 1)] = sum1; + } +} + +static void libsais16_initialize_buckets_for_partial_sorting_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count) +{ + sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k]; + + fast_sint_t i, j; sa_sint_t sum0 = left_suffixes_count + 1, sum1 = 0, sum2 = 0; + for (first_lms_suffix = T[first_lms_suffix], i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4((fast_sint_t)first_lms_suffix - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) + { + sa_sint_t SS = buckets[i + BUCKETS_INDEX4(0, 0)]; + sa_sint_t LS = buckets[i + BUCKETS_INDEX4(0, 1)]; + sa_sint_t SL = buckets[i + BUCKETS_INDEX4(0, 2)]; + sa_sint_t LL = buckets[i + BUCKETS_INDEX4(0, 3)]; + + buckets[i + BUCKETS_INDEX4(0, 0)] = sum0; + buckets[i + BUCKETS_INDEX4(0, 1)] = sum2; + buckets[i + BUCKETS_INDEX4(0, 2)] = 0; + buckets[i + BUCKETS_INDEX4(0, 3)] = 0; + + sum0 += SS + SL; sum1 += LS; sum2 += LS + LL; + + temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0; + temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum1; + } + + for (sum1 += 1; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) + { + sa_sint_t SS = buckets[i + BUCKETS_INDEX4(0, 0)]; + sa_sint_t LS = buckets[i + BUCKETS_INDEX4(0, 1)]; + sa_sint_t SL = buckets[i + BUCKETS_INDEX4(0, 2)]; + sa_sint_t LL = buckets[i + BUCKETS_INDEX4(0, 3)]; + + buckets[i + BUCKETS_INDEX4(0, 0)] = sum0; + buckets[i + BUCKETS_INDEX4(0, 1)] = sum2; + buckets[i + BUCKETS_INDEX4(0, 2)] = 0; + buckets[i + BUCKETS_INDEX4(0, 3)] = 0; + + sum0 += SS + SL; sum1 += LS; sum2 += LS + LL; + + temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0; + temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum1; + } +} + +static sa_sint_t libsais16_partial_sorting_scan_left_to_right_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE]; + sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) + { + libsais16_prefetch(&SA[i + 2 * prefetch_distance]); + + libsais16_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1); + libsais16_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2); + libsais16_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1); + libsais16_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2); + + sa_sint_t p0 = SA[i + 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]); + SA[induction_bucket[v0]++] = (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d; + + sa_sint_t p1 = SA[i + 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]); + SA[induction_bucket[v1]++] = (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d; + } + + for (j += prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]); + SA[induction_bucket[v]++] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; + } + + return d; +} + +#if defined(_OPENMP) + +static void libsais16_partial_sorting_scan_left_to_right_16u_block_prepare(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size, LIBSAIS_THREAD_STATE * RESTRICT state) +{ + const fast_sint_t prefetch_distance = 32; + + sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; + sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; + + memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t)); + + fast_sint_t i, j, count = 0; sa_sint_t d = 1; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) + { + libsais16_prefetch(&SA[i + 2 * prefetch_distance]); + + libsais16_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1); + libsais16_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2); + libsais16_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1); + libsais16_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2); + + sa_sint_t p0 = cache[count].index = SA[i + 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = cache[count++].symbol = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]); induction_bucket[v0]++; distinct_names[v0] = d; + sa_sint_t p1 = cache[count].index = SA[i + 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = cache[count++].symbol = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]); induction_bucket[v1]++; distinct_names[v1] = d; + } + + for (j += prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t p = cache[count].index = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = cache[count++].symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]); induction_bucket[v]++; distinct_names[v] = d; + } + + state[0].state.position = (fast_sint_t)d - 1; + state[0].state.count = count; +} + +static void libsais16_partial_sorting_scan_left_to_right_16u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count, sa_sint_t d) +{ + const fast_sint_t prefetch_distance = 32; + + sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; + sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; + + fast_sint_t i, j; + for (i = 0, j = count - 1; i < j; i += 2) + { + libsais16_prefetch(&cache[i + prefetch_distance]); + + sa_sint_t p0 = cache[i + 0].index; d += (p0 < 0); sa_sint_t v0 = cache[i + 0].symbol; + SA[induction_bucket[v0]++] = (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d; + + sa_sint_t p1 = cache[i + 1].index; d += (p1 < 0); sa_sint_t v1 = cache[i + 1].symbol; + SA[induction_bucket[v1]++] = (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d; + } + + for (j += 1; i < j; i += 1) + { + sa_sint_t p = cache[i].index; d += (p < 0); sa_sint_t v = cache[i].symbol; + SA[induction_bucket[v]++] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; + } +} + +static sa_sint_t libsais16_partial_sorting_scan_left_to_right_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; + + omp_block_start += block_start; + + if (omp_num_threads == 1) + { + d = libsais16_partial_sorting_scan_left_to_right_16u(T, SA, buckets, d, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + libsais16_partial_sorting_scan_left_to_right_16u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size, &thread_state[omp_thread_num]); + } + + #pragma omp barrier + + #pragma omp master + { + sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE]; + sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; + + fast_sint_t t; + for (t = 0; t < omp_num_threads; ++t) + { + sa_sint_t * RESTRICT temp_induction_bucket = &thread_state[t].state.buckets[0 * ALPHABET_SIZE]; + sa_sint_t * RESTRICT temp_distinct_names = &thread_state[t].state.buckets[2 * ALPHABET_SIZE]; + + fast_sint_t c; + for (c = 0; c < 2 * ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_induction_bucket[c]; induction_bucket[c] = A + B; temp_induction_bucket[c] = A; } + + for (d -= 1, c = 0; c < 2 * ALPHABET_SIZE; c += 1) { sa_sint_t A = distinct_names[c], B = temp_distinct_names[c], D = B + d; distinct_names[c] = B > 0 ? D : A; temp_distinct_names[c] = A; } + d += 1 + (sa_sint_t)thread_state[t].state.position; thread_state[t].state.position = (fast_sint_t)d - thread_state[t].state.position; + } + } + + #pragma omp barrier + + { + libsais16_partial_sorting_scan_left_to_right_16u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count, (sa_sint_t)thread_state[omp_thread_num].state.position); + } + } +#endif + } + + return d; +} + +#endif + +static sa_sint_t libsais16_partial_sorting_scan_left_to_right_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE]; + sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; + + SA[induction_bucket[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])]++] = (n - 1) | SAINT_MIN; + distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])] = ++d; + + if (threads == 1 || left_suffixes_count < 65536) + { + d = libsais16_partial_sorting_scan_left_to_right_16u(T, SA, buckets, d, 0, left_suffixes_count); + } +#if defined(_OPENMP) + else + { + fast_sint_t block_start; + for (block_start = 0; block_start < left_suffixes_count; ) + { + if (SA[block_start] == 0) + { + block_start++; + } + else + { + fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > left_suffixes_count) { block_max_end = left_suffixes_count;} + fast_sint_t block_end = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; } + fast_sint_t block_size = block_end - block_start; + + if (block_size < 32) + { + for (; block_start < block_end; block_start += 1) + { + sa_sint_t p = SA[block_start]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]); + SA[induction_bucket[v]++] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; + } + } + else + { + d = libsais16_partial_sorting_scan_left_to_right_16u_block_omp(T, SA, buckets, d, block_start, block_size, threads, thread_state); + block_start = block_end; + } + } + } + } +#else + UNUSED(thread_state); +#endif + + return d; +} + +static sa_sint_t libsais16_partial_sorting_scan_left_to_right_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2) + { + libsais16_prefetch(&SA[i + 3 * prefetch_distance]); + + libsais16_prefetch(&T[SA[i + 2 * prefetch_distance + 0] & SAINT_MAX] - 1); + libsais16_prefetch(&T[SA[i + 2 * prefetch_distance + 0] & SAINT_MAX] - 2); + libsais16_prefetch(&T[SA[i + 2 * prefetch_distance + 1] & SAINT_MAX] - 1); + libsais16_prefetch(&T[SA[i + 2 * prefetch_distance + 1] & SAINT_MAX] - 2); + + sa_sint_t p0 = SA[i + prefetch_distance + 0] & SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX4(T[p0 - (p0 > 0)], 0); libsais16_prefetchw(&buckets[v0]); + sa_sint_t p1 = SA[i + prefetch_distance + 1] & SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX4(T[p1 - (p1 > 0)], 0); libsais16_prefetchw(&buckets[v1]); + + sa_sint_t p2 = SA[i + 0]; d += (p2 < 0); p2 &= SAINT_MAX; sa_sint_t v2 = BUCKETS_INDEX4(T[p2 - 1], T[p2 - 2] >= T[p2 - 1]); + SA[buckets[v2]++] = (p2 - 1) | ((sa_sint_t)(buckets[2 + v2] != d) << (SAINT_BIT - 1)); buckets[2 + v2] = d; + + sa_sint_t p3 = SA[i + 1]; d += (p3 < 0); p3 &= SAINT_MAX; sa_sint_t v3 = BUCKETS_INDEX4(T[p3 - 1], T[p3 - 2] >= T[p3 - 1]); + SA[buckets[v3]++] = (p3 - 1) | ((sa_sint_t)(buckets[2 + v3] != d) << (SAINT_BIT - 1)); buckets[2 + v3] = d; + } + + for (j += 2 * prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX4(T[p - 1], T[p - 2] >= T[p - 1]); + SA[buckets[v]++] = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d; + } + + return d; +} + +static sa_sint_t libsais16_partial_sorting_scan_left_to_right_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + sa_sint_t * RESTRICT induction_bucket = &buckets[2 * k]; + sa_sint_t * RESTRICT distinct_names = &buckets[0 * k]; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2) + { + libsais16_prefetchw(&SA[i + 3 * prefetch_distance]); + + sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); + sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { const fast_sint_t Ts2 = T[(s2 & ~SUFFIX_GROUP_MARKER) - 1]; libsais16_prefetchw(&induction_bucket[Ts2]); libsais16_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts2, 0)]); } + sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { const fast_sint_t Ts3 = T[(s3 & ~SUFFIX_GROUP_MARKER) - 1]; libsais16_prefetchw(&induction_bucket[Ts3]); libsais16_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts3, 0)]); } + + sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; + if (p0 > 0) + { + SA[i + 0] = 0; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); p0 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] < T[p0 - 1]); + SA[induction_bucket[T[p0 - 1]]++] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d; + } + + sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; + if (p1 > 0) + { + SA[i + 1] = 0; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); p1 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] < T[p1 - 1]); + SA[induction_bucket[T[p1 - 1]]++] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d; + } + } + + for (j += 2 * prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; + if (p > 0) + { + SA[i] = 0; d += (p >> (SUFFIX_GROUP_BIT - 1)); p &= ~SUFFIX_GROUP_MARKER; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] < T[p - 1]); + SA[induction_bucket[T[p - 1]]++] = (p - 1) | ((sa_sint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d; + } + } + + return d; +} + +static void libsais16_partial_sorting_scan_left_to_right_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2) + { + libsais16_prefetchw(&SA[i + 3 * prefetch_distance]); + + sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); + sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { libsais16_prefetchw(&induction_bucket[T[s2 - 1]]); libsais16_prefetch(&T[s2] - 2); } + sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { libsais16_prefetchw(&induction_bucket[T[s3 - 1]]); libsais16_prefetch(&T[s3] - 2); } + + sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { SA[i + 0] = 0; SA[induction_bucket[T[p0 - 1]]++] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)); } + sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { SA[i + 1] = 0; SA[induction_bucket[T[p1 - 1]]++] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)); } + } + + for (j += 2 * prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { SA[i] = 0; SA[induction_bucket[T[p - 1]]++] = (p - 1) | ((sa_sint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1)); } + } +} + +#if defined(_OPENMP) + +static void libsais16_partial_sorting_scan_left_to_right_32s_6k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) + { + libsais16_prefetch(&SA[i + 2 * prefetch_distance]); + + libsais16_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1); + libsais16_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2); + libsais16_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1); + libsais16_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2); + + libsais16_prefetchw(&cache[i + prefetch_distance]); + + sa_sint_t p0 = cache[i + 0].index = SA[i + 0]; sa_sint_t symbol0 = 0; p0 &= SAINT_MAX; if (p0 != 0) { symbol0 = BUCKETS_INDEX4(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]); } cache[i + 0].symbol = symbol0; + sa_sint_t p1 = cache[i + 1].index = SA[i + 1]; sa_sint_t symbol1 = 0; p1 &= SAINT_MAX; if (p1 != 0) { symbol1 = BUCKETS_INDEX4(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]); } cache[i + 1].symbol = symbol1; + } + + for (j += prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t p = cache[i].index = SA[i]; sa_sint_t symbol = 0; p &= SAINT_MAX; if (p != 0) { symbol = BUCKETS_INDEX4(T[p - 1], T[p - 2] >= T[p - 1]); } cache[i].symbol = symbol; + } +} + +static void libsais16_partial_sorting_scan_left_to_right_32s_4k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) + { + libsais16_prefetchw(&SA[i + 2 * prefetch_distance]); + + sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); + + libsais16_prefetchw(&cache[i + prefetch_distance]); + + sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { cache[i + 0].index = p0; p0 &= ~SUFFIX_GROUP_MARKER; symbol0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] < T[p0 - 1]); p0 = 0; } cache[i + 0].symbol = symbol0; SA[i + 0] = p0 & SAINT_MAX; + sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { cache[i + 1].index = p1; p1 &= ~SUFFIX_GROUP_MARKER; symbol1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] < T[p1 - 1]); p1 = 0; } cache[i + 1].symbol = symbol1; SA[i + 1] = p1 & SAINT_MAX; + } + + for (j += prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { cache[i].index = p; p &= ~SUFFIX_GROUP_MARKER; symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] < T[p - 1]); p = 0; } cache[i].symbol = symbol; SA[i] = p & SAINT_MAX; + } +} + +static void libsais16_partial_sorting_scan_left_to_right_32s_1k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) + { + libsais16_prefetchw(&SA[i + 2 * prefetch_distance]); + + sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); + + libsais16_prefetchw(&cache[i + prefetch_distance]); + + sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { cache[i + 0].index = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)); symbol0 = T[p0 - 1]; p0 = 0; } cache[i + 0].symbol = symbol0; SA[i + 0] = p0 & SAINT_MAX; + sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { cache[i + 1].index = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)); symbol1 = T[p1 - 1]; p1 = 0; } cache[i + 1].symbol = symbol1; SA[i + 1] = p1 & SAINT_MAX; + } + + for (j += prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { cache[i].index = (p - 1) | ((sa_sint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1)); symbol = T[p - 1]; p = 0; } cache[i].symbol = symbol; SA[i] = p & SAINT_MAX; + } +} + +static sa_sint_t libsais16_partial_sorting_scan_left_to_right_32s_6k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size; + for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2) + { + libsais16_prefetchw(&cache[i + 2 * prefetch_distance]); + + libsais16_prefetchw(&buckets[cache[i + prefetch_distance + 0].symbol]); + libsais16_prefetchw(&buckets[cache[i + prefetch_distance + 1].symbol]); + + sa_sint_t v0 = cache[i + 0].symbol, p0 = cache[i + 0].index; d += (p0 < 0); cache[i + 0].symbol = buckets[v0]++; cache[i + 0].index = (p0 - 1) | ((sa_sint_t)(buckets[2 + v0] != d) << (SAINT_BIT - 1)); buckets[2 + v0] = d; + if (cache[i + 0].symbol < omp_block_end) { sa_sint_t s = cache[i + 0].symbol, q = (cache[s].index = cache[i + 0].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] >= T[q - 1]); } + + sa_sint_t v1 = cache[i + 1].symbol, p1 = cache[i + 1].index; d += (p1 < 0); cache[i + 1].symbol = buckets[v1]++; cache[i + 1].index = (p1 - 1) | ((sa_sint_t)(buckets[2 + v1] != d) << (SAINT_BIT - 1)); buckets[2 + v1] = d; + if (cache[i + 1].symbol < omp_block_end) { sa_sint_t s = cache[i + 1].symbol, q = (cache[s].index = cache[i + 1].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] >= T[q - 1]); } + } + + for (j += prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t v = cache[i].symbol, p = cache[i].index; d += (p < 0); cache[i].symbol = buckets[v]++; cache[i].index = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d; + if (cache[i].symbol < omp_block_end) { sa_sint_t s = cache[i].symbol, q = (cache[s].index = cache[i].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] >= T[q - 1]); } + } + + return d; +} + +static sa_sint_t libsais16_partial_sorting_scan_left_to_right_32s_4k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + sa_sint_t * RESTRICT induction_bucket = &buckets[2 * k]; + sa_sint_t * RESTRICT distinct_names = &buckets[0 * k]; + + fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size; + for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2) + { + libsais16_prefetchw(&cache[i + 2 * prefetch_distance]); + + sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0 >> 1]; libsais16_prefetchw(s0 >= 0 ? Is0 : NULL); const sa_sint_t * Ds0 = &distinct_names[s0]; libsais16_prefetchw(s0 >= 0 ? Ds0 : NULL); + sa_sint_t s1 = cache[i + prefetch_distance + 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1 >> 1]; libsais16_prefetchw(s1 >= 0 ? Is1 : NULL); const sa_sint_t * Ds1 = &distinct_names[s1]; libsais16_prefetchw(s1 >= 0 ? Ds1 : NULL); + + sa_sint_t v0 = cache[i + 0].symbol; + if (v0 >= 0) + { + sa_sint_t p0 = cache[i + 0].index; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); cache[i + 0].symbol = induction_bucket[v0 >> 1]++; cache[i + 0].index = (p0 - 1) | (v0 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d; + if (cache[i + 0].symbol < omp_block_end) { sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index; if (np > 0) { cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]); np = 0; } cache[i + 0].index = np & SAINT_MAX; } + } + + sa_sint_t v1 = cache[i + 1].symbol; + if (v1 >= 0) + { + sa_sint_t p1 = cache[i + 1].index; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); cache[i + 1].symbol = induction_bucket[v1 >> 1]++; cache[i + 1].index = (p1 - 1) | (v1 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d; + if (cache[i + 1].symbol < omp_block_end) { sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index; if (np > 0) { cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]); np = 0; } cache[i + 1].index = np & SAINT_MAX; } + } + } + + for (j += prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t v = cache[i].symbol; + if (v >= 0) + { + sa_sint_t p = cache[i].index; d += (p >> (SUFFIX_GROUP_BIT - 1)); cache[i].symbol = induction_bucket[v >> 1]++; cache[i].index = (p - 1) | (v << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d; + if (cache[i].symbol < omp_block_end) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]); np = 0; } cache[i].index = np & SAINT_MAX; } + } + } + + return d; +} + +static void libsais16_partial_sorting_scan_left_to_right_32s_1k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size; + for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2) + { + libsais16_prefetchw(&cache[i + 2 * prefetch_distance]); + + sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0]; libsais16_prefetchw(s0 >= 0 ? Is0 : NULL); + sa_sint_t s1 = cache[i + prefetch_distance + 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1]; libsais16_prefetchw(s1 >= 0 ? Is1 : NULL); + + sa_sint_t v0 = cache[i + 0].symbol; + if (v0 >= 0) + { + cache[i + 0].symbol = induction_bucket[v0]++; + if (cache[i + 0].symbol < omp_block_end) { sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index; if (np > 0) { cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; np = 0; } cache[i + 0].index = np & SAINT_MAX; } + } + + sa_sint_t v1 = cache[i + 1].symbol; + if (v1 >= 0) + { + cache[i + 1].symbol = induction_bucket[v1]++; + if (cache[i + 1].symbol < omp_block_end) { sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index; if (np > 0) { cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; np = 0; } cache[i + 1].index = np & SAINT_MAX; } + } + } + + for (j += prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t v = cache[i].symbol; + if (v >= 0) + { + cache[i].symbol = induction_bucket[v]++; + if (cache[i].symbol < omp_block_end) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; np = 0; } cache[i].index = np & SAINT_MAX; } + } + } +} + +static sa_sint_t libsais16_partial_sorting_scan_left_to_right_32s_6k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(cache); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; + + omp_block_start += block_start; + + if (omp_num_threads == 1) + { + d = libsais16_partial_sorting_scan_left_to_right_32s_6k(T, SA, buckets, d, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + libsais16_partial_sorting_scan_left_to_right_32s_6k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); + } + + #pragma omp barrier + + #pragma omp master + { + d = libsais16_partial_sorting_scan_left_to_right_32s_6k_block_sort(T, buckets, d, cache - block_start, block_start, block_size); + } + + #pragma omp barrier + + { + libsais16_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); + } + } +#endif + } + + return d; +} + +static sa_sint_t libsais16_partial_sorting_scan_left_to_right_32s_4k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(cache); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; + + omp_block_start += block_start; + + if (omp_num_threads == 1) + { + d = libsais16_partial_sorting_scan_left_to_right_32s_4k(T, SA, k, buckets, d, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + libsais16_partial_sorting_scan_left_to_right_32s_4k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); + } + + #pragma omp barrier + + #pragma omp master + { + d = libsais16_partial_sorting_scan_left_to_right_32s_4k_block_sort(T, k, buckets, d, cache - block_start, block_start, block_size); + } + + #pragma omp barrier + + { + libsais16_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); + } + } +#endif + } + + return d; +} + +static void libsais16_partial_sorting_scan_left_to_right_32s_1k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(cache); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; + + omp_block_start += block_start; + + if (omp_num_threads == 1) + { + libsais16_partial_sorting_scan_left_to_right_32s_1k(T, SA, buckets, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + libsais16_partial_sorting_scan_left_to_right_32s_1k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); + } + + #pragma omp barrier + + #pragma omp master + { + libsais16_partial_sorting_scan_left_to_right_32s_1k_block_sort(T, buckets, cache - block_start, block_start, block_size); + } + + #pragma omp barrier + + { + libsais16_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); + } + } +#endif + } +} + +#endif + +static sa_sint_t libsais16_partial_sorting_scan_left_to_right_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + SA[buckets[BUCKETS_INDEX4(T[n - 1], T[n - 2] >= T[n - 1])]++] = (n - 1) | SAINT_MIN; + buckets[2 + BUCKETS_INDEX4(T[n - 1], T[n - 2] >= T[n - 1])] = ++d; + + if (threads == 1 || left_suffixes_count < 65536) + { + d = libsais16_partial_sorting_scan_left_to_right_32s_6k(T, SA, buckets, d, 0, left_suffixes_count); + } +#if defined(_OPENMP) + else + { + fast_sint_t block_start, block_end; + for (block_start = 0; block_start < left_suffixes_count; block_start = block_end) + { + block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > left_suffixes_count) { block_end = left_suffixes_count; } + + d = libsais16_partial_sorting_scan_left_to_right_32s_6k_block_omp(T, SA, buckets, d, thread_state[0].state.cache, block_start, block_end - block_start, threads); + } + } +#else + UNUSED(thread_state); +#endif + + return d; +} + +static sa_sint_t libsais16_partial_sorting_scan_left_to_right_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + sa_sint_t * RESTRICT induction_bucket = &buckets[2 * k]; + sa_sint_t * RESTRICT distinct_names = &buckets[0 * k]; + + SA[induction_bucket[T[n - 1]]++] = (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1)) | SUFFIX_GROUP_MARKER; + distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] < T[n - 1])] = ++d; + + if (threads == 1 || n < 65536) + { + d = libsais16_partial_sorting_scan_left_to_right_32s_4k(T, SA, k, buckets, d, 0, n); + } +#if defined(_OPENMP) + else + { + fast_sint_t block_start, block_end; + for (block_start = 0; block_start < n; block_start = block_end) + { + block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > n) { block_end = n; } + + d = libsais16_partial_sorting_scan_left_to_right_32s_4k_block_omp(T, SA, k, buckets, d, thread_state[0].state.cache, block_start, block_end - block_start, threads); + } + } +#else + UNUSED(thread_state); +#endif + + return d; +} + +static void libsais16_partial_sorting_scan_left_to_right_32s_1k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + SA[buckets[T[n - 1]]++] = (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1)); + + if (threads == 1 || n < 65536) + { + libsais16_partial_sorting_scan_left_to_right_32s_1k(T, SA, buckets, 0, n); + } +#if defined(_OPENMP) + else + { + fast_sint_t block_start, block_end; + for (block_start = 0; block_start < n; block_start = block_end) + { + block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > n) { block_end = n; } + + libsais16_partial_sorting_scan_left_to_right_32s_1k_block_omp(T, SA, buckets, thread_state[0].state.cache, block_start, block_end - block_start, threads); + } + } +#else + UNUSED(thread_state); +#endif +} + +static void libsais16_partial_sorting_shift_markers_16u_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, const sa_sint_t * RESTRICT buckets, sa_sint_t threads) +{ + const fast_sint_t prefetch_distance = 32; + + const sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE]; + + fast_sint_t c; + +#if defined(_OPENMP) + #pragma omp parallel for schedule(static, 1) num_threads(threads) if(threads > 1 && n >= 65536) +#else + UNUSED(threads); UNUSED(n); +#endif + for (c = BUCKETS_INDEX2(ALPHABET_SIZE - 1, 0); c >= BUCKETS_INDEX2(1, 0); c -= BUCKETS_INDEX2(1, 0)) + { + fast_sint_t i, j; sa_sint_t s = SAINT_MIN; + for (i = (fast_sint_t)temp_bucket[c] - 1, j = (fast_sint_t)buckets[c - BUCKETS_INDEX2(1, 0)] + 3; i >= j; i -= 4) + { + libsais16_prefetchw(&SA[i - prefetch_distance]); + + sa_sint_t p0 = SA[i - 0], q0 = (p0 & SAINT_MIN) ^ s; s = s ^ q0; SA[i - 0] = p0 ^ q0; + sa_sint_t p1 = SA[i - 1], q1 = (p1 & SAINT_MIN) ^ s; s = s ^ q1; SA[i - 1] = p1 ^ q1; + sa_sint_t p2 = SA[i - 2], q2 = (p2 & SAINT_MIN) ^ s; s = s ^ q2; SA[i - 2] = p2 ^ q2; + sa_sint_t p3 = SA[i - 3], q3 = (p3 & SAINT_MIN) ^ s; s = s ^ q3; SA[i - 3] = p3 ^ q3; + } + + for (j -= 3; i >= j; i -= 1) + { + sa_sint_t p = SA[i], q = (p & SAINT_MIN) ^ s; s = s ^ q; SA[i] = p ^ q; + } + } +} + +static void libsais16_partial_sorting_shift_markers_32s_6k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k, const sa_sint_t * RESTRICT buckets, sa_sint_t threads) +{ + const fast_sint_t prefetch_distance = 32; + + const sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k]; + + fast_sint_t c; + +#if defined(_OPENMP) + #pragma omp parallel for schedule(static, 1) num_threads(threads) if(threads > 1 && k >= 65536) +#else + UNUSED(threads); +#endif + for (c = (fast_sint_t)k - 1; c >= 1; c -= 1) + { + fast_sint_t i, j; sa_sint_t s = SAINT_MIN; + for (i = (fast_sint_t)buckets[BUCKETS_INDEX4(c, 0)] - 1, j = (fast_sint_t)temp_bucket[BUCKETS_INDEX2(c - 1, 0)] + 3; i >= j; i -= 4) + { + libsais16_prefetchw(&SA[i - prefetch_distance]); + + sa_sint_t p0 = SA[i - 0], q0 = (p0 & SAINT_MIN) ^ s; s = s ^ q0; SA[i - 0] = p0 ^ q0; + sa_sint_t p1 = SA[i - 1], q1 = (p1 & SAINT_MIN) ^ s; s = s ^ q1; SA[i - 1] = p1 ^ q1; + sa_sint_t p2 = SA[i - 2], q2 = (p2 & SAINT_MIN) ^ s; s = s ^ q2; SA[i - 2] = p2 ^ q2; + sa_sint_t p3 = SA[i - 3], q3 = (p3 & SAINT_MIN) ^ s; s = s ^ q3; SA[i - 3] = p3 ^ q3; + } + + for (j -= 3; i >= j; i -= 1) + { + sa_sint_t p = SA[i], q = (p & SAINT_MIN) ^ s; s = s ^ q; SA[i] = p ^ q; + } + } +} + +static void libsais16_partial_sorting_shift_markers_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i; sa_sint_t s = SUFFIX_GROUP_MARKER; + for (i = (fast_sint_t)n - 1; i >= 3; i -= 4) + { + libsais16_prefetchw(&SA[i - prefetch_distance]); + + sa_sint_t p0 = SA[i - 0], q0 = ((p0 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p0 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q0; SA[i - 0] = p0 ^ q0; + sa_sint_t p1 = SA[i - 1], q1 = ((p1 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p1 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q1; SA[i - 1] = p1 ^ q1; + sa_sint_t p2 = SA[i - 2], q2 = ((p2 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p2 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q2; SA[i - 2] = p2 ^ q2; + sa_sint_t p3 = SA[i - 3], q3 = ((p3 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p3 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q3; SA[i - 3] = p3 ^ q3; + } + + for (; i >= 0; i -= 1) + { + sa_sint_t p = SA[i], q = ((p & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q; SA[i] = p ^ q; + } +} + +static void libsais16_partial_sorting_shift_buckets_32s_6k(sa_sint_t k, sa_sint_t * RESTRICT buckets) +{ + sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k]; + + fast_sint_t i; + for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0)) + { + buckets[2 * i + BUCKETS_INDEX4(0, 0)] = temp_bucket[i + BUCKETS_INDEX2(0, 0)]; + buckets[2 * i + BUCKETS_INDEX4(0, 1)] = temp_bucket[i + BUCKETS_INDEX2(0, 1)]; + } +} + +static sa_sint_t libsais16_partial_sorting_scan_right_to_left_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; + sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; + + fast_sint_t i, j; + for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) + { + libsais16_prefetch(&SA[i - 2 * prefetch_distance]); + + libsais16_prefetch(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 1); + libsais16_prefetch(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 2); + libsais16_prefetch(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 1); + libsais16_prefetch(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 2); + + sa_sint_t p0 = SA[i - 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); + SA[--induction_bucket[v0]] = (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d; + + sa_sint_t p1 = SA[i - 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); + SA[--induction_bucket[v1]] = (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d; + } + + for (j -= prefetch_distance + 1; i >= j; i -= 1) + { + sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); + SA[--induction_bucket[v]] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; + } + + return d; +} + +#if defined(_OPENMP) + +static void libsais16_partial_sorting_scan_right_to_left_16u_block_prepare(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size, LIBSAIS_THREAD_STATE * RESTRICT state) +{ + const fast_sint_t prefetch_distance = 32; + + sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; + sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; + + memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t)); + + fast_sint_t i, j, count = 0; sa_sint_t d = 1; + for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) + { + libsais16_prefetch(&SA[i - 2 * prefetch_distance]); + + libsais16_prefetch(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 1); + libsais16_prefetch(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 2); + libsais16_prefetch(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 1); + libsais16_prefetch(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 2); + + sa_sint_t p0 = cache[count].index = SA[i - 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = cache[count++].symbol = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); induction_bucket[v0]++; distinct_names[v0] = d; + sa_sint_t p1 = cache[count].index = SA[i - 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = cache[count++].symbol = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); induction_bucket[v1]++; distinct_names[v1] = d; + } + + for (j -= prefetch_distance + 1; i >= j; i -= 1) + { + sa_sint_t p = cache[count].index = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = cache[count++].symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); induction_bucket[v]++; distinct_names[v] = d; + } + + state[0].state.position = (fast_sint_t)d - 1; + state[0].state.count = count; +} + +static void libsais16_partial_sorting_scan_right_to_left_16u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count, sa_sint_t d) +{ + const fast_sint_t prefetch_distance = 32; + + sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; + sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; + + fast_sint_t i, j; + for (i = 0, j = count - 1; i < j; i += 2) + { + libsais16_prefetch(&cache[i + prefetch_distance]); + + sa_sint_t p0 = cache[i + 0].index; d += (p0 < 0); sa_sint_t v0 = cache[i + 0].symbol; + SA[--induction_bucket[v0]] = (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d; + + sa_sint_t p1 = cache[i + 1].index; d += (p1 < 0); sa_sint_t v1 = cache[i + 1].symbol; + SA[--induction_bucket[v1]] = (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d; + } + + for (j += 1; i < j; i += 1) + { + sa_sint_t p = cache[i].index; d += (p < 0); sa_sint_t v = cache[i].symbol; + SA[--induction_bucket[v]] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; + } +} + +static sa_sint_t libsais16_partial_sorting_scan_right_to_left_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; + + omp_block_start += block_start; + + if (omp_num_threads == 1) + { + d = libsais16_partial_sorting_scan_right_to_left_16u(T, SA, buckets, d, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + libsais16_partial_sorting_scan_right_to_left_16u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size, &thread_state[omp_thread_num]); + } + + #pragma omp barrier + + #pragma omp master + { + sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; + sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; + + fast_sint_t t; + for (t = omp_num_threads - 1; t >= 0; --t) + { + sa_sint_t * RESTRICT temp_induction_bucket = &thread_state[t].state.buckets[0 * ALPHABET_SIZE]; + sa_sint_t * RESTRICT temp_distinct_names = &thread_state[t].state.buckets[2 * ALPHABET_SIZE]; + + fast_sint_t c; + for (c = 0; c < 2 * ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_induction_bucket[c]; induction_bucket[c] = A - B; temp_induction_bucket[c] = A; } + + for (d -= 1, c = 0; c < 2 * ALPHABET_SIZE; c += 1) { sa_sint_t A = distinct_names[c], B = temp_distinct_names[c], D = B + d; distinct_names[c] = B > 0 ? D : A; temp_distinct_names[c] = A; } + d += 1 + (sa_sint_t)thread_state[t].state.position; thread_state[t].state.position = (fast_sint_t)d - thread_state[t].state.position; + } + } + + #pragma omp barrier + + { + libsais16_partial_sorting_scan_right_to_left_16u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count, (sa_sint_t)thread_state[omp_thread_num].state.position); + } + } +#endif + } + + return d; +} + +#endif + +static void libsais16_partial_sorting_scan_right_to_left_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + fast_sint_t scan_start = (fast_sint_t)left_suffixes_count + 1; + fast_sint_t scan_end = (fast_sint_t)n - (fast_sint_t)first_lms_suffix; + + if (threads == 1 || (scan_end - scan_start) < 65536) + { + libsais16_partial_sorting_scan_right_to_left_16u(T, SA, buckets, d, scan_start, scan_end - scan_start); + } +#if defined(_OPENMP) + else + { + sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; + sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; + + fast_sint_t block_start; + for (block_start = scan_end - 1; block_start >= scan_start; ) + { + if (SA[block_start] == 0) + { + block_start--; + } + else + { + fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end < scan_start) { block_max_end = scan_start - 1; } + fast_sint_t block_end = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; } + fast_sint_t block_size = block_start - block_end; + + if (block_size < 32) + { + for (; block_start > block_end; block_start -= 1) + { + sa_sint_t p = SA[block_start]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); + SA[--induction_bucket[v]] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; + } + } + else + { + d = libsais16_partial_sorting_scan_right_to_left_16u_block_omp(T, SA, buckets, d, block_end + 1, block_size, threads, thread_state); + block_start = block_end; + } + } + } + } +#else + UNUSED(thread_state); +#endif +} + +static sa_sint_t libsais16_partial_sorting_scan_right_to_left_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2) + { + libsais16_prefetch(&SA[i - 3 * prefetch_distance]); + + libsais16_prefetch(&T[SA[i - 2 * prefetch_distance - 0] & SAINT_MAX] - 1); + libsais16_prefetch(&T[SA[i - 2 * prefetch_distance - 0] & SAINT_MAX] - 2); + libsais16_prefetch(&T[SA[i - 2 * prefetch_distance - 1] & SAINT_MAX] - 1); + libsais16_prefetch(&T[SA[i - 2 * prefetch_distance - 1] & SAINT_MAX] - 2); + + sa_sint_t p0 = SA[i - prefetch_distance - 0] & SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX4(T[p0 - (p0 > 0)], 0); libsais16_prefetchw(&buckets[v0]); + sa_sint_t p1 = SA[i - prefetch_distance - 1] & SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX4(T[p1 - (p1 > 0)], 0); libsais16_prefetchw(&buckets[v1]); + + sa_sint_t p2 = SA[i - 0]; d += (p2 < 0); p2 &= SAINT_MAX; sa_sint_t v2 = BUCKETS_INDEX4(T[p2 - 1], T[p2 - 2] > T[p2 - 1]); + SA[--buckets[v2]] = (p2 - 1) | ((sa_sint_t)(buckets[2 + v2] != d) << (SAINT_BIT - 1)); buckets[2 + v2] = d; + + sa_sint_t p3 = SA[i - 1]; d += (p3 < 0); p3 &= SAINT_MAX; sa_sint_t v3 = BUCKETS_INDEX4(T[p3 - 1], T[p3 - 2] > T[p3 - 1]); + SA[--buckets[v3]] = (p3 - 1) | ((sa_sint_t)(buckets[2 + v3] != d) << (SAINT_BIT - 1)); buckets[2 + v3] = d; + } + + for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) + { + sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX4(T[p - 1], T[p - 2] > T[p - 1]); + SA[--buckets[v]] = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d; + } + + return d; +} + +static sa_sint_t libsais16_partial_sorting_scan_right_to_left_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + sa_sint_t * RESTRICT induction_bucket = &buckets[3 * k]; + sa_sint_t * RESTRICT distinct_names = &buckets[0 * k]; + + fast_sint_t i, j; + for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2) + { + libsais16_prefetchw(&SA[i - 3 * prefetch_distance]); + + sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0]; const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1]; const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); + sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { const fast_sint_t Ts2 = T[(s2 & ~SUFFIX_GROUP_MARKER) - 1]; libsais16_prefetchw(&induction_bucket[Ts2]); libsais16_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts2, 0)]); } + sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { const fast_sint_t Ts3 = T[(s3 & ~SUFFIX_GROUP_MARKER) - 1]; libsais16_prefetchw(&induction_bucket[Ts3]); libsais16_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts3, 0)]); } + + sa_sint_t p0 = SA[i - 0]; + if (p0 > 0) + { + SA[i - 0] = 0; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); p0 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); + SA[--induction_bucket[T[p0 - 1]]] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d; + } + + sa_sint_t p1 = SA[i - 1]; + if (p1 > 0) + { + SA[i - 1] = 0; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); p1 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); + SA[--induction_bucket[T[p1 - 1]]] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d; + } + } + + for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) + { + sa_sint_t p = SA[i]; + if (p > 0) + { + SA[i] = 0; d += (p >> (SUFFIX_GROUP_BIT - 1)); p &= ~SUFFIX_GROUP_MARKER; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); + SA[--induction_bucket[T[p - 1]]] = (p - 1) | ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d; + } + } + + return d; +} + +static void libsais16_partial_sorting_scan_right_to_left_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2) + { + libsais16_prefetchw(&SA[i - 3 * prefetch_distance]); + + sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); + sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { libsais16_prefetchw(&induction_bucket[T[s2 - 1]]); libsais16_prefetch(&T[s2] - 2); } + sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { libsais16_prefetchw(&induction_bucket[T[s3 - 1]]); libsais16_prefetch(&T[s3] - 2); } + + sa_sint_t p0 = SA[i - 0]; if (p0 > 0) { SA[i - 0] = 0; SA[--induction_bucket[T[p0 - 1]]] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)); } + sa_sint_t p1 = SA[i - 1]; if (p1 > 0) { SA[i - 1] = 0; SA[--induction_bucket[T[p1 - 1]]] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)); } + } + + for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) + { + sa_sint_t p = SA[i]; if (p > 0) { SA[i] = 0; SA[--induction_bucket[T[p - 1]]] = (p - 1) | ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)); } + } +} + +#if defined(_OPENMP) + +static void libsais16_partial_sorting_scan_right_to_left_32s_6k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) + { + libsais16_prefetch(&SA[i + 2 * prefetch_distance]); + + libsais16_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1); + libsais16_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2); + libsais16_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1); + libsais16_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2); + + libsais16_prefetchw(&cache[i + prefetch_distance]); + + sa_sint_t p0 = cache[i + 0].index = SA[i + 0]; sa_sint_t symbol0 = 0; p0 &= SAINT_MAX; if (p0 != 0) { symbol0 = BUCKETS_INDEX4(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); } cache[i + 0].symbol = symbol0; + sa_sint_t p1 = cache[i + 1].index = SA[i + 1]; sa_sint_t symbol1 = 0; p1 &= SAINT_MAX; if (p1 != 0) { symbol1 = BUCKETS_INDEX4(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); } cache[i + 1].symbol = symbol1; + } + + for (j += prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t p = cache[i].index = SA[i]; sa_sint_t symbol = 0; p &= SAINT_MAX; if (p != 0) { symbol = BUCKETS_INDEX4(T[p - 1], T[p - 2] > T[p - 1]); } cache[i].symbol = symbol; + } +} + +static void libsais16_partial_sorting_scan_right_to_left_32s_4k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) + { + libsais16_prefetchw(&SA[i + 2 * prefetch_distance]); + + sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); + + libsais16_prefetchw(&cache[i + prefetch_distance]); + + sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { SA[i + 0] = 0; cache[i + 0].index = p0; p0 &= ~SUFFIX_GROUP_MARKER; symbol0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); } cache[i + 0].symbol = symbol0; + sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { SA[i + 1] = 0; cache[i + 1].index = p1; p1 &= ~SUFFIX_GROUP_MARKER; symbol1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); } cache[i + 1].symbol = symbol1; + } + + for (j += prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { SA[i] = 0; cache[i].index = p; p &= ~SUFFIX_GROUP_MARKER; symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); } cache[i].symbol = symbol; + } +} + +static void libsais16_partial_sorting_scan_right_to_left_32s_1k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) + { + libsais16_prefetchw(&SA[i + 2 * prefetch_distance]); + + sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); + + libsais16_prefetchw(&cache[i + prefetch_distance]); + + sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { SA[i + 0] = 0; cache[i + 0].index = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)); symbol0 = T[p0 - 1]; } cache[i + 0].symbol = symbol0; + sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { SA[i + 1] = 0; cache[i + 1].index = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)); symbol1 = T[p1 - 1]; } cache[i + 1].symbol = symbol1; + } + + for (j += prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { SA[i] = 0; cache[i].index = (p - 1) | ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)); symbol = T[p - 1]; } cache[i].symbol = symbol; + } +} + +static sa_sint_t libsais16_partial_sorting_scan_right_to_left_32s_6k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) + { + libsais16_prefetchw(&cache[i - 2 * prefetch_distance]); + + libsais16_prefetchw(&buckets[cache[i - prefetch_distance - 0].symbol]); + libsais16_prefetchw(&buckets[cache[i - prefetch_distance - 1].symbol]); + + sa_sint_t v0 = cache[i - 0].symbol, p0 = cache[i - 0].index; d += (p0 < 0); cache[i - 0].symbol = --buckets[v0]; cache[i - 0].index = (p0 - 1) | ((sa_sint_t)(buckets[2 + v0] != d) << (SAINT_BIT - 1)); buckets[2 + v0] = d; + if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t s = cache[i - 0].symbol, q = (cache[s].index = cache[i - 0].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] > T[q - 1]); } + + sa_sint_t v1 = cache[i - 1].symbol, p1 = cache[i - 1].index; d += (p1 < 0); cache[i - 1].symbol = --buckets[v1]; cache[i - 1].index = (p1 - 1) | ((sa_sint_t)(buckets[2 + v1] != d) << (SAINT_BIT - 1)); buckets[2 + v1] = d; + if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t s = cache[i - 1].symbol, q = (cache[s].index = cache[i - 1].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] > T[q - 1]); } + } + + for (j -= prefetch_distance + 1; i >= j; i -= 1) + { + sa_sint_t v = cache[i].symbol, p = cache[i].index; d += (p < 0); cache[i].symbol = --buckets[v]; cache[i].index = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d; + if (cache[i].symbol >= omp_block_start) { sa_sint_t s = cache[i].symbol, q = (cache[s].index = cache[i].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] > T[q - 1]); } + } + + return d; +} + +static sa_sint_t libsais16_partial_sorting_scan_right_to_left_32s_4k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + sa_sint_t * RESTRICT induction_bucket = &buckets[3 * k]; + sa_sint_t * RESTRICT distinct_names = &buckets[0 * k]; + + fast_sint_t i, j; + for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) + { + libsais16_prefetchw(&cache[i - 2 * prefetch_distance]); + + sa_sint_t s0 = cache[i - prefetch_distance - 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0 >> 1]; libsais16_prefetchw(s0 >= 0 ? Is0 : NULL); const sa_sint_t * Ds0 = &distinct_names[s0]; libsais16_prefetchw(s0 >= 0 ? Ds0 : NULL); + sa_sint_t s1 = cache[i - prefetch_distance - 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1 >> 1]; libsais16_prefetchw(s1 >= 0 ? Is1 : NULL); const sa_sint_t * Ds1 = &distinct_names[s1]; libsais16_prefetchw(s1 >= 0 ? Ds1 : NULL); + + sa_sint_t v0 = cache[i - 0].symbol; + if (v0 >= 0) + { + sa_sint_t p0 = cache[i - 0].index; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); cache[i - 0].symbol = --induction_bucket[v0 >> 1]; cache[i - 0].index = (p0 - 1) | (v0 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d; + if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 0].symbol, np = cache[i - 0].index; if (np > 0) { cache[i - 0].index = 0; cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]); } } + } + + sa_sint_t v1 = cache[i - 1].symbol; + if (v1 >= 0) + { + sa_sint_t p1 = cache[i - 1].index; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); cache[i - 1].symbol = --induction_bucket[v1 >> 1]; cache[i - 1].index = (p1 - 1) | (v1 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d; + if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 1].symbol, np = cache[i - 1].index; if (np > 0) { cache[i - 1].index = 0; cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]); } } + } + } + + for (j -= prefetch_distance + 1; i >= j; i -= 1) + { + sa_sint_t v = cache[i].symbol; + if (v >= 0) + { + sa_sint_t p = cache[i].index; d += (p >> (SUFFIX_GROUP_BIT - 1)); cache[i].symbol = --induction_bucket[v >> 1]; cache[i].index = (p - 1) | (v << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d; + if (cache[i].symbol >= omp_block_start) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[i].index = 0; cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]); } } + } + } + + return d; +} + +static void libsais16_partial_sorting_scan_right_to_left_32s_1k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) + { + libsais16_prefetchw(&cache[i - 2 * prefetch_distance]); + + sa_sint_t s0 = cache[i - prefetch_distance - 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0]; libsais16_prefetchw(s0 >= 0 ? Is0 : NULL); + sa_sint_t s1 = cache[i - prefetch_distance - 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1]; libsais16_prefetchw(s1 >= 0 ? Is1 : NULL); + + sa_sint_t v0 = cache[i - 0].symbol; + if (v0 >= 0) + { + cache[i - 0].symbol = --induction_bucket[v0]; + if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 0].symbol, np = cache[i - 0].index; if (np > 0) { cache[i - 0].index = 0; cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; } } + } + + sa_sint_t v1 = cache[i - 1].symbol; + if (v1 >= 0) + { + cache[i - 1].symbol = --induction_bucket[v1]; + if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 1].symbol, np = cache[i - 1].index; if (np > 0) { cache[i - 1].index = 0; cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; }} + } + } + + for (j -= prefetch_distance + 1; i >= j; i -= 1) + { + sa_sint_t v = cache[i].symbol; + if (v >= 0) + { + cache[i].symbol = --induction_bucket[v]; + if (cache[i].symbol >= omp_block_start) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[i].index = 0; cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; } } + } + } +} + +static sa_sint_t libsais16_partial_sorting_scan_right_to_left_32s_6k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(cache); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; + + omp_block_start += block_start; + + if (omp_num_threads == 1) + { + d = libsais16_partial_sorting_scan_right_to_left_32s_6k(T, SA, buckets, d, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + libsais16_partial_sorting_scan_right_to_left_32s_6k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); + } + + #pragma omp barrier + + #pragma omp master + { + d = libsais16_partial_sorting_scan_right_to_left_32s_6k_block_sort(T, buckets, d, cache - block_start, block_start, block_size); + } + + #pragma omp barrier + + { + libsais16_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); + } + } +#endif + } + + return d; +} + +static sa_sint_t libsais16_partial_sorting_scan_right_to_left_32s_4k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(cache); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; + + omp_block_start += block_start; + + if (omp_num_threads == 1) + { + d = libsais16_partial_sorting_scan_right_to_left_32s_4k(T, SA, k, buckets, d, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + libsais16_partial_sorting_scan_right_to_left_32s_4k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); + } + + #pragma omp barrier + + #pragma omp master + { + d = libsais16_partial_sorting_scan_right_to_left_32s_4k_block_sort(T, k, buckets, d, cache - block_start, block_start, block_size); + } + + #pragma omp barrier + + { + libsais16_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); + } + } +#endif + } + + return d; +} + +static void libsais16_partial_sorting_scan_right_to_left_32s_1k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(cache); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; + + omp_block_start += block_start; + + if (omp_num_threads == 1) + { + libsais16_partial_sorting_scan_right_to_left_32s_1k(T, SA, buckets, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + libsais16_partial_sorting_scan_right_to_left_32s_1k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); + } + + #pragma omp barrier + + #pragma omp master + { + libsais16_partial_sorting_scan_right_to_left_32s_1k_block_sort(T, buckets, cache - block_start, block_start, block_size); + } + + #pragma omp barrier + + { + libsais16_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); + } + } +#endif + } +} + +#endif + +static sa_sint_t libsais16_partial_sorting_scan_right_to_left_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + fast_sint_t scan_start = (fast_sint_t)left_suffixes_count + 1; + fast_sint_t scan_end = (fast_sint_t)n - (fast_sint_t)first_lms_suffix; + + if (threads == 1 || (scan_end - scan_start) < 65536) + { + d = libsais16_partial_sorting_scan_right_to_left_32s_6k(T, SA, buckets, d, scan_start, scan_end - scan_start); + } +#if defined(_OPENMP) + else + { + fast_sint_t block_start, block_end; + for (block_start = scan_end - 1; block_start >= scan_start; block_start = block_end) + { + block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < scan_start) { block_end = scan_start - 1; } + + d = libsais16_partial_sorting_scan_right_to_left_32s_6k_block_omp(T, SA, buckets, d, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads); + } + } +#else + UNUSED(thread_state); +#endif + + return d; +} + +static sa_sint_t libsais16_partial_sorting_scan_right_to_left_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + if (threads == 1 || n < 65536) + { + d = libsais16_partial_sorting_scan_right_to_left_32s_4k(T, SA, k, buckets, d, 0, n); + } +#if defined(_OPENMP) + else + { + fast_sint_t block_start, block_end; + for (block_start = (fast_sint_t)n - 1; block_start >= 0; block_start = block_end) + { + block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < 0) { block_end = -1; } + + d = libsais16_partial_sorting_scan_right_to_left_32s_4k_block_omp(T, SA, k, buckets, d, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads); + } + } +#else + UNUSED(thread_state); +#endif + + return d; +} + +static void libsais16_partial_sorting_scan_right_to_left_32s_1k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + if (threads == 1 || n < 65536) + { + libsais16_partial_sorting_scan_right_to_left_32s_1k(T, SA, buckets, 0, n); + } +#if defined(_OPENMP) + else + { + fast_sint_t block_start, block_end; + for (block_start = (fast_sint_t)n - 1; block_start >= 0; block_start = block_end) + { + block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < 0) { block_end = -1; } + + libsais16_partial_sorting_scan_right_to_left_32s_1k_block_omp(T, SA, buckets, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads); + } + } +#else + UNUSED(thread_state); +#endif +} + +static fast_sint_t libsais16_partial_sorting_gather_lms_suffixes_32s_4k(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j, l; + for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j; i += 4) + { + libsais16_prefetch(&SA[i + prefetch_distance]); + + sa_sint_t s0 = SA[i + 0]; SA[l] = (s0 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s0 < 0); + sa_sint_t s1 = SA[i + 1]; SA[l] = (s1 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s1 < 0); + sa_sint_t s2 = SA[i + 2]; SA[l] = (s2 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s2 < 0); + sa_sint_t s3 = SA[i + 3]; SA[l] = (s3 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s3 < 0); + } + + for (j += 3; i < j; i += 1) + { + sa_sint_t s = SA[i]; SA[l] = (s - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s < 0); + } + + return l; +} + +static fast_sint_t libsais16_partial_sorting_gather_lms_suffixes_32s_1k(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j, l; + for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j; i += 4) + { + libsais16_prefetch(&SA[i + prefetch_distance]); + + sa_sint_t s0 = SA[i + 0]; SA[l] = s0 & SAINT_MAX; l += (s0 < 0); + sa_sint_t s1 = SA[i + 1]; SA[l] = s1 & SAINT_MAX; l += (s1 < 0); + sa_sint_t s2 = SA[i + 2]; SA[l] = s2 & SAINT_MAX; l += (s2 < 0); + sa_sint_t s3 = SA[i + 3]; SA[l] = s3 & SAINT_MAX; l += (s3 < 0); + } + + for (j += 3; i < j; i += 1) + { + sa_sint_t s = SA[i]; SA[l] = s & SAINT_MAX; l += (s < 0); + } + + return l; +} + +static void libsais16_partial_sorting_gather_lms_suffixes_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; + + if (omp_num_threads == 1) + { + libsais16_partial_sorting_gather_lms_suffixes_32s_4k(SA, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + thread_state[omp_thread_num].state.position = omp_block_start; + thread_state[omp_thread_num].state.count = libsais16_partial_sorting_gather_lms_suffixes_32s_4k(SA, omp_block_start, omp_block_size) - omp_block_start; + } + + #pragma omp barrier + + #pragma omp master + { + fast_sint_t t, position = 0; + for (t = 0; t < omp_num_threads; ++t) + { + if (t > 0 && thread_state[t].state.count > 0) + { + memmove(&SA[position], &SA[thread_state[t].state.position], (size_t)thread_state[t].state.count * sizeof(sa_sint_t)); + } + + position += thread_state[t].state.count; + } + } + } +#endif + } +} + +static void libsais16_partial_sorting_gather_lms_suffixes_32s_1k_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; + + if (omp_num_threads == 1) + { + libsais16_partial_sorting_gather_lms_suffixes_32s_1k(SA, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + thread_state[omp_thread_num].state.position = omp_block_start; + thread_state[omp_thread_num].state.count = libsais16_partial_sorting_gather_lms_suffixes_32s_1k(SA, omp_block_start, omp_block_size) - omp_block_start; + } + + #pragma omp barrier + + #pragma omp master + { + fast_sint_t t, position = 0; + for (t = 0; t < omp_num_threads; ++t) + { + if (t > 0 && thread_state[t].state.count > 0) + { + memmove(&SA[position], &SA[thread_state[t].state.position], (size_t)thread_state[t].state.count * sizeof(sa_sint_t)); + } + + position += thread_state[t].state.count; + } + } + } +#endif + } +} + +static void libsais16_induce_partial_order_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + memset(&buckets[2 * ALPHABET_SIZE], 0, 2 * ALPHABET_SIZE * sizeof(sa_sint_t)); + + sa_sint_t d = libsais16_partial_sorting_scan_left_to_right_16u_omp(T, SA, n, buckets, left_suffixes_count, 0, threads, thread_state); + libsais16_partial_sorting_shift_markers_16u_omp(SA, n, buckets, threads); + libsais16_partial_sorting_scan_right_to_left_16u_omp(T, SA, n, buckets, first_lms_suffix, left_suffixes_count, d, threads, thread_state); +} + +static void libsais16_induce_partial_order_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + sa_sint_t d = libsais16_partial_sorting_scan_left_to_right_32s_6k_omp(T, SA, n, buckets, left_suffixes_count, 0, threads, thread_state); + libsais16_partial_sorting_shift_markers_32s_6k_omp(SA, k, buckets, threads); + libsais16_partial_sorting_shift_buckets_32s_6k(k, buckets); + libsais16_partial_sorting_scan_right_to_left_32s_6k_omp(T, SA, n, buckets, first_lms_suffix, left_suffixes_count, d, threads, thread_state); +} + +static void libsais16_induce_partial_order_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t)); + + sa_sint_t d = libsais16_partial_sorting_scan_left_to_right_32s_4k_omp(T, SA, n, k, buckets, 0, threads, thread_state); + libsais16_partial_sorting_shift_markers_32s_4k(SA, n); + libsais16_partial_sorting_scan_right_to_left_32s_4k_omp(T, SA, n, k, buckets, d, threads, thread_state); + libsais16_partial_sorting_gather_lms_suffixes_32s_4k_omp(SA, n, threads, thread_state); +} + +static void libsais16_induce_partial_order_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + libsais16_partial_sorting_scan_left_to_right_32s_1k_omp(T, SA, n, &buckets[1 * k], threads, thread_state); + libsais16_partial_sorting_scan_right_to_left_32s_1k_omp(T, SA, n, &buckets[0 * k], threads, thread_state); + libsais16_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads, thread_state); +} + +static void libsais16_induce_partial_order_32s_1k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + libsais16_count_suffixes_32s(T, n, k, buckets); + libsais16_initialize_buckets_start_32s_1k(k, buckets); + libsais16_partial_sorting_scan_left_to_right_32s_1k_omp(T, SA, n, buckets, threads, thread_state); + + libsais16_count_suffixes_32s(T, n, k, buckets); + libsais16_initialize_buckets_end_32s_1k(k, buckets); + libsais16_partial_sorting_scan_right_to_left_32s_1k_omp(T, SA, n, buckets, threads, thread_state); + + libsais16_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads, thread_state); +} + +static sa_sint_t libsais16_renumber_lms_suffixes_16u(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t name, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + sa_sint_t * RESTRICT SAm = &SA[m]; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) + { + libsais16_prefetch(&SA[i + 2 * prefetch_distance]); + + libsais16_prefetchw(&SAm[(SA[i + prefetch_distance + 0] & SAINT_MAX) >> 1]); + libsais16_prefetchw(&SAm[(SA[i + prefetch_distance + 1] & SAINT_MAX) >> 1]); + libsais16_prefetchw(&SAm[(SA[i + prefetch_distance + 2] & SAINT_MAX) >> 1]); + libsais16_prefetchw(&SAm[(SA[i + prefetch_distance + 3] & SAINT_MAX) >> 1]); + + sa_sint_t p0 = SA[i + 0]; SAm[(p0 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p0 < 0; + sa_sint_t p1 = SA[i + 1]; SAm[(p1 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p1 < 0; + sa_sint_t p2 = SA[i + 2]; SAm[(p2 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p2 < 0; + sa_sint_t p3 = SA[i + 3]; SAm[(p3 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p3 < 0; + } + + for (j += prefetch_distance + 3; i < j; i += 1) + { + sa_sint_t p = SA[i]; SAm[(p & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p < 0; + } + + return name; +} + +static fast_sint_t libsais16_gather_marked_suffixes_16u(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t l, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + l -= 1; + + fast_sint_t i, j; + for (i = (fast_sint_t)m + omp_block_start + omp_block_size - 1, j = (fast_sint_t)m + omp_block_start + 3; i >= j; i -= 4) + { + libsais16_prefetch(&SA[i - prefetch_distance]); + + sa_sint_t s0 = SA[i - 0]; SA[l] = s0 & SAINT_MAX; l -= s0 < 0; + sa_sint_t s1 = SA[i - 1]; SA[l] = s1 & SAINT_MAX; l -= s1 < 0; + sa_sint_t s2 = SA[i - 2]; SA[l] = s2 & SAINT_MAX; l -= s2 < 0; + sa_sint_t s3 = SA[i - 3]; SA[l] = s3 & SAINT_MAX; l -= s3 < 0; + } + + for (j -= 3; i >= j; i -= 1) + { + sa_sint_t s = SA[i]; SA[l] = s & SAINT_MAX; l -= s < 0; + } + + l += 1; + + return l; +} + +static sa_sint_t libsais16_renumber_lms_suffixes_16u_omp(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + sa_sint_t name = 0; + +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start; + + if (omp_num_threads == 1) + { + name = libsais16_renumber_lms_suffixes_16u(SA, m, 0, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + thread_state[omp_thread_num].state.count = libsais16_count_negative_marked_suffixes(SA, omp_block_start, omp_block_size); + } + + #pragma omp barrier + + { + fast_sint_t t, count = 0; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; } + + if (omp_thread_num == omp_num_threads - 1) + { + name = (sa_sint_t)(count + thread_state[omp_thread_num].state.count); + } + + libsais16_renumber_lms_suffixes_16u(SA, m, (sa_sint_t)count, omp_block_start, omp_block_size); + } + } +#endif + } + + return name; +} + +static void libsais16_gather_marked_lms_suffixes_16u_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start; + + if (omp_num_threads == 1) + { + libsais16_gather_marked_suffixes_16u(SA, m, (fast_sint_t)n + (fast_sint_t)fs, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + if (omp_thread_num < omp_num_threads - 1) + { + thread_state[omp_thread_num].state.position = libsais16_gather_marked_suffixes_16u(SA, m, (fast_sint_t)m + omp_block_start + omp_block_size, omp_block_start, omp_block_size); + thread_state[omp_thread_num].state.count = (fast_sint_t)m + omp_block_start + omp_block_size - thread_state[omp_thread_num].state.position; + } + else + { + thread_state[omp_thread_num].state.position = libsais16_gather_marked_suffixes_16u(SA, m, (fast_sint_t)n + (fast_sint_t)fs, omp_block_start, omp_block_size); + thread_state[omp_thread_num].state.count = (fast_sint_t)n + (fast_sint_t)fs - thread_state[omp_thread_num].state.position; + } + } + + #pragma omp barrier + + #pragma omp master + { + fast_sint_t t, position = (fast_sint_t)n + (fast_sint_t)fs; + + for (t = omp_num_threads - 1; t >= 0; --t) + { + position -= thread_state[t].state.count; + if (t != omp_num_threads - 1 && thread_state[t].state.count > 0) + { + memmove(&SA[position], &SA[thread_state[t].state.position], (size_t)thread_state[t].state.count * sizeof(sa_sint_t)); + } + } + } + } +#endif + } +} + +static sa_sint_t libsais16_renumber_and_gather_lms_suffixes_16u_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + memset(&SA[m], 0, ((size_t)n >> 1) * sizeof(sa_sint_t)); + + sa_sint_t name = libsais16_renumber_lms_suffixes_16u_omp(SA, m, threads, thread_state); + if (name < m) + { + libsais16_gather_marked_lms_suffixes_16u_omp(SA, n, m, fs, threads, thread_state); + } + else + { + fast_sint_t i; for (i = 0; i < m; i += 1) { SA[i] &= SAINT_MAX; } + } + + return name; +} + +static sa_sint_t libsais16_renumber_distinct_lms_suffixes_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t name, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + sa_sint_t * RESTRICT SAm = &SA[m]; + + fast_sint_t i, j; sa_sint_t p0, p1, p2, p3 = 0; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) + { + libsais16_prefetchw(&SA[i + 2 * prefetch_distance]); + + libsais16_prefetchw(&SAm[(SA[i + prefetch_distance + 0] & SAINT_MAX) >> 1]); + libsais16_prefetchw(&SAm[(SA[i + prefetch_distance + 1] & SAINT_MAX) >> 1]); + libsais16_prefetchw(&SAm[(SA[i + prefetch_distance + 2] & SAINT_MAX) >> 1]); + libsais16_prefetchw(&SAm[(SA[i + prefetch_distance + 3] & SAINT_MAX) >> 1]); + + p0 = SA[i + 0]; SAm[(SA[i + 0] = p0 & SAINT_MAX) >> 1] = name | (p0 & p3 & SAINT_MIN); name += p0 < 0; + p1 = SA[i + 1]; SAm[(SA[i + 1] = p1 & SAINT_MAX) >> 1] = name | (p1 & p0 & SAINT_MIN); name += p1 < 0; + p2 = SA[i + 2]; SAm[(SA[i + 2] = p2 & SAINT_MAX) >> 1] = name | (p2 & p1 & SAINT_MIN); name += p2 < 0; + p3 = SA[i + 3]; SAm[(SA[i + 3] = p3 & SAINT_MAX) >> 1] = name | (p3 & p2 & SAINT_MIN); name += p3 < 0; + } + + for (j += prefetch_distance + 3; i < j; i += 1) + { + p2 = p3; p3 = SA[i]; SAm[(SA[i] = p3 & SAINT_MAX) >> 1] = name | (p3 & p2 & SAINT_MIN); name += p3 < 0; + } + + return name; +} + +static void libsais16_mark_distinct_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; sa_sint_t p0, p1, p2, p3 = 0; + for (i = (fast_sint_t)m + omp_block_start, j = (fast_sint_t)m + omp_block_start + omp_block_size - 3; i < j; i += 4) + { + libsais16_prefetchw(&SA[i + prefetch_distance]); + + p0 = SA[i + 0]; SA[i + 0] = p0 & (p3 | SAINT_MAX); p0 = (p0 == 0) ? p3 : p0; + p1 = SA[i + 1]; SA[i + 1] = p1 & (p0 | SAINT_MAX); p1 = (p1 == 0) ? p0 : p1; + p2 = SA[i + 2]; SA[i + 2] = p2 & (p1 | SAINT_MAX); p2 = (p2 == 0) ? p1 : p2; + p3 = SA[i + 3]; SA[i + 3] = p3 & (p2 | SAINT_MAX); p3 = (p3 == 0) ? p2 : p3; + } + + for (j += 3; i < j; i += 1) + { + p2 = p3; p3 = SA[i]; SA[i] = p3 & (p2 | SAINT_MAX); p3 = (p3 == 0) ? p2 : p3; + } +} + +static void libsais16_clamp_lms_suffixes_length_32s(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + sa_sint_t * RESTRICT SAm = &SA[m]; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - 3; i < j; i += 4) + { + libsais16_prefetchw(&SAm[i + prefetch_distance]); + + SAm[i + 0] = (SAm[i + 0] < 0 ? SAm[i + 0] : 0) & SAINT_MAX; + SAm[i + 1] = (SAm[i + 1] < 0 ? SAm[i + 1] : 0) & SAINT_MAX; + SAm[i + 2] = (SAm[i + 2] < 0 ? SAm[i + 2] : 0) & SAINT_MAX; + SAm[i + 3] = (SAm[i + 3] < 0 ? SAm[i + 3] : 0) & SAINT_MAX; + } + + for (j += 3; i < j; i += 1) + { + SAm[i] = (SAm[i] < 0 ? SAm[i] : 0) & SAINT_MAX; + } +} + +static sa_sint_t libsais16_renumber_distinct_lms_suffixes_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + sa_sint_t name = 0; + +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start; + + if (omp_num_threads == 1) + { + name = libsais16_renumber_distinct_lms_suffixes_32s_4k(SA, m, 1, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + thread_state[omp_thread_num].state.count = libsais16_count_negative_marked_suffixes(SA, omp_block_start, omp_block_size); + } + + #pragma omp barrier + + { + fast_sint_t t, count = 1; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; } + + if (omp_thread_num == omp_num_threads - 1) + { + name = (sa_sint_t)(count + thread_state[omp_thread_num].state.count); + } + + libsais16_renumber_distinct_lms_suffixes_32s_4k(SA, m, (sa_sint_t)count, omp_block_start, omp_block_size); + } + } +#endif + } + + return name - 1; +} + +static void libsais16_mark_distinct_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); + fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start; +#else + UNUSED(threads); + + fast_sint_t omp_block_start = 0; + fast_sint_t omp_block_size = (fast_sint_t)n >> 1; +#endif + libsais16_mark_distinct_lms_suffixes_32s(SA, m, omp_block_start, omp_block_size); + } +} + +static void libsais16_clamp_lms_suffixes_length_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); + fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start; +#else + UNUSED(threads); + + fast_sint_t omp_block_start = 0; + fast_sint_t omp_block_size = (fast_sint_t)n >> 1; +#endif + libsais16_clamp_lms_suffixes_length_32s(SA, m, omp_block_start, omp_block_size); + } +} + +static sa_sint_t libsais16_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + memset(&SA[m], 0, ((size_t)n >> 1) * sizeof(sa_sint_t)); + + sa_sint_t name = libsais16_renumber_distinct_lms_suffixes_32s_4k_omp(SA, m, threads, thread_state); + if (name < m) + { + libsais16_mark_distinct_lms_suffixes_32s_omp(SA, n, m, threads); + } + + return name; +} + +static sa_sint_t libsais16_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads) +{ + const fast_sint_t prefetch_distance = 32; + + sa_sint_t * RESTRICT SAm = &SA[m]; + + { + libsais16_gather_lms_suffixes_32s(T, SA, n); + + memset(&SA[m], 0, ((size_t)n - (size_t)m - (size_t)m) * sizeof(sa_sint_t)); + + fast_sint_t i, j; + for (i = (fast_sint_t)n - (fast_sint_t)m, j = (fast_sint_t)n - 1 - prefetch_distance - 3; i < j; i += 4) + { + libsais16_prefetch(&SA[i + 2 * prefetch_distance]); + + libsais16_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]); + libsais16_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]); + libsais16_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 2]) >> 1]); + libsais16_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 3]) >> 1]); + + SAm[((sa_uint_t)SA[i + 0]) >> 1] = SA[i + 1] - SA[i + 0] + 1 + SAINT_MIN; + SAm[((sa_uint_t)SA[i + 1]) >> 1] = SA[i + 2] - SA[i + 1] + 1 + SAINT_MIN; + SAm[((sa_uint_t)SA[i + 2]) >> 1] = SA[i + 3] - SA[i + 2] + 1 + SAINT_MIN; + SAm[((sa_uint_t)SA[i + 3]) >> 1] = SA[i + 4] - SA[i + 3] + 1 + SAINT_MIN; + } + + for (j += prefetch_distance + 3; i < j; i += 1) + { + SAm[((sa_uint_t)SA[i]) >> 1] = SA[i + 1] - SA[i] + 1 + SAINT_MIN; + } + + SAm[((sa_uint_t)SA[n - 1]) >> 1] = 1 + SAINT_MIN; + } + + { + libsais16_clamp_lms_suffixes_length_32s_omp(SA, n, m, threads); + } + + sa_sint_t name = 1; + + { + fast_sint_t i, j, p = SA[0], plen = SAm[p >> 1]; sa_sint_t pdiff = SAINT_MIN; + for (i = 1, j = m - prefetch_distance - 1; i < j; i += 2) + { + libsais16_prefetch(&SA[i + 2 * prefetch_distance]); + + libsais16_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]); libsais16_prefetch(&T[((sa_uint_t)SA[i + prefetch_distance + 0])]); + libsais16_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]); libsais16_prefetch(&T[((sa_uint_t)SA[i + prefetch_distance + 1])]); + + fast_sint_t q = SA[i + 0], qlen = SAm[q >> 1]; sa_sint_t qdiff = SAINT_MIN; + if (plen == qlen) { fast_sint_t l = 0; do { if (T[p + l] != T[q + l]) { break; } } while (++l < qlen); qdiff = (sa_sint_t)(l - qlen) & SAINT_MIN; } + SAm[p >> 1] = name | (pdiff & qdiff); name += (qdiff < 0); + + p = SA[i + 1]; plen = SAm[p >> 1]; pdiff = SAINT_MIN; + if (qlen == plen) { fast_sint_t l = 0; do { if (T[q + l] != T[p + l]) { break; } } while (++l < plen); pdiff = (sa_sint_t)(l - plen) & SAINT_MIN; } + SAm[q >> 1] = name | (qdiff & pdiff); name += (pdiff < 0); + } + + for (j += prefetch_distance + 1; i < j; i += 1) + { + fast_sint_t q = SA[i], qlen = SAm[q >> 1]; sa_sint_t qdiff = SAINT_MIN; + if (plen == qlen) { fast_sint_t l = 0; do { if (T[p + l] != T[q + l]) { break; } } while (++l < plen); qdiff = (sa_sint_t)(l - plen) & SAINT_MIN; } + SAm[p >> 1] = name | (pdiff & qdiff); name += (qdiff < 0); + + p = q; plen = qlen; pdiff = qdiff; + } + + SAm[p >> 1] = name | pdiff; name++; + } + + if (name <= m) + { + libsais16_mark_distinct_lms_suffixes_32s_omp(SA, n, m, threads); + } + + return name - 1; +} + +static void libsais16_reconstruct_lms_suffixes(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + const sa_sint_t * RESTRICT SAnm = &SA[n - m]; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) + { + libsais16_prefetchw(&SA[i + 2 * prefetch_distance]); + + libsais16_prefetch(&SAnm[SA[i + prefetch_distance + 0]]); + libsais16_prefetch(&SAnm[SA[i + prefetch_distance + 1]]); + libsais16_prefetch(&SAnm[SA[i + prefetch_distance + 2]]); + libsais16_prefetch(&SAnm[SA[i + prefetch_distance + 3]]); + + SA[i + 0] = SAnm[SA[i + 0]]; + SA[i + 1] = SAnm[SA[i + 1]]; + SA[i + 2] = SAnm[SA[i + 2]]; + SA[i + 3] = SAnm[SA[i + 3]]; + } + + for (j += prefetch_distance + 3; i < j; i += 1) + { + SA[i] = SAnm[SA[i]]; + } +} + +static void libsais16_reconstruct_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); + fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start; +#else + UNUSED(threads); + + fast_sint_t omp_block_start = 0; + fast_sint_t omp_block_size = m; +#endif + + libsais16_reconstruct_lms_suffixes(SA, n, m, omp_block_start, omp_block_size); + } +} + +static void libsais16_place_lms_suffixes_interval_16u(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, const sa_sint_t * RESTRICT buckets) +{ + const sa_sint_t * RESTRICT bucket_end = &buckets[7 * ALPHABET_SIZE]; + + fast_sint_t c, j = n; + for (c = ALPHABET_SIZE - 2; c >= 0; --c) + { + fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] - (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)]; + if (l > 0) + { + fast_sint_t i = bucket_end[c]; + if (j - i > 0) + { + memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t)); + } + + memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t)); + } + } + + memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t)); +} + +static void libsais16_place_lms_suffixes_interval_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets) +{ + const sa_sint_t * RESTRICT bucket_end = &buckets[3 * k]; + + fast_sint_t c, j = n; + for (c = (fast_sint_t)k - 2; c >= 0; --c) + { + fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] - (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)]; + if (l > 0) + { + fast_sint_t i = bucket_end[c]; + if (j - i > 0) + { + memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t)); + } + + memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t)); + } + } + + memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t)); +} + +static void libsais16_place_lms_suffixes_interval_32s_2k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets) +{ + fast_sint_t j = n; + + if (k > 1) + { + fast_sint_t c; + for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0); c >= BUCKETS_INDEX2(0, 0); c -= BUCKETS_INDEX2(1, 0)) + { + fast_sint_t l = (fast_sint_t)buckets[c + BUCKETS_INDEX2(1, 1)] - (fast_sint_t)buckets[c + BUCKETS_INDEX2(0, 1)]; + if (l > 0) + { + fast_sint_t i = buckets[c]; + if (j - i > 0) + { + memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t)); + } + + memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t)); + } + } + } + + memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t)); +} + +static void libsais16_place_lms_suffixes_interval_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t m, sa_sint_t * RESTRICT buckets) +{ + const fast_sint_t prefetch_distance = 32; + + sa_sint_t c = k - 1; fast_sint_t i, l = buckets[c]; + for (i = (fast_sint_t)m - 1; i >= prefetch_distance + 3; i -= 4) + { + libsais16_prefetch(&SA[i - 2 * prefetch_distance]); + + libsais16_prefetch(&T[SA[i - prefetch_distance - 0]]); + libsais16_prefetch(&T[SA[i - prefetch_distance - 1]]); + libsais16_prefetch(&T[SA[i - prefetch_distance - 2]]); + libsais16_prefetch(&T[SA[i - prefetch_distance - 3]]); + + sa_sint_t p0 = SA[i - 0]; if (T[p0] != c) { c = T[p0]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p0; + sa_sint_t p1 = SA[i - 1]; if (T[p1] != c) { c = T[p1]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p1; + sa_sint_t p2 = SA[i - 2]; if (T[p2] != c) { c = T[p2]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p2; + sa_sint_t p3 = SA[i - 3]; if (T[p3] != c) { c = T[p3]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p3; + } + + for (; i >= 0; i -= 1) + { + sa_sint_t p = SA[i]; if (T[p] != c) { c = T[p]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p; + } + + memset(&SA[0], 0, (size_t)l * sizeof(sa_sint_t)); +} + +static void libsais16_place_lms_suffixes_histogram_32s_6k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets) +{ + const sa_sint_t * RESTRICT bucket_end = &buckets[5 * k]; + + fast_sint_t c, j = n; + for (c = (fast_sint_t)k - 2; c >= 0; --c) + { + fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX4(c, 1)]; + if (l > 0) + { + fast_sint_t i = bucket_end[c]; + if (j - i > 0) + { + memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t)); + } + + memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t)); + } + } + + memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t)); +} + +static void libsais16_place_lms_suffixes_histogram_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets) +{ + const sa_sint_t * RESTRICT bucket_end = &buckets[3 * k]; + + fast_sint_t c, j = n; + for (c = (fast_sint_t)k - 2; c >= 0; --c) + { + fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)]; + if (l > 0) + { + fast_sint_t i = bucket_end[c]; + if (j - i > 0) + { + memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t)); + } + + memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t)); + } + } + + memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t)); +} + +static void libsais16_place_lms_suffixes_histogram_32s_2k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets) +{ + fast_sint_t j = n; + + if (k > 1) + { + fast_sint_t c; + for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0); c >= BUCKETS_INDEX2(0, 0); c -= BUCKETS_INDEX2(1, 0)) + { + fast_sint_t l = (fast_sint_t)buckets[c + BUCKETS_INDEX2(0, 1)]; + if (l > 0) + { + fast_sint_t i = buckets[c]; + if (j - i > 0) + { + memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t)); + } + + memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t)); + } + } + } + + memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t)); +} + +static void libsais16_final_bwt_scan_left_to_right_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) + { + libsais16_prefetchw(&SA[i + 2 * prefetch_distance]); + + sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); + + sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[i + 0] = T[p0] | SAINT_MIN; SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); } + sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[i + 1] = T[p1] | SAINT_MIN; SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); } + } + + for (j += prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[i] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); } + } +} + +static void libsais16_final_bwt_aux_scan_left_to_right_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) + { + libsais16_prefetchw(&SA[i + 2 * prefetch_distance]); + + sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); + + sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[i + 0] = T[p0] | SAINT_MIN; SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); if ((p0 & rm) == 0) { I[p0 / (rm + 1)] = induction_bucket[T[p0]]; }} + sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[i + 1] = T[p1] | SAINT_MIN; SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); if ((p1 & rm) == 0) { I[p1 / (rm + 1)] = induction_bucket[T[p1]]; }} + } + + for (j += prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[i] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]]; } } + } +} + +static void libsais16_final_sorting_scan_left_to_right_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) + { + libsais16_prefetchw(&SA[i + 2 * prefetch_distance]); + + sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); + + sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); } + sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); } + } + + for (j += prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); } + } +} + +static void libsais16_final_sorting_scan_left_to_right_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2) + { + libsais16_prefetchw(&SA[i + 3 * prefetch_distance]); + + sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); + sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { libsais16_prefetchw(&induction_bucket[T[s2 - 1]]); libsais16_prefetch(&T[s2] - 2); } + sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { libsais16_prefetchw(&induction_bucket[T[s3 - 1]]); libsais16_prefetch(&T[s3] - 2); } + + sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); } + sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); } + } + + for (j += 2 * prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); } + } +} + +#if defined(_OPENMP) + +static fast_sint_t libsais16_final_bwt_scan_left_to_right_16u_block_prepare(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t)); + + fast_sint_t i, j, count = 0; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) + { + libsais16_prefetchw(&SA[i + 2 * prefetch_distance]); + + sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); + + sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[i + 0] = T[p0] | SAINT_MIN; buckets[cache[count].symbol = T[p0]]++; cache[count++].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); } + sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[i + 1] = T[p1] | SAINT_MIN; buckets[cache[count].symbol = T[p1]]++; cache[count++].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); } + } + + for (j += prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[i] = T[p] | SAINT_MIN; buckets[cache[count].symbol = T[p]]++; cache[count++].index = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); } + } + + return count; +} + +static fast_sint_t libsais16_final_sorting_scan_left_to_right_16u_block_prepare(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t)); + + fast_sint_t i, j, count = 0; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) + { + libsais16_prefetchw(&SA[i + 2 * prefetch_distance]); + + sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); + + sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; buckets[cache[count].symbol = T[p0]]++; cache[count++].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); } + sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; buckets[cache[count].symbol = T[p1]]++; cache[count++].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); } + } + + for (j += prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; buckets[cache[count].symbol = T[p]]++; cache[count++].index = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); } + } + + return count; +} + +static void libsais16_final_order_scan_left_to_right_16u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = 0, j = count - 3; i < j; i += 4) + { + libsais16_prefetch(&cache[i + prefetch_distance]); + + SA[buckets[cache[i + 0].symbol]++] = cache[i + 0].index; + SA[buckets[cache[i + 1].symbol]++] = cache[i + 1].index; + SA[buckets[cache[i + 2].symbol]++] = cache[i + 2].index; + SA[buckets[cache[i + 3].symbol]++] = cache[i + 3].index; + } + + for (j += 3; i < j; i += 1) + { + SA[buckets[cache[i].symbol]++] = cache[i].index; + } +} + +static void libsais16_final_bwt_aux_scan_left_to_right_16u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = 0, j = count - 3; i < j; i += 4) + { + libsais16_prefetch(&cache[i + prefetch_distance]); + + SA[buckets[cache[i + 0].symbol]++] = cache[i + 0].index; if ((cache[i + 0].index & rm) == 0) { I[(cache[i + 0].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 0].symbol]; } + SA[buckets[cache[i + 1].symbol]++] = cache[i + 1].index; if ((cache[i + 1].index & rm) == 0) { I[(cache[i + 1].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 1].symbol]; } + SA[buckets[cache[i + 2].symbol]++] = cache[i + 2].index; if ((cache[i + 2].index & rm) == 0) { I[(cache[i + 2].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 2].symbol]; } + SA[buckets[cache[i + 3].symbol]++] = cache[i + 3].index; if ((cache[i + 3].index & rm) == 0) { I[(cache[i + 3].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 3].symbol]; } + } + + for (j += 3; i < j; i += 1) + { + SA[buckets[cache[i].symbol]++] = cache[i].index; if ((cache[i].index & rm) == 0) { I[(cache[i].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i].symbol]; } + } +} + +static void libsais16_final_sorting_scan_left_to_right_32s_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) + { + libsais16_prefetchw(&SA[i + 2 * prefetch_distance]); + + sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); + + libsais16_prefetchw(&cache[i + prefetch_distance]); + + sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; cache[i + 0].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); symbol0 = T[p0]; } cache[i + 0].symbol = symbol0; + sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; cache[i + 1].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); symbol1 = T[p1]; } cache[i + 1].symbol = symbol1; + } + + for (j += prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t symbol = SAINT_MIN, p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; cache[i].index = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); symbol = T[p]; } cache[i].symbol = symbol; + } +} + +static void libsais16_final_sorting_scan_left_to_right_32s_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size; + for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2) + { + libsais16_prefetchw(&cache[i + 2 * prefetch_distance]); + + sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0]; libsais16_prefetchw(s0 >= 0 ? Is0 : NULL); + sa_sint_t s1 = cache[i + prefetch_distance + 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1]; libsais16_prefetchw(s1 >= 0 ? Is1 : NULL); + + sa_sint_t v0 = cache[i + 0].symbol; + if (v0 >= 0) + { + cache[i + 0].symbol = induction_bucket[v0]++; + if (cache[i + 0].symbol < omp_block_end) { sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index; cache[i + 0].index = np ^ SAINT_MIN; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] < T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } } + } + + sa_sint_t v1 = cache[i + 1].symbol; + if (v1 >= 0) + { + cache[i + 1].symbol = induction_bucket[v1]++; + if (cache[i + 1].symbol < omp_block_end) { sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index; cache[i + 1].index = np ^ SAINT_MIN; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] < T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } } + } + } + + for (j += prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t v = cache[i].symbol; + if (v >= 0) + { + cache[i].symbol = induction_bucket[v]++; + if (cache[i].symbol < omp_block_end) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; cache[i].index = np ^ SAINT_MIN; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] < T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } } + } + } +} + +static void libsais16_final_bwt_scan_left_to_right_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; + + omp_block_start += block_start; + + if (omp_num_threads == 1) + { + libsais16_final_bwt_scan_left_to_right_16u(T, SA, induction_bucket, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + thread_state[omp_thread_num].state.count = libsais16_final_bwt_scan_left_to_right_16u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size); + } + + #pragma omp barrier + + #pragma omp master + { + fast_sint_t t; + for (t = 0; t < omp_num_threads; ++t) + { + sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; + fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A + B; temp_bucket[c] = A; } + } + } + + #pragma omp barrier + + { + libsais16_final_order_scan_left_to_right_16u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count); + } + } +#endif + } +} + +static void libsais16_final_bwt_aux_scan_left_to_right_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; + + omp_block_start += block_start; + + if (omp_num_threads == 1) + { + libsais16_final_bwt_aux_scan_left_to_right_16u(T, SA, rm, I, induction_bucket, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + thread_state[omp_thread_num].state.count = libsais16_final_bwt_scan_left_to_right_16u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size); + } + + #pragma omp barrier + + #pragma omp master + { + fast_sint_t t; + for (t = 0; t < omp_num_threads; ++t) + { + sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; + fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A + B; temp_bucket[c] = A; } + } + } + + #pragma omp barrier + + { + libsais16_final_bwt_aux_scan_left_to_right_16u_block_place(SA, rm, I, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count); + } + } +#endif + } +} + +static void libsais16_final_sorting_scan_left_to_right_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; + + omp_block_start += block_start; + + if (omp_num_threads == 1) + { + libsais16_final_sorting_scan_left_to_right_16u(T, SA, induction_bucket, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + thread_state[omp_thread_num].state.count = libsais16_final_sorting_scan_left_to_right_16u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size); + } + + #pragma omp barrier + + #pragma omp master + { + fast_sint_t t; + for (t = 0; t < omp_num_threads; ++t) + { + sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; + fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A + B; temp_bucket[c] = A; } + } + } + + #pragma omp barrier + + { + libsais16_final_order_scan_left_to_right_16u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count); + } + } +#endif + } +} + +static void libsais16_final_sorting_scan_left_to_right_32s_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(cache); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; + + omp_block_start += block_start; + + if (omp_num_threads == 1) + { + libsais16_final_sorting_scan_left_to_right_32s(T, SA, buckets, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + libsais16_final_sorting_scan_left_to_right_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); + } + + #pragma omp barrier + + #pragma omp master + { + libsais16_final_sorting_scan_left_to_right_32s_block_sort(T, buckets, cache - block_start, block_start, block_size); + } + + #pragma omp barrier + + { + libsais16_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); + } + } +#endif + } +} + +#endif + +static void libsais16_final_bwt_scan_left_to_right_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + SA[induction_bucket[T[(sa_sint_t)n - 1]]++] = ((sa_sint_t)n - 1) | ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1)); + + if (threads == 1 || n < 65536) + { + libsais16_final_bwt_scan_left_to_right_16u(T, SA, induction_bucket, 0, n); + } +#if defined(_OPENMP) + else + { + fast_sint_t block_start; + for (block_start = 0; block_start < n; ) + { + if (SA[block_start] == 0) + { + block_start++; + } + else + { + fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > n) { block_max_end = n;} + fast_sint_t block_end = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; } + fast_sint_t block_size = block_end - block_start; + + if (block_size < 32) + { + for (; block_start < block_end; block_start += 1) + { + sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; SA[block_start] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); } + } + } + else + { + libsais16_final_bwt_scan_left_to_right_16u_block_omp(T, SA, induction_bucket, block_start, block_size, threads, thread_state); + block_start = block_end; + } + } + } + } +#else + UNUSED(thread_state); +#endif +} + +static void libsais16_final_bwt_aux_scan_left_to_right_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + SA[induction_bucket[T[(sa_sint_t)n - 1]]++] = ((sa_sint_t)n - 1) | ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1)); + + if ((((sa_sint_t)n - 1) & rm) == 0) { I[((sa_sint_t)n - 1) / (rm + 1)] = induction_bucket[T[(sa_sint_t)n - 1]]; } + + if (threads == 1 || n < 65536) + { + libsais16_final_bwt_aux_scan_left_to_right_16u(T, SA, rm, I, induction_bucket, 0, n); + } +#if defined(_OPENMP) + else + { + fast_sint_t block_start; + for (block_start = 0; block_start < n; ) + { + if (SA[block_start] == 0) + { + block_start++; + } + else + { + fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > n) { block_max_end = n;} + fast_sint_t block_end = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; } + fast_sint_t block_size = block_end - block_start; + + if (block_size < 32) + { + for (; block_start < block_end; block_start += 1) + { + sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; SA[block_start] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]]; } } + } + } + else + { + libsais16_final_bwt_aux_scan_left_to_right_16u_block_omp(T, SA, rm, I, induction_bucket, block_start, block_size, threads, thread_state); + block_start = block_end; + } + } + } + } +#else + UNUSED(thread_state); +#endif +} + +static void libsais16_final_sorting_scan_left_to_right_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + SA[induction_bucket[T[(sa_sint_t)n - 1]]++] = ((sa_sint_t)n - 1) | ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1)); + + if (threads == 1 || n < 65536) + { + libsais16_final_sorting_scan_left_to_right_16u(T, SA, induction_bucket, 0, n); + } +#if defined(_OPENMP) + else + { + fast_sint_t block_start; + for (block_start = 0; block_start < n; ) + { + if (SA[block_start] == 0) + { + block_start++; + } + else + { + fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > n) { block_max_end = n;} + fast_sint_t block_end = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; } + fast_sint_t block_size = block_end - block_start; + + if (block_size < 32) + { + for (; block_start < block_end; block_start += 1) + { + sa_sint_t p = SA[block_start]; SA[block_start] = p ^ SAINT_MIN; if (p > 0) { p--; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); } + } + } + else + { + libsais16_final_sorting_scan_left_to_right_16u_block_omp(T, SA, induction_bucket, block_start, block_size, threads, thread_state); + block_start = block_end; + } + } + } + } +#else + UNUSED(thread_state); +#endif +} + +static void libsais16_final_sorting_scan_left_to_right_32s_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + SA[induction_bucket[T[n - 1]]++] = (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1)); + + if (threads == 1 || n < 65536) + { + libsais16_final_sorting_scan_left_to_right_32s(T, SA, induction_bucket, 0, n); + } +#if defined(_OPENMP) + else + { + fast_sint_t block_start, block_end; + for (block_start = 0; block_start < n; block_start = block_end) + { + block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > n) { block_end = n; } + + libsais16_final_sorting_scan_left_to_right_32s_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, block_start, block_end - block_start, threads); + } + } +#else + UNUSED(thread_state); +#endif +} + +static sa_sint_t libsais16_final_bwt_scan_right_to_left_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; sa_sint_t index = -1; + for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) + { + libsais16_prefetchw(&SA[i - 2 * prefetch_distance]); + + sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); + + sa_sint_t p0 = SA[i - 0]; index = (p0 == 0) ? (sa_sint_t)(i - 0) : index; + SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint16_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p0 : t; } + + sa_sint_t p1 = SA[i - 1]; index = (p1 == 0) ? (sa_sint_t)(i - 1) : index; + SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint16_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p1 : t; } + } + + for (j -= prefetch_distance + 1; i >= j; i -= 1) + { + sa_sint_t p = SA[i]; index = (p == 0) ? (sa_sint_t)i : index; + SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint16_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; } + } + + return index; +} + +static void libsais16_final_bwt_aux_scan_right_to_left_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) + { + libsais16_prefetchw(&SA[i - 2 * prefetch_distance]); + + sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); + + sa_sint_t p0 = SA[i - 0]; + SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint16_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p0 : t; if ((p0 & rm) == 0) { I[p0 / (rm + 1)] = induction_bucket[T[p0]] + 1; } } + + sa_sint_t p1 = SA[i - 1]; + SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint16_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p1 : t; if ((p1 & rm) == 0) { I[p1 / (rm + 1)] = induction_bucket[T[p1]] + 1; } } + } + + for (j -= prefetch_distance + 1; i >= j; i -= 1) + { + sa_sint_t p = SA[i]; + SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint16_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]] + 1; } } + } +} + +static void libsais16_final_sorting_scan_right_to_left_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) + { + libsais16_prefetchw(&SA[i - 2 * prefetch_distance]); + + sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); + + sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[--induction_bucket[T[p0]]] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); } + sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[--induction_bucket[T[p1]]] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); } + } + + for (j -= prefetch_distance + 1; i >= j; i -= 1) + { + sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[--induction_bucket[T[p]]] = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); } + } +} + +static void libsais16_final_sorting_scan_right_to_left_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2) + { + libsais16_prefetchw(&SA[i - 3 * prefetch_distance]); + + sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); + sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { libsais16_prefetchw(&induction_bucket[T[s2 - 1]]); libsais16_prefetch(&T[s2] - 2); } + sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { libsais16_prefetchw(&induction_bucket[T[s3 - 1]]); libsais16_prefetch(&T[s3] - 2); } + + sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[--induction_bucket[T[p0]]] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); } + sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[--induction_bucket[T[p1]]] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); } + } + + for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) + { + sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[--induction_bucket[T[p]]] = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); } + } +} + +#if defined(_OPENMP) + +static fast_sint_t libsais16_final_bwt_scan_right_to_left_16u_block_prepare(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t)); + + fast_sint_t i, j, count = 0; + for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) + { + libsais16_prefetchw(&SA[i - 2 * prefetch_distance]); + + sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); + + sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint16_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count++].index = (c0 <= c1) ? p0 : t; } + sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint16_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count++].index = (c0 <= c1) ? p1 : t; } + } + + for (j -= prefetch_distance + 1; i >= j; i -= 1) + { + sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint16_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count++].index = (c0 <= c1) ? p : t; } + } + + return count; +} + +static fast_sint_t libsais16_final_bwt_aux_scan_right_to_left_16u_block_prepare(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t)); + + fast_sint_t i, j, count = 0; + for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) + { + libsais16_prefetchw(&SA[i - 2 * prefetch_distance]); + + sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); + + sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint16_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count].index = (c0 <= c1) ? p0 : t; cache[count + 1].index = p0; count += 2; } + sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint16_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count].index = (c0 <= c1) ? p1 : t; cache[count + 1].index = p1; count += 2; } + } + + for (j -= prefetch_distance + 1; i >= j; i -= 1) + { + sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint16_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count].index = (c0 <= c1) ? p : t; cache[count + 1].index = p; count += 2; } + } + + return count; +} + +static fast_sint_t libsais16_final_sorting_scan_right_to_left_16u_block_prepare(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t)); + + fast_sint_t i, j, count = 0; + for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) + { + libsais16_prefetchw(&SA[i - 2 * prefetch_distance]); + + sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); + + sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; buckets[cache[count].symbol = T[p0]]++; cache[count++].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); } + sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; buckets[cache[count].symbol = T[p1]]++; cache[count++].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); } + } + + for (j -= prefetch_distance + 1; i >= j; i -= 1) + { + sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; buckets[cache[count].symbol = T[p]]++; cache[count++].index = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); } + } + + return count; +} + +static void libsais16_final_order_scan_right_to_left_16u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = 0, j = count - 3; i < j; i += 4) + { + libsais16_prefetch(&cache[i + prefetch_distance]); + + SA[--buckets[cache[i + 0].symbol]] = cache[i + 0].index; + SA[--buckets[cache[i + 1].symbol]] = cache[i + 1].index; + SA[--buckets[cache[i + 2].symbol]] = cache[i + 2].index; + SA[--buckets[cache[i + 3].symbol]] = cache[i + 3].index; + } + + for (j += 3; i < j; i += 1) + { + SA[--buckets[cache[i].symbol]] = cache[i].index; + } +} + +static void libsais16_final_bwt_aux_scan_right_to_left_16u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = 0, j = count - 6; i < j; i += 8) + { + libsais16_prefetch(&cache[i + prefetch_distance]); + + SA[--buckets[cache[i + 0].symbol]] = cache[i + 0].index; if ((cache[i + 1].index & rm) == 0) { I[cache[i + 1].index / (rm + 1)] = buckets[cache[i + 0].symbol] + 1; } + SA[--buckets[cache[i + 2].symbol]] = cache[i + 2].index; if ((cache[i + 3].index & rm) == 0) { I[cache[i + 3].index / (rm + 1)] = buckets[cache[i + 2].symbol] + 1; } + SA[--buckets[cache[i + 4].symbol]] = cache[i + 4].index; if ((cache[i + 5].index & rm) == 0) { I[cache[i + 5].index / (rm + 1)] = buckets[cache[i + 4].symbol] + 1; } + SA[--buckets[cache[i + 6].symbol]] = cache[i + 6].index; if ((cache[i + 7].index & rm) == 0) { I[cache[i + 7].index / (rm + 1)] = buckets[cache[i + 6].symbol] + 1; } + } + + for (j += 6; i < j; i += 2) + { + SA[--buckets[cache[i].symbol]] = cache[i].index; if ((cache[i + 1].index & rm) == 0) { I[(cache[i + 1].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i].symbol] + 1; } + } +} + +static void libsais16_final_sorting_scan_right_to_left_32s_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) + { + libsais16_prefetchw(&SA[i + 2 * prefetch_distance]); + + sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetch(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetch(s1 > 0 ? Ts1 : NULL); + + libsais16_prefetchw(&cache[i + prefetch_distance]); + + sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; cache[i + 0].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); symbol0 = T[p0]; } cache[i + 0].symbol = symbol0; + sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; cache[i + 1].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); symbol1 = T[p1]; } cache[i + 1].symbol = symbol1; + } + + for (j += prefetch_distance + 1; i < j; i += 1) + { + sa_sint_t symbol = SAINT_MIN, p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; cache[i].index = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); symbol = T[p]; } cache[i].symbol = symbol; + } +} + +static void libsais16_final_sorting_scan_right_to_left_32s_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) + { + libsais16_prefetchw(&cache[i - 2 * prefetch_distance]); + + sa_sint_t s0 = cache[i - prefetch_distance - 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0]; libsais16_prefetchw(s0 >= 0 ? Is0 : NULL); + sa_sint_t s1 = cache[i - prefetch_distance - 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1]; libsais16_prefetchw(s1 >= 0 ? Is1 : NULL); + + sa_sint_t v0 = cache[i - 0].symbol; + if (v0 >= 0) + { + cache[i - 0].symbol = --induction_bucket[v0]; + if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 0].symbol, np = cache[i - 0].index; cache[i - 0].index = np & SAINT_MAX; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] > T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } } + } + + sa_sint_t v1 = cache[i - 1].symbol; + if (v1 >= 0) + { + cache[i - 1].symbol = --induction_bucket[v1]; + if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 1].symbol, np = cache[i - 1].index; cache[i - 1].index = np & SAINT_MAX; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] > T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } } + } + } + + for (j -= prefetch_distance + 1; i >= j; i -= 1) + { + sa_sint_t v = cache[i].symbol; + if (v >= 0) + { + cache[i].symbol = --induction_bucket[v]; + if (cache[i].symbol >= omp_block_start) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; cache[i].index = np & SAINT_MAX; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] > T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } } + } + } +} + +static void libsais16_final_bwt_scan_right_to_left_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; + + omp_block_start += block_start; + + if (omp_num_threads == 1) + { + libsais16_final_bwt_scan_right_to_left_16u(T, SA, induction_bucket, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + thread_state[omp_thread_num].state.count = libsais16_final_bwt_scan_right_to_left_16u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size); + } + + #pragma omp barrier + + #pragma omp master + { + fast_sint_t t; + for (t = omp_num_threads - 1; t >= 0; --t) + { + sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; + fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A - B; temp_bucket[c] = A; } + } + } + + #pragma omp barrier + + { + libsais16_final_order_scan_right_to_left_16u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count); + } + } +#endif + } +} + +static void libsais16_final_bwt_aux_scan_right_to_left_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; + + omp_block_start += block_start; + + if (omp_num_threads == 1) + { + libsais16_final_bwt_aux_scan_right_to_left_16u(T, SA, rm, I, induction_bucket, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + thread_state[omp_thread_num].state.count = libsais16_final_bwt_aux_scan_right_to_left_16u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size); + } + + #pragma omp barrier + + #pragma omp master + { + fast_sint_t t; + for (t = omp_num_threads - 1; t >= 0; --t) + { + sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; + fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A - B; temp_bucket[c] = A; } + } + } + + #pragma omp barrier + + { + libsais16_final_bwt_aux_scan_right_to_left_16u_block_place(SA, rm, I, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count); + } + } +#endif + } +} + +static void libsais16_final_sorting_scan_right_to_left_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; + + omp_block_start += block_start; + + if (omp_num_threads == 1) + { + libsais16_final_sorting_scan_right_to_left_16u(T, SA, induction_bucket, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + thread_state[omp_thread_num].state.count = libsais16_final_sorting_scan_right_to_left_16u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size); + } + + #pragma omp barrier + + #pragma omp master + { + fast_sint_t t; + for (t = omp_num_threads - 1; t >= 0; --t) + { + sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; + fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A - B; temp_bucket[c] = A; } + } + } + + #pragma omp barrier + + { + libsais16_final_order_scan_right_to_left_16u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count); + } + } +#endif + } +} + +static void libsais16_final_sorting_scan_right_to_left_32s_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(cache); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; + + omp_block_start += block_start; + + if (omp_num_threads == 1) + { + libsais16_final_sorting_scan_right_to_left_32s(T, SA, buckets, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + libsais16_final_sorting_scan_right_to_left_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); + } + + #pragma omp barrier + + #pragma omp master + { + libsais16_final_sorting_scan_right_to_left_32s_block_sort(T, buckets, cache - block_start, block_start, block_size); + } + + #pragma omp barrier + + { + libsais16_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); + } + } +#endif + } +} + +#endif + +static sa_sint_t libsais16_final_bwt_scan_right_to_left_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + sa_sint_t index = -1; + + if (threads == 1 || n < 65536) + { + index = libsais16_final_bwt_scan_right_to_left_16u(T, SA, induction_bucket, 0, n); + } +#if defined(_OPENMP) + else + { + fast_sint_t block_start; + for (block_start = (fast_sint_t)n - 1; block_start >= 0; ) + { + if (SA[block_start] == 0) + { + index = (sa_sint_t)block_start--; + } + else + { + fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end < 0) { block_max_end = -1; } + fast_sint_t block_end = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; } + fast_sint_t block_size = block_start - block_end; + + if (block_size < 32) + { + for (; block_start > block_end; block_start -= 1) + { + sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; uint16_t c0 = T[p - (p > 0)], c1 = T[p]; SA[block_start] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; } + } + } + else + { + libsais16_final_bwt_scan_right_to_left_16u_block_omp(T, SA, induction_bucket, block_end + 1, block_size, threads, thread_state); + block_start = block_end; + } + } + } + } +#else + UNUSED(thread_state); +#endif + + return index; +} + +static void libsais16_final_bwt_aux_scan_right_to_left_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + if (threads == 1 || n < 65536) + { + libsais16_final_bwt_aux_scan_right_to_left_16u(T, SA, rm, I, induction_bucket, 0, n); + } +#if defined(_OPENMP) + else + { + fast_sint_t block_start; + for (block_start = (fast_sint_t)n - 1; block_start >= 0; ) + { + if (SA[block_start] == 0) + { + block_start--; + } + else + { + fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * ((LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads) / 2); if (block_max_end < 0) { block_max_end = -1; } + fast_sint_t block_end = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; } + fast_sint_t block_size = block_start - block_end; + + if (block_size < 32) + { + for (; block_start > block_end; block_start -= 1) + { + sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; uint16_t c0 = T[p - (p > 0)], c1 = T[p]; SA[block_start] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]] + 1; } } + } + } + else + { + libsais16_final_bwt_aux_scan_right_to_left_16u_block_omp(T, SA, rm, I, induction_bucket, block_end + 1, block_size, threads, thread_state); + block_start = block_end; + } + } + } + } +#else + UNUSED(thread_state); +#endif +} + +static void libsais16_final_sorting_scan_right_to_left_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + if (threads == 1 || n < 65536) + { + libsais16_final_sorting_scan_right_to_left_16u(T, SA, induction_bucket, 0, n); + } +#if defined(_OPENMP) + else + { + fast_sint_t block_start; + for (block_start = (fast_sint_t)n - 1; block_start >= 0; ) + { + if (SA[block_start] == 0) + { + block_start--; + } + else + { + fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end < -1) { block_max_end = -1; } + fast_sint_t block_end = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; } + fast_sint_t block_size = block_start - block_end; + + if (block_size < 32) + { + for (; block_start > block_end; block_start -= 1) + { + sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; SA[--induction_bucket[T[p]]] = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); } + } + } + else + { + libsais16_final_sorting_scan_right_to_left_16u_block_omp(T, SA, induction_bucket, block_end + 1, block_size, threads, thread_state); + block_start = block_end; + } + } + } + } +#else + UNUSED(thread_state); +#endif +} + +static void libsais16_final_sorting_scan_right_to_left_32s_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + if (threads == 1 || n < 65536) + { + libsais16_final_sorting_scan_right_to_left_32s(T, SA, induction_bucket, 0, n); + } +#if defined(_OPENMP) + else + { + fast_sint_t block_start, block_end; + for (block_start = (fast_sint_t)n - 1; block_start >= 0; block_start = block_end) + { + block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < 0) { block_end = -1; } + + libsais16_final_sorting_scan_right_to_left_32s_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads); + } + } +#else + UNUSED(thread_state); +#endif +} + +static void libsais16_clear_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT bucket_start, sa_sint_t * RESTRICT bucket_end, sa_sint_t threads) +{ + fast_sint_t c; + +#if defined(_OPENMP) + #pragma omp parallel for schedule(static, 1) num_threads(threads) if(threads > 1 && n >= 65536) +#else + UNUSED(threads); UNUSED(n); +#endif + for (c = 0; c < k; ++c) + { + if (bucket_end[c] > bucket_start[c]) + { + memset(&SA[bucket_start[c]], 0, ((size_t)bucket_end[c] - (size_t)bucket_start[c]) * sizeof(sa_sint_t)); + } + } +} + +static sa_sint_t libsais16_induce_final_order_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t bwt, sa_sint_t r, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + if (!bwt) + { + libsais16_final_sorting_scan_left_to_right_16u_omp(T, SA, n, &buckets[6 * ALPHABET_SIZE], threads, thread_state); + if (threads > 1 && n >= 65536) { libsais16_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE], &buckets[7 * ALPHABET_SIZE], threads); } + libsais16_final_sorting_scan_right_to_left_16u_omp(T, SA, n, &buckets[7 * ALPHABET_SIZE], threads, thread_state); + return 0; + } + else if (I != NULL) + { + libsais16_final_bwt_aux_scan_left_to_right_16u_omp(T, SA, n, r - 1, I, &buckets[6 * ALPHABET_SIZE], threads, thread_state); + if (threads > 1 && n >= 65536) { libsais16_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE], &buckets[7 * ALPHABET_SIZE], threads); } + libsais16_final_bwt_aux_scan_right_to_left_16u_omp(T, SA, n, r - 1, I, &buckets[7 * ALPHABET_SIZE], threads, thread_state); + return 0; + } + else + { + libsais16_final_bwt_scan_left_to_right_16u_omp(T, SA, n, &buckets[6 * ALPHABET_SIZE], threads, thread_state); + if (threads > 1 && n >= 65536) { libsais16_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE], &buckets[7 * ALPHABET_SIZE], threads); } + return libsais16_final_bwt_scan_right_to_left_16u_omp(T, SA, n, &buckets[7 * ALPHABET_SIZE], threads, thread_state); + } +} + +static void libsais16_induce_final_order_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + libsais16_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[4 * k], threads, thread_state); + libsais16_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[5 * k], threads, thread_state); +} + +static void libsais16_induce_final_order_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + libsais16_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[2 * k], threads, thread_state); + libsais16_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[3 * k], threads, thread_state); +} + +static void libsais16_induce_final_order_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + libsais16_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[1 * k], threads, thread_state); + libsais16_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[0 * k], threads, thread_state); +} + +static void libsais16_induce_final_order_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + libsais16_count_suffixes_32s(T, n, k, buckets); + libsais16_initialize_buckets_start_32s_1k(k, buckets); + libsais16_final_sorting_scan_left_to_right_32s_omp(T, SA, n, buckets, threads, thread_state); + + libsais16_count_suffixes_32s(T, n, k, buckets); + libsais16_initialize_buckets_end_32s_1k(k, buckets); + libsais16_final_sorting_scan_right_to_left_32s_omp(T, SA, n, buckets, threads, thread_state); +} + +static sa_sint_t libsais16_renumber_unique_and_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t f, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + sa_sint_t * RESTRICT SAm = &SA[m]; + + sa_sint_t i, j; + for (i = (sa_sint_t)omp_block_start, j = (sa_sint_t)omp_block_start + (sa_sint_t)omp_block_size - 2 * (sa_sint_t)prefetch_distance - 3; i < j; i += 4) + { + libsais16_prefetch(&SA[i + 3 * prefetch_distance]); + + libsais16_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 0]) >> 1]); + libsais16_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 1]) >> 1]); + libsais16_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 2]) >> 1]); + libsais16_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 3]) >> 1]); + + sa_uint_t q0 = (sa_uint_t)SA[i + prefetch_distance + 0]; const sa_sint_t * Tq0 = &T[q0]; libsais16_prefetchw(SAm[q0 >> 1] < 0 ? Tq0 : NULL); + sa_uint_t q1 = (sa_uint_t)SA[i + prefetch_distance + 1]; const sa_sint_t * Tq1 = &T[q1]; libsais16_prefetchw(SAm[q1 >> 1] < 0 ? Tq1 : NULL); + sa_uint_t q2 = (sa_uint_t)SA[i + prefetch_distance + 2]; const sa_sint_t * Tq2 = &T[q2]; libsais16_prefetchw(SAm[q2 >> 1] < 0 ? Tq2 : NULL); + sa_uint_t q3 = (sa_uint_t)SA[i + prefetch_distance + 3]; const sa_sint_t * Tq3 = &T[q3]; libsais16_prefetchw(SAm[q3 >> 1] < 0 ? Tq3 : NULL); + + sa_uint_t p0 = (sa_uint_t)SA[i + 0]; sa_sint_t s0 = SAm[p0 >> 1]; if (s0 < 0) { T[p0] |= SAINT_MIN; f++; s0 = i + 0 + SAINT_MIN + f; } SAm[p0 >> 1] = s0 - f; + sa_uint_t p1 = (sa_uint_t)SA[i + 1]; sa_sint_t s1 = SAm[p1 >> 1]; if (s1 < 0) { T[p1] |= SAINT_MIN; f++; s1 = i + 1 + SAINT_MIN + f; } SAm[p1 >> 1] = s1 - f; + sa_uint_t p2 = (sa_uint_t)SA[i + 2]; sa_sint_t s2 = SAm[p2 >> 1]; if (s2 < 0) { T[p2] |= SAINT_MIN; f++; s2 = i + 2 + SAINT_MIN + f; } SAm[p2 >> 1] = s2 - f; + sa_uint_t p3 = (sa_uint_t)SA[i + 3]; sa_sint_t s3 = SAm[p3 >> 1]; if (s3 < 0) { T[p3] |= SAINT_MIN; f++; s3 = i + 3 + SAINT_MIN + f; } SAm[p3 >> 1] = s3 - f; + } + + for (j += 2 * (sa_sint_t)prefetch_distance + 3; i < j; i += 1) + { + sa_uint_t p = (sa_uint_t)SA[i]; sa_sint_t s = SAm[p >> 1]; if (s < 0) { T[p] |= SAINT_MIN; f++; s = i + SAINT_MIN + f; } SAm[p >> 1] = s - f; + } + + return f; +} + +static void libsais16_compact_unique_and_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t * pl, fast_sint_t * pr, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + sa_sint_t * RESTRICT SAl = &SA[0]; + sa_sint_t * RESTRICT SAr = &SA[0]; + + fast_sint_t i, j, l = *pl - 1, r = *pr - 1; + for (i = (fast_sint_t)m + omp_block_start + omp_block_size - 1, j = (fast_sint_t)m + omp_block_start + 3; i >= j; i -= 4) + { + libsais16_prefetch(&SA[i - prefetch_distance]); + + sa_sint_t p0 = SA[i - 0]; SAl[l] = p0 & SAINT_MAX; l -= p0 < 0; SAr[r] = p0 - 1; r -= p0 > 0; + sa_sint_t p1 = SA[i - 1]; SAl[l] = p1 & SAINT_MAX; l -= p1 < 0; SAr[r] = p1 - 1; r -= p1 > 0; + sa_sint_t p2 = SA[i - 2]; SAl[l] = p2 & SAINT_MAX; l -= p2 < 0; SAr[r] = p2 - 1; r -= p2 > 0; + sa_sint_t p3 = SA[i - 3]; SAl[l] = p3 & SAINT_MAX; l -= p3 < 0; SAr[r] = p3 - 1; r -= p3 > 0; + } + + for (j -= 3; i >= j; i -= 1) + { + sa_sint_t p = SA[i]; SAl[l] = p & SAINT_MAX; l -= p < 0; SAr[r] = p - 1; r -= p > 0; + } + + *pl = l + 1; *pr = r + 1; +} + + +#if defined(_OPENMP) + +static sa_sint_t libsais16_count_unique_suffixes(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + sa_sint_t * RESTRICT SAm = &SA[m]; + + fast_sint_t i, j; sa_sint_t f0 = 0, f1 = 0, f2 = 0, f3 = 0; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) + { + libsais16_prefetch(&SA[i + 2 * prefetch_distance]); + + libsais16_prefetch(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]); + libsais16_prefetch(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]); + libsais16_prefetch(&SAm[((sa_uint_t)SA[i + prefetch_distance + 2]) >> 1]); + libsais16_prefetch(&SAm[((sa_uint_t)SA[i + prefetch_distance + 3]) >> 1]); + + f0 += SAm[((sa_uint_t)SA[i + 0]) >> 1] < 0; + f1 += SAm[((sa_uint_t)SA[i + 1]) >> 1] < 0; + f2 += SAm[((sa_uint_t)SA[i + 2]) >> 1] < 0; + f3 += SAm[((sa_uint_t)SA[i + 3]) >> 1] < 0; + } + + for (j += prefetch_distance + 3; i < j; i += 1) + { + f0 += SAm[((sa_uint_t)SA[i]) >> 1] < 0; + } + + return f0 + f1 + f2 + f3; +} + +#endif + +static sa_sint_t libsais16_renumber_unique_and_nonunique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + sa_sint_t f = 0; + +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start; + + if (omp_num_threads == 1) + { + f = libsais16_renumber_unique_and_nonunique_lms_suffixes_32s(T, SA, m, 0, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + thread_state[omp_thread_num].state.count = libsais16_count_unique_suffixes(SA, m, omp_block_start, omp_block_size); + } + + #pragma omp barrier + + { + fast_sint_t t, count = 0; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; } + + if (omp_thread_num == omp_num_threads - 1) + { + f = (sa_sint_t)(count + thread_state[omp_thread_num].state.count); + } + + libsais16_renumber_unique_and_nonunique_lms_suffixes_32s(T, SA, m, (sa_sint_t)count, omp_block_start, omp_block_size); + } + } +#endif + } + + return f; +} + +static void libsais16_compact_unique_and_nonunique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072 && m < fs) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start; + + if (omp_num_threads == 1) + { + fast_sint_t l = m, r = (fast_sint_t)n + (fast_sint_t)fs; + libsais16_compact_unique_and_nonunique_lms_suffixes_32s(SA, m, &l, &r, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + thread_state[omp_thread_num].state.position = (fast_sint_t)m + ((fast_sint_t)n >> 1) + omp_block_start + omp_block_size; + thread_state[omp_thread_num].state.count = (fast_sint_t)m + omp_block_start + omp_block_size; + + libsais16_compact_unique_and_nonunique_lms_suffixes_32s(SA, m, &thread_state[omp_thread_num].state.position, &thread_state[omp_thread_num].state.count, omp_block_start, omp_block_size); + } + + #pragma omp barrier + + #pragma omp master + { + fast_sint_t t, position; + + for (position = m, t = omp_num_threads - 1; t >= 0; --t) + { + fast_sint_t omp_block_end = t < omp_num_threads - 1 ? omp_block_stride * (t + 1) : ((fast_sint_t)n >> 1); + fast_sint_t count = ((fast_sint_t)m + ((fast_sint_t)n >> 1) + omp_block_end - thread_state[t].state.position); + + if (count > 0) + { + position -= count; memcpy(&SA[position], &SA[thread_state[t].state.position], (size_t)count * sizeof(sa_sint_t)); + } + } + + for (position = (fast_sint_t)n + (fast_sint_t)fs, t = omp_num_threads - 1; t >= 0; --t) + { + fast_sint_t omp_block_end = t < omp_num_threads - 1 ? omp_block_stride * (t + 1) : ((fast_sint_t)n >> 1); + fast_sint_t count = ((fast_sint_t)m + omp_block_end - thread_state[t].state.count); + + if (count > 0) + { + position -= count; memcpy(&SA[position], &SA[thread_state[t].state.count], (size_t)count * sizeof(sa_sint_t)); + } + } + } + } +#endif + } + + memcpy(&SA[(fast_sint_t)n + (fast_sint_t)fs - (fast_sint_t)m], &SA[(fast_sint_t)m - (fast_sint_t)f], (size_t)f * sizeof(sa_sint_t)); +} + +static sa_sint_t libsais16_compact_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + sa_sint_t f = libsais16_renumber_unique_and_nonunique_lms_suffixes_32s_omp(T, SA, m, threads, thread_state); + libsais16_compact_unique_and_nonunique_lms_suffixes_32s_omp(SA, n, m, fs, f, threads, thread_state); + + return f; +} + +static void libsais16_merge_unique_lms_suffixes_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, fast_sint_t l, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + const sa_sint_t * RESTRICT SAnm = &SA[(fast_sint_t)n - (fast_sint_t)m - 1 + l]; + + sa_sint_t i, j; fast_sint_t tmp = *SAnm++; + for (i = (sa_sint_t)omp_block_start, j = (sa_sint_t)omp_block_start + (sa_sint_t)omp_block_size - 6; i < j; i += 4) + { + libsais16_prefetch(&T[i + prefetch_distance]); + + sa_sint_t c0 = T[i + 0]; if (c0 < 0) { T[i + 0] = c0 & SAINT_MAX; SA[tmp] = i + 0; i++; tmp = *SAnm++; } + sa_sint_t c1 = T[i + 1]; if (c1 < 0) { T[i + 1] = c1 & SAINT_MAX; SA[tmp] = i + 1; i++; tmp = *SAnm++; } + sa_sint_t c2 = T[i + 2]; if (c2 < 0) { T[i + 2] = c2 & SAINT_MAX; SA[tmp] = i + 2; i++; tmp = *SAnm++; } + sa_sint_t c3 = T[i + 3]; if (c3 < 0) { T[i + 3] = c3 & SAINT_MAX; SA[tmp] = i + 3; i++; tmp = *SAnm++; } + } + + for (j += 6; i < j; i += 1) + { + sa_sint_t c = T[i]; if (c < 0) { T[i] = c & SAINT_MAX; SA[tmp] = i; i++; tmp = *SAnm++; } + } +} + +static void libsais16_merge_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, fast_sint_t l, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + const sa_sint_t * RESTRICT SAnm = &SA[(fast_sint_t)n - (fast_sint_t)m - 1 + l]; + + fast_sint_t i, j; sa_sint_t tmp = *SAnm++; + for (i = omp_block_start, j = omp_block_start + omp_block_size - 3; i < j; i += 4) + { + libsais16_prefetch(&SA[i + prefetch_distance]); + + if (SA[i + 0] == 0) { SA[i + 0] = tmp; tmp = *SAnm++; } + if (SA[i + 1] == 0) { SA[i + 1] = tmp; tmp = *SAnm++; } + if (SA[i + 2] == 0) { SA[i + 2] = tmp; tmp = *SAnm++; } + if (SA[i + 3] == 0) { SA[i + 3] = tmp; tmp = *SAnm++; } + } + + for (j += 3; i < j; i += 1) + { + if (SA[i] == 0) { SA[i] = tmp; tmp = *SAnm++; } + } +} + +static void libsais16_merge_unique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; + + if (omp_num_threads == 1) + { + libsais16_merge_unique_lms_suffixes_32s(T, SA, n, m, 0, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + thread_state[omp_thread_num].state.count = libsais16_count_negative_marked_suffixes(T, omp_block_start, omp_block_size); + } + + #pragma omp barrier + + { + fast_sint_t t, count = 0; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; } + + libsais16_merge_unique_lms_suffixes_32s(T, SA, n, m, count, omp_block_start, omp_block_size); + } + } +#endif + } +} + +static void libsais16_merge_nonunique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); UNUSED(thread_state); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start; + + if (omp_num_threads == 1) + { + libsais16_merge_nonunique_lms_suffixes_32s(SA, n, m, f, omp_block_start, omp_block_size); + } +#if defined(_OPENMP) + else + { + { + thread_state[omp_thread_num].state.count = libsais16_count_zero_marked_suffixes(SA, omp_block_start, omp_block_size); + } + + #pragma omp barrier + + { + fast_sint_t t, count = f; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; } + + libsais16_merge_nonunique_lms_suffixes_32s(SA, n, m, count, omp_block_start, omp_block_size); + } + } +#endif + } +} + +static void libsais16_merge_compacted_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + libsais16_merge_unique_lms_suffixes_32s_omp(T, SA, n, m, threads, thread_state); + libsais16_merge_nonunique_lms_suffixes_32s_omp(SA, n, m, f, threads, thread_state); +} + +static void libsais16_reconstruct_compacted_lms_suffixes_32s_2k_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, sa_sint_t fs, sa_sint_t f, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + if (f > 0) + { + memmove(&SA[n - m - 1], &SA[n + fs - m], (size_t)f * sizeof(sa_sint_t)); + + libsais16_count_and_gather_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state); + libsais16_reconstruct_lms_suffixes_omp(SA, n, m - f, threads); + + memcpy(&SA[n - m - 1 + f], &SA[0], ((size_t)m - (size_t)f) * sizeof(sa_sint_t)); + memset(&SA[0], 0, (size_t)m * sizeof(sa_sint_t)); + + libsais16_merge_compacted_lms_suffixes_32s_omp(T, SA, n, m, f, threads, thread_state); + } + else + { + libsais16_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n); + libsais16_reconstruct_lms_suffixes_omp(SA, n, m, threads); + } +} + +static void libsais16_reconstruct_compacted_lms_suffixes_32s_1k_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + if (f > 0) + { + memmove(&SA[n - m - 1], &SA[n + fs - m], (size_t)f * sizeof(sa_sint_t)); + + libsais16_gather_compacted_lms_suffixes_32s(T, SA, n); + libsais16_reconstruct_lms_suffixes_omp(SA, n, m - f, threads); + + memcpy(&SA[n - m - 1 + f], &SA[0], ((size_t)m - (size_t)f) * sizeof(sa_sint_t)); + memset(&SA[0], 0, (size_t)m * sizeof(sa_sint_t)); + + libsais16_merge_compacted_lms_suffixes_32s_omp(T, SA, n, m, f, threads, thread_state); + } + else + { + libsais16_gather_lms_suffixes_32s(T, SA, n); + libsais16_reconstruct_lms_suffixes_omp(SA, n, m, threads); + } +} + +static sa_sint_t libsais16_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + if (k > 0 && fs / k >= 6) + { + sa_sint_t alignment = (fs - 1024) / k >= 6 ? 1024 : 16; + sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 6 ? (sa_sint_t *)libsais16_align_up(&SA[n + fs - 6 * k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 6 * k]; + + sa_sint_t m = libsais16_count_and_gather_lms_suffixes_32s_4k_omp(T, SA, n, k, buckets, threads, thread_state); + if (m > 1) + { + memset(SA, 0, ((size_t)n - (size_t)m) * sizeof(sa_sint_t)); + + sa_sint_t first_lms_suffix = SA[n - m]; + sa_sint_t left_suffixes_count = libsais16_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(T, k, buckets, first_lms_suffix); + + libsais16_radix_sort_lms_suffixes_32s_6k_omp(T, SA, n, m, &buckets[4 * k], threads, thread_state); + libsais16_radix_sort_set_markers_32s_6k_omp(SA, k, &buckets[4 * k], threads); + + if (threads > 1 && n >= 65536) { memset(&SA[(fast_sint_t)n - (fast_sint_t)m], 0, (size_t)m * sizeof(sa_sint_t)); } + + libsais16_initialize_buckets_for_partial_sorting_32s_6k(T, k, buckets, first_lms_suffix, left_suffixes_count); + libsais16_induce_partial_order_32s_6k_omp(T, SA, n, k, buckets, first_lms_suffix, left_suffixes_count, threads, thread_state); + + sa_sint_t names = libsais16_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(SA, n, m, threads, thread_state); + if (names < m) + { + sa_sint_t f = libsais16_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state); + + if (libsais16_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state) != 0) + { + return -2; + } + + libsais16_reconstruct_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, m, fs, f, buckets, threads, thread_state); + } + else + { + libsais16_count_lms_suffixes_32s_2k(T, n, k, buckets); + } + + libsais16_initialize_buckets_start_and_end_32s_4k(k, buckets); + libsais16_place_lms_suffixes_histogram_32s_4k(SA, n, k, m, buckets); + libsais16_induce_final_order_32s_4k(T, SA, n, k, buckets, threads, thread_state); + } + else + { + SA[0] = SA[n - 1]; + + libsais16_initialize_buckets_start_and_end_32s_6k(k, buckets); + libsais16_place_lms_suffixes_histogram_32s_6k(SA, n, k, m, buckets); + libsais16_induce_final_order_32s_6k(T, SA, n, k, buckets, threads, thread_state); + } + + return 0; + } + else if (k > 0 && fs / k >= 4) + { + sa_sint_t alignment = (fs - 1024) / k >= 4 ? 1024 : 16; + sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 4 ? (sa_sint_t *)libsais16_align_up(&SA[n + fs - 4 * k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 4 * k]; + + sa_sint_t m = libsais16_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state); + if (m > 1) + { + libsais16_initialize_buckets_for_radix_and_partial_sorting_32s_4k(T, k, buckets, SA[n - m]); + + libsais16_radix_sort_lms_suffixes_32s_2k_omp(T, SA, n, m, &buckets[1], threads, thread_state); + libsais16_radix_sort_set_markers_32s_4k_omp(SA, k, &buckets[1], threads); + + libsais16_place_lms_suffixes_interval_32s_4k(SA, n, k, m - 1, buckets); + libsais16_induce_partial_order_32s_4k_omp(T, SA, n, k, buckets, threads, thread_state); + + sa_sint_t names = libsais16_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(SA, n, m, threads, thread_state); + if (names < m) + { + sa_sint_t f = libsais16_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state); + + if (libsais16_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state) != 0) + { + return -2; + } + + libsais16_reconstruct_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, m, fs, f, buckets, threads, thread_state); + } + else + { + libsais16_count_lms_suffixes_32s_2k(T, n, k, buckets); + } + } + else + { + SA[0] = SA[n - 1]; + } + + libsais16_initialize_buckets_start_and_end_32s_4k(k, buckets); + libsais16_place_lms_suffixes_histogram_32s_4k(SA, n, k, m, buckets); + libsais16_induce_final_order_32s_4k(T, SA, n, k, buckets, threads, thread_state); + + return 0; + } + else if (k > 0 && fs / k >= 2) + { + sa_sint_t alignment = (fs - 1024) / k >= 2 ? 1024 : 16; + sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 2 ? (sa_sint_t *)libsais16_align_up(&SA[n + fs - 2 * k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 2 * k]; + + sa_sint_t m = libsais16_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state); + if (m > 1) + { + libsais16_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(T, k, buckets, SA[n - m]); + + libsais16_radix_sort_lms_suffixes_32s_2k_omp(T, SA, n, m, &buckets[1], threads, thread_state); + libsais16_place_lms_suffixes_interval_32s_2k(SA, n, k, m - 1, buckets); + + libsais16_initialize_buckets_start_and_end_32s_2k(k, buckets); + libsais16_induce_partial_order_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state); + + sa_sint_t names = libsais16_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(T, SA, n, m, threads); + if (names < m) + { + sa_sint_t f = libsais16_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state); + + if (libsais16_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state) != 0) + { + return -2; + } + + libsais16_reconstruct_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, m, fs, f, buckets, threads, thread_state); + } + else + { + libsais16_count_lms_suffixes_32s_2k(T, n, k, buckets); + } + } + else + { + SA[0] = SA[n - 1]; + } + + libsais16_initialize_buckets_end_32s_2k(k, buckets); + libsais16_place_lms_suffixes_histogram_32s_2k(SA, n, k, m, buckets); + + libsais16_initialize_buckets_start_and_end_32s_2k(k, buckets); + libsais16_induce_final_order_32s_2k(T, SA, n, k, buckets, threads, thread_state); + + return 0; + } + else + { + sa_sint_t * buffer = fs < k ? (sa_sint_t *)libsais16_alloc_aligned((size_t)k * sizeof(sa_sint_t), 4096) : (sa_sint_t *)NULL; + + sa_sint_t alignment = fs - 1024 >= k ? 1024 : 16; + sa_sint_t * RESTRICT buckets = fs - alignment >= k ? (sa_sint_t *)libsais16_align_up(&SA[n + fs - k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : fs >= k ? &SA[n + fs - k] : buffer; + + if (buckets == NULL) { return -2; } + + memset(SA, 0, (size_t)n * sizeof(sa_sint_t)); + + libsais16_count_suffixes_32s(T, n, k, buckets); + libsais16_initialize_buckets_end_32s_1k(k, buckets); + + sa_sint_t m = libsais16_radix_sort_lms_suffixes_32s_1k(T, SA, n, buckets); + if (m > 1) + { + libsais16_induce_partial_order_32s_1k_omp(T, SA, n, k, buckets, threads, thread_state); + + sa_sint_t names = libsais16_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(T, SA, n, m, threads); + if (names < m) + { + if (buffer != NULL) { libsais16_free_aligned(buffer); buckets = NULL; } + + sa_sint_t f = libsais16_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state); + + if (libsais16_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state) != 0) + { + return -2; + } + + libsais16_reconstruct_compacted_lms_suffixes_32s_1k_omp(T, SA, n, m, fs, f, threads, thread_state); + + if (buckets == NULL) { buckets = buffer = (sa_sint_t *)libsais16_alloc_aligned((size_t)k * sizeof(sa_sint_t), 4096); } + if (buckets == NULL) { return -2; } + } + + libsais16_count_suffixes_32s(T, n, k, buckets); + libsais16_initialize_buckets_end_32s_1k(k, buckets); + libsais16_place_lms_suffixes_interval_32s_1k(T, SA, k, m, buckets); + } + + libsais16_induce_final_order_32s_1k(T, SA, n, k, buckets, threads, thread_state); + libsais16_free_aligned(buffer); + + return 0; + } +} + +static sa_sint_t libsais16_main_16u(const uint16_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t bwt, sa_sint_t r, sa_sint_t * RESTRICT I, sa_sint_t fs, sa_sint_t * freq, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + sa_sint_t m = libsais16_count_and_gather_lms_suffixes_16u_omp(T, SA, n, buckets, threads, thread_state); + + libsais16_initialize_buckets_start_and_end_16u(buckets, freq); + + if (m > 0) + { + sa_sint_t first_lms_suffix = SA[n - m]; + sa_sint_t left_suffixes_count = libsais16_initialize_buckets_for_lms_suffixes_radix_sort_16u(T, buckets, first_lms_suffix); + + if (threads > 1 && n >= 65536) { memset(SA, 0, ((size_t)n - (size_t)m) * sizeof(sa_sint_t)); } + libsais16_radix_sort_lms_suffixes_16u_omp(T, SA, n, m, buckets, threads, thread_state); + if (threads > 1 && n >= 65536) { memset(&SA[(fast_sint_t)n - (fast_sint_t)m], 0, (size_t)m * sizeof(sa_sint_t)); } + + libsais16_initialize_buckets_for_partial_sorting_16u(T, buckets, first_lms_suffix, left_suffixes_count); + libsais16_induce_partial_order_16u_omp(T, SA, n, buckets, first_lms_suffix, left_suffixes_count, threads, thread_state); + + sa_sint_t names = libsais16_renumber_and_gather_lms_suffixes_16u_omp(SA, n, m, fs, threads, thread_state); + if (names < m) + { + if (libsais16_main_32s(SA + n + fs - m, SA, m, names, fs + n - 2 * m, threads, thread_state) != 0) + { + return -2; + } + + libsais16_gather_lms_suffixes_16u_omp(T, SA, n, threads, thread_state); + libsais16_reconstruct_lms_suffixes_omp(SA, n, m, threads); + } + + libsais16_place_lms_suffixes_interval_16u(SA, n, m, buckets); + } + else + { + memset(SA, 0, (size_t)n * sizeof(sa_sint_t)); + } + + return libsais16_induce_final_order_16u_omp(T, SA, n, bwt, r, I, buckets, threads, thread_state); +} + +static sa_sint_t libsais16_main(const uint16_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t bwt, sa_sint_t r, sa_sint_t * I, sa_sint_t fs, sa_sint_t * freq, sa_sint_t threads) +{ + LIBSAIS_THREAD_STATE * RESTRICT thread_state = threads > 1 ? libsais16_alloc_thread_state(threads) : NULL; + sa_sint_t * RESTRICT buckets = (sa_sint_t *)libsais16_alloc_aligned(8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096); + + sa_sint_t index = buckets != NULL && (thread_state != NULL || threads == 1) + ? libsais16_main_16u(T, SA, n, buckets, bwt, r, I, fs, freq, threads, thread_state) + : -2; + + libsais16_free_aligned(buckets); + libsais16_free_thread_state(thread_state); + + return index; +} + +static sa_sint_t libsais16_main_ctx(const LIBSAIS_CONTEXT * ctx, const uint16_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t bwt, sa_sint_t r, sa_sint_t * I, sa_sint_t fs, sa_sint_t * freq) +{ + return ctx != NULL && (ctx->buckets != NULL && (ctx->thread_state != NULL || ctx->threads == 1)) + ? libsais16_main_16u(T, SA, n, ctx->buckets, bwt, r, I, fs, freq, (sa_sint_t)ctx->threads, ctx->thread_state) + : -2; +} + +static void libsais16_bwt_copy_16u(uint16_t * RESTRICT U, sa_sint_t * RESTRICT A, sa_sint_t n) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = 0, j = (fast_sint_t)n - 7; i < j; i += 8) + { + libsais16_prefetch(&A[i + prefetch_distance]); + + U[i + 0] = (uint16_t)A[i + 0]; + U[i + 1] = (uint16_t)A[i + 1]; + U[i + 2] = (uint16_t)A[i + 2]; + U[i + 3] = (uint16_t)A[i + 3]; + U[i + 4] = (uint16_t)A[i + 4]; + U[i + 5] = (uint16_t)A[i + 5]; + U[i + 6] = (uint16_t)A[i + 6]; + U[i + 7] = (uint16_t)A[i + 7]; + } + + for (j += 7; i < j; i += 1) + { + U[i] = (uint16_t)A[i]; + } +} + +#if defined(_OPENMP) + +static void libsais16_bwt_copy_16u_omp(uint16_t * RESTRICT U, sa_sint_t * RESTRICT A, sa_sint_t n, sa_sint_t threads) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); + fast_sint_t omp_block_stride = ((fast_sint_t)n / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : (fast_sint_t)n - omp_block_start; +#else + UNUSED(threads); + + fast_sint_t omp_block_start = 0; + fast_sint_t omp_block_size = (fast_sint_t)n; +#endif + + libsais16_bwt_copy_16u(U + omp_block_start, A + omp_block_start, (sa_sint_t)omp_block_size); + } +} + +#endif + +void * libsais16_create_ctx(void) +{ + return (void *)libsais16_create_ctx_main(1); +} + +void libsais16_free_ctx(void * ctx) +{ + libsais16_free_ctx_main((LIBSAIS_CONTEXT *)ctx); +} + +int32_t libsais16(const uint16_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq) +{ + if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0)) + { + return -1; + } + else if (n < 2) + { + if (n == 1) { SA[0] = 0; } + return 0; + } + + return libsais16_main(T, SA, n, 0, 0, NULL, fs, freq, 1); +} + +int32_t libsais16_ctx(const void * ctx, const uint16_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq) +{ + if ((ctx == NULL) || (T == NULL) || (SA == NULL) || (n < 0) || (fs < 0)) + { + return -1; + } + else if (n < 2) + { + if (n == 1) { SA[0] = 0; } + return 0; + } + + return libsais16_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, SA, n, 0, 0, NULL, fs, freq); +} + +int32_t libsais16_bwt(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq) +{ + if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0)) + { + return -1; + } + else if (n <= 1) + { + if (n == 1) { U[0] = T[0]; } + return n; + } + + sa_sint_t index = libsais16_main(T, A, n, 1, 0, NULL, fs, freq, 1); + if (index >= 0) + { + index++; + + U[0] = T[n - 1]; + libsais16_bwt_copy_16u(U + 1, A, index - 1); + libsais16_bwt_copy_16u(U + index, A + index, n - index); + } + + return index; +} + +int32_t libsais16_bwt_aux(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I) +{ + if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) || ((r & (r - 1)) != 0) || (I == NULL)) + { + return -1; + } + else if (n <= 1) + { + if (n == 1) { U[0] = T[0]; } + + I[0] = n; + return 0; + } + + if (libsais16_main(T, A, n, 1, r, I, fs, freq, 1) != 0) + { + return -2; + } + + U[0] = T[n - 1]; + libsais16_bwt_copy_16u(U + 1, A, I[0] - 1); + libsais16_bwt_copy_16u(U + I[0], A + I[0], n - I[0]); + + return 0; +} + +int32_t libsais16_bwt_ctx(const void * ctx, const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq) +{ + if ((ctx == NULL) || (T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0)) + { + return -1; + } + else if (n <= 1) + { + if (n == 1) { U[0] = T[0]; } + return n; + } + + sa_sint_t index = libsais16_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, A, n, 1, 0, NULL, fs, freq); + if (index >= 0) + { + index++; + + U[0] = T[n - 1]; + +#if defined(_OPENMP) + libsais16_bwt_copy_16u_omp(U + 1, A, index - 1, (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads); + libsais16_bwt_copy_16u_omp(U + index, A + index, n - index, (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads); +#else + libsais16_bwt_copy_16u(U + 1, A, index - 1); + libsais16_bwt_copy_16u(U + index, A + index, n - index); +#endif + } + + return index; +} + +int32_t libsais16_bwt_aux_ctx(const void * ctx, const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I) +{ + if ((ctx == NULL) || (T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) || ((r & (r - 1)) != 0) || (I == NULL)) + { + return -1; + } + else if (n <= 1) + { + if (n == 1) { U[0] = T[0]; } + + I[0] = n; + return 0; + } + + if (libsais16_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, A, n, 1, r, I, fs, freq) != 0) + { + return -2; + } + + U[0] = T[n - 1]; + +#if defined(_OPENMP) + libsais16_bwt_copy_16u_omp(U + 1, A, I[0] - 1, (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads); + libsais16_bwt_copy_16u_omp(U + I[0], A + I[0], n - I[0], (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads); +#else + libsais16_bwt_copy_16u(U + 1, A, I[0] - 1); + libsais16_bwt_copy_16u(U + I[0], A + I[0], n - I[0]); +#endif + + return 0; +} + +#if defined(_OPENMP) + +void * libsais16_create_ctx_omp(int32_t threads) +{ + if (threads < 0) { return NULL; } + + threads = threads > 0 ? threads : omp_get_max_threads(); + return (void *)libsais16_create_ctx_main(threads); +} + +int32_t libsais16_omp(const uint16_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq, int32_t threads) +{ + if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0) || (threads < 0)) + { + return -1; + } + else if (n < 2) + { + if (n == 1) { SA[0] = 0; } + return 0; + } + + threads = threads > 0 ? threads : omp_get_max_threads(); + + return libsais16_main(T, SA, n, 0, 0, NULL, fs, freq, threads); +} + +int32_t libsais16_bwt_omp(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t threads) +{ + if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (threads < 0)) + { + return -1; + } + else if (n <= 1) + { + if (n == 1) { U[0] = T[0]; } + return n; + } + + threads = threads > 0 ? threads : omp_get_max_threads(); + + sa_sint_t index = libsais16_main(T, A, n, 1, 0, NULL, fs, freq, threads); + if (index >= 0) + { + index++; + + U[0] = T[n - 1]; + libsais16_bwt_copy_16u_omp(U + 1, A, index - 1, threads); + libsais16_bwt_copy_16u_omp(U + index, A + index, n - index, threads); + } + + return index; +} + +int32_t libsais16_bwt_aux_omp(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I, int32_t threads) +{ + if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) || ((r & (r - 1)) != 0) || (I == NULL) || (threads < 0)) + { + return -1; + } + else if (n <= 1) + { + if (n == 1) { U[0] = T[0];} + + I[0] = n; + return 0; + } + + threads = threads > 0 ? threads : omp_get_max_threads(); + + if (libsais16_main(T, A, n, 1, r, I, fs, freq, threads) != 0) + { + return -2; + } + + U[0] = T[n - 1]; + libsais16_bwt_copy_16u_omp(U + 1, A, I[0] - 1, threads); + libsais16_bwt_copy_16u_omp(U + I[0], A + I[0], n - I[0], threads); + + return 0; +} + +#endif + +static LIBSAIS_UNBWT_CONTEXT * libsais16_unbwt_create_ctx_main(sa_sint_t threads) +{ + LIBSAIS_UNBWT_CONTEXT * RESTRICT ctx = (LIBSAIS_UNBWT_CONTEXT *)libsais16_alloc_aligned(sizeof(LIBSAIS_UNBWT_CONTEXT), 64); + sa_uint_t * RESTRICT bucket2 = (sa_uint_t *)libsais16_alloc_aligned(ALPHABET_SIZE * sizeof(sa_uint_t), 4096); + uint16_t * RESTRICT fastbits = (uint16_t *)libsais16_alloc_aligned((1 + (1 << UNBWT_FASTBITS)) * sizeof(uint16_t), 4096); + sa_uint_t * RESTRICT buckets = threads > 1 ? (sa_uint_t *)libsais16_alloc_aligned((size_t)threads * ALPHABET_SIZE * sizeof(sa_uint_t), 4096) : NULL; + + if (ctx != NULL && bucket2 != NULL && fastbits != NULL && (buckets != NULL || threads == 1)) + { + ctx->bucket2 = bucket2; + ctx->fastbits = fastbits; + ctx->buckets = buckets; + ctx->threads = threads; + + return ctx; + } + + libsais16_free_aligned(buckets); + libsais16_free_aligned(fastbits); + libsais16_free_aligned(bucket2); + libsais16_free_aligned(ctx); + + return NULL; +} + +static void libsais16_unbwt_free_ctx_main(LIBSAIS_UNBWT_CONTEXT * ctx) +{ + if (ctx != NULL) + { + libsais16_free_aligned(ctx->buckets); + libsais16_free_aligned(ctx->fastbits); + libsais16_free_aligned(ctx->bucket2); + libsais16_free_aligned(ctx); + } +} + +static void libsais16_unbwt_compute_histogram(const uint16_t * RESTRICT T, fast_sint_t n, sa_uint_t * RESTRICT count) +{ + fast_sint_t i; for (i = 0; i < n; i += 1) { count[T[i]]++; } +} + +static void libsais16_unbwt_calculate_fastbits(sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift) +{ + fast_uint_t v, w, sum; + for (v = 0, sum = 1, w = 0; w < ALPHABET_SIZE; ++w) + { + fast_uint_t prev = sum; sum += bucket2[w]; bucket2[w] = (sa_uint_t)prev; + if (prev != sum) + { + for (; v <= ((sum - 1) >> shift); ++v) { fastbits[v] = (uint16_t)w; } + } + } +} + +static void libsais16_unbwt_calculate_P(const uint16_t * RESTRICT T, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, fast_uint_t index, fast_sint_t omp_block_start, fast_sint_t omp_block_end) +{ + { + fast_sint_t i = omp_block_start, j = (fast_sint_t)index; if (omp_block_end < j) { j = omp_block_end; } + for (; i < j; ++i) { fast_uint_t c = T[i]; P[bucket2[c]++] = (sa_uint_t)i; } + } + + { + fast_sint_t i = (fast_sint_t)index, j = omp_block_end; if (omp_block_start > i) { i = omp_block_start; } + for (T -= 1, i += 1; i <= j; ++i) { fast_uint_t c = T[i]; P[bucket2[c]++] = (sa_uint_t)i; } + } +} + +static void libsais16_unbwt_init_single(const uint16_t * RESTRICT T, sa_uint_t * RESTRICT P, sa_sint_t n, const sa_sint_t * freq, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits) +{ + fast_uint_t index = I[0]; + fast_uint_t shift = 0; while ((n >> shift) > (1 << UNBWT_FASTBITS)) { shift++; } + + if (freq != NULL) + { + memcpy(bucket2, freq, ALPHABET_SIZE * sizeof(sa_uint_t)); + } + else + { + memset(bucket2, 0, ALPHABET_SIZE * sizeof(sa_uint_t)); + libsais16_unbwt_compute_histogram(T, n, bucket2); + } + + libsais16_unbwt_calculate_fastbits(bucket2, fastbits, shift); + libsais16_unbwt_calculate_P(T, P, bucket2, index, 0, n); +} + +#if defined(_OPENMP) + +static void libsais16_unbwt_init_parallel(const uint16_t * RESTRICT T, sa_uint_t * RESTRICT P, sa_sint_t n, const sa_sint_t * freq, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, sa_uint_t * RESTRICT buckets, sa_sint_t threads) +{ + fast_uint_t index = I[0]; + fast_uint_t shift = 0; while ((n >> shift) > (1 << UNBWT_FASTBITS)) { shift++; } + + #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) + { + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); + + if (omp_num_threads == 1) + { + libsais16_unbwt_init_single(T, P, n, freq, I, bucket2, fastbits); + } + else + { + { + sa_uint_t * RESTRICT bucket2_local = buckets + omp_thread_num * ALPHABET_SIZE; + fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; + + memset(bucket2_local, 0, ALPHABET_SIZE * sizeof(sa_uint_t)); + libsais16_unbwt_compute_histogram(T + omp_block_start, omp_block_size, bucket2_local); + } + + #pragma omp barrier + + { + sa_uint_t * RESTRICT bucket2_temp = buckets; + fast_sint_t omp_block_stride = (ALPHABET_SIZE / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ALPHABET_SIZE - omp_block_start; + + memset(bucket2 + omp_block_start, 0, omp_block_size * sizeof(sa_uint_t)); + + fast_sint_t t; + for (t = 0; t < omp_num_threads; ++t, bucket2_temp += ALPHABET_SIZE) + { + fast_sint_t c; for (c = omp_block_start; c < omp_block_start + omp_block_size; c += 1) { sa_uint_t A = bucket2[c], B = bucket2_temp[c]; bucket2[c] = A + B; bucket2_temp[c] = A; } + } + } + + #pragma omp barrier + + #pragma omp master + { + libsais16_unbwt_calculate_fastbits(bucket2, fastbits, shift); + } + + #pragma omp barrier + + { + sa_uint_t * RESTRICT bucket2_local = buckets + omp_thread_num * ALPHABET_SIZE; + fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; + + fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_uint_t A = bucket2[c], B = bucket2_local[c]; bucket2_local[c] = A + B; } + + libsais16_unbwt_calculate_P(T, P, bucket2_local, index, omp_block_start, omp_block_start + omp_block_size); + } + + #pragma omp barrier + + #pragma omp master + { + memcpy(bucket2, buckets + (omp_num_threads - 1) * ALPHABET_SIZE, ALPHABET_SIZE * sizeof(sa_uint_t)); + } + } + } +} + +#endif + +static void libsais16_unbwt_decode_1(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t * i0, fast_uint_t k) +{ + uint16_t * RESTRICT U0 = U; + + fast_uint_t i, p0 = *i0; + + for (i = 0; i != k; ++i) + { + uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0; + } + + *i0 = p0; +} + +static void libsais16_unbwt_decode_2(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t k) +{ + uint16_t * RESTRICT U0 = U; + uint16_t * RESTRICT U1 = U0 + r; + + fast_uint_t i, p0 = *i0, p1 = *i1; + + for (i = 0; i != k; ++i) + { + uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0; + uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = c1; + } + + *i0 = p0; *i1 = p1; +} + +static void libsais16_unbwt_decode_3(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t k) +{ + uint16_t * RESTRICT U0 = U; + uint16_t * RESTRICT U1 = U0 + r; + uint16_t * RESTRICT U2 = U1 + r; + + fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2; + + for (i = 0; i != k; ++i) + { + uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0; + uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = c1; + uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = c2; + } + + *i0 = p0; *i1 = p1; *i2 = p2; +} + +static void libsais16_unbwt_decode_4(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t k) +{ + uint16_t * RESTRICT U0 = U; + uint16_t * RESTRICT U1 = U0 + r; + uint16_t * RESTRICT U2 = U1 + r; + uint16_t * RESTRICT U3 = U2 + r; + + fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3; + + for (i = 0; i != k; ++i) + { + uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0; + uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = c1; + uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = c2; + uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = c3; + } + + *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; +} + +static void libsais16_unbwt_decode_5(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t k) +{ + uint16_t * RESTRICT U0 = U; + uint16_t * RESTRICT U1 = U0 + r; + uint16_t * RESTRICT U2 = U1 + r; + uint16_t * RESTRICT U3 = U2 + r; + uint16_t * RESTRICT U4 = U3 + r; + + fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4; + + for (i = 0; i != k; ++i) + { + uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0; + uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = c1; + uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = c2; + uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = c3; + uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = c4; + } + + *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4; +} + +static void libsais16_unbwt_decode_6(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t * i5, fast_uint_t k) +{ + uint16_t * RESTRICT U0 = U; + uint16_t * RESTRICT U1 = U0 + r; + uint16_t * RESTRICT U2 = U1 + r; + uint16_t * RESTRICT U3 = U2 + r; + uint16_t * RESTRICT U4 = U3 + r; + uint16_t * RESTRICT U5 = U4 + r; + + fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5; + + for (i = 0; i != k; ++i) + { + uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0; + uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = c1; + uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = c2; + uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = c3; + uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = c4; + uint16_t c5 = fastbits[p5 >> shift]; if (bucket2[c5] <= p5) { do { c5++; } while (bucket2[c5] <= p5); } p5 = P[p5]; U5[i] = c5; + } + + *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4; *i5 = p5; +} + +static void libsais16_unbwt_decode_7(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t * i5, fast_uint_t * i6, fast_uint_t k) +{ + uint16_t * RESTRICT U0 = U; + uint16_t * RESTRICT U1 = U0 + r; + uint16_t * RESTRICT U2 = U1 + r; + uint16_t * RESTRICT U3 = U2 + r; + uint16_t * RESTRICT U4 = U3 + r; + uint16_t * RESTRICT U5 = U4 + r; + uint16_t * RESTRICT U6 = U5 + r; + + fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5, p6 = *i6; + + for (i = 0; i != k; ++i) + { + uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0; + uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = c1; + uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = c2; + uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = c3; + uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = c4; + uint16_t c5 = fastbits[p5 >> shift]; if (bucket2[c5] <= p5) { do { c5++; } while (bucket2[c5] <= p5); } p5 = P[p5]; U5[i] = c5; + uint16_t c6 = fastbits[p6 >> shift]; if (bucket2[c6] <= p6) { do { c6++; } while (bucket2[c6] <= p6); } p6 = P[p6]; U6[i] = c6; + } + + *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4; *i5 = p5; *i6 = p6; +} + +static void libsais16_unbwt_decode_8(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t * i5, fast_uint_t * i6, fast_uint_t * i7, fast_uint_t k) +{ + uint16_t * RESTRICT U0 = U; + uint16_t * RESTRICT U1 = U0 + r; + uint16_t * RESTRICT U2 = U1 + r; + uint16_t * RESTRICT U3 = U2 + r; + uint16_t * RESTRICT U4 = U3 + r; + uint16_t * RESTRICT U5 = U4 + r; + uint16_t * RESTRICT U6 = U5 + r; + uint16_t * RESTRICT U7 = U6 + r; + + fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5, p6 = *i6, p7 = *i7; + + for (i = 0; i != k; ++i) + { + uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0; + uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = c1; + uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = c2; + uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = c3; + uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = c4; + uint16_t c5 = fastbits[p5 >> shift]; if (bucket2[c5] <= p5) { do { c5++; } while (bucket2[c5] <= p5); } p5 = P[p5]; U5[i] = c5; + uint16_t c6 = fastbits[p6 >> shift]; if (bucket2[c6] <= p6) { do { c6++; } while (bucket2[c6] <= p6); } p6 = P[p6]; U6[i] = c6; + uint16_t c7 = fastbits[p7 >> shift]; if (bucket2[c7] <= p7) { do { c7++; } while (bucket2[c7] <= p7); } p7 = P[p7]; U7[i] = c7; + } + + *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4; *i5 = p5; *i6 = p6; *i7 = p7; +} + +static void libsais16_unbwt_decode(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n, sa_sint_t r, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_sint_t blocks, fast_uint_t reminder) +{ + fast_uint_t shift = 0; while ((n >> shift) > (1 << UNBWT_FASTBITS)) { shift++; } + fast_uint_t offset = 0; + + while (blocks > 8) + { + fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6], i7 = I[7]; + libsais16_unbwt_decode_8(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, &i7, (fast_uint_t)r); + I += 8; blocks -= 8; offset += 8 * (fast_uint_t)r; + } + + if (blocks == 1) + { + fast_uint_t i0 = I[0]; + libsais16_unbwt_decode_1(U + offset, P, bucket2, fastbits, shift, &i0, reminder); + } + else if (blocks == 2) + { + fast_uint_t i0 = I[0], i1 = I[1]; + libsais16_unbwt_decode_2(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, reminder); + libsais16_unbwt_decode_1(U + offset + reminder, P, bucket2, fastbits, shift, &i0, ((fast_uint_t)r) - reminder); + } + else if (blocks == 3) + { + fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2]; + libsais16_unbwt_decode_3(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, reminder); + libsais16_unbwt_decode_2(U + offset + reminder, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, ((fast_uint_t)r) - reminder); + } + else if (blocks == 4) + { + fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3]; + libsais16_unbwt_decode_4(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, reminder); + libsais16_unbwt_decode_3(U + offset + reminder, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, ((fast_uint_t)r) - reminder); + } + else if (blocks == 5) + { + fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4]; + libsais16_unbwt_decode_5(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, reminder); + libsais16_unbwt_decode_4(U + offset + reminder, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, ((fast_uint_t)r) - reminder); + } + else if (blocks == 6) + { + fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5]; + libsais16_unbwt_decode_6(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, reminder); + libsais16_unbwt_decode_5(U + offset + reminder, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, ((fast_uint_t)r) - reminder); + } + else if (blocks == 7) + { + fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6]; + libsais16_unbwt_decode_7(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, reminder); + libsais16_unbwt_decode_6(U + offset + reminder, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, ((fast_uint_t)r) - reminder); + } + else + { + fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6], i7 = I[7]; + libsais16_unbwt_decode_8(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, &i7, reminder); + libsais16_unbwt_decode_7(U + offset + reminder, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, ((fast_uint_t)r) - reminder); + } +} + +static void libsais16_unbwt_decode_omp(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n, sa_sint_t r, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, sa_sint_t threads) +{ + fast_sint_t blocks = 1 + (((fast_sint_t)n - 1) / (fast_sint_t)r); + fast_uint_t reminder = (fast_uint_t)n - ((fast_uint_t)r * ((fast_uint_t)blocks - 1)); + +#if defined(_OPENMP) + fast_sint_t max_threads = blocks < threads ? blocks : threads; + #pragma omp parallel num_threads(max_threads) if(max_threads > 1 && n >= 65536) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + + fast_sint_t omp_block_stride = blocks / omp_num_threads; + fast_sint_t omp_block_reminder = blocks % omp_num_threads; + fast_sint_t omp_block_size = omp_block_stride + (omp_thread_num < omp_block_reminder); + fast_sint_t omp_block_start = omp_block_stride * omp_thread_num + (omp_thread_num < omp_block_reminder ? omp_thread_num : omp_block_reminder); + + libsais16_unbwt_decode(U + r * omp_block_start, P, n, r, I + omp_block_start, bucket2, fastbits, omp_block_size, omp_thread_num < omp_num_threads - 1 ? (fast_uint_t)r : reminder); + } +} + +static sa_sint_t libsais16_unbwt_core(const uint16_t * RESTRICT T, uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n, const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, sa_uint_t * RESTRICT buckets, sa_sint_t threads) +{ +#if defined(_OPENMP) + if (threads > 1 && n >= 262144) + { + libsais16_unbwt_init_parallel(T, P, n, freq, I, bucket2, fastbits, buckets, threads); + } + else +#else + UNUSED(buckets); +#endif + { + libsais16_unbwt_init_single(T, P, n, freq, I, bucket2, fastbits); + } + + libsais16_unbwt_decode_omp(U, P, n, r, I, bucket2, fastbits, threads); + return 0; +} + +static sa_sint_t libsais16_unbwt_main(const uint16_t * T, uint16_t * U, sa_uint_t * P, sa_sint_t n, const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * I, sa_sint_t threads) +{ + fast_uint_t shift = 0; while ((n >> shift) > (1 << UNBWT_FASTBITS)) { shift++; } + + sa_uint_t * RESTRICT bucket2 = (sa_uint_t *)libsais16_alloc_aligned(ALPHABET_SIZE * sizeof(sa_uint_t), 4096); + uint16_t * RESTRICT fastbits = (uint16_t *)libsais16_alloc_aligned(((size_t)1 + (size_t)(n >> shift)) * sizeof(uint16_t), 4096); + sa_uint_t * RESTRICT buckets = threads > 1 && n >= 262144 ? (sa_uint_t *)libsais16_alloc_aligned((size_t)threads * ALPHABET_SIZE * sizeof(sa_uint_t), 4096) : NULL; + + sa_sint_t index = bucket2 != NULL && fastbits != NULL && (buckets != NULL || threads == 1 || n < 262144) + ? libsais16_unbwt_core(T, U, P, n, freq, r, I, bucket2, fastbits, buckets, threads) + : -2; + + libsais16_free_aligned(buckets); + libsais16_free_aligned(fastbits); + libsais16_free_aligned(bucket2); + + return index; +} + +static sa_sint_t libsais16_unbwt_main_ctx(const LIBSAIS_UNBWT_CONTEXT * ctx, const uint16_t * T, uint16_t * U, sa_uint_t * P, sa_sint_t n, const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * I) +{ + return ctx != NULL && ctx->bucket2 != NULL && ctx->fastbits != NULL && (ctx->buckets != NULL || ctx->threads == 1) + ? libsais16_unbwt_core(T, U, P, n, freq, r, I, ctx->bucket2, ctx->fastbits, ctx->buckets, (sa_sint_t)ctx->threads) + : -2; +} + +void * libsais16_unbwt_create_ctx(void) +{ + return (void *)libsais16_unbwt_create_ctx_main(1); +} + +void libsais16_unbwt_free_ctx(void * ctx) +{ + libsais16_unbwt_free_ctx_main((LIBSAIS_UNBWT_CONTEXT *)ctx); +} + +int32_t libsais16_unbwt(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i) +{ + return libsais16_unbwt_aux(T, U, A, n, freq, n, &i); +} + +int32_t libsais16_unbwt_ctx(const void * ctx, const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i) +{ + return libsais16_unbwt_aux_ctx(ctx, T, U, A, n, freq, n, &i); +} + +int32_t libsais16_unbwt_aux(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I) +{ + if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL)) + { + return -1; + } + else if (n <= 1) + { + if (I[0] != n) { return -1; } + if (n == 1) { U[0] = T[0]; } + return 0; + } + + fast_sint_t t; for (t = 0; t <= (n - 1) / r; ++t) { if (I[t] <= 0 || I[t] > n) { return -1; } } + + return libsais16_unbwt_main(T, U, (sa_uint_t *)A, n, freq, r, (const sa_uint_t *)I, 1); +} + +int32_t libsais16_unbwt_aux_ctx(const void * ctx, const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I) +{ + if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL)) + { + return -1; + } + else if (n <= 1) + { + if (I[0] != n) { return -1; } + if (n == 1) { U[0] = T[0]; } + return 0; + } + + fast_sint_t t; for (t = 0; t <= (n - 1) / r; ++t) { if (I[t] <= 0 || I[t] > n) { return -1; } } + + return libsais16_unbwt_main_ctx((const LIBSAIS_UNBWT_CONTEXT *)ctx, T, U, (sa_uint_t *)A, n, freq, r, (const sa_uint_t *)I); +} + +#if defined(_OPENMP) + +void * libsais16_unbwt_create_ctx_omp(int32_t threads) +{ + if (threads < 0) { return NULL; } + + threads = threads > 0 ? threads : omp_get_max_threads(); + return (void *)libsais16_unbwt_create_ctx_main(threads); +} + +int32_t libsais16_unbwt_omp(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i, int32_t threads) +{ + return libsais16_unbwt_aux_omp(T, U, A, n, freq, n, &i, threads); +} + +int32_t libsais16_unbwt_aux_omp(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I, int32_t threads) +{ + if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL) || (threads < 0)) + { + return -1; + } + else if (n <= 1) + { + if (I[0] != n) { return -1; } + if (n == 1) { U[0] = T[0]; } + return 0; + } + + fast_sint_t t; for (t = 0; t <= (n - 1) / r; ++t) { if (I[t] <= 0 || I[t] > n) { return -1; } } + + threads = threads > 0 ? threads : omp_get_max_threads(); + return libsais16_unbwt_main(T, U, (sa_uint_t *)A, n, freq, r, (const sa_uint_t *)I, threads); +} + +#endif diff --git a/libsais/libsais16.h b/libsais/libsais16.h new file mode 100644 index 0000000..c577058 --- /dev/null +++ b/libsais/libsais16.h @@ -0,0 +1,285 @@ +/*-- + +This file is a part of libsais, a library for linear time +suffix array and burrows wheeler transform construction. + + Copyright (c) 2021 Ilya Grebnov + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +Please see the file LICENSE for full copyright information. + +--*/ + +#ifndef LIBSAIS16_H +#define LIBSAIS16_H 1 + +#ifdef __cplusplus +extern "C" { +#endif + + #include + + /** + * Creates the libsais16 context that allows reusing allocated memory with each libsais16 operation. + * In multi-threaded environments, use one context per thread for parallel executions. + * @return the libsais16 context, NULL otherwise. + */ + void * libsais16_create_ctx(void); + +#if defined(_OPENMP) + /** + * Creates the libsais16 context that allows reusing allocated memory with each parallel libsais16 operation using OpenMP. + * In multi-threaded environments, use one context per thread for parallel executions. + * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). + * @return the libsais16 context, NULL otherwise. + */ + void * libsais16_create_ctx_omp(int32_t threads); +#endif + + /** + * Destroys the libsass context and free previusly allocated memory. + * @param ctx The libsais16 context (can be NULL). + */ + void libsais16_free_ctx(void * ctx); + + /** + * Constructs the suffix array of a given 16-bit string. + * @param T [0..n-1] The input 16-bit string. + * @param SA [0..n-1+fs] The output array of suffixes. + * @param n The length of the given 16-bit string. + * @param fs The extra space available at the end of SA array (can be 0). + * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL). + * @return 0 if no error occurred, -1 or -2 otherwise. + */ + int32_t libsais16(const uint16_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq); + + /** + * Constructs the suffix array of a given 16-bit string using libsais16 context. + * @param ctx The libsais16 context. + * @param T [0..n-1] The input 16-bit string. + * @param SA [0..n-1+fs] The output array of suffixes. + * @param n The length of the given 16-bit string. + * @param fs The extra space available at the end of SA array (can be 0). + * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL). + * @return 0 if no error occurred, -1 or -2 otherwise. + */ + int32_t libsais16_ctx(const void * ctx, const uint16_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq); + +#if defined(_OPENMP) + /** + * Constructs the suffix array of a given 16-bit string in parallel using OpenMP. + * @param T [0..n-1] The input 16-bit string. + * @param SA [0..n-1+fs] The output array of suffixes. + * @param n The length of the given 16-bit string. + * @param fs The extra space available at the end of SA array (can be 0). + * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL). + * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). + * @return 0 if no error occurred, -1 or -2 otherwise. + */ + int32_t libsais16_omp(const uint16_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq, int32_t threads); +#endif + + /** + * Constructs the burrows-wheeler transformed 16-bit string of a given 16-bit string. + * @param T [0..n-1] The input 16-bit string. + * @param U [0..n-1] The output 16-bit string (can be T). + * @param A [0..n-1+fs] The temporary array. + * @param n The length of the given 16-bit string. + * @param fs The extra space available at the end of A array (can be 0). + * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL). + * @return The primary index if no error occurred, -1 or -2 otherwise. + */ + int32_t libsais16_bwt(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq); + + /** + * Constructs the burrows-wheeler transformed 16-bit string of a given 16-bit string with auxiliary indexes. + * @param T [0..n-1] The input 16-bit string. + * @param U [0..n-1] The output 16-bit string (can be T). + * @param A [0..n-1+fs] The temporary array. + * @param n The length of the given 16-bit string. + * @param fs The extra space available at the end of A array (can be 0). + * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL). + * @param r The sampling rate for auxiliary indexes (must be power of 2). + * @param I [0..(n-1)/r] The output auxiliary indexes. + * @return 0 if no error occurred, -1 or -2 otherwise. + */ + int32_t libsais16_bwt_aux(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I); + + /** + * Constructs the burrows-wheeler transformed 16-bit string of a given 16-bit string using libsais16 context. + * @param ctx The libsais16 context. + * @param T [0..n-1] The input 16-bit string. + * @param U [0..n-1] The output 16-bit string (can be T). + * @param A [0..n-1+fs] The temporary array. + * @param n The length of the given 16-bit string. + * @param fs The extra space available at the end of A array (can be 0). + * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL). + * @return The primary index if no error occurred, -1 or -2 otherwise. + */ + int32_t libsais16_bwt_ctx(const void * ctx, const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq); + + /** + * Constructs the burrows-wheeler transformed 16-bit string of a given 16-bit string with auxiliary indexes using libsais16 context. + * @param ctx The libsais16 context. + * @param T [0..n-1] The input 16-bit string. + * @param U [0..n-1] The output 16-bit string (can be T). + * @param A [0..n-1+fs] The temporary array. + * @param n The length of the given 16-bit string. + * @param fs The extra space available at the end of A array (can be 0). + * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL). + * @param r The sampling rate for auxiliary indexes (must be power of 2). + * @param I [0..(n-1)/r] The output auxiliary indexes. + * @return 0 if no error occurred, -1 or -2 otherwise. + */ + int32_t libsais16_bwt_aux_ctx(const void * ctx, const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I); + +#if defined(_OPENMP) + /** + * Constructs the burrows-wheeler transformed 16-bit string of a given 16-bit string in parallel using OpenMP. + * @param T [0..n-1] The input 16-bit string. + * @param U [0..n-1] The output 16-bit string (can be T). + * @param A [0..n-1+fs] The temporary array. + * @param n The length of the given 16-bit string. + * @param fs The extra space available at the end of A array (can be 0). + * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL). + * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). + * @return The primary index if no error occurred, -1 or -2 otherwise. + */ + int32_t libsais16_bwt_omp(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t threads); + + /** + * Constructs the burrows-wheeler transformed 16-bit string of a given 16-bit string with auxiliary indexes in parallel using OpenMP. + * @param T [0..n-1] The input 16-bit string. + * @param U [0..n-1] The output 16-bit string (can be T). + * @param A [0..n-1+fs] The temporary array. + * @param n The length of the given 16-bit string. + * @param fs The extra space available at the end of A array (can be 0). + * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL). + * @param r The sampling rate for auxiliary indexes (must be power of 2). + * @param I [0..(n-1)/r] The output auxiliary indexes. + * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). + * @return 0 if no error occurred, -1 or -2 otherwise. + */ + int32_t libsais16_bwt_aux_omp(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I, int32_t threads); +#endif + + /** + * Creates the libsais16 reverse BWT context that allows reusing allocated memory with each libsais16_unbwt_* operation. + * In multi-threaded environments, use one context per thread for parallel executions. + * @return the libsais16 context, NULL otherwise. + */ + void * libsais16_unbwt_create_ctx(void); + +#if defined(_OPENMP) + /** + * Creates the libsais16 reverse BWT context that allows reusing allocated memory with each parallel libsais16_unbwt_* operation using OpenMP. + * In multi-threaded environments, use one context per thread for parallel executions. + * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). + * @return the libsais16 context, NULL otherwise. + */ + void * libsais16_unbwt_create_ctx_omp(int32_t threads); +#endif + + /** + * Destroys the libsass reverse BWT context and free previusly allocated memory. + * @param ctx The libsais16 context (can be NULL). + */ + void libsais16_unbwt_free_ctx(void * ctx); + + /** + * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string with primary index. + * @param T [0..n-1] The input 16-bit string. + * @param U [0..n-1] The output 16-bit string (can be T). + * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). + * @param n The length of the given 16-bit string. + * @param freq [0..65535] The input 16-bit symbol frequency table (can be NULL). + * @param i The primary index. + * @return 0 if no error occurred, -1 or -2 otherwise. + */ + int32_t libsais16_unbwt(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i); + + /** + * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string with primary index using libsais16 reverse BWT context. + * @param ctx The libsais16 reverse BWT context. + * @param T [0..n-1] The input 16-bit string. + * @param U [0..n-1] The output 16-bit string (can be T). + * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). + * @param n The length of the given 16-bit string. + * @param freq [0..65535] The input 16-bit symbol frequency table (can be NULL). + * @param i The primary index. + * @return 0 if no error occurred, -1 or -2 otherwise. + */ + int32_t libsais16_unbwt_ctx(const void * ctx, const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i); + + /** + * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string with auxiliary indexes. + * @param T [0..n-1] The input 16-bit string. + * @param U [0..n-1] The output 16-bit string (can be T). + * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). + * @param n The length of the given 16-bit string. + * @param freq [0..65535] The input 16-bit symbol frequency table (can be NULL). + * @param r The sampling rate for auxiliary indexes (must be power of 2). + * @param I [0..(n-1)/r] The input auxiliary indexes. + * @return 0 if no error occurred, -1 or -2 otherwise. + */ + int32_t libsais16_unbwt_aux(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I); + + /** + * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string with auxiliary indexes using libsais16 reverse BWT context. + * @param ctx The libsais16 reverse BWT context. + * @param T [0..n-1] The input 16-bit string. + * @param U [0..n-1] The output 16-bit string (can be T). + * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). + * @param n The length of the given 16-bit string. + * @param freq [0..65535] The input 16-bit symbol frequency table (can be NULL). + * @param r The sampling rate for auxiliary indexes (must be power of 2). + * @param I [0..(n-1)/r] The input auxiliary indexes. + * @return 0 if no error occurred, -1 or -2 otherwise. + */ + int32_t libsais16_unbwt_aux_ctx(const void * ctx, const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I); + +#if defined(_OPENMP) + /** + * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string with primary index in parallel using OpenMP. + * @param T [0..n-1] The input 16-bit string. + * @param U [0..n-1] The output 16-bit string (can be T). + * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). + * @param n The length of the given 16-bit string. + * @param freq [0..65535] The input 16-bit symbol frequency table (can be NULL). + * @param i The primary index. + * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). + * @return 0 if no error occurred, -1 or -2 otherwise. + */ + int32_t libsais16_unbwt_omp(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i, int32_t threads); + + /** + * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string with auxiliary indexes in parallel using OpenMP. + * @param T [0..n-1] The input 16-bit string. + * @param U [0..n-1] The output 16-bit string (can be T). + * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). + * @param n The length of the given 16-bit string. + * @param freq [0..65535] The input 16-bit symbol frequency table (can be NULL). + * @param r The sampling rate for auxiliary indexes (must be power of 2). + * @param I [0..(n-1)/r] The input auxiliary indexes. + * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). + * @return 0 if no error occurred, -1 or -2 otherwise. + */ + int32_t libsais16_unbwt_aux_omp(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I, int32_t threads); +#endif + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/libsais/libsais_internal.h b/libsais/libsais_internal.h new file mode 100644 index 0000000..d11a213 --- /dev/null +++ b/libsais/libsais_internal.h @@ -0,0 +1,49 @@ +/*-- + +This file is a part of libsais, a library for linear time +suffix array and burrows wheeler transform construction. + + Copyright (c) 2021 Ilya Grebnov + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +Please see the file LICENSE for full copyright information. + +--*/ + +#ifndef LIBSAIS_INTERNAL_H +#define LIBSAIS_INTERNAL_H 1 + +#ifdef __cplusplus +extern "C" { +#endif + + #include + + /** + * Internal method to construct suffix array of an integer array. + * Note, during suffix array construction input array will be modified and restored at the end if no error occurred. + * @param T [0..n-1] The input integer array. + * @param SA [0..n-1+fs] The output array of suffixes. + * @param n The length of the integer array. + * @param k The alphabet size of the input integer array. + * @param fs Extra space available at the end of SA array (can be 0). + * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). + * @return 0 if no error occurred, -1 or -2 otherwise. + */ + int32_t libsais_main_32s_internal(int32_t * T, int32_t * SA, int32_t n, int32_t k, int32_t fs, int32_t threads); +#ifdef __cplusplus +} +#endif + +#endif diff --git a/m03_model.h b/m03_model.h new file mode 100644 index 0000000..0134b5b --- /dev/null +++ b/m03_model.h @@ -0,0 +1,408 @@ +/*-- + +This file is a part of bsc-m03 project. + + Copyright (c) 2021 Ilya Grebnov + + bsc-m03 is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + bsc-m03 is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with bsc-m03. If not, see . + +--*/ + +#pragma once + +#include +#include +#include + +#include "common/platform.h" +#include "common/rangecoder.h" + +#include + +#pragma warning( push ) +#pragma warning( disable : 6385 ) +#pragma warning( disable : 6386 ) + +enum class m03_mode : int { encoding = 0, decoding = 1, }; + +class m03_model +{ +protected: + m03_mode mode; + + void initialize_model(RangeCoder * coder, m03_mode mode) + { + this->coder = coder; + this->mode = mode; + + for (int32_t s = 0; s < 1536; ++s) { T1_model[s][0] = T1_model[s][1] = 1; } + for (int32_t s = 0; s < 1536; ++s) { T2_model[s][0] = T2_model[s][1] = T2_model[s][2] = T2_model[s][3] = 1; } + for (int32_t s = 0; s < 768 ; ++s) { Ternary_model[s][0] = Ternary_model[s][1] = Ternary_model[s][2] = Ternary_model[s][3] = 1; } + for (int32_t s = 0; s < 96 ; ++s) { for (int32_t c = 0; c < 16; ++c) { Tree_model[s][c] = 1; } } + } + + void encode_root_frequencies(const int32_t * root_frequencies, int32_t k, int32_t n) + { + int64_t bit_freq[33]; + int64_t bit_freq_sum[33]; + + { + int64_t remaining_min = n, remaining_max = n, remaining_count = k; + + memset(bit_freq, 0, sizeof(bit_freq)); + + for (ptrdiff_t p = 0; p < k; ++p) + { + bit_freq[bit_scan_reverse(root_frequencies[p] + 1)]++; + } + + for (ptrdiff_t bit = 0; bit <= 32 && remaining_count > 0; ++bit) + { + int64_t min_value = (1ll << (bit + 0)) - 1; + int64_t max_value = (1ll << (bit + 1)) - 2; + int64_t min = std::max(remaining_count - (remaining_max / (max_value + 1)), 0ll); + int64_t max = remaining_count * max_value < remaining_min ? remaining_count - 1 : remaining_count; + + this->coder->EncodeValue((unsigned int)min, (unsigned int)bit_freq[bit], (unsigned int)max); + + remaining_min -= bit_freq[bit] * max_value; + remaining_max -= bit_freq[bit] * min_value; + remaining_count -= bit_freq[bit]; + } + } + + { + int64_t bit_sum = 0, remaining_min = 0, remaining_max = 0, remaining_total = n; + + for (ptrdiff_t bit = 32; bit >= 0; --bit) + { + int64_t min_value = (1ll << (bit + 0)) - 1; + int64_t max_value = (1ll << (bit + 1)) - 2; + + bit_freq_sum[bit] = bit_sum; bit_sum += bit_freq[bit]; + + remaining_min += min_value * bit_freq[bit]; + remaining_max += max_value * bit_freq[bit]; + } + + for (ptrdiff_t p = 0; p < k; ++p) + { + int32_t bit = bit_scan_reverse(root_frequencies[p] + 1); + + { + for (ptrdiff_t b = 0; b < bit; ++b) + { + if (bit_freq[b] > 0) + { + this->coder->Encode((unsigned int)bit_freq[b], (unsigned int)bit_freq_sum[b], (unsigned int)(bit_freq[b] + bit_freq_sum[b])); + } + + assert(bit_freq_sum[b] > 0); bit_freq_sum[b]--; + } + + if (bit_freq_sum[bit] > 0) + { + this->coder->Encode(0, (unsigned int)bit_freq[bit], (unsigned int)(bit_freq[bit] + bit_freq_sum[bit])); + } + + assert(bit_freq[bit] > 0); bit_freq[bit]--; + } + + { + int64_t min_value = (1ll << (bit + 0)) - 1; + int64_t max_value = (1ll << (bit + 1)) - 2; + remaining_min -= min_value; + remaining_max -= max_value; + int64_t min = std::max(min_value, remaining_total - remaining_max); + int64_t max = std::min(max_value, remaining_total - remaining_min); + + this->coder->EncodeValue((unsigned int)min, (unsigned int)root_frequencies[p], (unsigned int)max); + + remaining_total -= root_frequencies[p]; + } + } + } + } + + void decode_root_frequencies(int32_t * root_frequencies, int32_t k, int32_t n) + { + int64_t bit_freq[33]; + int64_t bit_freq_sum[33]; + + { + int64_t remaining_min = n, remaining_max = n, remaining_count = k; + + memset(bit_freq, 0, sizeof(bit_freq)); + + for (ptrdiff_t bit = 0; bit <= 32 && remaining_count > 0; ++bit) + { + int64_t min_value = (1ll << (bit + 0)) - 1; + int64_t max_value = (1ll << (bit + 1)) - 2; + int64_t min = std::max(remaining_count - (remaining_max / (max_value + 1)), 0ll); + int64_t max = remaining_count * max_value < remaining_min ? remaining_count - 1 : remaining_count; + + bit_freq[bit] = this->coder->DecodeValue((unsigned int)min, (unsigned int)max); + + remaining_min -= bit_freq[bit] * max_value; + remaining_max -= bit_freq[bit] * min_value; + remaining_count -= bit_freq[bit]; + } + } + + { + int64_t bit_sum = 0, remaining_min = 0, remaining_max = 0, remaining_total = n; + + for (ptrdiff_t bit = 32; bit >= 0; --bit) + { + int64_t min_value = (1ll << (bit + 0)) - 1; + int64_t max_value = (1ll << (bit + 1)) - 2; + + bit_freq_sum[bit] = bit_sum; bit_sum += bit_freq[bit]; + + remaining_min += min_value * bit_freq[bit]; + remaining_max += max_value * bit_freq[bit]; + } + + for (ptrdiff_t p = 0; p < k; ++p) + { + int32_t bit = 0; + + while (bit_freq_sum[bit] > 0) + { + if (bit_freq[bit] > 0) + { + unsigned int cum_freq = this->coder->GetCumFreq((unsigned int)(bit_freq[bit] + bit_freq_sum[bit])); + if (cum_freq < bit_freq[bit]) + { + this->coder->Decode(0, (unsigned int)bit_freq[bit], (unsigned int)(bit_freq[bit] + bit_freq_sum[bit])); + break; + } + else + { + this->coder->Decode((unsigned int)bit_freq[bit], (unsigned int)bit_freq_sum[bit], (unsigned int)(bit_freq[bit] + bit_freq_sum[bit]));; + } + } + + bit_freq_sum[bit]--; bit++; + } + + assert(bit_freq[bit] > 0); bit_freq[bit]--; + + { + int64_t min_value = (1ll << (bit + 0)) - 1; + int64_t max_value = (1ll << (bit + 1)) - 2; + remaining_min -= min_value; + remaining_max -= max_value; + int64_t min = std::max(min_value, remaining_total - remaining_max); + int64_t max = std::min(max_value, remaining_total - remaining_min); + + root_frequencies[p] = this->coder->DecodeValue((unsigned int)min, (unsigned int)max); + + remaining_total -= root_frequencies[p]; + } + } + } + } + + int32_t predict(int32_t count, int32_t total, int32_t left_remaining, int32_t right_remaining, int32_t symbols_remaining) + { + int32_t inferred_right = std::max(total - left_remaining, 0); + right_remaining -= inferred_right; total -= inferred_right; + + assert(total <= right_remaining); + + if (total > 0) + { + if (total <= 2) + { + int32_t state = 0; + state += 1 * (std::min((int32_t)symbols_remaining - 2, 5)); + state += 8 * (std::min((int32_t)bit_scan_reverse(inferred_right + 1), 3)); + state += 32 * (left_remaining + right_remaining == symbols_remaining); + state += 64 * (left_remaining == total); + state += 128 * (((int64_t)left_remaining * 11) / ((int64_t)right_remaining)); + + if (total == 1) + { + static const int threshold[12] = { 147, 251, 374, 540, 761, 763, 1589, 2275, 2193, 3457, 3811, 1017 }; + + uint16_t * RESTRICT predictor = &this->T1_model[state][0]; + + if (predictor[0] + predictor[1] > threshold[state >> 7]) + { + predictor[0] = (predictor[0] + (predictor[0] < 2)) >> 1; + predictor[1] = (predictor[1] + (predictor[1] < 2)) >> 1; + } + + if (this->mode == m03_mode::encoding) + { + this->coder->Encode(count ? predictor[0] : 0, predictor[count], predictor[0] + predictor[1]); + } + else + { + unsigned int cum_freq = this->coder->GetCumFreq(predictor[0] + predictor[1]); + + count = cum_freq >= predictor[0]; + this->coder->Decode(count ? predictor[0] : 0, predictor[count], predictor[0] + predictor[1]); + } + + predictor[count]++; + } + else + { + static const int threshold[12] = { 149, 221, 255, 287, 292, 343, 494, 396, 655, 820, 2984, 225 }; + + uint16_t * RESTRICT predictor = &this->T2_model[state][0]; + + if (predictor[0] + predictor[1] + predictor[2] > threshold[state >> 7]) + { + predictor[0] = (predictor[0] + (predictor[0] < 2)) >> 1; + predictor[1] = (predictor[1] + (predictor[1] < 2)) >> 1; + predictor[2] = (predictor[2] + (predictor[2] < 2)) >> 1; + } + + if (this->mode == m03_mode::encoding) + { + unsigned int cum_freq = count == 0 ? 0 : count == 1 ? predictor[0] : predictor[0] + predictor[1]; + this->coder->Encode(cum_freq, predictor[count], predictor[0] + predictor[1] + predictor[2]); + } + else + { + unsigned int cum_freq = this->coder->GetCumFreq(predictor[0] + predictor[1] + predictor[2]); + + count = (cum_freq >= predictor[0]) + (cum_freq >= (unsigned int)(predictor[0] + predictor[1])); + cum_freq = count == 0 ? 0 : count == 1 ? predictor[0] : predictor[0] + predictor[1]; + + this->coder->Decode(cum_freq, predictor[count], predictor[0] + predictor[1] + predictor[2]); + } + + predictor[count]++; + } + } + else + { + int32_t pivot = (count > 0) + (count == total); + + { + static const int threshold[48] = + { + 142, 129, 115, 89 , 70 , 59 , 53 , 44, + 243, 167, 132, 105, 98 , 109, 107, 134, + 247, 200, 162, 134, 137, 149, 201, 262, + 339, 253, 184, 171, 235, 288, 299, 348, + 512, 396, 178, 357, 466, 484, 697, 587, + 220, 157, 144, 167, 219, 141, 228, 1076, + }; + + int32_t state = 0; + state += 1 * (std::min((int32_t)bit_scan_reverse(symbols_remaining - 1), 3)); + state += 4 * (inferred_right > 0); + state += 8 * (left_remaining == total); + state += 16 * (std::min((int32_t)bit_scan_reverse(total - 2), 7)); + state += 128 * (((int64_t)left_remaining * 9 + right_remaining) / ((int64_t)right_remaining * 2)); + + uint16_t * RESTRICT predictor = &this->Ternary_model[state][0]; + + if (predictor[0] + predictor[1] + predictor[2] > threshold[state >> 4]) + { + predictor[0] = (predictor[0] + (predictor[0] < 2)) >> 1; + predictor[1] = (predictor[1] + (predictor[1] < 2)) >> 1; + predictor[2] = (predictor[2] + (predictor[2] < 2)) >> 1; + } + + if (this->mode == m03_mode::encoding) + { + unsigned int cum_freq = pivot == 0 ? 0 : pivot == 1 ? predictor[0] : predictor[0] + predictor[1]; + this->coder->Encode(cum_freq, predictor[pivot], predictor[0] + predictor[1] + predictor[2]); + } + else + { + unsigned int cum_freq = this->coder->GetCumFreq(predictor[0] + predictor[1] + predictor[2]); + + pivot = (cum_freq >= predictor[0]) + (cum_freq >= (unsigned int)(predictor[0] + predictor[1])); + cum_freq = pivot == 0 ? 0 : pivot == 1 ? predictor[0] : predictor[0] + predictor[1]; + + this->coder->Decode(cum_freq, predictor[pivot], predictor[0] + predictor[1] + predictor[2]); + } + + predictor[pivot]++; if (pivot != 1) { count = pivot == 0 ? 0 : total; } + } + + if (pivot == 1) + { + static const int threshold[48] = + { + 275 , 167 , 218 , 163, 200, 123, 143, 61, + 515 , 335 , 344 , 268, 320, 244, 235, 85, + 863 , 474 , 527 , 387, 401, 298, 263, 107, + 1920, 968 , 629 , 500, 554, 286, 358, 121, + 3655, 1157, 1021, 623, 591, 365, 317, 109, + 2922, 249 , 776 , 159, 537, 133, 253, 158, + }; + + int32_t state = 0; + state += 1 * (inferred_right >= total); + state += 2 * (std::min(total - 3, 7)); + state += 16 * (((int64_t)left_remaining * 5) / ((int64_t)right_remaining)); + + int32_t min = 1, max = total - 1, context = 1; + while (min != max && context < 8) + { + uint16_t * RESTRICT predictor = &this->Tree_model[state][2 * context]; + + if (predictor[0] + predictor[1] > threshold[state >> 1]) + { + predictor[0] = (predictor[0] + (predictor[0] < 2)) >> 1; + predictor[1] = (predictor[1] + (predictor[1] < 2)) >> 1; + } + + int32_t median = min + ((max - min + 1) >> 1), bit = count >= median; + + if (this->mode == m03_mode::encoding) + { + this->coder->Encode(bit ? predictor[0] : 0, predictor[bit], predictor[0] + predictor[1]); + } + else + { + unsigned int cum_freq = this->coder->GetCumFreq(predictor[0] + predictor[1]); + + bit = cum_freq >= predictor[0]; + this->coder->Decode(bit ? predictor[0] : 0, predictor[bit], predictor[0] + predictor[1]); + } + + predictor[bit]++; context += context + bit; min = bit ? median : min; max = bit ? max : median - 1; + } + + count = this->mode == m03_mode::encoding + ? this->coder->EncodeValue(min, count, max) + : this->coder->DecodeValue(min, max); + } + } + + return count; + } + + return 0; + } + +private: + RangeCoder * coder; + + uint16_t T1_model[1536][2]; + uint16_t T2_model[1536][4]; + uint16_t Ternary_model[768][4]; + uint16_t Tree_model[96][16]; +}; + +#pragma warning( pop ) \ No newline at end of file diff --git a/m03_parser.h b/m03_parser.h new file mode 100644 index 0000000..e296a13 --- /dev/null +++ b/m03_parser.h @@ -0,0 +1,709 @@ +/*-- + +This file is a part of bsc-m03 project. + + Copyright (c) 2021 Ilya Grebnov + + bsc-m03 is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + bsc-m03 is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with bsc-m03. If not, see . + +--*/ + +#pragma once + +#include +#include +#include + +#include + +#include "common/platform.h" +#include "common/rangecoder.h" + +#include "hutucker/hu-tucker.h" + +#include "m03_model.h" + +#define OPTIMAL_ABT_SMALL_THRESHOLD (7) +#define OPTIMAL_ABT_LARGE_THRESHOLD (257) + +#pragma warning( push ) +#pragma warning( disable : 6385 ) +#pragma warning( disable : 6386 ) + +#pragma pack(push, 1) + +typedef struct symbol_context +{ + int32_t count; + int32_t offset; + uint16_t symbol; +} symbol_context; + +#pragma pack(pop) + +typedef struct offset_queue +{ + int32_t * offsets; + ptrdiff_t count; + ptrdiff_t size; + + bool initialize(ptrdiff_t size) + { + this->count = 0; + this->size = size; + this->offsets = (int32_t *)malloc(this->size * sizeof(int32_t)); + + return this->offsets != NULL; + } + + INLINE void push_offset(const int32_t offset) + { + if (this->count == this->size) + { + this->offsets = this->resize(); + } + + this->offsets[this->count++] = offset; + } + + INLINE void reset() { this->count = 0; } + + INLINE void sort() { std::stable_sort(this->offsets, this->offsets + this->count); } + + NOINLINE int32_t * resize() + { + return (int32_t *)realloc(this->offsets, (this->size += this->size) * sizeof(int32_t)); + } + + void destroy() + { + if (this->offsets != NULL) { free(this->offsets); this->offsets = NULL; } + } + +} offset_queue; + +class m03_parser: m03_model +{ +public: + + bool initialize(uint16_t * L, int32_t n, int32_t primary_index, int32_t * root_frequencies, int32_t k, RangeCoder * coder, m03_mode mode) + { + memset(this, 0, sizeof(m03_parser)); + + this->L = L; + this->n = n; + this->primary_index = primary_index; + this->root_frequencies = root_frequencies; + this->k = k; + + if ((this->contexts = (symbol_context *)malloc(n * sizeof(symbol_context))) == NULL) + { + this->destroy(); + return false; + } + + if ((this->hutucker_tmp = malloc(hutucker_tmp_size(MAX_ALPHABET_SIZE + 1))) == NULL) + { + this->destroy(); + return false; + } + + if (!current_segments.initialize(next_power_of_2(std::max(n / 4, 64)))) + { + this->destroy(); + return false; + } + + if (!next_segments.initialize(next_power_of_2(std::max(n / 4, 64)))) + { + this->destroy(); + return false; + } + + this->initialize_model(coder, mode); + this->initialize_alphabetic_tree_roots(); + + return true; + } + + void run() + { + if (this->mode == m03_mode::encoding) + { + this->encode_root_frequencies(this->root_frequencies, this->k, this->n - 1); + this->initialize_root_context(this->root_frequencies); + this->parse_contexts(); + + for (ptrdiff_t p = 0; p < n; ++p) + { + assert(p == this->primary_index || this->contexts[p].count == 1 ); + assert(p == this->primary_index || this->contexts[p].symbol == L[p]); + } + } + else + { + this->decode_root_frequencies(this->root_frequencies, this->k, this->n - 1); + this->initialize_root_context(this->root_frequencies); + this->parse_contexts(); + + for (ptrdiff_t p = 0; p < n; ++p) + { + L[p] = this->contexts[p].symbol; + } + } + } + + void destroy() + { + if (this->contexts != NULL) { free(this->contexts); this->contexts = NULL; } + if (this->hutucker_tmp != NULL) { free(this->hutucker_tmp); this->hutucker_tmp = NULL; } + + this->current_segments.destroy(); + this->next_segments.destroy(); + } + +private: + + uint16_t * L; + int32_t n; + int32_t primary_index; + int32_t * root_frequencies; + int32_t k; + + symbol_context * contexts; + offset_queue current_segments; + offset_queue next_segments; + void * hutucker_tmp; + + int32_t parent_frequencies [MAX_ALPHABET_SIZE + 1]; + int32_t left_frequencies [MAX_ALPHABET_SIZE + 1]; + symbol_context left_contexts [MAX_ALPHABET_SIZE + 1]; + + int32_t alphabetic_tree_keys[OPTIMAL_ABT_LARGE_THRESHOLD]; + int32_t alphabetic_tree_weight[OPTIMAL_ABT_LARGE_THRESHOLD]; + int64_t alphabetic_tree_cost[OPTIMAL_ABT_LARGE_THRESHOLD][OPTIMAL_ABT_LARGE_THRESHOLD]; + uint8_t alphabetic_tree_root[OPTIMAL_ABT_LARGE_THRESHOLD][OPTIMAL_ABT_LARGE_THRESHOLD]; + + void initialize_alphabetic_tree_roots() + { + for (int32_t l = 0; l < OPTIMAL_ABT_LARGE_THRESHOLD - 1; ++l) + { + this->alphabetic_tree_root[l][l + 1] = this->alphabetic_tree_root[l][l] = l; + } + } + + void initialize_root_context(const int32_t * root_frequencies) + { + int32_t unique_symbols = 0, total_symbols = 1; + + this->current_segments.push_offset(0); + + for (int32_t c = 0; c < this->k; ++c) + { + if (root_frequencies[c] > 0) + { + this->contexts[unique_symbols].count = root_frequencies[c]; + this->contexts[unique_symbols].offset = total_symbols; + this->contexts[unique_symbols].symbol = c; + + this->current_segments.push_offset(total_symbols); + + unique_symbols++; total_symbols += root_frequencies[c]; + } + } + + m03_parser::normalize_context(&this->contexts[0], unique_symbols, total_symbols); + } + + void parse_contexts() + { + while (this->current_segments.count > 0) + { + for (int32_t segment_start = 0; segment_start < this->current_segments.count;) + { + int32_t context_start = this->current_segments.offsets[segment_start]; + int32_t context_end = context_start + this->contexts[context_start].count; + int32_t segment_end = segment_start + 1; + + while (segment_end < this->current_segments.count && this->current_segments.offsets[segment_end] < context_end) + { + segment_end++; + } + + assert(context_end - context_start > 1); + assert(segment_end - segment_start > 1); + + if (this->is_trivial_context(context_start)) + { + m03_parser::split_trivial_context(this->contexts, this->next_segments, &this->current_segments.offsets[segment_start], &this->current_segments.offsets[segment_end]); + } + else + { + m03_parser::populate_context_frequencies(&this->contexts[context_start], &this->contexts[this->primary_index], &this->parent_frequencies[0]); + this->split_context_recursive(&this->current_segments.offsets[segment_start], &this->current_segments.offsets[segment_end]); + } + + segment_start = segment_end; + } + + this->next_segments.sort(); + this->current_segments.reset(); + + std::swap(this->current_segments, this->next_segments); + } + } + + void split_context_recursive(const int32_t * offsets, const int32_t * offsets_end) + { + assert(offsets_end - offsets > 0); + + if (offsets_end - offsets == 1) + { + m03_parser::populate_next_segments(&this->contexts[offsets[0]], &this->contexts[this->primary_index], &this->parent_frequencies[0], this->next_segments); + return; + } + + if (this->is_trivial_context(offsets[0])) + { + m03_parser::split_trivial_context(this->contexts, this->next_segments, offsets, offsets_end); + return; + } + + if (offsets_end - offsets >= OPTIMAL_ABT_SMALL_THRESHOLD && offsets_end - offsets <= OPTIMAL_ABT_LARGE_THRESHOLD) + { + this->build_optimal_alphabetic_tree(offsets, offsets_end); + this->traverse_alphabetic_tree(offsets, offsets_end, 0, (int32_t)(offsets_end - offsets) - 1); + return; + } + + const int32_t * offsets_pivot = (offsets_end - offsets) > 2 + ? this->choose_context_pivot_using_heuristic(offsets, offsets_end) + : &offsets[1]; + + this->split_context_by_pivot(offsets[0], offsets_pivot[0]); + this->split_context_recursive(offsets, offsets_pivot); + this->split_context_recursive(offsets_pivot, offsets_end); + } + + void traverse_alphabetic_tree(const int32_t * offsets, const int32_t * offsets_end, int32_t l, int32_t r) + { + assert(l <= r); + + if (l == r) + { + m03_parser::populate_next_segments(&this->contexts[offsets[l]], &this->contexts[this->primary_index], &this->parent_frequencies[0], this->next_segments); + return; + } + + if (this->is_trivial_context(offsets[l])) + { + m03_parser::split_trivial_context(this->contexts, this->next_segments, &offsets[l], &offsets[r + 1]); + return; + } + + int32_t offsets_pivot = this->alphabetic_tree_root[l][r]; + + this->split_context_by_pivot(offsets[l], offsets[offsets_pivot + 1]); + this->traverse_alphabetic_tree(offsets, offsets_end, l, offsets_pivot); + this->traverse_alphabetic_tree(offsets, offsets_end, offsets_pivot + 1, r); + } + + const int32_t * choose_context_pivot_using_heuristic(const int32_t * offsets, const int32_t * offsets_end) + { + assert(offsets_end - offsets > 2); + + int32_t context_begin = offsets[0]; + int32_t context_end = offsets[0] + this->contexts[offsets[0]].count; + size_t offsets_count = offsets_end - offsets; + + if (offsets_count == 3) + { + int64_t A = (int64_t)(offsets[1] ) - (int64_t)(context_begin); + int64_t C = (int64_t)(context_end) - (int64_t)(offsets[2]); + + return C <= A ? &offsets[1] : &offsets[2]; + } + else if (offsets_count == 4) + { + int64_t A = (int64_t)(offsets[1] ) - (int64_t)(context_begin); + int64_t B = (int64_t)(offsets[2] ) - (int64_t)(offsets[1]); + int64_t C = (int64_t)(offsets[3] ) - (int64_t)(offsets[2]); + int64_t D = (int64_t)(context_end) - (int64_t)(offsets[3]); + + const int32_t * offset1 = &offsets[1]; int64_t cost1 = pivot_cost3(B, C, D); + const int32_t * offset2 = &offsets[2]; int64_t cost2 = A + B + C + D; + const int32_t * offset3 = &offsets[3]; int64_t cost3 = pivot_cost3(A, B, C); + + if (cost2 <= cost1) { offset1 = offset2; cost1 = cost2; } + if (cost3 < cost1) { offset1 = offset3; } + + return offset1; + } + else if (offsets_count == 5) + { + int64_t A = (int64_t)(offsets[1] ) - (int64_t)(context_begin); + int64_t B = (int64_t)(offsets[2] ) - (int64_t)(offsets[1]); + int64_t C = (int64_t)(offsets[3] ) - (int64_t)(offsets[2]); + int64_t D = (int64_t)(offsets[4] ) - (int64_t)(offsets[3]); + int64_t E = (int64_t)(context_end) - (int64_t)(offsets[4]); + + const int32_t * offset1 = &offsets[1]; int64_t cost1 = pivot_cost4(B, C, D, E); + const int32_t * offset2 = &offsets[2]; int64_t cost2 = A + B + pivot_cost3(C, D, E); + const int32_t * offset3 = &offsets[3]; int64_t cost3 = pivot_cost3(A, B, C) + D + E; + const int32_t * offset4 = &offsets[4]; int64_t cost4 = pivot_cost4(A, B, C, D); + + if (cost2 <= cost1) { offset1 = offset2; cost1 = cost2; } + if (cost3 < cost1) { offset1 = offset3; cost1 = cost3; } + if (cost4 < cost1) { offset1 = offset4; } + + return offset1; + } + else if (offsets_count == 6) + { + int64_t A = (int64_t)(offsets[1] ) - (int64_t)(context_begin); + int64_t B = (int64_t)(offsets[2] ) - (int64_t)(offsets[1]); + int64_t C = (int64_t)(offsets[3] ) - (int64_t)(offsets[2]); + int64_t D = (int64_t)(offsets[4] ) - (int64_t)(offsets[3]); + int64_t E = (int64_t)(offsets[5] ) - (int64_t)(offsets[4]); + int64_t F = (int64_t)(context_end) - (int64_t)(offsets[5]); + + const int32_t * offset1 = &offsets[1]; int64_t cost1 = pivot_cost5(B, C, D, E, F); + const int32_t * offset2 = &offsets[2]; int64_t cost2 = A + B + pivot_cost4(C, D, E, F); + const int32_t * offset3 = &offsets[3]; int64_t cost3 = pivot_cost3(A, B, C) + pivot_cost3(D, E, F); + const int32_t * offset4 = &offsets[4]; int64_t cost4 = pivot_cost4(A, B, C, D) + E + F; + const int32_t * offset5 = &offsets[5]; int64_t cost5 = pivot_cost5(A, B, C, D, E); + + if (cost2 <= cost1) { offset1 = offset2; cost1 = cost2; } + if (cost3 <= cost1) { offset1 = offset3; cost1 = cost3; } + if (cost4 < cost1) { offset1 = offset4; cost1 = cost4; } + if (cost5 < cost1) { offset1 = offset5; } + + return offset1; + } + else + { + assert(offsets_count > OPTIMAL_ABT_LARGE_THRESHOLD); + + { + for (int32_t segment_end = context_end, offsets_index = (int32_t)offsets_count - 1; offsets_index >= 0; --offsets_index) + { + int32_t segment_start = offsets[offsets_index]; + + this->left_frequencies[offsets_index] = segment_end - segment_start; segment_end = segment_start; + } + + hutucker_get_lengths(offsets_count, (unsigned long *)this->left_frequencies, this->hutucker_tmp); + } + + { + uint8_t path[64] = { 0 }; + for (int32_t offsets_index = 0, length = 0; offsets_index < offsets_count; ++offsets_index) + { + for (; length < this->left_frequencies[offsets_index]; ++length) { path[length] = 0; } + + length = this->left_frequencies[offsets_index]; if (path[0] == 1) { return &offsets[offsets_index]; } + + for (int32_t k = length - 1; k >= 0; --k) { if (path[k] ^= 1) { break; } } + } + } + + return NULL; + } + } + + void build_optimal_alphabetic_tree(const int32_t * offsets, const int32_t * offsets_end) + { + ptrdiff_t offsets_count = (ptrdiff_t)(offsets_end - offsets); + + assert(offsets_count >= OPTIMAL_ABT_SMALL_THRESHOLD && offsets_count <= OPTIMAL_ABT_LARGE_THRESHOLD); + + this->alphabetic_tree_keys[offsets_count - 1] = offsets[0] + this->contexts[offsets[0]].count - offsets[offsets_count - 1]; + + for (ptrdiff_t offsets_index = offsets_count - 2; offsets_index >= 0; --offsets_index) + { + this->alphabetic_tree_keys[offsets_index] = offsets[offsets_index + 1] - offsets[offsets_index]; + this->alphabetic_tree_cost[offsets_index][offsets_index + 1] = this->alphabetic_tree_weight[offsets_index] = this->alphabetic_tree_keys[offsets_index] + this->alphabetic_tree_keys[offsets_index + 1]; + } + + for (ptrdiff_t length = 3; length <= offsets_count; ++length) + { + for (ptrdiff_t l = 0, r = length - 1; r < offsets_count; ++l, ++r) + { + uint8_t best_root = this->alphabetic_tree_root[l][r - 1]; + int64_t best_cost = this->alphabetic_tree_cost[l][best_root] + this->alphabetic_tree_cost[best_root + 1][r]; + + for (ptrdiff_t root = (ptrdiff_t)best_root + 1; root <= (ptrdiff_t)this->alphabetic_tree_root[l + 1][r]; ++root) + { + int64_t cost = this->alphabetic_tree_cost[l][root] + this->alphabetic_tree_cost[root + 1][r]; + if (cost < best_cost) { best_cost = cost; best_root = (uint8_t)root; } + } + + this->alphabetic_tree_weight[l] += this->alphabetic_tree_keys[r]; + this->alphabetic_tree_cost[l][r] = best_cost + this->alphabetic_tree_weight[l]; + this->alphabetic_tree_root[l][r] = best_root; + } + } + } + + void split_context_by_pivot(int32_t parent_context_offset, int32_t right_context_offset) + { + symbol_context * parent_context = &this->contexts[parent_context_offset]; + int32_t parent_interval_size = parent_context[0].count; + int32_t parent_unique_symbols = 1; + + symbol_context * left_context = &this->left_contexts[0]; + int32_t * left_frequencies = &this->left_frequencies[0]; + int32_t left_interval_size = right_context_offset - parent_context_offset; + int32_t left_unique_symbols = 0; + + int32_t right_interval_size = parent_interval_size - left_interval_size; + int32_t right_unique_symbols = 0; + + if (this->mode == m03_mode::encoding) + { + if (left_interval_size <= parent_interval_size - left_interval_size) + { + int32_t parent_total_symbols = parent_interval_size; + + parent_total_symbols -= ((uint32_t)(this->primary_index - parent_context_offset) < (uint32_t)parent_total_symbols); + + while (parent_total_symbols > 1 && parent_context[parent_unique_symbols].count > 0) + { + parent_total_symbols -= parent_context[parent_unique_symbols].count; + left_frequencies[parent_context[parent_unique_symbols].symbol] = 0; + parent_unique_symbols++; + } + + assert(parent_total_symbols > 0); parent_context[0].count = parent_total_symbols; + left_frequencies[parent_context[0].symbol] = 0; + + for (int32_t p = parent_context_offset; p < right_context_offset; ++p) { left_frequencies[L[p]]++; } + + left_frequencies[0] -= ((uint32_t)(this->primary_index - parent_context_offset) < (uint32_t)left_interval_size); + } + else + { + int32_t parent_total_symbols = parent_interval_size; + + parent_total_symbols -= ((uint32_t)(this->primary_index - parent_context_offset) < (uint32_t)parent_total_symbols); + + while (parent_total_symbols > 1 && parent_context[parent_unique_symbols].count > 0) + { + parent_total_symbols -= parent_context[parent_unique_symbols].count; + left_frequencies[parent_context[parent_unique_symbols].symbol] = parent_context[parent_unique_symbols].count; + parent_unique_symbols++; + } + + assert(parent_total_symbols > 0); parent_context[0].count = parent_total_symbols; + left_frequencies[parent_context[0].symbol] = parent_total_symbols; + + for (int32_t p = right_context_offset; p < parent_context_offset + parent_interval_size; ++p) { left_frequencies[L[p]]--; } + + left_frequencies[0] += ((uint32_t)(this->primary_index - right_context_offset) < (uint32_t)right_interval_size); + } + } + else + { + int32_t parent_total_symbols = parent_interval_size; + + parent_total_symbols -= ((uint32_t)(this->primary_index - parent_context_offset) < (uint32_t)parent_total_symbols); + + while (parent_total_symbols > 1 && parent_context[parent_unique_symbols].count > 0) + { + parent_total_symbols -= parent_context[parent_unique_symbols].count; + parent_unique_symbols++; + } + + assert(parent_total_symbols > 0); parent_context[0].count = parent_total_symbols; + } + + int32_t left_remaining = left_interval_size; + int32_t right_remaining = right_interval_size; + + left_remaining -= ((uint32_t)(this->primary_index - parent_context_offset) < (uint32_t)left_interval_size ); + right_remaining -= ((uint32_t)(this->primary_index - right_context_offset ) < (uint32_t)right_interval_size); + + for (int32_t parent_symbol_index = 0; parent_symbol_index < parent_unique_symbols; ++parent_symbol_index) + { + if (left_remaining > 0) + { + uint16_t symbol = parent_context[parent_symbol_index].symbol; + int32_t total = parent_context[parent_symbol_index].count; + int32_t count = left_frequencies[symbol]; + + if (total <= left_remaining + right_remaining - total) + { + count = left_remaining <= right_remaining + ? this->predict( count, total, left_remaining , right_remaining, parent_unique_symbols - parent_symbol_index) + : total - this->predict(total - count, total, right_remaining, left_remaining , parent_unique_symbols - parent_symbol_index); + } + else + { + total = left_remaining + right_remaining - total; + count = left_remaining - count; + + count = left_remaining <= right_remaining + ? this->predict( count, total, left_remaining , right_remaining, parent_unique_symbols - parent_symbol_index) + : total - this->predict(total - count, total, right_remaining, left_remaining , parent_unique_symbols - parent_symbol_index); + + count = left_remaining - count; + total = left_remaining + right_remaining - total; + } + + left_remaining = left_remaining - count; + right_remaining = right_remaining + count - total; + + if (count > 0) + { + left_context[left_unique_symbols].count = count; + left_context[left_unique_symbols].offset = parent_context[parent_symbol_index].offset; + left_context[left_unique_symbols].symbol = symbol; + + parent_context[parent_symbol_index].count -= count; + parent_context[parent_symbol_index].offset += count; + + left_unique_symbols++; + } + } + + if (parent_context[parent_symbol_index].count > 0) + { + parent_context[right_unique_symbols] = parent_context[parent_symbol_index]; + right_unique_symbols++; + } + } + + { + memmove(&this->contexts[right_context_offset], &parent_context[0], right_unique_symbols * sizeof(symbol_context)); + m03_parser::normalize_context(&this->contexts[right_context_offset], right_unique_symbols, right_interval_size); + + memcpy(&parent_context[0], &left_context[0], left_unique_symbols * sizeof(symbol_context)); + m03_parser::normalize_context(&parent_context[0], left_unique_symbols, left_interval_size); + } + } + + INLINE bool is_trivial_context(int32_t context_start) + { + return this->contexts[context_start + 1].count == 0 && ((uint32_t)(this->primary_index - context_start) >= (uint32_t)this->contexts[context_start].count); + } + + static void split_trivial_context(symbol_context * contexts, offset_queue & queue, const int32_t * offsets, const int32_t * offsets_end) + { + int32_t context_start = *offsets++; + symbol_context parent_context = contexts[context_start]; + + for (; offsets < offsets_end;) + { + symbol_context * context = &contexts[context_start]; + int32_t context_end = *offsets++; + int32_t context_size = context_end - context_start; + + queue.push_offset(parent_context.offset); + + context[0].count = context_size; parent_context.count -= context_size; + context[0].offset = parent_context.offset; parent_context.offset += context_size; + context[0].symbol = parent_context.symbol; if (context_size > 1) { context[1].count = 0; } + + context_start = context_end; + } + + queue.push_offset(parent_context.offset); + + contexts[context_start] = parent_context; if (contexts[context_start].count > 1) { contexts[context_start + 1].count = 0; } + } + + static void populate_context_frequencies(symbol_context * context, symbol_context * primary_index_context, int32_t * frequencies) + { + int32_t total_symbols = context[0].count; + int32_t unique_symbols = 1; + + total_symbols -= ((uint32_t)(primary_index_context - context) < (uint32_t)total_symbols); + + while (total_symbols > 1 && context[unique_symbols].count > 0) + { + frequencies[context[unique_symbols].symbol] = context[unique_symbols].count; + total_symbols -= context[unique_symbols].count; unique_symbols++; + } + + assert(total_symbols > 0); frequencies[context[0].symbol] = total_symbols; + } + + static void populate_next_segments(symbol_context * context, symbol_context * primary_index_context, int32_t * frequencies, offset_queue & queue) + { + int32_t total_symbols = context[0].count; + int32_t unique_symbols = 1; + + total_symbols -= ((uint32_t)(primary_index_context - context) < (uint32_t)total_symbols); + + while (total_symbols > 1 && context[unique_symbols].count > 0) + { + if (frequencies[context[unique_symbols].symbol] != context[unique_symbols].count) + { + queue.push_offset(context[unique_symbols].offset); + } + + total_symbols -= context[unique_symbols].count; unique_symbols++; + } + + if (total_symbols > 0 && frequencies[context[0].symbol] != total_symbols) + { + queue.push_offset(context[0].offset); + } + } + + static void normalize_context(symbol_context * context, int32_t unique_symbols, int32_t total_symbols) + { + if (unique_symbols > 1) + { + for (int32_t i = 1; i < unique_symbols; ++i) + { + symbol_context temp = context[i]; + + int32_t j = i; + while (j > 0 && (context[j - 1].count < temp.count || (context[j - 1].count == temp.count && context[j - 1].symbol > temp.symbol))) + { + context[j] = context[j - 1]; j--; + } + + context[j] = temp; + } + + { + symbol_context * contexts_start = &context[std::max(0, unique_symbols - 6)]; + symbol_context * contexts_end = &context[unique_symbols - 1]; + + while (contexts_start < contexts_end) { std::swap(*contexts_start++, *contexts_end--); } + } + } + + assert(total_symbols > 0); context[0].count = total_symbols; if (unique_symbols < total_symbols) { context[unique_symbols].count = 0; } + } + + INLINE static int64_t pivot_cost3(int64_t A, int64_t B, int64_t C) + { + return A + B + C + B + std::min(A, C); + } + + INLINE static int64_t pivot_cost4(int64_t A, int64_t B, int64_t C, int64_t D) + { + return A + B + C + D + std::min(A + B + C + D, std::min(pivot_cost3(A, B, C), pivot_cost3(B, C, D))); + } + + INLINE static int64_t pivot_cost5(int64_t A, int64_t B, int64_t C, int64_t D, int64_t E) + { + return A + B + C + D + E + std::min(std::min(pivot_cost4(B, C, D, E), A + B + pivot_cost3(C, D, E)), std::min(pivot_cost3(A, B, C) + D + E, pivot_cost4(A, B, C, D))); + } +}; + +#pragma warning( pop ) \ No newline at end of file