From 93177e3f87b8f76d7def5e5eab8a138ce80a84a3 Mon Sep 17 00:00:00 2001 From: lcdpuydt Date: Thu, 9 Feb 2023 15:34:18 +0100 Subject: [PATCH] New version: more memory-efficient etc. --- CMakeLists.txt | 17 +- README.md | 494 +++++---- libdivsufsort/include/config.h | 81 ++ libdivsufsort/include/divsufsort.h | 180 ++++ libdivsufsort/include/divsufsort64.h | 180 ++++ libdivsufsort/include/lfs.h | 56 + libdivsufsort/lib/libdivsufsort.so | 1 + libdivsufsort/lib/libdivsufsort.so.3 | 1 + libdivsufsort/lib/libdivsufsort.so.3.0.1 | Bin 0 -> 49984 bytes libdivsufsort/lib/libdivsufsort64.so | 1 + libdivsufsort/lib/libdivsufsort64.so.3 | 1 + libdivsufsort/lib/libdivsufsort64.so.3.0.1 | Bin 0 -> 50000 bytes longestcommonprefix/longestCommonPrefix.cpp | 482 +++++++++ longestcommonprefix/longestCommonPrefix.h | 392 +++++++ src/bandmatrix.h | 9 +- src/benchmarking.cpp | 523 ++++++---- src/benchmarking.h | 18 +- src/bitvec.h | 218 +--- src/buildDBG.cpp | 104 +- src/buildIndexAuxiliary.cpp | 6 +- src/buildIndexAuxiliary.h | 2 +- src/bwtrepr.h | 33 + src/createStyles.cpp | 16 +- src/encodedtext.h | 331 ++++++ src/fmindex.cpp | 39 +- src/fmindex.h | 146 +-- src/fmindexDBG.cpp | 1032 +++++++++---------- src/fmindexDBG.h | 607 ++++++----- src/fmocc.h | 31 +- src/fmpos.cpp | 37 +- src/fmpos.h | 4 +- src/main.cpp | 154 ++- src/mainDBG.cpp | 282 ++--- src/mainstats.cpp | 213 ++++ src/mappingpair.h | 2 +- src/node.h | 11 +- src/range.h | 11 + src/searchstrategy.cpp | 32 +- src/searchstrategy.h | 118 ++- src/strainfreemapper.cpp | 5 + src/suffixarray.h | 33 +- src/textoccurrence.h | 24 +- src/tkmer.h | 4 +- src/visualizePath.cpp | 162 +-- src/visualizeRead.cpp | 240 +++-- sux/bits/Rank.hpp | 1 + 46 files changed, 4376 insertions(+), 1958 deletions(-) create mode 100644 libdivsufsort/include/config.h create mode 100644 libdivsufsort/include/divsufsort.h create mode 100644 libdivsufsort/include/divsufsort64.h create mode 100644 libdivsufsort/include/lfs.h create mode 120000 libdivsufsort/lib/libdivsufsort.so create mode 120000 libdivsufsort/lib/libdivsufsort.so.3 create mode 100755 libdivsufsort/lib/libdivsufsort.so.3.0.1 create mode 120000 libdivsufsort/lib/libdivsufsort64.so create mode 120000 libdivsufsort/lib/libdivsufsort64.so.3 create mode 100755 libdivsufsort/lib/libdivsufsort64.so.3.0.1 create mode 100644 longestcommonprefix/longestCommonPrefix.cpp create mode 100644 longestcommonprefix/longestCommonPrefix.h create mode 100644 src/encodedtext.h create mode 100644 src/mainstats.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 1cc6c66..00e7c17 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,7 +5,19 @@ project(Nexus) # add pop count, c++ version and compiler warnings add_compile_options(-mpopcnt -std=gnu++14 -Wall -pedantic) -add_library(AuxiliaryClasses src/range.cpp src/fmindex.cpp src/nucleotide.cpp src/searchstrategy.cpp src/buildIndexAuxiliary.cpp src/fmpos.cpp src/fmocc.cpp src/search.cpp src/fmindexDBG.cpp src/strainfreemapper.cpp) +include_directories("${PROJECT_SOURCE_DIR}/radixSA64/") + +include_directories("${PROJECT_SOURCE_DIR}/longestcommonprefix/") + +include_directories("${PROJECT_SOURCE_DIR}/libdivsufsort/include/") +find_library(SA_LIB NAMES divsufsort64 libdivsufsort64 libdivsufsort64.so PATHS "${PROJECT_SOURCE_DIR}/libdivsufsort/lib" "${PROJECT_SOURCE_DIR}/libdivsufsort" NO_DEFAULT_PATH REQUIRED) +if(NOT SA_LIB) + message(FATAL_ERROR "divsufsort64 library not found") +endif() + +add_library(AuxiliaryClasses src/range.cpp src/fmindex.cpp src/nucleotide.cpp src/searchstrategy.cpp src/buildIndexAuxiliary.cpp src/fmpos.cpp src/fmocc.cpp src/search.cpp src/fmindexDBG.cpp src/strainfreemapper.cpp longestcommonprefix/longestCommonPrefix.cpp) + +target_link_libraries(AuxiliaryClasses "${SA_LIB}" ) #add_compile_definitions(ALPHABET=6) add_definitions(-DALPHABET=6) @@ -16,6 +28,9 @@ target_link_libraries(columba AuxiliaryClasses ) add_executable(nexus src/mainDBG.cpp src/benchmarking.cpp) target_link_libraries(nexus AuxiliaryClasses ) +add_executable(nexusStats src/mainstats.cpp) +target_link_libraries(nexusStats AuxiliaryClasses ) + add_executable(nexusBuild src/buildDBG.cpp) target_link_libraries(nexusBuild AuxiliaryClasses ) diff --git a/README.md b/README.md index dd5238f..58cb72b 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # Nexus -Pan-genome compacted de Bruijn graphs with support for approximate pattern matching using search schemes +Pan-genome compacted de Bruijn graph using the Bidirectional FM-index with support for lossless approximate pattern matching using search schemes and subgraph visualization. --- @@ -137,65 +166,98 @@ Nexus can align reads in a fasta (`.FASTA`, `.fasta`, `.fa`) or fastq (`.fq`, `. To align your reads, use the following format: ```bash -./nexus [options] basefilename readfile.[ext] +./nexus [options] + + Following input parameters are required: + base filename of the input index + the de Bruijn parameter of the index + the file containing the input reads to be + aligned (single end). + + [ext] + one of the following: fq, fastq, FASTA, fasta, fa + ``` -options: +Details: ``` - [options] - -sfr --strain-free strain-free matching - -e --max-ed maximum edit distance [default = 0] - -s --sa-sparseness suffix array sparseness factor [default = 1] - -c --cp-sparseness sparseness factor that indicates how many checkpoints must be stored to identify nodes. Use "none" to use no checkpoints. [default = 128] - -f --filter filtering type that should be used to filter the occurrences. This option is only valid in case of strain-free matching. Options: - linear linear filtering is efficient but does not filter out all redundant occurrences. Additionally, in some exceptional cases, a non-optimal replacement occurrence can be chosen. This is the default option. - complete complete filtering leads to a set of occurrences with no redundancy. This option is very slow however and thus not recommended. - -p --partitioning Add flag to do uniform/static/dynamic partitioning. Dynamic partitioning cannot be used with strain-free matching. [default = static] - -m --metric Add flag to set distance metric (editnaive/editopt/hamming) [default = editopt] - -ss --search-scheme Choose the search scheme - options: - kuch1 Kucherov k + 1 - kuch2 Kucherov k + 2 - kianfar Optimal Kianfar scheme - manbest Manual best improvement for Kianfar scheme (only for ed = 4) - pigeon Pigeonhole scheme - 01*0 01*0 search scheme - naive naive backtracking - custom custom search scheme, the next parameter should be a path to the folder containing this search scheme - -[ext] - one of the following: fq, fastq, FASTA, fasta, fa -Following input files are required: - .txt: input text T - .cct: character counts table - .sa.[saSF]: sparse suffix array, with suffix array sparseness factor [saSF] elements - .sa.bv.[saSF]: bitvector indicating which elements of the suffix array are stored. - .bwt: BWT of T - .rev.bwt: BWT of the reverse of T - .brt: Prefix occurrence table of T - .rev.brt: Prefix occurrence table of the reverse of T - .DBG: variable k and the compressed de Bruijn graph. - .B.left: bitvector B_left for the compressed de Bruijn graph. - .B.right.[cpSF]: bitvector B_right for the compressed de Bruijn graph, with checkpoint sparseness factor [cpSF]. - .B.right.full.[cpSF]: bitvector B_right_full for the compressed de Bruijn graph, with checkpoint sparseness factor [cpSF]. - .left.map: node identifier mapping corresponding to B_left. - .right.map.[cpSF]: node identifier mapping corresponding to B_right, with checkpoint sparseness factor [cpSF]. + Following input parameters are required: + base filename of the input index + the de Bruijn parameter of the index + the file containing the input reads to be + aligned (single end). + [ext] + one of the following: fq, fastq, FASTA, fasta, fa + [options] + -e/--max-ed maximum edit distance [default = 0] + + -s/--sa-sparseness suffix array sparseness factor [default = 16] + + -c/--cp-sparseness sparseness factor that indicates how many + checkpoints must be stored to identify nodes. + Use "none" to use no checkpoints. Choose a + value that was also used during the building + process. [default = 128] + + -p/--partitioning Add flag to do uniform/static/dynamic + partitioning of the seeds for search schemes. + Dynamic partitioning cannot be used with + strain-free matching. [default = dynamic] + + -m/--metric Add flag to set distance metric (editnaive/ + editopt/ hamming) [default = editopt] + + -ss/--search-scheme Choose the search scheme. Options: + * kuch1 Kucherov k + 1 [default] + * kuch2 Kucherov k + 2 + * kianfar Optimal Kianfar scheme + * manbest Manual best improvement for Kianfar + scheme (only for ed = 4) + * pigeon Pigeonhole scheme + * 01*0 01*0 search scheme + * naive naive backtracking + * custom custom search scheme, the next + parameter should be a path to the + folder containing this searchscheme + + -sfr/--strain-free strain-free matching: occurrences can be + identified as any path of connected nodes. In + other words, they do not have to occur exactly + in one of the input genomes of the pan-genome. + This is option is not activated by default and + is slower than the default implementation. + + -f/--filter filtering type that should be used to filter + the occurrences. This option is only valid in + case of strain-free matching. Options: + * linear: linear filtering is efficient but + does not filter out all redundant + occurrences. Additionally, in some + exceptional cases, a non-optimal replacement + occurrence can be chosen. This is the + default option. + * complete: complete filtering leads to a set + of occurrences with no redundancy. This + option is very slow however and thus not + recommended. ``` -### Strain-Fixed versus Strain-Free Matching +### Pattern Matching -The `-sfr` or `--strain-free` option indicates a crucial difference in functionality. If this option is not added, Nexus defaults to strain-fixed matching. This means that only substrings of the original strains in the pan-genome can be found as occurrences. If the option is included, Nexus switches to strain-free matching. In this case, occurrences can be non-contiguous with respect to the strains in the pan-genome. In other words, an occurrence does not necessarily entirely occur in one of the original strains of the pan-genome. The choice of which type of matching is used can influence the other options as is mentioned in their description above. + -The matching duration, the number of index nodes visited, the number of graph nodes visited, and the number of reported/unique matches (strain-fixed) or node paths (strain-free) will be printed to stdout along with some additional metrics. For strain-fixed matching specifically, the number of reported node paths is included as an additional metric since multiple matches can correspond to the same node path. + -The matches will be written to a custom output file in the folder where your readfile was. This output file for strain-fixed matching will be a tab-separated file with the fields: `Identifier` (identifies the read), `SubgraphID` (identifies the node paths that were found for this read), `Path` (the node path corresponding to this occurrence), `Strain` (the number of the strain in which this occurrence lies), `Position` (the position of this occurrence in the pan-genome), `Length` (the length of this occurrence), `ED` (the edit distance of this occurrence) and `reverseComplement` (1 if this occurrence was found on the reverse complement of the reference, 0 otherwise). Similarly for strain-free matching, a tab-separated file is generated with the fields: `Identifier`, `SubgraphID`, `Path`, `DistanceFromLeftEnd`, `Length`, `ED` and `reverseComplement`. The `Strain` and `Position` fields are not present since an occurrence does not necessarily belong to one strain/correspond to a certain position in the original pan-genome anymore. The `DistanceFromLeftEnd` field is new and indicates the distance from the start of the match to the start of the first node of the node path, partly substituting the previous `Position` field. For each optimal alignment under the maximum given edit distance a line will be present. This output file will be called `readfile_output.txt`. +In default pattern matching mode, only substrings of the original strains in the pan-genome can be found as occurrences. The matches will be written to a custom output file in the folder where your readfile was. This output file for will be a tab-separated file with the fields: `Identifier` (identifies the read), `SubgraphID` (identifies the node paths that were found for this read), `Path` (the node path corresponding to this occurrence), `DistanceFromLeftEnd` (the distance from the start of the match to the start of the first node of the node path), `Strain` (the number of the strain in which this occurrence lies), `Position` (the position of this occurrence in the pan-genome), `Length` (the length of this occurrence), `ED` (the edit distance of this occurrence) and `reverseComplement` (1 if this occurrence was found on the reverse complement of the reference, 0 otherwise). For each alignment under the maximum given edit distance a line will be present. This output file will be called `_output.tsv`. + +The matching duration, the number of index nodes visited, the number of graph nodes visited, and the number of reported/unique matches and node paths will be printed to stdout along with some additional metrics. @@ -204,7 +266,7 @@ The matches will be written to a custom output file in the folder where your rea Consider the final directory structure from [example 1](##Example-1). Copy this [file](https://github.com/biointec/nexus/releases/download/v1.0.0/EscherichiaColi4Strains.reads.fasta) to this `example` directory. This file contains 100 000 reads of length 100 all sampled from the pan-genome of four E. coli strains (i.e., each read is sampled from one of the four strains). Thus, each read will have at least one exact occurrence. -If you want to align these reads in a strain-fixed way using the Pigeonhole scheme up to an edit distance of 3 to our reference pan-genome, run the following command in the `build` folder: +If you want to align these reads using the Pigeonhole scheme up to an edit distance of 3 to our reference pan-genome, run the following command in the `build` folder: ```bash ./nexus -e 3 -ss pigeon ../example/EscherichiaColi4Strains ../example/EscherichiaColi4Strains.reads.fasta ``` @@ -215,45 +277,49 @@ After this operation your directory structure will look like this: ├── build ├── cmake ├── example - | ├── EscherichiaColi4Strains.B.left + | ├── EscherichiaColi4Strains.B.left.k20 | ├── EscherichiaColi4Strains.brt - | ├── EscherichiaColi4Strains.DBG + | ├── EscherichiaColi4Strains.DBG.k20 | ├── EscherichiaColi4Strains.rev.bwt - | ├── EscherichiaColi4Strains.sa.bv.1 - | ├── EscherichiaColi4Strains.B.right.128 + | ├── EscherichiaColi4Strains.sa.bv.16 + | ├── EscherichiaColi4Strains.B.right.128.k20 | ├── EscherichiaColi4Strains.bwt - | ├── EscherichiaColi4Strains.left.map + | ├── EscherichiaColi4Strains.left.map.k20 | ├── EscherichiaColi4Strains.reads.fasta - | ├── EscherichiaColi4Strains.reads.fasta_output.txt - | ├── EscherichiaColi4Strains.right.map.128 - | ├── EscherichiaColi4Strains.txt + | ├── EscherichiaColi4Strains.reads.fasta_output.tsv + | ├── EscherichiaColi4Strains.right.map.128.k20 + | ├── EscherichiaColi4Strains.compressed.txt | ├── EscherichiaColi4Strains.B.right.full.128 | ├── EscherichiaColi4Strains.cct | ├── EscherichiaColi4Strains.rev.brt - | └── EscherichiaColi4Strains.sa.1 + | └── EscherichiaColi4Strains.sa.16 + ├── libdivsufsort + ├── longestcommonprefix ├── radixSA64 ├── search_schemes ├── src - └── sux + ├── sux + ├── CMakeLists.txt + └── README.md ``` -The results can be found in `EscherichiaColi4Strains.reads.fasta_output.txt`. +The results can be found in `EscherichiaColi4Strains.reads.fasta_output.tsv`. -To align the reads in a strain-free way, run + --- **NOTE** -This second alignment of reads will overwrite the `EscherichiaColi4Strains.reads.fasta_output.txt`. Before running a second time, make sure to back up the original file if you would like to keep it stored. +Another alignment of the same reads would overwrite the `EscherichiaColi4Strains.reads.fasta_output.tsv`. Before running a second time, make sure to back up the original file if you would like to keep it stored. --- -Congratulations! You are now able to use Nexus to align reads to a pan-genome of four E. coli strains! + ## Custom Search Schemes @@ -275,12 +341,12 @@ If you want to provide optimal static partitioning, you can create a file named Similarly, to provide values for dynamic partitioning you can create a file called `dynamic_partitioning.txt`. This file should contain two lines. The first line contains percentages (again between 0 and 1) that correspond to the seeding positions, relative to the size of the pattern, of all parts, except the first and last part. The second line should contain space-separated integers corresponding to the weights of each part. ---- + ### Folder Structure Example Consider a search scheme which supports maximal edit/hamming distances 1, 2 and 4. For distance 1 no static or dynamic partitioning values are known. For distance 2 only static partitioning values are known and for distance 4 both static and dynamic partitioning values are known. The folder structure of this search scheme should look like this: @@ -316,52 +382,60 @@ In the `search_schemes` folder the hardcoded search schemes of Nexus are availab As part of the code of Nexus is based on [Columba 1.0](https://github.com/biointec/columba/releases/tag/v1.0), Nexus also implements lossless approximate pattern matching directly to the underlying linear reference. In this case, no node path in the pan-genome graph is found. We refer to this functionality as Nexus's built-in version of Columba. -Just as all of the other functionality, it is assumed here that the linear reference is a pan-genome containing DNA characters `A`, `C`, `G` and `T`; and separation characters `$` and `%`. Both separation characters must be present to garantee correct performance. +Just as all of the other functionality, it is assumed here that the linear reference is a pan-genome containing DNA characters `A`, `C`, `G` and `T`; and separation characters `$` and `%`. Both separation characters must be present to guarantee correct performance. Matching a read file to the linear reference text can be done by running the following command from the `build` folder. ```bash -./columba [options] basefilename readfile.[ext] +./columba [options] ``` -options: +Details: ``` + Following input parameters are required: + base filename of the input index + the file containing the input reads to be + aligned (single end). + + [ext] + one of the following: fq, fastq, FASTA, fasta, fa + [options] - -e --max-ed maximum edit distance [default = 0] - -s --sa-sparseness suffix array sparseness factor [default = 1] - -p --partitioning Add flag to do uniform/static/dynamic partitioning [default = dynamic] - -m --metric Add flag to set distance metric (editnaive/editopt/hamming) [default = editopt] - -ss --search-scheme Choose the search scheme - options: - kuch1 Kucherov k + 1 - kuch2 Kucherov k + 2 - kianfar Optimal Kianfar scheme - manbest Manual best improvement for Kianfar scheme (only for ed = 4) - pigeon Pigeonhole scheme - 01*0 01*0 search scheme - custom custom search scheme, the next parameter should be a path to the folder containing this search scheme - -[ext] - one of the following: fq, fastq, FASTA, fasta, fa -Following input files are required: - .txt: input text T - .cct: character counts table - .sa.[saSF]: suffix array sample every [saSF] elements - .bwt: BWT of T - .brt: Prefix occurrence table of T - .rev.brt: Prefix occurrence table of the reverse of T + -e/--max-ed maximum edit distance [default = 0] + -s/--sa-sparseness suffix array sparseness factor [default = 16] - + -p/--partitioning Add flag to do uniform/static/dynamic + partitioning of the seeds for search schemes. + Dynamic partitioning cannot be used with + strain-free matching. [default = dynamic] + + -m/--metric Add flag to set distance metric (editnaive/ + editopt/ hamming) [default = editopt] + + -ss/--search-scheme Choose the search scheme. Options: + * kuch1 Kucherov k + 1 [default] + * kuch2 Kucherov k + 2 + * kianfar Optimal Kianfar scheme + * manbest Manual best improvement for Kianfar + scheme (only for ed = 4) + * pigeon Pigeonhole scheme + * 01*0 01*0 search scheme + * naive naive backtracking + * custom custom search scheme, the next + parameter should be a path to the + folder containing this searchscheme ``` -As a result, a custom output file is written in the folder where your readfile was. This `.txt` file contains for each occurrence the following fields: `identifier`, `Position`, `Length`, `ED` and `reverseComplement`. `identifier` identifies the read, the other fields have been discussed above. +As a result, a custom output file is written in the folder where your readfile was. This `.txt` file contains for each occurrence the following fields: `identifier`, `Position`, `Length`, `ED` and `reverseComplement`. These fields have already been discussed above. + +For more information regarding columba's options, we refer to Columba's [GitHub page](https://github.com/biointec/columba). # Visualizing Subgraphs We also foresee functionality to visualize node paths along with their surrounding neighborhood. -The visualization functionality is implemented such that every connection between two nodes is shown separately. This means that it is possible that multiple edges have the same source and target nodes if this particular combination occurs multiple times in the pan-genome reference. Note that in the `Lossless Approximate Pattern Matching on Pan-genome de Bruijn Graphs` paper, edges are defined differently: each edge must have a different combination of source and target nodes. In other words, all parallel connections are considered together as one edge. We choose not to simplify the visualized subgraphs as such, to provide a more complete overview. +The visualization functionality is implemented such that every connection between two nodes can be shown separately. This means that it is possible that multiple edges have the same source and target nodes if this particular combination occurs multiple times in the pan-genome reference. ## Prerequisites ### Cytoscape @@ -379,7 +453,7 @@ Additionally, for smooth visualization we recommend installing `yFiles Layout Al To import a clear graph style, you can create a styles file using the following command (run from the `build` folder): ```bash -./createStyles [numberOfStrains] +./createStyles ``` As a parameter, enter the number of strains that your pan-genome graph contains. This way, the edge colors are spread out nicely. @@ -396,52 +470,81 @@ After running this program, a file `PanGenomeSubgraph.xml` is created in the `bu It is possible to input a DNA read, which is then matched to the pan-genome graph. The resulting occurrences are then visualized. To do so, execute the following in the `build` directory: ``` -./visualizeRead [options] basefilename read -``` - -There are pattern matching options, which are identical to the ones described above for matching read files. Additionally, two new visualization options are included. - -``` - [Pattern matching options] - -sfr --strain-free strain-free matching - -e --max-ed maximum edit distance [default = 0] - -s --sa-sparseness suffix array sparseness factor [default = 1] - -c --cp-sparseness sparseness factor that indicates how many checkpoints must be stored to identify nodes. Use "none" to use no checkpoints. Choose a value that was also used during the building process. [default = 128] - -f --filter filtering type that should be used to filter the occurrences. This option is only valid in case of strain-free matching. Options: - linear linear filtering is efficient but does not filter out all redundant occurrences. Additionally, in some exceptional cases, a non-optimal replacement occurrence can be chosen. This is the default option. - complete complete filtering leads to a set of occurrences with no redundancy. This option is very slow however and thus not recommended. - -p --partitioning Add flag to do uniform/static/dynamic partitioning. Dynamic partitioning cannot be used with strain-free matching. [default = static] - -m --metric Add flag to set distance metric (editnaive/editopt/hamming) [default = editopt] - -ss --search-scheme Choose the search scheme - options: - kuch1 Kucherov k + 1 - kuch2 Kucherov k + 2 - kianfar Optimal Kianfar scheme - manbest Manual best improvement for kianfar scheme (only for ed = 4) - pigeon Pigeonhole scheme - 01*0 01*0 search scheme - naive naive backtracking - custom custom search scheme, the next parameter should be a path to the folder containing this search scheme - - [Visualization options] - -d --visualization-depth Depth of the visualized neighborhood around the paths of interest [default = 3] - -o --output-files Prefix of the output files that will be created during the visualization process [default = basefilename] - -Following input files are required: - .txt: input text T - .cct: character counts table - .sa.[saSF]: sparse suffix array, with suffix array sparseness factor [saSF] elements - .sa.bv.[saSF]: bitvector indicating which elements of the suffix array are stored. - .bwt: BWT of T - .rev.bwt: BWT of the reverse of T - .brt: Prefix occurrence table of T - .rev.brt: Prefix occurrence table of the reverse of T - .DBG: variable k and the compressed de Bruijn graph. - .B.left: bitvector B_left for the compressed de Bruijn graph. - .B.right.[cpSF]: bitvector B_right for the compressed de Bruijn graph, with checkpoint sparseness factor [cpSF]. - .B.right.full.[cpSF]: bitvector B_right_full for the compressed de Bruijn graph, with checkpoint sparseness factor [cpSF]. - .left.map: node identifier mapping corresponding to B_left. - .right.map.[cpSF]: node identifier mapping corresponding to B_right, with checkpoint sparseness factor [cpSF]. +/visualizeRead [options] +``` + +There are pattern matching options, which are identical to the ones described above for matching read files. Additionally, three new visualization options are included at the end. + +``` + Following input parameters are required: + base filename of the input index + the de Bruijn parameter of the index + the read that must be aligned and visualized. + + + [options] + -e/--max-ed maximum edit distance [default = 0] + + -s/--sa-sparseness suffix array sparseness factor [default = 16] + + -c/--cp-sparseness sparseness factor that indicates how many + checkpoints must be stored to identify nodes. + Use "none" to use no checkpoints. Choose a + value that was also used during the building + process. [default = 128] + + -p/--partitioning Add flag to do uniform/static/dynamic + partitioning of the seeds for search schemes. + Dynamic partitioning cannot be used with + strain-free matching. [default = dynamic] + + -m/--metric Add flag to set distance metric (editnaive/ + editopt/ hamming) [default = editopt] + + -ss/--search-scheme Choose the search scheme. Options: + * kuch1 Kucherov k + 1 [default] + * kuch2 Kucherov k + 2 + * kianfar Optimal Kianfar scheme + * manbest Manual best improvement for Kianfar + scheme (only for ed = 4) + * pigeon Pigeonhole scheme + * 01*0 01*0 search scheme + * naive naive backtracking + * custom custom search scheme, the next + parameter should be a path to the + folder containing this searchscheme + + -sfr/--strain-free strain-free matching: occurrences can be + identified as any path of connected nodes. In + other words, they do not have to occur exactly + in one of the input genomes of the pan-genome. + This is option is not activated by default and + is slower than the default implementation. + + -f/--filter filtering type that should be used to filter + the occurrences. This option is only valid in + case of strain-free matching. Options: + * linear: linear filtering is efficient but + does not filter out all redundant + occurrences. Additionally, in some + exceptional cases, a non-optimal replacement + occurrence can be chosen. This is the + default option. + * complete: complete filtering leads to a set + of occurrences with no redundancy. This + option is very slow however and thus not + recommended. + + -d/--depth Depth of the visualized neighborhood around the + paths of interest [default = 3] + + -b/--bundle-edges Bundle edges stemming from different strains + together. Recommended when many strains are + present [default = false] + + -o/--output-files Prefix of the output files that will be created + during the visualization process [default = + basefilename] ``` @@ -449,7 +552,9 @@ This procedure outputs two files: `outputfilename_SubgraphOverview.tsv` and `out Note however that if your read does not occur in the graph respecting your search parameters, these two files will be empty except for the header. -`outputfilename_SubgraphOverview.tsv` contains an overview of the occurrences corresponding to the input read. This overview is similar to what is reported when mapping a readfile to the graph. For strain-fixed matching, `SubgraphID`, `Path`, `Strain`, `Position`, `Length` and `ED` are reported. For strain-free matching, `SubgraphID`, `Path`, `DistanceFromLeftEnd`, `Length` and `ED` are reported. Note that only the read itself is matched to the graph, not its reverse complement. +`outputfilename_SubgraphOverview.tsv` contains an overview of the occurrences corresponding to the input read. This overview is similar to what is reported when mapping a readfile to the graph. Fields `SubgraphID`, `Path`, `DistanceFromLeftEnd`, `Strain`, `Position`, `Length` and `ED` are reported. Note that only the read itself is matched to the graph, not its reverse complement. + + `outputfilename_SubgraphEdges.tsv` contains all edges for all subgraphs and can be used to visualize the subgraphs in Cytoscape. This can be done as follows: * either drag the `outputfilename_SubgraphEdges.tsv` into the `Network` pane or import it as a `Network from File` via the `File` dropdown in the top menu bar. Check that the columns are interpreted by Cytoscape as follows: @@ -463,6 +568,7 @@ Note however that if your read does not occur in the graph respecting your searc * OmegaFull = Target Node Attribute * PartOfPath = Target Node Attribute * Color = Edge Attribute + * EdgeMultiplicity = Edge Attribute * Select the network that is shown, then navigate to `Style` in the left menu bar * In the styles dropdown menu, choose `PanGenomeSubgraph` which you imported earlier * Select `Layout` in the top menu @@ -474,7 +580,7 @@ You should now see a clear set of subgraphs corresponding to your input read. Use again the pan-genome from [example 1](##Example-1). -If you want to align and visualize a read (e.g., `CGGCATCCAGGTCGTTAATGATGATAGTTGGTCTGGACATTTTTACTCCATGTCGTCGGTACTGCGAGTGTCGCAGATAAACATACCCAAAAGAAAACCC`) in a strain-fixed way using the Pigeonhole scheme up to an edit distance of 3 to our reference pan-genome, run the following command in the `build` folder: +If you want to align and visualize a read (e.g., `CGGCATCCAGGTCGTTAATGATGATAGTTGGTCTGGACATTTTTACTCCATGTCGTCGGTACTGCGAGTGTCGCAGATAAACATACCCAAAAGAAAACCC`) using the Pigeonhole scheme up to an edit distance of 3 to our reference pan-genome, run the following command in the `build` folder: ```bash ./visualizeRead -e 3 -ss pigeon ../example/EscherichiaColi4Strains CGGCATCCAGGTCGTTAATGATGATAGTTGGTCTGGACATTTTTACTCCATGTCGTCGGTACTGCGAGTGTCGCAGATAAACATACCCAAAAGAAAACCC ``` @@ -485,19 +591,19 @@ The default neighborhood depth of 3 is used. The results can now be found in `Es The darker nodes are part of the node path. Lighter nodes are part of the neighborhood. -To align the read in a strain-free way, run + ## Visualizing a Node Path You can also directly visualize a node path of interest. To do so, execute the following in the `build` directory: ``` -./visualizePath [options] basefilename path +./visualizePath [options] ``` The path should be a comma-separated list of nodes (no whitespaces). @@ -505,27 +611,35 @@ The path should be a comma-separated list of nodes (no whitespaces). In this case, there are similar but less options: ``` - [Visualization options] - -d --visualization-depth Depth of the visualized neighborhood around the paths of interest [default = 3] - -o --output-files Prefix of the output files that will be created during the visualization process [default = basefilename] - -s --sa-sparseness suffix array sparseness factor [default = 1] - -c --cp-sparseness sparseness factor that indicates how many checkpoints must be stored to identify nodes. Use "none" to use no checkpoints. Choose a value that was also used during the building process.[default = 128] + Following input parameters are required: + base filename of the input index + the de Bruijn parameter of the index + a comma-separated list of node identifiers + (e.g., 1,9,20) + + + [options] + -e/--max-ed maximum edit distance [default = 0] + + -s/--sa-sparseness suffix array sparseness factor [default = 16] + + -c/--cp-sparseness sparseness factor that indicates how many + checkpoints must be stored to identify nodes. + Use "none" to use no checkpoints. Choose a + value that was also used during the building + process. [default = 128] + + -d/--depth Depth of the visualized neighborhood around the + paths of interest [default = 3] + + -b/--bundle-edges Bundle edges stemming from different strains + together. Recommended when many strains are + present [default = false] + + -o/--output-files Prefix of the output files that will be created + during the visualization process [default = + basefilename] -Following input files are required: - .txt: input text T - .cct: character counts table - .sa.[saSF]: sparse suffix array, with suffix array sparseness factor [saSF] elements - .sa.bv.[saSF]: bitvector indicating which elements of the suffix array are stored. - .bwt: BWT of T - .rev.bwt: BWT of the reverse of T - .brt: Prefix occurrence table of T - .rev.brt: Prefix occurrence table of the reverse of T - .DBG: variable k and the compressed de Bruijn graph. - .B.left: bitvector B_left for the compressed de Bruijn graph. - .B.right.[cpSF]: bitvector B_right for the compressed de Bruijn graph, with checkpoint sparseness factor [cpSF]. - .B.right.full.[cpSF]: bitvector B_right_full for the compressed de Bruijn graph, with checkpoint sparseness factor [cpSF]. - .left.map: node identifier mapping corresponding to B_left. - .right.map.[cpSF]: node identifier mapping corresponding to B_right, with checkpoint sparseness factor [cpSF]. ``` @@ -541,8 +655,12 @@ We now want to visualize the subgraph around node path `{551,73827}` (which was ``` The result can now be found in `EscherichiaColi4Strains_SubgraphEdges.tsv`. You can now visualize the subgraph in Cytoscape as described above. This should show one of the subgraphs from the image above. -# Reproducing Results -The results from our paper 'Lossless Approximate Pattern Matching on Pan-genome de Bruijn Graphs' can be reproduced by using the following instructions. + + + + + \ No newline at end of file diff --git a/libdivsufsort/include/config.h b/libdivsufsort/include/config.h new file mode 100644 index 0000000..fb4e712 --- /dev/null +++ b/libdivsufsort/include/config.h @@ -0,0 +1,81 @@ +/* + * config.h for libdivsufsort + * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _CONFIG_H +#define _CONFIG_H 1 + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +/** Define to the version of this package. **/ +#define PROJECT_VERSION_FULL "2.0.1-14-g5f60d6f" + +/** Define to 1 if you have the header files. **/ +#define HAVE_INTTYPES_H 1 +#define HAVE_STDDEF_H 1 +#define HAVE_STDINT_H 1 +#define HAVE_STDLIB_H 1 +#define HAVE_STRING_H 1 +#define HAVE_STRINGS_H 1 +#define HAVE_MEMORY_H 1 +#define HAVE_SYS_TYPES_H 1 + +/** for WinIO **/ +/* #undef HAVE_IO_H */ +/* #undef HAVE_FCNTL_H */ +/* #undef HAVE__SETMODE */ +/* #undef HAVE_SETMODE */ +/* #undef HAVE__FILENO */ +/* #undef HAVE_FOPEN_S */ +/* #undef HAVE__O_BINARY */ +#ifndef HAVE__SETMODE +# if HAVE_SETMODE +# define _setmode setmode +# define HAVE__SETMODE 1 +# endif +# if HAVE__SETMODE && !HAVE__O_BINARY +# define _O_BINARY 0 +# define HAVE__O_BINARY 1 +# endif +#endif + +/** for inline **/ +#ifndef INLINE +# define INLINE inline +#endif + +/** for VC++ warning **/ +#ifdef _MSC_VER +#pragma warning(disable: 4127) +#endif + + +#ifdef __cplusplus +} /* extern "C" */ +#endif /* __cplusplus */ + +#endif /* _CONFIG_H */ diff --git a/libdivsufsort/include/divsufsort.h b/libdivsufsort/include/divsufsort.h new file mode 100644 index 0000000..6d3e648 --- /dev/null +++ b/libdivsufsort/include/divsufsort.h @@ -0,0 +1,180 @@ +/* + * divsufsort.h for libdivsufsort + * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _DIVSUFSORT_H +#define _DIVSUFSORT_H 1 + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +#include + +#ifndef DIVSUFSORT_API +# ifdef DIVSUFSORT_BUILD_DLL +# define DIVSUFSORT_API +# else +# define DIVSUFSORT_API +# endif +#endif + +/*- Datatypes -*/ +#ifndef SAUCHAR_T +#define SAUCHAR_T +typedef uint8_t sauchar_t; +#endif /* SAUCHAR_T */ +#ifndef SAINT_T +#define SAINT_T +typedef int32_t saint_t; +#endif /* SAINT_T */ +#ifndef SAIDX_T +#define SAIDX_T +typedef int32_t saidx_t; +#endif /* SAIDX_T */ +#ifndef PRIdSAINT_T +#define PRIdSAINT_T PRId32 +#endif /* PRIdSAINT_T */ +#ifndef PRIdSAIDX_T +#define PRIdSAIDX_T PRId32 +#endif /* PRIdSAIDX_T */ + + +/*- Prototypes -*/ + +/** + * Constructs the suffix array of a given string. + * @param T[0..n-1] The input string. + * @param SA[0..n-1] The output array of suffixes. + * @param n The length of the given string. + * @return 0 if no error occurred, -1 or -2 otherwise. + */ +DIVSUFSORT_API +saint_t +divsufsort(const sauchar_t *T, saidx_t *SA, saidx_t n); + +/** + * Constructs the burrows-wheeler transformed string of a given string. + * @param T[0..n-1] The input string. + * @param U[0..n-1] The output string. (can be T) + * @param A[0..n-1] The temporary array. (can be NULL) + * @param n The length of the given string. + * @return The primary index if no error occurred, -1 or -2 otherwise. + */ +DIVSUFSORT_API +saidx_t +divbwt(const sauchar_t *T, sauchar_t *U, saidx_t *A, saidx_t n); + +/** + * Returns the version of the divsufsort library. + * @return The version number string. + */ +DIVSUFSORT_API +const char * +divsufsort_version(void); + + +/** + * Constructs the burrows-wheeler transformed string of a given string and suffix array. + * @param T[0..n-1] The input string. + * @param U[0..n-1] The output string. (can be T) + * @param SA[0..n-1] The suffix array. (can be NULL) + * @param n The length of the given string. + * @param idx The output primary index. + * @return 0 if no error occurred, -1 or -2 otherwise. + */ +DIVSUFSORT_API +saint_t +bw_transform(const sauchar_t *T, sauchar_t *U, + saidx_t *SA /* can NULL */, + saidx_t n, saidx_t *idx); + +/** + * Inverse BW-transforms a given BWTed string. + * @param T[0..n-1] The input string. + * @param U[0..n-1] The output string. (can be T) + * @param A[0..n-1] The temporary array. (can be NULL) + * @param n The length of the given string. + * @param idx The primary index. + * @return 0 if no error occurred, -1 or -2 otherwise. + */ +DIVSUFSORT_API +saint_t +inverse_bw_transform(const sauchar_t *T, sauchar_t *U, + saidx_t *A /* can NULL */, + saidx_t n, saidx_t idx); + +/** + * Checks the correctness of a given suffix array. + * @param T[0..n-1] The input string. + * @param SA[0..n-1] The input suffix array. + * @param n The length of the given string. + * @param verbose The verbose mode. + * @return 0 if no error occurred. + */ +DIVSUFSORT_API +saint_t +sufcheck(const sauchar_t *T, const saidx_t *SA, saidx_t n, saint_t verbose); + +/** + * Search for the pattern P in the string T. + * @param T[0..Tsize-1] The input string. + * @param Tsize The length of the given string. + * @param P[0..Psize-1] The input pattern string. + * @param Psize The length of the given pattern string. + * @param SA[0..SAsize-1] The input suffix array. + * @param SAsize The length of the given suffix array. + * @param idx The output index. + * @return The count of matches if no error occurred, -1 otherwise. + */ +DIVSUFSORT_API +saidx_t +sa_search(const sauchar_t *T, saidx_t Tsize, + const sauchar_t *P, saidx_t Psize, + const saidx_t *SA, saidx_t SAsize, + saidx_t *left); + +/** + * Search for the character c in the string T. + * @param T[0..Tsize-1] The input string. + * @param Tsize The length of the given string. + * @param SA[0..SAsize-1] The input suffix array. + * @param SAsize The length of the given suffix array. + * @param c The input character. + * @param idx The output index. + * @return The count of matches if no error occurred, -1 otherwise. + */ +DIVSUFSORT_API +saidx_t +sa_simplesearch(const sauchar_t *T, saidx_t Tsize, + const saidx_t *SA, saidx_t SAsize, + saint_t c, saidx_t *left); + + +#ifdef __cplusplus +} /* extern "C" */ +#endif /* __cplusplus */ + +#endif /* _DIVSUFSORT_H */ diff --git a/libdivsufsort/include/divsufsort64.h b/libdivsufsort/include/divsufsort64.h new file mode 100644 index 0000000..2f1c375 --- /dev/null +++ b/libdivsufsort/include/divsufsort64.h @@ -0,0 +1,180 @@ +/* + * divsufsort64.h for libdivsufsort64 + * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _DIVSUFSORT64_H +#define _DIVSUFSORT64_H 1 + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +#include + +#ifndef DIVSUFSORT_API +# ifdef DIVSUFSORT_BUILD_DLL +# define DIVSUFSORT_API +# else +# define DIVSUFSORT_API +# endif +#endif + +/*- Datatypes -*/ +#ifndef SAUCHAR_T +#define SAUCHAR_T +typedef uint8_t sauchar_t; +#endif /* SAUCHAR_T */ +#ifndef SAINT_T +#define SAINT_T +typedef int32_t saint_t; +#endif /* SAINT_T */ +#ifndef SAIDX64_T +#define SAIDX64_T +typedef int64_t saidx64_t; +#endif /* SAIDX64_T */ +#ifndef PRIdSAINT_T +#define PRIdSAINT_T PRId32 +#endif /* PRIdSAINT_T */ +#ifndef PRIdSAIDX64_T +#define PRIdSAIDX64_T PRId64 +#endif /* PRIdSAIDX64_T */ + + +/*- Prototypes -*/ + +/** + * Constructs the suffix array of a given string. + * @param T[0..n-1] The input string. + * @param SA[0..n-1] The output array of suffixes. + * @param n The length of the given string. + * @return 0 if no error occurred, -1 or -2 otherwise. + */ +DIVSUFSORT_API +saint_t +divsufsort64(const sauchar_t *T, saidx64_t *SA, saidx64_t n); + +/** + * Constructs the burrows-wheeler transformed string of a given string. + * @param T[0..n-1] The input string. + * @param U[0..n-1] The output string. (can be T) + * @param A[0..n-1] The temporary array. (can be NULL) + * @param n The length of the given string. + * @return The primary index if no error occurred, -1 or -2 otherwise. + */ +DIVSUFSORT_API +saidx64_t +divbwt64(const sauchar_t *T, sauchar_t *U, saidx64_t *A, saidx64_t n); + +/** + * Returns the version of the divsufsort library. + * @return The version number string. + */ +DIVSUFSORT_API +const char * +divsufsort64_version(void); + + +/** + * Constructs the burrows-wheeler transformed string of a given string and suffix array. + * @param T[0..n-1] The input string. + * @param U[0..n-1] The output string. (can be T) + * @param SA[0..n-1] The suffix array. (can be NULL) + * @param n The length of the given string. + * @param idx The output primary index. + * @return 0 if no error occurred, -1 or -2 otherwise. + */ +DIVSUFSORT_API +saint_t +bw_transform64(const sauchar_t *T, sauchar_t *U, + saidx64_t *SA /* can NULL */, + saidx64_t n, saidx64_t *idx); + +/** + * Inverse BW-transforms a given BWTed string. + * @param T[0..n-1] The input string. + * @param U[0..n-1] The output string. (can be T) + * @param A[0..n-1] The temporary array. (can be NULL) + * @param n The length of the given string. + * @param idx The primary index. + * @return 0 if no error occurred, -1 or -2 otherwise. + */ +DIVSUFSORT_API +saint_t +inverse_bw_transform64(const sauchar_t *T, sauchar_t *U, + saidx64_t *A /* can NULL */, + saidx64_t n, saidx64_t idx); + +/** + * Checks the correctness of a given suffix array. + * @param T[0..n-1] The input string. + * @param SA[0..n-1] The input suffix array. + * @param n The length of the given string. + * @param verbose The verbose mode. + * @return 0 if no error occurred. + */ +DIVSUFSORT_API +saint_t +sufcheck64(const sauchar_t *T, const saidx64_t *SA, saidx64_t n, saint_t verbose); + +/** + * Search for the pattern P in the string T. + * @param T[0..Tsize-1] The input string. + * @param Tsize The length of the given string. + * @param P[0..Psize-1] The input pattern string. + * @param Psize The length of the given pattern string. + * @param SA[0..SAsize-1] The input suffix array. + * @param SAsize The length of the given suffix array. + * @param idx The output index. + * @return The count of matches if no error occurred, -1 otherwise. + */ +DIVSUFSORT_API +saidx64_t +sa_search64(const sauchar_t *T, saidx64_t Tsize, + const sauchar_t *P, saidx64_t Psize, + const saidx64_t *SA, saidx64_t SAsize, + saidx64_t *left); + +/** + * Search for the character c in the string T. + * @param T[0..Tsize-1] The input string. + * @param Tsize The length of the given string. + * @param SA[0..SAsize-1] The input suffix array. + * @param SAsize The length of the given suffix array. + * @param c The input character. + * @param idx The output index. + * @return The count of matches if no error occurred, -1 otherwise. + */ +DIVSUFSORT_API +saidx64_t +sa_simplesearch64(const sauchar_t *T, saidx64_t Tsize, + const saidx64_t *SA, saidx64_t SAsize, + saint_t c, saidx64_t *left); + + +#ifdef __cplusplus +} /* extern "C" */ +#endif /* __cplusplus */ + +#endif /* _DIVSUFSORT64_H */ diff --git a/libdivsufsort/include/lfs.h b/libdivsufsort/include/lfs.h new file mode 100644 index 0000000..7ef88f0 --- /dev/null +++ b/libdivsufsort/include/lfs.h @@ -0,0 +1,56 @@ +/* + * lfs.h for libdivsufsort + * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _LFS_H +#define _LFS_H 1 + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +#ifndef __STRICT_ANSI__ +# define LFS_OFF_T off_t +# define LFS_FOPEN fopen +# define LFS_FTELL ftello +# define LFS_FSEEK fseeko +# define LFS_PRId PRIdMAX +#else +# define LFS_OFF_T long +# define LFS_FOPEN fopen +# define LFS_FTELL ftell +# define LFS_FSEEK fseek +# define LFS_PRId "ld" +#endif +#ifndef PRIdOFF_T +# define PRIdOFF_T LFS_PRId +#endif + + +#ifdef __cplusplus +} /* extern "C" */ +#endif /* __cplusplus */ + +#endif /* _LFS_H */ diff --git a/libdivsufsort/lib/libdivsufsort.so b/libdivsufsort/lib/libdivsufsort.so new file mode 120000 index 0000000..f0aefcf --- /dev/null +++ b/libdivsufsort/lib/libdivsufsort.so @@ -0,0 +1 @@ +libdivsufsort.so.3 \ No newline at end of file diff --git a/libdivsufsort/lib/libdivsufsort.so.3 b/libdivsufsort/lib/libdivsufsort.so.3 new file mode 120000 index 0000000..9f02362 --- /dev/null +++ b/libdivsufsort/lib/libdivsufsort.so.3 @@ -0,0 +1 @@ +libdivsufsort.so.3.0.1 \ No newline at end of file diff --git a/libdivsufsort/lib/libdivsufsort.so.3.0.1 b/libdivsufsort/lib/libdivsufsort.so.3.0.1 new file mode 100755 index 0000000000000000000000000000000000000000..916f7895e095151051e72a059b97b8231be40235 GIT binary patch literal 49984 zcmeFadwf*Yx$r-e3@|{<4jMIT)SMdMl4@;Yv1TgTY}f;Pa0XFPX+^<~P&h@PFad12 z=_G;m4jX92OHVJ_^S$$II?O7C!kH`xK1G&HazzqSpnmEX-sqt?#{>?w(P!p&Q z4CMby0vGGHp`Kf(oIMuys&dCbk#{Zc=p(28^~uK%=XmXM$J~8+PQP!ZcRz5_aqZb& zeeT%%xxU=${JnR-^SKj`GxNObdHyl?oLtIt;eYe)ho3+3xXnjh?&uF!!zge*>GyZ? zvCQixkUO6I+y?)-laJ;8bGf6&1$DQ+m%8`=$@|9m&xL>Hb$s%1mH)onG55Zkc~0>1 zU;WEX^DX}SK5rhv{IN|P7T%8>#3>>)M zt8Nj%`THG3>KlKcF9W~w=RN7y2j+S8ANcLZ`}LJKdiCG!?;YRw>;L$GcfHythy#B8 z%zCeWy5CPzu706cf34qM9@lVQ=+$57GvrdgzQ~_#p%eN0XNt6+S)wlki+%V56DEA; zz8TXe%ud}iD>Y$4U_xwcV#2+XW=;Ce)Y+*?v&JTdPn$7)(%5^xJ#CVA@AP*jB@ z-=3S2gMW9zoJq50Pn|Ix$N(z&-3iGl-<>e|o~hFUQ>V+VNfW+3cS35`J=14To-ymb z08b^SOiF$?FgpdXrzXpDfyr}cO-)S-%)V#B>`C{`N>0&dQ}3HOZIXXI zZR)p^SIwSr)z<=?^#CDNUk!|m$A%4`aP?JRy=qAJdC%F^S6w5x6h!9h|MU1?_*#y^ z)BKaxLE&iTR`^Q(QWs2xGD3li{kr0*Q_qtAD*Wr(3*~-Z;HzBAUrrC?Ps5Ei+&vZA?b>;UiChS>-xdH>T|MOZdCTFmoeo2svEsC>Ax^n$+4!(|F}l)LfW6>ou2vs3$-*PL|y<&&;Ye*WTQ(L{4DzG!xPv@d-WF!JRIN8}Gsni%E4SI|vs-Po`I*JicY6Og zTg@*I+iK@w`?t0lTV7+iaC8uQ65^ zs%A^AtKU8^VrQh2TB)v$HB-!W5sWInL21E51Q)b^uC~(4*e;Z zK9;ThyPZju7e%W!Mynd15mGe&CjVz{Xwimj)sg(`CSPS-T1Q;~h?@IvvXk3wr}&}# ztJT$NQ@SbG^mf?NwfRFIdf=$o(X~76OuW1(mV7y?c9~6k!nFUP>D(B!G6T%u(O`%A zx}{#T`Zv>6X6Qp!aI5*ir&;<>O^B-9rrO;8i0WwiAl(0`o!poXWrD-i*d<-2bL(+4 z{b?}u6+3y%&S$vorA>Ro3`WM-pRwt-49cim1CywO3_o&k!RxVfTTmTzUj~sNsj}i@ z@IAz+8$%mAxib!^09f#_3@}!)-MIs*iI-O=oRD#&GvrY_xrr`QSKG;<%S~f-afJWo zM`YCfk7-m4eJFE2K&bP%GF8E&cG?Z5k7jGP$dpuDnS0BJFp+Wd`Qw5?-c}h`opv&Q zLzkYjs57lRYpH4FHk}*B>bRl%_Rw7LP$uHskyV4}JF2dHbhuMo4nf7#8)ikTr4BzQ zSZjJGtX_*HU+lk6T4nBiIG{FJ6>q1H1sC2DOTJ?B_Q6_8IR#6ZiIr8^j8#4+u8v3Y zYK&DacW_lR=AZKfn!y%BuZiOohd=TAcx&j)N{br=ZB>Kl!v zTX>-}25MNxiJfeY2RF2z?a37=Leq)niY+{&b~1&LEeqe%)l2mQ+uH})GO^oa{?Afd znD~N0YFlK>g0}i>st{<7RjlvWo<5#u#E-?2W4a);kg0|~+WtT^106V{K+wi{U$E7t zSTg=-$oW2_9J-X=YTxSEVXAB0OSGt{38>Zz_>v~zX?cGtAv#-aabJOCW67E>vn8|y zkOGXX(xW4l?#ku|V$R4enclLT{G&mUg=i>wK?p<6Du-pPKZZbqr4GP}G!FOhv+6X} zc4`OGul<n#$1+3GR^%gN#eMK}BKbxlxye%QDCbHC&IyciihuSA zpjI5To#+8OvTw8#`kCA?ji0TJM;dF4pRF^)`;9t*Bz-jBXgDG#O~H8ZSUlK?xG~j6 zv&qeib?lBc?F|{1)&vd@GiHSHh8crG0dQHAh&0viH`NA9ZL`!qGrgzFs%W$-n$5_D z)Cf=%&4i$?4NPWKg;s#>$d*M{AqMQn();q_PRQYf0W(vuBJLFENEYn5B(h`CfZ-Xd zU)OMD8C~jtRncZS(~+xV^RmbhE7D=qzw3z#p`=RH8cTl^6m;0mkTf+VR}`ijgA7FK zR6$yu7fT<{8ug3hA**}{oHFjtyUg+CUFO~91k^)~3?^hZ?afP6?7qPKQgpknNLDE~Unqg6+QPwTcai?%v!rn`bh-G$uXEYGNu*!v3-DQdf_ zGn;)u6i-a8Yx^fI;O0dAheq9J@+2Hp_lfS}&8XUAtERZxXRD9fN{(i;TC~w%`Dzeu zse8){JpE&;gU*QHUl~c(L{wY-H`tHGlA$GWr!*Z)TK^HUop`zn9U0j?_q*s(D>KMa zJ3Dr&2ohuA7}NQFHlF-ogt6MXJOIlTjQF~Bd3JUmY4esD*_IlNv=}7Hr(zdU1}%@= z2t8^E#WY*}8@-naS*vP6GIA!wtL8jrGo!Yiv~K~Li0A?YrKR3Sw4a;pcoWUK6_$0d zBnmP$U!Q;Oaid`uLxYYUQdPENhHUj_JhFM#9qm(Pel1bT$z1?C_ToTXwMMZ{`i=Pd z*o(7s`gFWzs%>WESgJ%rnf0Kwo(q5gc|j;`r`Lxp1>Sbr=|%uNIHx#sUj6n(zwX#B zx76)j)Xm+`JjPYFZQKzNw;#8S;Rvafz}LN~EvNTUBzFX>TkQ)P?T&5fJ)!E>mXKEH z)NsiKQyFM~-k&EYRi4$-$h-D*fBO^W^D=eqk0Ep9xY(mVwZWsn+l2N8Lwnk4$q}Gf zCM9T*0;7H-rl+kw)s*-g=jo$CWBJp7VW;Pp7X^&EwVZpj_wWDcroEQ7Uvd#sYZF4j z-<0M;drPDVO$(O;vc5xZSR3>!HL|T5xU3k$s)*SoKUZ!|$fWxuuz^`Xh@TNqyY0L&#Ho zPgCeVZyv<3vD2`B{ng+-LX zn$Dc89t#-D589DqHO50Nc4VFLP-9KF#i$F>G=03#X!w+uSn9}RW5&5FX9=;jHGPDn zeZQ|A{7Cy-Gw^9Nia`zSCq38iwLKQ2Q8Qr}rE%kE}P0I$LC9a+@AxCiQTj zW@+l2pfe_DMs^s>uh2KQ_o~m+_2-;YpRembg$SXiUhPA={+(0m`{?@3r_}e=^?yF4 zzMrm_g|id;59|8$DfRuSUpV8$wgaT@&J*hfO5Mm4>k6dqt0&f-C3Tllmu@WV9-bLl zzi=RS$JqM7(y{r$l@Spm%dbLLK{5sA-1 zE4HSyg{JW+?P`q2Tg*Cc&Ei&M^YbD!p@wrWn+e?+qjx~~9>Im|;Eu`o^P1jfLT}BQ z=7jii^tK)n+VY>#8^oyT?T!DO-VO=9A=3{8;mmDDouO%`$s@S=pIN!eZ8*x zbA4B@>j7LRdR-6X`ubki1zdl**Y#OkUyO~{ROk;ivUTBEn0*=a>C(_Z_&%;aXsi5K z^klYeFh@_{)Lx?P6|UQhJ^4yI^Epv%ZF#!YTKu23UfMLKcDR?}Xyo*Psi~@Vhp{@} zd`@U#Y2WsZywABGIf`cgAg|5+NE8`_YGXwGU0-+>6~I)Xf0+6n##L&X zD7X5-7*Qy7ADCL2xno4)t3zrx{b#KIU}RTg4dBZ!I_Q>iPxPC(Y$a8V-TKO7oPI%D zH6!s5oQG=nyMMut^ZkRii|D!MJ!i>#+8BTL_`1cjKfHX+IqpKbFrVY4S$8MRL8}kx z<`1b_OO=ODu@4B=yf;mUWYE8N#AteZOdTX&7{P7}IgL85I_dw7C`#Zs*L^(?C zMHyc{Sjb4#mA6q_)o7~?wi;hvW>;(zlj+D>jDZ8h!i?2xGof_4QM)oNR(UB;G2!pW zD?Y-cD_e9U|A&kfUnk5ZmO?4&+Nl3j8xLOB_u60e-euI^eXfA9R>0^r>plJv=0SN0 zI1b~mh%xdwdMB2=4Zo}O5h67qJ2_Bb9E@}d8>_Djr`v;Ejx&~iz}sy!T+A6?-lYyn zqspdzmj(~DO3WbUj71qI2gKAS?G-o$g|TGBSY2BEz;RjsG*$@LG8kjUrHr_y`VFJu zZbJ*fP(roS8L^dkIj-Lp6!CJblWCs$T;ZanxPOR-pDMe3!5z~Idfa28=J ze3nqIsqPY?iobeM`$f6=k0l4;gFk{jP!!o|)ZYae$JNGIx+#>1yf_zgYZFFS`#*Kl zTRb_4Skz5Cf{9dIWXVHwfEd8Un6V?9X8m3JKaq__!$xqTkw^2I>S;w;|}qLn(EzizofRP z-RiJBR2`>Hbxa-PywZY*Dovq}9dTHe`LgZgwM3p)b2>hjtTa{+sjR8qGW)`s>Nkz_ z`2e9GSG)C-Vt3ni%Nn>dYqzY|hbzW|l`B$$w0Gjc_wD3)cJev_R2`M6jU|UZFH>c# z9txctn|mPv0CfzN8CA_uwIPbPt6sxPjH?|9OwBHNPfhm(o7LNA4>GHd8R=uBa;RfG zfg@B?{nFeDQ9py#ON5GuUyd)Y=CV?-A(23-D2nqR@uAH%)%%R}?9YIhCe?6|FiuVN zt4714^eKn1ylhI42UPo8&D9}Bngyz$_4(H{p< zr#Fb(R!Q!0Bu6X(n;irh*K4c6Sk<4 zP{6o2NUe6ppB;=j^NFP*+tFfr+_9tq`N8VPgjR%5)SE)SR^*)pvvn`+U-JCU4caV4 z%qQU0$hrl$IjL_1!L}87-*6@&o`^h4h;;thLaq)p*q`Z>9`VLP1oF2+Xi|-W*_;#d z#J%SKZ=%1VKInA*Zvmz#J*_+xjHN#gaT3l`7Ycu*KgDId33Qtoq`X9&q0{O(gR5aU zq56$RT|O8gxJm@YSpHoY&rZU>f&>0TzQxpA(9ey@vCjpfs>@Wj1)%J(ryQ;AGX zD2J=@&k^3ci}2dVGS4A5&X5KFuw(m9;OFS>LPUU?QYZ@Co zORMK;CB$U10HIM0sjP6orVvw+5Yu2!oezB^=G2A|YSp~XXm}S1E(nb~17o*0i&f?xI8zrWjIv~-Wibk7gb7X_D z;v*~*Y`-vYV%&FzjOE9O83_lL0&>Y@+nBz7a?F^%f3h)cotTd-ud(!Ti{12gzH<}A zQam%gCfH&Xr=$m&dSj4PJW(5&+sG&rQC)_m37eP3)F%nGuE*wdSLpB*GJmV>e5)%_ zv6&&E8|{Y267v~Jv`bVpCDevka9bj>F5yIr z@JO5Dk@e!3M#D{(GZI6yFEE(t&OD=`3#-XgMY({s1isAl+g(xgJ|NuW#QOx|!S$xvX#M`;fhw_t z{Vpd_)l}uJ59l#3rVEMXga+_msbLH%9l(eUIHo8T*;xA?&CT>)?3IoB?ZHiU@Te$O zz=$-ZhFFMh)b;ftf7WGEB{Xai%Pr)z1?8hvOi} z?U`a$l@MUb*M+zlED}wPU7Lcm@6>c}OlzURtAZ=AvjcZaA?Ua@j-!aUu-ke_j3|um zn6scOu2@2^At2d;if@@c9Mi?j48eTeWI-;G)bh}RuSgWSC8X}?qQ$H?b?lO+|D>Nw z7Sa3>TeW)49|_I3{06h$iA6q}9Wpy^1BH9+iq38)t`!XA_$#J9>%~|H5BqW{bACsw zDpH1<6z<#i(s?WtDfi@QpBpW zvHXin1k1D`U*(CdHorV^wW!5l^p03KH_i?uB#jqg(JnEwlPwEb6}5Nno+GgYfI zFK9lnSBT0^A4U2}BDU6VLKlrfR>VZ7t!McHZMQdH%`cdVfGe2j*g=*gjrsvXyq=7M zs2&oc5>Y3#loPA&lLtsVh-@}i45V|PQndz!R)tQ5UQeRc-7?D4(&|gyv?@KhIfBg* zEB(DBoU{(w-*F1bUIHZ2tu3AbHt{-b!&6Z$riwcgisr_6y=+a61C`q0cK6DS`C{6A z+E*utU&>LO+n^V2J_?IXeTBQ1?z-*a z)7#aDjP&1V^?7!6`sdt~Py#s)d5ltMw_A@sf`#pCQfD=EFQdYav56fVz<$BBjzyYg z56Ap8GebST-)TnXdwegqO?5MtL({A`gs-J7Yt4`vd_UCVdua~em;3i;y^}yer;DI} zvjf*)QinW{@zpsrt9qX-HT~1CrNr@ds=lrFc}q0dIx-S( z5ILlm$J9LJ&^D1n`*c%Z4t<>!_vd5iw_s~D5`R84ztnFq>m57NWTbCn5w&BGJ6f-$ z1~Szqwhl=sf%av>4|NjQ^`~$>EWLsG!G53Ek=7o)y906Fb3Elf2(Us7 zR}1lryXCIMiego2MJO}C%JkD3Nc*Y7y3|cj+|k+(+#f(Wqa7xnkR@#}Sj1SArWWeR zQX^=0u=YLoUa`>`2xx0QVfHkh_Oy53Z9KuA(|4PD?(RSmrklg=M-YiG#i)8zsv3ag zq^e7`4+_Qw$)7Os8uK#yk}oq(W}6s5!JQu3_vQ=AHKH~B&~B?Y0V+Nu#aY2jQf~@U zno=|^EJla>gA%};Z-qsD3mL3secltNP^ueb0N%@b5uu(|d5660-5fdU&V^*4yqVm6 zQQnL30Fa_2sMHlX7-|>$@86y&eW?8b8_Ls0NNA<|33Kbo8245Yti5iH;ud04eaRgF zhT5Ll3HR6Ek9>6>;=jo~&I)qGyd%M?s!nGhEd0QJ8~t%YD7cymLcz(p^2MB?n)FT; zjJUZI3`sImG8%#ROtc%(aSU9YnpC_WSVS>bp zaqI{-5o^34^|r2!d9}updD@+-lku_+P+rXY>k5DsqnlwCla|u7rw>D2!^H0P@YKahn z;31|Od+>072vk2t9J@Cj;z2iu2Lz81OOEpKz;dyXhp9e$u*mNd@er5z@EziJ^~Qs) zHN9G+A%H1}-)mL}jrva*j$6qcU$5Aix^f|vUZMRo@p3}@LQ%a!`+j>FCaEvy(2al( zo8~|5m;3NZMo|vF(r)-# zy5Sp0{N01ijXM_DP2MPQ3S2?fVSId_0ADkqqNX5V84WMc4-jeiE- z;k)i3$!oFD@nI83t{;wELD0nF>xZu*aSuGX#@ctHq=u^3#eJ3ZiHetnx`?gxvx6I= zk=IgRi)KgxDTyjFU&Js^T^`Mh!Ic|_>pBj_Fox(?UaE*ne7SMBuH*3M#t@7N3g`Yt z$jP~~AzJnN(`Yxi*z6kZ3~9huz5gtX0#mI?BpanhWLh}x)O2NPx?rQh+Ehv26^bPb z<5*a6=iW;}_}w8P_~1UgP1_ky4Tz8Io?VrQygmEMxEfO4eu1WoX2vIYCwMthR5e*h z65FYYR$mQWWQLUMTUUCwjE08*Eo!Vj+gRP7HEq!cD;84)$%Kp1~bolTgj3 z8enEd5gWxkqo{%Kkg?=JA(G^cnxJpxnYc4#xqH2^kmOYh-@4Z!p8fftFaL2<*&#C% z=KoBX+{O3J^kHHr@}BlbCa>hp!P-$~rhi1hL^)i z>P1MXqt#4rCiupZ&XvpEzf#kq0_aJel79Rb6Xxap{dx5}yo0v~d%tJaV@%3mQ}uI4 zqN69T1dYa$4~0$v%%~S)B(NPR+Gtu3(aPw*OD);nZ;hwhWz_p7*wlnlZT7GF`X^%vU1v3DK2Zutn>qn(f|aq*WiT zj3?I+J&3h5%j!ow*oYXb=T0C|(L6d?FbLE3{=Q~MYXTUP#Se@|sPO`++MY;mAMMC; zP%t6FEt!`PJr-=k+%k}H98qL;urn6yL@La#j7K)izQR_;IeKPsnFVGfpV6=c^1?9? z{Z(XE7p2~|GsTcIcfRM{@lGJ;fXMGiqoM8*D(-mMnRYRW=5{ip^+ZM`aim4)?l|EXWjl_&Q8#Z z<65!C1;ls7q4I2nH{G3Fbhj7GGbCflXg7qpd(P{RcayMN+p<&Dnx zYsH{h{u_uU?hG>$6`e7}xaHI!#z$Zfm|Bhxl{3}UD+uY)rNkkT<7Dj?=Mjc1?k7>m zJTD5lp?0q+(g`NkfyijZ%U&$9pdYbMGt!)@HZwtlIT0FD@ghX||1dLVNW{5BAx#*E zT4Xu3VUg$55^t$N@Ud`VWmx~&mSFKRb3r26596miJlZL5NF+CtqKx(~C{HBs#r;l_ zahAMKtdP;pynVA6!OkM3(>L~wW!0_i^BIKgfP;+>`?jN}Bwx2@f3~}ysn9>96f+zt~z!PXH zyXgyH6V51LF@fbxM|#@NB@p&iJrwu%`V}>d8I!k}=Z1B`t`ZkkCOpM6 zIW+K|xx3rf`|#xObtR4dGkie@?7h-R+dSt6NmWpd5|`AglR3PN0oEdmFh(!DZ9WBW z8Rt7Az$4^J@p-|Yix9PGCFU5R7PI@ayQ{ToL)rI>;n zB*34jSd*xK-86pg#rUH{yJtWQ9(zraGv)STFaP~IA6|#~OM|dhsgCjsW$GwD{I$ZYDB?XfqlX zLdRAnU#4s6KTCiS;gg=KD_N}EAb}G#0ML8ub5^_{hWZcZeIg4)a4NC1Xw~-T#MVkw ztaY#D@t*b`9>3C!M{R)KPdLq;$A6PoVvp(FNRoShWy90J5O7CnvaS=d7W36;xJ`V^ zvEo~<^zG)`#ew(j`kdW-e;@ra&#?cm8P1hr*ngqDq6c2F?Yr&gpD?D=*w1xdwb8H* zN#1Qgzrcn2JpXBZ?#XDm-|@1gE^t@r8#QdQ$aAZ;S>}z;&p6ot2CF_4pB`bk8;yxv z+$-@L+3ZnJF7bxqsPj$4)%(VZBEc}hhj*lTj6G5B5qj9$M{*`5dN94w8dzN3za95~ zt&YwR0HZx_>6ZD161T=#mgq&juzvjwt3$MiL$M ztXpav7S|Z~Yz;w~4D2Gt7qX1@O}f2qoU@N^#y%o8KZdAc%dGcw2!VA`ao<^9g@j|l zCKLp!!L#{-?yX*^)Pj=e7yIzP(r^2((0?^Y*5#$njOj>x0{=MD7U6V}pdhZm_ECkj15fTs5(rmnd4cS|9JT9; zzV-Qdv@`xevX9VKIlSSUbmL8MW+C2V8gG|svOq%T!*8<|XcEx5h7bX$cmJWCFpoZv zlWthkj(4r24y<{9f>EG(!v+R-B>#DG3OE)sf1a2Gk4Oh(gm5fWEkc3$ySXb7*~R>I zK!-H-=b0cdhPs%ihZB-r0$V?aa1NM%4e>ug1$QDw+X?LM-Uy^^&#cog_u%vP57^GI zkOX~*m78PP4fzk*M=&n)RQVHhC0&ps#SF;4f-SNit;{wolA~mgQpI}nfun4E3M`VP zE(mkHXp>1ttjJjY156j0vr2CIse)udNN+{>58S|+tSeus z-%Mf{iDql;KJut$g4f8})NR|Uc8c~KOJWmCaeDEaXG`N1$JB9I=p*wjekftYKQMLj zC%GlzCxfX(R2@%{W2)AA_>U#mvj&a>Yh?ng)(vqAWPpcf**%o&K(8G|X!! z`Qk>06>)C^0hVEM5uVL977DKyPl1H**qg zjyWY#^%L!ed-)bCe0~-p1$?MV`1Yg?U#!;4=4*pA&UIkFqa{G-%BZV>T~w!B1-SZ@ ztERRR+c~#0Dq1%5pqXwgalb~jhZf0Fd|Su8bJV%J?f0(;i)l&phUQB<8P)N~5bu%4 zigH9wkS;A1m3r-UmxTM+5yBFAYp0m|B|;U+pK8pK@+EzinJ)k|YI9gf@3Kh*Uz1Rh zGrtc(yJM#DqxHzkSP~brfUR4Da3lM$Ff3bvbuOzH2Mz9Gmd^FCe85QG1*68*_GsM^ zqwZ$T!CF|yL(mxD9o$FO<3GZ=c4X(Ae#Ytn)#21TkOmeAHzent8%9GPo<+1W5f$4E zC2O4>tug1Dq4eIMS+T)lSmYfsvW>=yZ^;-k3?~-(yU`%uzlzppYx6qJ@ev_emIQ~{ zC=##OBaADkQ9IRoLCddcPN-%1FE~bM_?l5C;bI<*Iq~wKr2l*xgD)h)Jgt1NP6dps zS4Br6Q8ve^B*SrC4}C_C8_PG)8H>XRJ$A#gN0hKctfJL?o{rgHvr2+1-*b zh949u%cK|1_k>d*s z<+Mv|Xp&*K;$b2?2CSt4U`ta3y0uWtr}OKO;|qdj%oFdU?e5LgMHY5o@n~M;&COEV@e2JlGryu2lyrHbf4l zzQ&?E0WeAND=3#-l*==tFj7+_M9xAd#Ytx)7o{jOENsfC)F^yFbg+E|vn-ZsWr|qN z-x1BFcKSnh(AHiMi|jY*_kci6$?jq_nJ0fUQJ9rBjrae(`*hyE_N_ViJ<{80Ecq#_ z3%ZI{W!)!w>D!w`t~@OxeFhGKZ3o0~Zex*Mma{MzGln;dm&0Y;7}gqhW@<;3`P#!m zvo~K_=ZnNb&1MFn_u2P3%N4!CY-L$EI6EFZWT`L{viuH=2rFZU+MfY<55Gd{I^WJ6 z092D#!dz}Ew+YwNOoSNR1-#&?*PNpJ0y^01(oXln0~!Um0AD zv6@_$bDycniO){MSpJ;MimXwb`-`EF>Yn3{eN1(}+{viz$6AjfX47uJm-n!zO>Ovb z$5AI947MJ{2|IsbLGK#a>kmy>6^}QPeO%GR=(SxTTDF=!No9ufQ*0_eb^uDAmj8^= zO+Z?v7i045j$Q6+K&v`|;{xDo>I~bD`7%E2-U6dFbrx~&vv0^8`8SH(2sePG-@>cB zA-s7V_El?V&nTY3mXQZ4ni$B-og{PU&SdL})8~)0&J;Y|bTpR+$^6Rx3)a{4{tI&0 zu(o9LMZ<7S7L}&$>VY`i$HMXI@GBdA^>Me7gEp!4FB?L(`}WP&tla0&wVIo)nX09sf)nHdtzuT z#EP@j+vE*d!MFX57efeno)=H{B}xBLq6Z};{#f)?6IbiRY0nJ0$tmb3>fI(o4q%3w z4|Iv7G*(m-7&Oye?8zu3=Q484Sn+4sVgc`{W7x@(*{F_?!`OvzhKg52S{D?V*Y!7? zUzRX2(?mmbFc7kLu1cr_?88jSOM?g!wUwbb_W zeSF4~1IRiYeqF%|qkbDr$ujiP9Gd68tNk-#iA%&h#)_2>CO8O~iZ$_y4zl!5N>5yW zVtS&eBQJI#kqR_SurU!?pKwN;Ha&6Jd4dZ`Ppp=+lhPB}$1icgz7oACko3fYz7iMg z&Q2t50nhEl*Vr6udWY=9&?WI88)-j{R%|vSM^hvx^4W~Ss0vGRB4NRDGcyloakK=% zyyQfpf+VWV!;2bC#KM;3MD{p{`ChRzAsdFW*w*VM8!!EKOxz** zkMGL(BaT&&%lTTIMV!bs+Zp3$ChjmEYBs~IB)=MUSFlegeT?CLffJyw+MY|s^z@44 zCCc8?;`0cC7BN;56`EPiYOQ_GAFCx>eRy|p8yf|QSypV4kzS<-io91h@U@QArT#E6 zDrK0wF;zg=GqW(P=7rVmXp)b}C%%M?QfP^h+L8I?MMhmA^eH*UV(*#H2zNnDX}(y3 z0Ln3{QTGH3qCT_gZBOXBDR8WnDR{t4Hs`KH#EMo3xH)VW&eVhmiZ0K)0N$nBvS{WU zSl6>I0} z*@EA33O)+r>USY lSEEF=x!?`rjINE*;%NP2YdL!WAquBBbryHc__EA7g7>G{H3Hhf zx>d19odeR%j~hbi0=Nz5 zui~y_@v^OY*5&S%x|;neXJKlIq_b3^`aZ^i>`1}EDByma^;l0PF5IOnJAf=k=DEME zY7_lhtZ4ED2#L5CO3!8OU)J<4`(VNXA1r7}W*Mj4pKZitKbID?3tks|>KEdY{Di)H zUbyJ|+V@TXtbKE{@{hZu^_eAk3qfB-wdJo(er8GOA%b?W3q<7L;E%GkxQHTg%TQ zK(9HmkgQ-4IjC=(w_k@HaOjr2AetyLB%P{?vuguQ;Fv)a5E!bv8kOPeqUZQ-tvsS- z%+PeY1x*CMH3R&s#*&fxF2W4$4nl|{Bh{oYQ7ISsk3v)fWig^|5(b&pC@vbsQUkR@ z8IMB2XRjatRB$N?CdP_YG?QfMYONcXRz9e)HK$N8h57yisnckNUh1HwPRm=ouIx$b z09^x(RJ(V`T37vY@ly0>TeK1{0_#S7luGvj{)3K)Z634Hv6Xszz!bzF`&0Jxv3!AU zcgy)@S`*2#Vcj(%!2x(2_%)=p4~-xgGS@2GcF2b8R@du|C3pGQsf?=*&nS3S+Ut3X z6BTo2qQ_gzikHV z$eyTLOZU3#kLmj0+-Kw)MpF?hSSi&6s${mFfK$!jjRlbZIFe0@HF{4%tu{jb9qFrW zj}_lQlCn6C6*8KK@u}QF$$gnsLt?t@R+RvIQ-Z{Y z0DeDxw#Z15q#FfmvBZy)Wj~aiLiY8 zwV&hZQ_?@I>p~A2_3|qOvg}JlB@uZ^{G|Jrg+8XnA**9 zRU*O;#?j(GtRX*KmVKk)%~8FzscI0`uHM`9Mb1RA%I2o_JRG1uye>YLg6n9|M0oB;QL3g(Y5C z9ySJ%n80XpFq)8>t&#`0$x8xk#iYk`+7J&m$rkmBZPCc46dod@VEvFK(Ej+MB#%h2 z%*wE=$eJQ~5-gHR+|bI@&@jeOZHgI|!8ipsuyH>YVH5QvtaDs#qv2meLU+wtnC1j2 zD~3i|aYH*qEb({y;o1)xi60~ZyaTSR;EUdZ7t6$~9v*Uk#rmhE-Z6fG8g=JF zYlI{)c}j9&2@G+JlLBMuX6Q*$M${#V67w>03QaRy7zQV^NtnHU# z&wRLGf(WmI_VURqWgGwQ82eD!fF1dC!3ZZVn{UGr*KmG_d0?~r#2%znK=LUGgvch2 zN5xd;^P)N#>}N_H8OSCHNhQa{X7B`C52ac=7W8Cj1V zF;q+>Q~)p&Q?8p0yYp0uWQaD(uozKMJjs$R0s}l@LJ*@7?!Pe^`jyG9Q-V{O(j~-K zf(uhsjruz~?U$W8o|NpU3S*Hhm-HH&^}pl}Cw}U9*j^>cZ6}Q9U~W8?8Z7V+SD{}q zqM-ijUarD)tnw@$uzFtx#3ok{UKEs(9xmH_lG?UHmOd(VOTI##%zF|p+GJigWJT>O zqW#7C(pv3(_bf?d$jM9mC3m4#!gYWzqN~yUId(zrzTSWO2aqRb*%Uu)sQvS>JCV36 zFpaZ=E=t^f5T&1 z;FAQg7J09Q{R;v$Jux`XVPk zmdpFf5FArGp_jXzzCP;tegOWbqTjYUdeZiOBNQM3-U|)lC zP?Yl<`Fz6qxn;7w-@>o$`F;ZW^8RkWu^^r-@)gd4*L^Q_*8gxH*5Jr<__4X~CaC*; zKNhz#MFX~X`HzYpyMAuky_;H3--$kCx#dU|gKZSkoatRSU-qO|OW31X4B3xx)5b}v z0=5BaN@5`RNaO6sMzz%GSO=4<;~k?_UBMHaOTEN1DlBp0b3`x$*U9^u_JF8 z%g!O(spZ@9I-$H%qJexnaoRfJh$GnO|HXjd`p*dn+ED)!5b%Ez4V>@+aEZ8PjYEQ| z?q38@Pktlf{P=2-`-D`B@4>AC?Rn~mr9>ko=R^XLUdWOA<9d0X)cy_#a?!_}yV>^s zN!=as`owkB!#=skUL!eOcR!{(y|K2%y+a1%hv2^C?na7t>#x(wFXC`d(db0^MdjmE z`rA@}gLLNf1mO-78R-W}=F`HR$OH69*iB@#*j3n*vkr)1w&XjsaYFZe1W z1kExhhvb2AAbhBZN2%cpYeWRps-aI%Ax#B^xrlO(bl})(?&)B1F8b`gidvKIs^#{<2d5X3^q!Z-BwYi2!@r zrQT+ed5!ijU+M<C59`k9ox-g^@MK>?ME(B6I8_p^b4K2knF$X7WaICsI4L7K{q!EHw&42oV`lJsrk+QXP|jT!dz|?qt~;sXAxmKd9<&} zXbG_BRViNI@?RY70Q!&RMyrRA+$hk?*OVEw9;_Z@Zp0$WdRRUuKIDxj4-{l*1axC| z0K>?!>-n;jC==ZD5u3<8J}Pnl%*b+P*OlEZKBk2y8cN(fi07Q}kW9>+@aV-oUa+TX zkA9hET1g`E8Xm_h?=}g&hG80xa>g&`gNJ-T)z5#C4GFU@ASay-aIoB6cjFCoAoKS2 z$)Q?WG9APO*T{fH}_)g1X|@&q(b`Ukxb}m?VSA}*SP6; zQI0H%+@I=7D_#D$3Pw=v9yG>mJGT6p&meKD8atZBAxf30ZE_~=(ANmZ=={%X zIxj_!h8kF0Z9nhGdO|9B+~Pw9{D{pP-mPFGW9ECud~(iC9%$;kl*?j!>XF-TyX`i9 z-KLCmfF!B%?01sSe=EG??fC6Q9lH|FkX3>t;&G1Ry(`JK33DXw@J>}@`?V|il6hY~ z0IotAK1R=%Xq;(vcs_0Np?VSEE3{x-)kpvuf#0z!7I|~dMVW!bkm7#&4(maGBg2G^ zN0B$@zUOtfO@$s}Lc&~~PYRC!#y8Vq!4~~B=VkK{H!@*^do7MV0!(DlR`*&NV;Igg zf4{^}p3(>k>uu~hb?TaC?no3FycM~x9~W;A|5}nWCHO$%ED0Zcl&heeA?vj7wga!} z_(^FTcmS<92#5@_blq$jy}@M{?*^NmDHlAB$_6XuHSM( zKy>N<;+T*X{D0v@f9HNFW6}v%>HS5vEWCe1o*@^dZlR1e*~E8Q`em2)C8tL(UX9^4 zDgJsKeEKJDV2^Jbu{3@jVYlWW+c|$kv)cF4jpC0+HY~WnevJK}p$BP8Ki1L^-+XHf zMw%Aw1OK6e~Cx-q)G|7MEUl-l_`D@pzdd}h24WTw)dXecO{== z-UyPg6UkuAB@h4d!vERsbjy1U zv3phbeObs3x9xbC?V*jv((|a7@3@9}J=55aOFupo+buH=bmHt{CepXa+Y8TUu9M zrkxC)xZe9cCJoBZg~FVP*(&zP)$vBa<)eN1qmuDeW&Cc2vYhYb-ZVm`R*^#eV(lAb zPDWl@_(ch|2G`L39R2BvHNjCyYgL)$Obn8>`@jfw@2awhTl*G2(s89Tfj7jRRT>Cy z97F&M@-p#dyrv-CJ(%6KA1Lyv507%DT@`Ho6ur&kW7KX&r#go@L-@AA5$ihZsj`J+ zYb+LcJGMogu>*qXJ;915wMA`cJR+9iAPTu^u23_YI@|2apFdl)9W*&#cn(kC-afpQf&z#rnOFNKQ(TVz;$z zbv(d zRBd!uT{t_lRk%S}!!bs?qGf z9vSjINyxANvry@FW9e@>8g;S(7?t4vZkQR_jeHStvc_00I~LetR*(E>6mEapd{*F& z^6~QT^J8iUYB1h!%<7;LIjuKe44-7Gs~X!3!OS=<&*GWuH`45{!~+9CiH8zKAV6e6Dg?xuh2JdG~Jm$egeAmy*aLG@lhnTwxy}50V_Z$>SM*ecZd`kBNzAZVXydmECt4rEv7O0)kLy*@i5Z=SyWp7bL2E1 z zmpzU$S;|)~ptOD_0Im??g$OkYjQT-5*o$GjYBJ(e`+U4G492mcR~z}gw@(mjxL~RB zPh?kFr+hv601JEaig%6WPv~b-pv?BK}I<2YuFeslY!E6whKL${1vYsF~YX5^PQ^0*%Pct&2an_;H57|U03 zCn|&ApMz}CVgY%-0Ca#^-w$-3$meT>8jQMsKx_zVv+P?`M7-zP}!ZhB|Goz%6#^m{?G;^UszZKaE-dZm4ZFYh{D8R4X^VJ&yNnSPh|kS4v~9S=;JnA|_p zT=Tp89nN~jEBu>aTKkzT!ER2y2|R17Y6K_3oj1r4rW7AbEH>hWniroJj3UOznytrf zyA{i~luvEE>Ios5$ z;i8>Yjet};UZ^BzZ+El~bF31t@n%c^q@~&Vsn{wboZ=Gs_DZt&NBpeP!W~$=#pg#W znn?*>xUpBuMLR0mWTd~whP>$wQ(d6D*XFwgpcQ<>@)bU|6jX1>Zzr;c6k&XM`=dU6 z(rf9Zq=d}-LR7siEp-E9xy&=;x-yY`eRQ()U#!TRi(cVlFs3u@N&R7&wE0GZ^@9jDf4Ks=TV|%T+_Z{GDqi zf35P~uT4G)7g<)Pd;dG>w!SRlfA+TpaC!q}l*=ezqEwE62b2Y2qlQ znX<+&YdNm-%X*G2e%ZirqhB_0-0YVZIZAnnvc)f3Id1byD@Q5YDKGow6^>F~rR?y_ zPL5J`QFi;~HI7nV_sicozTuZQIZAnpvd1rbIZAokFYj=a@-F2)zr4>;$_IXF<0!?Y z?DI=IM=AUL@*zhlANl2DjtBg5kfW4CemTrh$|rvL2S+JK{L;ZuN~d3ra+Gq+FQ0Oh z(&d-WI7&J0mn=u|VL9kUiXg|llS)3<|5*uf|Nl+t!}I?qOJCmq-?{W-{AbF4UmTp_ z|15`eWmBh@ z&7C!4`gh7^%({2dENM6P&hD1dy6lFNYkHpVd3xrI*;7+fXG}lgMf7>{dxIC{U$=_) z52BRJ&Su5teVU`#fEzf9ja5>c&4w{@rm(I(jbqyaV*DJ#4cTl6!>5E}3CHT6X0!Kl z9Q<$Dto$C%V}HqJ2V(?vlD{*LW82@e*$o_@ev^8PpDCnD&gA$ciNx>9HEFif#Hit~ z*gFJ*-@hdgoL3aQsGwh1)`QBaFQrVJmCZgOZ3>G<6qbxK2F?vH4t(RnuYc*Q44YwvMJw`Rg(b`LqlINoXjtLkWqqu| z%7(sHVRgD+yl`bOe@b5AW5IESPXwcdPX>n-{xSG%sr^#^fWm6Cu+l6XJgl&6SYZi0 z4J!;s2L^1q6}*1J3f^My6Br@ypvCfhTAuO_t8h$ySCGzl?Ma=D94LL2gO9=Uv)K#7 zxxTCu`vQuFzNWxy6;90G()YA|nF9qsGwG`3qcmFZAJ{<6Hh z2MWuE=kAy%+_|SP%(cf$tLT3zy!lS+pAaqY z?&bgusLb8p>E{5lKt2cg`Qz4e7c1Nr&hLMECKxp^@Akr7VXJUYctl}a*e*N}zD=5+ z6QuT5YH#6aQd7rk#vosweJh(?eVTV_-tM8l{370WGRK(%zjeCGy!KOT?SXlRsU3bI z|Bv$dA2>cC7k|3UQQk za0UZsFmMI~XE1OE17|RB1_S>o28z!1Zf_Gmls_@U^`C!I=wA--uXFxzK%8X$&hid{ zJq_N;1^#uXEb#Gnk#`7)OD=yGdIwUSy%ULD$Y1V0Ijb||`hfUU{Q2RJz!?8TLJ{&; zBYq2i66=va*^(rGo_C7h=UwVGCw;wloml)v-6L!i;G$kpq`4esUO zZ1mfSyDfjY_mTLi;cxT-B`&V7b9m*`H~M;|fB&d|?GMN6?Ek?3VSheOes!fk&o%yW zjDMWyA7}c<#r|=ne|*9}{?R{9Z1j2_Iehrn%Ld=}?b_+7+Oo)1L$0d)a#gLqs#Q9E`)a!hx^~v0X_PU+V>f?sf*5`NARj=mINT{2hPOR_KO?M~O_YLIc z@5K6kf!w^GSP#{76H?Ee{6PP1I_s&GbyBF>$J>eZ0|RBJtuF{Pp0@t1z_!!Y7X~K! z{C;BpMmHaySWhP_d)Ij9`9w*>KRaFh*?}icTYpX{D@8$G~%NqGJzzdk2#xzvBx@_(=wJUMwWLf4;)pZlnn{&VtMcfx=7>-MMO`7!EG2mddr zmww88J)mQ~)IIe%?M1BX_Lp(OpN@rc+)N2Vv^A@wmOtV;^7jfw+Sf$%WkAO}Io(Uq z?Q?YHw{NHZbZ~w`z4Y_MSWfuUF-MNw@09_{pWn|}AnIfm8XVLickjy-5at@HF)7>8(@w=oY`9lj-wrU4JT_r>M_6jokP?^=Av6 z`16(1-;S<7VVL^jaq7!Xtq!d5`#B&4#-EOj2N^WOhY`<#0DNuaXc4gK?9==Ozy${W239k=IH z^x6O2$@RNPQbgerPdMZPL2cIalnr-gf8E0F9^9>g7`*N2u9a2^S$o(_4F1ehy^oWy;tvhuYPq;zpiDaKTq`* z)ShLq{)+XlD0c|Ius*uQbj_IyKJX!0k{LcXv$N6%>nPNeReTob4N>0EtZF7{MR{nP zr*wH~vZh{CHrPDb-g@WFXB*M9oR^e3m#&D9_uR@%U0Pepmvy~j%0MEv4A)Ha@-ZJ3 z`o;zBm;i;|(Sot?wuqwy}?PHrb} z8#+2Wq0PFmXIYiAc9`W{c$Ov?6!?tPPa9Wrbe^K@A!WrwnxSiwPc=E~AX;n2@MApMx(@pJQNM?HB$`B8RrVGb2Kb78XJiV3N z>BfoOcWsp2VvFm^Md0COm#HiWpB#S>x%Yu5%5D70@J!-YHSY?>;5sZ&n>WEGv-vWS z)weFJOwy|+%d*PfD09;q6sa#vlCS3Mti#~C7OY%rrUG%BvsEettu3lxl6KV-t|qgF zMX7k3nIzat7Ug1L>n2Y_Y7$%MKrPqOLoSt#WVM^^)vC_@D{J- ze0_L3_}_KO)8rAAZzpZRb2ad$>H)}uASYZI^6~r)yahu={h98yBd+m(M{5ut`5*8& z;>GdhHu@rez~9k?3iz@%NE-PZf51oH_JDOl0U!A$5cwsWv_*Uj%lj-yt_MEy1K{gw zjB&#bSa7@Kj*yW@0C$uh_g~fR-zST2@cVp2^NzI|B7QvnkGNgh#%i|)Pw~lsFFA6| z|AciiCpXlt&tEkEzDRFALg@&zg7bv?BND7@T-WAyb1Uz zh>G9=zlr$BuYeyeuR8z_+HWI1-lqWZo(1;t{9P%(Uw?F1@kZp}CLAgL81a$U0Y8uI zJytb^L8bf7aeJxq$C?&ij7f z@BMwh!@$bk`&nzRz1Fjy_1xC8_Fh(Qj?M6TJ*i)&=SGiQ-82K=OR_s*OPV1+3)-gD0TGYIM?!w0dneI_diCy>y%3!Q~7eAdfr+m-_!5-%d;JL z>e&0ffn;_6$;t2jTi@fn4Ci{GdraLYmvW!~&rZJo`M$@kF6vT8x4Wv1($3$x_4Pj% zIMsMk$Nu+ibnokb47m5DjtUpRZhbd8?SJQalimCBf96!&|M-aeywoxEyqmdC@baJi zrH1(y_jzA54t}@asiTWtg-@wNXlC5R+dLF&l(uBL4UqF(_geBl9C+%7O$z30zMtDiqO^RfFL%egl5M}_B8^b3CvljD+NpEA;if<6%E;$JwNuKaCZ z0GIX`kox~*+MKSQ55EAu=L_Ih0xux-|BVIk<^DjfcadC{(I@vMuFm#kdpt1*{8e|r zs$6*KVh6s!9rwFk_}cLf{1=0rDD9DU+OQt%z?`|%5M%yJvUEXdSY(< zD_r>63mo_p?!;T?!j~_1;EUbzGTo(~#-$GYFM@daaa3^2xm=QhS5(|Td+wZyn&{N( zXhnsm!k7}Nm{wI?b^pwoXjS!;$e3Ak=TuFZdf%)nC-3w*6_pF8R!pBcXX>n(4_0|* zSIw@eih62lYUWl)uNmcuR=X$DX3npvo$g-Do;qvR+)B^%>Z&Ra8TT!aqPA8r%^gl z&7ITdRnMFgoi6u!rZ1?T8LjfvOs%M?np#~sL&|YJbN0MhRcXkqnfFzWteHFV8V_f^ znWqQtLEK>X?I&3Bs zT6~1eU^`K<;Ri zzfM`W#+lz444lEh84R4kz!?mj!NC6q7&xk3d_s?X;Md~=+W*z#(d!$d8Og1BY`wol zJuP|le!xePUwNB@c8N#I&ybuW_P%5?xmI0^qH)A-PhX3|bHr{+UyEXK#QwMRwI~!v z>|dp?MG-n;e?NUKip3H8q4c#V2}kVjq_0I`IATvtUDLX+sMdu&s(oDF(C_-{e%D{? zcisQ~i~8lC+wb~I{jLY}yZ-#of-TykCvNbPrzZxS0Hg=IqvwL^d*mEc4^HmYE?G`* zJT2~#^6iqDxO%f33Qy|RA-(C~O?p#zrtWRk+fPPwso-mF1^(pT>FPPD`(-?q-;f~N zQ#;}|J$A!ca;jVJMzi$z4Rg5Y*mN?P?3f1fS_f2d?fo|Qrk^kO+Y88a+Y`FIxG`e2 zg}!N6G-YS#wn$=bGBm}oUVKLM%iC_%t)s=qprAXB>VMR&+<hV#5h;>2-SgNhdF|7`|zi4P@Af&C!8W`NHt$BiGTATIC&AQd5$2z=v%(sO< z-O1WF^|&ukUfdXZM(C_K`B!z=q{oZ&HUJ)vkvS@$T~epct?t2F^+%0_Z@r%B*Db%b zHP-BH+UpMwZUme;YW+heq}{bU_4w$3QP~tq-0am8zP7OSvKjX@z3(@?hji~2-6{%O z8wYoDPkhvR)7u()=y;L`MDH%QHioSB&Yy&=jYjN%H>B!r)+@IeG2biR@s@A9o)~vh zk0rge7k6I1wrRg#dNY`wIVGe^T3sO+wl=$+U-G)O+5P}b>2Yvo7H$bF_4wjIk!j5e=%Hs%Qnwj51AeLJJpN2M54_Ax8rJ7JB}$}Dh1gu` z%y8<=(5)+5K;BKJ)fKVcGYXqbtJ|>lgsnG1)|-D5e1)3c@rSMbrezO400tuQD|hJD zX0z}B&G0V0T{zE#MD9mMBI{*69u1W0!K+HOM_QqgW8c)R)?_r0_YWDSWd?Mir&2*o ziEh#Cjvs!OOiF`w6ZDkAunGiEdD5qx*oWTWTeT|^-%dvAKh)OD1=)I>PMG}BBi6^J z^{FCfo5sJO$4We!wO?>+ZCtCzL6g<0cWrYJ*5Ng*tgQ-Kzir_@J$5id0j$G|n)UdU zK#^|kHmx>An$pCaj?JGs16e8!%hA?(7>5pN$I4fEhC$nm*yqXG9J`gvwQ{c2iF4Iz z9s}w}f+{H!Y+m|~Dwk*fkW41IL*J?UqeI;C!R95Mdi^H2g7vaO?MKq~STaM`%*}fI zJX*{=$l7i%5-eDoJI`Y9pve-?!pmd|S#5@udnipA`}MANrj1>oSU@Q_*{TGV$HU#U4@~Ka$;s(!twV;jRn4gmXBwL$XhTleI;2QCK6)yEZK#il(~+YGn`=Kbt#;FD)?s0> zeaywp;KvNMM-ibuNQe1vb?b#4YVstec#^S$-f-}wR{v$%S3dX?-D)>tA9;<+4KhE~ z)Ej=bnbx7u z6j-|G1NFzWN1FAfj+^|=TK!lgx(+9(hgPrSyu6s1bxf;kWf%|uM(m_ls~buNXBk>u z8{of-S>N#|Mg>oWpFvvR@k=fYYeS^|9j)#&xwF1et2?Ty*d0O4S@4hjy7f`VisQ-T z^FmEhV)egqlBR#FD>2slS2~b{S3CSUyum}?Ta0-4W7wqG(p}tW1UD_1tK^~?9~QB8 z8C_dV%NwbGW9j6G_4-(CUHGD;aPR1s#$Fe`=w!`6qw5VaLcy)k;Y{0s>kXMTg{FPiSs=8MvH0XI5Wxi^SJpxW}-2OCmFLlXk8hc`06B#QSaIVH``&v zb6&o0 zpx(6$Xef9ndY)eRx?XuqZ#qHSaz8fWz84{Qn0uSww9XF0ykgBX>G*t@A=9>T*&p>T zG`cpPGOio=f)0ky1kGEI!9uO~J2r9@jW29<_o6UIuh7f=;?e;=*~ziP1-{0sd?X~igx}YrBD2wykzHExCKYMmOs;bX4`b7Plj%X z6NkPTy3MdYap|!Hdep5c@Y57g>IeltxQ5Df>r+LUrkux)d$rYL(HQj@lE9xgb0>^Lenr)hFK9ai)evbq!F zyw6Cd992NnjJR*TnV7P{14*8QB%>P?NgkO$$4;c~W1P)%lI7aE*Nm?1rggDlZ8u__ z5M=l|^Bf2gMg%An+!QUW-?VUOcrX*YL$BQAP^9n0u+U=~<;4n-bh z_>U|I2=}kww(ODUhe{-f9QdZ>%zv*3<2t+uX#v${lpPZOj{Hie6AtHN+ zqp=B#F3{FB)3;V8*#-{;9zF+^unnpxEjeLmW1Gx`!6Uo|47%ZQK`Dwu9(=NMYR~hS z7DEt2FVp{4(_)Z5@w=LK&ss)pQ9_WP!k;poCGJ8TR z%i$akdWF}m9zcU9B*Hg9!4a<>+yF+e_LOQrY}12H+7BB`{cT#^8mT*Wa-p`mUe2tY z)3v!5yfJ z{h|YV;*Ct$rROD{?%x_t3FsO{*&rmK$dbLo3YzXn8vQ-K?#?)B(Sw zz!`uu6!@&(;7q`o3VakwrV^*L53o;x-|P(@0C<1`Z|Dsk2za0ZE88Sh$soXk6!<5- zOEZCVzXC7s4IT`5FyN)L)2%oQ&{+byGYuL7Xo!Hur9oMMvII0L4f+zGFA3<<1AcK#*CvA0X3)f}(N=#)gk=2gfd6@6hoNWSC6Jd-SUp5#p38g+ z4}etm=!HA=;OppTT@Vb|eBIC@$4ze=q@xFq>xt1WG1IE}rJ9rN;9C}*C~9)Kp1SIdQzgbo)%hpj?K ziUj4)Jm^p;>;IV!cer%;(tm*tkEiJHkkFy-ZPtT_v^rUaVPZ0+1#zL3;d7C#Wzb_w ziXP!>0)|Y>Kr~>79$heGS_TFIJM`#+A=5JG5wJs#E*LT`gB}4p^yq>i(=zB0utSe7 z7&0w`9sxV_=z<~BGUyTTat4;hI}L>=p+^@4iIzc+Ekchj2of!W9$SPST@WN% z20gY2J-Q%Bv7>ehyib?QQ?eV7-m9lw!Bilwn$um3RmK%_nyoh|~m{t;9{(@LhAb|dBN z5RL4q-+E28gI>R?2mwNFhCbSteQ0|qSS(Em%Q&R@nJCl2X7VRvB6f6s*yjmva%t}Y z52^l&RB(ERU=WF*+za+MxR+M(grxm4#ldf?kw29xo>f%{pT~ce81NsnlnbZS6T027 zlA=x@!&PE!5KqY~Xw(mV@Ti)fW*}!8cEVSN3+Z4gtpr-!s9PI#vHH=sPh=izMaeGl zk>At}ZCvz^%w}yJW$q4?#C1CJ$|j&-KCh< zer?^AMaXR~Cu=MBDoMCS+1+~6Nna$9|1nbCABhi<8}1mN7$rs*go>>mN%*cY;w9R; z(IurTvEucIj>}4|wnj~AE}$@NO%?va(vr`$71w1u_Z27`Upx_aizDFfKwiXp+j(Mq zQa7sg7HwT_PHD-3n&C#gNSepF_VDd2bV{i^L{}`ZUX}Lr*rU-v0gCzO5o@1ejY3xv z6~RK0hz4>x^V9Ta#D`M<4yix5Q>$OXxS*RDu_nx_EelW%v9TF}&I??6GUMI}@lmbX zx=}^JF0KAgbj6@d%oY(=y>Bh1g4WQ%jG@+m)LdFJD<`QvJc&hIBeub7kASfBrsMc) zUe=rTWg;Pb((Z+(_kB6u%V1=gacx3k)D|;-9;QrgPDna?DMJ~yUKI_lRB&BjSg+}p zxKyOqrnLhOAP$Rr_4wJ5L@sWaDS>%L9NjB7XXTW@48b1k<~*ZFb zeX1UeBpNWp!0M%1x!HL1)IW?&)UNc))t=SW2R;K!zE1% z0wL>#u+>yE(j(jRD${Cfq)pDBU7(h^&XP5hjm#m{%f<|VCR zp{jf_g&76zg)Y?a*k6%?i2>7LN7_1ESh&mLzO}+IY~`1t=>?`W4Hkb$N;Vxw_T{Si z>7s}knyVMK>zNQWGgT0j2brG4!v0*QvGnpfx}r7Y(*~h7vT;D>VxGC(z~_~lqsJ#Q zF-I{mp@xzp^Up)&rpL}y6q@0T8K*|Iv}DKpE2XMCoI+3ro&aXK5x-8=8bNubA$O!2 z!Dwa6KnZRho=RARwFT<3Y;-H@6VqyqSbG_XeD{H1NIY|SW;~m5qD5EcSer{rUe#9o z3`%loo3_{;;<>Z7XsgbqYw7t6h2!sR_M6sudT_^rOFBo1+^x-yz!`)MSZ7D9P`8vI zd;a-mJYy1eikg_=;LasGT=^t!xZn=0;Zq)B#s~H;v+RP}y=Hu1$y)+GFS~-MY%K4Am|pxK(Sojch%B zR%%RwT}xiELkfoCWmx$i=y)BG&f1z|a1s-e0%5aYBk}w|B<|ad1{?5W^yRXM0d35` zUb#2dT08V!9P&u?9Qc-5_&ypcg6I;4hLr+TPpq_!3%2ucsq2{7jvjG6kV)HCol7|x zEUZifA>Oksq8zRnj{HH+`9%ohHb=$I#xbNbN*W5|!flioNZXi3dACGHG=;DXUlJbklLzHXI9&tDd-%z9KB)l5a{uRK>e5 zRQDlq)P$S`ZG-Hh>G+?adozB$ZoMw21(5nlD3;-T%?Q4=bguHzcaBWUr?9o#u(sht ze-YvjezIiZgoNp#?(2-;OZY*Mrx6@UEONl}U&IE7OZrd;tS;G7`^+ca187 zV)OID!QG3WFszMIVepg1|3i<>3i!QBIhWHs9LLZE9{KC^*rzxRA|O~OMAS}QQO#uO z2$P#`YIVhopmZJMdzOeL?sf4QiI4suVvP@ROxedph^=X*!qXb}q~BcJ8&s|7uu=q;piY_&-#?=u3+H zRZpbl_P2#_M6bjx+_{T(aI`-{G}HYny?Q0um)^8@fF6I$w-VYiUI`tqDVb?B9nUm_yLD6(ZQ{%0F(g2lEO|yyTvxB*JoGfmp~qmk3nA~_X#_u` z!+{*Jj`iTlv}ydLs8fkW@Q9>WYxp%+Xc@&xqrJmu+Us-pukz!M#>&mTF{BO69gRo- zMtq_>X{PeyR+XCVWT8xf1n--v9`3(F}prByddO{mHinmt)fJ=jW)R)|MMCM@hG9)LOkqxlk#2iECG)2K5wQJju88UJ@%Vu)(HIb%`Tnw!{2VRz8$_YWvYeG||C2~Xt67bbS`=sxJ+x281z#mX^LcEL z^my1ZC~XYJ2E#g%nCLZ-os5T)os5SgDrExJKPOb+jK?-~NEwe_BaxEJ_ID2>0fO7L zH4VKV^PeB~hMvcyhJCuB&3RGUnLN$d({v`>-T@R^x?U?ZzhL|i2};t+?p7KpTJk(? z9hQx#p9pXC!d{y5mygOFuKqnQY;7^EopODz{cCbCrcCQS`!_H&xd$7-M2UaNQM#X0 zwWGcYW8;=0TLyR~C&~aYtiw`H$6t)q5wTiW;k276k-{$Yky6r+OI3IXAswE9;6x58 zd1xeZ-(+cBG&n0OAo4warPz-L#mSCEK03$xyt7bvlC=$)spR)d8qQkrYC8hcz#v^c z48VU`X#Ng%*ZG8zsK%dsq`J_)mR7oQyf(Y9=Bk#PgGaR3Ih5(A=PupG?b^e?r|b5| z43Oy2bU{K!Py|s7cUVUp)exmH(J8_sF|He(kogOTLx*gvDK+9u$WhGSgGMmw&R>;j zO~%>Z=*(X!*=zpx6kmLa%wH+@9$Zn2=`v5d&PTr3-&OJq|ACs1n5#LR-_zCeUswEt z^@QM7;UBm*d|M*kjz}=PMe&dHVgm)_`3F?V)br2kU0j~8va8=iC~L7takTM#do4#t z--yfVg<{kG&hJ^T+PC7=llfTpq43ufOTo2tdX@D9oKtXNHQ%*0|3ddUIH#?+TOlRdtCKEHSOY5)1tka*3i3Yy;>y`)6vlW{R5g7XD&IW577YD6+Qak zwQAz0ayymtNmn+i;v@E-=$H`EHSh{i(UGhoZj}n9Y{xR_co77;srFraiE?_rZ^j3q zTIexkk6OA#bFjuDgN{(-`(4r+B@Af@^*IBcViN;ZW1Vc*d{L!R`R{eg?XLJq%O1N# zs7*%A?1h?#K#jecCY?U#0lBC0XqqRpW}zqu{^*cT|It!|U@_`$15OOV6A{n-Vd74| zNNUxQ^$v+>JCK}qkjBc$cJ(LH^dnM)z-mRBKr4@?=_l0^mv&R7l~y69vucQF75Q-| z3dTXKh#UKTaf9@_9?i8@pUrUFU*%87i#ust&+_Hk(CrcH<*+!5n>huRsv+H6i~i^)OxCk8>sLl!$ME(&O|Vn*VhG>Q%+m+p%e*j zt{H3w597#$X&<95f-F8@Cg5UF0Sy@f^J#S}DQv{Yc1H;IA^;?jyFs3S4oDbNyK*Ad zHyRM5W}ppr=-$IR<^ukPxZ!7+TFWrx#Ct&Z;>k8OR#`TRC9$n~Sm(uSd+dm9TEiOY zL$GQ7AZ=Y<$pz83jUd1W9iX=2I~WjaL1n&#b>z}c;@+6``TC=dsWiohJT!gi(@|e( zJdAxz^X+csNCzK|3tUtw6?D<4r;+ifs0gf3K72Wka`Etp!UMjUtrFZ|IjD#}gZ#N-9HCGU{ zRDE&hWhwuZ6M|&0+THM5@!JG9%Qz1!$&+yg_yY&1H8jW+kqoV_mQ#B^f71R4x4pB| z?LFJD8hhHCzXQs}DwFma@l2ZBBxbC{Tw?YjSxf!2aZpAiJ_eGsY4t>9>LSl+@21}E zZGyXseTzT+a@Dj(hc1UndjX#90JVmFs=b~PX3tB~-uB+@jU>#kcjNY2TCFESnu+r% z)_t+oP!28Qk;u_jU(Z>@Y8C_R0Lrq@)q9jZIS6|)%ZojkHOPq&Ys@i&Tg9GiF=B6H zPx{`HF~s#kSir_euqB!miD#H`mM<pyODKXi_>%G#Tv}doS#*yXr=**Et&^)Y{8-wl zt;2yZI8^vTdHoyOniwFh?qW)Wjx}f0vc8E)vq5RQLyY(Y@QU6gk%v}>3J3?_Kv>NFQ0xfytURUjQHPFs zxVLt^5g%+?EqLyoCk}}OU#^*{(+Y7@7?#tZM85OfXM|SOb6I50-(vrs>oh%6le$OR z;5;9%D$N+Ip3g&Qg166us(&K#WVm4svUR1^EeA7#RBgq6YE})L8Ho=wXcvt&o^TV6l*mU^dl}Klkg%&$&Le{XAN_Kro!b zA1;9x>>5z*;Ey0^W#KAcbO>e0OCwCHG+kcsZO=@X*RP)id1#Y%nbsS6`=`{ZtsVps zlo!9F^jRXY^9^Td#w1=-p(5^UiWD|ajQfUSiNM1Y)a7Fl_?TNR0;4A6&FI<&J7ekz z&j7a7iIJ5M z$2H%PHC&m=f^a6V(|g47Zt0TKAGa^ktyE{|mvqGGQm_7~uFxsu(eNI5gz2OBplbBj z5tMF!@0snX9XG(bXRU@?60a0~vN`Bei%UNH) zq}HH`sxGpnMKYC6vB&-k=RN&XBO>x=#R{Qj>ka$+9DC)fEVO0hO2isyFXmLmMWFrW zOi0{)Nw-9OE-(}0wMgM{BX%6-{-ijlBsviGE^%r|)E)<0iocJ+&o)T>Y`Ga@a2O5} zKU)UaPWjoI#bY?p4eyzh$QopLw;I7_@fad-(btC>gtrN|Cjgb<)P!s2pkq&jw~fcQ z8xhMl9`QtAurRdQaO(pRyi(p16BE-eC$I%>Zg`JJg8OT(F<{-rQh{GBP2-nVq08{? zNbnV{VGq;F6s}F&zBg2o5#1*&2tp=rqnmdHM0_@Ba5keZVCDU*c=K;ERQ5H%`SZ+v{0^+g|u$ zHLD-KHlx(_;48yEf&V0pFGsd9e?u#IuJ>O1Fpk)v|L*InGblLPlu^a6c5Dp^@Njt>N3yT_m0fm)BO_0;!265Yqwbu^{tsqh{>`n)3I^CWT`d zN#bl2)yS|mp&glp8|<$N{R;h|rfLlb@%Gvu@rU(@uC1Y`slpRdaXg|{*?)q-QuKlA zM{8)sO1C%j2mMFr?S1653d0>{zo|raX-e}P9U!`yG!1(KJvPj#UCy7i5UBkJmT^=4 z6I99ggG&cn@WDKGI{L>h1gubcnq4XGDjwLQ|U? ziw67y@tycXtbD}86e#VDGF%wV;>YL^^-rM;+@ZZSr!W^9{3d?mtWq%OV#~B%QzHUr z&$kca>02vm2K$lfW{agVNy*IR&UoS!>}LHR(Gnl#;7@`N1b;V!zYf7)f`tIFudGcG zi>S!v>S00-46GOxjD-D>WBSOv5k;6eC-dtCDFw4*&M^BudLi*RNYBof=s$eU9#6F? z`ijJLnB-Rb8hI3Rr{Ng{uKid3ID8bwp~h=sJgaeHJhy40mD@Pc%3?2r=v?b--?u-8 zTIGQ~6WYFmW_8@xD)Z6VkD?P|DH{MJeC{)~A?1#6{E@&frFHnbhRaGEUa^w$`fU>0 zQL1ZUHcXf}^tme13tJ7g>%j$DnYcI}DYxb6JYT0S%7qG~4ymNg1rjd<}q9_XmV16dKe<5-KgwnZw<(?`&DJx29j{RY*`HP@n= zihGmk0}`qDS$VFw1M^I`Yo>L=w2pOLO&?5auW21OEcRcDj?aCiw5{_?3Lg>gJ9_)) z(X!HG8@$mYZfgzeC2o#b$7$toTI@=qJrtHz&*8LIn)luu$ zHi=r%>dKgpZ2gr>50`nOvNm;NcFA@VP{+bMOTA1X zpWL*1sG^rL2xyf{pD>DNA*-aBd6p@|6!Gq3&DDqA>rk)}JRH(~(468QxY<6(@ga&Z z`&ZR+yYOx05!z;IXkXqvMtqPFza6=GD3Vy5E5l&2jUhIb1{3~NUI%I1#<^}~1v)QE z;aAx~Z0BQDrD26kbVdU?dRB=)inl&1pfakR46R`R_rq!#sKTw9b-vVwA1yJ%AKS-D zHybIBOZ;VfQXCU&bTw`0s1fYc8eW7*>))vL>0Lx+h~?Lz7j~L#7h`o!SODczK!8LG zi8U++q{7{DIk659#I4n_ILpnujKl;2b8hXD$e29A1IL00{@}!2ECx)I35R=leZ+b- zg+JmRwAHe|PQp7AWywe!>4^`)C~OZsFJovXmJtrHuiNM%U}!@`8*^B(dN#DQH6l)u zrdGDmh#kyH;SD6}G{oBI@b541w_r;nXK^zXip?`P!rr&w$)#t@(@pQ;&S%p4s0w%3 zsk+Rjnk{l7aj@kIh0;mLt3Z58rAA_mrWYcz#osnU1|~5!$(2`YxJWVTuHwexO?o@q zX_$9-+6a4KGiDZ{7ENr(%*w#;=HsG~ro3DW?hmW|m=NV z>{rN8dg_PVy~N{Lgv-x8U3j2#U3$Dg8lNwA%#<;0Ai zWhRy~bo;TiI@IP7LmPvYDSP=k7`1jYF)wGWZpnHn#Kx8%#gCXuqDYCoCaeu}41LOE zfs^r~XkijB^5-cuR0KN-e+MCp(K)iM(ekdb{eH(>{wBmgDwE zK%D0Zo9Q)(_8+A>e1}Ea!)2nBDjyVxDo~Q8t*B2Wi$^_GMx57l+8w*jquj!W>&`81&Kc-*ovMp zl$OChnQ2*@L$4)hjp3w%(uWUF^>>y~z&)AnwwiK)dVL#Nur zfJ-PG&w5&783B%>HrBrkt4sDbY!U4&Vy$;JH%L4scENC24Mlp;yFM#oE6JB_4u>SV zlMdc%vM5jpN{K0Il1&c?nakKW3?hU?nofR0tIHM%*wk@71M`zH0uKl7(CSwLWh9Tq zeNV<3i!5KjKxbRSTk;TD11^aqvaU7a zLF1v%JgAM@nk0{u3nrGf=9Vwf_~7T-noRawNO3=1L`9eV)N(VMu8sH-ti~)B?(HQP z8`lrk67RBZC^H6fQ6dE;G)3<)A=!+=8yz60Jlz^-#QC4qCXSp^ZW?4zi31D&}pzHt6cP~{#_<#sq>3C#=MPss&#JFNJv8jq7`eP-eg;`ff} zg}gX{7#F2eOZ=WL@q4Xb7{B*d(BWZ2zQoda+l=5zwg;aQzSnpjWzyk$|AVXE;d^d- z)OyQ}vfd)`ds#Qa`&IlNQ5>g>-@Ebj@p}Z|9MlVUn1y@5g~anpyqr)0d$-MCE75z0 z%!Ka~HL%3H$-Yog97D>D5-DUwVHX}*xT~2cbtCth(~)}*A+=0o7&8tTR>KvMKRu2` zhky3?Q*tGAFIV|eGK?O-;-S9rdmAN$FW4$ltcWp_Ewk2DN<+xKSOrG5(jP+6w7L@_ zG{xTtGY&l?a{)qViH?<7hisHys!4VyZ!V-^j2@HhT(GTj?d!F=-||jOF$V5n+LeUk zSq~BESK{KIup)B=6I@jFeHkixpoWr0=9nCL0Bj;+onu&Il*8~|73un^ zw&p2b_A0v2s+(k{#6G3|TUb~Z=Y{N13X37$D1VM%gtv+(gBkvU_%9M;yjo#h)^f&G zz-_S>uv(WTccktjc;+u81^L-F5?#Vd_^AwEc?CwKyeWq za)%o;NU3~EQD8)LHbquYge+Ag);L^h`AnQcYW@qEV5PWz%Tl^sjYjb*K_weR1wmL) zyQ(-xZfxO3SnMeERJCJ-r>>Qp-#a-Ms2tc8{5S4f8EY(%%vF-vbU=2O#2QNk@*t4u zYt`FrRxp0C(SeiDm$=tsH}Zm&YIsQ`_=&dS=bXf0(eUOv2~BjO1WKJK0WXIL@mrRGI)SFI5 zP(~M$vd$4M%lR%4Q=rxT3_D0d@0(7bApS@e<|U`2M60V3g5!eXYJMmTO2=KjD3d)n zHkNNnPqOww;{zR<%VY&Oo5(0%Bz}Pm+C0| zFsBgTi$K5wwC?4*E%oPDAKLlvF#;JaRae3QWj_D^;tWY zyly|FY6&y56BmeHg8_ygsQRn6Y=D({D{+*(ndbma%r-4H7_0q4hqPD?1&E^*VZHQq z*%lL<3Iw97V|dizNq!E~c~OTPRW!k5Mx%9phYF*! z7P2d|wf0%XaaOJtmWkbf@gVQ8aNe_~v@PDPL(m=ir&0iYJA^Ma%%Hg3xadt3Dqi~G z%xh(ox9bU3m#wmHmU9U_Xk2Iu8z>?Uwe9aBX!YQzmT0+IsNO08fp~>@n@+$_-bMjZ zK9nT-3Q#uzi zbCFcc?FD9hhzbvV@f-k3X4)b%O$&=xWTx68iHl+@_`F0`Kh~TlY>gc(rAqu+yavsB zIVCcWFxf?5itOtp@!FjIEXa>!KZ%DlWxAKzYn1E06kDrWY!-P16)0vvczF|JqSapp z*skP{gI9W1YL38IF!0VM?jXbvyzCtijytkd=eSalcb!ipbGbyoHcC(q4G%K{nv`~% zB||_YfjNR<)`^r5#T2Ipx=E|H;&JdI{syd)W9$)?vU=MFIk@)oje3H(66-{y@WjGV zlM?x_M6MrFJ3>|$SfTw)+(0yQxVUtT-~p3e#9H}IJ+UxHEIo0Lo}+pS4L-&577+I$ z{xew{(cAE7r~0eMOW6HDxZUSM4{Vu~O>)XH&HGNMFt{U%GJ3X(b)hass}`xfasCvg zaHU(!m?Kl#FzXiI9ZB4a2{D?t673bo?{Kz2D(Vh?BzA^Y&Y}R7 z%Zp()E_fEVRNWX%dNwnaW++mym%#%J?}R+2N|?x+T!rU;I2I1S=DQ-Ol)iP5h*+T` zv?E1IO3M-rN+bruFk+D1rHC{8gv0va zqbA$7U1=hlSc!l$3R!|XEG}KI&6eJs;}iMHy(l6spRIbr3NW^iAq48O`9WD$4B=r3 zh>6#Ba}Uxuw~OsjO9@&#%EqM#5`^ieqy&b>(yvM#lO+HTM=lLR5FzT2a_q#WqVyBv zzDq=+top5ZZ849O=t17($$Mgm`IOBChnb5@E^sSsWDm5MgxH+YPh$IdAJhiSD!I@m z9ab8F&1*RE2JHI1;~_quf#6iU!#|aYL3`jO-FZqeQJUop_;Z8>1eq5R;>PgpE1PYE z?AT_0h(Fwjd(7*zc*R#2ZIy+6k|@z1C2MLiy*+zGAzJf)~?adxyfrR_!N zN?Y;IstycR94*RTGz-Bra|CmN7ilTpg*Dk4(I#%hjrbC+_n3vP!oOC3pYz^!mRdq? zrP*R(&E_-^d>*Yt{oDJFB^N zpS$CWWpPC#Xkr(e-rZ`)7b~g^Cmv1&DsE8_2xS?CyQKN-Eml^eVlIaXdtx9Dj!$H% zp0I6I_2hf>L>Y`|TxCVRY3IJ-CH6_>h0IR`(qQnvpQ@dGgAthDxqzm0EP~ z9cM^4{FuFgY;7n@QA2$Mj5i@6HrKwJ-nEvc`ao<2p}ndL%2p%-4(3}-1L6EPf@awf zx?-@3ibM@>L`N6X5Glyk+NFz4v7E$UYL;ny5NZuGm07-gf$&l>njp-u{^OlGCX>si;r&VLk$#guL!^gn#aQ#O(u-u<@YOIDjA%@{ zS0tQX2oZ5A_Uj3pL@yw|L>eml^~a39fs;t9YZ2k<%z+MAEe;}Hm7xrik69UmC>;F; zJ2f42?}WGL7CY60pI{%gi{zq;0^y+W_c4BZHw|#*gqR75G3@?*fJvzGcg6N1>2bnn zbz5k>uwMN4-1ru)^}))Ct&>F=%NK}9G-o30VNrU6s!z5SaD%uZoEe|bjQ?=y3^VQv zbQbkmFPGSnCf>*a7uiji$da*oV#-ZC7PH}1ZS}cAL-;TfrK}@l?ci-JGAeBFEoSuu zcNLDIE;f!)cl+X}(4nOwYFYeO=_g7i#<{`U{%~Cr*N1b<$4*|}Pg?(N#tJn^7UuF0 zpQ;xuTih)SjVtB`yN|55rW1bN{apQko zZel?J6K9@N&#R%6Bs}eCN{P{xHv65&1UcD4YS(Lo1;*LBT zhi0R#+#`F}M9t|Ze(hGtkd5`LpQ8HSqW731e^K1nm~g7})tPppMSX$j-F`s&)LYKp ziyf<^mXc5Q3fWEqRa0h$SW8f>E*Fy>sr+MwlQ0|WfGsp+HAy)VCfi<-Sah5@0Pwgy zTS+h|4~cFMq*fxUm0Kz0P9j!Ni*1t4G*ne2CY=+2C4MDY)Hq~6L8fD&57+7>UO@&` zTo$e)qy{7%YeOh*CQn@cR7AFVZw_9HDP+SEk-yMqane4H3jq$Jt=8x=%A>OFF1Nj0 zsd*~zQz#Ol$c!TZxW~TRgDN7-LM0xT=D4QaWws3r`w)%9eX|2QfO_n@ z7u5YXicod!x@V+3@g2HV;@ES%Z{kq(C7o9~_MF60(GFfaDf(W@p3_h)rH&@KCWU1l!^;Arzp816OwlHkZruM1qcBEeAWNTSO`75+CA%GBMeC z|A8=_#CX41*uC(p?0s+2ufMkTBAI=nZjV5*Rn1`ED`vR7TR}9IR+*afvFxeANZclR zv(}KO0tabOhSYUgs;&zn2^LJPQk0>suAJpA!|^uv6H`eNwDV4$pG`y53=ym?LV|3#a+4k2F_+f+*w+^Q~d)vAfCLQzlg zSsoC{D^r?S3a0aa%5B+tdbee-sV_cFX6`9I%U9l`S*QV^uYmpje*1^Q4^RzoH)?gi zlC^QN2*6Pe;o-7ZsZxI#Z&X*W&bYyS|vpNkNrYb)ACB zLKj=+gr&&}m;FniyfFpM!sTTGj041q>BBu)?$%0A5@nW!?M*k$`mp=sm z^ph=qtsjcJgI9553=B_kUqXDf*ls3{x*u0bzxy|G|KG`$b`%K7h>G-6!q!}=*9t+n zCdp3WAa2j|;^N-OvkNE*sFa`w<+|@u>{8^}F7%4{=zzP=uub9I-DkLgrAJ)!?%qD- z(HCchc=GYOj}oRLK*DSBT3C;$t*R6j6Vr6v`l*@W5WjNhr&a*$`|xQ{Uuj|zZDMLZ zBs48lx@NuffW4}Yn?#z_p-4b&iB0XCbs7-IAC30xZd;{`vL{kn5Vo>-?LAv{cNkWi z{lMFT@~x(@F=Y^mhRBZShFoE?;;91VX@9mFH`DU*E}Z5ChwMWl_uQE#9(;E?=Ggbi zEQ%|d*#vMUi#7#zL;;zXZb^Hz02EQnnuqk}B)!2>N3-5PK(|(@Hvr(!05iFQRK@Y# zh1vSuV8vO{(CYq!v2s>2w7LqV*eI5tyfB{oY^*s?lor7NDmq$+@ElADP^0vIB(1!? zPe~G4@RhY$tZ^J6tVONn!qMel7aP+do{kcX=h8m7jZzt&)|mD@EFsl}-76ku?g-sfIM$yS zTzi$cJE>~24mqYcW{VwBvJ5Z+uk#&CynV0Y@UwSIU`A6nO4O|)jRf0jnhv)Y!C4f4 zzao{Pty{11crT7HD8GIcpeYD%Ua zs(6YrWt|Bs%z;TtG1X+5x%OZ8NNQ=o?b=6v5xM@_;J7v%njkXIbbY@|?=*5Bu2j&Pg;p{o~SdsPu7HKv*W?!SOT{$*- zZLF=pzAObIdLtPh{dBCY$ku?lqC-OPT+uO2HIbWT&N1`L;=bhsI*QOM+?~GRZ?x0B z;e^N$;ehZFZeSuk?Uq0OmY~`pCa|oq3K1zhLPa1tjblZ*;JB}>X%tyQad?SCSrGYu zLAoqb&+jcm9Ih+RcE>T*0Q)bP9J~eJ<^PTsN?gmL^h=DqiJt<406lU#Q^ZA^f&K4d zA*p7oi-j~xNnzrnH{E-pliczuHczl7Xm*)wFH}Fz4dkE!4oy|ur_gX2kDpRVu?kVs zVD5?r++waq*kgLlqA(*wjlWhk3da$VG2Grr*VSuJld76#m%>tS4v01T?G-55&dsa( zHpdxl)tVvcChwshRf)s0hpU~K>It8xa;R(wq_|Vf0l!@?O%SDauoPf5RH)g-;we4( z^&6?iOiwi?MRuwo50mfUZdp&hOUJ6fR0^o4Id6q^o<oi;X#Wt~a)$bxZqRd2R=DPPWVluXWOO<9$yNGae{#oTfImqAiJuBV7 zfQHmUUL!P)<0R+G|_+@hIWF&@}i1ZbB>*tda}GWA~!A%xGeS1Uf4UV zU|>QTl137dX-b>_Zt8L567l5-c-7IVsT^?R=r)4!2|duJl*XfqU)7UAkfT$NdcrWt za69dgPu~t7-<9q#H#$C1b>m(#Qsm;0Q{NjKdv2tG@N%hzaJtW zYxrEYYkcf%*XR}g7Y^tW`zK~LE26S4tGQ9$MB$Uqhp?R@F>5`Zh7S4qhiF#nJ2ToN zw7!#(l2331chA32Cz?3**3|k_M;n5M?cni7ers4K)#nG*5yFt}T@>!b4g_II_qwkfXwQNoo#)F_w{@;e_eZtVzA2q=GP(R6zga6j0?rsZ zZKb@ZfhI6==cmibV_)xFvDRplC|J+a@<+XQ{b`CMi7{~fk7~b{d~@ib4~ho&dymd= z03c3Aw_N0?jP7VTk!?nedXs&VeEEe>6j?9J=Q-*N$Ya@5TXnuF{o&HVYWN4sm0Fuv zLo4KElrqSF=A7mDKtXY%`jQfI^KG>U#ciyp zV>w+C#zdUxrlseam2A_>YL2zB`E^s-%DO;-*v4|}4T6E%9ehU*I={lz?-t0@zyq#@iZh7jNC@xX!wc=x)g+25k>mP|| zwpW(A3Xw|00d6$fk4u+XzuLqbnTsc04{ny>;b6b8${_M&W9RoD>hY5SI0BZGvHlUds5g<4Z;wW|;ex;h?nkKB9+&7$ka#fQCICm#^kS2CuR} z^r4_!NH9+%01jYw!*v+siuZ(I7lH8t2`DqqU>jZih471~Ny*Niy8Tq-*BS5lI3aAx zd4Ior|B7gK>m_HqAuIH^%3FmW`XG;n_Y!1v0Y3nt$YUWsQYVD@NF(3us^3&Que|P< zR{twSciXg;qSk43zo)UX3U;pW!{BR6&PSEU;lr0BTFIFxUo2q3c+oqd=cI|{d>LU0 zpLV=&G<1`g;JgNxi2dd~uHz-ejU8)gbLXATJT>Dt1S$AeT=5?Df3ilrODvs)bQujnMA+KfC!EXW z&#_OvOTVMX-R~xJruP-8uQY6reiJ86mQUURq(0yv7on0(^Y@0H!{}6fw&ZhTX5tp7 z!#A^89uLNVyRY|nZlU)dpP%ae4^}bEvGoP^ZQQ6^`_-&vFx0$F&sU*)mR@DX`d(zh zwk-QlLI;@~9%t-$qRJkA#Y(0qmy8a)$+lX=oRwSO8E+xJl5hE+_l90Ob2Y($4ABFBUvxf zY+@GVRXZ;%J(us^y#z;VT=sK(*kM^OJKp&?B%rOXffzu*UDrv+{q{sIf!E#?dhRnS zuy1#+qleU2+7jnp2!a#=IQfFR0(GlU%sugB@{T)giM=w~8l?f98$-{@(bfb8F%TY= zCWh;XNprG0a=Fv)WQ;mCfA94Ap7Uj{#YlAV;ct})b=ls+jvIcW{%mCm7~(?^c3aAc z;>Oo1$T4l@NOgjo{))=tB;U=dlX)be6m`lxl9PX-m8SJldGUwPX*nMpZU5Yv2a({r zETi+es;fPQHgUtGcz!p#IoV*tHaWH74&9zttGt1fUi=k%4&?0Ir&wyL;O#=?5{W*M z6F6>eCNI>?2nA0>cWEn60AtnOke#?M@mw0V%!2OdAse;pWk*=AdEtOj6`}`H_@L>u zePgdevi-J0vN7ss1z4M#6=1P`S^%#9u6n{d&b|Y7*8{@^u~qJ1&90QUx|vrafs`)r6F?BGwt6)uB3Z7N&R$nu z*iL+1^c8LOeBkKuhV?qsh5WvT>>BhywI&0naP(-IwU}+_dS$YpMdUU3A zJ|kTCa&!;=)jRnD4SPNLDvh>!s`Q5aE`o$)=l9pjYe*8&ku-OGh`phvyspo-k1#Rr zDsJrg9AmnD#jNmcEY*J0)*h25;nN=z3*h zB6>Caqnp#`kcEgF1EC9t+F*)y8L^nPS+;AWmv=1162J=t>!sqA=`P! zc!{f%zy*odl9y9$TXtFJYE_Qv1DBTv*Dt(;FMCMzoyyi$&jyJE>x`Hf59~LC?=9OY z?`lfS8o|4s_Io3R+bP|-)g6cyFsJMf`Ceu$^~G#}XMSfea0UZsF!28a3|uv`Xk_u% zibs9z{;Q{7Q#9?G>HXTAWCL~YfBmx6rTEj(96dPgJO!l7NSBkYAbpKgM7oM}HR)PX zkc4{gxq(zl3X#T;%1Pr$25ADxBuykuCf!WBl{AI)P15b8Z;|dK-9`E~=^oO(q^YF) zNR^~2lJx0*(hSl+kRBk-BF!evA zxtkv1_%JC(63eKb)IeH6T1m1<-y=zQr1&Beq}8M~q_u965a=I}en@(R^dr)b-SiWV zkCJ{$`bW|~xk;8+e@6N_=@+D5y6IOOe@*&lH$BerUr4_p{nkyt<0v-jAKdguj!%&O z8~80b<=Yk|K_IWIsTp0=%yx) zlA1{^Zd%WAgPU48wz+8|$4ze9%<%;`y~t70OQbDsdYR)^H??z=w2idgO|Nj2^eSnG zn|5-Pw2Snbn_lNA=?yo%$?+{W?dB+H4{5KP_HmT7-%W3Gl=KeiT{peQQPTTv>fk8J zCLM57Cr3#ixamWVk`B7*BaVmMbeN;0BW^m%QPRh5`h=sTV{Yo=D5={`$2m$m;igYH zO8U%ApL3LS(oMWtK^c1-l#|cPF{59~S8&HX$3c)3QJYw)YDVTyty?qqci8u>Gxb} zKeZv=3p1}5-Fzs?SCdSNZu~e$(HEcMC_0(1HktIH_`LIAGC6}|bV)M#6vsdVd=X`6 z6vteSxsN52_i~K>DVcmyuK$uu22hLwdw5F-$HxizT+eYDF~Xt{y+b&wd;~g|5V@a9j|RW5b$)YH^8FZ0$xr>#pLBKVm{UHNs%WESW0lltp`6rj?&GLRbf32MA$cG_ z>BA#Dv*X{B$$`VUI9(rlo(VpNW#*oB+A38aq`r5k?{~zrE$B-NJ>%C?pR_QuM2O*3 z%2G`%p}tYft6gs;lYar%o|^2OSVlOzAY*=ZPAJ=7)-pM}AeNC)ot-l#+dsj*nwsrz z;XaDbO2;kVsaJ11IO5F9ybWYO1(Yt>cp(yNL_7L6KNT8US#K$`CQ!UuX%H~TKJbdTTctm_T1~v_rCRO>3Q52P93NtI90m0P|O`03jDF>qDS6i)alA0x-~ zbESjwbL;sM2s~Xqg4ff*CG6PiIotEtuT?F5{+zD-Wt7kGoH}0$RXwND+cgS)Dm~u> zTvitP&!b!1divAnJqms*okxLZh&1ds-ya0-4sD-4tx|C4(0JDq zTYmX+=UVweIo;^Oiy9ob@;P!U8^Ony`U1EQJ}&i?y7ZRP zKZXG>=v{$(K3tVQHAD7`Nj)ksi)`hORL-3} zZ&p>bYTC$>qHBszU#MdG%sDeFrdC%^T~xvHUiBi+^y;azt170|&Yr!9TY6npP%_%5 z=(L&hSt_iVTOFlga;S(dnpag;6P;S^si{f9Ybs{Xq~^JD?U{9dMRe+Yv#Q9is97*| z-t4OC`?>hecU*9^+Py7W&GX#5Mpn+9Qxlz6J$K&7B2R5}=Byf?S5YzcmQbX;qI_bw zqQay4AziGQTQOtmoM}9vBK)n1p@=aC=(vfuRg~-Q4SM(%KvN=PQn!vXCzXZFib-R~ z-daATVoIpYET=;0Ey$>}KMU<7%n(1?^XLC!<)QXy_snwM;bb9` z#sksMmKAr01Nr_&A-8HQBaUg zg2~`V6HL!wcH@GWt!vZfJv-+&3>O6l-k!O0&zX5&`*zOoG#)SW!aN)7yB)w65pPAM z)iWOl*B~1Gvt3(vqZa?6?1=I4gWy5Z z8)9&r^!O<76rm9vc%Ah4E%03v^`@Ys=_V@e@w2`HvhD&upTB$Sx&GC^dB;~%2Yos+ zJ|;c$Q}8_T^Y|}Gj~@hQNx=O>x35u=hthXY1>g8}TjI!;FYAeAbj4x+M_tK`{?Pk$6jFTZT`09}*OpR2zmeL}w^;AphI{Gp2Yw!D{csQ) m$HVhW>vHUSBkxUgV@LT+oilF9mO1rr`sb&@kiTq83VT0U0E=k= literal 0 HcmV?d00001 diff --git a/longestcommonprefix/longestCommonPrefix.cpp b/longestcommonprefix/longestCommonPrefix.cpp new file mode 100644 index 0000000..48354c7 --- /dev/null +++ b/longestcommonprefix/longestCommonPrefix.cpp @@ -0,0 +1,482 @@ +// Copyright (c) 2018, Nicola Prezza. All rights reserved. +// Use of this source code is governed +// by a MIT license that can be found in the LICENSE file. + +#include "longestCommonPrefix.h" +#include "../src/fmindexDBG.h" + +struct LCPnode { + + FMPos pos; + + std::vector boundaries; + + friend std::ostream& operator<<(std::ostream& o, const LCPnode& t); + + LCPnode() { + } + + LCPnode(const FMPos& pos, const std::vector& boundaries) { + this->pos = pos; + this->boundaries = boundaries; + } +}; + +std::ifstream::pos_type filesize(std::string filename) { + std::ifstream in(filename.c_str(), + std::ifstream::ate | std::ifstream::binary); + return in.tellg(); +} + +/* + * file contains 'N' characters + */ +bool hasN(std::string filename) { + + std::ifstream i(filename); + + char c; + + while (i.get(c)) { + + if (c == 'N') + return true; + } + + return false; +} + +uint64_t node_size(sa_node s) { + return s.last - s.first_TERM; +} + +uint64_t node_size(std::pair p) { + return node_size(p.first) + node_size(p.second); +} + +uint64_t node_size(sa_node_n s) { + return s.last - s.first_TERM; +} + +uint64_t node_size(std::pair p) { + return node_size(p.first) + node_size(p.second); +} + +void print_node(sa_node n) { + + std::cout << "[" << n.first_TERM << ", " << n.first_A << ", " << n.first_C + << ", " << n.first_G << ", " << n.first_T << ", " << n.last << "]" + << std::endl; +} + +void print_node(sa_node_n n) { + + std::cout << "[" << n.first_TERM << ", " << n.first_A << ", " << n.first_C + << ", " << n.first_G << ", " << n.first_N << ", " << n.first_T + << ", " << n.last << "]" << std::endl; +} + +sa_node merge_nodes(sa_node a, sa_node b) { + + assert(a.depth == b.depth); + + return {a.first_TERM + b.first_TERM, + a.first_A + b.first_A, + a.first_C + b.first_C, + a.first_G + b.first_G, + a.first_T + b.first_T, + a.last + b.last, + a.depth}; +} + +sa_node_n merge_nodes(sa_node_n a, sa_node_n b) { + + assert(a.depth == b.depth); + + return {a.first_TERM + b.first_TERM, + a.first_A + b.first_A, + a.first_C + b.first_C, + a.first_G + b.first_G, + a.first_N + b.first_N, + a.first_T + b.first_T, + a.last + b.last, + a.depth}; +} + +inline uint64_t range_length(range_t r) { + assert(r.second >= r.first); + return r.second - r.first; +} + +inline uint64_t leaf_size(sa_leaf L) { + return range_length(L.rn); +} + +inline uint64_t leaf_size(std::pair P) { + return leaf_size(P.first) + leaf_size(P.second); +} + +void print_nodes(p_node p) { + + print_node(p.A); + print_node(p.C); + print_node(p.G); + print_node(p.T); +} + +range_t child_TERM(sa_node x) { + return {x.first_TERM, x.first_A}; +} +range_t child_A(sa_node x) { + return {x.first_A, x.first_C}; +} +range_t child_C(sa_node x) { + return {x.first_C, x.first_G}; +} +range_t child_G(sa_node x) { + return {x.first_G, x.first_T}; +} +range_t child_T(sa_node x) { + return {x.first_T, x.last}; +} + +range_t child_TERM(sa_node_n x) { + return {x.first_TERM, x.first_A}; +} +range_t child_A(sa_node_n x) { + return {x.first_A, x.first_C}; +} +range_t child_C(sa_node_n x) { + return {x.first_C, x.first_G}; +} +range_t child_G(sa_node_n x) { + return {x.first_G, x.first_N}; +} +range_t child_N(sa_node_n x) { + return {x.first_N, x.first_T}; +} +range_t child_T(sa_node_n x) { + return {x.first_T, x.last}; +} + +uint8_t number_of_children(sa_node N) { + + return uint8_t(N.last > N.first_T) + uint8_t(N.first_T > N.first_G) + + uint8_t(N.first_G > N.first_C) + uint8_t(N.first_C > N.first_A) + + uint8_t(N.first_A > N.first_TERM); +} + +uint8_t number_of_children(sa_node_n N) { + + return uint8_t(N.last > N.first_T) + uint8_t(N.first_T > N.first_N) + + uint8_t(N.first_N > N.first_G) + uint8_t(N.first_G > N.first_C) + + uint8_t(N.first_C > N.first_A) + uint8_t(N.first_A > N.first_TERM); +} + +/* + * number of children in the union of the two nodes + */ +uint8_t number_of_children(sa_node N1, sa_node N2) { + + return uint8_t((N1.last > N1.first_T) or (N2.last > N2.first_T)) + + uint8_t((N1.first_T > N1.first_G) or (N2.first_T > N2.first_G)) + + uint8_t((N1.first_G > N1.first_C) or (N2.first_G > N2.first_C)) + + uint8_t((N1.first_C > N1.first_A) or (N2.first_C > N2.first_A)) + + uint8_t((N1.first_A > N1.first_TERM) or + (N2.first_A > N2.first_TERM)); +} + +/* + * number of children in the union of the two nodes + */ +uint8_t number_of_children(sa_node_n N1, sa_node_n N2) { + + return uint8_t((N1.last > N1.first_T) or (N2.last > N2.first_T)) + + uint8_t((N1.first_T > N1.first_N) or (N2.first_T > N2.first_N)) + + uint8_t((N1.first_N > N1.first_G) or (N2.first_N > N2.first_G)) + + uint8_t((N1.first_G > N1.first_C) or (N2.first_G > N2.first_C)) + + uint8_t((N1.first_C > N1.first_A) or (N2.first_C > N2.first_A)) + + uint8_t((N1.first_A > N1.first_TERM) or + (N2.first_A > N2.first_TERM)); +} + +/* + * number of children in the union of the two nodes + */ +uint8_t number_of_children(std::pair P) { + + return number_of_children(P.first, P.second); +} + +/* + * number of children in the union of the two nodes + */ +uint8_t number_of_children(std::pair P) { + + return number_of_children(P.first, P.second); +} + +template +void computeLCPPrezza(FMIndexDBG* index, Bitvec2& LCP, + bool progress, uint& k) { + // Initialize the LCP array with one more element than the length of the + // original text + LCP = (index->getTextLength() + 1); + + // cout << "\nNow navigating suffix tree leaves of size >= 2 to compute " + // "internal LCP values." + // << endl; + + uint64_t m = 0; // portion of text covered by visited leaves + uint64_t leaves = 0; // number of visited leaves + uint64_t max_stack = 0; + uint64_t lcp_values = 1; // number of filled LCP values + + { + std::vector TMP_LEAVES; + // TMP_LEAVES.reserve(5); + + std::vector S; + S.reserve(1000); + + SARangePair rp = SARangePair(Range(0, index->getCounts()[1]), + Range(0, index->getCounts()[1])); + FMPos root = FMPos(rp, 0); + S.push_back(root); + + int last_perc = -1; + int perc = 0; + + while (!S.empty()) { + max_stack = S.size() > max_stack ? S.size() : max_stack; + + FMPos L = S.back(); + S.pop_back(); + leaves++; + + assert(L.getRanges().getRangeSA().getEnd() > + L.getRanges().getRangeSA().getBegin()); + + for (uint64_t i = L.getRanges().getRangeSA().getBegin() + 1; + i < L.getRanges().getRangeSA().getEnd(); ++i) { + + // assert(LCP[i] == nil); + + length_t h = L.getDepth(); + + if (h < k) { + LCP[i] = 0; + } else if (h == k) { + LCP[i] = 1; + } else { + LCP[i] = 2; + } + + lcp_values++; + m++; + } + + m++; + + assert(m <= index->getTextLength()); + + next_leaves(index, L, TMP_LEAVES, 2); + + for (int i = TMP_LEAVES.size() - 1; i >= 0; --i) { + S.push_back(TMP_LEAVES[i]); + } + + if (progress) { + perc = (100 * lcp_values) / index->getTextLength(); + + if (perc > last_perc) { + + std::cout << "LCP: " << perc << "%." + << "\r"; + std::cout.flush(); + last_perc = perc; + } + } + } + } + + // cout << "Visited leaves cover " << m << "/" << index->getTextLength() + // << " input characters." << endl; + // cout << "Computed " << lcp_values << "/" << index->getTextLength() + // << " LCP values." << endl; + + // cout << "Max stack size = " << max_stack << endl; + // cout << "Processed " << leaves << " suffix-tree leaves of size >= 2." + // << endl; + + // cout + // << "\nNow navigating suffix tree nodes to compute remaining LCP + // values." + // << endl; + + { + std::vector TMP_NODES; + // TMP_NODES.reserve(5); + + uint64_t nodes = 0; // visited ST nodes + max_stack = 0; + + std::vector S; + S.reserve(1000); + + LCPnode root; + SARangePair rp = SARangePair(Range(0, index->getTextLength()), + Range(0, index->getTextLength())); + root.pos = FMPos(rp, 0); + for (size_t i = 1; i < index->getSigma().size(); i++) { + root.boundaries.emplace_back(index->getCounts()[i]); + } + S.push_back(root); + + int last_perc = -1; + int perc = 0; + + while (!S.empty()) { + + max_stack = S.size() > max_stack ? S.size() : max_stack; + + LCPnode N = S.back(); + S.pop_back(); + nodes++; + + update_lcp(index, N, LCP, lcp_values, k); + + next_nodes(index, N, TMP_NODES); + + for (int i = TMP_NODES.size() - 1; i >= 0; --i) { + S.push_back(TMP_NODES[i]); + } + + TMP_NODES.clear(); + + if (progress) { + perc = (100 * lcp_values) / index->getTextLength(); + + if (perc > last_perc) { + + std::cout << "LCP: " << perc << "%." + << "\r"; + std::cout.flush(); + last_perc = perc; + } + } + } + } + if (progress) { + + std::cout << "LCP: 100%." + << "\n"; + std::cout << "Maximum stack size: " << max_stack << std::endl; + } +} + +template +void next_leaves(FMIndexDBG* index, FMPos& L, + std::vector& TMP_LEAVES, length_t min_n_children) { + + for (length_t i = 1; i < index->getSigma().size(); i++) { + SARangePair newRanges; + index->findRangesWithExtraCharBackward(i, L.getRanges(), newRanges); + if (newRanges.getRangeSA().width() >= min_n_children) { + TMP_LEAVES.emplace_back(newRanges, L.getDepth() + 1); + } + } + + std::sort(TMP_LEAVES.begin(), TMP_LEAVES.end(), + [](const FMPos& lhs, const FMPos& rhs) { + return lhs.getRanges().getRangeSA().width() < + rhs.getRanges().getRangeSA().width(); + }); +} + +template +void update_lcp(FMIndexDBG* index, LCPnode& x, Bitvec2& LCP, + uint64_t& lcp_values, uint& k) { + for (size_t i = 0; i < index->getSigma().size() - 1; i++) { + if ((x.boundaries[i] > ((i == 0) + ? x.pos.getRanges().getRangeSA().getBegin() + : x.boundaries[i - 1])) && + (x.boundaries[i] != x.pos.getRanges().getRangeSA().getEnd())) { + length_t h = x.pos.getDepth(); + + if (h < k) { + LCP[x.boundaries[i]] = 0; + } else if (h == k) { + LCP[x.boundaries[i]] = 1; + } else { + LCP[x.boundaries[i]] = 2; + } + + lcp_values++; + } + } +} + +template +void next_nodes(FMIndexDBG* index, LCPnode& x, + std::vector& TMP_NODES) { + for (size_t i = 1; i < index->getSigma().size(); i++) { + SARangePair newRanges; + index->findRangesWithExtraCharBackward(i, x.pos.getRanges(), newRanges); + FMPos newPos(newRanges, x.pos.getDepth() + 1); + // LCPnode result; + // result.pos = newPos; + size_t childrenCounter = 0; + std::vector boundaries(5, 0); + for (size_t j = 1; j < index->getSigma().size(); j++) { + childrenCounter += index->findRangesWithExtraCharForward( + j, newPos.getRanges(), newRanges); + boundaries[j - 1] = newRanges.getRangeSA().getBegin(); + } + childrenCounter += + newPos.getRanges().getRangeSA().getBegin() != boundaries[0]; + if (childrenCounter >= 2) { + TMP_NODES.emplace_back(newPos, boundaries); + } + } + + std::sort(TMP_NODES.begin(), TMP_NODES.end(), + [](const LCPnode& lhs, const LCPnode& rhs) { + return lhs.pos.getRanges().width() < + rhs.pos.getRanges().width(); + }); + + // for (size_t i = 1; i < index->getSigma().size(); i++) { + // Range r = Range(x.front().getRanges().getRangeSA().getBegin(), + // x.back().getRanges().getRangeSA().getEnd()); + // SARangePair rp = SARangePair(r, r); + // FMPos currentPos = x[i]; + // size_t childrenCounter = 0; + // toBeNamed childVector; + // for (size_t j = 0; j < index->getSigma().size(); j++) { + // SARangePair newRanges; + // childrenCounter += index->findRangesWithExtraCharForward( + // j, currentPos.getRanges(), newRanges); + // childVector.emplace_back(newRanges, currentPos.getDepth() + 1); + // } + // if (childrenCounter >= 2) { + // TMP_NODES.push_back(childVector); + // } + // } + + // std::sort(TMP_NODES.begin(), TMP_NODES.end(), + // [](const toBeNamed& lhs, const toBeNamed& rhs) { + // return lhs.back().getRanges().getRangeSA().getEnd() - + // lhs.front().getRanges().getRangeSA().getBegin() + // < + // rhs.back().getRanges().getRangeSA().getEnd() - + // rhs.front().getRanges().getRangeSA().getBegin(); + // }); + + // auto stub = 0; +} + +template void computeLCPPrezza(FMIndexDBG* index, Bitvec2& LCP, + bool progress, uint& k); +template void computeLCPPrezza(FMIndexDBG* index, Bitvec2& LCP, + bool progress, uint& k); \ No newline at end of file diff --git a/longestcommonprefix/longestCommonPrefix.h b/longestcommonprefix/longestCommonPrefix.h new file mode 100644 index 0000000..497cc1d --- /dev/null +++ b/longestcommonprefix/longestCommonPrefix.h @@ -0,0 +1,392 @@ +// Copyright (c) 2018, Nicola Prezza. All rights reserved. +// Use of this source code is governed +// by a MIT license that can be found in the LICENSE file. + +#ifndef INCLUDE_HPP_ +#define INCLUDE_HPP_ + +#include +#include +#include +#include +#include + +struct LCPnode; +template class FMIndexDBG; +class Bitvec2; +class FMPos; + +typedef std::pair range_t; +typedef uint64_t length_t; + +std::ifstream::pos_type filesize(std::string filename); + +/* + * representation of a right-maximal substring (SA node) as a list of BWT + * intervals + */ +struct sa_node { + + // right-maximal substring: string W such that Wa_1, ..., Wa_k occur in the + // text for at least k>=2 characters a_1, ..., a_k + + uint64_t first_TERM; + uint64_t first_A; + uint64_t first_C; + uint64_t first_G; + uint64_t first_T; + uint64_t last; + + // depth = |W| + uint64_t depth; + + uint64_t key() { + return first_TERM; + } +}; + +struct sa_node_n { + + // right-maximal substring: string W such that Wa_1, ..., Wa_k occur in the + // text for at least k>=2 characters a_1, ..., a_k + + uint64_t first_TERM; + uint64_t first_A; + uint64_t first_C; + uint64_t first_G; + uint64_t first_N; + uint64_t first_T; + uint64_t last; + + // depth = |W| + uint64_t depth; + + uint64_t key() { + return first_TERM; + } +}; + +/* + * file contains 'N' characters + */ +bool hasN(std::string filename); + +uint64_t node_size(sa_node s); + +uint64_t node_size(std::pair p); + +uint64_t node_size(sa_node_n s); + +uint64_t node_size(std::pair p); + +void print_node(sa_node n); + +void print_node(sa_node_n n); + +sa_node merge_nodes(sa_node a, sa_node b); + +sa_node_n merge_nodes(sa_node_n a, sa_node_n b); + +/* + * suffix array leaf = BWT range (inclusive) of W.TERM, for some string W. + * + */ +struct sa_leaf { + + // rn.first = first position of range. Equivalently, number of suffixes + // smaller than W.TERM (valid also if W.TERM does not occur) rn.second = + // last position (excluded) of interval. Equivalently, number of suffixes + // smaller than W.TERM + number of occurrences of W.TERM if last == first, + // then W.TERM does not occur (however, 'first' is in any case number of + // suffixes smaller than W.TERM) + range_t rn; + + // depth = |W.TERM| + uint64_t depth; + + uint64_t key() { + return rn.first; + } +}; + +inline uint64_t range_length(range_t r); + +inline uint64_t leaf_size(sa_leaf L); + +inline uint64_t leaf_size(std::pair P); + +struct p_range { + + range_t A; + range_t C; + range_t G; + range_t T; +}; + +struct p_node { + + sa_node A; + sa_node C; + sa_node G; + sa_node T; +}; + +struct p_range_n { + + range_t A; + range_t C; + range_t G; + range_t N; + range_t T; +}; + +struct p_node_n { + + sa_node_n A; + sa_node_n C; + sa_node_n G; + sa_node_n N; + sa_node_n T; +}; + +void print_nodes(p_node p); + +struct p_rank { + + public: + uint64_t A; + uint64_t C; + uint64_t G; + uint64_t T; + + p_rank operator+(const p_rank& a) const { + + return {a.A + A, a.C + C, a.G + G, a.T + T}; + } + + bool operator==(const p_rank& a) const { + + return a.A == A and a.C == C and a.G == G and a.T == T; + } + + bool operator!=(const p_rank& a) const { + + return a.A != A or a.C != C or a.G != G or a.T != T; + } + + bool operator<=(const p_rank& a) const { + + return A <= a.A and C <= a.C and G <= a.G and T <= a.T; + } +}; + +struct p_rank_n { + + public: + uint64_t A; + uint64_t C; + uint64_t G; + uint64_t N; + uint64_t T; + + p_rank_n operator+(const p_rank_n& a) const { + + return {a.A + A, a.C + C, a.G + G, a.N + N, a.T + T}; + } + + bool operator==(const p_rank_n& a) const { + + return a.A == A and a.C == C and a.G == G and a.N == N and a.T == T; + } + + bool operator!=(const p_rank_n& a) const { + + return a.A != A or a.C != C or a.G != G or a.N != N or a.T != T; + } + + bool operator<=(const p_rank_n& a) const { + + return A <= a.A and C <= a.C and G <= a.G and N <= a.N and T <= a.T; + } +}; + +inline p_range fold_ranks(p_rank& a, p_rank& b) { + + return {{a.A, b.A}, {a.C, b.C}, {a.G, b.G}, {a.T, b.T}}; +} + +inline p_range_n fold_ranks(p_rank_n& a, p_rank_n& b) { + + return {{a.A, b.A}, {a.C, b.C}, {a.G, b.G}, {a.N, b.N}, {a.T, b.T}}; +} + +inline uint64_t popcount128(__uint128_t x) { + + return __builtin_popcountll(uint64_t(x >> 64)) + + __builtin_popcountll(x & 0xFFFFFFFFFFFFFFFF); +} + +range_t child_TERM(sa_node x); +range_t child_A(sa_node x); +range_t child_C(sa_node x); +range_t child_G(sa_node x); +range_t child_T(sa_node x); + +range_t child_TERM(sa_node_n x); +range_t child_A(sa_node_n x); +range_t child_C(sa_node_n x); +range_t child_G(sa_node_n x); +range_t child_N(sa_node_n x); +range_t child_T(sa_node_n x); + +inline bool has_child_TERM(sa_node N) { + return N.first_A > N.first_TERM; +} +inline bool has_child_A(sa_node N) { + return N.first_C > N.first_A; +} +inline bool has_child_C(sa_node N) { + return N.first_G > N.first_C; +} +inline bool has_child_G(sa_node N) { + return N.first_T > N.first_G; +} +inline bool has_child_T(sa_node N) { + return N.last > N.first_T; +} + +inline bool has_child_TERM(sa_node_n N) { + return N.first_A > N.first_TERM; +} +inline bool has_child_A(sa_node_n N) { + return N.first_C > N.first_A; +} +inline bool has_child_C(sa_node_n N) { + return N.first_G > N.first_C; +} +inline bool has_child_G(sa_node_n N) { + return N.first_N > N.first_G; +} +inline bool has_child_N(sa_node_n N) { + return N.first_T > N.first_N; +} +inline bool has_child_T(sa_node_n N) { + return N.last > N.first_T; +} + +uint8_t number_of_children(sa_node N); + +uint8_t number_of_children(sa_node_n N); + +/* + * number of children in the union of the two nodes + */ +uint8_t number_of_children(sa_node N1, sa_node N2); + +/* + * number of children in the union of the two nodes + */ +uint8_t number_of_children(sa_node_n N1, sa_node_n N2); + +/* + * number of children in the union of the two nodes + */ +uint8_t number_of_children(std::pair P); + +/* + * number of children in the union of the two nodes + */ +uint8_t number_of_children(std::pair P); + +template +void update_lcp(sa_node& x, std::vector& LCP, uint64_t& lcp_values) { + + assert(x.first_A >= x.first_TERM); + assert(x.first_C >= x.first_A); + assert(x.first_G >= x.first_C); + assert(x.first_T >= x.first_G); + + assert(number_of_children(x) >= 2); + + lcp_int_t nil = ~lcp_int_t(0); + + if (has_child_TERM(x) and x.first_A != x.last) { + assert(LCP[x.first_A] == nil); + LCP[x.first_A] = x.depth; + lcp_values++; + } + if (has_child_A(x) and x.first_C != x.last) { + assert(LCP[x.first_C] == nil); + LCP[x.first_C] = x.depth; + lcp_values++; + } + if (has_child_C(x) and x.first_G != x.last) { + assert(LCP[x.first_G] == nil); + LCP[x.first_G] = x.depth; + lcp_values++; + } + if (has_child_G(x) and x.first_T != x.last) { + assert(LCP[x.first_T] == nil); + LCP[x.first_T] = x.depth; + lcp_values++; + } +} + +template +void update_lcp(sa_node_n& x, std::vector& LCP, + uint64_t& lcp_values) { + + assert(x.first_A >= x.first_TERM); + assert(x.first_C >= x.first_A); + assert(x.first_G >= x.first_C); + assert(x.first_N >= x.first_G); + assert(x.first_T >= x.first_N); + + assert(number_of_children(x) >= 2); + + lcp_int_t nil = ~lcp_int_t(0); + + if (has_child_TERM(x) and x.first_A != x.last) { + assert(LCP[x.first_A] == nil); + LCP[x.first_A] = x.depth; + lcp_values++; + } + if (has_child_A(x) and x.first_C != x.last) { + assert(LCP[x.first_C] == nil); + LCP[x.first_C] = x.depth; + lcp_values++; + } + if (has_child_C(x) and x.first_G != x.last) { + assert(LCP[x.first_G] == nil); + LCP[x.first_G] = x.depth; + lcp_values++; + } + if (has_child_G(x) and x.first_N != x.last) { + assert(LCP[x.first_N] == nil); + LCP[x.first_N] = x.depth; + lcp_values++; + } + if (has_child_N(x) and x.first_T != x.last) { + assert(LCP[x.first_T] == nil); + LCP[x.first_T] = x.depth; + lcp_values++; + } +} + +template +void computeLCPPrezza(FMIndexDBG* index, Bitvec2& LCP, + bool progress, uint& k); + +template +void next_leaves(FMIndexDBG* index, FMPos& L, + std::vector& TMP_LEAVES, length_t min_n_children); + +template +void update_lcp(FMIndexDBG* index, LCPnode& x, Bitvec2& LCP, + uint64_t& lcp_values, uint& k); + +template +void next_nodes(FMIndexDBG* index, LCPnode& x, + std::vector& TMP_NODES); + +#endif /* INCLUDE_HPP_ */ diff --git a/src/bandmatrix.h b/src/bandmatrix.h index e3844f2..40e7ad4 100644 --- a/src/bandmatrix.h +++ b/src/bandmatrix.h @@ -24,8 +24,11 @@ #define BANDMATRIX_H #include // used for reversing CIGAR string -#include // used for taking the log -#include // used for printing (debugging reasons) +#include +#include +#include // used for taking the log +#include // used for printing (debugging reasons) +#include #include "substring.h" @@ -498,7 +501,7 @@ class BitParallelED { * Find the minimum edit distance value and its position in a row * @param i Row index * @param jMin Column index at which minimum value is found (output) - * @param minScore Mimumum value (output) + * @param minScore Minimum value (output) */ void findMinimumAtRow(uint i, uint& jMin, uint& minScore) const { jMin = getFirstColumn(i); diff --git a/src/benchmarking.cpp b/src/benchmarking.cpp index ef16b9c..60c9709 100644 --- a/src/benchmarking.cpp +++ b/src/benchmarking.cpp @@ -32,30 +32,6 @@ using namespace std; vector schemes = {"kuch1", "kuch2", "kianfar", "manbest", "pigeon", "01*0", "custom", "naive"}; -template -void printMatches(vector matches, string text, bool printLine, - string duration, FMIndex& index, string name) { - - cout << endl; - - cout << name << ":\tduration: " << duration - << "µs\t nodes visited: " << index.getNodes() - << "\t matrix elements written: " << index.getMatrixElements() - << "\t startpositions reported: " << index.getTotalReported() - << " #matches: " << matches.size() << endl; - - for (auto match : matches) { - cout << "Found match at position " << match.getRange().getBegin() - << " with ED " << match.getDistance() << endl; - - cout << "\tCorresponding substring:\t" - << text.substr(match.getRange().getBegin(), - match.getRange().getEnd() - - match.getRange().getBegin()) - << endl; - } -} - string getFileExt(const string& s) { size_t i = s.rfind('.', s.length()); @@ -66,9 +42,23 @@ string getFileExt(const string& s) { return (""); } -vector> getReads(const string& file) { - vector> reads; - reads.reserve(200000); +size_t getReads(vector>& reads, string& file, + ifstream& ifile, size_t chunkSize, string& line, + bool readWithN) { + + string read = ""; + string p = ""; + + size_t chunkCounter = 0; + bool readLine = false; + + // Read first line of chunk + if (!line.empty() && (line[0] == '>' || line[0] == '@')) { + p = (line.substr(1)); + readLine = true; + } + + reads.reserve(2 * chunkSize); const auto& extension = getFileExt(file); @@ -76,7 +66,6 @@ vector> getReads(const string& file) { (extension == "FASTA") || (extension == "fasta") || (extension == "fa"); bool fastq = (extension == "fq") || (extension == "fastq"); - ifstream ifile(file.c_str()); if (!ifile) { throw runtime_error("Cannot open file " + file); } @@ -88,9 +77,9 @@ vector> getReads(const string& file) { } string line; // get the first line we do not need this - getline(ifile, line); + std::getline(ifile, line); - while (getline(ifile, line)) { + while (std::getline(ifile, line)) { istringstream iss{line}; vector tokens; @@ -107,19 +96,20 @@ vector> getReads(const string& file) { reads.push_back(make_pair(p, read)); } } else if (fasta) { - // fasta file - string read = ""; - string p = ""; - string line; - while (getline(ifile, line)) { + while (chunkCounter < chunkSize && std::getline(ifile, line)) { if (!line.empty() && line[0] == '>') { if (!read.empty()) { - - reads.push_back(make_pair(p, read)); - reads.push_back( - make_pair(p, Nucleotide::getRevCompl(read))); - read.clear(); + // Ignore reads containing N + if (read.find('N') == std::string::npos) { + reads.push_back(make_pair(p, read)); + reads.push_back( + make_pair(p, Nucleotide::getRevCompl(read))); + read.clear(); + chunkCounter++; + } else { + readWithN = true; + } } p = (line.substr(1)); @@ -129,27 +119,32 @@ vector> getReads(const string& file) { } } if (!read.empty()) { - - reads.push_back(make_pair(p, read)); - reads.push_back(make_pair(p, Nucleotide::getRevCompl(read))); - read.clear(); + // Ignore reads containing N + if (read.find('N') == std::string::npos) { + reads.push_back(make_pair(p, read)); + reads.push_back(make_pair(p, Nucleotide::getRevCompl(read))); + read.clear(); + chunkCounter++; + } else { + readWithN = true; + } } } else { // fastQ - string read = ""; - string id = ""; - string line; - bool readLine = false; - while (getline(ifile, line)) { + while (chunkCounter < chunkSize && std::getline(ifile, line)) { if (!line.empty() && line[0] == '@') { if (!read.empty()) { - - reads.push_back(make_pair(id, read)); - reads.push_back( - make_pair(id, Nucleotide::getRevCompl(read))); - read.clear(); + if (read.find('N') == std::string::npos) { + reads.push_back(make_pair(p, read)); + reads.push_back( + make_pair(p, Nucleotide::getRevCompl(read))); + read.clear(); + chunkCounter++; + } else { + readWithN = true; + } } - id = (line.substr(1)); + p = (line.substr(1)); readLine = true; } else if (readLine) { read = line; @@ -157,14 +152,17 @@ vector> getReads(const string& file) { } } if (!read.empty()) { - - reads.push_back(make_pair(id, read)); - reads.push_back(make_pair(id, Nucleotide::getRevCompl(read))); - read.clear(); + if (read.find('N') == std::string::npos) { + reads.push_back(make_pair(p, read)); + reads.push_back(make_pair(p, Nucleotide::getRevCompl(read))); + read.clear(); + chunkCounter++; + } else { + readWithN = true; + } } } - - return reads; + return chunkCounter; } double @@ -182,25 +180,34 @@ void writeToOutputSFI( const string& file, const vector, std::vector>>& mPerRead, - const vector>& reads) { + const vector>& reads, bool& firstChunk, ofstream& f2) { - cout << "Writing to output file " << file << " ..." << endl; - ofstream f2; - f2.open(file); + if (firstChunk) { + // Write header + firstChunk = false; + + // cout << "Writing to output file " << file << " ..." << endl; + + f2 << "Identifier\tSubgraphID\tPath\tDistanceFromLeftEnd\tStrain\t" + "Position\tLength\tED\treverseComplement\n"; + } - f2 << "Identifier\tSubgraphID\tPath\tStrain\tPosition\tLength\tED\treverseC" - "omplement\n"; for (unsigned int i = 0; i < reads.size(); i += 2) { auto id = reads[i].first; int counter = 0; for (const auto& path : mPerRead[i]) { for (length_t i = 0; i < path.second.size(); i++) { + // For occurrences shorter than k, use / to show that the nodes + // do not form a path, but a set of possibilities + char separationchar = + path.second[i].getRange().width() < k_DBG ? '/' : ','; f2 << id << "\t" << counter << "\t" << path.first[0]; for (length_t i = 1; i < path.first.size(); i++) { - f2 << "," << path.first[i]; + f2 << separationchar << path.first[i]; } - f2 << "\t" << path.second[i].getStrain() << "\t" + f2 << "\t" << path.second[i].getDistanceFromLeftEnd() << "\t" + << path.second[i].getStrain() << "\t" << path.second[i].getRange().getBegin() << "\t" << path.second[i].getRange().width() << "\t" << path.second[i].getDistance() << "\t0\n"; @@ -210,11 +217,16 @@ void writeToOutputSFI( for (const auto& path : mPerRead[i + 1]) { for (length_t i = 0; i < path.second.size(); i++) { + // For occurrences shorter than k, use / to show that the nodes + // do not form a path, but a set of possibilities + char separationchar = + path.second[i].getRange().width() < k_DBG ? '/' : ','; f2 << id << "\t" << counter << "\t" << path.first[0]; for (length_t i = 1; i < path.first.size(); i++) { - f2 << "," << path.first[i]; + f2 << separationchar << path.first[i]; } - f2 << "\t" << path.second[i].getStrain() << "\t" + f2 << "\t" << path.second[i].getDistanceFromLeftEnd() << "\t" + << path.second[i].getStrain() << "\t" << path.second[i].getRange().getBegin() << "\t" << path.second[i].getRange().width() << "\t" << path.second[i].getDistance() << "\t1\n"; @@ -222,30 +234,36 @@ void writeToOutputSFI( counter++; } } - f2.close(); } void writeToOutputSFR(const string& file, const std::vector>& mPerRead, const vector>& reads, - FMIndexDBG& index) { + bool& firstChunk, ofstream& f2) { - cout << "Writing to output file " << file << " ..." << endl; - ofstream f2; - f2.open(file); + if (firstChunk) { + // Write header + firstChunk = false; + + // cout << "Writing to output file " << file << " ..." << endl; + + f2 << "Identifier\tSubgraphID\tPath\tDistanceFromLeftEnd\tLength\tED\t" + "reverseComplement\n"; + } - f2 << "Identifier\tSubgraphID\tPath\tDistanceFromLeftEnd\tLength\tED\trever" - "seC" - "omplement\n"; for (unsigned int i = 0; i < reads.size(); i += 2) { auto id = reads[i].first; int counter = 0; for (const auto& path : mPerRead[i]) { + // For occurrences shorter than k, use / to show that the nodes + // do not form a path, but a set of possibilities + char separationchar = + path.getPosition().getTrueDepth() < k_DBG ? '/' : ','; vector nodepath = path.getPosition().getNodePath(); f2 << id << "\t" << counter << "\t" << nodepath[0]; for (length_t i = 1; i < nodepath.size(); i++) { - f2 << "," << nodepath[i]; + f2 << separationchar << nodepath[i]; } f2 << "\t" << path.getPosition().getDistanceFromLeftEnd() << "\t" << path.getPosition().getTrueDepth() << "\t" @@ -254,21 +272,21 @@ void writeToOutputSFR(const string& file, } for (const auto& path : mPerRead[i + 1]) { + // For occurrences shorter than k, use / to show that the nodes + // do not form a path, but a set of possibilities + char separationchar = + path.getPosition().getTrueDepth() < k_DBG ? '/' : ','; vector nodepath = path.getPosition().getNodePath(); f2 << id << "\t" << counter << "\t" << nodepath[0]; for (length_t i = 1; i < nodepath.size(); i++) { - f2 << "," << nodepath[i]; + f2 << separationchar << nodepath[i]; } - // int id, l; - // index.findID(path.getPosition().getRanges().getRangeSA().getBegin(), - // id, l); f2 << "\t" << path.getPosition().getDistanceFromLeftEnd() << "\t" << path.getPosition().getTrueDepth() << "\t" << path.getDistance() << "\t1\n"; counter++; } } - f2.close(); } double findMedian(vector a, int n) { @@ -303,13 +321,12 @@ double findMedian(vector a, int n) { } template -double doBenchSFI(vector>& reads, T& index, - SearchStrategyDBG* strategy, +double doBenchSFI(T& index, SearchStrategyDBG* strategy, string readsFile, length_t ED, string cpSparse, string outputFile) { if (outputFile == "") { - outputFile = readsFile + "_output.txt"; + outputFile = readsFile + "_output.tsv"; } size_t totalNodes = 0; @@ -322,6 +339,11 @@ double doBenchSFI(vector>& reads, T& index, size_t totalDBGNodes = 0; size_t totalNodePaths = 0; size_t allReportedNodePaths = 0; + chrono::duration elapsedNodepaths = + std::chrono::duration::zero(); + chrono::duration elapsedSAtoText = + std::chrono::duration::zero(); + chrono::duration elapsed = std::chrono::duration::zero(); cout << "Strain-fixed read mapping with " << strategy->getName() << " strategy for max distance " << ED << " with " @@ -331,106 +353,160 @@ double doBenchSFI(vector>& reads, T& index, << endl; cout.precision(2); - vector, std::vector>> - matchesPerRead = {}; - matchesPerRead.reserve(reads.size()); + // Read and write input in chunks + size_t chunkSize = 10000; + bool firstChunk = true; + size_t nrOfReads = 0; + string currentLine = ""; + + ifstream ifile(readsFile.c_str()); + ofstream f2; + f2.open(outputFile); + + bool readWithN = false; std::vector numberMatchesPerRead; - numberMatchesPerRead.reserve(reads.size()); - auto start = chrono::high_resolution_clock::now(); - for (unsigned int i = 0; i < reads.size(); i += 2) { + while (ifile) { + + // cout << "Reading in reads from " << readsFile << endl; + vector> reads; + reads.reserve(chunkSize * 2); + try { + nrOfReads += getReads(reads, readsFile, ifile, chunkSize, + currentLine, readWithN); + } catch (const exception& e) { + string er = e.what(); + er += " Did you provide a valid reads file?"; + throw runtime_error(er); + } - const auto& p = reads[i]; + auto start = chrono::high_resolution_clock::now(); - auto originalPos = p.first; - string read = p.second; - string revCompl = reads[i + 1].second; + vector, std::vector>> + matchesPerRead = {}; + matchesPerRead.reserve(reads.size()); - if (((i >> 1) - 1) % (8192 / (1 << ED)) == 0) { - cout << "Progress: " << i / 2 << "/" << reads.size() / 2 << "\r"; - cout.flush(); - } + numberMatchesPerRead.reserve(reads.size() + + numberMatchesPerRead.size()); - std::map, std::vector> - matches = strategy->matchApproxSFI(read, ED); - int nr_of_matches = 0; - for (auto const& p : matches) { - nr_of_matches += p.second.size(); - } + for (unsigned int i = 0; i < reads.size(); i += 2) { - totalNodes += index.getNodes(); - totalDBGNodes += index.getDBGNodes(); - totalMatrixElements += index.getMatrixElements(); - allReportedMatches += index.getTotalReported(); - totalUniqueMatches += nr_of_matches; - mappedReadsForward += !matches.empty(); - totalNodePaths += matches.size(); - allReportedNodePaths += index.getTotalReportedNodePaths(); - - // do the same for the reverse complement - auto matchesRevCompl = strategy->matchApproxSFI(revCompl, ED); - int nr_of_matchesRevCompl = 0; - for (auto const& p : matchesRevCompl) { - nr_of_matchesRevCompl += p.second.size(); - } + const auto& p = reads[i]; + + auto originalPos = p.first; + string read = p.second; + string revCompl = reads[i + 1].second; + + if ((((i + 2 * nrOfReads - reads.size()) >> 1) - 1) % + (8192 / (1 << ED)) == + 0) { + cout << "Progress: " << (i + 2 * nrOfReads - reads.size()) / 2 + << " reads done.\r"; + cout.flush(); + } + + const auto& matches = strategy->matchApproxSFI(read, ED); + int nr_of_matches = 0; + for (auto const& p : matches) { + nr_of_matches += p.second.size(); + } - totalNodes += index.getNodes(); - totalDBGNodes += index.getDBGNodes(); - totalMatrixElements += index.getMatrixElements(); - allReportedMatches += index.getTotalReported(); - totalUniqueMatches += nr_of_matchesRevCompl; - mappedReadsBackward += !matchesRevCompl.empty(); - totalNodePaths += matchesRevCompl.size(); - allReportedNodePaths += index.getTotalReportedNodePaths(); + totalNodes += index.getNodes(); + totalDBGNodes += index.getDBGNodes(); + totalMatrixElements += index.getMatrixElements(); + allReportedMatches += index.getTotalReported(); + totalUniqueMatches += nr_of_matches; + mappedReadsForward += !matches.empty(); + totalNodePaths += matches.size(); + allReportedNodePaths += index.getTotalReportedNodePaths(); + elapsedNodepaths += index.getNodePathDuration(); + elapsedSAtoText += index.getSADuration(); + + // do the same for the reverse complement + const auto& matchesRevCompl = + strategy->matchApproxSFI(revCompl, ED); + int nr_of_matchesRevCompl = 0; + for (auto const& p : matchesRevCompl) { + nr_of_matchesRevCompl += p.second.size(); + } - mappedReads += !(matchesRevCompl.empty() && matches.empty()); + totalNodes += index.getNodes(); + totalDBGNodes += index.getDBGNodes(); + totalMatrixElements += index.getMatrixElements(); + allReportedMatches += index.getTotalReported(); + totalUniqueMatches += nr_of_matchesRevCompl; + mappedReadsBackward += !matchesRevCompl.empty(); + totalNodePaths += matchesRevCompl.size(); + allReportedNodePaths += index.getTotalReportedNodePaths(); + elapsedNodepaths += index.getNodePathDuration(); + elapsedSAtoText += index.getSADuration(); + + mappedReads += !(matchesRevCompl.empty() && matches.empty()); + + matchesPerRead.push_back(matches); + matchesPerRead.push_back(matchesRevCompl); + numberMatchesPerRead.push_back(nr_of_matches + + nr_of_matchesRevCompl); + } - matchesPerRead.push_back(matches); - matchesPerRead.push_back(matchesRevCompl); - numberMatchesPerRead.push_back(nr_of_matches + nr_of_matchesRevCompl); + auto finish = chrono::high_resolution_clock::now(); + elapsed += finish - start; + writeToOutputSFI(outputFile, matchesPerRead, reads, firstChunk, f2); } + if (readWithN) { + cout << "Caution, reads containing an N were ignored.\n"; + } + + f2.close(); + if (ED == 0) { allReportedMatches = totalUniqueMatches; } - auto finish = chrono::high_resolution_clock::now(); - chrono::duration elapsed = finish - start; - cout << "Progress: " << reads.size() << "/" << reads.size() << "\n"; + elapsedSAtoText -= elapsedNodepaths; + chrono::duration FMIndexElapsed = + elapsed - elapsedSAtoText - elapsedNodepaths; + cout << "Progress: " << nrOfReads << "/" << nrOfReads << "\n"; cout << "Results for " << strategy->getName() << endl; + cout << "Time for finding the occurrences in the bidirectional FM-index: " + << fixed << FMIndexElapsed.count() << "s\n"; + cout << "Time for finding the node path: " << fixed + << elapsedNodepaths.count() << "s\n"; + cout << "Time for finding the occurrences in the reference text along with " + "the corresponding strain: " + << fixed << elapsedSAtoText.count() << "s\n"; cout << "Total duration: " << fixed << elapsed.count() << "s\n"; - cout << "Average no. index nodes: " << totalNodes / (double)(reads.size()) + cout << "Average no. index nodes: " << totalNodes / (double)(nrOfReads) << endl; cout << "Total no. index nodes: " << totalNodes << "\n"; cout << "Average no. unique matches: " - << totalUniqueMatches / (double)(reads.size()) << endl; + << totalUniqueMatches / (double)(nrOfReads) << endl; cout << "Total no. unique matches: " << totalUniqueMatches << "\n"; cout << "Average no. reported matches " - << allReportedMatches / (double)(reads.size()) << endl; + << allReportedMatches / (double)(nrOfReads) << endl; cout << "Total no. reported matches: " << allReportedMatches << "\n"; cout << "Average no. unique node paths " - << totalNodePaths / (double)(reads.size()) << endl; + << totalNodePaths / (double)(nrOfReads) << endl; cout << "Total no. unique node paths: " << totalNodePaths << "\n"; cout << "Average no. reported node paths: " - << allReportedNodePaths / (double)(reads.size()) << endl; + << allReportedNodePaths / (double)(nrOfReads) << endl; cout << "Total no. reported node paths: " << allReportedNodePaths << "\n"; cout << "Mapped reads :" << mappedReads << endl; cout << "Median number of unique matches per read " << findMedian(numberMatchesPerRead, numberMatchesPerRead.size()) << endl; - cout << "Average no. graph nodes: " - << totalDBGNodes / (double)(reads.size()) << endl; + cout << "Average no. graph nodes: " << totalDBGNodes / (double)(nrOfReads) + << endl; cout << "Total no. graph nodes: " << totalDBGNodes << "\n"; - writeToOutputSFI(outputFile, matchesPerRead, reads); return elapsed.count(); } -double doBenchSFR(vector>& reads, - FMIndexDBG& index, +double doBenchSFR(FMIndexDBG& index, SearchStrategyDBG, FMPosSFR>* strategy, string readsFile, length_t ED, string cpSparse, string outputFile) { @@ -438,18 +514,19 @@ double doBenchSFR(vector>& reads, StrainFreeMapper mapper(strategy); if (outputFile == "") { - outputFile = readsFile + "_output.txt"; + outputFile = readsFile + "_output.tsv"; } size_t totalNodes = 0; - size_t totalMatrixElements = 0; + // size_t totalMatrixElements = 0; size_t allReportedMatches = 0; size_t totalUniqueMatches = 0; size_t mappedReads = 0; - size_t mappedReadsForward = 0; - size_t mappedReadsBackward = 0; + // size_t mappedReadsForward = 0; + // size_t mappedReadsBackward = 0; size_t totalDBGNodes = 0; - size_t totalFilterSpecialCases = 0; + // size_t totalFilterSpecialCases = 0; + chrono::duration elapsed = std::chrono::duration::zero(); cout << "Strain-free read mapping with " << index.getFilteringOption() << " filtering, with " << strategy->getName() @@ -460,91 +537,131 @@ double doBenchSFR(vector>& reads, << endl; cout.precision(2); - std::vector> matchesPerRead = {}; - matchesPerRead.reserve(reads.size()); + size_t chunkSize = 10000; + bool firstChunk = true; + size_t nrOfReads = 0; + string currentLine = ""; + + ifstream ifile(readsFile.c_str()); + ofstream f2; + f2.open(outputFile); + + bool readWithN = false; std::vector numberMatchesPerRead; - numberMatchesPerRead.reserve(reads.size()); - auto start = chrono::high_resolution_clock::now(); - for (unsigned int i = 0; i < reads.size(); i += 2) { + while (ifile) { + + // cout << "Reading in reads from " << readsFile << endl; + vector> reads; + reads.reserve(chunkSize * 2); + try { + nrOfReads += getReads(reads, readsFile, ifile, chunkSize, + currentLine, readWithN); + } catch (const exception& e) { + string er = e.what(); + er += " Did you provide a valid reads file?"; + throw runtime_error(er); + } + auto start = chrono::high_resolution_clock::now(); + + std::vector> matchesPerRead = {}; + matchesPerRead.reserve(reads.size()); + + numberMatchesPerRead.reserve(reads.size() + + numberMatchesPerRead.size()); + + for (unsigned int i = 0; i < reads.size(); i += 2) { - const auto& p = reads[i]; + const auto& p = reads[i]; - auto originalPos = p.first; - string read = p.second; - string revCompl = reads[i + 1].second; + auto originalPos = p.first; + string read = p.second; + string revCompl = reads[i + 1].second; - if (((i >> 1) - 1) % (8192 / (1 << ED)) == 0) { - cout << "Progress: " << i / 2 << "/" << reads.size() / 2 << "\r"; - cout.flush(); + if ((((i + 2 * nrOfReads - reads.size()) >> 1) - 1) % + (8192 / (1 << ED)) == + 0) { + cout << "Progress: " << (i + 2 * nrOfReads - reads.size()) / 2 + << " reads done.\r"; + cout.flush(); + } + + const auto& matches = mapper.matchApproxSFR(read, ED); + int nr_of_matches = matches.size(); + + totalNodes += index.getNodes(); + totalDBGNodes += index.getDBGNodes(); + // totalFilterSpecialCases += index.getFilterSpecialCases(); + // totalMatrixElements += index.getMatrixElements(); + allReportedMatches += index.getTotalReported(); + totalUniqueMatches += nr_of_matches; + // mappedReadsForward += !matches.empty(); + + // do the same for the reverse complement + const auto& matchesRevCompl = mapper.matchApproxSFR(revCompl, ED); + int nr_of_matchesRevCompl = matchesRevCompl.size(); + + totalNodes += index.getNodes(); + totalDBGNodes += index.getDBGNodes(); + // totalFilterSpecialCases += index.getFilterSpecialCases(); + // totalMatrixElements += index.getMatrixElements(); + allReportedMatches += index.getTotalReported(); + totalUniqueMatches += nr_of_matchesRevCompl; + // mappedReadsBackward += !matchesRevCompl.empty(); + + mappedReads += !(matchesRevCompl.empty() && matches.empty()); + + matchesPerRead.push_back(matches); + matchesPerRead.push_back(matchesRevCompl); + numberMatchesPerRead.push_back(nr_of_matches + + nr_of_matchesRevCompl); } - std::vector matches = mapper.matchApproxSFR(read, ED); - int nr_of_matches = matches.size(); - - totalNodes += index.getNodes(); - totalDBGNodes += index.getDBGNodes(); - totalFilterSpecialCases += index.getFilterSpecialCases(); - totalMatrixElements += index.getMatrixElements(); - allReportedMatches += index.getTotalReported(); - totalUniqueMatches += nr_of_matches; - mappedReadsForward += !matches.empty(); - - // do the same for the reverse complement - auto matchesRevCompl = mapper.matchApproxSFR(revCompl, ED); - int nr_of_matchesRevCompl = matchesRevCompl.size(); - - totalNodes += index.getNodes(); - totalDBGNodes += index.getDBGNodes(); - totalFilterSpecialCases += index.getFilterSpecialCases(); - totalMatrixElements += index.getMatrixElements(); - allReportedMatches += index.getTotalReported(); - totalUniqueMatches += nr_of_matchesRevCompl; - mappedReadsBackward += !matchesRevCompl.empty(); - - mappedReads += !(matchesRevCompl.empty() && matches.empty()); - - matchesPerRead.push_back(matches); - matchesPerRead.push_back(matchesRevCompl); - numberMatchesPerRead.push_back(nr_of_matches + nr_of_matchesRevCompl); + auto finish = chrono::high_resolution_clock::now(); + elapsed += finish - start; + + writeToOutputSFR(outputFile, matchesPerRead, reads, firstChunk, f2); + } + + if (readWithN) { + cout << "Caution, reads containing an N were ignored.\n"; } + f2.close(); + if (ED == 0) { allReportedMatches = totalUniqueMatches; } - auto finish = chrono::high_resolution_clock::now(); - chrono::duration elapsed = finish - start; - cout << "Progress: " << reads.size() << "/" << reads.size() << "\n"; + cout << "Progress: " << nrOfReads << "/" << nrOfReads << "\n"; cout << "Results for " << strategy->getName() << endl; cout << "Total duration: " << fixed << elapsed.count() << "s\n"; - cout << "Average no. index nodes: " << totalNodes / (double)(reads.size()) + cout << "Average no. index nodes: " << totalNodes / (double)(nrOfReads) << endl; cout << "Total no. index nodes: " << totalNodes << "\n"; cout << "Average no. unique node paths: " - << totalUniqueMatches / (double)(reads.size()) << endl; + << totalUniqueMatches / (double)(nrOfReads) << endl; cout << "Total no. unique node paths: " << totalUniqueMatches << "\n"; cout << "Average no. reported node paths: " - << allReportedMatches / (double)(reads.size()) << endl; + << allReportedMatches / (double)(nrOfReads) << endl; cout << "Total no. reported node paths: " << allReportedMatches << "\n"; cout << "Mapped reads :" << mappedReads << endl; cout << "Median number of unique node paths per read " << findMedian(numberMatchesPerRead, numberMatchesPerRead.size()) << endl; - cout << "Average no. graph nodes: " - << totalDBGNodes / (double)(reads.size()) << endl; + cout << "Average no. graph nodes: " << totalDBGNodes / (double)(nrOfReads) + << endl; cout << "Total no. graph nodes: " << totalDBGNodes << "\n"; // cout << "Total no. special filter cases: " << totalFilterSpecialCases // << "\n"; - writeToOutputSFR(outputFile, matchesPerRead, reads, index); return elapsed.count(); } template double doBenchSFI, FMPos>( - vector>& reads, FMIndexDBG& index, + FMIndexDBG& index, SearchStrategyDBG, FMPos>* strategy, string readsFile, length_t ED, string cpSparse, string outputFile); \ No newline at end of file diff --git a/src/benchmarking.h b/src/benchmarking.h index b56a43b..3ca3ff3 100644 --- a/src/benchmarking.h +++ b/src/benchmarking.h @@ -29,13 +29,11 @@ using namespace std; extern vector schemes; -template -void printMatches(vector matches, string text, bool printLine, - string duration, FMIndex& mapper, string name); - string getFileExt(const string& s); -vector> getReads(const string& file); +size_t getReads(vector>& reads, string& file, + ifstream& ifile, size_t chunkSize, string& line, + bool readWithN); double avgVec(vector const& v); // note: the average must not be an integer @@ -46,23 +44,21 @@ void writeToOutputSFI( const string& file, const vector, std::vector>>& mPerRead, - const vector>& reads); + const vector>& reads, bool& firstChunk, ofstream& f2); void writeToOutputSFR(const string& file, const std::vector>>& mPerRead, const vector>& reads, - FMIndexDBG& index); + bool& firstChunk, ofstream& f2); double findMedian(vector a, int n); template -double doBenchSFI(vector>& reads, T& mapper, - SearchStrategyDBG* strategy, +double doBenchSFI(T& mapper, SearchStrategyDBG* strategy, string readsFile, length_t ED, string cpSparse, std::string outputFile = ""); -double doBenchSFR(vector>& reads, - FMIndexDBG& mapper, +double doBenchSFR(FMIndexDBG& mapper, SearchStrategyDBG, FMPosSFR>* strategy, string readsFile, length_t ED, std::string cpSparse, std::string outputFile = ""); diff --git a/src/bitvec.h b/src/bitvec.h index f9f9973..5206c1e 100644 --- a/src/bitvec.h +++ b/src/bitvec.h @@ -217,6 +217,20 @@ class Bitvec { */ Bitvec(size_t N) : N(N), bv((N + 63) / 64, 0ull) { } + + /** + * @brief Clear the bitvector + * + */ + void clear() { + N = 0; + bv.clear(); + bv.resize(0); + bv.shrink_to_fit(); + counts.clear(); + counts.resize(0); + counts.shrink_to_fit(); + } }; // ============================================================================ @@ -237,11 +251,15 @@ class BitvecIntl { * Allocate memory for bv and counts */ void allocateMem() { - // free existing allocations - free(bv); - bv = NULL; - free(counts); - counts = NULL; + // free existing allocations if any + if (bv) { + free(bv); + bv = NULL; + } + if (counts) { + free(counts); + counts = NULL; + } if (N == 0) { // special case for N == 0 bvSize = countsSize = 0; @@ -422,6 +440,24 @@ class BitvecIntl { free(bv); free(counts); } + + /** + * @brief Clear the bitvector + * + */ + void clear() { + N = 0; + bvSize = 0; + countsSize = 0; + if (bv) { + free(bv); + bv = NULL; + } + if (counts) { + free(counts); + counts = NULL; + } + } }; // ============================================================================ @@ -902,176 +938,4 @@ class BitrefNConst { } }; -// ============================================================================ -// BIT VECTOR CLASS -// ============================================================================ - -class BitvecN { - - private: - size_t N; // number of elements - uint8_t len; // length of entries - size_t bitmask; - std::vector bv; // actual bitvector - // std::vector counts; // interleaved 1st and 2nd level counts - - public: - /** - * Get a bit reference at a certain position - * @param p Position - * @return Bit reference object - */ - BitrefN operator[](size_t p) { - assert(p < N); - if (len == 0) { - return BitrefN(std::vector(), bitmask, 0); - } - size_t w = p * len / 8; - uint8_t b = p * len % 8; - std::vector words = {&bv[w]}; - size_t extra = std::ceil((float)(len - (8 - b)) / (float)8); - for (size_t i = 1; i <= extra; i++) { - words.emplace_back(&bv[w + i]); - } - return BitrefN(words, bitmask, b); - } - - /** - * Get a bit reference at a certain position - * @param p Position - * @return Bit reference object - */ - const BitrefNConst operator[](size_t p) const { - assert(p < N); - if (len == 0) { - return BitrefNConst(std::vector(), bitmask, 0); - } - size_t w = p * len / 8; - uint8_t b = p * len % 8; - std::vector words = {&bv[w]}; - size_t extra = std::ceil((float)(len - (8 - b)) / (float)8); - for (size_t i = 1; i <= extra; i++) { - words.emplace_back(&bv[w + i]); - } - return BitrefNConst(words, bitmask, b); - } - - // /** - // * Create an index for the bitvector to support fast rank operations - // */ - // void index() { - // counts = std::vector((bv.size() + 7) / 4, 0ull); - - // size_t countL1 = 0, countL2 = 0; - // for (size_t w = 0, q = 0; w < bv.size(); w++) { - // if (w % 8 == 0) { // store the L1 counts - // countL1 += countL2; - // counts[q] = countL1; - // countL2 = __builtin_popcountll(bv[w]); - // q += 2; - // } else { // store the L2 counts - // counts[q - 1] |= (countL2 << (((w % 8) - 1) * 9)); - // countL2 += __builtin_popcountll(bv[w]); - // } - // } - // } - - // /** - // * Get the number of 1-bits within the range [0...p[ (preceding pos p) - // * @param p Position - // */ - // size_t rank(size_t p) const { - // assert(p < N); - // size_t w = p / 64; // word index - // size_t b = p % 64; // bit offset - // size_t q = (w / 8) * 2; // counts index - - // // add the first-level counts - // size_t rv = counts[q]; - - // // add the second-level counts - // int64_t t = (w % 8) - 1; - // rv += counts[q + 1] >> (t + (t >> 60 & 8)) * 9 & 0x1FF; - - // // add the popcount in the final word - // return rv + __builtin_popcountll((bv[w] << 1) << (63 - b)); - // } - - /** - * Write the bitvector to an open filestream - * @param ofs Open output filestream - */ - void write(std::ofstream& ofs) const { - ofs.write((char*)&len, sizeof(len)); - ofs.write((char*)bv.data(), bv.size() * sizeof(uint8_t)); - // ofs.write((char*)counts.data(), counts.size() * sizeof(size_t)); - } - - /** - * Read the bitvector from an open filestream - * @param ifs Open input filestream - */ - void read(std::ifstream& ifs, size_t size) { - N = size; - - ifs.read((char*)&len, sizeof(len)); - - bv.resize((N * len + 7) / 8); - ifs.read((char*)bv.data(), bv.size() * sizeof(uint8_t)); - - bitmask = std::pow(2, len) - 1; - - // counts.resize((bv.size() + 7) / 4); - // ifs.read((char*)counts.data(), counts.size() * sizeof(size_t)); - } - - /** - * Return the number of elements of the bitvector - * @return The number of elements of the bitvector - */ - size_t nrOfElements() const { - return N; - } - - /** - * Return the length of the elements of the bitvector - * @return The length of the elements of the bitvector - */ - uint8_t elementLength() const { - return len; - } - - /** - * Return the size of the bitvector - * @return The size of the bitvector - */ - size_t size() const { - return N * len; - } - - /** - * Default constructor, move constructor and move assignment operator - */ - BitvecN() : N(0), len(0){}; - - // // TODO do we need the code below? It gives errors - // BitvecN(BitvecN&& rhs) = default; - // BitvecN& operator=(BitvecN&& rhs) = default; - - // /** - // * Deleted copy constructor and copy assignment operator - // */ - // BitvecN(const BitvecN&) = delete; - // BitvecN& operator=(const BitvecN&) = delete; - - /** - * Constructor - * @param N Number of bits in the bitvector - */ - BitvecN(size_t N) - : N(N), len(std::ceil(std::log2(N))), bitmask(std::pow(2, len) - 1), - bv((N * len + 7) / 8, 0) { - } -}; - #endif \ No newline at end of file diff --git a/src/buildDBG.cpp b/src/buildDBG.cpp index c3a5492..113e0f2 100644 --- a/src/buildDBG.cpp +++ b/src/buildDBG.cpp @@ -22,43 +22,73 @@ #include "fmindexDBG.h" +#include + using namespace std; typedef uint64_t length_t; void showUsage() { - cout << "Usage: ./nexusBuild \n\n"; - - cout << "Following input files are required:\n"; - cout << "\t.txt: input text T\n\n"; - - cout << "Following parameters are required:\n"; - cout << "\t is the de Bruijn parameter\n\n"; - - cout << " [options]\n"; - cout << " -s --sa-sparseness\tSuffix array sparseness factors to be " - "used. This option can be repeated multiple times for multiple " - "versions of the suffix array. This option takes values in {1, 2, " - "4, 8, 16, 32, 64, 128, 256}. Use \"all\" to use all options. " - "[default = 1]\n"; - cout << " -c --cp-sparseness\tSparseness factor that indicates " - "how many checkpoints must be stored to identify nodes. This " - "option can be repeated multiple times for multiple " - "versions of the checkpoint sparseness. Use \"none\" to use no " - "checkpoints. " - "[default = 128]\n"; - cout << " -p --progress\tReport extra progress updates\n"; + cout + << "This program constructs a new implicit pan-genome de Bruijn\n" + "graph, along with the underlying bidirectional FM-index.\n\n\n" + + "Usage: ./nexusBuild \n\n" + + " Following input parameters are required:\n" + " base filename of the input text\n" + " the de Bruijn parameter: a " + "comma-separated\n" + " list of integers is required (e.g.,\n" + " 20,21,23)\n\n" + + " Following input files are required:\n" + " .txt: input text with all genomes readily\n" + " concatenated, containing only the " + "following\n" + " characters: A, C, G, T, % and $ (at the\n" + " very end). No newlines are allowed.\n\n\n" + + " [options]\n" + " --skip Skip the building process of the data\n" + " structures that are independent of the de\n" + " Bruijn k parameter (i.e., the " + "bidirectional\n" + " FM-index). These data structures must be\n" + " available in the directory.\n\n" + " -s/--sa-sparseness Suffix array sparseness factors to be\n" + " used. This option can be repeated " + "multiple\n" + " times for multiple versions of the suffix\n" + " array. This option takes values in {1, 2,\n" + " 4, 8, 16, 32, 64, 128, 256}. Use \"all\"\n" + " to use all options. [default = 16]\n\n" + " -c/--cp-sparseness Sparseness factor that indicates how many\n" + " checkpoints must be stored to identify\n" + " nodes. This option can be repeated\n" + " multiple times for multiple versions of\n" + " the checkpoint sparseness. Use \"none\" " + "to\n" + " use no checkpoints. [default = 128]\n\n" + " -p --progress Report extra progress updates\n\n\n"; } -bool parseArguments(int argc, char* argv[], string& baseFN, uint& k, - vector& saSF, vector& cpSF, bool& progress) { +bool parseArguments(int argc, char* argv[], string& baseFN, vector& k, + vector& saSF, vector& cpSF, bool& progress, + bool& skip) { const int reqArguments = 2; progress = false; - if (argc == 2 && strcmp("help", argv[1]) == 0) { - return false; + + if (argc == 2) { + string firstArg(argv[1]); + if (firstArg.find("help") != std::string::npos) { + showUsage(); + return EXIT_SUCCESS; + } } + if (argc <= reqArguments) { - cerr << "Fatal error: insufficient number of arguments" << endl; + cerr << "Fatal error: insufficient number of arguments.\n" << endl; return false; } // process optional arguments @@ -107,6 +137,8 @@ bool parseArguments(int argc, char* argv[], string& baseFN, uint& k, i++; } else if (((arg == "-p") || (arg == "--progress"))) { progress = true; + } else if (((arg == "--skip"))) { + skip = true; } else { cerr << "Unknown argument: " << argv[i] << endl; return false; @@ -114,7 +146,7 @@ bool parseArguments(int argc, char* argv[], string& baseFN, uint& k, } if (saSF.empty()) { - saSF.push_back(1); + saSF.push_back(16); } if (cpSF.empty()) { @@ -128,31 +160,39 @@ bool parseArguments(int argc, char* argv[], string& baseFN, uint& k, cpSF.erase(unique(cpSF.begin(), cpSF.end()), cpSF.end()); baseFN = argv[argc - 2]; - k = atoi(argv[argc - 1]); + string k_list = argv[argc - 1]; + + std::stringstream ss(k_list); + string tmp; + while (getline(ss, tmp, ',')) { + k.push_back(stoi(tmp)); + } return true; } int main(int argc, char* argv[]) { string baseFilename; - uint k; + vector k; vector saSF = {}; vector cpSF = {}; bool progress; + bool skip = false; - if (!parseArguments(argc, argv, baseFilename, k, saSF, cpSF, progress)) { + if (!parseArguments(argc, argv, baseFilename, k, saSF, cpSF, progress, + skip)) { showUsage(); return EXIT_FAILURE; } cout << "Welcome to Nexus!\n"; cout << "Alphabet size is " << ALPHABET - 1 << " + 1\n"; - cout << "k is " << k << "\n"; + // cout << "k is " << k << "\n"; try { // cout << "Start creation of BWT approximate matcher" << endl; FMIndexDBG::buildFMIndexDBG(baseFilename, k, saSF, cpSF, - progress); + progress, skip); } catch (const std::exception& e) { cerr << "Fatal error: " << e.what() << endl; return EXIT_FAILURE; diff --git a/src/buildIndexAuxiliary.cpp b/src/buildIndexAuxiliary.cpp index b6e233c..32cebad 100644 --- a/src/buildIndexAuxiliary.cpp +++ b/src/buildIndexAuxiliary.cpp @@ -26,10 +26,10 @@ using namespace std; -void readText(const string& filename, string& buf) { - ifstream ifs(filename); +void readTextOriginal(const string& filenamebase, string& buf) { + ifstream ifs(filenamebase + ".txt"); if (!ifs) - throw runtime_error("Cannot open file: " + filename); + throw runtime_error("Cannot open file: " + filenamebase + ".txt"); ifs.seekg(0, ios::end); buf.resize(ifs.tellg()); diff --git a/src/buildIndexAuxiliary.h b/src/buildIndexAuxiliary.h index 22be0dd..403bee5 100644 --- a/src/buildIndexAuxiliary.h +++ b/src/buildIndexAuxiliary.h @@ -29,7 +29,7 @@ typedef uint64_t length_t; -void readText(const std::string& filename, std::string& buf); +void readTextOriginal(const std::string& filename, std::string& buf); void readSATextMode(const std::string& filename, std::vector& sa, size_t saSizeHint); diff --git a/src/bwtrepr.h b/src/bwtrepr.h index 71d027f..d32daf1 100644 --- a/src/bwtrepr.h +++ b/src/bwtrepr.h @@ -27,6 +27,7 @@ #include "alphabet.h" #include "bitvec.h" +#include "encodedtext.h" #include "selectinterface.h" // ============================================================================ @@ -73,6 +74,30 @@ class BWTRepr { // e.g. S = 5 for DNA (A,C,G,T + $) bv.index(); } + /** + * Constructor + * @param sigma Alphabet + * @param BWT Encoded Burrows-Wheeler transformation + */ + BWTRepr(const Alphabet& sigma, const EncodedText& BWT) + : bv(BWT.size() + 1), dollarPos(BWT.size()) { + // The $-character (cIdx == 0) is not encoded in the bitvector. + // Hence, use index cIdx-1 in the bitvector. + + for (size_t i = 0; i < BWT.size(); i++) { + if (BWT[i] == 0) { + // smallest character is sentinel + dollarPos = i; + continue; + } + + for (size_t cIdx = BWT[i]; cIdx < S; cIdx++) + bv(cIdx - 1, i) = true; + } + + bv.index(); + } + /** * Get occurrence count of character c in the range BWT[0...k[ * @param cIdx Character index @@ -135,6 +160,14 @@ class BWTRepr { // e.g. S = 5 for DNA (A,C,G,T + $) return true; } + + /** + * @brief Clear the bit vectors + * + */ + void clear() { + bv.clear(); + } }; template // S is the size of the alphabet (including '$') diff --git a/src/createStyles.cpp b/src/createStyles.cpp index 8d0df76..acd35a8 100644 --- a/src/createStyles.cpp +++ b/src/createStyles.cpp @@ -31,12 +31,21 @@ void showUsage() { cout << "This program creates a Cytoscape styles file that can be used to " "visualize subgraphs. The result wil be written to the file " "PanGenomeSubgraph.xml in this directory.\n\n"; - cout << "Usage: ./createStyles numberOfStrains\n\n"; + + cout << "Usage: ./createStyles \n\n"; } int main(int argc, char* argv[]) { int requiredArguments = 1; // number of strains + if (argc == 2) { + string firstArg(argv[1]); + if (firstArg.find("help") != std::string::npos) { + showUsage(); + return EXIT_SUCCESS; + } + } + if (argc != requiredArguments + 1) { std::cerr << "Error: one argument is required: the number of strains " "in the pan-genome.\n" @@ -46,11 +55,6 @@ int main(int argc, char* argv[]) { } std::string parameter = argv[1]; - if (parameter.find("help") != std::string::npos) { - showUsage(); - return EXIT_SUCCESS; - } - uint32_t nr_of_strains = 0; try { diff --git a/src/encodedtext.h b/src/encodedtext.h new file mode 100644 index 0000000..61bd69b --- /dev/null +++ b/src/encodedtext.h @@ -0,0 +1,331 @@ +/****************************************************************************** + * Columba: Approximate Pattern Matching using Search Schemes * + * Copyright (C) 2020-2022 - Luca Renders and * + * Jan Fostier * + * * + * This program is free software: you can redistribute it and/or modify * + * it under the terms of the GNU Affero General Public License as * + * published by the Free Software Foundation, either version 3 of the * + * License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU Affero General Public License for more details. * + * * + * You should have received a copy of the GNU Affero General Public License * + * along with this program. If not, see . * + ******************************************************************************/ + +#ifndef ENCODEDTEXT_H +#define ENCODEDTEXT_H + +#include "alphabet.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +/** + * ENCODEDTEXT CLASS + * + * Encodes a string where each character takes up B = ceil(log2(S)) bits + */ +template // S is the size of the alphabet (including '$') +class EncodedText { + + private: + // the number of bits per character + const static uint64_t B; + // bitmask where the first B bits are 1, and other bits are 0 + const static uint64_t bitmask; + // 64-bit word where the bit at index i indicates whether a symbol starting + // at index i overflows into the next word + const static uint64_t hasOverflowBits; + // masks for overflow (either all 0's or all 1's) + const static std::array overflowMasks; + + // the encoded text, where each symbol takes up B bits + std::vector encodedText; + // Size (in symbols) of the text + size_t tSize; + + /** + * Helper function to get a 64-bit containing either all 1's or all 0's + * @param index the index in the word + * @returns all 1's if a symbol starting at index overflows into next word, + * all 0's if the symbol starting at index does not overflow into the next + * word + */ + static uint64_t hasOverflow(uint64_t index) { + assert(index < 64); + // A) get the value at the correct bit + uint64_t maskIndex = + ((1ull << (63 - index)) & hasOverflowBits) >> (63 - index); + // B) return the mask + return overflowMasks[maskIndex]; + } + + /** + * Encode the letter with charIndex at place index in the text into the + * bitvector + * @param charIndex the index of the character in the alphabet + * @param index the index of this character in the text + */ + void encodeLetter(const uint64_t charIndex, const uint64_t index) { + + uint64_t bits = charIndex; + + // A) find word index of first bit + uint64_t w = (index * B) / 64; + + // B) find bit index of first bit + uint64_t b = (index * B) % 64; + + // C) split bits in bits for w1 and bits for w2 + + // find the mask for the bits that are in the second word + uint64_t mask = hasOverflow(b) & ((-1ull) ^ (-1ull << (B - (64 - b)))); + + // create the bits to be set in the second word + uint64_t bits2 = bits & mask; + + // indicates the number of bits in the next word + uint64_t maskSize = __builtin_popcountll(mask); + // shift the bits of 1st word to the right (shift out bits for 2nd word) + bits >>= maskSize; + + // D) add bits to word 1 + uint64_t startLocation = 64 - B + maskSize; + uint64_t shift = startLocation - b; + encodedText[w] |= (bits << shift); + + // E) add bits to word 2 (in case of overflow) + uint64_t shift2 = 64 - maskSize; + encodedText[w + 1] |= (bits2 << shift2); + } + + public: + /** + * Default constructor + */ + EncodedText() { + tSize = 0; + } + + /** + * Constructor: encodes the text given the alphabet + * @param sigma the alphabet to use + * @param text the text to encode + */ + EncodedText(const Alphabet& sigma, const std::string& text) + : tSize(text.size()) { + encodedText.resize(((text.size() * B) / 64) + 1); + + for (uint64_t i = 0; i < text.size(); i++) { + encodeLetter(sigma.c2i(text[i]), i); + } + } + + /** + * Constructor for a text filled with first character of alphabet (= symbol + * with only 0 bits) + * @param size the size of the text + */ + EncodedText(const uint64_t size) : tSize(size) { + encodedText.resize(((size * B) / 64) + 1); + } + + // ---------------------------------------------------------------------------- + // DECODING: convert symbolindex to character + // ---------------------------------------------------------------------------- + + /** + * Gets the letter at index in the text + * @param sigma the alphabet + * @param index the index in the text + * @returns a char which contains the character at index index in the + * original text according to the passed alphabet + */ + char decodeLetter(const Alphabet& sigma, const uint64_t index) const { + uint64_t cIdex = getEncodedLetter(index % tSize); + return sigma.i2c(cIdex); + } + + /** + * Decode the entire text + * @param sigma the alphabet + * @returns the decoded text + */ + std::string decodeText(const Alphabet& sigma) const { + std::string text; + text.resize(tSize); + + for (uint64_t i = 0; i < tSize; i++) { + text[i] = decodeLetter(sigma, i); + } + + return text; + } + + /** + * Decode the entire text + * @param sigma the alphabet + * @returns the original text + */ + std::string decodeSubstring(const Alphabet& sigma, length_t begin, + length_t end) const { + std::string text; + text.resize(end - begin); + + for (uint64_t i = begin; i < end; i++) { + text[i - begin] = decodeLetter(sigma, i); + } + + return text; + } + + // ---------------------------------------------------------------------------- + // ACCESS OPERATIONS + // ---------------------------------------------------------------------------- + + /** + * @returns the size (in characters) of the original text + */ + size_t size() const { + return tSize; + } + + /** + * Operator overloading, gets the characterindex (according to the used + * alphabet) of the character that was present at index index in the + * original text + * @param index the index to find the character of in the original text + */ + uint64_t operator[](const uint64_t index) const { + // A) find word index of first bit of this symbol + uint64_t w = (index * B) / 64; + + // B) find bit index of first bit of this symbol + uint64_t b = (index * B) % 64; + + // C) get the bits of the symbol that are in the first word + uint64_t bits = (encodedText[w] & (bitmask >> b)) << b; + + // D) get bitmask for bits that flow over into next word + uint64_t mask = hasOverflow(b) & ((-1ull) ^ (-1ull >> (B - (64 - b)))); + + // E) get bits in next word + uint64_t bitsNext = encodedText[w + 1] & mask; + + return (bits >> (64 - B)) + + (bitsNext >> (64 - __builtin_popcountll(mask))); + } + + /** + * @brief Set the letter with charIndex at place index in the text into the + * bitvector + * + * @param index the index of this character in the text + * @param charIndex the index of the character in the alphabet + */ + void set(const uint64_t index, const uint64_t charIndex) { + encodeLetter(charIndex, index); + } + + /** + * Gets the characterindex (according to the used + * alphabet) of the character that was present at index index in the + * original text + * @param index the index to find the character of in the original text + */ + uint64_t getEncodedLetter(const uint64_t index) const { + return operator[](index); + } + + // ---------------------------------------------------------------------------- + // I/O operations + // ---------------------------------------------------------------------------- + + /** + * Write encoded text to disk + * @param filename File name + */ + void write(const std::string& filename) { + std::ofstream ofs(filename); + if (!ofs) + throw std::runtime_error("Cannot open file: " + filename); + + ofs.write((char*)&tSize, sizeof(size_t)); + size_t vectorSize = encodedText.size(); + ofs.write((char*)&vectorSize, sizeof(size_t)); + ofs.write((char*)encodedText.data(), + encodedText.size() * sizeof(uint64_t)); + } + + /** + * Load encoded text from disk + * @param filename File name + */ + bool read(const std::string& filename) { + std::ifstream ifs(filename); + if (!ifs) + return false; + + ifs.read((char*)&tSize, sizeof(size_t)); + size_t vectorSize; + ifs.read((char*)&vectorSize, sizeof(size_t)); + encodedText.resize(vectorSize); + ifs.read((char*)&encodedText[0], encodedText.size() * sizeof(uint64_t)); + + return true; + } + + /** + * @brief Clear the text (free up memory) + * + */ + void clear() { + encodedText.clear(); + encodedText.resize(0); + encodedText.shrink_to_fit(); + tSize = 0; + } + + /** + * @brief Resize the text + * + * @param size new size + */ + void resize(const size_t size) { + tSize = size; + encodedText.resize(((size * B) / 64) + 1); + } + + /** + * @brief Check whether the text is empty + * + * @return true if empty + * @return false otherwise + */ + bool empty() { + return tSize == 0; + } +}; + +template +const std::array EncodedText::overflowMasks = {0ull, -1ull}; +template const uint64_t EncodedText::B = ceil(log2(S)); +template +const uint64_t EncodedText::bitmask = (-1ull) ^ (-1ull >> B); +template +const uint64_t EncodedText::hasOverflowBits = + ~(-1ull << (B - 1)); // the bit at index i indicates whether a symbol + // starting at index i overflows into the next word + +#endif \ No newline at end of file diff --git a/src/fmindex.cpp b/src/fmindex.cpp index b00edd5..af0d1fa 100644 --- a/src/fmindex.cpp +++ b/src/fmindex.cpp @@ -56,7 +56,23 @@ thread_local ExtraCharPtr FMIndex::extraChar; // ---------------------------------------------------------------------------- template -void FMIndex::fromFiles(const string& baseFile, bool verbose) { +void FMIndex::fromFiles(const string& baseFile, bool verbose, + bool strainFree) { + + if (verbose) { + + // read the text + cout << "Reading " << baseFile << ".compressed.txt" + << "..."; + cout.flush(); + } + + if (!text.read(baseFile + ".compressed.txt")) { + throw runtime_error("Problem reading: " + baseFile + ".txt"); + } + + textLength = text.size(); + // (text[text.size() - 1] == '\n') ? text.size() - 1 : text.size(); if (verbose) { cout << "Reading in files with baseFile " << baseFile << endl; @@ -74,6 +90,7 @@ void FMIndex::fromFiles(const string& baseFile, bool verbose) { // TODO why not in construction process? length_t cumCount = 0; // cumulative character counts + counts.clear(); for (size_t i = 0; i < charCounts.size(); i++) { if (charCounts[i] == 0) continue; @@ -86,20 +103,20 @@ void FMIndex::fromFiles(const string& baseFile, bool verbose) { // read the BWT cout << "Reading " << baseFile << ".bwt" << endl; } - if (!readText(baseFile + ".bwt", bwt)) { + if (!bwt.read(baseFile + ".bwt")) { throw runtime_error("Cannot open file: " + baseFile + ".bwt"); } - textLength = (bwt[bwt.size() - 1] == '\n') ? bwt.size() - 1 : bwt.size(); - if (verbose) { cout << "Done reading BWT (size: " << bwt.size() << ")" << endl; // read the reverse BWT cout << "Reading " << baseFile << ".rev.bwt" << endl; } - if (!readText(baseFile + ".rev.bwt", revbwt)) { - throw runtime_error("Cannot open file: " + baseFile + ".rev.bwt"); + if (strainFree) { + if (!revbwt.read(baseFile + ".rev.bwt")) { + throw runtime_error("Cannot open file: " + baseFile + ".rev.bwt"); + } } if (verbose) { cout << "Done reading reverse BWT (size: " << revbwt.size() << ")" @@ -191,12 +208,12 @@ length_t FMIndex::getTotalReportedNodePaths() const { template length_t FMIndex::findLF(length_t k, bool reversed) const { if (reversed) { - length_t posInAlphabet = sigma.c2i((unsigned char)revbwt[k]); + length_t posInAlphabet = revbwt[k]; return counts[posInAlphabet] + getNumberOfOccRev(posInAlphabet, k); } - length_t posInAlphabet = sigma.c2i((unsigned char)bwt[k]); + length_t posInAlphabet = bwt[k]; return counts[posInAlphabet] + getNumberOfOcc(posInAlphabet, k); } @@ -904,7 +921,7 @@ void FMIndex::recApproxMatchHamming( template bool FMIndex::extendFMPosIntermediary( const SARangePair& parentRanges, vector>& stack, - int row, length_t i, int trueDepth) { + int row, length_t i, int trueDepth) const { SARangePair pairForNewChar; @@ -923,7 +940,7 @@ bool FMIndex::extendFMPosIntermediary( template void FMIndex::extendFMPos(const SARangePair& parentRanges, vector>& stack, - int row, int trueDepth) { + int row, int trueDepth) const { // iterate over the entire alphabet for (length_t i = 2; i < sigma.size(); i++) { @@ -934,7 +951,7 @@ void FMIndex::extendFMPos(const SARangePair& parentRanges, template void FMIndex::extendFMPos( - const positionClass& pos, vector>& stack) { + const positionClass& pos, vector>& stack) const { extendFMPos(pos.getRanges(), stack, pos.getDepth()); } diff --git a/src/fmindex.h b/src/fmindex.h index e77b1da..e4e825b 100644 --- a/src/fmindex.h +++ b/src/fmindex.h @@ -28,6 +28,7 @@ #include "buildIndexAuxiliary.h" #include "bwtrepr.h" #include "cluster.h" +#include "encodedtext.h" #include "search.h" #include "suffixarray.h" #include "textoccurrence.h" @@ -63,7 +64,7 @@ template class FMIndex { // The length of the reference text length_t textLength; // The reference text - std::string text; + EncodedText text; // The number of separation characters int numberOfSeparationCharacters = 1; @@ -74,9 +75,9 @@ template class FMIndex { // bidirectional FM-index data structures // the bwt string of the reference genome - std::string bwt; + EncodedText bwt; // the bwt string of the reverse reference genome - std::string revbwt; + EncodedText revbwt; // the counts array of the reference genome std::vector counts; // the (sparse) suffix array of the reference genome @@ -148,8 +149,10 @@ template class FMIndex { * * @param baseFile the baseFile of the files that will be read in * @param verbose if true the steps will be written to cout + * @param strainFree bool indicating whether strainfree matching is required */ - void fromFiles(const std::string& baseFile, bool verbose); + void fromFiles(const std::string& baseFile, bool verbose, + bool strainFree = false); /** * @brief Read a binary file and stores content in array (e.g. suffix array) @@ -205,17 +208,6 @@ template class FMIndex { // ROUTINES FOR ACCESSING DATA STRUCTURE // ---------------------------------------------------------------------------- - /** - * @brief Finds the LF mapping of the character at index k in the bwt string - * - * @param k the index to find the LF mapping of - * @param reversed Only present for backwards compatibility. Throws an error - * if true. Defaults to false. - * @return length_t - the row that is the LF mapping of k. It is so that the - * entry in the suffix array of this return value is one less than the entry - * in the suffix array at index k - */ - /** * @brief Finds the LF mapping of the character at index k in the (reverse) * bwt string @@ -314,40 +306,6 @@ template class FMIndex { */ Range matchString(const std::string& s); - // ---------------------------------------------------------------------------- - // HELP ROUTINES FOR APPROXIMATE PATTERN MATCHING - // ---------------------------------------------------------------------------- - - /** - * @brief Find the ranges of cP using the principle explained in the paper - * of Lahm - * - * @param positionInAlphabet the position in the alphabet of the character - * that is added in the front - * @param rangesOfP the ranges of pattern P - * @param childRanges the ranges cP, this will be overwritten - * @return true if the new ranges are not empty - * @return false otherwise - */ - bool findRangesWithExtraCharBackward(length_t positionInAlphabet, - const SARangePair& rangesOfP, - SARangePair& childRanges) const; - - /** - * @brief Find the ranges of Pc using the principle explained in the paper - * of Lahm - * - * @param positionInAlphabet the position in the alphabet of the character - * that is added in the back - * @param rangesOfP the ranges of pattern P - * @param childRanges the ranges Pc, this will be overwritten - * @return true if the new ranges are not empty - * @return false otherwise - */ - bool findRangesWithExtraCharForward(length_t positionInAlphabet, - const SARangePair& rangesOfP, - SARangePair& childRanges) const; - // ---------------------------------------------------------------------------- // HELPER ROUTINES FOR APPROXIMATE MATCHING (ITERATIVELY) // ---------------------------------------------------------------------------- @@ -368,7 +326,7 @@ template class FMIndex { * @param lowerBound the lowerbound for this partition * @param descendantsOtherD the descendants of the other direction, defaults * to empty vector - * @param initEdsOtherDthe initialization eds of the other direction, + * @param initEdsOtherD the initialization eds of the other direction, * defaults to empty vector * @param remainingDesc the remaining descendants on the current branch, * that are already created but aren't checked yet and need to be checked @@ -444,7 +402,7 @@ template class FMIndex { /** * Updates the node path stacks according to the current position. The new * path is always either a subpath of the current path or the current path - * wiht one extra node. The entire path is reversed(leftNodes) + rightNodes + * with one extra node. The entire path is reversed(leftNodes) + rightNodes * @param pos the current position * @param leftNodes the left nodepath (from origin to leftmost point) * @param rightNodes the right node path (from origin to rightmost point) @@ -470,7 +428,7 @@ template class FMIndex { */ bool extendFMPosIntermediary(const SARangePair& parentRanges, std::vector>& stack, - int row, length_t i, int trueDepth = -1); + int row, length_t i, int trueDepth = -1) const; /** * @brief Pushes all the children corresponding to the node with ranges @@ -484,7 +442,7 @@ template class FMIndex { */ virtual void extendFMPos(const SARangePair& ranges, std::vector>& stack, - int row = 0, int trueDepth = -1); + int row = 0, int trueDepth = -1) const; /** * @brief Pushes all the children corresponding to the this position onto @@ -494,7 +452,7 @@ template class FMIndex { * @param stack the stack to push the children on */ virtual void extendFMPos(const positionClass& pos, - std::vector>& stack); + std::vector>& stack) const; /** * @brief Converts a match in the suffix array to matches in the text @@ -568,11 +526,14 @@ template class FMIndex { * @param sa_sparse sparseness factor of suffix array. It is assumed this is * a power of two * @param verbose will write to cout + * @param strainFree bool indicating whether strain free matching is + * required */ - FMIndex(const std::string& baseFile, int sa_sparse = 1, bool verbose = true) + FMIndex(const std::string& baseFile, int sa_sparse = 1, bool verbose = true, + bool strainFree = false) : baseFile(baseFile), sparseSA(baseFile, sa_sparse) { // read in files - fromFiles(baseFile, verbose); + fromFiles(baseFile, verbose, strainFree); // populate table populateTable(verbose); @@ -605,11 +566,11 @@ template class FMIndex { /** * @brief Get a reference to the original text * - * @return const std::string& - a reference to the original text + * @return const EncodedText& - a reference to the original text */ - const std::string& getText() { + const EncodedText& getText() { if (text.empty()) { - if (!readText(baseFile + ".txt", text)) { + if (!text.read(baseFile + ".compressed.txt")) { throw std::runtime_error("Problem reading: " + baseFile + ".txt"); } @@ -696,6 +657,33 @@ template class FMIndex { return SARangePair(); } + /** + * @brief Get the text length + * + * @return length_t - text length + */ + const length_t& getTextLength() const { + return textLength; + } + + /** + * @brief Get the counts + * + * @return std::vector& - counts + */ + const std::vector& getCounts() const { + return counts; + } + + /** + * @brief Get the alphabet + * + * @return const Alphabet - the alphabet + */ + const Alphabet getSigma() const { + return sigma; + } + // ---------------------------------------------------------------------------- // ROUTINES FOR EXACT MATCHING // ---------------------------------------------------------------------------- @@ -774,6 +762,40 @@ template class FMIndex { } } + // ---------------------------------------------------------------------------- + // HELP ROUTINES FOR APPROXIMATE PATTERN MATCHING + // ---------------------------------------------------------------------------- + + /** + * @brief Find the ranges of cP using the principle explained in the paper + * of Lam + * + * @param positionInAlphabet the position in the alphabet of the character + * that is added in the front + * @param rangesOfP the ranges of pattern P + * @param childRanges the ranges cP, this will be overwritten + * @return true if the new ranges are not empty + * @return false otherwise + */ + bool findRangesWithExtraCharBackward(length_t positionInAlphabet, + const SARangePair& rangesOfP, + SARangePair& childRanges) const; + + /** + * @brief Find the ranges of Pc using the principle explained in the paper + * of Lam + * + * @param positionInAlphabet the position in the alphabet of the character + * that is added in the back + * @param rangesOfP the ranges of pattern P + * @param childRanges the ranges Pc, this will be overwritten + * @return true if the new ranges are not empty + * @return false otherwise + */ + bool findRangesWithExtraCharForward(length_t positionInAlphabet, + const SARangePair& rangesOfP, + SARangePair& childRanges) const; + // ---------------------------------------------------------------------------- // ROUTINES FOR APPROXIMATE MATCHING // ---------------------------------------------------------------------------- @@ -793,7 +815,7 @@ template class FMIndex { length_t maxED); /** - * @brief Private elper function for the naive approximate pattern matching + * @brief Private helper function for the naive approximate pattern matching * method * * @param pattern the pattern to match @@ -838,7 +860,7 @@ template class FMIndex { * for eliminating redundancy in the edit distance metric * * @param search the search to follow - * @param startMatch the approximate match found for all previous partions + * @param startMatch the approximate match found for all previous partitions * of the search * @param occ a vector with matches of the complete search, if such a match * is found is a pushed upon this vector @@ -878,7 +900,7 @@ template class FMIndex { * to start a search for the next part * * @param s the search to follow - * @param startMatch the approximate match found for all previous partions + * @param startMatch the approximate match found for all previous partitions * of the search * @param occ a vector with matches of the complete search, if such a match * is found is a pushed upon this vector diff --git a/src/fmindexDBG.cpp b/src/fmindexDBG.cpp index 320fb14..66cfc8c 100644 --- a/src/fmindexDBG.cpp +++ b/src/fmindexDBG.cpp @@ -21,7 +21,9 @@ ******************************************************************************/ #include "fmindexDBG.h" -#include "../radixSA64/radix.h" +#include "divsufsort64.h" +#include "longestCommonPrefix.h" +#include "radix.h" #include #include @@ -52,25 +54,28 @@ void FMIndexDBG::createFMIndex( // read the text file from disk std::cout << "Reading " << baseFN << ".txt..." << std::endl; - readText(baseFN + ".txt", this->text); + // Store the full text in a temporary buffer + string buf; + readTextOriginal(baseFN, buf); // Find the length of the original text - this->textLength = (this->text[this->text.size() - 1] == '\n') - ? this->text.size() - 1 - : this->text.size(); + textLength = (buf[buf.size() - 1] == '\n') ? buf.size() - 1 : buf.size(); + + bool newLine = false; // Remove return character if necessary - if (this->textLength != this->text.size()) { + if (textLength != buf.size()) { + newLine = true; std::cout << "WARNING: the input text contained a tailing return, which was " "removed." << std::endl; - this->text = this->text.substr(0, this->textLength); + buf = buf.substr(0, textLength); } // count the frequency of each characters in T std::vector charCounts(256, 0); - for (char c : this->text) + for (char c : buf) charCounts[(unsigned char)c]++; // count the number of unique characters in T @@ -79,7 +84,7 @@ void FMIndexDBG::createFMIndex( if (count > 0) nUniqueChar++; - std::cout << "\tText has length " << this->textLength << "\n"; + std::cout << "\tText has length " << textLength << "\n"; std::cout << "\tText has " << nUniqueChar << " unique characters\n"; if (nUniqueChar > ALPHABET) { @@ -96,7 +101,12 @@ void FMIndexDBG::createFMIndex( } // Create the alphabet - this->sigma = Alphabet(charCounts); + sigma = Alphabet(charCounts); + + text = EncodedText(sigma, buf); + text.write(baseFN + ".compressed.txt"); + + std::cout << "Wrote file " << baseFN << ".compressed.txt\n"; // write the character counts table { @@ -113,148 +123,137 @@ void FMIndexDBG::createFMIndex( for (size_t i = 0; i < charCounts.size(); i++) { if (charCounts[i] == 0) continue; - this->counts.push_back(cumCount); + counts.push_back(cumCount); cumCount += charCounts[i]; } // build the SA - std::cout << "Generating the suffix array using radixSA64...\n"; + std::cout << "Generating the suffix array using divsufsort64...\n"; clock_t startTime = clock(); - length_t* radixSA = - Radix((uchar*)&(this->text)[0], this->textLength, 0).build(); - - std::vector SA(radixSA, radixSA + this->textLength); - - delete[] radixSA; + int64_t* SA = (int64_t*)malloc((size_t)textLength * sizeof(int64_t)); + divsufsort64((uchar*)&(buf)[0], SA, textLength); clock_t endTime = clock(); - float mseconds = clock_diff_to_msec(endTime - startTime); - printf("radixSA64 took [%.2fs]\n", mseconds / 1000.0); + float milliseconds = clock_diff_to_msec(endTime - startTime); + printf("divsufsort64 took [%.2fs]\n", milliseconds / 1000.0); // perform a sanity check on the suffix array std::cout << "\tPerforming sanity checks..." << std::endl; - sanityCheck(this->text, SA); - std::cout << "\tSanity checks OK" << std::endl; + // sanityCheck(text, SA); + // std::cout << "\tSanity checks OK" << std::endl; + + buf.clear(); + buf.resize(0); + buf.shrink_to_fit(); // build the BWT std::cout << "Generating BWT..." << std::endl; - this->bwt = std::string(this->textLength, '\0'); - for (size_t i = 0; i < SA.size(); i++) + bwt.resize(textLength); + for (size_t i = 0; i < textLength; i++) if (SA[i] > 0) - this->bwt[i] = this->text[SA[i] - 1]; + bwt.set(i, text[SA[i] - 1]); else - this->bwt[i] = this->text.back(); - - std::ofstream ofs(baseFN + ".bwt"); - ofs.write((char*)this->bwt.data(), this->bwt.size()); - ofs.close(); + bwt.set(i, text[textLength - 1]); + // Write bwt + bwt.write(baseFN + ".bwt"); std::cout << "Wrote file " << baseFN << ".bwt\n"; + // create succinct BWT bitvector table + fwdRepr = BWTRepr(sigma, bwt); + fwdRepr.write(baseFN + ".brt"); + std::cout << "Wrote file: " << baseFN << ".brt" << std::endl; + + // Count the number of strains + numberOfStrains = fwdRepr.occ(sigma.c2i('%'), textLength) + 1; + bwt.clear(); + fwdRepr.clear(); + // create sparse suffix arrays for (int saSF : sparse_sa) { - this->sparseSA = SparseSuffixArray(SA, saSF); - this->sparseSA.write(baseFN); + sparseSA.clear(); + sparseSA = SparseSuffixArray(SA, saSF, textLength); + sparseSA.write(baseFN); std::cout << "Wrote sparse suffix array with factor " << saSF << std::endl; } - SA.clear(); + delete[] SA; - // create succint BWT bitstd::vector table - this->fwdRepr = BWTRepr(this->sigma, this->bwt); - this->fwdRepr.write(baseFN + ".brt"); - std::cout << "Wrote file: " << baseFN << ".brt" << std::endl; + sparseSA.clear(); // Reverse the original text std::cout << "Reversing the original text...\n"; startTime = clock(); - std::string reverseText = this->text; - reverse(reverseText.begin(), reverseText.end()); + + readTextOriginal(baseFN, buf); + + if (newLine) { + buf.pop_back(); + } + + reverse(buf.begin(), buf.end()); endTime = clock(); - mseconds = clock_diff_to_msec(endTime - startTime); - printf("Reversing took [%.2fs]\n", mseconds / 1000.0); + milliseconds = clock_diff_to_msec(endTime - startTime); + printf("Reversing took [%.2fs]\n", milliseconds / 1000.0); // build the reverse SA - std::cout << "Generating the reverse suffix array using radixSA64...\n"; + std::cout << "Generating the reverse suffix array using divsufsort64...\n"; startTime = clock(); - length_t* radixRevSA = - Radix((uchar*)&(reverseText)[0], this->textLength, 0).build(); - - std::vector revSA(radixRevSA, radixRevSA + this->textLength); - - delete[] radixRevSA; + int64_t* revSA = (int64_t*)malloc((size_t)textLength * sizeof(int64_t)); + divsufsort64((uchar*)&(buf)[0], revSA, textLength); endTime = clock(); - mseconds = clock_diff_to_msec(endTime - startTime); - printf("radixSA64 took [%.2fs]\n", mseconds / 1000.0); + milliseconds = clock_diff_to_msec(endTime - startTime); + printf("divsufsort64 took [%.2fs]\n", milliseconds / 1000.0); - reverseText.clear(); + buf.clear(); + buf.resize(0); + buf.shrink_to_fit(); // perform a sanity check on the suffix array std::cout << "\tPerforming sanity checks..." << std::endl; - sanityCheck(this->text, revSA); - std::cout << "\tSanity checks OK" << std::endl; + // sanityCheck(text, revSA); + // std::cout << "\tSanity checks OK" << std::endl; // build the reverse BWT std::cout << "Generating reverse BWT..." << std::endl; - this->revbwt = std::string(this->textLength, '\0'); - this->revbwt.resize(this->textLength); - for (size_t i = 0; i < revSA.size(); i++) + revbwt.resize(textLength); + for (size_t i = 0; i < textLength; i++) if (revSA[i] > 0) - this->revbwt[i] = this->text[this->textLength - revSA[i]]; + revbwt.set(i, text[textLength - revSA[i]]); else - this->revbwt[i] = this->text.front(); + revbwt.set(i, text[0]); - std::ofstream ofsrev(baseFN + ".rev.bwt"); - ofsrev.write((char*)this->revbwt.data(), this->revbwt.size()); - ofsrev.close(); + // Encode reverse bwt + revbwt.write(baseFN + ".rev.bwt"); + delete[] revSA; std::cout << "Wrote file " << baseFN << ".rev.bwt\n"; - // create sparse suffix array - int saSF = sparse_sa.back(); - this->sparseRevSA = SparseSuffixArray(revSA, saSF); - revSA.clear(); - - // create succint reverse BWT bitstd::vector table - this->revRepr = BWTRepr(this->sigma, this->revbwt); - this->revRepr.write(baseFN + ".rev.brt"); + // create succinct reverse BWT bitvector table + revRepr = BWTRepr(sigma, revbwt); + revRepr.write(baseFN + ".rev.brt"); std::cout << "Wrote file: " << baseFN << ".rev.brt" << std::endl; - // populate table - this->populateTable(true); + revbwt.clear(); + revRepr.clear(); + text.clear(); } template -length_t FMIndexDBG::findRevSA(length_t index) const { - - if (sparseRevSA.getSparsenessFactor() == 0) { - // The sparse reverse SA is not present in memory - throw runtime_error("Reverse suffix array is not available."); - } - - // Iterate until a position is reached for which the entry is stored - length_t l = 0; - while (!sparseRevSA[index]) { - index = this->findLF(index, true); - l++; - } - return sparseRevSA.get(index) + l; -} - -template -void FMIndexDBG::computeLCP(Bitvec2& LCP, bool progress) { +void FMIndexDBG::computeLCPKasai(Bitvec2& LCP, bool progress, + uint& k) { std::cout << "Computing the LCP..." << std::endl; // Initialize the LCP array with one more element than the length of the // original text - LCP = (this->textLength + 1); + LCP = (textLength + 1); // Create a rank array, which will contain the inverse of the suffix array - std::vector rank(this->textLength); + std::vector rank(textLength); // Fill in the rank array - for (length_t i = 0; i < this->textLength; i++) { - rank[this->findSA(i)] = i; + for (length_t i = 0; i < textLength; i++) { + rank[findSA(i)] = i; if (progress) { - if (i % (this->textLength / 100) == 0) { - std::cout << "Progress part 1/5: " - << i / (this->textLength / 100) << "%" + if (i % (textLength / 100) == 0) { + std::cout << "Progress part 1/5: " << i / (textLength / 100) + << "%" << "\r"; std::cout.flush(); } @@ -267,17 +266,17 @@ void FMIndexDBG::computeLCP(Bitvec2& LCP, bool progress) { uint h = 0; // Fill in the LCP array by iterating over all suffixes in sorted order // (this is the order in the suffix array) - for (length_t i = 0; i < this->textLength; i++) { + for (length_t i = 0; i < textLength; i++) { // Verify that the suffix starting at index i is not the // lexicographically smallest suffix. In practice, the lexicographically - // smalles suffix is "$", so this one is skipped. + // smallest suffix is "$", so this one is skipped. if (rank[i] > 0) { // The suffix starting at index j is successor of the suffix // starting at index i in the suffix array. - length_t j = this->findSA(rank[i] - 1); - // Increment h until it represents the length of the longets common + length_t j = findSA(rank[i] - 1); + // Increment h until it represents the length of the longest common // prefix between the two suffixes - while (this->text[i + h] == this->text[j + h]) { + while (text[i + h] == text[j + h]) { h++; } // Update the compacted LCP array. @@ -296,9 +295,9 @@ void FMIndexDBG::computeLCP(Bitvec2& LCP, bool progress) { } // Print progress if (progress) { - if (i % (this->textLength / 100) == 0) { - std::cout << "Progress part 2/5: " - << i / (this->textLength / 100) << "%" + if (i % (textLength / 100) == 0) { + std::cout << "Progress part 2/5: " << i / (textLength / 100) + << "%" << "\r"; std::cout.flush(); } @@ -319,16 +318,16 @@ template void FMIndexDBG::computeBitVectors(std::queue& Q, Bitvec& Br_right, Bitvec& Bl_right, - bool progress) { + bool progress, uint& k) { // Build the longest common prefix array Bitvec2 LCP; - computeLCP(LCP, progress); + computeLCPPrezza(this, LCP, progress, k); // Build the bit vectors std::cout << "Computing Br_right and Bl_right..." << std::endl; // Initialize all values to false - for (length_t i = 0; i < this->textLength; i++) { + for (length_t i = 0; i < textLength; i++) { Br_right[i] = false; Bl_right[i] = false; } @@ -341,7 +340,7 @@ void FMIndexDBG::computeBitVectors(std::queue& Q, length_t index = i; end_values.insert(index); for (uint j = 1; j < k; j++) { - index = this->findLF(index, false); + index = findLF(index, false); end_values.insert(index); } } @@ -349,9 +348,9 @@ void FMIndexDBG::computeBitVectors(std::queue& Q, length_t lb = 0; // kIndex stores the last index for which the LCP value is equal to k length_t kIndex = 0; - // lastdiff stores the last index at which the characters BWT[lastdiff-1] - // and BWT[lastdiff] differ - length_t lastdiff = 0; + // lastDiff stores the last index at which the characters BWT[lastDiff-1] + // and BWT[lastDiff] differ + length_t lastDiff = 0; // if open is true, we are in an area for which the LCP values are larger // than k bool open = false; @@ -359,14 +358,14 @@ void FMIndexDBG::computeBitVectors(std::queue& Q, int counter = 0; // create a copy of the cumulative counts array corresponding to the // bidirectional FM-index - std::vector counts_copy = this->counts; + std::vector counts_copy = counts; // Iterate over all entries in the LCP array - for (length_t i = 1; i < this->textLength + 1; i++) { + for (length_t i = 1; i < textLength + 1; i++) { // the counts array is adjusted every iteration so that it would contain // the following information: counts[BWT[j]] =LF(j)+1, where j is the // index of the last occurrence of character BWT[j] in BWT before the // current index i of the algorithm - counts_copy[this->sigma.c2i(this->bwt[i - 1])]++; + counts_copy[bwt[i - 1]]++; if (LCP[i] >= 1) { // We enter a range with LCP entries bigger than or equal to k @@ -377,7 +376,7 @@ void FMIndexDBG::computeBitVectors(std::queue& Q, } } else { if (open) { - // We are leaving a k-mer range that must be analysed + // We are leaving a k-mer range that must be analyzed // Check if the k-mer is right-maximal if (kIndex > lb) { @@ -398,11 +397,11 @@ void FMIndexDBG::computeBitVectors(std::queue& Q, Br_right[i - 1] = true; // We get the range in the reverse SA that corresponds // to this k-mer - Range reverseRange = getReverseRange(lb, false); + Range reverseRange = getReverseRangeKMer(lb, false, k); length_t lb_rev = reverseRange.getBegin(); // Add the new node to the graph G.emplace_back(k, i - lb, lb, lb, lb_rev, lb_rev); - // Push the node identifier to the queu + // Push the node identifier to the queue Q.push(counter); // Increment the node ID creator counter++; @@ -410,21 +409,22 @@ void FMIndexDBG::computeBitVectors(std::queue& Q, } // check whether the k-mer corresponding to the current interval // is a left-maximal repeat by checking if the last index - // lastdiff at which the characters BWT[lastdiff-1] and - // BWT[lastdiff] differ, is higher than the left boundary (lb) + // lastDiff at which the characters BWT[lastDiff-1] and + // BWT[lastDiff] differ, is higher than the left boundary (lb) // of the SA interval - if (lastdiff > lb) { + if (lastDiff > lb) { // Iterate over all characters in the BWT corresponding to // this left-maximal k-mer interval for (length_t j = lb; j < i; j++) { // Get the corresponding character - char c = this->bwt[j]; + + char c = sigma.i2c(bwt[j]); // Check if the preceding character is a separation // character, which means we are situated in a start // node if (c != '$' && c != '%') { // This is not a start node - int cIdx = this->sigma.c2i(c); + int cIdx = sigma.c2i(c); // the counts array can directly be used to find the // right boundary of the closed SA interval of the // preceding k-mer, which needs to be set to true in @@ -446,16 +446,16 @@ void FMIndexDBG::computeBitVectors(std::queue& Q, // update the last index for which the LCP value is lower then k lb = i; } - if (this->bwt[i] != this->bwt[i - 1]) { - // the last index at which the characters BWT[lastdiff-1] and - // BWT[lastdiff] differ - lastdiff = i; + if (bwt[i] != bwt[i - 1]) { + // the last index at which the characters BWT[lastDiff-1] and + // BWT[lastDiff] differ + lastDiff = i; } // Print progress if (progress) { - if (i % (this->textLength / 100) == 0) { - std::cout << "Progress part 3/5: " - << i / (this->textLength / 100) << "%" + if (i % (textLength / 100) == 0) { + std::cout << "Progress part 1/3: " << i / (textLength / 100) + << "%" << "\r"; std::cout.flush(); } @@ -463,13 +463,13 @@ void FMIndexDBG::computeBitVectors(std::queue& Q, } // Print progress if (progress) { - std::cout << "Progress part 3/5: " << 100 << "%" + std::cout << "Progress part 1/3: " << 100 << "%" << "\n"; } open = false; // the one-bits in Bl_right that also correspond to a right-maximal k-mer // must be reset to zero - for (length_t i = 0; i < this->textLength; i++) { + for (length_t i = 0; i < textLength; i++) { if (open) { // We are in a right-maximal k-mer interval Bl_right[i] = false; @@ -484,9 +484,9 @@ void FMIndexDBG::computeBitVectors(std::queue& Q, } // Print progress if (progress) { - if (i % (this->textLength / 100) == 0) { - std::cout << "Progress part 4/5: " - << i / (this->textLength / 100) << "%" + if (i % (textLength / 100) == 0) { + std::cout << "Progress part 2/3: " << i / (textLength / 100) + << "%" << "\r"; std::cout.flush(); } @@ -499,7 +499,7 @@ void FMIndexDBG::computeBitVectors(std::queue& Q, // Print progress if (progress) { - std::cout << "Progress part 4/5: " << 100 << "%" + std::cout << "Progress part 2/3: " << 100 << "%" << "\n"; } @@ -515,29 +515,29 @@ std::vector> FMIndexDBG::getIntervals(length_t i, length_t j) { std::vector> result; - result.reserve(this->sigma.size()); + result.reserve(sigma.size()); // Iterate over all characters - for (length_t k = 0; k < this->sigma.size(); k++) { + for (length_t k = 0; k < sigma.size(); k++) { SARangePair child = SARangePair(Range(i, j), Range(0, 0)); // Find the range over the SA after appending the extra character to the // front of the current suffix - if (this->findRangesWithExtraCharBackward( + if (findRangesWithExtraCharBackward( k, SARangePair(Range(i, j), Range(0, 0)), child)) { - result.emplace_back(this->sigma.i2c(k), child.getRangeSA()); + result.emplace_back(sigma.i2c(k), child.getRangeSA()); } } return result; } template -Range FMIndexDBG::getReverseRange(length_t indexInSA, - bool isEndNode) { +Range FMIndexDBG::getReverseRangeKMer(length_t indexInSA, + bool isEndNode, uint& k) { // Find the index of the substring of interest in the original text - length_t indexInText = this->findSA(indexInSA); + length_t indexInText = findSA(indexInSA); // Get the substring of interest: the first k-mer of the suffix at the // indexInSA'th position in the SA. Match this k-mer to the reference - SARangePair p = this->matchStringBidirectionally( - Substring(&this->text, indexInText, (indexInText + k))); + SARangePair p = matchStringBidirectionally(Substring( + (text.decodeSubstring(sigma, indexInText, (indexInText + k))))); // If the node is not an end node, all computations are done if (!isEndNode) { // Return the reverse SA range @@ -547,20 +547,20 @@ Range FMIndexDBG::getReverseRange(length_t indexInSA, // corresponding substring. Hence, we need to distinguish between these as // well. Following variable keeps track of the number of extra characters we // use to distinguish the nodes. - length_t extra = k; + length_t extra = textLength / 100 + k; // Keep adding sets of characters until the match is unique or the start of // the reference was reached while (p.width() > 1 && extra <= indexInText) { // Exactly match the substring with extra characters - p = this->matchStringBidirectionally( - Substring(&this->text, indexInText - extra, indexInText + k)); - extra += k; + p = matchStringBidirectionally(Substring( + text.decodeSubstring(sigma, indexInText - extra, indexInText + k))); + extra += textLength / 100 + k; } // If too much characters were added in the last set, match again until the // start of the reference if (extra > indexInText) { - p = this->matchStringBidirectionally( - Substring(&this->text, 0, indexInText + k)); + p = matchStringBidirectionally( + Substring(text.decodeSubstring(sigma, 0, indexInText + k))); } // Return the first entry of the found reverse SA range. We don't return the // complete reverse SA range because it could contain more than one entry if @@ -569,46 +569,11 @@ Range FMIndexDBG::getReverseRange(length_t indexInSA, p.getRangeSARev().getBegin() + 1); } -template -void FMIndexDBG::setEdgeMapping(length_t id) { - Node node = G[id]; - // If only one edge goes through the node, nothing needs to be done - if (node.multiplicity > 1) { - // Create a std::vector of pairs of edges, defined by their index in the - // original text, along with their original rank in the SA - std::vector> strains_forward; - for (size_t i = 0; i < node.multiplicity; i++) { - strains_forward.emplace_back( - this->findSA(node.left_kmer_forward + i), i); - } - // Sort the std::vector based on the indexes of the edges in the - // original text - sort(strains_forward.begin(), strains_forward.end()); - // Create a std::vector of pairs of edges, defined by their index in the - // original text, along with their original rank in the reverse SA - std::vector> strains_reverse; - for (size_t i = 0; i < node.multiplicity; i++) { - strains_reverse.emplace_back( - this->textLength - findRevSA(node.left_kmer_reverse + i), i); - } - // Sort the std::vector based on the indexes of the edges in the - // original text - sort(strains_reverse.begin(), strains_reverse.end()); - // Create the edge mapping based on the two sorted std::vectors. The - // mapping maps from regular to reverse. - for (size_t i = 0; i < node.multiplicity; i++) { - G[id].edgeMapping[strains_forward[i].second] = - strains_reverse[i].second; - } - } -} - template void FMIndexDBG::buildCompressedGraph( const std::vector& checkpoint_sparseness, bool progress, - std::vector& B_rights, - std::vector& B_right_fulls, - std::vector>& mapping_rights) { + std::vector& B_rights, + std::vector>& mapping_rights, uint& k) { std::cout << "Start building the implicit graph:" << std::endl; // Counter for progress printing @@ -616,41 +581,41 @@ void FMIndexDBG::buildCompressedGraph( // Initialize the Br_right and Bl_right bit vectors to the length of the // text + 1. This way, rank(textLength) can be called as well. These are // temporary bit vectors for the build process. - Bitvec Br_right = (this->textLength + 1); - Bitvec Bl_right = (this->textLength + 1); + Bitvec Br_right = (textLength + 1); + Bitvec Bl_right = (textLength + 1); const int cp_len = checkpoint_sparseness.size(); - // Set the start values of B_right, B_left and B_right_full to 0. These are + // Set the start values of B_right and B_left to 0. These are // the bit vectors that will be stored. - for (length_t i = 0; i < this->textLength; i++) { + for (length_t i = 0; i < textLength; i++) { B_right[i] = false; B_left[i] = false; - B_right_full[i] = false; for (int j = 0; j < cp_len; j++) { B_rights[j][i] = false; - B_right_fulls[j][i] = false; } } // queue Q stores which nodes still need to be handled by the construction // algorithm std::queue Q; - // Store a temportary mapping of node IDs with index of the leftmost k-mer + // Store a temporary mapping of node IDs with index of the leftmost k-mer // in the reverse SA std::vector> temporary_mapping_left; - // Store temportary mappings of node IDs and offsets with index of the + // Store temporary mappings of node IDs and offsets with index of the // corresponding k-mer in the SA std::vector>> temporary_mapping_rights(checkpoint_sparseness.size()); - // Compute remporary bit vectors Br_right and Bl_right - computeBitVectors(Q, Br_right, Bl_right, progress); + // Compute temporary bit vectors Br_right and Bl_right + computeBitVectors(Q, Br_right, Bl_right, progress, k); // Number of right-maximal nodes - int rightMax = Br_right.rank(this->textLength) / 2; + int rightMax = Br_right.rank(textLength) / 2; // Number of nodes that are not right-maximal and precede a left-maximal // node - int leftMax = Bl_right.rank(this->textLength); + int leftMax = Bl_right.rank(textLength); + + G.reserve(rightMax + leftMax + numberOfStrains); std::cout << "Constructing the compressed graph..." << std::endl; @@ -663,7 +628,7 @@ void FMIndexDBG::buildCompressedGraph( // Add the end nodes (they can still be extended later) for (length_t s = 0; s < numberOfStrains; s++) { int id = rightMax + leftMax + s; - Range reverseRange = getReverseRange(s, true); + Range reverseRange = getReverseRangeKMer(s, true, k); G.emplace_back(1, 1, s, s, reverseRange.getBegin(), 0); Q.push(id); Bl_right[s] = 0; @@ -672,7 +637,7 @@ void FMIndexDBG::buildCompressedGraph( // Find the number of graph nodes numberOfGraphNodes = G.size(); // Forward matching - this->setDirection(FORWARD); + setDirection(FORWARD); // Keep iterating over the queue until all nodes are handled while (!Q.empty()) { @@ -742,15 +707,6 @@ void FMIndexDBG::buildCompressedGraph( node.left_kmer_forward + node.multiplicity - 1, MappingPair(id, node.len - k)); - - // Set the correct bits in B_right_full - for (size_t i = 0; - i < node.multiplicity; i++) { - B_right_fulls - [j] - [node.left_kmer_forward + i] = - true; - } } } } @@ -763,8 +719,8 @@ void FMIndexDBG::buildCompressedGraph( // The predecessor of this node needs to be // initialized int newID = rightMax + Bl_right.rank(i); - Range reverseRange = - getReverseRange(i, newID >= rightMax + leftMax); + Range reverseRange = getReverseRangeKMer( + i, newID >= rightMax + leftMax, k); G[newID] = Node(k, j - i, i, i, reverseRange.getBegin(), reverseRange.getBegin()); @@ -778,8 +734,8 @@ void FMIndexDBG::buildCompressedGraph( // If the node was extended or the node is an end node, index of the // leftmost k-mer in the reverse suffix array must be updated. if (node.len > k || id >= rightMax + leftMax) { - Range reverseRange = getReverseRange(node.left_kmer_forward, - id >= rightMax + leftMax); + Range reverseRange = getReverseRangeKMer( + node.left_kmer_forward, id >= rightMax + leftMax, k); node.left_kmer_reverse = reverseRange.getBegin(); } @@ -796,23 +752,29 @@ void FMIndexDBG::buildCompressedGraph( // Update the temporary right mapping temporary_mapping_rights[j].emplace_back(indexForBit, MappingPair(id, 0)); + } - // Set the correct bits in B_right_full corresponding to this node - for (size_t i = 0; i < node.multiplicity; i++) { - B_right_fulls[j][node.right_kmer_forward + i] = true; + if (node.len > k) { + // Set the correct bit in B_right corresponding to this node + indexForBit = node.left_kmer_forward + node.multiplicity - 1; + for (int j = 0; j < cp_len; j++) { + + if ((node.len - k) % checkpoint_sparseness[j] > 0) { + B_rights[j][indexForBit] = true; + // Update the temporary right mapping + temporary_mapping_rights[j].emplace_back( + indexForBit, MappingPair(id, node.len - k)); + } } } // Update the node attributes in the graph G[id] = node; - // Set the edge mapping of the node - setEdgeMapping(id); - // Print progress - if (progress) { + if (progress && numberOfGraphNodes > 100) { if (counter % (numberOfGraphNodes / 100) == 0) { - std::cout << "Progress part 5/5: " + std::cout << "Progress part 3/3: " << counter / (numberOfGraphNodes / 100) << "%" << "\r"; std::cout.flush(); @@ -822,7 +784,7 @@ void FMIndexDBG::buildCompressedGraph( } // Print progress if (progress) { - std::cout << "Progress part 5/5: " << 100 << "%" + std::cout << "Progress part 3/3: " << 100 << "%" << "\n"; } @@ -831,19 +793,18 @@ void FMIndexDBG::buildCompressedGraph( B_left.indexInterface(); for (int j = 0; j < cp_len; j++) { B_rights[j].indexInterface(); - B_right_fulls[j].indexInterface(); } // Sanity check if (size_t(rightMax + leftMax + numberOfStrains) != - B_left.rank(this->textLength)) { + B_left.rank(textLength)) { std::cout << "Warning: node count is incorrect." << std::endl; } // // Debugging - // for (length_t i = 0; i < this->textLength; i++) { - // std::cout << i << " B_right " << B_right[i] << " B_right_full " << - // B_right_full[i] << std::endl; + // for (length_t i = 0; i < textLength; i++) { + // std::cout << i << " B_right " << B_rights[0][i] << " B_left " + // << B_left[i] << std::endl; // } // Sanity check @@ -885,12 +846,14 @@ length_t FMIndexDBG::getFilterSpecialCases() const { } template void FMIndexDBG::resetCounters() { - this->nodeCounter = 0; - this->matrixElementCounter = 0; - this->positionsInPostProcessingCounter = 0; - this->redundantNodePathsCounter = 0; + nodeCounter = 0; + matrixElementCounter = 0; + positionsInPostProcessingCounter = 0; + redundantNodePathsCounter = 0; nodeDBGCounter = 0; filterSpecialCaseCounter = 0; + elapsedNodePaths = std::chrono::duration::zero(); + elapsedSAtoText = std::chrono::duration::zero(); } template @@ -912,124 +875,26 @@ int FMIndexDBG::findIDFirst(length_t i) const { } template -void FMIndexDBG::findID(length_t i, uint32_t& id, +void FMIndexDBG::findID(length_t j, uint32_t& id, uint32_t& l) const { nodeDBGCounter++; - bool id_found = false; + j--; l = 0; // The offset of the checkpoint in the node uint32_t l_offset, id_right; // Move to the left until an identifier is found - while (!id_found) { - if (B_right_full[i] == 1) { - // Find the number of ones up to i in B_right - id_right = B_right.rank(i); - // Retrieve the correct node ID using the node mapping - id = mapping_right[id_right].id; - id_found = true; - // Retrieve the offset of the checkpoint in the node - l_offset = mapping_right[id_right].distanceFromRightEnd; - } else { - // Shift the k-length window one character to the left - i = this->findLF(i, false); - l++; - } - } - if (l_offset != 0) { - // A checkpoint that is not at the end of the node was used to identify - // the k-mer - l = G[id].len - k - l_offset + l; - return; - } - - if (l == 0) { - // The input k-mer already was the rightmost k-mer of the node - l = G[id].len - k; - return; - } - - // TODO: benchmark the two options below - // TODO: is there a good option using BWT? - - // Find the rank in the SA of the edge through which we entered the - // predecessor - uint32_t offset = i - G[id].right_kmer_forward; - // Convert to a rank in the reverse SA - uint32_t offset_reverse = G[id].edgeMapping[offset]; - // Jump to the successor - id = jumpToSuccessorThroughEdge(id, offset_reverse); - - // // Alternative using suffix array: - // // But: the text is used here, which shouldn't be done - // char c = this->text[findSA(i)+k]; - // jumpToSuccessorWithChar(id, id, this->sigma.c2i(c)); - - // Decrement l because jumped one character to the right when moving to the - // successor - l--; - return; -} - -template -void FMIndexDBG::findIDandOffset( - length_t i, uint32_t& id, uint32_t& l, uint32_t& offset_reverse) const { - nodeDBGCounter++; - bool id_found = false; - l = 0; - // The offset of the checkpoint in the node - uint32_t l_offset, id_right; - // Move to the left until an identifier is found - while (!id_found) { - if (B_right_full[i] == 1) { - // Find the number of ones up to i in B_right - id_right = B_right.rank(i); - // Retrieve the correct node ID using the node mapping - id = mapping_right[id_right].id; - id_found = true; - // Retrieve the offset of the checkpoint in the node - l_offset = mapping_right[id_right].distanceFromRightEnd; - } else { - // Shift the k-length window one character to the left - i = this->findLF(i, false); - l++; - } - } - if (l_offset != 0) { - // A checkpoint that is not at the end of the node was used to identify - // the k-mer - l = G[id].len - k - l_offset + l; - // Find the regular offset of this edge in SA using rank and select - // operations - // B_right.rank(i) or id_right can never be 0, since the - // character $ cannot appear in the middle of a node. - uint32_t offset = B_right_full.rank(i) - - B_right_full.rank(B_right.select(id_right - 1) + 1); - // Convert to a rank in the reverse SA - offset_reverse = G[id].edgeMapping[offset]; - return; - } - - if (l == 0) { - // The input k-mer already was the rightmost k-mer of the node - l = G[id].len - k; - // Find the rank in the SA of the edge through which we entered the - // predecessor - uint32_t offset = i - G[id].right_kmer_forward; - // Convert to a rank in the reverse SA - offset_reverse = G[id].edgeMapping[offset]; - return; + while (B_right[j] == 0) { + // Shift the k-length window one character to the left + j = findLF(j, false); + l++; } - - // Find the rank in the SA of the edge through which we entered the - // predecessor - uint32_t offset = i - G[id].right_kmer_forward; - // Convert to a rank in the reverse SA - offset_reverse = G[id].edgeMapping[offset]; - // Jump to the successor - id = jumpToSuccessorThroughEdge(id, offset_reverse); - // Decrement l because jumped one character to the right when moving to the - // successor - l--; + // Find the number of ones up to i in B_right + id_right = B_right.rank(j); + // Retrieve the correct node ID using the node mapping + id = mapping_right[id_right].id; + // Retrieve the offset of the checkpoint in the node + l_offset = mapping_right[id_right].distanceFromRightEnd; + l = G[id].len - k_DBG - l_offset + l; return; } @@ -1038,7 +903,7 @@ int FMIndexDBG::jumpToSuccessorThroughEdge( uint32_t id, uint32_t& offset_reverse) const { // Move one step to the right length_t left_kmer_reverse = - this->findLF(G[id].right_kmer_reverse + offset_reverse, true); + findLF(G[id].right_kmer_reverse + offset_reverse, true); // Find the ID of the successor id = findIDFirst(left_kmer_reverse); // Find the new reverse offset @@ -1051,7 +916,7 @@ int FMIndexDBG::jumpToPredecessorThroughEdge( uint32_t id, uint32_t& offset) const { // Move one step to the right length_t right_kmer_forward = - this->findLF(G[id].left_kmer_forward + offset, false); + findLF(G[id].left_kmer_forward + offset, false); // Find the ID of the successor id = findIDLast(right_kmer_forward); // Find the new offset @@ -1072,7 +937,7 @@ bool FMIndexDBG::jumpToSuccessorWithChar( node.right_kmer_reverse + node.multiplicity)); // Append the next character to the substring and find the corresponding // range pair - if (!this->findRangesWithExtraCharForward(posInAlphabet, p, p)) { + if (!findRangesWithExtraCharForward(posInAlphabet, p, p)) { // No successor was found return false; } @@ -1080,7 +945,6 @@ bool FMIndexDBG::jumpToSuccessorWithChar( // that the offset is not larger than the number of such end nodes that can // be accessed. - // TODO: why is reverse offset a signed int? if (reverse_offset >= p.width()) { // No extra successor was found return false; @@ -1103,7 +967,7 @@ bool FMIndexDBG::jumpToPredecessorWithChar( node.right_kmer_reverse + node.multiplicity)); // Prepend the next character to the substring and find the corresponding // range pair - if (!this->findRangesWithExtraCharBackward(posInAlphabet, p, p)) { + if (!findRangesWithExtraCharBackward(posInAlphabet, p, p)) { // No predecessor was found return false; } @@ -1116,11 +980,9 @@ template std::vector FMIndexDBG::convertToMatchesInTextSFI( const FMOccSFI& saMatch) { - if (saMatch.getNodePath().empty()) { - return std::vector{}; - } return convertToMatchesInTextSFI(saMatch.getRanges(), saMatch.getNodePath(), + saMatch.getDistanceFromLeftEnd(), saMatch.getDepth(), saMatch.getDistance(), saMatch.getShift()); } @@ -1129,22 +991,23 @@ template std::vector FMIndexDBG::convertToMatchesInTextSFI( const SARangePair& ranges, const std::vector& nodepath, - const int& patternLength, const int& distance, const length_t& shift) { + const uint32_t& distanceFromLeftEnd, const int& patternLength, + const int& distance, const length_t& shift) { std::vector textMatches; textMatches.reserve(ranges.width()); for (length_t i = ranges.getRangeSA().getBegin(); i < ranges.getRangeSA().getEnd(); i++) { // find the startPosition in the text by looking at the SA - length_t startPos = this->findSA(i) + shift; + length_t startPos = findSA(i) + shift; // cap startPos at textLength - startPos = startPos % this->textLength; + startPos = startPos % textLength; length_t endPos = startPos + patternLength; textMatches.emplace_back(Range(startPos, endPos), distance, nodepath, - findStrain(startPos)); + findStrain(startPos), distanceFromLeftEnd); } return textMatches; } @@ -1158,18 +1021,19 @@ FMIndexDBG::separationIsNext(positionClass pos) const { template void FMIndexDBG::extendFMPos( const SARangePair& parentRanges, - std::vector>& stack, int row, int trueDepth) { + std::vector>& stack, int row, int trueDepth) const { // iterate over the entire alphabet, excluding the separation characters - for (length_t i = 2; i < this->sigma.size(); i++) { + for (length_t i = 2; i < sigma.size(); i++) { - this->extendFMPosIntermediary(parentRanges, stack, row, i, trueDepth); + extendFMPosIntermediary(parentRanges, stack, row, i, trueDepth); } } template void FMIndexDBG::extendFMPos( - const positionClass& pos, std::vector>& stack) { + const positionClass& pos, + std::vector>& stack) const { pos.extendFMPos(stack); } @@ -1177,73 +1041,162 @@ void FMIndexDBG::extendFMPos( template FMOccSFI FMIndexDBG::findNodePathForMatch( const FMOcc& occ) { + auto start = chrono::high_resolution_clock::now(); // Refer to the correct path finding function std::vector path = {}; - findNodePathForMatchForward(occ.getPosition(), occ.getShift(), path); - this->redundantNodePathsCounter++; - return FMOccSFI(occ, path); + uint32_t distanceFromLeftEnd = 0; + findNodePathForMatchForward(occ.getPosition(), occ.getShift(), path, + distanceFromLeftEnd); + redundantNodePathsCounter++; + auto finish = chrono::high_resolution_clock::now(); + elapsedNodePaths += finish - start; + return FMOccSFI(occ, path, distanceFromLeftEnd); +} + +template +void FMIndexDBG::findNodeUnderK( + const positionClass& pos, const int& shift, + std::vector& + path) { // TODO: greedy version, could use some more tweaking + length_t originalDepth = pos.getDepth(); + + setDirection(BACKWARD); + + std::vector> stack; + std::vector> candidatesAfterSearch; + std::vector> candidatesNearStartOfGenome; + stack.push_back(FMPosExt((char)0, pos)); + length_t originalWidth = pos.getRanges().width(); + while (!stack.empty()) { + FMPosExt p = stack.back(); + stack.pop_back(); + if (p.getTrueDepth() == k_DBG) { + candidatesAfterSearch.push_back(p); + originalWidth -= p.getRanges().width(); + } else { + extendFMPos(p, stack); + } + length_t newWidth = 0; + for (auto tempPos : stack) { + newWidth += tempPos.getRanges().width(); + } + if (newWidth != originalWidth) { + candidatesNearStartOfGenome.push_back(p); + } + } + + setDirection(FORWARD); + + for (FMPosExt p : candidatesNearStartOfGenome) { + stack.push_back(FMPosExt((char)0, p)); + } + + while (!stack.empty()) { + FMPosExt p = stack.back(); + stack.pop_back(); + if (p.getTrueDepth() == k_DBG) { + candidatesAfterSearch.push_back(p); + originalWidth -= p.getRanges().width(); + } else { + extendFMPos(p, stack); + } + length_t newWidth = 0; + for (auto tempPos : stack) { + newWidth += tempPos.getRanges().width(); + } + if (newWidth != originalWidth) { + candidatesNearStartOfGenome.push_back(p); + } + } + + std::vector> stack2; + + for (FMPosExt p : candidatesAfterSearch) { + // Get the left boundary of the SA interval of the match + length_t rb = p.getRanges().getRangeSA().getEnd(); + // Find the corresponding node of the path along with the + // distance of the k-mer of the match to the beginning of the node + uint32_t id, l; + findID(rb, id, l); + if (!std::count(path.begin(), path.end(), id)) { + path.push_back(id); + auto len = G[id].len; + auto var = len - l - k_DBG + originalDepth; + if (var < k_DBG) { + stack2.push_back(make_pair(id, var - originalDepth)); + } + } + } + + while (!stack2.empty()) { + auto pair = stack2.back(); + stack2.pop_back(); + for (size_t i = 2; i < sigma.size(); i++) { + // Check if there exists a successor that is the result of + // extension with the current character + uint32_t id_successor; + if (jumpToSuccessorWithChar(pair.first, id_successor, i)) { + if (!std::count(path.begin(), path.end(), id_successor)) { + path.push_back(id_successor); + auto var = pair.second + G[id_successor].len - k_DBG + 1 + + originalDepth; + if (var < k_DBG) { + stack2.push_back( + make_pair(id_successor, var - originalDepth)); + } + } + } + } + } } template void FMIndexDBG::findNodePathForMatchForward( - const positionClass& occ, const int& shift, - std::vector& path) const { - // A match shorter than k does not have a corresponding node path - if (occ.getDepth() < k) { - std::cout - << "WARNING: a match shorter than k was detected and will not be " - "reported." - << std::endl; + const positionClass& occ, const int& shift, std::vector& path, + uint32_t& distanceFromLeftEnd) { + // A match shorter than k does not have a unique node path + if (occ.getTrueDepth() < k_DBG) { + // Find all corresponding nodes + findNodeUnderK(occ, shift, path); return; } // Get the left boundary of the SA interval of the match length_t lb = occ.getRanges().getRangeSA().getBegin(); + // Get the start position of an occurrence in the reference text + length_t positionInText = findSA(lb) + shift; + // Get the string corresponding to the first k-mer of the occurrence + string firstKmer = + text.decodeSubstring(sigma, positionInText, positionInText + k_DBG); + // Get the SA range corresponding to this first k-mer + Range kmerRange = matchString(firstKmer); // Find the first node of the path along with the distance of the first // k-mer of the match to the beginning of the first node - - // TODO: why are id, l andn offset_reverse signed ints? - // This gives compiler warnings for comparison with unsigned ints - uint32_t id, l, offset_reverse; - findIDandOffset(lb, id, l, offset_reverse); - // Find the distance of the first k-mer of the match to the end of the first - // node - l = G[id].len - l - k; - // Check if the current node is an end node - bool endNode = id >= numberOfGraphNodes - numberOfStrains; - // Store how much characters of the match still need to be handled - int numberOfSteps = occ.getDepth() - k - 1; - int pos = numberOfSteps + shift - l; - // In case there is a shift present, check if the node is already part of - // the path - if (pos <= numberOfSteps) { - path.emplace_back(id); - } - while (pos >= 0) { - // If the previous node was an end node, it should have been the last - // one. Otherwise, an empty path is returned - // TODO: remove this if fixed in underlying code - if (endNode) { - path = {}; - return; - } - // Find the ID of the next node along with the new offset - id = jumpToSuccessorThroughEdge(id, offset_reverse); - // Check if the new node is an end node - endNode = id >= numberOfGraphNodes - numberOfStrains; + uint32_t id, pos; + findID(kmerRange.getEnd(), id, pos); + // Set distanceFromLeftEnd + distanceFromLeftEnd = pos; + // Find the distance of the the beginning of first k-mer of the match to the + // end of the first node. This reflects how much of the occurrence has been + // handled. + pos = G[id].len - pos; + // Store the start node in the node path + path.emplace_back(id); + while (pos < occ.getDepth()) { + // Copy the old id + uint32_t oldID = id; + // Find the ID of the successor using the next character of the + // occurrence + jumpToSuccessorWithChar(oldID, id, text[positionInText + pos]); // Find the new position in the match based on the length of the new // node - l = G[id].len - k; - pos -= l + 1; - // In case there is a shift present... - if (pos <= numberOfSteps) { - // Add the new node to the path - path.emplace_back(id); - } + pos += G[id].len - (k_DBG - 1); + // Add the new node to the path + path.emplace_back(id); } } template -int FMIndexDBG::findStrain(length_t input) { +int FMIndexDBG::findStrain(length_t input) const { int b = 0; int e = sorted_startpositions.size(); if (input >= sorted_startpositions[e - 1]) { @@ -1269,60 +1222,6 @@ int FMIndexDBG::findStrain(length_t input) { // ROUTINES FOR MAPPING // ---------------------------------------------------------------------------- -// ---------------------------------------------------------------------------- -// ROUTINES FOR EXACT PATTERN MATCHING -// ---------------------------------------------------------------------------- - -template -std::vector -FMIndexDBG::ExactMatchSFI(const string& pattern) { - // A pattern shorter than k does not have a corresponding node path - uint m = pattern.length(); - if (m < k) { - throw std::runtime_error("Pattern length cannot be smaller than k"); - } - this->setDirection(BACKWARD); - // Find the SA ranges for the rightmost k-mer of the pattern - SARangePair p = this->matchStringBidirectionally( - Substring(pattern, m - k, m, BACKWARD)); - - if (p.getRangeSA().getBegin() >= p.getRangeSA().getEnd()) { - // The rightmost k-mer does not occur in the reference - return std::vector{}; - } - // Find the last node of the path along with the distance of the last - // k-mer of the pattern to the beginning of the last node - uint32_t id, l; - findID(p.getRangeSA().getBegin(), id, l); - // Initialize the node path with the last node - std::vector resList = {id}; - // Store how much characters of the match still need to be handled - int pos = m - k - 1; - while (p.getRangeSA().getBegin() < p.getRangeSA().getEnd() && pos >= 0) { - // Add the next character of the pattern to the current match - this->addChar(pattern[pos], p); - pos--; - - if (l > 0) { - // We stay within the current node - l--; - } else { - // We jump to a predecessor and need to find its ID - id = findIDLast(p.getRangeSA().getBegin()); - // Add the new node to the path - resList.insert(resList.begin(), id); - // Find the current distance to the beginning of the current node - l = G[id].len - k; - } - } - if (p.getRangeSA().getBegin() >= p.getRangeSA().getEnd()) { - // The complete pattern does not occur in the reference - return std::vector{}; - } else { - return convertToMatchesInTextSFI(p, resList, pattern.length()); - } -} - // ---------------------------------------------------------------------------- // ROUTINES FOR APPROXIMATE PATTERN MATCHING // ---------------------------------------------------------------------------- @@ -1333,7 +1232,7 @@ FMIndexDBG::approxMatchesNaiveSFI(const std::string& pattern, length_t maxED) { std::vector> occurrences = - this->approxMatchesNaiveIntermediate(pattern, maxED); + approxMatchesNaiveIntermediate(pattern, maxED); return mapOccurrencesInSAToOccurrencesInTextSFI(occurrences, maxED); } @@ -1342,6 +1241,7 @@ template std::map, std::vector> FMIndexDBG::mapOccurrencesInSAToOccurrencesInTextSFI( std::vector>& occ, const int& maxED) { + auto start = chrono::high_resolution_clock::now(); sort(occ.begin(), occ.end()); occ.erase(unique(occ.begin(), occ.end()), occ.end()); @@ -1351,20 +1251,24 @@ FMIndexDBG::mapOccurrencesInSAToOccurrencesInTextSFI( map posToBestMatch; if (occ.size() == 0) { + auto finish = chrono::high_resolution_clock::now(); + elapsedSAtoText += finish - start; return {}; } if (occ.size() == 1) { // all occ are distinct - this->positionsInPostProcessingCounter = occ[0].getRanges().width(); - auto m = convertToMatchesInTextSFI(findNodePathForMatch(occ[0])); - sort(m.begin(), m.end()); - for (auto& occ : m) { - occ.generateOutput(); - } + positionsInPostProcessingCounter = occ[0].getRanges().width(); + const auto& m = convertToMatchesInTextSFI(findNodePathForMatch(occ[0])); std::map, std::vector> paths; - for (auto occ : m) { - paths[occ.getNodePath()].emplace_back(occ); + for (const auto& mOcc : m) { + paths[mOcc.getNodePath()].emplace_back(mOcc); + paths[mOcc.getNodePath()].back().generateOutput(); } + for (const auto& myPair : paths) { + sort(paths[myPair.first].begin(), paths[myPair.first].end()); + } + auto finish = chrono::high_resolution_clock::now(); + elapsedSAtoText += finish - start; return paths; } @@ -1372,9 +1276,9 @@ FMIndexDBG::mapOccurrencesInSAToOccurrencesInTextSFI( for (const auto& it : occ) { const Range& range = it.getRanges().getRangeSA(); - this->positionsInPostProcessingCounter += range.width(); + positionsInPostProcessingCounter += range.width(); - auto matchesInTextToCheck = + const auto& matchesInTextToCheck = convertToMatchesInTextSFI(findNodePathForMatch(it)); occurrencesInText.insert(occurrencesInText.end(), matchesInTextToCheck.begin(), @@ -1424,7 +1328,8 @@ FMIndexDBG::mapOccurrencesInSAToOccurrencesInTextSFI( for (TextOccurrenceSFI occ : nonRedundantOcc) { paths[occ.getNodePath()].emplace_back(occ); } - + auto finish = chrono::high_resolution_clock::now(); + elapsedSAtoText += finish - start; return paths; } @@ -1642,7 +1547,7 @@ bool FMIndexDBG::handleIfPrefix( // A complete prefix was found // Now check if one of these two occurrences can actually replace - // eachother. This is only the case if their starting point wrt the + // each other. This is only the case if their starting point wrt the // first node is within a maximum distance. if (!(FMOccSFR::checkProximity)( previousMatches[len - minLen - decrease], m, maxED)) { @@ -1818,7 +1723,7 @@ void FMIndexDBG::filterLinearInOneDirection( // Create the previousMatches vector, which will store the prefixbranch that // is considered at a given time in the algorithm. A prefix branch is a set - // of occurrences, all of which are prefixes of eachother. + // of occurrences, all of which are prefixes of each other. std::vector previousMatches(maxLen - minLen + 1, nullptr); // Create the previousMinimums vector, which will store the minimum // occurrences that correspond to the occurrences in previousMatches. A @@ -2023,7 +1928,7 @@ void FMIndexDBG::filterDifferentNodePathsComplete( // other // Now check if one of these two occurrences can - // actually replace eachother. This is only the + // actually replace each other. This is only the // case if their starting point wrt the graph // is within a maximum distance. @@ -2050,7 +1955,7 @@ void FMIndexDBG::filterDifferentNodePathsComplete( .getDistanceFromLeftEnd(); } // Also take the nodes of the longest path that - // occure before the shortest path in + // occurs before the shortest path in // consideration for (size_t nodeIterator = 0; nodeIterator < index; nodeIterator++) { @@ -2116,7 +2021,17 @@ std::vector FMIndexDBG::filterStrainFreeMatches( bool linear = !filteringOptionComplete; // Increase the total number of reported matches - this->positionsInPostProcessingCounter += occ.size(); + positionsInPostProcessingCounter += occ.size(); + + for (auto& o : occ) { + if (o.getPosition().getNodePath().empty()) { + vector nodePath; + FMPosSFR pos = o.getPosition(); + findNodeUnderK(pos, 0, nodePath); + pos.setNodePath(nodePath); + o.setPosition(pos); + } + } // Sort the occurrences if (linear) { @@ -2193,8 +2108,9 @@ void FMIndexDBG::initializeFilesForVisualization( if (!multipleSubgraphs) { // Write out the headers if this was not yet done - edgefile << "EdgeKey\tSource\tOmegaShort\tOmegaFull\tPartOfPath\tTarget" - "\tOmegaShort\tOmegaFull\tPartOfPath\tColor\n"; + edgefile + << "EdgeKey\tSource\tOmegaShort\tOmegaFull\tPartOfPath\tTarget" + "\tOmegaShort\tOmegaFull\tPartOfPath\tColor\tEdgeMultiplicity\n"; } } @@ -2217,9 +2133,11 @@ void FMIndexDBG::fillInVisualizationNode( // Check if this node is part of the original node path bool part_of_path = std::count(path.begin(), path.end(), id) > 0; + // Get the index of an occurrence of this node in the text + length_t indexInText = findSA(node.left_kmer_forward); // Get the string corresponding to the node string omega = - this->text.substr(this->findSA(node.left_kmer_forward), node.len); + text.decodeSubstring(sigma, indexInText, indexInText + node.len); string omega_short; // Get the short version of the string corresponding to the node @@ -2245,8 +2163,10 @@ template void FMIndexDBG::visualizeSubgraphIntermediary( std::vector& path, std::string subgraph_id, std::vector& visited_nodes, - std::queue>& node_queue, std::ofstream& edgefile, - size_t& edgecounter, std::vector& visualizedNodes) { + std::queue>& node_queue, + std::map>& edges, + size_t& edgecounter, std::vector& visualizedNodes, + bool separateEdges, std::set& subgraphNodes) { // Take the next node ID from the queue along with its neighborhood depth std::pair p = node_queue.front(); @@ -2254,6 +2174,7 @@ void FMIndexDBG::visualizeSubgraphIntermediary( // Find the corresponding node uint32_t id = p.first; Node node = G[id]; + subgraphNodes.insert(id); uint32_t current_depth = p.second; // Create the visualization node with extra attributes such as omega, if @@ -2291,21 +2212,31 @@ void FMIndexDBG::visualizeSubgraphIntermediary( *visualizedNodes[id_predecessor]; // Report the edge between the current node and its predecessor, - // along with the node attributes - edgefile << subgraph_id << "_Edge" << edgecounter << "\t" - << subgraph_id << id_predecessor << "\t" - << id_predecessor << ":" - << visNodePredecessor.omega_short << "\t" - << visNodePredecessor.omega << "\t" - << visNodePredecessor.part_of_path << "\t" - << subgraph_id << id << "\t" << id << ":" - << visNode.omega_short << "\t" << visNode.omega << "\t" - << visNode.part_of_path << "\t" - << findStrain(this->findSA(i + node.left_kmer_forward)) - << "\n"; + // along with the node attributes to a temporary buffer + std::stringstream buffer; + buffer << subgraph_id << "_Edge" << edgecounter << "\t" + << subgraph_id << id_predecessor << "\t" + << id_predecessor << ":" + << visNodePredecessor.omega_short << "\t" + << visNodePredecessor.omega << "\t" + << visNodePredecessor.part_of_path << "\t" << subgraph_id + << id << "\t" << id << ":" << visNode.omega_short << "\t" + << visNode.omega << "\t" << visNode.part_of_path; + if (separateEdges) { + // Each edge is reported separately with its corresponding + // strain ID + buffer << "\t" + << findStrain(findSA(i + node.left_kmer_forward)); + edges[buffer.str()][0]++; + } else { + // All edges between the same nodes are bundled into one. + // Strain IDs and multiplicities are stored as an attribute + edges[buffer.str()] + [findStrain(findSA(i + node.left_kmer_forward))]++; + } // Make sure the predecessor is visited if it is within the // neighborhood depth - if (!nodePredecessor.visited && current_depth > 0) { + if (!nodePredecessor.visited) { visitNode(id_predecessor, current_depth - 1, visited_nodes, node_queue); } @@ -2314,10 +2245,9 @@ void FMIndexDBG::visualizeSubgraphIntermediary( } // Check that the current node is not an end note and that its successors // still belong to the neighborhood - if (!(node.right_kmer_forward < (length_t)numberOfStrains) && - current_depth > 0) { + if (id < numberOfGraphNodes - numberOfStrains && current_depth > 0) { // Iterate over all possible characters - for (size_t i = 0; i < this->sigma.size(); i++) { + for (size_t i = 0; i < sigma.size(); i++) { uint32_t id_successor; // Try to jump to a new character by appending a character to the // substring of the current node. @@ -2353,17 +2283,18 @@ void FMIndexDBG::visualizeSubgraphIntermediary( // ---------------------------------------------------------------------------- template -void FMIndexDBG::visualizeSubgraph(std::vector& path, - uint32_t depth, - std::string filename, - bool multipleSubgraphs, - string subgraph_id) { +std::set FMIndexDBG::visualizeSubgraph( + std::vector& path, uint32_t depth, std::string filename, + bool separateEdges, bool multipleSubgraphs, string subgraph_id) { std::cout << "Constructing subgraph..." << std::endl; // Initialize the output file std::ofstream edgefile; initializeFilesForVisualization(filename, multipleSubgraphs, edgefile); + // Save all nodes in the subgraph + std::set subgraphNodes; + // Initialize a std::vector that keeps track of all visited nodes std::vector visited_nodes{}; // Create a node queue that keeps track of all nodes that still need to be @@ -2384,18 +2315,57 @@ void FMIndexDBG::visualizeSubgraph(std::vector& path, // Create an empty array that will eventually contain all nodes visualized std::vector visualizedNodes(G.size(), nullptr); + // Create an empty map that will eventually contain all edges visualized + std::map> edges; + // Iterate over all nodes in the neighborhood of the node path while (!node_queue.empty()) { - visualizeSubgraphIntermediary(path, subgraph_id, visited_nodes, - node_queue, edgefile, edgecounter, - visualizedNodes); + visualizeSubgraphIntermediary( + path, subgraph_id, visited_nodes, node_queue, edges, edgecounter, + visualizedNodes, separateEdges, subgraphNodes); + } + + // Write out the edges to the actual output file + for (const auto& edgePair : edges) { + // Write out all attributes except for the strain ID and multiplicity + // data + edgefile << edgePair.first << "\t"; + // Initialize bool necessary for bundled edges + bool firstEntry = true; + // Initialize the total edge multiplicity, also necessary for bundled + // edges + length_t edgeMultiplicity = 0; + // Iterate over all pairs + for (const auto& subPair : edgePair.second) { + if (separateEdges) { + // No bundled edges, so there is only one edge pair present + edgefile << subPair.second; + } else { + // Bundled edges + if (firstEntry) { + firstEntry = false; + } else { + // Separation character between pairs + edgefile << ","; + } + // Write out the strain ID with its multiplicity as color + edgefile << subPair.first << ":" << subPair.second; + // Keep track of the total multiplicity + edgeMultiplicity += subPair.second; + } + } + if (!separateEdges) { + // For bundled edges: write out the total multiplicity + edgefile << "\t" << edgeMultiplicity; + } + edgefile << "\n"; } // Close output file edgefile.close(); // Memory cleanup - for (auto visNode : visualizedNodes) { + for (const auto& visNode : visualizedNodes) { if (visNode) { delete visNode; } @@ -2407,23 +2377,27 @@ void FMIndexDBG::visualizeSubgraph(std::vector& path, for (size_t i = 0; i < visited_nodes.size(); i++) { G[visited_nodes[i]].visited = false; } + + return subgraphNodes; } // For strain-fixed matching: template void FMIndexDBG::visualizeSubgraphs( - std::map, std::vector>& paths, - uint32_t depth, std::string filename) { + const std::map, std::vector>& + paths, + uint32_t depth, std::string filename, bool separateEdges) { std::ofstream edgefile; std::ofstream overviewfile; - this->getText(); // TODO: is this efficient? - // Create the outputfiles + getText(); + // Create the output files edgefile.open(filename + "_SubgraphEdges.tsv"); overviewfile.open(filename + "_SubgraphOverview.tsv"); - // Initialize the headers of the outputfiles + // Initialize the headers of the output files edgefile << "EdgeKey\tSource\tOmegaShort\tOmegaFull\tPartOfPath\tTarget\tOm" - "egaShort\tOmegaFull\tPartOfPath\tColor\n"; - overviewfile << "SubgraphID\tPath\tStrain\tPosition\tLength\tED\n"; + "egaShort\tOmegaFull\tPartOfPath\tColor\tEdgeMultiplicity\n"; + overviewfile << "SubgraphID\tPath\tDistanceFromLeftEnd\tStrain\tPosition\tL" + "ength\tED\n"; edgefile.close(); @@ -2432,17 +2406,21 @@ void FMIndexDBG::visualizeSubgraphs( for (const auto& path : paths) { // Visualize the subgraph corresponding to this path std::vector nodepath = path.first; - visualizeSubgraph(nodepath, depth, filename, true, + visualizeSubgraph(nodepath, depth, filename, separateEdges, true, "Subgraph" + to_string(counter) + "_"); // Report all matches in the reference text that correspond to this path for (length_t i = 0; i < path.second.size(); i++) { + // Determine the separation character based on the occurrence length + char separationchar = + path.second[i].getRange().width() < k_DBG ? '/' : ','; // Report the path overviewfile << counter << "\t" << nodepath[0]; for (length_t i = 1; i < nodepath.size(); i++) { - overviewfile << "," << nodepath[i]; + overviewfile << separationchar << nodepath[i]; } // Report information on the match in the reference - overviewfile << "\t" << path.second[i].getStrain() << "\t" + overviewfile << "\t" << path.second[i].getDistanceFromLeftEnd() + << "\t" << path.second[i].getStrain() << "\t" << path.second[i].getRange().getBegin() << "\t" << path.second[i].getRange().width() << "\t" << path.second[i].getDistance() << "\n"; @@ -2454,19 +2432,19 @@ void FMIndexDBG::visualizeSubgraphs( // For strain-free matching: template -void FMIndexDBG::visualizeSubgraphs(std::vector& paths, - uint32_t depth, - std::string filename) { +void FMIndexDBG::visualizeSubgraphs( + const std::vector& paths, uint32_t depth, std::string filename, + bool separateEdges) { std::ofstream nodefile; std::ofstream edgefile; std::ofstream overviewfile; - this->getText(); // TODO: is this efficient? - // Create the outputfiles + getText(); + // Create the output files edgefile.open(filename + "_SubgraphEdges.tsv"); overviewfile.open(filename + "_SubgraphOverview.tsv"); - // Initialize the headers of the outputfiles + // Initialize the headers of the output files edgefile << "EdgeKey\tSource\tOmegaShort\tOmegaFull\tPartOfPath\tTarget\tOm" - "egaShort\tOmegaFull\tPartOfPath\tColor\n"; + "egaShort\tOmegaFull\tPartOfPath\tColor\tEdgeMultiplicity\n"; overviewfile << "SubgraphID\tPath\tDistanceFromLeftEnd\tLength\tED\n"; edgefile.close(); @@ -2476,7 +2454,7 @@ void FMIndexDBG::visualizeSubgraphs(std::vector& paths, for (const auto& path : paths) { // Visualize the subgraph corresponding to this path std::vector nodepath = path.getPosition().getNodePath(); - visualizeSubgraph(nodepath, depth, filename, true, + visualizeSubgraph(nodepath, depth, filename, separateEdges, true, "Subgraph" + to_string(counter) + "_"); // Report the match in the reference text that corresponds to this path overviewfile << counter << "\t" << nodepath[0]; diff --git a/src/fmindexDBG.h b/src/fmindexDBG.h index 5f9a847..4f15994 100644 --- a/src/fmindexDBG.h +++ b/src/fmindexDBG.h @@ -27,13 +27,14 @@ #include "mappingpair.h" #include "node.h" #include "rankinterface.h" -#include "rankselectinterface.h" +#include #include #include +#include // ============================================================================ -// CLASS FMINDEXDBG +// CLASS FM-INDEX DBG // ============================================================================ template @@ -42,9 +43,31 @@ class FMIndexDBG : public FMIndex { friend class FMPosSFR; private: - // De Bruijn parameter - uint k; - // Number of strains in the pan-gemome + using FMIndex::textLength; + using FMIndex::sigma; + using FMIndex::text; + using FMIndex::counts; + using FMIndex::bwt; + using FMIndex::revbwt; + using FMIndex::fwdRepr; + using FMIndex::revRepr; + using FMIndex::sparseSA; + using FMIndex::numberOfSeparationCharacters; + using FMIndex::strainFree; + using FMIndex::nodeCounter; + using FMIndex::matrixElementCounter; + using FMIndex::positionsInPostProcessingCounter; + using FMIndex::redundantNodePathsCounter; + + using FMIndex::findSA; + using FMIndex::findLF; + using FMIndex::setDirection; + using FMIndex::extendFMPosIntermediary; + using FMIndex::matchString; + using FMIndex::fromFiles; + using FMIndex::populateTable; + + // Number of strains in the pan-genome uint numberOfStrains; // Number of nodes in the graph; uint numberOfGraphNodes; @@ -53,14 +76,11 @@ class FMIndexDBG : public FMIndex { // graph // Set the bits to one for the last entry in the range in the SA for the - // rightmost k-mer of every node. This bit vector supports rank and select. - RankSelectInterface B_right; + // rightmost k-mer of every node. This bit vector supports rank. + RankInterface B_right; // Set the bits to one for the last entry in the range in the reverse SA for // the leftmost k-mer of every node. This bit vector supports rank. RankInterface B_left; - // Set the bits to one for every entry in the range in the SA for the - // rightmost k-mer of every node. This bit vector supports rank. - RankInterface B_right_full; // Mapping of right node IDs std::vector mapping_right; @@ -78,14 +98,17 @@ class FMIndexDBG : public FMIndex { // filtering process thread_local static length_t filterSpecialCaseCounter; - // the (sparse) reverse suffix array of the reference genome, this is only - // initialized in the build process - SparseSuffixArray sparseRevSA; - // Boolean that stores the filtering option in case of strain-free matching: // linear or complete bool filteringOptionComplete = false; + // Timer for finding the node paths + std::chrono::duration elapsedNodePaths = + std::chrono::duration::zero(); + // Timer for accessing the SA and finding the strains + std::chrono::duration elapsedSAtoText = + std::chrono::duration::zero(); + // ---------------------------------------------------------------------------- // ROUTINES FOR THE BUILDING PROCESS // ---------------------------------------------------------------------------- @@ -95,101 +118,124 @@ class FMIndexDBG : public FMIndex { * bidirectional FM-index. * * @param baseFile Base filename for FM-index - * @param k k-mer size + * @param k_list List of the required k-mer sizes * @param sa_sparse Suffix array sparseness factor * @param checkpoint_sparseness Sparseness factors for the checkpoints that * aid in finding node identifiers. * @param progress Prints progress if true + * @param skip If true, the construction of the bidirectional FM-index can + * be skipped * @param option Select algorithm option */ - FMIndexDBG(const std::string& baseFile, const int k, + FMIndexDBG(const std::string& baseFile, const std::vector& k_list, const std::vector& sa_sparse, const std::vector& checkpoint_sparseness, - const bool progress, + const bool progress, const bool skip, const SelectOption& option = SelectOption::SIMPLE) - : FMIndex(0, baseFile), k(k) { + : FMIndex(0, baseFile) { - createFMIndex(baseFile, sa_sparse); + if (!skip) { + // The bidirectional FM-index must still be built + createFMIndex(baseFile, sa_sparse); + } - k_DBG = k; + // Set global k variable to 0 here, since there can be multiple k values + // asked for the construction process + k_DBG = 0; + + // Load all required FM-index files, as some are discared during its + // construction process to save memory + fromFiles(baseFile, false); - this->numberOfSeparationCharacters = 2; + // Load the sparse suffix array + sparseSA = SparseSuffixArray(baseFile, sa_sparse.back()); + + // populate table + populateTable(true); // Count the number of strains - numberOfStrains = - (std::count(this->bwt.begin(), this->bwt.end(), '%') + 1); + numberOfStrains = fwdRepr.occ(sigma.c2i('%'), textLength) + 1; + + numberOfSeparationCharacters = 2; - this->numberOfSeparationCharacters = 2; // Initialize the bit vectors - B_right.setN(this->textLength + 1); - B_right.setOption(RankSelectOption::RANK9SELECT); - B_left.setN(this->textLength + 1); + B_right.setN(textLength + 1); + B_right.setOption(RankOption::RANK9); + B_left.setN(textLength + 1); B_left.setOption(RankOption::RANK9); - B_right_full.setN(this->textLength + 1); - B_right_full.setOption(RankOption::RANK9); - std::vector B_rights(checkpoint_sparseness.size()); - std::vector B_right_fulls(checkpoint_sparseness.size()); + for (uint k : k_list) { - for (uint j = 0; j < checkpoint_sparseness.size(); j++) { - B_rights[j].setN(this->textLength + 1); - B_rights[j].setOption(RankSelectOption::RANK9SELECT); - B_right_fulls[j].setN(this->textLength + 1); - B_right_fulls[j].setOption(RankOption::RANK9); - } + // Clear the data corresponding to the previous k value + G.clear(); + mapping_left.clear(); + mapping_right.clear(); + + // Initialize the B_right vectors for the different checkpoint + // sparseness factors + std::vector B_rights(checkpoint_sparseness.size()); + for (uint j = 0; j < checkpoint_sparseness.size(); j++) { + B_rights[j].setN(textLength + 1); + B_rights[j].setOption(RankOption::RANK9); + } + + // Initialize the mappings for different checkpoint sparseness + // factors + std::vector> mapping_rights( + checkpoint_sparseness.size()); - std::vector> mapping_rights( - checkpoint_sparseness.size()); + // Reset matching direction + setDirection(FORWARD); - // Create the implicit compressed De Bruijn graph - buildCompressedGraph(checkpoint_sparseness, progress, B_rights, - B_right_fulls, mapping_rights); + // Create the implicit compressed De Bruijn graph + buildCompressedGraph(checkpoint_sparseness, progress, B_rights, + mapping_rights, k); - // Write the bitvectors to output files + // Write out the results + writeDBGfilesToOutput(baseFile, k, checkpoint_sparseness, B_rights, + mapping_rights); + } + } + + void writeDBGfilesToOutput( + const std::string& baseFile, const int k, + const std::vector& checkpoint_sparseness, + const std::vector& B_rights, + std::vector>& mapping_rights) { + // Write the bit vectors to output files { for (uint j = 0; j < checkpoint_sparseness.size(); j++) { std::string suffix = checkpoint_sparseness[j] == INT32_MAX ? "none" : std::to_string(checkpoint_sparseness[j]); - std::ofstream ofs(baseFile + ".B.right." + suffix); + std::string filename = baseFile + ".B.right.k"; + filename += std::to_string(k) + ".cp" + suffix; + std::ofstream ofs(filename); if (!ofs) { - throw std::runtime_error("Cannot open file: " + baseFile + - ".B.right." + suffix); + throw std::runtime_error("Cannot open file: " + filename); } B_rights[j].write(ofs); ofs.close(); } } { - std::ofstream ofs(baseFile + ".B.left"); + std::string filename = baseFile + ".B.left.k"; + filename += std::to_string(k); + std::ofstream ofs(filename); if (!ofs) { - throw std::runtime_error("Cannot open file: " + baseFile + - ".B.left"); + throw std::runtime_error("Cannot open file: " + filename); } B_left.write(ofs); ofs.close(); } - // Write the bitvectors to output files - { - for (size_t j = 0; j < checkpoint_sparseness.size(); j++) { - std::string suffix = - checkpoint_sparseness[j] == INT32_MAX - ? "none" - : std::to_string(checkpoint_sparseness[j]); - std::ofstream ofs(baseFile + ".B.right.full." + suffix); - if (!ofs) { - throw std::runtime_error("Cannot open file: " + baseFile + - ".B.right.full." + suffix); - } - B_right_fulls[j].write(ofs); - ofs.close(); - } - } - // Write the compressed De Bruijn graph to an output file + // Write the compressed De Bruijn graph to an output file, including the + // corresponding k value { - std::ofstream ofs(baseFile + ".DBG"); + std::string filename = baseFile + ".DBG.k"; + filename += std::to_string(k); + std::ofstream ofs(filename); ofs.write((char*)&k, sizeof(k)); for (std::vector::iterator itr = G.begin(); itr != G.end(); ++itr) { @@ -205,11 +251,11 @@ class FMIndexDBG : public FMIndex { checkpoint_sparseness[j] == INT32_MAX ? "none" : std::to_string(checkpoint_sparseness[j]); - std::ofstream ofs(baseFile + ".right.map." + suffix, - std::ios::binary); + std::string filename = baseFile + ".right.map.k"; + filename += std::to_string(k) + ".cp" + suffix; + std::ofstream ofs(filename, std::ios::binary); if (!ofs) { - throw std::runtime_error("Cannot open file: " + baseFile + - ".right.map." + suffix); + throw std::runtime_error("Cannot open file: " + filename); } for (std::vector::iterator itr = mapping_rights[j].begin(); @@ -220,10 +266,11 @@ class FMIndexDBG : public FMIndex { } } { - std::ofstream ofs(baseFile + ".left.map", std::ios::binary); + std::string filename = baseFile + ".left.map.k"; + filename += std::to_string(k); + std::ofstream ofs(filename, std::ios::binary); if (!ofs) { - throw std::runtime_error("Cannot open file: " + baseFile + - ".left.map"); + throw std::runtime_error("Cannot open file: " + filename); } ofs.write((char*)&mapping_left[0], mapping_left.size() * sizeof(int)); @@ -244,32 +291,17 @@ class FMIndexDBG : public FMIndex { const std::vector& sparse_sa); /** - * @brief Find the entry in the reverse suffix array of this index. This is - * computed from the sparse reverse suffix array and the reverse bwt. This - * function only works when the reverse SA is stored, which is only during - * the build process. - * - * @param index the index to find the entry in the reverse SA of - * @return length_t - the entry in the SA of the index - */ - length_t findRevSA(length_t index) const; - - /** - * @brief Build the compacted longest common prefix array. An entry of 0 - * means that the LCP is shorter than k, an entry of 1 means that the LCP - * has length k and an entry of 2 means that the LCP is longer than k. + * @brief Build the compacted longest common prefix array corresponding to + * the Kasai algorithm. An entry of 0 means that the LCP is shorter than k, + * an entry of 1 means that the LCP has length k and an entry of 2 means + * that the LCP is longer than k. * * @param LCP A 2-bitvector object in which the compacted LCP will be * stored. It will be overwritten. * @param progress Prints progress if true + * @param k The k value determining the compacted LCP */ - void computeLCP(Bitvec2& LCP, bool progress); - - /** - * @brief Compute the Br_right and Bl_right bitvectors - * - * @param Q Node queue - */ + void computeLCPKasai(Bitvec2& LCP, bool progress, uint& k); /** * @brief Build the Br_right and Bl_right bitvectors @@ -280,9 +312,10 @@ class FMIndexDBG : public FMIndex { * @param Bl_right bit vector in which Br_right will be stored, will be * overwritten * @param progress Prints progress if true + * @param k De Bruijn k parameter corresponding to the bitvectors */ void computeBitVectors(std::queue& Q, Bitvec& Br_right, - Bitvec& Bl_right, bool progress); + Bitvec& Bl_right, bool progress, uint& k); /** * @brief For an ω-interval [i,j[, the function call getIntervals(i,j) @@ -302,19 +335,11 @@ class FMIndexDBG : public FMIndex { * first k-mer is of interest * @param isEndNode indicates whether the node we are looking at is an end * node, since these have a different convention + * @param k length of the k-mer * @return Range - the range over the reverse suffix array corresponding to * the k-mer of interest */ - Range getReverseRange(length_t indexInSA, bool isEndNode); - - /** - * @brief Set the edge mapping of a node. The edge mapping maps the ranks of - * the edges in the SA to their ranks in the reverse SA. - * - * @param id The identifier of the node for which the mapping must be - * constructed. - */ - void setEdgeMapping(length_t id); + Range getReverseRangeKMer(length_t indexInSA, bool isEndNode, uint& k); /** * @brief Build the implicit compressed de Bruijn graph @@ -324,17 +349,15 @@ class FMIndexDBG : public FMIndex { * @param progress Prints progress if true * @param B_rights B_right bit vectors for different checkpoint sparseness * factors - * @param B_right_fulls B_right_full bit vectors for different checkpoint - * sparseness factors * @param mapping_rights mapping_right mappings for different checkpoint * sparseness factors + * @param k current de Bruijn parameter k */ void buildCompressedGraph(const std::vector& checkpoint_sparseness, - bool progress, - std::vector& B_rights, - std::vector& B_right_fulls, - std::vector>& mapping_rights); + bool progress, std::vector& B_rights, + std::vector>& mapping_rights, + uint& k); // ---------------------------------------------------------------------------- // HELPER ROUTINES FOR MAPPING @@ -362,36 +385,6 @@ class FMIndexDBG : public FMIndex { */ int findIDFirst(length_t i) const; - /** - * @brief If i is an index for the suffix array, find the node in the - * compressed de Bruijn graph that corresponds to T[SA[i]..SA[i]+k[. - * T[SA[j-1]..SA[j-1]+k[ can be at any location in the node. - * - * @param i Index in the suffix array indicating a certain k-mer of a - * node - * @param id the ID of the corresponding node in the compressed de Bruijn - * graph (to be filled in) - * @param l the number of characters that are before the k-mer in the node - * (to be filled in) - */ - - /** - * @brief If i is an index for the suffix array, find the node in the - * compressed de Bruijn graph that corresponds to T[SA[i]..SA[i]+k[. - * T[SA[j-1]..SA[j-1]+k[ can be at any location in the node. - * - * @param i Index in the suffix array indicating a certain k-mer of a - * node - * @param id the ID of the corresponding node in the compressed de Bruijn - * graph (to be filled in) - * @param l the number of characters that are before the k-mer in the node - * (to be filled in) - * @param offset the offset of the edge in the reverse SA corresponding to - * index i in the regular SA (to be filled in) - */ - void findIDandOffset(length_t i, uint32_t& id, uint32_t& l, - uint32_t& offset) const; - /** * @brief Find the identifier of the successor of node id by following the * 'offset'th edge (with respect to the reverse SA) through node id. Also @@ -422,7 +415,7 @@ class FMIndexDBG : public FMIndex { * * @param id Identifier of the current node * @param id_successor Identifier of the successor (to be filled in) - * @param posInAlphabet Position in the alfabet of the character to add + * @param posInAlphabet Position in the alphabet of the character to add * @param reverse_offset The offset in the new interval (with respect to the * reverse SA). This is needed for end nodes. * @return true - a successor was found @@ -438,7 +431,7 @@ class FMIndexDBG : public FMIndex { * * @param id Identifier of the current node * @param id_predecessor Identifier of the predecessor (to be filled in) - * @param posInAlphabet Position in the alfabet of the character to add + * @param posInAlphabet Position in the alphabet of the character to add * @param offset Placeholder variable such that jumpToPredecessorWithChar * and jumpToSuccessorWithChar have the same parameters. This variable is * not used. @@ -475,17 +468,18 @@ class FMIndexDBG : public FMIndex { * * @param ranges the ranges in the SA and revSA of the approximate match * @param nodepath the node path of the approximate match + * @param distanceFromLeftEnd the distance from the occurrence to the left + * end of the first node of the node path * @param patternLength the length of the approximate match * @param distance the error distance of the approximate match * @param shift the shit of the approximate match * @return std::vector - a vector of matches in the text * containing a range and the edit distance */ - std::vector - convertToMatchesInTextSFI(const SARangePair& ranges, - const std::vector& nodepath, - const int& patternLength, const int& distance = 0, - const length_t& shift = 0); + std::vector convertToMatchesInTextSFI( + const SARangePair& ranges, const std::vector& nodepath, + const uint32_t& distanceFromLeftEnd, const int& patternLength, + const int& distance = 0, const length_t& shift = 0); /** * @brief Check if one of the children of a position is a separation @@ -496,7 +490,7 @@ class FMIndexDBG : public FMIndex { * character * @return false otherwise */ - const bool separationIsNext(positionClass pos) const; + const bool separationIsNext(positionClass pos) const override; /** * @brief Pushes all the children corresponding to the node with ranges @@ -510,7 +504,7 @@ class FMIndexDBG : public FMIndex { */ void extendFMPos(const SARangePair& parentRanges, std::vector>& stack, int row, - int trueDepth = -1) override; + int trueDepth = -1) const override; /** * @brief Pushes all the children corresponding to the this position onto @@ -519,8 +513,9 @@ class FMIndexDBG : public FMIndex { * @param pos the position to get the children of * @param stack the stack to push the children on */ - void extendFMPos(const positionClass& pos, - std::vector>& stack) override; + void + extendFMPos(const positionClass& pos, + std::vector>& stack) const override; /** * @brief Find the node path in the compressed De Bruijn graph corresponding @@ -535,16 +530,19 @@ class FMIndexDBG : public FMIndex { findNodePathForMatch(const FMOcc& occ); /** - * @brief Find the node path in the compressed De Bruijn graph corresponding - * to a certain match in a forward way + * @brief Find the node path in the compressed De Bruijn graph + * corresponding to a certain match in a forward way * * @param pos The position corresponding to the match of interest * @param shift The shift corresponding to the match of interest - * @param path The node path corresponding to the match of interest (to be - * filled in) + * @param path The node path corresponding to the match of interest (to + * be filled in) + * @param distanceFromLeftEnd the distance from the occurrence to the left + * end of the first node of the node path */ void findNodePathForMatchForward(const positionClass& pos, const int& shift, - std::vector& path) const; + std::vector& path, + uint32_t& distanceFromLeftEnd); /** * @brief Find to which strain a certain position in the text belongs @@ -552,7 +550,7 @@ class FMIndexDBG : public FMIndex { * @param input Position in the original text * @return int - Strain ID */ - int findStrain(length_t input); + int findStrain(length_t input) const; // ---------------------------------------------------------------------------- // ROUTINES FOR FILTERING STRAIN-FREE OCCURRENCES @@ -571,7 +569,7 @@ class FMIndexDBG : public FMIndex { /** * @brief Alternative compare function used for sorting purposes. This - * function first distinghuishes based on node path size. + * function first distinguishes based on node path size. * * @param occ1 First occurrence to compare * @param occ2 Second occurrence to compare @@ -633,7 +631,7 @@ class FMIndexDBG : public FMIndex { /** * @brief This function is called when a new minimum is found such that the - * replacement related paramters for all occurrences in the current prefix + * replacement related parameters for all occurrences in the current prefix * branch can be updated. * * @param previousMatches The vector containing the current prefix branch @@ -724,7 +722,7 @@ class FMIndexDBG : public FMIndex { /** * @brief This function iterates over all occurrences and analyzes them by - * checking if the node paths are prefixes of eachother. If so, certain + * checking if the node paths are prefixes of each other. If so, certain * attributes are set. The result of this function depends on the direction * of the node path (regular or reverse). * @@ -787,7 +785,7 @@ class FMIndexDBG : public FMIndex { // ---------------------------------------------------------------------------- /** - * @brief Initialize the outputfiles for the subgraph visualization process + * @brief Initialize the output files for the subgraph visualization process * * @param filename The base filename for all output files * @param multipleSubgraphs Indicates whether there are multiple subgraphs @@ -802,7 +800,7 @@ class FMIndexDBG : public FMIndex { * @brief Make sure a node is visited in the visualization process * * @param id The node that needs to be visited - * @param depth The current neigborhood depth + * @param depth The current neighborhood depth * @param visited_nodes Array of nodes that have been visited * @param node_queue Queue of nodes that still need to be visited */ @@ -826,34 +824,44 @@ class FMIndexDBG : public FMIndex { std::vector& path); /** - * @brief Intermadiary function in the visualization process. It takes a + * @brief Intermediary function in the visualization process. It takes a * node and executes everything that is necessary to visualize this node - * and its surroudings. + * and its surroundings. * * @param path node path of the original match * @param subgraph_id ID of the subgraph (a match can have multiple * subgraphs) * @param visited_nodes array of nodes that have been visited * @param node_queue queue of nodes that still need to be visited - * @param edgefile the output stream corresponding to the edge file + * @param edges Map of edges, along with their corresponding colors and + * respective multiplicities * @param edgecounter counter for the key column * @param visualizedNodes Vector containing all initialized visualization - * nodes + * @param separateEdges if false, parallel edges must be bundled together + * irrespective of their color + * @param subgraphNodes to be filled in: all nodes in the node path */ void visualizeSubgraphIntermediary( std::vector& path, std::string subgraph_id, std::vector& visited_nodes, - std::queue>& node_queue, std::ofstream& edgefile, - size_t& edgecounter, std::vector& visualizedNodes); + std::queue>& node_queue, + std::map>& edges, + size_t& edgecounter, std::vector& visualizedNodes, + bool separateEdges, std::set& subgraphNodes); public: + using FMIndex::matchStringBidirectionally; + using FMIndex::findRangesWithExtraCharBackward; + using FMIndex::findRangesWithExtraCharForward; + using FMIndex::approxMatchesNaiveIntermediate; + using FMIndex::getText; // ---------------------------------------------------------------------------- // ROUTINES FOR THE BUILDING AND LOADING PROCESS // ---------------------------------------------------------------------------- /** * @brief Build the implicit representation of the compressed De Bruijn - * graph based on the bidirectional FM-index + * graph based on the bidirectional FM-index. * * @param baseFile base for the filenames * @param k parameter for the De Bruijn graph @@ -861,12 +869,16 @@ class FMIndexDBG : public FMIndex { * @param checkpoint_sparseness sparseness factors for the checkpoints that * aid in finding node identifiers. * @param progress prints progress if true + * @param skip if true, the bidirectional FM-index is already present and + * must not be rebuilt */ - static void buildFMIndexDBG(const std::string& baseFile, const int k, + static void buildFMIndexDBG(const std::string& baseFile, + const std::vector& k, const std::vector& sa_sparse, const std::vector& checkpoint_sparseness, - const bool progress) { - FMIndexDBG(baseFile, k, sa_sparse, checkpoint_sparseness, progress); + const bool progress, const bool skip) { + FMIndexDBG(baseFile, k, sa_sparse, checkpoint_sparseness, progress, + skip); } /** @@ -875,117 +887,126 @@ class FMIndexDBG : public FMIndex { * @param baseFile base for the filenames * @param sa_sparse sparseness factor for the SA * @param cp_sparse sparseness factor for the checkpoint k-mers + * @param k de Bruijn parameter * @param strainFree bool that indicates whether strain-free matching * will be used or not + * @param filteringOptionComplete filtering option for strain-free matching * @param option select algorithm option */ FMIndexDBG(const std::string& baseFile, int sa_sparse, int cp_sparse, - bool strainFree = false, bool filteringOptionComplete = false, + uint k, bool strainFree = false, + bool filteringOptionComplete = false, const SelectOption& option = SelectOption::SIMPLE) - : FMIndex(baseFile, sa_sparse) { + : FMIndex(baseFile, sa_sparse, true, strainFree) { std::cout << "Constructing the compressed de Bruijn graph..." << std::endl; + // Store the de Bruijn parameter in a global variable so that it can be + // used everywhere + k_DBG = k; + this->strainFree = strainFree; this->filteringOptionComplete = filteringOptionComplete; // Find the number of strains - numberOfStrains = - (std::count(this->bwt.begin(), this->bwt.end(), '%') + 1); - this->numberOfSeparationCharacters = 2; + numberOfStrains = fwdRepr.occ(sigma.c2i('%'), textLength) + 1; + numberOfSeparationCharacters = 2; // Find all start positions of the different strains and sort them along // with the position of the sentinel character - sorted_startpositions.emplace_back(this->findSA(0)); + sorted_startpositions.emplace_back(findSA(0)); for (length_t i = 1; i < numberOfStrains; i++) { - sorted_startpositions.emplace_back(this->findSA(i) + 1); + sorted_startpositions.emplace_back(findSA(i) + 1); } std::sort(sorted_startpositions.begin(), sorted_startpositions.end()); std::string suffix = cp_sparse == INT32_MAX ? "none" : std::to_string(cp_sparse); - // read the bit vectors for the implicit representation of the - // compressed De Bruijn graph from files + // read implicit representation of the compressed De Bruijn graph from + // files { - std::ifstream ifs(baseFile + ".B.right." + suffix); + G = std::vector(); + Node n = Node(); + std::string filename = baseFile + ".DBG.k"; + filename += std::to_string(k_DBG); + std::ifstream ifs(filename); if (!ifs) { - throw std::runtime_error("Cannot open file " + baseFile + - ".B.right." + suffix); + std::string filename2 = baseFile + ".DBG"; + std::ifstream ifs2(filename2); + if (ifs2) { + std::cout + << "\nIt seems like you are trying to run Nexus v1.1.0 " + "on an index that was built using Nexus v1.0.0, " + "which is not possible. Please rebuild your index." + << std::endl; + exit(0); + } + throw std::runtime_error("Cannot open file " + filename); + } + ifs.read((char*)&k_DBG, sizeof(k_DBG)); + while (!n.read(ifs).eof()) { + G.emplace_back(n); } - B_right.read(ifs); ifs.close(); } + std::cout << "k = " << k_DBG << std::endl; + numberOfGraphNodes = G.size(); + + std::cout << "The pan-genome graph contains " << numberOfGraphNodes + << " nodes." << std::endl; + + // read the bit vectors for the implicit representation of the + // compressed De Bruijn graph from files { - std::ifstream ifs(baseFile + ".B.left"); + std::string filename = baseFile + ".B.right.k"; + filename += std::to_string(k_DBG) + ".cp" + suffix; + std::ifstream ifs(filename); if (!ifs) { - throw std::runtime_error("Cannot open file " + baseFile + - ".B.left"); + throw std::runtime_error("Cannot open file " + filename); } - B_left.read(ifs); + B_right.read(ifs); ifs.close(); } { - std::ifstream ifs(baseFile + ".B.right.full." + suffix); + std::string filename = baseFile + ".B.left.k"; + filename += std::to_string(k_DBG); + std::ifstream ifs(filename); if (!ifs) { - throw std::runtime_error("Cannot open file " + baseFile + - ".B.right.full." + suffix); + throw std::runtime_error("Cannot open file " + filename); } - B_right_full.read(ifs); + B_left.read(ifs); ifs.close(); } // // Debugging // for (length_t i = 0; i < textLength; i++) // { - // std::cout << i << " B_left " << B_left[i] + // std::cout << i << " B_left " << B_left[i] << " B_right " << + // B_right[i] // << std::endl; // } - // read implicit representation of the compressed De Bruijn graph from - // files - { - G = std::vector(); - Node n = Node(); - std::ifstream ifs(baseFile + ".DBG"); - if (!ifs) { - throw std::runtime_error("Cannot open file " + baseFile + - ".DBG"); - } - ifs.read((char*)&k, sizeof(k)); - while (!n.read(ifs).eof()) { - G.emplace_back(n); - } - ifs.close(); - } - std::cout << "k = " << k << std::endl; - k_DBG = k; - numberOfGraphNodes = G.size(); - - std::cout << "The pan-genome graph contains " << numberOfGraphNodes - << " nodes." << std::endl; - // read the node identifier mappings from files { - std::ifstream ifs(baseFile + ".right.map." + suffix, - std::ios::binary); + std::string filename = baseFile + ".right.map.k"; + filename += std::to_string(k_DBG) + ".cp" + suffix; + std::ifstream ifs(filename, std::ios::binary); if (!ifs) { - throw std::runtime_error("Cannot open file " + baseFile + - ".right.map." + suffix); - } - // ifs.seekg(0, std::ios::end); - // mapping_right.resize(ifs.tellg() / sizeof(int)); //TODO can this - // still be used? - MappingPair mp = MappingPair(); - while (!mp.read(ifs).eof()) { - mapping_right.emplace_back(mp); + throw std::runtime_error("Cannot open file " + filename); } + ifs.seekg(0, std::ios::end); + mapping_right.resize(ifs.tellg() / (2 * sizeof(int))); + ifs.seekg(0, std::ios::beg); + ifs.read((char*)&mapping_right[0], + mapping_right.size() * 2 * sizeof(int)); ifs.close(); } { - std::ifstream ifs(baseFile + ".left.map", std::ios::binary); + std::string filename = baseFile + ".left.map.k"; + filename += std::to_string(k_DBG); + std::ifstream ifs(filename, std::ios::binary); if (!ifs) { - throw std::runtime_error("Cannot open file " + baseFile + - ".left.map"); + throw std::runtime_error("Cannot open file " + filename); } ifs.seekg(0, std::ios::end); mapping_left.resize(ifs.tellg() / sizeof(int)); @@ -1002,6 +1023,25 @@ class FMIndexDBG : public FMIndex { // GENERAL HELPER ROUTINES // ---------------------------------------------------------------------------- + /** + * @brief Get the timer for finding the node paths in the graph + * + * @return std::chrono::duration - timer + */ + std::chrono::duration getNodePathDuration() const { + return elapsedNodePaths; + } + + /** + * @brief Get the timer for postprocessing (accessing the suffix array, + * filtering and identifying the strains) + * + * @return std::chrono::duration - timer + */ + std::chrono::duration getSADuration() const { + return elapsedSAtoText; + } + /** * @brief return whether we are matching strain-free or not * @@ -1009,7 +1049,7 @@ class FMIndexDBG : public FMIndex { * @return false otherwise */ bool isStrainFree() const { - return this->strainFree; + return strainFree; } /** @@ -1057,37 +1097,82 @@ class FMIndexDBG : public FMIndex { FMPosSFR::setDirection(d); } + /** + * @brief Return a set of statistics concerning the graph topology. All + * parameters are to be filled in. + * + * @param numberOfNodes total number of nodes + * @param numberOfEdges total number of edges + * @param totalLength total length summed over all nodes + * @param totalMultiplicity total multiplicity summed over all nodes (only + * differs from numberOfEdges because of the end nodes) + * @param lengths vector of the lengths of all nodes + * @param multiplicities vector of the multiplicities of all nodes + */ + void stats(uint32_t& numberOfNodes, uint64_t& numberOfEdges, + uint64_t& totalLength, uint64_t& totalMultiplicity, + std::vector& lengths, + std::vector& multiplicities) { + numberOfNodes = G.size(); + for (Node n : G) { + if (!(n.right_kmer_forward < numberOfStrains)) { + numberOfEdges += n.multiplicity; + } + totalMultiplicity += n.multiplicity; + multiplicities.push_back(n.multiplicity); + totalLength += n.len; + lengths.push_back(n.len); + } + } + + /** + * @brief Get the node with a certain identifier + * + * @param nodeID identifier of the node of interest + * @return const Node& - output node + */ + const Node& getNode(const uint32_t& nodeID) const { + return G[nodeID]; + } + + /** + * @brief Get the implicit graph vector + * + * @return const std::vector& - implicit graph vector + */ + const std::vector& getGraph() const { + return G; + } + // ---------------------------------------------------------------------------- // ROUTINES FOR MAPPING // ---------------------------------------------------------------------------- /** - * @brief If i is an index for the suffix array, find the node in the - * compressed de Bruijn graph that corresponds to T[SA[i]..SA[i]+k[. - * T[SA[j-1]..SA[j-1]+k[ can be at any location in the node. + * @brief If j the (exclusive) right bound of the SA interval corresponding + * to a certain k-mer, find the node in the compressed de Bruijn graph that + * corresponds to T[SA[j-1]..SA[j-1]+k[. T[SA[j-1]..SA[j-1]+k[ can be at any + * location in the node. * - * @param i Index in the suffix array indicating a certain k-mer of a + * @param j Index in the suffix array indicating a certain k-mer of a * node * @param id the ID of the corresponding node in the compressed de Bruijn * graph (to be filled in) * @param l the number of characters that are before the k-mer in the node * (to be filled in) */ - void findID(length_t i, uint32_t& id, uint32_t& l) const; - - // ---------------------------------------------------------------------------- - // ROUTINES FOR EXACT PATTERN MATCHING - // ---------------------------------------------------------------------------- + void findID(length_t j, uint32_t& id, uint32_t& l) const; /** - * @brief Exactly match the pattern to the compressed De Bruijn graph in a - * strain-fixed way + * @brief For an occurrence shorter than k, find all possible nodes in which + * it occurs * - * @param pattern the pattern to be matched - * @return std::vector - the result of the matching - * procedure + * @param occ The occurrence of interest + * @param shift Shift caused by the optimized edit distance + * @param path Output node path (to be filled in) */ - std::vector ExactMatchSFI(const std::string& pattern); + void findNodeUnderK(const positionClass& occ, const int& shift, + std::vector& path); // ---------------------------------------------------------------------------- // ROUTINES FOR APPROXIMATE PATTERN MATCHING @@ -1153,13 +1238,18 @@ class FMIndexDBG : public FMIndex { * @param path the path that needs to visualized * @param depth the depth of the subgraph * @param filename the base for the filenames of the output files + * @param separateEdges if false, parallel edges must be bundled together + * irrespective of their color * @param multipleSubgraphs bool indicating whether there are multiple * subgraphs * @param subgraph_id id of the subgraph + * @return std::set - all nodes in the subgraph */ - void visualizeSubgraph(std::vector& path, uint32_t depth, - std::string filename, bool multipleSubgraphs = false, - std::string subgraph_id = ""); + std::set visualizeSubgraph(std::vector& path, + uint32_t depth, std::string filename, + bool separateEdges, + bool multipleSubgraphs = false, + std::string subgraph_id = ""); /** * @brief Visualize the subgraphs corresponding to the (approximate) matches @@ -1170,10 +1260,13 @@ class FMIndexDBG : public FMIndex { * text occurrences as values * @param depth the depth of the subgraph * @param filename the base for the filenames of the output files + * @param separateEdges if false, parallel edges must be bundled together + * irrespective of their color */ void visualizeSubgraphs( - std::map, std::vector>& paths, - uint32_t depth, std::string filename); + const std::map, std::vector>& + paths, + uint32_t depth, std::string filename, bool separateEdges); /** * @brief Visualize the subgraphs corresponding to the (approximate) matches @@ -1183,7 +1276,9 @@ class FMIndexDBG : public FMIndex { * @param paths vector containing the strain-free matches * @param depth the depth of the subgraph * @param filename the base for the filenames of the output files + * @param separateEdges if false, parallel edges must be bundled together + * irrespective of their color */ - void visualizeSubgraphs(std::vector& paths, uint32_t depth, - std::string filename); + void visualizeSubgraphs(const std::vector& paths, uint32_t depth, + std::string filename, bool separateEdges); }; diff --git a/src/fmocc.h b/src/fmocc.h index aab4625..3a22d60 100644 --- a/src/fmocc.h +++ b/src/fmocc.h @@ -208,7 +208,7 @@ template class FMOcc { } /** - * @brief Operator overloading. Two FMocc are equal if their position, + * @brief Operator overloading. Two FMOcc are equal if their position, * distance and shift are all equal. * * @param rhs @@ -254,6 +254,8 @@ template class FMOccSFI : public FMOcc { private: // The node path in the graph that corresponds to this FMOcc std::vector nodepath; + // The distance of the occurrence from the left end of the start node + uint32_t distanceFromLeftEnd; public: /** @@ -261,9 +263,14 @@ template class FMOccSFI : public FMOcc { * * @param occ The FMOcc object on which this object will be built * @param nodepath The node path in the graph to be added + * @param distanceFromLeftEnd The distance of the occurrence from the left + * end of the start node */ - FMOccSFI(FMOcc occ, const std::vector& nodepath) - : FMOcc(occ), nodepath(nodepath) { + FMOccSFI(const FMOcc& occ, + const std::vector& nodepath, + const uint32_t& distanceFromLeftEnd) + : FMOcc(occ), nodepath(nodepath), + distanceFromLeftEnd(distanceFromLeftEnd) { } /** @@ -274,6 +281,16 @@ template class FMOccSFI : public FMOcc { const std::vector& getNodePath() const { return nodepath; } + + /** + * @brief Get the node path in the graph + * + * @return const uint32_t& - the distance from the left end of the start + * node + */ + const uint32_t& getDistanceFromLeftEnd() const { + return distanceFromLeftEnd; + } }; // ============================================================================ @@ -352,7 +369,7 @@ class FMOccSFR : public FMOcc { /** * @brief Construct a new FMOccSFR object or a strain-free approximate - * matche in the pan-genome compressed de Bruijn graph + * match in the pan-genome compressed de Bruijn graph * * @param pos the position in the pan-genome compressed de Bruijn graph of * this approximate match @@ -457,7 +474,7 @@ class FMOccSFR : public FMOcc { } /** - * @brief Static function. Object relpacement replaces thisOcc. Therefore, + * @brief Static function. Object replacement replaces thisOcc. Therefore, * the replacements attributes of thisOcc must be updated based on object * replacement. This is only done if object replacement is effectively * better than any other replacement that was found up until now (in both @@ -471,7 +488,7 @@ class FMOccSFR : public FMOcc { static setReplacementPtr setReplacement; /** - * @brief Static function. Object relpacement replaces thisOcc. Therefore, + * @brief Static function. Object replacement replaces thisOcc. Therefore, * the replacements attributes of thisOcc must be updated based on object * replacement. This is only done if object replacement is effectively * better than any other replacement that was found up until now (in both @@ -534,7 +551,7 @@ class FMOccSFR : public FMOcc { } /** - * @brief Static function. Object relpacement replaces thisOcc. Therefore, + * @brief Static function. Object replacement replaces thisOcc. Therefore, * the replacements attributes of thisOcc must be updated based on object * replacement. This is only done if object replacement is effectively * better than any other replacement that was found up until now (in both diff --git a/src/fmpos.cpp b/src/fmpos.cpp index 873841e..f04ac92 100644 --- a/src/fmpos.cpp +++ b/src/fmpos.cpp @@ -61,7 +61,7 @@ bool FMPos::compare(const FMOcc& rhs, int distance, length_t shift) const { if (getRanges().getRangeSA().getBegin() != rhs.getRanges().getRangeSA().getBegin()) { - // the positions do not beging at the same position, first position is + // the positions do not begin at the same position, first position is // smaller return getRanges().getRangeSA().getBegin() < rhs.getRanges().getRangeSA().getBegin(); @@ -93,11 +93,11 @@ ExtendFMPosAboveKPtr FMPosSFR::ExtendFMPosAboveK; AddCharAboveKPtr FMPosSFR::AddCharAboveK; void FMPosSFR::changeNodeRepresentation() { - // Get the left boundary of the SA interval of the match - length_t lb = ranges.getRangeSA().getBegin(); + // Get the right boundary of the SA interval of the match + length_t rb = ranges.getRangeSA().getEnd(); // Find the first node of the path uint32_t id, l; - ((FMIndexDBG*)index)->findID(lb, id, l); + ((FMIndexDBG*)index)->findID(rb, id, l); // Add the node to the path appendToNodePath(id); // this is also the final node on the left (no left extension has happened @@ -148,7 +148,10 @@ void FMPosSFR::extendFMPosAboveKBackward( if (indexInSA != node.left_kmer_forward) { // We can add a character whilst staying in the current front node // Get the character that was added - char c = ((FMIndexDBG*)index)->bwt[indexInSA]; + + uint64_t bwtSymbol = ((FMIndexDBG*)index)->bwt[indexInSA]; + char c = ((FMIndexDBG*)index)->sigma.i2c(bwtSymbol); + // Check that it is no separation character if (!(c == '$' || c == '%')) { // Push this position onto the stack, along with the character that @@ -239,7 +242,10 @@ void FMPosSFR::extendFMPosAboveKForward( if (indexInRevSA != node.right_kmer_reverse) { // We can add a character whilst staying in the current end node // Get the character that was added - char c = ((FMIndexDBG*)index)->revbwt[indexInRevSA]; + uint64_t bwtSymbol = + ((FMIndexDBG*)index)->revbwt[indexInRevSA]; + char c = ((FMIndexDBG*)index)->sigma.i2c(bwtSymbol); + // Check that it is no separation character if (!(c == '$' || c == '%')) { // Push this position onto the stack, along with the character that @@ -327,7 +333,9 @@ bool FMPosSFR::addCharAboveKBackward(const char& c) { // We can add a character whilst staying in the current front node // Check if the next character in the current front node is equal to the // character c we want to add - if (((FMIndexDBG*)index)->bwt[indexInSA] != c) { + uint64_t bwtSymbol = ((FMIndexDBG*)index)->bwt[indexInSA]; + char character = ((FMIndexDBG*)index)->sigma.i2c(bwtSymbol); + if (character != c) { // The characters are not equal, so character c cannot be added. To // indicate that this position is invalid, we set the ranges to // empty. @@ -356,7 +364,7 @@ bool FMPosSFR::addCharAboveKBackward(const char& c) { // Hence, we need to investigate the possible predecessors of the front // node. - // Get the index in alfabet sigma that corresponds to character c + // Get the index in alphabet sigma that corresponds to character c int posInAlphabet = ((FMIndexDBG*)index)->sigma.c2i((unsigned char)c); // Check that this index is valid @@ -395,7 +403,7 @@ bool FMPosSFR::addCharAboveKBackward(const char& c) { return true; } } - // The index in alfabet sigma is not valid or there exists no + // The index in alphabet sigma is not valid or there exists no // predecessor that is the result of the extension of the current match // with character c. To indicate that this position is invalid, we set // the ranges to empty. @@ -415,7 +423,10 @@ bool FMPosSFR::addCharAboveKForward(const char& c) { // We can add a character whilst staying in the current end node // Check if the next character in the current end node is equal to the // character c we want to add - if (((FMIndexDBG*)index)->revbwt[indexInRevSA] != c) { + uint64_t bwtSymbol = + ((FMIndexDBG*)index)->revbwt[indexInRevSA]; + char charac = ((FMIndexDBG*)index)->sigma.i2c(bwtSymbol); + if (charac != c) { // The characters are not equal, so character c cannot be added. To // indicate that this position is invalid, we set the ranges to // empty. @@ -444,7 +455,7 @@ bool FMPosSFR::addCharAboveKForward(const char& c) { // Hence, we need to investigate the possible successors of the end // node. - // Get the index in alfabet sigma that corresponds to character c + // Get the index in alphabet sigma that corresponds to character c int posInAlphabet = ((FMIndexDBG*)index)->sigma.c2i((unsigned char)c); // Check that this index is valid @@ -482,7 +493,7 @@ bool FMPosSFR::addCharAboveKForward(const char& c) { return true; } } - // The index in alfabet sigma is not valid or there exists no + // The index in alphabet sigma is not valid or there exists no // successor that is the result of the extension of the current match // with character c. To indicate that this position is invalid, we set // the ranges to empty. @@ -519,7 +530,7 @@ const bool FMPosSFR::separationIsNext() const { for (int i = 0; i < ((FMIndexDBG*)index)->numberOfSeparationCharacters; i++) { - // Find the separation character that corresponds to index i in alfabet + // Find the separation character that corresponds to index i in alphabet // sigma. unsigned char c = ((FMIndexDBG*)index)->sigma.i2c(i); // Create a copy of the current position to execute the character adding diff --git a/src/fmpos.h b/src/fmpos.h index 8705ebd..89c6712 100644 --- a/src/fmpos.h +++ b/src/fmpos.h @@ -270,8 +270,8 @@ class FMPosSFR : public FMPos { // Pointer to The node path in the graph that represents the current match std::shared_ptr> ptrToNodePath; - uint32_t finalNodeLeft; // the final node of the path on the left side - uint32_t finalNodeRight; // the final node of the path on the right side + uint32_t finalNodeLeft = 0; // the final node of the path on the left side + uint32_t finalNodeRight = 0; // the final node of the path on the right side uint32_t numberOfNodesLeft = 0; // the number of nodes on the left side uint32_t numberOfNodesRight = 0; // the number of nodes on the right side diff --git a/src/main.cpp b/src/main.cpp index 2f2d9a8..9a8faae 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -60,29 +60,6 @@ int editDistDP(string P, string O, int maxED) { return mat(m, n); } -void printMatches(vector matches, string text, bool printLine, - string duration, FMIndex& mapper, string name) { - - cout << endl; - - cout << name << ":\tduration: " << duration - << "µs\t nodes visited: " << mapper.getNodes() - << "\t matrix elements written: " << mapper.getMatrixElements() - << "\t startpositions reported: " << mapper.getTotalReported() - << " #matches: " << matches.size() << endl; - - for (auto match : matches) { - cout << "Found match at position " << match.getRange().getBegin() - << " with ED " << match.getDistance() << endl; - - cout << "\tCorresponding substring:\t" - << text.substr(match.getRange().getBegin(), - match.getRange().getEnd() - - match.getRange().getBegin()) - << endl; - } -} - string getFileExt(const string& s) { size_t i = s.rfind('.', s.length()); @@ -270,8 +247,8 @@ void doBench(vector>& reads, FMIndex& mapper, size_t allReportedMatches = 0; size_t totalUniqueMatches = 0; size_t mappedReads = 0; - size_t mappedReadsForward = 0; - size_t mappedReadsBackward = 0; + // size_t mappedReadsForward = 0; + // size_t mappedReadsBackward = 0; cout << "Benchmarking with " << strategy->getName() << " strategy for max distance " << ED << " with " @@ -305,7 +282,7 @@ void doBench(vector>& reads, FMIndex& mapper, totalMatrixElements += mapper.getMatrixElements(); allReportedMatches += mapper.getTotalReported(); totalUniqueMatches += matches.size(); - mappedReadsForward += !matches.empty(); + // mappedReadsForward += !matches.empty(); // do the same for the reverse complement vector matchesRevCompl = @@ -314,67 +291,15 @@ void doBench(vector>& reads, FMIndex& mapper, totalMatrixElements += mapper.getMatrixElements(); allReportedMatches += +mapper.getTotalReported(); totalUniqueMatches += matchesRevCompl.size(); - mappedReadsBackward += !matchesRevCompl.empty(); + // mappedReadsBackward += !matchesRevCompl.empty(); mappedReads += !(matchesRevCompl.empty() && matches.empty()); matchesPerRead.push_back(matches); matchesPerRead.push_back(matchesRevCompl); numberMatchesPerRead.push_back(matches.size() + matchesRevCompl.size()); - // correctness check, comment this out if you want to check - // For each reported match the reported edit distance is checked and - // compared to a recalculated value using a single banded matrix this - // is slow WARNING: this checks the EDIT DISTANCE, for it might be that - // the hamming distance is higher - /*for (auto match : matches) { - - string O = text.substr(match.getRange().getBegin(), - match.getRange().getEnd() - - match.getRange().getBegin()); - - int trueED = editDistDP(read, O, ED); - int foundED = match.getDistance(); - if (foundED != trueED) { - cout << i << "\n"; - cout << "Wrong ED!!" - << "\n"; - cout << "P: " << read << "\n"; - cout << "O: " << O << "\n"; - cout << "true ED " << trueED << ", found ED " << foundED << - "\n" - << match.getRange().getBegin() << "\n"; - } - }*/ - - // this block checks if at least one occurrence is found and if the - // identifier is a number and then checks if this position is found as - // a match (for checking correctness) if you want to check if the - // position is found as a match make sure that the identifier of the - // read is the position. Out comment this block for the check - /* bool originalFound = true; - try { - length_t pos = stoull(originalPos); - originalFound = false; - - for (auto match : matches) { - - if (match.getRange().getBegin() >= pos - (ED + 2) && - match.getRange().getBegin() <= pos + (ED + 2)) { - originalFound = true; - break; - } - } - } catch (const std::exception& e) { - // nothing to do, identifier is not the orignal position - } - - // check if at least one occurrence was found (for reads that were - // sampled from actual reference) Out-cooment this block if you want - // to do this. - if (matches.size() == 0 || (!originalFound)) { - cout << "Could not find occurrence for " << originalPos << endl; - }*/ } + auto finish = chrono::high_resolution_clock::now(); chrono::duration elapsed = finish - start; cout << "Progress: " << reads.size() << "/" << reads.size() << "\n"; @@ -399,11 +324,12 @@ void doBench(vector>& reads, FMIndex& mapper, << findMedian(numberMatchesPerRead, numberMatchesPerRead.size()) << endl; - writeToOutput(readsFile + "_output.txt", matchesPerRead, reads); + writeToOutput(readsFile + "_output.tsv", matchesPerRead, reads); } void showUsage() { - cout << "Usage: ./columba [options] basefilename readfile.[ext]\n\n"; + cout + << "Usage: ./columba [options] \n\n"; cout << " [options]\n"; cout << " -e --max-ed\t\tmaximum edit distance [default = 0]\n"; cout << " -s --sa-sparseness\tsuffix array sparseness factor " @@ -439,6 +365,68 @@ void showUsage() { cout << "\t.rev.brt: Prefix occurrence table of the " "reverse " "of T\n"; + cout << + + "This program aligns short, single end reads to a pan-genome in the\n" + "form of a linear concatenation. It reports the corresponding\n" + "coordinates in the original genomes.\n\n\n" + + "Usage: ./columba [options] \n\n" + + " Following input parameters are required:\n" + " base filename of the input index\n" + " the file containing the input reads to be\n" + " aligned (single end).\n\n" + + " [ext]\n" + " one of the following: fq, fastq, FASTA, fasta, fa\n\n\n" + + " [options]\n" + " -e/--max-ed maximum edit distance [default = 0]\n\n" + + " -s/--sa-sparseness suffix array sparseness factor [default = " + "16]\n\n" + + " -p/--partitioning Add flag to do uniform/static/dynamic\n" + " partitioning of the seeds for search schemes.\n" + " Dynamic partitioning cannot be used with\n" + " strain-free matching. [default = dynamic]\n\n" + + " -m/--metric Add flag to set distance metric (editnaive/\n" + " editopt/ hamming) [default = editopt]\n\n" + + " -ss/--search-scheme Choose the search scheme. Options:\n" + " * kuch1 Kucherov k + 1 [default]\n" + " * kuch2 Kucherov k + 2\n" + " * kianfar Optimal Kianfar scheme\n" + " * manbest Manual best improvement for " + "Kianfar\n" + " scheme (only for ed = 4)\n" + " * pigeon Pigeonhole scheme\n" + " * 01*0 01*0 search scheme\n" + " * naive naive backtracking\n" + " * custom custom search scheme, the next\n" + " parameter should be a path to the\n" + " folder containing this " + "searchscheme\n\n\n" + + " Following input files are required:\n" + " .compressed.txt: compressed version of the\n" + " input text T\n\n" + " .cct: character counts table\n\n" + " .sa.: sparse suffix array, with\n" + " suffix array sparseness\n" + " factor elements\n\n" + " .sa.bv.: bitvector indicating " + "which\n" + " elements of the suffix\n" + " array are stored.\n\n" + " .bwt: BWT of T\n\n" + " .rev.bwt: BWT of the reverse of T\n\n" + " .brt: Prefix occurrence table of " + "T\n\n" + " .rev.brt: Prefix occurrence table " + "of\n\n\n"; } int main(int argc, char* argv[]) { @@ -541,7 +529,7 @@ int main(int argc, char* argv[]) { else { cerr << "Unknown argument: " << arg << " is not an option" << endl; - return false; + return EXIT_FAILURE; } } diff --git a/src/mainDBG.cpp b/src/mainDBG.cpp index 17faa3f..d6348cd 100644 --- a/src/mainDBG.cpp +++ b/src/mainDBG.cpp @@ -29,99 +29,148 @@ * */ void showUsage() { - cout << "Usage: ./nexus [options] basefilename readfile.[ext]\n\n"; - cout << " [options]\n"; - cout << " -sfr --strain-free\tstrain-free matching\n"; - cout << " -e --max-ed\t\tmaximum edit distance [default = 0]\n"; - cout << " -s --sa-sparseness\tsuffix array sparseness factor " - "[default = " - "1]\n"; - cout << " -c --cp-sparseness\tsparseness factor that indicates " - "how many checkpoints must be stored to identify nodes. Use " - "\"none\" to use no checkpoints. Choose a value that was also used " - "during the building process. " - "[default = 128]\n"; - cout - << " -f --filter\t\tfiltering type that should be used to filter the " - "occurrences. This option is only valid in case of strain-free " - "matching. Options:\n\t" - << "linear\t\tlinear filtering is efficient but does not filter out " - "all redundant occurrences. Additionally, in some exceptional " - "cases, a non-optimal replacement occurrence can be chosen. This " - "is the default option.\n\t" - << "complete\tcomplete filtering leads to a set of occurrences with " - "no redundancy. This option is very slow however and thus not " - "recommended.\n"; - cout << " -p --partitioning\t\tAdd flag to do uniform/static/dynamic " - "partitioning. Dynamic partitioning cannot be used with " - "strain-free matching. [default = static]\n"; - cout << " -m --metric\t\tAdd flag to set distance metric " - "(editnaive/editopt/hamming) [default = editopt]\n"; - cout << " -ss --search-scheme\tChoose the search scheme\n options:\n\t" - << "kuch1\tKucherov k + 1\n\t" - << "kuch2\tKucherov k + 2\n\t" - << "kianfar\tOptimal Kianfar scheme\n\t" - << "manbest\tManual best improvement for Kianfar scheme (only for ed " - "= 4)\n\t" - << "pigeon\tPigeonhole scheme\n\t" - << "01*0\t01*0 search scheme\n\t" - << "naive\tnaive backtracking\n\t" - << "custom\tcustom search scheme, the next parameter should be a path " - "to the folder containing this search scheme\n\n"; - - cout << "[ext]\n" - << "\tone of the following: fq, fastq, FASTA, fasta, fa\n"; - - cout << "Following input files are required:\n"; - cout << "\t.txt: input text T\n"; - cout << "\t.cct: character counts table\n"; - cout << "\t.sa.[saSF]: sparse suffix array, with suffix " - "array sparseness factor [saSF] " - "elements\n"; - cout << "\t.sa.bv.[saSF]: bitvector indicating which " - "elements of the suffix array are stored.\n"; - cout << "\t.bwt: BWT of T\n"; - cout << "\t.rev.bwt: BWT of the reverse of T\n"; - cout << "\t.brt: Prefix occurrence table of T\n"; - cout << "\t.rev.brt: Prefix occurrence table of the " - "reverse " - "of T\n"; - cout << "\t.DBG: variable k and the compressed de " - "Bruijn graph.\n"; - cout << "\t.B.left: bitvector B_left for the compressed de " - "Bruijn graph.\n"; - cout << "\t.B.right.[cpSF]: bitvector B_right for the " - "compressed " - "de " - "Bruijn graph, with checkpoint sparseness factor [cpSF].\n"; - cout << "\t.B.right.full.[cpSF]: bitvector B_right_full for " - "the " - "compressed de Bruijn graph, with checkpoint sparseness factor " - "[cpSF].\n"; - cout << "\t.left.map: node identifier mapping corresponding " - "to B_left.\n"; - cout << "\t.right.map.[cpSF]: node identifier mapping " - "corresponding " - "to B_right, with checkpoint sparseness factor [cpSF].\n"; + cout << + + "This program aligns short, single end reads to a pan-genome de " + "Bruijn\n" + "graph. It reports the corresponding node paths as well as the\n" + "coordinates in the original genomes.\n\n\n" + + "Usage: ./nexus [options] \n\n" + + " Following input parameters are required:\n" + " base filename of the input index\n" + " the de Bruijn parameter of the index\n" + " the file containing the input reads to be\n" + " aligned (single end).\n\n" + + " [ext]\n" + " one of the following: fq, fastq, FASTA, fasta, fa\n\n\n" + + " [options]\n" + " -e/--max-ed maximum edit distance [default = 0]\n\n" + + " -s/--sa-sparseness suffix array sparseness factor [default = " + "16]\n\n" + + " -c/--cp-sparseness sparseness factor that indicates how many\n" + " checkpoints must be stored to identify nodes.\n" + " Use \"none\" to use no checkpoints. Choose a\n" + " value that was also used during the building\n" + " process. [default = 128]\n\n" + + " -p/--partitioning Add flag to do uniform/static/dynamic\n" + " partitioning of the seeds for search schemes.\n" + " Dynamic partitioning cannot be used with\n" + " strain-free matching. [default = dynamic]\n\n" + + " -m/--metric Add flag to set distance metric (editnaive/\n" + " editopt/ hamming) [default = editopt]\n\n" + + " -ss/--search-scheme Choose the search scheme. Options:\n" + " * kuch1 Kucherov k + 1 [default]\n" + " * kuch2 Kucherov k + 2\n" + " * kianfar Optimal Kianfar scheme\n" + " * manbest Manual best improvement for " + "Kianfar\n" + " scheme (only for ed = 4)\n" + " * pigeon Pigeonhole scheme\n" + " * 01*0 01*0 search scheme\n" + " * naive naive backtracking\n" + " * custom custom search scheme, the next\n" + " parameter should be a path to the\n" + " folder containing this " + "searchscheme\n\n" + + " -sfr/--strain-free strain-free matching: occurrences can be\n" + " identified as any path of connected nodes. In\n" + " other words, they do not have to occur exactly\n" + " in one of the input genomes of the pan-genome.\n" + " This is option is not activated by default and\n" + " is slower than the default implementation.\n\n" + + " -f/--filter filtering type that should be used to filter\n" + " the occurrences. This option is only valid in\n" + " case of strain-free matching. Options:\n" + " * linear: linear filtering is efficient but\n" + " does not filter out all redundant\n" + " occurrences. Additionally, in some\n" + " exceptional cases, a non-optimal " + "replacement\n" + " occurrence can be chosen. This is the\n" + " default option.\n" + " * complete: complete filtering leads to a set\n" + " of occurrences with no redundancy. This\n" + " option is very slow however and thus not\n" + " recommended.\n\n\n" + + " Following input files are required:\n" + " .compressed.txt: compressed version of the\n" + " input text T\n\n" + " .cct: character counts table\n\n" + " .sa.: sparse suffix array, with\n" + " suffix array sparseness\n" + " factor elements\n\n" + " .sa.bv.: bitvector indicating " + "which\n" + " elements of the suffix\n" + " array are stored.\n\n" + " .bwt: BWT of T\n\n" + " .rev.bwt: BWT of the reverse of T\n\n" + " .brt: Prefix occurrence table of " + "T\n\n" + " .rev.brt: Prefix occurrence table " + "of\n" + " the reverse of T\n\n" + " .DBG.k: the compressed de Bruijn\n" + " graph for the requested " + "de\n" + " Bruijn parameter\n\n" + " .B.right.k.cp: first bitvector of the\n" + " implicit representation " + "for\n" + " the requested de Bruijn\n" + " parameter, with " + "checkpoint\n" + " sparseness factor " + "\n\n" + " .B.left.k: second bitvector of the\n" + " implicit representation\n" + " for the requested de\n" + " Bruijn parameter\n\n" + " .right.map.k.cp: node identifier mapping\n" + " corresponding to the " + "first\n" + " bitvector, with " + "checkpoint\n" + " sparseness factor " + "\n\n" + " .left.map.k: node identifier mapping\n" + " corresponding to the\n" + " second bitvector\n\n\n"; } int main(int argc, char* argv[]) { - int requiredArguments = 2; // baseFile of files and file containing reads + int requiredArguments = 3; // baseFile of files, k and file containing reads + + if (argc == 2) { + string firstArg(argv[1]); + if (firstArg.find("help") != std::string::npos) { + showUsage(); + return EXIT_SUCCESS; + } + } if (argc < requiredArguments) { - cerr << "Insufficient number of arguments" << endl; + cerr << "Insufficient number of arguments.\n" << endl; showUsage(); return EXIT_FAILURE; } - if (argc == 2 && strcmp("help", argv[1]) == 0) { - showUsage(); - return EXIT_SUCCESS; - } cout << "Welcome to Nexus!\n"; - string saSparse = "1"; + string saSparse = "16"; string cpSparse = "128"; string maxED = "0"; string searchscheme = "kuch1"; @@ -130,7 +179,7 @@ int main(int argc, char* argv[]) { bool filteringIsChosen = false; bool filteringOptionComplete = false; - PartitionStrategy pStrat = STATIC; + PartitionStrategy pStrat = DYNAMIC; DistanceMetric metric = EDITOPTIMIZED; // process optional arguments @@ -145,9 +194,11 @@ int main(int argc, char* argv[]) { } else if (s == "dynamic") { pStrat = DYNAMIC; if (strainFree == true) { - throw runtime_error( - "Dynamic partitioning cannot be used with " - "strain-free matching."); + std::cout + << "Dynamic partitioning cannot be used with " + "strain-free matching, static is used instead." + << std::endl; + pStrat = STATIC; } } else if (s == "static") { pStrat = STATIC; @@ -247,14 +298,16 @@ int main(int argc, char* argv[]) { "combination with strain-free matching."); } if (pStrat == DYNAMIC) { - throw runtime_error("Dynamic partitioning cannot be used with " - "strain-free matching."); + std::cout << "Dynamic partitioning cannot be used with " + "strain-free matching, static is used instead." + << std::endl; + pStrat = STATIC; } } else { cerr << "Unknown argument: " << arg << " is not an option" << endl; - return false; + return EXIT_FAILURE; } } @@ -293,23 +346,15 @@ int main(int argc, char* argv[]) { throw runtime_error("manbest only supports 4 allowed errors"); } - string baseFile = argv[argc - 2]; + string baseFile = argv[argc - 3]; + uint k = atoi(argv[argc - 2]); string readsFile = argv[argc - 1]; - cout << "Reading in reads from " << readsFile << endl; - vector> reads; - try { - reads = getReads(readsFile); - } catch (const exception& e) { - string er = e.what(); - er += " Did you provide a valid reads file?"; - throw runtime_error(er); - } cout << "Start creation of BWT approximate matcher on graphs" << endl; if (strainFree) { - FMIndexDBG bwt(baseFile, saSF, cpSF, strainFree, + FMIndexDBG bwt(baseFile, saSF, cpSF, k, strainFree, filteringOptionComplete); SearchStrategyDBG, FMPosSFR>* strategy; @@ -347,27 +392,12 @@ int main(int argc, char* argv[]) { " is not on option as search scheme"); } StrainFreeMapper mapper(strategy); - doBenchSFR(reads, bwt, strategy, readsFile, ed, cpSparse); + doBenchSFR(bwt, strategy, readsFile, ed, cpSparse); delete strategy; - // // try { - // // auto results = mapper.matchApproxSFR("ACGAATCACCAA", ed); - // auto results = mapper.matchApproxSFR( - // "AGGCCTGATAAGACGCGCTGGCGTCACATCAGGCAACGGCTGTCGGATGCAGCGTGAACGCCTTAT" - // "CCGACCTACTGTTCTACTCCTGCGTAGGCCTGAT", - // ed); - // bwt.visualizeSubgraphs(results, 3, "test"); - - // delete strategy; - - // // } catch (const std::exception& e) { - // // cerr << "Fatal error: " << e.what() << endl; - // // return EXIT_FAILURE; - // // } - } else { - FMIndexDBG bwt(baseFile, saSF, cpSF, strainFree); + FMIndexDBG bwt(baseFile, saSF, cpSF, k, strainFree); SearchStrategyDBG, FMPos>* strategy; if (searchscheme == "kuch1") { @@ -401,26 +431,8 @@ int main(int argc, char* argv[]) { throw runtime_error(searchscheme + " is not on option as search scheme"); } - doBenchSFI(reads, bwt, strategy, readsFile, ed, cpSparse); + doBenchSFI(bwt, strategy, readsFile, ed, cpSparse); delete strategy; - - // try { - // // auto results = strategy->matchApproxSFI("GAATCACCAA", ed); - // auto results = strategy->matchApproxSFI( - // "GGTGGATAGGGTGGATAGGGTGGATAGGGTGGTTAGGGTGGATAGGGTGGATAGGGTGGATA" - // "GGGTGGATAGGGTGGATAGGGTGGATAGGGTGGATAGGA", - // ed); - // bwt.visualizeSubgraphs(results, 3, "test"); - // // bwt.getText(); - // // std::vector path = {551, 73827}; - // // bwt.visualizeSubgraph(path, 3, "testgraph"); - - // delete strategy; - - // } catch (const std::exception& e) { - // cerr << "Fatal error: " << e.what() << endl; - // return EXIT_FAILURE; - // } } cout << "Bye...\n"; diff --git a/src/mainstats.cpp b/src/mainstats.cpp new file mode 100644 index 0000000..ea510af --- /dev/null +++ b/src/mainstats.cpp @@ -0,0 +1,213 @@ +/****************************************************************************** + * Nexus: Pan-genome compacted de Bruijn graphs with support for approximate * + * pattern matching using search schemes * + * * + * Copyright (C) 2022 - Lore Depuydt , * + * Luca Renders and * + * Jan Fostier * + * * + * This program is free software: you can redistribute it and/or modify * + * it under the terms of the GNU Affero General Public License as * + * published by the Free Software Foundation, either version 3 of the * + * License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU Affero General Public License for more details. * + * * + * You should have received a copy of the GNU Affero General Public License * + * along with this program. If not, see . * + ******************************************************************************/ + +// #include "benchmarking.h" +// #include "searchstrategy.h" +#include "strainfreemapper.h" + +using namespace std; + +/** + * @brief Show the usage in terminal + * + */ +void showUsage() { + cout << + + "This program reports some statistics regarding the pan-genome graph\n" + "topology.\n\n\n" + + "Usage: ./nexusStats [options] \n\n" + + " Following input parameters are required:\n" + " base filename of the input index\n" + " the de Bruijn parameter of the index\n\n\n" + + " [options]\n" + + " -s/--sa-sparseness suffix array sparseness factor [default =\n" + " 256 to limit memory usage]\n\n" + + " -c/--cp-sparseness sparseness factor that indicates how many\n" + " checkpoints must be stored to identify nodes.\n" + " Use \"none\" to use no checkpoints. Choose a\n" + " value that was also used during the building\n" + " process. [default = 128]\n\n\n" + + " Following input files are required:\n" + " .compressed.txt: compressed version of the\n" + " input text T\n\n" + " .cct: character counts table\n\n" + " .sa.: sparse suffix array, with\n" + " suffix array sparseness\n" + " factor elements\n\n" + " .sa.bv.: bitvector indicating " + "which\n" + " elements of the suffix\n" + " array are stored.\n\n" + " .bwt: BWT of T\n\n" + " .rev.bwt: BWT of the reverse of T\n\n" + " .brt: Prefix occurrence table of " + "T\n\n" + " .rev.brt: Prefix occurrence table " + "of\n" + " the reverse of T\n\n" + " .DBG.k: the compressed de Bruijn\n" + " graph for the requested " + "de\n" + " Bruijn parameter\n\n" + " .B.right.k.cp: first bitvector of the\n" + " implicit representation " + "for\n" + " the requested de Bruijn\n" + " parameter, with " + "checkpoint\n" + " sparseness factor " + "\n\n" + " .B.left.k: second bitvector of the\n" + " implicit representation\n" + " for the requested de\n" + " Bruijn parameter\n\n" + " .right.map.k.cp: node identifier mapping\n" + " corresponding to the " + "first\n" + " bitvector, with " + "checkpoint\n" + " sparseness factor " + "\n\n" + " .left.map.k: node identifier mapping\n" + " corresponding to the\n" + " second bitvector\n\n\n"; +} + +int main(int argc, char* argv[]) { + + int requiredArguments = 2; // baseFile of files and k + + if (argc == 2) { + string firstArg(argv[1]); + if (firstArg.find("help") != std::string::npos) { + showUsage(); + return EXIT_SUCCESS; + } + } + + if (argc < requiredArguments) { + cerr << "Insufficient number of arguments.\n" << endl; + showUsage(); + return EXIT_FAILURE; + } + + string saSparse = "256"; + string cpSparse = "128"; + + // process optional arguments + for (int i = 1; i < argc - requiredArguments; i++) { + const string& arg = argv[i]; + + if (arg == "-s" || arg == "--sa-sparseness") { + if (i + 1 < argc) { + saSparse = argv[++i]; + + } else { + throw runtime_error(arg + " takes 1 argument as input"); + } + } else if (arg == "-c" || arg == "--cp-sparseness") { + if (i + 1 < argc) { + cpSparse = argv[++i]; + + } else { + throw runtime_error(arg + " takes 1 argument as input"); + } + } + } + + string baseFile = argv[argc - 2]; + uint k = atoi(argv[argc - 1]); + length_t saSF = stoi(saSparse); + if (saSF == 0 || saSF > 256 || (saSF & (saSF - 1)) != 0) { + cerr << saSF + << " is not allowed as sparse factor, should be in 2^[0, 8]" + << endl; + return EXIT_FAILURE; + } + length_t cpSF; + if (cpSparse == "none") { + cpSF = INT32_MAX; + } else { + cpSF = stoi(cpSparse); + double logcpSF = log2(cpSF); + double value; + if (modf(logcpSF, &value) != 0.0) { + cerr << "Checkpoint sparseness should be a power of 2." << endl; + return EXIT_FAILURE; + } + } + + cout << "Welcome to Nexus Stats!\n"; + + FMIndexDBG bwt(baseFile, saSF, cpSF, k, false); + + uint32_t numberOfNodes = 0; + uint64_t numberOfEdges = 0; + uint64_t totalLength = 0; + uint64_t totalMultiplicity = 0; + vector lengths; + vector multiplicities; + + bwt.stats(numberOfNodes, numberOfEdges, totalLength, totalMultiplicity, + lengths, multiplicities); + + uint32_t medianLength; + + sort(lengths.begin(), lengths.end()); + if (lengths.size() % 2 == 0) { + medianLength = + (lengths[lengths.size() / 2 - 1] + lengths[lengths.size() / 2]) / 2; + } else { + medianLength = lengths[lengths.size() / 2]; + } + + uint32_t medianMultiplicity; + + sort(multiplicities.begin(), multiplicities.end()); + if (multiplicities.size() % 2 == 0) { + medianMultiplicity = (multiplicities[multiplicities.size() / 2 - 1] + + multiplicities[multiplicities.size() / 2]) / + 2; + } else { + medianMultiplicity = multiplicities[multiplicities.size() / 2]; + } + + cout << "Total no. graph nodes: " << numberOfNodes << "\n"; + cout << "Total no. graph edges: " << numberOfEdges << "\n"; + cout << "Total node multiplicity: " << totalMultiplicity << "\n"; + cout << "Average node multiplicity: " + << totalMultiplicity / (double)(numberOfNodes) << endl; + cout << "Median node multiplicity: " << medianMultiplicity << endl; + cout << "Total node length: " << totalLength << "\n"; + cout << "Average node length: " << totalLength / (double)(numberOfNodes) + << endl; + cout << "Median node length: " << medianLength << endl; + + cout << "Bye...\n"; +} diff --git a/src/mappingpair.h b/src/mappingpair.h index e0813c4..a0aec85 100644 --- a/src/mappingpair.h +++ b/src/mappingpair.h @@ -54,7 +54,7 @@ struct MappingPair { /** * @brief Writes the node to a file * - * @param ofs output strean + * @param ofs output stream * @return std::ofstream& - output stream */ std::ofstream& write(std::ofstream& ofs) { diff --git a/src/node.h b/src/node.h index 8f23805..ed4ee0a 100644 --- a/src/node.h +++ b/src/node.h @@ -53,9 +53,6 @@ struct Node { // left boundary of the suffix interval in the reverse SA of the k-length // suffix of the substring corresponding to the node length_t left_kmer_reverse; - // Mapping of regular and reverse ranks of edges passing through the node. - // Specifically, regular ranks are mapped to revers ranks. - BitvecN edgeMapping; // for visualization bool visited = false; @@ -65,7 +62,7 @@ struct Node { */ Node() : len(0), multiplicity(0), left_kmer_forward(0), right_kmer_forward(0), - right_kmer_reverse(0), left_kmer_reverse(0), edgeMapping(0) { + right_kmer_reverse(0), left_kmer_reverse(0) { } /** @@ -90,13 +87,13 @@ struct Node { left_kmer_forward(left_kmer_forward), right_kmer_forward(right_kmer_forward), right_kmer_reverse(right_kmer_reverse), - left_kmer_reverse(left_kmer_reverse), edgeMapping(multiplicity) { + left_kmer_reverse(left_kmer_reverse) { } /** * @brief Writes the node to a file * - * @param ofs output strean + * @param ofs output stream * @return std::ofstream& - output stream */ std::ofstream& write(std::ofstream& ofs) { @@ -106,7 +103,6 @@ struct Node { ofs.write((char*)&right_kmer_forward, sizeof(right_kmer_forward)); ofs.write((char*)&right_kmer_reverse, sizeof(right_kmer_reverse)); ofs.write((char*)&left_kmer_reverse, sizeof(left_kmer_reverse)); - edgeMapping.write(ofs); return ofs; } @@ -123,7 +119,6 @@ struct Node { ifs.read((char*)&right_kmer_forward, sizeof(right_kmer_forward)); ifs.read((char*)&right_kmer_reverse, sizeof(right_kmer_reverse)); ifs.read((char*)&left_kmer_reverse, sizeof(left_kmer_reverse)); - edgeMapping.read(ifs, multiplicity); visited = false; return ifs; } diff --git a/src/range.h b/src/range.h index 8d5a427..8b75b69 100644 --- a/src/range.h +++ b/src/range.h @@ -106,6 +106,17 @@ class Range { return o.getBegin() == begin && o.getEnd() == end; } + /** + * @brief Check if the range contains a certain index + * + * @param position index to be checked + * @return true if the index lies within the range + * @return false otherwise + */ + bool contains(const length_t& position) const { + return begin <= position && position < end; + } + /** * @brief Operator overloading. Outputs the range as [begin, end) to the * outputstream diff --git a/src/searchstrategy.cpp b/src/searchstrategy.cpp index ae72bda..987ab3d 100644 --- a/src/searchstrategy.cpp +++ b/src/searchstrategy.cpp @@ -152,7 +152,6 @@ bool SearchStrategy::coversPatterns( // and error pattern is simply a vector containing // the number of errors in each partition P for (const Pattern& pattern : patterns) { - int numberOfCovers = 0; // check if a search covers the pattern bool patternCovered = false; @@ -172,7 +171,6 @@ bool SearchStrategy::coversPatterns( // print the pattern and the search that covers it if (thisCover) { - numberOfCovers++; numCover[si]++; if (verbose) { @@ -498,15 +496,19 @@ SearchStrategyDBG::matchApproxSFI(const string& pattern, SearchStrategy::index.resetCounters(); if (maxED == 0) { - SearchStrategy::index.setDirection(BACKWARD); - auto result = - SearchStrategy::index.ExactMatchSFI(pattern); - std::map, std::vector> paths; - for (auto occ : result) { - occ.generateOutput(); - paths[occ.getNodePath()].push_back(occ); + SARangePair finalRange = + SearchStrategy::index.matchStringBidirectionally( + pattern); + if (!finalRange.empty()) { + positionClass finalPos = + positionClass(finalRange, pattern.size(), pattern.size()); + FMOcc finalOcc(finalPos, 0); + vector> occs = {finalOcc}; + return SearchStrategy::index + .mapOccurrencesInSAToOccurrencesInTextSFI(occs, maxED); + } else { + return {}; } - return paths; } // create the parts of the pattern vector parts; @@ -662,22 +664,22 @@ void CustomSearchStrategy::getSearchSchemeFromFolder( if (stream_static) { // a file with static partitioning positions exists getline(stream_static, line); - vector postionsAsString = {}; + vector positionsAsString = {}; stringstream ss(line); string token; while (ss >> token) { - postionsAsString.push_back(token); + positionsAsString.push_back(token); } - if ((int)postionsAsString.size() != calculateNumParts(i) - 1) { + if ((int)positionsAsString.size() != calculateNumParts(i) - 1) { throw runtime_error( "Not enough static positions provided in " + pathToFolder + to_string(i) + "/static_partitioning.txt\nExpected: " + to_string(calculateNumParts(i) - 1) + " parts\nProvided: " + - to_string(postionsAsString.size()) + " parts"); + to_string(positionsAsString.size()) + " parts"); } - for (auto str : postionsAsString) { + for (auto str : positionsAsString) { staticPositions[i - 1].push_back(stod(str)); } diff --git a/src/searchstrategy.h b/src/searchstrategy.h index c2f19cd..c5611d2 100644 --- a/src/searchstrategy.h +++ b/src/searchstrategy.h @@ -27,7 +27,7 @@ #define Pattern std::vector -// An enum for partion strategy +// An enum for partition strategy enum PartitionStrategy { UNIFORM, STATIC, DYNAMIC }; // An enum for which distance metric to use enum DistanceMetric { HAMMING, EDITNAIVE, EDITOPTIMIZED }; @@ -448,18 +448,18 @@ template class SearchStrategy { /** * Retrieves the text of the index (for debugging purposes) */ - std::string getText() const { + EncodedText getText() const { return index.getText(); } /** - * Mathces a pattern approximately using this strategy + * Matches a pattern approximately using this strategy * @param pattern, the pattern to match * @param maxED, the maximal allowed edit distance (or hamming * distance) */ - virtual std::vector matchApprox(const std::string& pattern, - length_t maxED) const; + std::vector matchApprox(const std::string& pattern, + length_t maxED) const; length_t getNodes() const { return index.getNodes(); @@ -503,7 +503,7 @@ class SearchStrategyDBG : virtual public SearchStrategy { strainFree(index.isStrainFree()) { } - virtual std::map, std::vector> + std::map, std::vector> matchApproxSFI(const std::string& pattern, length_t maxED) const; }; @@ -601,7 +601,7 @@ class CustomSearchStrategy : virtual public SearchStrategy { &CustomSearchStrategy::getWeightsDefault, &CustomSearchStrategy::getWeightsDefault, &CustomSearchStrategy::getWeightsDefault}; // pointer to the correct - // getWeigths() function, + // getWeights() function, // either default or custom /** @@ -761,11 +761,12 @@ class CustomSearchStrategy : virtual public SearchStrategy { getSearchSchemeFromFolder(pathToFolder, verbose); } - int calculateNumParts(unsigned int maxED) const { + virtual int calculateNumParts(unsigned int maxED) const override { assert(supportsMaxScore[maxED - 1]); return schemePerED[maxED - 1][0].getNumParts(); } - const std::vector& createSearches(unsigned int maxED) const { + virtual const std::vector& + createSearches(unsigned int maxED) const override { assert(supportsMaxScore[maxED - 1]); return schemePerED[maxED - 1]; } @@ -775,11 +776,12 @@ template class CustomSearchStrategyDBG : public SearchStrategyDBG, public CustomSearchStrategy { private: - int calculateNumParts(unsigned int maxED) const { + virtual int calculateNumParts(unsigned int maxED) const override { return CustomSearchStrategy::calculateNumParts(maxED); } - const std::vector& createSearches(unsigned int maxED) const { + virtual const std::vector& + createSearches(unsigned int maxED) const override { return CustomSearchStrategy::createSearches(maxED); } @@ -805,10 +807,11 @@ class NaiveBackTrackingStrategy : virtual public SearchStrategy { protected: std::vector searches = {}; - int calculateNumParts(unsigned int maxED) const { + virtual int calculateNumParts(unsigned int maxED) const override { return 1; } - const std::vector& createSearches(unsigned int maxED) const { + virtual const std::vector& + createSearches(unsigned int maxED) const override { return searches; } @@ -845,12 +848,13 @@ class NaiveBackTrackingStrategyDBG : public SearchStrategyDBG, public NaiveBackTrackingStrategy { private: - int calculateNumParts(unsigned int maxED) const { + virtual int calculateNumParts(unsigned int maxED) const override { return NaiveBackTrackingStrategy::calculateNumParts( maxED); } - const std::vector& createSearches(unsigned int maxED) const { + virtual const std::vector& + createSearches(unsigned int maxED) const override { return NaiveBackTrackingStrategy::createSearches( maxED); } @@ -861,16 +865,18 @@ class NaiveBackTrackingStrategyDBG SearchStrategy::index.resetCounters(); if (maxED == 0) { - - auto result = - SearchStrategy::index.ExactMatchSFI(pattern); - std::map, std::vector> - paths; - for (auto occ : result) { - occ.generateOutput(); - paths[occ.getNodePath()].emplace_back(occ); + SARangePair finalRange = SearchStrategy::index + .matchStringBidirectionally(pattern); + if (!finalRange.empty()) { + positionClass finalPos = + positionClass(finalRange, pattern.size(), pattern.size()); + FMOcc finalOcc(finalPos, 0); + std::vector> occs = {finalOcc}; + return SearchStrategy::index + .mapOccurrencesInSAToOccurrencesInTextSFI(occs, maxED); + } else { + return {}; } - return paths; } return SearchStrategy::index.approxMatchesNaiveSFI( pattern, maxED); @@ -925,10 +931,11 @@ class KucherovKplus1 : virtual public SearchStrategy { const std::vector> staticPositions = { {0.5}, {0.41, 0.7}, {0.25, 0.50, 0.75}, {0.27, 0.47, 0.62, 0.81}}; - int calculateNumParts(unsigned int maxED) const { + virtual int calculateNumParts(unsigned int maxED) const override { return maxED + 1; } - const std::vector& createSearches(unsigned int maxED) const { + virtual const std::vector& + createSearches(unsigned int maxED) const override { assert(maxED >= 1); assert(maxED <= 4); @@ -961,11 +968,12 @@ template class KucherovKplus1DBG : public SearchStrategyDBG, public KucherovKplus1 { private: - int calculateNumParts(unsigned int maxED) const { + virtual int calculateNumParts(unsigned int maxED) const override { return KucherovKplus1::calculateNumParts(maxED); } - const std::vector& createSearches(unsigned int maxED) const { + virtual const std::vector& + createSearches(unsigned int maxED) const override { return KucherovKplus1::createSearches(maxED); } @@ -1020,10 +1028,11 @@ class KucherovKplus2 : virtual public SearchStrategy { const std::vector> schemePerED = {ED1, ED2, ED3, ED4}; - int calculateNumParts(unsigned int maxED) const { + virtual int calculateNumParts(unsigned int maxED) const override { return maxED + 2; } - const std::vector& createSearches(unsigned int maxED) const { + virtual const std::vector& + createSearches(unsigned int maxED) const override { return schemePerED[maxED - 1]; } @@ -1068,11 +1077,12 @@ template class KucherovKplus2DBG : public SearchStrategyDBG, public KucherovKplus2 { private: - int calculateNumParts(unsigned int maxED) const { + virtual int calculateNumParts(unsigned int maxED) const override { return KucherovKplus2::calculateNumParts(maxED); } - const std::vector& createSearches(unsigned int maxED) const { + virtual const std::vector& + createSearches(unsigned int maxED) const override { return KucherovKplus2::createSearches(maxED); } @@ -1115,10 +1125,11 @@ class OptimalKianfar : virtual public SearchStrategy { const std::vector> staticPositions = { {0.5}, {0.30, 0.60}, {0.17, 0.69, 0.96}, {0.2, 0.5, 0.6, 0.8}}; - int calculateNumParts(unsigned int maxED) const { + virtual int calculateNumParts(unsigned int maxED) const override { return maxED + 1; } - const std::vector& createSearches(unsigned int maxED) const { + virtual const std::vector& + createSearches(unsigned int maxED) const override { if (maxED < 1 || maxED > 5) { throw std::invalid_argument("max ED should be between 1 and 4"); } @@ -1152,11 +1163,12 @@ template class OptimalKianfarDBG : public SearchStrategyDBG, public OptimalKianfar { private: - int calculateNumParts(unsigned int maxED) const { + virtual int calculateNumParts(unsigned int maxED) const override { return OptimalKianfar::calculateNumParts(maxED); } - const std::vector& createSearches(unsigned int maxED) const { + virtual const std::vector& + createSearches(unsigned int maxED) const override { return OptimalKianfar::createSearches(maxED); } @@ -1176,9 +1188,9 @@ class OptimalKianfarDBG : public SearchStrategyDBG, // A concrete derived class of SearchStrategy. The strategy here is founded // on this observation: if x errors are allowed and the pattern is divided // up in (x -// + 2) parts then every match with max x erros contains a seed consisting +// + 2) parts then every match with max x errors contains a seed consisting // of n parts, where the first and last part of the seed contain no errors -// and all parts inbetween these contain exacly one error. (2 <= n <= x + +// and all parts in between these contain exactly one error. (2 <= n <= x + // 2) template class O1StarSearchStrategy : virtual public SearchStrategy { @@ -1212,10 +1224,11 @@ class O1StarSearchStrategy : virtual public SearchStrategy { const std::vector> schemePerED = {ED1, ED2, ED3, ED4}; - int calculateNumParts(unsigned int maxED) const { + virtual int calculateNumParts(unsigned int maxED) const override { return maxED + 2; } - const std::vector& createSearches(unsigned int maxED) const { + virtual const std::vector& + createSearches(unsigned int maxED) const override { return schemePerED[maxED - 1]; } @@ -1258,11 +1271,12 @@ template class O1StarSearchStrategyDBG : public SearchStrategyDBG, public O1StarSearchStrategy { private: - int calculateNumParts(unsigned int maxED) const { + virtual int calculateNumParts(unsigned int maxED) const override { return O1StarSearchStrategy::calculateNumParts(maxED); } - const std::vector& createSearches(unsigned int maxED) const { + virtual const std::vector& + createSearches(unsigned int maxED) const override { return O1StarSearchStrategy::createSearches(maxED); } @@ -1290,10 +1304,11 @@ class ManBestStrategy : virtual public SearchStrategy { Search::makeSearch({5, 4, 3, 2, 1, 0}, {0, 0, 0, 0, 3, 3}, {0, 0, 4, 4, 4, 4})}; - int calculateNumParts(unsigned int maxED) const { + virtual int calculateNumParts(unsigned int maxED) const override { return maxED + 2; } - const std::vector& createSearches(unsigned int maxED) const { + virtual const std::vector& + createSearches(unsigned int maxED) const override { assert(maxED == 4); return ED4; } @@ -1334,11 +1349,12 @@ template class ManBestStrategyDBG : public SearchStrategyDBG, public ManBestStrategy { private: - int calculateNumParts(unsigned int maxED) const { + virtual int calculateNumParts(unsigned int maxED) const override { return ManBestStrategy::calculateNumParts(maxED); } - const std::vector& createSearches(unsigned int maxED) const { + virtual const std::vector& + createSearches(unsigned int maxED) const override { return ManBestStrategy::createSearches(maxED); } @@ -1389,10 +1405,11 @@ class PigeonHoleSearchStrategy Search::makeSearch({4, 3, 2, 1, 0}, {0, 0, 0, 0, 0}, {0, 4, 4, 4, 4})}; const std::vector> schemePerED = {ED1, ED2, ED3, ED4}; - int calculateNumParts(unsigned int maxED) const { + virtual int calculateNumParts(unsigned int maxED) const override { return maxED + 1; } - const std::vector& createSearches(unsigned int maxED) const { + virtual const std::vector& + createSearches(unsigned int maxED) const override { return schemePerED[maxED - 1]; } @@ -1409,12 +1426,13 @@ class PigeonHoleSearchStrategyDBG : public SearchStrategyDBG, public PigeonHoleSearchStrategy { private: - int calculateNumParts(unsigned int maxED) const { + virtual int calculateNumParts(unsigned int maxED) const override { return PigeonHoleSearchStrategy::calculateNumParts( maxED); } - const std::vector& createSearches(unsigned int maxED) const { + virtual const std::vector& + createSearches(unsigned int maxED) const override { return PigeonHoleSearchStrategy::createSearches( maxED); } diff --git a/src/strainfreemapper.cpp b/src/strainfreemapper.cpp index b77ffb2..5abc530 100644 --- a/src/strainfreemapper.cpp +++ b/src/strainfreemapper.cpp @@ -42,6 +42,11 @@ StrainFreeMapper::matchApproxSFR(const std::string& pattern, std::vector result = {}; if (pos.isValid()) { pos.setNodePath(nodePathRight); + if (pos.getNodePath().empty()) { + vector nodePath; + strategy->index.findNodeUnderK(pos, 0, nodePath); + pos.setNodePath(nodePath); + } FMOccSFR occ(pos, 0); result.emplace_back(occ); } diff --git a/src/suffixarray.h b/src/suffixarray.h index 81ee16b..f2a36de 100644 --- a/src/suffixarray.h +++ b/src/suffixarray.h @@ -54,12 +54,27 @@ class SparseSuffixArray { return sparseSA[bitvector.rank(i)]; } - SparseSuffixArray(const std::vector& sa, - const length_t sparseNessFactor) + SparseSuffixArray(const int64_t* sa, const length_t sparseNessFactor, + const length_t textLength) : sparseNessFactor(sparseNessFactor) { - bitvector = Bitvec(sa.size()); - sparseSA.reserve(sa.size() / sparseNessFactor); - for (length_t i = 0; i < sa.size(); i++) { + bitvector = Bitvec(textLength); + sparseSA.reserve(textLength / sparseNessFactor); + for (length_t i = 0; i < textLength; i++) { + const auto& el = sa[i]; + if (el % sparseNessFactor == 0) { + sparseSA.emplace_back(el); + bitvector[i] = true; + } + } + bitvector.index(); + } + + SparseSuffixArray(const length_t* sa, const length_t sparseNessFactor, + const length_t textLength) + : sparseNessFactor(sparseNessFactor) { + bitvector = Bitvec(textLength); + sparseSA.reserve(textLength / sparseNessFactor); + for (length_t i = 0; i < textLength; i++) { const auto& el = sa[i]; if (el % sparseNessFactor == 0) { sparseSA.emplace_back(el); @@ -110,6 +125,14 @@ class SparseSuffixArray { sparseSA.size() * sizeof(length_t)); } } + + void clear() { + sparseNessFactor = 0; + bitvector.clear(); + sparseSA.clear(); + sparseSA.resize(0); + sparseSA.shrink_to_fit(); + } }; #endif \ No newline at end of file diff --git a/src/textoccurrence.h b/src/textoccurrence.h index bd0cf62..4845088 100644 --- a/src/textoccurrence.h +++ b/src/textoccurrence.h @@ -23,6 +23,7 @@ #pragma once #include "range.h" +#include // ============================================================================ // CLASS TextOccurrence @@ -50,7 +51,7 @@ class TextOccurrence { /** * @brief Generates the output of this occurrence, for now in format: * startposition\twidth\tdistance, where startposition is the beginning of - * the textoccurrence, width is the length of this occurrence, distance is + * the text occurrence, width is the length of this occurrence, distance is * the (edit or hamming) distance to the mapped read. * */ @@ -144,6 +145,9 @@ class TextOccurrenceSFI : public TextOccurrence { std::vector nodepath; // The strain in the pan-genome to which this TextOccurrence belongs int strain; + // The distance from the left end of the start node to the start of the + // occurrence + uint32_t distanceFromLeftEnd; public: /** @@ -153,10 +157,13 @@ class TextOccurrenceSFI : public TextOccurrence { * @param distance the distance to this range (edit or hamming) * @param nodepath the node path in the graph * @param strain the strain in the pan-genome + * @param distanceFromLeftEnd The distance from the left end of the start + * node to the start of the occurrence */ TextOccurrenceSFI(Range range, int distance, std::vector nodepath, - int strain) - : TextOccurrence(range, distance), nodepath(nodepath), strain(strain) { + int strain, uint32_t distanceFromLeftEnd) + : TextOccurrence(range, distance), nodepath(nodepath), strain(strain), + distanceFromLeftEnd(distanceFromLeftEnd) { } /** @@ -176,4 +183,15 @@ class TextOccurrenceSFI : public TextOccurrence { const int& getStrain() const { return strain; } + + /** + * @brief Get the distance from the left end of the first node in the + * pan-genome + * + * @return const uint32_t& - the distance from the left end of the first + * node in the pan-genome + */ + const uint32_t& getDistanceFromLeftEnd() const { + return distanceFromLeftEnd; + } }; \ No newline at end of file diff --git a/src/tkmer.h b/src/tkmer.h index 83c8d76..fbc57a2 100644 --- a/src/tkmer.h +++ b/src/tkmer.h @@ -284,7 +284,7 @@ template class TKmer { /** * Write a kmer to file - * @param ofs Openen output file stream + * @param ofs Opened output file stream */ void write(std::ofstream& ofs) const { ofs.write((char*)buf, kMSB + 1); @@ -292,7 +292,7 @@ template class TKmer { /** * Write a kmer to file - * @param ofs Openen output file stream + * @param ofs Opened output file stream */ void writeBytes() const { for (int i = 0; i < numBytes; i++) diff --git a/src/visualizePath.cpp b/src/visualizePath.cpp index 7bb1b25..af8f264 100644 --- a/src/visualizePath.cpp +++ b/src/visualizePath.cpp @@ -31,88 +31,126 @@ * */ void showUsage() { - cout << "Usage: ./visualizePath [options] basefilename path\n\n"; - cout << " [path] should be a comma-separated list of node identifiers\n\n"; - cout << " [Visualization options]\n"; - cout << " -d --visualization-depth\t\tDepth of the visualized " - "neighborhood around the paths of interest [default = 3]\n"; - cout << " -o --output-files\t\tPrefix of the output files that will be " - "created during the visualization process [default = " - "basefilename]\n"; - cout << " -s --sa-sparseness\tsuffix array sparseness factor " - "[default = " - "1]\n"; - cout << " -c --cp-sparseness\tsparseness factor that indicates " - "how many checkpoints must be stored to identify nodes. Use " - "\"none\" to use no checkpoints. Choose a value that was also used " - "during the building process." - "[default = 128]\n\n"; - - cout << "Following input files are required:\n"; - cout << "\t.txt: input text T\n"; - cout << "\t.cct: character counts table\n"; - cout << "\t.sa.[saSF]: sparse suffix array, with suffix " - "array sparseness factor [saSF] " - "elements\n"; - cout << "\t.sa.bv.[saSF]: bitvector indicating which " - "elements of the suffix array are stored.\n"; - cout << "\t.bwt: BWT of T\n"; - cout << "\t.rev.bwt: BWT of the reverse of T\n"; - cout << "\t.brt: Prefix occurrence table of T\n"; - cout << "\t.rev.brt: Prefix occurrence table of the " - "reverse " - "of T\n"; - cout << "\t.DBG: variable k and the compressed de " - "Bruijn graph.\n"; - cout << "\t.B.left: bitvector B_left for the compressed de " - "Bruijn graph.\n"; - cout << "\t.B.right.[cpSF]: bitvector B_right for the " - "compressed " - "de " - "Bruijn graph, with checkpoint sparseness factor [cpSF].\n"; - cout << "\t.B.right.full.[cpSF]: bitvector B_right_full for " - "the " - "compressed de Bruijn graph, with checkpoint sparseness factor " - "[cpSF].\n"; - cout << "\t.left.map: node identifier mapping corresponding " - "to B_left.\n"; - cout << "\t.right.map.[cpSF]: node identifier mapping " - "corresponding " - "to B_right, with checkpoint sparseness factor [cpSF].\n"; + cout << + + "This program visualizes a node, a node path or a set of nodes of\n" + "interest.\n\n\n" + + "Usage: ./visualizePath [options] \n\n" + + " Following input parameters are required:\n" + " base filename of the input index\n" + " the de Bruijn parameter of the index\n" + " a comma-separated list of node identifiers\n" + " (e.g., 1,9,20)\n\n\n" + + " [options]\n" + " -e/--max-ed maximum edit distance [default = 0]\n\n" + + " -s/--sa-sparseness suffix array sparseness factor [default = " + "16]\n\n" + + " -c/--cp-sparseness sparseness factor that indicates how many\n" + " checkpoints must be stored to identify nodes.\n" + " Use \"none\" to use no checkpoints. Choose a\n" + " value that was also used during the building\n" + " process. [default = 128]\n\n" + " -d/--depth Depth of the visualized neighborhood around " + "the\n" + " paths of interest [default = 3]\n\n" + " -b/--bundle-edges Bundle edges stemming from different strains\n" + " together. Recommended when many strains are\n" + " present [default = false]\n\n" + " -o/--output-files Prefix of the output files that will be " + "created\n" + " during the visualization process [default =\n" + " basefilename]\n\n\n" + + " Following input files are required:\n" + " .compressed.txt: compressed version of the\n" + " input text T\n\n" + " .cct: character counts table\n\n" + " .sa.: sparse suffix array, with\n" + " suffix array sparseness\n" + " factor elements\n\n" + " .sa.bv.: bitvector indicating " + "which\n" + " elements of the suffix\n" + " array are stored.\n\n" + " .bwt: BWT of T\n\n" + " .rev.bwt: BWT of the reverse of T\n\n" + " .brt: Prefix occurrence table of " + "T\n\n" + " .rev.brt: Prefix occurrence table " + "of\n" + " the reverse of T\n\n" + " .DBG.k: the compressed de Bruijn\n" + " graph for the requested " + "de\n" + " Bruijn parameter\n\n" + " .B.right.k.cp: first bitvector of the\n" + " implicit representation " + "for\n" + " the requested de Bruijn\n" + " parameter, with " + "checkpoint\n" + " sparseness factor " + "\n\n" + " .B.left.k: second bitvector of the\n" + " implicit representation\n" + " for the requested de\n" + " Bruijn parameter\n\n" + " .right.map.k.cp: node identifier mapping\n" + " corresponding to the " + "first\n" + " bitvector, with " + "checkpoint\n" + " sparseness factor " + "\n\n" + " .left.map.k: node identifier mapping\n" + " corresponding to the\n" + " second bitvector\n\n\n"; } int main(int argc, char* argv[]) { - int requiredArguments = 2; // baseFile of files and file containing reads + int requiredArguments = 3; // baseFile of files, k and the node path + + if (argc == 2) { + string firstArg(argv[1]); + if (firstArg.find("help") != std::string::npos) { + showUsage(); + return EXIT_SUCCESS; + } + } if (argc < requiredArguments) { - cerr << "Insufficient number of arguments" << endl; + cerr << "Insufficient number of arguments.\n" << endl; showUsage(); return EXIT_FAILURE; } - if (argc == 2 && strcmp("help", argv[1]) == 0) { - showUsage(); - return EXIT_SUCCESS; - } cout << "Welcome to Nexus!\n"; string visDepthString = "3"; string outputFile = ""; string cpSparse = "128"; - string saSparse = "1"; + string saSparse = "16"; + bool separateEdges = true; // process optional arguments for (int i = 1; i < argc - requiredArguments; i++) { const string& arg = argv[i]; - if (arg == "-d" || arg == "--visualization-depth") { + if (arg == "-d" || arg == "--depth") { if (i + 1 < argc) { visDepthString = argv[++i]; } else { throw runtime_error(arg + " takes 1 argument as input"); } + } else if (arg == "-b" || arg == "--bundle-edges") { + separateEdges = false; } else if (arg == "-o" || arg == "--output-files") { if (i + 1 < argc) { outputFile = argv[++i]; @@ -136,7 +174,7 @@ int main(int argc, char* argv[]) { } } else { cerr << "Unknown argument: " << arg << " is not an option" << endl; - return false; + return EXIT_FAILURE; } } length_t visDepth = stoi(visDepthString); @@ -160,7 +198,8 @@ int main(int argc, char* argv[]) { return EXIT_FAILURE; } - string baseFile = argv[argc - 2]; + string baseFile = argv[argc - 3]; + uint k = atoi(argv[argc - 2]); string pathString = argv[argc - 1]; if (outputFile == "") { @@ -169,7 +208,7 @@ int main(int argc, char* argv[]) { cout << "Start creation of BWT approximate matcher on graphs" << endl; - FMIndexDBG bwt(baseFile, saSF, cpSF); + FMIndexDBG bwt(baseFile, saSF, cpSF, k); std::vector path; @@ -182,12 +221,13 @@ int main(int argc, char* argv[]) { ss.ignore(); } } catch (const std::exception& e) { - std::cerr << "Something went wrong whilst parsing the path." << '\n'; + std::cerr << "Something went wrong whilst parsing the node path." + << '\n'; } bwt.getText(); - bwt.visualizeSubgraph(path, visDepth, outputFile); + bwt.visualizeSubgraph(path, visDepth, outputFile, separateEdges); cout << "Bye...\n"; } diff --git a/src/visualizeRead.cpp b/src/visualizeRead.cpp index 5f3457c..0eb229f 100644 --- a/src/visualizeRead.cpp +++ b/src/visualizeRead.cpp @@ -32,103 +32,155 @@ vector schemes = {"kuch1", "kuch2", "kianfar", "manbest", * */ void showUsage() { - cout << "Usage: ./visualizeRead [options] basefilename read\n\n"; - cout << " [Pattern matching options]\n"; - cout << " -sfr --strain-free\tstrain-free matching\n"; - cout << " -e --max-ed\t\tmaximum edit distance [default = 0]\n"; - cout << " -s --sa-sparseness\tsuffix array sparseness factor " - "[default = " - "1]\n"; - cout << " -c --cp-sparseness\tsparseness factor that indicates " - "how many checkpoints must be stored to identify nodes. Use " - "\"none\" to use no checkpoints. Choose a value that was also used " - "during the building process. " - "[default = 128]\n"; - cout - << " -f --filter\t\tfiltering type that should be used to filter the " - "occurrences. This option is only valid in case of strain-free " - "matching. Options:\n\t" - << "linear\t\tlinear filtering is efficient but does not filter out " - "all redundant occurrences. Additionally, in some exceptional " - "cases, a non-optimal replacement occurrence can be chosen. This " - "is the default option.\n\t" - << "complete\tcomplete filtering leads to a set of occurrences with " - "no redundancy. This option is very slow however and thus not " - "recommended.\n"; - cout << " -p --partitioning\t\tAdd flag to do uniform/static/dynamic " - "partitioning. Dynamic partitioning cannot be used with " - "strain-free matching. [default = static]\n"; - cout << " -m --metric\t\tAdd flag to set distance metric " - "(editnaive/editopt/hamming) [default = editopt]\n"; - cout << " -ss --search-scheme\tChoose the search scheme\n options:\n\t" - << "kuch1\tKucherov k + 1\n\t" - << "kuch2\tKucherov k + 2\n\t" - << "kianfar\tOptimal Kianfar scheme\n\t" - << "manbest\tManual best improvement for Kianfar scheme (only for ed " - "= 4)\n\t" - << "pigeon\tPigeonhole scheme\n\t" - << "01*0\t01*0 search scheme\n\t" - << "naive\tnaive backtracking\n\t" - << "custom\tcustom search scheme, the next parameter should be a path " - "to the folder containing this search scheme\n\n"; - - cout << " [Visualization options]\n"; - cout << " -d --visualization-depth\t\tDepth of the visualized " - "neighborhood around the paths of interest [default = 3]\n"; - cout << " -o --output-files\t\tPrefix of the output files that will be " - "created during the visualization process [default = " - "basefilename]\n\n"; - - cout << "Following input files are required:\n"; - cout << "\t.txt: input text T\n"; - cout << "\t.cct: character counts table\n"; - cout << "\t.sa.[saSF]: sparse suffix array, with suffix " - "array sparseness factor [saSF] " - "elements\n"; - cout << "\t.sa.bv.[saSF]: bitvector indicating which " - "elements of the suffix array are stored.\n"; - cout << "\t.bwt: BWT of T\n"; - cout << "\t.rev.bwt: BWT of the reverse of T\n"; - cout << "\t.brt: Prefix occurrence table of T\n"; - cout << "\t.rev.brt: Prefix occurrence table of the " - "reverse " - "of T\n"; - cout << "\t.DBG: variable k and the compressed de " - "Bruijn graph.\n"; - cout << "\t.B.left: bitvector B_left for the compressed de " - "Bruijn graph.\n"; - cout << "\t.B.right.[cpSF]: bitvector B_right for the " - "compressed " - "de " - "Bruijn graph, with checkpoint sparseness factor [cpSF].\n"; - cout << "\t.B.right.full.[cpSF]: bitvector B_right_full for " - "the " - "compressed de Bruijn graph, with checkpoint sparseness factor " - "[cpSF].\n"; - cout << "\t.left.map: node identifier mapping corresponding " - "to B_left.\n"; - cout << "\t.right.map.[cpSF]: node identifier mapping " - "corresponding " - "to B_right, with checkpoint sparseness factor [cpSF].\n"; + cout << + + "This program aligns a short, single end read to a pan-genome de\n" + "Bruijn graph. It visualizes the corresponding node paths with their\n" + "surroundings in the graph.\n\n\n" + + "Usage: ./visualizeRead [options] \n\n" + + " Following input parameters are required:\n" + " base filename of the input index\n" + " the de Bruijn parameter of the index\n" + " the read that must be aligned and " + "visualized.\n\n\n" + + " [options]\n" + " -e/--max-ed maximum edit distance [default = 0]\n\n" + + " -s/--sa-sparseness suffix array sparseness factor [default = " + "16]\n\n" + + " -c/--cp-sparseness sparseness factor that indicates how many\n" + " checkpoints must be stored to identify nodes.\n" + " Use \"none\" to use no checkpoints. Choose a\n" + " value that was also used during the building\n" + " process. [default = 128]\n\n" + + " -p/--partitioning Add flag to do uniform/static/dynamic\n" + " partitioning of the seeds for search schemes.\n" + " Dynamic partitioning cannot be used with\n" + " strain-free matching. [default = dynamic]\n\n" + + " -m/--metric Add flag to set distance metric (editnaive/\n" + " editopt/ hamming) [default = editopt]\n\n" + + " -ss/--search-scheme Choose the search scheme. Options:\n" + " * kuch1 Kucherov k + 1 [default]\n" + " * kuch2 Kucherov k + 2\n" + " * kianfar Optimal Kianfar scheme\n" + " * manbest Manual best improvement for " + "Kianfar\n" + " scheme (only for ed = 4)\n" + " * pigeon Pigeonhole scheme\n" + " * 01*0 01*0 search scheme\n" + " * naive naive backtracking\n" + " * custom custom search scheme, the next\n" + " parameter should be a path to the\n" + " folder containing this " + "searchscheme\n\n" + + " -sfr/--strain-free strain-free matching: occurrences can be\n" + " identified as any path of connected nodes. In\n" + " other words, they do not have to occur exactly\n" + " in one of the input genomes of the pan-genome.\n" + " This is option is not activated by default and\n" + " is slower than the default implementation.\n\n" + + " -f/--filter filtering type that should be used to filter\n" + " the occurrences. This option is only valid in\n" + " case of strain-free matching. Options:\n" + " * linear: linear filtering is efficient but\n" + " does not filter out all redundant\n" + " occurrences. Additionally, in some\n" + " exceptional cases, a non-optimal " + "replacement\n" + " occurrence can be chosen. This is the\n" + " default option.\n" + " * complete: complete filtering leads to a set\n" + " of occurrences with no redundancy. This\n" + " option is very slow however and thus not\n" + " recommended.\n\n" + + " -d/--depth Depth of the visualized neighborhood around " + "the\n" + " paths of interest [default = 3]\n\n" + " -b/--bundle-edges Bundle edges stemming from different strains\n" + " together. Recommended when many strains are\n" + " present [default = false]\n\n" + " -o/--output-files Prefix of the output files that will be " + "created\n" + " during the visualization process [default =\n" + " basefilename]\n\n\n" + + " Following input files are required:\n" + " .compressed.txt: compressed version of the\n" + " input text T\n\n" + " .cct: character counts table\n\n" + " .sa.: sparse suffix array, with\n" + " suffix array sparseness\n" + " factor elements\n\n" + " .sa.bv.: bitvector indicating " + "which\n" + " elements of the suffix\n" + " array are stored.\n\n" + " .bwt: BWT of T\n\n" + " .rev.bwt: BWT of the reverse of T\n\n" + " .brt: Prefix occurrence table of " + "T\n\n" + " .rev.brt: Prefix occurrence table " + "of\n" + " the reverse of T\n\n" + " .DBG.k: the compressed de Bruijn\n" + " graph for the requested " + "de\n" + " Bruijn parameter\n\n" + " .B.right.k.cp: first bitvector of the\n" + " implicit representation " + "for\n" + " the requested de Bruijn\n" + " parameter, with " + "checkpoint\n" + " sparseness factor " + "\n\n" + " .B.left.k: second bitvector of the\n" + " implicit representation\n" + " for the requested de\n" + " Bruijn parameter\n\n" + " .right.map.k.cp: node identifier mapping\n" + " corresponding to the " + "first\n" + " bitvector, with " + "checkpoint\n" + " sparseness factor " + "\n\n" + " .left.map.k: node identifier mapping\n" + " corresponding to the\n" + " second bitvector\n\n\n"; } int main(int argc, char* argv[]) { - int requiredArguments = 2; // baseFile of files and file containing reads + int requiredArguments = 3; // baseFile of files, k and file containing reads + + if (argc == 2) { + string firstArg(argv[1]); + if (firstArg.find("help") != std::string::npos) { + showUsage(); + return EXIT_SUCCESS; + } + } if (argc < requiredArguments) { - cerr << "Insufficient number of arguments" << endl; + cerr << "Insufficient number of arguments.\n" << endl; showUsage(); return EXIT_FAILURE; } - if (argc == 2 && strcmp("help", argv[1]) == 0) { - showUsage(); - return EXIT_SUCCESS; - } cout << "Welcome to Nexus!\n"; - string saSparse = "1"; + string saSparse = "16"; string cpSparse = "128"; string maxED = "0"; string visDepthString = "3"; @@ -138,6 +190,7 @@ int main(int argc, char* argv[]) { bool strainFree = false; bool filteringIsChosen = false; bool filteringOptionComplete = false; + bool separateEdges = true; PartitionStrategy pStrat = STATIC; DistanceMetric metric = EDITOPTIMIZED; @@ -193,13 +246,15 @@ int main(int argc, char* argv[]) { } else { throw runtime_error(arg + " takes 1 argument as input"); } - } else if (arg == "-d" || arg == "--visualization-depth") { + } else if (arg == "-d" || arg == "--depth") { if (i + 1 < argc) { visDepthString = argv[++i]; } else { throw runtime_error(arg + " takes 1 argument as input"); } + } else if (arg == "-b" || arg == "--bundle-edges") { + separateEdges = false; } else if (arg == "-o" || arg == "--output-files") { if (i + 1 < argc) { outputFile = argv[++i]; @@ -277,7 +332,7 @@ int main(int argc, char* argv[]) { else { cerr << "Unknown argument: " << arg << " is not an option" << endl; - return false; + return EXIT_FAILURE; } } @@ -317,7 +372,8 @@ int main(int argc, char* argv[]) { throw runtime_error("manbest only supports 4 allowed errors"); } - string baseFile = argv[argc - 2]; + string baseFile = argv[argc - 3]; + uint k = atoi(argv[argc - 2]); string read = argv[argc - 1]; if (outputFile == "") { @@ -328,7 +384,7 @@ int main(int argc, char* argv[]) { if (strainFree) { - FMIndexDBG bwt(baseFile, saSF, cpSF, strainFree, + FMIndexDBG bwt(baseFile, saSF, cpSF, k, strainFree, filteringOptionComplete); SearchStrategyDBG, FMPosSFR>* strategy; @@ -367,13 +423,13 @@ int main(int argc, char* argv[]) { } StrainFreeMapper mapper(strategy); auto results = mapper.matchApproxSFR(read, ed); - bwt.visualizeSubgraphs(results, visDepth, outputFile); + bwt.visualizeSubgraphs(results, visDepth, outputFile, separateEdges); delete strategy; } else { - FMIndexDBG bwt(baseFile, saSF, cpSF, strainFree); + FMIndexDBG bwt(baseFile, saSF, cpSF, k, strainFree); SearchStrategyDBG, FMPos>* strategy; if (searchscheme == "kuch1") { @@ -409,7 +465,7 @@ int main(int argc, char* argv[]) { } auto results = strategy->matchApproxSFI(read, ed); - bwt.visualizeSubgraphs(results, visDepth, outputFile); + bwt.visualizeSubgraphs(results, visDepth, outputFile, separateEdges); delete strategy; } diff --git a/sux/bits/Rank.hpp b/sux/bits/Rank.hpp index 8accd2c..8e22ed8 100644 --- a/sux/bits/Rank.hpp +++ b/sux/bits/Rank.hpp @@ -31,6 +31,7 @@ #include #include +#include namespace sux {