From ee09fafab6f67dbc1df0dc78a7897b16f57774aa Mon Sep 17 00:00:00 2001 From: Alexander Klassmann Date: Sun, 1 Nov 2020 09:00:02 +0000 Subject: [PATCH] version 3.2.0 --- DESCRIPTION | 19 +- MD5 | 103 +++--- NAMESPACE | 2 + NEWS.md | 13 +- R/as.newick.R | 14 +- R/calc_candidate_regions.R | 15 +- R/calc_ehh.R | 5 +- R/calc_ehhs.R | 5 +- R/calc_pairwise_haplen.R | 36 ++- R/calc_sfs_tests.R | 155 +++++++++ R/data2haplohh.R | 31 +- R/haplohh2sweepfinder.R | 65 ++++ R/make.example.files.R | 15 + R/manhattanplot.R | 33 +- R/plot.haplohh.R | 53 ++- R/rehh.R | 4 + R/scan_hh.R | 24 +- R/scan_hh_full.R | 89 +++-- R/subset.haplohh.R | 40 +++ build/vignette.rds | Bin 227 -> 239 bytes inst/doc/examples.R | 50 ++- inst/doc/examples.Rmd | 70 +++- inst/doc/examples.html | 120 ++++--- inst/doc/rehh.R | 8 +- inst/doc/rehh.Rmd | 183 ++++++----- inst/doc/rehh.html | 231 +++++++------ inst/extdata/example_neutral.vcf | 17 + inst/extdata/example_sweep.vcf | 17 + .../example_sweep_with_recombination.vcf | 17 + man/calc_ehh.Rd | 5 +- man/calc_ehhs.Rd | 5 +- man/calc_pairwise_haplen.Rd | 21 +- man/calc_sfs_tests.Rd | 94 ++++++ man/data2haplohh.Rd | 2 +- man/haplohh2sweepfinder.Rd | 58 ++++ man/make.example.files.Rd | 9 + man/manhattanplot.Rd | 6 + man/plot.haplohh.Rd | 14 +- man/rehh-package.Rd | 15 +- man/scan_hh.Rd | 11 +- man/scan_hh_full.Rd | 71 ++-- man/subset.haplohh.Rd | 4 + src/CALL_PAIRWISE_HAPLEN.c | 9 +- src/{CALL_SCAN_HH2.c => CALL_SCAN_HH_FULL.c} | 7 +- src/CALL_SFS_TESTS.c | 33 ++ src/calc_pairwise_haplen.c | 31 +- src/calc_pairwise_haplen.h | 5 +- src/calc_sfs_tests.c | 180 +++++++++++ src/calc_sfs_tests.h | 6 + src/init.c | 11 +- src/sfs_moments.c | 303 ++++++++++++++++++ src/sfs_moments.h | 13 + tests/testthat/test_data2haplohh.R | 40 ++- tests/testthat/test_furcation.R | 16 + tests/testthat/test_scan_full.R | 49 ++- tests/testthat/test_sfs_tests.R | 58 ++++ vignettes/examples.Rmd | 70 +++- vignettes/rehh.Rmd | 183 ++++++----- vignettes/vignette.bib | 48 ++- 59 files changed, 2242 insertions(+), 569 deletions(-) create mode 100644 R/calc_sfs_tests.R create mode 100644 R/haplohh2sweepfinder.R create mode 100644 inst/extdata/example_neutral.vcf create mode 100644 inst/extdata/example_sweep.vcf create mode 100644 inst/extdata/example_sweep_with_recombination.vcf create mode 100644 man/calc_sfs_tests.Rd create mode 100644 man/haplohh2sweepfinder.Rd rename src/{CALL_SCAN_HH2.c => CALL_SCAN_HH_FULL.c} (95%) create mode 100644 src/CALL_SFS_TESTS.c create mode 100644 src/calc_sfs_tests.c create mode 100644 src/calc_sfs_tests.h create mode 100644 src/sfs_moments.c create mode 100644 src/sfs_moments.h create mode 100644 tests/testthat/test_sfs_tests.R diff --git a/DESCRIPTION b/DESCRIPTION index bb3b81b..dea09be 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: rehh Maintainer: Alexander Klassmann Author: Mathieu Gautier, Alexander Klassmann and Renaud Vitalis -Version: 3.1.2 +Version: 3.2.0 License: GPL (>= 2) Title: Searching for Footprints of Selection using 'Extended Haplotype Homozygosity' Based Tests @@ -18,23 +18,18 @@ Description: Population genetic data such as 'Single Nucleotide 'Rsb' (Tang 2007) and 'XP-EHH' (Sabeti 2007) , targeted at differential selection between two populations. - Various plotting functions are also included to facilitate - visualization and interpretation of these statistics. - Due to changes in the API, albeit mostly minor, - versions 3.X are not compatible with versions 2.0.X. - Note: optionally, vcf files can be imported using package vcfR. That package - is currently removed from CRAN, but can still be installed from - following instructions there. + Various plotting functions are included to facilitate + visualization and interpretation of these statistics. Depends: R (>= 2.10) Imports: methods, rehh.data Suggests: ape, bookdown, data.table, gap, knitr, qqman, rmarkdown, R.utils, testthat, vcfR VignetteBuilder: knitr NeedsCompilation: yes -Packaged: 2020-07-17 09:16:17 UTC; alex -RoxygenNote: 7.1.0 +Packaged: 2020-11-01 09:31:22 UTC; alex +RoxygenNote: 7.1.1 URL: https://CRAN.R-project.org/package=rehh, https://gitlab.com/oneoverx/rehh -BugReports: https://gitlab.com/oneoverx/rehh/issues +BugReports: https://gitlab.com/oneoverx/rehh/-/issues Repository: CRAN -Date/Publication: 2020-07-17 10:30:03 UTC +Date/Publication: 2020-11-01 10:00:02 UTC diff --git a/MD5 b/MD5 index d1b14ac..9e1a3b2 100644 --- a/MD5 +++ b/MD5 @@ -1,45 +1,47 @@ -4be95bfe5393b4ad0d988f9076587813 *DESCRIPTION -479cda305f23caf8bd3f42caa2013c3e *NAMESPACE -c3bdce8419097ef74d75dedfadbb2264 *NEWS.md -d778a01442f62a331f42f43636d35c9a *R/as.newick.R -791804c313fb1e1bc68e7c5b95ecc92d *R/calc_candidate_regions.R -f64accc80d3c07213985e6eebd0969a7 *R/calc_ehh.R -adf2e4d9478c23d89b6a1f64cbb8e645 *R/calc_ehhs.R +8e4fd944b954c1f38c17b5b3ce560700 *DESCRIPTION +60535cbfb654e4815b8af2877be13a28 *NAMESPACE +481b516368c4630b1471c79c49cf4f20 *NEWS.md +49fb7dde709b7d56d950cfa481eee11e *R/as.newick.R +c489a95c70ee3f7a9b21c5e4e1701e00 *R/calc_candidate_regions.R +a5184787be940979ca4d87c32a69a6c5 *R/calc_ehh.R +d846b662a796c555a11a05a253237219 *R/calc_ehhs.R 7fc6137f3a9052c916c600103630f747 *R/calc_furcation.R 8652c8e385335d536e351d10ccba7767 *R/calc_haplen.R -647a736449be2f7cb9ef08fc31691ad5 *R/calc_pairwise_haplen.R +1b9d3c5e76519a3101f08b58125167d3 *R/calc_pairwise_haplen.R c0a7134ed354a1cc6e2d92cff7cb528b *R/calc_region_stats.R -aebc3fea50b8605b1d1aa3aaf21776bd *R/data2haplohh.R +dc4dfe02470f10724b4702d7bad63153 *R/calc_sfs_tests.R +db41e0abae0d42d719592911a8db10d7 *R/data2haplohh.R 3caea4a19997c6f248290697f797b4a1 *R/distribplot.R ef5e101f1c8c4891d5991bf2fd0257b4 *R/extract_regions.R e5d4611c5eeb4e3f839683a3f665e214 *R/freqbinplot.R 810c2aca58d223d852d957d0fe236b0a *R/furcation-class.R b9dd5b7b760f2fdb3441fcf332fd8f9a *R/haplohh-class.R +10eddc406dc337e263f6e745a1c8d002 *R/haplohh2sweepfinder.R 6eebd3fac16bb154d1ea834dd0289c26 *R/ies2xpehh.R 0b91ea1dff6639b4c3b4f13dea539637 *R/ihh2ihs.R 1fc5288e0bd273f896e5daef79a8838e *R/ines2rsb.R -1ab4dd6448b5781c7abf510cd5be6750 *R/make.example.files.R -f1138821f4d4882da57c5499764342fd *R/manhattanplot.R +39ac154b20c0400cd5632b5268de2892 *R/make.example.files.R +14aae5afa388353eb0abee02d930773c *R/manhattanplot.R de8aebb14b907da33b6e6483acb05467 *R/plot.ehh.R c461be69e8d0960110dbd2e2c214c809 *R/plot.ehhs.R f8018a6adb6442dd197b00f6f50d62d4 *R/plot.furcation.R 562bf90c94616972a745c618e3002078 *R/plot.haplen.R -46f5794ea7ee74e36c779522aa2d6ab2 *R/plot.haplohh.R -d0c64f19c95d71847c72b56a14db8de9 *R/rehh.R -e232bea3e5d1d83fdf560f24d01deb18 *R/scan_hh.R -f31185006802c7467a9f6a8cc8c92fa8 *R/scan_hh_full.R -d857d3be60093cc3900f1d221fd867fc *R/subset.haplohh.R +f0e6d5dc46aa06ec2aa75a551187cee9 *R/plot.haplohh.R +b0695663334e80f7f38654bd355b7c32 *R/rehh.R +eb17387a546704c5510ba46856b6ca93 *R/scan_hh.R +50902c106c1f1a7e23f3534f6f639290 *R/scan_hh_full.R +c8aaf488db328f7ea21ecb7ea309e4ee *R/subset.haplohh.R 9d53dfc94e3a08334b06091ab986217d *R/update_haplohh.R -8228a460ed62d771a05ec3e1e9a7cb59 *build/vignette.rds +28612d69e6e3c076a5bac689ddabec2b *build/vignette.rds bbd50c8eb813216188dce7896d1490d3 *data/datalist 91be48fb8754c805321891b9374459f3 *data/haplohh_cgu_bta12.RData 8439f1531106aeabce40b21067a1ac5e *inst/CITATION -ae04d6c09732f0e29be34d966896d3d4 *inst/doc/examples.R -347ed7fd5246ab0a51b07eb65c551f51 *inst/doc/examples.Rmd -a66b153a90d0c7ba1cfc0e6d7d993e13 *inst/doc/examples.html -0aca4270ed980c2b91db059f72ce8512 *inst/doc/rehh.R -0cb9f06cbea7141d8522641178da4b54 *inst/doc/rehh.Rmd -4c42ce24a6e5e3c0f331df7518413c4d *inst/doc/rehh.html +ec7fb3bdabc787abe3be7512729a1e2f *inst/doc/examples.R +4d69844ff06315aff3356db7f85edd4c *inst/doc/examples.Rmd +dec90acf8e63ab73cfe08ef0fc212cfa *inst/doc/examples.html +e4ac37fc09893fb1d5297d231923ba02 *inst/doc/rehh.R +602994925b401f0fe98262d30ff9d6e8 *inst/doc/rehh.Rmd +b9da2d6fdd34b4a4b7b0f4e4addbac14 *inst/doc/rehh.html 529c33971c081bacc8d00072aac22b20 *inst/extdata/bta12_cgu.vcf.gz 1a13676197023c7796532d96aeff7f76 *inst/extdata/example1.hap 98aedc74c294ec4769b8e3272200fbbe *inst/extdata/example1.map @@ -47,17 +49,21 @@ ef932b5c063861aae607ee0a93f33fb1 *inst/extdata/example1.vcf 259361a2879d15a50f494d8cb2322f95 *inst/extdata/example2.hap 05fcdbaf97aff8c6125dff9c40770538 *inst/extdata/example2.map 5e14d8117c9a49dac08ad3970a046373 *inst/extdata/example2.vcf +47a95eddad7ca90f68a4c35e671957b2 *inst/extdata/example_neutral.vcf +9adc5ae193051f554e00eda74567486e *inst/extdata/example_sweep.vcf +ab2d39dc100928b7c1e3a510287bb4c1 *inst/extdata/example_sweep_with_recombination.vcf e444d999cd7277d08690d6031446baf1 *inst/extdata/ms.out 3f1bb468e3b3346d7925ed893f668d22 *man/allelefurcation-class.Rd e43b5b616e972b190f596b0791aafdcb *man/as.newick.Rd 7d199b2def3e6d19e90ffa4128531956 *man/calc_candidate_regions.Rd -9a1d4d2573383b1af915cd9740433948 *man/calc_ehh.Rd -877765521fdd1bc21f0fad429f434980 *man/calc_ehhs.Rd +8ce616089d0acc6279c8acaaf2b30cb7 *man/calc_ehh.Rd +35d1ec2b344079cf92df525bf932c1d1 *man/calc_ehhs.Rd c933fc808090cbc1cade18247471df48 *man/calc_furcation.Rd 12a719a71c961a21e8de7f5146ac1dc2 *man/calc_haplen.Rd -36e5218744298abe44f67dbfb6d4c599 *man/calc_pairwise_haplen.Rd +e6f80fc8df2c011ea38bd46d54b06a74 *man/calc_pairwise_haplen.Rd 2acaa7c3c89fcb46f3c7e4f370160986 *man/calc_region_stats.Rd -aae72c0cb9f1bffb913dd8b13d534125 *man/data2haplohh.Rd +d2af8949d1a198ab26bfe45434be5924 *man/calc_sfs_tests.Rd +827eda0a35e51d45916607f10511a564 *man/data2haplohh.Rd de43ae62050ec49c91c44138fe93b67c *man/distribplot.Rd d557df62cd52e701005da0a5f0d8ec7c *man/extract_regions.Rd 17a00bb024774a6d4ef9491306f79872 *man/freqbinplot.Rd @@ -65,31 +71,33 @@ d557df62cd52e701005da0a5f0d8ec7c *man/extract_regions.Rd 33ddd89f9f2b47889d3be980b70747a0 *man/furcation-class.Rd 128fede3b0de4f84512e35c295433af2 *man/haplen-class.Rd 9917b0686748d15925dc890f045a076d *man/haplohh-class.Rd +c4211206daf63a9bfea41eb968dc3fc6 *man/haplohh2sweepfinder.Rd b240d6b987257834ba5e9815aadfc537 *man/haplohh_cgu_bta12.Rd 8f39cb0fa8ff33094b34f5cc094afa8d *man/ies2xpehh.Rd 53e588ceb4a78d806505251b4fb6f763 *man/ihh2ihs.Rd 5823cdec432bbd985be9063d46fc6af0 *man/ines2rsb.Rd -2449dbe55eb56665554d7fc8ee255766 *man/make.example.files.Rd -704045fb4408191c2962598b09080c7c *man/manhattanplot.Rd +9bf74e16a8816a0fd0f39fa67e879924 *man/make.example.files.Rd +af0baef3fee7a181c0b995d3bc65b5e0 *man/manhattanplot.Rd 96922dd9dbc33bcb024a69e3ec308be7 *man/plot.ehh.Rd 0091288c2de9b5c909c4f135dd87b1ad *man/plot.ehhs.Rd c7933322276a4b8fa37433e050766c4d *man/plot.furcation.Rd 939fa1dcc75eacda1eef877ee99c9e7b *man/plot.haplen.Rd -cd8682e4686cbe6130490e25a8e79301 *man/plot.haplohh.Rd -9d5eba020ec22ee48ef012d9f1a54d70 *man/rehh-package.Rd +090eca9f26b95024ee2aca3f483caf52 *man/plot.haplohh.Rd +07fab15f3017f348fcafac0dec82ac39 *man/rehh-package.Rd 0b3af48394811f7b7e481ca6fd7c63fd *man/remove.example.files.Rd -a67e3e8c460d7f55dde5243edf351c6c *man/scan_hh.Rd -9894dd029348f74ae46e2fb658289e24 *man/scan_hh_full.Rd -ce6e4d831dabbd3ded45ea6a4cdbf433 *man/subset.haplohh.Rd +c7d8274a12c1b590ad9f210db5bfa52c *man/scan_hh.Rd +f0afc4cb433d9c6b7fddb600605d1ef7 *man/scan_hh_full.Rd +3940df1de7b4ad97554a1d03ce8c64e1 *man/subset.haplohh.Rd 35d96dc7734635751107a32802c872c7 *man/update_haplohh.Rd ea195efe5e62727303defe92c7dfd337 *src/CALL_ASNEWICK.c 3bbb1385892442008a3efa83ed14f25a *src/CALL_EHH.c 733a9d7f96da43cdd8127c4323bca3dc *src/CALL_EHHS.c af8abfa9ecc355a7bd825358ef3afe5e *src/CALL_FURCATION.c 28bb39004e7d75cb01ff08c952148e6e *src/CALL_INTEGRAL.c -d5c7bc8c7d80203979aa6a9237e9346d *src/CALL_PAIRWISE_HAPLEN.c +60d65393e0384e62cb72f308058ef007 *src/CALL_PAIRWISE_HAPLEN.c 997866feb635416dba2745e1275b33f0 *src/CALL_SCAN_HH.c -34549fcae56fa1cf4325ed1c9870b376 *src/CALL_SCAN_HH2.c +fe0c7dcc2bb11ae05b3ea7080d5aa7d7 *src/CALL_SCAN_HH_FULL.c +63feb5cece836f3ef515492d15a166af *src/CALL_SFS_TESTS.c 95e3011e37d9dde0d75f3a3819b2acd3 *src/Makevars fba8f42eba26752e28b32338665d7348 *src/asnewick.c c091d180d8864d7f3eea26b915784877 *src/asnewick.h @@ -99,8 +107,10 @@ c091d180d8864d7f3eea26b915784877 *src/asnewick.h 27c58a2a27960458af14d7c61f0d155e *src/calc_ehhs.h 08642573b73421d2a0a401769bf8248b *src/calc_furcation.c 965b71b2edfd728a6a32472cda3e7713 *src/calc_furcation.h -8ea79b7de0fcb35f571962e17f7567fb *src/calc_pairwise_haplen.c -00d18c8e73beb8a0c8351110060f1219 *src/calc_pairwise_haplen.h +a05f2b0184ce0b035a956723f6d92eda *src/calc_pairwise_haplen.c +3839049affccee8d202be5eb505d366d *src/calc_pairwise_haplen.h +bbde67f562edd6b5cc23a49ca23ad99a *src/calc_sfs_tests.c +b779fcb22524d83657d325d4245cbc99 *src/calc_sfs_tests.h 4c1d8febd6cd04df5b4d22a3e2bd2c09 *src/definitions.h d42625e6350dcad51095cf22bf571db8 *src/haplotypes.c 976fa1c7e90f686dd57ee6229b12c43a *src/haplotypes.h @@ -110,9 +120,11 @@ d515074159b54f4e8fa71d847f0cc4e3 *src/haplotypes_with_nodes.c b8718089aed1c9aadf83d877f9762a5c *src/haplotypes_with_nodes.h 29441637b9f2e8c074323b102c802683 *src/homozygosity.c 8a0372f7038e294975eba1db483598ca *src/homozygosity.h -b6ca1e604a504ee2fdb7aff7621ce855 *src/init.c +625ba8904f1e83f49b398277e7a524d3 *src/init.c 77f8dc940762ed75f438205732206fc7 *src/integrate.c 40f968c6eb88192b40c247c6b9cd6492 *src/integrate.h +72a80b3450d690a6ac58609f94dd7fcb *src/sfs_moments.c +a3b3aeea2920660c6e293769c631b068 *src/sfs_moments.h 7d20b8af53a3a33835b45ed2dde1f2ff *tests/testthat.R d10abdbe610dee90be8e7c6a1baf28da *tests/testthat/furcation_F1205400.newick 26f4662e29cc03d705c9c9e5d81046d6 *tests/testthat/serialized_furcation_F1205400.txt @@ -121,14 +133,15 @@ d10abdbe610dee90be8e7c6a1baf28da *tests/testthat/furcation_F1205400.newick 7f87b223f6153b8926956609ba293241 *tests/testthat/setup.R a177583dabfe62a659b2d0ff4e5829d8 *tests/testthat/teardown.R 0f2d4d7ffafcef03cde572357aa3508e *tests/testthat/test_calc_candidate_regions.R -b333feb96e38279b41a60f9fcb93dd65 *tests/testthat/test_data2haplohh.R +f8a26c6ede3ff1c7bf965b3d3c9797ea *tests/testthat/test_data2haplohh.R a57d331f176ddaa7e7a19fb056cd2626 *tests/testthat/test_error_messages.R 3c553b425c68132fe5a74a06fc85d3e4 *tests/testthat/test_examples.R -6adc3c44917659c38b0adddbbf254f18 *tests/testthat/test_furcation.R +c5d51b0340ebb2dc628e911bcc259f82 *tests/testthat/test_furcation.R 83461c197932114fbb6185821cdae353 *tests/testthat/test_ihh2ihs.R -09bd1fc643230fc24c90aac2c2d2f035 *tests/testthat/test_scan_full.R +c94ab72e812c9daf632b2acb919aa436 *tests/testthat/test_scan_full.R 06e74a8f326b2099d3ef7b77c1664a89 *tests/testthat/test_scan_hh.R -347ed7fd5246ab0a51b07eb65c551f51 *vignettes/examples.Rmd +2641e6bbf68bdd6c97280b3a86dfedcc *tests/testthat/test_sfs_tests.R +4d69844ff06315aff3356db7f85edd4c *vignettes/examples.Rmd 80ac2e78110aa3e54fb5ce824bd5574f *vignettes/genetics.csl -0cb9f06cbea7141d8522641178da4b54 *vignettes/rehh.Rmd -0ae5b0965175c9ae6cbc11dd9bf5adc5 *vignettes/vignette.bib +602994925b401f0fe98262d30ff9d6e8 *vignettes/rehh.Rmd +dc13674ff5df93035a03384f762a946f *vignettes/vignette.bib diff --git a/NAMESPACE b/NAMESPACE index 407cd57..ead9057 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -14,10 +14,12 @@ export(calc_furcation) export(calc_haplen) export(calc_pairwise_haplen) export(calc_region_stats) +export(calc_sfs_tests) export(data2haplohh) export(distribplot) export(extract_regions) export(freqbinplot) +export(haplohh2sweepfinder) export(ies2xpehh) export(ihh2ihs) export(ines2rsb) diff --git a/NEWS.md b/NEWS.md index e255067..f39bdca 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,17 @@ +### rehh 3.2.0 (October 30, 2020) + +* corrections and extensions of the vignettes +* added inset between chromosomes in function manhattanplot() +* added option to highlight specific markers in plot.haplohh() +* function subset() can filter markers on maximum number of alleles +* option to set maximal haplotype extension (in base pairs) in function scan_hh_full() +* added function haplohh2sweepfinder() that extracts allele frequencies which +can serve as input for the programs SweepFinder or SweeD +* added function calc_sfs_tests() to calculate Tajima's D and Fay & Wu's H, allowing for data with missing values + ### rehh 3.1.2 (July 17, 2020) -* added option to parse vcf files using package 'data.table' in order to avoid reliance on package 'vcfR' which is currently removed from CRAN (but still available on github: https://github.com/knausb/vcfR) +* added option to parse vcf files using package 'data.table' in order to avoid reliance on package 'vcfR' ### rehh 3.1.1 (June 19, 2020) diff --git a/R/as.newick.R b/R/as.newick.R index 5b876bc..bc5640b 100644 --- a/R/as.newick.R +++ b/R/as.newick.R @@ -40,26 +40,24 @@ as.newick <- } if (!is.null(hap.names)) { if (length(hap.names) != furcation@nhap) { - stop( - "Number of specified haplotype names must match the number of haplotypes.", - call. = FALSE - ) + stop("Number of specified haplotype names must match the number of haplotypes.", + call. = FALSE) } } else{ hap.names <- seq_len(furcation@nhap) } - + ##calculations if (side == "left") { ftree <- furcation[[allele]]@left } else{ ftree <- furcation[[allele]]@right } - + ## usage of temporary file is a work-around for ## C string streams are missing under Windows tmp_file_name <- tempfile() - + #calculation and output done by C if (.Call( "CALL_ASNEWICK", @@ -76,7 +74,7 @@ as.newick <- } else{ stop("Could not write to a temporary file.", call. = FALSE) } - + unlink(tmp_file_name) return(newick) } diff --git a/R/calc_candidate_regions.R b/R/calc_candidate_regions.R index 6652cca..400b137 100644 --- a/R/calc_candidate_regions.R +++ b/R/calc_candidate_regions.R @@ -109,7 +109,7 @@ calc_candidate_regions <- function(scan, # perform calculation ## remove NA - scan <- scan[!is.na(scan[[score_column_nr]]), ] + scan <- scan[!is.na(scan[[score_column_nr]]),] if (ignore_sign) { score <- abs(scan[[score_column_nr]]) @@ -148,9 +148,12 @@ calc_candidate_regions <- function(scan, } for (offset in offsets) { - breaks <- seq(offset, - max(chr_pos) + window_size - 1, - window_size) + breaks <- seq( + floor(min(chr_pos) / window_size) * window_size + offset, + max(chr_pos) + window_size - 1, + window_size + ) + windows <- cut(chr_pos, breaks = breaks, right = right, @@ -213,7 +216,7 @@ calc_candidate_regions <- function(scan, if (nrow(chr_cand_reg) > 1) { # sort different offset windows by position - chr_cand_reg <- chr_cand_reg[order(chr_cand_reg[[2]]), ] + chr_cand_reg <- chr_cand_reg[order(chr_cand_reg[[2]]),] # join neighboring windows if (join_neighbors) { @@ -246,7 +249,7 @@ calc_candidate_regions <- function(scan, threshold]) #e liminate all right neighbors chr_cand_reg <- chr_cand_reg[-((i + 1):(i + - j)), ] + j)),] } i <- i + 1 } diff --git a/R/calc_ehh.R b/R/calc_ehh.R index 8edb288..7e30b9b 100644 --- a/R/calc_ehh.R +++ b/R/calc_ehh.R @@ -44,8 +44,9 @@ #'} #'@references Gautier, M. and Naves, M. (2011). Footprints of selection in the ancestral admixture of a New World Creole cattle breed. \emph{Molecular Ecology}, \strong{20}, 3128-3143. #' -#'Klassmann, A. et al. (2020). Detecting selection using Extended Haplotype -#'Homozygosity (EHH)-based statistics on unphased or unpolarized data. (submitted). +#'Klassmann, A. and Gautier, M. (2020). Detecting selection using Extended Haplotype +#'Homozygosity-based statistics on unphased or unpolarized data (preprint). +#'https://doi.org/10.22541/au.160405572.29972398/v1 #' #'Sabeti, P.C. et al. (2002). Detecting recent positive selection in the human genome from haplotype structure. \emph{Nature}, \strong{419}, 832-837. #' diff --git a/R/calc_ehhs.R b/R/calc_ehhs.R index 488627b..50cd29e 100644 --- a/R/calc_ehhs.R +++ b/R/calc_ehhs.R @@ -42,8 +42,9 @@ #'} #'@references Gautier, M. and Naves, M. (2011). Footprints of selection in the ancestral admixture of a New World Creole cattle breed. \emph{Molecular Ecology}, \strong{20}, 3128-3143. #' -#'Klassmann, A. et al. (2020). Detecting selection using Extended Haplotype -#'Homozygosity (EHH)-based statistics on unphased or unpolarized data. (submitted). +#'Klassmann, A. and Gautier, M. (2020). Detecting selection using Extended Haplotype +#'Homozygosity-based statistics on unphased or unpolarized data (preprint). +#'https://doi.org/10.22541/au.160405572.29972398/v1 #' #'Sabeti, P.C. et al. (2002). Detecting recent positive selection in the human genome from haplotype structure. \emph{Nature}, \strong{419}, 832-837. #' diff --git a/R/calc_pairwise_haplen.R b/R/calc_pairwise_haplen.R index 4d41b97..c712872 100644 --- a/R/calc_pairwise_haplen.R +++ b/R/calc_pairwise_haplen.R @@ -4,12 +4,16 @@ #'@param mrk integer representing the number of the focal marker within the haplohh object #'or string representing its ID/name. #'@param phased logical. If \code{TRUE} (default) chromosomes are expected to be phased. If \code{FALSE}, the haplotype data is assumed to -#'consist of pairwise ordered chromosomes belonging to diploid individuals and only the two chromosomes of +#'consist of pairwise ordered chromosomes belonging to diploid individuals and only the two chromosomes of #'each individual are compared. #'@param maxgap maximum allowed gap in bp between two markers. If exceeded, further calculation is stopped at the gap -#'(default=\code{NA}, i.e no limitation). +#'(default=\code{NA}, i.e. no limitation). +#'@param max_extend maximum distance in bp to extend shared haplotypes away from the focal marker. +#'(default \code{NA}, i.e. no limitation). +#'@param side side to consider, either "left" (positions lower than focal position), "right" (positions higher than focal position) +#'or "both" (default). #'@details The function computes the length of shared haplotypes (stretches of identical sequence) around -#'the focal marker. +#'the focal marker. #' #'Note that the function \code{\link{calc_haplen}} calculates for each chromosome #'the boundaries of its longest shared haplotype; separately upstream and downstream of @@ -28,7 +32,9 @@ calc_pairwise_haplen <- function(haplohh, mrk, phased = TRUE, - maxgap = NA) { + maxgap = NA, + max_extend = NA, + side = "both") { ##check parameters if (!(is.haplohh(haplohh))) { stop("Data is not a valid haplohh object.", call. = FALSE) @@ -58,9 +64,23 @@ calc_pairwise_haplen <- } mrk <- which(mrk.names(haplohh) == mrk) } + if (!is.na(maxgap) & (!is.numeric(maxgap) | maxgap < 1)) { + stop("maxgap must be a positive integer number.", call. = FALSE) + } + if (!is.na(max_extend) & + (!is.numeric(max_extend) | max_extend < 1)) { + stop("max_extend must be a positive integer number.", call. = FALSE) + } if (is.na(maxgap)) { - maxgap <- (max(positions(haplohh)) + 1) + maxgap <- diff(range(positions(haplohh))) + 1 + } + if (is.na(max_extend)) { + max_extend <- diff(range(positions(haplohh))) + 1 + } + + if (!(side %in% c("both", "left", "right"))) { + stop("side must be either \"both\", \"left\" or \"right\".", call. = FALSE) } ##perform calculations @@ -79,8 +99,10 @@ calc_pairwise_haplen <- nmrk(haplohh), positions(haplohh), mrk, - maxgap, - phased, + as.integer(maxgap), + as.integer(max_extend), + ifelse(side == "both", 0L, ifelse(side == "left", 1L, 2L)), + as.integer(phased), pairwise_haplen ) diff --git a/R/calc_sfs_tests.R b/R/calc_sfs_tests.R new file mode 100644 index 0000000..94d0582 --- /dev/null +++ b/R/calc_sfs_tests.R @@ -0,0 +1,155 @@ +#'Calculate site frequency spectrum test statistics +#'@description Calculate site frequency spectrum (SFS) tests Tajima's D, Fay & Wu's H and Zeng's E. +#'@param haplohh an object of class \code{haplohh} (see \code{\link{data2haplohh}}) +#'@param polarized logical. \code{TRUE} by default. If \code{FALSE}, use major and minor allele instead of ancestral and derived. If there +#'are more than two alleles then the minor allele refers to the second-most frequent allele. +#'Note that Tajima's D remains unchanged, Fay & Wu's H is always zero for folded spectra and Zeng's E becomes equal to Tajima's D. +#'@param window_size size of sliding windows. If \code{NA} (default), there will be only +#'one window covering the whole length of the chromosome. +#'@param overlap size of window overlap (default 0, i.e. no overlap). +#'@param right logical, indicating if the windows should be closed on the right and open on the left (default) or vice versa. +#'@param min_n_mrk minimum number of (polymorphic) markers per window. +#'@param verbose logical. \code{TRUE} by default; reports if multi-allelic sites are removed. +#'@details Neutrality tests based on the site frequency spectrum (SFS) are +#'largely unrelated to EHH-based methods. The tests provided here are implemented +#'elsewhere, too (e.g. in package \href{https://cran.r-project.org/package=PopGenome}{PopGenome}). +#' +#'Each test compares two estimations of the \emph{scaled mutation rate} theta, +#'which all have the same expected value under neutrality. Deviations from zero indicate +#'violations of the neutral null model, typically population size changes, population subdivision or selection. +#'Tajima's D and Fay & Wu's H become negative in presence of an almost completed sweep, Zeng's E becomes +#'positive for some time after it. Significance can typically be assigned only by +#'simulations. +#' +#'The standard definition of the tests cannot cope with missing values and typically markers +#'with missing genotypes must be discarded. Ferretti (2012) provides an extension +#'that can handle missing values (without discarding any non-missing values). In this package, +#'only the first moments (the theta-estimators themselves) are adapted accordingly, +#'but not the second moments (their variances), because the latter is computationally demanding +#'and the resulting bias relatively small. It is recommended, though, to discard markers or haplotypes +#'with more than 20\% missing values. +#' +#'Multi-allelic markers are always removed since the tests rely on the "infinite sites model" which +#'implies that all polymorphic markers are bi-allelic. +#'Monomorphic markers can be present, but are irrelevant for the tests. +#' +#'@return A data frame with window coordinates, the number of contained (polymorphic) markers, Watterson's, Tajima's and Zeng's +#'estimators of theta and the test statistics of Tajima's D, Fay & Wu's H and Zeng's E. +#'@examples +#'make.example.files() +#'# neutral evolution +#'hh <- data2haplohh("example_neutral.vcf", verbose = FALSE) +#'calc_sfs_tests(hh) +#'# strong selective sweep +#'hh <- data2haplohh("example_sweep.vcf", verbose = FALSE) +#'calc_sfs_tests(hh) +#'remove.example.files() +#'@references Watterson, G.A. (1975). On the number of segregating sites in genetical models without recombination. +#'\emph{Theoretical Population Biology} \strong{7}(2) 256-276. +#' +#'Tajima, F. (1983). Evolutionary relationship of DNA sequences in finite populations. +#'\emph{Genetics} \strong{105}(2) 437-60. +#' +#'Tajima, F. (1989). Statistical method for testing the neutral mutation hypothesis by DNA polymorphism. +#'\emph{Genetics} \strong{123}(3) 585-95. +#' +#'Fay, J. and Wu, C. (2000). Hitchhiking under positive Darwinian selection. \emph{Genetics} +#'\strong{155}(3) 1405-13. +#' +#'Zeng, E. et al. (2006). Statistical tests for detecting positive selection by utilizing high-frequency variants. +#'\emph{Genetics} \strong{174}(3) 1431-9. +#' +#'Ferretti, L. and Raineri, E. and Ramos-Onsins, S. (2012). Neutrality tests for sequences with missing data. +#'\emph{Genetics} \strong{191}(4) 1397-401. +#'@export +calc_sfs_tests <- + function(haplohh, + polarized = TRUE, + window_size = NA, + overlap = 0, + right = TRUE, + min_n_mrk = 1, + verbose = TRUE) { + # need integer numbers, otherwise "%%" causes trouble with i386 + overlap <- as.integer(overlap) + min_n_mrk <- as.integer(min_n_mrk) + + if (is.na(window_size)) { + window_size <- ceiling(max(haplohh@positions)) + } + window_size <- as.integer(window_size) + + if (window_size < 1) { + stop("Window size has to be a positive integer number.", call. = FALSE) + } + if (is.na(overlap) | + overlap < 0 | + overlap >= window_size | + overlap %% 1L != 0L | + (overlap != 0L & window_size %% overlap != 0L)) { + stop("'overlap' has to be zero or an integer factor of 'window_size'.", + call. = FALSE) + } + if (min_n_mrk < 1 | min_n_mrk %% 1 != 0L) { + stop("'min_n_mrk' has to be a positive integer number.", + call. = FALSE) + } + + # if present, remove multi-allelic sites + if (max(apply(haplohh@haplo, 2, function(x) { + length(na.omit(unique(x))) + })) > 2) { + haplohh <- + subset( + haplohh, + max_alleles = 2, + min_perc_geno.mrk = floor(2 / nhap(haplohh) * 100), + verbose = verbose + ) + } + + step <- ifelse(overlap != 0, overlap, window_size) + window_left <- + seq( + floor(min(positions(haplohh) - right * 1) / window_size) * window_size, + ceiling(max(positions(haplohh))) - window_size + 1 - right * 1, + step + ) + window_right <- window_left + window_size + windows <- cbind(window_left, window_right) + + n_mrk <- vector(mode = "integer", length = nrow(windows)) + results <- matrix(0, nrow = nrow(windows), ncol = 6) + + .Call( + "CALL_SFS_TESTS", + haplo(haplohh), + nhap(haplohh), + nmrk(haplohh), + positions(haplohh), + polarized, + windows, + nrow(windows), + right, + min_n_mrk, + n_mrk, + results + ) + + df <- data.frame(chr.name(haplohh), windows, n_mrk, results) + colnames(df) <- + c( + "CHR", + "START", + "END", + "N_MRK", + "THETA_S", + "THETA_PI", + "THETA_L", + "TAJIMA_D", + "FAY_WU_H", + "ZENG_E" + ) + + return(df) + } diff --git a/R/data2haplohh.R b/R/data2haplohh.R index 490fcf3..e4771d9 100644 --- a/R/data2haplohh.R +++ b/R/data2haplohh.R @@ -33,9 +33,9 @@ #'Low confidence ancestral alleles are usually coded by lower-case letters. If \code{TRUE} (default), these are #'changed to upper case before the alleles of the sample are matched for polarization. #'@param vcf_reader library used to read vcf. By default, low-level parsing is -#'performed using the generic package \code{data.table}. In order to read compressed files, +#'performed using the generic package \code{data.table}. In order to read compressed files, #'the package \code{R.utils} must be installed, too. -#'If the specialized package \code{vcfR} is available, set this parameter to \code{"vcfR"}. +#'If the specialized package \code{vcfR} is available, set this parameter to \code{"vcfR"}. #'@param position_scaling_factor intended primarily for output of ms where #'positions lie in the interval [0,1]. These can be rescaled to sizes #'of typical markers in real data. @@ -262,7 +262,7 @@ data2haplohh <- check_chromosome_names(map_file, unique(as.character(map[, 1])), chr.name) ### subset map data frame to specified chromosome - map <- map[as.character(map[, 1]) == chr.name,] + map <- map[as.character(map[, 1]) == chr.name, ] ### set first slots of haplohh hh <- new("haplohh") @@ -419,12 +419,17 @@ data2haplohh <- call. = FALSE) } + ## scale positions + if (!is.na(position_scaling_factor)) { + hh@positions <- hh@positions * position_scaling_factor + } + # ## check for multiple markers multiple_markers <- duplicated(hh@positions) if (sum(multiple_markers) > 0) { if (remove_multiple_markers) { hh@positions <- hh@positions[!multiple_markers] - hh@haplo <- hh@haplo[,!multiple_markers, drop = FALSE] + hh@haplo <- hh@haplo[, !multiple_markers, drop = FALSE] warning(paste( "Removed", sum(multiple_markers), @@ -439,10 +444,6 @@ data2haplohh <- } } - ## scale positions - if (!is.na(position_scaling_factor)) { - hh@positions <- hh@positions * position_scaling_factor - } # filtering hh <- subset( @@ -568,7 +569,7 @@ read.standard <- function(hap_file, verbose) { } rownames(tmp_haplo) <- rownames - return(tmp_haplo[,-1]) + return(tmp_haplo[, -1]) } read.transposed <- function(hap_file, verbose) { @@ -673,10 +674,10 @@ read.fastPhase <- function(hap_file, popsel, verbose) { } hap1 <- unlist(strsplit(out_fphase[hap1_line], split = " ")) - tmp_haplo[hap1_index,] <- hap1 + tmp_haplo[hap1_index, ] <- hap1 hap2 <- unlist(strsplit(out_fphase[hap2_line], split = " ")) - tmp_haplo[hap2_index,] <- hap2 + tmp_haplo[hap2_index, ] <- hap2 } if (!anyDuplicated(hapnames)) { @@ -788,7 +789,7 @@ read.vcf <- chr.name <- check_chromosome_names(vcf_file, as.character(unique(map[, 1])), chr.name) selected <- map[, 1] == chr.name - map <- map[selected,] + map <- map[selected, ] if (polarize_vcf) { if (verbose) @@ -845,7 +846,7 @@ read.vcf <- ), stringsAsFactors = FALSE, showProgress = FALSE, - )[selected,] + )[selected, ] )) } @@ -978,9 +979,9 @@ read.vcf <- # if only one marker then matrix must be explicitly coerxed if (nrow(hh@haplo) == 1) { - hh@haplo <- as.matrix(hh@haplo[,!is.na(aan)]) + hh@haplo <- as.matrix(hh@haplo[, !is.na(aan)]) } else{ - hh@haplo <- hh@haplo[,!is.na(aan)] + hh@haplo <- hh@haplo[, !is.na(aan)] } hh@positions <- hh@positions[!is.na(aan)] diff --git a/R/haplohh2sweepfinder.R b/R/haplohh2sweepfinder.R new file mode 100644 index 0000000..bb3d79d --- /dev/null +++ b/R/haplohh2sweepfinder.R @@ -0,0 +1,65 @@ +#'Translate object of \code{\link{haplohh-class}} into SweepFinder format +#'@description Extract allele frequencies of an object of class \code{\link{haplohh-class}} +#'and returns a table in SweepFinder input format. +#'@param haplohh object of class \code{\link{haplohh-class}}. +#'@param polarized logical. If \code{TRUE} (default), flag "folded" is set to 0, otherwise to 1. +#'@param verbose logical. If \code{TRUE} (default), prints filter statements. +#'@details SweepFinder and SweeD are two stand-alone programs which +#'implement the same method to detect selective sweeps using the +#'allele frequency at each site. This function calculates these frequencies +#'from a \code{\link{haplohh-class}} and returns a table which +#'can be saved into a file (with tabs as separators, without row names and quotes) that can +#'be used as input for the two programs. +#' +#'Sites with less than two haplotypes genotyped or with more than two alleles are removed. +#'If \code{polarized}, sites monomorphic for the ancestral allele are removed, too. +#' +#'@return A dataframe with four columns: +#'\itemize{ +#'\item \strong{position} marker position +#'\item \strong{x} (absolute) frequency of the alternative (derived) variant +#'\item \strong{n} number of non-missing genotypes +#'\item \strong{folded} a flag marking polarization +#'} +#'@seealso \code{\link{haplohh-class}}, \code{\link{data2haplohh}} +#'@references DeGiorgio, M., and, Huber, CD and Hubisz, MJ and, Hellmann, I. and Nielsen, R. (2016) +#'SweepFinder2: increased robustness and flexibility. \emph{Bioinformatics} \strong{32}:1895-1897 +#' +#'Pavlidis, P., D. Zivkovic, A. Stamatakis, and N. Alachiotis, (2013) +#'SweeD: likelihood-based detection of selective sweeps in thousands of genomes. +#'\emph{Molecular Biology and Evolution} \strong{30}: 2224-34. +#'@examples #example +#'# sweepfinder example from vignette +#'make.example.files() +#'hh <- data2haplohh("example_sweep_with_recombination.vcf") +#'haplohh2sweepfinder(hh) +#'remove.example.files() +#'@export +haplohh2sweepfinder <- + function(haplohh, + polarized = TRUE, + verbose = TRUE) { + # remove multi-allelic sites (sites with more than two alleles) + # and sites with less than two sequences genotyped + haplohh <- + subset( + haplohh, + max_alleles = 2, + min_perc_geno.mrk = floor(2 / nhap(haplohh) * 100), + verbose = verbose + ) + + n <- colSums(!is.na(haplohh@haplo)) + x <- colSums(!is.na(haplohh@haplo) & haplohh@haplo != 0L) + df <- data.frame( + position = haplohh@positions, + x = x, + n = n, + folded = (!polarized) * 1L + ) + # sites that are monomorphic for the ancestral allele are not allowed + if (polarized) { + df <- df[df$x > 0, ] + } + return(df) + } diff --git a/R/make.example.files.R b/R/make.example.files.R index 8dbc928..ea7f57c 100644 --- a/R/make.example.files.R +++ b/R/make.example.files.R @@ -17,6 +17,9 @@ #'\item \code{example2.hap} "example 2" haplotype file in "standard format" #'\item \code{example2.map} "example 2" marker information file #'\item \code{example2.vcf} "example 2" as vcf file +#'\item \code{example_neutral.vcf} "example neutral evolution" as vcf file +#'\item \code{example_sweep.vcf} "example for a selective sweep (without recombination)" +#'\item \code{example_sweep_with_recombination.vcf} "example for a selective sweep with recombination #'\item \code{ms.out output} from a small simulation by the program 'ms' #'\item \code{bta12_cgu.hap} an haplotype file in "standard format" #'\item \code{bta12_cgu.thap} an haplotype file in "transposed format" @@ -26,11 +29,17 @@ #'Example 1 was used in (Gautier 2017) to explain the various EHH derived statistics calculated by this package. #'Example 2 is an extension containing multi-allelic markers and missing values. #' +#'Examples for neutral data and sweeps are discussed in a supplement of Klassmann (2020). +#' #'The bta12 files contain data for 280 haplotypes, originating from 140 individuals belonging to the #'Creole cattle breed from Guadeloupe, at 1.424 markers mapping to bovine chromosome 12 (BTA12) (Gautier 2011). #'@references Gautier, M. and Naves, M. (2011). Footprints of selection in the ancestral admixture of a New World Creole cattle breed. \emph{Molecular Ecology}, \strong{20}, 3128-3143. #' #'Gautier, M., Klassmann, A. and Vitalis, R. (2017). rehh 2.0: a reimplementation of the R package rehh to detect positive selection from haplotype structure. \emph{Molecular Ecology Resources}, \strong{17}, 78-90. +#' +#'Klassmann, A. and Gautier, M. (2020). Detecting selection using Extended Haplotype +#'Homozygosity-based statistics on unphased or unpolarized data (preprint). +#'https://doi.org/10.22541/au.160405572.29972398/v1 #'@seealso \code{\link{data2haplohh}}, \code{\link{remove.example.files}} #'@export #'@import rehh.data @@ -60,6 +69,9 @@ make.example.files <- function() { "example2.hap", "example2.map", "example2.vcf", + "example_neutral.vcf", + "example_sweep.vcf", + "example_sweep_with_recombination.vcf", "ms.out", "bta12_cgu.vcf.gz" ) @@ -98,6 +110,9 @@ remove.example.files <- function() { "example2.hap", "example2.map", "example2.vcf", + "example_neutral.vcf", + "example_sweep.vcf", + "example_sweep_with_recombination.vcf", "ms.out", "bta12_cgu.vcf.gz" ) diff --git a/R/manhattanplot.R b/R/manhattanplot.R index 1fcebcd..800ffc5 100644 --- a/R/manhattanplot.R +++ b/R/manhattanplot.R @@ -26,13 +26,18 @@ #'@param cex size of the points representing markers in the plot(s) (see \code{\link[graphics]{par}}). #'@param las orientation of axis labels (see \code{\link[graphics]{par}}). #'@param pch type of the points representing markers in the plot(s) (see \code{\link[graphics]{points}}). +#'@param inset inset (in bases) between chromosomes to avoid overlap of data points. Default: 5,000,000 bases. #'@param resolution Rasterize data points to the specified resolution and remove #'duplicate points. Defaults to NULL, i.e. no rasterization. A typical value might be \code{c(1E5, 0.01)}, #'meaning that resolution on the x-axis (chromosomal position) is 100000 and on the y-axis (score or p-value) is 0.01. #'@param ... further arguments to be passed to \code{\link[graphics]{plot.default}}. #'@details The color of chromosomes is taken from the "Graphics Palette", see \code{\link[grDevices]{palette}}. -#'@details If a single chromosome is plotted, a genomic region can be specified by +#' +#'If a single chromosome is plotted, a genomic region can be specified by #'argument \code{xlim}. +#' +#'Other statistics can be plotted as well, although a warning is issued. They must be given by a data.frame +#'with columns CHR and POSITION and the statistic in the third column. #'@return The function returns a plot. #'@seealso \code{\link{ihh2ihs}}, \code{\link{ies2xpehh}}, \code{\link{ines2rsb}}, \code{\link{calc_candidate_regions}}. #'@examples library(rehh.data) @@ -65,6 +70,7 @@ manhattanplot <- cex = 0.5, las = 1, pch = 20, + inset = 5E+6, resolution = NULL, ...) { # check parameters @@ -131,6 +137,13 @@ manhattanplot <- call. = FALSE) } + if (is.na(inset)) { + inset <- 0 + } else if (!is.numeric(inset)) { + stop("Inset has to be specified by a number.", + call. = FALSE) + } + # perform plot ## try to identify statistic by column name @@ -183,8 +196,8 @@ manhattanplot <- "]") } else { ylab <- - bquote("-" ~ log[10] ~ "[2" * Phi[ ~ "-" ~ "|" ~ scriptstyle(italic(.(statistic))) ~ - "|"] * "]") + bquote("-" ~ log[10] ~ "[2" * Phi[~ "-" ~ "|" ~ scriptstyle(italic(.(statistic))) ~ + "|"] * "]") } } else{ if (ignore_sign) { @@ -216,7 +229,7 @@ manhattanplot <- if (!is.null(mrk)) { if (is.vector(mrk)) { nmrk <- length(mrk) - data_highlighted <- data[mrk, ] + data_highlighted <- data[mrk,] } else{ nmrk <- nrow(mrk) ## merge erases row.names; duplicate them as column @@ -232,7 +245,7 @@ manhattanplot <- ## remove rows with NAs (arising by empty subset) data_highlighted <- - data_highlighted[!is.na(data_highlighted$CHR), ] + data_highlighted[!is.na(data_highlighted$CHR),] if (nrow(data_highlighted) < nmrk) { warning(paste( @@ -257,15 +270,15 @@ manhattanplot <- stop("Specified chromosomes not contained in data.", call. = FALSE) } chromosomes <- chr.name - data <- data[data$CHR %in% chromosomes, ] + data <- data[data$CHR %in% chromosomes,] } chr_max <- vapply(split(data, data$CHR, drop = TRUE), function(x) { - max(x$POSITION) + max(x$POSITION) + inset }, FUN.VALUE = 0) cum <- cumsum(c(0, chr_max[chromosomes])) - label_pos <- (cum[-length(cum)] + cum[-1]) / 2 + label_pos <- (cum[-length(cum)] + cum[-1] - inset) / 2 cum <- cum[-length(cum)] names(cum) <- chromosomes @@ -294,7 +307,7 @@ manhattanplot <- if (!is.null(dot.args$xlim)) { # subset to specified positions data <- data[data$POSITION >= dot.args$xlim[1] & - data$POSITION <= dot.args$xlim[2], ] + data$POSITION <= dot.args$xlim[2],] dot.args$xlim <- dot.args$xlim / scale } @@ -348,7 +361,7 @@ manhattanplot <- } if (!is.null(cr)) { - cr <- cr[cr$CHR %in% chromosomes, ] + cr <- cr[cr$CHR %in% chromosomes,] if (nrow(cr) > 0) { col <- adjustcolor(cr.col, alpha.f = cr.opacity) diff --git a/R/plot.haplohh.R b/R/plot.haplohh.R index 2a1460d..e93b263 100644 --- a/R/plot.haplohh.R +++ b/R/plot.haplohh.R @@ -34,19 +34,21 @@ #'@param pos.lab.mrk position of marker labels. Either \code{"top"} (default) or \code{"none"}. #'@param srt.hap rotation of haplotype labels (see \code{\link[graphics]{par}}). #'@param srt.mrk rotation of marker labels (see \code{\link[graphics]{par}}). +#'@param highlight.mrk vector of markers to be highlighted +#'@param highlight.mrk.col color for each allele (as coded internally) at highlighted markers. #'@param ... other parameters to be passed to \code{\link[graphics]{plot.default}}. #'@details Specifying a haplohh-object with more than 4096 haplotypes or #'markers produces an error. #'@seealso \code{\link{calc_haplen}}, \code{\link{plot.furcation}}. #'@examples #example haplohh object #'make.example.files() -#'hh <- data2haplohh(hap_file = "example1.hap", +#'hh <- data2haplohh(hap_file = "example1.hap", #' map_file = "example1.map", #' allele_coding = "01") #'plot(hh) -#'hh <- data2haplohh(hap_file = "example2.hap", -#' map_file = "example2.map", -#' allele_coding = "01", +#'hh <- data2haplohh(hap_file = "example2.hap", +#' map_file = "example2.map", +#' allele_coding = "01", #' min_perc_geno.mrk = 50) #'plot(hh) #'remove.example.files() @@ -75,6 +77,8 @@ plot.haplohh <- pos.lab.mrk = "top", srt.hap = 0, srt.mrk = 0, + highlight.mrk = NULL, + highlight.mrk.col = c("lightgray", "black", "darkgray"), ...) { # arbitrary limit on haplotypes and markers MAX <- 4096L @@ -122,6 +126,35 @@ plot.haplohh <- } } + if (!is.null(highlight.mrk)) { + if (is.numeric(highlight.mrk)) { + hightlight.mrk <- as.integer(highlight.mrk) + if (any(highlight.mrk < 1)) { + stop(paste0("No marker numbers smaller than 1 allowed."), + call. = FALSE) + } + if (any(highlight.mrk > nmrk(x))) { + stop( + paste0( + "The marker number ", + highlight.mrk[which(highlight.mrk > nmrk(x))], + " is bigger than the number of markers in the data set (", + nmrk(x), + ")" + ), + call. = FALSE + ) + } + } else{ + highlight.mrk <- as.character(highlight.mrk) + if (!all(highlight.mrk %in% mrk.names(x))) { + stop(paste0("Marker '", highlight.mrk[which(!(highlight.mrk %in% mrk.names(x)))], "' not found."), + call. = FALSE) + } + highlight.mrk <- which(mrk.names(x) %in% highlight.mrk) + } + } + dot_args <- list(...) if (!is.null(dot_args$xlim)) { @@ -265,18 +298,26 @@ plot.haplohh <- for (i in seq_len(nrow(hh_subset@haplo))) { #draw haplo-lines y <- 1 - (i - 0.5) / nrow(hh_subset@haplo) + #if marker is set lines are colored by allele, otherwise by number + #if colored by allele and allele is NA then use color for allele lines(range(hh_subset@positions) / scale, rep(y, 2), - col = linecol[ifelse(is.na(mrk), i, hh_subset@haplo[i, mrk]) %% length(linecol) + + col = linecol[ifelse(is.na(mrk), i, ifelse(is.na(hh_subset@haplo[i, mrk]), 0, hh_subset@haplo[i, mrk])) %% length(linecol) + 1L], lwd = lwd) + pcol <- col[((hh_subset@haplo[i, ]) %% length(col)) + 1L] + if (!is.null(highlight.mrk)) { + pcol[highlight.mrk] <- + highlight.mrk.col[((hh_subset@haplo[i, highlight.mrk]) %% length(highlight.mrk.col)) + 1L] + } + #add markers points( x = xcoord / scale, y = rep(y, ncol(hh_subset@haplo)), cex = cex, - col = col[((hh_subset@haplo[i, ]) %% length(col)) + 1L], + col = pcol, pch = pch[((hh_subset@haplo[i, ]) %% length(pch)) + 1L] ) diff --git a/R/rehh.R b/R/rehh.R index 6a50fdb..899e240 100644 --- a/R/rehh.R +++ b/R/rehh.R @@ -10,6 +10,10 @@ #'Gautier M., Klassmann A., and Vitalis R. (2017). rehh 2.0: a reimplementation of the R package #'rehh to detect positive selection from haplotype structure. \emph{Molecular Ecology Resources}, \strong{17}, 78-90. #' +#'Klassmann, A. and Gautier, M. (2020). Detecting selection using Extended Haplotype +#'Homozygosity-based statistics on unphased or unpolarized data (preprint). +#'https://doi.org/10.22541/au.160405572.29972398/v1 +#' #'Sabeti, P.C. et al. (2002). Detecting recent positive selection in the human genome from haplotype structure. \emph{Nature}, \strong{419}, 832-837. #' #'Sabeti, P.C. et al. (2007). Genome-wide detection and characterization of positive selection in human populations. \emph{Nature}, \strong{449}, 913-918. diff --git a/R/scan_hh.R b/R/scan_hh.R index a4976d9..e2ade72 100644 --- a/R/scan_hh.R +++ b/R/scan_hh.R @@ -9,7 +9,7 @@ #'the less haplotypes contribute to EHH(S). #'@param limhomohaplo if there are less than \code{limhomohaplo} homozygous chromosomes, the #'calculation is stopped. This option is intended for unphased data and should be invoked only -#'if relatively low frequency variants are not filtered subsequently (see main vignette and Klassmann et al. 2020). +#'if relatively low frequency variants are not filtered subsequently (see main vignette and Klassmann et al. 2020). #'@param limehh limit at which EHH stops to be evaluated. #'@param limehhs limit at which EHHS stops to be evaluated. #'@param phased logical. If \code{TRUE} (default) chromosomes are expected to be phased. If \code{FALSE}, the haplotype data is assumed to @@ -37,7 +37,7 @@ #'for all markers. To perform a whole genome-scan this function needs #'to be called for each chromosome and results concatenated. #' -#'Note that setting \code{limehh} or \code{limehhs} to zero is likely to reduce power, +#'Note that setting \code{limehh} or \code{limehhs} to zero is likely to reduce power, #'since even under neutrality a tiny fraction (<<0.05) of extremely long shared haplotypes is expected #'which, if fully accounted for, would obfuscate the signal at selected sites. #' @@ -48,21 +48,22 @@ #'\item sample frequency of the ancestral / major allele #'\item sample frequency of the second-most frequent remaining allele #'\item number of evaluated haplotypes at the focal marker for the ancestral / major allele -#'\item number of evaluated haplotypes at the focal marker for the second-most frequent remaining allele +#'\item number of evaluated haplotypes at the focal marker for the second-most frequent remaining allele #'\item iHH of the ancestral / major allele #'\item iHH of the second-most frequent remaining allele #'\item iES (used by Sabeti et al 2007) #'\item inES (used by Tang et al 2007)} #'Note that in case of unphased data the evaluation is restricted to #'haplotypes of homozygous individuals which reduces the power -#'to detect selection, particularly for iHS (for appropriate parameter setting +#'to detect selection, particularly for iHS (for appropriate parameter setting #'see the main vignette and Klassmann et al (2020)). #' # #'@references Gautier, M. and Naves, M. (2011). Footprints of selection in the ancestral admixture of a New World Creole cattle breed. \emph{Molecular Ecology}, \strong{20}, 3128-3143. #' -#'Klassmann, A. et al. (2020). Detecting selection using Extended Haplotype -#'Homozygosity (EHH)-based statistics on unphased or unpolarized data. (submitted). +#'Klassmann, A. and Gautier, M. (2020). Detecting selection using Extended Haplotype +#'Homozygosity-based statistics on unphased or unpolarized data (preprint). +#'https://doi.org/10.22541/au.160405572.29972398/v1 #' #'Sabeti, P.C. et al. (2002). Detecting recent positive selection in the human genome from haplotype structure. \emph{Nature}, \strong{419}, 832-837. #' @@ -112,12 +113,19 @@ scan_hh <- limehhs > 1) { stop("limehhs must lie between 0 and 1.", call. = FALSE) } + if (!is.na(maxgap) & (!is.numeric(maxgap) | maxgap < 1)) { + stop("maxgap must be a positive integer number.", call. = FALSE) + } + if (!is.na(scalegap) & (!is.numeric(scalegap) | scalegap < 1)) { + stop("scalegap must be a positive integer number.", call. = FALSE) + } + if (is.na(maxgap)) { - maxgap <- (max(positions(haplohh)) + 1) + maxgap <- diff(range(positions(haplohh))) + 1 } if (is.na(scalegap)) { - scalegap <- (max(positions(haplohh)) + 1) + scalegap <- diff(range(positions(haplohh))) + 1 } else if (scalegap > maxgap) { stop("scalegap has to be smaller than maxgap in order to have an effect.", call. = FALSE) diff --git a/R/scan_hh_full.R b/R/scan_hh_full.R index 55f7841..2f515a5 100644 --- a/R/scan_hh_full.R +++ b/R/scan_hh_full.R @@ -1,12 +1,10 @@ #'Compute iHH, iES and inES over a whole chromosome without cut-offs #'@description Compute integrated EHH (iHH), integrated EHHS (iES) and integrated normalized EHHS (inES) for all markers of a chromosome (or linkage group). -#'This function computes the statistics by a slightly different algorithm than \code{\link{scan_hh}}: it sidesteps the calculation of EHH and EHHS values and their subsequent integration and -#'consequently no cut-offs relying on these values can be specified. Instead -#'it computes the full lengths of pairwise shared haplotypes and averages them afterwords. -#' -#'This function is (as yet) exclusively intended for the study of general properties of these statistics -#'using simulated data. The omission of all cut-offs is not recommended for a scan on experimental data. +#'This function computes the statistics by a slightly different algorithm than \code{\link{scan_hh}}: it sidesteps the calculation of EHH and EHHS values and their subsequent integration and +#'consequently no cut-offs relying on these values can be specified. Instead, +#'it computes the (full) lengths of pairwise shared haplotypes and averages them afterwords. #' +#'This function is primarily intended for the study of general properties of these statistics using simulated data. #'@param haplohh an object of class \code{haplohh} (see \code{\link{data2haplohh}}) #'@param phased logical. If \code{TRUE} (default) chromosomes are expected to be phased. If \code{FALSE}, the haplotype data is assumed to #'consist of pairwise ordered chromosomes belonging to diploid individuals. @@ -14,12 +12,14 @@ #'@param polarized logical. \code{TRUE} by default. If \code{FALSE}, use major and minor allele instead of ancestral and derived. If there #'are more than two alleles then the minor allele refers to the second-most frequent allele. #'@param maxgap maximum allowed gap in bp between two markers. If exceeded, further calculation of EHH(S) is stopped at the gap -#'(default=\code{NA}, i.e no limitation). -#'@param discard_integration_at_border logical. If \code{TRUE} (default) and computation of any of the statistics reaches first or last +#'(default=\code{NA}, i.e. no limitation). +#'@param max_extend maximum distance in bp to extend shared haplotypes away from the focal marker. +#'(default \code{NA}, i.e. no limitation). +#'@param discard_integration_at_border logical. If \code{TRUE} (default) and computation of any of the statistics reaches first or last #'marker or a gap larger than \code{maxgap}, iHH, iES and inES are set to \code{NA}. #'@param geometric.mean logical. If \code{FALSE} (default), the standard arithmetic mean is used to average -#'shared haplotype lengths. If \code{TRUE} -#'the geometric mean is used instead (IES values are undefined in this case). Note that usage of the geometric mean has not +#'shared haplotype lengths. If \code{TRUE} +#'the geometric mean is used instead (IES values are undefined in this case). Note that usage of the geometric mean has not #'yet been studied formally and should be considered experimental! #'@param threads number of threads to parallelize computation #' @@ -28,18 +28,24 @@ #'the computation of EHH and EHHS values and their stepwise integration. Instead, the length of all shared haplotypes #'is computed and afterwords averaged. In the absence of missing values the #'statistics are identical to those calculated by \code{\link{scan_hh}} with settings -#'\code{limehh = 0}, \code{limehhs = 0} and \code{interpolate = FALSE}, yet this function is faster. -#'The former two settings are however not recommended for the application on experimental data -#'(see vignette). +#'\code{limehh = 0}, \code{limehhs = 0}, \code{lower_ehh_y_bound = 0} and \code{interpolate = FALSE}, yet this function is faster. +#' +#'Application of a cut-off is necessary for reducing the spurious signals +#'of selection caused by single shared haplotypes of extreme length. Hence, e.g. for human experimental data +#'it might be reasonable to set \code{max_extend} to 1 or 2 Mb. #' -#'If \code{discard_integration_at_border} is set to \code{TRUE} and the extension of shared haplotypes -#'reaches a border (i.e. chromosomal boundaries or a gap larger than \code{maxgap}), this function discards all statistics, -#'while \code{\link{scan_hh}} handles each statistic independently. +#'\code{\link{scan_hh}} computes the statistics iHH_A, ihh_D and iES/inES separately, +#'while this function calculates them simultaneously. Hence, +#'if \code{discard_integration_at_border} is set to \code{TRUE} and the extension of shared haplotypes +#'reaches a border (i.e. chromosomal boundaries or a gap larger than \code{maxgap}), this function discards +#'all statistics. #' -#'\code{\link{scan_hh}} "removes" chromosomes with missing values from further calculations, -#'while this function treats each missing value -#'as a different allele. This yields a somewhat faster decay of all statistics with respect to the -#'distance to the focal marker. +#'The handling of missing values is different, too: \code{\link{scan_hh}} "removes" chromosomes with missing values from further calculations. +#'EHH and EHHS are then calculated for the remaining chromosomes which can accidentally yield an increase in EHH or EHHS. +#'This can not happen with \code{scan_hh_full()} which treats each missing value of a marker +#'as if it were a new allele - terminating any shared haplotype, but does changing the +#'set of considered chromosomes. Thus, missing values +#'cause a faster decay of EHH(S) with function \code{scan_hh_full()}. #'@return The returned value is a dataframe with markers in rows and the following columns #'\enumerate{ #'\item chromosome name @@ -47,20 +53,22 @@ #'\item sample frequency of the ancestral / major allele #'\item sample frequency of the second-most frequent remaining allele #'\item number of evaluated haplotypes at the focal marker for the ancestral / major allele -#'\item number of evaluated haplotypes at the focal marker for the second-most frequent remaining allele +#'\item number of evaluated haplotypes at the focal marker for the second-most frequent remaining allele #'\item iHH of the ancestral / major allele #'\item iHH of the second-most frequent remaining allele #'\item iES (used by Sabeti et al 2007) #'\item inES (used by Tang et al 2007)} #'Note that in case of unphased data the evaluation is restricted to #'haplotypes of homozygous individuals which reduces the power -#'to detect selection, particularly for iHS (for appropriate parameter setting +#'to detect selection, particularly for iHS (for appropriate parameter setting #'see the main vignette and Klassmann et al (2020)). #' # #'@references Gautier, M. and Naves, M. (2011). Footprints of selection in the ancestral admixture of a New World Creole cattle breed. \emph{Molecular Ecology}, \strong{20}, 3128-3143. #' -#'Klassmann A., Vitalis R., and Gautier M. Detecting selection using Extended Haplotype Homozygosity (EHH)-based statistics on unphased or unpolarized data. Preprint. https://doi.org/10.22541/au.158584282.24875401. +#'Klassmann, A. and Gautier, M. (2020). Detecting selection using Extended Haplotype +#'Homozygosity-based statistics on unphased or unpolarized data (preprint). +#'https://doi.org/10.22541/au.160405572.29972398/v1 #' #'Sabeti, P.C. et al. (2002). Detecting recent positive selection in the human genome from haplotype structure. \emph{Nature}, \strong{419}, 832-837. #' @@ -69,19 +77,26 @@ #'Tang, K. and Thornton, K.R. and Stoneking, M. (2007). A New Approach for Using Genome Scans to Detect Recent Positive Selection in the Human Genome. \emph{Plos Biology}, \strong{7}, e171. #' #'Voight, B.F. and Kudaravalli, S. and Wen, X. and Pritchard, J.K. (2006). A map of recent positive selection in the human genome. \emph{Plos Biology}, \strong{4}, e72. -#'@seealso \code{\link{data2haplohh}}, code{\link{scan_hh}}, -#'\code{\link{ihh2ihs}},\code{\link{ines2rsb}}, \code{\link{ies2xpehh}} +#'@seealso \code{\link{data2haplohh}}, \code{\link{scan_hh}}, +#'\code{\link{ihh2ihs}}, \code{\link{ines2rsb}}, \code{\link{ies2xpehh}} #'@examples #'#example haplohh object (280 haplotypes, 1424 SNPs) #'#see ?haplohh_cgu_bta12 for details #'data(haplohh_cgu_bta12) -#'scan <- scan_hh_full(haplohh_cgu_bta12) +#'#using function scan_hh() with no cut-offs +#'scan <- scan_hh(haplohh_cgu_bta12, discard_integration_at_border = FALSE, +#'limehh = 0, limehhs = 0, lower_ehh_y_bound = 0, interpolate = FALSE) +#'#using function scan_hh_full() +#'scan_full <- scan_hh_full(haplohh_cgu_bta12, discard_integration_at_border = FALSE) +#'#both yield identical results within numerical precision +#'all.equal(scan, scan_full) #'@export scan_hh_full <- function(haplohh, phased = TRUE, polarized = TRUE, maxgap = NA, + max_extend = NA, discard_integration_at_border = TRUE, geometric.mean = FALSE, threads = 1) { @@ -89,8 +104,19 @@ scan_hh_full <- if (!(is.haplohh(haplohh))) { stop("Data is not a valid haplohh object.", call. = FALSE) } + if (!is.na(maxgap) & (!is.numeric(maxgap) | maxgap < 1)) { + stop("maxgap must be a positive integer number.", call. = FALSE) + } + if (!is.na(max_extend) & + (!is.numeric(max_extend) | max_extend < 1)) { + stop("max_extend must be a positive integer number.", call. = FALSE) + } + if (is.na(maxgap)) { - maxgap <- (max(positions(haplohh)) + 1) + maxgap <- diff(range(positions(haplohh))) + 1 + } + if (is.na(max_extend)) { + max_extend <- diff(range(positions(haplohh))) + 1 } ##perform calculation @@ -123,7 +149,7 @@ scan_hh_full <- }) res.list <- .Call( - "CALL_SCAN_HH2", + "CALL_SCAN_HH_FULL", haplo(haplohh), nhap(haplohh), nmrk(haplohh), @@ -133,6 +159,7 @@ scan_hh_full <- allele_stats[2, ], positions(haplohh), as.integer(maxgap), + as.integer(max_extend), as.integer(phased), as.integer(discard_integration_at_border), as.integer(geometric.mean), @@ -163,14 +190,16 @@ scan_hh_full <- #if not polarized, change column names if (!polarized) { colnames(res)[3:10] <- - c("FREQ_MAJ", + c( + "FREQ_MAJ", "FREQ_MIN", "NHAPLO_MAJ", "NHAPLO_MIN", "IHH_MAJ", "IHH_MIN", "IES", - "INES") + "INES" + ) } return(res) diff --git a/R/subset.haplohh.R b/R/subset.haplohh.R index 003f92f..92163db 100644 --- a/R/subset.haplohh.R +++ b/R/subset.haplohh.R @@ -15,6 +15,8 @@ #'In case of multi-allelic markers the second-most frequent allele is referred to as minor allele. #'Setting this value to zero eliminates monomorphic sites. Default is \code{NA}, #'hence no constraint. +#'@param max_alleles threshold for the maximum number of different alleles at a site. Default is \code{NA}, +#'hence no restriction. In order to retain only bi-allelic markers, set this parameter to 2. #'@param verbose logical. If \code{TRUE} (default), report verbose progress. #'@param ... further arguments are ignored. #'@seealso \code{\link{haplohh-class}}, \code{\link{data2haplohh}} @@ -31,6 +33,7 @@ subset.haplohh <- min_perc_geno.hap = NA, min_perc_geno.mrk = 100, min_maf = NA, + max_alleles = NA, verbose = TRUE, ...) { # check parameters @@ -53,6 +56,12 @@ subset.haplohh <- stop("min_maf should lie in the interval [0,0.5].", call. = FALSE) } } + ### max_alleles must be at least 2 + if (!is.na(max_alleles)) { + if (!is.numeric(max_alleles) | max_alleles < 2) { + stop("max_alleles should be at least 2.", call. = FALSE) + } + } ### check if object is valid haplohh if (!is.haplohh(x)) { stop("Data is not a valid object of class haplohh.", call. = FALSE) @@ -224,6 +233,37 @@ subset.haplohh <- } } + if (!is.na(max_alleles)) { + if (verbose) + cat("Discard markers with more than", + max_alleles, + "different alleles.\n") + mrk_sel <- + apply(x@haplo, 2, function(x) { + length(unique(na.omit(x))) <= max_alleles + }) + if (sum(mrk_sel) == nmrk(x)) { + if (verbose) + cat("No marker discarded.\n") + } else{ + if (verbose) + cat(nmrk(x) - sum(mrk_sel), "markers discarded.\n") + x@haplo <- x@haplo[, mrk_sel, drop = FALSE] + x@positions <- x@positions[mrk_sel] + if (verbose) + cat(nmrk(x), "markers remaining.\n") + + if (nmrk(x) == 0) { + warning( + "No marker left after filtering on maximal number of different alleles.\n", + call. = FALSE, + immediate. = TRUE + ) + return(x) + } + } + } + if (verbose) cat("Data consists of", nhap(x), diff --git a/build/vignette.rds b/build/vignette.rds index af7cc9262523cdc72193e57a86db81c16851c984..78514cf732df874e8553d51ecda0e11443d977af 100644 GIT binary patch literal 239 zcmVoJGftY>3s2N+y@6XHG0Be;pQNT3Y2X++#mK^(WZtDP=M z%VowR&_X8EneF8{1FeCda>WaAyofXKd9!E!vt;}}80*x`d+o)R=&JpXJ!0IKIjn+K z_tLYk{D*fM&Y~^-Or%g!(bkCe9UT)<8b^loBcVljQaF#OdIAhW5Ejw_002;fZ$SV6 literal 227 zcmV<90381xiwFP!0000019ebK4uUWct$-**qLB;q1e`*Ri5p{FxNT$%Bzz35L3f@U z9AJt<;$nVp-s`;d;xfhzW?>j6wx*BC##j;VU|qmdr1u15a7shM<^@DiI}f?*Epd6A z@eq`V6WT2IdC67I9};*Y9tpkvNTgD&aoQ^_Fl4)5ON7{+A?*B?*YVj;{exQ#HlbIh znMgs>(k9T_#R+J_Iad8(9ydE`C)&qs19!+mssHH!uQD%i9v*UvybSa)uW|OqRM&Mi dc63dI)Q&XhhC+$pR-hiI>H{{#GVjI#005?dW(NQO diff --git a/inst/doc/examples.R b/inst/doc/examples.R index 2c8d098..1863434 100644 --- a/inst/doc/examples.R +++ b/inst/doc/examples.R @@ -1,5 +1,39 @@ ## ----setup, include=FALSE----------------------------------------------------- -knitr::opts_chunk$set(comment = ">", fig.height = 4.5, fig.width = 4.5, fig.show = "hold") +knitr::opts_chunk$set(comment = ">", fig.align = 'center', fig.height = 4.5, fig.width = 4.5, fig.show = "hold") + +## ----echo = FALSE, fig.height = 3, fig.width = 3------------------------------ +oldpar = par(mar = rep(0.1, 4)) +plot.new() +seq = c( + "AACTCAGACGA", + "AAGCGACAACT", + "ACGTCACACCA", + "AACCCAGCACT", + "AAGCCGGACCA", + "AAGCCGGACCA", + "GAGCCGGACCT", + "AAGCCGGACCT" +) +for (i in seq_along(seq)) { + n = strsplit(seq[i], "")[[1]] + text(((0:10) + 0.5) / 11, (8 - i) / 8 + 1 / 16, n) +} +transparent_red <- adjustcolor("red", alpha.f = 0.5) +transparent_blue <- adjustcolor("blue", alpha.f = 0.5) +polygon( + c(0, 11, 11, 0, 0, 1, 1, 0, 0) / 11, + c(4, 4, 0, 0, 1, 1, 2, 2, 4) / 8, + border = transparent_red, + col = transparent_red +) +polygon( + c(3, 7, 7, 8, 8, 7, 7, 4, 4, 3, 3, 5, 5, 3) / 11, + c(8, 8, 7, 7, 5, 5, 4, 4, 5, 5, 6, 6, 7, 7) / 8, + border = transparent_blue, + col = transparent_blue +) +polygon(c(5, 6, 6, 5) / 11, c(8, 8, 0, 0) / 8, border = "black") +par(oldpar) ## ----library, message = FALSE------------------------------------------------- library(rehh) @@ -40,7 +74,7 @@ hh_vcf <- data2haplohh(hap_file = "example1.vcf", verbose = FALSE) identical(hh, hh_vcf) -## ----------------------------------------------------------------------------- +## ----hhplot1, fig.cap = "Graphical output of the plot.haplohh() function"----- plot(hh) ## ----------------------------------------------------------------------------- @@ -88,7 +122,7 @@ cr ## ----manhattan11, fig.align = 'center', fig.cap = "Graphical output of the manhattanplot() function", fig.pos = '!h', fig.lp = 'fig:'---- manhattanplot(ihs, threshold = c(-1.5,1.5), cr = cr, ylim = c(-2.5,2.5), pch = 20) -## ----furcation11, fig.align = 'center', fig.cap = "Graphical output of the plot.furcation() function", fig.pos = '!h', fig.lp = 'fig:'---- +## ----furcation11, fig.cap = "Graphical output of the plot.furcation() function", fig.pos = '!h', fig.lp = 'fig:'---- f <- calc_furcation(hh, mrk = "rs6") # set equal plot margins on left and right side and save old ones oldpar <- par(mar = (c(5, 3, 4, 3) + 0.1)) @@ -103,7 +137,7 @@ plot(f, # reset old margins par(oldpar) -## ----newick1, , eval = requireNamespace("ape", quietly = TRUE), fig.align = 'center', fig.cap = "Graphical output of the plot.phylo() function of package ape", fig.pos = '!h', fig.lp = 'fig:'---- +## ----newick1, eval = requireNamespace("ape", quietly = TRUE), fig.align = 'center', fig.cap = "Graphical output of the plot.phylo() function of package ape", fig.pos = '!h', fig.lp = 'fig:'---- newick <- as.newick(f, allele = 0, side = "left", @@ -183,7 +217,7 @@ hh_vcf <- data2haplohh(hap_file = "example2.vcf", verbose = FALSE) identical(hh, hh_vcf) -## ----------------------------------------------------------------------------- +## ----hhplot2, fig.cap = "Graphical output of the plot.furcation() function"---- plot(hh) ## ----------------------------------------------------------------------------- @@ -233,11 +267,11 @@ hh_subset = subset(hh, scan <- scan_hh(hh_subset, discard_integration_at_border = FALSE) scan -## ----manhattan22, fig.align = 'center', fig.cap = "Graphical output of the manhattanplot() function", fig.pos = '!h', fig.lp = 'fig:'---- +## ----manhattan22, fig.cap = "Graphical output of the manhattanplot() function", fig.pos = '!h', fig.lp = 'fig:'---- ihs <- ihh2ihs(scan, freqbin = 1, verbose = FALSE) manhattanplot(ihs, threshold = c(-1.5, 1.5), ylim = c(-2.5,2.5), pch = 20) -## ----furcation21, fig.align = 'center', fig.cap = "Graphical output of the plot.furcation() function", fig.pos = '!h', fig.lp = 'fig:'---- +## ----furcation21, fig.cap = "Graphical output of the plot.furcation() function", fig.pos = '!h', fig.lp = 'fig:'---- f <- calc_furcation(hh, mrk = "rs6") # set equal plot margins on left and right side and save old ones oldpar <- par(mar = (c(5, 3, 4, 3) + 0.1)) @@ -248,7 +282,7 @@ plot(f, legend.xy.coords = "none") par(oldpar) -## ----newick2, , eval = requireNamespace("ape", quietly = TRUE), fig.align = 'center', fig.cap = "Graphical output of the plot.phylo() function of package ape", fig.pos = '!h', fig.lp = 'fig:'---- +## ----newick2, eval = requireNamespace("ape", quietly = TRUE), fig.align = 'center', fig.cap = "Graphical output of the plot.phylo() function of package ape", fig.pos = '!h', fig.lp = 'fig:'---- newick <- as.newick(f, allele = 0, side = "left", diff --git a/inst/doc/examples.Rmd b/inst/doc/examples.Rmd index cb5a5b8..89f0db2 100644 --- a/inst/doc/examples.Rmd +++ b/inst/doc/examples.Rmd @@ -6,24 +6,72 @@ output: bookdown::html_document2: base_format: rmarkdown::html_vignette toc: yes + bookdown::pdf_document2: + toc: yes + fig_caption: yes + number_sections: yes +fontsize: 12 pt +urlcolor: blue bibliography: vignette.bib csl: genetics.csl +header-includes: + - \numberwithin{equation}{section} vignette: > \usepackage[utf8]{inputenc} + \usepackage{amsmath} %\VignetteIndexEntry{Examples in detail} %\VignetteEncoding{UTF-8} %\VignetteEngine{knitr::rmarkdown} --- ```{r setup, include=FALSE} -knitr::opts_chunk$set(comment = ">", fig.height = 4.5, fig.width = 4.5, fig.show = "hold") +knitr::opts_chunk$set(comment = ">", fig.align = 'center', fig.height = 4.5, fig.width = 4.5, fig.show = "hold") ``` \clearpage # Overview -This vignette focuses on two small example data sets delivered with the package *rehh* (see main vignette). They have been constructed to ease comprehension of the relevant statistics and functionality of the package. The first example has been already discussed in [@Gautier2017] while the second set is an extension to include multiple markers and missing values. +Despite a bewildering nomenclature, the idea of *Extended Haplotype Homozygosity* is simple. Consider the following alignment of nucleotide sequences where only bi-allelic sites have been retained: + +```{r echo = FALSE, fig.height = 3, fig.width = 3} +oldpar = par(mar = rep(0.1, 4)) +plot.new() +seq = c( + "AACTCAGACGA", + "AAGCGACAACT", + "ACGTCACACCA", + "AACCCAGCACT", + "AAGCCGGACCA", + "AAGCCGGACCA", + "GAGCCGGACCT", + "AAGCCGGACCT" +) +for (i in seq_along(seq)) { + n = strsplit(seq[i], "")[[1]] + text(((0:10) + 0.5) / 11, (8 - i) / 8 + 1 / 16, n) +} +transparent_red <- adjustcolor("red", alpha.f = 0.5) +transparent_blue <- adjustcolor("blue", alpha.f = 0.5) +polygon( + c(0, 11, 11, 0, 0, 1, 1, 0, 0) / 11, + c(4, 4, 0, 0, 1, 1, 2, 2, 4) / 8, + border = transparent_red, + col = transparent_red +) +polygon( + c(3, 7, 7, 8, 8, 7, 7, 4, 4, 3, 3, 5, 5, 3) / 11, + c(8, 8, 7, 7, 5, 5, 4, 4, 5, 5, 6, 6, 7, 7) / 8, + border = transparent_blue, + col = transparent_blue +) +polygon(c(5, 6, 6, 5) / 11, c(8, 8, 0, 0) / 8, border = "black") +par(oldpar) +``` -The pattern of variation seen in the sets is intended to reflect an evolutionary scenario of an "on-going selective sweep" with the derived allele of the central marker experiencing strong selection. +The colored areas mark the maximal extension to which at least two sequences carrying the same *focal allele* are identical, i.e. homozygous to each other. The average length of all sequence-pairwise *shared haplotypes* yields the *iHH* scores for the two central alleles, respectively. The (unstandardized) *iHS* value is the log ratio of them. The statistics *XP-EHH* and *Rsb* are constructed in the same way with the two alleles replaced by two populations and while *Rsb* is normalized to 1 at the focal position, *XP-EHH* is not. That's all! + +This vignette analyses in great detail two small example data sets delivered with the package *rehh* (see main vignette). They have been constructed to ease comprehension of the relevant statistics and functionality of the package. The first example has been already discussed in [@Gautier2017] while the second set is an extension to include multiple markers and missing values. The modifications for unphased or unpolarized data have been described in [@Klassmann2020]. + +The pattern of variation seen in the sets and in the alignment above is intended to reflect an evolutionary scenario of an "on-going selective sweep" with one allele of the central marker experiencing strong selection. The package has to be installed and then loaded by ```{r library, message = FALSE} @@ -96,8 +144,8 @@ identical(hh, hh_vcf) ### Visualizing the sequences -The haplohh-object can be visualized by a simple plot command: -```{r} +The haplohh-object can be visualized by a simple plot that shows ancestral alleles in blue and derived ones in red: +```{r hhplot1, fig.cap = "Graphical output of the plot.haplohh() function"} plot(hh) ``` @@ -213,7 +261,7 @@ manhattanplot(ihs, threshold = c(-1.5,1.5), cr = cr, ylim = c(-2.5,2.5), pch = 2 A furcation plot represents a more fine-grained visualization of the homozygosity decay. In particular, individual haplotypes can be discerned which may instigate further investigations. The labels plotted in Figure \@ref(fig:furcation11) are set in bold face, if the branches with which they are associated encompass further haplotypes. -```{r furcation11, fig.align = 'center', fig.cap = "Graphical output of the plot.furcation() function", fig.pos = '!h', fig.lp = 'fig:'} +```{r furcation11, fig.cap = "Graphical output of the plot.furcation() function", fig.pos = '!h', fig.lp = 'fig:'} f <- calc_furcation(hh, mrk = "rs6") # set equal plot margins on left and right side and save old ones oldpar <- par(mar = (c(5, 3, 4, 3) + 0.1)) @@ -231,7 +279,7 @@ par(oldpar) A furcation diagram consists of trees for each allele and both sides ("left" and "right") of the marker. The individual trees can be exported into a string in *Newick* format to be rendered by external programs, e.g. the phylogenetic R-package [ape](https://cran.r-project.org/package=ape), see Figure \@ref(fig:newick1). -```{r newick1, , eval = requireNamespace("ape", quietly = TRUE), fig.align = 'center', fig.cap = "Graphical output of the plot.phylo() function of package ape", fig.pos = '!h', fig.lp = 'fig:'} +```{r newick1, eval = requireNamespace("ape", quietly = TRUE), fig.align = 'center', fig.cap = "Graphical output of the plot.phylo() function of package ape", fig.pos = '!h', fig.lp = 'fig:'} newick <- as.newick(f, allele = 0, side = "left", @@ -357,7 +405,7 @@ identical(hh, hh_vcf) ### Visualizing the sequences The haplohh-object can be visualized by a simple plot command: -```{r} +```{r hhplot2, fig.cap = "Graphical output of the plot.furcation() function"} plot(hh) ``` @@ -447,7 +495,7 @@ Note that the value of *EHH_D*, now representing the allele with internal coding However, with so few *EHH* values due to missing values, there is not much signal left and a standardization by `ihh2ihs()` averages the alleged outlier away as can be observed in Figure \@ref(fig:manhattan22). -```{r manhattan22, fig.align = 'center', fig.cap = "Graphical output of the manhattanplot() function", fig.pos = '!h', fig.lp = 'fig:'} +```{r manhattan22, fig.cap = "Graphical output of the manhattanplot() function", fig.pos = '!h', fig.lp = 'fig:'} ihs <- ihh2ihs(scan, freqbin = 1, verbose = FALSE) manhattanplot(ihs, threshold = c(-1.5, 1.5), ylim = c(-2.5,2.5), pch = 20) ``` @@ -457,7 +505,7 @@ manhattanplot(ihs, threshold = c(-1.5, 1.5), ylim = c(-2.5,2.5), pch = 20) A furcation diagram can show the pattern for all three alleles of the focal marker `rs6`. (Pseudo-)furcations that arise from the removal of chromosomes due to missing values are marked by dashed lines as depicted in Figure \@ref(fig:furcation21). -```{r furcation21, fig.align = 'center', fig.cap = "Graphical output of the plot.furcation() function", fig.pos = '!h', fig.lp = 'fig:'} +```{r furcation21, fig.cap = "Graphical output of the plot.furcation() function", fig.pos = '!h', fig.lp = 'fig:'} f <- calc_furcation(hh, mrk = "rs6") # set equal plot margins on left and right side and save old ones oldpar <- par(mar = (c(5, 3, 4, 3) + 0.1)) @@ -471,7 +519,7 @@ par(oldpar) Again, it is possible to export each tree into Newick format. This format, however, has no option to mark different kinds of branches. We let package *ape* render the Newick string to yield Figure \@ref(fig:newick2). -```{r newick2, , eval = requireNamespace("ape", quietly = TRUE), fig.align = 'center', fig.cap = "Graphical output of the plot.phylo() function of package ape", fig.pos = '!h', fig.lp = 'fig:'} +```{r newick2, eval = requireNamespace("ape", quietly = TRUE), fig.align = 'center', fig.cap = "Graphical output of the plot.phylo() function of package ape", fig.pos = '!h', fig.lp = 'fig:'} newick <- as.newick(f, allele = 0, side = "left", diff --git a/inst/doc/examples.html b/inst/doc/examples.html index d6279a5..19e5590 100644 --- a/inst/doc/examples.html +++ b/inst/doc/examples.html @@ -12,7 +12,7 @@ - + Examples in detail @@ -29,6 +29,22 @@ } }); + @@ -315,7 +331,7 @@

Examples in detail

Alexander Klassmann

-

2020-07-17

+

2020-11-01

@@ -359,8 +375,11 @@

2020-07-17

1 Overview

-

This vignette focuses on two small example data sets delivered with the package rehh (see main vignette). They have been constructed to ease comprehension of the relevant statistics and functionality of the package. The first example has been already discussed in (Gautier et al. 2017) while the second set is an extension to include multiple markers and missing values.

-

The pattern of variation seen in the sets is intended to reflect an evolutionary scenario of an “on-going selective sweep” with the derived allele of the central marker experiencing strong selection.

+

Despite a bewildering nomenclature, the idea of Extended Haplotype Homozygosity is simple. Consider the following alignment of nucleotide sequences where only bi-allelic sites have been retained:

+

+

The colored areas mark the maximal extension to which at least two sequences carrying the same focal allele are identical, i.e. homozygous to each other. The average length of all sequence-pairwise shared haplotypes yields the iHH scores for the two central alleles, respectively. The (unstandardized) iHS value is the log ratio of them. The statistics XP-EHH and Rsb are constructed in the same way with the two alleles replaced by two populations and while Rsb is normalized to 1 at the focal position, XP-EHH is not. That’s all!

+

This vignette analyses in great detail two small example data sets delivered with the package rehh (see main vignette). They have been constructed to ease comprehension of the relevant statistics and functionality of the package. The first example has been already discussed in (Gautier et al. 2017) while the second set is an extension to include multiple markers and missing values. The modifications for unphased or unpolarized data have been described in (Klassmann and Gautier 2020).

+

The pattern of variation seen in the sets and in the alignment above is intended to reflect an evolutionary scenario of an “on-going selective sweep” with one allele of the central marker experiencing strong selection.

The package has to be installed and then loaded by

library(rehh)
@@ -463,9 +482,14 @@

2.1.2 Input options

2.2 Calculations and visualizations

2.2.1 Visualizing the sequences

-

The haplohh-object can be visualized by a simple plot command:

+

The haplohh-object can be visualized by a simple plot that shows ancestral alleles in blue and derived ones in red:

plot(hh)
-

+
+Graphical output of the plot.haplohh() function +

+Figure 2.1: Graphical output of the plot.haplohh() function +

+

Note, however, that this kind of plot is intended only for relatively small data sets.

@@ -523,12 +547,12 @@

2.2.2 EHH

The starting set for the derived allele consists of {HG3_1, HG3_2, HG4_1 and HG4_2}. Extending to the right, the corresponding haplotypes remain homozygous and consequently the set is not split until marker rs11. In particular we have \[EHH^d_{rs6,rs7}=EHH^d_{rs6,rs8}=\frac{1}{4\cdot3}\sum_{k=1}^14\cdot3=1\] and essentially the same situation on the left side of the focal marker.

-

The corresponding plot (Figure 2.1) shows that EHH of the ancestral allele decays more rapidly than that of the derived allele.

+

The corresponding plot (Figure 2.2) shows that EHH of the ancestral allele decays more rapidly than that of the derived allele.

plot(res)
Graphical output of the plot.ehh() function

-Figure 2.1: Graphical output of the plot.ehh() function +Figure 2.2: Graphical output of the plot.ehh() function

Assume now that the haplotypes are not phased. That means, at each marker for which a diploid individual is heterozygous, it is unknown which allele belongs to chromosome ‘1’ and which to chromosome ‘2’. In this case the concept of extended haplotypes is not well-defined across individuals. However, we can still measure the decay of extended homozygosity within individuals. This is done by setting option phased to FALSE while assuming that the haplotypes in the input files are ordered as pairs belonging to individuals.

@@ -566,12 +590,12 @@

2.2.2 EHH

Again we can retrace the calculations by hand keeping in mind that EHH is now estimated by the fraction of homozygous individuals at each marker, see the corresponding formula in the main vignette.

Extending the shared haplotypes to the right, we find that both individuals HG1 and HG2 become heterozygous already at marker rs7 and hence EHH at this position becomes 0. Extending to the left, HG1 becomes heterozygous at marker rs5, while HG2 is still homozygous in the region spanning from rs6 to rs5. Hence the proportion of homozygous individuals at this marker is \(\frac{1}{2}\). At marker rs4 the second individual becomes heterozygous, too, and EHH yields 0.

By contrast, the individuals carrying the derived focal allele are homozygous for the entire chromosome except for marker rs1 where HG4 becomes heterozygous.

-

Figure 2.2 shows again the difference between EHH of the two core alleles:

+

Figure 2.3 shows again the difference between EHH of the two core alleles:

plot(res)
Graphical output of the plot.ehh() function

-Figure 2.2: Graphical output of the plot.ehh() function +Figure 2.3: Graphical output of the plot.ehh() function

@@ -610,12 +634,12 @@

2.2.3 EHHS

\[\mathrm{EHHS}_{rs6,rs6}=\frac{1}{n_s(n_s-1)}\left(\sum\limits_{k=1}^{K_{rs6,rs6}}n_k(n_k-1)\right)=\frac{1}{8\cdot7}(4\cdot3+4\cdot3)=\frac{3}{7}\] \[\mathrm{EHHS}_{rs6,rs7}=\frac{1}{n_s(n_s-1)}\left(\sum\limits_{k=1}^{K_{rs6,rs7}}n_k(n_k-1)\right)=\frac{1}{8\cdot7}(2\cdot1+2\cdot1+4\cdot3)=\frac{2}{7}\] \[\mathrm{EHHS}_{rs6,rs8}=\frac{1}{n_s(n_s-1)}\left(\sum\limits_{k=1}^{K_{rs6,rs8}}n_k(n_k-1)\right)=\frac{1}{8\cdot7}(1\cdot0+2\cdot1+1\cdot0+4\cdot3)=\frac{1}{4}\;.\]

-

By default, the following command shows the (un-normalized) EHHS values as in Figure 2.3. In order to draw the normalized values one can toggle the option nehhs to TRUE.

+

By default, the following command shows the (un-normalized) EHHS values as in Figure 2.4. In order to draw the normalized values one can toggle the option nehhs to TRUE.

plot(res)
Graphical output of the plot.ehhs() function

-Figure 2.3: Graphical output of the plot.ehhs() function +Figure 2.4: Graphical output of the plot.ehhs() function

@@ -674,18 +698,18 @@

2.2.4 “Genome-wide” scan

>      CHR START   END  EXTR_MRK
 > rs6 chr1 60000 60000 -1.938961

Under the assumption that most sites evolve neutrally, the standardized iHS values should follow a normal distribution with the sites under selection as outliers.

-

Obviously we do not have enough markers to fit a distribution, but a “genome-wide” plot of the ihs values shows clearly that the central marker is rather an outlier (as much as is possible for such a small sample), see Figure 2.4.

+

Obviously we do not have enough markers to fit a distribution, but a “genome-wide” plot of the ihs values shows clearly that the central marker is rather an outlier (as much as is possible for such a small sample), see Figure 2.5.

manhattanplot(ihs, threshold = c(-1.5,1.5), cr = cr, ylim = c(-2.5,2.5), pch = 20)
-Graphical output of the manhattanplot() function +Graphical output of the manhattanplot() function

-Figure 2.4: Graphical output of the manhattanplot() function +Figure 2.5: Graphical output of the manhattanplot() function

2.2.5 Furcations and haplotype length

-

A furcation plot represents a more fine-grained visualization of the homozygosity decay. In particular, individual haplotypes can be discerned which may instigate further investigations. The labels plotted in Figure 2.5 are set in bold face, if the branches with which they are associated encompass further haplotypes.

+

A furcation plot represents a more fine-grained visualization of the homozygosity decay. In particular, individual haplotypes can be discerned which may instigate further investigations. The labels plotted in Figure 2.6 are set in bold face, if the branches with which they are associated encompass further haplotypes.

f <- calc_furcation(hh, mrk = "rs6")
 # set equal plot margins on left and right side and save old ones
 oldpar <- par(mar = (c(5, 3, 4, 3) + 0.1))
@@ -702,10 +726,10 @@ 

2.2.5 Furcations and haplotype le
Graphical output of the plot.furcation() function

-Figure 2.5: Graphical output of the plot.furcation() function +Figure 2.6: Graphical output of the plot.furcation() function

-

A furcation diagram consists of trees for each allele and both sides (“left” and “right”) of the marker. The individual trees can be exported into a string in Newick format to be rendered by external programs, e.g. the phylogenetic R-package ape, see Figure ??.

+

A furcation diagram consists of trees for each allele and both sides (“left” and “right”) of the marker. The individual trees can be exported into a string in Newick format to be rendered by external programs, e.g. the phylogenetic R-package ape, see Figure 2.7.

newick <- as.newick(f, 
                     allele = 0, 
                     side = "left", 
@@ -719,13 +743,13 @@ 

2.2.5 Furcations and haplotype le edge.color = "blue", underscore = TRUE, no.margin = TRUE)

-
+
Graphical output of the plot.phylo() function of package ape

-(#fig:newick1, )Graphical output of the plot.phylo() function of package ape +Figure 2.7: Graphical output of the plot.phylo() function of package ape

-

The end points of shared extended haplotypes can be defined as the “last split” in a furcation, i.e. the positions until which at least two different chromosomes of the sample are homozygous. Calculation of shared haplotype length and its visualization in Figure 2.6 are called by:

+

The end points of shared extended haplotypes can be defined as the “last split” in a furcation, i.e. the positions until which at least two different chromosomes of the sample are homozygous. Calculation of shared haplotype length and its visualization in Figure 2.8 are called by:

h <- calc_haplen(f)
 plot(h, 
      hap.names = hap.names(hh), 
@@ -733,10 +757,10 @@ 

2.2.5 Furcations and haplotype le
Graphical output of the plot.haplen() function

-Figure 2.6: Graphical output of the plot.haplen() function +Figure 2.8: Graphical output of the plot.haplen() function

-

In case of unphased haplotypes, furcations can only occur within individuals which limits the informative value of furcation diagrams as in Figure 2.7.

+

In case of unphased haplotypes, furcations can only occur within individuals which limits the informative value of furcation diagrams as in Figure 2.9.

f <- calc_furcation(hh, mrk = "rs6", phased = FALSE)
 # set equal plot margins on left and right side and save old ones
 oldpar <- par(mar = (c(5, 3, 4, 3) + 0.1))
@@ -752,16 +776,16 @@ 

2.2.5 Furcations and haplotype le
Graphical output of the plot.haplen() function

-Figure 2.7: Graphical output of the plot.haplen() function +Figure 2.9: Graphical output of the plot.haplen() function

-

Nevertheless, the length of shared haplotypes, now identical to the ranges of individual homozygosity, can be calculated as before to yield Figure 2.8.

+

Nevertheless, the length of shared haplotypes, now identical to the ranges of individual homozygosity, can be calculated as before to yield Figure 2.10.

h <- calc_haplen(f)
 plot(h, hap.names = hap.names(hh))
Graphical output of the plot.haplen() function

-Figure 2.8: Graphical output of the plot.haplen() function +Figure 2.10: Graphical output of the plot.haplen() function

@@ -893,7 +917,12 @@

3.2 Calculations and visualizatio

3.2.1 Visualizing the sequences

The haplohh-object can be visualized by a simple plot command:

plot(hh)
-

+
+Graphical output of the plot.furcation() function +

+Figure 3.1: Graphical output of the plot.furcation() function +

+

3.2.2 EHH

@@ -930,12 +959,12 @@

3.2.2 EHH

> IHH_A IHH_D1 IHH_D2 > 18816.67 90012.50 43216.67

Note that the derived alleles are ordered by their internal coding.

-

Figure 3.1 shows clearly that the first derived allele has a strong extended homozygosity while the second derived allele is not that different from the ancestral allele.

+

Figure 3.2 shows clearly that the first derived allele has a strong extended homozygosity while the second derived allele is not that different from the ancestral allele.

plot(res)
Graphical output of the plot.ehh() function

-Figure 3.1: Graphical output of the plot.ehh() function +Figure 3.2: Graphical output of the plot.ehh() function

@@ -971,12 +1000,12 @@

3.2.3 EHHS

> > [[4]] > [1] 49005.95
-

Note that the number of evaluated haplotypes NHAPLO decreases with distance to the focal marker due to missing values which lead at each calculation step to subsequent removals of the respective chromosomes. This can sometimes yield a transient increase of EHHS (and in general, EHH, too) as can be seen at position 30 kb in Figure 3.2.

+

Note that the number of evaluated haplotypes NHAPLO decreases with distance to the focal marker due to missing values which lead at each calculation step to subsequent removals of the respective chromosomes. This can sometimes yield a transient increase of EHHS (and in general, EHH, too) as can be seen at position 30 kb in Figure 3.3.

plot(res)
Graphical output of the plot.ehh() function

-Figure 3.2: Graphical output of the plot.ehh() function +Figure 3.3: Graphical output of the plot.ehh() function

@@ -1060,12 +1089,12 @@

3.2.4 “Genome-wide” scan

> $frequency.class > N_MRK MEAN_UNIHS SD_UNIHS LOWER_QT UPPER_QT > [0.05,0.95) 11 0.1787203 0.6140553 -0.7234433 0.9454923 -

The “genome-wide” ihs values are depicted in Figure 3.3.

+

The “genome-wide” ihs values are depicted in Figure 3.4.

manhattanplot(ihs, threshold = c(-1.5, 1.5), ylim = c(-2.5,2.5), pch = 20)
-Graphical output of the manhattanplot() function +Graphical output of the manhattanplot() function

-Figure 3.3: Graphical output of the manhattanplot() function +Figure 3.4: Graphical output of the manhattanplot() function

Now we know that actually the first derived allele at marker rs6 is of interest, but it is not present in the scan because it is not the major derived allele. How can we modify the scan to include it?

@@ -1111,19 +1140,19 @@

3.2.4 “Genome-wide” scan

> rs10 14500.000 27500.00 > rs11 4267.857 13050.00

Note that the value of EHH_D, now representing the allele with internal coding 1, is much higher at marker rs6 than before.

-

However, with so few EHH values due to missing values, there is not much signal left and a standardization by ihh2ihs() averages the alleged outlier away as can be observed in Figure 3.4.

+

However, with so few EHH values due to missing values, there is not much signal left and a standardization by ihh2ihs() averages the alleged outlier away as can be observed in Figure 3.5.

ihs <- ihh2ihs(scan, freqbin = 1, verbose = FALSE)
 manhattanplot(ihs, threshold = c(-1.5, 1.5), ylim = c(-2.5,2.5), pch = 20)
-Graphical output of the manhattanplot() function +Graphical output of the manhattanplot() function

-Figure 3.4: Graphical output of the manhattanplot() function +Figure 3.5: Graphical output of the manhattanplot() function

3.2.5 Furcations and haplotype length

-

A furcation diagram can show the pattern for all three alleles of the focal marker rs6. (Pseudo-)furcations that arise from the removal of chromosomes due to missing values are marked by dashed lines as depicted in Figure 3.5.

+

A furcation diagram can show the pattern for all three alleles of the focal marker rs6. (Pseudo-)furcations that arise from the removal of chromosomes due to missing values are marked by dashed lines as depicted in Figure 3.6.

f <- calc_furcation(hh, mrk = "rs6")
 # set equal plot margins on left and right side and save old ones
 oldpar <- par(mar = (c(5, 3, 4, 3) + 0.1))
@@ -1136,10 +1165,10 @@ 

3.2.5 Furcations and haplotype le
Graphical output of the plot.furcation() function

-Figure 3.5: Graphical output of the plot.furcation() function +Figure 3.6: Graphical output of the plot.furcation() function

-

Again, it is possible to export each tree into Newick format. This format, however, has no option to mark different kinds of branches. We let package ape render the Newick string to yield Figure ??.

+

Again, it is possible to export each tree into Newick format. This format, however, has no option to mark different kinds of branches. We let package ape render the Newick string to yield Figure 3.7.

newick <- as.newick(f, 
                     allele = 0, 
                     side = "left", 
@@ -1150,19 +1179,19 @@ 

3.2.5 Furcations and haplotype le edge.color = "blue", underscore = TRUE, no.margin = TRUE)

-
+
Graphical output of the plot.phylo() function of package ape

-(#fig:newick2, )Graphical output of the plot.phylo() function of package ape +Figure 3.7: Graphical output of the plot.phylo() function of package ape

-

Likewise, Figure 3.6 of the shared haplotype lengths covers all alleles of the focal marker.

+

Likewise, Figure 3.8 of the shared haplotype lengths covers all alleles of the focal marker.

h <- calc_haplen(f)
 plot(h, hap.names = hap.names(hh), legend.xy.coords = "none")
Graphical output of the plot.haplen() function

-Figure 3.6: Graphical output of the plot.haplen() function +Figure 3.8: Graphical output of the plot.haplen() function

Finally, to clean-up the working directory, we call

@@ -1176,6 +1205,9 @@

References

Gautier M., Klassmann A., Vitalis R., 2017 rehh 2.0: a reimplementation of the R package rehh to detect positive selection from haplotype structure. Molecular Ecology Resources 17: 78–90.

+
+

Klassmann A., Gautier M., 2020 Detecting selection using extended haplotype homozygosity-based statistics on unphased or unpolarized data (preprint). https://doi.org/10.22541/au.160405572.29972398/v1.

+

diff --git a/inst/doc/rehh.R b/inst/doc/rehh.R index 9fa05ea..34a02e5 100644 --- a/inst/doc/rehh.R +++ b/inst/doc/rehh.R @@ -7,7 +7,7 @@ library(rehh) ## ----make_examples, results = 'hide'------------------------------------------ make.example.files() -## ----minimalcodeexample, results = "hide"------------------------------------- +## ----minimalcodeexample, fig.align = 'center', results = "hide"--------------- hh <- # data input data2haplohh( hap_file = "bta12_cgu.hap", @@ -16,7 +16,8 @@ hh <- # data input allele_coding = "map" ) scan <- scan_hh(hh) # calculation of EHH and integration -ihs <- ihh2ihs(scan) # log ratio for alleles and normalization + # (combine results from different chromosomes) +ihs <- ihh2ihs(scan) # log ratio for alleles and standardization manhattanplot(ihs) # plot of the statistics ## ----map_inp------------------------------------------------------------------ @@ -298,6 +299,7 @@ manhattanplot(wgscan.ihs.cgu, main = "iHS (CGU cattle breed)", cr = cr.cgu, mrk = "F1205400", + inset = 1E+7, resolution = c(200000, 0.05)) # set back to default colors palette("default") @@ -362,7 +364,7 @@ newick <- as.newick(furcation, side = "left", hap.names = hap.names(haplohh_cgu_bta12)) -## ----newick, , eval = requireNamespace("ape", quietly = TRUE), fig.align = 'center', fig.width = 6, fig.height = 6, fig.lp = 'fig:', fig.cap = 'Graphical output of the ape::plot.phylo() function', fig.pos = "!h"---- +## ----newick, eval = requireNamespace("ape", quietly = TRUE), fig.align = 'center', fig.width = 6, fig.height = 6, fig.lp = 'fig:', fig.cap = 'Graphical output of the ape::plot.phylo() function', fig.pos = "!h"---- library(ape) tree <- ape::read.tree(text = newick) plot(tree, diff --git a/inst/doc/rehh.Rmd b/inst/doc/rehh.Rmd index 6cf7e53..d105989 100644 --- a/inst/doc/rehh.Rmd +++ b/inst/doc/rehh.Rmd @@ -1,20 +1,25 @@ --- -title: "Vignette for package *rehh* (version 3.1.2)" +title: "Vignette for package *rehh*" author: "Alexander Klassmann, Renaud Vitalis and Mathieu Gautier" date: "`r Sys.Date()`" output: bookdown::html_document2: base_format: rmarkdown::html_vignette toc: yes - bookdown::pdf_book: + bookdown::pdf_document2: toc: yes fig_caption: yes number_sections: yes +fontsize: 12 pt +urlcolor: blue bibliography: vignette.bib csl: genetics.csl +header-includes: + - \numberwithin{equation}{section} vignette: > \usepackage[utf8]{inputenc} - %\VignetteIndexEntry{How to use rehh} + \usepackage{amsmath} + %\VignetteIndexEntry{Vignette for package *rehh*} %\VignetteEncoding{UTF-8} %\VignetteEngine{knitr::rmarkdown} --- @@ -25,7 +30,7 @@ knitr::opts_chunk$set(comment = ">",fig.height = 4.5, fig.width = 4.5, fig.show \clearpage # About the package -This vignette describes how the R package *rehh* can be applied to perform whole genome scans for footprints of selection using statistics related to *Extended Haplotype Homozygosity (EHH)* [@Sabeti2002]. +This vignette describes comprehensively how the R package *rehh* can be applied to perform whole genome scans for footprints of selection using statistics related to *Extended Haplotype Homozygosity (EHH)* [@Sabeti2002]. The vignette *Examples in detail* explains basic usage and methodology with the help of two tiny artificial data sets. The package accepts multi-allelic genetic markers as input. Typically, albeit not necessarily, these will be bi-allelic SNPs. @@ -35,28 +40,14 @@ library(rehh) ``` ## Background -The analysis of molecular population genetic data often comprises the search for genomic regions that might have experienced recent selection. Diverse approaches have been developed, reviewed e.g. in [@Oleksyk2010] and [@Vitti2013], however only a few have found wide-spread application [@Cadzow2014], [@Haasl2016]. To the latter belong *iHS* [Voight2006], *Rsb* [@Tang2007] and *XP-EHH* [@Sabeti2007], all of which are *summary statistics* aimed to distill a certain aspect of the genetic data into a single score and constructed in a way that extreme values are indicative of positive or "Darwinian" selection. *iHS* is intended for application on a single (presumably homogeneous) population, while *XP-EHH* and *Rsb* are targeted to differential selection between two populations. All three statistics are based on the concept of *Extended Haplotype Homozygosity (EHH)* as formulated by [@Sabeti2002]. +The analysis of molecular population genetic data often comprises the search for genomic regions that might have experienced recent selection. Diverse approaches have been developed; for reviews on methodology see [@Sabeti2006], [@Oleksyk2010] and [@Vitti2013] and for +practical advice [@Cadzow2014], [@Utsunomiya2015] and [@Weigand2018]. However only a few have found wide-spread application [@Haasl2016]. To the latter belong *iHS* [@Voight2006], *Rsb* [@Tang2007] and *XP-EHH* [@Sabeti2007], all of which are *summary statistics* aimed to distill a certain aspect of the genetic data into a single score and constructed in a way that extreme values are indicative of positive or "Darwinian" selection. *iHS* is intended for application on a single (presumably homogeneous) population, while *XP-EHH* and *Rsb* are targeted to differential selection between two populations. All three statistics are based on the concept of *Extended Haplotype Homozygosity (EHH)* as formulated by [@Sabeti2002]. *iHS* and *XP-EHH* can be calculated by the independent C++ command line tool *Hapbin* [@Maclean2015], which has been optimized for speed by exploiting bit-wise machine-level operations. The package *rehh* cannot compete on performance, but has the advantage of being able to work with multi-allelic markers and missing values. Moreover, it possesses a broader range of input and output options, including several graphical representations. ## Changes between versions 2.X and 3.X -The C routines responsible for the bulk of calculations have been rewritten and all of the R code has been largely revised and streamlined yielding the following new features: - -- the package accepts multi-allelic markers. -- support for input files in *variant call format* as well as the output format of the simulation program *ms* [@Hudson2002]. -- computation of statistics and their visualization has become separated into different functions. -- graphical plots are more customizable. -- furcation diagrams can be labeled and a related visualization of "haplotype length" was added. -- a function to define candidate regions of selection. -- adaptations for unphased haplotypes or unpolarized alleles. -- output in form of matrices has been replaced by data frames; some columns have become optional. -- all names of data frames are now in lower case letters while all column names contain exclusively capital letters and underscores. -- a new internal representation of alleles: previously, an ancestral allele was coded by 1, a derived allele by 2 and a missing value by 0. These codings have been replaced by 0, 1 and `NA`, respectively. Furthermore, the numbers are now explicitly of type "integer" instead of "numeric". -- a second vignette to explain the statistics involved and the functionality of the package using an invented tiny data set. -- additional options to yield virtually identical results with the program *hapbin* (see section \@ref(hapbin)). -- an inconsistency between implementation and documentation concerning the calculation of one-sided p-values has been cleared. - -**Due to changes in the API, although mostly small, the versions 3.X are not compatible with versions 2.X!** + +Due to changes in the API, although mostly small, the versions 3.X are not compatible with versions 2.X. Data objects of class `haplohh` (see below) created by versions up to 2.0.4 must be updated via the command `update_haplohh()` (see its documentation by typing `?update_haplohh`) in order to be accepted by the functions of the current version. @@ -68,9 +59,11 @@ For illustration purposes, several example input files as well as R objects are - Two tiny invented examples, each in our "standard" haplotype format (see section \@ref(input)) and in *variant call format* (*vcf*). The first example was used in [@Gautier2017] for the explanation of the various statistics calculated by this package. The second example is an extension of the former including multi-allelic markers and missing values. Both sets are discussed in depth in a second vignette. + - Further three tiny examples used for the supplement on Site Frequency Spectrum-based methods of [@Klassmann2020]. + - An output file of the program *ms* containing two small simulated haplotype samples. - - Input files in different formats that originate from a study on the "Creole cattle breed from Guadeloupe" (CGU) [@Gautier2011]. All files contain the same set of phased SNPs of *Bos taurus* chromosome 12 from 140 individuals. + - A data set in various formats that originated from a study on the "Creole cattle breed from Guadeloupe" (CGU) [@Gautier2011]. All files contain the same set of phased SNPs of *Bos taurus* chromosome 12 from 140 individuals. All of the above files are copied into the current working directory via the command ```{r make_examples, results = 'hide'} @@ -81,7 +74,7 @@ make.example.files() ### R objects - - The data for chromosome 12 of the cattle study mentioned above as object of the *rehh* data class `haplohh`. This object becomes available by the command `data(haplohh_cgu_bta12)`. + - The data set for chromosome 12 of the cattle study mentioned above as object of the *rehh* data class `haplohh`. This object becomes available by the command `data(haplohh_cgu_bta12)`. - The scores *iHH* and *iES* as obtained by the function `scan_hh()` applied on SNPs of the whole genome for the population CGU (defined above) and another population EUT ("European taurine"). They reside in the accompanying package `rehh.data` and are obtained by `library(rehh.data)` followed by `data(wgscan.cgu)` resp. `data(wgscan.eut)`. @@ -91,15 +84,16 @@ The sequence of alleles on a chromosome is referred to as its *haplotype* and so ## Overview -The package calculates three statistics by the following steps: +The package calculates three statistics which can be used to perform whole-genome scans for selection: *iHS*, *XP-EHH* and *Rsb*. *iHS* compares alleles within a single population while the other two compare sites between populations. The calculation proceeds technically in five steps which are performed by running two commands and combining tables: -- each marker is taken in turn as a "focal marker" around which the extended haplotype homozygosity (*EHH*) is measured -- *EHH* is summerized into a number by integration -- two integrals are compared by taking their log ratio (two alleles for within-population or two populations for cross-population statistics) -- the genome-wide distribution of these log-ratios is normalized +- each marker is taken in turn as a "focal marker" around which the extended haplotype homozygosity is computed at further markers in increasing distance to the focal marker up to some stop criterion +- for each focal marker these quantities are "integrated" over the surrounding markers +- these integrals have to be calculated for each chromosome separately and the resulting tables to be combined to yields whole-genome statistics +- at each focal position, two such integrals are compared (either from two alleles or from two populations) by taking their log ratio +- the distribution of these log-ratios is standardized -Here is a minimal code example for a single population and a single chromossome: -```{r minimalcodeexample, results = "hide"} +Here is a minimal code example for calculating *iHS* on a single chromosome: +```{r minimalcodeexample, fig.align = 'center', results = "hide"} hh <- # data input data2haplohh( hap_file = "bta12_cgu.hap", @@ -108,14 +102,15 @@ hh <- # data input allele_coding = "map" ) scan <- scan_hh(hh) # calculation of EHH and integration -ihs <- ihh2ihs(scan) # log ratio for alleles and normalization + # (combine results from different chromosomes) +ihs <- ihh2ihs(scan) # log ratio for alleles and standardization manhattanplot(ihs) # plot of the statistics ``` # Data input {#input} The package *rehh* requires as input: - - a haplotype data file for each population of interest (see section \@ref(hapfile)). + - a haplotype data file (see section \@ref(hapfile)). and, if the haplotype data file is neither in *variant call format* nor in the format of *ms* output, @@ -130,8 +125,8 @@ Five haplotype input file formats are supported: - a *standard* haplotype format. Each row represents a haplotype with marker genotype in columns as in the example file `bta12_cgu.hap` containing 280 haplotypes with 1424 SNPs each (see section \@ref(LoadDataEx1)). The first element of each row is taken as a haplotype identifier. - a *transposed* format with haplotypes in columns and markers in rows as in the example file `bta12_cgu.thap`. This format is similar to the one produced by the phasing program *SHAPEIT2* [@OConnell2014]. This format assumes neither row nor column names and hence no haplotype identifiers can be specified (see section \@ref(LoadDataEx2)). - the output file format from the phasing program *fastPHASE* [@Scheet2006] as in the `bta12_hapguess_switch.out` example file. Note that this file format allows to include haplotypes from several populations (if the -u *fastPHASE* option was used) (see section \@ref(LoadDataEx3)). - - *variant call format (vcf)*, comprising both haplotype and marker information. In order to read files in this format, the package *vcfR* or the packages *data.table* and *R.utils* (the latter is needed for compressed files) have to be installed (see section \@ref(LoadDataEx4)). - - The output of the simulation program *ms* [@Hudson2002] and its derivatives *msHOT* [@Hellenthal2007], *ms^2^* [@Ewing2010] and *ms'* [@Kelleher2016]. In order to read these files, the package *gap* has to be installed (see section \@ref(LoadDataEx5)). + - *variant call format (vcf)*, comprising both haplotype and marker information. In order to read files in this format, the package [vcfR](https://cran.r-project.org/package=vcfR) or the packages [data.table](https://cran.r-project.org/package=data.table) and [R.utils](https://cran.r-project.org/package=R.utils) (the latter is needed for compressed files) have to be installed (see section \@ref(LoadDataEx4)). + - The output of the simulation program *ms* [@Hudson2002] and its derivatives *msHOT* [@Hellenthal2007], *ms^2^* [@Ewing2010] and *ms'* [@Kelleher2016]. In order to read these files, the package [gap](https://cran.r-project.org/package=gap) has to be installed (see section \@ref(LoadDataEx5)). Alleles in standard or transposed haplotype format can be provided either coded (by integer numbers) or without coding (e.g. as nucleotides) (see section \@ref(LoadData)). @@ -224,7 +219,7 @@ hh <- data2haplohh(hap_file = "bta12_cgu.hap", ### Example 2: reading haplotype file in transposed format (*SHAPEIT2*--like) {#LoadDataEx2} -If the haplotype input file `bta12_cgu.thap` is in "transposed" format, the option `haplotype.in.columns` has to be set to `TRUE` while all other parameters remain unaffected with respect to example 1. Note that this is the only format that has to be explicitly declared by the user. +If the haplotype input file is in "transposed" format (like `bta12_cgu.thap`), the option `haplotype.in.columns` has to be set to `TRUE` while all other parameters remain unaffected with respect to example 1. This is the only format which is not recognized automatically, but has to be explicitly declared by the user. ```{r example2} hh <- data2haplohh(hap_file = "bta12_cgu.thap", @@ -247,7 +242,7 @@ hh <- data2haplohh(hap_file = "bta12_hapguess_switch.out", allele_coding = "map") ``` -If no value is specified for the `popsel` argument and more than one population is detected in the *fastPHASE* output file, an error in produced and the available population numbers printed: +If no value is specified for the `popsel` argument and more than one population is detected in the *fastPHASE* output file, an error is produced and the available population numbers printed: ```{r error = TRUE} hh <- data2haplohh(hap_file = "bta12_hapguess_switch.out", @@ -263,11 +258,10 @@ Ancestral alleles are sometimes marked by upper case as "high confident" and by If the `AA` key is absent, the option `polarize_vcf` should be set to `FALSE` and the allele coding of the *vcf* file is directly used as internal coding. -If there is data for more than one chromosome in the file, the chromosome of interest has to be specified by `chr.name`. Since always the whole file is read in, it may be advisable for large data sets to create chromosome-specific files. +If there is data for more than one chromosome in the file, the chromosome of interest has to be specified by `chr.name`. Since always the whole file is read in, it is advisable to split large data sets into chromosome-specific files. In order to process *vcf* files, the package [vcfR](https://cran.r-project.org/package=vcfR) or the package -[data.table](https://cran.r-project.org/package=data.table) (which in turn needs [R.utils](https://cran.r-project.org/package=R.utils) to read compressed files) have to be installed. The parameter `vcf_reader` has to be set to either `"vcfR"` or `"data.table"`. [Note: at the time of writing, the package *vcfR* has been removed from CRAN, but can still be installed from https://github.com/knausb/vcfR, following instructions there.] - +[data.table](https://cran.r-project.org/package=data.table) (which in turn needs [R.utils](https://cran.r-project.org/package=R.utils) to read compressed files) have to be installed. The parameter `vcf_reader` has to be set to either `"vcfR"` or `"data.table"`. In the file `bta12_cgu.vcf.gz` the ancestral allele was set as reference and hence no further polarizing is necessary. @@ -280,11 +274,13 @@ hh <- data2haplohh(hap_file = "bta12_cgu.vcf.gz", ### Example 5: reading ms output {#LoadDataEx5} The function `data2haplohh()` automatically checks whether the haplotype file is in the output format of the simulation program *ms* [@Hudson2002]. If this is the case, the parameters `map_file` and `allele_coding` are ignored. If the file contains several 'runs' (as referred to by the parameter `nrep` of *ms*), it is necessary to specify the number of the run in option `chr.name`. Note that always the whole file is read, so that it might be advisable to spread large simulations over separate files. -One argument of the `data2haplohh` function is specifically dedicated to *ms* output, although it works with other formats as well: *ms* gives chromosomal positions as fractions of the interval [0,1] and in order to obtain more realistic values, these positions can be multiplied by a factor, set by `position_scaling_factor`. +One argument of the `data2haplohh` function is specifically dedicated to *ms* output, although it works with other formats as well: *ms* gives chromosomal positions as fractions of the interval [0,1] and in order to obtain more realistic values, these positions can be multiplied by a factor, set by `position_scaling_factor`. + +Note that *ms* output can contain multiple markers with the same (rounded) position, which *rehh* does not accept. In this case the numerical precision for chromosomal positions in the *ms* output should be increased (option `-p` of *ms*, option `-oformat` of *msms*). -Note that *rehh* does not accept multiple markers with the same position and hence it is highly recommended to increase the numerical precision for chromosomal positions in the *ms* output. +Setting `remove_multiple_markers` to `TRUE` entails that from consecutive markers with the same position only the first one is retained and a warning containing the number of removed markers is printed. Note that this effectively transforms the "infinite sites model" used for simulations by *ms* into a "finite sites model". -In order to read this format, the package [gap](https://cran.r-project.org/package=gap) has to be installed. +In order to read the *ms* format, the package [gap](https://cran.r-project.org/package=gap) has to be installed. ```{r ms_example, eval = requireNamespace("gap", quietly = TRUE)} hh <- data2haplohh(hap_file = "ms.out", @@ -309,7 +305,7 @@ hh_subset = subset(hh, select.mrk = -1) ## Definition and computation ### The (allele-specific) *Extended Haplotype Homozygosity (EHH)* {#EHH} -For any given allele $a$ of a focal marker $s$, sometimes referred to as a *core* allele, the *Extended Haplotype Homozygosity (EHH)* is defined as the probability that two randomly chosen chromosomes, carrying the core allele, are homozygous over a given surrounding chromosomal region [@Sabeti2002]. It is estimated from a sample by calculating the homozygosity of the chromosomal chunk between the focal marker and another marker $t$ by the formula +For an allele $a$ of a focal marker $s$, sometimes referred to as a *core* allele, the *Extended Haplotype Homozygosity (EHH)* is defined as the probability that two randomly chosen chromosomes, carrying the core allele, are homozygous over a given surrounding chromosomal region [@Sabeti2002]. It is estimated from a sample by calculating the homozygosity of the chromosomal chunk between the focal marker and another marker $t$ by the formula \begin{equation} \mathrm{EHH}_{s,t}^a=\frac{1}{n_{a}(n_a-1)}\sum\limits_{k=1}^{K^a_{s,t}}n_k(n_k-1) (\#eq:ehh) @@ -317,19 +313,21 @@ For any given allele $a$ of a focal marker $s$, sometimes referred to as a *core where $n_a$ represents the number of chromosomes carrying the core allele $a$, $K^a_{s,t}$ represents the number of **different** shared haplotypes and $n_k$ refers to the number of chromosomes pertaining to the $k$-th such shared haplotype. If there is no missing data, it holds that $n_a=\sum\limits_{k=1}^{K^a_{s,t}}n_k$. In the case of unphased chromosomes from diploid individuals (see section \@ref(phasing)) the extended haplotype homozygosity can be calculated as follows [@Tang2007]: -we consider only chromosomes from individuals that are homozygous for the allele $a$ at the focal marker $s$ and estimate *EHH* at some marker $t$ by the fraction of individuals that are (still) homozygous over the entire chromosomal stretch between $s$ and $t$. Let $I^a_{s,t}$ denote the number of individuals that are homozygous from marker $s$ til marker $t$. We can reformulate the fraction of individuals in terms of the fraction of shared haplotypes: since haplotypes of different individuals are not compared they can be regarded as distinct by definition, hence $K_{s,s}^a=I_{s,s}^a=\frac{1}{2}n_a$. With increasing distance of $t$ from $s$, any increase in $K_{s,t}^a$ is tantamount to a decrease of the number of homozygous individuals, yielding +we consider only chromosomes from individuals that are homozygous for the allele $a$ at the focal marker $s$ and estimate *EHH* at some marker $t$ by the fraction of individuals that are (still) homozygous over the entire chromosomal stretch between $s$ and $t$. Let $I^a_{s,t}$ denote the number of individuals that are homozygous from marker $s$ til marker $t$. \begin{equation} -\mathrm{EHH}_{s,t}^a=\frac{I_{s,t}^a}{I_{s,s}^a}=\frac{n_a-K_{s,t}^a}{\frac{1}{2}n_a}\;. +\mathrm{EHH}_{s,t}^a=\frac{I_{s,t}^a}{I_{s,s}^a}\;. (\#eq:ehh2) \end{equation} -No matter which of the two definitions is used, it is common practice to stop computation when *EHH* reaches a certain lower threshold, e.g. 0.05. +*EHH* is usually computed only for a region it surpasses a given threshold (e.g., $EHH > 0.05$). ### The integrated *EHH* (*iHH*) {#iHH} -By definition, irrespective of the allele considered, *EHH* starts at 1 and decays to 0 with increasing distance of *t* from the focal marker *s*. For a given core allele, the integrated *EHH* (*iHH*) is defined as the area under the *EHH* curve which, in turn, is defined by the *EHH* values and associated chromosomal positions [@Voight2006]. The integral is computed with a simple standard method, called the *trapezoidal rule*. +By definition, *EHH* starts at 1 and decays to 0 with increasing distance of *t* from the focal marker *s*. For a given core allele, the integrated *EHH* (*iHH*) is defined as the area under the *EHH* curve which, in turn, is defined by the *EHH* values and associated chromosomal positions [@Voight2006]. The integral is computed with a simple numerical method, called the *trapezoidal rule*. + +Note that, technical details aside, the *iHH* value is nothing else than the average length of shared haplotypes. ### The (site-specific) Extended Haplotype Homozygosity (*EHHS*) {#EHHS} -An extended haplotype homozygosity can be defined as well without reference to core alleles. In this case, +An extended haplotype homozygosity can be defined as well without regard to core alleles. In this case, the quantity is aimed to reflect the probability that any two randomly chosen chromosomes from a population are homozygous over a given surrounding chromosomal region of a focal marker. In contrast to the allele-specific *EHH* defined in the previous section, the chromosomes are not required to carry a specific allele at the focal marker. We adopt the naming by [@Tang2007] as *site--specific* EHH, abbreviated by *EHHS*. Note, however that this quantity is sometimes referred to as *EHH*, too, and there is no agreed notation in the literature. *EHHS* was used in genome scans in two versions: un-normalized by [@Sabeti2007] and normalized by [@Tang2007]. @@ -357,7 +355,7 @@ $\mathrm{EHHS}_{s,t}=1-h_{s,t}$ and hence \begin{equation*} \mathrm{nEHHS}_{s,t}=\frac{\mathrm{EHHS}_{s,t}}{\mathrm{EHHS}_{s,s}}\;. \end{equation*} -Thus $\mathrm{nEHHS}_{s,t}$ is just normalized in order to yield 1 at the focal marker $s$. Note that the normalization factor depends on the frequency of the alleles at the focal marker and consequently is in general not the same for different focal markers. +Thus $\mathrm{nEHHS}_{s,t}$ is just normalized in order to yield 1 at the focal marker $s$. Note that the normalization factor depends on the frequency of the alleles at the focal marker and consequently is not necessarily the same for different focal markers. Furthermore, we note that *EHHS* and *EHH* are related by \begin{equation*} @@ -365,9 +363,9 @@ Furthermore, we note that *EHHS* and *EHH* are related by \end{equation*} where for the sake of simplicity we assume that the focal marker has only two alleles $a1$ and $a2$. *EHHS* might hence be viewed as a linear combination of the *EHH*'s of the focal alleles, weighted by roughly the square of the focal allele frequencies. -In the case of unphased chromosomes from diploid individuals (see section \@ref(phasing)) *EHHS* can be calculated like *EHH* using equation \@ref(eq:ehh2) without the restriction to core alleles: +In the case of unphased chromosomes from diploid individuals (see section \@ref(phasing)) *EHHS* can be calculated like *EHH* in Equation \@ref(eq:ehh2), just without reference to core alleles: \begin{equation} -\mathrm{EHHS}_{s,t}=\frac{I_{s,t}}{I_{s,s}}=\frac{n-K_{s,t}}{\frac{1}{2}n}\;. +\mathrm{EHHS}_{s,t}=\frac{I_{s,t}}{I_{s,s}}\;. (\#eq:ehhs2) \end{equation} Note that defined this way, *EHHS* is always 1 at the focal marker. Hence there is no distinction between $\mathrm{EHHS}$ and $\mathrm{nEHHS}$. @@ -378,21 +376,23 @@ Again, unphased *EHHS* can be related to unphased *EHH* by \end{equation} where for the sake of simplicity we assumed a bi-allelic focal marker with alleles $a1$ and $a2$. -As with *EHH*, the *EHHS* is usually computed only for the region where its value surpasses a given threshold (e.g., *EHHS*>0.05). +As with *EHH*, the *EHHS* is usually computed only for the region where its value surpasses a given threshold (e.g., $EHHS > 0.05$). ### The integrated *EHHS* (*iES*) {#iES} Like *EHH*, *EHHS* has its maximum at the focal marker and decays to 0 with increasing distance from the focal marker. For a given focal marker, analogously to *iHH*, *iES* is defined as the integrated *EHHS* [@Tang2007]. Depending on whether *EHHS* or *nEHHS* is integrated, we yield *iES* and *inES* respectively. As with *iHH*, the numerical integration uses the *trapezoidal rule*. +Note that, technical details aside, the *iES* and *inES* values represent the average length of shared haplotypes. The length of shared haplotypes with different core alleles yields zero and these are included in the former but not the latter. + ## The function `calc_ehh()` {#calcehh} The function `calc_ehh()` computes *EHH* for all alleles of a focal marker $s$ relative to markers $t$ upstream and downstream. For each allele the corresponding integral *iHH* of the *EHH* curve is calculated as well. Three options can be specified to constrain the computation of *EHH*: -- `limehh` sets a threshold below which further calculation of *EHH* is stopped. Its default value is 0.05. Note that lowering this cut-off, although increasing the accuracy of *EHH* estimates, might actually decrease the power to detect selective events since under neutrality a tiny fraction (<<0.05) of very long shared haplotypes can be expected, too. +- `limehh` sets a threshold below which further calculation of *EHH* is stopped. Its default value is 0.05. Note that lowering this cut-off might actually decrease the power to detect selective events since under neutrality a tiny fraction of sequences has very long shared haplotypes which, if not capped, confound the signal of selection [@Klassmann2020]. - `limhaplo` defines the smallest acceptable number of evaluated chromosomes and has a default (and mimimum) value of 2. This parameter might be increased if missing values are suspected to be non-randomly distributed leading to a biased drop-out of evaluated chromosomes. -- `limhomohaplo` sets a minimum number of homozygous chromosomes below which calculation of *EHH* is stopped (or not even started). Its default (and minimum) value is 2. This number should be increased to 4 for small samples of unphased haplotypes in order to limit the influence of a single pair of shared haplotypes (see section \@ref(phasing).) +- `limhomohaplo` sets a minimum number of homozygous chromosomes below which calculation of *EHH* is stopped (or not even started). Its default (and minimum) value is 2. This number should be increased to 4 for small samples of unphased haplotypes in order to limit the influence of a single shared haplotype (see section \@ref(phasing).) Several parameters influence the *IHH* values (the integral over *EHH*): @@ -404,7 +404,7 @@ Several parameters influence the *IHH* values (the integral over *EHH*): - integration is performed by default over the area between the graph defined by the *EHH* values and the horizontal line y = `limehh`. If numerical agreement with the program *Hapbin* is wanted, this area should be extended to the x-axis by setting `lower_y_bound` to zero. -- by default the *EHH* curve is defined by linearly interpolating *EHH* values between consecutive markers, yielding a continuous curve. However in particular for full re-sequencing data, it is more accurate to let this function decrease step-wise at each marker by setting `interpolate`to `FALSE` (although the effect is likely to be minor). +- by default, the *EHH* curve is defined by linearly interpolating *EHH* values between consecutive markers, yielding a continuous curve. However, in particular for full re-sequencing data, it is more accurate to let this function decrease step-wise at each marker by setting `interpolate` to `FALSE` (although the difference is likely to be minor). The option `polarized`, `TRUE` by default, in this function merely affects the order and labeling of alleles. @@ -464,7 +464,7 @@ plot(calc_ehh(haplohh_cgu_bta12, ``` ## The function `calc_ehhs()` -The `calc_ehhs()` function computes $\mathrm{EHHS}$ and $\mathrm{nEHHS}$ around the focal marker $s$ relative to another marker $t$. This function also computes the corresponding integrals $\mathrm{iES}$ and $\mathrm{inES}$ respectively. The options are identical to those of the function `calc_ehh` (see previous section), except that `polarized` is not needed here. Details are available by the command: +The `calc_ehhs()` function computes *EHHS* and normalized *EHHS* around the focal marker $s$ relative to another marker $t$. This function also computes the corresponding integrals *iES* and *inES* respectively. The options are identical to those of the function `calc_ehh` (see previous section), except that `polarized` is absent, because variant ancestry status does not figure in the formulas. Details are available by the command: ```{r, eval=FALSE} ?calc_ehhs ``` @@ -477,7 +477,7 @@ res <- calc_ehhs(haplohh_cgu_bta12, include_nhaplo = TRUE) ``` -The output is similar to that of `calc_ehh` except that there are no alleles to be distinguished, but instead the normalized and non-normalized versions of *EHHS*. A list with four elements is obtained: +The output is similar to that of `calc_ehh()`, except that there are no alleles to be distinguished, but instead the wether *EHHS* is normalized or not. A list with four elements is obtained: 1. `mrk.name`: the name/identifier of the focal marker. 2. `ehhs`: a data frame with *EHHS* and *nEHHS* values along the chromosome around the focal marker. Optionally, the column `NHAPLO` can be included to show how many chromosomes were evaluated at each marker. @@ -610,14 +610,16 @@ p^\text{right}_\text{iHS}=-\log_{10}\left(1-\Phi\left(\text{iHS}\right)\right) \end{equation*} for the opposite case. -In case of unpolarized alleles, the *iHH* values of major and minor alleles are opposed to obtain *uniHS*. Since derived allele frequency cannot be accounted for, no binning should be performed. The resulting standardized *iHS* cannot be expected to follow a normal distribution and p-values are not meaningful. +Note that this procedure is controversial, because we identify the empirical distribution with the distribution under the null hypothesis of neutrality. This is an approximation at best and only warranted when it can be assumed that there are so few selected sites that their influence on the overall shape of the distribution can be neglected. + +In case of unpolarized alleles, the *uniHS* is taken as the ratio of *iHH* from minor to major allele. Since derived allele frequency cannot be accounted for, no binning should be performed. The resulting standardized *iHS* cannot be expected to follow a normal distribution and p-values as defined above are not meaningful. ### The function `ihh2ihs()` {#ihh2ihs} The `ihh2ihs()` function computes *iHS* using a data frame containing the *iHH* values for ancestral and derived (resp. major and minor) alleles as obtained by the `scan_hh()` function (see section \@ref(scanhh)). The argument `min_maf` allows to exclude focal markers according to their minor allele frequency (by default `min_maf`=0.05). The argument `freqbin` controls the size (or number) of the allele frequency bins used to perform standardization (see section \@ref(ihs)). More precisely, allele frequency bins are built from `min_maf` to 1-`min_maf` in steps of size `freqbin` (by default `freqbin`=0.025). If an integer of 1 or greater is specified, a corresponding number of equally spaced bins is created. If `freqbin` is set to 0, standardization is performed considering each observed frequency as a discrete frequency class, which is useful when there are only a few distinct haplotypes. For unphased data, *iHH* is calculated using only haplotypes of individuals which are homozygous at the focal marker. This number can be considerably lower than the absolute allele frequency. Hence, in addition to `min_maf`, the option `min_nhaplo` (default `NA`) should be used to reduce statistical noise arising from too few evaluated haplotypes. -Optionally, the allele frequencies of the input data frame can be included into the output data frame by setting `include_freq` to `TRUE`. +Optionally, the allele frequencies of the input data frame can be included into the output by setting `include_freq` to `TRUE`. A p-value is calculated for standardized *iHS* values. By default, it is two-sided, but a side can be chosen by setting argument `p.side` to `"left"` or `"right"`. @@ -840,6 +842,8 @@ The colors of the chromosomes in Figures \@ref(fig:manhattanplot) and \@ref(fig: Candidate regions as obtained by the function `calc_candidate_regions()` can be added to the plot as parameter `cr`. Individual markers can be highlighted by setting argument `mrk` to a vector of marker IDs or a data frame with positions (containing columns with name `CHR` and `POSITION`). +By default, chromosomes are separated by an inset of 5,000,000 bases. This value can be increased by the corresponding parameter in order to further reduce overlap between data points of neighboring chromosomes. + In order to reduce the number of plotted data points, the data set can be rasterized in both dimensions by parameter `resolution`. The data points are then rounded to the specified resolution and duplicate points removed. Furthermore, it is possible to specify a subset or a re-ordering of chromosomes with help of parameter `chr.name` as in Figure \@ref(fig:manhattanplotsub). @@ -853,6 +857,7 @@ manhattanplot(wgscan.ihs.cgu, main = "iHS (CGU cattle breed)", cr = cr.cgu, mrk = "F1205400", + inset = 1E+7, resolution = c(200000, 0.05)) # set back to default colors palette("default") @@ -860,7 +865,7 @@ palette("default") ## Genome wide score plots: the function `manhattan()` of package `qqman` -The package [qqman](https://cran.r-project.org/package=qqman) contains a function `manhattan()` which is similar to the function `manhattanplot()` of this package. The input data frame is expected to have a slightly different format, though. Hence, before plotting we need to "translate" our data as in the following example with *ihs* values: +The package [qqman](https://cran.r-project.org/package=qqman) contains a function `manhattan()` which is similar to the function `manhattanplot()` of this package. The input data frame is expected to have a slightly different format, though. Hence, before plotting we need to "translate" our data as in the following example with *iHS* values: ```{r rehh2qqman} # extract data frame from result list @@ -954,7 +959,7 @@ newick <- as.newick(furcation, hap.names = hap.names(haplohh_cgu_bta12)) ``` Such a string can be rendered graphically e.g. by the R package [ape](https://cran.r-project.org/package=ape) yielding Figure \@ref(fig:newick): -```{r newick, , eval = requireNamespace("ape", quietly = TRUE), fig.align = 'center', fig.width = 6, fig.height = 6, fig.lp = 'fig:', fig.cap = 'Graphical output of the ape::plot.phylo() function', fig.pos = "!h"} +```{r newick, eval = requireNamespace("ape", quietly = TRUE), fig.align = 'center', fig.width = 6, fig.height = 6, fig.lp = 'fig:', fig.cap = 'Graphical output of the ape::plot.phylo() function', fig.pos = "!h"} library(ape) tree <- ape::read.tree(text = newick) plot(tree, @@ -967,7 +972,7 @@ plot(tree, ## The functions `calc_haplen()` and `plot.haplen()` -The length of a particular extended shared haplotype in a sample can be defined as the range that a chromosome shares a haplotype with at least one other chromosome. For a given chromosome it corresponds to the maximal extension of the inner branches to both sides of the focal marker in a furcation diagram. The function `calc_haplen()` calculates this quantity: +To each haplotype in the sample the length of its longest shared haplotype is assigned, i.e. the range over which it is identical to at least one other haplotype (the latter might be different left and right to the focal marker). It corresponds to the maximal extension of the inner branches to both sides of the focal marker in a furcation diagram. The function `calc_haplen()` calculates this quantity: ```{r} haplen <- calc_haplen(furcation) ``` @@ -1033,12 +1038,12 @@ remove.example.files() # Data considerations ## Multi-allelic markers {#multiallelic} -For many species, a low per-site mutation rate ensures that the vast majority of Single Nucleotide Polymorphisms (SNPs) appears only with two alleles in a sample. Hence bi-allelic SNPs will constitute the foremost kind of data to apply our package onto. However, we think the ability to calculate the statistics for multi-allelic markers might be useful +For many species, a low per-site mutation rate ensures that the vast majority of Single Nucleotide Polymorphisms (SNPs) is observed with only two alleles in a sample. Hence bi-allelic SNPs will constitute the foremost kind of data to apply our package onto. However, we think the ability to calculate the statistics for multi-allelic markers might be useful - for species and/or genomic regions with a high per-site mutation rate - for genetic variation in form of (short) tandem repeats or (larger) copy number variations which are multi-allelic by definition and may carry information not captured by SNP markers - because the relative rarity of multi-allelic SNPs might make these particularly interesting -- since the original approach of [@Sabeti2002] was not to compare *extended haplotype homozygosity* between *core alleles* of a single SNP, but between multiple *core haplotypes* defined by several neighboring SNPs; such a partition could be created in *rehh* by adding an artificial multi-allelic marker. +- since the original approach of [@Sabeti2002] did not compare *EHH* on the two *core alleles* of a single SNP, but for multiple *core haplotypes* defined by several neighboring SNPs; such a partition could be created in *rehh* by adding an artificial multi-allelic marker. ## Dealing with gaps {#gaps} Certain genomic regions such as centromeres are difficult to sequence and can give rise to large gaps between consecutive markers. If not accounted for, these will cause spuriously inflated values of the "integrals" *iHH* and *iES*. [@Voight2006] applied two corrections for gaps. First, they introduced a penalty proportional to physical distance that resulted in any gap being greater than 20 kb to be re-scaled to this number. Second, they discarded integration, if two consecutive markers with a distance greater than 200 kb were encountered. Both methods are implemented in *rehh*, yet turned off by default, since the corresponding thresholds should be adapted manually to the specific data set. @@ -1059,23 +1064,21 @@ Errors or typos aside, there are several possibilities how multiple markers with - The *variant call format* allows to specify different kinds of markers in the same file. Hence at a certain position one might observe a SNP as well as an Insertion/Deletion or a tandem repeat. - In output of *ms* the positions are given with a pre-set precision and consequently the positions of two different segregating sites might be rounded to the same number. -Since it is unclear how *rehh* should handle such markers, they are not accepted as input. Ideally, multiple markers should be dealt with by a pre-processing of the data outside of the package. As a quick-and-dirty work-around, we offer the option `remove_multiple_markers`, which, if set to `TRUE`, removes all but the first marker with identical positions. +Since it is unclear how *rehh* should handle such markers, they are not accepted as input. Ideally, multiple markers should be dealt with by a pre-processing of the data outside of the package. As a quick-and-dirty work-around, we offer the option `remove_multiple_markers` in function `data2haplohh()`, which, if set to `TRUE`, removes all but the first marker with identical positions. ## Dealing with unphased data {#phasing} -Notwithstanding expensive experimental methods, current high-throughput genotyping / sequencing technologies cannot directly assign alleles to specific chromosomes of a heterozygous diploid (or multiploid) individual. Instead, this task of *phasing* is typically performed by specialized bioinformatic tools like the previously mentioned *SHAPEIT* [@OConnell2014] and *fastPHASE* [@Scheet2006]. Although computationally demanding, the application of these tools is straight-forward and the results usually of sufficient quality for the calculation of *EHH* based statistics. Typically, the tools interpolate missing values away. +Notwithstanding expensive experimental methods, current high-throughput genotyping / sequencing technologies cannot directly assign alleles to specific chromosomes of a heterozygous diploid (or multiploid) individual. Instead, this task of *phasing* is typically performed by specialized bioinformatic tools like *SHAPEIT* [@OConnell2014] and *fastPHASE* [@Scheet2006]. Although computationally demanding, the application of these tools is straight-forward and the results usually of sufficient quality for the calculation of *EHH* based statistics. Typically, the tools interpolate missing values away. In the presumably rare cases where phasing is not feasible, *EHH* or *EHHS* can only be meaningfully estimated by reducing the set of compared chromosomes to those of homozygous (at the focal marker) individuals (assuming that the input data is ordered correspondingly). However, this reduction entails a substantial loss of power; even by an adapted parameter setting (see below) at the very minimum 10, but better up to 30 sequences are needed to obtain at least moderately accurate estimations. For the within-population statistic *iHS*, the latter requirement concerns both major and minor alleles of a marker and scans should not be performed on samples comprising less than 100 sequences. Even for sample sizes of 200 sequences, meaningful estimation of *iHS* is hence restricted to variants of intermediate frequencies, i.e. high minor frequency. For the cross-population statistics *Rsb* and *XP-EHH* a minimum number of 30 sequences from homozygous individuals is usually fulfilled if the sample size of each population exceeds 60 sequences. - - -For unphased sequences +Hence, for unphased sequences the following parameters shoudl be set: - the option `phased` of the functions `calc_ehh()`, `calc_ehhs()` and `scan_hh()` must be set to `FALSE`. However, if the data is actually phased, this entails a substantial loss of power to detect selection! -Most of the variance (and hence "noise") comes at any marker from the longest shared haplotypes. To limit their contribution +A few shared haplotypes of extreme length are usually encountered in neutrally evolving regions. In order to limit this "statistical noise", cut-off rules are for unphased sequences even more important than they are for phased ones - the cut-off for the calculation of *EHH* resp. *EHHS* defined by option `limehh` resp. `limehhs` should be increased from the default value of 0.05 to 0.1. - in function `ihh2ihs()`, hence for a within-population scan using *iHH* values, in addition to the filtering by the MAF of core alleles (parameter `min_maf`, default 0.05), a minimum absolute number of evaluated haplotypes should be set by parameter `min_nhaplo`; this value should never be lower than 10 and, if the sample size allows, be as high as 30. @@ -1085,9 +1088,9 @@ See [@Klassmann2020] for a study on estimating *iHS*, *Rsb* and *XP-EHH* using u ## Dealing with unpolarized data {#polarizing} -The designation of alleles as 'ancestral' or 'derived' is referred to as *polarization*. Since sequences of ancient genomes are available only for a few species and restricted to a limited time back to the past, the 'ancestral' allele is usually inferred to be the one carried by one or more outgroup species such as chimpanzees or gorillas for humans. However, this presupposes the existence of a reference sequence of suitable 'neighbor' species of sufficient quality as well as reliable genome-wide alignments. These requirements are not trivial and even if they are fulfilled, any alignment will not cover the whole genome and the covered part will contain mis-specified alleles due to invisible secondary or back-mutations [@Baudry2003], possibly causing spurious signals of selection [@Hernandez2007]. +The designation of alleles as 'ancestral' or 'derived' is referred to as *polarization*. Since sequences of ancient genomes are available only for a few species and restricted to a limited time back to the past, the 'ancestral' allele is usually inferred to be the one carried by one or more outgroup species such as chimpanzees or gorillas for humans. However, this presupposes the existence of a reference sequence of a suitable 'neighbor' species of sufficient quality as well as reliable genome-wide alignments. These requirements are not trivial and even if they are fulfilled, any alignment will not cover the whole genome and the covered part will contain mis-specified alleles due to invisible secondary or back-mutations [@Baudry2003], possibly causing spurious signals of selection [@Hernandez2007]. -Note that the bin-wise standardization of $iHS$ is the only calculation step within our package where the information about ancestry is exploited. The information of ancestry status is valuable since the expected values under neutral evolution depend on the respective allele frequencies at the focal marker (see Figure \@ref(fig:freqbin) of this vignette and Figure 4 of [@Voight2006]). The binning of markers by frequency before its standardization (see section \@ref(ihh2ihs)) is aimed to eliminate most of this dependence. For unpolarized alleles this correction cannot be done. +Note that the bin-wise standardization of *iHS* is the only calculation step within our package where the information about ancestry is exploited. The information of ancestry status is valuable since the expected values under neutral evolution depend on the respective allele frequencies at the focal marker (see Figure \@ref(fig:freqbin) of this vignette and Figure 4 of [@Voight2006]). The binning of markers by frequency before its standardization (see section \@ref(ihh2ihs)) is aimed to eliminate most of this dependence. For unpolarized alleles this correction cannot be done. Hence two parameters are important when dealing with unpolarized data: @@ -1113,19 +1116,39 @@ $\sqrt{\frac{1}{n}\sum(x_i-\bar{x})^2}$ for the standard deviation while *rehh* 6. The default number of bins is 50 in *hapbin*, yielding a bin width of 0.02. The default width in *rehh* is 0.025 (yielding 36 bins, see point above!). Setting the number of bins in *hapbin* to 40 with option `-b` or `--bin` yields a bin width of 0.025. -7. If run in default mode, *hapbin* calculates *EHH* by (notation as in section \@ref(EHH)) -\begin{equation*} -\mathrm{EHH}^a_{s,t}=\sum_{k=1}^{K^a_{s,t}}\left(\frac{n_k}{n_a}\right)^2\;. -\end{equation*} -For a set of $n$ chromosomes this estimator reaches its minimum value of $\frac{1}{n}$ if all of them are distinct. Yet formula \@ref(eq:ehh) used by *rehh* and applied by *hapbin* if run with option `-a` or `--binom` returns zero in this case. The difference reflects distinct sampling strategies, either with replacement or without. For increasing sample size both converge. -The same holds for *EHHS*. +7. *hapbin* uses by default another estimator for homozygosity than *rehh* (see section \@ref(estimation)). +If run with option `-a` or `--binom`, it uses the same as *rehh*. 8. Integration over *EHH* resp. *EHHS* is performed by *hapbin* on the area between the curve spanned by these quantities and the x-axis (y=0) while *rehh* by default integrates only over the part of that area that is above the threshold set by the parameters `limehh` resp. `limehhs`, i.e. the area between the curve and the line y=threshold. This is not to be confused with the condition for truncation at left and right ends of the curve which is (for all practical purposes) handled identically by both programs. Setting in *rehh* the parameter `lower_y_bound` to zero makes the integration identical to that of *hapbin*. As mentioned above, `limehh(s)` of *rehh* corresponds to `-c` or `--cutoff` of *hapbin*. 9. By default, the parameter `discard_integration_at_border` is `TRUE` in *rehh*. It has to be set to `FALSE` in order to conform to *hapbin*. 10. Large differences can arise from different handling of gaps during the integration of *EHH* resp. *EHHS* yielding *iHH* resp. *iES*. *Hapbin* has a parameter `-s` or `--scale` to "down-weight" large gaps by capping them to the specified value. Its default value is 20000 while the corresponding option in *rehh* is turned off by default, but can be set by `scalegap`. -The option `maxgap` within *rehh* leads to a stop of the integration and if the parameter `discard_integration_at_border` is set to `TRUE`, then no value is reported. This has no counterpart in *hapbin*. Instead, *hapbin* allows to specify a maximum length of Extended Haplotypes (disabled by default) which is not possible in *rehh*. +The option `maxgap` within *rehh* leads to a stop of the integration and if the parameter `discard_integration_at_border` is set to `TRUE`, then no value is reported. This has no counterpart in *hapbin*. Instead, *hapbin* allows to specify a maximum length of Extended Haplotypes (disabled by default) which is available as option of the function `scan_hh_full()` in *rehh*. + +\clearpage +# About estimating homozygosity {#estimation} + +The term *homozygosity* as component of the abbreviation *EHH* refers to the probability that two arbitrarily chosen chromosomes from a large population are identical at a given locus or in a given region. It does not make any statement about homozygosity of individuals or even presuppose that individuals are diploid. One might even argue, whether the term *homogeneity* would have been more appropriate. + +If there are $K$ alleles in the population and each allele has a population frequency of $f_k$, then this probability is given by +$$H=\sum_{k=1}^{K}f_k^2\;.$$ +For each allele $k$, its population frequency can be estimated by its sample frequency $x_k$: if the sample contains $n$ chromosomes and allele $k$ is observed $n_k$ times, then +$$\hat{f_k}=\frac{n_k}{n}=x_k\;.$$ +It seems straightforward to estimate the population homozygosity from a sample by +$$\hat{H_1}=\sum_{k=1}^Kx_k^2=\sum_{k=1}^K\left(\frac{n_k}{n}\right)^2\;.$$ +However, it turns out that this estimator is biased (it yields values that tend to be slightly too high). The following estimator, instead, is unbiased [@Nei1974]: +$$\hat{H_2}=\frac{n}{n-1}\sum_{k=1}^{K_{s,t}}x_k^2-\frac{1}{n-1}\;.$$ +The latter is used by *rehh*. We can see this e.g. in Equation \@ref(eq:ehhssab), when we consider each (shared) haplotype in the region between markers $s$ and $t$ as an allele. We get + +$$EHHS=\frac{1}{n(n-1)}\sum_{k=1}^{K_{s,t}}n_k(n_k-1)=\frac{n}{n-1}\frac{1}{n^2}\sum_{k=1}^{K_{s,t}}(n_k^2-n_k)=\frac{n}{n-1}\left(\sum_{k=1}^{K_{s,t}}\frac{n_k^2}{n^2}-\frac{n}{n^2}\right)=\hat{H_2}\;.$$ +*hapbin*, in constrast, uses by default estimator $\hat{H_1}$ and refers to $\hat{H_2}$ as the "alternative" estimator. Evidently, for large $n$, the difference between the two becomes negligible. For small $n$ this is not necessarily so. If we consider a minimal sample of two non-identical chromosomes, hence $n=2$ and $K=2$, then we have +$$\hat{H_1}=\left(\frac{1}{2}\right)^2+\left(\frac{1}{2}\right)^2=\frac{1}{2}$$ +and +$$\hat{H_2}=\frac{1}{2\cdot 1}(1\cdot 0+1\cdot 0)=0\;.$$ +Interestingly, although $\hat{H_1}$ is biased, it yields values which are on average closer to the population value than $\hat{H_2}$, since the variance of $\hat{H_1}$ is smaller than that of $\hat{H_2}$ [@Nei1974]. + +It is unlikely, though, that the choice of the estimator has a major effect on detecting selection. \clearpage # References diff --git a/inst/doc/rehh.html b/inst/doc/rehh.html index 9a4c055..ab9a042 100644 --- a/inst/doc/rehh.html +++ b/inst/doc/rehh.html @@ -12,9 +12,9 @@ - + -Vignette for package rehh (version 3.1.2) +Vignette for package rehh + @@ -313,9 +329,9 @@ -

Vignette for package rehh (version 3.1.2)

+

Vignette for package rehh

Alexander Klassmann, Renaud Vitalis and Mathieu Gautier

-

2020-07-17

+

2020-11-01

1 About the package

-

This vignette describes how the R package rehh can be applied to perform whole genome scans for footprints of selection using statistics related to Extended Haplotype Homozygosity (EHH) (Sabeti et al. 2002).

+

This vignette describes comprehensively how the R package rehh can be applied to perform whole genome scans for footprints of selection using statistics related to Extended Haplotype Homozygosity (EHH) (Sabeti et al. 2002). The vignette Examples in detail explains basic usage and methodology with the help of two tiny artificial data sets.

The package accepts multi-allelic genetic markers as input. Typically, albeit not necessarily, these will be bi-allelic SNPs.

The package is available for Linux, Windows and MacOS X from the CRAN repository https://cran.r-project.org/package=rehh and may be installed using a standard procedure. Once the package has been successfully installed, it can be loaded by:

library(rehh)

1.1 Background

-

The analysis of molecular population genetic data often comprises the search for genomic regions that might have experienced recent selection. Diverse approaches have been developed, reviewed e.g. in (Oleksyk et al. 2010) and (Vitti et al. 2013), however only a few have found wide-spread application (Cadzow et al. 2014), (Haasl and Payseur 2016). To the latter belong iHS [Voight2006], Rsb (Tang et al. 2007) and XP-EHH (Sabeti et al. 2007), all of which are summary statistics aimed to distill a certain aspect of the genetic data into a single score and constructed in a way that extreme values are indicative of positive or “Darwinian” selection. iHS is intended for application on a single (presumably homogeneous) population, while XP-EHH and Rsb are targeted to differential selection between two populations. All three statistics are based on the concept of Extended Haplotype Homozygosity (EHH) as formulated by (Sabeti et al. 2002).

+

The analysis of molecular population genetic data often comprises the search for genomic regions that might have experienced recent selection. Diverse approaches have been developed; for reviews on methodology see (Sabeti 2006), (Oleksyk et al. 2010) and (Vitti et al. 2013) and for +practical advice (Cadzow et al. 2014), (Utsunomiya et al. 2015) and (Weigand and Leese 2018). However only a few have found wide-spread application (Haasl and Payseur 2016). To the latter belong iHS (Voight et al. 2006), Rsb (Tang et al. 2007) and XP-EHH (Sabeti et al. 2007), all of which are summary statistics aimed to distill a certain aspect of the genetic data into a single score and constructed in a way that extreme values are indicative of positive or “Darwinian” selection. iHS is intended for application on a single (presumably homogeneous) population, while XP-EHH and Rsb are targeted to differential selection between two populations. All three statistics are based on the concept of Extended Haplotype Homozygosity (EHH) as formulated by (Sabeti et al. 2002).

iHS and XP-EHH can be calculated by the independent C++ command line tool Hapbin (Maclean et al. 2015), which has been optimized for speed by exploiting bit-wise machine-level operations. The package rehh cannot compete on performance, but has the advantage of being able to work with multi-allelic markers and missing values. Moreover, it possesses a broader range of input and output options, including several graphical representations.

1.2 Changes between versions 2.X and 3.X

-

The C routines responsible for the bulk of calculations have been rewritten and all of the R code has been largely revised and streamlined yielding the following new features:

-
    -
  • the package accepts multi-allelic markers.
  • -
  • support for input files in variant call format as well as the output format of the simulation program ms (Hudson 2002).
  • -
  • computation of statistics and their visualization has become separated into different functions.
  • -
  • graphical plots are more customizable.
  • -
  • furcation diagrams can be labeled and a related visualization of “haplotype length” was added.
  • -
  • a function to define candidate regions of selection.
  • -
  • adaptations for unphased haplotypes or unpolarized alleles.
  • -
  • output in form of matrices has been replaced by data frames; some columns have become optional.
  • -
  • all names of data frames are now in lower case letters while all column names contain exclusively capital letters and underscores.
  • -
  • a new internal representation of alleles: previously, an ancestral allele was coded by 1, a derived allele by 2 and a missing value by 0. These codings have been replaced by 0, 1 and NA, respectively. Furthermore, the numbers are now explicitly of type “integer” instead of “numeric”.
  • -
  • a second vignette to explain the statistics involved and the functionality of the package using an invented tiny data set.
  • -
  • additional options to yield virtually identical results with the program hapbin (see section 8).
  • -
  • an inconsistency between implementation and documentation concerning the calculation of one-sided p-values has been cleared.
  • -
-

Due to changes in the API, although mostly small, the versions 3.X are not compatible with versions 2.X!

+

Due to changes in the API, although mostly small, the versions 3.X are not compatible with versions 2.X.

Data objects of class haplohh (see below) created by versions up to 2.0.4 must be updated via the command update_haplohh() (see its documentation by typing ?update_haplohh) in order to be accepted by the functions of the current version.

@@ -445,8 +447,9 @@

1.3 Example files

1.3.1 Input files

  • Two tiny invented examples, each in our “standard” haplotype format (see section 2) and in variant call format (vcf). The first example was used in (Gautier et al. 2017) for the explanation of the various statistics calculated by this package. The second example is an extension of the former including multi-allelic markers and missing values. Both sets are discussed in depth in a second vignette.

  • +
  • Further three tiny examples used for the supplement on Site Frequency Spectrum-based methods of (Klassmann and Gautier 2020).

  • An output file of the program ms containing two small simulated haplotype samples.

  • -
  • Input files in different formats that originate from a study on the “Creole cattle breed from Guadeloupe” (CGU) (Gautier and Naves 2011). All files contain the same set of phased SNPs of Bos taurus chromosome 12 from 140 individuals.

  • +
  • A data set in various formats that originated from a study on the “Creole cattle breed from Guadeloupe” (CGU) (Gautier and Naves 2011). All files contain the same set of phased SNPs of Bos taurus chromosome 12 from 140 individuals.

All of the above files are copied into the current working directory via the command

make.example.files()
@@ -455,7 +458,7 @@

1.3.1 Input files

1.3.2 R objects

    -
  • The data for chromosome 12 of the cattle study mentioned above as object of the rehh data class haplohh. This object becomes available by the command data(haplohh_cgu_bta12).

  • +
  • The data set for chromosome 12 of the cattle study mentioned above as object of the rehh data class haplohh. This object becomes available by the command data(haplohh_cgu_bta12).

  • The scores iHH and iES as obtained by the function scan_hh() applied on SNPs of the whole genome for the population CGU (defined above) and another population EUT (“European taurine”). They reside in the accompanying package rehh.data and are obtained by library(rehh.data) followed by data(wgscan.cgu) resp. data(wgscan.eut).

@@ -467,14 +470,15 @@

1.4 Terminology

1.5 Overview

-

The package calculates three statistics by the following steps:

+

The package calculates three statistics which can be used to perform whole-genome scans for selection: iHS, XP-EHH and Rsb. iHS compares alleles within a single population while the other two compare sites between populations. The calculation proceeds technically in five steps which are performed by running two commands and combining tables:

    -
  • each marker is taken in turn as a “focal marker” around which the extended haplotype homozygosity (EHH) is measured
  • -
  • EHH is summerized into a number by integration
  • -
  • two integrals are compared by taking their log ratio (two alleles for within-population or two populations for cross-population statistics)
  • -
  • the genome-wide distribution of these log-ratios is normalized
  • +
  • each marker is taken in turn as a “focal marker” around which the extended haplotype homozygosity is computed at further markers in increasing distance to the focal marker up to some stop criterion
  • +
  • for each focal marker these quantities are “integrated” over the surrounding markers
  • +
  • these integrals have to be calculated for each chromosome separately and the resulting tables to be combined to yields whole-genome statistics
  • +
  • at each focal position, two such integrals are compared (either from two alleles or from two populations) by taking their log ratio
  • +
  • the distribution of these log-ratios is standardized
-

Here is a minimal code example for a single population and a single chromossome:

+

Here is a minimal code example for calculating iHS on a single chromosome:

hh <-                  # data input
   data2haplohh(
     hap_file = "bta12_cgu.hap",
@@ -483,16 +487,17 @@ 

1.5 Overview

allele_coding = "map" ) scan <- scan_hh(hh) # calculation of EHH and integration -ihs <- ihh2ihs(scan) # log ratio for alleles and normalization -manhattanplot(ihs) # plot of the statistics
-

+ # (combine results from different chromosomes) +ihs <- ihh2ihs(scan) # log ratio for alleles and standardization +manhattanplot(ihs) # plot of the statistics
+

2 Data input

The package rehh requires as input:

    -
  • a haplotype data file for each population of interest (see section 2.1).
  • +
  • a haplotype data file (see section 2.1).

and, if the haplotype data file is neither in variant call format nor in the format of ms output,

    @@ -506,8 +511,8 @@

    2.1 Haplotype data file

  • a standard haplotype format. Each row represents a haplotype with marker genotype in columns as in the example file bta12_cgu.hap containing 280 haplotypes with 1424 SNPs each (see section 2.3.1). The first element of each row is taken as a haplotype identifier.
  • a transposed format with haplotypes in columns and markers in rows as in the example file bta12_cgu.thap. This format is similar to the one produced by the phasing program SHAPEIT2 (O’Connell et al. 2014). This format assumes neither row nor column names and hence no haplotype identifiers can be specified (see section 2.3.2).
  • the output file format from the phasing program fastPHASE (Scheet and Stephens 2006) as in the bta12_hapguess_switch.out example file. Note that this file format allows to include haplotypes from several populations (if the -u fastPHASE option was used) (see section 2.3.3).
  • -
  • variant call format (vcf), comprising both haplotype and marker information. In order to read files in this format, the package vcfR or the packages data.table and R.utils (the latter is needed for compressed files) have to be installed (see section 2.3.4).
  • -
  • The output of the simulation program ms (Hudson 2002) and its derivatives msHOT (Hellenthal and Stephens 2007), ms2 (Ewing and Hermisson 2010) and ms’ (Kelleher et al. 2016). In order to read these files, the package gap has to be installed (see section 2.3.5).
  • +
  • variant call format (vcf), comprising both haplotype and marker information. In order to read files in this format, the package vcfR or the packages data.table and R.utils (the latter is needed for compressed files) have to be installed (see section 2.3.4).
  • +
  • The output of the simulation program ms (Hudson 2002) and its derivatives msHOT (Hellenthal and Stephens 2007), ms2 (Ewing and Hermisson 2010) and ms’ (Kelleher et al. 2016). In order to read these files, the package gap has to be installed (see section 2.3.5).

Alleles in standard or transposed haplotype format can be provided either coded (by integer numbers) or without coding (e.g. as nucleotides) (see section 2.3).

If alleles are not polarized, i.e. their ancestral/derived status is not known, some arbitrary assignation can be done either beforehand by the user or during input by the package; appropriate parameters must be set in subsequent functions (see section 7.6).

@@ -629,7 +634,7 @@

2.3.1 Example 1: reading haplotyp

2.3.2 Example 2: reading haplotype file in transposed format (SHAPEIT2–like)

-

If the haplotype input file bta12_cgu.thap is in “transposed” format, the option haplotype.in.columns has to be set to TRUE while all other parameters remain unaffected with respect to example 1. Note that this is the only format that has to be explicitly declared by the user.

+

If the haplotype input file is in “transposed” format (like bta12_cgu.thap), the option haplotype.in.columns has to be set to TRUE while all other parameters remain unaffected with respect to example 1. This is the only format which is not recognized automatically, but has to be explicitly declared by the user.

hh <- data2haplohh(hap_file = "bta12_cgu.thap",
                    map_file = "map.inp",
                    chr.name = 12,
@@ -669,7 +674,7 @@ 

2.3.3 Example 3: reading haplotyp > Number of mono-, bi-, multi-allelic markers: > 1 2 > 27 1397

-

If no value is specified for the popsel argument and more than one population is detected in the fastPHASE output file, an error in produced and the available population numbers printed:

+

If no value is specified for the popsel argument and more than one population is detected in the fastPHASE output file, an error is produced and the available population numbers printed:

hh <- data2haplohh(hap_file = "bta12_hapguess_switch.out",
                    map_file = "map.inp",
                    chr.name = 12,
@@ -687,9 +692,9 @@ 

2.3.4 Example 4: reading vcf file

The function data2haplohh() checks automatically whether the specified haplotype input file is in variant call format (vcf). If this is the case, the parameters map_file and allele_coding are ignored. By default, the function tries to polarize (see section 7.6) the alleles of each marker using the ancestral allele, expected to be given by key AA of the INFO field.

Ancestral alleles are sometimes marked by upper case as “high confident” and by lower case as “low confident”. The default setting capitalize_AA = TRUE lifts this distinction before polarization.

If the AA key is absent, the option polarize_vcf should be set to FALSE and the allele coding of the vcf file is directly used as internal coding.

-

If there is data for more than one chromosome in the file, the chromosome of interest has to be specified by chr.name. Since always the whole file is read in, it may be advisable for large data sets to create chromosome-specific files.

+

If there is data for more than one chromosome in the file, the chromosome of interest has to be specified by chr.name. Since always the whole file is read in, it is advisable to split large data sets into chromosome-specific files.

In order to process vcf files, the package vcfR or the package -data.table (which in turn needs R.utils to read compressed files) have to be installed. The parameter vcf_reader has to be set to either "vcfR" or "data.table". [Note: at the time of writing, the package vcfR has been removed from CRAN, but can still be installed from https://github.com/knausb/vcfR, following instructions there.]

+data.table (which in turn needs R.utils to read compressed files) have to be installed. The parameter vcf_reader has to be set to either "vcfR" or "data.table".

In the file bta12_cgu.vcf.gz the ancestral allele was set as reference and hence no further polarizing is necessary.

hh <- data2haplohh(hap_file = "bta12_cgu.vcf.gz",
                    polarize_vcf = FALSE,
@@ -714,8 +719,9 @@ 

2.3.4 Example 4: reading vcf file

2.3.5 Example 5: reading ms output

The function data2haplohh() automatically checks whether the haplotype file is in the output format of the simulation program ms (Hudson 2002). If this is the case, the parameters map_file and allele_coding are ignored. If the file contains several ‘runs’ (as referred to by the parameter nrep of ms), it is necessary to specify the number of the run in option chr.name. Note that always the whole file is read, so that it might be advisable to spread large simulations over separate files.

One argument of the data2haplohh function is specifically dedicated to ms output, although it works with other formats as well: ms gives chromosomal positions as fractions of the interval [0,1] and in order to obtain more realistic values, these positions can be multiplied by a factor, set by position_scaling_factor.

-

Note that rehh does not accept multiple markers with the same position and hence it is highly recommended to increase the numerical precision for chromosomal positions in the ms output.

-

In order to read this format, the package gap has to be installed.

+

Note that ms output can contain multiple markers with the same (rounded) position, which rehh does not accept. In this case the numerical precision for chromosomal positions in the ms output should be increased (option -p of ms, option -oformat of msms).

+

Setting remove_multiple_markers to TRUE entails that from consecutive markers with the same position only the first one is retained and a warning containing the number of removed markers is printed. Note that this effectively transforms the “infinite sites model” used for simulations by ms into a “finite sites model”.

+

In order to read the ms format, the package gap has to be installed.

hh <- data2haplohh(hap_file = "ms.out",
                    chr.name = 2,
                    position_scaling_factor = 1000)
@@ -771,27 +777,28 @@

3 Computing EHH, EHH

3.1 Definition and computation

3.1.1 The (allele-specific) Extended Haplotype Homozygosity (EHH)

-

For any given allele \(a\) of a focal marker \(s\), sometimes referred to as a core allele, the Extended Haplotype Homozygosity (EHH) is defined as the probability that two randomly chosen chromosomes, carrying the core allele, are homozygous over a given surrounding chromosomal region (Sabeti et al. 2002). It is estimated from a sample by calculating the homozygosity of the chromosomal chunk between the focal marker and another marker \(t\) by the formula +

For an allele \(a\) of a focal marker \(s\), sometimes referred to as a core allele, the Extended Haplotype Homozygosity (EHH) is defined as the probability that two randomly chosen chromosomes, carrying the core allele, are homozygous over a given surrounding chromosomal region (Sabeti et al. 2002). It is estimated from a sample by calculating the homozygosity of the chromosomal chunk between the focal marker and another marker \(t\) by the formula \[\begin{equation} \mathrm{EHH}_{s,t}^a=\frac{1}{n_{a}(n_a-1)}\sum\limits_{k=1}^{K^a_{s,t}}n_k(n_k-1) \tag{3.1} \end{equation}\] where \(n_a\) represents the number of chromosomes carrying the core allele \(a\), \(K^a_{s,t}\) represents the number of different shared haplotypes and \(n_k\) refers to the number of chromosomes pertaining to the \(k\)-th such shared haplotype. If there is no missing data, it holds that \(n_a=\sum\limits_{k=1}^{K^a_{s,t}}n_k\).

In the case of unphased chromosomes from diploid individuals (see section 7.5) the extended haplotype homozygosity can be calculated as follows (Tang et al. 2007): -we consider only chromosomes from individuals that are homozygous for the allele \(a\) at the focal marker \(s\) and estimate EHH at some marker \(t\) by the fraction of individuals that are (still) homozygous over the entire chromosomal stretch between \(s\) and \(t\). Let \(I^a_{s,t}\) denote the number of individuals that are homozygous from marker \(s\) til marker \(t\). We can reformulate the fraction of individuals in terms of the fraction of shared haplotypes: since haplotypes of different individuals are not compared they can be regarded as distinct by definition, hence \(K_{s,s}^a=I_{s,s}^a=\frac{1}{2}n_a\). With increasing distance of \(t\) from \(s\), any increase in \(K_{s,t}^a\) is tantamount to a decrease of the number of homozygous individuals, yielding +we consider only chromosomes from individuals that are homozygous for the allele \(a\) at the focal marker \(s\) and estimate EHH at some marker \(t\) by the fraction of individuals that are (still) homozygous over the entire chromosomal stretch between \(s\) and \(t\). Let \(I^a_{s,t}\) denote the number of individuals that are homozygous from marker \(s\) til marker \(t\). \[\begin{equation} -\mathrm{EHH}_{s,t}^a=\frac{I_{s,t}^a}{I_{s,s}^a}=\frac{n_a-K_{s,t}^a}{\frac{1}{2}n_a}\;. +\mathrm{EHH}_{s,t}^a=\frac{I_{s,t}^a}{I_{s,s}^a}\;. \tag{3.2} \end{equation}\]

-

No matter which of the two definitions is used, it is common practice to stop computation when EHH reaches a certain lower threshold, e.g. 0.05.

+

EHH is usually computed only for a region it surpasses a given threshold (e.g., \(EHH > 0.05\)).

3.1.2 The integrated EHH (iHH)

-

By definition, irrespective of the allele considered, EHH starts at 1 and decays to 0 with increasing distance of t from the focal marker s. For a given core allele, the integrated EHH (iHH) is defined as the area under the EHH curve which, in turn, is defined by the EHH values and associated chromosomal positions (Voight et al. 2006). The integral is computed with a simple standard method, called the trapezoidal rule.

+

By definition, EHH starts at 1 and decays to 0 with increasing distance of t from the focal marker s. For a given core allele, the integrated EHH (iHH) is defined as the area under the EHH curve which, in turn, is defined by the EHH values and associated chromosomal positions (Voight et al. 2006). The integral is computed with a simple numerical method, called the trapezoidal rule.

+

Note that, technical details aside, the iHH value is nothing else than the average length of shared haplotypes.

3.1.3 The (site-specific) Extended Haplotype Homozygosity (EHHS)

-

An extended haplotype homozygosity can be defined as well without reference to core alleles. In this case, +

An extended haplotype homozygosity can be defined as well without regard to core alleles. In this case, the quantity is aimed to reflect the probability that any two randomly chosen chromosomes from a population are homozygous over a given surrounding chromosomal region of a focal marker. In contrast to the allele-specific EHH defined in the previous section, the chromosomes are not required to carry a specific allele at the focal marker. We adopt the naming by (Tang et al. 2007) as site–specific EHH, abbreviated by EHHS. Note, however that this quantity is sometimes referred to as EHH, too, and there is no agreed notation in the literature.

EHHS was used in genome scans in two versions: un-normalized by (Sabeti et al. 2007) and normalized by (Tang et al. 2007).

In line with (Sabeti et al. 2007) we define @@ -815,15 +822,15 @@

3.1.3 The (site-specific) Extende \[\begin{equation*} \mathrm{nEHHS}_{s,t}=\frac{\mathrm{EHHS}_{s,t}}{\mathrm{EHHS}_{s,s}}\;. \end{equation*}\] -Thus \(\mathrm{nEHHS}_{s,t}\) is just normalized in order to yield 1 at the focal marker \(s\). Note that the normalization factor depends on the frequency of the alleles at the focal marker and consequently is in general not the same for different focal markers.

+Thus \(\mathrm{nEHHS}_{s,t}\) is just normalized in order to yield 1 at the focal marker \(s\). Note that the normalization factor depends on the frequency of the alleles at the focal marker and consequently is not necessarily the same for different focal markers.

Furthermore, we note that EHHS and EHH are related by \[\begin{equation*} \mathrm{EHHS}_{s,t}=\frac{n_{a1}(n_{a1}-1)}{n_s(n_s-1)}\mathrm{EHH}_{s,t}^{a1}+\frac{n_{a2}(n_{a2}-1)}{n_s(n_s-1)}\mathrm{EHH}^{a2}_{s,t}\;, \end{equation*}\] where for the sake of simplicity we assume that the focal marker has only two alleles \(a1\) and \(a2\). EHHS might hence be viewed as a linear combination of the EHH’s of the focal alleles, weighted by roughly the square of the focal allele frequencies.

-

In the case of unphased chromosomes from diploid individuals (see section 7.5) EHHS can be calculated like EHH using equation (3.2) without the restriction to core alleles: +

In the case of unphased chromosomes from diploid individuals (see section 7.5) EHHS can be calculated like EHH in Equation (3.2), just without reference to core alleles: \[\begin{equation} -\mathrm{EHHS}_{s,t}=\frac{I_{s,t}}{I_{s,s}}=\frac{n-K_{s,t}}{\frac{1}{2}n}\;. +\mathrm{EHHS}_{s,t}=\frac{I_{s,t}}{I_{s,s}}\;. \tag{3.5} \end{equation}\] Note that defined this way, EHHS is always 1 at the focal marker. Hence there is no distinction between \(\mathrm{EHHS}\) and \(\mathrm{nEHHS}\).

@@ -832,11 +839,12 @@

3.1.3 The (site-specific) Extende \mathrm{EHHS}_{s,t}=\frac{I_{s,t}}{I_{s,s}}=\frac{I_{s,t}^{a1}+I_{s,t}^{a2}}{I_{s,s}^{a1}+I_{s,s}^{a2}}=\frac{I_{s,s}^{a1}}{I_{s,s}^{a1}+I_{s,s}^{a2}}\mathrm{EHH}_{s,t}^{a1}+\frac{I_{s,s}^{a2}}{I_{s,s}^{a1}+I_{s,s}^{a2}}\mathrm{EHH}_{s,t}^{a2}\; \end{equation}\] where for the sake of simplicity we assumed a bi-allelic focal marker with alleles \(a1\) and \(a2\).

-

As with EHH, the EHHS is usually computed only for the region where its value surpasses a given threshold (e.g., EHHS>0.05).

+

As with EHH, the EHHS is usually computed only for the region where its value surpasses a given threshold (e.g., \(EHHS > 0.05\)).

3.1.4 The integrated EHHS (iES)

Like EHH, EHHS has its maximum at the focal marker and decays to 0 with increasing distance from the focal marker. For a given focal marker, analogously to iHH, iES is defined as the integrated EHHS (Tang et al. 2007). Depending on whether EHHS or nEHHS is integrated, we yield iES and inES respectively. As with iHH, the numerical integration uses the trapezoidal rule.

+

Note that, technical details aside, the iES and inES values represent the average length of shared haplotypes. The length of shared haplotypes with different core alleles yields zero and these are included in the former but not the latter.

@@ -844,9 +852,9 @@

3.2 The function calc_ehh()

The function calc_ehh() computes EHH for all alleles of a focal marker \(s\) relative to markers \(t\) upstream and downstream. For each allele the corresponding integral iHH of the EHH curve is calculated as well.

Three options can be specified to constrain the computation of EHH:

    -
  • limehh sets a threshold below which further calculation of EHH is stopped. Its default value is 0.05. Note that lowering this cut-off, although increasing the accuracy of EHH estimates, might actually decrease the power to detect selective events since under neutrality a tiny fraction (<<0.05) of very long shared haplotypes can be expected, too.

  • +
  • limehh sets a threshold below which further calculation of EHH is stopped. Its default value is 0.05. Note that lowering this cut-off might actually decrease the power to detect selective events since under neutrality a tiny fraction of sequences has very long shared haplotypes which, if not capped, confound the signal of selection (Klassmann and Gautier 2020).

  • limhaplo defines the smallest acceptable number of evaluated chromosomes and has a default (and mimimum) value of 2. This parameter might be increased if missing values are suspected to be non-randomly distributed leading to a biased drop-out of evaluated chromosomes.

  • -
  • limhomohaplo sets a minimum number of homozygous chromosomes below which calculation of EHH is stopped (or not even started). Its default (and minimum) value is 2. This number should be increased to 4 for small samples of unphased haplotypes in order to limit the influence of a single pair of shared haplotypes (see section 7.5.)

  • +
  • limhomohaplo sets a minimum number of homozygous chromosomes below which calculation of EHH is stopped (or not even started). Its default (and minimum) value is 2. This number should be increased to 4 for small samples of unphased haplotypes in order to limit the influence of a single shared haplotype (see section 7.5.)

Several parameters influence the IHH values (the integral over EHH):

    @@ -857,7 +865,7 @@

    3.2 The function calc_ehh()
  • integration is stopped/discarded if a gap greater than maxgap is encountered.
  • integration is performed by default over the area between the graph defined by the EHH values and the horizontal line y = limehh. If numerical agreement with the program Hapbin is wanted, this area should be extended to the x-axis by setting lower_y_bound to zero.

  • -
  • by default the EHH curve is defined by linearly interpolating EHH values between consecutive markers, yielding a continuous curve. However in particular for full re-sequencing data, it is more accurate to let this function decrease step-wise at each marker by setting interpolateto FALSE (although the effect is likely to be minor).

  • +
  • by default, the EHH curve is defined by linearly interpolating EHH values between consecutive markers, yielding a continuous curve. However, in particular for full re-sequencing data, it is more accurate to let this function decrease step-wise at each marker by setting interpolate to FALSE (although the difference is likely to be minor).

  • The option polarized, TRUE by default, in this function merely affects the order and labeling of alleles.

    The parameter phased can be toggled to FALSE in order to calculate EHH by the formula for unphased data (see section 3.1.1 and 7.5).

    @@ -991,14 +999,14 @@

    3.2 The function calc_ehh()

    3.3 The function calc_ehhs()

    -

    The calc_ehhs() function computes \(\mathrm{EHHS}\) and \(\mathrm{nEHHS}\) around the focal marker \(s\) relative to another marker \(t\). This function also computes the corresponding integrals \(\mathrm{iES}\) and \(\mathrm{inES}\) respectively. The options are identical to those of the function calc_ehh (see previous section), except that polarized is not needed here. Details are available by the command:

    +

    The calc_ehhs() function computes EHHS and normalized EHHS around the focal marker \(s\) relative to another marker \(t\). This function also computes the corresponding integrals iES and inES respectively. The options are identical to those of the function calc_ehh (see previous section), except that polarized is absent, because variant ancestry status does not figure in the formulas. Details are available by the command:

    ?calc_ehhs

    In the following example, the EHHS statistics are computed around the SNP F1205400.

    data(haplohh_cgu_bta12)
     res <- calc_ehhs(haplohh_cgu_bta12, 
                      mrk = "F1205400", 
                      include_nhaplo = TRUE)
    -

    The output is similar to that of calc_ehh except that there are no alleles to be distinguished, but instead the normalized and non-normalized versions of EHHS. A list with four elements is obtained:

    +

    The output is similar to that of calc_ehh(), except that there are no alleles to be distinguished, but instead the wether EHHS is normalized or not. A list with four elements is obtained:

    1. mrk.name: the name/identifier of the focal marker.
    2. ehhs: a data frame with EHHS and nEHHS values along the chromosome around the focal marker. Optionally, the column NHAPLO can be included to show how many chromosomes were evaluated at each marker.
    3. @@ -1158,7 +1166,7 @@

      3.4 The function scan_hh()<
      # perform scan using scan_hh
       system.time(scan <- scan_hh(haplohh_cgu_bta12))
      >    user  system elapsed 
      ->   0.840   0.000   0.839
      +> 0.833 0.000 0.834

    # perform scan applying calc_ehh and calc_ehhs to each marker
     slow_scan_hh <- function(haplohh) {
       # create empty vectors of size nmrk
    @@ -1180,7 +1188,7 @@ 

    3.4 The function scan_hh()< } system.time(slow_scan <- slow_scan_hh(haplohh_cgu_bta12))

    >    user  system elapsed 
    ->    5.05    0.00    5.05
    +> 5.094 0.000 5.095

    Comparing columns shows that the computed values are identical:

    identical(slow_scan[, "IHH_A"], scan[, "IHH_A"])
    > [1] TRUE
    @@ -1217,13 +1225,14 @@

    4.1.1 Definition

    p^\text{right}_\text{iHS}=-\log_{10}\left(1-\Phi\left(\text{iHS}\right)\right) \end{equation*}\] for the opposite case.

    -

    In case of unpolarized alleles, the iHH values of major and minor alleles are opposed to obtain uniHS. Since derived allele frequency cannot be accounted for, no binning should be performed. The resulting standardized iHS cannot be expected to follow a normal distribution and p-values are not meaningful.

    +

    Note that this procedure is controversial, because we identify the empirical distribution with the distribution under the null hypothesis of neutrality. This is an approximation at best and only warranted when it can be assumed that there are so few selected sites that their influence on the overall shape of the distribution can be neglected.

    +

    In case of unpolarized alleles, the uniHS is taken as the ratio of iHH from minor to major allele. Since derived allele frequency cannot be accounted for, no binning should be performed. The resulting standardized iHS cannot be expected to follow a normal distribution and p-values as defined above are not meaningful.

    4.1.2 The function ihh2ihs()

    The ihh2ihs() function computes iHS using a data frame containing the iHH values for ancestral and derived (resp. major and minor) alleles as obtained by the scan_hh() function (see section 3.4). The argument min_maf allows to exclude focal markers according to their minor allele frequency (by default min_maf=0.05). The argument freqbin controls the size (or number) of the allele frequency bins used to perform standardization (see section 4.1.1). More precisely, allele frequency bins are built from min_maf to 1-min_maf in steps of size freqbin (by default freqbin=0.025). If an integer of 1 or greater is specified, a corresponding number of equally spaced bins is created. If freqbin is set to 0, standardization is performed considering each observed frequency as a discrete frequency class, which is useful when there are only a few distinct haplotypes.

    For unphased data, iHH is calculated using only haplotypes of individuals which are homozygous at the focal marker. This number can be considerably lower than the absolute allele frequency. Hence, in addition to min_maf, the option min_nhaplo (default NA) should be used to reduce statistical noise arising from too few evaluated haplotypes.

    -

    Optionally, the allele frequencies of the input data frame can be included into the output data frame by setting include_freq to TRUE.

    +

    Optionally, the allele frequencies of the input data frame can be included into the output by setting include_freq to TRUE.

    A p-value is calculated for standardized iHS values. By default, it is two-sided, but a side can be chosen by setting argument p.side to "left" or "right".

    As a typical workflow for performing a whole genome scan one might run scan_hh() on haplotype data from each chromosome and concatenate the resulting data frames before standardization. In the following example, we assume that the haplotype files are named as hap_chr_i.cgu with \(i=1,...,29\) and the marker information file is named map.inp. The data frame wgscan contains the \(iHH^A\) and \(iHH^D\) values for the whole genome and serves as input for the ihh$ihs() function which calculates iHS, i.e. the standardized log ratio of the two \(iHH\) values, for each marker.

    ## demo code - no data files for all chromosomes provided
    @@ -1462,7 +1471,7 @@ 

    5.4 Genome wide score plots: the
    manhattanplot(wgscan.ihs.cgu,
                   main = "iHS (CGU cattle breed)")
    -Graphical output of the manhattanplot() function +Graphical output of the manhattanplot() function

    Figure 5.5: Graphical output of the manhattanplot() function

    @@ -1473,13 +1482,14 @@

    5.4 Genome wide score plots: the threshold = 4, main = "p-value of iHS (CGU cattle breed)")

    -Graphical output of the manhattanplot() function +Graphical output of the manhattanplot() function

    Figure 5.6: Graphical output of the manhattanplot() function

    The colors of the chromosomes in Figures 5.5 and 5.6 are the default colors of R as obtained by the command palette(). It is possible to change them by the same command. Note that the colors are associated with the order of chromosomes in the scan and not their order in the plot.

    Candidate regions as obtained by the function calc_candidate_regions() can be added to the plot as parameter cr. Individual markers can be highlighted by setting argument mrk to a vector of marker IDs or a data frame with positions (containing columns with name CHR and POSITION).

    +

    By default, chromosomes are separated by an inset of 5,000,000 bases. This value can be increased by the corresponding parameter in order to further reduce overlap between data points of neighboring chromosomes.

    In order to reduce the number of plotted data points, the data set can be rasterized in both dimensions by parameter resolution. The data points are then rounded to the specified resolution and duplicate points removed.

    Furthermore, it is possible to specify a subset or a re-ordering of chromosomes with help of parameter chr.name as in Figure 5.7.

    # re-define colors
    @@ -1491,12 +1501,13 @@ 

    5.4 Genome wide score plots: the main = "iHS (CGU cattle breed)", cr = cr.cgu, mrk = "F1205400", - resolution = c(200000, 0.05))

    + inset = 1E+7, + resolution = c(200000, 0.05))
    > Rasterization reduced 6958 data points to 6377 .
    # set back to default colors
     palette("default")
    -Graphical output of the manhattanplot() function +Graphical output of the manhattanplot() function

    Figure 5.7: Graphical output of the manhattanplot() function

    @@ -1504,7 +1515,7 @@

    5.4 Genome wide score plots: the

    5.5 Genome wide score plots: the function manhattan() of package qqman

    -

    The package qqman contains a function manhattan() which is similar to the function manhattanplot() of this package. The input data frame is expected to have a slightly different format, though. Hence, before plotting we need to “translate” our data as in the following example with ihs values:

    +

    The package qqman contains a function manhattan() which is similar to the function manhattanplot() of this package. The input data frame is expected to have a slightly different format, though. Hence, before plotting we need to “translate” our data as in the following example with iHS values:

    # extract data frame from result list
     ihs <- wgscan.ihs.cgu$ihs
     # create new data frame
    @@ -1596,7 +1607,7 @@ 

    6.2 The functions calc_furc allele = 0, side = "left", hap.names = hap.names(haplohh_cgu_bta12))

    -

    Such a string can be rendered graphically e.g. by the R package ape yielding Figure ??:

    +

    Such a string can be rendered graphically e.g. by the R package ape yielding Figure 6.4:

    library(ape)
     tree <- ape::read.tree(text = newick)
     plot(tree, 
    @@ -1605,16 +1616,16 @@ 

    6.2 The functions calc_furc edge.color = "blue", underscore = TRUE, no.margin = TRUE)

    -
    +
    Graphical output of the ape::plot.phylo() function

    -(#fig:newick, )Graphical output of the ape::plot.phylo() function +Figure 6.4: Graphical output of the ape::plot.phylo() function

    6.3 The functions calc_haplen() and plot.haplen()

    -

    The length of a particular extended shared haplotype in a sample can be defined as the range that a chromosome shares a haplotype with at least one other chromosome. For a given chromosome it corresponds to the maximal extension of the inner branches to both sides of the focal marker in a furcation diagram. The function calc_haplen() calculates this quantity:

    +

    To each haplotype in the sample the length of its longest shared haplotype is assigned, i.e. the range over which it is identical to at least one other haplotype (the latter might be different left and right to the focal marker). It corresponds to the maximal extension of the inner branches to both sides of the focal marker in a furcation diagram. The function calc_haplen() calculates this quantity:

    haplen <- calc_haplen(furcation)

    The haplen object is a list with four elements:

      @@ -1637,15 +1648,15 @@

      6.3 The functions calc_hapl > 4 1 Derived 25932431 31009388 > 5 1 Derived 24259278 32960675 > 6 1 Derived 26683271 31457115 -

      Haplotype length is visualized in Figure 6.4 obtained by the command

      +

      Haplotype length is visualized in Figure 6.5 obtained by the command

      plot(haplen)
      Graphical output of the plot.haplen() function

      -Figure 6.4: Graphical output of the plot.haplen() function +Figure 6.5: Graphical output of the plot.haplen() function

      -

      As with a furcation plot, one can zoom into the picture using the parameter xlim and select alleles using parameter allele. The following command yields Figure 6.5 showing only the region to the “left” of the focal marker and only the chromosomes of the ancestral core allele.

      +

      As with a furcation plot, one can zoom into the picture using the parameter xlim and select alleles using parameter allele. The following command yields Figure 6.6 showing only the region to the “left” of the focal marker and only the chromosomes of the ancestral core allele.

      plot(haplen,
            allele = 0,
            xlim = c(haplen$xlim[1], haplen$position),
      @@ -1655,10 +1666,10 @@ 

      6.3 The functions calc_hapl
      Graphical output of the plot.haplen() function

      -Figure 6.5: Graphical output of the plot.haplen() function +Figure 6.6: Graphical output of the plot.haplen() function

      -

      We give a small example on how to check visual results by directly accessing the data of the haplohh-class object (see also its documentation by ?"haplohh-class"): from Figures ?? or 6.5 it appears that two extended shared haplotypes reach the left border and we might identify them by their labels as CGU_MN147_2 and CGU_MN153_2. We can prove that there are indeed exactly two haplotypes in the sample which are identical within the complete region to the “left” of the focal marker:

      +

      We give a small example on how to check visual results by directly accessing the data of the haplohh-class object (see also its documentation by ?"haplohh-class"): from Figures 6.4 or 6.6 it appears that two extended shared haplotypes reach the left border and we might identify them by their labels as CGU_MN147_2 and CGU_MN153_2. We can prove that there are indeed exactly two haplotypes in the sample which are identical within the complete region to the “left” of the focal marker:

      # finding the index number of marker "F1205400"
       mrk.nr = which(mrk.names(haplohh_cgu_bta12) == "F1205400")
       # subset of all markers on the "left" of the focal one
      @@ -1693,12 +1704,12 @@ 

      6.3 The functions calc_hapl

      7 Data considerations

      7.1 Multi-allelic markers

      -

      For many species, a low per-site mutation rate ensures that the vast majority of Single Nucleotide Polymorphisms (SNPs) appears only with two alleles in a sample. Hence bi-allelic SNPs will constitute the foremost kind of data to apply our package onto. However, we think the ability to calculate the statistics for multi-allelic markers might be useful

      +

      For many species, a low per-site mutation rate ensures that the vast majority of Single Nucleotide Polymorphisms (SNPs) is observed with only two alleles in a sample. Hence bi-allelic SNPs will constitute the foremost kind of data to apply our package onto. However, we think the ability to calculate the statistics for multi-allelic markers might be useful

      • for species and/or genomic regions with a high per-site mutation rate
      • for genetic variation in form of (short) tandem repeats or (larger) copy number variations which are multi-allelic by definition and may carry information not captured by SNP markers
      • because the relative rarity of multi-allelic SNPs might make these particularly interesting
      • -
      • since the original approach of (Sabeti et al. 2002) was not to compare extended haplotype homozygosity between core alleles of a single SNP, but between multiple core haplotypes defined by several neighboring SNPs; such a partition could be created in rehh by adding an artificial multi-allelic marker.
      • +
      • since the original approach of (Sabeti et al. 2002) did not compare EHH on the two core alleles of a single SNP, but for multiple core haplotypes defined by several neighboring SNPs; such a partition could be created in rehh by adding an artificial multi-allelic marker.
      @@ -1720,36 +1731,36 @@

      7.4 Dealing with multiple markers
    1. The variant call format allows to specify different kinds of markers in the same file. Hence at a certain position one might observe a SNP as well as an Insertion/Deletion or a tandem repeat.
    2. In output of ms the positions are given with a pre-set precision and consequently the positions of two different segregating sites might be rounded to the same number.
    3. -

      Since it is unclear how rehh should handle such markers, they are not accepted as input. Ideally, multiple markers should be dealt with by a pre-processing of the data outside of the package. As a quick-and-dirty work-around, we offer the option remove_multiple_markers, which, if set to TRUE, removes all but the first marker with identical positions.

      +

      Since it is unclear how rehh should handle such markers, they are not accepted as input. Ideally, multiple markers should be dealt with by a pre-processing of the data outside of the package. As a quick-and-dirty work-around, we offer the option remove_multiple_markers in function data2haplohh(), which, if set to TRUE, removes all but the first marker with identical positions.

      7.5 Dealing with unphased data

      -

      Notwithstanding expensive experimental methods, current high-throughput genotyping / sequencing technologies cannot directly assign alleles to specific chromosomes of a heterozygous diploid (or multiploid) individual. Instead, this task of phasing is typically performed by specialized bioinformatic tools like the previously mentioned SHAPEIT (O’Connell et al. 2014) and fastPHASE (Scheet and Stephens 2006). Although computationally demanding, the application of these tools is straight-forward and the results usually of sufficient quality for the calculation of EHH based statistics. Typically, the tools interpolate missing values away.

      +

      Notwithstanding expensive experimental methods, current high-throughput genotyping / sequencing technologies cannot directly assign alleles to specific chromosomes of a heterozygous diploid (or multiploid) individual. Instead, this task of phasing is typically performed by specialized bioinformatic tools like SHAPEIT (O’Connell et al. 2014) and fastPHASE (Scheet and Stephens 2006). Although computationally demanding, the application of these tools is straight-forward and the results usually of sufficient quality for the calculation of EHH based statistics. Typically, the tools interpolate missing values away.

      In the presumably rare cases where phasing is not feasible, EHH or EHHS can only be meaningfully estimated by reducing the set of compared chromosomes to those of homozygous (at the focal marker) individuals (assuming that the input data is ordered correspondingly). However, this reduction entails a substantial loss of power; even by an adapted parameter setting (see below) at the very minimum 10, but better up to 30 sequences are needed to obtain at least moderately accurate estimations.

      For the within-population statistic iHS, the latter requirement concerns both major and minor alleles of a marker and scans should not be performed on samples comprising less than 100 sequences. Even for sample sizes of 200 sequences, meaningful estimation of iHS is hence restricted to variants of intermediate frequencies, i.e. high minor frequency.

      -

      For the cross-population statistics Rsb and XP-EHH a minimum number of 30 sequences from homozygous individuals is usually fulfilled if the sample size of each population exceeds 60 sequences.

      -

      For unphased sequences

      +

      For the cross-population statistics Rsb and XP-EHH a minimum number of 30 sequences from homozygous individuals is usually fulfilled if the sample size of each population exceeds 60 sequences. +Hence, for unphased sequences the following parameters shoudl be set:

      • the option phased of the functions calc_ehh(), calc_ehhs() and scan_hh() must be set to FALSE. However, if the data is actually phased, this entails a substantial loss of power to detect selection!
      -

      Most of the variance (and hence “noise”) comes at any marker from the longest shared haplotypes. To limit their contribution

      +

      A few shared haplotypes of extreme length are usually encountered in neutrally evolving regions. In order to limit this “statistical noise”, cut-off rules are for unphased sequences even more important than they are for phased ones

      • the cut-off for the calculation of EHH resp. EHHS defined by option limehh resp. limehhs should be increased from the default value of 0.05 to 0.1.
      • in function ihh2ihs(), hence for a within-population scan using iHH values, in addition to the filtering by the MAF of core alleles (parameter min_maf, default 0.05), a minimum absolute number of evaluated haplotypes should be set by parameter min_nhaplo; this value should never be lower than 10 and, if the sample size allows, be as high as 30.
      • if the latter option is set to 20 or lower, a further cut-off, supplementing limehh, namely limhomohaplo should be set to 4, meaning that calculation of EHH/EHHS is stopped when less than 4 sequences (two individuals) remain homozygous.
      -

      See (Klassmann et al. 2020) for a study on estimating iHS, Rsb and XP-EHH using unphased sequences.

      +

      See (Klassmann and Gautier 2020) for a study on estimating iHS, Rsb and XP-EHH using unphased sequences.

      7.6 Dealing with unpolarized data

      -

      The designation of alleles as ‘ancestral’ or ‘derived’ is referred to as polarization. Since sequences of ancient genomes are available only for a few species and restricted to a limited time back to the past, the ‘ancestral’ allele is usually inferred to be the one carried by one or more outgroup species such as chimpanzees or gorillas for humans. However, this presupposes the existence of a reference sequence of suitable ‘neighbor’ species of sufficient quality as well as reliable genome-wide alignments. These requirements are not trivial and even if they are fulfilled, any alignment will not cover the whole genome and the covered part will contain mis-specified alleles due to invisible secondary or back-mutations (Baudry and Depaulis 2003), possibly causing spurious signals of selection (Hernandez et al. 2007).

      -

      Note that the bin-wise standardization of \(iHS\) is the only calculation step within our package where the information about ancestry is exploited. The information of ancestry status is valuable since the expected values under neutral evolution depend on the respective allele frequencies at the focal marker (see Figure 5.1 of this vignette and Figure 4 of (Voight et al. 2006)). The binning of markers by frequency before its standardization (see section 4.1.2) is aimed to eliminate most of this dependence. For unpolarized alleles this correction cannot be done.

      +

      The designation of alleles as ‘ancestral’ or ‘derived’ is referred to as polarization. Since sequences of ancient genomes are available only for a few species and restricted to a limited time back to the past, the ‘ancestral’ allele is usually inferred to be the one carried by one or more outgroup species such as chimpanzees or gorillas for humans. However, this presupposes the existence of a reference sequence of a suitable ‘neighbor’ species of sufficient quality as well as reliable genome-wide alignments. These requirements are not trivial and even if they are fulfilled, any alignment will not cover the whole genome and the covered part will contain mis-specified alleles due to invisible secondary or back-mutations (Baudry and Depaulis 2003), possibly causing spurious signals of selection (Hernandez et al. 2007).

      +

      Note that the bin-wise standardization of iHS is the only calculation step within our package where the information about ancestry is exploited. The information of ancestry status is valuable since the expected values under neutral evolution depend on the respective allele frequencies at the focal marker (see Figure 5.1 of this vignette and Figure 4 of (Voight et al. 2006)). The binning of markers by frequency before its standardization (see section 4.1.2) is aimed to eliminate most of this dependence. For unpolarized alleles this correction cannot be done.

      Hence two parameters are important when dealing with unpolarized data:

      • since in this case the internal coding of alleles as 0,1,2 etc. conveys no information about ancestral status, it is appropriate to set the parameter polarized in function scan_hh to FALSE: iHH values are then computed for the major (most frequent) and minor (second-most frequent) alleles.
      • the standardization of iHS for such data should NOT be done in a frequency-dependent manner and consequently the parameter freqbin in the function ihh2ihs() should be set to 1 (one bin).
      -

      Simulations have shown that neglecting ancestry status reduces the strength of the signal, yet conspicuous values remain clustered; consequently, a delineation of candidate regions of selection should rely primarily on this latter feature. For a more detailed analysis see (Klassmann et al. 2020).

      +

      Simulations have shown that neglecting ancestry status reduces the strength of the signal, yet conspicuous values remain clustered; consequently, a delineation of candidate regions of selection should rely primarily on this latter feature. For a more detailed analysis see (Klassmann and Gautier 2020).

      @@ -1763,19 +1774,35 @@

      8 Differences to the program \(\sqrt{\frac{1}{n}\sum(x_i-\bar{x})^2}\) for the standard deviation while rehh uses \(\sqrt{\frac{1}{n-1}\sum(x_i-\bar{x})^2}\).

    4. The bins used by hapbin for the standardization of iHS cover the whole interval ]0,1], while in rehh they span the interval [min_maf,1-min_maf[. Hapbin includes the upper endpoint into each bin, while rehh includes by default the lower endpoint. The latter can be changed by setting the parameter right of ihh2ihs() to TRUE.

    5. The default number of bins is 50 in hapbin, yielding a bin width of 0.02. The default width in rehh is 0.025 (yielding 36 bins, see point above!). Setting the number of bins in hapbin to 40 with option -b or --bin yields a bin width of 0.025.

    6. -
    7. If run in default mode, hapbin calculates EHH by (notation as in section 3.1.1) -\[\begin{equation*} -\mathrm{EHH}^a_{s,t}=\sum_{k=1}^{K^a_{s,t}}\left(\frac{n_k}{n_a}\right)^2\;. -\end{equation*}\] -For a set of \(n\) chromosomes this estimator reaches its minimum value of \(\frac{1}{n}\) if all of them are distinct. Yet formula (3.1) used by rehh and applied by hapbin if run with option -a or --binom returns zero in this case. The difference reflects distinct sampling strategies, either with replacement or without. For increasing sample size both converge. -The same holds for EHHS.

    8. +
    9. hapbin uses by default another estimator for homozygosity than rehh (see section 9). +If run with option -a or --binom, it uses the same as rehh.

    10. Integration over EHH resp. EHHS is performed by hapbin on the area between the curve spanned by these quantities and the x-axis (y=0) while rehh by default integrates only over the part of that area that is above the threshold set by the parameters limehh resp. limehhs, i.e. the area between the curve and the line y=threshold. This is not to be confused with the condition for truncation at left and right ends of the curve which is (for all practical purposes) handled identically by both programs. Setting in rehh the parameter lower_y_bound to zero makes the integration identical to that of hapbin. As mentioned above, limehh(s) of rehh corresponds to -c or --cutoff of hapbin.

    11. By default, the parameter discard_integration_at_border is TRUE in rehh. It has to be set to FALSE in order to conform to hapbin.

    12. Large differences can arise from different handling of gaps during the integration of EHH resp. EHHS yielding iHH resp. iES. Hapbin has a parameter -s or --scale to “down-weight” large gaps by capping them to the specified value. Its default value is 20000 while the corresponding option in rehh is turned off by default, but can be set by scalegap. -The option maxgap within rehh leads to a stop of the integration and if the parameter discard_integration_at_border is set to TRUE, then no value is reported. This has no counterpart in hapbin. Instead, hapbin allows to specify a maximum length of Extended Haplotypes (disabled by default) which is not possible in rehh. -

    13. +The option maxgap within rehh leads to a stop of the integration and if the parameter discard_integration_at_border is set to TRUE, then no value is reported. This has no counterpart in hapbin. Instead, hapbin allows to specify a maximum length of Extended Haplotypes (disabled by default) which is available as option of the function scan_hh_full() in rehh.

    +
    +

    9 About estimating homozygosity

    +

    The term homozygosity as component of the abbreviation EHH refers to the probability that two arbitrarily chosen chromosomes from a large population are identical at a given locus or in a given region. It does not make any statement about homozygosity of individuals or even presuppose that individuals are diploid. One might even argue, whether the term homogeneity would have been more appropriate.

    +

    If there are \(K\) alleles in the population and each allele has a population frequency of \(f_k\), then this probability is given by +\[H=\sum_{k=1}^{K}f_k^2\;.\] +For each allele \(k\), its population frequency can be estimated by its sample frequency \(x_k\): if the sample contains \(n\) chromosomes and allele \(k\) is observed \(n_k\) times, then +\[\hat{f_k}=\frac{n_k}{n}=x_k\;.\] +It seems straightforward to estimate the population homozygosity from a sample by +\[\hat{H_1}=\sum_{k=1}^Kx_k^2=\sum_{k=1}^K\left(\frac{n_k}{n}\right)^2\;.\] +However, it turns out that this estimator is biased (it yields values that tend to be slightly too high). The following estimator, instead, is unbiased (Nei and Roychoudhury 1974): +\[\hat{H_2}=\frac{n}{n-1}\sum_{k=1}^{K_{s,t}}x_k^2-\frac{1}{n-1}\;.\] +The latter is used by rehh. We can see this e.g. in Equation (3.3), when we consider each (shared) haplotype in the region between markers \(s\) and \(t\) as an allele. We get

    +

    \[EHHS=\frac{1}{n(n-1)}\sum_{k=1}^{K_{s,t}}n_k(n_k-1)=\frac{n}{n-1}\frac{1}{n^2}\sum_{k=1}^{K_{s,t}}(n_k^2-n_k)=\frac{n}{n-1}\left(\sum_{k=1}^{K_{s,t}}\frac{n_k^2}{n^2}-\frac{n}{n^2}\right)=\hat{H_2}\;.\] +hapbin, in constrast, uses by default estimator \(\hat{H_1}\) and refers to \(\hat{H_2}\) as the “alternative” estimator. Evidently, for large \(n\), the difference between the two becomes negligible. For small \(n\) this is not necessarily so. If we consider a minimal sample of two non-identical chromosomes, hence \(n=2\) and \(K=2\), then we have +\[\hat{H_1}=\left(\frac{1}{2}\right)^2+\left(\frac{1}{2}\right)^2=\frac{1}{2}\] +and +\[\hat{H_2}=\frac{1}{2\cdot 1}(1\cdot 0+1\cdot 0)=0\;.\] +Interestingly, although \(\hat{H_1}\) is biased, it yields values which are on average closer to the population value than \(\hat{H_2}\), since the variance of \(\hat{H_1}\) is smaller than that of \(\hat{H_2}\) (Nei and Roychoudhury 1974).

    +

    It is unlikely, though, that the choice of the estimator has a major effect on detecting selection. +

    +

    References

    @@ -1810,17 +1837,23 @@

    References

    Kelleher J., Etheridge A. M., McVean G., 2016 Efficient coalescent simulation and genealogical analysis for large sample sizes. PLoS Comput Biol 12: 1–22.

    -

    Klassmann A., Vitalis R., Gautier M., 2020 Detecting selection using extended haplotype homozygosity (EHH)-based statistics on unphased or unpolarized data (preprint).

    +

    Klassmann A., Gautier M., 2020 Detecting selection using extended haplotype homozygosity-based statistics on unphased or unpolarized data (preprint). https://doi.org/10.22541/au.160405572.29972398/v1.

    Maclean C. A., Hong N. P. C., Prendergast J. G. D., 2015 Hapbin: An efficient program for performing haplotype-based scans for positive selection in large genomic datasets. Mol Biol Evol 32: 3027–3029.

    +
    +

    Nei M., Roychoudhury K., 1974 Sampling Variances of Heterozygosity and Genetic Distance. Genetics 76: 379–390.

    +

    O’Connell J., Gurdasani D., Delaneau O., Pirastu N., Ulivi S., others, 2014 A general approach for haplotype phasing across the full spectrum of relatedness. PLoS Genet 10: e1004234.

    Oleksyk T. K., Smith M. W., O’Brien S. J., 2010 Genome-wide scans for footprints of natural selection. Philosophical Transactions of the Royal Society B: Biological Sciences 365: 185–205.

    +
    +

    Sabeti P. C., 2006 Positive natural selection in the human lineage. Science 312: 1614–1620.

    +

    Sabeti P. C., Reich D. E., Higgins J. M., Levine H. Z. P., Richter D. J., others, 2002 Detecting recent positive selection in the human genome from haplotype structure. Nature 419: 832–837.

    @@ -1833,12 +1866,18 @@

    References

    Tang K., Thornton K. R., Stoneking M., 2007 A new approach for using genome scans to detect recent positive selection in the human genome. PLoS Biol 5: e171.

    +
    +

    Utsunomiya Y. T., Pérez O’Brien A. M. P., Sonstegard T. S., Sölkner J., Garcia J. F., 2015 Genomic data as the "hitchhiker’s guide" to cattle adaptation: Tracking the milestones of past selection in the bovine genome. Frontiers in Genetics 5: 1–13.

    +

    Vitti J. J., Grossman S. R., Sabeti P. C., 2013 Detecting natural selection in genomic data. Annual Review of Genetics 47: 97–120.

    Voight B. F., Kudaravalli S., Wen X., Pritchard J. K., 2006 A map of recent positive selection in the human genome. PLoS Biol 4: e72.

    +
    +

    Weigand H., Leese F., 2018 Detecting signatures of positive selection in non-model species using genomic data. Zoological Journal of the Linnean Society 184: 528–583.

    +
    diff --git a/inst/extdata/example_neutral.vcf b/inst/extdata/example_neutral.vcf new file mode 100644 index 0000000..bca990d --- /dev/null +++ b/inst/extdata/example_neutral.vcf @@ -0,0 +1,17 @@ +##fileformat=VCFv4.2 +##reference=NCBI38 +##contig= +##INFO= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG1 HG2 HG3 HG4 +chr1 10000 s1 G T 100 PASS AA=G GT 0|1 0|0 0|0 0|0 +chr1 20000 s2 G A 100 PASS AA=G GT 0|0 0|1 1|0 0|1 +chr1 30000 s3 T C 100 PASS AA=T GT 1|0 0|0 0|0 0|0 +chr1 40000 s4 A C 100 PASS AA=A GT 0|0 1|0 0|0 0|1 +chr1 50000 s5 T G 100 PASS AA=T GT 1|0 0|1 0|1 1|0 +chr1 60000 s6 G A 100 PASS AA=A GT 1|1 1|1 1|1 1|0 +chr1 70000 s7 C T 100 PASS AA=C GT 0|0 0|0 0|0 0|1 +chr1 80000 s8 C T 100 PASS AA=C GT 1|0 0|0 1|0 1|0 +chr1 90000 s9 T A 100 PASS AA=T GT 0|0 1|0 0|0 0|1 +chr1 100000 s10 A G 100 PASS AA=A GT 1|0 0|0 0|0 0|0 +chr1 110000 s11 C T 100 PASS AA=T GT 1|1 0|1 0|1 1|1 diff --git a/inst/extdata/example_sweep.vcf b/inst/extdata/example_sweep.vcf new file mode 100644 index 0000000..0a87a12 --- /dev/null +++ b/inst/extdata/example_sweep.vcf @@ -0,0 +1,17 @@ +##fileformat=VCFv4.2 +##reference=NCBI38 +##contig= +##INFO= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG1 HG2 HG3 HG4 +chr1 10000 s1 G T 100 PASS AA=G GT 0|0 0|0 0|0 0|0 +chr1 20000 s2 G A 100 PASS AA=G GT 0|1 1|1 1|1 1|1 +chr1 30000 s3 T C 100 PASS AA=T GT 1|0 0|0 0|0 0|0 +chr1 40000 s4 A C 100 PASS AA=A GT 0|1 1|1 1|1 1|1 +chr1 50000 s5 T G 100 PASS AA=T GT 1|0 0|0 0|0 0|0 +chr1 60000 s6 G A 100 PASS AA=A GT 1|0 0|0 0|0 0|0 +chr1 70000 s7 C T 100 PASS AA=C GT 0|1 1|1 1|1 1|1 +chr1 80000 s8 C T 100 PASS AA=C GT 1|0 0|0 0|0 0|0 +chr1 90000 s9 T A 100 PASS AA=T GT 0|1 1|1 1|1 1|1 +chr1 100000 s10 A G 100 PASS AA=A GT 1|0 0|0 0|0 0|0 +chr1 110000 s11 C T 100 PASS AA=T GT 1|1 1|1 1|1 1|1 diff --git a/inst/extdata/example_sweep_with_recombination.vcf b/inst/extdata/example_sweep_with_recombination.vcf new file mode 100644 index 0000000..1649e9d --- /dev/null +++ b/inst/extdata/example_sweep_with_recombination.vcf @@ -0,0 +1,17 @@ +##fileformat=VCFv4.2 +##reference=NCBI38 +##contig= +##INFO= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG1 HG2 HG3 HG4 +chr1 10000 s1 G T 100 PASS AA=G GT 0|1 0|0 0|0 0|0 +chr1 20000 s2 G A 100 PASS AA=G GT 0|0 0|1 1|0 1|1 +chr1 30000 s3 T C 100 PASS AA=T GT 1|0 0|0 0|0 0|0 +chr1 40000 s4 A C 100 PASS AA=A GT 0|0 1|0 1|1 1|1 +chr1 50000 s5 T G 100 PASS AA=T GT 1|0 0|0 0|0 0|0 +chr1 60000 s6 G A 100 PASS AA=A GT 1|0 0|0 0|0 0|0 +chr1 70000 s7 C T 100 PASS AA=C GT 0|0 1|1 1|1 1|1 +chr1 80000 s8 C T 100 PASS AA=C GT 1|0 0|0 0|0 0|0 +chr1 90000 s9 T A 100 PASS AA=T GT 0|0 1|0 0|1 1|1 +chr1 100000 s10 A G 100 PASS AA=A GT 1|0 0|0 0|0 0|0 +chr1 110000 s11 C T 100 PASS AA=T GT 1|1 0|1 0|1 1|1 diff --git a/man/calc_ehh.Rd b/man/calc_ehh.Rd index 3fc849d..ec08e00 100644 --- a/man/calc_ehh.Rd +++ b/man/calc_ehh.Rd @@ -99,8 +99,9 @@ ehh <- calc_ehh(haplohh_cgu_bta12, mrk = "F1205400") \references{ Gautier, M. and Naves, M. (2011). Footprints of selection in the ancestral admixture of a New World Creole cattle breed. \emph{Molecular Ecology}, \strong{20}, 3128-3143. -Klassmann, A. et al. (2020). Detecting selection using Extended Haplotype -Homozygosity (EHH)-based statistics on unphased or unpolarized data. (submitted). +Klassmann, A. and Gautier, M. (2020). Detecting selection using Extended Haplotype +Homozygosity-based statistics on unphased or unpolarized data (preprint). +https://doi.org/10.22541/au.160405572.29972398/v1 Sabeti, P.C. et al. (2002). Detecting recent positive selection in the human genome from haplotype structure. \emph{Nature}, \strong{419}, 832-837. diff --git a/man/calc_ehhs.Rd b/man/calc_ehhs.Rd index adb3b25..bfb929b 100644 --- a/man/calc_ehhs.Rd +++ b/man/calc_ehhs.Rd @@ -95,8 +95,9 @@ ehhs <- calc_ehhs(haplohh_cgu_bta12, mrk = "F1205400") \references{ Gautier, M. and Naves, M. (2011). Footprints of selection in the ancestral admixture of a New World Creole cattle breed. \emph{Molecular Ecology}, \strong{20}, 3128-3143. -Klassmann, A. et al. (2020). Detecting selection using Extended Haplotype -Homozygosity (EHH)-based statistics on unphased or unpolarized data. (submitted). +Klassmann, A. and Gautier, M. (2020). Detecting selection using Extended Haplotype +Homozygosity-based statistics on unphased or unpolarized data (preprint). +https://doi.org/10.22541/au.160405572.29972398/v1 Sabeti, P.C. et al. (2002). Detecting recent positive selection in the human genome from haplotype structure. \emph{Nature}, \strong{419}, 832-837. diff --git a/man/calc_pairwise_haplen.Rd b/man/calc_pairwise_haplen.Rd index d5a1c58..2f0e700 100644 --- a/man/calc_pairwise_haplen.Rd +++ b/man/calc_pairwise_haplen.Rd @@ -4,7 +4,14 @@ \alias{calc_pairwise_haplen} \title{Calculate pairwise shared haplotype length between all chromosomes} \usage{ -calc_pairwise_haplen(haplohh, mrk, phased = TRUE, maxgap = NA) +calc_pairwise_haplen( + haplohh, + mrk, + phased = TRUE, + maxgap = NA, + max_extend = NA, + side = "both" +) } \arguments{ \item{haplohh}{an object of class \code{haplohh} (see \code{\link{data2haplohh}}).} @@ -13,11 +20,17 @@ calc_pairwise_haplen(haplohh, mrk, phased = TRUE, maxgap = NA) or string representing its ID/name.} \item{phased}{logical. If \code{TRUE} (default) chromosomes are expected to be phased. If \code{FALSE}, the haplotype data is assumed to -consist of pairwise ordered chromosomes belonging to diploid individuals and only the two chromosomes of +consist of pairwise ordered chromosomes belonging to diploid individuals and only the two chromosomes of each individual are compared.} \item{maxgap}{maximum allowed gap in bp between two markers. If exceeded, further calculation is stopped at the gap -(default=\code{NA}, i.e no limitation).} +(default=\code{NA}, i.e. no limitation).} + +\item{max_extend}{maximum distance in bp to extend shared haplotypes away from the focal marker. +(default \code{NA}, i.e. no limitation).} + +\item{side}{side to consider, either "left" (positions lower than focal position), "right" (positions higher than focal position) +or "both" (default).} } \value{ The returned value is a matrix with pairwise shared haplotype lengths. @@ -27,7 +40,7 @@ Calculate pairwise shared haplotype length between all chromosomes at a focal ma } \details{ The function computes the length of shared haplotypes (stretches of identical sequence) around -the focal marker. +the focal marker. Note that the function \code{\link{calc_haplen}} calculates for each chromosome the boundaries of its longest shared haplotype; separately upstream and downstream of diff --git a/man/calc_sfs_tests.Rd b/man/calc_sfs_tests.Rd new file mode 100644 index 0000000..99c74bf --- /dev/null +++ b/man/calc_sfs_tests.Rd @@ -0,0 +1,94 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/calc_sfs_tests.R +\name{calc_sfs_tests} +\alias{calc_sfs_tests} +\title{Calculate site frequency spectrum test statistics} +\usage{ +calc_sfs_tests( + haplohh, + polarized = TRUE, + window_size = NA, + overlap = 0, + right = TRUE, + min_n_mrk = 1, + verbose = TRUE +) +} +\arguments{ +\item{haplohh}{an object of class \code{haplohh} (see \code{\link{data2haplohh}})} + +\item{polarized}{logical. \code{TRUE} by default. If \code{FALSE}, use major and minor allele instead of ancestral and derived. If there +are more than two alleles then the minor allele refers to the second-most frequent allele. +Note that Tajima's D remains unchanged, Fay & Wu's H is always zero for folded spectra and Zeng's E becomes equal to Tajima's D.} + +\item{window_size}{size of sliding windows. If \code{NA} (default), there will be only +one window covering the whole length of the chromosome.} + +\item{overlap}{size of window overlap (default 0, i.e. no overlap).} + +\item{right}{logical, indicating if the windows should be closed on the right and open on the left (default) or vice versa.} + +\item{min_n_mrk}{minimum number of (polymorphic) markers per window.} + +\item{verbose}{logical. \code{TRUE} by default; reports if multi-allelic sites are removed.} +} +\value{ +A data frame with window coordinates, the number of contained (polymorphic) markers, Watterson's, Tajima's and Zeng's +estimators of theta and the test statistics of Tajima's D, Fay & Wu's H and Zeng's E. +} +\description{ +Calculate site frequency spectrum (SFS) tests Tajima's D, Fay & Wu's H and Zeng's E. +} +\details{ +Neutrality tests based on the site frequency spectrum (SFS) are +largely unrelated to EHH-based methods. The tests provided here are implemented +elsewhere, too (e.g. in package \href{https://cran.r-project.org/package=PopGenome}{PopGenome}). + +Each test compares two estimations of the \emph{scaled mutation rate} theta, +which all have the same expected value under neutrality. Deviations from zero indicate +violations of the neutral null model, typically population size changes, population subdivision or selection. +Tajima's D and Fay & Wu's H become negative in presence of an almost completed sweep, Zeng's E becomes +positive for some time after it. Significance can typically be assigned only by +simulations. + +The standard definition of the tests cannot cope with missing values and typically markers +with missing genotypes must be discarded. Ferretti (2012) provides an extension +that can handle missing values (without discarding any non-missing values). In this package, +only the first moments (the theta-estimators themselves) are adapted accordingly, +but not the second moments (their variances), because the latter is computationally demanding +and the resulting bias relatively small. It is recommended, though, to discard markers or haplotypes +with more than 20\% missing values. + +Multi-allelic markers are always removed since the tests rely on the "infinite sites model" which +implies that all polymorphic markers are bi-allelic. +Monomorphic markers can be present, but are irrelevant for the tests. +} +\examples{ +make.example.files() +# neutral evolution +hh <- data2haplohh("example_neutral.vcf", verbose = FALSE) +calc_sfs_tests(hh) +# strong selective sweep +hh <- data2haplohh("example_sweep.vcf", verbose = FALSE) +calc_sfs_tests(hh) +remove.example.files() +} +\references{ +Watterson, G.A. (1975). On the number of segregating sites in genetical models without recombination. +\emph{Theoretical Population Biology} \strong{7}(2) 256-276. + +Tajima, F. (1983). Evolutionary relationship of DNA sequences in finite populations. +\emph{Genetics} \strong{105}(2) 437-60. + +Tajima, F. (1989). Statistical method for testing the neutral mutation hypothesis by DNA polymorphism. +\emph{Genetics} \strong{123}(3) 585-95. + +Fay, J. and Wu, C. (2000). Hitchhiking under positive Darwinian selection. \emph{Genetics} +\strong{155}(3) 1405-13. + +Zeng, E. et al. (2006). Statistical tests for detecting positive selection by utilizing high-frequency variants. +\emph{Genetics} \strong{174}(3) 1431-9. + +Ferretti, L. and Raineri, E. and Ramos-Onsins, S. (2012). Neutrality tests for sequences with missing data. +\emph{Genetics} \strong{191}(4) 1397-401. +} diff --git a/man/data2haplohh.Rd b/man/data2haplohh.Rd index 1d4c4cc..f99a92a 100644 --- a/man/data2haplohh.Rd +++ b/man/data2haplohh.Rd @@ -70,7 +70,7 @@ Low confidence ancestral alleles are usually coded by lower-case letters. If \co changed to upper case before the alleles of the sample are matched for polarization.} \item{vcf_reader}{library used to read vcf. By default, low-level parsing is -performed using the generic package \code{data.table}. In order to read compressed files, +performed using the generic package \code{data.table}. In order to read compressed files, the package \code{R.utils} must be installed, too. If the specialized package \code{vcfR} is available, set this parameter to \code{"vcfR"}.} diff --git a/man/haplohh2sweepfinder.Rd b/man/haplohh2sweepfinder.Rd new file mode 100644 index 0000000..131e915 --- /dev/null +++ b/man/haplohh2sweepfinder.Rd @@ -0,0 +1,58 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/haplohh2sweepfinder.R +\name{haplohh2sweepfinder} +\alias{haplohh2sweepfinder} +\title{Translate object of \code{\link{haplohh-class}} into SweepFinder format} +\usage{ +haplohh2sweepfinder(haplohh, polarized = TRUE, verbose = TRUE) +} +\arguments{ +\item{haplohh}{object of class \code{\link{haplohh-class}}.} + +\item{polarized}{logical. If \code{TRUE} (default), flag "folded" is set to 0, otherwise to 1.} + +\item{verbose}{logical. If \code{TRUE} (default), prints filter statements.} +} +\value{ +A dataframe with four columns: +\itemize{ +\item \strong{position} marker position +\item \strong{x} (absolute) frequency of the alternative (derived) variant +\item \strong{n} number of non-missing genotypes +\item \strong{folded} a flag marking polarization +} +} +\description{ +Extract allele frequencies of an object of class \code{\link{haplohh-class}} +and returns a table in SweepFinder input format. +} +\details{ +SweepFinder and SweeD are two stand-alone programs which +implement the same method to detect selective sweeps using the +allele frequency at each site. This function calculates these frequencies +from a \code{\link{haplohh-class}} and returns a table which +can be saved into a file (with tabs as separators, without row names and quotes) that can +be used as input for the two programs. + +Sites with less than two haplotypes genotyped or with more than two alleles are removed. +If \code{polarized}, sites monomorphic for the ancestral allele are removed, too. +} +\examples{ +#example +# sweepfinder example from vignette +make.example.files() +hh <- data2haplohh("example_sweep_with_recombination.vcf") +haplohh2sweepfinder(hh) +remove.example.files() +} +\references{ +DeGiorgio, M., and, Huber, CD and Hubisz, MJ and, Hellmann, I. and Nielsen, R. (2016) +SweepFinder2: increased robustness and flexibility. \emph{Bioinformatics} \strong{32}:1895-1897 + +Pavlidis, P., D. Zivkovic, A. Stamatakis, and N. Alachiotis, (2013) +SweeD: likelihood-based detection of selective sweeps in thousands of genomes. +\emph{Molecular Biology and Evolution} \strong{30}: 2224-34. +} +\seealso{ +\code{\link{haplohh-class}}, \code{\link{data2haplohh}} +} diff --git a/man/make.example.files.Rd b/man/make.example.files.Rd index a117b01..d9e4ed0 100644 --- a/man/make.example.files.Rd +++ b/man/make.example.files.Rd @@ -15,6 +15,9 @@ This function copies the following example files to the current working director \item \code{example2.hap} "example 2" haplotype file in "standard format" \item \code{example2.map} "example 2" marker information file \item \code{example2.vcf} "example 2" as vcf file +\item \code{example_neutral.vcf} "example neutral evolution" as vcf file +\item \code{example_sweep.vcf} "example for a selective sweep (without recombination)" +\item \code{example_sweep_with_recombination.vcf} "example for a selective sweep with recombination \item \code{ms.out output} from a small simulation by the program 'ms' \item \code{bta12_cgu.hap} an haplotype file in "standard format" \item \code{bta12_cgu.thap} an haplotype file in "transposed format" @@ -24,6 +27,8 @@ This function copies the following example files to the current working director Example 1 was used in (Gautier 2017) to explain the various EHH derived statistics calculated by this package. Example 2 is an extension containing multi-allelic markers and missing values. +Examples for neutral data and sweeps are discussed in a supplement of Klassmann (2020). + The bta12 files contain data for 280 haplotypes, originating from 140 individuals belonging to the Creole cattle breed from Guadeloupe, at 1.424 markers mapping to bovine chromosome 12 (BTA12) (Gautier 2011). } @@ -31,6 +36,10 @@ Creole cattle breed from Guadeloupe, at 1.424 markers mapping to bovine chromoso Gautier, M. and Naves, M. (2011). Footprints of selection in the ancestral admixture of a New World Creole cattle breed. \emph{Molecular Ecology}, \strong{20}, 3128-3143. Gautier, M., Klassmann, A. and Vitalis, R. (2017). rehh 2.0: a reimplementation of the R package rehh to detect positive selection from haplotype structure. \emph{Molecular Ecology Resources}, \strong{17}, 78-90. + +Klassmann, A. and Gautier, M. (2020). Detecting selection using Extended Haplotype +Homozygosity-based statistics on unphased or unpolarized data (preprint). +https://doi.org/10.22541/au.160405572.29972398/v1 } \seealso{ \code{\link{data2haplohh}}, \code{\link{remove.example.files}} diff --git a/man/manhattanplot.Rd b/man/manhattanplot.Rd index b185abd..43c459b 100644 --- a/man/manhattanplot.Rd +++ b/man/manhattanplot.Rd @@ -25,6 +25,7 @@ manhattanplot( cex = 0.5, las = 1, pch = 20, + inset = 5e+06, resolution = NULL, ... ) @@ -76,6 +77,8 @@ Values of 1, 2, 3 and 4, respectively indicate positions below, to the left of, \item{pch}{type of the points representing markers in the plot(s) (see \code{\link[graphics]{points}}).} +\item{inset}{inset (in bases) between chromosomes to avoid overlap of data points. Default: 5,000,000 bases.} + \item{resolution}{Rasterize data points to the specified resolution and remove duplicate points. Defaults to NULL, i.e. no rasterization. A typical value might be \code{c(1E5, 0.01)}, meaning that resolution on the x-axis (chromosomal position) is 100000 and on the y-axis (score or p-value) is 0.01.} @@ -93,6 +96,9 @@ The color of chromosomes is taken from the "Graphics Palette", see \code{\link[g If a single chromosome is plotted, a genomic region can be specified by argument \code{xlim}. + +Other statistics can be plotted as well, although a warning is issued. They must be given by a data.frame +with columns CHR and POSITION and the statistic in the third column. } \examples{ library(rehh.data) diff --git a/man/plot.haplohh.Rd b/man/plot.haplohh.Rd index 9e2aed4..54dc6cb 100644 --- a/man/plot.haplohh.Rd +++ b/man/plot.haplohh.Rd @@ -27,6 +27,8 @@ pos.lab.mrk = "top", srt.hap = 0, srt.mrk = 0, + highlight.mrk = NULL, + highlight.mrk.col = c("lightgray", "black", "darkgray"), ... ) } @@ -85,6 +87,10 @@ consecutive sequences are set into the specified colors.} \item{srt.mrk}{rotation of marker labels (see \code{\link[graphics]{par}}).} +\item{highlight.mrk}{vector of markers to be highlighted} + +\item{highlight.mrk.col}{color for each allele (as coded internally) at highlighted markers.} + \item{...}{other parameters to be passed to \code{\link[graphics]{plot.default}}.} } \description{ @@ -99,13 +105,13 @@ markers produces an error. \examples{ #example haplohh object make.example.files() -hh <- data2haplohh(hap_file = "example1.hap", +hh <- data2haplohh(hap_file = "example1.hap", map_file = "example1.map", allele_coding = "01") plot(hh) -hh <- data2haplohh(hap_file = "example2.hap", - map_file = "example2.map", - allele_coding = "01", +hh <- data2haplohh(hap_file = "example2.hap", + map_file = "example2.map", + allele_coding = "01", min_perc_geno.mrk = 50) plot(hh) remove.example.files() diff --git a/man/rehh-package.Rd b/man/rehh-package.Rd index d137f79..adc64e2 100644 --- a/man/rehh-package.Rd +++ b/man/rehh-package.Rd @@ -20,13 +20,8 @@ Population genetic data such as 'Single Nucleotide 'Rsb' (Tang 2007) and 'XP-EHH' (Sabeti 2007) , targeted at differential selection between two populations. - Various plotting functions are also included to facilitate - visualization and interpretation of these statistics. - Due to changes in the API, albeit mostly minor, - versions 3.X are not compatible with versions 2.0.X. - Note: optionally, vcf files can be imported using package vcfR. That package - is currently removed from CRAN, but can still be installed from - following instructions there. + Various plotting functions are included to facilitate + visualization and interpretation of these statistics. } \details{ See \code{vignette("rehh", package = "rehh")} for an overview of the package and @@ -42,6 +37,10 @@ genome-wide SNP data from haplotype structure. \emph{Bioinformatics}, \strong{28 Gautier M., Klassmann A., and Vitalis R. (2017). rehh 2.0: a reimplementation of the R package rehh to detect positive selection from haplotype structure. \emph{Molecular Ecology Resources}, \strong{17}, 78-90. +Klassmann, A. and Gautier, M. (2020). Detecting selection using Extended Haplotype +Homozygosity-based statistics on unphased or unpolarized data (preprint). +https://doi.org/10.22541/au.160405572.29972398/v1 + Sabeti, P.C. et al. (2002). Detecting recent positive selection in the human genome from haplotype structure. \emph{Nature}, \strong{419}, 832-837. Sabeti, P.C. et al. (2007). Genome-wide detection and characterization of positive selection in human populations. \emph{Nature}, \strong{449}, 913-918. @@ -55,7 +54,7 @@ Useful links: \itemize{ \item \url{https://CRAN.R-project.org/package=rehh} \item \url{https://gitlab.com/oneoverx/rehh} - \item Report bugs at \url{https://gitlab.com/oneoverx/rehh/issues} + \item Report bugs at \url{https://gitlab.com/oneoverx/rehh/-/issues} } } diff --git a/man/scan_hh.Rd b/man/scan_hh.Rd index 36b20ed..17894ec 100644 --- a/man/scan_hh.Rd +++ b/man/scan_hh.Rd @@ -72,14 +72,14 @@ The returned value is a dataframe with markers in rows and the following columns \item sample frequency of the ancestral / major allele \item sample frequency of the second-most frequent remaining allele \item number of evaluated haplotypes at the focal marker for the ancestral / major allele -\item number of evaluated haplotypes at the focal marker for the second-most frequent remaining allele +\item number of evaluated haplotypes at the focal marker for the second-most frequent remaining allele \item iHH of the ancestral / major allele \item iHH of the second-most frequent remaining allele \item iES (used by Sabeti et al 2007) \item inES (used by Tang et al 2007)} Note that in case of unphased data the evaluation is restricted to haplotypes of homozygous individuals which reduces the power -to detect selection, particularly for iHS (for appropriate parameter setting +to detect selection, particularly for iHS (for appropriate parameter setting see the main vignette and Klassmann et al (2020)). } \description{ @@ -93,7 +93,7 @@ times faster as a procedure calling in turn \code{calc_ehh} and \code{calc_ehhs} for all markers. To perform a whole genome-scan this function needs to be called for each chromosome and results concatenated. -Note that setting \code{limehh} or \code{limehhs} to zero is likely to reduce power, +Note that setting \code{limehh} or \code{limehhs} to zero is likely to reduce power, since even under neutrality a tiny fraction (<<0.05) of extremely long shared haplotypes is expected which, if fully accounted for, would obfuscate the signal at selected sites. } @@ -106,8 +106,9 @@ scan <- scan_hh(haplohh_cgu_bta12) \references{ Gautier, M. and Naves, M. (2011). Footprints of selection in the ancestral admixture of a New World Creole cattle breed. \emph{Molecular Ecology}, \strong{20}, 3128-3143. -Klassmann, A. et al. (2020). Detecting selection using Extended Haplotype -Homozygosity (EHH)-based statistics on unphased or unpolarized data. (submitted). +Klassmann, A. and Gautier, M. (2020). Detecting selection using Extended Haplotype +Homozygosity-based statistics on unphased or unpolarized data (preprint). +https://doi.org/10.22541/au.160405572.29972398/v1 Sabeti, P.C. et al. (2002). Detecting recent positive selection in the human genome from haplotype structure. \emph{Nature}, \strong{419}, 832-837. diff --git a/man/scan_hh_full.Rd b/man/scan_hh_full.Rd index 9d7d0c6..7c6f092 100644 --- a/man/scan_hh_full.Rd +++ b/man/scan_hh_full.Rd @@ -9,6 +9,7 @@ scan_hh_full( phased = TRUE, polarized = TRUE, maxgap = NA, + max_extend = NA, discard_integration_at_border = TRUE, geometric.mean = FALSE, threads = 1 @@ -25,14 +26,17 @@ EHH(S) is then estimated over individuals which are homozygous at the focal mark are more than two alleles then the minor allele refers to the second-most frequent allele.} \item{maxgap}{maximum allowed gap in bp between two markers. If exceeded, further calculation of EHH(S) is stopped at the gap -(default=\code{NA}, i.e no limitation).} +(default=\code{NA}, i.e. no limitation).} -\item{discard_integration_at_border}{logical. If \code{TRUE} (default) and computation of any of the statistics reaches first or last +\item{max_extend}{maximum distance in bp to extend shared haplotypes away from the focal marker. +(default \code{NA}, i.e. no limitation).} + +\item{discard_integration_at_border}{logical. If \code{TRUE} (default) and computation of any of the statistics reaches first or last marker or a gap larger than \code{maxgap}, iHH, iES and inES are set to \code{NA}.} \item{geometric.mean}{logical. If \code{FALSE} (default), the standard arithmetic mean is used to average -shared haplotype lengths. If \code{TRUE} -the geometric mean is used instead (IES values are undefined in this case). Note that usage of the geometric mean has not +shared haplotype lengths. If \code{TRUE} +the geometric mean is used instead (IES values are undefined in this case). Note that usage of the geometric mean has not yet been studied formally and should be considered experimental!} \item{threads}{number of threads to parallelize computation} @@ -45,24 +49,23 @@ The returned value is a dataframe with markers in rows and the following columns \item sample frequency of the ancestral / major allele \item sample frequency of the second-most frequent remaining allele \item number of evaluated haplotypes at the focal marker for the ancestral / major allele -\item number of evaluated haplotypes at the focal marker for the second-most frequent remaining allele +\item number of evaluated haplotypes at the focal marker for the second-most frequent remaining allele \item iHH of the ancestral / major allele \item iHH of the second-most frequent remaining allele \item iES (used by Sabeti et al 2007) \item inES (used by Tang et al 2007)} Note that in case of unphased data the evaluation is restricted to haplotypes of homozygous individuals which reduces the power -to detect selection, particularly for iHS (for appropriate parameter setting +to detect selection, particularly for iHS (for appropriate parameter setting see the main vignette and Klassmann et al (2020)). } \description{ Compute integrated EHH (iHH), integrated EHHS (iES) and integrated normalized EHHS (inES) for all markers of a chromosome (or linkage group). -This function computes the statistics by a slightly different algorithm than \code{\link{scan_hh}}: it sidesteps the calculation of EHH and EHHS values and their subsequent integration and -consequently no cut-offs relying on these values can be specified. Instead -it computes the full lengths of pairwise shared haplotypes and averages them afterwords. +This function computes the statistics by a slightly different algorithm than \code{\link{scan_hh}}: it sidesteps the calculation of EHH and EHHS values and their subsequent integration and +consequently no cut-offs relying on these values can be specified. Instead, +it computes the (full) lengths of pairwise shared haplotypes and averages them afterwords. -This function is (as yet) exclusively intended for the study of general properties of these statistics -using simulated data. The omission of all cut-offs is not recommended for a scan on experimental data. +This function is primarily intended for the study of general properties of these statistics using simulated data. } \details{ Integrated EHH (iHH), integrated EHHS (iES) and integrated normalized EHHS (inES) @@ -70,29 +73,43 @@ are computed for all markers of the chromosome (or linkage group). This function the computation of EHH and EHHS values and their stepwise integration. Instead, the length of all shared haplotypes is computed and afterwords averaged. In the absence of missing values the statistics are identical to those calculated by \code{\link{scan_hh}} with settings -\code{limehh = 0}, \code{limehhs = 0} and \code{interpolate = FALSE}, yet this function is faster. -The former two settings are however not recommended for the application on experimental data -(see vignette). - -If \code{discard_integration_at_border} is set to \code{TRUE} and the extension of shared haplotypes -reaches a border (i.e. chromosomal boundaries or a gap larger than \code{maxgap}), this function discards all statistics, -while \code{\link{scan_hh}} handles each statistic independently. - -\code{\link{scan_hh}} "removes" chromosomes with missing values from further calculations, -while this function treats each missing value -as a different allele. This yields a somewhat faster decay of all statistics with respect to the -distance to the focal marker. +\code{limehh = 0}, \code{limehhs = 0}, \code{lower_ehh_y_bound = 0} and \code{interpolate = FALSE}, yet this function is faster. + +Application of a cut-off is necessary for reducing the spurious signals +of selection caused by single shared haplotypes of extreme length. Hence, e.g. for human experimental data +it might be reasonable to set \code{max_extend} to 1 or 2 Mb. + +\code{\link{scan_hh}} computes the statistics iHH_A, ihh_D and iES/inES separately, +while this function calculates them simultaneously. Hence, +if \code{discard_integration_at_border} is set to \code{TRUE} and the extension of shared haplotypes +reaches a border (i.e. chromosomal boundaries or a gap larger than \code{maxgap}), this function discards +all statistics. + +The handling of missing values is different, too: \code{\link{scan_hh}} "removes" chromosomes with missing values from further calculations. +EHH and EHHS are then calculated for the remaining chromosomes which can accidentally yield an increase in EHH or EHHS. +This can not happen with \code{scan_hh_full()} which treats each missing value of a marker +as if it were a new allele - terminating any shared haplotype, but does changing the +set of considered chromosomes. Thus, missing values +cause a faster decay of EHH(S) with function \code{scan_hh_full()}. } \examples{ #example haplohh object (280 haplotypes, 1424 SNPs) #see ?haplohh_cgu_bta12 for details data(haplohh_cgu_bta12) -scan <- scan_hh_full(haplohh_cgu_bta12) +#using function scan_hh() with no cut-offs +scan <- scan_hh(haplohh_cgu_bta12, discard_integration_at_border = FALSE, +limehh = 0, limehhs = 0, lower_ehh_y_bound = 0, interpolate = FALSE) +#using function scan_hh_full() +scan_full <- scan_hh_full(haplohh_cgu_bta12, discard_integration_at_border = FALSE) +#both yield identical results within numerical precision +all.equal(scan, scan_full) } \references{ Gautier, M. and Naves, M. (2011). Footprints of selection in the ancestral admixture of a New World Creole cattle breed. \emph{Molecular Ecology}, \strong{20}, 3128-3143. -Klassmann A., Vitalis R., and Gautier M. Detecting selection using Extended Haplotype Homozygosity (EHH)-based statistics on unphased or unpolarized data. Preprint. https://doi.org/10.22541/au.158584282.24875401. +Klassmann, A. and Gautier, M. (2020). Detecting selection using Extended Haplotype +Homozygosity-based statistics on unphased or unpolarized data (preprint). +https://doi.org/10.22541/au.160405572.29972398/v1 Sabeti, P.C. et al. (2002). Detecting recent positive selection in the human genome from haplotype structure. \emph{Nature}, \strong{419}, 832-837. @@ -103,6 +120,6 @@ Tang, K. and Thornton, K.R. and Stoneking, M. (2007). A New Approach for Using G Voight, B.F. and Kudaravalli, S. and Wen, X. and Pritchard, J.K. (2006). A map of recent positive selection in the human genome. \emph{Plos Biology}, \strong{4}, e72. } \seealso{ -\code{\link{data2haplohh}}, code{\link{scan_hh}}, -\code{\link{ihh2ihs}},\code{\link{ines2rsb}}, \code{\link{ies2xpehh}} +\code{\link{data2haplohh}}, \code{\link{scan_hh}}, +\code{\link{ihh2ihs}}, \code{\link{ines2rsb}}, \code{\link{ies2xpehh}} } diff --git a/man/subset.haplohh.Rd b/man/subset.haplohh.Rd index fd94093..87a99b6 100644 --- a/man/subset.haplohh.Rd +++ b/man/subset.haplohh.Rd @@ -11,6 +11,7 @@ min_perc_geno.hap = NA, min_perc_geno.mrk = 100, min_maf = NA, + max_alleles = NA, verbose = TRUE, ... ) @@ -36,6 +37,9 @@ In case of multi-allelic markers the second-most frequent allele is referred to Setting this value to zero eliminates monomorphic sites. Default is \code{NA}, hence no constraint.} +\item{max_alleles}{threshold for the maximum number of different alleles at a site. Default is \code{NA}, +hence no restriction. In order to retain only bi-allelic markers, set this parameter to 2.} + \item{verbose}{logical. If \code{TRUE} (default), report verbose progress.} \item{...}{further arguments are ignored.} diff --git a/src/CALL_PAIRWISE_HAPLEN.c b/src/CALL_PAIRWISE_HAPLEN.c index b048d6f..f09acde 100644 --- a/src/CALL_PAIRWISE_HAPLEN.c +++ b/src/CALL_PAIRWISE_HAPLEN.c @@ -7,7 +7,8 @@ * R objects are marked by a trailing underscore. */ SEXP CALL_PAIRWISE_HAPLEN(SEXP data_, SEXP nbr_chr_, SEXP nbr_mrk_, SEXP map_, - SEXP foc_mrk_, SEXP max_gap_, SEXP phased_, SEXP pairwise_haplen_) { + SEXP foc_mrk_, SEXP maxgap_, SEXP max_extend_, SEXP side_, + SEXP phased_, SEXP pairwise_haplen_) { //get pointer to R data vector int* data = INTEGER(data_); @@ -16,14 +17,16 @@ SEXP CALL_PAIRWISE_HAPLEN(SEXP data_, SEXP nbr_chr_, SEXP nbr_mrk_, SEXP map_, int nbr_chr = asInteger(nbr_chr_); int nbr_mrk = asInteger(nbr_mrk_); int foc_mrk = asInteger(foc_mrk_) - 1; //change to C indexing! - double max_gap = asReal(max_gap_); + int maxgap = asInteger(maxgap_); + int max_extend = asInteger(max_extend_); + int side = asInteger(side_); double* map = REAL(map_); bool phased = asLogical(phased_); double* pairwise_haplen = REAL(pairwise_haplen_); //perform calculation - calc_pairwise_haplen(data, nbr_chr, nbr_mrk, map, foc_mrk, max_gap, phased, 0, pairwise_haplen); + calc_pairwise_haplen(data, nbr_chr, nbr_mrk, map, foc_mrk, maxgap, max_extend, side, phased, false, pairwise_haplen); return ScalarLogical(1); } diff --git a/src/CALL_SCAN_HH2.c b/src/CALL_SCAN_HH_FULL.c similarity index 95% rename from src/CALL_SCAN_HH2.c rename to src/CALL_SCAN_HH_FULL.c index 772fb77..4798a4e 100644 --- a/src/CALL_SCAN_HH2.c +++ b/src/CALL_SCAN_HH_FULL.c @@ -10,8 +10,8 @@ * Interface between R and C. * R objects are marked by a trailing underscore. */ -SEXP CALL_SCAN_HH2(SEXP data_, SEXP nbr_chr_, SEXP nbr_mrk_, SEXP first_allele_, SEXP second_allele_, - SEXP map_, SEXP max_gap_, SEXP phased_, SEXP discard_integration_at_border_, +SEXP CALL_SCAN_HH_FULL(SEXP data_, SEXP nbr_chr_, SEXP nbr_mrk_, SEXP first_allele_, SEXP second_allele_, + SEXP map_, SEXP max_gap_, SEXP max_extend_, SEXP phased_, SEXP discard_integration_at_border_, SEXP geometric_mean_, SEXP nbr_threads_) { //get pointer to R data vectors @@ -24,6 +24,7 @@ SEXP CALL_SCAN_HH2(SEXP data_, SEXP nbr_chr_, SEXP nbr_mrk_, SEXP first_allele_, int nbr_chr = asInteger(nbr_chr_); int nbr_mrk = asInteger(nbr_mrk_); int max_gap = asInteger(max_gap_); + int max_extend = asInteger(max_extend_); bool phased = asLogical(phased_); bool discard_integration_at_border = asLogical(discard_integration_at_border_); bool geometric_mean = asLogical(geometric_mean_); @@ -73,7 +74,7 @@ SEXP CALL_SCAN_HH2(SEXP data_, SEXP nbr_chr_, SEXP nbr_mrk_, SEXP first_allele_, pairwise_haplen[i] = 0; } - bool discard = calc_pairwise_haplen(data, nbr_chr, nbr_mrk, map, j, max_gap, + bool discard = calc_pairwise_haplen(data, nbr_chr, nbr_mrk, map, j, max_gap, max_extend, BOTH, phased, discard_integration_at_border, pairwise_haplen) == 1; if(discard){ diff --git a/src/CALL_SFS_TESTS.c b/src/CALL_SFS_TESTS.c new file mode 100644 index 0000000..1e85926 --- /dev/null +++ b/src/CALL_SFS_TESTS.c @@ -0,0 +1,33 @@ +#include +#include "definitions.h" +#include "calc_sfs_tests.h" + +/** + * Interface between R and C. + * R objects are marked by a trailing underscore. + */ +SEXP CALL_SFS_TESTS(SEXP data_, SEXP nbr_chr_, SEXP nbr_mrk_, SEXP map_, + SEXP polarized_, SEXP windows_, SEXP n_windows_, SEXP right_, + SEXP min_n_mrk_, SEXP n_mrk_, SEXP results_) { + + //get pointer to R data vectors + int* data = INTEGER(data_); + double* windows = REAL(windows_); + double* map = REAL(map_); + int* n_mrk = INTEGER(n_mrk_); + double *results = REAL(results_); + + //translate R vectors of size 1 to numbers + int nbr_chr = asInteger(nbr_chr_); + int nbr_mrk = asInteger(nbr_mrk_); + bool polarized = asLogical(polarized_); + bool right = asLogical(right_); + int n_windows = asInteger(n_windows_); + int min_n_mrk = asInteger(min_n_mrk_); + + calc_sfs_tests(data, nbr_chr, nbr_mrk, map, polarized, + windows, n_windows, right, min_n_mrk, + n_mrk, results); + + return ScalarLogical(1); +} diff --git a/src/calc_pairwise_haplen.c b/src/calc_pairwise_haplen.c index bc03229..fd418dd 100644 --- a/src/calc_pairwise_haplen.c +++ b/src/calc_pairwise_haplen.c @@ -8,19 +8,23 @@ */ int extend_haplen(const int* const data, const int nbr_chr, const double* map, const int foc_mrk, const int end_mrk, int* const hap, int* const nbr_hap, int* const nbr_chr_with_hap, - const int max_gap, const bool discard_integration_at_border, double* const pairwise_haplen) { + const int maxgap, const int max_extend, const bool discard_integration_at_border, double* const pairwise_haplen) { int increment = foc_mrk <= end_mrk ? 1 : -1; int mrk; for (mrk = foc_mrk + increment; mrk != end_mrk + increment; mrk += increment) { // walk along the chromosome, away from the focal SNP double gap = increment * (map[mrk] - map[mrk - increment]); - if(gap > max_gap){ + if(gap > maxgap){ if(discard_integration_at_border){ return(1); } break; } + double length = increment * (map[mrk] - map[foc_mrk]); + if(length > max_extend){ + break; + } if (update_hap_with_lengths(data, nbr_chr, mrk, hap, nbr_hap, nbr_chr_with_hap, length, pairwise_haplen)) { int tot_nbr_chr_in_hap = 0; @@ -65,8 +69,9 @@ int extend_haplen(const int* const data, const int nbr_chr, const double* map, c * Computes furcation trees on both sides of the focal marker. */ int calc_pairwise_haplen(const int* const data, const int nbr_chr, const int nbr_mrk, - const double* map, const int foc_mrk, const int max_gap, - const bool phased, const bool discard_integration_at_border, + const double* map, const int foc_mrk, const int maxgap, + const int max_extend, const int side, const bool phased, + const bool discard_integration_at_border, double* const pairwise_haplen) { int nbr_hap; //number of distinct haplotypes @@ -75,16 +80,20 @@ int calc_pairwise_haplen(const int* const data, const int nbr_chr, const int nbr int *hap = (int*) malloc(nbr_chr * sizeof(int)); //vector index to chromosomes, ordered by haplotype int *nbr_chr_with_hap = (int*) malloc(nbr_chr * sizeof(int)); //for each haplotype gives the number of chromosomes sharing it - init_site_hap(data, nbr_chr, foc_mrk, phased, hap, &nbr_hap, nbr_chr_with_hap); - - discard = extend_haplen(data, nbr_chr, map, foc_mrk, 0, hap, &nbr_hap, nbr_chr_with_hap, - max_gap, discard_integration_at_border, pairwise_haplen); - if(!discard){ + if(side == BOTH || side == LEFT){ init_site_hap(data, nbr_chr, foc_mrk, phased, hap, &nbr_hap, nbr_chr_with_hap); - discard = extend_haplen(data, nbr_chr, map, foc_mrk, nbr_mrk - 1, hap, &nbr_hap, nbr_chr_with_hap, - max_gap, discard_integration_at_border, pairwise_haplen); + + discard = extend_haplen(data, nbr_chr, map, foc_mrk, 0, hap, &nbr_hap, nbr_chr_with_hap, + maxgap, max_extend, discard_integration_at_border, pairwise_haplen); } + if(side == BOTH || side == RIGHT){ + if(!discard){ + init_site_hap(data, nbr_chr, foc_mrk, phased, hap, &nbr_hap, nbr_chr_with_hap); + discard = extend_haplen(data, nbr_chr, map, foc_mrk, nbr_mrk - 1, hap, &nbr_hap, nbr_chr_with_hap, + maxgap, max_extend, discard_integration_at_border, pairwise_haplen); + } + } free(hap); free(nbr_chr_with_hap); return(discard ? 1 : 0); diff --git a/src/calc_pairwise_haplen.h b/src/calc_pairwise_haplen.h index f626c92..a3fd88e 100644 --- a/src/calc_pairwise_haplen.h +++ b/src/calc_pairwise_haplen.h @@ -1,5 +1,8 @@ #include "definitions.h" +#define BOTH 0 +#define LEFT 1 +#define RIGHT 2 int calc_pairwise_haplen(const int* const data, const int nbr_chr, const int nbr_mrk, const double* map, - const int foc_mrk, const int max_gap, + const int foc_mrk, const int max_gap, const int max_extend, const int side, const bool phased, const bool discard_integration_at_border, double* const haplength); diff --git a/src/calc_sfs_tests.c b/src/calc_sfs_tests.c new file mode 100644 index 0000000..528d9b4 --- /dev/null +++ b/src/calc_sfs_tests.c @@ -0,0 +1,180 @@ +#include "calc_sfs_tests.h" +#include "sfs_moments.h" + + +void calc_sfs_tests(int* data, int nbr_chr, int nbr_mrk, double* map, + bool polarized, double* windows, int n_windows, bool right, + int min_n_mrk, int* n_mrk, double* results){ + int n_tests = 3; + int fs_size; + + double* T = (double*) malloc(n_tests * sizeof(double)); + double* alpha = (double*) malloc(n_tests * sizeof(double)); + double* beta = (double*) malloc(n_tests * sizeof(double)); + double** secMom0; + + double*** Omega = (double***) malloc(n_tests * sizeof(double**)); + + for(int test = 0; test < n_tests; test++){ + Omega[test] = (double**) malloc(nbr_chr * sizeof(double*)); + for(int n = 1; n < nbr_chr; n++){ + /* weights for sample size n */ + Omega[test][n] = (double*) malloc(n * sizeof(double)); + } + } + + double** omega_S = (double**) malloc(nbr_chr * sizeof(double*)); + double** omega_PI = (double**) malloc(nbr_chr * sizeof(double*)); + double** omega_L = (double**) malloc(nbr_chr * sizeof(double*)); + double** fs0 = (double**) malloc(nbr_chr * sizeof(double*)); + + /* get weights and null spectrum for each possible (sub-)sample size */ + if (polarized) { + for(int n = 2; n <= nbr_chr; n++){ + omega_S[n - 1] = getOmega(n, 'S'); + omega_PI[n - 1] = getOmega(n, 'P'); + omega_L[n - 1] = getOmega(n, 'L'); + fs0[n - 1] = getXi0(n); + } + secMom0 = getSigma(nbr_chr); + }else{ + for(int n = 2; n <= nbr_chr; n++){ + omega_S[n - 1] = getOmegaStar(n,'S'); + omega_PI[n - 1] = getOmegaStar(n, 'P'); + omega_L[n - 1] = getOmegaStar(n, 'L'); + fs0[n - 1] = getEta0(n); + } + secMom0 = getRho(nbr_chr); + } + + for(int n = 1; n < nbr_chr; n++){ + for(int i = 0; i < n; i++){ + Omega[0][n][i] = omega_PI[n][i] - omega_S[n][i]; + Omega[1][n][i] = omega_PI[n][i] - omega_L[n][i]; + Omega[2][n][i] = omega_L[n][i] - omega_S[n][i]; + } + } + + if (polarized) { + fs_size = nbr_chr - 1; + }else{ + fs_size = nbr_chr / 2; + } + + /* + * compute coefficients for theta-square estimation + */ + double an = getWeightedFirstMoment(omega_S[nbr_chr - 1], fs0[nbr_chr - 1], fs_size); + double bn = getWeightedSecondMoment(omega_S[nbr_chr - 1], fs0[nbr_chr - 1], secMom0, fs_size); + + /* + * compute coefficients for the variance in the nominator + */ + for (int test = 0; test < n_tests; test++) { + alpha[test] = getWeightedFirstMoment(Omega[test][nbr_chr - 1], fs0[nbr_chr - 1], fs_size); + beta[test] = getWeightedSecondMoment(Omega[test][nbr_chr - 1], fs0[nbr_chr - 1], secMom0, fs_size); + } + + int start_mrk = 0; + for(int window = 0; window < n_windows; window++){ + /* increase marker until it exceeds left window boundary */ + while(((right && map[start_mrk] <= windows[window]) + ||(!right && map[start_mrk] < windows[window])) + && start_mrk < nbr_mrk){ + start_mrk++; + } + + double theta_S = 0.0, theta_PI = 0.0, theta_L = 0.0; + + int mrk = start_mrk; + int n_polymorphic_mrk = 0; + + /* increase marker until it exceeds right window boundary */ + while(((right && map[mrk] <= windows[n_windows + window]) + || (!right && map[mrk] < windows[n_windows + window])) + && mrk < nbr_mrk){ + + int x = 0; + int n = 0; + int first_non_missing_allele = -1; + for(int chr = 0; chr < nbr_chr; chr++){ + if(data[mrk * nbr_chr + chr] != MISSING_VALUE){ + n++; + if(polarized){ + /* any non-zero allele is taken as derived */ + if(data[mrk * nbr_chr + chr] > 0){ + x++; + } + /* count first non-missing allele */ + }else{ + if(first_non_missing_allele == -1){ + first_non_missing_allele = data[mrk * nbr_chr + chr]; + x++; + }else if(data[mrk * nbr_chr + chr] == first_non_missing_allele){ + x++; + } + } + } + } + /* fold spectrum */ + if(!polarized && x > (n + 1) / 2){ + x = n - x; + } + + /* if polymorphic ... */ + if(x > 0 && x < n){ + n_polymorphic_mrk++; + theta_S += (1. / fs0[n - 1][x - 1] ) * omega_S[n - 1][x - 1]; + theta_PI += (1. / fs0[n - 1][x - 1]) * omega_PI[n - 1][x - 1]; + theta_L += (1. / fs0[n - 1][x - 1]) * omega_L[n - 1][x - 1]; + } + mrk++; + } + + if(n_polymorphic_mrk > min_n_mrk){ + results[window] = theta_S; + results[n_windows + window] = theta_PI; + results[n_windows * 2 + window] = theta_L; + + double theta_S_squared = (theta_S * theta_S - an * theta_S) / (1. + bn); + + T[0] = theta_PI - theta_S; + T[1] = theta_PI - theta_L; + T[2] = theta_L - theta_S; + + for(int test = 0; test < n_tests; test++){ + if (T[test] != 0) { + T[test] /= sqrt(alpha[test] * theta_S + beta[test] * theta_S_squared); + } + results[n_windows * (test + 3) + window] = T[test]; + } + } + + n_mrk[window] = n_polymorphic_mrk; + } + + for(int test = 0; test < n_tests; test++){ + for(int n = 1; n < nbr_chr; n++){ + free(Omega[test][n]); + } + free(Omega[test]); + } + free(Omega); + for(int n = 1; n < nbr_chr; n++){ + free(omega_S[n]); + free(omega_PI[n]); + free(omega_L[n]); + free(fs0[n]); + } + free(omega_S); + free(omega_PI); + free(omega_L); + free(fs0); + for(int i = 0; i < fs_size; i++){ + free(secMom0[i]); + } + free(secMom0); + free(alpha); + free(beta); + free(T); +} diff --git a/src/calc_sfs_tests.h b/src/calc_sfs_tests.h new file mode 100644 index 0000000..2cf77ec --- /dev/null +++ b/src/calc_sfs_tests.h @@ -0,0 +1,6 @@ +#include +#include "definitions.h" + +void calc_sfs_tests(int* data, int nbr_chr, int nbr_mrk, double* map, + bool polarized, double* windows, int n_windows, bool right, + int min_n_mrk, int* n_mrk, double* results); diff --git a/src/init.c b/src/init.c index e2b8823..edfa943 100644 --- a/src/init.c +++ b/src/init.c @@ -9,10 +9,12 @@ extern SEXP CALL_EHH(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); extern SEXP CALL_EHHS(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); extern SEXP CALL_FURCATION(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); extern SEXP CALL_INTEGRAL(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); -extern SEXP CALL_PAIRWISE_HAPLEN(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); +extern SEXP CALL_PAIRWISE_HAPLEN(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); extern SEXP CALL_SCAN_HH(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); -extern SEXP CALL_SCAN_HH2(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); +extern SEXP CALL_SCAN_HH_FULL(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); extern SEXP CALL_ASNEWICK(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); +extern SEXP CALL_SFS_TESTS(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); + static const R_CallMethodDef CallEntries[] = { @@ -20,10 +22,11 @@ static const R_CallMethodDef CallEntries[] = { {"CALL_EHHS", (DL_FUNC) &CALL_EHHS, 8}, {"CALL_FURCATION", (DL_FUNC) &CALL_FURCATION, 7}, {"CALL_INTEGRAL", (DL_FUNC) &CALL_INTEGRAL, 9}, - {"CALL_PAIRWISE_HAPLEN",(DL_FUNC) &CALL_PAIRWISE_HAPLEN,8}, + {"CALL_PAIRWISE_HAPLEN",(DL_FUNC) &CALL_PAIRWISE_HAPLEN,10}, {"CALL_SCAN_HH", (DL_FUNC) &CALL_SCAN_HH, 18}, - {"CALL_SCAN_HH2", (DL_FUNC) &CALL_SCAN_HH2, 11}, + {"CALL_SCAN_HH_FULL", (DL_FUNC) &CALL_SCAN_HH_FULL, 12}, {"CALL_ASNEWICK", (DL_FUNC) &CALL_ASNEWICK, 6}, + {"CALL_SFS_TESTS", (DL_FUNC) &CALL_SFS_TESTS, 11}, {NULL, NULL, 0} }; diff --git a/src/sfs_moments.c b/src/sfs_moments.c new file mode 100644 index 0000000..7fbb5b4 --- /dev/null +++ b/src/sfs_moments.c @@ -0,0 +1,303 @@ +#include "sfs_moments.h" + +/** + * Harmonic numbers: 1+1/2+1/3+...+1/i + */ +double* getHarmonicNumbers(int n) { + double* HarmonicNumbers = (double*) malloc(n*sizeof(double)); + + if (!HarmonicNumbers) + return 0; + + HarmonicNumbers[0] = 0; + + for (int i = 1; i < n; i++) { + HarmonicNumbers[i] = HarmonicNumbers[i - 1] + 1.0 / i; + } + + return HarmonicNumbers; +} + +/** + * The expected Xi spectrum under constant population size + * cf. equation (1) of Fu 1995. + */ +double* getXi0(int n) { + double* xi0 = (double*) malloc((n-1)*sizeof(double)); + for (int i = 1; i < n; i++) { + xi0[i - 1] = 1. / i; + } + return xi0; +} + +/** + * the expected Eta spectrum under constant population size + * cf. equations (6) and (7) of Fu 1995. + */ +double* getEta0(int n) { + double* eta0 = (double*) malloc(n / 2 * sizeof(double)); + for (int i = 1; i <= n / 2; i++) { + eta0[i - 1] = (1. / i + 1. / (n - i)) / (1. + KRONECKER(i, n - i)); + } + return eta0; +} + +/* + * cf. equation (6) of Fu 1995. + */ +double getBeta(int i, double *HarmonicNumbers, int n) { + double ai = HarmonicNumbers[i - 1], an = HarmonicNumbers[n - 1]; + double beta = 0; + + beta = 2.0 * n * (an + (1.0 / n) - ai) / ((n - i + 1.0) * (n - i)) + - 2.0 / (n - i); + + return beta; +} + +/* + * cf. equation (2) of Fu 1995. + */ +double getSigma_ii(int i, double *HarmonicNumbers, int n) { + double sigma_ii = 0; + double ai = HarmonicNumbers[i - 1], an = HarmonicNumbers[n - 1]; + + if (2 * i < n) { + sigma_ii = getBeta(i + 1, HarmonicNumbers, n); + } else { + if (2 * i == n) { + sigma_ii = 2.0 * (an - ai) / (n - i) - 1.0 / (i * i); + } else { + sigma_ii = getBeta(i, HarmonicNumbers, n) - 1.0 / (i * i); + } + + } + + return sigma_ii; +} + +/* + * cf. equation (3) of Fu 1995. + */ +double getSigma_ij(int i, int j, double *HarmonicNumbers, int n) { + double sigma_ij = 0; + + if (i == j) { + return getSigma_ii(i, HarmonicNumbers, n); + } + + if (i < j) { + int tmp = i; + i = j; + j = tmp; + } + + double ai = HarmonicNumbers[i - 1], aj = HarmonicNumbers[j - 1], an = + HarmonicNumbers[n - 1]; + + if (i + j < n) { + sigma_ij = (getBeta(i + 1, HarmonicNumbers, n) + - getBeta(i, HarmonicNumbers, n)) / 2.0; + } else { + if (i + j == n) { + sigma_ij = ((an - ai) / (n - i) + (an - aj) / (n - j)) + - ((getBeta(i, HarmonicNumbers, n) + + getBeta(j + 1, HarmonicNumbers, n)) / 2.0) + - (1.0 / (i * j)); + } else { + sigma_ij = ((getBeta(j, HarmonicNumbers, n) + - getBeta(j + 1, HarmonicNumbers, n)) / 2.0) + - (1.0 / (i * j)); + } + } + return sigma_ij; +} + + +/** + * sigma matrix = (sigma)ij + */ +double** getSigma(int n) { + int i, j; + double **sigma = (double**) malloc((n - 1) * sizeof(double*)); + double *HarmonicNumbers = getHarmonicNumbers(n); + + for (i = 0; i < n - 1; i++) { + sigma[i] = malloc((n - 1) * sizeof(double)); + } + + for (i = 0; i < n - 1; i++) { + for (j = i; j < n - 1; j++) { + if (i == j) { + sigma[i][i] = getSigma_ii(i + 1, HarmonicNumbers, n); + } else { + sigma[j][i] = sigma[i][j] = getSigma_ij(i + 1, j + 1, + HarmonicNumbers, n); + } + } + } + free(HarmonicNumbers); + return sigma; +} + + +/* + * cf. equation (9) of Fu 1995 + */ +double getRho_ii(int i, double *HarmonicSums, int n) { + double rho_ii; + + rho_ii = getSigma_ii(i, HarmonicSums, n) + + getSigma_ii(n - i, HarmonicSums, n) + + 2 * getSigma_ij(i, n - i, HarmonicSums, n); + rho_ii /= (1.0 + KRONECKER(i, n - i)) * (1.0 + KRONECKER(i, n - i)); + + return rho_ii; +} + +/* + * cf. equation (9) of Fu 1995 + */ +double getRho_ij(int i, int j, double *HarmonicSums, int n) { + double rho_ij; + + rho_ij = getSigma_ij(i, j, HarmonicSums, n) + + getSigma_ij(i, n - j, HarmonicSums, n) + + getSigma_ij(n - i, j, HarmonicSums, n) + + getSigma_ij(n - i, n - j, HarmonicSums, n); + rho_ij /= ((1.0 + KRONECKER(i, n - i)) * (1.0 + KRONECKER(j, n - j))); + + return rho_ij; +} + +/** + * rho matrix = (rho)ij + */ +double** getRho(int n) { + int i, j; + double **rho = (double**) malloc(n / 2 * sizeof(double*)); + double *HarmonicNumbers = getHarmonicNumbers(n); + + for (i = 0; i < n / 2; i++) { + rho[i] = (double*) malloc(n / 2 * sizeof(double)); + } + + for (i = 0; i < n / 2; i++) { + for (j = i; j < n / 2; j++) { + if (i == j) { + rho[i][i] = getRho_ii(i + 1, HarmonicNumbers, n); + } else { + rho[j][i] = rho[i][j] = getRho_ij(i + 1, j + 1, HarmonicNumbers, + n); + } + } + } + return rho; +} + +/* + * first coefficient in nominator of test statistic + */ +double getWeightedFirstMoment(double* weight, double* fs0, int fs_size) { + int i; + double alpha = 0; + + for (i = 0; i < fs_size; i++) { + alpha += weight[i] * weight[i] / fs0[i]; + } + + return alpha; +} + +/* + * second coefficient in nominator of test statistic + */ +double getWeightedSecondMoment(double* weight, double* fs0, double** secMom0, + int fs_size) { + int i, j; + double beta = 0; + + for (i = 0; i < fs_size; i++) { + for (j = 0; j < fs_size; j++) { + beta += (weight[i] / fs0[i]) * secMom0[i][j] * (weight[j] / fs0[j]); + } + } + + return beta; +} + +/* + * compute Omega for folded theta estimators + * cf. Table 1 of Achaz (2009) + */ +double* getOmegaStar(int n, char type) { + double* omega = (double*) malloc((n / 2) * sizeof(double)); + + switch(type){ + case 'S': + for (int i = 1; i <= n / 2; i++) { + omega[i - 1] = n / (double) (i * (n - i) * (1 + KRONECKER(i, n - i))) ; + } + break; + case 'P': + for (int i = 1; i <= n / 2; i++) { + omega[i - 1] = n / (double) (1 + KRONECKER(i, n - i)); + } + break; + case 'L': + for (int i = 1; i <= n / 2; i++) { + omega[i - 1] = 1. / (double) (1 + KRONECKER(i, n - i)); + } + break; + default: return NULL; + } + + double sum = 0.; + for (int i = 0; i < n / 2; i++) { + sum += omega[i]; + } + + for (int i = 0; i < n / 2; i++) { + omega[i] /= sum; + } + + return omega; +} + +/* + * compute omega for unfolded theta estimators + * cf. Table 1 of Achaz (2009) + */ +double* getOmega(int n, char type) { + double* omega = (double*) malloc((n - 1) * sizeof(double)); + + switch(type){ + case 'S': + for (int i = 1; i < n; i++) { + omega[i - 1] = 1.0 / i; + } + break; + case 'P': + for (int i = 1; i < n; i++) { + omega[i - 1] = (double) (n - i); + } + break; + case 'L': + for (int i = 1; i < n; i++) { + omega[i - 1] = 1.0; + } + break; + default: return NULL; + } + + double sum = 0.; + for (int i = 0; i < n - 1; i++) { + sum += omega[i]; + } + + for (int i = 0; i < n - 1; i++) { + omega[i] /= sum; + } + + return omega; +} diff --git a/src/sfs_moments.h b/src/sfs_moments.h new file mode 100644 index 0000000..684feea --- /dev/null +++ b/src/sfs_moments.h @@ -0,0 +1,13 @@ +#include +#include "definitions.h" + +#define KRONECKER( A, B ) (((A)==(B))?1:0) + +double* getXi0(int n); +double* getEta0(int n); +double** getSigma(int n); +double** getRho(int n); +double getWeightedFirstMoment(double* weight, double* fs0, int fs_size); +double getWeightedSecondMoment(double* weight, double* fs0, double** secMom0, int fs_size); +double* getOmega(int n, char test); +double* getOmegaStar(int n, char test); diff --git a/tests/testthat/test_data2haplohh.R b/tests/testthat/test_data2haplohh.R index 8356663..0020a94 100644 --- a/tests/testthat/test_data2haplohh.R +++ b/tests/testthat/test_data2haplohh.R @@ -109,7 +109,8 @@ test_that("checked_bta", { hap4 <- data2haplohh(hap_file = "bta12_cgu.vcf.gz", - polarize_vcf = FALSE, vcf_reader = "data.table") + polarize_vcf = FALSE, + vcf_reader = "data.table") expect_equal(hap3, hap4) @@ -117,7 +118,8 @@ test_that("checked_bta", { hap5 <- data2haplohh(hap_file = "bta12_cgu.vcf.gz", - polarize_vcf = FALSE, vcf_reader = "vcfR") + polarize_vcf = FALSE, + vcf_reader = "vcfR") expect_equal(hap3, hap5) @@ -125,3 +127,37 @@ test_that("checked_bta", { file.remove("test_bta.log") }) + +test_that("data2haplohh_AA_scan", { + skip_if_not_installed("data.table") + skip_if_not_installed("vcfR") + sink(file = "test_AA_scan.log") + + tmp <- tempfile() + ## check if ancestral allele is correctly parsed + writeLines( + c( + "##fileformat=VCFv4.2", + "##INFO=", + "##INFO=", + "##INFO=", + "##FORMAT=", + "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tHG1\tHG2", + "chr1\t10000\trs1\tG\tT\t100\tPASS\tAA=G\tGT\t.\t0|0", + "chr1\t20000\trs2\tG\tA\t100\tPASS\t.\tGT\t0\t1|0", + "chr1\t30000\trs3\tA\tC\t100\tPASS\tAC=2;AF=0.5;AA=C;VT=SNP\tGT\t0\t1|0" + ), + tmp + ) + + hh1 <- + suppressWarnings(data2haplohh(tmp, vcf_reader = "data.table", verbose = FALSE)) + hh2 <- + suppressWarnings(data2haplohh(tmp, vcf_reader = "vcfR", verbose = FALSE)) + + expect_equal(hh1, hh2) + + unlink(tmp) + sink() + file.remove("test_AA_scan.log") +}) diff --git a/tests/testthat/test_furcation.R b/tests/testthat/test_furcation.R index 2e3d3ba..2992e9c 100644 --- a/tests/testthat/test_furcation.R +++ b/tests/testthat/test_furcation.R @@ -106,6 +106,22 @@ test_that("checked_furcation", { expect_identical(as.newick(f, allele = 1, side = "left"), expected_newick[3]) expect_identical(as.newick(f, allele = 1, side = "right"), expected_newick[4]) + ## haplen calculates longest shared haplotype per chromosome, + ## for left and right side independently + h <- calc_haplen(f) + lengths_haplen_left <- positions(hh)["F1205400"] - h$haplen$MIN + lengths_haplen_right <- h$haplen$MAX - positions(hh)["F1205400"] + + ## pairwise haplen calculates length of all mutually shared haplotypes + ph <- calc_pairwise_haplen(hh, mrk = "F1205400", side = "left") + lengths_longest_pairwise_haplen_left <- apply(ph, 1, max) + expect_equivalent(lengths_haplen_left, lengths_longest_pairwise_haplen_left) + + ph <- calc_pairwise_haplen(hh, mrk = "F1205400", side = "right") + lengths_longest_pairwise_haplen_right <- apply(ph, 1, max) + expect_equivalent(lengths_haplen_right, + lengths_longest_pairwise_haplen_right) + sink() file.remove("test_furcation.log") }) diff --git a/tests/testthat/test_scan_full.R b/tests/testthat/test_scan_full.R index 78e52a1..b2b2a83 100644 --- a/tests/testthat/test_scan_full.R +++ b/tests/testthat/test_scan_full.R @@ -1,5 +1,5 @@ -#compares iHH calculated from file with pre-calculated values in dataset -#compares iHH and iES from the functions calc_ehh(s) with scan_hh +#checks equivalence between scan_hh_full and scan_hh +#and scan_hh_full and calc_pairwise_haplen context("scan_hh_full") test_that("checked_scan_hh_full", { @@ -29,6 +29,13 @@ test_that("checked_scan_hh_full", { expect_equal(scan1, scan2) + #compare with element wise calculation + IES <- vapply(1:nmrk(hh), function(x) { + sum(calc_pairwise_haplen(hh, mrk = x)) / (nhap(hh) * (nhap(hh) - 1)) + }, FUN.VALUE = 0.0) + + expect_equal(scan2$IES, IES) + #unphased scan1 <- scan_hh( hh, @@ -45,13 +52,49 @@ test_that("checked_scan_hh_full", { expect_equal(scan1, scan2) - #test maxgap + #compare with element wise calculation + IES <- vapply(1:nmrk(hh), function(x) { + sum(calc_pairwise_haplen(hh, mrk = x, phased = FALSE)) / nhap(hh) + }, FUN.VALUE = 0.0) + #for unphased data IES is set to INES (normalized by focal marker) + INES <- IES * nhap(hh) / (scan2$NHAPLO_A + scan2$NHAPLO_D) + expect_equal(scan2$INES, INES) + + #test max_extend and maxgap hh <- data2haplohh( hap_file = "example1.hap", map_file = "example1.map", allele_coding = "01", verbose = FALSE ) + + #test max_extend + scan1 <- + scan_hh_full(hh, + discard_integration_at_border = FALSE, + max_extend = 20000) + + IHH_A <- + c( + 17142.8571428571, + 25238.0952380952, + 33333.3333333333, + 33333.3333333333, + 29523.8095238095, + 28333.3333333333, + 31333.3333333333, + 32380.9523809524, + 36666.6666666667, + 25238.0952380952, + 20000 + ) + IHH_D <- c(0, 0, 30, 30, 0, 40, 40, 0, 30, 0, 15) * 1000 + + expect_equal(scan1$IHH_A, IHH_A) + expect_equal(scan1$IHH_D, IHH_D) + + + #test maxgap hh@positions[5] <- 55000 hh@positions[11] <- 115000 for (maxgap in c(4000, 5000, 10000, 15000, NA)) { diff --git a/tests/testthat/test_sfs_tests.R b/tests/testthat/test_sfs_tests.R new file mode 100644 index 0000000..64b294e --- /dev/null +++ b/tests/testthat/test_sfs_tests.R @@ -0,0 +1,58 @@ +#checks sfs statistics of example 1 +context("sfs_test_statistics") + +test_that("checked_example1", { + sink(file = "test_sfs_example1.log") + hh <- + data2haplohh(hap_file = "example1.hap", + map_file = "example1.map", + allele_coding = "map") + + res <- calc_sfs_tests(hh, window_size = 110000) + + expect_equal(res$THETA_S, 4.24242424242424) + expect_equal(res$TAJIMA_D,-0.159251163979525) + expect_equal(res$FAY_WU_H, 0.928627986893831) + expect_equal(res$ZENG_E,-1.07358846844319) + + res <- calc_sfs_tests(hh, + window_size = 50000, + overlap = 25000, + right = FALSE) + + expect_equal( + res$THETA_S, + c( + 1.54269972451791, + 1.92837465564738, + 1.92837465564738 + ) + ) + expect_equal( + res$TAJIMA_D, + c( + -0.524737350335118, + 0.420384850289377, + 0.000462724105986242 + ) + ) + expect_equal( + res$FAY_WU_H, + c( + 0.898992052528498, + 0.822978681944425, + 0.768113436481463 + ) + ) + expect_equal( + res$ZENG_E, + c( + -1.16431403321423, + -0.527192758957502, + -0.738186112321971 + ) + ) + + sink() + file.remove("test_sfs_example1.log") +}) diff --git a/vignettes/examples.Rmd b/vignettes/examples.Rmd index cb5a5b8..89f0db2 100644 --- a/vignettes/examples.Rmd +++ b/vignettes/examples.Rmd @@ -6,24 +6,72 @@ output: bookdown::html_document2: base_format: rmarkdown::html_vignette toc: yes + bookdown::pdf_document2: + toc: yes + fig_caption: yes + number_sections: yes +fontsize: 12 pt +urlcolor: blue bibliography: vignette.bib csl: genetics.csl +header-includes: + - \numberwithin{equation}{section} vignette: > \usepackage[utf8]{inputenc} + \usepackage{amsmath} %\VignetteIndexEntry{Examples in detail} %\VignetteEncoding{UTF-8} %\VignetteEngine{knitr::rmarkdown} --- ```{r setup, include=FALSE} -knitr::opts_chunk$set(comment = ">", fig.height = 4.5, fig.width = 4.5, fig.show = "hold") +knitr::opts_chunk$set(comment = ">", fig.align = 'center', fig.height = 4.5, fig.width = 4.5, fig.show = "hold") ``` \clearpage # Overview -This vignette focuses on two small example data sets delivered with the package *rehh* (see main vignette). They have been constructed to ease comprehension of the relevant statistics and functionality of the package. The first example has been already discussed in [@Gautier2017] while the second set is an extension to include multiple markers and missing values. +Despite a bewildering nomenclature, the idea of *Extended Haplotype Homozygosity* is simple. Consider the following alignment of nucleotide sequences where only bi-allelic sites have been retained: + +```{r echo = FALSE, fig.height = 3, fig.width = 3} +oldpar = par(mar = rep(0.1, 4)) +plot.new() +seq = c( + "AACTCAGACGA", + "AAGCGACAACT", + "ACGTCACACCA", + "AACCCAGCACT", + "AAGCCGGACCA", + "AAGCCGGACCA", + "GAGCCGGACCT", + "AAGCCGGACCT" +) +for (i in seq_along(seq)) { + n = strsplit(seq[i], "")[[1]] + text(((0:10) + 0.5) / 11, (8 - i) / 8 + 1 / 16, n) +} +transparent_red <- adjustcolor("red", alpha.f = 0.5) +transparent_blue <- adjustcolor("blue", alpha.f = 0.5) +polygon( + c(0, 11, 11, 0, 0, 1, 1, 0, 0) / 11, + c(4, 4, 0, 0, 1, 1, 2, 2, 4) / 8, + border = transparent_red, + col = transparent_red +) +polygon( + c(3, 7, 7, 8, 8, 7, 7, 4, 4, 3, 3, 5, 5, 3) / 11, + c(8, 8, 7, 7, 5, 5, 4, 4, 5, 5, 6, 6, 7, 7) / 8, + border = transparent_blue, + col = transparent_blue +) +polygon(c(5, 6, 6, 5) / 11, c(8, 8, 0, 0) / 8, border = "black") +par(oldpar) +``` -The pattern of variation seen in the sets is intended to reflect an evolutionary scenario of an "on-going selective sweep" with the derived allele of the central marker experiencing strong selection. +The colored areas mark the maximal extension to which at least two sequences carrying the same *focal allele* are identical, i.e. homozygous to each other. The average length of all sequence-pairwise *shared haplotypes* yields the *iHH* scores for the two central alleles, respectively. The (unstandardized) *iHS* value is the log ratio of them. The statistics *XP-EHH* and *Rsb* are constructed in the same way with the two alleles replaced by two populations and while *Rsb* is normalized to 1 at the focal position, *XP-EHH* is not. That's all! + +This vignette analyses in great detail two small example data sets delivered with the package *rehh* (see main vignette). They have been constructed to ease comprehension of the relevant statistics and functionality of the package. The first example has been already discussed in [@Gautier2017] while the second set is an extension to include multiple markers and missing values. The modifications for unphased or unpolarized data have been described in [@Klassmann2020]. + +The pattern of variation seen in the sets and in the alignment above is intended to reflect an evolutionary scenario of an "on-going selective sweep" with one allele of the central marker experiencing strong selection. The package has to be installed and then loaded by ```{r library, message = FALSE} @@ -96,8 +144,8 @@ identical(hh, hh_vcf) ### Visualizing the sequences -The haplohh-object can be visualized by a simple plot command: -```{r} +The haplohh-object can be visualized by a simple plot that shows ancestral alleles in blue and derived ones in red: +```{r hhplot1, fig.cap = "Graphical output of the plot.haplohh() function"} plot(hh) ``` @@ -213,7 +261,7 @@ manhattanplot(ihs, threshold = c(-1.5,1.5), cr = cr, ylim = c(-2.5,2.5), pch = 2 A furcation plot represents a more fine-grained visualization of the homozygosity decay. In particular, individual haplotypes can be discerned which may instigate further investigations. The labels plotted in Figure \@ref(fig:furcation11) are set in bold face, if the branches with which they are associated encompass further haplotypes. -```{r furcation11, fig.align = 'center', fig.cap = "Graphical output of the plot.furcation() function", fig.pos = '!h', fig.lp = 'fig:'} +```{r furcation11, fig.cap = "Graphical output of the plot.furcation() function", fig.pos = '!h', fig.lp = 'fig:'} f <- calc_furcation(hh, mrk = "rs6") # set equal plot margins on left and right side and save old ones oldpar <- par(mar = (c(5, 3, 4, 3) + 0.1)) @@ -231,7 +279,7 @@ par(oldpar) A furcation diagram consists of trees for each allele and both sides ("left" and "right") of the marker. The individual trees can be exported into a string in *Newick* format to be rendered by external programs, e.g. the phylogenetic R-package [ape](https://cran.r-project.org/package=ape), see Figure \@ref(fig:newick1). -```{r newick1, , eval = requireNamespace("ape", quietly = TRUE), fig.align = 'center', fig.cap = "Graphical output of the plot.phylo() function of package ape", fig.pos = '!h', fig.lp = 'fig:'} +```{r newick1, eval = requireNamespace("ape", quietly = TRUE), fig.align = 'center', fig.cap = "Graphical output of the plot.phylo() function of package ape", fig.pos = '!h', fig.lp = 'fig:'} newick <- as.newick(f, allele = 0, side = "left", @@ -357,7 +405,7 @@ identical(hh, hh_vcf) ### Visualizing the sequences The haplohh-object can be visualized by a simple plot command: -```{r} +```{r hhplot2, fig.cap = "Graphical output of the plot.furcation() function"} plot(hh) ``` @@ -447,7 +495,7 @@ Note that the value of *EHH_D*, now representing the allele with internal coding However, with so few *EHH* values due to missing values, there is not much signal left and a standardization by `ihh2ihs()` averages the alleged outlier away as can be observed in Figure \@ref(fig:manhattan22). -```{r manhattan22, fig.align = 'center', fig.cap = "Graphical output of the manhattanplot() function", fig.pos = '!h', fig.lp = 'fig:'} +```{r manhattan22, fig.cap = "Graphical output of the manhattanplot() function", fig.pos = '!h', fig.lp = 'fig:'} ihs <- ihh2ihs(scan, freqbin = 1, verbose = FALSE) manhattanplot(ihs, threshold = c(-1.5, 1.5), ylim = c(-2.5,2.5), pch = 20) ``` @@ -457,7 +505,7 @@ manhattanplot(ihs, threshold = c(-1.5, 1.5), ylim = c(-2.5,2.5), pch = 20) A furcation diagram can show the pattern for all three alleles of the focal marker `rs6`. (Pseudo-)furcations that arise from the removal of chromosomes due to missing values are marked by dashed lines as depicted in Figure \@ref(fig:furcation21). -```{r furcation21, fig.align = 'center', fig.cap = "Graphical output of the plot.furcation() function", fig.pos = '!h', fig.lp = 'fig:'} +```{r furcation21, fig.cap = "Graphical output of the plot.furcation() function", fig.pos = '!h', fig.lp = 'fig:'} f <- calc_furcation(hh, mrk = "rs6") # set equal plot margins on left and right side and save old ones oldpar <- par(mar = (c(5, 3, 4, 3) + 0.1)) @@ -471,7 +519,7 @@ par(oldpar) Again, it is possible to export each tree into Newick format. This format, however, has no option to mark different kinds of branches. We let package *ape* render the Newick string to yield Figure \@ref(fig:newick2). -```{r newick2, , eval = requireNamespace("ape", quietly = TRUE), fig.align = 'center', fig.cap = "Graphical output of the plot.phylo() function of package ape", fig.pos = '!h', fig.lp = 'fig:'} +```{r newick2, eval = requireNamespace("ape", quietly = TRUE), fig.align = 'center', fig.cap = "Graphical output of the plot.phylo() function of package ape", fig.pos = '!h', fig.lp = 'fig:'} newick <- as.newick(f, allele = 0, side = "left", diff --git a/vignettes/rehh.Rmd b/vignettes/rehh.Rmd index 6cf7e53..d105989 100644 --- a/vignettes/rehh.Rmd +++ b/vignettes/rehh.Rmd @@ -1,20 +1,25 @@ --- -title: "Vignette for package *rehh* (version 3.1.2)" +title: "Vignette for package *rehh*" author: "Alexander Klassmann, Renaud Vitalis and Mathieu Gautier" date: "`r Sys.Date()`" output: bookdown::html_document2: base_format: rmarkdown::html_vignette toc: yes - bookdown::pdf_book: + bookdown::pdf_document2: toc: yes fig_caption: yes number_sections: yes +fontsize: 12 pt +urlcolor: blue bibliography: vignette.bib csl: genetics.csl +header-includes: + - \numberwithin{equation}{section} vignette: > \usepackage[utf8]{inputenc} - %\VignetteIndexEntry{How to use rehh} + \usepackage{amsmath} + %\VignetteIndexEntry{Vignette for package *rehh*} %\VignetteEncoding{UTF-8} %\VignetteEngine{knitr::rmarkdown} --- @@ -25,7 +30,7 @@ knitr::opts_chunk$set(comment = ">",fig.height = 4.5, fig.width = 4.5, fig.show \clearpage # About the package -This vignette describes how the R package *rehh* can be applied to perform whole genome scans for footprints of selection using statistics related to *Extended Haplotype Homozygosity (EHH)* [@Sabeti2002]. +This vignette describes comprehensively how the R package *rehh* can be applied to perform whole genome scans for footprints of selection using statistics related to *Extended Haplotype Homozygosity (EHH)* [@Sabeti2002]. The vignette *Examples in detail* explains basic usage and methodology with the help of two tiny artificial data sets. The package accepts multi-allelic genetic markers as input. Typically, albeit not necessarily, these will be bi-allelic SNPs. @@ -35,28 +40,14 @@ library(rehh) ``` ## Background -The analysis of molecular population genetic data often comprises the search for genomic regions that might have experienced recent selection. Diverse approaches have been developed, reviewed e.g. in [@Oleksyk2010] and [@Vitti2013], however only a few have found wide-spread application [@Cadzow2014], [@Haasl2016]. To the latter belong *iHS* [Voight2006], *Rsb* [@Tang2007] and *XP-EHH* [@Sabeti2007], all of which are *summary statistics* aimed to distill a certain aspect of the genetic data into a single score and constructed in a way that extreme values are indicative of positive or "Darwinian" selection. *iHS* is intended for application on a single (presumably homogeneous) population, while *XP-EHH* and *Rsb* are targeted to differential selection between two populations. All three statistics are based on the concept of *Extended Haplotype Homozygosity (EHH)* as formulated by [@Sabeti2002]. +The analysis of molecular population genetic data often comprises the search for genomic regions that might have experienced recent selection. Diverse approaches have been developed; for reviews on methodology see [@Sabeti2006], [@Oleksyk2010] and [@Vitti2013] and for +practical advice [@Cadzow2014], [@Utsunomiya2015] and [@Weigand2018]. However only a few have found wide-spread application [@Haasl2016]. To the latter belong *iHS* [@Voight2006], *Rsb* [@Tang2007] and *XP-EHH* [@Sabeti2007], all of which are *summary statistics* aimed to distill a certain aspect of the genetic data into a single score and constructed in a way that extreme values are indicative of positive or "Darwinian" selection. *iHS* is intended for application on a single (presumably homogeneous) population, while *XP-EHH* and *Rsb* are targeted to differential selection between two populations. All three statistics are based on the concept of *Extended Haplotype Homozygosity (EHH)* as formulated by [@Sabeti2002]. *iHS* and *XP-EHH* can be calculated by the independent C++ command line tool *Hapbin* [@Maclean2015], which has been optimized for speed by exploiting bit-wise machine-level operations. The package *rehh* cannot compete on performance, but has the advantage of being able to work with multi-allelic markers and missing values. Moreover, it possesses a broader range of input and output options, including several graphical representations. ## Changes between versions 2.X and 3.X -The C routines responsible for the bulk of calculations have been rewritten and all of the R code has been largely revised and streamlined yielding the following new features: - -- the package accepts multi-allelic markers. -- support for input files in *variant call format* as well as the output format of the simulation program *ms* [@Hudson2002]. -- computation of statistics and their visualization has become separated into different functions. -- graphical plots are more customizable. -- furcation diagrams can be labeled and a related visualization of "haplotype length" was added. -- a function to define candidate regions of selection. -- adaptations for unphased haplotypes or unpolarized alleles. -- output in form of matrices has been replaced by data frames; some columns have become optional. -- all names of data frames are now in lower case letters while all column names contain exclusively capital letters and underscores. -- a new internal representation of alleles: previously, an ancestral allele was coded by 1, a derived allele by 2 and a missing value by 0. These codings have been replaced by 0, 1 and `NA`, respectively. Furthermore, the numbers are now explicitly of type "integer" instead of "numeric". -- a second vignette to explain the statistics involved and the functionality of the package using an invented tiny data set. -- additional options to yield virtually identical results with the program *hapbin* (see section \@ref(hapbin)). -- an inconsistency between implementation and documentation concerning the calculation of one-sided p-values has been cleared. - -**Due to changes in the API, although mostly small, the versions 3.X are not compatible with versions 2.X!** + +Due to changes in the API, although mostly small, the versions 3.X are not compatible with versions 2.X. Data objects of class `haplohh` (see below) created by versions up to 2.0.4 must be updated via the command `update_haplohh()` (see its documentation by typing `?update_haplohh`) in order to be accepted by the functions of the current version. @@ -68,9 +59,11 @@ For illustration purposes, several example input files as well as R objects are - Two tiny invented examples, each in our "standard" haplotype format (see section \@ref(input)) and in *variant call format* (*vcf*). The first example was used in [@Gautier2017] for the explanation of the various statistics calculated by this package. The second example is an extension of the former including multi-allelic markers and missing values. Both sets are discussed in depth in a second vignette. + - Further three tiny examples used for the supplement on Site Frequency Spectrum-based methods of [@Klassmann2020]. + - An output file of the program *ms* containing two small simulated haplotype samples. - - Input files in different formats that originate from a study on the "Creole cattle breed from Guadeloupe" (CGU) [@Gautier2011]. All files contain the same set of phased SNPs of *Bos taurus* chromosome 12 from 140 individuals. + - A data set in various formats that originated from a study on the "Creole cattle breed from Guadeloupe" (CGU) [@Gautier2011]. All files contain the same set of phased SNPs of *Bos taurus* chromosome 12 from 140 individuals. All of the above files are copied into the current working directory via the command ```{r make_examples, results = 'hide'} @@ -81,7 +74,7 @@ make.example.files() ### R objects - - The data for chromosome 12 of the cattle study mentioned above as object of the *rehh* data class `haplohh`. This object becomes available by the command `data(haplohh_cgu_bta12)`. + - The data set for chromosome 12 of the cattle study mentioned above as object of the *rehh* data class `haplohh`. This object becomes available by the command `data(haplohh_cgu_bta12)`. - The scores *iHH* and *iES* as obtained by the function `scan_hh()` applied on SNPs of the whole genome for the population CGU (defined above) and another population EUT ("European taurine"). They reside in the accompanying package `rehh.data` and are obtained by `library(rehh.data)` followed by `data(wgscan.cgu)` resp. `data(wgscan.eut)`. @@ -91,15 +84,16 @@ The sequence of alleles on a chromosome is referred to as its *haplotype* and so ## Overview -The package calculates three statistics by the following steps: +The package calculates three statistics which can be used to perform whole-genome scans for selection: *iHS*, *XP-EHH* and *Rsb*. *iHS* compares alleles within a single population while the other two compare sites between populations. The calculation proceeds technically in five steps which are performed by running two commands and combining tables: -- each marker is taken in turn as a "focal marker" around which the extended haplotype homozygosity (*EHH*) is measured -- *EHH* is summerized into a number by integration -- two integrals are compared by taking their log ratio (two alleles for within-population or two populations for cross-population statistics) -- the genome-wide distribution of these log-ratios is normalized +- each marker is taken in turn as a "focal marker" around which the extended haplotype homozygosity is computed at further markers in increasing distance to the focal marker up to some stop criterion +- for each focal marker these quantities are "integrated" over the surrounding markers +- these integrals have to be calculated for each chromosome separately and the resulting tables to be combined to yields whole-genome statistics +- at each focal position, two such integrals are compared (either from two alleles or from two populations) by taking their log ratio +- the distribution of these log-ratios is standardized -Here is a minimal code example for a single population and a single chromossome: -```{r minimalcodeexample, results = "hide"} +Here is a minimal code example for calculating *iHS* on a single chromosome: +```{r minimalcodeexample, fig.align = 'center', results = "hide"} hh <- # data input data2haplohh( hap_file = "bta12_cgu.hap", @@ -108,14 +102,15 @@ hh <- # data input allele_coding = "map" ) scan <- scan_hh(hh) # calculation of EHH and integration -ihs <- ihh2ihs(scan) # log ratio for alleles and normalization + # (combine results from different chromosomes) +ihs <- ihh2ihs(scan) # log ratio for alleles and standardization manhattanplot(ihs) # plot of the statistics ``` # Data input {#input} The package *rehh* requires as input: - - a haplotype data file for each population of interest (see section \@ref(hapfile)). + - a haplotype data file (see section \@ref(hapfile)). and, if the haplotype data file is neither in *variant call format* nor in the format of *ms* output, @@ -130,8 +125,8 @@ Five haplotype input file formats are supported: - a *standard* haplotype format. Each row represents a haplotype with marker genotype in columns as in the example file `bta12_cgu.hap` containing 280 haplotypes with 1424 SNPs each (see section \@ref(LoadDataEx1)). The first element of each row is taken as a haplotype identifier. - a *transposed* format with haplotypes in columns and markers in rows as in the example file `bta12_cgu.thap`. This format is similar to the one produced by the phasing program *SHAPEIT2* [@OConnell2014]. This format assumes neither row nor column names and hence no haplotype identifiers can be specified (see section \@ref(LoadDataEx2)). - the output file format from the phasing program *fastPHASE* [@Scheet2006] as in the `bta12_hapguess_switch.out` example file. Note that this file format allows to include haplotypes from several populations (if the -u *fastPHASE* option was used) (see section \@ref(LoadDataEx3)). - - *variant call format (vcf)*, comprising both haplotype and marker information. In order to read files in this format, the package *vcfR* or the packages *data.table* and *R.utils* (the latter is needed for compressed files) have to be installed (see section \@ref(LoadDataEx4)). - - The output of the simulation program *ms* [@Hudson2002] and its derivatives *msHOT* [@Hellenthal2007], *ms^2^* [@Ewing2010] and *ms'* [@Kelleher2016]. In order to read these files, the package *gap* has to be installed (see section \@ref(LoadDataEx5)). + - *variant call format (vcf)*, comprising both haplotype and marker information. In order to read files in this format, the package [vcfR](https://cran.r-project.org/package=vcfR) or the packages [data.table](https://cran.r-project.org/package=data.table) and [R.utils](https://cran.r-project.org/package=R.utils) (the latter is needed for compressed files) have to be installed (see section \@ref(LoadDataEx4)). + - The output of the simulation program *ms* [@Hudson2002] and its derivatives *msHOT* [@Hellenthal2007], *ms^2^* [@Ewing2010] and *ms'* [@Kelleher2016]. In order to read these files, the package [gap](https://cran.r-project.org/package=gap) has to be installed (see section \@ref(LoadDataEx5)). Alleles in standard or transposed haplotype format can be provided either coded (by integer numbers) or without coding (e.g. as nucleotides) (see section \@ref(LoadData)). @@ -224,7 +219,7 @@ hh <- data2haplohh(hap_file = "bta12_cgu.hap", ### Example 2: reading haplotype file in transposed format (*SHAPEIT2*--like) {#LoadDataEx2} -If the haplotype input file `bta12_cgu.thap` is in "transposed" format, the option `haplotype.in.columns` has to be set to `TRUE` while all other parameters remain unaffected with respect to example 1. Note that this is the only format that has to be explicitly declared by the user. +If the haplotype input file is in "transposed" format (like `bta12_cgu.thap`), the option `haplotype.in.columns` has to be set to `TRUE` while all other parameters remain unaffected with respect to example 1. This is the only format which is not recognized automatically, but has to be explicitly declared by the user. ```{r example2} hh <- data2haplohh(hap_file = "bta12_cgu.thap", @@ -247,7 +242,7 @@ hh <- data2haplohh(hap_file = "bta12_hapguess_switch.out", allele_coding = "map") ``` -If no value is specified for the `popsel` argument and more than one population is detected in the *fastPHASE* output file, an error in produced and the available population numbers printed: +If no value is specified for the `popsel` argument and more than one population is detected in the *fastPHASE* output file, an error is produced and the available population numbers printed: ```{r error = TRUE} hh <- data2haplohh(hap_file = "bta12_hapguess_switch.out", @@ -263,11 +258,10 @@ Ancestral alleles are sometimes marked by upper case as "high confident" and by If the `AA` key is absent, the option `polarize_vcf` should be set to `FALSE` and the allele coding of the *vcf* file is directly used as internal coding. -If there is data for more than one chromosome in the file, the chromosome of interest has to be specified by `chr.name`. Since always the whole file is read in, it may be advisable for large data sets to create chromosome-specific files. +If there is data for more than one chromosome in the file, the chromosome of interest has to be specified by `chr.name`. Since always the whole file is read in, it is advisable to split large data sets into chromosome-specific files. In order to process *vcf* files, the package [vcfR](https://cran.r-project.org/package=vcfR) or the package -[data.table](https://cran.r-project.org/package=data.table) (which in turn needs [R.utils](https://cran.r-project.org/package=R.utils) to read compressed files) have to be installed. The parameter `vcf_reader` has to be set to either `"vcfR"` or `"data.table"`. [Note: at the time of writing, the package *vcfR* has been removed from CRAN, but can still be installed from https://github.com/knausb/vcfR, following instructions there.] - +[data.table](https://cran.r-project.org/package=data.table) (which in turn needs [R.utils](https://cran.r-project.org/package=R.utils) to read compressed files) have to be installed. The parameter `vcf_reader` has to be set to either `"vcfR"` or `"data.table"`. In the file `bta12_cgu.vcf.gz` the ancestral allele was set as reference and hence no further polarizing is necessary. @@ -280,11 +274,13 @@ hh <- data2haplohh(hap_file = "bta12_cgu.vcf.gz", ### Example 5: reading ms output {#LoadDataEx5} The function `data2haplohh()` automatically checks whether the haplotype file is in the output format of the simulation program *ms* [@Hudson2002]. If this is the case, the parameters `map_file` and `allele_coding` are ignored. If the file contains several 'runs' (as referred to by the parameter `nrep` of *ms*), it is necessary to specify the number of the run in option `chr.name`. Note that always the whole file is read, so that it might be advisable to spread large simulations over separate files. -One argument of the `data2haplohh` function is specifically dedicated to *ms* output, although it works with other formats as well: *ms* gives chromosomal positions as fractions of the interval [0,1] and in order to obtain more realistic values, these positions can be multiplied by a factor, set by `position_scaling_factor`. +One argument of the `data2haplohh` function is specifically dedicated to *ms* output, although it works with other formats as well: *ms* gives chromosomal positions as fractions of the interval [0,1] and in order to obtain more realistic values, these positions can be multiplied by a factor, set by `position_scaling_factor`. + +Note that *ms* output can contain multiple markers with the same (rounded) position, which *rehh* does not accept. In this case the numerical precision for chromosomal positions in the *ms* output should be increased (option `-p` of *ms*, option `-oformat` of *msms*). -Note that *rehh* does not accept multiple markers with the same position and hence it is highly recommended to increase the numerical precision for chromosomal positions in the *ms* output. +Setting `remove_multiple_markers` to `TRUE` entails that from consecutive markers with the same position only the first one is retained and a warning containing the number of removed markers is printed. Note that this effectively transforms the "infinite sites model" used for simulations by *ms* into a "finite sites model". -In order to read this format, the package [gap](https://cran.r-project.org/package=gap) has to be installed. +In order to read the *ms* format, the package [gap](https://cran.r-project.org/package=gap) has to be installed. ```{r ms_example, eval = requireNamespace("gap", quietly = TRUE)} hh <- data2haplohh(hap_file = "ms.out", @@ -309,7 +305,7 @@ hh_subset = subset(hh, select.mrk = -1) ## Definition and computation ### The (allele-specific) *Extended Haplotype Homozygosity (EHH)* {#EHH} -For any given allele $a$ of a focal marker $s$, sometimes referred to as a *core* allele, the *Extended Haplotype Homozygosity (EHH)* is defined as the probability that two randomly chosen chromosomes, carrying the core allele, are homozygous over a given surrounding chromosomal region [@Sabeti2002]. It is estimated from a sample by calculating the homozygosity of the chromosomal chunk between the focal marker and another marker $t$ by the formula +For an allele $a$ of a focal marker $s$, sometimes referred to as a *core* allele, the *Extended Haplotype Homozygosity (EHH)* is defined as the probability that two randomly chosen chromosomes, carrying the core allele, are homozygous over a given surrounding chromosomal region [@Sabeti2002]. It is estimated from a sample by calculating the homozygosity of the chromosomal chunk between the focal marker and another marker $t$ by the formula \begin{equation} \mathrm{EHH}_{s,t}^a=\frac{1}{n_{a}(n_a-1)}\sum\limits_{k=1}^{K^a_{s,t}}n_k(n_k-1) (\#eq:ehh) @@ -317,19 +313,21 @@ For any given allele $a$ of a focal marker $s$, sometimes referred to as a *core where $n_a$ represents the number of chromosomes carrying the core allele $a$, $K^a_{s,t}$ represents the number of **different** shared haplotypes and $n_k$ refers to the number of chromosomes pertaining to the $k$-th such shared haplotype. If there is no missing data, it holds that $n_a=\sum\limits_{k=1}^{K^a_{s,t}}n_k$. In the case of unphased chromosomes from diploid individuals (see section \@ref(phasing)) the extended haplotype homozygosity can be calculated as follows [@Tang2007]: -we consider only chromosomes from individuals that are homozygous for the allele $a$ at the focal marker $s$ and estimate *EHH* at some marker $t$ by the fraction of individuals that are (still) homozygous over the entire chromosomal stretch between $s$ and $t$. Let $I^a_{s,t}$ denote the number of individuals that are homozygous from marker $s$ til marker $t$. We can reformulate the fraction of individuals in terms of the fraction of shared haplotypes: since haplotypes of different individuals are not compared they can be regarded as distinct by definition, hence $K_{s,s}^a=I_{s,s}^a=\frac{1}{2}n_a$. With increasing distance of $t$ from $s$, any increase in $K_{s,t}^a$ is tantamount to a decrease of the number of homozygous individuals, yielding +we consider only chromosomes from individuals that are homozygous for the allele $a$ at the focal marker $s$ and estimate *EHH* at some marker $t$ by the fraction of individuals that are (still) homozygous over the entire chromosomal stretch between $s$ and $t$. Let $I^a_{s,t}$ denote the number of individuals that are homozygous from marker $s$ til marker $t$. \begin{equation} -\mathrm{EHH}_{s,t}^a=\frac{I_{s,t}^a}{I_{s,s}^a}=\frac{n_a-K_{s,t}^a}{\frac{1}{2}n_a}\;. +\mathrm{EHH}_{s,t}^a=\frac{I_{s,t}^a}{I_{s,s}^a}\;. (\#eq:ehh2) \end{equation} -No matter which of the two definitions is used, it is common practice to stop computation when *EHH* reaches a certain lower threshold, e.g. 0.05. +*EHH* is usually computed only for a region it surpasses a given threshold (e.g., $EHH > 0.05$). ### The integrated *EHH* (*iHH*) {#iHH} -By definition, irrespective of the allele considered, *EHH* starts at 1 and decays to 0 with increasing distance of *t* from the focal marker *s*. For a given core allele, the integrated *EHH* (*iHH*) is defined as the area under the *EHH* curve which, in turn, is defined by the *EHH* values and associated chromosomal positions [@Voight2006]. The integral is computed with a simple standard method, called the *trapezoidal rule*. +By definition, *EHH* starts at 1 and decays to 0 with increasing distance of *t* from the focal marker *s*. For a given core allele, the integrated *EHH* (*iHH*) is defined as the area under the *EHH* curve which, in turn, is defined by the *EHH* values and associated chromosomal positions [@Voight2006]. The integral is computed with a simple numerical method, called the *trapezoidal rule*. + +Note that, technical details aside, the *iHH* value is nothing else than the average length of shared haplotypes. ### The (site-specific) Extended Haplotype Homozygosity (*EHHS*) {#EHHS} -An extended haplotype homozygosity can be defined as well without reference to core alleles. In this case, +An extended haplotype homozygosity can be defined as well without regard to core alleles. In this case, the quantity is aimed to reflect the probability that any two randomly chosen chromosomes from a population are homozygous over a given surrounding chromosomal region of a focal marker. In contrast to the allele-specific *EHH* defined in the previous section, the chromosomes are not required to carry a specific allele at the focal marker. We adopt the naming by [@Tang2007] as *site--specific* EHH, abbreviated by *EHHS*. Note, however that this quantity is sometimes referred to as *EHH*, too, and there is no agreed notation in the literature. *EHHS* was used in genome scans in two versions: un-normalized by [@Sabeti2007] and normalized by [@Tang2007]. @@ -357,7 +355,7 @@ $\mathrm{EHHS}_{s,t}=1-h_{s,t}$ and hence \begin{equation*} \mathrm{nEHHS}_{s,t}=\frac{\mathrm{EHHS}_{s,t}}{\mathrm{EHHS}_{s,s}}\;. \end{equation*} -Thus $\mathrm{nEHHS}_{s,t}$ is just normalized in order to yield 1 at the focal marker $s$. Note that the normalization factor depends on the frequency of the alleles at the focal marker and consequently is in general not the same for different focal markers. +Thus $\mathrm{nEHHS}_{s,t}$ is just normalized in order to yield 1 at the focal marker $s$. Note that the normalization factor depends on the frequency of the alleles at the focal marker and consequently is not necessarily the same for different focal markers. Furthermore, we note that *EHHS* and *EHH* are related by \begin{equation*} @@ -365,9 +363,9 @@ Furthermore, we note that *EHHS* and *EHH* are related by \end{equation*} where for the sake of simplicity we assume that the focal marker has only two alleles $a1$ and $a2$. *EHHS* might hence be viewed as a linear combination of the *EHH*'s of the focal alleles, weighted by roughly the square of the focal allele frequencies. -In the case of unphased chromosomes from diploid individuals (see section \@ref(phasing)) *EHHS* can be calculated like *EHH* using equation \@ref(eq:ehh2) without the restriction to core alleles: +In the case of unphased chromosomes from diploid individuals (see section \@ref(phasing)) *EHHS* can be calculated like *EHH* in Equation \@ref(eq:ehh2), just without reference to core alleles: \begin{equation} -\mathrm{EHHS}_{s,t}=\frac{I_{s,t}}{I_{s,s}}=\frac{n-K_{s,t}}{\frac{1}{2}n}\;. +\mathrm{EHHS}_{s,t}=\frac{I_{s,t}}{I_{s,s}}\;. (\#eq:ehhs2) \end{equation} Note that defined this way, *EHHS* is always 1 at the focal marker. Hence there is no distinction between $\mathrm{EHHS}$ and $\mathrm{nEHHS}$. @@ -378,21 +376,23 @@ Again, unphased *EHHS* can be related to unphased *EHH* by \end{equation} where for the sake of simplicity we assumed a bi-allelic focal marker with alleles $a1$ and $a2$. -As with *EHH*, the *EHHS* is usually computed only for the region where its value surpasses a given threshold (e.g., *EHHS*>0.05). +As with *EHH*, the *EHHS* is usually computed only for the region where its value surpasses a given threshold (e.g., $EHHS > 0.05$). ### The integrated *EHHS* (*iES*) {#iES} Like *EHH*, *EHHS* has its maximum at the focal marker and decays to 0 with increasing distance from the focal marker. For a given focal marker, analogously to *iHH*, *iES* is defined as the integrated *EHHS* [@Tang2007]. Depending on whether *EHHS* or *nEHHS* is integrated, we yield *iES* and *inES* respectively. As with *iHH*, the numerical integration uses the *trapezoidal rule*. +Note that, technical details aside, the *iES* and *inES* values represent the average length of shared haplotypes. The length of shared haplotypes with different core alleles yields zero and these are included in the former but not the latter. + ## The function `calc_ehh()` {#calcehh} The function `calc_ehh()` computes *EHH* for all alleles of a focal marker $s$ relative to markers $t$ upstream and downstream. For each allele the corresponding integral *iHH* of the *EHH* curve is calculated as well. Three options can be specified to constrain the computation of *EHH*: -- `limehh` sets a threshold below which further calculation of *EHH* is stopped. Its default value is 0.05. Note that lowering this cut-off, although increasing the accuracy of *EHH* estimates, might actually decrease the power to detect selective events since under neutrality a tiny fraction (<<0.05) of very long shared haplotypes can be expected, too. +- `limehh` sets a threshold below which further calculation of *EHH* is stopped. Its default value is 0.05. Note that lowering this cut-off might actually decrease the power to detect selective events since under neutrality a tiny fraction of sequences has very long shared haplotypes which, if not capped, confound the signal of selection [@Klassmann2020]. - `limhaplo` defines the smallest acceptable number of evaluated chromosomes and has a default (and mimimum) value of 2. This parameter might be increased if missing values are suspected to be non-randomly distributed leading to a biased drop-out of evaluated chromosomes. -- `limhomohaplo` sets a minimum number of homozygous chromosomes below which calculation of *EHH* is stopped (or not even started). Its default (and minimum) value is 2. This number should be increased to 4 for small samples of unphased haplotypes in order to limit the influence of a single pair of shared haplotypes (see section \@ref(phasing).) +- `limhomohaplo` sets a minimum number of homozygous chromosomes below which calculation of *EHH* is stopped (or not even started). Its default (and minimum) value is 2. This number should be increased to 4 for small samples of unphased haplotypes in order to limit the influence of a single shared haplotype (see section \@ref(phasing).) Several parameters influence the *IHH* values (the integral over *EHH*): @@ -404,7 +404,7 @@ Several parameters influence the *IHH* values (the integral over *EHH*): - integration is performed by default over the area between the graph defined by the *EHH* values and the horizontal line y = `limehh`. If numerical agreement with the program *Hapbin* is wanted, this area should be extended to the x-axis by setting `lower_y_bound` to zero. -- by default the *EHH* curve is defined by linearly interpolating *EHH* values between consecutive markers, yielding a continuous curve. However in particular for full re-sequencing data, it is more accurate to let this function decrease step-wise at each marker by setting `interpolate`to `FALSE` (although the effect is likely to be minor). +- by default, the *EHH* curve is defined by linearly interpolating *EHH* values between consecutive markers, yielding a continuous curve. However, in particular for full re-sequencing data, it is more accurate to let this function decrease step-wise at each marker by setting `interpolate` to `FALSE` (although the difference is likely to be minor). The option `polarized`, `TRUE` by default, in this function merely affects the order and labeling of alleles. @@ -464,7 +464,7 @@ plot(calc_ehh(haplohh_cgu_bta12, ``` ## The function `calc_ehhs()` -The `calc_ehhs()` function computes $\mathrm{EHHS}$ and $\mathrm{nEHHS}$ around the focal marker $s$ relative to another marker $t$. This function also computes the corresponding integrals $\mathrm{iES}$ and $\mathrm{inES}$ respectively. The options are identical to those of the function `calc_ehh` (see previous section), except that `polarized` is not needed here. Details are available by the command: +The `calc_ehhs()` function computes *EHHS* and normalized *EHHS* around the focal marker $s$ relative to another marker $t$. This function also computes the corresponding integrals *iES* and *inES* respectively. The options are identical to those of the function `calc_ehh` (see previous section), except that `polarized` is absent, because variant ancestry status does not figure in the formulas. Details are available by the command: ```{r, eval=FALSE} ?calc_ehhs ``` @@ -477,7 +477,7 @@ res <- calc_ehhs(haplohh_cgu_bta12, include_nhaplo = TRUE) ``` -The output is similar to that of `calc_ehh` except that there are no alleles to be distinguished, but instead the normalized and non-normalized versions of *EHHS*. A list with four elements is obtained: +The output is similar to that of `calc_ehh()`, except that there are no alleles to be distinguished, but instead the wether *EHHS* is normalized or not. A list with four elements is obtained: 1. `mrk.name`: the name/identifier of the focal marker. 2. `ehhs`: a data frame with *EHHS* and *nEHHS* values along the chromosome around the focal marker. Optionally, the column `NHAPLO` can be included to show how many chromosomes were evaluated at each marker. @@ -610,14 +610,16 @@ p^\text{right}_\text{iHS}=-\log_{10}\left(1-\Phi\left(\text{iHS}\right)\right) \end{equation*} for the opposite case. -In case of unpolarized alleles, the *iHH* values of major and minor alleles are opposed to obtain *uniHS*. Since derived allele frequency cannot be accounted for, no binning should be performed. The resulting standardized *iHS* cannot be expected to follow a normal distribution and p-values are not meaningful. +Note that this procedure is controversial, because we identify the empirical distribution with the distribution under the null hypothesis of neutrality. This is an approximation at best and only warranted when it can be assumed that there are so few selected sites that their influence on the overall shape of the distribution can be neglected. + +In case of unpolarized alleles, the *uniHS* is taken as the ratio of *iHH* from minor to major allele. Since derived allele frequency cannot be accounted for, no binning should be performed. The resulting standardized *iHS* cannot be expected to follow a normal distribution and p-values as defined above are not meaningful. ### The function `ihh2ihs()` {#ihh2ihs} The `ihh2ihs()` function computes *iHS* using a data frame containing the *iHH* values for ancestral and derived (resp. major and minor) alleles as obtained by the `scan_hh()` function (see section \@ref(scanhh)). The argument `min_maf` allows to exclude focal markers according to their minor allele frequency (by default `min_maf`=0.05). The argument `freqbin` controls the size (or number) of the allele frequency bins used to perform standardization (see section \@ref(ihs)). More precisely, allele frequency bins are built from `min_maf` to 1-`min_maf` in steps of size `freqbin` (by default `freqbin`=0.025). If an integer of 1 or greater is specified, a corresponding number of equally spaced bins is created. If `freqbin` is set to 0, standardization is performed considering each observed frequency as a discrete frequency class, which is useful when there are only a few distinct haplotypes. For unphased data, *iHH* is calculated using only haplotypes of individuals which are homozygous at the focal marker. This number can be considerably lower than the absolute allele frequency. Hence, in addition to `min_maf`, the option `min_nhaplo` (default `NA`) should be used to reduce statistical noise arising from too few evaluated haplotypes. -Optionally, the allele frequencies of the input data frame can be included into the output data frame by setting `include_freq` to `TRUE`. +Optionally, the allele frequencies of the input data frame can be included into the output by setting `include_freq` to `TRUE`. A p-value is calculated for standardized *iHS* values. By default, it is two-sided, but a side can be chosen by setting argument `p.side` to `"left"` or `"right"`. @@ -840,6 +842,8 @@ The colors of the chromosomes in Figures \@ref(fig:manhattanplot) and \@ref(fig: Candidate regions as obtained by the function `calc_candidate_regions()` can be added to the plot as parameter `cr`. Individual markers can be highlighted by setting argument `mrk` to a vector of marker IDs or a data frame with positions (containing columns with name `CHR` and `POSITION`). +By default, chromosomes are separated by an inset of 5,000,000 bases. This value can be increased by the corresponding parameter in order to further reduce overlap between data points of neighboring chromosomes. + In order to reduce the number of plotted data points, the data set can be rasterized in both dimensions by parameter `resolution`. The data points are then rounded to the specified resolution and duplicate points removed. Furthermore, it is possible to specify a subset or a re-ordering of chromosomes with help of parameter `chr.name` as in Figure \@ref(fig:manhattanplotsub). @@ -853,6 +857,7 @@ manhattanplot(wgscan.ihs.cgu, main = "iHS (CGU cattle breed)", cr = cr.cgu, mrk = "F1205400", + inset = 1E+7, resolution = c(200000, 0.05)) # set back to default colors palette("default") @@ -860,7 +865,7 @@ palette("default") ## Genome wide score plots: the function `manhattan()` of package `qqman` -The package [qqman](https://cran.r-project.org/package=qqman) contains a function `manhattan()` which is similar to the function `manhattanplot()` of this package. The input data frame is expected to have a slightly different format, though. Hence, before plotting we need to "translate" our data as in the following example with *ihs* values: +The package [qqman](https://cran.r-project.org/package=qqman) contains a function `manhattan()` which is similar to the function `manhattanplot()` of this package. The input data frame is expected to have a slightly different format, though. Hence, before plotting we need to "translate" our data as in the following example with *iHS* values: ```{r rehh2qqman} # extract data frame from result list @@ -954,7 +959,7 @@ newick <- as.newick(furcation, hap.names = hap.names(haplohh_cgu_bta12)) ``` Such a string can be rendered graphically e.g. by the R package [ape](https://cran.r-project.org/package=ape) yielding Figure \@ref(fig:newick): -```{r newick, , eval = requireNamespace("ape", quietly = TRUE), fig.align = 'center', fig.width = 6, fig.height = 6, fig.lp = 'fig:', fig.cap = 'Graphical output of the ape::plot.phylo() function', fig.pos = "!h"} +```{r newick, eval = requireNamespace("ape", quietly = TRUE), fig.align = 'center', fig.width = 6, fig.height = 6, fig.lp = 'fig:', fig.cap = 'Graphical output of the ape::plot.phylo() function', fig.pos = "!h"} library(ape) tree <- ape::read.tree(text = newick) plot(tree, @@ -967,7 +972,7 @@ plot(tree, ## The functions `calc_haplen()` and `plot.haplen()` -The length of a particular extended shared haplotype in a sample can be defined as the range that a chromosome shares a haplotype with at least one other chromosome. For a given chromosome it corresponds to the maximal extension of the inner branches to both sides of the focal marker in a furcation diagram. The function `calc_haplen()` calculates this quantity: +To each haplotype in the sample the length of its longest shared haplotype is assigned, i.e. the range over which it is identical to at least one other haplotype (the latter might be different left and right to the focal marker). It corresponds to the maximal extension of the inner branches to both sides of the focal marker in a furcation diagram. The function `calc_haplen()` calculates this quantity: ```{r} haplen <- calc_haplen(furcation) ``` @@ -1033,12 +1038,12 @@ remove.example.files() # Data considerations ## Multi-allelic markers {#multiallelic} -For many species, a low per-site mutation rate ensures that the vast majority of Single Nucleotide Polymorphisms (SNPs) appears only with two alleles in a sample. Hence bi-allelic SNPs will constitute the foremost kind of data to apply our package onto. However, we think the ability to calculate the statistics for multi-allelic markers might be useful +For many species, a low per-site mutation rate ensures that the vast majority of Single Nucleotide Polymorphisms (SNPs) is observed with only two alleles in a sample. Hence bi-allelic SNPs will constitute the foremost kind of data to apply our package onto. However, we think the ability to calculate the statistics for multi-allelic markers might be useful - for species and/or genomic regions with a high per-site mutation rate - for genetic variation in form of (short) tandem repeats or (larger) copy number variations which are multi-allelic by definition and may carry information not captured by SNP markers - because the relative rarity of multi-allelic SNPs might make these particularly interesting -- since the original approach of [@Sabeti2002] was not to compare *extended haplotype homozygosity* between *core alleles* of a single SNP, but between multiple *core haplotypes* defined by several neighboring SNPs; such a partition could be created in *rehh* by adding an artificial multi-allelic marker. +- since the original approach of [@Sabeti2002] did not compare *EHH* on the two *core alleles* of a single SNP, but for multiple *core haplotypes* defined by several neighboring SNPs; such a partition could be created in *rehh* by adding an artificial multi-allelic marker. ## Dealing with gaps {#gaps} Certain genomic regions such as centromeres are difficult to sequence and can give rise to large gaps between consecutive markers. If not accounted for, these will cause spuriously inflated values of the "integrals" *iHH* and *iES*. [@Voight2006] applied two corrections for gaps. First, they introduced a penalty proportional to physical distance that resulted in any gap being greater than 20 kb to be re-scaled to this number. Second, they discarded integration, if two consecutive markers with a distance greater than 200 kb were encountered. Both methods are implemented in *rehh*, yet turned off by default, since the corresponding thresholds should be adapted manually to the specific data set. @@ -1059,23 +1064,21 @@ Errors or typos aside, there are several possibilities how multiple markers with - The *variant call format* allows to specify different kinds of markers in the same file. Hence at a certain position one might observe a SNP as well as an Insertion/Deletion or a tandem repeat. - In output of *ms* the positions are given with a pre-set precision and consequently the positions of two different segregating sites might be rounded to the same number. -Since it is unclear how *rehh* should handle such markers, they are not accepted as input. Ideally, multiple markers should be dealt with by a pre-processing of the data outside of the package. As a quick-and-dirty work-around, we offer the option `remove_multiple_markers`, which, if set to `TRUE`, removes all but the first marker with identical positions. +Since it is unclear how *rehh* should handle such markers, they are not accepted as input. Ideally, multiple markers should be dealt with by a pre-processing of the data outside of the package. As a quick-and-dirty work-around, we offer the option `remove_multiple_markers` in function `data2haplohh()`, which, if set to `TRUE`, removes all but the first marker with identical positions. ## Dealing with unphased data {#phasing} -Notwithstanding expensive experimental methods, current high-throughput genotyping / sequencing technologies cannot directly assign alleles to specific chromosomes of a heterozygous diploid (or multiploid) individual. Instead, this task of *phasing* is typically performed by specialized bioinformatic tools like the previously mentioned *SHAPEIT* [@OConnell2014] and *fastPHASE* [@Scheet2006]. Although computationally demanding, the application of these tools is straight-forward and the results usually of sufficient quality for the calculation of *EHH* based statistics. Typically, the tools interpolate missing values away. +Notwithstanding expensive experimental methods, current high-throughput genotyping / sequencing technologies cannot directly assign alleles to specific chromosomes of a heterozygous diploid (or multiploid) individual. Instead, this task of *phasing* is typically performed by specialized bioinformatic tools like *SHAPEIT* [@OConnell2014] and *fastPHASE* [@Scheet2006]. Although computationally demanding, the application of these tools is straight-forward and the results usually of sufficient quality for the calculation of *EHH* based statistics. Typically, the tools interpolate missing values away. In the presumably rare cases where phasing is not feasible, *EHH* or *EHHS* can only be meaningfully estimated by reducing the set of compared chromosomes to those of homozygous (at the focal marker) individuals (assuming that the input data is ordered correspondingly). However, this reduction entails a substantial loss of power; even by an adapted parameter setting (see below) at the very minimum 10, but better up to 30 sequences are needed to obtain at least moderately accurate estimations. For the within-population statistic *iHS*, the latter requirement concerns both major and minor alleles of a marker and scans should not be performed on samples comprising less than 100 sequences. Even for sample sizes of 200 sequences, meaningful estimation of *iHS* is hence restricted to variants of intermediate frequencies, i.e. high minor frequency. For the cross-population statistics *Rsb* and *XP-EHH* a minimum number of 30 sequences from homozygous individuals is usually fulfilled if the sample size of each population exceeds 60 sequences. - - -For unphased sequences +Hence, for unphased sequences the following parameters shoudl be set: - the option `phased` of the functions `calc_ehh()`, `calc_ehhs()` and `scan_hh()` must be set to `FALSE`. However, if the data is actually phased, this entails a substantial loss of power to detect selection! -Most of the variance (and hence "noise") comes at any marker from the longest shared haplotypes. To limit their contribution +A few shared haplotypes of extreme length are usually encountered in neutrally evolving regions. In order to limit this "statistical noise", cut-off rules are for unphased sequences even more important than they are for phased ones - the cut-off for the calculation of *EHH* resp. *EHHS* defined by option `limehh` resp. `limehhs` should be increased from the default value of 0.05 to 0.1. - in function `ihh2ihs()`, hence for a within-population scan using *iHH* values, in addition to the filtering by the MAF of core alleles (parameter `min_maf`, default 0.05), a minimum absolute number of evaluated haplotypes should be set by parameter `min_nhaplo`; this value should never be lower than 10 and, if the sample size allows, be as high as 30. @@ -1085,9 +1088,9 @@ See [@Klassmann2020] for a study on estimating *iHS*, *Rsb* and *XP-EHH* using u ## Dealing with unpolarized data {#polarizing} -The designation of alleles as 'ancestral' or 'derived' is referred to as *polarization*. Since sequences of ancient genomes are available only for a few species and restricted to a limited time back to the past, the 'ancestral' allele is usually inferred to be the one carried by one or more outgroup species such as chimpanzees or gorillas for humans. However, this presupposes the existence of a reference sequence of suitable 'neighbor' species of sufficient quality as well as reliable genome-wide alignments. These requirements are not trivial and even if they are fulfilled, any alignment will not cover the whole genome and the covered part will contain mis-specified alleles due to invisible secondary or back-mutations [@Baudry2003], possibly causing spurious signals of selection [@Hernandez2007]. +The designation of alleles as 'ancestral' or 'derived' is referred to as *polarization*. Since sequences of ancient genomes are available only for a few species and restricted to a limited time back to the past, the 'ancestral' allele is usually inferred to be the one carried by one or more outgroup species such as chimpanzees or gorillas for humans. However, this presupposes the existence of a reference sequence of a suitable 'neighbor' species of sufficient quality as well as reliable genome-wide alignments. These requirements are not trivial and even if they are fulfilled, any alignment will not cover the whole genome and the covered part will contain mis-specified alleles due to invisible secondary or back-mutations [@Baudry2003], possibly causing spurious signals of selection [@Hernandez2007]. -Note that the bin-wise standardization of $iHS$ is the only calculation step within our package where the information about ancestry is exploited. The information of ancestry status is valuable since the expected values under neutral evolution depend on the respective allele frequencies at the focal marker (see Figure \@ref(fig:freqbin) of this vignette and Figure 4 of [@Voight2006]). The binning of markers by frequency before its standardization (see section \@ref(ihh2ihs)) is aimed to eliminate most of this dependence. For unpolarized alleles this correction cannot be done. +Note that the bin-wise standardization of *iHS* is the only calculation step within our package where the information about ancestry is exploited. The information of ancestry status is valuable since the expected values under neutral evolution depend on the respective allele frequencies at the focal marker (see Figure \@ref(fig:freqbin) of this vignette and Figure 4 of [@Voight2006]). The binning of markers by frequency before its standardization (see section \@ref(ihh2ihs)) is aimed to eliminate most of this dependence. For unpolarized alleles this correction cannot be done. Hence two parameters are important when dealing with unpolarized data: @@ -1113,19 +1116,39 @@ $\sqrt{\frac{1}{n}\sum(x_i-\bar{x})^2}$ for the standard deviation while *rehh* 6. The default number of bins is 50 in *hapbin*, yielding a bin width of 0.02. The default width in *rehh* is 0.025 (yielding 36 bins, see point above!). Setting the number of bins in *hapbin* to 40 with option `-b` or `--bin` yields a bin width of 0.025. -7. If run in default mode, *hapbin* calculates *EHH* by (notation as in section \@ref(EHH)) -\begin{equation*} -\mathrm{EHH}^a_{s,t}=\sum_{k=1}^{K^a_{s,t}}\left(\frac{n_k}{n_a}\right)^2\;. -\end{equation*} -For a set of $n$ chromosomes this estimator reaches its minimum value of $\frac{1}{n}$ if all of them are distinct. Yet formula \@ref(eq:ehh) used by *rehh* and applied by *hapbin* if run with option `-a` or `--binom` returns zero in this case. The difference reflects distinct sampling strategies, either with replacement or without. For increasing sample size both converge. -The same holds for *EHHS*. +7. *hapbin* uses by default another estimator for homozygosity than *rehh* (see section \@ref(estimation)). +If run with option `-a` or `--binom`, it uses the same as *rehh*. 8. Integration over *EHH* resp. *EHHS* is performed by *hapbin* on the area between the curve spanned by these quantities and the x-axis (y=0) while *rehh* by default integrates only over the part of that area that is above the threshold set by the parameters `limehh` resp. `limehhs`, i.e. the area between the curve and the line y=threshold. This is not to be confused with the condition for truncation at left and right ends of the curve which is (for all practical purposes) handled identically by both programs. Setting in *rehh* the parameter `lower_y_bound` to zero makes the integration identical to that of *hapbin*. As mentioned above, `limehh(s)` of *rehh* corresponds to `-c` or `--cutoff` of *hapbin*. 9. By default, the parameter `discard_integration_at_border` is `TRUE` in *rehh*. It has to be set to `FALSE` in order to conform to *hapbin*. 10. Large differences can arise from different handling of gaps during the integration of *EHH* resp. *EHHS* yielding *iHH* resp. *iES*. *Hapbin* has a parameter `-s` or `--scale` to "down-weight" large gaps by capping them to the specified value. Its default value is 20000 while the corresponding option in *rehh* is turned off by default, but can be set by `scalegap`. -The option `maxgap` within *rehh* leads to a stop of the integration and if the parameter `discard_integration_at_border` is set to `TRUE`, then no value is reported. This has no counterpart in *hapbin*. Instead, *hapbin* allows to specify a maximum length of Extended Haplotypes (disabled by default) which is not possible in *rehh*. +The option `maxgap` within *rehh* leads to a stop of the integration and if the parameter `discard_integration_at_border` is set to `TRUE`, then no value is reported. This has no counterpart in *hapbin*. Instead, *hapbin* allows to specify a maximum length of Extended Haplotypes (disabled by default) which is available as option of the function `scan_hh_full()` in *rehh*. + +\clearpage +# About estimating homozygosity {#estimation} + +The term *homozygosity* as component of the abbreviation *EHH* refers to the probability that two arbitrarily chosen chromosomes from a large population are identical at a given locus or in a given region. It does not make any statement about homozygosity of individuals or even presuppose that individuals are diploid. One might even argue, whether the term *homogeneity* would have been more appropriate. + +If there are $K$ alleles in the population and each allele has a population frequency of $f_k$, then this probability is given by +$$H=\sum_{k=1}^{K}f_k^2\;.$$ +For each allele $k$, its population frequency can be estimated by its sample frequency $x_k$: if the sample contains $n$ chromosomes and allele $k$ is observed $n_k$ times, then +$$\hat{f_k}=\frac{n_k}{n}=x_k\;.$$ +It seems straightforward to estimate the population homozygosity from a sample by +$$\hat{H_1}=\sum_{k=1}^Kx_k^2=\sum_{k=1}^K\left(\frac{n_k}{n}\right)^2\;.$$ +However, it turns out that this estimator is biased (it yields values that tend to be slightly too high). The following estimator, instead, is unbiased [@Nei1974]: +$$\hat{H_2}=\frac{n}{n-1}\sum_{k=1}^{K_{s,t}}x_k^2-\frac{1}{n-1}\;.$$ +The latter is used by *rehh*. We can see this e.g. in Equation \@ref(eq:ehhssab), when we consider each (shared) haplotype in the region between markers $s$ and $t$ as an allele. We get + +$$EHHS=\frac{1}{n(n-1)}\sum_{k=1}^{K_{s,t}}n_k(n_k-1)=\frac{n}{n-1}\frac{1}{n^2}\sum_{k=1}^{K_{s,t}}(n_k^2-n_k)=\frac{n}{n-1}\left(\sum_{k=1}^{K_{s,t}}\frac{n_k^2}{n^2}-\frac{n}{n^2}\right)=\hat{H_2}\;.$$ +*hapbin*, in constrast, uses by default estimator $\hat{H_1}$ and refers to $\hat{H_2}$ as the "alternative" estimator. Evidently, for large $n$, the difference between the two becomes negligible. For small $n$ this is not necessarily so. If we consider a minimal sample of two non-identical chromosomes, hence $n=2$ and $K=2$, then we have +$$\hat{H_1}=\left(\frac{1}{2}\right)^2+\left(\frac{1}{2}\right)^2=\frac{1}{2}$$ +and +$$\hat{H_2}=\frac{1}{2\cdot 1}(1\cdot 0+1\cdot 0)=0\;.$$ +Interestingly, although $\hat{H_1}$ is biased, it yields values which are on average closer to the population value than $\hat{H_2}$, since the variance of $\hat{H_1}$ is smaller than that of $\hat{H_2}$ [@Nei1974]. + +It is unlikely, though, that the choice of the estimator has a major effect on detecting selection. \clearpage # References diff --git a/vignettes/vignette.bib b/vignettes/vignette.bib index 9e107ea..364e41b 100644 --- a/vignettes/vignette.bib +++ b/vignettes/vignette.bib @@ -218,6 +218,17 @@ @Manual{Rkey Url = {http://www.R-project.org} } +@article{Sabeti2006, +author = {Sabeti, Pardis C.}, +doi = {10.1126/science.1124309}, +journal = {Science}, +number = {5780}, +pages = {1614--1620}, +title = {{Positive natural selection in the human lineage}}, +volume = {312}, +year = {2006} +} + @Article{Sabeti2002, Title = {Detecting recent positive selection in the human genome from haplotype structure.}, Author = {Pardis C Sabeti and David E Reich and John M Higgins and Haninah Z P Levine and Daniel J Richter and others}, @@ -381,6 +392,16 @@ @article{Baudry2003 volume = {165}, year = {2003} } +@article{Weigand2018, + author = {Weigand, Hannah and Leese, Florian}, + doi = {10.1093/zoolinnean/zly007}, + journal = {Zoological Journal of the Linnean Society}, + number = {2}, + pages = {528--583}, + title = {{Detecting signatures of positive selection in non-model species using genomic data}}, + volume = {184}, + year = {2018} +} @Article{Voight2006, Title = {A map of recent positive selection in the human genome.}, Author = {Benjamin F Voight and Sridhar Kudaravalli and Xiaoquan Wen and Jonathan K Pritchard}, @@ -404,6 +425,16 @@ @Article{Voight2006 Timestamp = {2011.12.01}, Url = {http://dx.doi.org/10.1371/journal.pbio.0040072} } +@article{Utsunomiya2015, +author = {Utsunomiya, Yuri T. and {P{\'{e}}rez O'Brien}, Ana M.P. and Sonstegard, Tad S. and S{\"{o}}lkner, Johann and Garcia, Jos{\'{e}} F.}, +doi = {10.3389/fgene.2015.00036}, +journal = {Frontiers in Genetics}, +number = {FEB}, +pages = {1--13}, +title = {{Genomic data as the "hitchhiker's guide" to cattle adaptation: Tracking the milestones of past selection in the bovine genome}}, +volume = {5}, +year = {2015} +} @article{Vitti2013, abstract = {The past fifty years have seen the development and application of nu-merous statistical methods to identify genomic regions that appear to be shaped by natural selection. These methods have been used to in-vestigate the macro-and microevolution of a broad range of organisms, including humans. Here, we provide a comprehensive outline of these methods, explaining their conceptual motivations and statistical inter-pretations. We highlight areas of recent and future development in evolutionary genomics methods and discuss ongoing challenges for re-searchers employing such tests. In particular, we emphasize the impor-tance of functional follow-up studies to characterize putative selected alleles and the use of selection scans as hypothesis-generating tools for investigating evolutionary histories.}, author = {Vitti, Joseph J and Grossman, Sharon R and Sabeti, Pardis C.}, @@ -419,12 +450,21 @@ @article{Vitti2013 volume = {47}, year = {2013} } +@article{Nei1974, +author = {Nei, Masatoshi and Roychoudhury, K.}, +journal = {Genetics}, +number = {2}, +pages = {379--390}, +title = {{Sampling Variances of Heterozygosity and Genetic Distance}}, +volume = {76}, +year = {1974} +} @misc{Klassmann2020, -doi = {10.22541/au.158584282.24875401}, -url = {https://doi.org/10.22541%2Fau.158584282.24875401}, +doi = {10.22541/au.160405572.29972398/v1}, +journal = {https://doi.org/10.22541/au.160405572.29972398/v1}, publisher = {Authorea, Inc.}, -author = {Alexander Klassmann and Renaud Vitalis and Mathieu Gautier}, -title = {Detecting selection using Extended Haplotype Homozygosity ({EHH})-based statistics on unphased or unpolarized data (preprint)}, +author = {Alexander Klassmann and Mathieu Gautier}, +title = {Detecting selection using Extended Haplotype Homozygosity-based statistics on unphased or unpolarized data (preprint)}, year = {2020} }