-
Notifications
You must be signed in to change notification settings - Fork 10
/
calculate_association.pl
executable file
·1732 lines (1427 loc) · 84.4 KB
/
calculate_association.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env perl
use warnings;
use strict;
use Carp;
use Getopt::Long;
use Pod::Usage;
BEGIN {($_=$0)=~s{[^\\\/]+$}{};$_||="./"}
use lib $_, $_."kext";
use GWA;
our $VERSION = '$Revision: bbb13c8a31de6a6e9a1e71ca347a7d02a855a27b $';
our $LAST_CHANGED_DATE = '$LastChangedDate: 2009-07-10 12:30:41 -0700 (Fri, 10 Jul 2009) $';
our ($verbose, $help, $man);
our ($pedheaderfile, $gtfile);
our ($qt, $tdt, $tsp, $pdt, $remove, $keep, $exclude, $extract, $output, $mind_threshold, $geno_threshold, $maf_threshold, $mme_threshold, $fme_threshold, $ime_threshold,
$hwe_threshold, $perfam, $force_tdt, $alleleAB, $cc, $cycle, $seed, $cellsize, $canonical_gtfile_format, $illumina_gtfile_format, $affymetrix_gtfile_format,
$plink_tpedfile_format, $simple_format,
$marker_start, $marker_end, $snppropfile, $allmarker, $allind, $flush_output, $perm_method);
our ($orig_command_line);
processArgument ();
main ();
sub processArgument {
$orig_command_line = $0 . " @ARGV";
GetOptions('verbose|v'=>\$verbose, 'help|h'=>\$help, 'man|m'=>\$man, 'qt'=>\$qt, 'tdt'=>\$tdt, 'tsp'=>\$tsp, 'pdt'=>\$pdt, 'cc|assoc'=>\$cc, 'force_tdt'=>\$force_tdt,
'remove=s@'=>\$remove, 'keep=s@'=>\$keep, 'exclude=s@'=>\$exclude, 'extract=s@'=>\$extract, 'output=s'=>\$output,
'allmarker'=>\$allmarker, 'allind'=>\$allind, 'mind_threshold=f'=>\$mind_threshold,
'geno_threshold=f'=>\$geno_threshold, 'maf_threshold=f'=>\$maf_threshold, 'mme_threshold=f'=>\$mme_threshold,
'fme_threshold=f'=>\$fme_threshold, 'ime_threshold=f'=>\$ime_threshold, 'hwe_threshold=f'=>\$hwe_threshold,
'perfam'=>\$perfam, 'alleleAB|ab'=>\$alleleAB, 'cycle=i'=>\$cycle, 'seed=i'=>\$seed,
'cellsize=i'=>\$cellsize, 'canonical_gtfile_format'=>\$canonical_gtfile_format, 'illumina_gtfile_format'=>\$illumina_gtfile_format,
'affymetrix_gtfile_format'=>\$affymetrix_gtfile_format, 'plink_tpedfile_format'=>\$plink_tpedfile_format, 'simple_format'=>\$simple_format,
'mstart=i'=>\$marker_start, 'mend=i'=>\$marker_end,
'snppropfile=s'=>\$snppropfile, 'flush!'=>\$flush_output, 'perm_method=i'=>\$perm_method) or pod2usage ();
$help and pod2usage (-verbose=>1, -exitval=>1, -output=>\*STDOUT);
$man and pod2usage (-verbose=>2, -exitval=>1, -output=>\*STDOUT);
@ARGV or pod2usage (-verbose=>0, -exitval=>1, -output=>\*STDOUT);
@ARGV == 2 or pod2usage ("Syntax error");
if (not defined $flush_output or $flush_output == 1) {
$| = 1; #by default force a flush after every read or write to minotor program progress in real-time (it however increase disk overhead when running the program in parallel in multiple machines so --noflush can be used)
}
($pedheaderfile, $gtfile) = @ARGV;
if ($allmarker) {
defined $geno_threshold || defined $maf_threshold || defined $mme_threshold || defined $hwe_threshold and pod2usage ("Error in argument: please do not specify any marker exclusion criteria when --allmarker is specified");
$geno_threshold = 1;
$maf_threshold = 0;
$mme_threshold = 1;
$hwe_threshold = 0;
print STDERR "NOTICE: the --allmarker argument set all marker exclusion criteria as: --geno 1 --maf 0 --mme 0 --hwe 0\n";
}
if ($allind) {
defined $mind_threshold || defined $fme_threshold || defined $ime_threshold and pod2usage ("Error in argument: please do not specify any sample exclusion criteria when the --allsample argument is specified");
$mind_threshold = 1;
$fme_threshold = 1;
$ime_threshold = 1;
print STDERR "NOTICE: the --allind argument set all individual/sample exclusion criteria as: --mind 1 --fme 1 --ime 1\n";
}
defined $mind_threshold or $mind_threshold = 0.1; #missingness (non-call rate) for individual (same as PLINK)
defined $geno_threshold or $geno_threshold = 0.1; #genotyping call rate threshold for SNP (same as PLINK)
defined $maf_threshold or $maf_threshold = 0.01; #minor allele frequency threshold for SNP (same as PLINK)
defined $mme_threshold or $mme_threshold = 0.1; #marker mendelian error threshold for SNP (same as PLINK)
defined $fme_threshold or $fme_threshold = 1; #(OPTION NOT WELL-DEFINED YET) #family mendelian error threshold, where one error in one individual is counted as one error for the family (different as PLINK)
defined $ime_threshold or $ime_threshold = 0.02; #individual (offspring) mendelian error threshold (for 500K marker, this equals to less than 10K mendel errors for an offspring)
defined $hwe_threshold or $hwe_threshold = 0.001; #hardy-weinburg equilibrium threshold for SNP with exact test (same as PLINK)
#prepare various output file handles
if (defined $output) {
open (STDOUT, ">$output") or confess "Error: cannot write to outputfile $output: $!";
eval {
open(STDERR, " | tee $output.log 1>&2");
};
if ($@) {
print STDERR "WARNING: Failed to redirect standard error to log file (Error message will be printed in STDERR only but not recorded in log file)\n";
} else {
print STDERR "NOTICE: program notification/warning messages that appear in STDERR will be also recorded in log file $output.log\n";
}
open (MINFO, ">$output.minfo") or confess "Error: cannot write to marker information file $output.minfo: $!";
print STDERR "NOTICE: detailed genotyping summary information for all markers (even if not passing quality control threshold) will be written to $output.minfo\n";
outputMarkerInfo ('Name', 'Chr', 'Position', 'genotype_missing', 'minor_allele_frequency', 'hwe_p_value', 'marker_mendel_error', 'comments');
open (SINFO, ">$output.sinfo") or confess "Error: cannot write to sample information file $output.sinfo: $!";
print SINFO "family_id\tindividual_id\tnum_analyzed_marker\tnocall_count\tnocall_rate\tmendelian_error_count\tmendelian_error_rate\n";
print STDERR "NOTICE: detailed genotyping summary information for all samples will be written to $output.sinfo\n";
open (FINFO, ">$output.finfo") or confess "Error: cannot write to family information file $output.finfo: $!";
print FINFO "family_id\tnum_analyzed_marker\tmendelian_error_count\tmendelian_error_rate\n";
print STDERR "NOTICE: detailed genotyping summary information for all families with Mendelian errors will be written to $output.finfo\n";
if ($perfam) {
$tdt or $tsp or pod2usage ("Error: the --evidence argument is supported only for --tdt and --tsp analysis");
open (EVI, ">$output.evidence") or confess "Error: cannot write to evidence file $output.evidence";
print STDERR "NOTICE: detailed statistical evidence on association for each family will be written to $output.evidence\n";
print EVI ''; #prevent Perl from complaining "file handle used only once"
}
} else {
$perfam and pod2usage ("Error in argument: please specify --output argument when using --evidence argument to record detailed association evidence");
}
$maf_threshold >= 0 and $maf_threshold <= 1 or pod2usage ("Error in argument: the --maf_threshold argument must be between 0 and 1 inclusive");
$hwe_threshold >= 0 and $hwe_threshold <= 1 or pod2usage ("Error in argument: the --hwe_threshold argument must be between 0 and 1 inclusive");
$mme_threshold >= 0 and $mme_threshold <= 1 or pod2usage ("Error in argument: the --mme_threshold argument must be between 0 and 1 inclusive");
$geno_threshold >= 0 and $geno_threshold <= 1 or pod2usage ("Error in argument: the --geno_threshold argument must be between 0 and 1 inclusive");
$fme_threshold >= 0 and $fme_threshold <= 1 or pod2usage ("Error in argument: the --fme_threshold argument must be between 0 and 1 inclusive");
$ime_threshold >= 0 and $ime_threshold <= 1 or pod2usage ("Error in argument: the --ime_threshold argument must be between 0 and 1 inclusive");
$mind_threshold >= 0 and $mind_threshold <= 1 or pod2usage ("Error in argument: the --mind_threshold argument must be between 0 and 1 inclusive");
defined $marker_start and $marker_start >= 1 || pod2usage ("Error in argument: the --mstart argument must be greater or equal to 1 (firts marker)");
$marker_start ||= 1;
defined $marker_end and $marker_end >= $marker_start || pod2usage ("Error in argument: the --mend argument must be greater than or equal to --mstart argument");
$allmarker or print STDERR "NOTICE: the following marker exclusion criteria is in effect: --geno $geno_threshold --maf $maf_threshold --mme $mme_threshold --hwe $hwe_threshold\n";
$allind or print STDERR "NOTICE: the following individual exclusion criteria will *NOT* be used in association test\n However, individuals failing these criteria will be recorded for re-running program (with --remove argument): --mind $mind_threshold --fme $fme_threshold --ime $ime_threshold\n";
defined $cycle or $cycle = 0; #by default no permutation cycle is used
$cycle >= 0 or pod2usage ("Error in argument: --cycle must be a positive integer or zero");
defined $seed or $seed = 1; #default randomization seed is 1
defined $cellsize or $cellsize = 5; #default minimum cellsize (in contingency table) is five (same as PLINK)
$cellsize >= 0 or pod2usage ("Error in argument: --cellsize must be a positive integer or zero");
my $count_format = 0;
$illumina_gtfile_format and $count_format++;
$affymetrix_gtfile_format and $count_format++;
$plink_tpedfile_format and $count_format++;
$canonical_gtfile_format and $count_format++;
$simple_format and $count_format++;
if ($count_format > 1) {
pod2usage ("Error in argument: please specify only one argument from --illumina_gtfile_format, --affymetrix_gtfile_format, --plink_tpedfile_format, --simple_format and --canonical_gtfile_format");
} elsif ($count_format == 0) {
print STDERR "NOTICE: Assuming the input data is in canonical gtfile format, with columns as SNP, Chr, Position and genotypes\n";
$canonical_gtfile_format = 1;
}
my $analysis = 0;
$tdt and $analysis++;
$tsp and $analysis++;
$pdt and $analysis++;
$cc and $analysis++;
$qt and $analysis++;
$analysis > 1 and pod2usage ("Error in argument: please specify only one of the --tdt, --tsp, --pdt or --cc analysis options");
if (not $analysis) {
$cc = 1; #default is to perform case-control association tests
print STDERR "NOTICE: The default analysis type is case-control comparisons\n";
}
$force_tdt and $tdt || pod2usage ("Error in argument: the --force_tdt argument can only be used in conjunction with --tdt argument");
if ($perfam) {
$tdt or $tsp or pod2usage ("Error in argument: the --evidence argument is only applicable for --tdt or --tsp operation to record per-family transmission ratio");
}
$perm_method ||= 1;
$perm_method >= 1 and $perm_method <= 5 or pod2usage ("Error in argument: --perm_method should be an integer between 1-5");
}
sub main {
#reading ped_header file to make sure that this file is specified correctly, before doing any analysis
my ($ped, $ped_index) = GWA::readPedHeaderFile ($pedheaderfile, $qt);
my $ped_stat;
my (%extract_snp, %exclude_snp);
#read snppropfile
my ($snpprop, $proplen);
$snppropfile and ($snpprop, $proplen) = readSNPPropFile ($snppropfile);
#handle the --keep, --remove, --extract and --exclude argument (they specify the initial inclusion/exclusion criteria for samples and markers)
if ($keep) {
my @keepfile = split (/,/, join (',', @$keep));
my ($newped, %keep_not_found);
my $count_keep = 0;
for my $nextfile (@keepfile) {
open (IND, $nextfile) or confess "Error: cannot read from file $nextfile: $!";
while (<IND>) {
s/^\s*|\s*[\r\n]+$//g; #delete heading and trailing spaces and return characters
m/\S/ or next; #skip empty lines
m/^(\S+)\s+(\S+)$/ or confess "Error: invalid record in $nextfile (two non-blank fields expected): <$_>";
if ($ped->{$1}{$2}) {
$newped->{$1}{$2} and next; #duplicate sample identifiers found in the keep files
$newped->{$1}{$2} = $ped->{$1}{$2};
$count_keep++;
} else {
$keep_not_found{"$1:$2"}++;
}
}
close (IND);
}
$newped or print STDERR "FATAL ERROR: no individual found in ped-header-file $pedheaderfile based on the --keep argument (@keepfile)\n" and exit (1000);
if (%keep_not_found) {
my @example = keys %keep_not_found;
@example = splice (@example, 0, 5);
print STDERR "WARNING: ${\(scalar keys %keep_not_found)} individuals (for example: @example) specified by --keep argument are not found in pedheader file $pedheaderfile\n";
}
$ped = $newped;
print STDERR "NOTICE: Finished keeping $count_keep individuals in analysis based on --keep argument (@keepfile)\n";
}
if ($remove) {
my @removefile = split (/,/, join (',', @$remove));
my (%removed_ind, %remove_not_found);
for my $nextfile (@removefile) {
open (IND, $nextfile) or confess "Error: cannot read from file $nextfile: $!";
while (<IND>) {
s/^\s*|\s*[\r\n]+$//g; #delete heading and trailing spaces and return characters
m/\S/ or next; #skip empty lines
m/^(\S+)\s+(\S+)$/ or confess "Error: invalid record in $remove (two fields expected): <$_>";
if ($ped->{$1}{$2}) {
delete $ped->{$1}{$2};
$removed_ind{$1, $2}++;
} elsif ($removed_ind{$1, $2}) {
1;
} else {
$remove_not_found{"$1:$2"}++;
}
}
close (IND);
}
if (%remove_not_found) {
my @example = keys %remove_not_found;
@example = splice (@example, 0, 5);
print STDERR "WARNING: ${\(scalar keys %remove_not_found)} individuals (for example: @example) specified by --remove argument are not found in pedheader file $pedheaderfile\n";
}
print STDERR "NOTICE: Finished removing ${\(scalar keys %removed_ind)} individuals from analysis based on --remove argument (@removefile)\n";
}
if ($extract) {
my @extractfile = split (/,/, join (',', @$extract));
for my $nextfile (@extractfile) {
open (EXTRACT, $nextfile) or confess "Error: cannot read from file $nextfile: $!";
while (<EXTRACT>) {
s/^\s*|\s*[\r\n]+$//g;
m/\S/ or next; #skip empty lines
m/^(\S+)$/ or confess "Error: invalid record found in $nextfile (one marker name expected per line): <$_>";
$extract_snp{$1}++;
}
close (EXTRACT);
}
print STDERR "NOTICE: Finished reading ${\(scalar keys %extract_snp)} markers to use in analysis based on --extract argument (@extractfile)\n";
}
if ($exclude) {
my @excludefile = split (/,/, join (',', @$exclude));
for my $nextfile (@excludefile) {
open (EXCLUDE, $nextfile) or confess "Error: cannot read from file $nextfile: $!";
while (<EXCLUDE>) {
s/^\s*|\s*[\r\n]+$//;
m/\S/ or next; #skip empty lines
m/^(\S+)$/ or confess "Error: invalid record found in $nextfile (one marker name expected per line): <$_>";
$exclude_snp{$1}++;
}
close (EXCLUDE);
}
print STDERR "NOTICE: Finished reading ${\(scalar keys %exclude_snp)} markers to exclude from analysis based on --exclude argument (@excludefile)\n";
}
#inspect the pedigree, analyze family structure and calculate certain statistics
$ped_stat = GWA::inspectPedigree ($ped, $qt);
#perform actual association analysis
my ($fam_mendel_error, $ind_mendel_error, $ind_missing, $num_analyzed_marker, $nf_index) = calculateAssociation ($gtfile, $ped, $ped_index, $ped_stat, \%extract_snp, \%exclude_snp, $snpprop);
#output sample statistics (missing genotypes and mendelian errors for offsprings)
outputSampleStat ($ped_index, $fam_mendel_error, $ind_mendel_error, $ind_missing, $num_analyzed_marker);
#identifying individuals failing the inclusion criteria and print them out (to be included in --remove argument to re-run the program)
outputRemoveIndividual ($ped, $ped_index, $fam_mendel_error, $ind_mendel_error, $ind_missing, $num_analyzed_marker);
print STDERR "NOTICE: Program finished!\n";
close (STDOUT); close (STDERR);
}
#write individual remove files to help re-execute the program
sub outputRemoveIndividual {
my ($ped, $ped_index, $fam_mendel_error, $ind_mendel_error, $ind_missing, $num_analyzed_marker) = @_;
my %bad_ind;
if ($output) {
open (IND, ">$output.remove") or confess "Error: cannot write individuals that should be removed from analysis to the file $output.remove: $!";
print STDERR "NOTICE: Writting individuals who fail to meet the --mind, --fme_threshold or --ime_threshold criteria to $output.remove file\n";
} else {
*IND = *STDERR;
print STDERR "NOTICE: Identifying individuals who fail to meet the --mind --fme_threshold or --ime_threshold criteria\n";
}
for my $i (0 .. @$ped_index-1) {
my ($fam, $ind) = ($ped_index->[$i][0], $ped_index->[$i][1]);
$ped->{$fam}{$ind} or next; #this individual is excluded from analysis (not in %ped), possibly due to --remove argument
if ($ind_missing->[$i] and $ind_missing->[$i]/$num_analyzed_marker > $mind_threshold or $ind_mendel_error->[$i] and $ind_mendel_error->[$i]/$num_analyzed_marker > $ime_threshold) {
$bad_ind{$i} and next; #this individual is already written to file
$bad_ind{$i}++;
print IND $ped_index->[$i][0], "\t", $ped_index->[$i][1], "\n";
}
if ($fam_mendel_error->{$fam} and $fam_mendel_error->{$fam} / $num_analyzed_marker > $fme_threshold) {
$bad_ind{$i} and next; #this individual is already written to file
$bad_ind{$i}++;
print IND $ped_index->[$i][0], "\t", $ped_index->[$i][1], "\n";
}
}
scalar (keys %bad_ind) and print STDERR "NOTICE: A total of ", scalar (keys %bad_ind), " individuals that fail to meet the --mind, --fme_threshold or --ime_threshold criteria\n";
scalar (keys %bad_ind) and print STDERR "NOTICE: Please consider to re-run the program using the --remove argument to remove these individuals\n";
if ($output) {
scalar (keys %bad_ind) and print STDERR "\tCommand to execute: $orig_command_line --remove $output.remove\n";
close (IND);
}
}
sub analyzeHeaderLine {
my ($header) = @_;
$header =~ s/\s*[\r\n]+$//;
my @header = split (/\t/, $header);
my ($name_index, $chr_index, $pos_index, @ac_index, @gt_index, @allindex);
if ($canonical_gtfile_format) {
($name_index, $chr_index, $pos_index) = (0, 1, 2);
if ($alleleAB) {
@gt_index = (3..@header-1);
} else {
@ac_index = (3..@header-1);
}
} elsif ($illumina_gtfile_format) {
for my $i (0 .. @header-1) {
lc $header[$i] eq 'name' and $name_index = $i;
lc $header[$i] eq 'chr' and $chr_index = $i;
lc $header[$i] eq 'position' || lc $header[$i] eq 'pos' and $pos_index = $i;
$header[$i] =~ m/\.Allele Calls$/i and push @ac_index, $i;
$header[$i] =~ m/\.GType/i and push @gt_index, $i;
}
} elsif ($affymetrix_gtfile_format) { #this format is created by apt-probeset-genotype, with header lines starting with #, and with genotype denoted as 0,1,2 and -1.
1;
} elsif ($plink_tpedfile_format) {
@header = split (/\s+/, $header); #separated by space rather than tab
($name_index, $chr_index, $pos_index) = (1, 0, 3);
if ($alleleAB) {
@gt_index = (4..@header-1);
} else {
@ac_index = (4..@header-1);
}
} elsif ($simple_format) {
($name_index, $chr_index, $pos_index) = (0, undef, undef);
if ($alleleAB) {
@gt_index = (1..@header-1);
} else {
@ac_index = (1..@header-1);
}
}
return ($name_index, $chr_index, $pos_index, \@ac_index, \@gt_index, scalar (@header));
}
#this subroutine generate the sample index for all permutation cycles
#during each permutation cycle, the cells in the @perm_index array tells what two groups to compare (for case-control study), or what families to flip transmission/untransmission status (for family-based study), or how to scramble quantitative traits (for QT study)
sub calculatePermIndex {
my ($cycle, $nf_index) = @_;
my @perm_index;
if ($cc) {
my @allindex = (@{$nf_index->[0]}, @{$nf_index->[1]});
for my $current_cycle (1 .. $cycle) {
srand ($current_cycle + ($seed-1)*$cycle); #this ganrantee that when doing 100 cycles in 10 machines, each of 1000 permutation get a differnt seed
fisher_yates_shuffle (\@allindex);
my @case_index = @allindex[0 .. @{$nf_index->[0]}-1];
my @control_index = @allindex[@{$nf_index->[0]} .. @allindex-1];
push @perm_index, [\@case_index, \@control_index];
}
} elsif ($tdt or $tsp) {
for my $current_cycle (1 .. $cycle) {
srand ($current_cycle + ($seed-1)*$cycle);
my @switch;
for (1.. @$nf_index) {
push @switch, (rand () > 0.5) ? 1: 0;
}
push @perm_index, \@switch;
}
} elsif ($qt) {
my @allindex = (0 .. @{$nf_index->[0]}-1); #this is not a random shuffle of first element of nf_index, instead it shuffle the position (so that qt is equally shuffled!)
for my $current_cycle (1 .. $cycle) {
srand ($current_cycle + ($seed-1)*$cycle);
fisher_yates_shuffle (\@allindex); #the allindex contains shuffled number ranging from 0 to arrayelement(nfindex)-1, but the nf_index position index in the ped_index array
push @perm_index, [@allindex];
}
}
return \@perm_index;
}
sub generateNFIndex {
my ($ped) = @_;
my ($nf_index);
if ($tdt) {
$nf_index = $force_tdt ? GWA::identifyTrioFromPed ($ped, 'affected') : GWA::identifyIndepentTrioFromPed ($ped, 'affected');
@$nf_index = sort {$a->[0] <=> $b->[0]} @$nf_index; #sort by index of father in pedheaderfile
@$nf_index or confess "Error: unable to find suitable trios from ped_header file $pedheaderfile";
} elsif ($tsp) {
$nf_index = GWA::identifyNuclearFamilyFromPed ($ped, 'affected');
@$nf_index = sort {$a->[0] <=> $b->[0]} @$nf_index; #sort by index of father in pedheaderfile
@$nf_index or confess "Error: unable to find suitable trios or quartets from ped_header file $pedheaderfile";
} elsif ($cc) {
$nf_index = GWA::identifyCaseControlFromPed ($ped); #first element: case; second element: control
scalar (@{$nf_index->[0]}) or confess "Error: unable to find suitable cases from ped_header file $pedheaderfile";
scalar (@{$nf_index->[1]}) or confess "Error: unable to find suitable controls from ped_header file $pedheaderfile";
} elsif ($qt) {
$nf_index = GWA::identifyQTIndFromPed ($ped); #first element: posindex; second element: qtvalue
scalar (@{$nf_index->[0]}) or confess "Error: unable to find suitable individuals for quantitative trait analysis from ped_header file $pedheaderfile";
}
return ($nf_index);
}
sub calculateAssociation {
my ($gtfile, $ped, $ped_index, $ped_stat, $extract_snp, $exclude_snp, $snpprop) = @_;
my ($name_index, $chr_index, $pos_index, $ac_index, $gt_index, $num_record_per_line);
my ($nf_index, $perm_index);
my (@dm_invalidgt, @dm_invalidchr, @dm_gt2allele, @dm_geno, @dm_maf, @dm_mme, @dm_hwe);
my ($num_analyzed_marker) = (0);
my (%fam_mendel_error, @ind_mendel_error, @ind_missing);
my ($counter_line) = (0);
my ($name_length); #the length of the marker name in the association output file (sometimes markers such as CNVs have extraordinarily long names)
#read the header line of the genotype file to make sure the number of samples in the file is the same as the ped file
#determine whether the 'Allele Calls' or the 'GType' records in the gtfile will be used for association analysis
open (GT, $gtfile) or confess "Error: cannot read from GT file $gtfile: $!";
defined ($_ = <GT>) or confess "Error: cannot read anything from GT file $gtfile";
chomp;
$counter_line++;
($name_index, $chr_index, $pos_index, $ac_index, $gt_index, $num_record_per_line) = analyzeHeaderLine ($_);
$num_record_per_line >= 2 or confess "Error: the GT file must contain at least 2 space-delimited records per line, but the header line is: $_";
if ($plink_tpedfile_format) {
open (GT, $gtfile); #open GT file again to start reading from first line
$counter_line = 0;
}
if ($simple_format) {
defined $name_index or confess "Error: unable to find columns for SNP name in the header line ($_) of GT file $gtfile";
} else {
defined $name_index and defined $chr_index and defined $pos_index or confess "Error: unable to find columns for 'Name', 'Chr' and 'Position' in the header line ($_) of GT file $gtfile";
}
if ($alleleAB) {
$gt_index or confess "Error: --alleleAB argument is set but GType column is not found in header line ($_) of GT file $gtfile";
if ($plink_tpedfile_format) {
@$gt_index == 2*@$ped_index or confess "Error: Discordant number of samples: ${\(scalar @$ped_index)} in pedheaderfile $pedheaderfile versus ${\(scalar @$gt_index)} in GT file $gtfile";
} else {
@$gt_index == @$ped_index or confess "Error: Discordant number of samples: ${\(scalar @$ped_index)} in pedheaderfile $pedheaderfile versus ${\(scalar @$gt_index)} in GT file $gtfile";
}
} else {
if (@$ac_index) {
if ($plink_tpedfile_format) {
@$ac_index == 2*@$ped_index or confess "Error: Discordant number of samples: $ped_stat->{num_ind} in pedheaderfile $pedheaderfile versus ${\(scalar @$ac_index)} in genotype file $gtfile";
} else {
@$ac_index == @$ped_index or confess "Error: Discordant number of samples: $ped_stat->{num_ind} in pedheaderfile $pedheaderfile versus ${\(scalar @$ac_index)} in genotype file $gtfile";
}
} elsif (@$gt_index) {
confess "Error: Unable to find 'Allele Calls' records in header line of gtfile $gtfile but found 'GType' records (did you forget the --alleleAB argument?)";
} else {
confess "Error: Unable to find genotype records in the header line of gtfile $gtfile";
}
}
$nf_index = generateNFIndex ($ped); #nf_index has different meaning for different study designs
$cycle and $perm_index = calculatePermIndex ($cycle, $nf_index); #--cycle argument determines the number of permutation cycles
#record the nuclear families to the first line of the EVIDENCE file
if ($tdt || $tsp and $perfam) { #--evidence is only supported for --tdt and --tsp analysis
print EVI "Name";
for my $index (@$nf_index) {
print EVI "\t", $ped_index->[$index->[0]][0], ":"; #family id
for my $i (0 .. @$index-1) {
$i and print EVI ",";
print EVI $ped_index->[$index->[$i]][1]; #individual id
}
}
print EVI "\n";
}
#read and process each marker sequentially
while (<GT>) {
my (@record, @ac, @gt); #ac: allele call (such as ACGT), gt: genotype call (AB only)
my ($name, $chr, $pos);
my ($marker_bad_flag); #a flag showing whether marker failed one criteria (GENO, MAF, HWE, MME, etc);
my ($marker_comment) = ''; #a comment to be printed at the end of each line of the marker info file
#decide whether to process this marker based on --mstart and --mend arguments, and then read in this line
$counter_line++; #current line counter for GT file, which is one more than the counter for markers
if ($marker_start) {
if ($plink_tpedfile_format) {
$counter_line >= $marker_start or next;
} else {
$counter_line-1 >= $marker_start or next;
}
}
if ($marker_end) {
if ($plink_tpedfile_format) {
$counter_line <= $marker_end or last;
} else {
$counter_line-1 <= $marker_end or last;
}
}
s/\s*[\r\n]+$//; #discard trailing newline or return character
if ($plink_tpedfile_format) {
@record = split (/\s+/, $_); #all records are separated by space
} else {
@record = split (/\t/, $_); #all records are separated by tab
}
@record == $num_record_per_line or confess "Error: Discordant number of space/tab-delimited records in $gtfile: $num_record_per_line in header line but ${\(scalar @record)} in line $counter_line: <$_>";
if (defined $chr_index) {
($name, $chr, $pos) = @record[$name_index, $chr_index, $pos_index];
} else {
($name, $chr, $pos) = ($record[$name_index], 99, "NA"); #the chr is treated as a special "autosome"
}
$chr eq '23' and $chr = 'X';
$chr eq '24' and $chr = 'Y'; #for compatibilit with PLINK-formatted TFAM/TPED files
$verbose and print STDERR "NOTICE: Processing marker $name at chr$chr position $pos\n";
#read genotypes for this marker on all individuals
%$extract_snp and $extract_snp->{$name} || next; #this SNP is not specifed in the --extract file so skipped from analysis
$exclude_snp->{$name} and next; #this SNP is excluded from analysis
if ($alleleAB) {
@ac = @record[@$gt_index];
} else {
@ac = @record[@$ac_index];
}
if ($plink_tpedfile_format) {
my @ac1 = @ac;
@ac = ();
for my $i (0 .. @ac1/2-1) {
push @ac, $ac1[$i*2].$ac1[$i*2+1];
}
}
map {s/^NC/00/} @ac;
map {s/^\-\-/00/} @ac;
map {s/^0$/00/} @ac; #all NO_CALL genotypes are converted in the form of 00
map {s/ //} @ac; #get rid of spaces in genotype calls (sometimes there is a space between the two alleles in a genotype call)
#check the --geno_threshold and make sure all genotype calls are recognizable
my @nc_geno = grep {m/^00$/} @ac;
my @normal_geno = grep {m/^[ABCGT1234][ABCGT1234]$/} @ac;
my $geno_missing = @nc_geno/@ac;
my (%ac_count, %allele_count);
if (@nc_geno+@normal_geno != @ac) { #unrecognized genotype call (less than or more than 2 letters)
my @invalid_geno = grep {!m/^[ABCGT01234][ABCGT01234]$/} @ac;
print STDERR "WARNING: marker $name (chr=$chr pos=$pos) has non-recognized genotype calls (@invalid_geno)\n";
$marker_comment .= "(invlid_genotype=" . join (",", @invalid_geno) . ")";
@ac = (('00') x scalar (@ac)); #treat all genotypes as NULL for this marker
$allmarker or push @dm_invalidgt, $name and $marker_bad_flag++;
}
$geno_missing <= $geno_threshold or push (@dm_geno, $name) and $marker_bad_flag++; #flag this marker as bad marker (does not pass QC criteria)
#convert ACGT allele calls to AB genotypes, identify unique alleles
my $allac = join ('', @ac);
$ac_count{$_}++ for (@ac);
$allele_count{$_}++ for (split (//, $allac));
delete $allele_count{0};
my @uniq_allele = sort keys %allele_count;
my (%ac2gt, %gt2ac);
if (not @uniq_allele) { #completely missing genotype (all individuals have NoCall genotype)
$marker_comment .= '(all_genotype_missing)';
@gt = @ac;
} elsif (@uniq_allele == 1) { #only one allele found for this marker
$marker_comment .= "(single_allele=$uniq_allele[0])";
@gt = (('AA') x scalar (@ac));
} elsif (@uniq_allele > 2) {
print STDERR "WARNING: marker $name (chr=$chr pos=$pos) has more than two alleles (@uniq_allele)\n";
$marker_comment .= "(more_than_two_allele=" . join (",", @uniq_allele) . ")";
$allmarker or push @dm_gt2allele, $name and $marker_bad_flag++;
@gt = (('00') x scalar (@ac)); #consider all marker genotypes to be NULL when more than 2 allele is present
} else { #the normal scenario
if ($alleleAB) {
@gt = @ac;
map {s/^BA/AB/} @gt; #make sure A is always listed before B in the genotype calls
} else {
$uniq_allele[1] eq 'B' and print STDERR "WARNING: the 'B' allele is found in genotype calls for marker $name (did you forget the --alleleAB argument?)\n";
@ac2gt{'00', $uniq_allele[0].$uniq_allele[0], $uniq_allele[0].$uniq_allele[1], $uniq_allele[1].$uniq_allele[0], $uniq_allele[1].$uniq_allele[1]} = ('00', 'AA', 'AB', 'AB', 'BB');
%gt2ac = reverse %ac2gt;
@gt = map {$ac2gt{$_}} @ac;
}
}
#calculate minor allele frequency (for founders only)
my (%founder_allele, $maf_founder);
if ($chr eq 'X') {
for my $indgt (@gt[@{$ped_stat->{male_founder_index}}]) {
$indgt eq '00' and next;
$indgt =~ m/^(\w)\1$/ or next; #invalid genotype calls for male chrX
$founder_allele{$1}++;
}
for my $indgt (@gt[@{$ped_stat->{female_founder_index}}]) {
$indgt eq '00' and next;
$founder_allele{$_}++ for (split (//, $indgt));
}
} elsif ($chr eq 'Y') {
for my $indgt (@gt[@{$ped_stat->{male_founder_index}}]) {
$indgt eq '00' and next;
$indgt =~ m/^(\w)\1$/ or next; #invalid genotype calls for male chrY
$founder_allele{$1}++;
}
} else {
for my $indgt (@gt[@{$ped_stat->{founder_index}}]) {
$indgt eq '00' and next;
$founder_allele{$_}++ for (split (//, $indgt));
}
}
if (keys %founder_allele == 0) {
$maf_founder = 0;
} elsif (keys %founder_allele == 1) {
$maf_founder = 0;
} else {
$maf_founder = $founder_allele{A}/($founder_allele{A}+$founder_allele{B});
$maf_founder > 0.5 and $maf_founder = 1-$maf_founder;
}
$maf_founder >= $maf_threshold or push (@dm_maf, $name) and $marker_bad_flag++; #flag this marker as bad marker (does not pass QC criteria)
#print the header line of the association results (the length of the header line depends on the marker name length)
if (not defined $name_length) {
$name_length = length ($name) + 5; #the idea is that: given the first marker, add its length by 5, it should suffice for all subsequent markers (if not, the full marker name is still printed with mis-alignment)
$name_length < 12 and $name_length = 12;
outputAssocHeaderline ($name_length);
}
#check the Hardy-Weinberg equilibrium threshold
my (%founder_gt, $hwe_p_value);
if ($chr eq 'X') {
my @founder_gt = @gt[@{$ped_stat->{female_founder_index}}];
$founder_gt{$_}++ for (@founder_gt);
if (grep {!m/00/} @founder_gt) {
$hwe_p_value = kc::SNPHWE ($founder_gt{'AB'}||0, $founder_gt{'AA'}||0, $founder_gt{'BB'}||0);
} else {
$hwe_p_value = 'NA'; #no genotype information in founders so cannot calculate HWE
}
} elsif ($chr =~ m/^\d+$/) {
my @founder_gt = @gt[@{$ped_stat->{founder_index}}];
$founder_gt{$_}++ for (@founder_gt);
if (grep {!m/00/} @founder_gt) {
$hwe_p_value = kc::SNPHWE ($founder_gt{'AB'}||0, $founder_gt{'AA'}||0, $founder_gt{'BB'}||0);
} else {
$hwe_p_value = 'NA'; #no genotype information in founders
}
} else {
$hwe_p_value = 'NA'; #this statement is reserved for Y chromosome or Mitochondria (non diploid chromosome)
$marker_comment .= "(non_autosome_or_X)";
$allmarker or push @dm_invalidchr, $name and $marker_bad_flag++;
}
if ($hwe_p_value ne 'NA' and $hwe_p_value < $hwe_threshold) {
push @dm_hwe, $name;
$marker_bad_flag++;
}
#calculate association test statistics
my ($marker_mendel_error, %fam_mendel_error_recorded) = ('NA');
if ($tdt or $tsp) {
my ($chi2, $chi2_p, $t_count, $u_count, $index_mendel_error, $tu_trio, $permstring1, $permstring2);
my ($hx11, $hx12, $hx22, $hy11, $hy12, $hy22, $sx1122, $sx2211, $sy12, $sy21, $hx_star);
if ($tdt) {
($chi2, $chi2_p, $t_count, $u_count, $index_mendel_error, $tu_trio) = GWA::calTDT (\@gt, $nf_index, $ped_index, ($chr eq 'X'), $perfam);
$cycle and ($permstring1, $permstring2) = GWA::calTDT_perm (\@gt, $nf_index, $ped_index, ($chr eq 'X'), 0, $perm_index, $cycle);
} elsif ($tsp) {
($chi2, $chi2_p, $sx1122, $sx2211, $sy12, $sy21, $index_mendel_error, $tu_trio) = GWA::calTSP (\@gt, $nf_index, $ped_index, ($chr eq 'X'), $perfam);
$cycle and ($permstring1, $permstring2) = GWA::calTSP_perm (\@gt, $nf_index, $ped_index, ($chr eq 'X'), 0, $perm_index, $cycle);
}
#check the marker mendel error threshold (the $index_mendel_error contains index for offsprings with mendelian inconsistency)
$marker_mendel_error = @$index_mendel_error / @$nf_index;
$marker_mendel_error <= $mme_threshold or push @dm_mme, $name and $marker_bad_flag++;
#mark the family and individual as having mendel error, if this marker is still considered as a good marker
if (not $marker_bad_flag) {
for my $index (@$index_mendel_error) {
$ind_mendel_error[$index]++; #mark the child has mendelian error (but do not mark the parent as having mendelian error)
my $fam = $ped_index->[$index][0];
$fam_mendel_error_recorded{$fam} and next; #this family is already processed for this marker (but two different ind has mendel error)
$fam_mendel_error{$fam}++; #mark the family as having one additional mendel error
$fam_mendel_error_recorded{$fam}++; #mark the family as having mendel error and already recorded (several individuals in same family may have mendel error)
}
}
#prepare the output formatting
if (not $marker_bad_flag) {
if ($tdt) {
outputTDTResult ($name_length, $name, $chr, $pos, $chi2, $chi2_p, $t_count, $u_count, \@uniq_allele, $tu_trio, $permstring1, $permstring2, $snpprop->{$name}||'');
} elsif ($tsp) {
outputTSPResult ($name_length, $name, $chr, $pos, $chi2, $chi2_p, $sx1122, $sx2211, $sy12, $sy21, \@uniq_allele, $tu_trio, $permstring1, $permstring2, $snpprop->{$name}||'');
}
}
} elsif ($cc) {
my ($chi2, $chi2_p, $case_af, $control_af, $permstring1, $permstring2);
($chi2, $chi2_p, $case_af, $control_af) = GWA::calCC (\@gt, $nf_index, ($chr eq 'X'), $cellsize, undef, $ped_stat);
$cycle and ($permstring1, $permstring2) = GWA::calCC_perm (\@gt, $perm_index, ($chr eq 'X'), $cycle, $perm_method, $ped_stat);
if (not $marker_bad_flag) {
outputCCResult ($name_length, $name, $chr, $pos, $chi2, $chi2_p, $case_af, $control_af, \@uniq_allele, $permstring1, $permstring2, $snpprop->{$name}||'');
}
} elsif ($qt) {
my ($a, $b, $f, $f_p, $permstring1, $permstring2);
if ($marker_comment) { #something is wrong with the marker
($a, $b, $f, $f_p) = qw/NA NA NA NA/;
$cycle and $permstring1 = ',NA'x$cycle and $permstring2 = $permstring1;
} else {
($a, $b, $f, $f_p) = GWA::calQT (\@gt, $nf_index, ($chr eq 'X'));
$cycle and ($permstring1, $permstring2) = GWA::calQT_perm (\@gt, $nf_index, $perm_index, ($chr eq 'X'), $cycle);
}
if (not $marker_bad_flag) {
outputQTResult ($name_length, $name, $chr, $pos, $a, $b, $f, $f_p, \@uniq_allele, $permstring1, $permstring2, $snpprop->{$name}||'');
}
}
#write marker information regardless of whether marker_bad_flag is set (regardless of the exclusion criteria)
outputMarkerInfo ($name, $chr, $pos, $geno_missing, $maf_founder, $hwe_p_value, $marker_mendel_error, $marker_comment);
if (not $marker_bad_flag) {
for my $i (0 .. @ac-1) {
$ac[$i] eq '00' and $ind_missing[$i]++; #record individual that has NC genotype at this marker (the marker already passed quality threshold so it is a good marker)
}
$num_analyzed_marker++; #number of analyzed markers (those passing threshold)
}
}
print STDERR "NOTICE: Finished association analysis on $num_analyzed_marker markers that pass inclusion criteria\n";
outputDiscardMarker (\@dm_invalidgt, \@dm_invalidchr, \@dm_gt2allele, \@dm_geno, \@dm_maf, \@dm_hwe, \@dm_mme);
return (\%fam_mendel_error, \@ind_mendel_error, \@ind_missing, $num_analyzed_marker, $nf_index);
}
#print the header line in association results
sub outputAssocHeaderline {
my ($name_length) = @_;
my $name = "Name" . (' ' x $name_length);
$name = substr ($name, 0, $name_length);
if ($tdt) {
print "$name Chr Position A1:A2 T:U CHI2_TDT CHI2_P", $cycle ? "\tCHI2_PERM\tCHI2_P_PERM" : "", $snppropfile ? "\tSNP_property" : "";
} elsif ($tsp) {
print "$name Chr Position T:U_TRIO T:U_QURT CHI2 CHI2_P", $cycle ? "\tCHI2_PERM\tCHI2_P_PERM" : "", $snppropfile ? "\tSNP_property" : "";
} elsif ($cc) {
print "$name Chr Position A1:A2 case_AF cont_AF ALLELIC_chi2 A_P TREND_chi2 T_P GENO_chi2 G_P DOM_chi2 D_P REC_chi2 R_P";
if ($cycle) {
my %permcode = (1=>'A', 2=>'T', 3=>'G', 4=>'D', 5=>'R');
print "\t$permcode{$perm_method}_CHI2_PERM\t$permcode{$perm_method}_CHI2_P_PERM";
}
$snppropfile and print "\tSNP_property";
} elsif ($qt) {
print "$name Chr Position A1:A2 INTERCEPT SLOPE F F_P", $cycle ? "\tF_PERM\tF_P_PERM": "", $snppropfile ? "\tSNP_property":"";
}
print "\n";
}
sub outputMarkerInfo {
my (@info) = @_;
defined $output or return; #when --output is not set, do not write any marker information to file
print MINFO join ("\t", @info), "\n";
}
sub outputDiscardMarker {
my ($dm_invalidgt, $dm_invalidchr, $dm_gt2allele, $dm_geno, $dm_maf, $dm_hwe, $dm_mme) = @_;
@$dm_invalidgt and print STDERR "NOTICE: Total of ", scalar (@$dm_invalidgt), " markers are discarded from analysis due to having invalid (unrecognizable) genotype calls\n";
$output and @$dm_invalidgt and print STDERR "#dm_invalidgt#\n", join ("\n", @$dm_invalidgt), "\n#dm_invalidgt#\n";
@$dm_invalidchr and print STDERR "NOTICE: Total of ", scalar (@$dm_invalidchr), " markers are discarded from analysis due to being on non-autosome and non-X chromosomes\n";
$output and @$dm_invalidchr and print STDERR "#dm_invalidchr#\n", join ("\n", @$dm_invalidchr), "\n#dm_invalidchr#\n";
@$dm_gt2allele and print STDERR "NOTICE: Total of ", scalar (@$dm_gt2allele), " markers are discarded from analysis due to having more than 2 alleles in genotype calls\n";
$output and @$dm_gt2allele and print STDERR "#dm_gt2allele#\n", join ("\n", @$dm_gt2allele), "\n#dm_gt2allele#\n";
@$dm_geno and print STDERR "NOTICE: Total of ", scalar (@$dm_geno), " markers are discarded from analysis due to having genotype frequency lower than geno_threshold $geno_threshold\n";
$output and @$dm_geno and print STDERR "#dm_geno#\n", join ("\n", @$dm_geno), "\n#dm_geno#\n";
@$dm_maf and print STDERR "NOTICE: Total of ", scalar (@$dm_maf), " markers are discarded from analysis due to having minor allele frequency in founders lower than maf_threshold $maf_threshold\n";
$output and @$dm_maf and print STDERR "#dm_maf#\n", join ("\n", @$dm_maf), "\n#dm_maf#\n";
@$dm_hwe and print STDERR "NOTICE: Total of ", scalar (@$dm_hwe), " markers are discarded from analysis due to having Hardy-Weinberg equilibium P value lower than hwe_threshold $hwe_threshold\n";
$output and @$dm_hwe and print STDERR "#dm_hwe#\n", join ("\n", @$dm_hwe), "\n#dm_hwe#\n";
@$dm_mme and print STDERR "NOTICE: Total of ", scalar (@$dm_mme), " markers are discarded from analysis due to having mendelian errors more than marker_mendel_error_threshold $mme_threshold\n";
$output and @$dm_mme and print STDERR "#dm_mme#\n", join ("\n", @$dm_mme), "\n#dm_mme#\n";
}
sub outputQTResult {
my ($name_length, $name, $chr, $pos, $a, $b, $f, $f_p, $uniq_allele, $permstring1, $permstring2, $property) = @_;
my ($allele1, $allele2) = @$uniq_allele;
if (length ($name) < $name_length) {
$name = substr ($name . (' 'x$name_length), 0, $name_length);
}
$allele1 ||= '-';
$allele2 ||= '-';
$chr = substr (" $chr", -4, 4);
$pos = substr (" $pos", -11, 11);
print $name, $chr, $pos, " $allele1:$allele2 ";
if ($a eq 'NA') {
print ' NA', ' ', ' NA';
} else {
print sprintf ("%10.4g", $a), ' ', sprintf ("%10.4g", $b); #the b value might be -3.469e-17 (so there won't be space between a and b)
}
print ' '; #some people reported lack of space between SLOPE and F
if ($f eq 'NA') {
print ' NA', ' ', ' NA';
} else {
print sprintf ("%10.4g", $f), ' ', sprintf ("%10.4g", $f_p);
}
$permstring1 and print "\t", $permstring1;
$permstring2 and print "\t", $permstring2;
if ($snppropfile) {
print $property ? "\t$property" : "\tNA";
}
print "\n";
}
sub outputCCResult {
my ($name_length, $marker, $chr, $pos, $chi2, $chi2_p, $case_af, $control_af, $uniq_allele, $permstring1, $permstring2, $property) = @_;
my ($allele1, $allele2) = @$uniq_allele;
if (length ($marker) < $name_length) {
$marker = substr ($marker . (' 'x$name_length), 0, $name_length);
}
$allele1 ||= '-';
$allele2 ||= '-';
$chr = substr (" $chr", -4, 4);
$pos = substr (" $pos", -11, 11);
print $marker, $chr, $pos, " $allele1:$allele2", sprintf ("%10.4g", $case_af), sprintf ("%10.4g", $control_af);
for my $i (0 .. @$chi2-1) {
if ($chi2->[$i] eq 'NA') {
print ' NA', ' ', ' NA';
} else {
print sprintf ("%10.4g", $chi2->[$i]), ' ', sprintf ("%10.4g", $chi2_p->[$i]);
}
}
$permstring1 and print "\t", $permstring1;
$permstring2 and print "\t", $permstring2;
if ($snppropfile) {
print $property ? "\t$property" : "\tNA";
}
print "\n";
}
sub outputTDTResult {
my ($name_length, $name, $chr, $pos, $chi2, $chi2_p, $t_count, $u_count, $uniq_allele, $tu_trio, $permstring1, $permstring2, $property) = @_;
if (length ($name) < $name_length) {
$name = substr ($name . (' 'x$name_length), 0, $name_length);
}
$chr = substr (" $chr", -4, 4);
$pos = substr (" $pos", -11, 11);
if ($chi2 eq 'NA') {
($chi2, $chi2_p) = (' NA', ' NA');
} else {
$chi2 = sprintf ("%10.4g", $chi2);
$chi2_p = sprintf ("%10.4g", $chi2_p);
}
my ($allele1, $allele2) = @$uniq_allele;
$allele2 ||= '-';
my $transmission_status = substr (" $t_count:$u_count", -10, 10);
print $name, $chr, $pos, " $allele1:$allele2", $transmission_status, $chi2, ' ', $chi2_p;
$permstring1 and print "\t", $permstring1;
$permstring2 and print "\t", $permstring2;
if (@$tu_trio) { #when --evidence argument is set, write the detailed output evidence to the file handle
print EVI "$name\t", join ("\t", @$tu_trio), "\n";
}
if ($snppropfile) {
print $property ? "\t$property" : "\tNA";
}
print "\n";
}
sub outputTSPResult {
my ($name_length, $name, $chr, $pos, $chi2, $chi2_p, $sx1122, $sx2211, $sy12, $sy21, $uniq_allele, $tu_trio, $permstring1, $permstring2, $property) = @_;
my ($trio_tu_ratio, $quartet_tu_ratio);
if (length ($name) < $name_length) {
$name = substr ($name . (' 'x$name_length), 0, $name_length);
}
$chr = substr (" $chr", -4, 4);
$pos = substr (" $pos", -11, 11);
if ($chi2 eq 'NA') {
($chi2, $chi2_p) = (' NA', ' NA');
} else {
$chi2 = sprintf ("%10.4g", $chi2);
$chi2_p = sprintf ("%10.4g", $chi2_p);
}
my ($allele1, $allele2) = @$uniq_allele;
$allele2 ||= '-';
$trio_tu_ratio = substr (" $allele1($sy12):$allele2($sy21)", -15, 15);
$quartet_tu_ratio = substr (" $allele1($sx1122):$allele2($sx2211)", -15, 15);
print $name, $chr, $pos, $trio_tu_ratio, $quartet_tu_ratio, $chi2, ' ', $chi2_p;
$permstring1 and print "\t", $permstring1;
$permstring2 and print "\t", $permstring2;
if (@$tu_trio) { #when --evidence argument is set, write the detailed output evidence to the file handle
print EVI "$name\t", join ("\t", @$tu_trio), "\n";
}
if ($snppropfile) {
print $property ? "\t$property" : "\tNA";
}
print "\n";
}
sub outputSampleStat {
my ($ped_index, $fam_mendel_error, $ind_mendel_error, $ind_missing, $num_analyzed_marker) = @_;
if ($output) {
for my $i (0 .. @$ped_index-1) {
my ($famid, $indid, $fatid, $motid) = ($ped_index->[$i][0], $ped_index->[$i][1], $ped_index->[$i][2], $ped_index->[$i][3]);
$ind_mendel_error->[$i] ||= 0;
$ind_missing->[$i] ||= 0;
print SINFO "$famid\t$indid\t$num_analyzed_marker\t$ind_missing->[$i]\t", $ind_missing->[$i]/$num_analyzed_marker, "\t$ind_mendel_error->[$i]\t", $ind_mendel_error->[$i]/$num_analyzed_marker, "\n";
}
for my $famid (keys %$fam_mendel_error) {
print FINFO $famid, "\t", $fam_mendel_error->{$famid}, "\t", $num_analyzed_marker, "\t", $fam_mendel_error->{$famid} / $num_analyzed_marker, "\n";
}
}
}
sub fisher_yates_shuffle {
my $array = shift;
my $i;
for ($i = @$array; --$i; ) {
my $j = int rand ($i+1);
next if $i == $j;
@$array[$i, $j] = @$array[$j, $i];
}
}
sub readSNPPropFile {
my ($snppropfile) = @_;
my (%snpprop, $maxlength);
open (SNPPROP, $snppropfile) or confess "Error: cannot read from snppropfile $snppropfile: $!";
while (<SNPPROP>) {
m/^(\S+)\t(.+)/ or confess "Error: invalid record found in snppropfile (at least 2 tab-delimited records expected): <$_>";
$snpprop{$1} = $2;
$maxlength ||= length ($2);
$maxlength < length ($2) and $maxlength = length ($2);
}
return (\%snpprop, $maxlength);
}
=head1 SYNOPSIS
calculate_association.pl [arguments] <ped-header-file> <GTfile>
Optional arguments:
-v, --verbose use verbose output
-h, --help print help message
-m, --man print complete documentation
Analysis Type:
--tdt use TDT analysis for independent trios
--tsp use TSP analysis for combined trio and quartet
--pdt use PDT analysis for general pedigree (not implemented yet)
--cc case-control study (default analysis option)
--force_tdt force TDT analysis for non-independent trios in multiplex families
--qt perform quantitative trait analysis (default is binary trait)
Output Control:
--output <file> output root file name (default=STDOUT)
--perfam record detailed transmission ratio for each family
--snppropfile <file> a SNP property file to annotate association results
--(no)flush flush cache after each read/write operation (default=ON)
Inclusion/Exclusion Criteria for Markers:
--allmarker consider all markers (no exclusion on maf, hwe, geno, mme)
--exclude <file(s)> specify markers to exclude from analysis
--extract <file(s)> specify markers to extract (use) in analysis
--geno_threshold <float> genotype frequency threshold for SNP (default=0.1)
--maf_threshold <float> minor allele frequency threshold for SNP (default=0.01)
--mme_threshold <float> marker mendelian error threshold (default=0.1)
--hwe_threshold <float> Hardy-Weinberg equilibrium P-value threshold (default=0.001)
--mstart <int> index (1-based) of first marker to include analysis
--mend <int> index (1-based) of last marker to include in analysis
--cellsize <int> minimum number (inclusive) in cell in contingency table
Inclusion/Exclusion Criteria for Samples:
--allind consider all individuals (no exclusion on mind, fme, ime)
--remove <file(s)> specify individuals to remove from analysis
--keep <file(s)> specify individuals to keep in analysis
--mind_threshold <float> missing genotype threshold for individuals (default=0.1, re-run program)
--fme_threshold <float> family mendelian error threshold (default=1, OPTION NOT DEFINED YET!)
--ime_threshold <float> individual offspring mendelian error threshold (default=0.02, re-run program)
File Format Specification
--alleleAB GT file use AB alphabet for alleles (rather than ACGT)
--ab GT file use AB alphabet for alleles (rather than ACGT)
--canonical_gtfile_format data file in canonical format (columns are name, chr, pos, genotypes)
--simple_format data file in simple format (columns are name, genotypes)
--illumina_gtfile_format data file in Illumina FullDataTable format
--affymetrix_gtfile_format data file in Affymetrix Power Tools genotype call format (to be implemented)
--plink_tpedfile_format data file in TFAM/TPED file format generated by PLINK
Analysis-specific Options: