forked from eyashiro/AmpProc
-
Notifications
You must be signed in to change notification settings - Fork 0
/
otutab_sintax_to_ampvis.v1.2.sh
executable file
·173 lines (136 loc) · 5.46 KB
/
otutab_sintax_to_ampvis.v1.2.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
#!/bin/bash
#######################################################
#
# Script: otutab_sintax_to_ampvis.v1.1.sh
#
# Author: Erika Yashiro, Ph.D.
#
# Last modified: 3 September 2019
#
# script.sh -i [otu table_notax] -t [sintax.out.txt] -r [which reference database used for sintax]
#
# REFDATABASE: MiDAS / SILVA (or silva) / RDP / UNITE
#
#######################################################
# Set error check
set -e
set -o pipefail
while getopts :i:t:r: option
do
case "${option}"
in
i) OTUTABLE_NOTAX=${OPTARG} ;;
t) SINTAX=${OPTARG} ;;
r) REFDATABASE=${OPTARG} ;;
\?) echo ""
echo "Invalid option: -$OPTARG" >&2
echo ""
echo "Usage: script.sh -i [otutable_notax.txt] -t [sintax.out.txt] -r [which reference database used for sintax]"
echo ""
echo "Exiting script."
echo ""
exit 1 ;;
:) echo ""
echo "Option -$OPTARG requires an argument"
echo ""
echo "Usage: script.sh -i [otutable_notax.txt] -t [sintax.out.txt] -r [which reference database used for sintax]"
echo ""
echo "Exiting script."
echo ""
exit 1 ;;
esac
done
####################
# FUNCTIONS
####################
REMOVE_FUNCTION () {
rm -f $SINTAX.trimmed label_otu label_sintax label_uniques
rm -f $SINTAX.sorted
rm -f $SINTAX.sorted.tmp*
#rm otutable_notax_sorted.txt
}
####################
# Input files:
#OTUTABLE_NOTAX=$1
#SINTAX=$2
#REFDATABASE=$3
# REFDATABASE: MiDAS / SILVA / RDP / UNITE
# Remove file extension from input otu table name.
FILERAD=`echo $OTUTABLE_NOTAX | sed 's/\..*//g'`
# Sort otu table (-V is version sort)
awk 'NR<2{print $0; next}{print $0 | "sort -k1 -V"}' $OTUTABLE_NOTAX | sed 's/#OTU ID/OTU/g' > otutable_notax_sorted.txt
# In some cases, especially with zotus, a couple raw reads match the same otus/zotus. In such a case some of the otus/zotus get eliminated from the final z/otu table and cause the joining of taxonomy and frequency table to fail.
# Remove entries that don't match up between z/otu table and sintax output.
# z/otu list from z/otu table
cut -f1 $OTUTABLE_NOTAX > label_otu
# z/otu list from sintax output
cut -f1 $SINTAX > label_sintax
# Check that z/otu table and sintax output are congruent sample sets
CONGRUENTLABELS=`cat label_otu label_sintax | sort | uniq -d | wc -l`
if [ $CONGRUENTLABELS -lt 1 ]
then
echo ""
echo "The input table and fasta file do not contain the same samples."
echo "Exiting script"
REMOVE_FUNCTION
exit 1
fi
# remove non-congruent z/otus from sintax file
cat label_otu label_sintax | sort | uniq -u > label_uniques
grep -v -w -F -f label_uniques $SINTAX | sed '/^--$/d' > $SINTAX.trimmed
# Extract columns, sort
#awk -F "\t" '{ print $1"\t"$4 }' $SINTAX | sed 's/,/\t/g' | sort -k1 -V >> $SINTAX.sorted.tmp
awk -F "\t" '{ print $1"\t"$4 }' $SINTAX.trimmed | sort -k1 -V > $SINTAX.sorted.tmp0
# Extract just the taxonomy column
awk -F "\t" '{ print $2 }' $SINTAX.sorted.tmp0 > $SINTAX.sorted.tmp
#echo "Hello, $SINTAX.sorted.tmp"
#head $SINTAX.sorted.tmp
# Swap domain for kingdom if presented as such
#BACTDOMAIN=$(head -n 10 $SINTAX.sorted.tmp | grep -c "^d")
#head -n10 $SINTAX.sorted.tmp | grep "d:" | wc -l
#echo "domain: $BACTDOMAIN"
#if [ -n $DOMAIN ]
#if [ "$BACTDOMAIN" -le 0 ]
# then
# sed -i 's/d:/k:/g' $SINTAX.sorted.tmp
# else
# echo ""
#fi
# Check presence of all taxonomy levels
# use domain or kingdom designation
# count number of occurences of d__ domain, vs k__ kingdom, if grep -c finds nothing, and has exit status 1, then just echo zero.
KDNUM=`grep -c "k:" $SINTAX.sorted.tmp || true`
KDNUM2=`grep -c "d:" $SINTAX.sorted.tmp || true`
#echo "KDNUM: $KDNUM; KDNUM2: $KDNUM2"
if [ $KDNUM -ge 1 ]
then
awk -F ',' 'OFS="," { if ($1 !~ /k:/) {$1="k:,"$1; print $0} else {print $0} }' $SINTAX.sorted.tmp > $SINTAX.sorted.tmpa
else
if [ $KDNUM2 -ge 1 ]
then
awk -F ',' 'OFS="," { if ($1 !~ /d:/) {$1="d:,"$1; print $0} else {print $0} }' $SINTAX.sorted.tmp > $SINTAX.sorted.tmpa
fi
fi
awk -F ',' 'OFS="," { if ($2 !~ /p:/) {$2="p:,"$2; print $0} else {print $0} }' $SINTAX.sorted.tmpa > $SINTAX.sorted.tmpb
awk -F ',' 'OFS="," { if ($3 !~ /c:/) {$3="c:,"$3; print $0} else {print $0} }' $SINTAX.sorted.tmpb > $SINTAX.sorted.tmpc
awk -F ',' 'OFS="," { if ($4 !~ /o:/) {$4="o:,"$4; print $0} else {print $0} }' $SINTAX.sorted.tmpc > $SINTAX.sorted.tmpd
awk -F ',' 'OFS="," { if ($5 !~ /f:/) {$5="f:,"$5; print $0} else {print $0} }' $SINTAX.sorted.tmpd > $SINTAX.sorted.tmpe
awk -F ',' 'OFS="," { if ($6 !~ /g:/) {$6="g:,"$6; print $0} else {print $0} }' $SINTAX.sorted.tmpe > $SINTAX.sorted.tmpf
awk -F ',' 'OFS="," { if ($7 !~ /s:/) {$7="s:"$7; print $0} else {print $0} }' $SINTAX.sorted.tmpf > $SINTAX.sorted.tmpg
# replace kingdom with domain if necessary
#if [ -n $DOMAIN ]
#if [ $DOMAINpresent -gt 0 ]
# then
# sed -i 's/k:/d:/g' $SINTAX.sorted.tmpg
#fi
# Reappend the OTU column
awk -F "\t" '{ print $1 }' $SINTAX.sorted.tmp0 > $SINTAX.sorted.tmp00
paste -d "\t" $SINTAX.sorted.tmp00 $SINTAX.sorted.tmpg > $SINTAX.sorted.tmph
# Append all columns and header
printf "OTU\tKingdom\tPhylum\tClass\tOrder\tFamily\tGenus\tSpecies\n" > $SINTAX.sorted
sed -e 's/:/__/g' -e 's/,/\t/g' $SINTAX.sorted.tmph >> $SINTAX.sorted
# Append tax to otutable
#join -t ' ' otutable_notax_sorted.txt $SINTAX.sorted > ${FILERAD}_${REFDATABASE}.txt
join -t $'\t' otutable_notax_sorted.txt $SINTAX.sorted > ${FILERAD}_${REFDATABASE}.txt
mv otutable_notax_sorted.txt ${FILERAD}.sorted.txt
REMOVE_FUNCTION