-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsort_revdocs.hadoop
executable file
·44 lines (40 loc) · 1.71 KB
/
sort_revdocs.hadoop
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#!/bin/bash
# Gather command line args
job_name=$1
input=$2
output=$3
echo "Updating virtualenv"
pip install -r requirements.txt --upgrade
echo "Zipping up virtualenv"
cd /home/halfak/venv/3.4/
zip -rq ../3.4.zip *
cd -
cp /home/halfak/venv/3.4.zip virtualenv.zip
echo "Moving virtualenv.zip to HDFS"
hdfs dfs -put -f virtualenv.zip /user/halfak/virtualenv.zip;
echo "Running hadoop job"
hadoop jar /opt/hadoop/share/hadoop/tools/lib/hadoop-*streaming*.jar \
-D mapreduce.job.name=$job_name \
-D mapreduce.output.fileoutputformat.compress=true \
-D mapreduce.output.fileoutputformat.compress.type=BLOCK \
-D mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.BZip2Codec \
-D mapreduce.task.timeout=6000000 \
-D stream.num.map.output.key.fields=3 \
-D mapreduce.partition.keypartitioner.options='-k1,1n' \
-D mapreduce.job.output.key.comparator.class="org.apache.hadoop.mapred.lib.KeyFieldBasedComparator" \
-D mapreduce.partition.keycomparator.options='-k1,1n -k2,2 -k3,3n' \
-D mapreduce.reduce.speculative=false \
-D mapreduce.reduce.env="LD_LIBRARY_PATH=virtualenv/lib/" \
-D mapreduce.map.env="LD_LIBRARY_PATH=virtualenv/lib/" \
-D mapreduce.map.memory.mb=5120 \
-D mapreduce.reduce.speculative=false \
-D mapreduce.reduce.memory.mb=5120 \
-D mapreduce.reduce.vcores=2 \
-D mapreduce.job.reduces=2000 \
-files hadoop/json2tsv \
-archives 'hdfs:///user/halfak/virtualenv.zip#virtualenv' \
-input $input \
-output $output \
-mapper "bash -c './json2tsv page.id timestamp id -'" \
-reducer "bash -c 'cut -f4'" \
-partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner