Skip to content

Commit

Permalink
scripts for handling the TPCDS data generation and the uploads into S…
Browse files Browse the repository at this point in the history
…3 storage (#146)


* adding scripts that handles the parallelism of TPCDS data generation and the upload into S3 storage

Signed-off-by: galsalomon66 <[email protected]>

---------

Signed-off-by: galsalomon66 <[email protected]>
  • Loading branch information
galsalomon66 authored Dec 27, 2023
1 parent 381985a commit f31f46e
Show file tree
Hide file tree
Showing 2 changed files with 201 additions and 0 deletions.
170 changes: 170 additions & 0 deletions TPCDS/generate_upload_and_remove_infra.bash
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
#!/bin/bash

## this script resides in [galsl/fedora_38:tpcds_v2] docker container.
## the container uses the following repo [ https://github.com/galsalomon66/tpc-ds-datagen-to-aws-s3 ] for the dsdgen application.
## the purpose of this script it to launch multiple instances of the dsdgen-application(depends on number of cores)
## the flow splits between the very-big-tables and the small tables.
## the num_of_cpu defines the size of parallelism, the num_of_partitions defines the amount chunks that combines togather a single table (it could be huge).
## each cycle of parallel generate-application, ended with flow that uploads the generated files into S3(its done in parallel), upon all files are uploaded
## it removes all files, i.e. for 3TB scale there is no need for 3TB of disk-space (as for S3-storage capacity it obvious ...)


## TODO set by te user
TPCDS_DIR=/tpcds_output/


all_tables="call_center
catalog_page
customer_address
customer
customer_demographics
date_dim
household_demographics
income_band
item
promotion
reason
ship_mode
store
time_dim
warehouse
web_page
web_site
catalog_returns
catalog_sales
web_returns
web_sales
store_returns
store_sales"

#big tables and also parent
#parent table means it got a child table, i.e. there is a relation between them.
parent_tables="store_sales catalog_sales web_sales inventory"

## not a parent table
standalone_tables="call_center catalog_page customer_address customer customer_demographics date_dim household_demographics income_band
item promotion reason ship_mode store time_dim warehouse web_page web_site"
#small_tables=""

num_of_cpu=56
num_of_partitions=0

create_dsdgen_workers_non_parent_tables()
{

[ ! -d ${TPCDS_DIR} ] && echo ${TPCDS_DIR} not exist && exit
num_of_partitions=$(echo 1 | awk -v sc=${SCALE} -v c=${num_of_cpu} '{print int((sc/1000)*c);}')
if [ $num_of_partitions -le 1 ]
then
num_of_partitions=2
fi

echo "small tables="num_of_partitions=${num_of_partitions}

((i=1))

for t in ${standalone_tables}
do
for c in $(seq 1 ${num_of_partitions})
do
## the command line defines which table, what scale(size), paratition size, what partition to produce and where to produce it.
echo "time ./dsdgen -dir ${TPCDS_DIR} -table ${t} -scale ${SCALE} -force -parallel ${num_of_partitions} -child ${c} &" >> generate_upload_and_remove_exec.bash
## number of CPU
if [ $(( i++ % ${num_of_cpu} )) -eq 0 ]
then
echo wait >> generate_upload_and_remove_exec.bash
# upon complete with wait, loop on generated dat files, upload each in parallel, each upload is done, remove file
# upload && remove
#
echo upload_and_remove_worker_func >> generate_upload_and_remove_exec.bash
fi
done
done
echo wait >> generate_upload_and_remove_exec.bash
echo upload_and_remove_worker_func >> generate_upload_and_remove_exec.bash
echo "echo small tables done." >> generate_upload_and_remove_exec.bash

chmod +x generate_upload_and_remove_exec.bash
}

create_dsdgen_workers()
{

[ ! -d ${TPCDS_DIR} ] && echo ${TPCDS_DIR} not exist && exit
num_of_partitions=$(echo 1 | awk -v sc=${SCALE} -v c=${num_of_cpu} '{print int((sc/10)*c);}')
echo "big tables="num_of_partitions=${num_of_partitions}
if [ $num_of_partitions -le 1 ]
then
num_of_partitions=2
fi

((i=1))
touch generate_upload_and_remove_exec.bash
rm -f generate_upload_and_remove_exec.bash

echo "#!/bin/bash" >> generate_upload_and_remove_exec.bash
## upload_and_remove_func.bash include functions for upload and remove
echo ". generate_upload_and_remove_infra.bash" >> generate_upload_and_remove_exec.bash
echo "cd /tpc-ds-datagen-to-aws-s3/tpc-ds/v2.11.0rc2/tools" >> generate_upload_and_remove_exec.bash

for t in ${parent_tables}
do
for c in $(seq 1 ${num_of_partitions})
do
echo "time ./dsdgen -dir ${TPCDS_DIR} -table ${t} -scale ${SCALE} -force -parallel ${num_of_partitions} -child ${c} &" >> generate_upload_and_remove_exec.bash
## number of CPU
if [ $(( i++ % ${num_of_cpu} )) -eq 0 ]
then
echo wait >> generate_upload_and_remove_exec.bash
# upon complete with wait, loop on generated dat files, upload each in parallel, each upload is done, remove file
# upload && remove
#
echo upload_and_remove_worker_func >> generate_upload_and_remove_exec.bash
fi
done
done
echo wait >> generate_upload_and_remove_exec.bash
echo upload_and_remove_worker_func >> generate_upload_and_remove_exec.bash
echo "echo big tables done." >> generate_upload_and_remove_exec.bash

## adding the production of the other tables
create_dsdgen_workers_non_parent_tables

chmod +x generate_upload_and_remove_exec.bash

## the generated script bellow contains all is needed for creating TPCDS tables in S3-storage.
## should execute by the user
#./generate_upload_and_remove_exec.bash

}

upload_and_remove_worker_func()
{
# create list of tasks to run in background, remove each uploaded file upon completion
(i=0)
touch upload_and_remove_exec.bash
rm -f upload_and_remove_exec.bash

echo "#!/bin/bash" >> upload_and_remove_exec.bash

for f in $(ls ${TPCDS_DIR}/*.dat)
do
#echo $f
table_name=$(basename $f | sed 's/_[0-9]\+_[0-9]\+/ /' | awk '{print $1;}')
echo "(aws s3api put-object --bucket hive --key scale_${SCALE}/${table_name}/$(basename $f) --body ${f} --endpoint-url ${S3_ENDPOINT} > /dev/null 2>&1 && echo upload ${f} && rm -f ${f}) &" >> upload_and_remove_exec.bash
if [ $(( i++ % ${num_of_cpu} )) -eq 0 ]
then
echo wait >> upload_and_remove_exec.bash
fi
done

echo wait >> upload_and_remove_exec.bash
#upload and remove all generated files
chmod +x upload_and_remove_exec.bash
cp upload_and_remove_exec.bash upload_and_remove.bash_${RANDOM} ## debug

## start upload and remove in parallel
./upload_and_remove_exec.bash

}

31 changes: 31 additions & 0 deletions TPCDS/run_tpcds.bash
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/bin/bash

## this script is the entry-point of tpcds-data-generation
## the first and the only argument is the SCALE factor

[ $# -lt 1 ] && echo "type a single number for the scale (2 --> 3000)" && exit

re='^[0-9]+$'
[[ ! $1 =~ $re ]] && echo "SCALE should be a number" && exit

## the following code lines accepts env-variables for the S3 system
[ -z ${S3_ENDPOINT} ] && echo "missing env-variable S3_ENDPOINT" && exit
[ -z ${S3_ACCESS_KEY} ] && echo missing env-variable S3_ACCESS_KEY && exit
[ -z ${S3_SECRET_KEY} ] && echo missing env-variable S3_SECRET_KEY && exit

## updating AWS credentials
cat ~/.aws/credentials | \
awk -v acc=${S3_ACCESS_KEY} '{if($0 ~ /aws_access_key_id/){print "aws_access_key_id = ",acc;} else{print $0;}}' | \
awk -v acc=${S3_SECRET_KEY} '{if($0 ~ /aws_secret_access_key/){print "aws_secret_access_key = ",acc;} else{print $0;}}' > /tmp/credentials

cat /tmp/credentials > ~/.aws/credentials

export SCALE=$1

. ./generate_upload_and_remove_infra.bash

## create generate_upload_and_remove_exec.bash
create_dsdgen_workers
## running tpcds data generator script
time /generate_upload_and_remove_exec.bash

0 comments on commit f31f46e

Please sign in to comment.