This project is for experiment of flink-cdc and doris.
CDC(Change Data Capture) is made up of two components, the CDD and the CDT. CDD is stand for Change Data Detection and CDT is stand for Change Data Transfer.
Extract, Load, Transform (ELT) is a data integration process for transferring raw data from a source server to a data system (such as a data warehouse or data lake) on a target server and then preparing the information for downstream uses.
Streaming ETL (Extract, Transform, Load) is the processing and movement of real-time data from one place to another. ETL is short for the database functions extract, transform, and load.
vm | role | ip | xxx_home |
---|---|---|---|
vm101 | zookeeper | 192.168.56.101 | /opt/zookeeper |
vm102 | zookeeper | 192.168.56.102 | /opt/zookeeper |
vm103 | zookeeper | 192.168.56.103 | /opt/zookeeper |
cd /opt/zookeeper
./bin/zkServer.sh status
echo stat | nc 127.0.0.1 2181
Zookeeper version: 3.8.4-9316c2a7a97e1666d8f4593f34dd6fc36ecc436c, built on 2024-02-12 22:16 UTC
Clients:
/127.0.0.1:39370[0](queued=0,recved=1,sent=0)
Latency min/avg/max: 0/6.375/41
Received: 10
Sent: 9
Connections: 1
Outstanding: 0
Zxid: 0x100000003
Mode: follower
Node count: 5
vm | role | ip | xxx_home |
---|---|---|---|
vm116 | NameNode, zkfc, JournalNode | 192.168.56.116 | /opt/hadoop |
vm117 | NameNode, zkfc, JournalNode | 192.168.56.117 | /opt/hadoop |
vm118 | NameNode, zkfc, JournalNode | 192.168.56.118 | /opt/hadoop |
vm119 | DataNode | 192.168.56.119 | /opt/hadoop |
vm120 | DataNode | 192.168.56.120 | /opt/hadoop |
vm121 | DataNode | 192.168.56.121 | /opt/hadoop |
vm | role | ip | xxx_home |
---|---|---|---|
vm116 | yarn RM | 192.168.56.116 | /opt/hadoop |
vm117 | yarn RM | 192.168.56.117 | /opt/hadoop |
vm118 | yarn RM | 192.168.56.118 | /opt/hadoop |
vm119 | yarn NM | 192.168.56.119 | /opt/hadoop |
vm120 | yarn NM | 192.168.56.120 | /opt/hadoop |
vm121 | yarn NM | 192.168.56.121 | /opt/hadoop |
###########################################################
# 以下所有操作都需要在hduser用户下执行
# su -l hduser
###########################################################
# vm116
# hdfs --daemon start journalnode
# hdfs namenode -format (执行一次)
# hdfs zkfc -formatZK (执行一次)
# hdfs --daemon start namenode && hdfs --daemon start zkfc
# vm117 vm118
# hdfs --daemon start journalnode
# hdfs namenode -bootstrapStandby (执行一次)
# hdfs --daemon start namenode && hdfs --daemon start zkfc
## test hdfs HA
(
hdfs haadmin -getServiceState nn1
hdfs haadmin -getServiceState nn2
hdfs haadmin -getServiceState nn3
)
active
standby
standby
# vm119 vm120 vm121
# hdfs --daemon start datanode
# yarn --daemon start resourcemanager //vm116 vm117 vm118
# yarn --daemon start nodemanager //vm119 vm120 vm121
vm | role | ip | xxx_home |
---|---|---|---|
vm116 | minio | 192.168.56.116 | /opt/minio |
vm117 | minio | 192.168.56.117 | /opt/minio |
vm118 | minio | 192.168.56.118 | /opt/minio |
vm119 | minio | 192.168.56.119 | /opt/minio |
vm | role | ip | xxx_home |
---|---|---|---|
vm116 | sidekick, flink(masters+workers) | 192.168.56.116 | /opt/flink |
vm117 | sidekick, flink(masters+workers) | 192.168.56.117 | /opt/flink |
vm118 | sidekick, flink(masters+workers) | 192.168.56.118 | /opt/flink |
vm119 | sidekick, flink(workers) | 192.168.56.119 | /opt/flink |
vm120 | sidekick, flink(workers) | 192.168.56.120 | /opt/flink |
vm121 | sidekick, flink(workers) | 192.168.56.121 | /opt/flink |
minio client
curl -o /usr/local/bin/mc -# -fSL https://dl.min.io/client/mc/release/linux-amd64/mc
chmod +x /usr/local/bin/mc
mc --help
mc alias set myminio http://localhost:9000 minioadmin minioadmin
mc admin user svcacct add --access-key "u5SybesIDVX9b6Pk" --secret-key "lOpH1v7kdM6H8NkPu1H2R6gLc9jcsmWM" myminio minioadmin
# mc admin user svcacct add --access-key "myuserserviceaccount" --secret-key "myuserserviceaccountpassword" myminio minioadmin
minio load balancer
bash /vagrant/scripts/install-minio-sidekick.sh --port "18000" --sites "http://vm{116...119}:9000"
mc mb myminio/flink
mc mb myminio/flink-state
# vm116 vm117 vm118 vm119 vm120 vm121
su -l root
bash /vagrant/scripts/install-flink.sh --version 1.20.0 --flink-home /opt/flink
bash /vagrant/scripts/install-flink-cdc.sh --version 3.2.0 --flink-cdc-home /opt/flink-cdc
# https://blog.csdn.net/hiliang521/article/details/126860098
su -l hduser
cd /opt/flink
## start-cluster
bin/start-cluster.sh
## stop-cluster
bin/stop-cluster.sh
bin/flink run /opt/flink/examples/streaming/WordCount.jar
vagrant ssh vm211
cd /opt/test-flink-cdc
docker compose exec doris-fe mysql -uroot -P9030 -h127.0.0.1 -e "show backends; show frontends;"
vagrant ssh vm116
cat <<EOF > ~/mysql-to-doris.yaml
source:
type: mysql
hostname: 192.168.56.211
port: 3306
username: root
password: 123456
tables: app_db.\.*
server-id: 5400-5404
#server-time-zone: UTC
server-time-zone: 'Asia/Shanghai'
sink:
type: doris
fenodes: 192.168.56.211:8030
username: root
password: ""
table.create.properties.light_schema_change: true
table.create.properties.replication_num: 1
pipeline:
name: Sync MySQL Database to Doris
parallelism: 2
EOF
flink_cdc_home="/opt/flink-cdc"
pushd $flink_cdc_home || exit 1
./bin/flink-cdc.sh ~/mysql-to-doris.yaml --jar lib/mysql-connector-java-8.0.27.jar
popd || exit 1
bin/sql-client.sh
SET execution.checkpointing.interval = 6000;