DataX全量数据采集
前置操作
DataX安装:https://blog.csdn.net/Tonystark_lz/article/details/126393252?spm=1001.2014.3001.5501
DataX配置文件生成脚本
1)在~/bin目录下创建gen_import_config.py脚本(生成datax json文件)
vim ~/bin/gen_import_config.py
脚本内容如下(已配置HA):
#!/usr/bin/python
# ecoding=utf-8
import json
import getopt
import os
import sys
import MySQLdb
#MySQL相关配置,需根据实际情况作出修改
mysql_host = "hadoop102"
mysql_port = "3306"
mysql_user = "root"
mysql_passwd = "000000"
#HDFS NameNode相关配置,需根据实际情况作出修改
#hdfs_nn_host = "hadoop102"
#hdfs_nn_port = "8020"
#HDFS HA 相关配置,需根据实际情况作出修改
my_nameservices = "mycluster"
my_namenodes_1 = "nn1"
my_namenodes_2 = "nn2"
my_namenodes_3 = "nn3"
my_rpc_address_1 = "hadoop102:8020"
my_rpc_address_2 = "hadoop103:8020"
my_rpc_address_3 = "hadoop104:8020"
#生成配置文件的目标路径,可根据实际情况作出修改
output_path = "/opt/module/datax/job/import"
def get_connection():
return MySQLdb.connect(host=mysql_host, port=int(mysql_port), user=mysql_user, passwd=mysql_passwd)
def get_mysql_meta(database, table):
connection = get_connection()
cursor = connection.cursor()
sql = "SELECT COLUMN_NAME,DATA_TYPE from information_schema.COLUMNS WHERE TABLE_SCHEMA=%s AND TABLE_NAME=%s ORDER BY ORDINAL_POSITION"
cursor.execute(sql, [database, table])
fetchall = cursor.fetchall()
cursor.close()
connection.close()
return fetchall
def get_mysql_columns(database, table):
return map(lambda x: x[0], get_mysql_meta(database, table))
def get_hive_columns(database, table):
def type_mapping(mysql_type):
mappings = {
"bigint": "bigint",
"int": "bigint",
"smallint": "bigint",
"tinyint": "bigint",
"decimal": "string",
"double": "double",
"float": "float",
"binary": "string",
"char": "string",
"varchar": "string",
"datetime": "string",
"time": "string",
"timestamp": "string",
"date": "string",
"text": "string"
}
return mappings[mysql_type]
meta = get_mysql_meta(database, table)
return map(lambda x: {"name": x[0], "type": type_mapping(x[1].lower())}, meta)
def generate_json(source_database, source_table):
job = {
"job": {
"setting": {
"speed": {
"channel": 3
},
"errorLimit": {
"record": 0,
"percentage": 0.02
}
},
"content": [{
"reader": {
"name": "mysqlreader",
"parameter": {
"username": mysql_user,
"password": mysql_passwd,
"column": get_mysql_columns(source_database, source_table),
"splitPk": "",
"connection": [{
"table": [source_table],
"jdbcUrl": ["jdbc:mysql://" + mysql_host + ":" + mysql_port + "/" + source_database]
}]
}
},
"writer": {
"name": "hdfswriter",
"parameter": {
"defaultFS": "hdfs://" + my_nameservices,
"hadoopConfig":{
"dfs.nameservices": my_nameservices,
"dfs.ha.namenodes." + my_nameservices: my_namenodes_1 + "," + my_namenodes_2 + "," + my_namenodes_3,
"dfs.namenode.rpc-address." + my_nameservices + "." + my_namenodes_1 : my_rpc_address_1,
"dfs.namenode.rpc-address." + my_nameservices + "." + my_namenodes_2: my_rpc_address_2,
"dfs.namenode.rpc-address." + my_nameservices + "." + my_namenodes_3: my_rpc_address_3,
"dfs.client.failover.proxy.provider." + my_nameservices: "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider"
},
"fileType": "text",
"path": "${targetdir}",
"fileName": source_table,
"column": get_hive_columns(source_database, source_table),
"writeMode": "append",
"fieldDelimiter": "\t",
"compress": "gzip"
}
}
}]
}
}
if not os.path.exists(output_path):
os.makedirs(output_path)
with open(os.path.join(output_path, ".".join([source_database, source_table, "json"])), "w") as f:
json.dump(job, f)
def main(args):
source_database = ""
source_table = ""
options, arguments = getopt.getopt(args, '-d:-t:', ['sourcedb=', 'sourcetbl='])
for opt_name, opt_value in options:
if opt_name in ('-d', '--sourcedb'):
source_database = opt_value
if opt_name in ('-t', '--sourcetbl'):
source_table = opt_value
generate_json(source_database, source_table)
if __name__ == '__main__':
main(sys.argv[1:])
注:
(1)安装Python Mysql驱动
由于需要使用Python访问Mysql数据库,故需安装驱动,命令如下:
[atguigu@hadoop102 bin]$ sudo yum install -y MySQL-python
(2)脚本使用说明
python gen_import_config.py -d database -t table
通过-d传入数据库名,-t传入表名,执行上述命令即可生成该表的DataX同步配置文件。
2)在~/bin目录下创建gen_import_config.sh脚本(批量生成配置文件)
vim ~/bin/gen_import_config.sh
#!/bin/bash
python ~/bin/gen_import_config.py -d edu -t base_category_info
python ~/bin/gen_import_config.py -d edu -t base_province
python ~/bin/gen_import_config.py -d edu -t base_source
python ~/bin/gen_import_config.py -d edu -t base_subject_info
python ~/bin/gen_import_config.py -d edu -t cart_info
python ~/bin/gen_import_config.py -d edu -t chapter_info
python ~/bin/gen_import_config.py -d edu -t comment_info
python ~/bin/gen_import_config.py -d edu -t course_info
python ~/bin/gen_import_config.py -d edu -t favor_info
python ~/bin/gen_import_config.py -d edu -t knowledge_point
python ~/bin/gen_import_config.py -d edu -t order_detail
python ~/bin/gen_import_config.py -d edu -t order_info
python ~/bin/gen_import_config.py -d edu -t payment_info
python ~/bin/gen_import_config.py -d edu -t review_info
python ~/bin/gen_import_config.py -d edu -t test_exam
python ~/bin/gen_import_config.py -d edu -t test_exam_question
python ~/bin/gen_import_config.py -d edu -t test_paper
python ~/bin/gen_import_config.py -d edu -t test_paper_question
python ~/bin/gen_import_config.py -d edu -t test_point_question
python ~/bin/gen_import_config.py -d edu -t test_ question_info
python ~/bin/gen_import_config.py -d edu -t test_ question_option
python ~/bin/gen_import_config.py -d edu -t user_chapter_process
python ~/bin/gen_import_config.py -d edu -t video_info
python ~/bin/gen_import_config.py -d edu -t vip_change_detail
3)执行gen_import_config.sh脚本,生成配置文件
4)测试生成的DataX配置文件
以base_category_info为例,测试用脚本生成的配置文件是否可用。
1)创建目标路径
由于DataX同步任务要求目标路径提前存在,故需手动创建路径,当前base_category_info表的目标路径应为/origin_data/edu/db/base_category_info_full/2021-09-25。
[atguigu@hadoop102 bin]$ hadoop fs -mkdir -p /origin_data/edu/db/base_category_info_full/2021-09-25
2)执行DataX同步命令
[atguigu@hadoop102 bin]$ python /opt/module/datax/bin/datax.py -p"-Dtargetdir=/origin_data/edu/db/base_category_info_full/2021-09-25" /opt/module/datax/job/import/edu.base_category_info.json
3)观察同步结果
观察HFDS目标路径是否出现数据。
全量表数据同步脚本
为方便使用以及后续的任务调度,此处编写一个全量表数据同步脚本。
1)在~/bin目录创建mysql_to_hdfs_full.sh
vim ~/bin/mysql_to_hdfs_full.sh
#!/bin/bash
DATAX_HOME=/opt/module/datax
# 如果传入日期则do_date等于传入的日期,否则等于前一天日期
if [ -n "$2" ] ;then
do_date=$2
else
do_date=`date -d "-1 day" +%F`
fi
#处理目标路径,此处的处理逻辑是,如果目标路径不存在,则创建;若存在,则清空,目的是保证同步任务可重复执行
handle_targetdir() {
hadoop fs -rm -f -r $1
hadoop fs -mkdir -p $1
}
#数据同步
import_data() {
datax_config=$1
target_dir=$2
handle_targetdir $target_dir
python $DATAX_HOME/bin/datax.py -p"-Dtargetdir=$target_dir" $datax_config
}
TABLE_NAME=$1
case "${TABLE_NAME}" in
"base_category_info"|"base_province"|"base_source"|"base_subject_info"|"cart_info"|"chapter_info"|"comment_info"|"course_info"|"favor_info"|"knowledge_point"|"order_detail"|"order_info"|"payment_info"|"review_info"|"test_exam"|"test_exam_question"|"test_paper"|"test_paper_question"|"test_point_question"|"test_question_info"|"test_question_option"|"user_chapter_process"|"user_info"|"video_info"|"vip_change_detail" )
import_data /opt/module/datax/job/import/edu."${TABLE_NAME}".json /origin_data/edu/db/"${TABLE_NAME}"_full/$do_date
;;
"all")
for i in "base_category_info" "base_province" "base_source" "base_subject_info" "cart_info" "chapter_info" "comment_info" "course_info" "favor_info" "knowledge_point" "order_detail" "order_info" "payment_info" "review_info" "test_exam" "test_exam_question" "test_paper" "test_paper_question" "test_point_question" "test_question_info" "test_question_option" "user_chapter_process" "user_info" "video_info" "vip_change_detail"
do
import_data /opt/module/datax/job/import/edu.${i}.json /origin_data/edu/db/${i}_full/$do_date
done
;;
esac
2)测试同步脚本
mysql_to_hdfs_full.sh all 2021-09-25
3)检查同步结果
Maxwell增量表首日全量同步
1)在~/bin目录创建mysql_to_kafka_inc_init.sh
[atguigu@hadoop102 bin]$ vim mysql_to_kafka_inc_init.sh
脚本内容如下
#!/bin/bash
# 该脚本的作用是初始化所有的增量表,只需执行一次
MAXWELL_HOME=/opt/module/maxwell
import_data() {
$MAXWELL_HOME/bin/maxwell-bootstrap --database edu --table $1 --config $MAXWELL_HOME/config.properties
}
case $1 in
"base_category_info"|"base_province"|"base_source"|"base_subject_info"|"cart_info"|"chapter_info"|"comment_info"|"course_info"|"favor_info"|"knowledge_point"|"order_detail"|"order_info"|"payment_info"|"review_info"|"test_exam"|"test_exam_question"|"test_paper"|"test_paper_question"|"test_point_question"|"test_question_info"|"test_question_option"|"user_chapter_process"|"user_info"|"video_info"|"vip_change_detail")
import_data $1
;;
"all")
import_data base_category_info
import_data base_province
import_data base_source
import_data base_subject_info
import_data cart_info
import_data chapter_info
import_data comment_info
import_data course_info
import_data favor_info
import_data knowledge_point
import_data order_detail
import_data order_info
import_data payment_info
import_data review_info
import_data test_exam
import_data test_exam_question
import_data test_paper
import_data test_paper_question
import_data test_point_question
import_data test_question_info
import_data test_question_option
import_data user_chapter_process
import_data user_info
import_data video_info
import_data vip_change_detail
;;
esac
2)测试同步脚本
(1)清理历史数据
为方便查看结果,现将HDFS上之前同步的增量表数据删除
[atguigu@hadoop102 ~]$ hadoop fs -ls /origin_data/edu/db | grep _inc | awk '{print $8}' | xargs hadoop fs -rm -r -f
(2)执行同步脚本
[atguigu@hadoop102 bin]$ mysql_to_kafka_inc_init.sh all
(3)检查同步结果