##################
 # SQOOP 1.4.6安装#
 ##################
     #解压重,重命名
     [root@single01 download]# tar -zxvf /opt/download/sqoop-1.4.6.bin__hadoop-2.0.4.gz -C /opt/software/
     [root@single01 download]# cd /opt/software/
     [root@single01 software]# mv sqoop-1.4.6.bin__hadoop-2.0.4-alpha/ sqoop-1.4.6
    #环境变量配置
     [root@single01 conf]# vim /etc/profile.d/my.sh
     #-----------------------------------------------------
     #sqoop1.4.6
     export SQOOP_HOME=/opt/software/sqoop-1.4.6
     export PATH=$PATH:$SQOOP_HOME/bin
     #--------------------------------------------
     source /etc/profile
    #sqoop.env
         [root@single01 ~]# cd /opt/software/sqoop-1.4.6/lib/
         [root@single01 lib]# pwd    =>/opt/software/sqoop-1.4.6/lib
         [root@single01 lib]# mv ../conf/sqoop-env-template.sh ../conf/sqoop-env.sh
         [root@single01 sqoop-1.4.6]# echo $HADOOP_HOME    =>/opt/software/hadoop313
         [root@single01 sqoop-1.4.6]# echo $HIVE_HOME    =>/opt/software/hive312
         [root@single01 lib]# vim ../conf/sqoop-env.sh
         #-------------------------------------------------------------
         export HADOOP_COMMON_HOME=/opt/software/hadoop313    #填写$HADOOP_HOME路径
         export HADOOP_MAPRED_HOME=/opt/software/hadoop313    #填写$HADOOP_HOME路径
         #export HBASE_HOME=                                    #填写$HBASE_HOME路径,没装,不改变
         export HIVE_HOME=/opt/software/hive312                #填写$HIVE_HOME路径
         #export ZOOCFGDIR=                                    #填写$ZOOKEEPER_HOME路径,没装,不改变
         export LOGDIR=$SQOOP_HOME/logs                        #日志信息
         #---------------------------------------------------------------------
     
     #资源拷贝
     [root@single01 ~]# cd /opt/software/sqoop-1.4.6/lib/
     #mysql驱动jar包
     cp /opt/software/hive312/lib/mysql-connector-java-5.1.47.jar ./
     #hadoop3个jar包
     cp /opt/software/hadoop313/share/hadoop/common/hadoop-common-3.1.3.jar ./
     cp /opt/software/hadoop313/share/hadoop/hdfs/hadoop-hdfs-3.1.3.jar ./
     cp /opt/software/hadoop313/share/hadoop/mapreduce/hadoop-mapreduce-client-core-3.1.3.jar ./
     #如果安装sqoop 1.4.7
     #出现异常; ERROR hive.HiveConfig: Could bot load org.apache.hadoop.hive.conf.HiveConf.
     #添加jar包软连接(把jar包关联到sqoop的lib目录下)
     ln -s /opt/software/hive312/lib/hive-exec-3.1.2.jar ./    
     #出现异常:ERROR sqoop.Sqoop:Got exception runnung Sqoop: java.lang.NullPointerException
     #at org.json.JSONOBJECT.<init>(JSONoBJECT.JAVA:144)
     把java-json.jar包移动到sqoop的lib目录下
     #检查安装情况
     ls|grep mysql        =》mysql-connector-java-5.1.47.jar
     ls |grep hadoop
     #---------------------------------------------------------------
     avro-mapred-1.7.5-hadoop2.jar
     hadoop-common-3.1.3.jar
     hadoop-hdfs-3.1.3.jar
     hadoop-mapreduce-client-core-3.1.3.jar
     kite-hadoop-compatibility-1.0.0.jar
     parquet-hadoop-1.4.1.jar
     #-------------------------------------------------------------
     
     #常用命令
         #测试命令
         sqoop list-databases --connect jdbc:mysql://single01:3306 --username root --password ok
#hive=>mysql
     # 从Hive表到RDBMS表的直接导出
 # 不建议生产环境使用,因为当Hive 表记录较大时,或者RDBMS有多个分区表时,无法做精细的控制
 #hive=>hdfs=>mysql (hive数据就在hdfs上)
 # 从Hive表导出到HDFS 时,可以进一步对数据进行字段筛选、字段加工、数据过滤操作
     # 使得HDFS上的数据更“接近“ 或等于将来实际要导入RDBMS 表的数据
 # 从HDFS 导入RDBMS 时,也是将一个“小数据集“与目标表中的数据做对比会提高导出速度
#从数据仓库导出
     #hdfs->mysql(mysql上要先根据hdfs上的数据类型建表)
     sqoop export \
     #JDBC
     --connect jdbc:mysql://single01:3306/test \
     --username root \
     --password ok \
     --table score_kb16 \
     --columns stu_name,stu_gender,java_score,mysql_score \
     #MAPREDUCE
     --export-dir /test/hive/kb16/kb16_scores.txt \
     --fields-terminated-by ',';
 #mysql ->hdfs
 sqoop import \
 --connect jdbc:mysql://single01:3306/test \
 --username root \
 --password ok \
 --table order_info \
 --columns order_id,order_user_id,order_dt,order_money,order_status \
 --where "order_dt between '2019-01-05' and '2019-01-10'" \
 -m 1 \
 --delete-target-dir \
 --target-dir /test/hive/order_info \
 --fields-terminated-by ',' ;
#用query方式执行走parallel(并行)执行模式,必须指定--split-by分裂字段;  -m 2表示两个reduce
 sqoop import \
 --connect jdbc:mysql://single01:3306/test \
 --username root \
 --password ok \
 --query "select order_id,order_user_id,order_dt,order_money,order_status from order_info where order_user_id<=1900 and \$CONDITIONS" \
 -m 2 \
 --split-by order_user_id \
 --delete-target-dir \
 --target-dir /test/hive/order_info2 \
 --fields-terminated-by ',' ;
#mysql->hive
 #mysql> source /root/order_info.sql 在mysql中执行SQL文件
 #mysql 一> hdfs
 #--incremental append只支持新增不支持更新
 #--tabLe TABLE NAME --query SELECT_COMMAND
 #--sp1it—by 和-m 结合实现 numberReduceTasks并行
 # --check-column和--last-value 结合实现--check-column :where sid>5
全量
 sqoop import \
 --connect jdbc:mysql://single01:3306/test \
 --username root \
 --password ok \
 --table order_info \
 --columns order_id,order_user_id,order_dt,order_money,order_status \
 --delete-target-dir \
 --fields-terminated-by ',' \
 -m 2 \
 --hive-import \
 --create-hive-table \
 --hive-database kb16 \
 --hive-table full_order_info 
 sqoop import \
 --connect jdbc:mysql://single01:3306/test \
 --username root \
 --password ok \
 --table order_info \
 --columns id,order_id,order_user_id,order_dt,order_money,order_status \
 --delete-target-dir \
 --fields-terminated-by ',' \
 -m 1 \
 --hive-import \
 --create-hive-table \
 --hive-database kb16 \
 --hive-table full_order_info2 
 增量(分区)
 #在hive上,sqoop-1.4.7支持incremental append,
 #sqoop-1.4.6不支持incremental append
 #Append mode for hive imports is not yet supported.
 #解决方案:建分区表,手动添加分区,然后挂载数据分区即可
--incremental append|lastmodified
     append     主键或唯一键(对新增的数据做增量,可以做insert,不能做update)
     lastmodified  支持更新,主要面向日期(date|datetime|timestamp),支持append和
     --merge-key order_id (修改order_id)
按量
 #mysql -->hive 按id做append增量 
 sqoop import \
 --connect jdbc:mysql://single01:3306/test \
 --username root \
 --password ok \
 --table order_info \
 --columns id,order_id,order_user_id,order_dt,order_money,order_status \
 --fields-terminated-by ',' \
 -m 1 \
 --incremental append \
 --check-column id \
 --last-value 79979 \
 --hive-import \
 --hive-database kb16 \
 --hive-table full_order_info2
 #partitioned by(id_range int) 10000 20000
#mysql -->hdfs 按id数值做append增量(下次增量修改--last-value)
 sqoop import \
 --connect jdbc:mysql://single01:3306/test \
 --username root \
 --password ok \
 --table order_info \
 --columns id,order_id,order_user_id,order_dt,order_money,order_status \
 --target-dir /test/hive/order_id_append  \
 --fields-terminated-by ',' \
 -m 2 \
 --split-by id \
 --incremental append \
 --check-column id \
 --last-value 0
 #799979
 按日(时段)
 sqoop import \
 --connect jdbc:mysql://single01:3306/test \
 --username root \
 --password ok \
 --table order_info \
 --columns order_id,order_user_id,order_dt,order_money,order_status \
 --where "order_dt>'2019-06-21'"
 --fields-terminated-by ',' \
 -m 1 \
 --incremental append \
 --check-column order_dt \
 --last-value '2019-06-21 21:41:22' \
 --hive-import \
 --hive-database kb16 \
 --hive-table full_order_info
 mysql->hdfs(按id做增量)
1:32:55
 sqoop import \
 --connect jdbc:mysql://single01:3306/test \
 --username root \
 --password ok \
 --query "select order_id,order_user_id,cast(order_dt as datetime) as order_dt,order_money,order_status from order_info where order_user_id<=1000 and \$CONDITIONS" \
 --fields-terminated-by ',' \
 --target-dir /test/hive/order_dt_lastmodified \
 -m 1 \
 --incremental lastmodified \
 --check-column order_dt \
 --last-value '2019-01-01 00:00:00'
 --merge-key id #合并
 --append #新增 
sqoop import \
 --connect jdbc:mysql://single01:3306/test \
 --username root \
 --password ok \
 --query "select order_id,order_user_id,cast(order_dt as datetime) as order_dt,order_money,order_status from order_info where order_user_id<=1000 and \$CONDITIONS" \
 --fields-terminated-by ',' \
 --target-dir /test/hive/order_dt_lastmodified \
 -m 1 \
 --incremental lastmodified \
 --check-column order_dt \
 --last-value '2019-01-01 00:00:00'
 --merge-key id 
create external table kb16.sqoop_order_info_par_cluster(
     id bigint ,
     order_id bigint ,
     order_user_id bigint ,
     order_dt string,
     order_money string,
     order_status int
 )
 partitioned by(ym string)
 clustered by (id) sorted by (order_dt) into 4 buckets
 row format delimited
 fields terminated by ','
 stored as textfile;
#1.手动添加分区
 alter table kb16.sqoop_order_info_par_cluster add partition (ym='2019-01');
#删除分区
 alter table kb16.sqoop_order_info_par_cluster drop partition (ym='2019-03');
#查看分区
 show partitions kb16.sqoop_order_info_par_cluster partition(ym='2019-02');
#add_order_par_by_ym_sqoop_data.sh --hive kb16.table -mysql test.order_info -par 2019-03
[root@single01 ~]# rst=`hive -e "show partitions kb16.sqoop_order_info_par_cluster partition(ym='2019-03')"`
 sqoop import \
 --connect jdbc:mysql://single01:3306/test \
 --username root \
 --password ok \
 --table order_info \
 --where "date_format(order_dt,'%Y-%m')='2019-01'" \
 -m 1 \
 --fields-terminated-by ',' \
 --delete-target-dir \
 --target-dir /hive312/warehouse/kb16.db/sqoop_order_info_par_cluster/ym=2019-01 
 alter table kb16.sqoop_order_info_par_cluster add partition (ym='2019-02');
sqoop import \
 --connect jdbc:mysql://single01:3306/test \
 --username root \
 --password ok \
 --table order_info \
 --where "date_format(order_dt,'%Y-%m')='2019-02'" \
 -m 1 \
 --fields-terminated-by ',' \
 --delete-target-dir \
 --target-dir /hive312/warehouse/kb16.db/sqoop_order_info_par_cluster/ym=2019-02 
serde
job 封装
 #查看列表
 sqoop job --list
 #删除job
 sqoop job --delet JOB_NAME
 #创建job
 sqoop job --create JOB_NAME \
 ...
 #执行job
 sqop job --exec JOB_NAME
sqoop import \
 --connect jdbc:mysql://single01:3306/test \
 --username root \
 --password ok \
 --query "select order_id,order_user_id,cast(order_dt as datetime) as order_dt,order_money,order_status from order_info where order_user_id<=1000 and \$CONDITIONS" \
 --fields-terminated-by ',' \
 --delete-target-dir \
 --target-dir /test/hive/order_dt_lastmodified \
 -m 1 \
 --incremental lastmodified \
 --check-column order_dt \
 --last-value '2019-01-01 00:00:00'
 --merge-key id #合并
 --append #新增 
 拉链表
     应用场景
     大量的历史数据+新增的数据+有限时间范围内(截止拉取数据时间)的少量更新数据
         
         
 mysql->hbase
 hive -e
 mysql -u root -pok -E "select count..."
shell crontab 调度工具
 mysql
 truncate table mysql_order;
     create table mysql_order(
         id bigint auto_increment primary key,
         order_id bigint not null,
         user_id bigint not null,
         order_dt datetime not null,
         order_money decimal(10,2),
         order_status int 
     );
insert into mysql_order(order_id,user_id,order_dt,order_money,order_status) values
 (1,1,'2019-01-01 08:35:44',38.45,0),
 (2,2,'2019-01-01 09:12:31',123.45,0),
 (3,3,'2019-01-01 11:05:02',49.45,0),
 (4,1,'2019-01-01 13:19:12',58.65,0),
 (5,3,'2019-01-01 20:01:27',360.38,0),
 (6,4,'2019-01-01 22:30:00',99.33,0),
 (1,1,'2019-01-01 08:50:30',38.45,2),
 (2,2,'2019-01-01 09:35:05',123.45,2),
 (3,3,'2019-01-01 11:40:44',49.45,1),
 (4,1,'2019-01-01 13:32:11',58.65,0);
insert into mysql_order(order_id,user_id,order_dt,order_money,order_status) values
 (5,3,'2019-01-02 08:01:22',360.38,1),
 (6,4,'2019-01-02 08:18:20',99.33,2),
 (7,2,'2019-01-02 08:52:09',1200.00,0),
 (8,4,'2019-01-02 09:35:05',560.00,0),
 (1,1,'2019-01-02 12:22:33',38.45,3),
 (9,5,'2019-01-02 23:45:10',32.00,0),
 (7,2,'2019-01-02 09:20:22',1200.00,2),
 (8,4,'2019-01-02 10:02:09',560.00,2);
hive
     ods
 #订单历史全量表:kb6.hive_order
 #mysql-->hive 复制MySQL表结构到hive(会在hdfs上自动生成相应目录)
 sqoop create-hive-table \
 --connect jdbc:mysql://single01:3306/test \
 --username root \
 --password ok \
 --table mysql_order \
 --fields-terminated-by ',' \
 --hive-table kb16.hive_order    
 #mysql-->hdfs 按照order_dt做增量导入
 sqoop import \
 --connect jdbc:mysql://single01:3306/test \
 --username root \
 --password ok \
 --table mysql_order \
 --target-dir /hive312/warehouse/kb16.db/hive_order \
 --incremental lastmodified \
 --check-column order_dt \
 --last-value '2019-01-01 00:00:00' \
 --merge-key id \
 -m 1
 # --last-value 2022-01-21 11:49:45.0   
 #2019-01-01 22:30:00
 sqoop import \
 --connect jdbc:mysql://single01:3306/test \
 --username root \
 --password ok \
 --table mysql_order \
 --target-dir /hive312/warehouse/kb16.db/hive_order \
 --incremental lastmodified \
 --check-column order_dt \
 --last-value '2019-01-01 22:30:00' \
 --merge-key id \
 -m 1
 #拉链表    
 #建表要求
 #Hive对使用Update功能的表有特定的语法要求, 语法要求如下: (1)要执行Update的表中, 建表时必须带有buckets(分桶)属性 (2)要执行Update的表中, 需要指定格式,其余格式目前赞不支持, 如:parquet格式, 目前只支持ORCFileformat和AcidOutputFormat (3)要执行Update的表中, 建表时必须指定参数(‘transactional’ = true);
 #stored as orc tblproperties("transactional"="true");
批量更新语法
 #MERGE INTO <target table> AS T USING <source expression/table> AS S
  #ON <``boolean` `expression1>
  #WHEN MATCHED [AND <``boolean` `expression2>] THEN UPDATE SET <set clause list>
  #WHEN MATCHED [AND <``boolean` `expression3>] THEN DELETE
  #WHEN NOT MATCHED [AND <``boolean` `e xpression4>] THEN INSERT VALUES<value li
 #再用当日数据参考历史数据,存在则更新,否则新增
 drop table if exists kb16.zipper_hive_order;
set hive.support.concurrency = true;
 set hive.enforce.bucketing = true;
 set hive.exec.dynamic.partition.mode = nonstrict;
 set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
 set hive.compactor.initiator.on = true;
 set hive.compactor.worker.threads = 1;
 set hive.auto.convert.join=false;
 set hive.merge.cardinality.check=false;
create table kb16.zipper_hive_order(
 order_id bigint,
 user_id bigint,
 order_create_dt timestamp,
 order_modify_dt timestamp,
 order_money decimal(10,2),
 current_status int 
 )
 clustered by(order_create_dt) into 2 buckets
 row format delimited
 fields terminated by ','
 stored as orc tblproperties("transactional"="true");
#先合并当日数据
 select 
 order_id,user_id,order_money,
 min(order_dt) as order_create_dt,
 if(max(order_dt)==min(order_dt),'9999-12-31 00:00:00',max(order_dt)) as order_modify_dt,
 max(order_status) as current_status
 from kb16.hive_order
 where to_date(order_dt)='2019-01-01'
 group by order_id,user_id,order_money;
 merge into kb16.zipper_hive_order as Z using (
 select 
 order_id,user_id,order_money,
 min(order_dt) as order_create_dt,
 if(max(order_dt)==min(order_dt),'9999-12-31 00:00:00',max(order_dt)) as order_modify_dt,
 max(order_status) as current_status
 from kb16.hive_order
 where to_date(order_dt)='2019-01-01'
 group by order_id,user_id,order_money
 ) as O
 on Z.order_id=O.order_id 
 when matched and O.current_status=1 then delete 
 when not matched O.current_status=1 then delete
 when matched and O.current_status!=1 then update set order_modify_dt=O.order_modify_dt,current_status=O.current_status
 when not matched then insert values(O.order_id,O.user_id,O.order_money,O.order_create_dt,O.order_modify_dt,O.current_status);
 #where to_date(order_dt)='2019-01-02'
    dwd    
     
#RDBMS 模型->降维
 #DATA WAREHOUSE 数仓的数据建模 【星型】,雪花,星座
 #事实表,维度表
 拉链表
     应用场景
     大量的历史数据+新增的数据+有限时间范围内(截止拉取数据时间)的少量更新数据










