- 组件总览
- Zookeeper部署
- 创建目录
- 解压
- 环境变量
- 配置服务器编号
- 配置
- 同步
- 启动
- Hadoop部署
- 解压
- 环境变量
- 创建目录
- 集群配置
- core-site.xml
- hdfs-site.xml
- mapred-site.xml
- yarn-site.xml
- workers
- 启动
- 初始化zkfc
- 启动集群
- 检验HDFS
- 验证HA
- PostgresSQL部署
- 依赖
- yum 安装
- systemctl 管理 PostgreSQL 服务
- 自定义数据目录
- 修改postgresql.conf
- 修改pg_hba.conf
- 启动
- 登录
- 卸载
- Hive部署
- 解压
- 环境变量
- 解决依赖问题
- 配置
- hive-log4j2.properties
- hive-env.sh
- hive-site.xml
- 初始化数据库
- 创建目录
- 启动
- 启动脚本
- 验证
- Hadoop权限问题
- 刷新权限
- HBase部署
- Zookeeper兼容
- 解压
- 环境变量
- 配置
- regionservers
- hbase-env.sh
- hbase-site.xml
- backup-masters
- 启动
- 验证
- Kafka部署
- 解压
- 环境变量
- 创建目录
- 配置
- server.properties
- producer.properties
- consumer.properties
- 启动
- 检验
- Spark部署
- 解压
- 环境变量
- 配置
- spark-env.sh
- slaves
- spark-defaults.conf
- 启动
- HA
- 检验
- Local
- Standalone
- YARN
- Flume部署
- 解压
- 环境变量
- 配置
- 检验
- flume-conf.properties示例
- 启动Agent
- 测试
- Maven部署
- 解压
- 环境变量
- 配置
- 检验
- Kudu部署
- Git
- 解压
- 编译
- 依赖
- 编译
- 拷贝文件
- 环境变量
- 安装
- 创建目录
- 配置
- master.gflagfile
- tserver.gflagfile
- 启动
- 检验
- WebUI
- master
- tserver
- 停止
- Impala部署
- 解压
- 编译
- 准备
- 执行编译
- 拷贝编译文件
- 环境变量
- 配置
- 启动
- 验证
- Statestored WebUI
- Catalog WebUI
- impalad WebUI
- 编译问题
- Impala负载均衡
- 安装 haproxy
- 配置 haproxy.cfg
- 连接
- Hue集成Impala
- Tez部署
- 解压
- 修改pom
- 编译
- 编译工具
- 编译 probuf-2.5.0
- 编译Tez
- 解压
- 上传
- 配置
- tez-site.xml
- hadoop-env.sh
- hive-site.xml
- hive-env.sh
- mapred-site.xml
- 检验
- Lzo编译安装
组件总览
组件 | 版本 |
JDK | 1.8.0_211 |
Scala | 1.12.14 |
Zookeeper | 3.5.9 |
Hadoop | 3.2.2 |
Hive | 3.1.4 |
HBase | 2.4.9 |
Kafka | 2.6.3 |
Spark | 2.4.8 |
Kudu | 1.14.0 |
Impala | 3.4.0 |
Zookeeper部署
下载 apache-zookeeper-3.5.9-bin.tar.gz
https://mirrors.cnnic.cn/apache/zookeeper/zookeeper-3.5.9/apache-zookeeper-3.5.9-bin.tar.gz
创建目录
# 切换到hdfs用户安装
su hdfs
# 安装目录
sudo mkdir /opt/apps/
# 数据目录
sudo chmod 755 /data
sudo mkdir /data/zookeeper
sudo chown hdfs:hdfs zookeeper
解压
tar -zxvf apache-zookeeper-3.5.9-bin.tar.gz -C /opt/apps/
cd /opt/apps
mv apache-zookeeper-3.5.9/ zookeeper-3.5.9
环境变量
sudo vim /etc/profile.d/hdfs_env.sh
# zookeeper
export ZK_HOME=/opt/apps/zookeeper-3.5.9
export PATH=$PATH:$ZK_HOME/bin
source /etc/profile.d/hdfs_env.sh
配置服务器编号
echo "1" > /data/zookeeper/myid
配置
cd /opt/apps/zookeeper-3.5.9/conf/
sudo mv zoo_sample.cfg zoo.cfg
sudo vim zoo.cfg
# 修改data存储路径
dataDir=/data/zookeeper
# 添加
server.1=hadoop-master:2888:3888
server.2=hadoop-slave01:2888:3888
server.3=hadoop-slave02:2888:3888
vim /opt/apps/zookeeper/bin/zkEnv.sh
# 添加
export JAVA_HOME=/opt/apps/jdk
同步
lsync /opt/apps/zookeeper-3.5.9
# 修改其他服务 myid
[root@hadoop-slave01 /]$ echo "2" > /data/zookeeper/myid
[root@hadoop-slave02 /]$ echo "3" > /data/zookeeper/myid
启动
启动脚本
vim zkCluster.sh
sudo chmod +x zkCluster.sh
脚本内容
#!/bin/bash
hosts=(hadoop-master hadoop-slave01 hadoop-slave02)
path=/opt/apps/zookeeper-3.5.9
case $1 in
"start"){
for i in ${hosts[@]}
do
echo ---------- $i zookeeper startting ------------
ssh $i "$path/bin/zkServer.sh start"
done
};;
"stop"){
for i in ${hosts[@]}
do
echo ---------- $i zookeeper stopping ------------
ssh $i "$path/bin/zkServer.sh stop"
done
};;
"status"){
for i in ${hosts[@]}
do
echo ---------- $i zookeeper status ------------
ssh $i "$path/bin/zkServer.sh status"
done
};;
esac
Hadoop部署
下载 hadoop-3.2.2.tar.gz
https://archive.apache.org/dist/hadoop/common/hadoop-3.2.2/hadoop-3.2.2.tar.gz
解压
tar -zxvf hadoop-3.2.2.tar.gz -C /opt/apps/
环境变量
vim /etc/profile.d/hdfs_env.sh
# hadoop
export HADOOP_HOME=/opt/apps/hadoop-3.2.2
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
export LD_LIBRARY_PATH=$HADOOP_HOME/lib/native:/usr/lib64
export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
source /etc/profile.d/hdfs_env.sh
创建目录
sudo mkdir -p /data/hadoop/tmp
sudo mkdir -p /data/hadoop/nn
sudo mkdir -p /data/hadoop/dn
sudo mkdir -p /data/hadoop/jn
sudo chown hdfs:hdfs -R /data/hadoop
集群配置
cd /opt/apps/hadoop-3.2.2/etc/hadoop/
core-site.xml
https://hadoop.apache.org/docs/r3.1.4/hadoop-project-dist/hadoop-common/core-default.xml
fs.defaultFS
hdfs://nameservice1
hadoop.tmp.dir
/data/hadoop/tmp
fs.trash.interval
4320
ha.zookeeper.quorum
hadoop-master:2181,hadoop-slave01:2181,hadoop-slave02:2181
io.native.lib.available
true
io.compression.codecs
org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.BZip2Codec,org.apache.hadoop.io.compress.SnappyCodec
io.file.buffer.size
131072
ipc.client.connection.maxidletime
60000
hadoop.proxyuser.hdfs.hosts
*
hadoop.proxyuser.hdfs.groups
*
hadoop.proxyuser.root.hosts
*
hadoop.proxyuser.root.groups
*
hdfs-site.xml
https://hadoop.apache.org/docs/r3.1.4/hadoop-project-dist/hadoop-hdfs/hdfs-default.xml
dfs.nameservices
nameservice1
dfs.ha.namenodes.nameservice1
nn1,nn2
dfs.namenode.rpc-address.nameservice1.nn1
hadoop-master:8020
dfs.namenode.servicerpc-address.nameservice1.nn1
hadoop-master:8022
dfs.namenode.http-address.nameservice1.nn1
hadoop-master:9870
dfs.namenode.https-address.nameservice1.nn1
hadoop-master:9871
dfs.namenode.rpc-address.nameservice1.nn2
hadoop-slave01:8020
dfs.namenode.servicerpc-address.nameservice1.nn2
hadoop-slave01:8022
dfs.namenode.http-address.nameservice1.nn2
hadoop-slave01:9870
dfs.namenode.https-address.nameservice1.nn2
hadoop-salve01:9871
dfs.namenode.name.dir
/data/hadoop/nn
dfs.journalnode.edits.dir
/data/hadoop/jn
dfs.namenode.shared.edits.dir
qjournal://hadoop-master:8485;hadoop-slave01:8485;hadoop-slave02:8485/nameservice1
dfs.datanode.data.dir
/data/hadoop/dn
dfs.datanode.fsdataset.volume.choosing.policy
org.apache.hadoop.hdfs.server.datanode.fsdataset.AvailableSpaceVolumeChoosingPolicy
dfs.datanode.available-space-volume-choosing-policy.balanced-space-threshold
2147483648
dfs.datanode.available-space-volume-choosing-policy.balanced-space-preference-fraction
0.85f
dfs.replication
3
dfs.ha.automatic-failover.enabled.nameservice1
true
dfs.client.failover.proxy.provider.nameservice1
org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider
dfs.ha.fencing.methods
sshfence
shell(/bin/true)
dfs.ha.fencing.ssh.private-key-files
/home/hdfs/.ssh/id_rsa
dfs.ha.fencing.ssh.connect-timeout
30000
ha.zookeeper.session-timeout.ms
5000
dfs.permissions.enabled
false
dfs.webhdfs.enabled
true
dfs.support.append
true
dfs.datanode.balance.bandwidthPerSec
50m
dfs.namenode.handler.count
50
dfs.blocksize
134217728
dfs.datanode.max.transfer.threads
8192
dfs.datanode.du.reserved
2147483648
dfs.client.use.datanode.hostname
true
only cofig in clients
mapred-site.xml
https://hadoop.apache.org/docs/r3.1.4/hadoop-mapreduce-client/hadoop-mapreduce-client-core/mapred-default.xml
mapreduce.framework.name
yarn
mapreduce.jobhistory.address
hadoop-slave02:10020
mapreduce.jobhistory.webapp.address
hadoop-slave02:19888
mapreduce.jobhistory.webapp.https.address
hadoop-slave02:19890
mapreduce.jobhistory.done-dir
/data/yarn/history/done
mapreduce.jobhistory.intermediate-done-dir
/data/yarn/history/done_intermediate
yarn.app.mapreduce.am.resource.mb
1536
yarn.app.mapreduce.am.command-opts
-Xmx1024m
yarn.app.mapreduce.am.resource.cpu-vcores
1
mapreduce.map.memory.mb
-1
mapreduce.map.java.opts
-Xmx300m
mapreduce.map.cpu.vcores
1
mapreduce.reduce.memory.mb
-1
mapreduce.reduce.java.opts
-Xmx300m
mapreduce.reduce.cpu.vcores
1
yarn-site.xml
https://hadoop.apache.org/docs/r3.1.4/hadoop-yarn/hadoop-yarn-common/yarn-default.xml
yarn.resourcemanager.ha.enabled
true
yarn.resourcemanager.cluster-id
yarnRM
yarn.resourcemanager.ha.rm-ids
rm1,rm2
yarn.resourcemanager.hostname.rm1
hadoop-master
yarn.resourcemanager.hostname.rm2
hadoop-slave01
yarn.resourcemanager.address.rm1
hadoop-master:8032
yarn.resourcemanager.scheduler.address.rm1
hadoop-master:8030
yarn.resourcemanager.resource-tracker.address.rm1
hadoop-master:8031
yarn.resourcemanager.admin.address.rm1
hadoop-master:8033
yarn.resourcemanager.webapp.address.rm1
hadoop-master:8088
yarn.resourcemanager.webapp.https.address.rm1
hadoop-master:8090
yarn.resourcemanager.address.rm2
hadoop-slave01:8032
yarn.resourcemanager.scheduler.address.rm2
hadoop-slave01:8030
yarn.resourcemanager.resource-tracker.address.rm2
hadoop-slave01:8031
yarn.resourcemanager.admin.address.rm2
hadoop-slave01:8033
yarn.resourcemanager.webapp.address.rm2
hadoop-slave01:8088
yarn.resourcemanager.webapp.https.address.rm2
hadoop-slave01:8090
yarn.nodemanager.address
0.0.0.0:9103
yarn.nodemanager.aux-services
mapreduce_shuffle
yarn.nodemanager.webapp.address
0.0.0.0:8042
yarn.nodemanager.localizer.address
0.0.0.0:8040
yarn.nodemanager.aux-services.mapreduce.shuffle.class
org.apache.hadoop.mapred.ShuffleHandler
mapreduce.shuffle.port
23080
yarn.app.mapreduce.am.staging-dir
/user
yarn.web-proxy.address
hadoop-master:8041
yarn.nodemanager.env-whitelist
JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME
yarn.log-aggregation-enable
true
yarn.log.server.url
http://hadoop-slave02:19888/jobhistory/logs
yarn.log-aggregation.retain-seconds
604800
yarn.nodemanager.local-dirs
/data/yarn/local
yarn.nodemanager.log-dirs
/data/yarn/logs
yarn.log-aggregation-enable
true
yarn.nodemanager.remote-app-log-dir
/tmp/app-logs
yarn.log-aggregation.retain-seconds
1209600
yarn.app.mapreduce.am.scheduler.connection.wait.interval-ms
5000
yarn.resourcemanager.ha.automatic-failover.enabled
true
yarn.resourcemanager.ha.automatic-failover.embedded
true
yarn.resourcemanager.recovery.enabled
true
yarn.resourcemanager.store.class
org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore
yarn.resourcemanager.zk-address
hadoop-master:2181,hadoop-slave01:2181,hadoop-slave02:2181
yarn.nodemanager.env-whitelist
JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME
yarn.scheduler.minimum-allocation-mb
1024
yarn.scheduler.maximum-allocation-mb
8192
yarn.scheduler.minimum-allocation-vcores
1
yarn.scheduler.maximum-allocation-vcores
16
yarn.nodemanager.resource.memory-mb
3072
yarn.nodemanager.resource.cpu-vcores
6
yarn.resourcemanager.scheduler.class
org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler
yarn.nodemanager.vmem-check-enabled
false
yarn.nodemanager.pmem-check-enabled
false
workers
hadoop-master
hadoop-slave01
hadoop-slave02
启动
初始化zkfc
hdfs zkfc -formatZK
# 两个namenode节点,启动zkfc
hdfs --daemon start zkfc
首次启动要初始化NameNode,先手动启动各个组件
# 每个节点,启动 JournalNode集群
hdfs --daemon start journalnode
# 格式化NameNode
# 在一个NameNode节点格式化
hdfs namenode -format
# 启动NameNode
hdfs --daemon start namenode
# 同步NameNode
# 另一个NameNode同步
hdfs namenode -bootstrapStandby
hdfs --daemon start namenode
# webUI
10.0.11.110:9870
# 切换NameNode active
# 启动后两个都时standby,将一台强制切换为active
hdfs haadmin -transitionToActive nn1 --forcemanual
# 查看NameNode状态
hdfs haadmin -getServiceState nn1
hdfs haadmin -getServiceState nn2
# 其中一台,启动DataNode
vim $HADOOP_CONF_DIR/hadoop-env.sh
export JAVA_HOME=/opt/apps/jdk
hdfs --workers --daemon start datanode
# 两个主节点,启动ResourceManager
yarn --daemon start resourcemanager
# ResourceManager webUI
http://10.0.11.110:8088
# 每个节点,启动NodeManager
yarn --daemon start nodemanager
# 第三个节点,启动Historyserver
mapred --daemon start historyserver
# Historyserver webUI
http://10.0.11.112:19888
启动集群
start-dfs.sh
start-yarn.sh
检验HDFS
# HDFS
hdfs dfs -ls /
hdfs dfs -put core-site.xml /tmp
# YARN
hadoop jar /opt/apps/hadoop-3.2.2/share/hadoop/mapreduce/hadoop-mapreduce-examples-3.1.4.jar \
wordcount \
/tmp/core-site.xml /tmp/out
# 查看计算结果
hdfs dfs -text /tmp/out/*
验证HA
# HDFS,kill 掉 active,standby节点会自动切换成 active
jps
15065 NameNode
kill -9 15065
# 重启后会自动成为 standby
hdfs --daemon start namenode
# YARN,从任一RM节点进入WebUI都会进入 hadoop-master:8088, kill后从该节点无法开打Web
jps
13743 ResourceManager
kill 13743
# 重启后能继续访问
yarn --daemon start resourcemanager
PostgresSQL部署
官网:PostgreSQL: Linux downloads (Red Hat family)
yum:RepoView: PostgreSQL PGDG 13 Updates RPMs
离线下载:PostgreSQL JDBC Download
依赖
yum install -y python-devel perl-ExtUtils-Embed python-devel gcc-c++ openssl-devel readline readline-devel bzip2 zlib zlib-devel openssl openssl-devel pam pam-devel libxml2 libxml2-devel libxslt libxslt-devel openldap openldap-devel libgeos-dev libproj-dev libgdal-dev xsltproc docbook-xsl docbook-xml imagemagick libmagickcore-dev dblatex tcl tcl-devel unixODBC unixODBC-devel libpng12 libpng12-devel libtiff libtiff-devel curl-devel
yum 安装
sudo yum install -y https://download.postgresql.org/pub/repos/yum/reporpms/EL-7-x86_64/pgdg-redhat-repo-latest.noarch.rpm
sudo yum install -y postgresql13-server
rpm -qa|grep postgres
systemctl 管理 PostgreSQL 服务
cp /lib/systemd/system/postgresql-13.service /etc/systemd/system/postgresql-13.service
自定义数据目录
# 两种方式
# 1. 单实例,直接修改文件
vim /etc/systemd/system/postgresql-13.service
Environment=PGDATA=/data/pgsql/13/data/
# 初始化
sudo /usr/pgsql-13/bin/postgresql-13-setup initdb postgresql-13
ll /data
# 2.导入环境变量
export PGHOME=/usr/pgsql-13
export PGDATA=/data/pgsql/13/data
export PGUSER=postgres
export LD_LIBRARY_PATH=$PGHOME/lib:$LD_LIBRARY_PATH
export PATH=$PGHOME/bin:$PATH
# 初始化
sudo /usr/pgsql-13/bin/postgresql-13-setup initdb
修改postgresql.conf
vim $PGDATA/postgresql.conf
# 修改
listen_addresses = '*' # what IP address(es) to listen on;
port = 5432 # (change requires restart)
max_connections = 1000 # (change requires restart)
superuser_reserved_connections = 5 # (change requires restart)
shared_buffers = 8192MB # min 128kB
work_mem = 16MB # min 64kB
maintenance_work_mem = 512MB # min 1MB
vacuum_cost_delay = 0 # 0-100 milliseconds (0 disables)
max_worker_processes = 128 # (change requires restart)
max_parallel_maintenance_workers = 8 # taken from max_parallel_workers
max_parallel_workers_per_gather = 16 # taken from max_parallel_workers
fsync = on # flush data to disk for crash safety
commit_delay = 1000 # range 0-100000, in microseconds
commit_siblings = 100 # range 1-1000
checkpoint_completion_target = 0.8 # checkpoint target duration, 0.0 - 1.0
effective_cache_size = 4GB
datestyle = 'iso, mdy'
lc_messages = 'en_US.UTF-8' # locale for system error message
lc_monetary = 'en_US.UTF-8' # locale for monetary formatting
lc_numeric = 'en_US.UTF-8' # locale for number formatting
lc_time = 'en_US.UTF-8' # locale for time formatting
default_text_search_config = 'pg_catalog.english'
修改pg_hba.conf
vim $PGDATA/pg_hba.conf
# 允许所有访问,新增一条
# host all all 0.0.0.0/0 md5
# 只允许内网 10.0.11.110/112 网段访问,增加一条
host all all 10.0.11.110/112 trust
启动
sudo systemctl enable postgresql-13
sudo systemctl start postgresql-13
sudo systemctl status postgresql-13
登录
psql -h 127.0.0.1 -p 5432 -U postgres
# postgres用户下直接输入psql 就可以进入psql的命令界面
su - postgres
psql -U postgres
alter user postgres with password '123456';
# 创建角色hive
select rolname from pg_roles;
create user hive with password '123456';
create database metastore owner hive;
grant all privileges on database metastore to hive;
alter role hive with createdb;
卸载
yum remove postgresql13-server
Hive部署
兼容性通过下载页可用看到兼容版本
https://hive.apache.org/downloads.html
下载 hive-3.1.2
https://dlcdn.apache.org/hive/hive-3.1.2/apache-hive-3.1.2-bin.tar.gz
解压
tar -zxvf apache-hive-3.1.2-bin.tar.gz -C /opt/apps
mv opt/apps/apache-hive-3.1.2-bin.tar.gz opt/apps/apache-hive-3.1.2
环境变量
vim /etc/profile.d/hdfs_env.sh
# hive
export HIVE_HOME=/opt/apps/hadoop-3.2.2
export HIVE_CONF_DIR=$HIVE_HOME/conf
export PATH=$PATH:$HIVE_HOME/bin
source /etc/profile.d/hdfs_env.sh
解决依赖问题
# 上传jdbc connetor
cp postgresql-42.3.1.jar /opt/apps/hive-3.1.2/lib/
mv $HIVE_HOME/lib/postgresql-9.4.1208.jre7.jar $HIVE_HOME/lib/postgresql-9.4.1208.jre7.jar.bak
# Hadoop3.2.2的guava比Hive的版本高,拷贝到Hive的lib下
cp $HADOOP_HOME/share/hadoop/common/lib/guava-27.0-jre.jar $HIVE_HOME/lib/
mv $HIVE_HOME/lib/guava-19.0.jar $HIVE_HOME/lib/guava-19.0.jar.bak
mv $HIVE_HOME/lib/log4j-slf4j-impl-*.jar $HIVE_HOME/lib/log4j-slf4j-impl-*.jar.bak
配置
cd /opt/apps/hive-3.1.2/conf
cp hive-default.xml.template hive-site.xml
mv hive-env.sh.template hive-env.sh
mv hive-log4j2.properties.template hive-log4j2.properties
hive-log4j2.properties
cat > hive-log4j2.properties <log4j.rootLogger=WARN, CA
log4j.appender.CA=org.apache.log4j.ConsoleAppender
log4j.appender.CA.layout=org.apache.log4j.PatternLayout
log4j.appender.CA.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
EOL
hive-env.sh
vim hive-env.sh
# 修改
HADOOP_HOME=/opt/apps/hadoop-3.2.2
export HIVE_CONF_DIR=/opt/apps/hive-3.1.2/conf
export HIVE_AUX_JARS_PATH=/opt/apps/hive-3.1.2/lib
hive-site.xml
Configuration Properties - Apache Hive - Apache Software Foundation
vim hive-site.xml
hive.metastore.db.type
postgres
javax.jdo.option.ConnectionURL
jdbc:postgresql://hadoop-master:5432/metastore
javax.jdo.option.ConnectionDriverName
org.postgresql.Driver
javax.jdo.option.ConnectionUserName
hive
javax.jdo.option.ConnectionPassword
123456
hive.metastore.warehouse.dir
/user/hive/warehouse
datanucleus.schema.autoCreateAll
true
hive.metastore.schema.verification
false
hive.metastore.uris
thrift://hadoop-master:9083,thrift://hadoop-slave01:9083
hive.server2.support.dynamic.service.discovery
true
hive.server2.zookeeper.namespace
hiveserver2
hive.zookeeper.quorum
hadoop-master,hadoop-slave01,hadoop-slave02
hive.zookeeper.client.port
2181
hive.server2.thrift.bind.host
0.0.0.0
hive.server2.thrift.port
10000
hive.server2.authentication
NONE
hive.server2.enable.doAs
true
datanucleus.autoCreateSchema
true
datanucleus.fixedDatastore
true
datanucleus.autoStartMechanismMode
checked
hive.cli.print.current.db
true
hive.cli.print.header
true
hive.execution.engine
mr
hive.exec.scratchdir
/tmp/hive
hive.exec.local.scratchdir
/data/hive/scratchdir
hive.querylog.location
/data/hive/querylog
hive.downloaded.resources.dir
/data/hive/resources/${hive.session.id}_resources
hive.server2.webui.host
0.0.0.0
hive.server2.webui.port
10002
初始化数据库
# PG
schematool -initSchema -dbType postgres
# 查看数据库
psql -h localhost -Uhive -p 5432 -W -d metastore
\d
# MySQL
cp /usr/share/java/mysql-connector-java.jar /opt/apps/hive/lib/
CREATE DATABASE metastore DEFAULT CHARACTER SET utf8 DEFAULT COLLATE utf8_general_ci;
create user 'hive'@'%' identified by '123456';
grant all on metastore.* to 'hive'@'%';
show grants for 'hive'@'%';
schematool -initSchema -dbType mysql
创建目录
# HDFS
hdfs dfs -mkdir -p /user/hive/warehouse
hdfs dfs -ls /hive
# 本地
sudo mkdir /data/hive
sudo chown hdfs:hdfs /data/hive
启动
# 分别启动两台的metaStore和hiveServer2
# metastore启动后可以直接通过hive命令打开hive客户端
hive --service metastore
# hiveserver2启动后可以通过beeline连接hive
# !connect jdbc:hive2://hadoop-master:10000 hdfs 123456
hiveserver2
nohup hive --service metastore >> $HIVE_HOME/logs/metastore_$(date +%Y_%m_%d).log 2>&1 &
nohup hiveserver2 >> $HIVE_HOME/logs/hiveserver2_$(date +%Y_%m_%d).log 2>&1 &
验证
su hdfs
hive
zkCli.sh
ls /hiveserver2
# 两个都启动后会注册到zk
# [serverUri=0.0.0.0:10000;version=3.1.2;sequence=0000000004, serverUri=0.0.0.0:10000;version=3.1.2;sequence=0000000008]
beeline
!connect jdbc:hive2://hadoop-master,hadoop-slave01/;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2 hdfs 123456
# kill 掉其中一台 metastore
jps
# 27090 RunJar
kill 27090
# beeline再次查看,剩下另外一台metastore同样提供服务
Hadoop权限问题
vim core-site.xml
hadoop.proxyuser.hdfs.groups
*
hadoop.proxyuser.hdfs.hosts
*
刷新权限
yarn rmadmin -refreshSuperUserGroupsConfiguration
hdfs dfsadmin -refreshSuperUserGroupsConfiguration
HBase部署
兼容性
Apache HBase ™ Reference Guide
搜索“Java support”,查看JDK兼容
Hadoop兼容
Apache HBase ™ Reference Guide
Apache HBase ™ Reference Guide
Zookeeper兼容
What version of ZooKeeper should I use?
The newer version, the better. ZooKeeper 3.4.x is required as of HBase 1.0.0
搜索“ZooKeeper Requirements”,如下:
An Apache ZooKeeper quorum is required. The exact version depends on your version of HBase, though the minimum ZooKeeper version is 3.4.x due to the useMulti
feature made default in 1.0.0 (see HBASE-16598).
下载 HBase2.4.9
https://dlcdn.apache.org/hbase/2.4.9/hbase-2.4.9-bin.tar.gz
解压
tar -zxvf hbase-2.4.9-bin.tar.gz -C /opt/apps
mv hbase-2.4.9-bin hbase-2.4.9
环境变量
vim /etc/profile.d/hdfs_env.sh
# hadoop
export HBASE_HOME=/opt/apps/hbase-2.4.9
export PATH=$PATH:$HBASE_HOME/bin
source /etc/profile.d/hdfs_env.sh
配置
cd /opt/apps/hbase-2.4.9/conf
# 链接hadoop配置文件
ln -s /opt/apps/hadoop-3.2.2/etc/hadoop/core-site.xml core-site.xml
ln -s /opt/apps/hadoop-3.2.2/etc/hadoop/hdfs-site.xml hdfs-site.xml
Apache HBase ™ Reference Guide
regionservers
hadoop-master
hadoop-slave01
hadoop-slave02
hbase-env.sh
cat > hbase-env.sh << 'EOF'
export JAVA_HOME=/opt/apps/jdk1.8.0_211
export HBASE_MANAGES_ZK=false
EOF
hbase-site.xml
hbase.cluster.distributed
true
hbase.rootdir
hdfs://nameservice1/hbase
hbase.tmp.dir
/data/hbase
hbase.master.port
16000
hbase.master.info.bindAddress
0.0.0.0
hbase.master.info.port
16010
hbase.regionserver.port
16020
hbase.regionserver.info.bindAddress
0.0.0.0
hbase.regionserver.info.port
16030
hbase.regionserver.handler.count
100
hbase.regionserver.codecs
snappy,gz
hbase.hregion.memstore.flush.size
128
hbase.hregion.memstore.block.multiplier
4
hbase.hstore.compaction.max
10
hbase.hregion.max.filesize
10737418240
hbase.client.scanner.caching
1000
hbase.client.scanner.timeout.period
300000
hfile.block.cache.size
0.4
hbase.rpc.timeout
300000
hbase.zookeeper.quorum
hadoop-master,hadoop-slave01,hadoop-slave02
hbase.zookeeper.property.clientPort
2181
zookeeper.znode.parent
/hbase
zookeeper.session.timeout
120000
hbase.zookeeper.property.dataDir
/data/zookeeper
backup-masters
vim backup-masters
# 备用master的host,可多台
hadoop-slave02
启动
start-hbase.sh
stop-hbase.sh
hbase-daemon.sh start master
hbase-daemon.sh start regionserver
# Master WebUI
http://hadoop-master:16010
# 备用 Master
http://hadoop-slave02:16010
# RegionServer WebUI
http://hadoop-master:16030
验证
zkCli.sh
ls /hbase
ls /hbase/backup-masters
# [hadoop-slave02,16000,1642670777382]
# 在 master节点下线 HMaster
hbase-daemon.sh stop master
jps再查看 HMaster 不存在后,zookeeper 中 backup-masters 中 hadoop-slave02 没有了。
hadoop-master 的 WebUI 无法打开,Hadoop-slave02 自动切换为 HMaster。哪台再启动 HMaster,会自动注册为 backup-masters。
Kafka部署
下载
Apache Kafka
解压
tar -zxvf kafka_2.12-2.6.3.tgz -C /opt/apps/
环境变量
vim /etc/profile.d/hdfs_env.sh
# hadoop
export KAFKA_HOME=/opt/apps/kafka_2.12-2.6.3
export PATH=$PATH:$KAFKA_HOME/bin
source /etc/profile.d/hdfs_env.sh
创建目录
# 日志目录
mkdir $KAFKA_HOME/logs
# 数据
mkdir /data/kafka
配置
Apache Kafka
server.properties
broker.id=1
port=9092
delete.topic.enable=true
log.dirs=/data/kaka
zookeeper.connect=datalake-01:2181,datalake-02:2181,datalake-03:2181
producer.properties
bootstrap.servers=hadoop-master:9092,hadoop-slave01:9092,hadoop-slave02:9092
consumer.properties
bootstrap.servers=hadoop-master:9092,hadoop-slave01:9092,hadoop-slave02:9092
将 kafka 分发到 hadoop-slave01,hadoop-slave02,并修改 broker.id
broker.id=2
broker.id=3
启动
# 在三个节点上分别执行
nohup kafka-server-start.sh -daemon /opt/apps/kafka_2.12-2.6.3/config/server.properties 2>&1 &
脚本
for i in ethan001 ethan002 ethan003
do
echo "========== $i =========="
ssh $i 'kafka-server-start.sh -daemon /opt/apps/kafka_2.12-3.1.2/config/server.properties 2>&1'
echo $?
done
检验
# 查看 topic 列表
kafka-topics.sh --zookeeper localhost:2181 --list
# 创建 topic
kafka-topics.sh --zookeeper localhost:2181 --create --topic test --partitions 3 --replication-factor 3
# 查看topic
kafka-topics.sh --zookeeper localhost:2181 --describe --topic test
# producer
kafka-console-producer.sh --topic test --broker-list localhost:9092
# consumer
kafka-console-consumer.sh --topic test --bootstrap-server localhost:9092
Spark部署
下载
Downloads | Apache Spark
解压
tar -zxvf spark-2.4.8-bin-hadoop2.7.tgz /opt/apps/
mv spark-2.4.8-bin-hadoop2.7/ spark-2.4.8
环境变量
sudo vim /etc/profile.d/hdfs_env.sh
# spark
export SPARK_HOME=/opt/apps/spark-2.4.8
export PATH=$PATH:$SPARK_HOME/bin
# Hadoop与Spark都有脚本,使用别名区别
alias spark-start-all='/opt/apps/spark-2.4.8/sbin/start-all.sh'
alias spark-stop-all='/opt/apps/spark-2.4.8/sbin/stop-all.sh'
source /etc/profile.d/hdfs_env.sh
配置
cd /$SPARK_HOME/conf
cp spark-defaults.conf.template spark-defaults.conf
mv spark-env.sh.template spark-env.sh
mv slaves.template slaves
cp log4j.properties.template log4j.properties
spark-env.sh
export JAVA_HOME=/opt/apps/jdk1.8.0_211
export SCALA_HOME=/opt/apps/scala-2.12.14
export HADOOP_HOME=/opt/apps/hadoop-3.2.2
export HADOOP_CONF_DIR=/opt/apps/hadoop-3.2.2/etc/hadoop
export SPARK_MASTER_WEBUI_PORT=8078
export SPARK_WORKER_WEBUI_PORT=8079
# Master通信端口7077
export SPARK_MASTER_PORT=7077
export SPARK_DAEMON_JAVA_OPTS="-Dspark.deploy.recoveryMode=ZOOKEEPER -Dspark.deploy.zookeeper.url=hadoop-master:2181,hadoop-slave01:2181,hadoop-slave02:2181 -Dspark.deploy.zookeeper.dir=/spark2"
slaves
hadoop-master
hadoop-slave01
hadoop-slave02
spark-defaults.conf
Spark on YARN 会将本地 Spark-jar上传到 HDFS,然后分发到各个 NodeManager,将 jar 先上传到HDFS,减少上传过程。
hdfs dfs -mkdir -p /spark/jars
hdfs dfs -put /opt/apps/spark-2.4.8/jars/* /spark/jars/
vim spark-defaults.conf
spark.yarn.jars=hdfs://nameservice1/spark/jars/*
spark.yarn.historyServer.address=hadoop-slave02:18080
spark.history.ui.port=18080
spark.master.rest.port 16066
spark.eventLog.enabled true
spark.eventLog.dir hdfs://nameservice1/spark/directory
启动
# standalone
spark-start-all
# jps 查看
# Master
# worker
# Master WebUI
http://hadoop-master:8078
# History
mkdir /tmp/spark-events
$SPARK_HOME/sbin/start-history-server.sh
# History WebUI
http://hadoop-slave02:18080
HA
# 在其他节点启动 Master
sh $SPARK_HOME/sbin/start-master.sh
# 检查
zkCli.sh
ls /spark2/leader_election
# [_c_7c35ab9e-e333-4e18-aea4-25501fca6a22-latch-0000000002, _c_c853231d-86b2-45ae-8734-fbfa1b25fe40-latch-0000000001]
# WebUI 可以看到状态是 Status: STANDBY
http://hadoop-slave01:8078
# kill 掉 active Master,再次查看会变成Active
jps
# 13436 Master
kill -9 13436
检验
检验部署是否成功
run-example SparkPi
# Pi is roughly 3.1407357036785184
Local
spark-shell
val textFile = sc.textFile("file:///opt/apps/spark-2.4.8/README.md")
textFile.first()
textFile.count()
textFile.map(line=>line.split(" ").size).reduce((a,b)=>if(a>b) a else b)
res1.collect
val textFile = sc.textFile("hdfs://nameservice1/tmp/README.md")
textFile.first
textFile.collect
:quit
Standalone
spark-submit \
--master spark://10.0.11.111:7077 \
--executor-memory 1G \
--total-executor-cores 1 \
--class org.apache.spark.examples.SparkPi \
/opt/apps/spark-2.4.8/examples/jars/spark-examples_2.11-2.4.8.jar
查看任务运行(Spark History):
http://hadoop-slave02:18080
YARN
spark-submit \
--master yarn \
--deploy-mode client \
--class org.apache.spark.examples.SparkPi \
/opt/apps/spark-2.4.8/examples/jars/spark-examples_2.11-2.4.8.jar
查看任务运行(YARN History):
http://hadoop-slave02:19888
Flume部署
下载Flume1.9.0
Apache Downloads
解压
tar -zxvf apache-flume-1.9.0-bin.tar.gz -C /opt/apps/
mv apache-flume-1.9.0-bin/ flume-1.9.0
环境变量
vim /etc/profile.d/hdfs_env.sh
# flume
export FLUME_HOME=/opt/apps/flume-1.9.0
export PATH=$PATH:$FLUME_HOME/bin
source /etc/profile.d/hdfs_env.sh
配置
cd /opt/apps/flume-1.9.0/conf/
cp flume-env.sh.template flume-env.sh
vim flume-env.sh
export JAVA_HOME=/opt/apps/jdk1.8.0_211
# 冲突
cd $FLUME_HOME/lib
mv guava-11.0.2.jar guava-11.0.2.jar.bak
cp $HADOOP_HOME/share/hadoop/common/lib/guava-27.0-jre.jar $FLUME_HOME/lib
检验
flume-ng version
Flume 1.9.0
Source code repository: https://git-wip-us.apache.org/repos/asf/flume.git
Revision: d4fcab4f501d41597bc616921329a4339f73585e
Compiled by fszabo on Mon Dec 17 20:45:25 CET 2018
From source with checksum 35db629a3bda49d23e9b3690c80737f9
Flume 1.9.0 User Guide — Apache Flume
flume-conf.properties示例
# 1.Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1
# 2.Describe/configure the source
a1.sources.r1.type = netcat
a1.sources.r1.bind = hadoop-master
a1.sources.r1.port = 44444
# 3.Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
# 4.Describe the sink
a1.sinks.k1.type = logger
# 5.Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
启动Agent
flume-ng agent \
--name a1 \
--conf $FLUME_HOME/conf \
--conf-file $FLUME_HOME/conf/flume-conf.properties \
-Dflume.root.logger=INFO,console
测试
telnet hadoop-master 4444
Maven部署
下载Maven-3.6.4
Index of /maven/maven-3/3.6.3/binaries
解压
tar -zxvf apache-maven-3.6.3-bin.tar.gz -C /opt/apps/
mv /opt/apps/apache-maven-3.6.3-bin/ /opt/apps/maven-3.6.3
环境变量
vim /etc/profile.d/hdfs_env.sh
# maven
export M2_HOME=/opt/apps/maven-3.6.3
export PATH=$PATH:$M2_HOME/bin
source /etc/profile.d/hdfs_env.sh
配置
vim setting /opt/apps/maven-3.6.3/conf/setting.xml
/data/repo
ali-public
https://maven.aliyun.com/repository/public
public
ali-central
https://maven.aliyun.com/repository/central
central
ali-apache-snapshots
https://maven.aliyun.com/repository/apache-snapshots
apache snapshots
ali-snapshots
https://maven.aliyun.com/repository/snapshots
snapshots
ali-releases
https://maven.aliyun.com/repository/releases
releases
ali-mapr-public
https://maven.aliyun.com/repository/mapr-public
mapr-public
ali-google
https://maven.aliyun.com/repository/google
ali-gradle-plugin
https://maven.aliyun.com/repository/gradle-plugin
gradle-plugin
ali-spring
https://maven.aliyun.com/repository/spring
spring
ali-spring-plugin
https://maven.aliyun.com/repository/spring-plugin
spring-plugin
ali-grails-core
https://maven.aliyun.com/repository/grails-core
grails-core
nexus-hortonworks
Nexus hortonworks
https://repo.hortonworks.com/content/groups/public/
central
cloudera
cloudera
https://repository.cloudera.com/artifactory/cloudera-repos
central
mavenl
Maven Repository Switchboard
http://repo1.maven.org/maven2/
central
maven2
Maven Repository Switchboard
http://repo1.maven.apache.org/maven2/
central
检验
mvn -v
Kudu部署
下载Kudu1.14.0
https://dlcdn.apache.org/kudu/1.14.0/apache-kudu-1.14.0.tar.gz
Git
git clone https://github.com/apache/kudu
解压
tar -zxvf apache-kudu-1.14.0.tar.gz -C /opt/apps
mv apache-kudu-1.14.0/ kudu-1.14.0
编译
Apache Kudu - Installing Apache Kudu
依赖
sudo yum install -y autoconf automake cyrus-sasl-devel cyrus-sasl-gssapi \
cyrus-sasl-plain flex gcc gcc-c++ gdb git java-1.8.0-openjdk-devel \
krb5-server krb5-workstation libtool make openssl-devel patch \
pkgconfig redhat-lsb-core rsync unzip vim-common which
# Centos7,添加依赖
sudo yum install -y centos-release-scl-rh devtoolset-8
# 安装memkind,用于支持Kudu NVM (non-volatile memory)
sudo yum remove memkind
sudo yum install -y numactl-libs numactl-devel
git clone https://github.com/memkind/memkind.git
cd memkind
./build.sh --prefix=/usr
sudo make install
sudo ldconfig
# 如果需要编译文档,需要安装依赖,RHEL/centos 7以上用 rubygems 替换 gem
sudo yum install -y gem graphviz zlib-devel rh-ruby23
build-support/enable_devtoolset.sh thirdparty/build-if-necessary.sh
编译
mkdir -p build/release
cd build/release
../../build-support/enable_devtoolset.sh \
../../thirdparty/installed/common/bin/cmake \
-DCMAKE_BUILD_TYPE=release ../.. \
-DNO_TESTS=1
make j4
拷贝文件
拷贝 /build/release/bin/ 下的3个文件 kudu,kudu-master,kudu-tserver,和 www 目录,并创建 conf 目录
├── bin
│ ├── kudu
│ ├── kudu-master
│ └── kudu-tserver
├── conf
│ ├── master.gflagfile
│ └── tserver.gflagfile
└── www
├── bootstrap
│ ├── css
│ │ ├── bootstrap.min.css
│ │ ├── bootstrap.min.css.map
│ │ ├── bootstrap-table.min.css
│ │ ├── bootstrap-theme.min.css
│ │ └── bootstrap-theme.min.css.map
│ ├── fonts
│ │ └── glyphicons-halflings-regular.woff
│ └── js
│ ├── bootstrap.min.js
│ └── bootstrap-table.min.js
├── config.mustache
├── d3.v2.js
├── dashboards.mustache
├── epoch.0.5.2.min.css
├── epoch.0.5.2.min.js
├── favicon.ico
├── home.mustache
├── index.html
├── jquery-3.5.1.min.js
├── key.png
├── kudu.css
├── kudu.js
├── log-anchors.mustache
├── logo.png
├── logs.mustache
├── maintenance-manager.mustache
├── masters.mustache
├── metrics-epoch.js
├── metrics.html
├── scans.mustache
├── table.mustache
├── tables.mustache
├── tablet-consensus-status.mustache
├── tablet.mustache
├── tablet-rowsetlayout-svg.mustache
├── tablet-servers.mustache
├── tablets.mustache
├── threadz.mustache
├── tracing.html
└── tracing.js
环境变量
sudo vim /etc/profile.d/hdfs_env.sh
# kudu
export KUDU_HOME=/opt/apps/kudu-1.14.0
export PATH=$PATH:$KUDU_HOME/bin
source /etc/profile.d/hdfs_env.sh
安装
如果完成拷贝和环境变量,这一步可以不用
# 默认安装到 /usr/local/bin和 /usr/local/sbin
cd /opt/apps/kudu-1.14.0-src/build/release
make DESTDIR=/opt/apps/kudu-1.14.0 install
创建目录
mkdir -p /opt/apps/kudu-1.14.0/logs /data/kudu/master /data/kudu/tserver
配置
Apache Kudu - Apache Kudu Configuration Reference
mkdir /opt/apps/kudu-1.14.0/conf
cd /opt/apps/kudu-1.14.0/conf
touch master.gflagfile tserver.gflagfile
简易配置
master.gflagfile
--rpc_bind_addresses=0.0.0.0:7051
--master_addresses=hadoop-master:7051
--fs_data_dirs=/data/kudu/master
--fs_metadata_dir=/data/kudu/master
--fs_wal_dir=/data/kudu/master
--log_dir=/opt/apps/kudu-1.14.0/logs
--webserver_doc_root=/opt/apps/kudu-1.14.0/www
tserver.gflagfile
--rpc_bind_addresses=0.0.0.0:7050
--tserver_master_addrs=hadoop-master:7051
--fs_wal_dir=/data/kudu/tserver
--fs_metadata_dir=/data/kudu/tserver
--fs_data_dirs=/data/kudu/tserver
--log_dir=/opt/apps/kudu-1.14.0/logs
--webserver_doc_root=/opt/apps/kudu-1.14.0/www
集群配置
master.gflagfile
--rpc_bind_addresses=hadoop-master:7051
--master_addresses=hadoop-master:7051,hadoop-slave01:7051,hadoop-slave02:7051
--webserver_enabled=true
--webserver_port=8051
--metrics_log_interval_ms=60000
--webserver_doc_root=/opt/apps/kudu-1.14.0/www
--fs_wal_dir=/data/kudu/master
--fs_metadata_dir=/data/kudu/master
--fs_data_dirs=/data/kudu/master
--log_dir=/opt/apps/kudu-1.14.0/logs
--colorlogtostderr=true
--enable_process_lifetime_heap_profiling=true
--heap_profile_path=/data/kudu/master/heap
--rpc_authentication=disabled
--unlock_unsafe_flags=true
--unlock_experimental_flags=true
--max_log_size=2048
--flush_threshold_secs=86400
--budgeted_compaction_target_rowset_size=67100000
--tablet_delta_store_minor_compact_max=100
--tablet_delta_store_major_compact_min_ratio=0.01
--memory_limit_hard_bytes=1073741824
--block_cache_capacity_mb=256
--default_num_replicas=3
--max_clock_sync_error_usec=10000000
--consensus_rpc_timeout_ms=30000
--follower_unavailable_considered_failed_sec=300
--leader_failure_max_missed_heartbeat_periods=3
--tserver_unresponsive_timeout_ms=60000
--rpc_num_service_threads=10
--max_negotiation_threads=50
--min_negotiation_threads=0
--rpc_negotiation_timeout_ms=3000
--rpc_default_keepalive_time_ms=65000
--rpc_num_acceptors_per_address=1
--master_ts_rpc_timeout_ms=60000
--remember_clients_ttl_ms=3600000
--remember_responses_ttl_ms=600000
--rpc_service_queue_length=500
--raft_heartbeat_interval_ms=500
--heartbeat_interval_ms=1000
--heartbeat_max_failures_before_backoff=3
tserver.gflagfile
--rpc_bind_addresses=0.0.0.0:7050
--tserver_master_addrs=hadoop-master:7051,hadoop-slave01:7051,hadoop-slave02:7051
--webserver_enabled=true
--webserver_port=8050
--metrics_log_interval_ms=60000
--webserver_doc_root=/opt/apps/kudu-1.14.0/www
--fs_wal_dir=/data/kudu/tserver
--fs_metadata_dir=/data/kudu/tserver
--fs_data_dirs=/data/kudu/tserver
--log_dir=/opt/apps/kudu-1.14.0/logs
--colorlogtostderr=true
--enable_process_lifetime_heap_profiling=true
--heap_profile_path=/data/kudu/tserver/heap
--rpc_authentication=disabled
--unlock_unsafe_flags=true
--unlock_experimental_flags=true
--max_log_size=1800
--flush_threshold_secs=86400
--budgeted_compaction_target_rowset_size=67100000
--tablet_delta_store_minor_compact_max=100
--tablet_delta_store_major_compact_min_ratio=0.01
--memory_limit_hard_bytes=1073741824
--block_cache_capacity_mb=536870912
--default_num_replicas=3
--consensus_rpc_timeout_ms=30000
--follower_unavailable_considered_failed_sec=300
--leader_failure_max_missed_heartbeat_periods=3
--tserver_unresponsive_timeout_ms=60000
--rpc_num_service_threads=10
--max_negotiation_threads=50
--min_negotiation_threads=0
--rpc_negotiation_timeout_ms=3000
--rpc_default_keepalive_time_ms=65000
--rpc_num_acceptors_per_address=1
--master_ts_rpc_timeout_ms=60000
--remember_clients_ttl_ms=3600000
--remember_responses_ttl_ms=600000
--rpc_service_queue_length=500
--raft_heartbeat_interval_ms=500
--heartbeat_interval_ms=1000
--heartbeat_max_failures_before_backoff=3
启动
# master
nohup kudu-master --flagfile=/opt/apps/kudu-1.14.0/conf/master.gflagfile &
# tserver
nohup kudu-tserver --flagfile=/opt/apps/kudu-1.14.0/conf/tserver.gflagfile &
检验
kudu cluster ksck localhost:7051
kudu master list localhost:7051
kudu tserver list localhost:7051
WebUI
master
http://hadoop-master:8051
tserver
http://hadoop-master:8050
停止
jobs -l
# [1]+ 21795 运行中 kudu-master --flagfile=/opt/apps/kudu-1.14.0/conf/master.gflagfile &
kill 21795
Impala部署
注意:最好在干净的环境下编译,会下载很多依赖修改本地环境变量。
下载 Imapala-3.4.0
https://archive.apache.org/dist/impala/3.4.0/apache-impala-3.4.0.tar.gz
解压
tar -zxvf apache-impala-3.4.0.tar.gz -C /opt/apps/
mv /opt/apps/apache-impala-3.4.0 impala-3.4.0
编译
Building Impala - Impala - Apache Software Foundation
准备
修改 Ant 版本
cd /opt/apps/impala-3.4.0
vim bin/bootstrap_system.sh
# 243行,修改Ant版本为1.9.16,注释sha512验证
redhat sudo wget -nv \
https://downloads.apache.org/ant/binaries/apache-ant-1.9.16-bin.tar.gz
#redhat sha512sum -c - <<< 'b9324cffeb5b113fa289126db1408b9a0125757b598d763f076fc5deec97fb43f27979974cadcac79b6573d8
#4dcb2d1d5bf59b7972fb2abe5ed3d9fed445b04e apache-ant-1.9.16-bin.tar.gz'
redhat sudo tar -C /usr/local -xzf apache-ant-1.9.16-bin.tar.gz
redhat sudo ln -s /usr/local/apache-ant-1.9.16/bin/ant /usr/local/bin
如果编译过程中已经建立软链,编译失败,那么把这几行都注释掉,避免重复下载
预下载 m2_archive.tar.gz
m2_archive.tar.gz下载过程中总是中断,手动下载并上传到 /tmp
https://jenkins.impala.io/job/all-build-options-ub1604/7919//artifact/Impala/logs/m2_archive.tar.gz
vim /opt/jars/impala-4.0.0/bin/jenkins/populate_m2_directory.py
# 修改和注释下面的
tmp_tarball_location = "/tmp/tarball_name"
#subprocess.check_call(["wget", "-q", url, "-O", tmp_tarball_location])
impala-3.4 需要修改pom文件
vim impala-parent/pom.xml
# 修改
cdh.rcs.releases.repo
https://repository.cloudera.com/artifactory/cdh-releases-rcs
CDH Releases Repository
true
# 删除
cloudera.thirdparty.repo
https://repository.cloudera.com/content/repositories/third-party
Cloudera Third Party Repository
false
# 修改
cloudera.thirdparty.repo
https://repository.cloudera.com/artifactory/cdh-releases-rcs
Cloudera Third Party Repository
false
执行编译
export IMPALA_HOME=`pwd`
bin/bootstrap_system.sh
source $IMPALA_HOME/bin/impala-config.sh
# 编译下载依赖在 toolchain 下,需要很久
./buildall.sh -noclean -notests -skiptests
拷贝编译文件
#!/bin/bash
IMPALA_SRC_HOME=/opt/jars/impala-4.0.0-src
IMPALA_HOME=/opt/apps/impala-4.0.0
# if exist the dest directory then clear it.
if [ -d "${IMPALA_HOME}" ]; then
rm -rf ${IMPALA_HOME}/*
else
mkdir -p ${IMPALA_HOME}
fi
mkdir ${IMPALA_HOME}/be
mkdir ${IMPALA_HOME}/lib
mkdir ${IMPALA_HOME}/dependency/
mkdir ${IMPALA_HOME}/sbin
cp -rf ${IMPALA_SRC_HOME}/be/build/debug/* ${IMPALA_HOME}/be/
cp -rf ${IMPALA_SRC_HOME}/toolchain/toolchain-packages-gcc7.5.0/gcc-7.5.0/lib64* ${IMPALA_HOME}/lib/
cp -rf ${IMPALA_SRC_HOME}/fe/target/impala-frontend-0.1-SNAPSHOT.jar ${IMPALA_HOME}/lib/
cp -rf ${IMPALA_SRC_HOME}/fe/target/dependency/* ${IMPALA_HOME}/dependency/
cp -rf ${IMPALA_SRC_HOME}/shell/build/impala-shell-4.0.0-RELEASE/* ${IMPALA_HOME}/shell
cp -r ${IMPALA_SRC_HOME}/www ${IMPALA_HOME}/
echo "Finished"
环境变量
sudo vim /etc/profile.d/hdfs_env.sh
# spark
export IMAPALA_HOME=/opt/apps/impala-3.4.0
export PATH=$PATH:$IMAPALA_HOME/bin
source /etc/profile.d/hdfs_env.sh
配置
默认配置文件为 /etc/default/impala,没有则创建
IMPALA_CATALOG_SERVICE_HOST=hadoop-slave02
IMPALA_STATE_STORE_HOST=hadoop-slave02
IMPALA_CATALOG_SERVICE_PORT=26000
IMPALA_STATE_STORE_PORT=24000
IMPALA_BACKEND_PORT=22000
IMPALA_LOG_DIR=/opt/apps/impala-4.0.0/logs
export IMPALA_CATALOG_ARGS=" -log_dir=${IMPALA_LOG_DIR} -catalog_service_port=${IMPALA_CATALOG_SERVICE_PORT}"
export IMPALA_STATE_STORE_ARGS=" -log_dir=${IMPALA_LOG_DIR} -state_store_port=${IMPALA_STATE_STORE_PORT}"
export IMPALA_SERVER_ARGS=" \
-log_dir=${IMPALA_LOG_DIR} \
-catalog_service_host=${IMPALA_CATALOG_SERVICE_HOST} \
-state_store_port=${IMPALA_STATE_STORE_PORT} \
-state_store_host=${IMPALA_STATE_STORE_HOST} \
-use_statestore \
-be_port=${IMPALA_BACKEND_PORT} -mem_limit=60%"
# -kudu_master_hosts=hadoop-master:7051
export ENABLE_CORE_DUMPS=${ENABLE_COREDUMPS:-false}
export JAVA_HOME=/opt/apps/jdk1.8.0_211
export IMPALA_HOME=/opt/jars/impala
export IMPALA_CONF_DIR=/etc/impala/conf
export HADOOP_CONF_DIR=/etc/hadoop/conf
export HIVE_CONF_DIR=/etc/hive/conf
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$IMPALA_HOME/lib:$HADOOP_HOME/lib/native
for f in /opt/jars/impala/lib/*.jar; do
export CLASSPATH=$CLASSPATH:$f
done
export CLASSPATH=$CLASSPATH:/etc/impala/conf
MYSQL_CONNECTOR_JAR=/usr/share/java/mysql-connector-java.jar
export IMPALAD_START="nohup ${IMPALA_HOME}/bin/impalad ${IMPALA_SERVER_ARGS} &"
export CATALOG_START="nohup ${IMPALA_HOME}/bin/catalogd ${IMPALA_CATALOG_ARGS} &"
export STATESTORE_START="nohup ${IMPALA_HOME}/bin/statestored ${IMPALA_STATE_STORE_ARGS} &"
创建配置文件软链
ln -s /opt/apps/impala-4.0.0/conf /etc/impala/conf
ln -s /opt/apps/hadoop-3.2.2/etc/hadoop/core-site.xml /etc/impala/conf/core-site.xml
ln -s /opt/apps/hadoop-3.2.2/etc/hadoop/hdfs-site.xml /etc/impala/conf/hdfs-site.xml
ln -s /opt/apps/hive-3.1.2/conf/hive-site.xml /etc/impala/conf/hive-site.xml
ln -s /opt/apps/hbase-2.3.7/conf/hbase-site.xml /etc/impala/conf/hbase-site.xml
mkdir -p /etc/hadoop/conf
ln -s /opt/apps/hadoop-3.2.2/etc/hadoop/core-site.xml /etc/hadoop/conf/core-site.xml
ln -s /opt/apps/hadoop-3.2.2/etc/hadoop/hdfs-site.xml /etc/hadoop/conf/hdfs-site.xml
ln -s /opt/apps/hadoop-3.2.2/etc/hadoop/yarn-site.xml /etc/hadoop/conf/yarn-site.xml
mkdir -p /etc/hive/conf
ln -s /opt/apps/hive-3.1.2/conf/hive-site.xml /etc/hive/conf/hive-site.xml
mkdir -p /etc/hbase/conf
ln -s /opt/apps/hbase-2.3.7/conf/hbase-site.xml /etc/hbase/conf/hbase-site.xml
ln -s /opt/apps/impala-4.0.0/shell/impala-shell /bin/impala-shell
所有节点创建短路读取路径
mkdir -p /var/lib/hdfs-sockets
配置hdfs-site.xml
dfs.client.read.shortcircuit
true
dfs.domain.socket.path
/var/run/hdfs-sockets/dn
dfs.datanode.hdfs-blocks-metadata.enabled
true
dfs.client.file-block-storage-locations.timeout
30000
修改 /var/run/hdfs-sockets 目录权限,用户为hdfs,组为root
sudo chown hdfs:root -R /var/run/hdfs-sockets
启动
source /etc/default/impala
nohup $STATESTORE_START &
nohup $CATALOG_START &
nohup $IMPALAD_START &
nohup $IMPALAD_START -kudu_master_hosts=hadoop-master:7051 &
nohup ${STATESTORE_START} > $IMPALA_HOME/logs/statestore.log 2>&1 &
nohup ${CATALOG_START} > $IMPALA_HOME/logs/catalog.log 2>&1 &
nohup ${IMPALAD_START} -kudu_master_hosts=hadoop-master:7051 > $IMPALA_HOME/logs/impalad.log 2>&1 &
验证
ps -ef | grep impala
#root 29741 26654 0 10:20 pts/0 00:00:00 /opt/jars/impala/bin/statestored -log_dir=/opt/jars/impala/logs -state_store_port=24000
#root 29799 27425 16 10:20 pts/1 00:00:07 /opt/jars/impala/bin/catalogd -log_dir=/opt/jars/impala/logs -catalog_service_port=26000
#root 29932 29834 24 10:21 pts/2 00:00:09 /opt/jars/impala/bin/impalad -log_dir=/opt/jars/impala/logs -catalog_service_host=hadoop-slave02 -state_store_port=24000 -state_store_host=hadoop-slave02 -use_statestore -be_port=22000 -mem_limit=60%
jobs -l
Statestored WebUI
http://hadoop-slave02:25010/
Catalog WebUI
http://hadoop-slave02:25020
impalad WebUI
http://hadoop-slave02:25000/
编译问题
sudo service postgresql initdb Hint: the preferred way to do this is now "postgresql-setup initdb" Data directory is not empty!
cd /var/lib/pgsql
rm -rf data/
启动报错
impalad: /usr/lib64/libstdc++.so.6: version `GLIBCXX_3.4.21' not found (required by impalad)
impalad: /usr/lib64/libstdc++.so.6: version `GLIBCXX_3.4.22' not found (required by impalad)
impalad: /usr/lib64/libstdc++.so.6: version `CXXABI_1.3.8' not found (required by impalad)
impalad: /usr/lib64/libstdc++.so.6: version `GLIBCXX_3.4.20' not found (required by impalad)
impalad: /usr/lib64/libstdc++.so.6: version `CXXABI_1.3.11' not found (required by impalad)
impalad: /usr/lib64/libstdc++.so.6: version `GLIBCXX_3.4.21' not found (required by /opt/apps/impala-4.0.0/lib/libkudu_client.so.0)
impalad: /usr/lib64/libstdc++.so.6: version `CXXABI_1.3.8' not found (required by /opt/apps/impala-4.0.0/lib/libkudu_client.so.0)
impalad: /usr/lib64/libstdc++.so.6: version `GLIBCXX_3.4.20' not found (required by /opt/apps/impala-4.0.0/lib/libkudu_client.so.0)
impalad: /usr/lib64/libstdc++.so.6: version `CXXABI_1.3.9' not found (required by /opt/apps/impala-4.0.0/lib/libkudu_client.so.0)
impalad: /usr/lib64/libstdc++.so.6: version `GLIBCXX_3.4.22' not found (required by /opt/apps/impala-4.0.0/lib/libkudu_client.so.0)
cp /opt/apps/impala-4.0.0/lib/libstdc++.so.6.0.24 /usr/lib64
cd /usr/lib64
ln -snf libstdc++.so.6.0.24 libstdc++.so.6
报错
Environment variable CLASSPATH not set!
getJNIEnv: getGlobalJNIEnv failed
Impala负载均衡
安装 haproxy
yum install -y haproxy
配置 haproxy.cfg
vim /etc/haproxy/haproxy.cfg
listen impalashell # 监听 impala-shell
bind 0.0.0.0:21000 # proxy绑定的IP和端口
mode tcp # 以4层⽅方式代理理,重要
option tcplog
balance roundrobin # 调度算法 'leastconn' 最少连接数分配,或者 'roundrobin'轮询
server impalashell_1 hadoop-master:21000 check # 所有impalad节点,别名,主机名,端口, check检查正常才转发给impalad
server impalashell_2 hadoop-slave01:21000 check
server impalashell_3 hadoop-slave02:21000 check
listen impalajdbc # 监听jdbc的请求,通过客户端界面连接就是用的jdbc
bind 0.0.0.0:21050
mode tcp
option tcplog
balance roundrobin
server impalajdbc_1 hadoop-master:21050 check
server impalajdbc_2 hadoop-slave01:21050 check
server impalajdbc_2 hadoop-slave02:21050 check
连接
impala-shell -i hadoop-proxy:21000
Hue集成Impala
vim /opt/apps/hue/desktop/conf/pseudo-distributed.ini
[impala] # 1140行左右
server_host=hadoop-proxy # 任意一台impala-server主机,使用了haproxy实现负载均衡,则填haproxy绑定的主机
server_port=21050 # hue是通过jdbc的方式连接的impalad
impala_conf_dir=/etc/impala/conf # impala的配置文件目录
Tez部署
Apache Tez – Install and Deployment Instructions
下载 Tez-0.10.1 源码
Apache Downloads
解压
tar -zxvf apache-tez-0.10.1-src.tar.gz
修改pom
vim pom.xml3.2.2
# 编译 tez-ui 需要翻墙很麻烦,基本不会使用,可以跳过 tez-ui 模块
...
编译
编译工具
yum -y install autoconf automake libtool cmake ncurses-devel openssl-devel lzo-devel zlib-devel gcc gcc-c++
编译 probuf-2.5.0
https://codeload.github.com/protocolbuffers/protobuf/tar.gz/refs/tags/v2.5.0
tar -zxvf protobuf-2.5.0.tar.gz
cd protobuf-2.5.0
./configure
make install
编译Tez
cd apache-tez-0.10.1-src
mvn clean package -DskipTests=true -Dmaven.javadoc.skip=true
编译完成,apache-tez-0.10.1-src/tez-dist/target/ 下
解压
解压 tez-0.10.1-minimal.tar.gz
mkdir /opt/apps/tez-0.10.1
tar -zxvf /opt/jars/tez-0.10.1-minimal.tar.gz -C /opt/apps/tez-0.10.1/
上传
上传 tez-0.10.1.tar.gz 到 HDFS
su hdfs
hdfs dfs -mkdir /tez
hdfs dfs -put /opt/apps/tez-0.10.1.tar.gz /tez/
配置
tez-site.xml
在 $HADOOP_HOME/etc/hadoop 下新建 tez-site.xml 文件
tez.lib.uris
${fs.defaultFS}/tez/apache-tez-0.10.1-SNAPSHOT.tar.gz
tez.use.cluster.hadoop-libs
true
tez.am.resource.memory.mb
1024
tez.am.resource.cpu.vcores
1
tez.container.max.java.heap.fraction
0.4
tez.task.resource.memory.mb
1024
tez.task.resource.cpu.vcores
1
tez.history.logging.service.class
org.apache.tez.dag.history.logging.ats.ATSHistoryLoggingService
hadoop-env.sh
修改 Hadoop 启动环境
TEZ_CONF_DIR=/opt/apps/hadoop-3.2.2/etc/hadoop/
TEZ_JARS=/opt/apps/tez-0.10.1
export HADOOP_CLASSPATH=${HADOOP_CLASSPATH}:${TEZ_CONF_DIR}:${TEZ_JARS}/*:${TEZ_JARS}/lib/*
hive-site.xml
修改 hive 配置计算引擎
vim /opt/apps/hive-3.1.2/conf/hive-site.xml
hive.execution.engine
tez
hive.tez.container.size
2048
hive-env.sh
修改 Hive 启动环境
export TEZ_HOME=/opt/apps/tez-0.10.1
export TEZ_JARS=""
for jar in `ls $TEZ_HOME |grep jar`; do
export TEZ_JARS=$TEZ_JARS:$TEZ_HOME/$jar
done
for jar in `ls $TEZ_HOME/lib`; do
export TEZ_JARS=$TEZ_JARS:$TEZ_HOME/lib/$jar
done
export HIVE_AUX_JARS_PATH=${HIVE_HOME}/lib$TEZ_JARS
mapred-site.xml
mapreduce.framework.name
yarn-tez
检验
cd /opt/apps/tez-0.10.1
hdfs dfs -put LICENSE /tez
yarn jar /opt/apps/tez-0.10.1/tez-examples-0.10.1.jar orderedwordcount /tez/LICENSE /tez/output
hive
show databases;
create database test_db;
use test_db;
create table test_tb(id int, name string);
insert into test_tb values(1,"aaa");
Lzo编译安装
下载
wget http://www.oberhumer.com/opensource/lzo/download/lzo-2.10.tar.gz
tar -zxvf lzo-2.10.tar.gz
cd lzo-2.10
./configure -prefix=/usr/local/hadoop/lzo/
make
make install