(1) 克隆三台虚拟机
(1) IP地址
# 修改hadoop181的ip为 192.168.207.181 [root@hadoop181 ~]# vim /etc/sysconfig/network-scripts/ifcfg-ens33 # 修改hadoop182的ip为 192.168.207.182 [root@hadoop182 ~]# vim /etc/sysconfig/network-scripts/ifcfg-ens33 # 修改hadoop183的ip为 192.168.207.183 [root@hadoop183 ~]# vim /etc/sysconfig/network-scripts/ifcfg-ens33(2) 关闭防火墙
# 关闭hadoop181 的防火墙 [root@hadoop181 ~]# systemctl disable firewalld && systemctl stop firewalld # 关闭hadoop182 的防火墙 [root@hadoop182 ~]# systemctl disable firewalld && systemctl stop firewalld # 关闭hadoop183 的防火墙 [root@hadoop183 ~]# systemctl disable firewalld && systemctl stop firewalld(3)关闭 seLinux
# 关闭hadoop181的selinux , 设置 `SELINUX=disabled` 即可 [root@hadoop181 ~]# vim /etc/selinux/config # 关闭hadoop182的selinux , 设置 `SELINUX=disabled` 即可 [root@hadoop182 ~]# vim /etc/selinux/config # 关闭hadoop183的selinux , 设置 `SELINUX=disabled` 即可 [root@hadoop183 ~]vim /etc/selinux/config(4)修改hostname
# 设置 hadoop181 的 hostname 为 hadoop181 [root@hadoop181 ~]# hostnamectl set-hostname hadoop181 # 第一台 # 设置 hadoop182 的 hostname 为 hadoop182 [root@hadoop182 ~]# hostnamectl set-hostname hadoop182 # 第二台 # 设置 hadoop183 的 hostname 为 hadoop183 [root@hadoop183 ~]# hostnamectl set-hostname hadoop183 # 第三台Note: 这里的显示是因为之前处理过, 所以现实的会是对应的hostname, 没处理过的时候是下面这种显示
[root@localhost ~]# hostnamectl set-hostname hadoop183 # 第一台(5)增加hosts文件内容
[root@hadoop181 ~]# vim /etc/hosts [root@hadoop182 ~]# vim /etc/hosts [root@hadoop183 ~]# vim /etc/hosts在三台主机分别追加如下内容
192.168.207.181 hadoop181 192.168.207.182 hadoop182 192.168.207.183 hadoop183(1) 创建用户
# 配置hadoop181 上的用户和用户组 [root@hadoop181 ~]# groupadd hadoop [root@hadoop181 ~]# useradd -s /bin/bash -d /home/hadoop -g hadoop hadoop [root@hadoop181 ~]# passwd hadoop # 设置账户密码都为Hadoop # 配置hadoop182 上的用户和用户组 [root@hadoop182 ~]# groupadd hadoop [root@hadoop182 ~]# useradd -s /bin/bash -d /home/hadoop -g hadoop hadoop [root@hadoop182 ~]# passwd hadoop # 设置账户密码都为Hadoop # 配置hadoop183 上的用户和用户组 [root@hadoop183 ~]# groupadd hadoop [root@hadoop183 ~]# useradd -s /bin/bash -d /home/hadoop -g hadoop hadoop [root@hadoop183 ~]# passwd hadoop # 设置账户密码都为Hadoop(2)用新的配置用户sudo提权
[root@hadoop181 ~]# sudo vim /etc/sudoers [root@hadoop182 ~]# sudo vim /etc/sudoers [root@hadoop183 ~]# sudo vim /etc/sudoers在三台主机都增加如下内容
hadoop ALL=(ALL) NOPASSWD:ALL #增加这一行示例
(3)给hadoop用户准备便捷工具
# 现在hadoop用户目录下创建bin目录,然后追加两个脚本工具 分别命名为xsync 和 xssh (文档后面使用的就是这两个脚本) [hadoop@hadoop181 ~]$ mkdir bin [hadoop@hadoop181 ~]$ cd bin集群ssh操作执行命令脚本 参见我另一份博客 ssh集群批量操作脚本
集群文件分发脚本 参见我另一份博客 集群间文件分发
(4)集群间免密登陆配置
源主机目标1目标2目标3hadoop181hadoop181hadoop182hadoop183hadoop182hadoop181hadoop182hadoop183hadoop183hadoop181hadoop182hadoop183 # 密钥生成 [hadoop@hadoop181 ~]$ ssh-keygen -t rsa [hadoop@hadoop182 ~]$ ssh-keygen -t rsa [hadoop@hadoop183 ~]$ ssh-keygen -t rsa # 分发hadoop181 到 其他机器的 密钥命令 [hadoop@hadoop181 ~]$ ssh-copy-id hadoop@hadoop181 [hadoop@hadoop181 ~]$ ssh-copy-id hadoop@hadoop182 [hadoop@hadoop181 ~]$ ssh-copy-id hadoop@hadoop183 # 分发hadoop182 到 其他机器的 密钥命令 [hadoop@hadoop182 ~]$ ssh-copy-id hadoop@hadoop181 [hadoop@hadoop182 ~]$ ssh-copy-id hadoop@hadoop182 [hadoop@hadoop182 ~]$ ssh-copy-id hadoop@hadoop183 # 分发hadoop183 到 其他机器的 密钥命令 [hadoop@hadoop183 ~]$ ssh-copy-id hadoop@hadoop181 [hadoop@hadoop183 ~]$ ssh-copy-id hadoop@hadoop182 [hadoop@hadoop183 ~]$ ssh-copy-id hadoop@hadoop183(1) 上传安装包到hadoop加目录下 (本次使用版本hadoop-3.1.3.tar.gz) (2) 解压安装包
[hadoop@hadoop181 ~]$ tar -zxvf hadoop-3.1.3.tar.gz(3) 配置hadoop环境变量
[hadoop@hadoop181 hadoop-3.1.3]$ pwd /home/hadoop/hadoop-3.1.3 [hadoop@hadoop181 hadoop-3.1.3]$ cd ~ [hadoop@hadoop181 ~]$ vim .bashrc追加如下内容
# JAVA HOME export JAVA_HOME=/home/hadoop/jdk1.8.0_144 export PATH=$PATH:$JAVA_HOME/bin # HADOOP HOME 这个路径是pwd拿到的路径 export HADOOP_HOME=/home/hadoop/hadoop-3.1.3 export PATH=$PATH:$HADOOP_HOME/sbin export PATH=$PATH:$HADOOP_HOME/bin(4) 使环境变量生效
[hadoop@hadoop181 ~]$ source .bashrc(5) 环境变量分发
[hadoop@hadoop181 ~]$ xsync .bashrc参考我另一篇笔记 https://blog.csdn.net/SnowXu01/article/details/108372971 zookeeper搭建实录
(1) 修改hadoop-env.sh 文件
# 修改 hadoop-env.sh 修改 JAVA_HOME 路径 [hadoop@hadoop181 ~]$ cp $HADOOP_HOME/etc/hadoop/hadoop-env.sh $HADOOP_HOME/etc/hadoop/hadoop-env.sh.back [hadoop@hadoop181 ~]$ vim $HADOOP_HOME/etc/hadoop/hadoop-env.sh(2)备份并配置 core-site.xml 文件
# 修改 core-site.xml 文件, 修改前先备份 [hadoop@hadoop181 ~]$ cp $HADOOP_HOME/etc/hadoop/core-site.xml $HADOOP_HOME/etc/hadoop/core-site.xml.back [hadoop@hadoop181 ~]$ vim $HADOOP_HOME/etc/hadoop/core-site.xml(3)配置内容如下
<configuration> <!-- 把多个NameNode的地址组装成一个集群hdfscluster --> <property> <name>fs.defaultFS</name> <value>hdfs://hdfscluster</value> </property> <property> <name>hadoop.tmp.dir</name> <value>/home/hadoop/hadoop-3.1.3/data/tmp</value> </property> <!-- 声明journalnode服务器存储目录--> <property> <name>dfs.journalnode.edits.dir</name> <!-- 特别注意: 这个只有3.* 以后才能这么配置 ; 2.* 的版本只能配置绝对路径 --> <value>file://${hadoop.tmp.dir}/jn</value> </property> </configuration>(1)备份并配置hdfs-site.xml文件
# 修改 core-site.xml 文件, 修改前先备份 [hadoop@hadoop181 ~]$ cp $HADOOP_HOME/etc/hadoop/hdfs-site.xml $HADOOP_HOME/etc/hadoop/hdfs-site.xml.back [hadoop@hadoop181 ~]$ vim $HADOOP_HOME/etc/hadoop/hdfs-site.xml(2)配置内容如下
<configuration> <!-- 完全分布式集群名称 --> <property> <name>dfs.nameservices</name> <value>hdfscluster</value> </property> <!-- NameNode数据存储目录 --> <property> <name>dfs.namenode.name.dir</name> <value>file://${hadoop.tmp.dir}/name</value> </property> <!-- DataNode数据存储目录 --> <property> <name>dfs.datanode.data.dir</name> <value>file://${hadoop.tmp.dir}/data</value> </property> <!-- 集群中NameNode节点都有哪些 --> <property> <name>dfs.ha.namenodes.hdfscluster</name> <value>nn1,nn2,nn3</value> </property> <!-- nn1的RPC通信地址 9820、9000、8020--> <property> <name>dfs.namenode.rpc-address.hdfscluster.nn1</name> <value>hadoop181:9000</value> </property> <!-- nn2的RPC通信地址 --> <property> <name>dfs.namenode.rpc-address.hdfscluster.nn2</name> <value>hadoop182:9000</value> </property> <!-- nn3的RPC通信地址 --> <property> <name>dfs.namenode.rpc-address.hdfscluster.nn3</name> <value>hadoop183:9000</value> </property> <!-- nn1的http通信地址 --> <property> <name>dfs.namenode.http-address.hdfscluster.nn1</name> <value>hadoop181:9870</value> </property> <!-- nn2的http通信地址 --> <property> <name>dfs.namenode.http-address.hdfscluster.nn2</name> <value>hadoop182:9870</value> </property> <!-- nn3的http通信地址 --> <property> <name>dfs.namenode.http-address.hdfscluster.nn3</name> <value>hadoop183:9870</value> </property> <!-- 指定NameNode元数据在JournalNode上的存放位置 --> <property> <name>dfs.namenode.shared.edits.dir</name> <value>qjournal://hadoop181:8485;hadoop182:8485;hadoop183:8485/hdfscluster</value> </property> <!-- 配置隔离机制,即同一时刻只能有一台服务器对外响应(active)解决脑裂 --> <property> <name>dfs.ha.fencing.methods</name> <value>sshfence</value> </property> <!-- 使用隔离机制时需要ssh无秘钥登录--> <property> <name>dfs.ha.fencing.ssh.private-key-files</name> <value>/home/hadoop/.ssh/id_rsa</value> </property> <!-- 访问代理类:client用于确定哪个NameNode为Active --> <property> <name>dfs.client.failover.proxy.provider.hdfscluster</name> <value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value> </property> <!-- 关闭权限检查--> <property> <name>dfs.permissions.enable</name> <value>false</value> </property> </configuration>(1) 删除
[hadoop@hadoop181 ~]$ xssh rm -rf $HADOOP_HOME/logs/* [hadoop@hadoop181 ~]$ xssh rm -rf $HADOOP_HOME/data/*(2) 确认下是不是都删除了,如果还有就得都清理掉
# 检查logs路径下是不是都清理干净了 [hadoop@hadoop181 ~]$ xssh ls -l $HADOOP_HOME/logs/ [DEBUG] 1 command is :ls -l /home/hadoop/hadoop-3.1.3/logs/ [DEBUG] 1 command is :ls -l /home/hadoop/hadoop-3.1.3/logs/ [DEBUG] ssh to hadoop181 to execute commands [ ls -l /home/hadoop/hadoop-3.1.3/logs/] total 0 [DEBUG] ssh to hadoop182 to execute commands [ ls -l /home/hadoop/hadoop-3.1.3/logs/] total 0 [DEBUG] ssh to hadoop183 to execute commands [ ls -l /home/hadoop/hadoop-3.1.3/logs/] total 0 [hadoop@hadoop181 ~]$ # 检查data路径下是不是都清理干净了 [hadoop@hadoop181 ~]$ xssh ls -l $HADOOP_HOME/data/ [DEBUG] 1 command is :ls -l /home/hadoop/hadoop-3.1.3/data/ [DEBUG] 1 command is :ls -l /home/hadoop/hadoop-3.1.3/data/ [DEBUG] ssh to hadoop181 to execute commands [ ls -l /home/hadoop/hadoop-3.1.3/data/] total 0 [DEBUG] ssh to hadoop182 to execute commands [ ls -l /home/hadoop/hadoop-3.1.3/data/] total 0 [DEBUG] ssh to hadoop183 to execute commands [ ls -l /home/hadoop/hadoop-3.1.3/data/] total 0 [hadoop@hadoop181 ~]$(1) 文件分发
[hadoop@hadoop181 ~]$ xsync hadoop-3.1.3(2)启动所有的zookeeper节点
# 启动 [hadoop@hadoop181 ~]$ xssh /home/hadoop/apache-zookeeper/bin/zkServer.sh start # 查看一下集群状态,我这里都已经启动了 [hadoop@hadoop181 ~]$ xssh /home/hadoop/apache-zookeeper/bin/zkServer.sh status [DEBUG] 1 command is :/home/hadoop/apache-zookeeper/bin/zkServer.sh status [DEBUG] ssh to hadoop181 to execute commands [ /home/hadoop/apache-zookeeper/bin/zkServer.sh status] ZooKeeper JMX enabled by default Using config: /home/hadoop/apache-zookeeper/bin/../conf/zoo.cfg Client port found: 2181. Client address: localhost. Mode: follower [DEBUG] ssh to hadoop182 to execute commands [ /home/hadoop/apache-zookeeper/bin/zkServer.sh status] ZooKeeper JMX enabled by default Using config: /home/hadoop/apache-zookeeper/bin/../conf/zoo.cfg Client port found: 2181. Client address: localhost. Mode: leader [DEBUG] ssh to hadoop183 to execute commands [ /home/hadoop/apache-zookeeper/bin/zkServer.sh status] ZooKeeper JMX enabled by default Using config: /home/hadoop/apache-zookeeper/bin/../conf/zoo.cfg Client port found: 2181. Client address: localhost. Mode: follower [hadoop@hadoop181 ~]$(3)启动所有 journalnode
# 到每个节点都启动 [hadoop@hadoop181 ~]$ xssh $HADOOP_HOME/sbin/hadoop-daemon.sh start journalnode # 启动方式2 [hadoop@hadoop181 ~]$ xssh hdfs --daemon start journalnode # 查看启动状态 是否有 JournalNode 进程存在 [hadoop@hadoop181 ~]$ xssh jps -l [DEBUG] 1 command is :jps -l [DEBUG] ssh to hadoop181 to execute commands [ jps -l] 2065 org.apache.zookeeper.server.quorum.QuorumPeerMain 3430 org.apache.hadoop.hdfs.qjournal.server.JournalNode 3487 sun.tools.jps.Jps [DEBUG] ssh to hadoop182 to execute commands [ jps -l] 1910 org.apache.zookeeper.server.quorum.QuorumPeerMain 3014 org.apache.hadoop.hdfs.qjournal.server.JournalNode 3066 sun.tools.jps.Jps [DEBUG] ssh to hadoop183 to execute commands [ jps -l] 1907 org.apache.zookeeper.server.quorum.QuorumPeerMain 2955 org.apache.hadoop.hdfs.qjournal.server.JournalNode 3007 sun.tools.jps.Jps(4)格式化NameNode
[hadoop@hadoop181 ~]$ hdfs namenode -format(5) 同步nn1元数据到 nn2 和 nn3
[hadoop@hadoop182 ~]$ hdfs namenode -bootstrapStandby [hadoop@hadoop183 ~]$ hdfs namenode -bootstrapStandby(6) 启动 nn2 和 nn3
# 在182 节点上启动 nn2 [hadoop@hadoop182 ~]$ hdfs --daemon start namenode ## 在183 节点上启动 nn3 [hadoop@hadoop183 ~]$ hdfs --daemon start namenode(7)启动所有节点上的datanode
[hadoop@hadoop181 ~]$ xssh hdfs --daemon start datanode(8)将 nn1 设置为激活状态
# 这个可以在任意一台nn上执行 [hadoop@hadoop181 ~] hdfs haadmin -transitionToActive nn1 # 查看是否激活, 可以在任意一台nn上执行, 这里我在hadoop183 节点执行 [hadoop@hadoop183 ~]$ hdfs haadmin -getServiceState nn1 active [hadoop@hadoop183 ~]$(1) 服务规划
服务hadoop181hadoop182hadoop183DataNode√√√Journal Node√√√NodeManager√√√Zookeeper√√√ZKFS√√√Name Node√√ResourceManager√√HistoryServer√Secondary NameNode√(2) 修改 core-site.xml 文件
# 修改前备份 [hadoop@hadoop181 ~]$ cp $HADOOP_HOME/etc/hadoop/core-site.xml $HADOOP_HOME/etc/hadoop/no-automatic-failover.core-site.xml.back # 修改文件 [hadoop@hadoop181 ~]$ vim $HADOOP_HOME/etc/hadoop/core-site.xml追加如下内容
<!--指定zk集群的位置--> <property> <name>ha.zookeeper.quorum</name> <value>hadoop181:2181,hadoop182:2181,hadoop183:2181</value> </property>(3) 修改hdfs-site.xml 文件
[hadoop@hadoop181 ~]$ cp $HADOOP_HOME/etc/hadoop/hdfs-site.xml $HADOOP_HOME/etc/hadoop/no-automatic-failover.hdfs-site.xml.back [hadoop@hadoop181 ~]$ vim $HADOOP_HOME/etc/hadoop/hdfs-site.xml增加如下内容
<!--开启故障转移--> <property> <name>dfs.ha.automatic-failover.enabled</name> <value>true</value> </property>(4)分发这两个配置文件
[hadoop@hadoop181 ~]$ xsync $HADOOP_HOME/etc/hadoop/hdfs-site.xml [hadoop@hadoop181 ~]$ xsync $HADOOP_HOME/etc/hadoop/core-site.xml(1) 先停止所有的hdfs服务
# 停止所有Hadoop相关的服务 [hadoop@hadoop181 ~]$ stop-all.sh # 停止zookeeper服务 [hadoop@hadoop181 ~]$ xssh /home/hadoop/apache-zookeeper/bin/zkServer.sh stop # 检查状态,要确认所有节点都没有服务 [hadoop@hadoop181 logs]$ xssh jps -l [DEBUG] 1 command is :jps -l [DEBUG] ssh to hadoop181 to execute commands [ jps -l] 21683 sun.tools.jps.Jps [DEBUG] ssh to hadoop182 to execute commands [ jps -l] 8943 sun.tools.jps.Jps [DEBUG] ssh to hadoop183 to execute commands [ jps -l] 9009 sun.tools.jps.Jps(2) 启动所有节点上的zookeeper 服务
# 启动zookeeper服务 [hadoop@hadoop181 ~]$ xssh /home/hadoop/apache-zookeeper/bin/zkServer.sh start # 检查启动状态 [hadoop@hadoop181 ~]$ xssh jps -l [DEBUG] 1 command is :jps -l [DEBUG] ssh to hadoop181 to execute commands [ jps -l] 21745 org.apache.zookeeper.server.quorum.QuorumPeerMain 21802 sun.tools.jps.Jps [DEBUG] ssh to hadoop182 to execute commands [ jps -l] 9058 sun.tools.jps.Jps 8999 org.apache.zookeeper.server.quorum.QuorumPeerMain [DEBUG] ssh to hadoop183 to execute commands [ jps -l] 9065 org.apache.zookeeper.server.quorum.QuorumPeerMain 9117 sun.tools.jps.Jps [hadoop@hadoop181 logs]$(3)初始化HA在zookeeper中的状态
[hadoop@hadoop181 ~]$ hdfs zkfc -formatZK启动所有的hdfs 服务
[hadoop@hadoop181 ~]$ start-dfs.sh(4)可以测试下
kill 掉现在激活状态的 nn节点, 然后看看zk中的状态
[hadoop@hadoop181 ~]$ start-dfs.sh(5) 如果自动故障转移不生效查看zkfc的日志
我的是因为有个系统组件没有找到 错误信息
2020-09-03 16:12:20,020 INFO org.apache.hadoop.ha.SshFenceByTcpPort: Looking for process running on port 9000 2020-09-03 16:12:20,086 WARN org.apache.hadoop.ha.SshFenceByTcpPort: PATH=$PATH:/sbin:/usr/sbin fuser -v -k -n tcp 9000 via ssh: bash: fuser: command not found 2020-09-03 16:12:20,086 INFO org.apache.hadoop.ha.SshFenceByTcpPort: rc: 127 2020-09-03 16:12:20,086 INFO org.apache.hadoop.ha.SshFenceByTcpPort.jsch: Disconnecting from hadoop182 port 22 2020-09-03 16:12:20,086 WARN org.apache.hadoop.ha.NodeFencer: Fencing method org.apache.hadoop.ha.SshFenceByTcpPort(null) was unsuccessful. 2020-09-03 16:12:20,086 ERROR org.apache.hadoop.ha.NodeFencer: Unable to fence service by any configured method. 2020-09-03 16:12:20,087 WARN org.apache.hadoop.ha.ActiveStandbyElector: Exception handling the winning of election解决方案
# 安装fuser这个命令先,所有主机都需要安装 [hadoop@hadoop181 ~]$ xssh sudo yum -y install psmisc(6) 重新测试
# 看到是哪个nn节点为激活状态 [hadoop@hadoop181 ~]$ hdfs haadmin -getServiceState nn3 active [hadoop@hadoop181 ~]$ hdfs haadmin -getServiceState nn2 standby [hadoop@hadoop181 ~]$ hdfs haadmin -getServiceState nn1 standby [hadoop@hadoop181 ~]$ # kill -9 这台nn上的namenode [hadoop@hadoop181 ~]$ ssh hadoop@hadoop183 "sudo kill -9 16130" # kill 之后检查哪个节点是激活状态,这里测试正常了, 这个active已经从nn3到nn2了 [hadoop@hadoop181 ~]$ hdfs haadmin -getServiceState nn1 standby [hadoop@hadoop181 ~]$ hdfs haadmin -getServiceState nn2 active [hadoop@hadoop181 ~]$ # kill -9 nn2 的namenode再试试 [hadoop@hadoop181 ~]$ ssh hadoop@hadoop182 "sudo kill -9 23413" [hadoop@hadoop181 ~]$ hdfs haadmin -getServiceState nn1 active好了 ~~~ 打完收工 !!!(PS: 被这个warning 坑死, 都没组件了你就不能报个 Error 么 ??? 坑我一下午 )