使环境变量生效或者
[root@node09 06_hive]# source /etc/profile.d/my_env.sh 6)解决日志Jar包冲突,进入/opt/module/hive/lib目录 [root@node09 lib]# mv log4j-slf4j-impl-2.10.0.jar log4j-slf4j-impl-2.10.0.jar.bak将MySQL的JDBC驱动拷贝到Hive的lib目录下
[root@node09 07_mysql]# cp mysql-connector-java-5.1.48.jar /opt/module/hive/lib/在$HIVE_HOME/conf目录下新建hive-site.xml文件
[root@node09 conf]# vim hive-site.xml添加如下内容
<?xml version="1.0"?> <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> <configuration> <property> <name>javax.jdo.option.ConnectionURL</name> <value>jdbc:mysql://node09:3306/metastore?useSSL=false</value> </property> <property> <name>javax.jdo.option.ConnectionDriverName</name> <value>com.mysql.jdbc.Driver</value> </property> <property> <name>javax.jdo.option.ConnectionUserName</name> <value>root</value> </property> <property> <name>javax.jdo.option.ConnectionPassword</name> <value>000000</value> </property> <property> <name>hive.metastore.warehouse.dir</name> <value>/user/hive/warehouse</value> </property> <property> <name>hive.metastore.schema.verification</name> <value>false</value> </property> <property> <name>hive.metastore.uris</name> <value>thrift://node09:9083</value> </property> <property> <name>hive.server2.thrift.port</name> <value>10000</value> </property> <property> <name>hive.server2.thrift.bind.host</name> <value>node09</value> </property> <property> <name>hive.metastore.event.db.notification.api.auth</name> <value>false</value> </property> <property> <name>hive.cli.print.header</name> <value>true</value> </property> <property> <name>hive.cli.print.current.db</name> <value>true</value> </property> </configuration>内容如下:
#!/bin/bash HIVE_LOG_DIR=$HIVE_HOME/logs mkdir -p $HIVE_LOG_DIR #检查进程是否运行正常,参数1为进程名,参数2为进程端口 function check_process() { pid=$(ps -ef 2>/dev/null | grep -v grep | grep -i $1 | awk '{print $2}') ppid=$(netstat -nltp 2>/dev/null | grep $2 | awk '{print $7}' | cut -d '/' -f 1) echo $pid [[ "$pid" =~ "$ppid" ]] && [ "$ppid" ] && return 0 || return 1 } function hive_start() { metapid=$(check_process HiveMetastore 9083) cmd="nohup hive --service metastore >$HIVE_LOG_DIR/metastore.log 2>&1 &" cmd=$cmd" sleep 4; hdfs dfsadmin -safemode wait >/dev/null 2>&1" [ -z "$metapid" ] && eval $cmd || echo "Metastroe服务已启动" server2pid=$(check_process HiveServer2 10000) cmd="nohup hive --service hiveserver2 >$HIVE_LOG_DIR/hiveServer2.log 2>&1 &" [ -z "$server2pid" ] && eval $cmd || echo "HiveServer2服务已启动" } function hive_stop() { metapid=$(check_process HiveMetastore 9083) [ "$metapid" ] && kill $metapid || echo "Metastore服务未启动" server2pid=$(check_process HiveServer2 10000) [ "$server2pid" ] && kill $server2pid || echo "HiveServer2服务未启动" } case $1 in "start") hive_start ;; "stop") hive_stop ;; "restart") hive_stop sleep 2 hive_start ;; "status") check_process HiveMetastore 9083 >/dev/null && echo "Metastore服务运行正常" || echo "Metastore服务运行异常" check_process HiveServer2 10000 >/dev/null && echo "HiveServer2服务运行正常" || echo "HiveServer2服务运行异常" ;; *) echo Invalid Args! echo 'Usage: '$(basename $0)' start|stop|restart|status' ;; esac 3)添加执行权限 [root@node09 bin]# chmod +x hiveservices.sh 4)启动Hive后台服务 [root@node09 bin]# hiveservices.sh start 5)查看Hive后台服务运行情况 [root@node09 bin]# hiveservices.sh status Metastore服务运行正常 HiveServer2服务运行异常 6)启动Hive客户端 [root@node09 bin]# hive官方文档 https://cwiki.apache.org/confluence/display/Hive/Hive+on+Spark%3A+Getting+Started#:~:text=Version%20Compatibility%20%20%20%20Hive%20Version%20,%20%201.6.0%20%204%20more%20rows%20
1)从官网下载Spark源码并解压 下载地址: https://www.apache.org/dyn/closer.lua/spark/spark-2.4.5/spark-2.4.5.tgz2)上传并解压spark3)进入spark解压后的目录4)执行编译命令[root@node09 spark-2.4.5]# ./dev/make-distribution.sh --name without-hive --tgz -Pyarn -Phadoop-3.1 -Dhadoop.version=3.1.3 -Pparquet-provided -Porc-provided -Phadoop-provided 5)等待编译完成,spark-2.4.5-bin-without-hive.tgz为最终文件添加如下内容
export SPARK_HOME=/opt/module/spark export PATH=$PATH:$SPARK_HOME/binsource 使其生效
[root@node09 spark]# source /etc/profile.d/my_env.sh 3)配置spark运行环境 [root@node09 spark]# mv /opt/module/spark/conf/spark-env.sh.template /opt/module/spark/conf/spark-env.sh [root@node09 spark]# vim /opt/module/spark/conf/spark-env.sh添加如下内容
export SPARK_DIST_CLASSPATH=$(hadoop classpath) 4)连接spark jar包到hive,如何hive中已存在则跳过 [root@node09 spark]# ln -s /opt/module/spark/jars/scala-library-2.11.12.jar /opt/module/hive/lib/scala-library-2.11.12.jar [root@node09 spark]# ln -s /opt/module/spark/jars/spark-core_2.11-2.4.5.jar /opt/module/hive/lib/spark-core_2.11-2.4.5.jar [root@node09 spark]# ln -s /opt/module/spark/jars/spark-network-common_2.11-2.4.5.jar /opt/module/hive/lib/spark-network-common_2.11-2.4.5.jar 5)新建spark配置文件 [root@node09 spark]# vim /opt/module/hive/conf/spark-defaults.conf添加如下内容
spark.master yarn spark.master yarn spark.eventLog.enabled true spark.eventLog.dir hdfs://hadoop102:8020/spark-history spark.driver.memory 2g spark.executor.memory 2g 6)在HDFS创建如下路径 [root@node09 spark]# hadoop fs -mkdir /spark-history 7)上传Spark依赖到HDFS [root@node09 spark]# hadoop fs -mkdir /spark-jars [root@node09 spark]# hadoop fs -put /opt/module/spark/jars/* /spark-jars 8)修改hive-site.xml <!--Spark依赖位置--> <property> <name>spark.yarn.jars</name> <value>hdfs://hadoop102:8020/spark-jars/*</value> </property> <!--Hive执行引擎--> <property> <name>hive.execution.engine</name> <value>spark</value> </property> 9)Hive on Spark 测试 a. 启动hive客户端 b. 创建一张测试表hive (default)> create external table student(id int, name string) location '/student'; c. 通过insert测试效果hive (default)> insert into table student values(1,'abc');默认Yarn的配置下,容量调度器只有一条Default队列。在capacity-scheduler.xml中可以配置多条队列,修改以下属性,增加hive队列。
<property> <name>yarn.scheduler.capacity.root.queues</name> <value>default,hive</value> <description> The queues at the this level (root is the root queue). </description> </property> <property> <name>yarn.scheduler.capacity.root.default.capacity</name> <value>50</value> <description> default队列的容量为50% </description> </property>同时为新加队列添加必要属性:
<property> <name>yarn.scheduler.capacity.root.hive.capacity</name> <value>50</value> <description> hive队列的容量为50% </description> </property> <property> <name>yarn.scheduler.capacity.root.hive.user-limit-factor</name> <value>1</value> <description> 一个用户最多能够获取该队列资源容量的比例 </description> </property> <property> <name>yarn.scheduler.capacity.root.hive.maximum-capacity</name> <value>80</value> <description> hive队列的最大容量 </description> </property> <property> <name>yarn.scheduler.capacity.root.hive.state</name> <value>RUNNING</value> </property> <property> <name>yarn.scheduler.capacity.root.hive.acl_submit_applications</name> <value>*</value> <description> 访问控制,控制谁可以将任务提交到该队列 </description> </property> <property> <name>yarn.scheduler.capacity.root.hive.acl_administer_queue</name> <value>*</value> <description> 访问控制,控制谁可以管理(包括提交和取消)该队列的任务 </description> </property> <property> <name>yarn.scheduler.capacity.root.hive.acl_application_max_priority</name> <value>*</value> <description> 访问控制,控制用户可以提交到该队列的任务的最大优先级 </description> </property> <property> <name>yarn.scheduler.capacity.root.hive.maximum-application-lifetime</name> <value>-1</value> <description> hive队列中任务的最大生命时长 </description> </property> <property> <name>yarn.scheduler.capacity.root.hive.default-application-lifetime</name> <value>-1</value> <description> default队列中任务的最大生命时长 </description> </property>为方便后续hive客户端的测试和shell脚本中的任务能同时执行,我们将hive客户端的测试任务提交到hive队列,让shell脚本中的任务使用默认值,提交到default队列。 每次进入hive客户端时,执行以下命令
hive (default)> set mapreduce.job.queuename=hive;