hive-创建数据库-创建表--hive版本3.1.2

tech2022-08-04 214

1. 数据库

1.1 创建数据库

create database if not exists database_name;

1.2 查看所有的数据库

show databases;

1.3 切换数据库

use database_name;

1.4 查看当前所使用的数据库

select current_database();

1.5 查看数据库信息

show database extended database_name;

1.6 删除数据库

drop database database_name; 如果数据库不为空,可以使用cascade强制删除 drop database database_name cascade;

2. 表

2. 1. 创建表

2.1.1 常用数据类型

int bigint(long) double string timestamp(时间类型)

2.1.2 数据类型转换

cast(字段 as 数据类型) 如果无法转换返回null

2.1.3 语法

CREATE [EXTERNAL] TABLE [IF NOT EXISTS] table_name [(col_name data_type [COMMENT col_comment], ...)] [COMMENT table_comment] [PARTITIONED BY (col_name data_type [COMMENT col_comment], ...)] 分区 [CLUSTERED BY (col_name, col_name, ...) 分桶 [SORTED BY (col_name [ASC|DESC], ...)] INTO num_buckets BUCKETS] [ROW FORMAT row_format] row format delimited fields terminated by “分隔符” [STORED AS file_format] [LOCATION hdfs_path]

2.1.4 内部表(管理表)和外部表

内部表 , 默认创建的表都是内部表 , 删除表时hive会删除表中的数据(即使是location指向数据位置)外部表 , external , location指向数据的位置 , 删除表不会删除数据 , 用于共享数据 create external table table_name( ... ) row delimited fields terminated by ',' ---格式化数据,分隔符为 , location '...';

2.1.5 静态分区表

hive中数据存储在表得目录下 , 查询数据时会加载所有的文件

为了提高效率 , 对表进行分区 , 一个分区就是一个目录例如 : 同一天的数据放在一个文件夹下 , 要查询这天的数据时只需要加载这一天文件夹下的文件即可

静态一级分区

create table tb_static_partition_1( id int , ctime string , name string ) partitioned by (dt string) row format delimited fields terminated by "," ; load data local inpath "/doit17/log/2020-09-02_01.log" into table tb_static_partition partition(dt='2020-09-02') ; load data local inpath "/doit17/log/2020-09-02_02.log" into table tb_static_partition partition(dt='2020-09-02') ; load data local inpath "/doit17/log/2020-09-01_01.log" into table tb_static_partition partition(dt='2020-09-01') ; load data local inpath "/doit17/log/2020-09-01_02.log" into table tb_static_partition partition(dt='2020-09-01') ; load data local inpath "/doit17/log/2020-08-31_01.log" into table tb_static_partition partition(dt='2020-08-31') ; load data local inpath "/doit17/log/2020-08-31_02.log" into table tb_static_partition partition(dt='2020-08-31') ; 例如: 查询2020-08-31这天的数据 , 只加载这一个目录下的数据 select * from tb_static_partition1 where dt = '2020-08-31';

静态二级分区

create table tb_static_partition_2( id int , ctime string , name string ) partitioned by (mt string , dt string) row format delimited fields terminated by "," ; load data local inpath "/doit17/log/2020-09-02_01.log" into table tb_static_partition_2 partition(mt='09' , dt='02') ; load data local inpath "/doit17/log/2020-09-02_02.log" into table tb_static_partition_2 partition(mt='09' , dt='02') ; load data local inpath "/doit17/log/2020-09-01_01.log" into table tb_static_partition_2 partition(mt='09' , dt='01') ; load data local inpath "/doit17/log/2020-09-01_02.log" into table tb_static_partition_2 partition(mt='09' , dt='01') ; load data local inpath "/doit17/log/2020-08-31_01.log" into table tb_static_partition_2 partition(mt='08' , dt='31') ; load data local inpath "/doit17/log/2020-08-31_02.log" into table tb_static_partition_2 partition(mt='08' , dt='31') ; 查询时可以只查询mt 也可以查询mt和dt

也可以根据年月日静态三级分区

2.1.6 删除和添加分区

删除分区 alter table tb_static_partition_1 drop partition(dt='') , partition(dt=''); 添加分区 alter table tb_static_partition_2 add partition(dt='') alter table tb_static_partition_2 add partition(dt='') partition(dt='');

2.1.7 动态分区表

按照表中某个字段的值进行分区

create table tb_addr_dynamic_partition( id int , name string , address string ) partitioned by(addr string) ; 按照地址分区开启动态分区 set hive.exec.dynamic.partition=true //使用动态分区 set hive.exec.dynamic.partition.mode=nonstrick;//无限制模式，如果模式是strict，则必须有一个静态分区且放在最前面 set hive.exec.max.dynamic.partitions.pernode=10000;//每个节点生成动态分区的最大个数 set hive.exec.max.dynamic.partitions=100000;//生成动态分区的最大个数 set hive.exec.max.created.files=150000;//一个任务最多可以创建的文件数目 set dfs.datanode.max.xcievers=8192;//限定一次最多打开的文件数 set hive.merge.mapfiles=true; //map端的结果进行合并 set mapred.reduce.tasks =20000; //设置reduce task个数将查询到的数据插入表中并按照address分区 insert into table tb_addr_dynamic_partition partition(addr) select id,name,address,address from tb_addr ;

2.1.8 查询表结构

desc table_name; desc formatted table_name;

2.1.9 创建分桶表

create table tb_bucket( sid string , name string ) clustered by(sid) ---按照sid分桶 into 4 buckets ---分4个桶 , 实际为4个文件 row format delimited fields terminated by '\t'; set hive.enforce.bucketing=true; -- 开启分桶 set mapreduce.job.reduces=-1; -- 默认reduce个数 insert into tb_name select sid,name from tb_name2; ---查询tb_name2的数据导入tb_bucket

抽样查询

select * from tb_bucket tablesample(bucket 1 out of 4 on sid); ---数据分为4个桶 , 查询第一个桶中的数据 ---不是直接查询表的4个文件 ---查询到的数据分布不是很均匀

最新回复(0)