1.数据去重
distinct
group by
row_number()over()
2.分组排序
hive
colomn1. colomn2
2020-11-05 a
2020-11-06 b
2020-11-07 a
2020-11-08 c
2020-11-09 b
select colomn1,colomn2,row_number() over(partition by colomn2 order by colomn1 ) rank from tmp.tmp1177
a 2020-11-05 1
a 2020-11-07 2
b 2020-11-06 1
b 2020-11-09 2
c 2020-11-08 1
mysql
SELECT colomn1, colomn2, if(@colomn2 = colomn2, @i := @i+1,@i :=1) rk, @colomn2 := colomn2 FROM tmp.tmp_1117 a ,
(SELECT @i := 1)b
ORDER BY colomn1,colomn2
a 2020-11-05 1
a 2020-11-07 2
b 2020-11-06 1
b 2020-11-09 2
c 2020-11-08 1
3.分组聚合
group by
with cube
hive中可使用
SELECT dt,
city,
count(DISTINCT udid) AS uv
FROM analysis.tmp_gmm_0910
GROUP BY dt,
city WITH CUBE
20200101 . a1 . 100
20200101 . a2 . 200
20200101 . b1 . 300
20200101 . b2 . 400
20200102 . a1 . 100
20200102 . a2 . 200
20200102 . b1 . 300
20200102 . b2 . 400
20200101 . a . 300
20200101 . b . 700
20200102 . a . 300
20200102 . b . 700
20200101 . null 1000
20200102 . null 1000
null . null 2000
4.获取偏移量
lead()over 获取下一个元素
lag()over 获取上一个元素
lead(field, num, default,value) field 需要查找的字段,num 往后查找的 num
行的数据,defaultvalue 没有符合条件的默认值
a
udid . time
udid . 8:20
udid . 8:21
udid . 8.22
select udid, time ,time1,time1-time
(select udid,time,lead(time,1,0) over(partition by udid) time1
from a )
where time1 !=0
udid . 8:20 . 8.21 0.01
udid . 8:21 . 8.22 0.01
udid . 8.22 . 0 8.22
5.计算日期
计算相隔天数datediff
日期格式:yyyy-mm-dd
大日期在前,小日期在后
select datediff('2020-09-01','2020-08-31')
1
计算往前或者往后多少天date_add,date_sub
current_date():2020.11.06
hive:
select date_add(current_date(),1)
2020-11-17
select date_sub(current_date(),1)
2020-11-15
select date_add(current_date(),-1)
2020-11-15
mysql:
select date_add(current_date(),interval 1 DAY)
2020-11-17
select date_sub(current_date(),interval 1 DAY)
2020-11-15
select date_add(current_date(),interval -1 DAY)
6.日期格式转换
yyyy-mm-dd与yyyymmdd转换
hive或者impala或者spark通用
select from_unixtime(unix_timestamp('20201101','yyyyMMdd'),'yyyy-MM-dd')
2020-11-01
select from_unixtime(unix_timestamp('2020-11-01','yyyy-MM-dd'),'yyyyMMdd')
20201101
mysql使用
select dt,date_format(str_to_date(dt, '%Y%m%d'),'%Y-%m-%d')
20201108 2020-11-08
时间戳与日期格式转换
hive, spark,impala可用,mysql不可
select to_date(time),time from tmp.tmp_1117 limit 4
2020-10-29 2020-10-29 11:54:56
2020-10-29 2020-10-29 11:54:56
2020-10-29 2020-10-29 11:54:55
2020-10-29 2020-10-29 11:54:55
得到年,月,日,时,分,秒,星期,当年第几周
select weekofyear('2020-11-06')
45
select dayofweek('2020-11-06')
6
select month('2020-11-06')
11
select year('2020-11-06')
2020
select day('2020-11-06')
6
select hour('2020-11-06 22:35:23')
22
select minute('2020-11-06 22:35:23'))
35
select second('2020-11-06 22:35:23')
23
7.截取固定字符串之前或者之后的数据
split_part/substring_index
hive/spark:
h5_url='https://www.xxx.com/xxx/xxx/index.html?/fdsf/dfds&sdf/dfsd'
select substring_index(h5_url,'?',1)
https://www.xxx.com/xxx/xxx/index.html
impala:
select split_part(h5_url,'?',1)
https://www.xxx.com/xxx/xxx/index.html
8.替换某个字符
replace
select replace(split_part(h5_url,'?',1),'www','xxx')
https://xxx.xxx.com/xxx/xxx/index.html
9.模糊查询
like
select* from tmp.tmp_1108 where sdk like '%金额%'
rlike
select* from tmp.tmp_1108 where sdk rlike '金额'
10.拼接字符
concat
select concat('今天是',dt)
今天是2020-11-07
11.解析url
parse_url
如果原url格式正确可直接解析,不需要拼接,如果原url缺少前面的请求头,则需要拼接
select parse_url(concat('http://www.xx.cn',url), 'QUERY', 'enter_referer') AS entrance from tmp.tmp1109 limit 10
版权声明:本文为qq_45802130原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。