常见SQL语句编写
# 常用语法
# 排名函数
DENSE_RANK()。如果使用 DENSE_RANK() 进行排名会得到:1,1,2,3,4。
RANK()。如果使用 RANK() 进行排名会得到:1,1,3,4,5。
ROW_NUMBER()。如果使用 ROW_NUMBER() 进行排名会得到:1,2,3,4,5。
# 求连续3天都有销售量的销售人员名单。
select a.sales
from t1 a
left join t1 b on a.sales=b.sales
and datediff(a.date,1)=b.date
left join t1 c on b.sales=b.sales
and datediff(b.date,1)=c.date
where c.sales is not null
# 求部门工资最高的员工。
select Department.Name as 'Department', Employee.Name as 'Employee', Salary
from
Department join Employee on Employee.DepartmentId = Department.Id
where
(Employee.DepartmentId, Salary) in
(
select DepartmentId, max(Salary) as Salary
from Employee
group by DepartmentId
);
# 求部门工资前3高的员工。
/**
解题思路:先对Employee表进行部门分组工资排名,再关联Department表查询部门名称,再使用WHERE筛选出排名小于等于3的数据(也就是每个部门排名前3的工资)。
**/
select b.Name as Department, e2.Name as Employee, e2.Salary
from
Department b join
(
select DepartmentId, Name, Salary, dense_rank() over(partition by DepartmentId order by Salary desc) as rankId
from Employee
) as e2
on e2.DepartmentId = b.Id
where e2.rankId <= 3;
# 要求使用SQL统计出每个用户的累积访问次数?
我们有如下的用户访问数据
userId visitDate visitCount
u01 2017/1/21 5
u02 2017/1/23 6
u03 2017/1/22 8
u04 2017/1/20 3
u01 2017/1/23 6
u01 2017/2/21 8
U02 2017/1/23 6
U01 2017/2/22 4
要求使用SQL统计出每个用户的累积访问次数,如下表所示:
用户id 月份 小计 累积
u01 2017-01 11 11
u01 2017-02 12 23
u02 2017-01 12 12
u03 2017-01 8 8
u04 2017-01 3 3
SELECT t2.userid,
t2.visitmonth,
subtotal_visit_cnt,
sum(subtotal_visit_cnt) over (partition BY userid ORDER BY visitmonth) AS total_visit_cnt
FROM
(SELECT userid,
visitmonth,
sum(visitcount) AS subtotal_visit_cnt
FROM
(SELECT userid,
date_format(regexp_replace(visitdate,'/','-'),'yyyy-MM') AS visitmonth,
visitcount
FROM test_sql.test1) t1
GROUP BY userid, visitmonth) t2
ORDER BY t2.userid, t2.visitmonth;
# 注意sum() over ()的使用,这类似于累加的作用
# 求每个店铺访问次数top3的访客信息。
输出店铺名称、访客id、访问次数。注:有50W个京东店铺,每个顾客访客访问任何一个店铺的任何一个商品时都会产生一条访问日志,访问日志存储的表名为Visit,访客的用户id为user_id,被访问的店铺名称为shop。
# 首先以店铺和访客id进行分组求出用户在各个店铺的访问次数
# 接着对每个店铺的按照访问次数访客进行分组排序
# 最后取top3的访客信息
select t2.userid, t2.shop
from (
select v1.userid, v1.shop, rank() over (partition by shop order by cnt desc) as ranks
from (
select userid, shop, count(*) as cnt
from visit
group by userid, shop
) as v1
) as t2
where t2.ranks <=3;
# 2017年每个月的订单数、用户数、总成交金额
已知一个表STG.ORDER,有如下字段:Date,Order_id,User_id,amount。
数据样例:2017-01-01,10029028,1000003251,33.57。
select t1.mondate, count(t1.order_id) as orderCount, count(t1.user_id) as userCount, sum(t1.amount) as totalAmount
from (
select date_format(date, 'yyyy-MM') as mondate, order_id, user_id, amount
from order
where date_format(date, 'yyyy') = '2017'
) as t1
group by t1.mondate;
# 求得所有用户和活跃用户的总数及平均年龄。
(活跃用户指连续两天都有访问记录的用户)
CREATE TABLE log(
dt string,
user_id string,
age int)
ROW format delimited fields terminated BY ',';
# 求活跃用户的总数及平均年龄
# 首先按照日期和用户分组,并且根据用户进行分区区内排序
select dt, user_id, min(age) as age, row_number() over (partition by user_id order by dt) as ranking
from log
group by dt, user_id
# 接着求出日期和排名的差异
select t1.user_id, t1.age, date_sub(t1.dt, ranking) as flag
from t1
# 以用户和flag分组,将连续登陆的用户结合在一起,求出连续两天登陆的用户
select t2.user_id, min(t2.age) as age
from t2
group by t2.user_id, t2.flag
having count(*) >= 2
# 过滤掉可能在不同时间段连续登陆的用户
select t3.user_id, min(t3.age) as age
from t3
group by t3.user_id
# union all合并所有用户和活跃用户
# 所有用户中在今年10月份第一次购买商品的金额
表ordertable 字段: ( 购买用户:userid,金额:money,购买时间:paymenttime ( 格式:2017-10-01 ),订单id:orderid
select userid, money, orderid, paymenttime
from (
select userid, money, orderid, paymenttime, rank() over(partition by userid order by paymenttime) as ranking
from ordertable
where date_format(paymenttime, 'yyyy-MM') == '2021-10'
) as t1
where t1.ranking = 1;
# 求11月9号下午14点(14-15点),访问/api/user/login接口的top10的ip地址
有一个线上服务器访问日志格式如下(用sql答题)
时间 接口 ip地址
2016-11-09 14:22:05 /api/user/login 110.23.5.33
2016-11-09 14:23:10 /api/user/detail 57.3.2.16
2016-11-09 15:59:40 /api/user/login 200.6.5.166
select ip
from (
select ip, count(*) as cnt
from log
where date_format(dat, 'MM') == '11' and date_format(dat, 'dd') == '09' and date_format(dat, 'hh') == '14' and inter = '/api/user/login'
group by ip
) as t1
order by cnt desc
limit 10;
# 查询充值日志表2019年01月02号每个区组下充值额最大的账号
有一个充值日志表credit_log,字段如下:
dist_id
int '区组id',
account
string '账号',
money
int '充值金额',
create_time
string '订单时间'
请写出SQL语句,查询充值日志表2019年01月02号每个区组下充值额最大的账号,要求结果: 区组id,账号,金额,充值时间
select dist_id, account, money, create_time
from (
select dist_id, account, money, create_time, rank() over(partition by dist_id order by money desc) as ranking
from (
select dist_id, account, sum(money) as money, create_time
from credit_log
where date_format(create_time, 'yyyy-MM-dd') = '20190102'
group by disi_id, account # 这里根据区号和账号分组,求出某个账号同时在这一天充值多次的和
) as t1
) as t2
where t2.ranking = 1;
(8)考虑到数据安全的需要,需定时将“借阅记录”中数据进行备份,请使用一条SQL语句,在备份用户bak下创建与“借阅记录”表结构完全一致的数据表BORROW_LOG_BAK.井且将“借阅记录”中现有数据全部复制到BORROW_L0G_ BAK中。 (9)现在需要将原Oracle数据库中数据迁移至Hive仓库,请写出“图书”在Hive中的建表语句(Hive实现,提示:列分隔符|;数据表数据需要外部导入:分区分别以month_part、day_part 命名) (10)Hive中有表A,现在需要将表A的月分区 201505 中 user_id为20000的user_dinner字段更新为bonc8920,其他用户user_dinner字段数据不变,请列出更新的方法步骤。(Hive实现,提示:Hlive中无update语法,请通过其他办法进行数据更新)