分享

大数据项目之电商数仓(总结)(三):系统业务数据仓库

问题导读:
1、如何设计ODS层?
2、如何使用DWD层对ODS层数据进行判空过滤?
3、DWS层之用户行为宽表如何设计?
4、用户行为数据宽表如何导入脚本数据?


上一篇:大数据项目之电商数仓(总结)(二):系统业务数据仓库

3.3 ODS层
完全仿照业务数据库中的表字段,一模一样的创建ODS层对应表。

3.3.1 创建订单表
  1. hive (gmall)>
  2. drop table if exists ods_order_info;
  3. create external table ods_order_info (
  4.     `id` string COMMENT '订单编号',
  5.     `total_amount` decimal(10,2) COMMENT '订单金额',
  6.     `order_status` string COMMENT '订单状态',
  7.     `user_id` string COMMENT '用户id',
  8.     `payment_way` string COMMENT '支付方式',
  9.     `out_trade_no` string COMMENT '支付流水号',
  10.     `create_time` string COMMENT '创建时间',
  11.     `operate_time` string COMMENT '操作时间'
  12. ) COMMENT '订单表'
  13. PARTITIONED BY (`dt` string)
  14. row format delimited fields terminated by '\t'
  15. location '/warehouse/gmall/ods/ods_order_info/';
复制代码

3.3.2 创建订单详情表
  1. hive (gmall)>
  2. drop table if exists ods_order_detail;
  3. create external table ods_order_detail(
  4.     `id` string COMMENT '订单详情编号',
  5.     `order_id` string  COMMENT '订单号',
  6.     `user_id` string COMMENT '用户id',
  7.     `sku_id` string COMMENT '商品id',
  8.     `sku_name` string COMMENT '商品名称',
  9.     `order_price` string COMMENT '商品单价',
  10.     `sku_num` string COMMENT '商品数量',
  11.     `create_time` string COMMENT '创建时间'
  12. ) COMMENT '订单明细表'
  13. PARTITIONED BY (`dt` string)
  14. row format delimited fields terminated by '\t'
  15. location '/warehouse/gmall/ods/ods_order_detail/';
复制代码

3.3.3 创建商品表
  1. hive (gmall)>
  2. drop table if exists ods_sku_info;
  3. create external table ods_sku_info(
  4.     `id` string COMMENT 'skuId',
  5.     `spu_id` string   COMMENT 'spuid',
  6.     `price` decimal(10,2) COMMENT '价格',
  7.     `sku_name` string COMMENT '商品名称',
  8.     `sku_desc` string COMMENT '商品描述',
  9.     `weight` string COMMENT '重量',
  10.     `tm_id` string COMMENT '品牌id',
  11.     `category3_id` string COMMENT '品类id',
  12.     `create_time` string COMMENT '创建时间'
  13. ) COMMENT '商品表'
  14. PARTITIONED BY (`dt` string)
  15. row format delimited fields terminated by '\t'
  16. location '/warehouse/gmall/ods/ods_sku_info/';
复制代码

3.3.4 创建用户表
  1. hive (gmall)>
  2. drop table if exists ods_user_info;
  3. create external table ods_user_info(
  4.     `id` string COMMENT '用户id',
  5.     `name`  string COMMENT '姓名',
  6.     `birthday` string COMMENT '生日',
  7.     `gender` string COMMENT '性别',
  8.     `email` string COMMENT '邮箱',
  9.     `user_level` string COMMENT '用户等级',
  10.     `create_time` string COMMENT '创建时间'
  11. ) COMMENT '用户信息'
  12. PARTITIONED BY (`dt` string)
  13. row format delimited fields terminated by '\t'
  14. location '/warehouse/gmall/ods/ods_user_info/';
复制代码

3.3.5 创建商品一级分类表
  1. hive (gmall)>
  2. drop table if exists ods_base_category1;
  3. create external table ods_base_category1(
  4.     `id` string COMMENT 'id',
  5.     `name`  string COMMENT '名称'
  6. ) COMMENT '商品一级分类'
  7. PARTITIONED BY (`dt` string)
  8. row format delimited fields terminated by '\t'
  9. location '/warehouse/gmall/ods/ods_base_category1/';
复制代码

3.3.6 创建商品二级分类表
  1. hive (gmall)>
  2. drop table if exists ods_base_category2;
  3. create external table ods_base_category2(
  4.     `id` string COMMENT ' id',
  5.     `name` string COMMENT '名称',
  6.     category1_id string COMMENT '一级品类id'
  7. ) COMMENT '商品二级分类'
  8. PARTITIONED BY (`dt` string)
  9. row format delimited fields terminated by '\t'
  10. location '/warehouse/gmall/ods/ods_base_category2/';
复制代码


3.3.7 创建商品三级分类表
  1. hive (gmall)>
  2. drop table if exists ods_base_category3;
  3. create external table ods_base_category3(
  4.     `id` string COMMENT ' id',
  5.     `name`  string COMMENT '名称',
  6.     category2_id string COMMENT '二级品类id'
  7. ) COMMENT '商品三级分类'
  8. PARTITIONED BY (`dt` string)
  9. row format delimited fields terminated by '\t'
  10. location '/warehouse/gmall/ods/ods_base_category3/';
复制代码


3.3.8 创建支付流水表
  1. hive (gmall)>
  2. drop table if exists ods_payment_info;
  3. create external table ods_payment_info(
  4.     `id`   bigint COMMENT '编号',
  5.     `out_trade_no`    string COMMENT '对外业务编号',
  6.     `order_id`        string COMMENT '订单编号',
  7.     `user_id`         string COMMENT '用户编号',
  8.     `alipay_trade_no` string COMMENT '支付宝交易流水编号',
  9.     `total_amount`    decimal(16,2) COMMENT '支付金额',
  10.     `subject`         string COMMENT '交易内容',
  11.     `payment_type`    string COMMENT '支付类型',
  12.     `payment_time`    string COMMENT '支付时间'
  13.    )  COMMENT '支付流水表'
  14. PARTITIONED BY (`dt` string)
  15. row format delimited fields terminated by '\t'
  16. location '/warehouse/gmall/ods/ods_payment_info/';
复制代码


3.3.9 ODS层数据导入脚本
1)在/home/kgg/bin目录下创建脚本ods_db.sh
  1. [kgg@hadoop102 bin]$ vim ods_db.sh
  2.     在脚本中填写如下内容
  3. #!/bin/bash
  4.    APP=gmall
  5.    hive=/opt/module/hive/bin/hive
  6. # 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
  7. if [ -n "$1" ] ;then
  8.     do_date=$1
  9. else
  10.     do_date=`date -d "-1 day" +%F`
  11. fi
  12. sql="
  13. load data inpath '/origin_data/$APP/db/order_info/$do_date' OVERWRITE into table "$APP".ods_order_info partition(dt='$do_date');
  14. load data inpath '/origin_data/$APP/db/order_detail/$do_date' OVERWRITE into table "$APP".ods_order_detail partition(dt='$do_date');
  15. load data inpath '/origin_data/$APP/db/sku_info/$do_date' OVERWRITE into table "$APP".ods_sku_info partition(dt='$do_date');
  16. load data inpath '/origin_data/$APP/db/user_info/$do_date' OVERWRITE into table "$APP".ods_user_info partition(dt='$do_date');
  17. load data inpath '/origin_data/$APP/db/payment_info/$do_date' OVERWRITE into table "$APP".ods_payment_info partition(dt='$do_date');
  18. load data inpath '/origin_data/$APP/db/base_category1/$do_date' OVERWRITE into table "$APP".ods_base_category1 partition(dt='$do_date');
  19. load data inpath '/origin_data/$APP/db/base_category2/$do_date' OVERWRITE into table "$APP".ods_base_category2 partition(dt='$do_date');
  20. load data inpath '/origin_data/$APP/db/base_category3/$do_date' OVERWRITE into table "$APP".ods_base_category3 partition(dt='$do_date');
  21. "
  22. $hive -e "$sql"
复制代码

2)增加脚本执行权限
  1. [kgg@hadoop102 bin]$ chmod 777 ods_db.sh
复制代码

3)执行脚本导入数据
  1. [kgg@hadoop102 bin]$ ods_db.sh 2019-02-10
  2. [kgg@hadoop102 bin]$ ods_db.sh 2019-02-11
复制代码

4)查询导入数据
  1. hive (gmall)>
  2. select * from ods_order_info where dt='2019-02-10' limit 1;
  3. select * from ods_order_info where dt='2019-02-11' limit 1;
复制代码



3.4 DWD层
对ODS层数据进行判空过滤。对商品分类表进行维度退化(降维)。

3.4.1 创建订单表
  1. hive (gmall)>
  2. drop table if exists dwd_order_info;
  3. create external table dwd_order_info (
  4.     `id` string COMMENT '',
  5.     `total_amount` decimal(10,2) COMMENT '',
  6.     `order_status` string COMMENT ' 1 2 3 4 5',
  7.     `user_id` string COMMENT 'id',
  8.     `payment_way` string COMMENT '',
  9.     `out_trade_no` string COMMENT '',
  10.     `create_time` string COMMENT '',
  11.     `operate_time` string COMMENT ''
  12. )
  13. PARTITIONED BY (`dt` string)
  14. stored as parquet
  15. location '/warehouse/gmall/dwd/dwd_order_info/'
  16. tblproperties ("parquet.compression"="snappy");
复制代码


3.4.2 创建订单详情表
  1. hive (gmall)>
  2. drop table if exists dwd_order_detail;
  3. create external table dwd_order_detail(
  4.     `id` string COMMENT '',
  5.     `order_id` decimal(10,2) COMMENT '',
  6.     `user_id` string COMMENT 'id',
  7.     `sku_id` string COMMENT 'id',
  8.     `sku_name` string COMMENT '',
  9.     `order_price` string COMMENT '',
  10.     `sku_num` string COMMENT '',
  11.     `create_time` string COMMENT ''
  12. )
  13. PARTITIONED BY (`dt` string)
  14. stored as parquet
  15. location '/warehouse/gmall/dwd/dwd_order_detail/'
  16. tblproperties ("parquet.compression"="snappy");
复制代码


3.4.3 创建用户表
  1. hive (gmall)>
  2. drop table if exists dwd_user_info;
  3. create external table dwd_user_info(
  4.     `id` string COMMENT 'id',
  5.     `name` string COMMENT '',
  6.     `birthday` string COMMENT '',
  7.     `gender` string COMMENT '',
  8.     `email` string COMMENT '',
  9.     `user_level` string COMMENT '',
  10.     `create_time` string COMMENT ''
  11. )
  12. PARTITIONED BY (`dt` string)
  13. stored as parquet
  14. location '/warehouse/gmall/dwd/dwd_user_info/'
  15. tblproperties ("parquet.compression"="snappy");
复制代码


3.4.4 创建支付流水表
  1. hive (gmall)>
  2. drop table if exists dwd_payment_info;
  3. create external table dwd_payment_info(
  4.     `id`   bigint COMMENT '',
  5.     `out_trade_no`    string COMMENT '',
  6.     `order_id`        string COMMENT '',
  7.     `user_id`         string COMMENT '',
  8.     `alipay_trade_no` string COMMENT '',
  9.     `total_amount`    decimal(16,2) COMMENT '',
  10.     `subject`         string COMMENT '',
  11.     `payment_tpe`    string COMMENT '',
  12.     `payment_time`    string COMMENT ''
  13.    )  
  14. PARTITIONED BY (`dt` string)
  15. stored as parquet
  16. location '/warehouse/gmall/dwd/dwd_payment_info/'
  17. tblproperties ("parquet.compression"="snappy");
复制代码


3.4.5 创建商品表(增加分类)

2021-01-12_195335.jpg

  1. hive (gmall)>
  2. drop table if exists dwd_sku_info;
  3. create external table dwd_sku_info(
  4.     `id` string COMMENT 'skuId',
  5.     `spu_id` string COMMENT 'spuid',
  6.     `price` decimal(10,2) COMMENT '',
  7.     `sku_name` string COMMENT '',
  8.     `sku_desc` string COMMENT '',
  9.     `weight` string COMMENT '',
  10.     `tm_id` string COMMENT 'id',
  11.     `category3_id` string COMMENT '1id',
  12.     `category2_id` string COMMENT '2id',
  13.     `category1_id` string COMMENT '3id',
  14.     `category3_name` string COMMENT '3',
  15.     `category2_name` string COMMENT '2',
  16.     `category1_name` string COMMENT '1',
  17.     `create_time` string COMMENT ''
  18. )
  19. PARTITIONED BY (`dt` string)
  20. stored as parquet
  21. location '/warehouse/gmall/dwd/dwd_sku_info/'
  22. tblproperties ("parquet.compression"="snappy");
复制代码


3.4.6 DWD层数据导入脚本
1)在/home/kgg/bin目录下创建脚本dwd_db.sh
  1. [kgg@hadoop102 bin]$ vim dwd_db.sh
  2.     在脚本中填写如下内容
  3. #!/bin/bash
  4. # 定义变量方便修改
  5. APP=gmall
  6. hive=/opt/module/hive/bin/hive
  7. # 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
  8. if [ -n "$1" ] ;then
  9.     do_date=$1
  10. else
  11.     do_date=`date -d "-1 day" +%F`  
  12. fi
  13. sql="
  14. set hive.exec.dynamic.partition.mode=nonstrict;
  15. insert overwrite table "$APP".dwd_order_info partition(dt)
  16. select * from "$APP".ods_order_info
  17. where dt='$do_date' and id is not null;
  18. insert overwrite table "$APP".dwd_order_detail partition(dt)
  19. select * from "$APP".ods_order_detail
  20. where dt='$do_date'   and id is not null;
  21. insert overwrite table "$APP".dwd_user_info partition(dt)
  22. select * from "$APP".ods_user_info
  23. where dt='$do_date' and id is not null;
  24. insert overwrite table "$APP".dwd_payment_info partition(dt)
  25. select * from "$APP".ods_payment_info
  26. where dt='$do_date' and id is not null;
  27. insert overwrite table "$APP".dwd_sku_info partition(dt)
  28. select  
  29.     sku.id,
  30.     sku.spu_id,
  31.     sku.price,
  32.     sku.sku_name,
  33.     sku.sku_desc,
  34.     sku.weight,
  35.     sku.tm_id,
  36.     sku.category3_id,
  37.     c2.id category2_id,
  38.     c1.id category1_id,
  39.     c3.name category3_name,
  40.     c2.name category2_name,
  41.     c1.name category1_name,
  42.     sku.create_time,
  43.     sku.dt
  44. from
  45.     "$APP".ods_sku_info sku
  46. join "$APP".ods_base_category3 c3 on sku.category3_id=c3.id
  47.     join "$APP".ods_base_category2 c2 on c3.category2_id=c2.id
  48.     join "$APP".ods_base_category1 c1 on c2.category1_id=c1.id
  49. where sku.dt='$do_date'  and c2.dt='$do_date'
  50. and c3.dt='$do_date' and c1.dt='$do_date'
  51. and sku.id is not null;
  52. "
  53. $hive -e "$sql"
复制代码

2)增加脚本执行权限
  1. [kgg@hadoop102 bin]$ chmod 777 dwd_db.sh
复制代码

3)执行脚本导入数据
  1. [kgg@hadoop102 bin]$ dwd_db.sh 2019-02-10
  2. [kgg@hadoop102 bin]$ dwd_db.sh 2019-02-11
复制代码

4)查看导入数据
  1. hive (gmall)>
  2. select * from dwd_sku_info where dt='2019-02-10' limit 2;
  3. select * from dwd_sku_info where dt='2019-02-11' limit 2;
复制代码

3.4.3 小结
思考:
1)维度退化要付出什么代价?
    如果被退化的维度,还有其他业务表使用,退化后处理起来就麻烦些。
2)想想在实际业务中还有那些维度表可以退化
    城市的三级分类(省、市、县)等


3.5 DWS层之用户行为宽表
1)为什么要建宽表
需求目标,把每个用户单日的行为聚合起来组成一张多列宽表,以便之后关联用户维度信息后进行,不同角度的统计分析。

3.5.1 创建用户行为宽表
  1. hive (gmall)>
  2. drop table if exists dws_user_action;
  3. create external table dws_user_action
  4. (   
  5.     user_id          string      comment '用户 id',
  6.     order_count     bigint      comment '下单次数 ',
  7.     order_amount    decimal(16,2)  comment '下单金额 ',
  8.     payment_count   bigint      comment '支付次数',
  9.     payment_amount  decimal(16,2) comment '支付金额 ',
  10.     comment_count   bigint      comment '评论次数'
  11. ) COMMENT '每日用户行为宽表'
  12. PARTITIONED BY (`dt` string)
  13. stored as parquet
  14. location '/warehouse/gmall/dws/dws_user_action/';
复制代码

3.5.2 向用户行为宽表导入数据
1)导入数据
  1. hive (gmall)>
  2. with
  3. tmp_order as
  4. (
  5.     select
  6.         user_id,
  7. count(*)  order_count,
  8.         sum(oi.total_amount) order_amount
  9.     from dwd_order_info oi
  10.     where date_format(oi.create_time,'yyyy-MM-dd')='2019-02-10'
  11.     group by user_id
  12. ) ,
  13. tmp_payment as
  14. (
  15.     select
  16.         user_id,
  17.         sum(pi.total_amount) payment_amount,
  18.         count(*) payment_count
  19.     from dwd_payment_info pi
  20.     where date_format(pi.payment_time,'yyyy-MM-dd')='2019-02-10'
  21.     group by user_id
  22. ),
  23. tmp_comment as
  24. (
  25.     select
  26.         user_id,
  27.         count(*) comment_count
  28.     from dwd_comment_log c
  29.     where date_format(c.dt,'yyyy-MM-dd')='2019-02-10'
  30.     group by user_id
  31. )
  32. insert overwrite table dws_user_action partition(dt='2019-02-10')
  33. select
  34.     user_actions.user_id,
  35.     sum(user_actions.order_count),
  36.     sum(user_actions.order_amount),
  37.     sum(user_actions.payment_count),
  38.     sum(user_actions.payment_amount),
  39.     sum(user_actions.comment_count)
  40. from
  41. (
  42.     select
  43.         user_id,
  44.         order_count,
  45.         order_amount,
  46.         0 payment_count,
  47.         0 payment_amount,
  48.         0 comment_count
  49.     from tmp_order
  50.     union all
  51.     select
  52.         user_id,
  53.         0,
  54.         0,
  55.         payment_count,
  56.         payment_amount,
  57.         0
  58.     from tmp_payment
  59.     union all
  60.     select
  61.         user_id,
  62.         0,
  63.         0,
  64.         0,
  65.         0,
  66.         comment_count
  67.     from tmp_comment
  68. ) user_actions
  69. group by user_id;
复制代码

2)查询导入结果
  1. hive (gmall)> select * from dws_user_action;
复制代码

3.5.3 用户行为数据宽表导入脚本
1)在/home/kgg/bin目录下创建脚本dws_db_wide.sh
  1. [kgg@hadoop102 bin]$ vim dws_db_wide.sh
  2.     在脚本中填写如下内容
  3. #!/bin/bash
  4. # 定义变量方便修改
  5. APP=gmall
  6. hive=/opt/module/hive/bin/hive
  7. # 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
  8. if [ -n "$1" ] ;then
  9.     do_date=$1
  10. else
  11.     do_date=`date -d "-1 day" +%F`  
  12. fi
  13. sql="
  14. with  
  15. tmp_order as
  16. (
  17.     select
  18.         user_id,
  19.         sum(oi.total_amount) order_amount,
  20.         count(*)  order_count
  21.     from "$APP".dwd_order_info  oi
  22.     where date_format(oi.create_time,'yyyy-MM-dd')='$do_date'
  23.     group by user_id
  24. )  ,
  25. tmp_payment as
  26. (
  27.     select
  28.         user_id,
  29.         sum(pi.total_amount) payment_amount,
  30.         count(*) payment_count
  31.     from "$APP".dwd_payment_info pi
  32.     where date_format(pi.payment_time,'yyyy-MM-dd')='$do_date'
  33.     group by user_id
  34. ),
  35. tmp_comment as
  36. (  
  37.     select  
  38.         user_id,
  39.         count(*) comment_count
  40.     from "$APP".dwd_comment_log c
  41.     where date_format(c.dt,'yyyy-MM-dd')='$do_date'
  42.     group by user_id
  43. )
  44. Insert overwrite table "$APP".dws_user_action partition(dt='$do_date')
  45. select
  46.     user_actions.user_id,
  47.     sum(user_actions.order_count),
  48.     sum(user_actions.order_amount),
  49.     sum(user_actions.payment_count),
  50.     sum(user_actions.payment_amount),
  51.     sum(user_actions.comment_count)
  52. from
  53. (
  54.     select
  55.         user_id,
  56.         order_count,
  57.         order_amount,
  58.         0 payment_count,
  59.         0 payment_amount,
  60.         0 comment_count
  61.     from tmp_order
  62.     union all
  63.     select
  64.         user_id,
  65.         0,
  66.         0,
  67.         payment_count,
  68.         payment_amount,
  69.         0
  70.     from tmp_payment
  71.     union all
  72.     select
  73.         user_id,
  74.         0,
  75.         0,
  76.         0,
  77.         0,
  78.         comment_count
  79.     from tmp_comment
  80. ) user_actions
  81. group by user_id;
  82. "
  83. $hive -e "$sql"
复制代码

2)增加脚本执行权限
  1. [kgg@hadoop102 bin]$ chmod 777 dws_db_wide.sh
复制代码

3)执行脚本导入数据
  1. [kgg@hadoop102 bin]$ dws_db_wide.sh 2019-02-11
复制代码

4)查看导入数据
  1. hive (gmall)>
  2. select * from dws_user_action where dt='2019-02-11' limit 2;
复制代码

最新经典文章,欢迎关注公众号


没找到任何评论,期待你打破沉寂

您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

关闭

推荐上一条 /5 下一条