讨论一个比较有意思的业务需求

业务需求描述：

有一个用户登录表，用户每次登陆时，就会向这个表中插入一条数据，这个表记录了用户的用户ID和登录时间，表的数据量有几千万，

现在需要求出从今天开始算，用户持续登录的时间（也就是用户今天登陆了，昨天也登陆了，但是前天没有登录，那用户的持续登录时间就

是一天）。

分析：

看似蛮简单的一需求，在数据库里面实际操作起来不是那么简单的，并非一个简单的Select能够搞定的，从业务的描述我们起码可以得到如下

的分析结论：

1. 业务需要统计这样的数据，应该并不需要实时的数据，所以我们可以获取某个时间的快照数据来做计算；

2. 表数据量比较大，如果直接在这个表上操作，势必会对产品的使用造成影响（因为每个用户登录时，都需要再往里面插数据的）；

3. 需要统计每个用户的持续登录时间，那意味着如果没有持续登录时间的用户就是不需要的用户，这里面应该可以筛选掉一大批用户；

4. 每个用户都需要做统计计算，这个肯定是一个循环计算的过程，我们最好前期能过滤掉一部分用户，那后面的统计计算无疑可以节省很多的时间；

5. 用户持续登录，一般时间不可能很长（很少有人天天去登陆一个网站，持续100天的吧），意味着我们可以用持续天数来做为循环条件，

而不必以用户来作为循环条件（用户做循环条件的话，循环次数应该会比较大）；

6. 大数据量做统计，而且是定位到每个用户的，性能是必须要重点考虑的因素；

造测试数据：

测试数据其实有一个比较难的要求是能够尽量的接近真实数据，这样的测试效果才是最好的；我们预计造一个2千5百万的表，造几十万的用户，然后随机

的生成登陆时间，但是要求登录时间尽量能贴合真实的情况；

我们先创建测试表（其实最好是分区表）：

 --  Create Test Table  
  create   table  dbo.UserLoginInfo
(
     userid  int 
    ,logintime  datetime 
    , CONSTRAINT   [  PK_UserInfo  ]   PRIMARY   KEY   CLUSTERED  
    ( 
         userid  ASC 
        ,logintime  ASC  
    ) WITH  
    (
         PAD_INDEX   =   OFF 
        ,STATISTICS_NORECOMPUTE   =   OFF 
        ,IGNORE_DUP_KEY  =   OFF 
        ,ALLOW_ROW_LOCKS   =   ON 
        ,ALLOW_PAGE_LOCKS   =   ON 
        , FILLFACTOR   =   90 
    )  ON   [  PRIMARY  ]  
)  ON   [  PRIMARY  ] 

 --  DST Table  
  create   table  dbo.DST_UserLoginInfo
(
     userid  int 
    ,logindate  varchar ( 10 )
    ,ContinueDays  smallint 
    ,IsOver  bit 
    , CONSTRAINT   [  PK_UserLoginInfo  ]   PRIMARY   KEY   CLUSTERED  
    ( 
         userid  ASC 
        ,logindate  ASC  
    ) WITH  
    (
         PAD_INDEX   =   OFF 
        ,STATISTICS_NORECOMPUTE   =   OFF 
        ,IGNORE_DUP_KEY  =   OFF 
        ,ALLOW_ROW_LOCKS   =   ON 
        ,ALLOW_PAGE_LOCKS   =   ON 
        , FILLFACTOR   =   90 
    )  ON   [  PRIMARY  ]  
)  ON   [  PRIMARY  ]

复制代码

说明：我们创建了两个表，一个是模拟真实的用户登录数据的表，另外一个是我们准备做数据统计的表；

我们按时间由远到近，分几个批次来生成测试数据：

 /*  -----------造二千五百万用户登录记录的过程---------------------  */ 
 --  第一批次  
  set  nocount  on 
 go 
 --  1000*5000=500W  
  declare   @u_count   int 
 set   @u_count  =  0 
 --  1000  
  while   @u_count  <  1000 
 begin 
 declare   @userid   int , @count   int 
 set   @userid  =   rand () *  100000 
 set   @count  =  0 
 --  5000  
  while   @count  <  5000 
 begin 
 insert   into  dbo.UserLoginInfo(userid,logintime)
 --  4000  
  select   @userid , CAST ( convert ( varchar ( 10 ), dateadd (d, abs (checksum( newid ()) %  4000 ), getdate () -  4000 ), 120 ) +  '   '  +   str ( 5  +  abs (checksum( newid ()) %  6 ), 2 ) +  '  :  '  +  replace ( str ( abs (checksum( newid ()) %  60 ), 2 ), '  ' , '  0  ' ) +  '  :  '  +  replace ( str ( abs (checksum( newid ()) %  60 ), 2 ), '  ' , '  0  ' )  as   datetime )
 set   @count  =  @count  +  1 
 end 
 set   @u_count  =  @u_count  +  1 
 end 
 go 
 set  nocount  off 

 --  第二批次  
  set  nocount  on 
 go 
 --  1000*6000=600W  
  declare   @u_count   int 
 set   @u_count  =  0 
 while   @u_count  <  1000 
 begin 
 declare   @userid   int , @count   int 
 set   @userid  =   rand () *  1000000 
 set   @count  =  0 
 while   @count  <  6000 
 begin 
 --  3000  
  insert   into  dbo.UserLoginInfo(userid,logintime)
 select   @userid , CAST ( convert ( varchar ( 10 ), dateadd (d, abs (checksum( newid ()) %  3000 ), getdate () -  3000 ), 120 ) +  '   '  +   str ( 5  +  abs (checksum( newid ()) %  6 ), 2 ) +  '  :  '  +  replace ( str ( abs (checksum( newid ()) %  60 ), 2 ), '  ' , '  0  ' ) +  '  :  '  +  replace ( str ( abs (checksum( newid ()) %  60 ), 2 ), '  ' , '  0  ' )  as   datetime )
 set   @count  =  @count  +  1 
 end 
 set   @u_count  =  @u_count  +  1 
 end 
 go 
 set  nocount  off 

 --  第三批次  
  set  nocount  on 
 go 
 --  1000*4000=400W  
  declare   @u_count   int 
 set   @u_count  =  0 
 while   @u_count  <  1000 
 begin 
 declare   @userid   int , @count   int 
 set   @userid  =   rand () *  1000000 
 set   @count  =  0 
 while   @count  <  4000 
 begin 
 --  2000  
  insert   into  dbo.UserLoginInfo(userid,logintime)
 select   @userid , CAST ( convert ( varchar ( 10 ), dateadd (d, abs (checksum( newid ()) %  2000 ), getdate () -  2000 ), 120 ) +  '   '  +   str ( 5  +  abs (checksum( newid ()) %  6 ), 2 ) +  '  :  '  +  replace ( str ( abs (checksum( newid ()) %  60 ), 2 ), '  ' , '  0  ' ) +  '  :  '  +  replace ( str ( abs (checksum( newid ()) %  60 ), 2 ), '  ' , '  0  ' )  as   datetime )
 set   @count  =  @count  +  1 
 end 
 set   @u_count  =  @u_count  +  1 
 end 
 go 
 set  nocount  off 

 --  第四批次  
  set  nocount  on 
 go 
 --  10000*700=700W  
  declare   @u_count   int 
 set   @u_count  =  0 
 while   @u_count  <  10000 
 begin 
 declare   @userid   int , @count   int 
 set   @userid  =   rand () *  100000000 
 --  select @userid  
  set   @count  =  0 
 while   @count  <  700 
 begin 
 --  1000  
  insert   into  dbo.UserLoginInfo(userid,logintime)
 select   @userid , CAST ( convert ( varchar ( 10 ), dateadd (d, abs (checksum( newid ()) %  1000 ), getdate () -  1000 ), 120 ) +  '   '  +   str ( 5  +  abs (checksum( newid ()) %  6 ), 2 ) +  '  :  '  +  replace ( str ( abs (checksum( newid ()) %  60 ), 2 ), '  ' , '  0  ' ) +  '  :  '  +  replace ( str ( abs (checksum( newid ()) %  60 ), 2 ), '  ' , '  0  ' )  as   datetime )
 set   @count  =  @count  +  1 
 end 
 set   @u_count  =  @u_count  +  1 
 end 
 go 
 set  nocount  off 

 --  第五批次  
  set  nocount  on 
 go 
 --  100000*10=100W  
  declare   @u_count   int 
 set   @u_count  =  0 
 while   @u_count  <  100000 
 begin 
 declare   @userid   int , @count   int 
 set   @userid  =   rand () *  100000000 
 set   @count  =  0 
 while   @count  <  10 
 begin 
 --  500  
  insert   into  dbo.UserLoginInfo(userid,logintime)
 select   @userid , CAST ( convert ( varchar ( 10 ), dateadd (d, abs (checksum( newid ()) %  500 ), getdate () -  500 ), 120 ) +  '   '  +   str ( 5  +  abs (checksum( newid ()) %  6 ), 2 ) +  '  :  '  +  replace ( str ( abs (checksum( newid ()) %  60 ), 2 ), '  ' , '  0  ' ) +  '  :  '  +  replace ( str ( abs (checksum( newid ()) %  60 ), 2 ), '  ' , '  0  ' )  as   datetime )
 set   @count  =  @count  +  1 
 end 
 set   @u_count  =  @u_count  +  1 
 end 
 go 
 set  nocount  off 

 --  第六批次  
  set  nocount  on 
 go 
 --  100000*20=200W  
  declare   @u_count   int 
 set   @u_count  =  0 
 while   @u_count  <  100000 
 begin 
 declare   @userid   int , @count   int 
 set   @userid  =   rand () *  100000000 
 set   @count  =  0 
 while   @count  <  20 
 begin 
 --  365  
  insert   into  dbo.UserLoginInfo(userid,logintime)
 select   @userid , CAST ( convert ( varchar ( 10 ), dateadd (d, abs (checksum( newid ()) %  365 ), getdate () -  365 ), 120 ) +  '   '  +   str ( 5  +  abs (checksum( newid ()) %  6 ), 2 ) +  '  :  '  +  replace ( str ( abs (checksum( newid ()) %  60 ), 2 ), '  ' , '  0  ' ) +  '  :  '  +  replace ( str ( abs (checksum( newid ()) %  60 ), 2 ), '  ' , '  0  ' )  as   datetime )
 set   @count  =  @count  +  1 
 end 
 set   @u_count  =  @u_count  +  1 
 end 
 go 
 set  nocount  off

复制代码

造这些数据，在我本机上花了大半天时间才完成（痛苦呀），这些脚本运行完之后，我本机生成的数据量情况如下：

用户数量：

随机生成的时间情况：

现在我们完成了一个两千五百万记录，用户数量二十二万，时间从2001-03-05到2012-02-15的用户登录记录；但是有个问题，就是今天的时间，

没有加上去，我们再补充一下今天登陆的记录（假定今天每个用户都登陆了）

 --  add today login recode for every user  
  insert   into  dbo.UserLoginInfo(userid,logintime)
 select  userid, DATEADD (MINUTE, -  20 , GETDATE ())   from  
(
 select    distinct  userid  from  dbo.UserLoginInfo  with (nolock)
) a

复制代码

加完之后就应该有今天的数据了：

到这里造数据的过程就完了。

持续天数计算：

思路：1. 我们将UserLoginInfo的记录先做筛选，筛选掉那些今天有登陆，但是昨天没有登录的用户（也就是没有持续登录的用户）；

2. 将筛选后的数据放入到中间表，我们通过中间表来计算用户持续登录的天数；

3. 计算完成后，通过查询中间表，输出用户和持续登录天数；

以下就按照前面的思路来开展步骤：

1. 筛选掉不需要的记录，并将其导入中间表：

 --  find data into other table  
  insert   into  dbo.DST_UserLoginInfo(userid,logindate,ContinueDays,IsOver)
 select   distinct  userid, convert ( varchar ( 10 ),logintime, 23 )  as  logindate, 0 , 0 
 from   dbo.UserLoginInfo a  with (nolock) 
 where   exists 
 --  login today  
 (  select   1   from  dbo.UserLoginInfo b  with (nolock)  where  b.userid = a.userid
   and  b.logintime <=  GETDATE ()  and  b.logintime >=  convert ( varchar ( 10 ), GETDATE (), 23 ))
 --  login yesterday  
  and   exists 
( select   1   from  dbo.UserLoginInfo c  with (nolock)  where  a.userid = c.userid
  and  c.logintime <  convert ( varchar ( 10 ), GETDATE (), 23 )  and  c.logintime >=  convert ( varchar ( 10 ), GETDATE () -  1 , 23 )
)

复制代码

说明：在这里我们将时间字段变成了日期型的字符串，方便后面计算时做判断，另外中间表增加了持续时间和标识位字段，也是为了方便后面的计算。

2. 通过天数来循环中间表，计算持续天数：
我们先来看一下，筛选完成后的数据量为800多万，节省了持续天数计算时大量的计算量；

接着我们来计算持续天数：

这里有两种方式：

1. 按用户来循环计算：

每次取一个用户，然后根据用户ID来循环计算这个用户的持续登录天数；但是设想一下，如果我们有10万个用户，那我们第一层次取用户的循环将要

循环10万次，而且每个用户又需要在第一层的循环里面做持续时间天数的循环计算，可以想象计算量是非常大的，不可取；

2. 按持续天数来循环计算：

前面分析阶段已经提及过，正常情况下很少有用户能持续100天，每天都登陆到一个网站上面的，那意味着我们循环的天数不会是一个非常大的量，而

且这种循环一次性的将同一个持续天数的用户一次性计算完成了，效率应该是比较高的，我们采用这种方式来进行计算；

 --  Update ContinueDays  
  declare   @days   smallint  
 set   @days  =  1 
 while   exists  ( select   1   from  dbo.DST_UserLoginInfo  where  IsOver =  0 )
 begin 
  update  a  set  ContinueDays =  @days ,IsOver =  1 
  from  dbo.DST_UserLoginInfo a
  where  IsOver =  0 
  and   exists  
 ( select   1   from  dbo.DST_UserLoginInfo b  with (nolock)  where  a.userid = b.userid  and  IsOver =  0 
    and  b.logindate =  convert ( varchar ( 10 ), GETDATE () -  @days , 23 ))
  and   not   exists 
  ( select   1   from  dbo.DST_UserLoginInfo c  with (nolock)  where  a.userid = c.userid  and  IsOver =  0 
     and  c.logindate =  convert ( varchar ( 10 ), GETDATE () -  @days  -  1 , 23 ))

  set   @days  =  @days  +  1 
 end

复制代码

运行完成后，我们来查看下运行的结果：

一共是14639个用户有持续登录，最长的登录时间为259天（真实情况应该不会有这么大，计算的时候最耗时的就是这两个持续时间大的用户）。

验证结果：

我现在来抽查几条数据，看是否正确：

先在中间表中随便找一个持续时间为一天的记录，再到原登录表中找到他实际的登录数据：

打钩的是持续时间，正好是一天（其实这里的理解好像有点问题，用户明明是连续两天登录，而此处的持续时间只算做一天）；

接下来抽个三天的记录：

结果也是正确的。

总结：

分析、造数据、测试和计算我们都做完了，现在将上面的计算过程做成一个存储过程，这样就方便随时调用了：

 Create   Proc  usp_UserContinueDays
 as 
 begin 
 set  nocount  on 
 --  truncate dst table  
  if   OBJECT_ID ( '  dbo.DST_UserLoginInfo  ' )  is   null 
    begin 
      --  DST Table  
      create   table  dbo.DST_UserLoginInfo
    (
         userid  int 
        ,logindate  varchar ( 10 )
        ,ContinueDays  smallint 
        ,IsOver  bit 
        , CONSTRAINT   [  PK_UserLoginInfo  ]   PRIMARY   KEY   CLUSTERED  
        ( 
             userid  ASC 
            ,logindate  ASC  
        ) WITH  
        (
             PAD_INDEX   =   OFF 
            ,STATISTICS_NORECOMPUTE   =   OFF 
            ,IGNORE_DUP_KEY  =   OFF 
            ,ALLOW_ROW_LOCKS   =   ON 
            ,ALLOW_PAGE_LOCKS   =   ON 
            , FILLFACTOR   =   90 
        )  ON   [  PRIMARY  ]  
    )  ON   [  PRIMARY  ] 

    end 
 else 
    truncate   table  dbo.DST_UserLoginInfo

 --  find data to other table  
  insert   into  dbo.DST_UserLoginInfo(userid,logindate,ContinueDays,IsOver)
 select   distinct  userid, convert ( varchar ( 10 ),logintime, 23 )  as  logindate, 0 , 0 
 from   dbo.UserLoginInfo a  with (nolock) 
 where   exists 
 --  login today  
 (  select   1   from  dbo.UserLoginInfo b  with (nolock)  where  b.userid = a.userid
   and  b.logintime <=  GETDATE ()  and  b.logintime >=  convert ( varchar ( 10 ), GETDATE (), 23 ))
 --  login yesterday  
  and   exists 
( select   1   from  dbo.UserLoginInfo c  with (nolock)  where  a.userid = c.userid
  and  c.logintime <  convert ( varchar ( 10 ), GETDATE (), 23 )  and  c.logintime >=  convert ( varchar ( 10 ), GETDATE () -  1 , 23 )
)

 --  Update ContinueDays  
  declare   @days   smallint  
 set   @days  =  1 
 while   exists  ( select   1   from  dbo.DST_UserLoginInfo  where  IsOver =  0 )
 begin 
  update  a  set  ContinueDays =  @days ,IsOver =  1 
  from  dbo.DST_UserLoginInfo a
  where  IsOver =  0 
  and   exists  
 ( select   1   from  dbo.DST_UserLoginInfo b  with (nolock)  where  a.userid = b.userid  and  IsOver =  0 
    and  b.logindate =  convert ( varchar ( 10 ), GETDATE () -  @days , 23 ))
  and   not   exists 
  ( select   1   from  dbo.DST_UserLoginInfo c  with (nolock)  where  a.userid = c.userid  and  IsOver =  0 
     and  c.logindate =  convert ( varchar ( 10 ), GETDATE () -  @days  -  1 , 23 ))

  set   @days  =  @days  +  1 
 end 

 --  Result  
  select  userid, MIN (logindate)  as  LoginDate ,ContinueDays   from  dbo.DST_UserLoginInfo 
 group   by  userid,ContinueDays 
 order   by  ContinueDays  desc 

 set  nocount  off 
 end

复制代码

到此，测试计算的过程完成，不过有点遗憾的是，这个业务需求是另外一个公司的朋友提供的，我没办法拿到他们原始的计算方法，所以就没用办法比较

算法的最终效果了；如果大家有更好的方法，欢迎讨论。

作者： Leo_wl

出处： http://www.cnblogs.com/Leo_wl/

本文版权归作者和博客园共有，欢迎转载，但未经作者同意必须保留此段声明，且在文章页面明显位置给出原文连接，否则保留追究法律责任的权利。

版权信息

查看更多关于讨论一个比较有意思的业务需求的详细内容...

声明：本文来自网络，不代表【好得很程序员自学网】立场，转载请注明出处：http://haodehen.cn/did49867

更新时间：2022-09-24 阅读：48次