Toronto, Ontario, Canada

Eastern Time Zone

info@maximconsulting.ca

Statistical function in T-SQL

In my current project I need to calculate mean, mode, median and percentiles (95th percentile, 99th percentile) of a given data set.

In this article I’ve put together a summary of what I’ve discovered during my research.

The following code snippets are taken from the following blog – please visit it for more detailed information:
http://blogs.lessthandot.com/index.php/DataMgmt/DataDesign/calculating-mean-median-and-mode-with-sq

MEAN Calculation

Mean is another name for average so we can use AVG function to calculate mean.

MEDIAN Calculation

Median is middle point in the set

DECLARE @Temp TABLE(Id INT IDENTITY(1,1), DATA DECIMAL(10,5))


INSERT INTO @Temp VALUES(1)
INSERT INTO @Temp VALUES(2)
INSERT INTO @Temp VALUES(5)
INSERT INTO @Temp VALUES(5)
INSERT INTO @Temp VALUES(5)
INSERT INTO @Temp VALUES(6)
INSERT INTO @Temp VALUES(6)
INSERT INTO @Temp VALUES(6)
INSERT INTO @Temp VALUES(7)
INSERT INTO @Temp VALUES(9)
INSERT INTO @Temp VALUES(10)
INSERT INTO @Temp VALUES(NULL)


SELECT ((
        SELECT TOP 1 DATA
        FROM   (
                SELECT  TOP 50 PERCENT DATA
                FROM    @Temp
                WHERE   DATA IS NOT NULL
                ORDER BY DATA
                ) AS A
        ORDER BY DATA DESC) + 
        (
        SELECT TOP 1 DATA
        FROM   (
                SELECT  TOP 50 PERCENT DATA
                FROM    @Temp
                WHERE   DATA IS NOT NULL
                ORDER BY DATA DESC
                ) AS A
        ORDER BY DATA ASC)) / 2

–MODE Calculation

DECLARE @Temp TABLE(Id INT IDENTITY(1,1), DATA DECIMAL(10,5))

INSERT INTO @Temp VALUES(1)
INSERT INTO @Temp VALUES(2)
INSERT INTO @Temp VALUES(5)
INSERT INTO @Temp VALUES(5)
INSERT INTO @Temp VALUES(5)
INSERT INTO @Temp VALUES(6)
INSERT INTO @Temp VALUES(6)
INSERT INTO @Temp VALUES(6)
INSERT INTO @Temp VALUES(7)
INSERT INTO @Temp VALUES(9)
INSERT INTO @Temp VALUES(10)
INSERT INTO @Temp VALUES(NULL)


SELECT TOP 1 WITH ties DATA
FROM   @Temp
WHERE  DATA IS Not NULL
GROUP  BY DATA
ORDER  BY COUNT(*) DESC


Percentile Calculation


Please reference the following blog for details:

http://www.sqlteam.com/article/computing-percentiles-in-sql-server


— function for floating point division
CREATE FUNCTION dbo.FDIV 
(@numerator float, 
@denominator float)
RETURNS float
AS
BEGIN
RETURN CASE WHEN @denominator = 0.0 
THEN 0.0
ELSE @numerator / @denominator
END
END
GO

— function for linear interpolation
CREATE FUNCTION dbo.LERP 
(@value float, — between low and high
@low float,
@high float,
@newlow float,
@newhigh float)
RETURNS float — between newlow and newhigh
AS
BEGIN
  RETURN CASE 
      WHEN @value between @low and @high and @newlow <= @newhigh THEN @newlow + dbo.FDIV((@value-@low), (@high-@low)) * (@newhigh – @newlow)
      WHEN @value = @low and @newlow is not NULL THEN @newlow
      WHEN @value = @high and @newhigh is not NULL THEN @newhigh
      ELSE NULL
END
END
GO


Declare @TestScores table (StudentID int, Score int)
insert @TestScores (StudentID, Score) Values (1,  20)
insert @TestScores (StudentID, Score) Values (2,  03)
insert @TestScores (StudentID, Score) Values (3,  40)
insert @TestScores (StudentID, Score) Values (4,  45)
insert @TestScores (StudentID, Score) Values (5,  50)
insert @TestScores (StudentID, Score) Values (6,  20)
insert @TestScores (StudentID, Score) Values (7,  90)
insert @TestScores (StudentID, Score) Values (8,  20)
insert @TestScores (StudentID, Score) Values (9,  11)
insert @TestScores (StudentID, Score) Values (10, 30)


–Find the percentile rank of a given score 
— The derived table makes one scan over the data values to compute some aggregates. The outer select interpolates 
— between the pth percentile of the nearest samples below and above the given value.
— The result is 25, 0.5. That means a score of 25 is at the 50th percentile, the median, of the distribution.

declare @val float
set @val = 25

select 
@val as val,
dbo.LERP(@val, scoreLT, scoreGE, 
dbo.FDIV(countLT-1,countMinus1), 
dbo.FDIV(countLT,countMinus1)) as percentrank
from (
select 
SUM(CASE WHEN Score < @val 
THEN 1 ELSE 0 END) as countLT,
count(*)-1 as countMinus1,
MAX(CASE WHEN Score < @val
THEN Score END) as scoreLT,
MIN(CASE WHEN Score >= @val
THEN Score END) as scoreGE
from @TestScores
) as x1

— Find the percentile (the score) that characterizes a given percentage

declare @pp float
set @pp = .75

select 
@pp as factor, 
dbo.LERP(max(d), 0.0, 1.0, max(a.Score), max(b.Score)) as percentile
from
(
select floor(kf) as k, kf-floor(kf) as d
from (
select 1+@pp*(count(*)-1) as kf from @TestScores
) as x1
) as x2
join @TestScores a
on 
(select count(*) from @TestScores aa
where aa.Score < a.Score) < k
join @TestScores b
on 
(select count(*) from @TestScores bb
where bb.Score < b.Score) < k+1