The US$ 3 million Heritage Health Price competition is on so we take a look at how to get started using the R statistical computing and analysis platform.

We do not have the full set of data yet, so this is a simple warm-up
session to predict the days in hospital in year 2 based on the year 1
data.

Prerequisites
Obviously you need to have R installed,
and you should also have signed up for the competition (be sure to
read the terms carefully) and downloaded and extracted the release 1
data file.

Data preparation
Let’s load the data into R and do some basic housekeeping:

#!/usr/bin/Rscript
## example001.R - simple benchmarks for the HHP
## Copyright © 2011 CYBAEA Limited - http://www.cybaea.net/
##############################
#### DATA PREPARATION
##++++
## Members
members <- read.csv ( file = "HHP_release1/Members_Y1.csv" ,
colClasses = rep ( "factor" , 3 ),
comment.char = "" )
##----
##++++
## Claims
claims.Y1 <- read.csv ( file = "HHP_release1/Claims_Y1.csv" ,
colClasses = c (
rep ( "factor" , 7 ),
"integer" , # paydelay
"character" , # LengthOfStay
"character" , # dsfs
"factor" , # PrimaryConditionGroup
"character" # CharlsonIndex
),
comment.char = "" )
## Utility function
make.numeric <- function ( cv , FUN = mean ) {
### make a character vector numeric by splitting on '-'
sapply ( strsplit ( gsub ( "[^[:digit:]]+" ,
" " ,
cv ,
perl = TRUE ),
" " ,
fixed = TRUE ),
function ( x ) FUN ( as.numeric ( x )))
}
## Length of stay as days
{
z <- make.numeric ( claims.Y1 $ LengthOfStay )
z.week <- grepl ( "week" , claims.Y1 $ LengthOfStay , fixed = TRUE )
z [ z.week ] <- z [ z.week ] * 7 # Weeks are 7 days
z [ is.nan ( z )] <- 0
claims.Y1 $ LengthOfStay.days <- z
}
los.levels <- c ( "" , "1 day" , sprintf ( "%d days" , 2 : 6 ),
"1- 2 weeks" , "2- 4 weeks" , "4- 8 weeks" , "8-12 weeks" ,
"12-26 weeks" , "26+ weeks" )
stopifnot ( all ( claims.Y1 $ LengthOfStay %in% los.levels ))
claims.Y1 $ LengthOfStay <- factor ( claims.Y1 $ LengthOfStay ,
levels = los.levels ,
labels = c ( "0 days" , los.levels [ -1 ]),
ordered = TRUE )
## Months since first claim
claims.Y1 $ dsfs.months <- make.numeric ( claims.Y1 $ dsfs )
## dsfs is an ordered factor and gives the ordering of the claims
dsfs.levels <- c ( "0- 1 month" , sprintf ( "%d-%2d months" , 1 : 11 , 2 : 12 ))
claims.Y1 $ dsfs <- factor ( claims.Y1 $ dsfs , levels = dsfs.levels , ordered = TRUE )
## Index as numeric
claims.Y1 $ CharlsonIndex.numeric <- make.numeric ( claims.Y1 $ CharlsonIndex )
claims.Y1 $ CharlsonIndex <- factor ( claims.Y1 $ CharlsonIndex , ordered = TRUE )
##----
##++++
## Days in hospital
dih.Y2 <- read.csv ( file = "HHP_release1/DayInHospital_Y2.csv" ,
colClasses = c ( "factor" , "integer" ),
comment.char = "" )
names ( dih.Y2 )[ 1 ] <- "MemberID" # Fix broken file
##----
save ( members , claims.Y1 , dih.Y2 ,
file = "HHPR1.RData" )
##############################

Scoring
We will need a function to score our predictions p against the actual
values a. The formula is on the evaluation page and we implement it
as:

#!/usr/bin/Rscript
## example001.R - simple benchmarks for the HHP
## Copyright © 2011 CYBAEA Limited - http://www.cybaea.net/
##############################
#### FUNCTION TO CALCULATE SCORE
HPPScore <- function ( p , a ) {
### Scorng function after
### http://www.heritagehealthprize.com/c/hhp/Details/Evaluation
### Base 10 log from http://www.heritagehealthprize.com/forums/default.aspx?g=posts&m=2226#post2226
sqrt ( mean (( log ( 1 + p , 10 ) - log ( 1 + a , 10 )) ^ 2 ))
}
##############################

The simplest benchmarks
The simplest models don’t really model at all: they just use the
average and are simple benchmarks.

#!/usr/bin/Rscript
## example001.R - simple benchmarks for the HHP
## Copyright © 2011 CYBAEA Limited - http://www.cybaea.net/
y <- dih.Y2 $ DaysInHospital_Y2 # Actual
p <- rep ( mean ( y ), NROW ( dih.Y2 ))
cat ( sprintf ( "Score using mean : %8.6f\n" , HPPScore ( p , y )))
# Score using mean : 0.278725
p <- rep ( median ( y ), NROW ( dih.Y2 ))
cat ( sprintf ( "Score using median: %8.6f\n" , HPPScore ( p , y )))
# Score using median : 0.267969

Simple single-variable linear models
OK, a model that doesn’t use past data isn’t much of a model, so let’s
improve on that:

#!/usr/bin/Rscript
## example001.R - simple benchmarks for the HHP
## Copyright © 2011 CYBAEA Limited - http://www.cybaea.net/
library ( "reshape2" )
vars <- dcast ( claims.Y1 , MemberID ~ . , sum , value_var = "LengthOfStay.days" )
names ( vars )[ 2 ] <- "LengthOfStay"
data <- merge ( vars , dih.Y2 )
model <- lm ( DaysInHospital_Y2 ~ LengthOfStay , data = data )
p <- predict ( model )
cat ( sprintf ( "Score using lm(LengthOfStay): %8.6f\n" , HPPScore ( p , y )))
# Score using lm(LengthOfStay): 0.279062
model <- glm ( DaysInHospital_Y2 ~ LengthOfStay ,
family = quasipoisson (),
data = data )
p <- predict ( model , type = "response" )
cat ( sprintf ( "Score using glm(LengthOfStay): %8.6f\n" , HPPScore ( p , y )))
# Score using glm ( LengthOfStay ) : 0.278914

Let the competition begin.