setwd("/Users/jameshanley/Documents/Courses/634/JUPITER") 
  # fill in the path to YOUR datafile

ds=read.table("JUPITERdataset.txt") # ds: jh shorthand for "dataset"

n=length(ds$tx) ; n # number of observations in ds

ds[seq(1,n,500),] # list every 500th observation

# events & person days in tx=1 (statin) and tx=0 (placebo) groups
 
sum(ds$terminated.by.event[ds$tx==1]);length(ds$last.day[ds$tx==1]);
sum(ds$terminated.by.event[ds$tx==0]);length(ds$last.day[ds$tx==0]);

# Kaplan-Meier curves

library(survival)

# have a look at programs within survival package..
# JH uses the Package Manager in Packages and Data menu
# to look at contents of different packages...

# survfit is used to fit Kaplan-Meier curve(s)
# you must supply the time and event-indicator as a pair, as Surv object
# in survival analysis, event indicator = 0 usually used for a censored observation,
# and 1 for an observation terminated by the event of interest.

fit=survfit( Surv(ds$last.day,ds$terminated.by.event) ~ ds$tx)

plot(fit)  # why so messy ... ??

# structure of fit [object.oriented] helps to know
# what the various parts of the results are stored under what name..

str(fit) 

# if use the Surv(time,status)~grouping variable form, it fits as many
# curves as there are groups; they can then be accessed as [1], [2] etc.. 

# now, by being able to get at the relevant components, and using
# general plot function, have control over the appearance of plot..

plot(fit[1]$time/365,1-fit[1]$surv,type="s",
  xlab="Year", ylab="Cumulative Incidence")
points(fit[2]$time/365,1-fit[2]$surv,type="s")

# ======= viewed from an "event-rate" perspective ============

# overall numbers of cases, person years, and event rates by treatment

cases.tx1  = sum(ds$terminated.by.event[ds$tx==1]) # sum of 0's and 1s
py.tx1 = (1/365)*sum(ds$last.day[ds$tx==1]) # easier to deal in p-years than p-days

rate.tx1 = 100*(cases.tx1/py.tx1);
round(c(cases.tx1,py.tx1,rate.tx1),2)

cases.tx0  = sum(ds$terminated.by.event[ds$tx==0])
py.tx0 = (1/365)*sum(ds$last.day[ds$tx==0])
rate.tx0 = 100*(cases.tx0/py.tx0);
round(c(cases.tx0,py.tx0,rate.tx0),2)

# *********  WINDOW-SPECIFIC RATES .. window width = 1 year

# printed excerpts are in "last in, first out" order

# creates 4 vectors

# WARNING: this nest piece of code, which splits each observation
# into the year by year contributions made by each subject, 
# was easy to code, but is somewhat inneficient.. and may take
# a minute or so to run.. still faster than by hand!
# to let you know where it is at, it prints selected messages..

# printed excerpts are in "last in, first out" order

# IF this code takes too long on your computer,
# you can always read in the already-baked result (the time.slice.ds dataset)
# from the course website...

verbose=T
Tx=c(); years=c(); e=c(); year=c(); # 4 vectors to be created
for (i in 1:n){
	yrs=ceiling(ds$last.day[i]/365)
   if(verbose & i == (100*floor(i/100)) ) print(paste(
   "subject",toString(i),"contributes to the first",toString(yrs),"1-year-wide window(s)"))
	for (yr in 1:yrs){
	if (yr<yrs) {
		year=c(year,yr);
		years=c(years,1); 
		e=c(e,0);
		Tx=c(Tx,ds$tx[i])
	}
	if (yr==yrs) {
		year=c(year,yr);
		years=c(years, round(ds$last.day[i]/365-(yrs-1),3) ); 
		e=c(e,ds$terminated.by.event[i]);
		Tx=c(Tx,ds$tx[i])
	} 
 }
 }

time.slice.ds=data.frame(Tx,year,years,e) # create dataset from the 4 generated vectors

# this new (longer) datset has as few as 1 record per person (for those who entered last) 
# and as many as 5 for those who entered first (and were events free going in to year 5...

# excerpts from this dataset...

n.obs=length(time.slice.ds$year) ; n.obs # total number of time slices contrinbuted by subjects

# first 10
time.slice.ds[1:10,]
# last 20
time.slice.ds[(n.obs-19):n.obs,]

# you might want to save this dataframe, since it takes a while to create
# if so, use write.table(time.slice.ds,"somefilename.txt",sep=" ")

# now, divie the dataset by year and Tx-group, and calculate
# number of events and person years for each of (5 x 2 = ) 10 "cells" .. 

stats=aggregate(time.slice.ds[,3:4],list(Tx,year),sum) # cols 3 and 4 are p-y and event (0/1)

stats # look at this dataset 

names(stats)[1]="Tx"; names(stats)[2]="Year" ;        # rename cols to be more meaningfull.. 
names(stats)[3]="P-Years" ; names(stats)[4]="events"

stats  # more readable..  but pairs of numbers to create 
       # compared year-specific rates are still on different rows.... 

# so, a "not-elegant but it works" solution.. put compared items side by side on same row
# will appreciate this side by side format later when do Mantel-Haenszel calculations in R

summarystats=cbind(stats[seq(2,10,2),2:4],stats[seq(1,9,2),3:4]) 
names(summarystats)=c("Year","py.tx1","cases.tx1","py.tx0","cases.tx0") # better names..

summarystats # finally...  you can now get R to calculate year-specific rates for each tx.

# AND calculate 5 year-specific rate ratios and rate differences.. 

# (might also wish to see if rates are highere in higher years, as subjects are <<presumably>>
# a year older... << >> not quite since not all 18000 have contributed to all years
# Dr Genest, in his seminar on this study, in RVH in Fall, told us there was an outcry
# and protest against this study in the US after the 1st year, and so Astra Zeneca
# then began to recruit manymore of its patients from overseas.. so those now in year 5
# may be of a different provenance from those who who are coming along behind them...)

R CODE to compute year-specific statistics # for you to supply ..

# question: 

# how similar are the year-specific rate ratios comparing statin vs placebo. ?
# are they homogeneous enough that you think it makes sense to combine them 
# into 1 summary rate ratio?
# (remember than some of the jumpig around will be because of random variation.. 
# the Poisson numerators are not all that big in higher years, and so the rate 
# ratio is not that stable.) 

# If you consider them the rate ratios "poolable", then pool them
# the dataset format makes it easy to program the summary ratio given 
# in Rothman 2002, eqn # 8-5 for summary Incidence Ratio 
# [there shouldn't be "confounding" by year, but just the same... ]

# since RAMQ will (and mds) be more interested -- for cost effectiveness --
# in rate differences, while you are at it, use equation 8-4 to calculate 
# a summary Incidence difference
# [again, the summary one probably won't be v. different from the crude one.. 
# but its worth being aware of how these calculations would need to be 
# done in a non-experimental situation, or one where an rct dataset is smaller,
# and there could be major treatment imbalances on a stronger variable than 
# year...)  

# === last point... (should actually be sooner) ===

# double check that nothing got lost in the programming of the time-
# slicing.. compare overall totals with those at beginning of this section..
# can also use for crude rate ratios etc ...

sapply(summarystats[2:5],sum)

# ========