setwd("/Users/jameshanley/Documents/Courses/634/JUPITER") # fill in the path to YOUR datafile ds=read.table("JUPITERdataset.txt") # ds: jh shorthand for "dataset" n=length(ds$tx) ; n # number of observations in ds ds[seq(1,n,500),] # list every 500th observation # events & person days in tx=1 (statin) and tx=0 (placebo) groups sum(ds$terminated.by.event[ds$tx==1]);length(ds$last.day[ds$tx==1]); sum(ds$terminated.by.event[ds$tx==0]);length(ds$last.day[ds$tx==0]); # Kaplan-Meier curves library(survival) # have a look at programs within survival package.. # JH uses the Package Manager in Packages and Data menu # to look at contents of different packages... # survfit is used to fit Kaplan-Meier curve(s) # you must supply the time and event-indicator as a pair, as Surv object # in survival analysis, event indicator = 0 usually used for a censored observation, # and 1 for an observation terminated by the event of interest. fit=survfit( Surv(ds$last.day,ds$terminated.by.event) ~ ds$tx) plot(fit) # why so messy ... ?? # structure of fit [object.oriented] helps to know # what the various parts of the results are stored under what name.. str(fit) # if use the Surv(time,status)~grouping variable form, it fits as many # curves as there are groups; they can then be accessed as [1], [2] etc.. # now, by being able to get at the relevant components, and using # general plot function, have control over the appearance of plot.. plot(fit[1]$time/365,1-fit[1]$surv,type="s", xlab="Year", ylab="Cumulative Incidence") points(fit[2]$time/365,1-fit[2]$surv,type="s") # ======= viewed from an "event-rate" perspective ============ # overall numbers of cases, person years, and event rates by treatment cases.tx1 = sum(ds$terminated.by.event[ds$tx==1]) # sum of 0's and 1s py.tx1 = (1/365)*sum(ds$last.day[ds$tx==1]) # easier to deal in p-years than p-days rate.tx1 = 100*(cases.tx1/py.tx1); round(c(cases.tx1,py.tx1,rate.tx1),2) cases.tx0 = sum(ds$terminated.by.event[ds$tx==0]) py.tx0 = (1/365)*sum(ds$last.day[ds$tx==0]) rate.tx0 = 100*(cases.tx0/py.tx0); round(c(cases.tx0,py.tx0,rate.tx0),2) # ********* WINDOW-SPECIFIC RATES .. window width = 1 year # printed excerpts are in "last in, first out" order # creates 4 vectors # WARNING: this nest piece of code, which splits each observation # into the year by year contributions made by each subject, # was easy to code, but is somewhat inneficient.. and may take # a minute or so to run.. still faster than by hand! # to let you know where it is at, it prints selected messages.. # printed excerpts are in "last in, first out" order # IF this code takes too long on your computer, # you can always read in the already-baked result (the time.slice.ds dataset) # from the course website... verbose=T Tx=c(); years=c(); e=c(); year=c(); # 4 vectors to be created for (i in 1:n){ yrs=ceiling(ds$last.day[i]/365) if(verbose & i == (100*floor(i/100)) ) print(paste( "subject",toString(i),"contributes to the first",toString(yrs),"1-year-wide window(s)")) for (yr in 1:yrs){ if (yr<yrs) { year=c(year,yr); years=c(years,1); e=c(e,0); Tx=c(Tx,ds$tx[i]) } if (yr==yrs) { year=c(year,yr); years=c(years, round(ds$last.day[i]/365-(yrs-1),3) ); e=c(e,ds$terminated.by.event[i]); Tx=c(Tx,ds$tx[i]) } } } time.slice.ds=data.frame(Tx,year,years,e) # create dataset from the 4 generated vectors # this new (longer) datset has as few as 1 record per person (for those who entered last) # and as many as 5 for those who entered first (and were events free going in to year 5... # excerpts from this dataset... n.obs=length(time.slice.ds$year) ; n.obs # total number of time slices contrinbuted by subjects # first 10 time.slice.ds[1:10,] # last 20 time.slice.ds[(n.obs-19):n.obs,] # you might want to save this dataframe, since it takes a while to create # if so, use write.table(time.slice.ds,"somefilename.txt",sep=" ") # now, divie the dataset by year and Tx-group, and calculate # number of events and person years for each of (5 x 2 = ) 10 "cells" .. stats=aggregate(time.slice.ds[,3:4],list(Tx,year),sum) # cols 3 and 4 are p-y and event (0/1) stats # look at this dataset names(stats)[1]="Tx"; names(stats)[2]="Year" ; # rename cols to be more meaningfull.. names(stats)[3]="P-Years" ; names(stats)[4]="events" stats # more readable.. but pairs of numbers to create # compared year-specific rates are still on different rows.... # so, a "not-elegant but it works" solution.. put compared items side by side on same row # will appreciate this side by side format later when do Mantel-Haenszel calculations in R summarystats=cbind(stats[seq(2,10,2),2:4],stats[seq(1,9,2),3:4]) names(summarystats)=c("Year","py.tx1","cases.tx1","py.tx0","cases.tx0") # better names.. summarystats # finally... you can now get R to calculate year-specific rates for each tx. # AND calculate 5 year-specific rate ratios and rate differences.. # (might also wish to see if rates are highere in higher years, as subjects are <<presumably>> # a year older... << >> not quite since not all 18000 have contributed to all years # Dr Genest, in his seminar on this study, in RVH in Fall, told us there was an outcry # and protest against this study in the US after the 1st year, and so Astra Zeneca # then began to recruit manymore of its patients from overseas.. so those now in year 5 # may be of a different provenance from those who who are coming along behind them...) R CODE to compute year-specific statistics # for you to supply .. # question: # how similar are the year-specific rate ratios comparing statin vs placebo. ? # are they homogeneous enough that you think it makes sense to combine them # into 1 summary rate ratio? # (remember than some of the jumpig around will be because of random variation.. # the Poisson numerators are not all that big in higher years, and so the rate # ratio is not that stable.) # If you consider them the rate ratios "poolable", then pool them # the dataset format makes it easy to program the summary ratio given # in Rothman 2002, eqn # 8-5 for summary Incidence Ratio # [there shouldn't be "confounding" by year, but just the same... ] # since RAMQ will (and mds) be more interested -- for cost effectiveness -- # in rate differences, while you are at it, use equation 8-4 to calculate # a summary Incidence difference # [again, the summary one probably won't be v. different from the crude one.. # but its worth being aware of how these calculations would need to be # done in a non-experimental situation, or one where an rct dataset is smaller, # and there could be major treatment imbalances on a stronger variable than # year...) # === last point... (should actually be sooner) === # double check that nothing got lost in the programming of the time- # slicing.. compare overall totals with those at beginning of this section.. # can also use for crude rate ratios etc ... sapply(summarystats[2:5],sum) # ========