# Comparing two samples using the kernel smoothers of the # NPMLEs of their distribution functions # Author: Wei Pan, weip@biostat.umn.edu # INPUT: 1) datafile: name of the file containing the data # 2) B: number of bootstrap replications # 3) lbandw: lower bound of the bandwidth to be searched # 4) ubandw: upper bound of the bandwidth to be searched # 5) nfold: nfold-fold crossvaludation to be used in selecting bandwidth # 6) range.x: range of the survival time to be considered kernel2test<-function(datafile, B=200, lbandw=0.1, ubandw=1.2, nfold=10, range.x=c(0,2), debug=F){ x0<-seq(0,range.x[2],length=101*range.x[2]/2) dat<-read.table(datafile) n<-length(dat$V2) nseq<-1:length(dat$V2) sam1<-nseq[dat$V3==0] sam2<-nseq[dat$V3==1] LC1<-dat$V1[sam1] RC1<-dat$V2[sam1] LC2<-dat$V1[sam2] RC2<-dat$V2[sam2] n1<-length(LC1) n2<-length(LC2) v1<-CVbw(LC1, RC1, n1,lbandw=lbandw, ubandw=ubandw, nfold=10, range.x=range.x) v2<-CVbw(LC2, RC2, n2,lbandw=lbandw, ubandw=ubandw, nfold=10, range.x=range.x) if (debug) cat("bw1=",v1," bw2=", v2,"\n") y1<-KSestimator(LC1, RC1, n1, v1, range.x=range.x) y2<-KSestimator(LC2, RC2, n2, v2, range.x=range.x) KSstat0<-max(abs(y1-y2)) Dstat0<-sum(y1-y2)*(x0[2]-x0[1]) if (debug==T){ cat("stat0=",KSstat0," ", Dstat0, "\n") # plot(x0, y1, type="l") # lines(x0, y2, lty=2) } #bootstrap b<-1 KSngt<-KSnlt<-0 Dngt<-Dnlt<-0 for (b in 1:B){ subsam1<-sample(1:n, n1, replace=T) subsam2<-sample(1:n, n2, replace=T) lc1<-dat$V1[subsam1] rc1<-dat$V2[subsam1] lc2<-dat$V1[subsam2] rc2<-dat$V2[subsam2] y1<-KSestimator(lc1, rc1, n1, v1, range.x=range.x) y2<-KSestimator(lc2, rc2, n2, v2, range.x=range.x) KSstat<-max(abs(y1-y2)) Dstat<-sum(y1-y2)*(x0[2]-x0[1]) if (debug==T){ cat(" ", KSstat, " ", Dstat," "); #plot(x0, y1, type="l") #lines(x0, y2, lty=2) } if (KSstat0KSstat) KSngt<-KSngt+1 if (Dstat0Dstat) Dngt<-Dngt+1 } if (debug==F){ sink(paste("KERNEL/2TestB",B,sep=""), append=T) cat(KSnlt," ", KSngt," ", KSstat0, " ", Dnlt," ",Dngt," ", Dstat0," ", v1," ", v2,"\n") sink() } } CVbw<-function(LC,RC, N, lbandw=0.1, ubandw=1.2, nfold=10, range.x=c(0,2), debug=F){ bandw.list<-seq(lbandw, ubandw, by=0.1) LKLHD<-rep(0,length(bandw.list)) #CV for(v in 1:length(bandw.list)){ NCV<-N-nfold CVtime<-N/nfold for (i in 1:CVtime){ noind<-seq((i-1)*nfold+1, (i-1)*nfold+nfold, by=1) lc<-c(0, LC[-noind]) rc<-c(0, RC[-noind]) storage.mode(lc)<-"double" storage.mode(rc)<-"double" storage.mode(NCV)<-"integer" res<-.C("IcmNPMLE", x=lc, P=rc, n=NCV) x<-rep(res$x[2:(res$n+1)], res$P[2:(res$n+1)]*N+0.5) ks<-ksmooth(x, ker="parzen", bandwidth=bandw.list[v], n.points=101*(range.x[2]/2), range.x=range.x) for (j in noind) LKLHD[v]<-LKLHD[v] + lklhd(ks, LC[j], RC[j]) } if (debug==T) cat("LKLHD[",v,"]=", LKLHD[v]," ") } maxlkl<-max(LKLHD) v<-bandw.list[LKLHD==maxlkl] return(v) } KSestimator<-function(LC, RC, N, v, range.x=c(0,2)){ lc<-c(0, LC) rc<-c(0, RC) storage.mode(lc)<-"double" storage.mode(rc)<-"double" storage.mode(N)<-"integer" res<-.C("IcmNPMLE", x=lc, P=rc, n=N) x<-rep(res$x[2:(res$n+1)], res$P[2:(res$n+1)]*N+0.5) npts<-101*range.x[2]/2 ks<-ksmooth(x, ker="parzen", bandwidth=v, n.points=npts, range.x=range.x) surv<-rep(1, length(ks$y)); for (i in 2:length(ks$y)) surv[i]<-surv[i-1]-(ks$y[i-1]+ks$y[i])*(ks$x[i]-ks$x[i-1])/2 return(surv) } lklhd<-function(res, lc, rc){ lkl<-0 lkl<-sum(res$y[(res$x>=lc)&(res$x<=rc)])*(res$x[3]-res$x[2]); # +sum(res$y[(ilc-1):(irc-1)]))* if (lkl>1) { lkl<-1 # cat("lkl=",lkl,"\n") } if (lkl<=0) return(-100000000) else return(log(lkl)) }