# Logistische und Poisson Regression # Beispiele aus Cohen et al. (2003, pp. 475-535) # HINWEIS: Für die Beispielsyntax muss das Paket 'nnet' installiert worden sein. # ------------------------------------------------------------------------------ BaseURL="http://www2.jura.uni-hamburg.de/instkrim/kriminologie/Mitarbeiter/Enzmann/Lehre/StatIIKrim/" Pfad = '' # ------------------------------------------------------------------------------ # Gegebenfalls die folgende Anweisung auskommentieren und den Pfadnamen (mit ab- # schließendem Schrägstrich-Vorwärts) eintragen, z.B.: Pfad = 'd:/winword/texte/S2_ss07/StatIIKrim/' # ------------------------------------------------------------------------------ # Nur bei Internet-Anschluss verwenden: # download.file(paste(BaseURL,'c13e01dt.txt',sep=""),paste(Pfad,'c13e01dt.txt',sep=""),mode='wb') # download.file(paste(BaseURL,'c13e02dt.txt',sep=""),paste(Pfad,'c13e02dt.txt',sep=""),mode='wb') # ------------------------------------------------------------------------------ # Folgende Bibliotheken (Pakete) müssen geladen (und ggf. erst installiert) # werden: library(nnet) library(MASS) # Grafiken aufzeichnen: windows(record=T) # ============================================================================== # Table 13.2.1 (p. 489): Logits, odds, and probabilities pubs = c(0:15,15.38,16:30) logit.prom = -6.0 + 0.39*pubs odds.prom = exp(logit.prom) prob.prom = odds.prom/(1+odds.prom) cbind(pubs,logit=round(logit.prom,2), odds=round(odds.prom,2), prob=round(prob.prom,2)) # ------------------------------------------------------------------------------ # Figure 13.2.1 (p. 491): Logit as a function of probability p logit = function(p) { log(p/(1-p)) } p = seq(0,1,.0001) plot(p,logit(p),type='l',col='blue',main='Logit as a function of probability') abline(h=0) # ------------------------------------------------------------------------------ # p. 492f: The odds(/ratios) of promotion # odds.3.pubs = exp(-6.00)*exp(.39*3) odds.2.pubs = exp(-6.00)*exp(.39*2) odds.ratio=odds.3.pubs/odds.2.pubs cbind("Odds of 3 pubs"=odds.3.pubs, "Odds of 2 pubs"=odds.2.pubs, "Odds ratio"=odds.ratio) # 5 vs. 4 publications: odds.5.pubs = exp(-6.00)*exp(.39*5) odds.4.pubs = exp(-6.00)*exp(.39*4) odds.ratio=odds.5.pubs/odds.4.pubs cbind("Odds of 5 pubs"=odds.5.pubs, "Odds of 4 pubs"=odds.4.pubs, "Odds ratio"=odds.ratio) # 11 vs. 10 publications: odds.11.pubs = exp(-6.00)*exp(.39*11) odds.10.pubs = exp(-6.00)*exp(.39*10) odds.ratio=odds.11.pubs/odds.10.pubs cbind("Odds of 11 pubs"=odds.11.pubs, "Odds of 10 pubs"=odds.10.pubs, "Odds ratio"=odds.ratio) # Increase from 10 to 15 publications: odds.10.pubs*odds.ratio**5 # ------------------------------------------------------------------------------ # Example Table 13.2.2, I. Logistic Regression (p. 495): # Die Datei 'c13e01dt.txt' muss im aktuellen Arbeitsverzeichnis vorliegen! # Einlesen der Beispieldaten: ch13ex01 = read.fwf(file=paste(Pfad,'c13e01dt.txt',sep=''), widths=c(5,5,5,8,8,8), col.names=c('case','physrec','comply','knowledg','benefits','barriers'), colClasses=c('integer','factor','factor','numeric','numeric','numeric')) model01 = glm(comply~physrec+knowledg+benefits+barriers,family='binomial',data=ch13ex01) summary(model01) # 95% Confidence interval for model parameters (asymptotic normality assumed): round(confint.default(model01),2) # 95% Confidence interval for model parameters based on "profile likelihoods" # (setzt library MASS voraus, s.o.): round(confint(model01),2) # Model chi-square: Chi = model01$null-model01$dev Chi # Model df: Df = model01$df.null-model01$df.res Df # p: 1-pchisq(Chi,Df) # Mit Hilfe von anova() und update() kannn der Modelltest direkt durchgeführt # werden; anova() erstellt eine Tabelle der Effekte eines Modells oder ver- # gleicht verschiedene, sog. "geschachtelte" (d.h. aufeinander aufbauende) # Modelle. Mit update() kann ein Modell abgeändert und neu geschätzt werden, # die Funktion lässt sich hier benutzen, um das sog. "Null"-Modell (keine UVs) # zu spezifizieren. Vergleicht man dieses Nullmodell mit dem geschätzten Mo- # dell mittels anova() und spezifizert zugleich einen Chi²-Test, erhält man # das in Cohen berichtete Modell-Chi² inkl. Signifikanztest, da die Differenz # der "deviances" des Nullmodells und des geschätzten Modells Chi²-verteilt ist # mit den Freiheitsgraden der Differenz der DF von Nullmodell und geschätztem # Modell (vgl. die Ergebnisse mit obigem Test): anova(update(model01,comply~1),model01,test='Chisq') # Odds ratios and 95% confidence intervals: round(cbind(exp(model01$coef),exp(confint.default(model01))),3) # ------------------------------------------------------------------------------ # Multiple R² analogs (pseudo R²) (p. 502 ff.): LogRegR2 = function(model) { n = dim(model$model)[1] Chi2 = model$null - model$dev Df = model$df.null - model$df.res p = 1-pchisq(Chi2,Df) RL2 = Chi2/model$null # also called McFaddens R² Cox = 1-exp(-Chi2/n) # Cox & Snell Index Nag = Cox/(1-exp(-model$null/n)) # Nagelkerke Index list('Chi2'=Chi2,'df'=Df,'p'=p,'RL2'=RL2,'CoxR2'=Cox,'NagelkerkeR2'=Nag) } LogRegR2(model01) # ------------------------------------------------------------------------------ # Table 13.2.3 (p. 503): model01a = glm(comply~physrec,family='binomial',data=ch13ex01) model01b = glm(comply~physrec+knowledg,family='binomial',data=ch13ex01) R2.01a = LogRegR2(model01a) R2.01b = LogRegR2(model01b) R2.01 = LogRegR2(model01) tab13.2.3 = rbind(cbind(R2.01a$RL2,R2.01a$CoxR2,R2.01a$NagelkerkeR2), cbind(R2.01b$RL2,R2.01b$CoxR2,R2.01b$NagelkerkeR2), cbind(R2.01$RL2, R2.01$CoxR2, R2.01$NagelkerkeR2)) colnames(tab13.2.3)=c('RL²','Cox & Snell','Nagelkerke') rownames(tab13.2.3)=c('PHYSREC alone', 'PHYSREC,KNOWLEDG', 'PHYSREC,KNOWLEDG,BENEFITS,BARRIERS') round(tab13.2.3,2) # ------------------------------------------------------------------------------ # Wald statistic (Chi²) (p. 507 f.) # Anmerkung: R berechnet die Wald-statistic als "Bj/SE.Bj" (vgl. Fußnote 3 in # Cohen, S. 508). Die z-Werte quadriert ergeben die Chi²-Werte der Wald-Statis- # tik gemäß Formel 13.2.28 in Cohen. erg = summary(model01) round(cbind(B=erg$coeff[,1],SE=erg$coeff[,2], "Wald Chi2"=erg$coeff[,1]**2/erg$coeff[,2]**2),3) # ------------------------------------------------------------------------------ # Impact of predictor scaling (p. 510): source("D:/Statist/R/work/SPSS/freq.r") comply = ch13ex01$comply benefits.A = as.numeric(ch13ex01$benefits) # A. physrec(1,0), benefits (5,0): physrec.A = as.numeric(ch13ex01$physrec) physrec.A[which(physrec.A == 1)]=0 physrec.A[which(physrec.A == 2)]=1 freq(physrec.A) freq(benefits.A) mod.A = glm(comply~physrec.A+benefits.A,family='binomial') summary(mod.A) round(cbind("Exp(B) (odds ratio)"=exp(mod.A$coeff)),3) # B. physrec(1,-1), benefits (5,0): physrec.B = as.numeric(ch13ex01$physrec) physrec.B[which(physrec.B == 1)]=-1 physrec.B[which(physrec.B == 2)]=1 freq(physrec.B) benefits.B = benefits.A freq(benefits.B) mod.B = glm(comply~physrec.B+benefits.B,family='binomial') summary(mod.B) round(cbind("Exp(B) (odds ratio)"=exp(mod.B$coeff)),3) # C. physrec(1,0), benefits (1, 0): physrec.C = physrec.A freq(physrec.C) benefits.C = benefits.A/5 freq(benefits.C) mod.C = glm(comply~physrec.C+benefits.C,family='binomial') summary(mod.C) round(cbind("Exp(B) (odds ratio)"=exp(mod.C$coeff)),3) # D. physrec(z-values), benefits(z-values): physrec.D = scale(physrec.A) freq(physrec.D) benefits.D = scale(benefits.A) freq(benefits.D) mod.D = glm(comply~physrec.D+benefits.D,family='binomial') summary(mod.D) round(cbind("Exp(B) (odds ratio)"=exp(mod.D$coeff)),3) # ------------------------------------------------------------------------------ # Beispiel Table 13.3.1 (p. 521f.): # Die Datei 'c13e02dt.txt' muss im aktuellen Arbeitsverzeichnis vorliegen! # Einlesen der Beispieldaten: ch13ex02 = read.fwf(file=paste(Pfad,'c13e02dt.txt',sep=''), widths=c(5,5,5), col.names=c('case','steps4','interven'), colClasses=c('integer','factor','numeric')) steps4 = ch13ex02$steps4 interven = ch13ex02$interven Recode = function (var, recodes, as.factor.result) { recode.list <- rev(strsplit(recodes, ";")[[1]]) is.fac <- is.factor(var) if (missing(as.factor.result)) as.factor.result <- is.fac if (is.fac) var <- as.character(var) result <- var if (is.numeric(var)) { lo <- min(var, na.rm = TRUE) hi <- max(var, na.rm = TRUE) } for (term in recode.list) { if (0 < length(grep(":", term))) { range <- strsplit(strsplit(term, "=")[[1]][1], ":") low <- eval(parse(text = range[[1]][1])) high <- eval(parse(text = range[[1]][2])) target <- eval(parse(text = strsplit(term, "=")[[1]][2])) result[(var >= low) & (var <= high)] <- target } else if (0 < length(grep("else", term))) { target <- eval(parse(text = strsplit(term, "=")[[1]][2])) result[1:length(var)] <- target } else { set <- eval(parse(text = strsplit(term, "=")[[1]][1])) target <- eval(parse(text = strsplit(term, "=")[[1]][2])) for (val in set) { if (is.na(val)) result[is.na(var)] <- target else result[var == val] <- target } } } if (as.factor.result) result <- as.factor(result) else if (!is.numeric(result)) { result.valid <- na.omit(result) if (length(result.valid) == length(grep("[0-9]", result.valid))) result <- as.numeric(result) } result } # Nested Dichotomies: steps123v4=Recode(as.numeric(steps4),"c(1,2,3)=0;else=1") mod.A=glm(steps123v4 ~ interven,family='binomial') steps12v3=Recode(as.numeric(steps4),"c(1,2)=0;4=4;else=1") mod.B=glm(steps12v3[which(steps12v3 < 4)] ~ interven[which(steps12v3 < 4)],family='binomial') steps1v2=Recode(as.numeric(steps4),"1=0;2=1;else=2") mod.C=glm(steps1v2[which(steps1v2 < 2)] ~ interven[which(steps1v2 < 2)],family='binomial') LogRegR2(mod.A)$Chi2 + LogRegR2(mod.B)$Chi2 + LogRegR2(mod.C)$Chi2 mod.mult0 = multinom(steps4 ~ 1) mod.mult = multinom(steps4 ~ interven) summary(mod.mult,cor=F,Wald=T) anova(mod.mult0,mod.mult) mod.mult0$dev - mod.mult$dev # Ordinal logistic regression: mod.pol0 = polr(steps4 ~ 1) mod.pol = polr(steps4 ~ interven) summary(mod.pol) anova(mod.pol0,mod.pol) mod.pol0$dev - mod.pol$dev # Score Test: mod.mult0$dev - mod.mult$dev - (mod.pol0$dev - mod.pol$dev) 1-pchisq(mod.mult0$dev - mod.mult$dev - (mod.pol0$dev - mod.pol$dev),2) cbind(B=round(summary(mod.pol)$coef[,1],3), SE=round(summary(mod.pol)$coef[,2],3), Wald=round(summary(mod.pol)$coef[,1]**2/summary(mod.pol)$coef[,2]**2,3), df=c(1,1,1,1), sig=round(1-pchisq(summary(mod.pol)$coef[,1]**2/summary(mod.pol)$coef[,2]**2,c(1,1,1,1)),3)) round(exp(mod.pol$coef),3) # ============================================================================== # Figure 13.4.1 (p. 527): Probability of Y events according to Poisson distrib- # ution as a function of the expected (mean) number of events: Y.counts = 0:10 tab = round(cbind(Y.counts,dpois(Y.counts, .50), dpois(Y.counts, 1.50), dpois(Y.counts, 2.00), dpois(Y.counts, 4.00), dpois(Y.counts,10.00)),2) colnames(tab)=c('Y','µ=.50','µ=1.25','µ=2.00','µ=4.00','µ=10.00') tab plot(tab[,2]~tab[,1],type='b',ylim=c(0,.7),col='black', ylab='Poisson probability',xlab='Number of events', main='Figure 13.4.1: Poisson distributions') lines(tab[,1],tab[,3],type='b',col='blue') lines(tab[,1],tab[,4],type='b',col='red') lines(tab[,1],tab[,5],type='b',lty=2,col='black') lines(tab[,1],tab[,6],type='b',lty=2,col='blue') legend(x='topright',legend=colnames(tab)[2:6], lty=c(1,1,1,2,2),col=c('black','blue','red','black','blue')) # ------------------------------------------------------------------------------ # Figure 13.4.2 (p. 529): Poisson regression versus linear regression for a # fictitious example predicting number of aggressive acts in 30-minute recess as # a function of teacher rating of aggressiveness to a Poisson regression # equation: case = 100:110 aggress = 0:10 b0 = -1.68 b1 = .35 mu.Pois = exp(b0+b1*aggress) mu.OLS = predict(lm(mu.Pois~aggress)) round(cbind(case,aggress,'µ.Pois'=mu.Pois, 'µ.OLS'=mu.OLS, 'ln(µ)'=b0+b1*aggress),2) plot(mu.Pois~aggress,ylim=c(-1,7),type='b',col='blue', ylab='Predicted aggressive acts',xlab='Teacher aggressiveness rating', main='Figure 13.4.2: Poisson regression vs. linear regression') lines(aggress,mu.OLS,type='b',col='red') abline(h=0,lty=2,col='gray') legend(x='topleft',legend=c('linear','Poisson'), lty=c(1,1),col=c('red','blue')) # ------------------------------------------------------------------------------