. * do-file for lecture 1a of VHM 802/812, Winter 2016 . version 14 /* works also with version 13 */ . set more off . cd "h:\vhm\vhm802\data_stata" h:\vhm\vhm802\data_stata . . use daisy2red, clear . twoway (scatter milk120 parity) (lfit milk120 parity) . twoway (scatter milk120 parity, msymbol(circle_hollow)) (lfit milk120 parity, range(0 8)), ytitle("mil > k120") xtitle("parity") . histogram milk120 (bin=31, start=1110.2, width=145.80967) . . * regression model with prediction . regress milk120 parity Source | SS df MS Number of obs = 1,536 -------------+---------------------------------- F(1, 1534) = 262.27 Model | 109234227 1 109234227 Prob > F = 0.0000 Residual | 638905966 1,534 416496.718 R-squared = 0.1460 -------------+---------------------------------- Adj R-squared = 0.1455 Total | 748140192 1,535 487387.748 Root MSE = 645.37 ------------------------------------------------------------------------------ milk120 | Coef. Std. Err. t P>|t| [95% Conf. Interval] -------------+---------------------------------------------------------------- parity | 178.347 11.01266 16.19 0.000 156.7455 199.9484 _cons | 2727.08 34.33991 79.41 0.000 2659.722 2794.438 ------------------------------------------------------------------------------ . display invttail(1534,0.025) /* t* obtained from Stata */ 1.9615116 . predict fit (option xb assumed; fitted values) . predict se_mean, stdp /* SE(yhat), for confidence interval */ . predict se_ind, stdf /* prediction error, for prediction interval */ . generate pred_low = fit - invttail(1534,0.025)*se_ind . generate pred_upp = fit + invttail(1534,0.025)*se_ind . list cow parity milk120 fit se_mean se_ind pred_low pred_upp in 1/10 +-------------------------------------------------------------------------------+ | cow parity milk120 fit se_mean se_ind pred_low pred_upp | |-------------------------------------------------------------------------------| 1. | 3 5 4173.0 3618.815 29.87665 646.0568 2351.567 4886.063 | 2. | 4 5 3727.3 3618.815 29.87665 646.0568 2351.567 4886.063 | 3. | 5 5 3090.8 3618.815 29.87665 646.0568 2351.567 4886.063 | 4. | 8 5 4228.4 3618.815 29.87665 646.0568 2351.567 4886.063 | 5. | 9 6 3431.1 3797.162 39.53432 646.5754 2528.896 5065.427 | |-------------------------------------------------------------------------------| 6. | 11 5 3984.7 3618.815 29.87665 646.0568 2351.567 4886.063 | 7. | 12 5 4064.7 3618.815 29.87665 646.0568 2351.567 4886.063 | 8. | 15 4 4241.5 3440.468 21.55974 645.7256 2173.869 4707.066 | 9. | 17 2 3416.6 3083.774 18.35515 645.6265 1817.37 4350.178 | 10. | 17 3 3529.5 3262.121 16.7209 645.5822 1995.804 4528.438 | +-------------------------------------------------------------------------------+ . * code modified from VER2 Fig 14.1, works for simple linear regression only . twoway (scatter milk120 parity, msymbol(circle_hollow)) (lfit milk120 parity, range(0,8)) /// > (lfitci milk120 parity, ciplot(rline) blcolor(black) blpattern(dash) range(0,8)) /// > (lfitci milk120 parity, stdf ciplot(rline) blcolor(black) blwidth(medium) blpattern(shortdash_dot_dot > ) range(0,8)), /// > legend(off) ytitle("milk120") xtitle("parity") . . * residuals: individual observations . predict rawres, residuals (38 missing values generated) . predict stdres, rstandard (38 missing values generated) . predict delres, rstudent /* note: rstudent option for deletion residuals */ (38 missing values generated) . summarize rawres stdres delres Variable | Obs Mean Std. Dev. Min Max -------------+--------------------------------------------------------- rawres | 1,536 -2.65e-06 645.1553 -1973.574 2137.779 stdres | 1,536 -.0000951 1.000306 -3.059309 3.313621 delres | 1,536 -.0000529 1.000989 -3.067684 3.324461 . list cow parity milk120 fit rawres stdres delres in 1/10 +-----------------------------------------------------------------------+ | cow parity milk120 fit rawres stdres delres | |-----------------------------------------------------------------------| 1. | 3 5 4173.0 3618.815 554.1854 .8596371 .8595639 | 2. | 4 5 3727.3 3618.815 108.4854 .1682796 .1682263 | 3. | 5 5 3090.8 3618.815 -528.0146 -.8190417 -.8189538 | 4. | 8 5 4228.4 3618.815 609.5853 .9455719 .9455392 | 5. | 9 6 3431.1 3797.162 -366.0615 -.568283 -.5681576 | |-----------------------------------------------------------------------| 6. | 11 5 3984.7 3618.815 365.8853 .5675513 .5674258 | 7. | 12 5 4064.7 3618.815 445.8853 .691645 .6915274 | 8. | 15 4 4241.5 3440.468 801.0323 1.2419 1.24212 | 9. | 17 2 3416.6 3083.774 332.8264 .5159264 .5158029 | 10. | 17 3 3529.5 3262.121 267.3793 .4144459 .414334 | +-----------------------------------------------------------------------+ . list cow parity milk120 fit rawres stdres delres if stdres~=. & abs(stdres)>3 +------------------------------------------------------------------------+ | cow parity milk120 fit rawres stdres delres | |------------------------------------------------------------------------| 669. | 765 2 1110.2 3083.774 -1973.574 -3.059309 -3.067684 | 1319. | 2440 5 5630.3 3618.815 2011.485 3.12016 3.129088 | 1341. | 2460 3 5399.9 3262.121 2137.779 3.313621 3.324461 | 1370. | 2489 3 5278.0 3262.121 2015.879 3.124673 3.133643 | +------------------------------------------------------------------------+ . display 2*1536*ttail(1533,3.324461) /* Bonferroni-corrected P-value */ 1.3928477 . display invttail(1533,.025/1536) /* cut-off for signif at 5% level */ 4.1672429 . . * assessing normality using residuals . qnorm stdres . histogram stdres, normal (bin=31, start=-3.0593085, width=.20557838) . summarize stdres, detail Standardized residuals ------------------------------------------------------------- Percentiles Smallest 1% -2.23909 -3.059309 5% -1.601332 -2.949112 10% -1.267381 -2.927706 Obs 1,536 25% -.670562 -2.849384 Sum of Wgt. 1,536 50% -.0180296 Mean -.0000951 Largest Std. Dev. 1.000306 75% .6480753 2.97556 90% 1.281694 3.12016 Variance 1.000612 95% 1.698061 3.124673 Skewness .129289 99% 2.410189 3.313621 Kurtosis 3.089667 . swilk stdres Shapiro-Wilk W test for normal data Variable | Obs W V z Prob>z -------------+------------------------------------------------------ stdres | 1,536 0.99804 1.826 1.517 0.06468 . . * assessing linearity using residuals . scatter rawres fit . rvfplot /* same plot without generating variables; raw residuals only */ . scatter stdres fit /* most interesting residual plot */ . scatter stdres parity /* in simple regression, essentially the same as above */ . lowess stdres parity, xtitle("parity") . lowess milk120 parity, ytitle ("milk120") xtitle("parity") . . * assessing homoscedasticity . scatter stdres fit /* same as above */ . tabstat stdres, statistics(mean sd count) by(parity) Summary for variables: stdres by categories of: parity (Lactation number) parity | mean sd N ---------+------------------------------ 1 | -.4121462 .7542639 403 2 | .4093676 .9711145 369 3 | .2594253 1.015435 308 4 | .0739393 1.017689 219 5 | -.2971432 1.074628 164 6 | -.4336789 .8422888 69 7 | -.6370535 .1613464 4 ---------+------------------------------ Total | -.0000951 1.000306 1536 ---------------------------------------- . estat hettest Breusch-Pagan / Cook-Weisberg test for heteroskedasticity Ho: Constant variance Variables: fitted values of milk120 chi2(1) = 7.29 Prob > chi2 = 0.0069 . estat imtest Cameron & Trivedi's decomposition of IM-test --------------------------------------------------- Source | chi2 df p ---------------------+----------------------------- Heteroskedasticity | 16.48 2 0.0003 Skewness | 7.22 1 0.0072 Kurtosis | 0.75 1 0.3874 ---------------------+----------------------------- Total | 24.46 4 0.0001 --------------------------------------------------- . * note: several other tests and versions exist => beyond the scope of course . . * transformation . boxcox milk120 parity, model(lhs) /* lhs=left-hand-side is the default */ Fitting comparison model Iteration 0: log likelihood = -12237.344 Iteration 1: log likelihood = -12235.143 Iteration 2: log likelihood = -12235.143 Fitting full model Iteration 0: log likelihood = -12116.128 Iteration 1: log likelihood = -12112.932 Iteration 2: log likelihood = -12112.931 Number of obs = 1,536 LR chi2(1) = 244.42 Log likelihood = -12112.931 Prob > chi2 = 0.000 ------------------------------------------------------------------------------ milk120 | Coef. Std. Err. z P>|z| [95% Conf. Interval] -------------+---------------------------------------------------------------- /theta | .7656256 .0921533 8.31 0.000 .5850085 .9462428 ------------------------------------------------------------------------------ Estimates of scale-variant parameters ---------------------------- | Coef. -------------+-------------- Notrans | parity | 27.0975 _cons | 554.488 -------------+-------------- /sigma | 97.53378 ---------------------------- --------------------------------------------------------- Test Restricted LR statistic P-value H0: log likelihood chi2 Prob > chi2 --------------------------------------------------------- theta = -1 -12314.672 403.48 0.000 theta = 0 -12148.834 71.81 0.000 theta = 1 -12116.128 6.39 0.011 --------------------------------------------------------- . * analysis of power-transformed outcome . generate tmilk = milk120^.75 (38 missing values generated) . regress tmilk parity Source | SS df MS Number of obs = 1,536 -------------+---------------------------------- F(1, 1534) = 264.86 Model | 1103464.63 1 1103464.63 Prob > F = 0.0000 Residual | 6390947.69 1,534 4166.19798 R-squared = 0.1472 -------------+---------------------------------- Adj R-squared = 0.1467 Total | 7494412.32 1,535 4882.3533 Root MSE = 64.546 ------------------------------------------------------------------------------ tmilk | Coef. Std. Err. t P>|t| [95% Conf. Interval] -------------+---------------------------------------------------------------- parity | 17.92527 1.101429 16.27 0.000 15.7648 20.08573 _cons | 375.9898 3.434499 109.47 0.000 369.253 382.7267 ------------------------------------------------------------------------------ . predict fit1 (option xb assumed; fitted values) . predict stdres1, rstandard (38 missing values generated) . scatter stdres1 fit1 . qnorm stdres1 . summarize stdres1, detail Standardized residuals ------------------------------------------------------------- Percentiles Smallest 1% -2.313608 -3.402181 5% -1.658406 -3.083472 10% -1.283043 -3.058562 Obs 1,536 25% -.6623973 -3.01349 Sum of Wgt. 1,536 50% .0068648 Mean -.0000956 Largest Std. Dev. 1.000307 75% .6662298 2.804832 90% 1.270427 2.859347 Variance 1.000614 95% 1.657495 2.936321 Skewness -.0226933 99% 2.27907 3.102081 Kurtosis 3.070858 . swilk stdres1 Shapiro-Wilk W test for normal data Variable | Obs W V z Prob>z -------------+------------------------------------------------------ stdres1 | 1,536 0.99904 0.891 -0.291 0.61429 . estat hettest Breusch-Pagan / Cook-Weisberg test for heteroskedasticity Ho: Constant variance Variables: fitted values of tmilk chi2(1) = 3.31 Prob > chi2 = 0.0689 . * backtransformation of fitted values . generate btfit = fit1^(1/0.75) . predict se_ind1, stdf /* prediction error, for prediction interval */ . generate pred_low1 = fit1 - invttail(1534,0.025)*se_ind1 . generate pred_upp1 = fit1 + invttail(1534,0.025)*se_ind1 . generate btpred_low = pred_low1^(1/0.75) . generate btpred_upp = pred_upp1^(1/0.75) . list cow parity milk120 fit pred_low pred_upp btfit btpred_low btpred_upp in 1/10 +------------------------------------------------------------------------------------------+ | cow parity milk120 fit pred_low pred_upp btfit btpred~w btpred~p | |------------------------------------------------------------------------------------------| 1. | 3 5 4173.0 3618.815 2351.567 4886.063 3608.866 2362.556 4974.857 | 2. | 4 5 3727.3 3618.815 2351.567 4886.063 3608.866 2362.556 4974.857 | 3. | 5 5 3090.8 3618.815 2351.567 4886.063 3608.866 2362.556 4974.857 | 4. | 8 5 4228.4 3618.815 2351.567 4886.063 3608.866 2362.556 4974.857 | 5. | 9 6 3431.1 3797.162 2528.896 5065.427 3795.29 2529.675 5177.738 | |------------------------------------------------------------------------------------------| 6. | 11 5 3984.7 3618.815 2351.567 4886.063 3608.866 2362.556 4974.857 | 7. | 12 5 4064.7 3618.815 2351.567 4886.063 3608.866 2362.556 4974.857 | 8. | 15 4 4241.5 3440.468 2173.869 4707.066 3424.82 2198.007 4774.433 | 9. | 17 2 3416.6 3083.774 1817.37 4350.178 3064.116 1877.132 4381.072 | 10. | 17 3 3529.5 3262.121 1995.804 4528.438 3243.214 2036.153 4576.494 | +------------------------------------------------------------------------------------------+ . . * analysis of square-root transformed outcome . generate rmilk = milk120^.5 (38 missing values generated) . regress rmilk parity Source | SS df MS Number of obs = 1,536 -------------+---------------------------------- F(1, 1534) = 266.23 Model | 8851.08257 1 8851.08257 Prob > F = 0.0000 Residual | 50998.9065 1,534 33.2457018 R-squared = 0.1479 -------------+---------------------------------- Adj R-squared = 0.1473 Total | 59849.9891 1,535 38.9902209 Root MSE = 5.7659 ------------------------------------------------------------------------------ rmilk | Coef. Std. Err. t P>|t| [95% Conf. Interval] -------------+---------------------------------------------------------------- parity | 1.605405 .0983907 16.32 0.000 1.41241 1.798399 _cons | 51.96426 .3068041 169.37 0.000 51.36246 52.56606 ------------------------------------------------------------------------------ . predict fit2 (option xb assumed; fitted values) . predict stdres2, rstandard (38 missing values generated) . scatter stdres2 fit2 . qnorm stdres2 . summarize stdres2, detail Standardized residuals ------------------------------------------------------------- Percentiles Smallest 1% -2.486428 -3.791986 5% -1.729319 -3.220177 10% -1.306212 -3.191329 Obs 1,536 25% -.6381913 -3.184865 Sum of Wgt. 1,536 50% .0332434 Mean -.0000959 Largest Std. Dev. 1.000308 75% .6782325 2.611937 90% 1.257776 2.637807 Variance 1.000616 95% 1.624026 2.753201 Skewness -.1790609 99% 2.184529 2.897921 Kurtosis 3.137836 . swilk stdres2 Shapiro-Wilk W test for normal data Variable | Obs W V z Prob>z -------------+------------------------------------------------------ stdres2 | 1,536 0.99739 2.436 2.243 0.01246 . estat hettest Breusch-Pagan / Cook-Weisberg test for heteroskedasticity Ho: Constant variance Variables: fitted values of rmilk chi2(1) = 0.83 Prob > chi2 = 0.3616 . . * analysis of natural log-transformed outcome (for demonstration purposes only) . generate lmilk = ln(milk120) (38 missing values generated) . regress lmilk parity Source | SS df MS Number of obs = 1,536 -------------+---------------------------------- F(1, 1534) = 264.94 Model | 11.7075975 1 11.7075975 Prob > F = 0.0000 Residual | 67.7880022 1,534 .044190353 R-squared = 0.1473 -------------+---------------------------------- Adj R-squared = 0.1467 Total | 79.4955996 1,535 .051788664 Root MSE = .21022 ------------------------------------------------------------------------------ lmilk | Coef. Std. Err. t P>|t| [95% Conf. Interval] -------------+---------------------------------------------------------------- parity | .0583876 .0035872 16.28 0.000 .0513513 .0654238 _cons | 7.890967 .0111855 705.46 0.000 7.869027 7.912908 ------------------------------------------------------------------------------ . di "multiplicative effect of 1 added year: "exp(0.0583876) multiplicative effect of 1 added year: 1.0601258 . generate lparity = ln(parity) . regress lmilk lparity Source | SS df MS Number of obs = 1,536 -------------+---------------------------------- F(1, 1534) = 391.32 Model | 16.1575251 1 16.1575251 Prob > F = 0.0000 Residual | 63.3380745 1,534 .041289488 R-squared = 0.2033 -------------+---------------------------------- Adj R-squared = 0.2027 Total | 79.4955996 1,535 .051788664 Root MSE = .2032 ------------------------------------------------------------------------------ lmilk | Coef. Std. Err. t P>|t| [95% Conf. Interval] -------------+---------------------------------------------------------------- lparity | .1725382 .008722 19.78 0.000 .1554299 .1896466 _cons | 7.905481 .0089887 879.49 0.000 7.887849 7.923112 ------------------------------------------------------------------------------ . di "multiplicative effect of doubling parity: "exp(ln(2)*.1725382) multiplicative effect of doubling parity: 1.1270396