dad_age : age of father of baby (years)
mom_age : age of mother of baby (years)
maturity : classify mother as of advanced maternal age or not (advanced / younger)
len_preg : length of pregnancy (weeks)
is_premie : classify baby as either premature or full-term (premie / fullterm)
num_visits : number of visits to hospital during pregnancy
marital : marital status of mother at time of birth (married / unmarried)
mom_wt_gain : mother’s weight gain during pregnancy (pounds)
bwt : birth weight of baby (pounds)
low_bwt : classify baby as either low birthweight or not (low / notlow)
sex : sex of baby (female / male) smoke : smoking status of mother (smoker / nonsmoker)
mom_white : classify mother as either white or not (white / nonwhite)
mom_age_level : age level of mother of baby (teens, early20s, late20s, early30s, 35+)
library(car)
## Loading required package: carData
library("pwr")
library(pscl)
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
setwd("C:/Users/DELL/Desktop/HS 631 - Statistical Computing")
load(file = "births.Rdat")
Checking if data needs to be cleaned:
summary(births)
## dad_age mom_age maturity len_preg is_premie
## Min. : 14.0 Min. :13 advnced:133 Min. : 20.00 fullterm:846
## 1st Qu.: 26.0 1st Qu.:22 younger:867 1st Qu.: 37.00 premie :152
## Median : 32.0 Median :27 Median : 39.00 unknown : 2
## Mean :195.9 Mean :27 Mean : 40.26
## 3rd Qu.: 39.0 3rd Qu.:32 3rd Qu.: 40.00
## Max. :999.0 Max. :50 Max. :999.00
## num_visits marital mom_wt_gain bwt
## Min. : 0.00 married :613 Min. : 0.00 Min. : 1.000
## 1st Qu.: 10.00 unknown : 1 1st Qu.: 21.00 1st Qu.: 6.380
## Median : 12.00 unmarried:386 Median : 30.00 Median : 7.310
## Mean : 20.99 Mean : 56.48 Mean : 7.101
## 3rd Qu.: 15.00 3rd Qu.: 40.00 3rd Qu.: 8.060
## Max. :999.00 Max. :999.00 Max. :11.750
## low_bwt sex smoke mom_white mom_age_level
## low :111 female:503 nonsmoker:873 nonwhite:284 35+ :133
## notlow:889 male :497 smoker :126 unknown : 2 early20s:281
## unknown : 1 white :714 early30s:219
## late20s :257
## teens :110
##
Cleaning data i.e. setting impossible values to NA:
births$dad_age[births$dad_age == 999] <- NA
births$len_preg[births$len_preg == 999] <- NA
births$num_visits[births$num_visits == 999] <- NA
births$mom_wt_gain[births$mom_wt_gain == 999] <- NA
births$is_premie[births$is_premie == "unknown"] <- NA
births$marital[births$marital == "unknown"] <- NA
births$smoke[births$smoke == "unknown"] <- NA
births$mom_white[births$mom_white == "unknown"] <- NA
Reordering levels of variables and Renaming level advnced to advanced in maturity variable:
levels(births$maturity)[levels(births$maturity)=="advnced"] <- "advanced"
maturity_level <- c("younger","advanced")
births$maturity <- factor(births$maturity, levels = maturity_level)
is_premie_level <- c("fullterm","premie")
births$is_premie <- factor(births$is_premie, levels = is_premie_level)
low_bwt_level <- c("notlow","low")
births$low_bwt <- factor(births$low_bwt, levels = low_bwt_level)
smoke_level <- c("nonsmoker","smoker")
births$smoke <- factor(births$smoke, levels = smoke_level)
lev <- c("teens","early20s","late20s","early30s","35+")
births$mom_age_level <- factor(births$mom_age_level, levels = lev)
Dropping unused levels:
births$marital <- droplevels(births$marital)
births$mom_white <- droplevels(births$mom_white)
summary(births)
## dad_age mom_age maturity len_preg is_premie
## Min. :14.00 Min. :13 younger :867 Min. :20.00 fullterm:846
## 1st Qu.:25.00 1st Qu.:22 advanced:133 1st Qu.:37.00 premie :152
## Median :30.00 Median :27 Median :39.00 NA's : 2
## Mean :30.26 Mean :27 Mean :38.33
## 3rd Qu.:35.00 3rd Qu.:32 3rd Qu.:40.00
## Max. :55.00 Max. :50 Max. :45.00
## NA's :171 NA's :2
## num_visits marital mom_wt_gain bwt low_bwt
## Min. : 0.0 married :613 Min. : 0.00 Min. : 1.000 notlow:889
## 1st Qu.:10.0 unmarried:386 1st Qu.:20.00 1st Qu.: 6.380 low :111
## Median :12.0 NA's : 1 Median :30.00 Median : 7.310
## Mean :12.1 Mean :30.33 Mean : 7.101
## 3rd Qu.:15.0 3rd Qu.:38.00 3rd Qu.: 8.060
## Max. :30.0 Max. :85.00 Max. :11.750
## NA's :9 NA's :27
## sex smoke mom_white mom_age_level
## female:503 nonsmoker:873 nonwhite:284 teens :110
## male :497 smoker :126 white :714 early20s:281
## NA's : 1 NA's : 2 late20s :257
## early30s:219
## 35+ :133
##
##
Correlation between variables using ranks for factor variables:
births_numeric <- births[,c(1,2,4,6,8,9)]
births_numeric$maturity <- as.numeric(births$maturity)
births_numeric$is_premie <- as.numeric(births$is_premie, levels = is_premie_level)
births_numeric$low_bwt <- as.numeric(births$low_bwt)
births_numeric$smoke <- as.numeric(births$smoke, levels = smoke_level)
births_numeric$mom_age_level <- as.numeric(births$mom_age_level, levels = lev)
births_numeric$marital <- as.numeric(births$marital, levels = levels(births$marital))
births_numeric$mom_white <- as.numeric(births$mom_white, levels = levels(births$mom_white))
births_numeric$sex <- as.numeric(births$sex, levels = levels(births$sex))
c <- cor(births_numeric, use= "pairwise.complete.obs", method = "spearman")
c
## dad_age mom_age len_preg num_visits mom_wt_gain
## dad_age 1.000000000 0.794262548 -0.031710050 0.05900601 -0.04552655
## mom_age 0.794262548 1.000000000 -0.042773785 0.14341061 -0.05028151
## len_preg -0.031710050 -0.042773785 1.000000000 0.13787177 0.06576266
## num_visits 0.059006014 0.143410609 0.137871772 1.00000000 0.07083079
## mom_wt_gain -0.045526551 -0.050281512 0.065762657 0.07083079 1.00000000
## bwt 0.086371777 0.076181800 0.452590772 0.10020952 0.18096286
## maturity 0.481485708 0.588819814 -0.034195286 0.04824250 -0.04737845
## is_premie -0.004102886 -0.012752560 -0.630651495 -0.14326482 -0.14590753
## low_bwt -0.005760923 -0.006005699 -0.416282434 -0.12284366 -0.12083229
## smoke -0.103885980 -0.108385376 -0.008966556 -0.06872315 0.04326961
## mom_age_level 0.770025057 0.974049888 -0.041156960 0.14038995 -0.04123870
## marital -0.358941381 -0.453100372 -0.020186892 -0.21859213 -0.04019315
## mom_white 0.130724383 0.179408733 0.053478896 0.07975005 0.08262090
## sex 0.059692174 -0.002254230 0.025143334 -0.04755096 0.05119186
## bwt maturity is_premie low_bwt smoke
## dad_age 0.08637178 0.481485708 -0.004102886 -0.005760923 -0.103885980
## mom_age 0.07618180 0.588819814 -0.012752560 -0.006005699 -0.108385376
## len_preg 0.45259077 -0.034195286 -0.630651495 -0.416282434 -0.008966556
## num_visits 0.10020952 0.048242501 -0.143264817 -0.122843661 -0.068723152
## mom_wt_gain 0.18096286 -0.047378454 -0.145907527 -0.120832286 0.043269610
## bwt 1.00000000 0.020012654 -0.445992768 -0.544169401 -0.087658394
## maturity 0.02001265 1.000000000 0.023836636 0.030345546 -0.050294575
## is_premie -0.44599277 0.023836636 1.000000000 0.563192923 -0.001598465
## low_bwt -0.54416940 0.030345546 0.563192923 1.000000000 0.039743707
## smoke -0.08765839 -0.050294575 -0.001598465 0.039743707 1.000000000
## mom_age_level 0.07334022 0.604506834 -0.010809773 -0.001966450 -0.100951595
## marital -0.15149952 -0.157858725 0.087095443 0.121480719 0.107220940
## mom_white 0.17073331 0.056131765 -0.078280577 -0.080610599 0.039403578
## sex 0.16461252 -0.006484698 0.030435326 -0.020163852 0.038083607
## mom_age_level marital mom_white sex
## dad_age 0.770025057 -0.358941381 0.13072438 0.059692174
## mom_age 0.974049888 -0.453100372 0.17940873 -0.002254230
## len_preg -0.041156960 -0.020186892 0.05347890 0.025143334
## num_visits 0.140389948 -0.218592128 0.07975005 -0.047550965
## mom_wt_gain -0.041238699 -0.040193147 0.08262090 0.051191864
## bwt 0.073340224 -0.151499522 0.17073331 0.164612523
## maturity 0.604506834 -0.157858725 0.05613177 -0.006484698
## is_premie -0.010809773 0.087095443 -0.07828058 0.030435326
## low_bwt -0.001966450 0.121480719 -0.08061060 -0.020163852
## smoke -0.100951595 0.107220940 0.03940358 0.038083607
## mom_age_level 1.000000000 -0.443794113 0.18999029 0.001196308
## marital -0.443794113 1.000000000 -0.33014869 -0.004251541
## mom_white 0.189990292 -0.330148687 1.00000000 -0.031705094
## sex 0.001196308 -0.004251541 -0.03170509 1.000000000
mom_age and mom_age_level variables have a correlation of 0.974049888
bwt and low_bwt variables have a correlation of -0.5441694
mom_age and dad_age variables have a high correlation of 0.794262548
is_premie and len_preg variable have a high correlation of -0.630651495
library(corrplot)
## corrplot 0.84 loaded
corrplot(c)
library("ggplot2")
g <- ggplot(births, aes(x=dad_age))
g + geom_histogram(binwidth = 3, color="cyan") +
ggtitle("Distribution of dad_age variable") +
xlab("Age of father of baby (years)") + ylab("Count")
## Warning: Removed 171 rows containing non-finite values (stat_bin).
Age of father of baby has the highest count for ages between 32-35years in this dataset.
g1 <- ggplot(births, aes(x=mom_age))
g1 + geom_histogram(binwidth = 3, color="red") +
ggtitle("Distribution of mom_age variable") +
xlab("Age of mother of baby (years)") + ylab("Count")
Age of mother of baby has the highest count for ages between 20-23years in this datasest.
g2 <- ggplot(births, aes(x=maturity))
g2 + geom_bar(color="cyan") +
ggtitle("Distribution of maturity variable") +
xlab("Classifying mother as of advanced maternal age or not") + ylab("Count")
Mother generally tend to be of younger maternal age for this dataset with a count of around 800 in this dataset.
g3 <- ggplot(births, aes(x=len_preg))
g3 + geom_histogram(binwidth = 2, color="purple") +
ggtitle("Distribution of len_preg variable") +
xlab("Length of pregnancy (weeks)") + ylab("Count")
## Warning: Removed 2 rows containing non-finite values (stat_bin).
Length of pregnancy (weeks) has the highest count for 37-39 weeks for this datsest.
g4 <- ggplot(births, aes(x=is_premie))
g4 + geom_bar(color="cyan") +
ggtitle("Distribution of is_premie variable") +
xlab("Classifying baby as either premature or full-term") + ylab("Count")
In this dataset, babies are mostly full term (800 out of 1000).
g5 <- ggplot(births, aes(x=num_visits))
g5 + geom_histogram(binwidth = 2, color="red") +
ggtitle("Distribution of num_visits variable") +
xlab("Number of visits to hospital during pregnancy") + ylab("Count")
## Warning: Removed 9 rows containing non-finite values (stat_bin).
Number of visits to hospital during pregnancy is around 10-15 for most of the mothers having the baby in this dataset.
g6 <- ggplot(births, aes(x=marital))
g6 + geom_bar(color="cyan") +
ggtitle("Distribution of marital variable") +
xlab("Marital status of mother at time of birth") + ylab("Count")
Mother at time of birth tend to be married more than unmarried for this dataset.
g7 <- ggplot(births, aes(x=mom_wt_gain))
g7 + geom_histogram(binwidth = 7, color="purple") +
ggtitle("Distribution of mom_wt_gain variable") +
xlab("Mother's weight gain during pregnancy (pounds)") + ylab("Count")
## Warning: Removed 27 rows containing non-finite values (stat_bin).
Mothers gaining 25-30 pounds during pregnancy have the highest count for this dataset.
g8 <- ggplot(births, aes(x=bwt))
g8 + geom_histogram(binwidth = 1, color="cyan") +
ggtitle("Distribution of bwt variable") +
xlab("Birth weight of baby (pounds)") + ylab("Count")
Babies with birth weight 6.5-7.5 pounds have the highest count for this dataset.
g9 <- ggplot(births, aes(x=low_bwt))
g9 + geom_bar(color="red") +
ggtitle("Distribution of low_bwt variable") +
xlab("Classifying baby as either low birthweight or not") + ylab("Count")
Babies tend not to have a low birth weight in this dataset (around 800 out of 1000).
g10 <- ggplot(births, aes(x=sex))
g10 + geom_bar(color="orange") +
ggtitle("Distribution of sex variable") +
xlab("Sex of baby") + ylab("Count")
There are almost as many male in this dataset as there are female.
g11 <- ggplot(births, aes(x=smoke))
g11 + geom_bar(color="red") +
ggtitle("Distribution of smoke variable") +
xlab("Smoking status of mother") + ylab("Count")
This dataset has a majority of non smoking mother rather than smoking mothers (800 out of 1000).
g12 <- ggplot(births, aes(x=mom_white))
g12 + geom_bar(color="cyan") +
ggtitle("Distribution of mom_white variable") +
xlab("Classifying mother as either white or not") + ylab("Count")
For this dataset, white mother has the highest count (700 out of 1000).
g13 <- ggplot(births, aes(x=mom_age_level))
g13 + geom_bar(color="purple") +
ggtitle("Distribution of mom_age_level variable") +
xlab("Age level of mother of baby (years)") + ylab("Count")
Early 20s age level has the highest count for age level of mother of baby followed by late 20s, early 30s, 35+, teens respectively.
g14 <- ggplot(births)
g14 + geom_density(aes(x=bwt, color=smoke))+
ggtitle("Distribution of bwt variable with respect to smoking status of mother") +
xlab("Birthweight of baby (pounds)") + ylab("Count")
## Warning: Groups with fewer than two data points have been dropped.
Non smoking mothers tend to give birth to babies with birtweight (pounds) slightly higher than smoking mothers on average for this dataset.
g15 <- ggplot(births)
g15 + geom_boxplot(aes(x=factor(mom_age_level), y=bwt)) +
ggtitle("Distribution of mom_age_level and bwt variable") +
xlab("Age level of mother of baby (years)") + ylab("Birthweight of baby (pounds)")
Median birthweight of baby for teens is the lowest while for late20s is the highest for this dataset. On average, 75th percentile of each level of mother’s age have around the same birthweight of baby (pounds) for this dataset.
g16 <- ggplot(births)
g16 + geom_bar(aes(x=factor(mom_white), fill=low_bwt), position = "dodge") +
ggtitle("Distribution of mom_white and low_bwt variable") +
xlab("Mother as either white or not") + ylab("Count")
Count for White mothers giving birth to low birthweight baby (pounds) is higher than for nonwhite mother for this dataset.
g17 <- ggplot(births)
g17 + geom_point(aes(x=bwt, y= mom_wt_gain,color=factor(maturity))) +
ggtitle("Distribution of mom_wt_gain and bwt variable with respect to maturity variable") +
xlab("Birthweight of baby (pounds)") + ylab("Mother's weight gain during pregnancy (pounds)")
## Warning: Removed 27 rows containing missing values (geom_point).
Mother’s weight gain during pregnancy (pounds) tends to be slightly higher on average, for younger mothers than for the advanced mothers, for the same birthweight of baby (pounds), for this dataset.
g18 <- ggplot(births)
g18 + geom_point(aes(x=bwt, y=num_visits, color=factor(is_premie))) +
ggtitle("Distribution of bwt and num_visits variable") +
xlab("Birthweight of baby (pounds)") + ylab("Number of visits to hospital during pregnancy")
## Warning: Removed 9 rows containing missing values (geom_point).
Premature babies tend to have lesser birtweight of baby (pounds) which is usually observed. On average, mother of premature babies tend to have lesser number of visits to hospital during pregnancy than mother of fullterm babies for this dataset.
t.test(births$bwt ~ births$smoke)
##
## Welch Two Sample t-test
##
## data: births$bwt by births$smoke
## t = 2.359, df = 171.32, p-value = 0.01945
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.05151165 0.57957328
## sample estimates:
## mean in group nonsmoker mean in group smoker
## 7.144273 6.828730
With a p-value equal to 0.01945, there is sufficient evidence to reject the null hypothesis and accept the alternate hypothesis
Null Hypothesis: There is no difference in true means of birthweight of baby (pounds) due to smoking or non smoking mother
Alternate Hypothesis: There is difference in true means of birthweight of baby (pounds) due to smoking or non smoking mother
t.test(births$len_preg ~ births$low_bwt)
##
## Welch Two Sample t-test
##
## data: births$len_preg by births$low_bwt
## t = 12.188, df = 113.43, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 4.618782 6.411808
## sample estimates:
## mean in group notlow mean in group low
## 38.94257 33.42727
With a p-value equal to < 2.2e-16, there is sufficient evidence to reject the null hypothesis and accept the alternate hypothesis
Null Hypothesis: There is no difference in true means of length of pregnancy (weeks) (len_preg variable) due to notlow or low birthweight of baby (low_bwt variable)
Alternate Hypothesis: There is difference in true means of length of pregnancy (weeks) (len_preg variable) due to notlow or low birthweight of baby (low_bwt variable)
t.test(births$mom_wt_gain ~ births$smoke)
##
## Welch Two Sample t-test
##
## data: births$mom_wt_gain by births$smoke
## t = -1.2229, df = 150.17, p-value = 0.2233
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -4.786411 1.126667
## sample estimates:
## mean in group nonsmoker mean in group smoker
## 30.09636 31.92623
With a p-value equal to 0.2233, there is not sufficient evidence to reject the null hypothesis and accept the alternate hypothesis
Null Hypothesis: There is no difference in true means of mother’s weight gain during pregnancy (pounds) due to whether mother of baby smokes or not.
Alternate Hypothesis: There is difference in true means of mother’s weight gain during pregnancy (pounds) due to whether mother of baby smokes or not.
Power test for the above t-test:
pwr.t2n.test(n1=873, n2=126, d = .2, sig.level = .05)
##
## t test power calculation
##
## n1 = 873
## n2 = 126
## d = 0.2
## sig.level = 0.05
## power = 0.5543761
## alternative = two.sided
Since power equals to 0.5543761, there is 55.43% probability of finding a small effect if it is there because of smoking status of mother due to random sampling alone and that explains why we did not find enough evidence to reject the null hypothesis.
pwr.t2n.test(n1=873, n2=126, d = .5, sig.level = .05)
##
## t test power calculation
##
## n1 = 873
## n2 = 126
## d = 0.5
## sig.level = 0.05
## power = 0.9994839
## alternative = two.sided
Since power equals to 0.9994839, there is 99.94% probability of finding a medium size effect if it is there because of smoking status of mother due to random sampling alone.
pwr.t2n.test(n1=873, n2=126, d = .8, sig.level = .05)
##
## t test power calculation
##
## n1 = 873
## n2 = 126
## d = 0.8
## sig.level = 0.05
## power = 1
## alternative = two.sided
Since power equals to 1, there is 100% probability of finding a large size effect if it is there because of smoking status of mother due to random sampling alone.
summary(aov(births$mom_wt_gain ~ births$mom_age_level))
## Df Sum Sq Mean Sq F value Pr(>F)
## births$mom_age_level 4 632 157.9 0.778 0.54
## Residuals 968 196504 203.0
## 27 observations deleted due to missingness
With a p-value equal to 0.54, there is not sufficient evidence to reject the null hypothesis and accept the alternate hypothesis
Null Hypothesis: All true means of mother’s weight gain during pregnancy (pounds) for different mom age levels is same
Alternate Hypothesis: Atleast one true mean of mother’s weight gain during pregnancy (pounds) for different mom age levels is different
We could do power test to see if there really is an effect to be detected in the sample.
summary(aov(births$bwt ~ births$mom_age_level))
## Df Sum Sq Mean Sq F value Pr(>F)
## births$mom_age_level 4 14.8 3.699 1.629 0.165
## Residuals 995 2259.6 2.271
With a p-value equal to 0.165, there is not sufficient evidence to reject the null hypothesis and accept the alternate hypothesis
Null Hypothesis: All true means of birth weight of baby (pounds) for different mom age levels is same
Alternate Hypothesis: Atleast one true mean of birth weight of baby (pounds) for different mom age levels is different
We could do power test to see if there really is an effect to be detected in the sample.
wilcox.test(births_numeric$mom_age_level ~ births_numeric$smoke)
##
## Wilcoxon rank sum test with continuity correction
##
## data: births_numeric$mom_age_level by births_numeric$smoke
## W = 64393, p-value = 0.001428
## alternative hypothesis: true location shift is not equal to 0
With a p-value equal to 0.001428 there is sufficient evidence to reject the null hypothesis and accept the alternate hypothesis
Null hypothesis: True median age for non smoking mothers is not different from true median age for smoking mothers.
Alternate Hypothesis: True median age for non smoking mothers is different from true median age for smoking mothers.
wilcox.test(births_numeric$mom_age_level ~ births_numeric$is_premie)
##
## Wilcoxon rank sum test with continuity correction
##
## data: births_numeric$mom_age_level by births_numeric$is_premie
## W = 65383, p-value = 0.733
## alternative hypothesis: true location shift is not equal to 0
With a p-value equal to 0.733 there is not sufficient evidence to reject the null hypothesis and accept the alternate hypothesis
Null hypothesis: True median age of mothers giving birth to premature babies is not different from true median age of mothers giving birth to fullterm babies
Alternate Hypothesis: True median age of mothers giving birth to premature babies is different from true median age of mothers giving birth to fullterm babies
We could do power test to see if there really is an effect to be detected in the sample.
wilcox.test(births_numeric$mom_age_level ~ births_numeric$low_bwt)
##
## Wilcoxon rank sum test with continuity correction
##
## data: births_numeric$mom_age_level by births_numeric$low_bwt
## W = 49513, p-value = 0.9506
## alternative hypothesis: true location shift is not equal to 0
With a p-value equal to 0.9506 there is not sufficient evidence to reject the null hypothesis and accept the alternate hypothesis
Null hypothesis: True median age of mothers giving birth to babies with low birthweight (pounds) is not different from true median age of mothers giving birth to babies with not low birthweight (pounds)
Alternate Hypothesis: True median age of mothers giving birth to babies with low birthweight (pounds) is different from true median age of mothers giving birth to babies with not low birthweight (pounds)
We could do power test to see if there really is an effect to be detected in the sample.
chisq.test(table(births$mom_white, births$is_premie))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(births$mom_white, births$is_premie)
## X-squared = 5.6307, df = 1, p-value = 0.01765
With a p-value equal to 0.01765, there is enough evidence to reject the null hypothesis and accept the alternate hypothesis
Null Hypothesis: mom_white and is_premie variables are independent
Alternate Hypothesis: mom_white and is_premie variables are not independent
chisq.test(table(births$low_bwt, births$mom_white))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(births$low_bwt, births$mom_white)
## X-squared = 5.9293, df = 1, p-value = 0.01489
With a p-value equal to p-value = 0.01489, there is enough evidence to reject the null hypothesis and accept the alternate hypothesis
Null Hypothesis: low_bwt and mom_white variables are independent
Alternate Hypothesis: low_bwt and mom_white variables are not independent
chisq.test(table(births$maturity, births$smoke))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(births$maturity, births$smoke)
## X-squared = 2.0994, df = 1, p-value = 0.1474
With a p-value equal to p-value = 0.1474, there is not enough evidence to reject the null hypothesis and accept the alternate hypothesis
Null Hypothesis: Maturity and smoke variables are independent
Alternate Hypothesis: Maturity and smoke variables are not independent
Power test for the above chi square test:
cohen.ES(test = "chisq", size = "small")
##
## Conventional effect size from Cohen (1982)
##
## test = chisq
## size = small
## effect.size = 0.1
pwr.chisq.test(w = 0.1, power = 0.9, df = 1, sig.level = 0.05)
##
## Chi squared power calculation
##
## w = 0.1
## N = 1050.742
## df = 1
## sig.level = 0.05
## power = 0.9
##
## NOTE: N is the number of observations
With N=1050.742, we would need a slightly bigger sample size for detecting a small effect if it is there through random sampling alone.
cohen.ES(test = "chisq", size = "medium")
##
## Conventional effect size from Cohen (1982)
##
## test = chisq
## size = medium
## effect.size = 0.3
pwr.chisq.test(w = 0.3, power = 0.9, df = 1, sig.level = 0.05)
##
## Chi squared power calculation
##
## w = 0.3
## N = 116.7491
## df = 1
## sig.level = 0.05
## power = 0.9
##
## NOTE: N is the number of observations
N = 116.7491, for detecting a medium effect if there is any through random sampling alone
chisq.test(table(births$low_bwt, births$marital))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(births$low_bwt, births$marital)
## X-squared = 13.957, df = 1, p-value = 0.0001871
With a p-value equal to p-value = 0.0001871, there is enough evidence to reject the null hypothesis and accept the alternate hypothesis
Null Hypothesis: low_bwt and marital variables are independent
Alternate Hypothesis: low_bwt and marital variables are not independent
chisq.test(table(births$smoke, births$mom_age_level))
##
## Pearson's Chi-squared test
##
## data: table(births$smoke, births$mom_age_level)
## X-squared = 12.648, df = 4, p-value = 0.01313
With a p-value equal to p-value = 0.01313, there is enough evidence to reject the null hypothesis and accept the alternate hypothesis
Null Hypothesis: smoke and mom_age_level variables are independent
Alternate Hypothesis: smoke and mom_age_level variables are not independent
chisq.test(table(births$low_bwt, births$mom_age_level))
##
## Pearson's Chi-squared test
##
## data: table(births$low_bwt, births$mom_age_level)
## X-squared = 3.3818, df = 4, p-value = 0.4961
With a p-value equal to p-value = 0.4961, there is not enough evidence to reject the null hypothesis and accept the alternate hypothesis
Null Hypothesis: low_bwt and mom_age_level variables are independent
Alternate Hypothesis: low_bwt and mom_age_level variables are not independent
Power test for the above chi square test:
cohen.ES(test = "chisq", size = "small")
##
## Conventional effect size from Cohen (1982)
##
## test = chisq
## size = small
## effect.size = 0.1
pwr.chisq.test(w = 0.1, power = 0.9, df = 4, sig.level = 0.05)
##
## Chi squared power calculation
##
## w = 0.1
## N = 1540.505
## df = 4
## sig.level = 0.05
## power = 0.9
##
## NOTE: N is the number of observations
With N=1540.505, we would need a bigger sample size for detecting a small effect if it is there through random sampling alone.
cohen.ES(test = "chisq", size = "medium")
##
## Conventional effect size from Cohen (1982)
##
## test = chisq
## size = medium
## effect.size = 0.3
pwr.chisq.test(w = 0.3, power = 0.9, df = 4, sig.level = 0.05)
##
## Chi squared power calculation
##
## w = 0.3
## N = 171.1672
## df = 4
## sig.level = 0.05
## power = 0.9
##
## NOTE: N is the number of observations
N = 171.1672, for detecting a medium effect if there is any through random sampling alone
fit1 <- lm(bwt ~ dad_age+mom_age+maturity+len_preg+is_premie+num_visits+marital+mom_wt_gain+low_bwt+sex+smoke+mom_white+mom_age_level, data=births)
summary(fit1)
##
## Call:
## lm(formula = bwt ~ dad_age + mom_age + maturity + len_preg +
## is_premie + num_visits + marital + mom_wt_gain + low_bwt +
## sex + smoke + mom_white + mom_age_level, data = births)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.4770 -0.6012 -0.0021 0.5494 3.5563
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.2179472 0.8220275 -0.265 0.79098
## dad_age 0.0080555 0.0077365 1.041 0.29809
## mom_age 0.0090994 0.0218822 0.416 0.67764
## maturityadvanced -0.0731887 0.4196130 -0.174 0.86158
## len_preg 0.1718275 0.0181833 9.450 < 2e-16 ***
## is_premiepremie -0.0442754 0.1440740 -0.307 0.75869
## num_visits 0.0003972 0.0089395 0.044 0.96457
## maritalunmarried -0.1172818 0.0853089 -1.375 0.16959
## mom_wt_gain 0.0071629 0.0023449 3.055 0.00233 **
## low_bwtlow -2.3881336 0.1431235 -16.686 < 2e-16 ***
## sexmale 0.3821920 0.0653943 5.844 7.45e-09 ***
## smokesmoker -0.2143158 0.1068650 -2.005 0.04525 *
## mom_whitewhite 0.2458025 0.0807928 3.042 0.00243 **
## mom_age_levelearly20s 0.0825564 0.1538582 0.537 0.59171
## mom_age_levellate20s 0.0187617 0.2309455 0.081 0.93527
## mom_age_levelearly30s -0.0630545 0.3222707 -0.196 0.84493
## mom_age_level35+ NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9136 on 784 degrees of freedom
## (200 observations deleted due to missingness)
## Multiple R-squared: 0.6055, Adjusted R-squared: 0.598
## F-statistic: 80.24 on 15 and 784 DF, p-value: < 2.2e-16
plot(fit1,1)
Removing mom_age_level variable as it is a dichotomised version of mom_age variable with a high correlation as shown above in the script (0.974049888)
fit2 <- lm(bwt ~ dad_age+mom_age+maturity+len_preg+is_premie+num_visits+marital+mom_wt_gain+low_bwt+sex+smoke+mom_white, data=births)
summary(fit2)
##
## Call:
## lm(formula = bwt ~ dad_age + mom_age + maturity + len_preg +
## is_premie + num_visits + marital + mom_wt_gain + low_bwt +
## sex + smoke + mom_white, data = births)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.4327 -0.6086 -0.0148 0.5625 3.5285
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.0049431 0.7583592 -0.007 0.99480
## dad_age 0.0084262 0.0077106 1.093 0.27481
## mom_age 0.0003153 0.0101018 0.031 0.97511
## maturityadvanced 0.0068293 0.1215602 0.056 0.95521
## len_preg 0.1723625 0.0181543 9.494 < 2e-16 ***
## is_premiepremie -0.0442005 0.1438363 -0.307 0.75870
## num_visits 0.0009585 0.0089045 0.108 0.91431
## maritalunmarried -0.1272476 0.0839416 -1.516 0.12994
## mom_wt_gain 0.0070000 0.0023375 2.995 0.00283 **
## low_bwtlow -2.3906454 0.1428852 -16.731 < 2e-16 ***
## sexmale 0.3803505 0.0652462 5.829 8.11e-09 ***
## smokesmoker -0.2139060 0.1066228 -2.006 0.04518 *
## mom_whitewhite 0.2441426 0.0802893 3.041 0.00244 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9126 on 787 degrees of freedom
## (200 observations deleted due to missingness)
## Multiple R-squared: 0.6049, Adjusted R-squared: 0.5989
## F-statistic: 100.4 on 12 and 787 DF, p-value: < 2.2e-16
plot(fit2,1)
Removing low_bwt variable as it is a dichotomised version of bwt variable with a high correlation as shown above (-0.5441694)
fit3 <- lm(bwt ~ dad_age+mom_age+maturity+len_preg+is_premie+num_visits+marital+mom_wt_gain+sex+smoke+mom_white, data=births)
summary(fit3)
##
## Call:
## lm(formula = bwt ~ dad_age + mom_age + maturity + len_preg +
## is_premie + num_visits + marital + mom_wt_gain + sex + smoke +
## mom_white, data = births)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.7553 -0.6560 -0.0184 0.6780 3.6461
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.833457 0.841302 -4.557 6.02e-06 ***
## dad_age 0.004544 0.008968 0.507 0.612484
## mom_age 0.004895 0.011750 0.417 0.677078
## maturityadvanced -0.053106 0.141387 -0.376 0.707308
## len_preg 0.264976 0.020119 13.171 < 2e-16 ***
## is_premiepremie -0.617364 0.162552 -3.798 0.000157 ***
## num_visits 0.004374 0.010359 0.422 0.672980
## maritalunmarried -0.208628 0.097511 -2.140 0.032699 *
## mom_wt_gain 0.009118 0.002716 3.357 0.000825 ***
## sexmale 0.398622 0.075910 5.251 1.95e-07 ***
## smokesmoker -0.304327 0.123907 -2.456 0.014261 *
## mom_whitewhite 0.290618 0.093369 3.113 0.001922 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.062 on 788 degrees of freedom
## (200 observations deleted due to missingness)
## Multiple R-squared: 0.4644, Adjusted R-squared: 0.457
## F-statistic: 62.12 on 11 and 788 DF, p-value: < 2.2e-16
plot(fit3,1)
Removing maturity variable as it is the most insignificant variable with a p-value of 0.707308 (Significance level of 0.05)
fit4 <- lm(bwt ~ dad_age+mom_age+len_preg+is_premie+num_visits+marital+mom_wt_gain+sex+smoke+mom_white, data=births)
summary(fit4)
##
## Call:
## lm(formula = bwt ~ dad_age + mom_age + len_preg + is_premie +
## num_visits + marital + mom_wt_gain + sex + smoke + mom_white,
## data = births)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.7438 -0.6576 -0.0143 0.6790 3.6576
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.778629 0.828091 -4.563 5.84e-06 ***
## dad_age 0.004543 0.008963 0.507 0.612403
## mom_age 0.002701 0.010190 0.265 0.791002
## len_preg 0.264946 0.020107 13.177 < 2e-16 ***
## is_premiepremie -0.619446 0.162369 -3.815 0.000147 ***
## num_visits 0.004483 0.010349 0.433 0.665026
## maritalunmarried -0.215704 0.095622 -2.256 0.024356 *
## mom_wt_gain 0.009093 0.002714 3.351 0.000844 ***
## sexmale 0.399191 0.075854 5.263 1.83e-07 ***
## smokesmoker -0.304207 0.123840 -2.456 0.014246 *
## mom_whitewhite 0.291236 0.093304 3.121 0.001866 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.061 on 789 degrees of freedom
## (200 observations deleted due to missingness)
## Multiple R-squared: 0.4643, Adjusted R-squared: 0.4575
## F-statistic: 68.39 on 10 and 789 DF, p-value: < 2.2e-16
plot(fit4,1)
Removing mom_age variable as it is the most insignificant variable with a p-value of 0.791002 (Significance level of 0.05)
fit5 <- lm(bwt ~ dad_age+len_preg+is_premie+num_visits+marital+mom_wt_gain+sex+smoke+mom_white, data=births)
summary(fit5)
##
## Call:
## lm(formula = bwt ~ dad_age + len_preg + is_premie + num_visits +
## marital + mom_wt_gain + sex + smoke + mom_white, data = births)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.7360 -0.6565 -0.0124 0.6773 3.6639
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.741310 0.815557 -4.587 5.22e-06 ***
## dad_age 0.006310 0.005989 1.054 0.292364
## len_preg 0.264527 0.020033 13.204 < 2e-16 ***
## is_premiepremie -0.620669 0.162208 -3.826 0.000140 ***
## num_visits 0.004749 0.010294 0.461 0.644700
## maritalunmarried -0.221589 0.092954 -2.384 0.017368 *
## mom_wt_gain 0.009059 0.002709 3.344 0.000865 ***
## sexmale 0.398450 0.075758 5.260 1.86e-07 ***
## smokesmoker -0.306345 0.123504 -2.480 0.013329 *
## mom_whitewhite 0.291292 0.093249 3.124 0.001850 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.061 on 790 degrees of freedom
## (200 observations deleted due to missingness)
## Multiple R-squared: 0.4643, Adjusted R-squared: 0.4582
## F-statistic: 76.07 on 9 and 790 DF, p-value: < 2.2e-16
plot(fit5,1)
Removing num_visits variable as it is the most insignificant variable with a p-value of 0.644700 (Significance level of 0.05)
fit6 <- lm(bwt ~ dad_age+len_preg+is_premie+marital+mom_wt_gain+sex+smoke+mom_white, data=births)
summary(fit6)
##
## Call:
## lm(formula = bwt ~ dad_age + len_preg + is_premie + marital +
## mom_wt_gain + sex + smoke + mom_white, data = births)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.7653 -0.6583 -0.0149 0.6669 3.6883
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.955459 0.790216 -5.006 6.86e-07 ***
## dad_age 0.007065 0.005940 1.189 0.234621
## len_preg 0.270831 0.019219 14.092 < 2e-16 ***
## is_premiepremie -0.606265 0.161020 -3.765 0.000179 ***
## maritalunmarried -0.218020 0.090826 -2.400 0.016605 *
## mom_wt_gain 0.009433 0.002676 3.526 0.000447 ***
## sexmale 0.390730 0.075147 5.200 2.54e-07 ***
## smokesmoker -0.306601 0.123186 -2.489 0.013016 *
## mom_whitewhite 0.288232 0.092247 3.125 0.001845 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.058 on 798 degrees of freedom
## (193 observations deleted due to missingness)
## Multiple R-squared: 0.4828, Adjusted R-squared: 0.4776
## F-statistic: 93.11 on 8 and 798 DF, p-value: < 2.2e-16
plot(fit6,1)
Removing dad_age variable as it is the most insignificant variable with a p-value of 0.234621 (Significance level of 0.05)
fit7 <- lm(bwt ~ len_preg+is_premie+marital+mom_wt_gain+sex+smoke+mom_white, data=births)
summary(fit7)
##
## Call:
## lm(formula = bwt ~ len_preg + is_premie + marital + mom_wt_gain +
## sex + smoke + mom_white, data = births)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.7412 -0.6448 -0.0237 0.6784 4.1543
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.096698 0.687181 -5.962 3.50e-09 ***
## len_preg 0.282229 0.017408 16.212 < 2e-16 ***
## is_premiepremie -0.523456 0.142239 -3.680 0.000246 ***
## maritalunmarried -0.247769 0.075388 -3.287 0.001051 **
## mom_wt_gain 0.008355 0.002427 3.442 0.000602 ***
## sexmale 0.396837 0.068709 5.776 1.03e-08 ***
## smokesmoker -0.383683 0.104092 -3.686 0.000241 ***
## mom_whitewhite 0.218897 0.080821 2.708 0.006880 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.064 on 962 degrees of freedom
## (30 observations deleted due to missingness)
## Multiple R-squared: 0.494, Adjusted R-squared: 0.4903
## F-statistic: 134.2 on 7 and 962 DF, p-value: < 2.2e-16
plot(fit7)
bwt = 0.282229 * len_preg + (-0.523456) * is_premiepremie + (-0.247769) * maritalunmarried + 0.008355 * mom_wt_gain + 0.396837 * sexmale + (-0.383683) * smokesmoker + 0.218897 * mom_whitewhite + (-4.096698)
For every one week increase in length of pregnancy (len_preg variable), the birthweight of baby (bwt variable) increases by 0.282229 pounds on average when all other variables in the model are held constant.
For premature (is_premiepremie variable) as compared with fullterm, birthweight of baby (bwt variable) decreases by 0.523456 pounds on average when all other variables in the model are held constant.
For ummarried mother at time of birth (maritalunmarried variable) as compared with married mother at time of birth, birthweight of baby (bwt variable) decreases by 0.247769 pounds on average when all other variables in the model are held constant.
For every one pound increase in mother’s weight during pregnancy (mom_wt_gain variable), the birthweight of baby (bwt variable) increases by 0.008355 pounds on average when all other variables in the model are held constant.
For male baby (sexmale variable) as compared with female baby, birthweight of baby (bwt variable) increases by 0.396837 pounds on average when all other variables in the model are held constant.
For white mother (mom_whitewhite variable) as compared with nonwhite mother, birthweight of baby (bwt variable) increases by 0.218897 pounds on average when all other variables in the model are held constant.
Even though is_premie variable is a dichotomised version of len_preg variable and has a high correlation, both the variables are present in the final model. Removing either one of the variable, resulted in a slightly worse Adjusted R-squared (0.4837) and the value of fitted coefficients were around same and hence this final model has been chosen.
fit_all <- glm(low_bwt ~ ., family = binomial(), data = births)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(fit_all)
##
## Call:
## glm(formula = low_bwt ~ ., family = binomial(), data = births)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.431e-04 -2.100e-08 -2.100e-08 -2.100e-08 2.778e-04
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 9.548e+02 1.458e+05 0.007 0.995
## dad_age 3.359e+00 9.207e+02 0.004 0.997
## mom_age 3.718e+00 2.668e+03 0.001 0.999
## maturityadvanced -1.473e+02 6.438e+04 -0.002 0.998
## len_preg -3.282e+00 2.690e+03 -0.001 0.999
## is_premiepremie -2.840e+01 1.373e+04 -0.002 0.998
## num_visits 1.576e+00 1.079e+03 0.001 0.999
## maritalunmarried 1.859e+01 6.864e+03 0.003 0.998
## mom_wt_gain 5.582e-02 2.668e+02 0.000 1.000
## bwt -1.758e+02 1.299e+04 -0.014 0.989
## sexmale -5.662e+00 5.273e+03 -0.001 0.999
## smokesmoker 6.726e+00 6.180e+03 0.001 0.999
## mom_whitewhite 1.349e+01 7.363e+03 0.002 0.999
## mom_age_levelearly20s -3.415e+01 1.116e+04 -0.003 0.998
## mom_age_levellate20s -9.080e+01 2.875e+04 -0.003 0.997
## mom_age_levelearly30s -1.363e+02 4.292e+04 -0.003 0.997
## mom_age_level35+ NA NA NA NA
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4.9325e+02 on 799 degrees of freedom
## Residual deviance: 4.5427e-07 on 784 degrees of freedom
## (200 observations deleted due to missingness)
## AIC: 32
##
## Number of Fisher Scoring iterations: 25
Warnings show a case of perfect linear separation.
pR2(fit_all)
## llh llhNull G2 McFadden r2ML
## -2.271326e-07 -3.486010e+02 6.972020e+02 1.000000e+00 5.816774e-01
## r2CU
## 1.000000e+00
It shows a perfect fit but the p-values of the fitted coefficients showed that the fitted coefficients are not significant.
McFadden = 1.000000e+00
r2CU = 1.000000e+00
exp(coef(fit_all))
## (Intercept) dad_age mom_age
## Inf 2.874839e+01 4.117821e+01
## maturityadvanced len_preg is_premiepremie
## 1.101887e-64 3.753741e-02 4.627576e-13
## num_visits maritalunmarried mom_wt_gain
## 4.833817e+00 1.181766e+08 1.057407e+00
## bwt sexmale smokesmoker
## 4.705249e-77 3.473805e-03 8.336749e+02
## mom_whitewhite mom_age_levelearly20s mom_age_levellate20s
## 7.219851e+05 1.479959e-15 3.689081e-40
## mom_age_levelearly30s mom_age_level35+
## 6.404289e-60 NA
The Odds Ratio for variables bwt and mom_age_level (mom_age_levelearly20s, mom_age_levellate20s, mom_age_levelearly30s, mom_age_level35+) is too large.
vif(fit_all)
Error: There are aliased coefficients in the model
c1 <- cor(x = births_numeric$bwt, y = births_numeric$low_bwt, use = "pairwise.complete.obs", method = "spearman")
c1
## [1] -0.5441694
Removing bwt variable as it has high negative correlation with low_bwt variable (-0.5441694) and low_bwt variable is dichotomised version of bwt variables
fit_all2 <- glm(low_bwt ~ dad_age+mom_age+maturity+len_preg+is_premie+num_visits+marital+mom_wt_gain+sex+smoke+mom_white+mom_age_level, family = binomial(), data = births)
summary(fit_all2)
##
## Call:
## glm(formula = low_bwt ~ dad_age + mom_age + maturity + len_preg +
## is_premie + num_visits + marital + mom_wt_gain + sex + smoke +
## mom_white + mom_age_level, family = binomial(), data = births)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6657 -0.2811 -0.1827 -0.1100 3.5119
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 24.79866 5.46594 4.537 5.71e-06 ***
## dad_age 0.04050 0.03957 1.024 0.306
## mom_age -0.10523 0.11832 -0.889 0.374
## maturityadvanced 1.68805 2.22231 0.760 0.447
## len_preg -0.69667 0.13195 -5.280 1.29e-07 ***
## is_premiepremie 0.35991 0.58387 0.616 0.538
## num_visits 0.01975 0.04143 0.477 0.634
## maritalunmarried 0.51847 0.42012 1.234 0.217
## mom_wt_gain -0.01940 0.01314 -1.476 0.140
## sexmale -0.25239 0.34547 -0.731 0.465
## smokesmoker 0.74374 0.45724 1.627 0.104
## mom_whitewhite -0.33757 0.40330 -0.837 0.403
## mom_age_levelearly20s 0.30859 0.77343 0.399 0.690
## mom_age_levellate20s 0.39519 1.23468 0.320 0.749
## mom_age_levelearly30s 0.94004 1.73235 0.543 0.587
## mom_age_level35+ NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 493.25 on 799 degrees of freedom
## Residual deviance: 262.63 on 785 degrees of freedom
## (200 observations deleted due to missingness)
## AIC: 292.63
##
## Number of Fisher Scoring iterations: 7
pR2(fit_all2)
## llh llhNull G2 McFadden r2ML r2CU
## -131.3145492 -348.6009843 434.5728701 0.6233099 0.4191229 0.7205418
vif(fit_all2) Error still showing
c2 <- cor(x=births_numeric$mom_age_level, y=births_numeric$mom_age, use = "pairwise.complete.obs", method = "spearman")
c2
## [1] 0.9740499
Hence removing mom_age_level variable, since it is a dichotomised version of mom_age # and has a very high positive correlation as shown (0.9740499)
fit_all3 <- glm(low_bwt ~ dad_age+mom_age+maturity+len_preg+is_premie+num_visits+marital+mom_wt_gain+sex+smoke+mom_white, family = binomial(), data = births)
summary(fit_all3)
##
## Call:
## glm(formula = low_bwt ~ dad_age + mom_age + maturity + len_preg +
## is_premie + num_visits + marital + mom_wt_gain + sex + smoke +
## mom_white, family = binomial(), data = births)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7338 -0.2814 -0.1824 -0.1110 3.4843
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 24.15270 5.20897 4.637 3.54e-06 ***
## dad_age 0.03820 0.03926 0.973 0.3306
## mom_age -0.05194 0.05300 -0.980 0.3271
## maturityadvanced 0.64422 0.62817 1.026 0.3051
## len_preg -0.69969 0.13248 -5.282 1.28e-07 ***
## is_premiepremie 0.33354 0.58311 0.572 0.5673
## num_visits 0.01745 0.04098 0.426 0.6702
## maritalunmarried 0.47216 0.40792 1.157 0.2471
## mom_wt_gain -0.02004 0.01311 -1.529 0.1263
## sexmale -0.26192 0.34342 -0.763 0.4457
## smokesmoker 0.76346 0.45429 1.681 0.0928 .
## mom_whitewhite -0.33514 0.39941 -0.839 0.4014
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 493.25 on 799 degrees of freedom
## Residual deviance: 263.19 on 788 degrees of freedom
## (200 observations deleted due to missingness)
## AIC: 287.19
##
## Number of Fisher Scoring iterations: 7
pR2(fit_all3)
## llh llhNull G2 McFadden r2ML r2CU
## -131.5953141 -348.6009843 434.0113404 0.6225045 0.4187150 0.7198406
vif(fit_all3)
## dad_age mom_age maturity len_preg is_premie num_visits
## 2.880344 4.175761 2.028880 3.033071 3.035933 1.137955
## marital mom_wt_gain sex smoke mom_white
## 1.339829 1.051122 1.053447 1.051109 1.163147
No warning message of linear separation is found anymore
VIF is < 5
McFadden and r2CU values are not equal to 1
exp(cbind(OR=coef(fit_all3), confint(fit_all3)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 3.085906e+10 2.328560e+06 1.744716e+15
## dad_age 1.038934e+00 9.595182e-01 1.119562e+00
## mom_age 9.493872e-01 8.550548e-01 1.053308e+00
## maturityadvanced 1.904506e+00 5.465061e-01 6.524037e+00
## len_preg 4.967378e-01 3.752908e-01 6.308689e-01
## is_premiepremie 1.395895e+00 4.360910e-01 4.307565e+00
## num_visits 1.017606e+00 9.370674e-01 1.100695e+00
## maritalunmarried 1.603449e+00 7.119340e-01 3.550942e+00
## mom_wt_gain 9.801595e-01 9.547152e-01 1.005131e+00
## sexmale 7.695744e-01 3.884274e-01 1.503356e+00
## smokesmoker 2.145690e+00 8.509763e-01 5.112089e+00
## mom_whitewhite 7.152374e-01 3.305702e-01 1.595971e+00
Removing num_visits variable as it is the most insignificant variable with p-value equal to 0.6702 (Significance level = 0.05)
fit_all4 <- glm(low_bwt ~ dad_age+mom_age+maturity+len_preg+is_premie+marital+mom_wt_gain+sex+smoke+mom_white, family = binomial(), data = births)
summary(fit_all4)
##
## Call:
## glm(formula = low_bwt ~ dad_age + mom_age + maturity + len_preg +
## is_premie + marital + mom_wt_gain + sex + smoke + mom_white,
## family = binomial(), data = births)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7814 -0.2812 -0.1829 -0.1116 3.4334
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 24.10001 5.19906 4.635 3.56e-06 ***
## dad_age 0.03759 0.03939 0.954 0.3400
## mom_age -0.05063 0.05286 -0.958 0.3382
## maturityadvanced 0.66123 0.62709 1.054 0.2917
## len_preg -0.69287 0.13070 -5.301 1.15e-07 ***
## is_premiepremie 0.35156 0.58223 0.604 0.5460
## maritalunmarried 0.43629 0.40243 1.084 0.2783
## mom_wt_gain -0.01992 0.01294 -1.539 0.1237
## sexmale -0.27373 0.34250 -0.799 0.4242
## smokesmoker 0.75881 0.45405 1.671 0.0947 .
## mom_whitewhite -0.33746 0.39851 -0.847 0.3971
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 503.72 on 806 degrees of freedom
## Residual deviance: 263.65 on 796 degrees of freedom
## (193 observations deleted due to missingness)
## AIC: 285.65
##
## Number of Fisher Scoring iterations: 7
pR2(fit_all4)
## llh llhNull G2 McFadden r2ML r2CU
## -131.8265837 -348.6009843 433.5488012 0.6218410 0.4156383 0.7184720
vif(fit_all4)
## dad_age mom_age maturity len_preg is_premie marital
## 2.879465 4.134529 2.018674 2.964980 3.024291 1.305635
## mom_wt_gain sex smoke mom_white
## 1.038430 1.047072 1.047911 1.157161
Removing is_premie variable as it is the most insignificant variable with p-value equal to 0.5460 (Significance level = 0.05)
fit_all5 <- glm(low_bwt ~ dad_age+mom_age+maturity+len_preg+marital+mom_wt_gain+sex+smoke+mom_white, family = binomial(), data = births)
summary(fit_all5)
##
## Call:
## glm(formula = low_bwt ~ dad_age + mom_age + maturity + len_preg +
## marital + mom_wt_gain + sex + smoke + mom_white, family = binomial(),
## data = births)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8231 -0.2876 -0.1821 -0.1065 3.5148
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 26.67656 3.14293 8.488 <2e-16 ***
## dad_age 0.03599 0.03916 0.919 0.3580
## mom_age -0.05013 0.05270 -0.951 0.3415
## maturityadvanced 0.68319 0.62472 1.094 0.2741
## len_preg -0.75773 0.07950 -9.531 <2e-16 ***
## maritalunmarried 0.42427 0.40273 1.053 0.2921
## mom_wt_gain -0.02093 0.01282 -1.633 0.1026
## sexmale -0.25607 0.34075 -0.752 0.4524
## smokesmoker 0.75689 0.45296 1.671 0.0947 .
## mom_whitewhite -0.32764 0.39728 -0.825 0.4095
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 503.72 on 806 degrees of freedom
## Residual deviance: 264.01 on 797 degrees of freedom
## (193 observations deleted due to missingness)
## AIC: 284.01
##
## Number of Fisher Scoring iterations: 7
pR2(fit_all5)
## llh llhNull G2 McFadden r2ML r2CU
## -132.0069775 -348.6009843 433.1880136 0.6213236 0.4153770 0.7180203
Removing sex variable as it is the most insignificant variable with p-value equal to 0.4524 (Significance level = 0.05)
fit_all6 <- glm(low_bwt ~ dad_age+mom_age+maturity+len_preg+marital+mom_wt_gain+smoke+mom_white, family = binomial(), data = births)
summary(fit_all6)
##
## Call:
## glm(formula = low_bwt ~ dad_age + mom_age + maturity + len_preg +
## marital + mom_wt_gain + smoke + mom_white, family = binomial(),
## data = births)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7466 -0.2847 -0.1825 -0.1097 3.5446
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 26.26630 3.05920 8.586 <2e-16 ***
## dad_age 0.03360 0.03903 0.861 0.389
## mom_age -0.04590 0.05249 -0.874 0.382
## maturityadvanced 0.65276 0.62248 1.049 0.294
## len_preg -0.75172 0.07846 -9.581 <2e-16 ***
## maritalunmarried 0.43878 0.40339 1.088 0.277
## mom_wt_gain -0.02039 0.01273 -1.602 0.109
## smokesmoker 0.73449 0.45202 1.625 0.104
## mom_whitewhite -0.31376 0.39631 -0.792 0.429
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 503.72 on 806 degrees of freedom
## Residual deviance: 264.58 on 798 degrees of freedom
## (193 observations deleted due to missingness)
## AIC: 282.58
##
## Number of Fisher Scoring iterations: 7
pR2(fit_all6)
## llh llhNull G2 McFadden r2ML r2CU
## -132.2911031 -348.6009843 432.6197624 0.6205085 0.4149652 0.7173085
Removing mom_white variable as it is the most insignificant variable with p-value equal to 0.429 (Significance level = 0.05)
fit_all6 <- glm(low_bwt ~ dad_age+mom_age+maturity+len_preg+marital+mom_wt_gain+smoke, family = binomial(), data = births)
summary(fit_all6)
##
## Call:
## glm(formula = low_bwt ~ dad_age + mom_age + maturity + len_preg +
## marital + mom_wt_gain + smoke, family = binomial(), data = births)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7954 -0.2909 -0.1823 -0.1128 3.4917
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 26.16053 3.06171 8.544 <2e-16 ***
## dad_age 0.03473 0.03894 0.892 0.3725
## mom_age -0.05262 0.05195 -1.013 0.3111
## maturityadvanced 0.69194 0.62123 1.114 0.2654
## len_preg -0.75095 0.07827 -9.595 <2e-16 ***
## maritalunmarried 0.49784 0.39582 1.258 0.2085
## mom_wt_gain -0.02121 0.01276 -1.663 0.0964 .
## smokesmoker 0.70157 0.44951 1.561 0.1186
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 504.12 on 808 degrees of freedom
## Residual deviance: 265.24 on 801 degrees of freedom
## (191 observations deleted due to missingness)
## AIC: 281.24
##
## Number of Fisher Scoring iterations: 7
pR2(fit_all6)
## llh llhNull G2 McFadden r2ML r2CU
## -132.6180444 -348.6009843 431.9658796 0.6195707 0.4137157 0.7162643
Removing dad_age variable as it is the most insignificant variable with p-value equal to 0.3725 (Significance level = 0.05)
fit_all7 <- glm(low_bwt ~ mom_age+maturity+len_preg+marital+mom_wt_gain+smoke, family = binomial(), data = births)
summary(fit_all7)
##
## Call:
## glm(formula = low_bwt ~ mom_age + maturity + len_preg + marital +
## mom_wt_gain + smoke, family = binomial(), data = births)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8705 -0.3379 -0.2146 -0.1384 3.4401
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 22.979044 2.438407 9.424 <2e-16 ***
## mom_age 0.001178 0.031688 0.037 0.9704
## maturityadvanced 0.285770 0.538977 0.530 0.5960
## len_preg -0.680030 0.061842 -10.996 <2e-16 ***
## maritalunmarried 0.653076 0.306767 2.129 0.0333 *
## mom_wt_gain -0.012138 0.010231 -1.186 0.2354
## smokesmoker 0.597802 0.359837 1.661 0.0967 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 661.33 on 971 degrees of freedom
## Residual deviance: 380.75 on 965 degrees of freedom
## (28 observations deleted due to missingness)
## AIC: 394.75
##
## Number of Fisher Scoring iterations: 6
pR2(fit_all7)
## llh llhNull G2 McFadden r2ML r2CU
## -190.3740946 -348.6009843 316.4537793 0.4538911 0.2778842 0.5428222
Removing mom_age variable as it is the most insignificant variable with p-value equal to 0.9704 (Significance level = 0.05)
fit_all8 <- glm(low_bwt ~ maturity+len_preg+marital+mom_wt_gain+smoke, family = binomial(), data = births)
summary(fit_all8)
##
## Call:
## glm(formula = low_bwt ~ maturity + len_preg + marital + mom_wt_gain +
## smoke, family = binomial(), data = births)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8708 -0.3387 -0.2143 -0.1382 3.4388
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 23.01361 2.25468 10.207 <2e-16 ***
## maturityadvanced 0.29902 0.40417 0.740 0.4594
## len_preg -0.68009 0.06182 -11.001 <2e-16 ***
## maritalunmarried 0.64856 0.28165 2.303 0.0213 *
## mom_wt_gain -0.01216 0.01022 -1.190 0.2341
## smokesmoker 0.59728 0.35952 1.661 0.0966 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 661.33 on 971 degrees of freedom
## Residual deviance: 380.75 on 966 degrees of freedom
## (28 observations deleted due to missingness)
## AIC: 392.75
##
## Number of Fisher Scoring iterations: 6
pR2(fit_all8)
## llh llhNull G2 McFadden r2ML r2CU
## -190.3747849 -348.6009843 316.4523988 0.4538891 0.2778832 0.5428202
Removing maturity variable as it is the most insignificant variable with p-value equal to 0.4594 (Significance level = 0.05)
fit_all9 <- glm(low_bwt ~ len_preg+marital+mom_wt_gain+smoke, family = binomial(), data = births)
summary(fit_all9)
##
## Call:
## glm(formula = low_bwt ~ len_preg + marital + mom_wt_gain + smoke,
## family = binomial(), data = births)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8805 -0.3293 -0.2149 -0.1401 3.4303
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 23.04652 2.24313 10.274 <2e-16 ***
## len_preg -0.67944 0.06148 -11.052 <2e-16 ***
## maritalunmarried 0.61950 0.27820 2.227 0.0260 *
## mom_wt_gain -0.01225 0.01019 -1.202 0.2292
## smokesmoker 0.59078 0.35904 1.645 0.0999 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 661.33 on 971 degrees of freedom
## Residual deviance: 381.28 on 967 degrees of freedom
## (28 observations deleted due to missingness)
## AIC: 391.28
##
## Number of Fisher Scoring iterations: 6
pR2(fit_all9)
## llh llhNull G2 McFadden r2ML r2CU
## -190.6387935 -348.6009843 315.9243815 0.4531318 0.2774908 0.5420537
Removing mom_wt_gain variable as it is the most insignificant variable with p-value equal to 0.2292 (Significance level = 0.05)
fit_all10 <- glm(low_bwt ~ len_preg+marital+smoke, family = binomial(), data = births)
summary(fit_all10)
##
## Call:
## glm(formula = low_bwt ~ len_preg + marital + smoke, family = binomial(),
## data = births)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9426 -0.2910 -0.2058 -0.1408 3.4692
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 23.58983 2.24347 10.515 <2e-16 ***
## len_preg -0.70344 0.06145 -11.447 <2e-16 ***
## maritalunmarried 0.64288 0.27224 2.361 0.0182 *
## smokesmoker 0.52240 0.35765 1.461 0.1441
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 692.56 on 997 degrees of freedom
## Residual deviance: 394.05 on 994 degrees of freedom
## (2 observations deleted due to missingness)
## AIC: 402.05
##
## Number of Fisher Scoring iterations: 6
pR2(fit_all10)
## llh llhNull G2 McFadden r2ML r2CU
## -197.0256110 -348.6009843 303.1507466 0.4348105 0.2619607 0.5210883
Removing smoke variable as it is the most insignificant variable with p-value equal to 0.1441 (Significance level = 0.05)
fit_all11 <- glm(low_bwt ~ len_preg+marital, family = binomial(), data = births)
summary(fit_all11)
##
## Call:
## glm(formula = low_bwt ~ len_preg + marital, family = binomial(),
## data = births)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9834 -0.3012 -0.2132 -0.1479 3.4403
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 23.63838 2.24101 10.548 <2e-16 ***
## len_preg -0.70286 0.06131 -11.464 <2e-16 ***
## maritalunmarried 0.66923 0.27075 2.472 0.0134 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 692.56 on 997 degrees of freedom
## Residual deviance: 396.08 on 995 degrees of freedom
## (2 observations deleted due to missingness)
## AIC: 402.08
##
## Number of Fisher Scoring iterations: 6
pR2(fit_all11)
## llh llhNull G2 McFadden r2ML r2CU
## -198.0396885 -348.6009843 301.1225915 0.4319015 0.2604594 0.5181017
exp(cbind(OR=coef(fit_all11), confint(fit_all11)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 1.845089e+10 2.798761e+08 1.854117e+12
## len_preg 4.951682e-01 4.363693e-01 5.551420e-01
## maritalunmarried 1.952728e+00 1.149141e+00 3.333546e+00
exp(coef(fit_all11))
## (Intercept) len_preg maritalunmarried
## 1.845089e+10 4.951682e-01 1.952728e+00
plot(fit_all11)
McFadden: 0.4319015
r2CU: 0.5181017
logit(low_bwt) = -0.70286 * (len_preg) + 0.66923 * (maritalunmarried) + 23.63838
For every one-unit increase in length of pregnancy (weeks), the odds of having baby with low birthweight (pounds) is multiplied by exp(-0.70286)= 4.951682e-01 (ie: a decrease of 49.5%) on average, when all other variables in the model are held constant.
For umarried mother at time of birth as compared with married mother at time of birth, the odds of having baby with low birthweight (pounds) is multiplied by exp(0.66923)=1.952728 (ie: an increase of 95%) on average when all other variables in the model are held constant.
Creating new dataframe without dad_age column as it has lots of missing information:
birth_2 <- births[,2:14]
birth_2 <- na.omit(birth_2)
summary(birth_2)
## mom_age maturity len_preg is_premie num_visits
## Min. :13.00 younger :835 Min. :20.0 fullterm:819 Min. : 0.0
## 1st Qu.:22.00 advanced:127 1st Qu.:37.0 premie :143 1st Qu.:10.0
## Median :27.00 Median :39.0 Median :12.0
## Mean :27.01 Mean :38.4 Mean :12.2
## 3rd Qu.:32.00 3rd Qu.:40.0 3rd Qu.:15.0
## Max. :50.00 Max. :45.0 Max. :30.0
## marital mom_wt_gain bwt low_bwt sex
## married :599 Min. : 0.00 Min. : 1.000 notlow:860 female:483
## unmarried:363 1st Qu.:20.00 1st Qu.: 6.380 low :102 male :479
## Median :30.00 Median : 7.310
## Mean :30.34 Mean : 7.128
## 3rd Qu.:38.00 3rd Qu.: 8.060
## Max. :85.00 Max. :11.750
## smoke mom_white mom_age_level
## nonsmoker:841 nonwhite:272 teens :103
## smoker :121 white :690 early20s:269
## late20s :250
## early30s:213
## 35+ :127
##
fit_null <- glm(low_bwt ~ 1, family = binomial(), data = birth_2)
summary(fit_null)
##
## Call:
## glm(formula = low_bwt ~ 1, family = binomial(), data = birth_2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.4735 -0.4735 -0.4735 -0.4735 2.1185
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.1320 0.1047 -20.36 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 650.57 on 961 degrees of freedom
## Residual deviance: 650.57 on 961 degrees of freedom
## AIC: 652.57
##
## Number of Fisher Scoring iterations: 4
Removing mom_age_level variable, since it is a dichotomised version of mom_age and has a very high positive correlation as shown (0.9740499)
Removing bwt variable as it has high negative correlation with low_bwt variable (-0.5441694) and low_bwt variable is dichotomised version of bwt variable
fit_main <- glm(low_bwt ~ mom_age+len_preg+num_visits+mom_wt_gain+maturity+is_premie+smoke+marital+mom_white+sex, family = binomial(), data = birth_2)
summary(fit_main)
##
## Call:
## glm(formula = low_bwt ~ mom_age + len_preg + num_visits + mom_wt_gain +
## maturity + is_premie + smoke + marital + mom_white + sex,
## family = binomial(), data = birth_2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8322 -0.3232 -0.2171 -0.1328 3.4597
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 21.013120 4.152910 5.060 4.2e-07 ***
## mom_age -0.002514 0.032108 -0.078 0.9376
## len_preg -0.638696 0.105039 -6.081 1.2e-09 ***
## num_visits 0.040347 0.032789 1.230 0.2185
## mom_wt_gain -0.012158 0.010494 -1.159 0.2466
## maturityadvanced 0.275132 0.543351 0.506 0.6126
## is_premiepremie 0.340374 0.481983 0.706 0.4801
## smokesmoker 0.698477 0.365566 1.911 0.0560 .
## maritalunmarried 0.729563 0.325191 2.243 0.0249 *
## mom_whitewhite 0.034356 0.318194 0.108 0.9140
## sexmale -0.299270 0.283064 -1.057 0.2904
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 650.57 on 961 degrees of freedom
## Residual deviance: 376.60 on 951 degrees of freedom
## AIC: 398.6
##
## Number of Fisher Scoring iterations: 6
fit_step1 <- step(fit_null, scope=list(lower=fit_null, upper=fit_main),direction="both")
## Start: AIC=652.57
## low_bwt ~ 1
##
## Df Deviance AIC
## + len_preg 1 390.59 394.59
## + is_premie 1 455.53 459.53
## + marital 1 636.71 640.71
## + num_visits 1 640.42 644.42
## + mom_wt_gain 1 640.42 644.42
## + mom_white 1 646.24 650.24
## + smoke 1 648.12 652.12
## <none> 650.57 652.57
## + maturity 1 649.44 653.44
## + sex 1 650.22 654.22
## + mom_age 1 650.56 654.56
##
## Step: AIC=394.59
## low_bwt ~ len_preg
##
## Df Deviance AIC
## + marital 1 384.26 390.26
## + smoke 1 387.41 393.41
## <none> 390.59 394.59
## + mom_wt_gain 1 388.72 394.72
## + sex 1 389.71 395.71
## + is_premie 1 389.91 395.91
## + mom_white 1 390.11 396.11
## + mom_age 1 390.34 396.34
## + num_visits 1 390.37 396.37
## + maturity 1 390.46 396.46
## - len_preg 1 650.57 652.57
##
## Step: AIC=390.26
## low_bwt ~ len_preg + marital
##
## Df Deviance AIC
## + smoke 1 381.67 389.67
## <none> 384.26 390.26
## + mom_wt_gain 1 383.00 391.00
## + num_visits 1 383.00 391.00
## + sex 1 383.42 391.42
## + is_premie 1 383.75 391.75
## + maturity 1 383.76 391.76
## + mom_age 1 383.99 391.99
## + mom_white 1 384.24 392.24
## - marital 1 390.59 394.59
## - len_preg 1 636.71 640.71
##
## Step: AIC=389.67
## low_bwt ~ len_preg + marital + smoke
##
## Df Deviance AIC
## <none> 381.67 389.67
## + num_visits 1 380.09 390.09
## - smoke 1 384.26 390.26
## + mom_wt_gain 1 380.30 390.30
## + sex 1 380.56 390.56
## + is_premie 1 381.05 391.05
## + maturity 1 381.12 391.12
## + mom_age 1 381.33 391.33
## + mom_white 1 381.67 391.67
## - marital 1 387.41 393.41
## - len_preg 1 635.35 641.35
summary(fit_step1)
##
## Call:
## glm(formula = low_bwt ~ len_preg + marital + smoke, family = binomial(),
## data = birth_2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9055 -0.2905 -0.2072 -0.1457 3.4396
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 22.94011 2.24980 10.196 <2e-16 ***
## len_preg -0.68642 0.06158 -11.146 <2e-16 ***
## maritalunmarried 0.66335 0.27683 2.396 0.0166 *
## smokesmoker 0.59549 0.35969 1.656 0.0978 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 650.57 on 961 degrees of freedom
## Residual deviance: 381.67 on 958 degrees of freedom
## AIC: 389.67
##
## Number of Fisher Scoring iterations: 6
Comment:
Even if we keep mom_age_level variable as the predictor variable for step function, we would still be getting the same results (AIC, significant variables) as we got without mom_age_level variable. This result of the step function is found to be consistent with my final logistic model (with same significant variables and similar fitted coefficients) except for the smoke variable which is not a significant variable in my final model as its p-value is greater than significance level of 0.05