Dataset variables (1000 births):

dad_age : age of father of baby (years)
mom_age : age of mother of baby (years)
maturity : classify mother as of advanced maternal age or not (advanced / younger)
len_preg : length of pregnancy (weeks)
is_premie : classify baby as either premature or full-term (premie / fullterm)
num_visits : number of visits to hospital during pregnancy
marital : marital status of mother at time of birth (married / unmarried)
mom_wt_gain : mother’s weight gain during pregnancy (pounds)
bwt : birth weight of baby (pounds)
low_bwt : classify baby as either low birthweight or not (low / notlow)
sex : sex of baby (female / male) smoke : smoking status of mother (smoker / nonsmoker)
mom_white : classify mother as either white or not (white / nonwhite)
mom_age_level : age level of mother of baby (teens, early20s, late20s, early30s, 35+)

library(car) 
## Loading required package: carData
library("pwr")
library(pscl)
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis

Loading data:

setwd("C:/Users/DELL/Desktop/HS 631 - Statistical Computing")
load(file = "births.Rdat")

Exploratory Data Analysis:

Checking if data needs to be cleaned:

summary(births)
##     dad_age         mom_age      maturity      len_preg         is_premie  
##  Min.   : 14.0   Min.   :13   advnced:133   Min.   : 20.00   fullterm:846  
##  1st Qu.: 26.0   1st Qu.:22   younger:867   1st Qu.: 37.00   premie  :152  
##  Median : 32.0   Median :27                 Median : 39.00   unknown :  2  
##  Mean   :195.9   Mean   :27                 Mean   : 40.26                 
##  3rd Qu.: 39.0   3rd Qu.:32                 3rd Qu.: 40.00                 
##  Max.   :999.0   Max.   :50                 Max.   :999.00                 
##    num_visits          marital     mom_wt_gain          bwt        
##  Min.   :  0.00   married  :613   Min.   :  0.00   Min.   : 1.000  
##  1st Qu.: 10.00   unknown  :  1   1st Qu.: 21.00   1st Qu.: 6.380  
##  Median : 12.00   unmarried:386   Median : 30.00   Median : 7.310  
##  Mean   : 20.99                   Mean   : 56.48   Mean   : 7.101  
##  3rd Qu.: 15.00                   3rd Qu.: 40.00   3rd Qu.: 8.060  
##  Max.   :999.00                   Max.   :999.00   Max.   :11.750  
##    low_bwt        sex            smoke        mom_white    mom_age_level
##  low   :111   female:503   nonsmoker:873   nonwhite:284   35+     :133  
##  notlow:889   male  :497   smoker   :126   unknown :  2   early20s:281  
##                            unknown  :  1   white   :714   early30s:219  
##                                                           late20s :257  
##                                                           teens   :110  
## 

Cleaning data i.e. setting impossible values to NA:

births$dad_age[births$dad_age == 999] <- NA
births$len_preg[births$len_preg == 999] <- NA
births$num_visits[births$num_visits == 999] <- NA
births$mom_wt_gain[births$mom_wt_gain == 999] <- NA
births$is_premie[births$is_premie == "unknown"] <- NA
births$marital[births$marital == "unknown"] <- NA
births$smoke[births$smoke == "unknown"] <- NA
births$mom_white[births$mom_white == "unknown"] <- NA

Reordering levels of variables and Renaming level advnced to advanced in maturity variable:

levels(births$maturity)[levels(births$maturity)=="advnced"] <- "advanced"
maturity_level <- c("younger","advanced")
births$maturity <- factor(births$maturity, levels = maturity_level)
is_premie_level <- c("fullterm","premie")
births$is_premie <- factor(births$is_premie, levels = is_premie_level)
low_bwt_level <- c("notlow","low")
births$low_bwt <- factor(births$low_bwt, levels = low_bwt_level)
smoke_level <- c("nonsmoker","smoker")
births$smoke <- factor(births$smoke, levels = smoke_level)
lev <- c("teens","early20s","late20s","early30s","35+")
births$mom_age_level <- factor(births$mom_age_level, levels = lev)

Dropping unused levels:

births$marital <- droplevels(births$marital)
births$mom_white <- droplevels(births$mom_white)

Cleaned data:

summary(births)
##     dad_age         mom_age       maturity      len_preg        is_premie  
##  Min.   :14.00   Min.   :13   younger :867   Min.   :20.00   fullterm:846  
##  1st Qu.:25.00   1st Qu.:22   advanced:133   1st Qu.:37.00   premie  :152  
##  Median :30.00   Median :27                  Median :39.00   NA's    :  2  
##  Mean   :30.26   Mean   :27                  Mean   :38.33                 
##  3rd Qu.:35.00   3rd Qu.:32                  3rd Qu.:40.00                 
##  Max.   :55.00   Max.   :50                  Max.   :45.00                 
##  NA's   :171                                 NA's   :2                     
##    num_visits        marital     mom_wt_gain         bwt           low_bwt   
##  Min.   : 0.0   married  :613   Min.   : 0.00   Min.   : 1.000   notlow:889  
##  1st Qu.:10.0   unmarried:386   1st Qu.:20.00   1st Qu.: 6.380   low   :111  
##  Median :12.0   NA's     :  1   Median :30.00   Median : 7.310               
##  Mean   :12.1                   Mean   :30.33   Mean   : 7.101               
##  3rd Qu.:15.0                   3rd Qu.:38.00   3rd Qu.: 8.060               
##  Max.   :30.0                   Max.   :85.00   Max.   :11.750               
##  NA's   :9                      NA's   :27                                   
##      sex            smoke        mom_white    mom_age_level
##  female:503   nonsmoker:873   nonwhite:284   teens   :110  
##  male  :497   smoker   :126   white   :714   early20s:281  
##               NA's     :  1   NA's    :  2   late20s :257  
##                                              early30s:219  
##                                              35+     :133  
##                                                            
## 

Correlation between variables using ranks for factor variables:

births_numeric <- births[,c(1,2,4,6,8,9)]
births_numeric$maturity <- as.numeric(births$maturity)
births_numeric$is_premie <- as.numeric(births$is_premie, levels = is_premie_level)
births_numeric$low_bwt <- as.numeric(births$low_bwt)
births_numeric$smoke <- as.numeric(births$smoke, levels = smoke_level)
births_numeric$mom_age_level <- as.numeric(births$mom_age_level, levels = lev)
births_numeric$marital <- as.numeric(births$marital, levels = levels(births$marital))
births_numeric$mom_white <- as.numeric(births$mom_white, levels = levels(births$mom_white))
births_numeric$sex <- as.numeric(births$sex, levels = levels(births$sex))

c <- cor(births_numeric, use= "pairwise.complete.obs", method = "spearman")
c
##                    dad_age      mom_age     len_preg  num_visits mom_wt_gain
## dad_age        1.000000000  0.794262548 -0.031710050  0.05900601 -0.04552655
## mom_age        0.794262548  1.000000000 -0.042773785  0.14341061 -0.05028151
## len_preg      -0.031710050 -0.042773785  1.000000000  0.13787177  0.06576266
## num_visits     0.059006014  0.143410609  0.137871772  1.00000000  0.07083079
## mom_wt_gain   -0.045526551 -0.050281512  0.065762657  0.07083079  1.00000000
## bwt            0.086371777  0.076181800  0.452590772  0.10020952  0.18096286
## maturity       0.481485708  0.588819814 -0.034195286  0.04824250 -0.04737845
## is_premie     -0.004102886 -0.012752560 -0.630651495 -0.14326482 -0.14590753
## low_bwt       -0.005760923 -0.006005699 -0.416282434 -0.12284366 -0.12083229
## smoke         -0.103885980 -0.108385376 -0.008966556 -0.06872315  0.04326961
## mom_age_level  0.770025057  0.974049888 -0.041156960  0.14038995 -0.04123870
## marital       -0.358941381 -0.453100372 -0.020186892 -0.21859213 -0.04019315
## mom_white      0.130724383  0.179408733  0.053478896  0.07975005  0.08262090
## sex            0.059692174 -0.002254230  0.025143334 -0.04755096  0.05119186
##                       bwt     maturity    is_premie      low_bwt        smoke
## dad_age        0.08637178  0.481485708 -0.004102886 -0.005760923 -0.103885980
## mom_age        0.07618180  0.588819814 -0.012752560 -0.006005699 -0.108385376
## len_preg       0.45259077 -0.034195286 -0.630651495 -0.416282434 -0.008966556
## num_visits     0.10020952  0.048242501 -0.143264817 -0.122843661 -0.068723152
## mom_wt_gain    0.18096286 -0.047378454 -0.145907527 -0.120832286  0.043269610
## bwt            1.00000000  0.020012654 -0.445992768 -0.544169401 -0.087658394
## maturity       0.02001265  1.000000000  0.023836636  0.030345546 -0.050294575
## is_premie     -0.44599277  0.023836636  1.000000000  0.563192923 -0.001598465
## low_bwt       -0.54416940  0.030345546  0.563192923  1.000000000  0.039743707
## smoke         -0.08765839 -0.050294575 -0.001598465  0.039743707  1.000000000
## mom_age_level  0.07334022  0.604506834 -0.010809773 -0.001966450 -0.100951595
## marital       -0.15149952 -0.157858725  0.087095443  0.121480719  0.107220940
## mom_white      0.17073331  0.056131765 -0.078280577 -0.080610599  0.039403578
## sex            0.16461252 -0.006484698  0.030435326 -0.020163852  0.038083607
##               mom_age_level      marital   mom_white          sex
## dad_age         0.770025057 -0.358941381  0.13072438  0.059692174
## mom_age         0.974049888 -0.453100372  0.17940873 -0.002254230
## len_preg       -0.041156960 -0.020186892  0.05347890  0.025143334
## num_visits      0.140389948 -0.218592128  0.07975005 -0.047550965
## mom_wt_gain    -0.041238699 -0.040193147  0.08262090  0.051191864
## bwt             0.073340224 -0.151499522  0.17073331  0.164612523
## maturity        0.604506834 -0.157858725  0.05613177 -0.006484698
## is_premie      -0.010809773  0.087095443 -0.07828058  0.030435326
## low_bwt        -0.001966450  0.121480719 -0.08061060 -0.020163852
## smoke          -0.100951595  0.107220940  0.03940358  0.038083607
## mom_age_level   1.000000000 -0.443794113  0.18999029  0.001196308
## marital        -0.443794113  1.000000000 -0.33014869 -0.004251541
## mom_white       0.189990292 -0.330148687  1.00000000 -0.031705094
## sex             0.001196308 -0.004251541 -0.03170509  1.000000000

mom_age and mom_age_level variables have a correlation of 0.974049888
bwt and low_bwt variables have a correlation of -0.5441694
mom_age and dad_age variables have a high correlation of 0.794262548
is_premie and len_preg variable have a high correlation of -0.630651495

Data Visualization:

library(corrplot)
## corrplot 0.84 loaded
corrplot(c)

Univariate distribution:

library("ggplot2")
g <- ggplot(births, aes(x=dad_age))
g + geom_histogram(binwidth = 3, color="cyan") +
  ggtitle("Distribution of dad_age variable") +
  xlab("Age of father of baby (years)") + ylab("Count")
## Warning: Removed 171 rows containing non-finite values (stat_bin).

Age of father of baby has the highest count for ages between 32-35years in this dataset.

g1 <- ggplot(births, aes(x=mom_age))
g1 + geom_histogram(binwidth = 3, color="red") +
  ggtitle("Distribution of mom_age variable") +
  xlab("Age of mother of baby (years)") + ylab("Count")

Age of mother of baby has the highest count for ages between 20-23years in this datasest.

g2 <- ggplot(births, aes(x=maturity))
g2 + geom_bar(color="cyan") +
  ggtitle("Distribution of maturity variable") +
  xlab("Classifying mother as of advanced maternal age or not") + ylab("Count")

Mother generally tend to be of younger maternal age for this dataset with a count of around 800 in this dataset.

g3 <- ggplot(births, aes(x=len_preg))
g3 + geom_histogram(binwidth = 2, color="purple") +
  ggtitle("Distribution of len_preg variable") +
  xlab("Length of pregnancy (weeks)") + ylab("Count")
## Warning: Removed 2 rows containing non-finite values (stat_bin).

Length of pregnancy (weeks) has the highest count for 37-39 weeks for this datsest.

g4 <- ggplot(births, aes(x=is_premie))
g4 + geom_bar(color="cyan") +
  ggtitle("Distribution of is_premie variable") +
  xlab("Classifying baby as either premature or full-term") + ylab("Count")

In this dataset, babies are mostly full term (800 out of 1000).

g5 <- ggplot(births, aes(x=num_visits))
g5 + geom_histogram(binwidth = 2, color="red") +
  ggtitle("Distribution of num_visits variable") +
  xlab("Number of visits to hospital during pregnancy") + ylab("Count")
## Warning: Removed 9 rows containing non-finite values (stat_bin).

Number of visits to hospital during pregnancy is around 10-15 for most of the mothers having the baby in this dataset.

g6 <- ggplot(births, aes(x=marital))
g6 + geom_bar(color="cyan") +
  ggtitle("Distribution of marital variable") +
  xlab("Marital status of mother at time of birth") + ylab("Count")

Mother at time of birth tend to be married more than unmarried for this dataset.

g7 <- ggplot(births, aes(x=mom_wt_gain))
g7 + geom_histogram(binwidth = 7, color="purple") +
  ggtitle("Distribution of mom_wt_gain variable") +
  xlab("Mother's weight gain during pregnancy (pounds)") + ylab("Count")
## Warning: Removed 27 rows containing non-finite values (stat_bin).

Mothers gaining 25-30 pounds during pregnancy have the highest count for this dataset.

g8 <- ggplot(births, aes(x=bwt))
g8 + geom_histogram(binwidth = 1, color="cyan") +
  ggtitle("Distribution of bwt variable") +
  xlab("Birth weight of baby (pounds)") + ylab("Count")

Babies with birth weight 6.5-7.5 pounds have the highest count for this dataset.

g9 <- ggplot(births, aes(x=low_bwt))
g9 + geom_bar(color="red") +
  ggtitle("Distribution of low_bwt variable") +
  xlab("Classifying baby as either low birthweight or not") + ylab("Count")

Babies tend not to have a low birth weight in this dataset (around 800 out of 1000).

g10 <- ggplot(births, aes(x=sex))
g10 + geom_bar(color="orange") +
  ggtitle("Distribution of sex variable") +
  xlab("Sex of baby") + ylab("Count")

There are almost as many male in this dataset as there are female.

g11 <- ggplot(births, aes(x=smoke))
g11 + geom_bar(color="red") +
  ggtitle("Distribution of smoke variable") +
  xlab("Smoking status of mother") + ylab("Count")

This dataset has a majority of non smoking mother rather than smoking mothers (800 out of 1000).

g12 <- ggplot(births, aes(x=mom_white))
g12 + geom_bar(color="cyan") +
  ggtitle("Distribution of mom_white variable") +
  xlab("Classifying mother as either white or not") + ylab("Count")

For this dataset, white mother has the highest count (700 out of 1000).

g13 <- ggplot(births, aes(x=mom_age_level))
g13 + geom_bar(color="purple") +
  ggtitle("Distribution of mom_age_level variable") +
  xlab("Age level of mother of baby (years)") + ylab("Count")

Early 20s age level has the highest count for age level of mother of baby followed by late 20s, early 30s, 35+, teens respectively.

Bivariate Distribution:

g14 <- ggplot(births)
g14 + geom_density(aes(x=bwt, color=smoke))+
  ggtitle("Distribution of bwt variable with respect to smoking status of mother") +
  xlab("Birthweight of baby (pounds)") + ylab("Count")
## Warning: Groups with fewer than two data points have been dropped.

Non smoking mothers tend to give birth to babies with birtweight (pounds) slightly higher than smoking mothers on average for this dataset.

g15 <- ggplot(births)
g15 + geom_boxplot(aes(x=factor(mom_age_level), y=bwt)) + 
  ggtitle("Distribution of mom_age_level and bwt variable") +
  xlab("Age level of mother of baby (years)") + ylab("Birthweight of baby (pounds)")

Median birthweight of baby for teens is the lowest while for late20s is the highest for this dataset. On average, 75th percentile of each level of mother’s age have around the same birthweight of baby (pounds) for this dataset.

g16 <- ggplot(births)
g16 + geom_bar(aes(x=factor(mom_white), fill=low_bwt), position = "dodge") + 
  ggtitle("Distribution of mom_white and low_bwt variable") +
  xlab("Mother as either white or not") + ylab("Count")

Count for White mothers giving birth to low birthweight baby (pounds) is higher than for nonwhite mother for this dataset.

Trivariate Distribution:

g17 <- ggplot(births)
g17 + geom_point(aes(x=bwt, y= mom_wt_gain,color=factor(maturity))) + 
  ggtitle("Distribution of mom_wt_gain and bwt variable with respect to maturity variable") +
  xlab("Birthweight of baby (pounds)") + ylab("Mother's weight gain during pregnancy (pounds)")
## Warning: Removed 27 rows containing missing values (geom_point).

Mother’s weight gain during pregnancy (pounds) tends to be slightly higher on average, for younger mothers than for the advanced mothers, for the same birthweight of baby (pounds), for this dataset.

g18 <- ggplot(births)
g18 + geom_point(aes(x=bwt, y=num_visits, color=factor(is_premie))) +
  ggtitle("Distribution of bwt and num_visits variable") +
  xlab("Birthweight of baby (pounds)") + ylab("Number of visits to hospital during pregnancy")
## Warning: Removed 9 rows containing missing values (geom_point).

Premature babies tend to have lesser birtweight of baby (pounds) which is usually observed. On average, mother of premature babies tend to have lesser number of visits to hospital during pregnancy than mother of fullterm babies for this dataset.

Statistical Tests (Significance level = 0.05):

T-test:

t.test(births$bwt ~ births$smoke)
## 
##  Welch Two Sample t-test
## 
## data:  births$bwt by births$smoke
## t = 2.359, df = 171.32, p-value = 0.01945
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.05151165 0.57957328
## sample estimates:
## mean in group nonsmoker    mean in group smoker 
##                7.144273                6.828730

With a p-value equal to 0.01945, there is sufficient evidence to reject the null hypothesis and accept the alternate hypothesis
Null Hypothesis: There is no difference in true means of birthweight of baby (pounds) due to smoking or non smoking mother
Alternate Hypothesis: There is difference in true means of birthweight of baby (pounds) due to smoking or non smoking mother

t.test(births$len_preg ~ births$low_bwt)
## 
##  Welch Two Sample t-test
## 
## data:  births$len_preg by births$low_bwt
## t = 12.188, df = 113.43, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  4.618782 6.411808
## sample estimates:
## mean in group notlow    mean in group low 
##             38.94257             33.42727

With a p-value equal to < 2.2e-16, there is sufficient evidence to reject the null hypothesis and accept the alternate hypothesis
Null Hypothesis: There is no difference in true means of length of pregnancy (weeks) (len_preg variable) due to notlow or low birthweight of baby (low_bwt variable)
Alternate Hypothesis: There is difference in true means of length of pregnancy (weeks) (len_preg variable) due to notlow or low birthweight of baby (low_bwt variable)

t.test(births$mom_wt_gain ~ births$smoke)
## 
##  Welch Two Sample t-test
## 
## data:  births$mom_wt_gain by births$smoke
## t = -1.2229, df = 150.17, p-value = 0.2233
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -4.786411  1.126667
## sample estimates:
## mean in group nonsmoker    mean in group smoker 
##                30.09636                31.92623

With a p-value equal to 0.2233, there is not sufficient evidence to reject the null hypothesis and accept the alternate hypothesis
Null Hypothesis: There is no difference in true means of mother’s weight gain during pregnancy (pounds) due to whether mother of baby smokes or not.
Alternate Hypothesis: There is difference in true means of mother’s weight gain during pregnancy (pounds) due to whether mother of baby smokes or not.

Power test for the above t-test:

pwr.t2n.test(n1=873, n2=126, d = .2, sig.level = .05)
## 
##      t test power calculation 
## 
##              n1 = 873
##              n2 = 126
##               d = 0.2
##       sig.level = 0.05
##           power = 0.5543761
##     alternative = two.sided

Since power equals to 0.5543761, there is 55.43% probability of finding a small effect if it is there because of smoking status of mother due to random sampling alone and that explains why we did not find enough evidence to reject the null hypothesis.

pwr.t2n.test(n1=873, n2=126, d = .5, sig.level = .05)
## 
##      t test power calculation 
## 
##              n1 = 873
##              n2 = 126
##               d = 0.5
##       sig.level = 0.05
##           power = 0.9994839
##     alternative = two.sided

Since power equals to 0.9994839, there is 99.94% probability of finding a medium size effect if it is there because of smoking status of mother due to random sampling alone.

pwr.t2n.test(n1=873, n2=126, d = .8, sig.level = .05)
## 
##      t test power calculation 
## 
##              n1 = 873
##              n2 = 126
##               d = 0.8
##       sig.level = 0.05
##           power = 1
##     alternative = two.sided

Since power equals to 1, there is 100% probability of finding a large size effect if it is there because of smoking status of mother due to random sampling alone.

ANOVA:

summary(aov(births$mom_wt_gain ~ births$mom_age_level))
##                       Df Sum Sq Mean Sq F value Pr(>F)
## births$mom_age_level   4    632   157.9   0.778   0.54
## Residuals            968 196504   203.0               
## 27 observations deleted due to missingness

With a p-value equal to 0.54, there is not sufficient evidence to reject the null hypothesis and accept the alternate hypothesis
Null Hypothesis: All true means of mother’s weight gain during pregnancy (pounds) for different mom age levels is same
Alternate Hypothesis: Atleast one true mean of mother’s weight gain during pregnancy (pounds) for different mom age levels is different

We could do power test to see if there really is an effect to be detected in the sample.

summary(aov(births$bwt ~ births$mom_age_level))
##                       Df Sum Sq Mean Sq F value Pr(>F)
## births$mom_age_level   4   14.8   3.699   1.629  0.165
## Residuals            995 2259.6   2.271

With a p-value equal to 0.165, there is not sufficient evidence to reject the null hypothesis and accept the alternate hypothesis
Null Hypothesis: All true means of birth weight of baby (pounds) for different mom age levels is same
Alternate Hypothesis: Atleast one true mean of birth weight of baby (pounds) for different mom age levels is different

We could do power test to see if there really is an effect to be detected in the sample.

Wilcoxon-Mann Whitney test:

wilcox.test(births_numeric$mom_age_level ~ births_numeric$smoke)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  births_numeric$mom_age_level by births_numeric$smoke
## W = 64393, p-value = 0.001428
## alternative hypothesis: true location shift is not equal to 0

With a p-value equal to 0.001428 there is sufficient evidence to reject the null hypothesis and accept the alternate hypothesis
Null hypothesis: True median age for non smoking mothers is not different from true median age for smoking mothers.
Alternate Hypothesis: True median age for non smoking mothers is different from true median age for smoking mothers.

wilcox.test(births_numeric$mom_age_level ~ births_numeric$is_premie)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  births_numeric$mom_age_level by births_numeric$is_premie
## W = 65383, p-value = 0.733
## alternative hypothesis: true location shift is not equal to 0

With a p-value equal to 0.733 there is not sufficient evidence to reject the null hypothesis and accept the alternate hypothesis
Null hypothesis: True median age of mothers giving birth to premature babies is not different from true median age of mothers giving birth to fullterm babies
Alternate Hypothesis: True median age of mothers giving birth to premature babies is different from true median age of mothers giving birth to fullterm babies

We could do power test to see if there really is an effect to be detected in the sample.

wilcox.test(births_numeric$mom_age_level ~ births_numeric$low_bwt)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  births_numeric$mom_age_level by births_numeric$low_bwt
## W = 49513, p-value = 0.9506
## alternative hypothesis: true location shift is not equal to 0

With a p-value equal to 0.9506 there is not sufficient evidence to reject the null hypothesis and accept the alternate hypothesis
Null hypothesis: True median age of mothers giving birth to babies with low birthweight (pounds) is not different from true median age of mothers giving birth to babies with not low birthweight (pounds)
Alternate Hypothesis: True median age of mothers giving birth to babies with low birthweight (pounds) is different from true median age of mothers giving birth to babies with not low birthweight (pounds)

We could do power test to see if there really is an effect to be detected in the sample.

Chi-square test:

chisq.test(table(births$mom_white, births$is_premie))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(births$mom_white, births$is_premie)
## X-squared = 5.6307, df = 1, p-value = 0.01765

With a p-value equal to 0.01765, there is enough evidence to reject the null hypothesis and accept the alternate hypothesis
Null Hypothesis: mom_white and is_premie variables are independent
Alternate Hypothesis: mom_white and is_premie variables are not independent

chisq.test(table(births$low_bwt, births$mom_white))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(births$low_bwt, births$mom_white)
## X-squared = 5.9293, df = 1, p-value = 0.01489

With a p-value equal to p-value = 0.01489, there is enough evidence to reject the null hypothesis and accept the alternate hypothesis
Null Hypothesis: low_bwt and mom_white variables are independent
Alternate Hypothesis: low_bwt and mom_white variables are not independent

chisq.test(table(births$maturity, births$smoke))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(births$maturity, births$smoke)
## X-squared = 2.0994, df = 1, p-value = 0.1474

With a p-value equal to p-value = 0.1474, there is not enough evidence to reject the null hypothesis and accept the alternate hypothesis
Null Hypothesis: Maturity and smoke variables are independent
Alternate Hypothesis: Maturity and smoke variables are not independent

Power test for the above chi square test:

cohen.ES(test = "chisq", size = "small")
## 
##      Conventional effect size from Cohen (1982) 
## 
##            test = chisq
##            size = small
##     effect.size = 0.1
pwr.chisq.test(w = 0.1, power = 0.9, df = 1, sig.level = 0.05)
## 
##      Chi squared power calculation 
## 
##               w = 0.1
##               N = 1050.742
##              df = 1
##       sig.level = 0.05
##           power = 0.9
## 
## NOTE: N is the number of observations

With N=1050.742, we would need a slightly bigger sample size for detecting a small effect if it is there through random sampling alone.

cohen.ES(test = "chisq", size = "medium")
## 
##      Conventional effect size from Cohen (1982) 
## 
##            test = chisq
##            size = medium
##     effect.size = 0.3
pwr.chisq.test(w = 0.3, power = 0.9, df = 1, sig.level = 0.05)
## 
##      Chi squared power calculation 
## 
##               w = 0.3
##               N = 116.7491
##              df = 1
##       sig.level = 0.05
##           power = 0.9
## 
## NOTE: N is the number of observations

N = 116.7491, for detecting a medium effect if there is any through random sampling alone

chisq.test(table(births$low_bwt, births$marital))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(births$low_bwt, births$marital)
## X-squared = 13.957, df = 1, p-value = 0.0001871

With a p-value equal to p-value = 0.0001871, there is enough evidence to reject the null hypothesis and accept the alternate hypothesis
Null Hypothesis: low_bwt and marital variables are independent
Alternate Hypothesis: low_bwt and marital variables are not independent

chisq.test(table(births$smoke, births$mom_age_level))
## 
##  Pearson's Chi-squared test
## 
## data:  table(births$smoke, births$mom_age_level)
## X-squared = 12.648, df = 4, p-value = 0.01313

With a p-value equal to p-value = 0.01313, there is enough evidence to reject the null hypothesis and accept the alternate hypothesis
Null Hypothesis: smoke and mom_age_level variables are independent
Alternate Hypothesis: smoke and mom_age_level variables are not independent

chisq.test(table(births$low_bwt, births$mom_age_level))
## 
##  Pearson's Chi-squared test
## 
## data:  table(births$low_bwt, births$mom_age_level)
## X-squared = 3.3818, df = 4, p-value = 0.4961

With a p-value equal to p-value = 0.4961, there is not enough evidence to reject the null hypothesis and accept the alternate hypothesis
Null Hypothesis: low_bwt and mom_age_level variables are independent
Alternate Hypothesis: low_bwt and mom_age_level variables are not independent

Power test for the above chi square test:

cohen.ES(test = "chisq", size = "small")
## 
##      Conventional effect size from Cohen (1982) 
## 
##            test = chisq
##            size = small
##     effect.size = 0.1
pwr.chisq.test(w = 0.1, power = 0.9, df = 4, sig.level = 0.05)
## 
##      Chi squared power calculation 
## 
##               w = 0.1
##               N = 1540.505
##              df = 4
##       sig.level = 0.05
##           power = 0.9
## 
## NOTE: N is the number of observations

With N=1540.505, we would need a bigger sample size for detecting a small effect if it is there through random sampling alone.

cohen.ES(test = "chisq", size = "medium")
## 
##      Conventional effect size from Cohen (1982) 
## 
##            test = chisq
##            size = medium
##     effect.size = 0.3
pwr.chisq.test(w = 0.3, power = 0.9, df = 4, sig.level = 0.05)
## 
##      Chi squared power calculation 
## 
##               w = 0.3
##               N = 171.1672
##              df = 4
##       sig.level = 0.05
##           power = 0.9
## 
## NOTE: N is the number of observations

N = 171.1672, for detecting a medium effect if there is any through random sampling alone

Linear regression model fitting birth weight as a function of predictor variables:

fit1 <- lm(bwt ~ dad_age+mom_age+maturity+len_preg+is_premie+num_visits+marital+mom_wt_gain+low_bwt+sex+smoke+mom_white+mom_age_level, data=births)
summary(fit1)
## 
## Call:
## lm(formula = bwt ~ dad_age + mom_age + maturity + len_preg + 
##     is_premie + num_visits + marital + mom_wt_gain + low_bwt + 
##     sex + smoke + mom_white + mom_age_level, data = births)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.4770 -0.6012 -0.0021  0.5494  3.5563 
## 
## Coefficients: (1 not defined because of singularities)
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -0.2179472  0.8220275  -0.265  0.79098    
## dad_age                0.0080555  0.0077365   1.041  0.29809    
## mom_age                0.0090994  0.0218822   0.416  0.67764    
## maturityadvanced      -0.0731887  0.4196130  -0.174  0.86158    
## len_preg               0.1718275  0.0181833   9.450  < 2e-16 ***
## is_premiepremie       -0.0442754  0.1440740  -0.307  0.75869    
## num_visits             0.0003972  0.0089395   0.044  0.96457    
## maritalunmarried      -0.1172818  0.0853089  -1.375  0.16959    
## mom_wt_gain            0.0071629  0.0023449   3.055  0.00233 ** 
## low_bwtlow            -2.3881336  0.1431235 -16.686  < 2e-16 ***
## sexmale                0.3821920  0.0653943   5.844 7.45e-09 ***
## smokesmoker           -0.2143158  0.1068650  -2.005  0.04525 *  
## mom_whitewhite         0.2458025  0.0807928   3.042  0.00243 ** 
## mom_age_levelearly20s  0.0825564  0.1538582   0.537  0.59171    
## mom_age_levellate20s   0.0187617  0.2309455   0.081  0.93527    
## mom_age_levelearly30s -0.0630545  0.3222707  -0.196  0.84493    
## mom_age_level35+              NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9136 on 784 degrees of freedom
##   (200 observations deleted due to missingness)
## Multiple R-squared:  0.6055, Adjusted R-squared:  0.598 
## F-statistic: 80.24 on 15 and 784 DF,  p-value: < 2.2e-16
plot(fit1,1)

Removing mom_age_level variable as it is a dichotomised version of mom_age variable with a high correlation as shown above in the script (0.974049888)

fit2 <- lm(bwt ~ dad_age+mom_age+maturity+len_preg+is_premie+num_visits+marital+mom_wt_gain+low_bwt+sex+smoke+mom_white, data=births)
summary(fit2)
## 
## Call:
## lm(formula = bwt ~ dad_age + mom_age + maturity + len_preg + 
##     is_premie + num_visits + marital + mom_wt_gain + low_bwt + 
##     sex + smoke + mom_white, data = births)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.4327 -0.6086 -0.0148  0.5625  3.5285 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -0.0049431  0.7583592  -0.007  0.99480    
## dad_age           0.0084262  0.0077106   1.093  0.27481    
## mom_age           0.0003153  0.0101018   0.031  0.97511    
## maturityadvanced  0.0068293  0.1215602   0.056  0.95521    
## len_preg          0.1723625  0.0181543   9.494  < 2e-16 ***
## is_premiepremie  -0.0442005  0.1438363  -0.307  0.75870    
## num_visits        0.0009585  0.0089045   0.108  0.91431    
## maritalunmarried -0.1272476  0.0839416  -1.516  0.12994    
## mom_wt_gain       0.0070000  0.0023375   2.995  0.00283 ** 
## low_bwtlow       -2.3906454  0.1428852 -16.731  < 2e-16 ***
## sexmale           0.3803505  0.0652462   5.829 8.11e-09 ***
## smokesmoker      -0.2139060  0.1066228  -2.006  0.04518 *  
## mom_whitewhite    0.2441426  0.0802893   3.041  0.00244 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9126 on 787 degrees of freedom
##   (200 observations deleted due to missingness)
## Multiple R-squared:  0.6049, Adjusted R-squared:  0.5989 
## F-statistic: 100.4 on 12 and 787 DF,  p-value: < 2.2e-16
plot(fit2,1)

Removing low_bwt variable as it is a dichotomised version of bwt variable with a high correlation as shown above (-0.5441694)

fit3 <- lm(bwt ~ dad_age+mom_age+maturity+len_preg+is_premie+num_visits+marital+mom_wt_gain+sex+smoke+mom_white, data=births)
summary(fit3)
## 
## Call:
## lm(formula = bwt ~ dad_age + mom_age + maturity + len_preg + 
##     is_premie + num_visits + marital + mom_wt_gain + sex + smoke + 
##     mom_white, data = births)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7553 -0.6560 -0.0184  0.6780  3.6461 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -3.833457   0.841302  -4.557 6.02e-06 ***
## dad_age           0.004544   0.008968   0.507 0.612484    
## mom_age           0.004895   0.011750   0.417 0.677078    
## maturityadvanced -0.053106   0.141387  -0.376 0.707308    
## len_preg          0.264976   0.020119  13.171  < 2e-16 ***
## is_premiepremie  -0.617364   0.162552  -3.798 0.000157 ***
## num_visits        0.004374   0.010359   0.422 0.672980    
## maritalunmarried -0.208628   0.097511  -2.140 0.032699 *  
## mom_wt_gain       0.009118   0.002716   3.357 0.000825 ***
## sexmale           0.398622   0.075910   5.251 1.95e-07 ***
## smokesmoker      -0.304327   0.123907  -2.456 0.014261 *  
## mom_whitewhite    0.290618   0.093369   3.113 0.001922 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.062 on 788 degrees of freedom
##   (200 observations deleted due to missingness)
## Multiple R-squared:  0.4644, Adjusted R-squared:  0.457 
## F-statistic: 62.12 on 11 and 788 DF,  p-value: < 2.2e-16
plot(fit3,1)

Removing maturity variable as it is the most insignificant variable with a p-value of 0.707308 (Significance level of 0.05)

fit4 <- lm(bwt ~ dad_age+mom_age+len_preg+is_premie+num_visits+marital+mom_wt_gain+sex+smoke+mom_white, data=births)
summary(fit4)
## 
## Call:
## lm(formula = bwt ~ dad_age + mom_age + len_preg + is_premie + 
##     num_visits + marital + mom_wt_gain + sex + smoke + mom_white, 
##     data = births)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7438 -0.6576 -0.0143  0.6790  3.6576 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -3.778629   0.828091  -4.563 5.84e-06 ***
## dad_age           0.004543   0.008963   0.507 0.612403    
## mom_age           0.002701   0.010190   0.265 0.791002    
## len_preg          0.264946   0.020107  13.177  < 2e-16 ***
## is_premiepremie  -0.619446   0.162369  -3.815 0.000147 ***
## num_visits        0.004483   0.010349   0.433 0.665026    
## maritalunmarried -0.215704   0.095622  -2.256 0.024356 *  
## mom_wt_gain       0.009093   0.002714   3.351 0.000844 ***
## sexmale           0.399191   0.075854   5.263 1.83e-07 ***
## smokesmoker      -0.304207   0.123840  -2.456 0.014246 *  
## mom_whitewhite    0.291236   0.093304   3.121 0.001866 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.061 on 789 degrees of freedom
##   (200 observations deleted due to missingness)
## Multiple R-squared:  0.4643, Adjusted R-squared:  0.4575 
## F-statistic: 68.39 on 10 and 789 DF,  p-value: < 2.2e-16
plot(fit4,1)

Removing mom_age variable as it is the most insignificant variable with a p-value of 0.791002 (Significance level of 0.05)

fit5 <- lm(bwt ~ dad_age+len_preg+is_premie+num_visits+marital+mom_wt_gain+sex+smoke+mom_white, data=births)
summary(fit5)
## 
## Call:
## lm(formula = bwt ~ dad_age + len_preg + is_premie + num_visits + 
##     marital + mom_wt_gain + sex + smoke + mom_white, data = births)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7360 -0.6565 -0.0124  0.6773  3.6639 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -3.741310   0.815557  -4.587 5.22e-06 ***
## dad_age           0.006310   0.005989   1.054 0.292364    
## len_preg          0.264527   0.020033  13.204  < 2e-16 ***
## is_premiepremie  -0.620669   0.162208  -3.826 0.000140 ***
## num_visits        0.004749   0.010294   0.461 0.644700    
## maritalunmarried -0.221589   0.092954  -2.384 0.017368 *  
## mom_wt_gain       0.009059   0.002709   3.344 0.000865 ***
## sexmale           0.398450   0.075758   5.260 1.86e-07 ***
## smokesmoker      -0.306345   0.123504  -2.480 0.013329 *  
## mom_whitewhite    0.291292   0.093249   3.124 0.001850 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.061 on 790 degrees of freedom
##   (200 observations deleted due to missingness)
## Multiple R-squared:  0.4643, Adjusted R-squared:  0.4582 
## F-statistic: 76.07 on 9 and 790 DF,  p-value: < 2.2e-16
plot(fit5,1)

Removing num_visits variable as it is the most insignificant variable with a p-value of 0.644700 (Significance level of 0.05)

fit6 <- lm(bwt ~ dad_age+len_preg+is_premie+marital+mom_wt_gain+sex+smoke+mom_white, data=births)
summary(fit6)
## 
## Call:
## lm(formula = bwt ~ dad_age + len_preg + is_premie + marital + 
##     mom_wt_gain + sex + smoke + mom_white, data = births)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7653 -0.6583 -0.0149  0.6669  3.6883 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -3.955459   0.790216  -5.006 6.86e-07 ***
## dad_age           0.007065   0.005940   1.189 0.234621    
## len_preg          0.270831   0.019219  14.092  < 2e-16 ***
## is_premiepremie  -0.606265   0.161020  -3.765 0.000179 ***
## maritalunmarried -0.218020   0.090826  -2.400 0.016605 *  
## mom_wt_gain       0.009433   0.002676   3.526 0.000447 ***
## sexmale           0.390730   0.075147   5.200 2.54e-07 ***
## smokesmoker      -0.306601   0.123186  -2.489 0.013016 *  
## mom_whitewhite    0.288232   0.092247   3.125 0.001845 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.058 on 798 degrees of freedom
##   (193 observations deleted due to missingness)
## Multiple R-squared:  0.4828, Adjusted R-squared:  0.4776 
## F-statistic: 93.11 on 8 and 798 DF,  p-value: < 2.2e-16
plot(fit6,1)

Final linear model

Removing dad_age variable as it is the most insignificant variable with a p-value of 0.234621 (Significance level of 0.05)

fit7 <- lm(bwt ~ len_preg+is_premie+marital+mom_wt_gain+sex+smoke+mom_white, data=births)
summary(fit7)
## 
## Call:
## lm(formula = bwt ~ len_preg + is_premie + marital + mom_wt_gain + 
##     sex + smoke + mom_white, data = births)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7412 -0.6448 -0.0237  0.6784  4.1543 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -4.096698   0.687181  -5.962 3.50e-09 ***
## len_preg          0.282229   0.017408  16.212  < 2e-16 ***
## is_premiepremie  -0.523456   0.142239  -3.680 0.000246 ***
## maritalunmarried -0.247769   0.075388  -3.287 0.001051 ** 
## mom_wt_gain       0.008355   0.002427   3.442 0.000602 ***
## sexmale           0.396837   0.068709   5.776 1.03e-08 ***
## smokesmoker      -0.383683   0.104092  -3.686 0.000241 ***
## mom_whitewhite    0.218897   0.080821   2.708 0.006880 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.064 on 962 degrees of freedom
##   (30 observations deleted due to missingness)
## Multiple R-squared:  0.494,  Adjusted R-squared:  0.4903 
## F-statistic: 134.2 on 7 and 962 DF,  p-value: < 2.2e-16
plot(fit7)

Equation of fitted line for linear model:

bwt = 0.282229 * len_preg + (-0.523456) * is_premiepremie + (-0.247769) * maritalunmarried + 0.008355 * mom_wt_gain + 0.396837 * sexmale + (-0.383683) * smokesmoker + 0.218897 * mom_whitewhite + (-4.096698)

Effect of significant variables in the final linear model on bwt variable:

For every one week increase in length of pregnancy (len_preg variable), the birthweight of baby (bwt variable) increases by 0.282229 pounds on average when all other variables in the model are held constant.

For premature (is_premiepremie variable) as compared with fullterm, birthweight of baby (bwt variable) decreases by 0.523456 pounds on average when all other variables in the model are held constant.

For ummarried mother at time of birth (maritalunmarried variable) as compared with married mother at time of birth, birthweight of baby (bwt variable) decreases by 0.247769 pounds on average when all other variables in the model are held constant.

For every one pound increase in mother’s weight during pregnancy (mom_wt_gain variable), the birthweight of baby (bwt variable) increases by 0.008355 pounds on average when all other variables in the model are held constant.

For male baby (sexmale variable) as compared with female baby, birthweight of baby (bwt variable) increases by 0.396837 pounds on average when all other variables in the model are held constant.

For white mother (mom_whitewhite variable) as compared with nonwhite mother, birthweight of baby (bwt variable) increases by 0.218897 pounds on average when all other variables in the model are held constant.

Comment:

Even though is_premie variable is a dichotomised version of len_preg variable and has a high correlation, both the variables are present in the final model. Removing either one of the variable, resulted in a slightly worse Adjusted R-squared (0.4837) and the value of fitted coefficients were around same and hence this final model has been chosen.

Logistic regression model fitting the dichotomous low birth weight variable as a function of predictor variables:

fit_all <- glm(low_bwt ~ ., family = binomial(), data = births)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(fit_all)
## 
## Call:
## glm(formula = low_bwt ~ ., family = binomial(), data = births)
## 
## Deviance Residuals: 
##        Min          1Q      Median          3Q         Max  
## -2.431e-04  -2.100e-08  -2.100e-08  -2.100e-08   2.778e-04  
## 
## Coefficients: (1 not defined because of singularities)
##                         Estimate Std. Error z value Pr(>|z|)
## (Intercept)            9.548e+02  1.458e+05   0.007    0.995
## dad_age                3.359e+00  9.207e+02   0.004    0.997
## mom_age                3.718e+00  2.668e+03   0.001    0.999
## maturityadvanced      -1.473e+02  6.438e+04  -0.002    0.998
## len_preg              -3.282e+00  2.690e+03  -0.001    0.999
## is_premiepremie       -2.840e+01  1.373e+04  -0.002    0.998
## num_visits             1.576e+00  1.079e+03   0.001    0.999
## maritalunmarried       1.859e+01  6.864e+03   0.003    0.998
## mom_wt_gain            5.582e-02  2.668e+02   0.000    1.000
## bwt                   -1.758e+02  1.299e+04  -0.014    0.989
## sexmale               -5.662e+00  5.273e+03  -0.001    0.999
## smokesmoker            6.726e+00  6.180e+03   0.001    0.999
## mom_whitewhite         1.349e+01  7.363e+03   0.002    0.999
## mom_age_levelearly20s -3.415e+01  1.116e+04  -0.003    0.998
## mom_age_levellate20s  -9.080e+01  2.875e+04  -0.003    0.997
## mom_age_levelearly30s -1.363e+02  4.292e+04  -0.003    0.997
## mom_age_level35+              NA         NA      NA       NA
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4.9325e+02  on 799  degrees of freedom
## Residual deviance: 4.5427e-07  on 784  degrees of freedom
##   (200 observations deleted due to missingness)
## AIC: 32
## 
## Number of Fisher Scoring iterations: 25

Warnings show a case of perfect linear separation.

pR2(fit_all)
##           llh       llhNull            G2      McFadden          r2ML 
## -2.271326e-07 -3.486010e+02  6.972020e+02  1.000000e+00  5.816774e-01 
##          r2CU 
##  1.000000e+00

It shows a perfect fit but the p-values of the fitted coefficients showed that the fitted coefficients are not significant.
McFadden = 1.000000e+00
r2CU = 1.000000e+00

exp(coef(fit_all))
##           (Intercept)               dad_age               mom_age 
##                   Inf          2.874839e+01          4.117821e+01 
##      maturityadvanced              len_preg       is_premiepremie 
##          1.101887e-64          3.753741e-02          4.627576e-13 
##            num_visits      maritalunmarried           mom_wt_gain 
##          4.833817e+00          1.181766e+08          1.057407e+00 
##                   bwt               sexmale           smokesmoker 
##          4.705249e-77          3.473805e-03          8.336749e+02 
##        mom_whitewhite mom_age_levelearly20s  mom_age_levellate20s 
##          7.219851e+05          1.479959e-15          3.689081e-40 
## mom_age_levelearly30s      mom_age_level35+ 
##          6.404289e-60                    NA

The Odds Ratio for variables bwt and mom_age_level (mom_age_levelearly20s, mom_age_levellate20s, mom_age_levelearly30s, mom_age_level35+) is too large.

vif(fit_all)
Error: There are aliased coefficients in the model

c1 <- cor(x = births_numeric$bwt, y = births_numeric$low_bwt, use = "pairwise.complete.obs", method = "spearman")
c1
## [1] -0.5441694

Removing bwt variable as it has high negative correlation with low_bwt variable (-0.5441694) and low_bwt variable is dichotomised version of bwt variables

fit_all2 <- glm(low_bwt ~ dad_age+mom_age+maturity+len_preg+is_premie+num_visits+marital+mom_wt_gain+sex+smoke+mom_white+mom_age_level, family = binomial(), data = births)
summary(fit_all2)
## 
## Call:
## glm(formula = low_bwt ~ dad_age + mom_age + maturity + len_preg + 
##     is_premie + num_visits + marital + mom_wt_gain + sex + smoke + 
##     mom_white + mom_age_level, family = binomial(), data = births)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6657  -0.2811  -0.1827  -0.1100   3.5119  
## 
## Coefficients: (1 not defined because of singularities)
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           24.79866    5.46594   4.537 5.71e-06 ***
## dad_age                0.04050    0.03957   1.024    0.306    
## mom_age               -0.10523    0.11832  -0.889    0.374    
## maturityadvanced       1.68805    2.22231   0.760    0.447    
## len_preg              -0.69667    0.13195  -5.280 1.29e-07 ***
## is_premiepremie        0.35991    0.58387   0.616    0.538    
## num_visits             0.01975    0.04143   0.477    0.634    
## maritalunmarried       0.51847    0.42012   1.234    0.217    
## mom_wt_gain           -0.01940    0.01314  -1.476    0.140    
## sexmale               -0.25239    0.34547  -0.731    0.465    
## smokesmoker            0.74374    0.45724   1.627    0.104    
## mom_whitewhite        -0.33757    0.40330  -0.837    0.403    
## mom_age_levelearly20s  0.30859    0.77343   0.399    0.690    
## mom_age_levellate20s   0.39519    1.23468   0.320    0.749    
## mom_age_levelearly30s  0.94004    1.73235   0.543    0.587    
## mom_age_level35+            NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 493.25  on 799  degrees of freedom
## Residual deviance: 262.63  on 785  degrees of freedom
##   (200 observations deleted due to missingness)
## AIC: 292.63
## 
## Number of Fisher Scoring iterations: 7
pR2(fit_all2)
##          llh      llhNull           G2     McFadden         r2ML         r2CU 
## -131.3145492 -348.6009843  434.5728701    0.6233099    0.4191229    0.7205418

vif(fit_all2) Error still showing

c2 <- cor(x=births_numeric$mom_age_level, y=births_numeric$mom_age, use = "pairwise.complete.obs", method = "spearman")
c2
## [1] 0.9740499

Hence removing mom_age_level variable, since it is a dichotomised version of mom_age # and has a very high positive correlation as shown (0.9740499)

fit_all3 <- glm(low_bwt ~ dad_age+mom_age+maturity+len_preg+is_premie+num_visits+marital+mom_wt_gain+sex+smoke+mom_white, family = binomial(), data = births)
summary(fit_all3)
## 
## Call:
## glm(formula = low_bwt ~ dad_age + mom_age + maturity + len_preg + 
##     is_premie + num_visits + marital + mom_wt_gain + sex + smoke + 
##     mom_white, family = binomial(), data = births)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7338  -0.2814  -0.1824  -0.1110   3.4843  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      24.15270    5.20897   4.637 3.54e-06 ***
## dad_age           0.03820    0.03926   0.973   0.3306    
## mom_age          -0.05194    0.05300  -0.980   0.3271    
## maturityadvanced  0.64422    0.62817   1.026   0.3051    
## len_preg         -0.69969    0.13248  -5.282 1.28e-07 ***
## is_premiepremie   0.33354    0.58311   0.572   0.5673    
## num_visits        0.01745    0.04098   0.426   0.6702    
## maritalunmarried  0.47216    0.40792   1.157   0.2471    
## mom_wt_gain      -0.02004    0.01311  -1.529   0.1263    
## sexmale          -0.26192    0.34342  -0.763   0.4457    
## smokesmoker       0.76346    0.45429   1.681   0.0928 .  
## mom_whitewhite   -0.33514    0.39941  -0.839   0.4014    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 493.25  on 799  degrees of freedom
## Residual deviance: 263.19  on 788  degrees of freedom
##   (200 observations deleted due to missingness)
## AIC: 287.19
## 
## Number of Fisher Scoring iterations: 7
pR2(fit_all3)
##          llh      llhNull           G2     McFadden         r2ML         r2CU 
## -131.5953141 -348.6009843  434.0113404    0.6225045    0.4187150    0.7198406
vif(fit_all3)  
##     dad_age     mom_age    maturity    len_preg   is_premie  num_visits 
##    2.880344    4.175761    2.028880    3.033071    3.035933    1.137955 
##     marital mom_wt_gain         sex       smoke   mom_white 
##    1.339829    1.051122    1.053447    1.051109    1.163147

No warning message of linear separation is found anymore
VIF is < 5
McFadden and r2CU values are not equal to 1

exp(cbind(OR=coef(fit_all3), confint(fit_all3)))
## Waiting for profiling to be done...
##                            OR        2.5 %       97.5 %
## (Intercept)      3.085906e+10 2.328560e+06 1.744716e+15
## dad_age          1.038934e+00 9.595182e-01 1.119562e+00
## mom_age          9.493872e-01 8.550548e-01 1.053308e+00
## maturityadvanced 1.904506e+00 5.465061e-01 6.524037e+00
## len_preg         4.967378e-01 3.752908e-01 6.308689e-01
## is_premiepremie  1.395895e+00 4.360910e-01 4.307565e+00
## num_visits       1.017606e+00 9.370674e-01 1.100695e+00
## maritalunmarried 1.603449e+00 7.119340e-01 3.550942e+00
## mom_wt_gain      9.801595e-01 9.547152e-01 1.005131e+00
## sexmale          7.695744e-01 3.884274e-01 1.503356e+00
## smokesmoker      2.145690e+00 8.509763e-01 5.112089e+00
## mom_whitewhite   7.152374e-01 3.305702e-01 1.595971e+00

Removing num_visits variable as it is the most insignificant variable with p-value equal to 0.6702 (Significance level = 0.05)

fit_all4 <- glm(low_bwt ~ dad_age+mom_age+maturity+len_preg+is_premie+marital+mom_wt_gain+sex+smoke+mom_white, family = binomial(), data = births)
summary(fit_all4)
## 
## Call:
## glm(formula = low_bwt ~ dad_age + mom_age + maturity + len_preg + 
##     is_premie + marital + mom_wt_gain + sex + smoke + mom_white, 
##     family = binomial(), data = births)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7814  -0.2812  -0.1829  -0.1116   3.4334  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      24.10001    5.19906   4.635 3.56e-06 ***
## dad_age           0.03759    0.03939   0.954   0.3400    
## mom_age          -0.05063    0.05286  -0.958   0.3382    
## maturityadvanced  0.66123    0.62709   1.054   0.2917    
## len_preg         -0.69287    0.13070  -5.301 1.15e-07 ***
## is_premiepremie   0.35156    0.58223   0.604   0.5460    
## maritalunmarried  0.43629    0.40243   1.084   0.2783    
## mom_wt_gain      -0.01992    0.01294  -1.539   0.1237    
## sexmale          -0.27373    0.34250  -0.799   0.4242    
## smokesmoker       0.75881    0.45405   1.671   0.0947 .  
## mom_whitewhite   -0.33746    0.39851  -0.847   0.3971    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 503.72  on 806  degrees of freedom
## Residual deviance: 263.65  on 796  degrees of freedom
##   (193 observations deleted due to missingness)
## AIC: 285.65
## 
## Number of Fisher Scoring iterations: 7
pR2(fit_all4)
##          llh      llhNull           G2     McFadden         r2ML         r2CU 
## -131.8265837 -348.6009843  433.5488012    0.6218410    0.4156383    0.7184720
vif(fit_all4)
##     dad_age     mom_age    maturity    len_preg   is_premie     marital 
##    2.879465    4.134529    2.018674    2.964980    3.024291    1.305635 
## mom_wt_gain         sex       smoke   mom_white 
##    1.038430    1.047072    1.047911    1.157161

Removing is_premie variable as it is the most insignificant variable with p-value equal to 0.5460 (Significance level = 0.05)

fit_all5 <- glm(low_bwt ~ dad_age+mom_age+maturity+len_preg+marital+mom_wt_gain+sex+smoke+mom_white, family = binomial(), data = births)
summary(fit_all5)
## 
## Call:
## glm(formula = low_bwt ~ dad_age + mom_age + maturity + len_preg + 
##     marital + mom_wt_gain + sex + smoke + mom_white, family = binomial(), 
##     data = births)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8231  -0.2876  -0.1821  -0.1065   3.5148  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      26.67656    3.14293   8.488   <2e-16 ***
## dad_age           0.03599    0.03916   0.919   0.3580    
## mom_age          -0.05013    0.05270  -0.951   0.3415    
## maturityadvanced  0.68319    0.62472   1.094   0.2741    
## len_preg         -0.75773    0.07950  -9.531   <2e-16 ***
## maritalunmarried  0.42427    0.40273   1.053   0.2921    
## mom_wt_gain      -0.02093    0.01282  -1.633   0.1026    
## sexmale          -0.25607    0.34075  -0.752   0.4524    
## smokesmoker       0.75689    0.45296   1.671   0.0947 .  
## mom_whitewhite   -0.32764    0.39728  -0.825   0.4095    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 503.72  on 806  degrees of freedom
## Residual deviance: 264.01  on 797  degrees of freedom
##   (193 observations deleted due to missingness)
## AIC: 284.01
## 
## Number of Fisher Scoring iterations: 7
pR2(fit_all5)
##          llh      llhNull           G2     McFadden         r2ML         r2CU 
## -132.0069775 -348.6009843  433.1880136    0.6213236    0.4153770    0.7180203

Removing sex variable as it is the most insignificant variable with p-value equal to 0.4524 (Significance level = 0.05)

fit_all6 <- glm(low_bwt ~ dad_age+mom_age+maturity+len_preg+marital+mom_wt_gain+smoke+mom_white, family = binomial(), data = births)
summary(fit_all6)
## 
## Call:
## glm(formula = low_bwt ~ dad_age + mom_age + maturity + len_preg + 
##     marital + mom_wt_gain + smoke + mom_white, family = binomial(), 
##     data = births)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7466  -0.2847  -0.1825  -0.1097   3.5446  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      26.26630    3.05920   8.586   <2e-16 ***
## dad_age           0.03360    0.03903   0.861    0.389    
## mom_age          -0.04590    0.05249  -0.874    0.382    
## maturityadvanced  0.65276    0.62248   1.049    0.294    
## len_preg         -0.75172    0.07846  -9.581   <2e-16 ***
## maritalunmarried  0.43878    0.40339   1.088    0.277    
## mom_wt_gain      -0.02039    0.01273  -1.602    0.109    
## smokesmoker       0.73449    0.45202   1.625    0.104    
## mom_whitewhite   -0.31376    0.39631  -0.792    0.429    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 503.72  on 806  degrees of freedom
## Residual deviance: 264.58  on 798  degrees of freedom
##   (193 observations deleted due to missingness)
## AIC: 282.58
## 
## Number of Fisher Scoring iterations: 7
pR2(fit_all6)
##          llh      llhNull           G2     McFadden         r2ML         r2CU 
## -132.2911031 -348.6009843  432.6197624    0.6205085    0.4149652    0.7173085

Removing mom_white variable as it is the most insignificant variable with p-value equal to 0.429 (Significance level = 0.05)

fit_all6 <- glm(low_bwt ~ dad_age+mom_age+maturity+len_preg+marital+mom_wt_gain+smoke, family = binomial(), data = births)
summary(fit_all6)
## 
## Call:
## glm(formula = low_bwt ~ dad_age + mom_age + maturity + len_preg + 
##     marital + mom_wt_gain + smoke, family = binomial(), data = births)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7954  -0.2909  -0.1823  -0.1128   3.4917  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      26.16053    3.06171   8.544   <2e-16 ***
## dad_age           0.03473    0.03894   0.892   0.3725    
## mom_age          -0.05262    0.05195  -1.013   0.3111    
## maturityadvanced  0.69194    0.62123   1.114   0.2654    
## len_preg         -0.75095    0.07827  -9.595   <2e-16 ***
## maritalunmarried  0.49784    0.39582   1.258   0.2085    
## mom_wt_gain      -0.02121    0.01276  -1.663   0.0964 .  
## smokesmoker       0.70157    0.44951   1.561   0.1186    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 504.12  on 808  degrees of freedom
## Residual deviance: 265.24  on 801  degrees of freedom
##   (191 observations deleted due to missingness)
## AIC: 281.24
## 
## Number of Fisher Scoring iterations: 7
pR2(fit_all6)
##          llh      llhNull           G2     McFadden         r2ML         r2CU 
## -132.6180444 -348.6009843  431.9658796    0.6195707    0.4137157    0.7162643

Removing dad_age variable as it is the most insignificant variable with p-value equal to 0.3725 (Significance level = 0.05)

fit_all7 <- glm(low_bwt ~ mom_age+maturity+len_preg+marital+mom_wt_gain+smoke, family = binomial(), data = births)
summary(fit_all7)
## 
## Call:
## glm(formula = low_bwt ~ mom_age + maturity + len_preg + marital + 
##     mom_wt_gain + smoke, family = binomial(), data = births)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8705  -0.3379  -0.2146  -0.1384   3.4401  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      22.979044   2.438407   9.424   <2e-16 ***
## mom_age           0.001178   0.031688   0.037   0.9704    
## maturityadvanced  0.285770   0.538977   0.530   0.5960    
## len_preg         -0.680030   0.061842 -10.996   <2e-16 ***
## maritalunmarried  0.653076   0.306767   2.129   0.0333 *  
## mom_wt_gain      -0.012138   0.010231  -1.186   0.2354    
## smokesmoker       0.597802   0.359837   1.661   0.0967 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 661.33  on 971  degrees of freedom
## Residual deviance: 380.75  on 965  degrees of freedom
##   (28 observations deleted due to missingness)
## AIC: 394.75
## 
## Number of Fisher Scoring iterations: 6
pR2(fit_all7)
##          llh      llhNull           G2     McFadden         r2ML         r2CU 
## -190.3740946 -348.6009843  316.4537793    0.4538911    0.2778842    0.5428222

Removing mom_age variable as it is the most insignificant variable with p-value equal to 0.9704 (Significance level = 0.05)

fit_all8 <- glm(low_bwt ~ maturity+len_preg+marital+mom_wt_gain+smoke, family = binomial(), data = births)
summary(fit_all8)
## 
## Call:
## glm(formula = low_bwt ~ maturity + len_preg + marital + mom_wt_gain + 
##     smoke, family = binomial(), data = births)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8708  -0.3387  -0.2143  -0.1382   3.4388  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      23.01361    2.25468  10.207   <2e-16 ***
## maturityadvanced  0.29902    0.40417   0.740   0.4594    
## len_preg         -0.68009    0.06182 -11.001   <2e-16 ***
## maritalunmarried  0.64856    0.28165   2.303   0.0213 *  
## mom_wt_gain      -0.01216    0.01022  -1.190   0.2341    
## smokesmoker       0.59728    0.35952   1.661   0.0966 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 661.33  on 971  degrees of freedom
## Residual deviance: 380.75  on 966  degrees of freedom
##   (28 observations deleted due to missingness)
## AIC: 392.75
## 
## Number of Fisher Scoring iterations: 6
pR2(fit_all8)
##          llh      llhNull           G2     McFadden         r2ML         r2CU 
## -190.3747849 -348.6009843  316.4523988    0.4538891    0.2778832    0.5428202

Removing maturity variable as it is the most insignificant variable with p-value equal to 0.4594 (Significance level = 0.05)

fit_all9 <- glm(low_bwt ~ len_preg+marital+mom_wt_gain+smoke, family = binomial(), data = births)
summary(fit_all9)
## 
## Call:
## glm(formula = low_bwt ~ len_preg + marital + mom_wt_gain + smoke, 
##     family = binomial(), data = births)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8805  -0.3293  -0.2149  -0.1401   3.4303  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      23.04652    2.24313  10.274   <2e-16 ***
## len_preg         -0.67944    0.06148 -11.052   <2e-16 ***
## maritalunmarried  0.61950    0.27820   2.227   0.0260 *  
## mom_wt_gain      -0.01225    0.01019  -1.202   0.2292    
## smokesmoker       0.59078    0.35904   1.645   0.0999 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 661.33  on 971  degrees of freedom
## Residual deviance: 381.28  on 967  degrees of freedom
##   (28 observations deleted due to missingness)
## AIC: 391.28
## 
## Number of Fisher Scoring iterations: 6
pR2(fit_all9)
##          llh      llhNull           G2     McFadden         r2ML         r2CU 
## -190.6387935 -348.6009843  315.9243815    0.4531318    0.2774908    0.5420537

Removing mom_wt_gain variable as it is the most insignificant variable with p-value equal to 0.2292 (Significance level = 0.05)

fit_all10 <- glm(low_bwt ~ len_preg+marital+smoke, family = binomial(), data = births)
summary(fit_all10)
## 
## Call:
## glm(formula = low_bwt ~ len_preg + marital + smoke, family = binomial(), 
##     data = births)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9426  -0.2910  -0.2058  -0.1408   3.4692  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      23.58983    2.24347  10.515   <2e-16 ***
## len_preg         -0.70344    0.06145 -11.447   <2e-16 ***
## maritalunmarried  0.64288    0.27224   2.361   0.0182 *  
## smokesmoker       0.52240    0.35765   1.461   0.1441    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 692.56  on 997  degrees of freedom
## Residual deviance: 394.05  on 994  degrees of freedom
##   (2 observations deleted due to missingness)
## AIC: 402.05
## 
## Number of Fisher Scoring iterations: 6
pR2(fit_all10)
##          llh      llhNull           G2     McFadden         r2ML         r2CU 
## -197.0256110 -348.6009843  303.1507466    0.4348105    0.2619607    0.5210883

Final logistic model

Removing smoke variable as it is the most insignificant variable with p-value equal to 0.1441 (Significance level = 0.05)

fit_all11 <- glm(low_bwt ~ len_preg+marital, family = binomial(), data = births)
summary(fit_all11)
## 
## Call:
## glm(formula = low_bwt ~ len_preg + marital, family = binomial(), 
##     data = births)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9834  -0.3012  -0.2132  -0.1479   3.4403  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      23.63838    2.24101  10.548   <2e-16 ***
## len_preg         -0.70286    0.06131 -11.464   <2e-16 ***
## maritalunmarried  0.66923    0.27075   2.472   0.0134 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 692.56  on 997  degrees of freedom
## Residual deviance: 396.08  on 995  degrees of freedom
##   (2 observations deleted due to missingness)
## AIC: 402.08
## 
## Number of Fisher Scoring iterations: 6
pR2(fit_all11)
##          llh      llhNull           G2     McFadden         r2ML         r2CU 
## -198.0396885 -348.6009843  301.1225915    0.4319015    0.2604594    0.5181017
exp(cbind(OR=coef(fit_all11), confint(fit_all11)))
## Waiting for profiling to be done...
##                            OR        2.5 %       97.5 %
## (Intercept)      1.845089e+10 2.798761e+08 1.854117e+12
## len_preg         4.951682e-01 4.363693e-01 5.551420e-01
## maritalunmarried 1.952728e+00 1.149141e+00 3.333546e+00
exp(coef(fit_all11))
##      (Intercept)         len_preg maritalunmarried 
##     1.845089e+10     4.951682e-01     1.952728e+00
plot(fit_all11)

McFadden: 0.4319015
r2CU: 0.5181017

Equation for fitted coefficients:

logit(low_bwt) = -0.70286 * (len_preg) + 0.66923 * (maritalunmarried) + 23.63838

Effect of significant variables on low_bwt variable:

For every one-unit increase in length of pregnancy (weeks), the odds of having baby with low birthweight (pounds) is multiplied by exp(-0.70286)= 4.951682e-01 (ie: a decrease of 49.5%) on average, when all other variables in the model are held constant.

For umarried mother at time of birth as compared with married mother at time of birth, the odds of having baby with low birthweight (pounds) is multiplied by exp(0.66923)=1.952728 (ie: an increase of 95%) on average when all other variables in the model are held constant.

Using step function to check AIC and if we get the same significant variables as my final logistics model:

Creating new dataframe without dad_age column as it has lots of missing information:

birth_2 <- births[,2:14] 
birth_2 <- na.omit(birth_2)
summary(birth_2)
##     mom_age          maturity      len_preg       is_premie     num_visits  
##  Min.   :13.00   younger :835   Min.   :20.0   fullterm:819   Min.   : 0.0  
##  1st Qu.:22.00   advanced:127   1st Qu.:37.0   premie  :143   1st Qu.:10.0  
##  Median :27.00                  Median :39.0                  Median :12.0  
##  Mean   :27.01                  Mean   :38.4                  Mean   :12.2  
##  3rd Qu.:32.00                  3rd Qu.:40.0                  3rd Qu.:15.0  
##  Max.   :50.00                  Max.   :45.0                  Max.   :30.0  
##       marital     mom_wt_gain         bwt           low_bwt        sex     
##  married  :599   Min.   : 0.00   Min.   : 1.000   notlow:860   female:483  
##  unmarried:363   1st Qu.:20.00   1st Qu.: 6.380   low   :102   male  :479  
##                  Median :30.00   Median : 7.310                            
##                  Mean   :30.34   Mean   : 7.128                            
##                  3rd Qu.:38.00   3rd Qu.: 8.060                            
##                  Max.   :85.00   Max.   :11.750                            
##        smoke        mom_white    mom_age_level
##  nonsmoker:841   nonwhite:272   teens   :103  
##  smoker   :121   white   :690   early20s:269  
##                                 late20s :250  
##                                 early30s:213  
##                                 35+     :127  
## 
fit_null <- glm(low_bwt ~ 1, family = binomial(), data = birth_2)
summary(fit_null)
## 
## Call:
## glm(formula = low_bwt ~ 1, family = binomial(), data = birth_2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.4735  -0.4735  -0.4735  -0.4735   2.1185  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.1320     0.1047  -20.36   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 650.57  on 961  degrees of freedom
## Residual deviance: 650.57  on 961  degrees of freedom
## AIC: 652.57
## 
## Number of Fisher Scoring iterations: 4

Removing mom_age_level variable, since it is a dichotomised version of mom_age and has a very high positive correlation as shown (0.9740499)
Removing bwt variable as it has high negative correlation with low_bwt variable (-0.5441694) and low_bwt variable is dichotomised version of bwt variable

fit_main <- glm(low_bwt ~ mom_age+len_preg+num_visits+mom_wt_gain+maturity+is_premie+smoke+marital+mom_white+sex, family = binomial(), data = birth_2)
summary(fit_main)
## 
## Call:
## glm(formula = low_bwt ~ mom_age + len_preg + num_visits + mom_wt_gain + 
##     maturity + is_premie + smoke + marital + mom_white + sex, 
##     family = binomial(), data = birth_2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8322  -0.3232  -0.2171  -0.1328   3.4597  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      21.013120   4.152910   5.060  4.2e-07 ***
## mom_age          -0.002514   0.032108  -0.078   0.9376    
## len_preg         -0.638696   0.105039  -6.081  1.2e-09 ***
## num_visits        0.040347   0.032789   1.230   0.2185    
## mom_wt_gain      -0.012158   0.010494  -1.159   0.2466    
## maturityadvanced  0.275132   0.543351   0.506   0.6126    
## is_premiepremie   0.340374   0.481983   0.706   0.4801    
## smokesmoker       0.698477   0.365566   1.911   0.0560 .  
## maritalunmarried  0.729563   0.325191   2.243   0.0249 *  
## mom_whitewhite    0.034356   0.318194   0.108   0.9140    
## sexmale          -0.299270   0.283064  -1.057   0.2904    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 650.57  on 961  degrees of freedom
## Residual deviance: 376.60  on 951  degrees of freedom
## AIC: 398.6
## 
## Number of Fisher Scoring iterations: 6
fit_step1 <- step(fit_null, scope=list(lower=fit_null, upper=fit_main),direction="both")
## Start:  AIC=652.57
## low_bwt ~ 1
## 
##               Df Deviance    AIC
## + len_preg     1   390.59 394.59
## + is_premie    1   455.53 459.53
## + marital      1   636.71 640.71
## + num_visits   1   640.42 644.42
## + mom_wt_gain  1   640.42 644.42
## + mom_white    1   646.24 650.24
## + smoke        1   648.12 652.12
## <none>             650.57 652.57
## + maturity     1   649.44 653.44
## + sex          1   650.22 654.22
## + mom_age      1   650.56 654.56
## 
## Step:  AIC=394.59
## low_bwt ~ len_preg
## 
##               Df Deviance    AIC
## + marital      1   384.26 390.26
## + smoke        1   387.41 393.41
## <none>             390.59 394.59
## + mom_wt_gain  1   388.72 394.72
## + sex          1   389.71 395.71
## + is_premie    1   389.91 395.91
## + mom_white    1   390.11 396.11
## + mom_age      1   390.34 396.34
## + num_visits   1   390.37 396.37
## + maturity     1   390.46 396.46
## - len_preg     1   650.57 652.57
## 
## Step:  AIC=390.26
## low_bwt ~ len_preg + marital
## 
##               Df Deviance    AIC
## + smoke        1   381.67 389.67
## <none>             384.26 390.26
## + mom_wt_gain  1   383.00 391.00
## + num_visits   1   383.00 391.00
## + sex          1   383.42 391.42
## + is_premie    1   383.75 391.75
## + maturity     1   383.76 391.76
## + mom_age      1   383.99 391.99
## + mom_white    1   384.24 392.24
## - marital      1   390.59 394.59
## - len_preg     1   636.71 640.71
## 
## Step:  AIC=389.67
## low_bwt ~ len_preg + marital + smoke
## 
##               Df Deviance    AIC
## <none>             381.67 389.67
## + num_visits   1   380.09 390.09
## - smoke        1   384.26 390.26
## + mom_wt_gain  1   380.30 390.30
## + sex          1   380.56 390.56
## + is_premie    1   381.05 391.05
## + maturity     1   381.12 391.12
## + mom_age      1   381.33 391.33
## + mom_white    1   381.67 391.67
## - marital      1   387.41 393.41
## - len_preg     1   635.35 641.35
summary(fit_step1)
## 
## Call:
## glm(formula = low_bwt ~ len_preg + marital + smoke, family = binomial(), 
##     data = birth_2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9055  -0.2905  -0.2072  -0.1457   3.4396  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      22.94011    2.24980  10.196   <2e-16 ***
## len_preg         -0.68642    0.06158 -11.146   <2e-16 ***
## maritalunmarried  0.66335    0.27683   2.396   0.0166 *  
## smokesmoker       0.59549    0.35969   1.656   0.0978 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 650.57  on 961  degrees of freedom
## Residual deviance: 381.67  on 958  degrees of freedom
## AIC: 389.67
## 
## Number of Fisher Scoring iterations: 6

Comment:

Even if we keep mom_age_level variable as the predictor variable for step function, we would still be getting the same results (AIC, significant variables) as we got without mom_age_level variable. This result of the step function is found to be consistent with my final logistic model (with same significant variables and similar fitted coefficients) except for the smoke variable which is not a significant variable in my final model as its p-value is greater than significance level of 0.05