p299
The Wage data set contains a number of other features not explored in this chapter, such as marital status (maritl), job class (jobclass), and others. Explore the relationships between some of these other predictors and wage, and use non-linear fitting techniques in order to fit flexible models to the data. Create plots of the results obtained, and write a summary of your findings.
library(tidyverse)
library(ISLR)
attach(Wage)
glimpse(Wage)
## Rows: 3,000
## Columns: 11
## $ year <int> 2006, 2004, 2003, 2003, 2005, 2008, 2009, 2008, 2006, 2004…
## $ age <int> 18, 24, 45, 43, 50, 54, 44, 30, 41, 52, 45, 34, 35, 39, 54…
## $ maritl <fct> 1. Never Married, 1. Never Married, 2. Married, 2. Married…
## $ race <fct> 1. White, 1. White, 1. White, 3. Asian, 1. White, 1. White…
## $ education <fct> 1. < HS Grad, 4. College Grad, 3. Some College, 4. College…
## $ region <fct> 2. Middle Atlantic, 2. Middle Atlantic, 2. Middle Atlantic…
## $ jobclass <fct> 1. Industrial, 2. Information, 1. Industrial, 2. Informati…
## $ health <fct> 1. <=Good, 2. >=Very Good, 1. <=Good, 2. >=Very Good, 1. <…
## $ health_ins <fct> 2. No, 2. No, 1. Yes, 1. Yes, 1. Yes, 1. Yes, 1. Yes, 1. Y…
## $ logwage <dbl> 4.318063, 4.255273, 4.875061, 5.041393, 4.318063, 4.845098…
## $ wage <dbl> 75.04315, 70.47602, 130.98218, 154.68529, 75.04315, 127.11…
library(skimr)
skim(Wage)
Name | Wage |
Number of rows | 3000 |
Number of columns | 11 |
_______________________ | |
Column type frequency: | |
factor | 7 |
numeric | 4 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
maritl | 0 | 1 | FALSE | 5 | 2. : 2074, 1. : 648, 4. : 204, 5. : 55 |
race | 0 | 1 | FALSE | 4 | 1. : 2480, 2. : 293, 3. : 190, 4. : 37 |
education | 0 | 1 | FALSE | 5 | 2. : 971, 4. : 685, 3. : 650, 5. : 426 |
region | 0 | 1 | FALSE | 1 | 2. : 3000, 1. : 0, 3. : 0, 4. : 0 |
jobclass | 0 | 1 | FALSE | 2 | 1. : 1544, 2. : 1456 |
health | 0 | 1 | FALSE | 2 | 2. : 2142, 1. : 858 |
health_ins | 0 | 1 | FALSE | 2 | 1. : 2083, 2. : 917 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
year | 0 | 1 | 2005.79 | 2.03 | 2003.00 | 2004.00 | 2006.00 | 2008.00 | 2009.00 | ▇▃▃▃▆ |
age | 0 | 1 | 42.41 | 11.54 | 18.00 | 33.75 | 42.00 | 51.00 | 80.00 | ▃▇▇▃▁ |
logwage | 0 | 1 | 4.65 | 0.35 | 3.00 | 4.45 | 4.65 | 4.86 | 5.76 | ▁▁▇▇▁ |
wage | 0 | 1 | 111.70 | 41.73 | 20.09 | 85.38 | 104.92 | 128.68 | 318.34 | ▂▇▂▁▁ |
plot(jobclass,wage)
plot(maritl,wage)
# jobclass.fit = lm(wage\~I(jobclass), data = Wage)
g1 <- ggplot(Wage, aes(x = maritl, y = wage, fill = maritl)) + geom_boxplot() + theme(legend.position = "none")
g1
Married has the highest average salary.
g2 <- ggplot(Wage, aes(x = jobclass, y = wage, fill = jobclass)) + geom_boxplot() + theme(legend.position = "none")
g2
require(gridExtra)
## Loading required package: gridExtra
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
#grid.arrange(g1, g2, g3, g4, g5, g6, g7, g8, ncol = 2)
grid.arrange(g1, g2, ncol = 2)
ggplot(Wage, aes(x = race, y = wage, fill = race)) + geom_boxplot() + theme(legend.position = "none")
ggplot(Wage, aes(x = education, y = wage, fill = education)) + geom_boxplot() + theme(legend.position = "none")
ggplot(Wage, aes(x = health_ins, y = wage, fill = health_ins)) + geom_boxplot() + theme(legend.position = "none")
ggplot(Wage, aes(x = region, y = wage, fill = region)) + geom_boxplot() + theme(legend.position = "none")
g3 <- ggplot(Wage, aes(x = age, y = wage)) +
geom_point(alpha = 0.5) +
geom_smooth()
g3
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Just a line. Not points.
g3 <- ggplot(Wage, aes(x = age, y = wage)) +
# geom_point(alpha = 0.5) +
geom_smooth()
g3
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# jobclass.fit = lm(wage \~ jobclass + cut(age,4), data = Wage)
# plot(jobclass.fit)
# summary(jobclass.fit)
#anova(fit1,fit2,fit3)
#cut(Wage\$jobclass, 4)
model <- lm(wage ~ poly(age, 3, raw = T) + cut(year, breaks = 2002:2009) + maritl + race + education + jobclass + health + health_ins, data = Wage)
summary(model)
##
## Call:
## lm(formula = wage ~ poly(age, 3, raw = T) + cut(year, breaks = 2002:2009) +
## maritl + race + education + jobclass + health + health_ins,
## data = Wage)
##
## Residuals:
## Min 1Q Median 3Q Max
## -101.439 -18.601 -3.308 13.816 209.106
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.247e+00 1.973e+01 0.165 0.869297
## poly(age, 3, raw = T)1 3.589e+00 1.409e+00 2.546 0.010931
## poly(age, 3, raw = T)2 -4.877e-02 3.192e-02 -1.528 0.126684
## poly(age, 3, raw = T)3 1.629e-04 2.323e-04 0.701 0.483087
## cut(year, breaks = 2002:2009)(2003,2004] 2.596e+00 2.146e+00 1.209 0.226587
## cut(year, breaks = 2002:2009)(2004,2005] 3.120e+00 2.191e+00 1.424 0.154457
## cut(year, breaks = 2002:2009)(2005,2006] 7.621e+00 2.271e+00 3.356 0.000801
## cut(year, breaks = 2002:2009)(2006,2007] 5.199e+00 2.282e+00 2.278 0.022796
## cut(year, breaks = 2002:2009)(2007,2008] 6.409e+00 2.275e+00 2.817 0.004884
## cut(year, breaks = 2002:2009)(2008,2009] 8.390e+00 2.276e+00 3.686 0.000232
## maritl2. Married 1.338e+01 1.813e+00 7.382 2.02e-13
## maritl3. Widowed 4.343e-01 7.970e+00 0.054 0.956547
## maritl4. Divorced 1.464e-01 2.930e+00 0.050 0.960159
## maritl5. Separated 7.147e+00 4.860e+00 1.471 0.141523
## race2. Black -4.788e+00 2.134e+00 -2.244 0.024904
## race3. Asian -2.686e+00 2.589e+00 -1.038 0.299564
## race4. Other -5.402e+00 5.633e+00 -0.959 0.337673
## education2. HS Grad 7.549e+00 2.356e+00 3.205 0.001366
## education3. Some College 1.798e+01 2.506e+00 7.175 9.08e-13
## education4. College Grad 3.062e+01 2.535e+00 12.079 < 2e-16
## education5. Advanced Degree 5.313e+01 2.796e+00 19.001 < 2e-16
## jobclass2. Information 3.453e+00 1.319e+00 2.618 0.008877
## health2. >=Very Good 6.307e+00 1.413e+00 4.464 8.35e-06
## health_ins2. No -1.634e+01 1.405e+00 -11.630 < 2e-16
##
## (Intercept)
## poly(age, 3, raw = T)1 *
## poly(age, 3, raw = T)2
## poly(age, 3, raw = T)3
## cut(year, breaks = 2002:2009)(2003,2004]
## cut(year, breaks = 2002:2009)(2004,2005]
## cut(year, breaks = 2002:2009)(2005,2006] ***
## cut(year, breaks = 2002:2009)(2006,2007] *
## cut(year, breaks = 2002:2009)(2007,2008] **
## cut(year, breaks = 2002:2009)(2008,2009] ***
## maritl2. Married ***
## maritl3. Widowed
## maritl4. Divorced
## maritl5. Separated
## race2. Black *
## race3. Asian
## race4. Other
## education2. HS Grad **
## education3. Some College ***
## education4. College Grad ***
## education5. Advanced Degree ***
## jobclass2. Information **
## health2. >=Very Good ***
## health_ins2. No ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 33.77 on 2976 degrees of freedom
## Multiple R-squared: 0.3503, Adjusted R-squared: 0.3452
## F-statistic: 69.75 on 23 and 2976 DF, p-value: < 2.2e-16