p299
Fit some of the non-linear models investigated in this chapter to the Auto data set. Is there evidence for non-linear relationships in this data set? Create some informative plots to justify your answer.
glimpse(Auto)
## Rows: 392
## Columns: 9
## $ mpg <dbl> 18, 15, 18, 16, 17, 15, 14, 14, 14, 15, 15, 14, 15, 14, …
## $ cylinders <dbl> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 6, 6, 6, 4,…
## $ displacement <dbl> 307, 350, 318, 304, 302, 429, 454, 440, 455, 390, 383, 3…
## $ horsepower <dbl> 130, 165, 150, 150, 140, 198, 220, 215, 225, 190, 170, 1…
## $ weight <dbl> 3504, 3693, 3436, 3433, 3449, 4341, 4354, 4312, 4425, 38…
## $ acceleration <dbl> 12.0, 11.5, 11.0, 12.0, 10.5, 10.0, 9.0, 8.5, 10.0, 8.5,…
## $ year <dbl> 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, …
## $ origin <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 3,…
## $ name <fct> chevrolet chevelle malibu, buick skylark 320, plymouth s…
library(skimr)
skim(Auto)
Name | Auto |
Number of rows | 392 |
Number of columns | 9 |
_______________________ | |
Column type frequency: | |
factor | 1 |
numeric | 8 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
name | 0 | 1 | FALSE | 301 | amc: 5, for: 5, toy: 5, amc: 4 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
mpg | 0 | 1 | 23.45 | 7.81 | 9 | 17.00 | 22.75 | 29.00 | 46.6 | ▆▇▆▃▁ |
cylinders | 0 | 1 | 5.47 | 1.71 | 3 | 4.00 | 4.00 | 8.00 | 8.0 | ▇▁▃▁▅ |
displacement | 0 | 1 | 194.41 | 104.64 | 68 | 105.00 | 151.00 | 275.75 | 455.0 | ▇▂▂▃▁ |
horsepower | 0 | 1 | 104.47 | 38.49 | 46 | 75.00 | 93.50 | 126.00 | 230.0 | ▆▇▃▁▁ |
weight | 0 | 1 | 2977.58 | 849.40 | 1613 | 2225.25 | 2803.50 | 3614.75 | 5140.0 | ▇▇▅▅▂ |
acceleration | 0 | 1 | 15.54 | 2.76 | 8 | 13.78 | 15.50 | 17.02 | 24.8 | ▁▆▇▂▁ |
year | 0 | 1 | 75.98 | 3.68 | 70 | 73.00 | 76.00 | 79.00 | 82.0 | ▇▆▇▆▇ |
origin | 0 | 1 | 1.58 | 0.81 | 1 | 1.00 | 1.00 | 2.00 | 3.0 | ▇▁▂▁▂ |
## The following object is masked from package:ggplot2:
##
## mpg
names(Auto)
## [1] "mpg" "cylinders" "displacement" "horsepower" "weight"
## [6] "acceleration" "year" "origin" "name"
g1 <- ggplot(Auto, aes(x = cylinders, y = mpg, group = cylinders)) + geom_boxplot() + theme(legend.position = "none")
g1
ggplot(Auto, aes(x = displacement, y = mpg)) +
geom_point(alpha = 0.5) +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(Auto, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.5) +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(Auto, aes(x = weight, y = mpg)) +
geom_point(alpha = 0.5) +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(Auto, aes(x = acceleration, y = mpg)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "loess", formula = y ~ x)
ggplot(Auto, aes(x = year, y = mpg)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "loess", formula = y ~ x) # Default params
ggplot(Auto, aes(x = origin, y = mpg, group=origin)) +
geom_boxplot()# + theme(legend.position = "none")
library(gam)
# gam.fit = gam(mpg ~ cylinders + s(acceleration,4) + s(weight,4), data = Auto)
gam.fit = gam(mpg ~ cylinders, data = Auto)
summary(gam.fit)
##
## Call: gam(formula = mpg ~ cylinders, data = Auto)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -14.2413 -3.1832 -0.6332 2.5491 17.9168
##
## (Dispersion Parameter for gaussian family taken to be 24.1434)
##
## Null Deviance: 23818.99 on 391 degrees of freedom
## Residual Deviance: 9415.91 on 390 degrees of freedom
## AIC: 2364.574
##
## Number of Local Scoring Iterations: 2
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## cylinders 1 14403.1 14403.1 596.57 < 2.2e-16 ***
## Residuals 390 9415.9 24.1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1