This assignment is ungraded. I encourage you to review the problems to see if (1) you know how to do them or (2) if you know how to google how to do it. If either path forward escapes you, I suggest that you complete this assignment.
Part 1
Exercise 7.2.3 from Data Science for Public Policy. Data can be found here.
- Graph and regress sale price against gross square feet interpret the results
sale_df <- read_csv("https://raw.githubusercontent.com/DataScienceForPublicPolicy/diys/main/data/home_sales_nyc.csv")
ggplot(data = sale_df, aes(x = gross.square.feet, y = sale.price)) +
geom_point(alpha = 0.15,
size = 1.2,
colour = "blue") +
scale_x_continuous("Property size (gross square feet)", labels = scales::comma) +
scale_y_continuous("Sale price (USD)", labels = scales::comma)
reg_est <- lm(sale.price ~ gross.square.feet, data = sale_df)
summary(reg_est)
##
## Call:
## lm(formula = sale.price ~ gross.square.feet, data = sale_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1700116 -212264 -44958 138638 8661923
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -42584.389 11534.260 -3.692 0.000223 ***
## gross.square.feet 466.176 7.097 65.684 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 463900 on 12666 degrees of freedom
## Multiple R-squared: 0.2541, Adjusted R-squared: 0.254
## F-statistic: 4314 on 1 and 12666 DF, p-value: < 2.2e-16
Part 2
Reproduce this figure from tidymodels 3.3 with the data from Part 1 replacing mpg with sale price for numeric variables.
corr_res <- map(sale_df %>%
select(where(is.numeric), -c(sale.price, borough, zip.code)),
cor.test,
y=sale_df$sale.price)
corr_res %>%
map_dfr(broom::tidy, .id = "predictor") %>%
ggplot(aes(x = fct_reorder(predictor, estimate))) +
geom_point(aes(y = estimate)) +
geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = .1) +
labs(x = NULL, y = "Correlation with sale price") +
theme_bw() +
coord_flip()
Part 3
Exercise 7.4.5
Estimate a set of regressions, evaluate the pros and cons of each, and select the “best” specification.
Create and analyze the following four models from the textbook and one of your own:
- Model 1 (mod1) regresses sales prices and building area
- Model 2 (mod2) adds borough as a categorical variable
- Model 3 (mod3) incorporates an interaction to estimate borough-specific slopes for building area
- Model 4 (mod4) adds land area
library(gridExtra)
# Simple regression
mod1 <- lm(sale.price ~ gross.square.feet,
data = sale_df)
# With borough
mod2 <- lm(sale.price ~ gross.square.feet + factor(borough),
data = sale_df)
# Interaction
mod3 <- lm(sale.price ~ gross.square.feet * factor(borough),
data = sale_df)
# With Additional variables
mod4 <-
lm(sale.price ~ gross.square.feet * factor(borough) + land.square.feet + age,
data = sale_df)
sale_df <- sale_df %>% mutate(quarter = lubridate::floor_date(sale.date, 'quarter'))
mod5 <- lm(sale.price ~ gross.square.feet * factor(borough) + land.square.feet + age + factor(quarter),
data = sale_df)
#Base
base1 <-
ggplot(sale_df, aes(x = gross.square.feet, y = sale.price / 1000000)) +
geom_point(colour = rgb(0, 0, 0, 0.1), size = 0.8) +
geom_point(
aes(x = sale_df$gross.square.feet, y = predict(mod1) / 1000000),
colour = rgb(1, 0, 0, 0.2),
size = 0.6
) +
xlab("Gross Square Feet") + ylab("Sales Price ($MM)") +
ggtitle(paste0("Model 1 (BIC = ", round(BIC(mod1)), ")")) +
xlim(0, 3000) + ylim(0, 3)
#Base2
base2 <-
ggplot(sale_df, aes(x = gross.square.feet, y = sale.price / 1000000)) +
geom_point(colour = rgb(0, 0, 0, 0.1), size = 0.8) +
geom_point(
aes(x = sale_df$gross.square.feet, y = predict(mod2) / 1000000),
colour = rgb(1, 0, 0, 0.2),
size = 0.6
) +
xlab("Gross Square Feet") + ylab("Sales Price ($MM)") +
ggtitle(paste0("Model 2 (BIC = ", round(BIC(mod2)), ")")) +
xlim(0, 3000) + ylim(0, 3)
#Base3
base3 <-
ggplot(sale_df, aes(x = gross.square.feet, y = sale.price / 1000000)) +
geom_point(colour = rgb(0, 0, 0, 0.1), size = 0.8) +
geom_point(
aes(x = sale_df$gross.square.feet, y = predict(mod3) / 1000000),
colour = rgb(1, 0, 0, 0.2),
size = 0.6
) +
xlab("Gross Square Feet") + ylab("Sales Price ($MM)") +
ggtitle(paste0("Model 3 (BIC = ", round(BIC(mod3)), ")")) +
xlim(0, 3000) + ylim(0, 3)
#Base4
base4 <-
ggplot(sale_df, aes(x = gross.square.feet, y = sale.price / 1000000)) +
geom_point(colour = rgb(0, 0, 0, 0.1), size = 0.8) +
geom_point(
aes(x = sale_df$gross.square.feet, y = predict(mod4) / 1000000),
colour = rgb(1, 0, 0, 0.2),
size = 0.6
) +
xlab("Gross Square Feet") + ylab("Sales Price ($MM)") +
ggtitle(paste0("Model 4 (BIC = ", round(BIC(mod4)), ")")) +
xlim(0, 3000) + ylim(0, 3)
grid.arrange(base1, base2, base3, base4, ncol = 2)
broom::glance(mod5)
## # A tibble: 1 × 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.489 0.489 384076. 808. 0 15 -180860. 3.62e5 3.62e5
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
broom::tidy(mod5)
## # A tibble: 16 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 794057. 192345. 4.13 3.68e- 5
## 2 gross.square.feet 1032. 56.4 18.3 9.11e-74
## 3 factor(borough)2 -652250. 194929. -3.35 8.22e- 4
## 4 factor(borough)3 -1098488. 192639. -5.70 1.21e- 8
## 5 factor(borough)4 -611850. 192018. -3.19 1.44e- 3
## 6 factor(borough)5 -596887. 192362. -3.10 1.92e- 3
## 7 land.square.feet 37.3 1.86 20.1 3.31e-88
## 8 age -721. 154. -4.69 2.71e- 6
## 9 factor(quarter)2017-10-01 431. 13121. 0.0329 9.74e- 1
## 10 factor(quarter)2018-01-01 3506. 13441. 0.261 7.94e- 1
## 11 factor(quarter)2018-04-01 8800. 13278. 0.663 5.07e- 1
## 12 factor(quarter)2018-07-01 52101. 14444. 3.61 3.11e- 4
## 13 gross.square.feet:factor(borough)2 -866. 60.4 -14.3 3.18e-46
## 14 gross.square.feet:factor(borough)3 -291. 57.8 -5.04 4.76e- 7
## 15 gross.square.feet:factor(borough)4 -761. 57.4 -13.3 7.37e-40
## 16 gross.square.feet:factor(borough)5 -890. 57.6 -15.4 2.58e-53
Part 4
In the class divvy example (see the lectures page for code/files), we had a lot of missing values in our data. We also didn’t have a very rigorous treatment of time/seasonality. Explore how impactful these issues are by creating a few different models and comparing the predictions using the workflows we saw from class in rsample (splitting data), parsnip (linear_reg, set_engine, set_mode, fit), yardstick (mape, rmse), and broom (augment).
divvy_data <- read_csv('https://github.com/erhla/pa470spring2023/raw/main/static/lectures/week_3_data.csv')
# split data
grouped <- rsample::initial_split(divvy_data)
train <- training(grouped)
test <- testing(grouped)
# create parsnip model
lm_model <-
parsnip::linear_reg() %>%
set_engine("lm") %>%
set_mode('regression') %>%
fit(rides ~ solar_rad + factor(hour(started_hour)) +
factor(wday(started_hour)) +
factor(month(started_hour)) +
temp + wind + interval_rain + avg_speed, data=train)
# predict and augment
preds <-
predict(lm_model, test %>% filter(month(started_hour) >= 5))
test_preds <- lm_model %>%
augment(test %>% filter(month(started_hour) >=5))
# evaluate
yardstick::mape(test_preds,
truth = rides,
estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 mape standard 119.
yardstick::rmse(test_preds,
truth = rides,
estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 359.
ggplot(test_preds, aes(x=.pred)) +
geom_density()
divvy_data <- divvy_data %>%
mutate(
day = floor_date(started_hour, 'day'),
bad_weather = if_else(solar_rad <= 5 & temp <= 5, 1, 0),
nice_weather = if_else(solar_rad >= 25 & temp >= 15, 1, 0)
)
total_rain <- divvy_data %>% group_by(
day
) %>% summarize(total_precip = sum(interval_rain, na.rm=T))
divvy_data <- divvy_data %>% left_join(total_rain) %>%
mutate(rainy_weather = if_else(total_precip > 0, 1, 0))
grouped <- rsample::initial_split(divvy_data)
train <- training(grouped)
test <- testing(grouped)
lm_model <-
parsnip::linear_reg() %>%
set_engine("lm") %>%
fit(rides ~ solar_rad + factor(hour(started_hour)) +
factor(nice_weather) + factor(bad_weather) + factor(rainy_weather) +
temp + avg_speed, data=train)
preds <-
predict(lm_model, test %>% filter(month(started_hour) >= 5))
test_preds <- lm_model %>%
augment(test %>% filter(month(started_hour) >=5))
yardstick::mape(test_preds,
truth = rides,
estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 mape standard 104.
yardstick::rmse(test_preds,
truth = rides,
estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 350.
ggplot(test_preds, aes(x=.pred)) +
geom_density()