1. Use today's setup, but add a religiosity variable for this sample of persons of immigrant origin using PCA. z-standardize religiosity and visualize its distribution.
# Add packages to library
library(tidyverse) # Add the tidyverse package to my current library.
library(haven) # Handle labelled data.
library(essurvey) # Add ESS API package.
library(ggplot2) # Allows us to create nice figures.
library(estimatr) # Allows us to estimate (cluster-)robust standard errors.
library(texreg) # Allows us to make nicely-formatted Html & Latex regression tables.
library(broom) # Allows us to turn parts of model objects into tibbles.
library(modelr) # Model predictions (can also do resampling).
library(lmtest) # Allows us to estimate (cluster-)robust standard errors for GLMs. 
library(sandwich) # Allows us to estimate (cluster-)robust standard errors for GLMs. 
library(margins) # Averaged marginal effects
ESS <- ESS %>% transmute( # Create new variables and keep only those
  cntry = as_factor(cntry), # Country of interview
  gndr = as_factor(gndr),
  facntr = as_factor(facntr), # Father born in cntry
  mocntr = as_factor(mocntr), # Mother born in cntry
  dscrgrp = as_factor(dscrgrp), # Belonging to discriminated group #<<
  dscrrlg = as_factor(dscrrlg), # Discriminated because of religion #<<
  dscrrce = as_factor(dscrrce), # Discriminated because of race #<<
  dscrlng = as_factor(dscrlng), # Discriminated because of language #<<
  dscretn = as_factor(dscretn), # Discriminated because of ethnicity #<<
  discr = case_when( #<<
    dscrgrp == "Yes" & (dscrrlg == "Marked" | dscrrce == "Marked" | #<<
                          dscrlng == "Marked" | dscretn == "Marked") ~ "Yes", #<<
    dscrgrp == "No" | (dscrgrp == "Yes" & dscrrlg != "Marked" & dscrrce != "Marked" #<<
                       & dscrlng != "Marked" & dscretn != "Marked") ~ "No", #<<
    TRUE ~ as.character(NA)) %>% as_factor(), # all others missing, then change to factor #<<
  pspwght = zap_label(pspwght),
  # Regigion
    rlgblg = as_factor(rlgblg), # Belonging to a religion
    rlgdgr = zap_labels(rlgdgr), # How religios
    rlgatnd = max(rlgatnd, na.rm = TRUE) - zap_labels(rlgatnd), # Service attendance
    pray = max(pray, na.rm = TRUE) - zap_labels(pray), # Prayer frequency
  agea = zap_label(agea),
  eduyrs = case_when( # Education
    eduyrs > 21 ~ 21, # Recode to max 21 years of edu.
    eduyrs < 9 ~ 9, # Recode to min 9 years of edu.
    TRUE ~ zap_labels(eduyrs))) %>% # make it numeric, then
  dplyr::filter(# Keep only respondents of immigrant origin,
    facntr == "No" & mocntr == "No" &
      # filter cases from four countries, then
      (cntry == "Denmark" | cntry == "Sweden" | cntry == "Norway" | cntry == "Germany"))

# Conduct PCA of climate change concerns
(pca <- ESS %>% prcomp(
  formula = ~ rlgdgr + rlgatnd + pray,
  data = ., na.action = na.exclude,
  center = TRUE, scale = TRUE))
# Standard deviations (1, .., p=3):
# [1] 1.49 0.69 0.54
# 
# Rotation (n x k) = (3 x 3):
#           PC1   PC2    PC3
# rlgdgr  -0.59  0.37  0.714
# rlgatnd -0.55 -0.84 -0.017
# pray    -0.59  0.40 -0.700
# Show importance of single components.
summary(pca)
# Importance of components:
#                          PC1   PC2    PC3
# Standard deviation     1.494 0.686 0.5435
# Proportion of Variance 0.744 0.157 0.0985
# Cumulative Proportion  0.744 0.902 1.0000
# Add new religiosity variable to ESS; beware to turn it around using "-"
ESS$relig <- -pca$x[, "PC1"] %>% scale() %>% as.numeric()

# Make histogram of religiosity
ggplot(data = ESS, aes(x = relig)) +
  geom_histogram() +
  labs(y = "Count", x = "Religiosity") +
  theme_minimal()

  1. Drop unused factor levels in cntry, and make a design matrix like I did in my slides (i.e., keep only discr, relig, cntry, agea, gndr, eduyrs, pspwght, and apply casewise deletion).
ESS <- ESS %>%
  dplyr::select(discr, relig, cntry, agea, gndr, eduyrs, pspwght) %>%
  mutate( # Drop unused factor levels
    cntry = fct_drop(cntry),
    gndr = fct_drop(gndr)) %>%
  drop_na()

(n <- nrow(ESS))
# [1] 818

That design matrix has observations left.

  1. Regress discrimination on religiosity; compare the results of an LPM and a logistic regression in a regression table.
lpm <- lm_robust(as.numeric(discr) - 1 ~ relig, weights = pspwght, data = ESS)
log_model <- glm(discr ~ relig, weights = pspwght, data = ESS, family = binomial(link="logit"))
log_model_robSE <- coeftest(log_model, sandwich::vcovHC(log_model))

# Summary of LPMs, plug in SEs by hand
screenreg(list(lpm, log_model_robSE), include.ci = FALSE, custom.model.names = c("LPM", "Logistic"))
# 
# ==================================
#              LPM         Logistic 
# ----------------------------------
# (Intercept)    0.14 ***  -1.81 ***
#               (0.01)     (0.11)   
# relig          0.02       0.16    
#               (0.01)     (0.11)   
# ----------------------------------
# R^2            0.00               
# Adj. R^2       0.00               
# Num. obs.    818                  
# RMSE           0.37               
# ==================================
# *** p < 0.001; ** p < 0.01; * p < 0.05
  1. Model 3: Add cntry, agea, and gndr as control variables to your logistic regression, what happens to our two religiosity estimates?
log_model2 <- glm(discr ~ relig + agea + gndr + cntry, weights = pspwght, data = ESS, family = binomial(link="logit"))
log_model2_robSE <- coeftest(log_model2, sandwich::vcovHC(log_model2))

# Summary
screenreg(list(log_model_robSE, log_model2_robSE), include.ci = FALSE)
# 
# ==================================
#               Model 1    Model 2  
# ----------------------------------
# (Intercept)   -1.81 ***  -0.76 *  
#               (0.11)     (0.30)   
# relig          0.16       0.14    
#               (0.11)     (0.12)   
# agea                     -0.03 ***
#                          (0.01)   
# gndrFemale               -0.05    
#                          (0.24)   
# cntryDenmark              0.32    
#                          (0.37)   
# cntryNorway               0.09    
#                          (0.32)   
# cntrySweden               0.11    
#                          (0.29)   
# ==================================
# *** p < 0.001; ** p < 0.01; * p < 0.05
  1. Estimate the Averaged Marginal Effects for Model 2 and compare them to the coefficients of an LPM. Do they differ?
AME_log_model2 <- margins(log_model2, vcov = sandwich::vcovHC(log_model2))
lpm2 <- lm_robust(as.numeric(discr) - 1 ~ relig + agea + gndr + cntry, weights = pspwght, data = ESS)

# Summary of LPMs versus AMEs
screenreg(list(lpm2, AME_log_model2), include.ci = FALSE, custom.model.names = c("LPM", "AME"))
# 
# ========================================
#                  LPM         AME        
# ----------------------------------------
# (Intercept)        0.26 ***             
#                   (0.04)                
# relig              0.02         0.02    
#                   (0.01)       (0.01)   
# agea              -0.00 ***    -0.00 ***
#                   (0.00)       (0.00)   
# gndrFemale        -0.01        -0.01    
#                   (0.03)       (0.03)   
# cntryDenmark       0.04         0.04    
#                   (0.05)       (0.05)   
# cntryNorway        0.01         0.01    
#                   (0.04)       (0.04)   
# cntrySweden        0.02         0.01    
#                   (0.03)       (0.03)   
# ----------------------------------------
# R^2                0.03                 
# Adj. R^2           0.02                 
# Num. obs.        818                    
# RMSE               0.37                 
# Deviance (Null)               753.78    
# df.null                       817       
# Log Likelihood               -368.58    
# AIC                           751.16    
# BIC                           784.11    
# Deviance                      724.80    
# DF Resid.                     811       
# nobs                          818       
# ========================================
# *** p < 0.001; ** p < 0.01; * p < 0.05
  1. Add an interaction between religiosity and country of residence.
log_model3 <- glm(discr ~ relig*cntry + agea + gndr, weights = pspwght, data = ESS, family = binomial(link="logit")) %>%
  coeftest(., sandwich::vcovHC(.))

# Summary of LPMs, plug in SEs by hand
screenreg(list(log_model, log_model2, log_model3), 
          include.ci = FALSE)
# 
# =======================================================
#                     Model 1      Model 2      Model 3  
# -------------------------------------------------------
# (Intercept)           -1.81 ***    -0.76 **   -0.83 *  
#                       (0.10)       (0.29)     (0.32)   
# relig                  0.16         0.14       0.47 *  
#                       (0.09)       (0.09)     (0.20)   
# agea                               -0.03 ***  -0.03 ***
#                                    (0.01)     (0.01)   
# gndrFemale                         -0.05      -0.05    
#                                    (0.19)     (0.24)   
# cntryDenmark                        0.32       0.45    
#                                    (0.32)     (0.38)   
# cntryNorway                         0.09       0.21    
#                                    (0.28)     (0.33)   
# cntrySweden                         0.11       0.24    
#                                    (0.23)     (0.29)   
# relig:cntryDenmark                            -0.37    
#                                               (0.34)   
# relig:cntryNorway                             -0.73 *  
#                                               (0.36)   
# relig:cntrySweden                             -0.53    
#                                               (0.28)   
# -------------------------------------------------------
# AIC                  767.24       751.16               
# BIC                  776.66       784.11               
# Log Likelihood      -381.62      -368.58               
# Deviance             750.69       724.80               
# Num. obs.            818          818                  
# =======================================================
# *** p < 0.001; ** p < 0.01; * p < 0.05
  1. Although only the difference in slopes between Germany and Norway is significant, visualize the association between religiosity and discrimination by country as implied in your model.
log_model3 <- glm(discr ~ relig*cntry + agea + gndr, weights = pspwght, data = ESS, family = binomial(link="logit"))
synth_data <- ESS %>% # Use ESS_minority, then
   # Create synthetic data, then
   data_grid(agea, gndr, cntry, relig) %>% 
   mutate( # Set confounders to a constant value, then
     agea = mean(ESS$agea, na.rm = TRUE),
     gndr = "Female") %>%
   unique() %>% # Drop duplicates, then
   mutate( # Get the predictions
     fit = predict(log_model3, 
                    type = "response", 
                    newdata = .),
     fit_se = predict(log_model3, 
                       type = "response", 
                       newdata = .,
                       se.fit = TRUE)$se.fit,
     min95 = fit - fit_se*qt(0.975, df = nrow(ESS)),
     max95 = fit + fit_se*qt(0.975, df = nrow(ESS)))

ggplot(data = synth_data, 
       aes(y = fit, x = relig)) +
  geom_ribbon(aes(ymin = min95, ymax = max95,
                  fill = cntry), alpha = 1/10) +
  geom_line(aes(color = cntry)) +
  labs(x = "Religiosity",
  y = "Probability of belonging to a group \n discriminated based on \n race, ethnicity, language, or religion",
  caption = "(Covariates set to: Women at mean age)") + 
  theme_minimal() +
  theme(legend.position="bottom",
        legend.title=element_blank())

  1. Hide Sweden and Denmark from the plot and show only Germany and Norway.
ggplot(data = synth_data %>% dplyr::filter(cntry != "Denmark" & cntry != "Sweden"), 
       aes(y = fit, x = relig)) +
  geom_ribbon(aes(ymin = min95, ymax = max95,
                  fill = cntry), alpha = 1/10) +
  geom_line(aes(color = cntry)) +
  labs(x = "Religiosity",
  y = "Probability of belonging to a group \n discriminated based on \n race, ethnicity, language, or religion",
  caption = "(Covariates set to: Women at mean age)") + 
  theme_minimal() +
  theme(legend.position="bottom",
        legend.title=element_blank())