Retention

at20 <- read_csv("raw_data/active_wd_2020.csv") %>% 
  select(employee_id, hire_date) %>% 
  mutate(year20=T)

active_21 <- read_csv("raw_data/active_wd_2021.csv")  %>% 
  select(employee_id=`Employee ID`, hire_date=`Hire Date`) %>% 
  mutate(hire_date=mdy(hire_date))

terminated_21 <- read_csv("raw_data/terminated_wd_2021.csv") %>% 
  select(employee_id=`Employee ID`, hire_date=`Hire Date`) %>% 
  mutate(hire_date=mdy(hire_date))

at21 <- rbind(active_21, terminated_21)  %>% 
  mutate(year21=T)

at2122 <- anti_join(at20, at21)
cat(paste0("Missing employees between 2020 and 2021 data: ", nrow(at2122)))
## Missing employees between 2020 and 2021 data: 23
df <- read_csv("raw_data/active_wd_2021.csv")

colnames(df) <- c("department","employee_id","gender",   
                  "race_ethnicity","education", "military_status",
                  "date_of_birth", "original_hire_date","hire_date","pay_rate_type", "current_base_pay", 
                  "job_profile_current","time_type_current", "cost_center_current",  
                  "effective_date1", "business_process_type1", "business_process_reason1", "pay_rate_type1", "base_pay_change1", "job_profile1", "time_type1", "cost_center1", 
                  "effective_date2", "business_process_type2", "business_process_reason2", "pay_rate_type2", "base_pay_change2", "job_profile2", "time_type2", "cost_center2", 
                  "effective_date3", "business_process_type3", "business_process_reason3", "pay_rate_type3", "base_pay_change3", "job_profile3", "time_type3", "cost_center3",
                  "effective_date4", "business_process_type4", "business_process_reason4", "pay_rate_type4", "base_pay_change4", "job_profile4", "time_type4", "cost_center4", 
                  "effective_date5", "business_process_type5", "business_process_reason5", "pay_rate_type5", "base_pay_change5", "job_profile5", "time_type5", "cost_center5", 
                  "effective_date6", "business_process_type6", "business_process_reason6", "pay_rate_type6", "base_pay_change6", "job_profile6", "time_type6", "cost_center6",
                  "effective_date7", "business_process_type7", "business_process_reason7", "pay_rate_type7", "base_pay_change7", "job_profile7", "time_type7", "cost_center7", 
                  "effective_date8", "business_process_type8", "business_process_reason8", "pay_rate_type8", "base_pay_change8", "job_profile8", "time_type8", "cost_center8", 
                  "effective_date9", "business_process_type9", "business_process_reason9", "pay_rate_type9", "base_pay_change9", "job_profile9", "time_type9", "cost_center9",
                  "effective_date10", "business_process_type10", "business_process_reason10", "pay_rate_type10", "base_pay_change10", "job_profile10", "time_type10", "cost_center10",
                  "effective_date11", "business_process_type11", "business_process_reason11", "pay_rate_type11", "base_pay_change11", "job_profile11", "time_type11", "cost_center11",
                  "effective_date12", "business_process_type12", "business_process_reason12", "pay_rate_type12", "base_pay_change12", "job_profile12", "time_type12", "cost_center12", 
                  "effective_date13", "business_process_type13", "business_process_reason13", "pay_rate_type13", "base_pay_change13", "job_profile13", "time_type13", "cost_center13", 
                  "effective_date14", "business_process_type14", "business_process_reason14", "pay_rate_type14", "base_pay_change14", "job_profile14", "time_type14", "cost_center14",
                  "effective_date15", "business_process_type15", "business_process_reason15", "pay_rate_type15", "base_pay_change15", "job_profile15", "time_type15", "cost_center15", 
                  "effective_date16", "business_process_type16", "business_process_reason16", "pay_rate_type16", "base_pay_change16", "job_profile16", "time_type16", "cost_center16", 
                  "effective_date17", "business_process_type17", "business_process_reason17", "pay_rate_type17", "base_pay_change17", "job_profile17", "time_type17", "cost_center17", 
                  "effective_date18", "business_process_type18", "business_process_reason18", "pay_rate_type18", "base_pay_change18", "job_profile18", "time_type18", "cost_center18", 
                  "effective_date19", "business_process_type19", "business_process_reason19", "pay_rate_type19", "base_pay_change19", "job_profile19", "time_type19", "cost_center19", 
                  "effective_date20", "business_process_type20", "business_process_reason20", "pay_rate_type20", "base_pay_change20", "job_profile20", "time_type20", "cost_center20", 
                  "effective_date21", "business_process_type21", "business_process_reason21", "pay_rate_type21", "base_pay_change21", "job_profile21", "time_type21", "cost_center21", 
                  "effective_date22", "business_process_type22", "business_process_reason22", "pay_rate_type22", "base_pay_change22", "job_profile22", "time_type22", "cost_center22", 
                  "effective_date23", "business_process_type23", "business_process_reason23", "pay_rate_type23", "base_pay_change23", "job_profile23", "time_type23", "cost_center23", 
                  "effective_date24", "business_process_type24", "business_process_reason24", "pay_rate_type24", "base_pay_change24", "job_profile24", "time_type24", "cost_center24", 
                  "effective_date25", "business_process_type25", "business_process_reason25", #"pay_rate_type_25", "base_pay_change25", 
                  "job_profile25", #"time_type25", 
                  "cost_center25", 
                  "effective_date_26", "business_process_type_26", "business_process_reason_26", #"pay_rate_type_26", "base_pay_change_26", 
                  "job_profile_26", #"time_type_26", 
                  "cost_center_26", "2008_annual_performance_rating", "2009_annual_performance_rating", "2010_annual_performance_rating",
                  "2011_annual_performance_rating", "2012_annual_performance_rating", "2013_annual_performance_rating",
                  "2014_annual_performance_rating", "2015_annual_performance_rating", "2016_annual_performance_rating",
                  "2017_annual_performance_rating", "2018_annual_performance_rating", "2019_annual_performance_rating",
                  "2020_annual_performance_rating")

for (i in 1:26) {
  
  df_year <- df %>% 
    select(department, employee_id, gender, race_ethnicity,
           military_status, date_of_birth,
           original_hire_date, hire_date, pay_rate_type,
           current_base_pay,
           job_profile_current, time_type_current, cost_center_current,
           ends_with(paste0("e", as.character(i))),
           ends_with(paste0("n", as.character(i))),
           ends_with(paste0("r", as.character(i)))
    ) %>% 
    rename(pay_rate_type_current=pay_rate_type) %>% 
    mutate(termination_date=ymd("1900-01-01"))
  
  colnames(df_year) <- gsub(paste0("n", as.character(i)), "n", colnames(df_year))
  colnames(df_year) <- gsub(paste0("e", as.character(i)), "e", colnames(df_year))
  colnames(df_year) <- gsub(paste0("r", as.character(i)), "r", colnames(df_year))
  

  if(!"effective_date" %in% colnames(df_year)) {
    df_year$effective_date <- NA
  }
  
  
  if(!"business_process_type" %in% colnames(df_year)) {
    df_year$business_process_type <- NA
  }
  
  
  if(!"business_process_reason" %in% colnames(df_year)) {
    df_year$business_process_reason <- NA
  }
  
  
  if(!"pay_rate_type" %in% colnames(df_year)) {
    df_year$pay_rate_type <- NA
  }
  
  
  if(!"base_pay_change" %in% colnames(df_year)) {
    df_year$base_pay_change <- NA
  }
  
  
  if(!"job_profile" %in% colnames(df_year)) {
    df_year$job_profile <- NA
  }
  
  if(!"time_type" %in% colnames(df_year)) {
    df_year$time_type<- NA
  }
  
  
  if(!"cost_center" %in% colnames(df_year)) {
    df_year$cost_center<- NA
  }
  
  df_year <- df_year %>% 
    select(department, employee_id, gender, race_ethnicity,
           military_status,
           date_of_birth,
           original_hire_date, hire_date, termination_date,
           pay_rate_type_current,
           current_base_pay,
           job_profile_current, time_type_current, cost_center_current,
           effective_date,
           business_process_type, business_process_reason, pay_rate_type,
           base_pay_change, job_profile, time_type, cost_center)
  df_year$workday <- i
  
  if (i==1) {
    df_mega <- df_year
  } else {
    df_mega <- rbind(df_mega, df_year)
  }
  
}

df_mega <- filter(df_mega, !is.na(cost_center))

data_date <- ymd("2021-05-28")
 
df_mega <- df_mega %>% mutate(date_of_birth = mdy(date_of_birth),
                    age = floor(decimal_date(data_date) - decimal_date(date_of_birth)))


df_mega <- df_mega %>% 
  mutate(date_of_birth = case_when(
    age < 0 ~ date_of_birth-years(100),
    TRUE ~ date_of_birth),
    age = floor(decimal_date(data_date) - decimal_date(date_of_birth)))

df_mega <- df_mega %>% mutate(hire_date = mdy(hire_date),
                    years_of_service = floor(decimal_date(data_date) - decimal_date(hire_date)))

df_mega <- df_mega %>% 
  mutate(hire_date = case_when(
    years_of_service < 0 ~ hire_date-years(100),
    TRUE ~ hire_date),
    years_of_service = case_when(
      years_of_service < 0 ~ floor(decimal_date(data_date) - decimal_date(hire_date)),
      TRUE ~ years_of_service
    ))

df_mega <- df_mega %>% 
  mutate(original_hire_date=mdy(original_hire_date),
         termination_date=ymd(termination_date),
         effective_date=mdy(effective_date))


df_mega <- df_mega %>% 
  mutate(age_group_5=case_when(
    age < 25 ~ "<25",
    age >= 25 & age < 30 ~ "25-29",
    age >= 30 & age < 35 ~ "30-34",
    age >= 35 & age < 40 ~ "35-39",
    age >= 40 & age < 45 ~ "40-44",
    age >= 45 & age < 50 ~ "45-49",
    age >= 50 & age < 55 ~ "50-54",
    age >= 55 & age < 60 ~ "55-59",
    age >= 60 & age < 65 ~ "60-64",
    age >= 65 ~ "65+",
    TRUE ~ NA_character_))


df_mega <- df_mega %>% 
  mutate(age_group_10=case_when(
    age < 25 ~ "<25",
    age >= 25 & age < 35 ~ "25-34",
    age >= 35 & age < 45 ~ "35-44",
    age >= 45 & age < 55 ~ "45-55",
    age >= 55 & age < 65 ~ "55-64",
    age >= 65 ~ "65+",
    TRUE ~ NA_character_))


df_mega <- df_mega %>% 
  mutate(years_of_service_grouped=case_when(
    years_of_service==0 ~ "0",
    years_of_service>=1  & years_of_service<3 ~ "1-2",
    years_of_service>=3  & years_of_service<3 ~ "3-5",
    years_of_service>=6  & years_of_service<3 ~ "6-10",
    years_of_service>=11  & years_of_service<3 ~ "11-15",
    years_of_service>=16  & years_of_service<3 ~ "16-20",
    years_of_service>=21  & years_of_service<3 ~ "21-25",
    years_of_service>=25 ~ "25+",
    TRUE ~ NA_character_))


df_mega <- df_mega %>%
  mutate(dept=case_when(
    department == 'News' ~ 'News', 
    department == 'Editorial' ~ 'News',
    department == 'Client Solutions' ~ 'Commercial',
    department == 'Circulation' ~ 'Commercial',
    department == 'Finance' ~ 'Commercial',
    department == 'Marketing' ~ 'Commercial',
    department == 'WP News Media Services' ~ 'Commercial',
    department == 'Production' ~ 'Commercial',
    department == 'Public Relations' ~ 'Commercial',
    department == 'Administration' ~ 'Commercial',
    department == 'Product'~ 'Commercial',
    TRUE ~ 'Other'))

df_mega <- df_mega %>%
  mutate(desk=case_when(
    cost_center_current == '110000 News Operations' ~ 'Operations',
    cost_center_current == '110001 News Digital Operations' ~ 'Operations',
    cost_center_current == '110610 Audience Development and Engagement' ~ 'Audience Development and Engagement',
    cost_center_current == '110620 News Audio' ~ 'Audio',
    cost_center_current == '110604 Presentation Design' ~ 'Design',
    cost_center_current == '110605 Presentation' ~ 'Photography',
    cost_center_current == '110664 News National Apps' ~ 'Emerging News Products',
    cost_center_current == '110665 News The Lily' ~ 'Emerging News Products',
    cost_center_current == '110666 News Snapchat' ~ 'Emerging News Products',
    cost_center_current == '110667 News By The Way' ~ 'Emerging News Products',
    cost_center_current == '113210 Economy and Business' ~ 'Financial',
    cost_center_current == '114000 Foreign Administration' ~ 'Foreign',
    cost_center_current == '114095 News Foreign Brazil' ~ 'Foreign',
    cost_center_current == '114100 Foreign Latam' ~ 'Foreign',
    cost_center_current == '114220 News Foreign Istanbul' ~ 'Foreign',
    cost_center_current == '114235 Foreign Western Europe' ~ 'Foreign',
    cost_center_current == '114300 News Foreign West Africa' ~ 'Foreign',
    cost_center_current == '114415 Foreign Hong Kong' ~ 'Foreign',
    cost_center_current == '114405 Foreign Beijing Bureau' ~ 'Foreign',
    cost_center_current == '114105 Foreign Mexico Bureau' ~ 'Foreign',
    cost_center_current == '114005 Foreign Beirut Bureau' ~ 'Foreign',
    cost_center_current == '114400 Foreign India Bureau' ~ 'Foreign',
    cost_center_current == '114410 Foreign Tokyo Bureau' ~ 'Foreign',
    cost_center_current == '114205 Foreign Islamabad Bureau' ~ 'Foreign',
    cost_center_current == '114305 Foreign Nairobi Bureau' ~ 'Foreign',
    cost_center_current == '114240 Foreign Rome Bureau' ~ 'Foreign',
    cost_center_current == '114200 Foreign London Bureau' ~ 'Foreign',
    cost_center_current == '114230 Foreign Moscow Bureau' ~ 'Foreign',
    cost_center_current == '114225 Foreign Cairo Bureau' ~ 'Foreign',
    cost_center_current == '114215 Foreign Berlin Bureau' ~ 'Foreign',
    cost_center_current == "114310 Foreign Baghdad Bureau" ~ "Foreign",
    cost_center_current == "114315 Foreign Jerusalem Bureau" ~ "Foreign",
    cost_center_current == '110603 Presentation Graphics' ~ 'Graphics',
    cost_center_current == '110450 Investigative' ~ 'Investigative',
    cost_center_current == '112300 Local Politics and Government' ~ 'Local',
    cost_center_current == '110601 Multiplatform Desk' ~ 'Multiplatform',
    cost_center_current == '110500 Magazine' ~ 'National',
    cost_center_current == '113200 National Politics and Government' ~ 'National',
    cost_center_current == '113205 National Security' ~ 'National',
    cost_center_current == '113215 News National Health & Science' ~ 'National',
    cost_center_current == '113220 National Enterprise' ~ 'National',
    cost_center_current == '113235 National America' ~ 'National',
    cost_center_current == '113240 News National Environment' ~ 'National',
    cost_center_current == '110006 News Content & Research' ~ 'News Content and Research',
    cost_center_current == '110455 News Logistics' ~ 'News Logistics',
    cost_center_current == '110410 Book World' ~ 'Outlook',
    cost_center_current == '110460 Outlook' ~ 'Outlook',
    cost_center_current == '110475 Polling' ~ 'Polling',
    cost_center_current == '110015 Sports Main' ~ 'Sports',
    cost_center_current == '110300 Style' ~ 'Style',
    cost_center_current == '110435 Food' ~ 'Style',
    cost_center_current == '110485 Travel' ~ 'Style',
    cost_center_current == '110495 Local Living' ~ 'Style',
    cost_center_current == '110505 Weekend' ~ 'Style',
    cost_center_current == '110600 Universal Desk' ~ 'Universal Desk',
    cost_center_current == '110652 News Video - General' ~ 'Video',
    cost_center_current == '110663 Wake Up Report' ~ 'Other',
    cost_center_current == '115000 Editorial Administration' ~ 'Editorial',
    TRUE ~ 'non-newsroom'))


df_mega <- df_mega %>%
  mutate(tier=case_when(
    desk == 'National' ~ 'Tier 1',
    desk == 'Foreign' ~ 'Tier 1',
    desk == 'Financial' ~ 'Tier 1',
    desk == 'Investigative' ~ 'Tier 1',
    desk == 'Style' ~ 'Tier 2',
    desk == 'Local' ~ 'Tier 2',
    desk == 'Graphics' ~ 'Tier 2',
    desk == 'Universal Desk' ~ 'Tier 2',
    desk == 'Sports' ~ 'Tier 2',
    desk == 'Outlook' ~ 'Tier 2',
    desk == 'Editorial' ~ 'Tier 2',
    desk == 'Audio' ~ 'Tier 3',
    desk == 'Polling' ~ 'Tier 3',
    desk == 'Design' ~ 'Tier 3',
    desk == 'Operations' ~ 'Tier 3',
    desk == 'Multiplatform' ~ 'Tier 3',
    desk == 'Video' ~ 'Tier 3',
    desk == 'Audience Development and Engagement' ~ 'Tier 3',
    desk == 'News Logistics' ~ 'Tier 4',
    desk == 'News Content and Research' ~ 'Tier 4',
    desk == 'Emerging News Products' ~ 'Tier 4',
    desk == 'Other' ~ 'Tier 4',
    TRUE ~ 'Other'))


df_mega <- df_mega %>%
  mutate(race_grouping=case_when(
    race_ethnicity == 'White (United States of America)' ~ 'white',
    race_ethnicity == 'Black or African American (United States of America)' ~ 'person of color',
    race_ethnicity == 'Asian (United States of America)' ~ 'person of color',
    race_ethnicity == 'Hispanic or Latino (United States of America)' ~ 'person of color',
    race_ethnicity == 'Two or More Races (United States of America)' ~ 'person of color',
    race_ethnicity == 'American Indian or Alaska Native (United States of America)' ~ 'person of color',
    race_ethnicity == 'Native Hawaiian or Other Pacific Islander (United States of America)' ~ 'person of color',
    TRUE ~ 'unknown'))


mega_df_columns <- data.frame(colnames(df_mega))

df_columns <- data.frame(colnames(df))

df2 <- read_csv("raw_data/terminated_wd_2021.csv")

colnames(df2) <- c("department","employee_id","gender",   
                   "race_ethnicity","education", "military_status",
                   "date_of_birth", "original_hire_date","hire_date", "termination_date",
                   "pay_rate_type", "current_base_pay", 
                   "job_profile_current","time_type_current", "cost_center_current",  
                   "effective_date1", "business_process_type1", "business_process_reason1", "pay_rate_type1", "base_pay_change1", "job_profile1", "time_type1", "cost_center1", 
                   "effective_date2", "business_process_type2", "business_process_reason2", "pay_rate_type2", "base_pay_change2", "job_profile2", "time_type2", "cost_center2", 
                   "effective_date3", "business_process_type3", "business_process_reason3", "pay_rate_type3", "base_pay_change3", "job_profile3", "time_type3", "cost_center3",
                   "effective_date4", "business_process_type4", "business_process_reason4", "pay_rate_type_4", "base_pay_change4", "job_profile4", "time_type4", "cost_center4", 
                   "effective_date5", "business_process_type5", "business_process_reason5", "pay_rate_type_5", "base_pay_change5", "job_profile5", "time_type5", "cost_center5", 
                   "effective_date6", "business_process_type6", "business_process_reason6", "pay_rate_type_6", "base_pay_change6", "job_profile6", "time_type6", "cost_center6",
                   "effective_date7", "business_process_type7", "business_process_reason7", "pay_rate_type_7", "base_pay_change7", "job_profile7", "time_type7", "cost_center7", 
                   "effective_date8", "business_process_type8", "business_process_reason8", "pay_rate_type_8", "base_pay_change8", "job_profile8", "time_type8", "cost_center8", 
                   "effective_date9", "business_process_type9", "business_process_reason9", "pay_rate_type_9", "base_pay_change9", "job_profile9", "time_type9", "cost_center_9",
                   "effective_date10", "business_process_type10", "business_process_reason10", "pay_rate_type10", "base_pay_change10", "job_profile10", "time_type10", "cost_center10",
                   "effective_date11", "business_process_type11", "business_process_reason11", "pay_rate_type11", "base_pay_change11", "job_profile11", "time_type11", "cost_center11",
                   "effective_date12", "business_process_type12", "business_process_reason12", "pay_rate_type12", "base_pay_change12", "job_profile12", "time_type12", "cost_center12", 
                   "effective_date13", "business_process_type13", "business_process_reason13", "pay_rate_type13", "base_pay_change13", "job_profile13", "time_type13", "cost_center13", 
                   "effective_date14", "business_process_type14", "business_process_reason14", "pay_rate_type14", "base_pay_change14", "job_profile14", "time_type14", "cost_center14",
                   "effective_date15", "business_process_type15", "business_process_reason15", "pay_rate_type15", "base_pay_change15", "job_profile15", "time_type15", "cost_center15", 
                   "2008_annual_performance_rating", "2009_annual_performance_rating", "2010_annual_performance_rating",
                   "2011_annual_performance_rating", "2012_annual_performance_rating", "2013_annual_performance_rating",
                   "2014_annual_performance_rating", "2015_annual_performance_rating", "2016_annual_performance_rating",
                   "2017_annual_performance_rating", "2018_annual_performance_rating", "2019_annual_performance_rating",
                   "2020_annual_performance_rating")



for (i in 1:15) {
  
  df2_year <- df2 %>% 
    select(department, employee_id, gender, race_ethnicity,
           military_status, date_of_birth,
           original_hire_date, hire_date, termination_date, 
           pay_rate_type,
           current_base_pay,
           job_profile_current, time_type_current, cost_center_current,
           ends_with(paste0("e", as.character(i))),
           ends_with(paste0("n", as.character(i))),
           ends_with(paste0("r", as.character(i)))
    ) %>% 
    rename(pay_rate_type_current=pay_rate_type) 
  
  colnames(df2_year) <- gsub(paste0("n", as.character(i)), "n", colnames(df2_year))
  colnames(df2_year) <- gsub(paste0("e", as.character(i)), "e", colnames(df2_year))
  colnames(df2_year) <- gsub(paste0("r", as.character(i)), "r", colnames(df2_year))
  
  if(!"effective_date" %in% colnames(df2_year)) {
    df2_year$effective_date <- NA
  }
  
  
  if(!"business_process_type" %in% colnames(df2_year)) {
    df2_year$business_process_type <- NA
  }
  
  
  if(!"business_process_reason" %in% colnames(df2_year)) {
    df2_year$business_process_reason <- NA
  }
  
  
  if(!"pay_rate_type" %in% colnames(df2_year)) {
    df2_year$pay_rate_type <- NA
  }
  
  
  if(!"base_pay_change" %in% colnames(df2_year)) {
    df2_year$base_pay_change <- NA
  }
  
  
  if(!"job_profile" %in% colnames(df2_year)) {
    df2_year$job_profile <- NA
  }
  
  if(!"time_type" %in% colnames(df2_year)) {
    df2_year$time_type<- NA
  }
  
  
  if(!"cost_center" %in% colnames(df2_year)) {
    df2_year$cost_center<- NA
  }
  
  df2_year <- df2_year %>% 
    select(department, employee_id, gender, military_status,
           race_ethnicity,
           date_of_birth,
           original_hire_date, hire_date, termination_date,
           pay_rate_type_current,
           current_base_pay,
           job_profile_current, time_type_current, cost_center_current,
           effective_date,
           business_process_type, business_process_reason, pay_rate_type,
           base_pay_change, job_profile, time_type, cost_center)
  df2_year$workday <- i
  
  if (i==1) {
    df2_mega <- df2_year
  } else {
    df2_mega <- rbind(df2_mega, df2_year)
  }
  
}


data_date <- ymd("2021-05-28")

df2_mega <- df2_mega %>% mutate(date_of_birth = mdy(date_of_birth),
                              age = floor(decimal_date(data_date) - decimal_date(date_of_birth)))


df2_mega <- df2_mega %>% 
  mutate(date_of_birth = case_when(
    age < 0 ~ date_of_birth-years(100),
    TRUE ~ date_of_birth),
    age = floor(decimal_date(data_date) - decimal_date(date_of_birth)))



df2_mega <- df2_mega %>% mutate(hire_date = mdy(hire_date),
                              years_of_service = floor(decimal_date(data_date) - decimal_date(hire_date)))

df2_mega <- df2_mega %>% 
  mutate(hire_date = case_when(
    years_of_service < 0 ~ hire_date-years(100),
    TRUE ~ hire_date),
    years_of_service = case_when(
      years_of_service < 0 ~ floor(decimal_date(data_date) - decimal_date(hire_date)),
      TRUE ~ years_of_service
    ))

df2_mega <- df2_mega %>% 
  mutate(original_hire_date=mdy(original_hire_date),
         termination_date=mdy(termination_date),
         effective_date=mdy(effective_date))

df2_mega <- df2_mega %>% 
  mutate(age_group_5=case_when(
    age < 25 ~ "<25",
    age >= 25 & age < 30 ~ "25-29",
    age >= 30 & age < 35 ~ "30-34",
    age >= 35 & age < 40 ~ "35-39",
    age >= 40 & age < 45 ~ "40-44",
    age >= 45 & age < 50 ~ "45-49",
    age >= 50 & age < 55 ~ "50-54",
    age >= 55 & age < 60 ~ "55-59",
    age >= 60 & age < 65 ~ "60-64",
    age >= 65 ~ "65+",
    TRUE ~ NA_character_))

df2_mega <- df2_mega %>% 
  mutate(age_group_10=case_when(
    age < 25 ~ "<25",
    age >= 25 & age < 35 ~ "25-34",
    age >= 35 & age < 45 ~ "35-44",
    age >= 45 & age < 55 ~ "45-55",
    age >= 55 & age < 65 ~ "55-64",
    age >= 65 ~ "65+",
    TRUE ~ NA_character_))

df2_mega <- df2_mega %>% 
  mutate(years_of_service_grouped=case_when(
    years_of_service==0 ~ "0",
    years_of_service>=1  & years_of_service<3 ~ "1-2",
    years_of_service>=3  & years_of_service<3 ~ "3-5",
    years_of_service>=6  & years_of_service<3 ~ "6-10",
    years_of_service>=11  & years_of_service<3 ~ "11-15",
    years_of_service>=16  & years_of_service<3 ~ "16-20",
    years_of_service>=21  & years_of_service<3 ~ "21-25",
    years_of_service>=25 ~ "25+",
    TRUE ~ NA_character_))


df2_mega <- df2_mega %>%
  mutate(dept=case_when(
    department == 'News' ~ 'News', 
    department == 'Editorial' ~ 'News',
    department == 'Client Solutions' ~ 'Commercial',
    department == 'Circulation' ~ 'Commercial',
    department == 'Finance' ~ 'Commercial',
    department == 'Marketing' ~ 'Commercial',
    department == 'WP News Media Services' ~ 'Commercial',
    department == 'Production' ~ 'Commercial',
    department == 'Public Relations' ~ 'Commercial',
    department == 'Administration' ~ 'Commercial',
    department == 'Product'~ 'Commercial',
    TRUE ~ 'Other'))


df2_mega <- df2_mega %>%
  mutate(desk=case_when(
    cost_center_current == '110000 News Operations' ~ 'Operations',
    cost_center_current == '110001 News Digital Operations' ~ 'Operations',
    cost_center_current == '110610 Audience Development and Engagement' ~ 'Audience Development and Engagement',
    cost_center_current == '110620 News Audio' ~ 'Audio',
    cost_center_current == '110604 Presentation Design' ~ 'Design',
    cost_center_current == '110605 Presentation' ~ 'Photography',
    cost_center_current == '110664 News National Apps' ~ 'Emerging News Products',
    cost_center_current == '110665 News The Lily' ~ 'Emerging News Products',
    cost_center_current == '110666 News Snapchat' ~ 'Emerging News Products',
    cost_center_current == '110667 News By The Way' ~ 'Emerging News Products',
    cost_center_current == '113210 Economy and Business' ~ 'Financial',
    cost_center_current == '114000 Foreign Administration' ~ 'Foreign',
    cost_center_current == '114095 News Foreign Brazil' ~ 'Foreign',
    cost_center_current == '114100 Foreign Latam' ~ 'Foreign',
    cost_center_current == '114220 News Foreign Istanbul' ~ 'Foreign',
    cost_center_current == '114235 Foreign Western Europe' ~ 'Foreign',
    cost_center_current == '114300 News Foreign West Africa' ~ 'Foreign',
    cost_center_current == '114415 Foreign Hong Kong' ~ 'Foreign',
    cost_center_current == '114405 Foreign Beijing Bureau' ~ 'Foreign',
    cost_center_current == '114105 Foreign Mexico Bureau' ~ 'Foreign',
    cost_center_current == '114005 Foreign Beirut Bureau' ~ 'Foreign',
    cost_center_current == '114400 Foreign India Bureau' ~ 'Foreign',
    cost_center_current == '114410 Foreign Tokyo Bureau' ~ 'Foreign',
    cost_center_current == '114205 Foreign Islamabad Bureau' ~ 'Foreign',
    cost_center_current == '114305 Foreign Nairobi Bureau' ~ 'Foreign',
    cost_center_current == '114240 Foreign Rome Bureau' ~ 'Foreign',
    cost_center_current == '114200 Foreign London Bureau' ~ 'Foreign',
    cost_center_current == '114230 Foreign Moscow Bureau' ~ 'Foreign',
    cost_center_current == '114225 Foreign Cairo Bureau' ~ 'Foreign',
    cost_center_current == '114215 Foreign Berlin Bureau' ~ 'Foreign',
    cost_center_current == "114310 Foreign Baghdad Bureau" ~ "Foreign",
    cost_center_current == "114315 Foreign Jerusalem Bureau" ~ "Foreign",
    cost_center_current == '110603 Presentation Graphics' ~ 'Graphics',
    cost_center_current == '110450 Investigative' ~ 'Investigative',
    cost_center_current == '112300 Local Politics and Government' ~ 'Local',
    cost_center_current == '110601 Multiplatform Desk' ~ 'Multiplatform',
    cost_center_current == '110500 Magazine' ~ 'National',
    cost_center_current == '113200 National Politics and Government' ~ 'National',
    cost_center_current == '113205 National Security' ~ 'National',
    cost_center_current == '113215 News National Health & Science' ~ 'National',
    cost_center_current == '113220 National Enterprise' ~ 'National',
    cost_center_current == '113235 National America' ~ 'National',
    cost_center_current == '113240 News National Environment' ~ 'National',
    cost_center_current == '110006 News Content & Research' ~ 'News Content and Research',
    cost_center_current == '110455 News Logistics' ~ 'News Logistics',
    cost_center_current == '110410 Book World' ~ 'Outlook',
    cost_center_current == '110460 Outlook' ~ 'Outlook',
    cost_center_current == '110475 Polling' ~ 'Polling',
    cost_center_current == '110015 Sports Main' ~ 'Sports',
    cost_center_current == '110300 Style' ~ 'Style',
    cost_center_current == '110435 Food' ~ 'Style',
    cost_center_current == '110485 Travel' ~ 'Style',
    cost_center_current == '110495 Local Living' ~ 'Style',
    cost_center_current == '110505 Weekend' ~ 'Style',
    cost_center_current == '110600 Universal Desk' ~ 'Universal Desk',
    cost_center_current == '110652 News Video - General' ~ 'Video',
    cost_center_current == '110663 Wake Up Report' ~ 'Other',
    cost_center_current == '115000 Editorial Administration' ~ 'Editorial',
    TRUE ~ 'non-newsroom'))


df2_mega <- df2_mega %>%
  mutate(tier=case_when(
    desk == 'National' ~ 'Tier 1',
    desk == 'Foreign' ~ 'Tier 1',
    desk == 'Financial' ~ 'Tier 1',
    desk == 'Investigative' ~ 'Tier 1',
    desk == 'Style' ~ 'Tier 2',
    desk == 'Local' ~ 'Tier 2',
    desk == 'Graphics' ~ 'Tier 2',
    desk == 'Universal Desk' ~ 'Tier 2',
    desk == 'Sports' ~ 'Tier 2',
    desk == 'Outlook' ~ 'Tier 2',
    desk == 'Editorial' ~ 'Tier 2',
    desk == 'Audio' ~ 'Tier 3',
    desk == 'Polling' ~ 'Tier 3',
    desk == 'Design' ~ 'Tier 3',
    desk == 'Operations' ~ 'Tier 3',
    desk == 'Multiplatform' ~ 'Tier 3',
    desk == 'Video' ~ 'Tier 3',
    desk == 'Audience Development and Engagement' ~ 'Tier 3',
    desk == 'News Logistics' ~ 'Tier 4',
    desk == 'News Content and Research' ~ 'Tier 4',
    desk == 'Emerging News Products' ~ 'Tier 4',
    desk == 'Other' ~ 'Tier 4',
    TRUE ~ 'Other'))


df2_mega <- df2_mega %>%
  mutate(race_grouping=case_when(
    race_ethnicity == 'White (United States of America)' ~ 'white',
    race_ethnicity == 'Black or African American (United States of America)' ~ 'person of color',
    race_ethnicity == 'Asian (United States of America)' ~ 'person of color',
    race_ethnicity == 'Hispanic or Latino (United States of America)' ~ 'person of color',
    race_ethnicity == 'Two or More Races (United States of America)' ~ 'person of color',
    race_ethnicity == 'American Indian or Alaska Native (United States of America)' ~ 'person of color',
    race_ethnicity == 'Native Hawaiian or Other Pacific Islander (United States of America)' ~ 'person of color',
    TRUE ~ 'unknown'))

df2_mega %>% count(year(termination_date))


df_mega$data <- "active"
df2_mega$data <- "terminated"
df3_mega <- rbind(df_mega, df2_mega)
df <- df3_mega

comes_goes <- df %>% 
  rename(data_type=data) %>% 
  mutate(termination_year=year(termination_date))

comes_goes$termination_year <- ifelse(comes_goes$termination_year==1900, NA, comes_goes$termination_year)

comes_goes$eff_year = ifelse(comes_goes$data_type=="active", 2022-comes_goes$years_of_service, comes_goes$termination_year)

comes_goes$data_year= year(comes_goes$effective_date)
comes_goes_active <- comes_goes %>% filter(data_type=="active")
comes_goes_active$year_type <- ifelse(year(comes_goes_active$hire_date)==year(comes_goes_active$effective_date), "hired", "active")
comes_goes_term <- comes_goes %>% filter(data_type=="terminated")

comes_goes_term <- comes_goes_term %>% 
  group_by(employee_id) %>% 
  arrange(employee_id, effective_date) %>% 
  mutate(row=row_number()) 

comes_goes_term_solo <- comes_goes_term %>% 
  filter(row_number()==n()) %>% 
  mutate(year_type="terminated")

comes_goes_term <- left_join(comes_goes_term, comes_goes_term_solo)
comes_goes_term$year_type <- ifelse(is.na(comes_goes_term$year_type), "active", comes_goes_term$year_type)
comes_goes_term$row <- NULL

comes_goes <- rbind(comes_goes_active, comes_goes_term)

comes_goes2 <- comes_goes %>% 
  group_by(employee_id, data_year) %>%
  arrange(employee_id, data_year, desc(year_type)) %>% 
  slice(1)

uniques <- comes_goes2 %>% 
  ungroup() %>% 
  select(employee_id, hire_date, termination_date, data_type, termination_year, data_year, year_type) %>% 
  unique() %>% 
  filter(year_type=="hired")

ids <- comes_goes %>% 
  ungroup() %>% 
  select(employee_id, hire_date, termination_date) %>% 
  unique() %>% 
  mutate(start_year=year(hire_date), end_year=year(termination_date))

ids$end_year <- ifelse(ids$end_year==1900, 2021, ids$end_year)


for(i in 1:nrow(ids)) {
  
  data_year <- ids$start_year[i]:ids$end_year[i]
  df_solo <- data.frame(data_year)
  df_solo$employee_id <- ids$employee_id[i]
  

  df_solo <- left_join(df_solo, uniques) %>% 
    select(employee_id, data_year, year_type)
  
  if (i==1) {
    df <- df_solo
  } else {
    df <- rbind(df, df_solo)
  }
  print(i)
}


cg_narrow <- comes_goes %>% 
  group_by(employee_id) %>% 
  arrange(employee_id, effective_date) %>% 
  filter(row_number()==n()) %>% 
  select(department, employee_id, gender, race_ethnicity,
         date_of_birth, original_hire_date, hire_date,
         termination_date, pay_rate_type_current,
         current_base_pay, job_profile_current,
         time_type_current, cost_center_current,
         cost_center, age, years_of_service,
         age_group_5, age_group_10, years_of_service_grouped,
         dept, desk, tier, race_grouping) %>% 
  unique()

# 
df<- df %>% 
  left_join(cg_narrow) %>% 
  mutate(year_type=case_when(
    year(termination_date)==data_year~ "terminated",
    TRUE ~ year_type
  ))

df$year_type <- ifelse(is.na(df$year_type), "active", df$year_type)
#### hires versus terminations by year (total company)

#From 2016 to 2021, The Post hired more women than men across the company. Much of that growth happened in the Newsroom.
df %>%   
  filter(data_year >=2016) %>% 
  count(gender, year_type)  %>%
  filter(year_type!="active") %>% 
  pivot_wider(names_from="year_type", values_from="n") %>% 
  kable()
gender hired terminated
Female 312 261
Male 203 201
Prefer not to disclose 1 1
#The number of women in Commercial, on the other hand, dramatically shrank.
#More than twice as many women as men in Commercial have departed. Our analysis showed that 2019 was a standout year for turnover among female staffers in Commercial, with nearly 40 women leaving The Post.
df %>% 
  filter(data_year >=2016) %>% 
  count(dept, gender, year_type) %>% 
  filter(year_type!="active") %>% 
  pivot_wider(names_from="year_type", values_from="n") %>% 
  mutate(category=case_when(
    terminated==hired ~ "Same",
    hired > terminated ~ "More hired than left",
    terminated > hired ~ "More left than hired",
    TRUE ~ ""
  )) %>% 
  kable()
dept gender hired terminated category
Commercial Female 81 127 More left than hired
Commercial Male 35 64 More left than hired
News Female 215 115 More hired than left
News Male 154 89 More hired than left
News Prefer not to disclose 1 1 Same
Other Female 16 19 More left than hired
Other Male 14 48 More left than hired
#From 2016 to 2021, more workers of color left in Commercial than were hired, with the exception of Hispanic or Latino workers, whose numbers remained consistent. Demographic groups of fewer than five were excluded to protect individuals’ privacy.
df %>% 
  filter(data_year >=2016) %>% 
  count(dept, race_ethnicity, year_type) %>% 
  filter(year_type!="active") %>% 
  pivot_wider(names_from="year_type", values_from="n") %>% 
  filter(hired>=5 & terminated>=5) %>% 
  mutate(category=case_when(
    hired>terminated ~ "More hired than left",
    terminated>hired ~ "More left than hired",
    terminated==hired ~ "Same",
    TRUE ~ ""
  )) %>% 
  kable()
dept race_ethnicity hired terminated category
Commercial Asian (United States of America) 14 18 More left than hired
Commercial Black or African American (United States of America) 30 51 More left than hired
Commercial Hispanic or Latino (United States of America) 6 7 More left than hired
Commercial Two or More Races (United States of America) 5 8 More left than hired
Commercial White (United States of America) 59 102 More left than hired
News Asian (United States of America) 42 24 More hired than left
News Black or African American (United States of America) 37 31 More hired than left
News Hispanic or Latino (United States of America) 28 12 More hired than left
News Two or More Races (United States of America) 17 5 More hired than left
News White (United States of America) 221 127 More hired than left
Other Black or African American (United States of America) 6 37 More left than hired
Other White (United States of America) 19 23 More left than hired
#In the Newsroom, on the other hand, more workers were hired than left. For every employee that left, 1.8 employees were hired. 
df %>% 
  filter(data_year >=2016) %>% 
  count(dept,year_type) %>% 
  filter(year_type!="active") %>% 
  pivot_wider(names_from="year_type", values_from="n") %>% 
  mutate(category=case_when(
    hired>terminated ~ "More hired than left",
    terminated>hired ~ "More left than hired",
    terminated==hired ~ "Same",
    TRUE ~ ""
  )) %>% 
  kable()
dept hired terminated category
Commercial 116 191 More left than hired
News 370 205 More hired than left
Other 30 67 More left than hired
#370/205 = 1.8
#However, that was not the case for Black employees. For every Black employee who left, only 1.2 Black employees were hired to replace them. 

df %>% 
  filter(data_year >=2016) %>% 
  count(dept, race_ethnicity, year_type) %>% 
  filter(year_type!="active") %>% 
  pivot_wider(names_from="year_type", values_from="n") %>% 
  filter(hired>=5 & terminated>=5) %>% 
  mutate(category=case_when(
    hired>terminated ~ "More hired than left",
    terminated>hired ~ "More left than hired",
    terminated==hired ~ "Same",
    TRUE ~ ""
  )) %>% 
  kable()
dept race_ethnicity hired terminated category
Commercial Asian (United States of America) 14 18 More left than hired
Commercial Black or African American (United States of America) 30 51 More left than hired
Commercial Hispanic or Latino (United States of America) 6 7 More left than hired
Commercial Two or More Races (United States of America) 5 8 More left than hired
Commercial White (United States of America) 59 102 More left than hired
News Asian (United States of America) 42 24 More hired than left
News Black or African American (United States of America) 37 31 More hired than left
News Hispanic or Latino (United States of America) 28 12 More hired than left
News Two or More Races (United States of America) 17 5 More hired than left
News White (United States of America) 221 127 More hired than left
Other Black or African American (United States of America) 6 37 More left than hired
Other White (United States of America) 19 23 More left than hired
#37/31 = 1.2

#A deeper analysis of the data shows that in 2020 alone more than 1 out of 3 workers who left the Newsroom were Black. Fewer than 1 in 5 of those hired that year were Black.
df %>% 
  filter(data_year ==2020) %>% 
  filter(dept=="News") %>% 
  count(dept, race_ethnicity, data_year, year_type) %>% 
  filter(year_type!="active") %>% 
  pivot_wider(names_from="year_type", values_from="n") %>% 
  #filter(hired>=5 & terminated>=5) %>% 
  mutate(category=case_when(
    hired>terminated ~ "More hired than left",
    terminated>hired ~ "More left than hired",
    terminated==hired ~ "Same",
    TRUE ~ ""
  )) %>% 
  kable()
dept race_ethnicity data_year hired terminated category
News Asian (United States of America) 2020 9 1 More hired than left
News Black or African American (United States of America) 2020 9 10 More left than hired
News Hispanic or Latino (United States of America) 2020 5 2 More hired than left
News Prefer Not to Disclose (United States of America) 2020 1 1 Same
News Two or More Races (United States of America) 2020 3 NA
News White (United States of America) 2020 21 14 More hired than left
News NA 2020 3 1 More hired than left
#10/29 = .34
#9/51 =.176 or 1/5=.2
#In 2017, only 17 percent of Newsroom hires were people of color. 
#In 2020 and in the first half of 2021, however, they made up 55 percent of new hires.  
view_this <-df %>% 
  filter(dept=="News") %>% 
  count(dept, race_grouping, data_year, year_type) %>% 
  filter(year_type!="active") %>% 
  pivot_wider(names_from="year_type", values_from="n") %>% 
  #filter(hired>=5 & terminated>=5) %>% 
   mutate(category=case_when(
    hired>terminated ~ "More hired than left",
    terminated>hired ~ "More left than hired",
    terminated==hired ~ "Same",
    TRUE ~ ""
  )) %>% 
  kable()

#12/(12+5+54)=.17


#Of the hires in 2020 and so far in 2021, 55 percent have been people of color. 

df %>% 
  filter(dept=="News") %>% 
  filter(data_year==2020 | data_year==2021) %>% 
  count(dept, race_grouping, year_type) %>% 
  filter(year_type!="active") %>% 
  pivot_wider(names_from="year_type", values_from="n") %>% 
  mutate(category=case_when(
    hired>terminated ~ "More hired than left",
    terminated>hired ~ "More left than hired",
    terminated==hired ~ "Same",
    TRUE ~ ""
  )) %>% 
  kable()
dept race_grouping hired terminated category
News person of color 43 18 More hired than left
News unknown 7 2 More hired than left
News white 32 20 More hired than left
#43/(43+7+32) = .52
#18/(18+2+20) = .45

Newsroom

#In the Newsroom, journalists of color were, on average, hired at a younger age than White workers. 
df %>% 
  filter(year_type=="hired") %>% 
  filter(data_year>=2016) %>% 
  mutate(hire_age=data_year-year(date_of_birth)) %>% 
  filter(dept=="News") %>% 
  group_by(race_ethnicity) %>% 
  summarize(count=n(),
            `average age`=round(mean(hire_age)),
            `median age`=round(median(hire_age))
  ) %>% 
  filter(count>=5) %>% 
  mutate(race_ethnicity=gsub(" \\(United States of America\\)", "", race_ethnicity)) %>% 
  filter(!is.na(race_ethnicity))%>% 
  kable()
race_ethnicity count average age median age
Asian 42 29 27
Black or African American 37 31 30
Hispanic or Latino 28 30 29
Prefer Not to Disclose 11 31 31
Two or More Races 17 28 27
White 221 33 31
#On the Commercial side, Black employees tend to be hired at an older age than employees of other races and ethnicities, with average and median hiring ages of 39 and 42, respectively. Asian employees, meanwhile, have the youngest average and median hiring ages at 27 and 26, respectively.
df %>% 
  filter(year_type=="hired") %>% 
  filter(data_year>=2016) %>% 
  mutate(hire_age=data_year-year(date_of_birth)) %>% 
  filter(dept=="Commercial") %>% 
  group_by(race_ethnicity) %>% 
  summarize(count=n(),
            `average age`=round(mean(hire_age)),
            `median age`=round(median(hire_age))
  ) %>% 
  filter(count>=5) %>% 
  mutate(race_ethnicity=gsub(" \\(United States of America\\)", "", race_ethnicity)) %>% 
  filter(!is.na(race_ethnicity))%>% 
  kable()
race_ethnicity count average age median age
Asian 14 27 26
Black or African American 30 39 42
Hispanic or Latino 6 34 28
Two or More Races 5 33 31
White 59 32 28
#Turnover by desk chart
turnoverbydesk <- df %>% 
  filter(dept=="News") %>% 
  filter(pay_rate_type_current=="Salaried") %>% 
  filter(data_year>=2016) %>% 
  count(dept, desk, year_type) %>% 
  filter(year_type!="active") %>% 
  pivot_wider(names_from="year_type", values_from="n") %>% 
  filter(hired>=5 & terminated>=5) %>% 
  mutate(category=case_when(
    hired>terminated ~ "More hired than left",
    terminated>hired ~ "More left than hired",
    terminated==hired ~ "Same",
    TRUE ~ ""
  )) 

kable(turnoverbydesk)
dept desk hired terminated category
News Audience Development and Engagement 24 8 More hired than left
News Emerging News Products 22 10 More hired than left
News Financial 28 19 More hired than left
News Graphics 16 6 More hired than left
News Local 21 10 More hired than left
News Multiplatform 16 8 More hired than left
News National 44 15 More hired than left
News Sports 18 12 More hired than left
News Video 36 12 More hired than left
#Within the Newsroom, White employees were often hired at a much higher rate than employees of color. In some sections, fewer than five employees of color were hired and therefore couldn’t even be counted within the data in order to protect workers’ privacy. This was the case for Local, Financial and Multiplatform, where 14, 22 and 13 White employees, respectively, were hired from 2016 to 2021.
#and chart
newsroom_desks <- df %>% 
  filter(dept=="News") %>% 
  filter(pay_rate_type_current=="Salaried") %>% 
  filter(data_year>=2016) %>% 
  count(race_grouping, desk, year_type) %>% 
  filter(year_type!="active") %>% 
  pivot_wider(names_from="year_type", values_from="n") %>% 
  filter(hired>=5 & terminated>=5) %>% 
  mutate(category=case_when(
    hired>terminated ~ "More hired than left",
    terminated>hired ~ "More left than hired",
    terminated==hired ~ "Same",
    TRUE ~ ""
  ))
kable(newsroom_desks)
race_grouping desk hired terminated category
person of color Emerging News Products 6 6 Same
person of color National 13 5 More hired than left
person of color Sports 9 6 More hired than left
person of color Video 15 5 More hired than left
white Financial 22 11 More hired than left
white Local 14 6 More hired than left
white Multiplatform 13 6 More hired than left
white National 28 10 More hired than left
white Sports 9 6 More hired than left
white Video 20 7 More hired than left
library(vroom)
df <-vroom("raw_data/active_wd_2021.csv")

colnames(df) <- c("department","employee_id","gender",   
"race_ethnicity","education", "military_status",
"date_of_birth", "original_hire_date","hire_date","pay_rate_type", "current_base_pay", 
"job_profile_current","time_type_current", "cost_center_current",  
"effective_date1", "business_process_type1", "business_process_reason1", "pay_rate_type1", "base_pay_change1", "job_profile1", "time_type1", "cost_center1", 
"effective_date2", "business_process_type2", "business_process_reason2", "pay_rate_type2", "base_pay_change2", "job_profile2", "time_type2", "cost_center2", 
"effective_date3", "business_process_type3", "business_process_reason3", "pay_rate_type3", "base_pay_change3", "job_profile3", "time_type3", "cost_center3",
"effective_date4", "business_process_type4", "business_process_reason4", "pay_rate_type4", "base_pay_change4", "job_profile4", "time_type4", "cost_center4", 
"effective_date5", "business_process_type5", "business_process_reason5", "pay_rate_type5", "base_pay_change5", "job_profile5", "time_type5", "cost_center5", 
"effective_date6", "business_process_type6", "business_process_reason6", "pay_rate_type6", "base_pay_change6", "job_profile6", "time_type6", "cost_center6",
"effective_date7", "business_process_type7", "business_process_reason7", "pay_rate_type7", "base_pay_change7", "job_profile7", "time_type7", "cost_center7", 
"effective_date8", "business_process_type8", "business_process_reason8", "pay_rate_type8", "base_pay_change8", "job_profile8", "time_type8", "cost_center8", 
"effective_date9", "business_process_type9", "business_process_reason9", "pay_rate_type9", "base_pay_change9", "job_profile9", "time_type9", "cost_center9",
"effective_date10", "business_process_type10", "business_process_reason10", "pay_rate_type10", "base_pay_change10", "job_profile10", "time_type10", "cost_center10",
"effective_date11", "business_process_type11", "business_process_reason11", "pay_rate_type11", "base_pay_change11", "job_profile11", "time_type11", "cost_center11",
"effective_date12", "business_process_type12", "business_process_reason12", "pay_rate_type12", "base_pay_change12", "job_profile12", "time_type12", "cost_center12", 
"effective_date13", "business_process_type13", "business_process_reason13", "pay_rate_type13", "base_pay_change13", "job_profile13", "time_type13", "cost_center13", 
"effective_date14", "business_process_type14", "business_process_reason14", "pay_rate_type14", "base_pay_change14", "job_profile14", "time_type14", "cost_center14",
"effective_date15", "business_process_type15", "business_process_reason15", "pay_rate_type15", "base_pay_change15", "job_profile15", "time_type15", "cost_center15", 
"effective_date16", "business_process_type16", "business_process_reason16", "pay_rate_type16", "base_pay_change16", "job_profile16", "time_type16", "cost_center16", 
"effective_date17", "business_process_type17", "business_process_reason17", "pay_rate_type17", "base_pay_change17", "job_profile17", "time_type17", "cost_center17", 
"effective_date18", "business_process_type18", "business_process_reason18", "pay_rate_type18", "base_pay_change18", "job_profile18", "time_type18", "cost_center18", 
"effective_date19", "business_process_type19", "business_process_reason19", "pay_rate_type19", "base_pay_change19", "job_profile19", "time_type19", "cost_center19", 
"effective_date20", "business_process_type20", "business_process_reason20", "pay_rate_type20", "base_pay_change20", "job_profile20", "time_type20", "cost_center20", 
"effective_date21", "business_process_type21", "business_process_reason21", "pay_rate_type21", "base_pay_change21", "job_profile21", "time_type21", "cost_center21", 
"effective_date22", "business_process_type22", "business_process_reason22", "pay_rate_type22", "base_pay_change22", "job_profile22", "time_type22", "cost_center22", 
"effective_date23", "business_process_type23", "business_process_reason23", "pay_rate_type23", "base_pay_change23", "job_profile23", "time_type23", "cost_center23", 
"effective_date24", "business_process_type24", "business_process_reason24", "pay_rate_type24", "base_pay_change24", "job_profile24", "time_type24", "cost_center24", 
"effective_date25", "business_process_type25", "business_process_reason25", #"pay_rate_type_25", "base_pay_change25", 
"job_profile25", #"time_type25", 
"cost_center25", 
"effective_date_26", "business_process_type_26", "business_process_reason_26", #"pay_rate_type_26", "base_pay_change_26", 
"job_profile_26", #"time_type_26", 
"cost_center_26", "2008_annual_performance_rating", "2009_annual_performance_rating", "2010_annual_performance_rating",
"2011_annual_performance_rating", "2012_annual_performance_rating", "2013_annual_performance_rating",
"2014_annual_performance_rating", "2015_annual_performance_rating", "2016_annual_performance_rating",
"2017_annual_performance_rating", "2018_annual_performance_rating", "2019_annual_performance_rating",
"2020_annual_performance_rating")
  
df2 <- vroom("raw_data/terminated_wd_2021.csv")


colnames(df2) <- c("department","employee_id","gender",   
                  "race_ethnicity","education", "military_status",
                  "date_of_birth", "original_hire_date","hire_date", "termination_date",
                  "pay_rate_type", "current_base_pay", 
                  "job_profile_current","time_type_current", "cost_center_current",  
                  "effective_date1", "business_process_type1", "business_process_reason1", "pay_rate_type1", "base_pay_change1", "job_profile1", "time_type1", "cost_center1", 
                  "effective_date2", "business_process_type2", "business_process_reason2", "pay_rate_type2", "base_pay_change2", "job_profile2", "time_type2", "cost_center2", 
                  "effective_date3", "business_process_type3", "business_process_reason3", "pay_rate_type3", "base_pay_change3", "job_profile3", "time_type3", "cost_center3",
                  "effective_date4", "business_process_type4", "business_process_reason4", "pay_rate_type_4", "base_pay_change4", "job_profile4", "time_type4", "cost_center4", 
                  "effective_date5", "business_process_type5", "business_process_reason5", "pay_rate_type_5", "base_pay_change5", "job_profile5", "time_type5", "cost_center5", 
                  "effective_date6", "business_process_type6", "business_process_reason6", "pay_rate_type_6", "base_pay_change6", "job_profile6", "time_type6", "cost_center6",
                  "effective_date7", "business_process_type7", "business_process_reason7", "pay_rate_type_7", "base_pay_change7", "job_profile7", "time_type7", "cost_center7", 
                  "effective_date8", "business_process_type8", "business_process_reason8", "pay_rate_type_8", "base_pay_change8", "job_profile8", "time_type8", "cost_center8", 
                  "effective_date9", "business_process_type9", "business_process_reason9", "pay_rate_type_9", "base_pay_change9", "job_profile9", "time_type9", "cost_center_9",
                  "effective_date10", "business_process_type10", "business_process_reason10", "pay_rate_type10", "base_pay_change10", "job_profile10", "time_type10", "cost_center10",
                  "effective_date11", "business_process_type11", "business_process_reason11", "pay_rate_type11", "base_pay_change11", "job_profile11", "time_type11", "cost_center11",
                  "effective_date12", "business_process_type12", "business_process_reason12", "pay_rate_type12", "base_pay_change12", "job_profile12", "time_type12", "cost_center12", 
                  "effective_date13", "business_process_type13", "business_process_reason13", "pay_rate_type13", "base_pay_change13", "job_profile13", "time_type13", "cost_center13", 
                  "effective_date14", "business_process_type14", "business_process_reason14", "pay_rate_type14", "base_pay_change14", "job_profile14", "time_type14", "cost_center14",
                  "effective_date15", "business_process_type15", "business_process_reason15", "pay_rate_type15", "base_pay_change15", "job_profile15", "time_type15", "cost_center15", 
                   "2008_annual_performance_rating", "2009_annual_performance_rating", "2010_annual_performance_rating",
                  "2011_annual_performance_rating", "2012_annual_performance_rating", "2013_annual_performance_rating",
                  "2014_annual_performance_rating", "2015_annual_performance_rating", "2016_annual_performance_rating",
                  "2017_annual_performance_rating", "2018_annual_performance_rating", "2019_annual_performance_rating",
                  "2020_annual_performance_rating")

data_date <- ymd("2021-05-28")

df <- df %>% mutate(date_of_birth = mdy(date_of_birth),
                    age = floor(decimal_date(data_date) - decimal_date(date_of_birth)))

df <- df %>% 
  mutate(date_of_birth = case_when(
    age < 0 ~ date_of_birth-years(100),
    TRUE ~ date_of_birth),
    age = floor(decimal_date(data_date) - decimal_date(date_of_birth)))


df2 <- df2 %>% mutate(date_of_birth = mdy(date_of_birth),
                    age = floor(decimal_date(data_date) - decimal_date(date_of_birth)))

df2 <- df2 %>% 
  mutate(date_of_birth = case_when(
    age < 0 ~ date_of_birth-years(100),
    TRUE ~ date_of_birth),
    age = floor(decimal_date(data_date) - decimal_date(date_of_birth)))



df <- df %>% mutate(hire_date = mdy(hire_date),
                    years_of_service = floor(decimal_date(data_date) - decimal_date(hire_date)))

df <- df %>% 
  mutate(original_hire_date=mdy(original_hire_date))#,
         #termination_date=ymd(termination_date),
         #effective_date=mdy(effective_date))


df2 <- df2 %>% mutate(hire_date = mdy(hire_date),
                      termination_date=mdy(termination_date),
                    years_of_service = floor(decimal_date(data_date) - decimal_date(hire_date)))

df2 <- df2 %>% 
  mutate(original_hire_date=mdy(original_hire_date),
         termination_date=ymd(termination_date)#,
         #effective_date=mdy(effective_date)
         )


df <- df %>% 
  mutate(age_group_5=case_when(
    age < 25 ~ "<25",
    age >= 25 & age < 30 ~ "25-29",
    age >= 30 & age < 35 ~ "30-34",
    age >= 35 & age < 40 ~ "35-39",
    age >= 40 & age < 45 ~ "40-44",
    age >= 45 & age < 50 ~ "45-49",
    age >= 50 & age < 55 ~ "50-54",
    age >= 55 & age < 60 ~ "55-59",
    age >= 60 & age < 65 ~ "60-64",
    age >= 65 ~ "65+",
    TRUE ~ NA_character_))


df2 <- df2 %>% 
  mutate(age_group_5=case_when(
    age < 25 ~ "<25",
    age >= 25 & age < 30 ~ "25-29",
    age >= 30 & age < 35 ~ "30-34",
    age >= 35 & age < 40 ~ "35-39",
    age >= 40 & age < 45 ~ "34-44",
    age >= 45 & age < 50 ~ "45-49",
    age >= 50 & age < 55 ~ "50-54",
    age >= 55 & age < 60 ~ "55-59",
    age >= 60 & age < 65 ~ "60-64",
    age >= 65 ~ "65+",
    TRUE ~ NA_character_))


df <- df %>% 
  mutate(age_group_10=case_when(
    age < 25 ~ "<25",
    age >= 25 & age < 35 ~ "25-34",
    age >= 35 & age < 45 ~ "35-44",
    age >= 45 & age < 55 ~ "45-55",
    age >= 55 & age < 65 ~ "55-64",
    age >= 65 ~ "65+",
    TRUE ~ NA_character_))


df2 <- df2 %>% 
  mutate(age_group_10=case_when(
    age < 25 ~ "<25",
    age >= 25 & age < 35 ~ "25-34",
    age >= 35 & age < 45 ~ "35-44",
    age >= 45 & age < 55 ~ "45-55",
    age >= 55 & age < 65 ~ "55-64",
    age >= 65 ~ "65+",
    TRUE ~ NA_character_))

df <- df %>% 
  mutate(years_of_service_grouped=case_when(
    years_of_service==0 ~ "0",
    years_of_service>=1  & years_of_service<3 ~ "1-2",
    years_of_service>=3  & years_of_service<3 ~ "3-5",
    years_of_service>=6  & years_of_service<3 ~ "6-10",
    years_of_service>=11  & years_of_service<3 ~ "11-15",
    years_of_service>=16  & years_of_service<3 ~ "16-20",
    years_of_service>=21  & years_of_service<3 ~ "21-25",
    years_of_service>=25 ~ "25+",
    TRUE ~ NA_character_))


df2 <- df2 %>% 
  mutate(years_of_service_grouped=case_when(
    years_of_service==0 ~ "0",
    years_of_service>=1  & years_of_service<3 ~ "1-2",
    years_of_service>=3  & years_of_service<3 ~ "3-5",
    years_of_service>=6  & years_of_service<3 ~ "6-10",
    years_of_service>=11  & years_of_service<3 ~ "11-15",
    years_of_service>=16  & years_of_service<3 ~ "16-20",
    years_of_service>=21  & years_of_service<3 ~ "21-25",
    years_of_service>=25 ~ "25+",
    TRUE ~ NA_character_))

df <- df %>%
  mutate(dept=case_when(
    department == "News" ~ "News", 
    department == "Editorial" ~ "News",
    department == "Client Solutions" ~ "Commercial",
    department == "Circulation" ~ "Commercial",
    department == "Finance" ~ "Commercial",
    department == "Marketing" ~ "Commercial",
    department == "WP News Media Services" ~ "Commercial",
    department == "Production" ~ "Commercial",
    department == "Public Relations" ~ "Commercial",
    department == "Administration" ~ "Commercial",
    department == "Product"~ "Commercial",
    TRUE ~ "Other"))

df2 <- df2 %>%
  mutate(dept=case_when(
    department == "News" ~ "News", 
    department == "Editorial" ~ "News",
    department == "News Service and Syndicate" ~ "News", 
    department == "Audience Development and Insights" ~ "Commercial",
    department == "Client Solutions" ~ "Commercial",
    department == "Customer Care and Logistics" ~ "Commercial",
    department == "Finance" ~ "Commercial",
    department == "Legal" ~ "Commercial",
    department == "Marketing" ~ "Commercial",
    department == "WP News Media Services" ~ "Commercial",
    department == "Production" ~ "Commercial",
    department == "Public Relations" ~ "Commercial",
    department == "Washington Post Live" ~ "Commercial",
    department == "Product" ~ "Commercial",
    TRUE ~ "Other"))

df <- df %>%
  mutate(desk=case_when(
    cost_center_current == "110000 News Operations" ~ "Operations",
    cost_center_current == "110001 News Digital Operations" ~ "Operations",
    cost_center_current == "110610 Audience Development and Engagement" ~ "Audience Development and Engagement",
    cost_center_current == "110620 News Audio" ~ "Audio",
    cost_center_current == "110604 Presentation Design" ~ "Design",
    cost_center_current == "110605 Presentation" ~ "Photography",
    cost_center_current == "110664 News National Apps" ~ "Emerging News Products",
    cost_center_current == "110665 News The Lily" ~ "Emerging News Products",
    cost_center_current == "110666 News Snapchat" ~ "Emerging News Products",
    cost_center_current == "110667 News By The Way" ~ "Emerging News Products",
    cost_center_current == "113210 Economy and Business" ~ "Financial",
    cost_center_current == "114000 Foreign Administration" ~ "Foreign",
    cost_center_current == "114095 News Foreign Brazil" ~ "Foreign",
    cost_center_current == "114100 Foreign Latam" ~ "Foreign",
    cost_center_current == "114220 News Foreign Istanbul" ~ "Foreign",
    cost_center_current == "114235 Foreign Western Europe" ~ "Foreign",
    cost_center_current == "114300 News Foreign West Africa" ~ "Foreign",
    cost_center_current == "114415 Foreign Hong Kong" ~ "Foreign",
    cost_center_current == "114405 Foreign Beijing Bureau" ~ "Foreign",
    cost_center_current == "114105 Foreign Mexico Bureau" ~ "Foreign",
    cost_center_current == "114005 Foreign Beirut Bureau" ~ "Foreign",
    cost_center_current == "114400 Foreign India Bureau" ~ "Foreign",
    cost_center_current == "114410 Foreign Tokyo Bureau" ~ "Foreign",
    cost_center_current == "114205 Foreign Islamabad Bureau" ~ "Foreign",
    cost_center_current == "114305 Foreign Nairobi Bureau" ~ "Foreign",
    cost_center_current == "114240 Foreign Rome Bureau" ~ "Foreign",
    cost_center_current == "114200 Foreign London Bureau" ~ "Foreign",
    cost_center_current == "114230 Foreign Moscow Bureau" ~ "Foreign",
    cost_center_current == "114225 Foreign Cairo Bureau" ~ "Foreign",
    cost_center_current == "114215 Foreign Berlin Bureau" ~ "Foreign",
    cost_center_current == "114310 Foreign Baghdad Bureau" ~ "Foreign",
    cost_center_current == "114315 Foreign Jerusalem Bureau" ~ "Foreign",
    cost_center_current == "110603 Presentation Graphics" ~ "Graphics",
    cost_center_current == "110450 Investigative" ~ "Investigative",
    cost_center_current == "112300 Local Politics and Government" ~ "Local",
    cost_center_current == "110601 Multiplatform Desk" ~ "Multiplatform",
    cost_center_current == "110500 Magazine" ~ "National",
    cost_center_current == "113200 National Politics and Government" ~ "National",
    cost_center_current == "113205 National Security" ~ "National",
    cost_center_current == "113215 News National Health & Science" ~ "National",
    cost_center_current == "113220 National Enterprise" ~ "National",
    cost_center_current == "113235 National America" ~ "National",
    cost_center_current == "113240 News National Environment" ~ "National",
    cost_center_current == "110006 News Content & Research" ~ "News Content and Research",
    cost_center_current == "110455 News Logistics" ~ "News Logistics",
    cost_center_current == "110410 Book World" ~ "Outlook",
    cost_center_current == "110460 Outlook" ~ "Outlook",
    cost_center_current == "110475 Polling" ~ "Polling",
    cost_center_current == "110015 Sports Main" ~ "Sports",
    cost_center_current == "110300 Style" ~ "Style",
    cost_center_current == "110435 Food" ~ "Style",
    cost_center_current == "110485 Travel" ~ "Style",
    cost_center_current == "110495 Local Living" ~ "Style",
    cost_center_current == "110505 Weekend" ~ "Style",
    cost_center_current == "110600 Universal Desk" ~ "Universal Desk",
    cost_center_current == "110652 News Video - General" ~ "Video",
    cost_center_current == "110663 Wake Up Report" ~ "Other",
    cost_center_current == "115000 Editorial Administration" ~ "Editorial",
    TRUE ~ "non-newsroom"))

df2 <- df2 %>%
  mutate(desk=case_when(
    cost_center_current == "110000 News Operations" ~ "Operations",
    cost_center_current == "110001 News Digital Operations" ~ "Operations",
    cost_center_current == "110610 Audience Development and Engagement" ~ "Audience Development and Engagement",
    cost_center_current == "110620 News Audio" ~ "Audio",
    cost_center_current == "110604 Presentation Design" ~ "Design",
    cost_center_current == "110605 Presentation" ~ "Photography",
    cost_center_current == "110664 News National Apps" ~ "Emerging News Products",
    cost_center_current == "110665 News The Lily" ~ "Emerging News Products",
    cost_center_current == "110666 News Snapchat" ~ "Emerging News Products",
    cost_center_current == "110667 News By The Way" ~ "Emerging News Products",
    cost_center_current == "113210 Economy and Business" ~ "Financial",
    cost_center_current == "114000 Foreign Administration" ~ "Foreign",
    cost_center_current == "114095 News Foreign Brazil" ~ "Foreign",
    cost_center_current == "114100 Foreign Latam" ~ "Foreign",
    cost_center_current == "114220 News Foreign Istanbul" ~ "Foreign",
    cost_center_current == "114235 Foreign Western Europe" ~ "Foreign",
    cost_center_current == "114300 News Foreign West Africa" ~ "Foreign",
    cost_center_current == "114415 Foreign Hong Kong" ~ "Foreign",
    cost_center_current == "114405 Foreign Beijing Bureau" ~ "Foreign",
    cost_center_current == "114105 Foreign Mexico Bureau" ~ "Foreign",
    cost_center_current == "114005 Foreign Beirut Bureau" ~ "Foreign",
    cost_center_current == "114400 Foreign India Bureau" ~ "Foreign",
    cost_center_current == "114410 Foreign Tokyo Bureau" ~ "Foreign",
    cost_center_current == "114205 Foreign Islamabad Bureau" ~ "Foreign",
    cost_center_current == "114305 Foreign Nairobi Bureau" ~ "Foreign",
    cost_center_current == "114240 Foreign Rome Bureau" ~ "Foreign",
    cost_center_current == "114200 Foreign London Bureau" ~ "Foreign",
    cost_center_current == "114230 Foreign Moscow Bureau" ~ "Foreign",
    cost_center_current == "114225 Foreign Cairo Bureau" ~ "Foreign",
    cost_center_current == "114215 Foreign Berlin Bureau" ~ "Foreign",
    cost_center_current == "114310 Foreign Baghdad Bureau" ~ "Foreign",
    cost_center_current == "114315 Foreign Jerusalem Bureau" ~ "Foreign",
    cost_center_current == "110603 Presentation Graphics" ~ "Graphics",
    cost_center_current == "110450 Investigative" ~ "Investigative",
    cost_center_current == "112300 Local Politics and Government" ~ "Local",
    cost_center_current == "110601 Multiplatform Desk" ~ "Multiplatform",
    cost_center_current == "110500 Magazine" ~ "National",
    cost_center_current == "113200 National Politics and Government" ~ "National",
    cost_center_current == "113205 National Security" ~ "National",
    cost_center_current == "113215 News National Health & Science" ~ "National",
    cost_center_current == "113220 National Enterprise" ~ "National",
    cost_center_current == "113235 National America" ~ "National",
    cost_center_current == "113240 News National Environment" ~ "National",
    cost_center_current == "110006 News Content & Research" ~ "News Content and Research",
    cost_center_current == "110455 News Logistics" ~ "News Logistics",
    cost_center_current == "110410 Book World" ~ "Outlook",
    cost_center_current == "110460 Outlook" ~ "Outlook",
    cost_center_current == "110475 Polling" ~ "Polling",
    cost_center_current == "110015 Sports Main" ~ "Sports",
    cost_center_current == "110300 Style" ~ "Style",
    cost_center_current == "110435 Food" ~ "Style",
    cost_center_current == "110485 Travel" ~ "Style",
    cost_center_current == "110495 Local Living" ~ "Style",
    cost_center_current == "110505 Weekend" ~ "Style",
    cost_center_current == "110600 Universal Desk" ~ "Universal Desk",
    cost_center_current == "110652 News Video - General" ~ "Video",
    cost_center_current == "110663 Wake Up Report" ~ "Other",
    cost_center_current == "115000 Editorial Administration" ~ "Editorial",
    TRUE ~ "non-newsroom"))

df <- df %>%
  mutate(tier=case_when(
    desk == "National" ~ "Tier 1",
    desk == "Foreign" ~ "Tier 1",
    desk == "Financial" ~ "Tier 1",
    desk == "Investigative" ~ "Tier 1",
    desk == "Style" ~ "Tier 2",
    desk == "Local" ~ "Tier 2",
    desk == "Graphics" ~ "Tier 2",
    desk == "Universal Desk" ~ "Tier 2",
    desk == "Sports" ~ "Tier 2",
    desk == "Outlook" ~ "Tier 2",
    desk == "Editorial" ~ "Tier 2",
    desk == "Audio" ~ "Tier 3",
    desk == "Polling" ~ "Tier 3",
    desk == "Design" ~ "Tier 3",
    desk == "Operations" ~ "Tier 3",
    desk == "Multiplatform" ~ "Tier 3",
    desk == "Video" ~ "Tier 3",
    desk == "Audience Development and Engagement" ~ "Tier 3",
    desk == "News Logistics" ~ "Tier 4",
    desk == "News Content and Research" ~ "Tier 4",
    desk == "Emerging News Products" ~ "Tier 4",
    desk == "Other" ~ "Tier 4",
    TRUE ~ "Other"))

df2 <- df2 %>%
  mutate(tier=case_when(
    desk == "National" ~ "Tier 1",
    desk == "Foreign" ~ "Tier 1",
    desk == "Financial" ~ "Tier 1",
    desk == "Investigative" ~ "Tier 1",
    desk == "Style" ~ "Tier 2",
    desk == "Local" ~ "Tier 2",
    desk == "Graphics" ~ "Tier 2",
    desk == "Universal Desk" ~ "Tier 2",
    desk == "Sports" ~ "Tier 2",
    desk == "Outlook" ~ "Tier 2",
    desk == "Editorial" ~ "Tier 2",
    desk == "Audio" ~ "Tier 3",
    desk == "Polling" ~ "Tier 3",
    desk == "Design" ~ "Tier 3",
    desk == "Operations" ~ "Tier 3",
    desk == "Multiplatform" ~ "Tier 3",
    desk == "Video" ~ "Tier 3",
    desk == "Audience Development and Engagement" ~ "Tier 3",
    desk == "News Logistics" ~ "Tier 4",
    desk == "News Content and Research" ~ "Tier 4",
    desk == "Emerging News Products" ~ "Tier 4",
    desk == "Other" ~ "Tier 4",
    TRUE ~ "Other"))

df <- df %>%
  mutate(race_grouping=case_when(
    race_ethnicity == "White (United States of America)" ~ "white",
    race_ethnicity == "Black or African American (United States of America)" ~ "person of color",
    race_ethnicity == "Asian (United States of America)" ~ "person of color",
    race_ethnicity == "Hispanic or Latino (United States of America)" ~ "person of color",
    race_ethnicity == "Two or More Races (United States of America)" ~ "person of color",
    race_ethnicity == "American Indian or Alaska Native (United States of America)" ~ "person of color",
    race_ethnicity == "Native Hawaiian or Other Pacific Islander (United States of America)" ~ "person of color",
    TRUE ~ "unknown"))

df2 <- df2 %>%
  mutate(race_grouping=case_when(
    race_ethnicity == "White (United States of America)" ~ "white",
    race_ethnicity == "Black or African American (United States of America)" ~ "person of color",
    race_ethnicity == "Asian (United States of America)" ~ "person of color",
    race_ethnicity == "Hispanic or Latino (United States of America)" ~ "person of color",
    race_ethnicity == "Two or More Races (United States of America)" ~ "person of color",
    race_ethnicity == "American Indian or Alaska Native (United States of America)" ~ "person of color",
    race_ethnicity == "Native Hawaiian or Other Pacific Islander (United States of America)" ~ "person of color",
    TRUE ~ "unknown"))


pay_change_function <- function(x=df, cycles=26, label="active") {
  for (i in 1:cycles) {
    if (i <24) {
      df_year <- df %>% 
        select(employee_id, paste0("business_process_reason", as.character(i)),
               paste0("base_pay_change", as.character(i)),
               paste0("effective_date", as.character(i)),
               paste0("pay_rate_type", as.character(i)),
               paste0("time_type", as.character(i)),
               gender, race_ethnicity, race_grouping,
               age_group_5, dept, tier,
               `2008_annual_performance_rating`,
               `2009_annual_performance_rating`,
               `2010_annual_performance_rating`,
               `2011_annual_performance_rating`,
               `2012_annual_performance_rating`,
               `2013_annual_performance_rating`,
               `2014_annual_performance_rating`,
               `2015_annual_performance_rating`,
               `2016_annual_performance_rating`,
               `2017_annual_performance_rating`,
               `2018_annual_performance_rating`,
               `2019_annual_performance_rating`,
               `2020_annual_performance_rating`
        ) 
    } else {
      df_year <- df %>% 
        select(employee_id, paste0("business_process_reason", as.character(i)),
               #paste0("base_pay_change", as.character(i)),
               paste0("effective_date", as.character(i)),
               #paste0("pay_rate_type", as.character(i)),
               gender, race_ethnicity, race_grouping,
               age_group_5, dept, tier,
               `2008_annual_performance_rating`,
               `2009_annual_performance_rating`,
               `2010_annual_performance_rating`,
               `2011_annual_performance_rating`,
               `2012_annual_performance_rating`,
               `2013_annual_performance_rating`,
               `2014_annual_performance_rating`,
               `2015_annual_performance_rating`,
               `2016_annual_performance_rating`,
               `2017_annual_performance_rating`,
               `2018_annual_performance_rating`,
               `2019_annual_performance_rating`,
               `2020_annual_performance_rating`
               ) 
      
      df_year$base_pay_change <- NA
      df_year$pay_rate_type <- NA
      df_year$time_type <- NA
      
    }
    
    colnames(df_year) <- gsub(paste0("n", as.character(i)), "n", colnames(df_year))
    colnames(df_year) <- gsub(paste0("e", as.character(i)), "e", colnames(df_year))
    #colnames(df_year) <- gsub(paste0("r", as.character(i)), "r", colnames(df_year))
    
    df_year <- df_year %>% 
      select(employee_id, business_process_reason,base_pay_change,effective_date,
             pay_rate_type,gender, race_ethnicity, race_grouping,
             age_group_5, dept, tier,
             `2008_annual_performance_rating`,
             `2009_annual_performance_rating`,
             `2010_annual_performance_rating`,
             `2011_annual_performance_rating`,
             `2012_annual_performance_rating`,
             `2013_annual_performance_rating`,
             `2014_annual_performance_rating`,
             `2015_annual_performance_rating`,
             `2016_annual_performance_rating`,
             `2017_annual_performance_rating`,
             `2018_annual_performance_rating`,
             `2019_annual_performance_rating`,
             `2020_annual_performance_rating`) %>% 
      mutate(workday = paste0(label, "-", i))
    
    if (i==1) {
      mega_df <- df_year
    } else {
      mega_df <- rbind(mega_df, df_year)
    }
  }
  return(mega_df)
}

reason_for_change1 <- pay_change_function(df, cycles=25, label="active")
reason_for_change2 <- pay_change_function(df2, cycles=15, label="terminated")
reason_for_change_combined <- rbind(reason_for_change1, reason_for_change2)

emp_perf_function <- function(x=df_name, year_start=2008, year_end=2020, label="active") {
  for (i in year_start:year_end) {
    df_name <- x
    df_year <- df_name %>% 
      select(employee_id, paste0(as.character(i), "_annual_performance_rating"),
             gender, race_ethnicity, race_grouping, dept)
    
    colnames(df_year) <- c("employee_id", "performance_rating","gender",
                           "race_ethnicity","race_grouping","dept")
    
    df_year$workday <- paste0(label, "-", i)
    
    if (i==year_start) {
      mega_df <- df_year
    } else {
      mega_df <- rbind(mega_df, df_year)  
      
    }
  }
  return(mega_df)
}  

ratings1 <- emp_perf_function(df, year_start=2008, year_end=2020, label="active")
ratings2 <- emp_perf_function(df2, year_start=2008, year_end=2020, label="terminated")

ratings_combined <- rbind(ratings1, ratings2) %>% 
  filter(!is.na(performance_rating))

news_salaried <- filter(df, dept == "News", pay_rate_type == "Salaried")
news_hourly <- filter(df, dept == "News", pay_rate_type == "Hourly")
commercial_salaried <- filter(df, dept == "Commercial", pay_rate_type == "Salaried")
commercial_hourly <- filter(df, dept == "Commercial", pay_rate_type == "Hourly")

news_salaried2 <-  filter(df2, dept == "News", pay_rate_type == "Salaried")
news_hourly2 <-  filter(df2, dept == "News", pay_rate_type == "Hourly")
commercial_salaried2 <-  filter(df2, dept == "Commercial", pay_rate_type == "Salaried")
commercial_hourly2 <-  filter(df2, dept == "Commercial", pay_rate_type == "Hourly")
#Within our Guild data, 67 percent of Guild-covered employees were White, and 28 percent were people of color.

df %>% 
  filter(pay_rate_type=="Salaried") %>% 
  group_by(race_grouping) %>% 
  summarize(n=n(), median=median(current_base_pay, na.rm=T),
            age=median(age, na.rm=T)) %>% 
  mutate(percent=round(n/sum(n)*100,0))%>% 
  kable()
race_grouping n median age percent
person of color 218 94530.0 35.5 28
unknown 40 129890.0 35.0 5
white 525 110453.4 39.0 67
# Salaried White male Newsroom employees outnumbered salaried White female Newsroom employees (229 to 207), but salaried women of color outnumbered salaried men of color in the Newsroom (109 to 72). 
df %>% 
  filter(dept=="News") %>% 
  filter(pay_rate_type=="Salaried") %>% 
  count(gender, race_grouping) %>% 
  filter(n>=5)%>% 
  kable()
gender race_grouping n
Female person of color 109
Female unknown 20
Female white 207
Male person of color 72
Male unknown 19
Male white 229
#The median salary for men was $120,977, compared with $102,700 for women, a gap of 18 percent. 
df %>% 
  filter(dept=="News") %>% 
  filter(pay_rate_type=="Salaried") %>% 
  group_by(gender) %>% 
  summarize(n=n(), median=median(current_base_pay, na.rm=T),
            age=median(age, na.rm=T)) %>% 
  mutate(percent=round(n/sum(n)*100,0))  %>% 
  filter(n>=5)%>% 
  kable()
gender n median age percent
Female 336 102700.0 35.5 51
Male 320 120976.6 42.0 49
#The median pay for women was $36 per hour, compared with $34 per hour for men. 
df %>% 
  filter(dept=="News") %>% 
  filter(pay_rate_type=="Hourly") %>% 
  group_by(gender) %>% 
  summarize(n=n(), median=median(current_base_pay, na.rm=T),
            age=median(age, na.rm=T)) %>% 
  mutate(percent=round(n/sum(n)*100,0))  %>% 
  filter(n>=5)%>% 
  kable()
gender n median age percent
Female 50 35.90 34 60
Male 33 33.77 34 40
#The disparity in median pay between men and women was starkest among employees under the age of 40, among whom there was a gap of 12.5 percent, amounting to nearly $13,000.
df %>% 
  filter(dept=="News") %>% 
  filter(pay_rate_type=="Salaried") %>%
  mutate(y40 =case_when(age >=40 ~ "Over 40",
                        age < 40 ~ "Under 40",
                        TRUE ~ "Unknown")) %>% 
  group_by(gender, y40) %>%
  summarize(n=n(), median=median(current_base_pay, na.rm=T),
            age=median(age, na.rm=T)) %>%
  mutate(percent=round(n/sum(n)*100,0))  %>%
  filter(n>=5)%>% 
  kable()
gender y40 n median age percent
Female Over 40 125 128280 52 37
Female Under 40 211 90440 31 63
Male Over 40 181 131410 52 57
Male Under 40 139 103400 34 43
#103400-90400
#(103400-90400)/90400
#White journalists made nearly 16 percent more than journalists of color. 
#White journalists made a median salary of $113,810, while journalists of color had a median salary of $98,435. 
df %>% 
  filter(dept=="News") %>% 
  filter(pay_rate_type=="Salaried") %>% 
  group_by(race_grouping) %>% 
  summarize(n=n(), median=median(current_base_pay, na.rm=T),
            age=median(age, na.rm=T)) %>% 
  mutate(percent=round(n/sum(n)*100,0))  %>% 
  filter(n>=5)%>% 
  kable()
race_grouping n median age percent
person of color 181 98435.0 35 28
unknown 39 130000.0 35 6
white 437 113809.7 41 67
#This gap was particularly stark when comparing the median salaries for White men and women of color. White men had a median salary of $123,797, compared with a median salary of $94,840 for women of color. That gap was nearly $29,000 — or over 28 percent.
df %>% 
  filter(dept=="News") %>% 
  filter(pay_rate_type=="Salaried") %>% 
  group_by(gender, race_grouping) %>% 
  summarize(n=n(), median=median(current_base_pay, na.rm=T),
            age=median(age, na.rm=T)) %>% 
  mutate(percent=round(n/sum(n)*100,0))  %>% 
  filter(n>=5)%>% 
  kable()
gender race_grouping n median age percent
Female person of color 109 94840.0 33.0 32
Female unknown 20 116890.0 33.5 6
Female white 207 105780.0 37.0 62
Male person of color 72 104420.0 40.0 22
Male unknown 19 133560.0 36.0 6
Male white 229 123796.9 43.0 72
#For the 320 salaried male Newsroom employees working at The Post, the median salary was $120,977. 
#For the 336 salaried female Newsroom employees, it was $102,700.
#The median age for salaried men working in the Newsroom was 42, compared with 36 for salaried women.

df %>% 
  filter(dept=="News") %>% 
  filter(pay_rate_type=="Salaried") %>% 
  group_by(gender) %>% 
  summarize(n=n(), median=median(current_base_pay, na.rm=T),
            age=median(age, na.rm=T)) %>% 
  mutate(percent=round(n/sum(n)*100,0))  %>% 
  filter(n>=5)%>% 
  kable()
gender n median age percent
Female 336 102700.0 35.5 51
Male 320 120976.6 42.0 49
#For the 83 hourly employees across the Newsroom, there was also a slight difference. 
#The median hourly wage for men was $34, compared with $36 for women. 
#Comparing by age is difficult because only 33 men worked hourly jobs in the Newsroom and their ages varied widely.
#That said, female hourly workers (50) made more than male hourly workers. 
#Under the age of 40, the gap was nearly $3, and over the age of 40, the gap was $10. 
#However, the sample size for hourly workers was low, meaning a few employees could lower or raise the median fairly drastically.

df %>% 
  filter(dept=="News") %>% 
  filter(pay_rate_type=="Hourly") %>% 
  group_by(gender) %>% 
  summarize(n=n(), median=median(current_base_pay, na.rm=T),
            age=median(age, na.rm=T)) %>% 
  mutate(percent=round(n/sum(n)*100,0))  %>% 
  filter(n>=5)%>% 
  kable()
gender n median age percent
Female 50 35.90 34 60
Male 33 33.77 34 40
#Over the age of 40: There were nearly 1.4 men for every woman in the Newsroom. 
#Their median salary was 2.4 percent more, a gap of over $3,000. 
#Under the age of 40: There were nearly 1.5 women for every man in the Newsroom. 
#Their median salary was 12.5 percent less, a gap of nearly $13,000.

df %>% 
  filter(dept=="News") %>% 
  filter(pay_rate_type=="Salaried") %>%
  mutate(y40 =case_when(age >=40 ~ "Over 40",
                        age < 40 ~ "Under 40",
                        TRUE ~ "Unknown")) %>% 
  group_by(gender, y40) %>%
  summarize(n=n(), median=median(current_base_pay, na.rm=T),
            age=median(age, na.rm=T)) %>%
  mutate(percent=round(n/sum(n)*100,0))  %>%
  filter(n>=5)%>% 
  kable()
gender y40 n median age percent
Female Over 40 125 128280 52 37
Female Under 40 211 90440 31 63
Male Over 40 181 131410 52 57
Male Under 40 139 103400 34 43
#Among those under 40, Newsroom employees of color made about 4 percent less than their White counterparts, with median salaries of $92,000 and $96,000, respectively.
#The race disparity widened for journalists 40 and over. 
#Newsroom employees of color had a median salary of $121,875, while their White colleagues had a median salary of $130,800 — a gap of more than 7 percent.

df %>% 
  filter(dept=="News") %>% 
  filter(pay_rate_type=="Salaried") %>%
  mutate(y40 =case_when(age >=40 ~ "Over 40",
                        age < 40 ~ "Under 40",
                        TRUE ~ "Unknown")) %>% 
  group_by(race_grouping, y40) %>%
  summarize(n=n(), median=median(current_base_pay, na.rm=T),
            age=median(age, na.rm=T)) %>%
  mutate(percent=round(n/sum(n)*100,0))  %>%
  filter(n>=5)%>% 
  kable()
race_grouping y40 n median age percent
person of color Over 40 68 121874.6 47.5 38
person of color Under 40 113 92000.0 30.0 62
unknown Over 40 10 151785.4 54.0 26
unknown Under 40 29 105000.0 33.0 74
white Over 40 228 130799.7 52.0 52
white Under 40 209 96000.0 32.0 48
#Below are the median salaries by race and ethnicity across the Newsroom:
#  White: $113,810
#Black: $102,700
#Asian: $103,970
#Hispanic or Latino: $95,780
#Two or more races: $91,090

df %>% 
  filter(dept=="News") %>% 
  filter(pay_rate_type=="Salaried") %>%
  mutate(y40 =case_when(age >=40 ~ "Over 40",
                        age < 40 ~ "Under 40",
                        TRUE ~ "Unknown")) %>% 
  group_by(race_ethnicity) %>%
  summarize(n=n(), median=median(current_base_pay, na.rm=T),
            age=median(age, na.rm=T)) %>%
  mutate(percent=round(n/sum(n)*100,0))  %>%
  filter(n>=5)%>% 
  kable()
race_ethnicity n median age percent
Asian (United States of America) 59 103970.05 35.0 9
Black or African American (United States of America) 60 102700.04 39.0 9
Hispanic or Latino (United States of America) 42 95780.04 35.5 6
Prefer Not to Disclose (United States of America) 12 94890.00 31.0 2
Two or More Races (United States of America) 18 91090.00 30.0 3
White (United States of America) 437 113809.68 41.0 67
NA 27 144340.00 36.0 4
#The gap between White journalists in the Newsroom and journalists of color was nearly 16 percent, with a median salary of $113,810 for 437 White journalists and $98,435 for 181 journalists of color. 
df %>% 
  filter(dept=="News") %>% 
  filter(pay_rate_type=="Salaried") %>%
  mutate(y40 =case_when(age >=40 ~ "Over 40",
                        age < 40 ~ "Under 40",
                        TRUE ~ "Unknown")) %>% 
  group_by(race_grouping) %>%
  summarize(n=n(), median=median(current_base_pay, na.rm=T),
            age=median(age, na.rm=T)) %>%
  mutate(percent=round(n/sum(n)*100,0))  %>%
  filter(n>=5)%>% 
  kable()
race_grouping n median age percent
person of color 181 98435.0 35 28
unknown 39 130000.0 35 6
white 437 113809.7 41 67
#About 58 of the hourly employees in the Newsroom were White — and they made a median wage of $35 an hour compared with $34 for hourly employees of color. 
df %>% 
  filter(dept=="News") %>% 
  filter(pay_rate_type=="Hourly") %>%
  mutate(y40 =case_when(age >=40 ~ "Over 40",
                        age < 40 ~ "Under 40",
                        TRUE ~ "Unknown")) %>% 
  group_by(race_grouping) %>%
  summarize(n=n(), median=median(current_base_pay, na.rm=T),
            age=median(age, na.rm=T)) %>%
  mutate(percent=round(n/sum(n)*100,0))  %>%
  filter(n>=5)%>% 
  kable()
race_grouping n median age percent
person of color 24 33.59 27.5 29
white 58 35.40 36.5 70
#The median salaries by group were as follows: 

#White men: $123,797
#Men of color: $104,420
#White women: $105,780
#Women of color: $94,840

#Median ages by group were as follows:

#White men: 43
#Men of color: 40
#White women: 37
#Women of color: 33

df %>% 
  filter(dept=="News") %>% 
  filter(pay_rate_type=="Salaried") %>%
  mutate(y40 =case_when(age >=40 ~ "Over 40",
                        age < 40 ~ "Under 40",
                        TRUE ~ "Unknown")) %>% 
  group_by(race_grouping, gender) %>%
  summarize(n=n(), median=median(current_base_pay, na.rm=T),
            age=median(age, na.rm=T)) %>%
  mutate(percent=round(n/sum(n)*100,0))  %>%
  filter(n>=5)%>% 
  kable()
race_grouping gender n median age percent
person of color Female 109 94840.0 33.0 60
person of color Male 72 104420.0 40.0 40
unknown Female 20 116890.0 33.5 51
unknown Male 19 133560.0 36.0 49
white Female 207 105780.0 37.0 47
white Male 229 123796.9 43.0 52
# For most sections the median salary for men surpasses women CHART
#Of the 15 sections that had at least five men and five women:
#In 10 sections, men had higher median pay than women (beyond 5 percent): Audience, Design, Foreign, Investigative, Local, Multiplatform, National, Opinion, Photography and Style.
#One section, Sports, had a median pay disparity that favors women by 18 percent.
#In four sections, there was approximate pay equity between genders (within 5 percent): Emerging News Products, Financial, Graphics and Video.

desk_analysis <- df %>% 
  filter(dept=="News") %>% 
  filter(pay_rate_type=="Salaried") %>%
  mutate(y40 =case_when(age >=40 ~ "Over 40",
                        age < 40 ~ "Under 40",
                        TRUE ~ "Unknown")) %>% 
  group_by(desk, gender) %>%
  summarize(n=n(), median=median(current_base_pay, na.rm=T),
            age=median(age, na.rm=T)) %>%
  mutate(percent=round(n/sum(n)*100,0))  %>%
  filter(n>=5)

kable(desk_analysis)
desk gender n median age percent
Audience Development and Engagement Female 19 88880.00 31.0 68
Audience Development and Engagement Male 9 94780.00 34.0 32
Audio Female 5 94280.00 31.0 62
Design Female 19 82201.52 29.0 58
Design Male 14 93640.00 37.0 42
Editorial Female 15 102692.61 33.0 45
Editorial Male 18 156476.68 45.5 55
Emerging News Products Female 19 73440.00 29.0 73
Emerging News Products Male 7 76840.01 28.0 27
Financial Female 16 133666.00 42.0 37
Financial Male 27 137340.00 41.0 63
Foreign Female 17 129780.00 31.0 52
Foreign Male 16 150780.00 37.5 48
Graphics Female 11 104320.14 31.0 52
Graphics Male 9 101640.00 34.0 43
Investigative Female 6 133200.00 38.0 35
Investigative Male 11 151780.00 43.0 65
Local Female 36 108075.80 39.5 53
Local Male 32 122011.18 51.5 47
Multiplatform Female 28 88560.00 42.5 64
Multiplatform Male 16 96356.89 51.5 36
National Female 54 145602.97 42.5 47
National Male 60 178737.50 41.0 53
Photography Female 10 91847.57 42.5 37
Photography Male 17 107940.95 49.0 63
Sports Female 10 125935.00 32.5 23
Sports Male 33 103685.00 41.0 77
Style Female 28 107060.00 41.0 56
Style Male 22 118280.00 46.5 44
Video Female 29 91560.00 30.0 63
Video Male 17 95000.00 35.0 37
# Workers of color have lower median salary in nearly all departments CHART
#Of 13 sections for which there were at least five White journalists and five journalists of color:
#Six sections had a median pay disparity favoring White journalists of more than 7 percent: the Newsroom, Emerging News Products, Video, Style, Photography and National.
#Seven sections had approximate pay equity (within 5 percent) between journalists of color and White journalists: Audience, Design, Financial, Graphics, Local, Multiplatform and Sports.

desk_analysis <- df %>% 
  filter(dept=="News") %>% 
  filter(pay_rate_type=="Salaried") %>%
  mutate(y40 =case_when(age >=40 ~ "Over 40",
                        age < 40 ~ "Under 40",
                        TRUE ~ "Unknown")) %>% 
  group_by(desk, race_grouping) %>%
  summarize(n=n(), median=median(current_base_pay, na.rm=T),
            age=median(age, na.rm=T)) %>%
  mutate(percent=round(n/sum(n)*100,0))  %>%
  filter(n>=5)

kable(desk_analysis)
desk race_grouping n median age percent
Audience Development and Engagement person of color 11 88280.00 34.0 39
Audience Development and Engagement white 17 89280.00 31.0 61
Design person of color 15 90009.88 32.0 45
Design white 17 85060.00 32.0 52
Editorial person of color 7 104560.00 33.0 21
Editorial white 25 134806.25 42.0 76
Emerging News Products person of color 7 71853.00 27.0 27
Emerging News Products white 18 76700.01 28.5 69
Financial person of color 8 131420.00 38.5 19
Financial white 33 137799.88 41.0 77
Foreign unknown 25 144340.00 36.0 76
Foreign white 6 124843.27 32.5 18
Graphics person of color 8 102980.07 29.5 38
Graphics white 11 106340.00 33.0 52
Investigative white 13 151780.00 43.0 76
Local person of color 19 113000.00 43.0 28
Local white 49 113280.00 46.0 72
Multiplatform person of color 8 90611.76 46.0 18
Multiplatform white 36 92547.18 49.0 82
National person of color 28 135595.00 39.5 25
National white 83 177838.45 44.0 73
Photography person of color 7 95000.00 49.0 26
Photography white 19 106340.00 48.0 70
Sports person of color 14 102780.00 38.5 33
Sports white 29 107560.00 41.0 67
Style person of color 12 104243.74 36.0 24
Style white 38 115471.31 47.5 76
Video person of color 18 87000.00 29.5 39
Video white 27 95000.00 35.0 59
# Race and gender pay disparities evident across sections CHART
#There were only four sections that had at least five White men, five White women, five men of color and five women of color: Design, Local, National and Sports.
#All four sections had racial and gender pay disparities. 
#For employees across these four sections, the disparity in median salaries between White men and women of color was 8 percent. 
#The starkest gaps in median salaries were between White men and men of color in National ($51,000) and between White women and women of color in Sports ($65,000).


desk_analysis <- df %>% 
  filter(dept=="News") %>% 
  filter(pay_rate_type=="Salaried") %>%
  mutate(y40 =case_when(age >=40 ~ "Over 40",
                        age < 40 ~ "Under 40",
                        TRUE ~ "Unknown")) %>% 
  group_by(desk, race_grouping, gender) %>%
  summarize(n=n(), median=median(current_base_pay, na.rm=T),
            age=median(age, na.rm=T)) %>%
  mutate(percent=round(n/sum(n)*100,0))  %>%
  filter(n>=5)

kable(desk_analysis)
desk race_grouping gender n median age percent
Audience Development and Engagement person of color Female 9 90000.00 35.0 82
Audience Development and Engagement white Female 10 86330.00 29.0 59
Audience Development and Engagement white Male 7 103060.00 35.0 41
Design person of color Female 8 85789.94 29.0 53
Design person of color Male 7 92500.00 41.0 47
Design white Female 11 82201.52 31.0 65
Design white Male 6 92500.00 37.0 35
Editorial white Female 10 107200.37 35.0 40
Editorial white Male 15 158673.36 46.0 60
Emerging News Products white Female 14 72940.00 29.0 78
Financial person of color Female 6 141070.00 43.0 75
Financial white Female 10 129876.00 40.5 30
Financial white Male 23 142027.50 41.0 70
Foreign unknown Female 13 136560.00 36.0 52
Foreign unknown Male 12 147670.00 37.5 48
Graphics person of color Female 5 104320.14 28.0 62
Graphics white Female 5 104780.00 31.0 45
Graphics white Male 5 117131.00 59.0 45
Investigative white Male 9 151840.00 43.0 69
Local person of color Female 12 106890.00 39.5 63
Local person of color Male 7 122909.15 50.0 37
Local white Female 24 108075.80 38.5 49
Local white Male 25 121113.20 54.0 51
Multiplatform person of color Female 5 84220.51 32.0 62
Multiplatform white Female 23 89840.00 43.0 64
Multiplatform white Male 13 96600.08 50.0 36
National person of color Female 16 137890.00 40.0 57
National person of color Male 12 135050.00 37.5 43
National white Female 36 148550.00 45.5 43
National white Male 47 186065.00 43.0 57
Photography person of color Male 7 95000.00 49.0 100
Photography white Female 10 91847.57 42.5 53
Photography white Male 9 119340.00 52.0 47
Sports person of color Female 5 95000.00 27.0 36
Sports person of color Male 9 110560.00 42.0 64
Sports white Female 5 160089.98 60.0 17
Sports white Male 24 102653.66 40.0 83
Style person of color Female 8 96353.74 36.0 67
Style white Female 20 111833.47 47.0 53
Style white Male 18 116175.06 49.0 47
Video person of color Female 15 84000.00 29.0 83
Video white Female 13 95000.00 35.0 48
Video white Male 14 94500.00 35.0 52
#Across sections in which employees’ median salary was higher than $125,000 (including National, Investigative, Foreign, Financial and Opinions): 
#67 percent of employees were White and 55 percent of employees were male. 


df %>% 
  filter(dept=="News") %>% 
  filter(pay_rate_type=="Salaried") %>% 
  mutate(t_desk=case_when(
    desk=="National" | 
      desk=="Investigative" |
      desk=="Foreign" |
      desk=="Financial" |
      desk=="Editorial" ~ "Higher",
    TRUE ~ "Lower")) %>% 
  group_by(t_desk, race_grouping) %>% 
  summarise(
    count = length(current_base_pay),
    median = median(current_base_pay, na.rm = FALSE)
  ) %>% 
  mutate(percent=count/sum(count)*100)%>% 
  kable()
t_desk race_grouping count median percent
Higher person of color 48 134455 20.000000
Higher unknown 32 136950 13.333333
Higher white 160 150485 66.666667
Lower person of color 133 92400 31.894484
Lower unknown 7 90000 1.678657
Lower white 277 100000 66.426858
df %>% 
  filter(dept=="News") %>% 
  filter(pay_rate_type=="Salaried") %>% 
  mutate(t_desk=case_when(
    desk=="National" | 
      desk=="Investigative" |
      desk=="Foreign" |
      desk=="Financial" |
      desk=="Editorial" ~ "Higher",
    TRUE ~ "Lower")) %>% 
  group_by(t_desk, gender) %>% 
  summarise(
    count = length(current_base_pay),
    median = median(current_base_pay, na.rm = FALSE)
  ) %>% 
  mutate(percent=count/sum(count)*100)%>% 
  kable()
t_desk gender count median percent
Higher Female 108 135780.00 45.0000000
Higher Male 132 154535.00 55.0000000
Lower Female 228 92214.84 54.6762590
Lower Male 188 101042.43 45.0839329
Lower Prefer not to disclose 1 106340.00 0.2398082
#40 percent of employees were White men, 27 percent were White women, 12 percent were women of color, and 8 percent were men of color (the rest did not indicate race). 

df %>% 
  filter(dept=="News") %>% 
  filter(pay_rate_type=="Salaried") %>% 
  mutate(t_desk=case_when(
    desk=="National" | 
      desk=="Investigative" |
      desk=="Foreign" |
      desk=="Financial" |
      desk=="Editorial" ~ "Higher",
    TRUE ~ "Lower")) %>% 
  group_by(t_desk, race_grouping, gender) %>% 
  summarise(
    count = length(current_base_pay),
    median = median(current_base_pay, na.rm = FALSE)
  ) %>% 
  mutate(percent=count/sum(count)*100)%>% 
  kable()
t_desk race_grouping gender count median percent
Higher person of color Female 28 132500.0 58.3333333
Higher person of color Male 20 135050.0 41.6666667
Higher unknown Female 16 133170.0 50.0000000
Higher unknown Male 16 140840.0 50.0000000
Higher white Female 64 137384.8 40.0000000
Higher white Male 96 166841.8 60.0000000
Lower person of color Female 81 91060.0 60.9022556
Lower person of color Male 52 94890.0 39.0977444
Lower unknown Female 4 87780.0 57.1428571
Lower unknown Male 3 98340.0 42.8571429
Lower white Female 143 95000.0 51.6245487
Lower white Male 133 103400.0 48.0144404
Lower white Prefer not to disclose 1 106340.0 0.3610108