Post Guild Pay Study: Retention and Newsroom

Retention

at20 <- read_csv("raw_data/active_wd_2020.csv") %>% 
  select(employee_id, hire_date) %>% 
  mutate(year20=T)

active_21 <- read_csv("raw_data/active_wd_2021.csv")  %>% 
  select(employee_id=`Employee ID`, hire_date=`Hire Date`) %>% 
  mutate(hire_date=mdy(hire_date))

terminated_21 <- read_csv("raw_data/terminated_wd_2021.csv") %>% 
  select(employee_id=`Employee ID`, hire_date=`Hire Date`) %>% 
  mutate(hire_date=mdy(hire_date))

at21 <- rbind(active_21, terminated_21)  %>% 
  mutate(year21=T)

at2122 <- anti_join(at20, at21)
cat(paste0("Missing employees between 2020 and 2021 data: ", nrow(at2122)))

## Missing employees between 2020 and 2021 data: 23

df <- read_csv("raw_data/active_wd_2021.csv")

colnames(df) <- c("department","employee_id","gender",   
                  "race_ethnicity","education", "military_status",
                  "date_of_birth", "original_hire_date","hire_date","pay_rate_type", "current_base_pay", 
                  "job_profile_current","time_type_current", "cost_center_current",  
                  "effective_date1", "business_process_type1", "business_process_reason1", "pay_rate_type1", "base_pay_change1", "job_profile1", "time_type1", "cost_center1", 
                  "effective_date2", "business_process_type2", "business_process_reason2", "pay_rate_type2", "base_pay_change2", "job_profile2", "time_type2", "cost_center2", 
                  "effective_date3", "business_process_type3", "business_process_reason3", "pay_rate_type3", "base_pay_change3", "job_profile3", "time_type3", "cost_center3",
                  "effective_date4", "business_process_type4", "business_process_reason4", "pay_rate_type4", "base_pay_change4", "job_profile4", "time_type4", "cost_center4", 
                  "effective_date5", "business_process_type5", "business_process_reason5", "pay_rate_type5", "base_pay_change5", "job_profile5", "time_type5", "cost_center5", 
                  "effective_date6", "business_process_type6", "business_process_reason6", "pay_rate_type6", "base_pay_change6", "job_profile6", "time_type6", "cost_center6",
                  "effective_date7", "business_process_type7", "business_process_reason7", "pay_rate_type7", "base_pay_change7", "job_profile7", "time_type7", "cost_center7", 
                  "effective_date8", "business_process_type8", "business_process_reason8", "pay_rate_type8", "base_pay_change8", "job_profile8", "time_type8", "cost_center8", 
                  "effective_date9", "business_process_type9", "business_process_reason9", "pay_rate_type9", "base_pay_change9", "job_profile9", "time_type9", "cost_center9",
                  "effective_date10", "business_process_type10", "business_process_reason10", "pay_rate_type10", "base_pay_change10", "job_profile10", "time_type10", "cost_center10",
                  "effective_date11", "business_process_type11", "business_process_reason11", "pay_rate_type11", "base_pay_change11", "job_profile11", "time_type11", "cost_center11",
                  "effective_date12", "business_process_type12", "business_process_reason12", "pay_rate_type12", "base_pay_change12", "job_profile12", "time_type12", "cost_center12", 
                  "effective_date13", "business_process_type13", "business_process_reason13", "pay_rate_type13", "base_pay_change13", "job_profile13", "time_type13", "cost_center13", 
                  "effective_date14", "business_process_type14", "business_process_reason14", "pay_rate_type14", "base_pay_change14", "job_profile14", "time_type14", "cost_center14",
                  "effective_date15", "business_process_type15", "business_process_reason15", "pay_rate_type15", "base_pay_change15", "job_profile15", "time_type15", "cost_center15", 
                  "effective_date16", "business_process_type16", "business_process_reason16", "pay_rate_type16", "base_pay_change16", "job_profile16", "time_type16", "cost_center16", 
                  "effective_date17", "business_process_type17", "business_process_reason17", "pay_rate_type17", "base_pay_change17", "job_profile17", "time_type17", "cost_center17", 
                  "effective_date18", "business_process_type18", "business_process_reason18", "pay_rate_type18", "base_pay_change18", "job_profile18", "time_type18", "cost_center18", 
                  "effective_date19", "business_process_type19", "business_process_reason19", "pay_rate_type19", "base_pay_change19", "job_profile19", "time_type19", "cost_center19", 
                  "effective_date20", "business_process_type20", "business_process_reason20", "pay_rate_type20", "base_pay_change20", "job_profile20", "time_type20", "cost_center20", 
                  "effective_date21", "business_process_type21", "business_process_reason21", "pay_rate_type21", "base_pay_change21", "job_profile21", "time_type21", "cost_center21", 
                  "effective_date22", "business_process_type22", "business_process_reason22", "pay_rate_type22", "base_pay_change22", "job_profile22", "time_type22", "cost_center22", 
                  "effective_date23", "business_process_type23", "business_process_reason23", "pay_rate_type23", "base_pay_change23", "job_profile23", "time_type23", "cost_center23", 
                  "effective_date24", "business_process_type24", "business_process_reason24", "pay_rate_type24", "base_pay_change24", "job_profile24", "time_type24", "cost_center24", 
                  "effective_date25", "business_process_type25", "business_process_reason25", #"pay_rate_type_25", "base_pay_change25", 
                  "job_profile25", #"time_type25", 
                  "cost_center25", 
                  "effective_date_26", "business_process_type_26", "business_process_reason_26", #"pay_rate_type_26", "base_pay_change_26", 
                  "job_profile_26", #"time_type_26", 
                  "cost_center_26", "2008_annual_performance_rating", "2009_annual_performance_rating", "2010_annual_performance_rating",
                  "2011_annual_performance_rating", "2012_annual_performance_rating", "2013_annual_performance_rating",
                  "2014_annual_performance_rating", "2015_annual_performance_rating", "2016_annual_performance_rating",
                  "2017_annual_performance_rating", "2018_annual_performance_rating", "2019_annual_performance_rating",
                  "2020_annual_performance_rating")

for (i in 1:26) {
  
  df_year <- df %>% 
    select(department, employee_id, gender, race_ethnicity,
           military_status, date_of_birth,
           original_hire_date, hire_date, pay_rate_type,
           current_base_pay,
           job_profile_current, time_type_current, cost_center_current,
           ends_with(paste0("e", as.character(i))),
           ends_with(paste0("n", as.character(i))),
           ends_with(paste0("r", as.character(i)))
    ) %>% 
    rename(pay_rate_type_current=pay_rate_type) %>% 
    mutate(termination_date=ymd("1900-01-01"))
  
  colnames(df_year) <- gsub(paste0("n", as.character(i)), "n", colnames(df_year))
  colnames(df_year) <- gsub(paste0("e", as.character(i)), "e", colnames(df_year))
  colnames(df_year) <- gsub(paste0("r", as.character(i)), "r", colnames(df_year))
  

  if(!"effective_date" %in% colnames(df_year)) {
    df_year$effective_date <- NA
  }
  
  
  if(!"business_process_type" %in% colnames(df_year)) {
    df_year$business_process_type <- NA
  }
  
  
  if(!"business_process_reason" %in% colnames(df_year)) {
    df_year$business_process_reason <- NA
  }
  
  
  if(!"pay_rate_type" %in% colnames(df_year)) {
    df_year$pay_rate_type <- NA
  }
  
  
  if(!"base_pay_change" %in% colnames(df_year)) {
    df_year$base_pay_change <- NA
  }
  
  
  if(!"job_profile" %in% colnames(df_year)) {
    df_year$job_profile <- NA
  }
  
  if(!"time_type" %in% colnames(df_year)) {
    df_year$time_type<- NA
  }
  
  
  if(!"cost_center" %in% colnames(df_year)) {
    df_year$cost_center<- NA
  }
  
  df_year <- df_year %>% 
    select(department, employee_id, gender, race_ethnicity,
           military_status,
           date_of_birth,
           original_hire_date, hire_date, termination_date,
           pay_rate_type_current,
           current_base_pay,
           job_profile_current, time_type_current, cost_center_current,
           effective_date,
           business_process_type, business_process_reason, pay_rate_type,
           base_pay_change, job_profile, time_type, cost_center)
  df_year$workday <- i
  
  if (i==1) {
    df_mega <- df_year
  } else {
    df_mega <- rbind(df_mega, df_year)
  }
  
}

df_mega <- filter(df_mega, !is.na(cost_center))

data_date <- ymd("2021-05-28")
 
df_mega <- df_mega %>% mutate(date_of_birth = mdy(date_of_birth),
                    age = floor(decimal_date(data_date) - decimal_date(date_of_birth)))


df_mega <- df_mega %>% 
  mutate(date_of_birth = case_when(
    age < 0 ~ date_of_birth-years(100),
    TRUE ~ date_of_birth),
    age = floor(decimal_date(data_date) - decimal_date(date_of_birth)))

df_mega <- df_mega %>% mutate(hire_date = mdy(hire_date),
                    years_of_service = floor(decimal_date(data_date) - decimal_date(hire_date)))

df_mega <- df_mega %>% 
  mutate(hire_date = case_when(
    years_of_service < 0 ~ hire_date-years(100),
    TRUE ~ hire_date),
    years_of_service = case_when(
      years_of_service < 0 ~ floor(decimal_date(data_date) - decimal_date(hire_date)),
      TRUE ~ years_of_service
    ))

df_mega <- df_mega %>% 
  mutate(original_hire_date=mdy(original_hire_date),
         termination_date=ymd(termination_date),
         effective_date=mdy(effective_date))


df_mega <- df_mega %>% 
  mutate(age_group_5=case_when(
    age < 25 ~ "<25",
    age >= 25 & age < 30 ~ "25-29",
    age >= 30 & age < 35 ~ "30-34",
    age >= 35 & age < 40 ~ "35-39",
    age >= 40 & age < 45 ~ "40-44",
    age >= 45 & age < 50 ~ "45-49",
    age >= 50 & age < 55 ~ "50-54",
    age >= 55 & age < 60 ~ "55-59",
    age >= 60 & age < 65 ~ "60-64",
    age >= 65 ~ "65+",
    TRUE ~ NA_character_))


df_mega <- df_mega %>% 
  mutate(age_group_10=case_when(
    age < 25 ~ "<25",
    age >= 25 & age < 35 ~ "25-34",
    age >= 35 & age < 45 ~ "35-44",
    age >= 45 & age < 55 ~ "45-55",
    age >= 55 & age < 65 ~ "55-64",
    age >= 65 ~ "65+",
    TRUE ~ NA_character_))


df_mega <- df_mega %>% 
  mutate(years_of_service_grouped=case_when(
    years_of_service==0 ~ "0",
    years_of_service>=1  & years_of_service<3 ~ "1-2",
    years_of_service>=3  & years_of_service<3 ~ "3-5",
    years_of_service>=6  & years_of_service<3 ~ "6-10",
    years_of_service>=11  & years_of_service<3 ~ "11-15",
    years_of_service>=16  & years_of_service<3 ~ "16-20",
    years_of_service>=21  & years_of_service<3 ~ "21-25",
    years_of_service>=25 ~ "25+",
    TRUE ~ NA_character_))


df_mega <- df_mega %>%
  mutate(dept=case_when(
    department == 'News' ~ 'News', 
    department == 'Editorial' ~ 'News',
    department == 'Client Solutions' ~ 'Commercial',
    department == 'Circulation' ~ 'Commercial',
    department == 'Finance' ~ 'Commercial',
    department == 'Marketing' ~ 'Commercial',
    department == 'WP News Media Services' ~ 'Commercial',
    department == 'Production' ~ 'Commercial',
    department == 'Public Relations' ~ 'Commercial',
    department == 'Administration' ~ 'Commercial',
    department == 'Product'~ 'Commercial',
    TRUE ~ 'Other'))

df_mega <- df_mega %>%
  mutate(desk=case_when(
    cost_center_current == '110000 News Operations' ~ 'Operations',
    cost_center_current == '110001 News Digital Operations' ~ 'Operations',
    cost_center_current == '110610 Audience Development and Engagement' ~ 'Audience Development and Engagement',
    cost_center_current == '110620 News Audio' ~ 'Audio',
    cost_center_current == '110604 Presentation Design' ~ 'Design',
    cost_center_current == '110605 Presentation' ~ 'Photography',
    cost_center_current == '110664 News National Apps' ~ 'Emerging News Products',
    cost_center_current == '110665 News The Lily' ~ 'Emerging News Products',
    cost_center_current == '110666 News Snapchat' ~ 'Emerging News Products',
    cost_center_current == '110667 News By The Way' ~ 'Emerging News Products',
    cost_center_current == '113210 Economy and Business' ~ 'Financial',
    cost_center_current == '114000 Foreign Administration' ~ 'Foreign',
    cost_center_current == '114095 News Foreign Brazil' ~ 'Foreign',
    cost_center_current == '114100 Foreign Latam' ~ 'Foreign',
    cost_center_current == '114220 News Foreign Istanbul' ~ 'Foreign',
    cost_center_current == '114235 Foreign Western Europe' ~ 'Foreign',
    cost_center_current == '114300 News Foreign West Africa' ~ 'Foreign',
    cost_center_current == '114415 Foreign Hong Kong' ~ 'Foreign',
    cost_center_current == '114405 Foreign Beijing Bureau' ~ 'Foreign',
    cost_center_current == '114105 Foreign Mexico Bureau' ~ 'Foreign',
    cost_center_current == '114005 Foreign Beirut Bureau' ~ 'Foreign',
    cost_center_current == '114400 Foreign India Bureau' ~ 'Foreign',
    cost_center_current == '114410 Foreign Tokyo Bureau' ~ 'Foreign',
    cost_center_current == '114205 Foreign Islamabad Bureau' ~ 'Foreign',
    cost_center_current == '114305 Foreign Nairobi Bureau' ~ 'Foreign',
    cost_center_current == '114240 Foreign Rome Bureau' ~ 'Foreign',
    cost_center_current == '114200 Foreign London Bureau' ~ 'Foreign',
    cost_center_current == '114230 Foreign Moscow Bureau' ~ 'Foreign',
    cost_center_current == '114225 Foreign Cairo Bureau' ~ 'Foreign',
    cost_center_current == '114215 Foreign Berlin Bureau' ~ 'Foreign',
    cost_center_current == "114310 Foreign Baghdad Bureau" ~ "Foreign",
    cost_center_current == "114315 Foreign Jerusalem Bureau" ~ "Foreign",
    cost_center_current == '110603 Presentation Graphics' ~ 'Graphics',
    cost_center_current == '110450 Investigative' ~ 'Investigative',
    cost_center_current == '112300 Local Politics and Government' ~ 'Local',
    cost_center_current == '110601 Multiplatform Desk' ~ 'Multiplatform',
    cost_center_current == '110500 Magazine' ~ 'National',
    cost_center_current == '113200 National Politics and Government' ~ 'National',
    cost_center_current == '113205 National Security' ~ 'National',
    cost_center_current == '113215 News National Health & Science' ~ 'National',
    cost_center_current == '113220 National Enterprise' ~ 'National',
    cost_center_current == '113235 National America' ~ 'National',
    cost_center_current == '113240 News National Environment' ~ 'National',
    cost_center_current == '110006 News Content & Research' ~ 'News Content and Research',
    cost_center_current == '110455 News Logistics' ~ 'News Logistics',
    cost_center_current == '110410 Book World' ~ 'Outlook',
    cost_center_current == '110460 Outlook' ~ 'Outlook',
    cost_center_current == '110475 Polling' ~ 'Polling',
    cost_center_current == '110015 Sports Main' ~ 'Sports',
    cost_center_current == '110300 Style' ~ 'Style',
    cost_center_current == '110435 Food' ~ 'Style',
    cost_center_current == '110485 Travel' ~ 'Style',
    cost_center_current == '110495 Local Living' ~ 'Style',
    cost_center_current == '110505 Weekend' ~ 'Style',
    cost_center_current == '110600 Universal Desk' ~ 'Universal Desk',
    cost_center_current == '110652 News Video - General' ~ 'Video',
    cost_center_current == '110663 Wake Up Report' ~ 'Other',
    cost_center_current == '115000 Editorial Administration' ~ 'Editorial',
    TRUE ~ 'non-newsroom'))


df_mega <- df_mega %>%
  mutate(tier=case_when(
    desk == 'National' ~ 'Tier 1',
    desk == 'Foreign' ~ 'Tier 1',
    desk == 'Financial' ~ 'Tier 1',
    desk == 'Investigative' ~ 'Tier 1',
    desk == 'Style' ~ 'Tier 2',
    desk == 'Local' ~ 'Tier 2',
    desk == 'Graphics' ~ 'Tier 2',
    desk == 'Universal Desk' ~ 'Tier 2',
    desk == 'Sports' ~ 'Tier 2',
    desk == 'Outlook' ~ 'Tier 2',
    desk == 'Editorial' ~ 'Tier 2',
    desk == 'Audio' ~ 'Tier 3',
    desk == 'Polling' ~ 'Tier 3',
    desk == 'Design' ~ 'Tier 3',
    desk == 'Operations' ~ 'Tier 3',
    desk == 'Multiplatform' ~ 'Tier 3',
    desk == 'Video' ~ 'Tier 3',
    desk == 'Audience Development and Engagement' ~ 'Tier 3',
    desk == 'News Logistics' ~ 'Tier 4',
    desk == 'News Content and Research' ~ 'Tier 4',
    desk == 'Emerging News Products' ~ 'Tier 4',
    desk == 'Other' ~ 'Tier 4',
    TRUE ~ 'Other'))


df_mega <- df_mega %>%
  mutate(race_grouping=case_when(
    race_ethnicity == 'White (United States of America)' ~ 'white',
    race_ethnicity == 'Black or African American (United States of America)' ~ 'person of color',
    race_ethnicity == 'Asian (United States of America)' ~ 'person of color',
    race_ethnicity == 'Hispanic or Latino (United States of America)' ~ 'person of color',
    race_ethnicity == 'Two or More Races (United States of America)' ~ 'person of color',
    race_ethnicity == 'American Indian or Alaska Native (United States of America)' ~ 'person of color',
    race_ethnicity == 'Native Hawaiian or Other Pacific Islander (United States of America)' ~ 'person of color',
    TRUE ~ 'unknown'))


mega_df_columns <- data.frame(colnames(df_mega))

df_columns <- data.frame(colnames(df))

df2 <- read_csv("raw_data/terminated_wd_2021.csv")

colnames(df2) <- c("department","employee_id","gender",   
                   "race_ethnicity","education", "military_status",
                   "date_of_birth", "original_hire_date","hire_date", "termination_date",
                   "pay_rate_type", "current_base_pay", 
                   "job_profile_current","time_type_current", "cost_center_current",  
                   "effective_date1", "business_process_type1", "business_process_reason1", "pay_rate_type1", "base_pay_change1", "job_profile1", "time_type1", "cost_center1", 
                   "effective_date2", "business_process_type2", "business_process_reason2", "pay_rate_type2", "base_pay_change2", "job_profile2", "time_type2", "cost_center2", 
                   "effective_date3", "business_process_type3", "business_process_reason3", "pay_rate_type3", "base_pay_change3", "job_profile3", "time_type3", "cost_center3",
                   "effective_date4", "business_process_type4", "business_process_reason4", "pay_rate_type_4", "base_pay_change4", "job_profile4", "time_type4", "cost_center4", 
                   "effective_date5", "business_process_type5", "business_process_reason5", "pay_rate_type_5", "base_pay_change5", "job_profile5", "time_type5", "cost_center5", 
                   "effective_date6", "business_process_type6", "business_process_reason6", "pay_rate_type_6", "base_pay_change6", "job_profile6", "time_type6", "cost_center6",
                   "effective_date7", "business_process_type7", "business_process_reason7", "pay_rate_type_7", "base_pay_change7", "job_profile7", "time_type7", "cost_center7", 
                   "effective_date8", "business_process_type8", "business_process_reason8", "pay_rate_type_8", "base_pay_change8", "job_profile8", "time_type8", "cost_center8", 
                   "effective_date9", "business_process_type9", "business_process_reason9", "pay_rate_type_9", "base_pay_change9", "job_profile9", "time_type9", "cost_center_9",
                   "effective_date10", "business_process_type10", "business_process_reason10", "pay_rate_type10", "base_pay_change10", "job_profile10", "time_type10", "cost_center10",
                   "effective_date11", "business_process_type11", "business_process_reason11", "pay_rate_type11", "base_pay_change11", "job_profile11", "time_type11", "cost_center11",
                   "effective_date12", "business_process_type12", "business_process_reason12", "pay_rate_type12", "base_pay_change12", "job_profile12", "time_type12", "cost_center12", 
                   "effective_date13", "business_process_type13", "business_process_reason13", "pay_rate_type13", "base_pay_change13", "job_profile13", "time_type13", "cost_center13", 
                   "effective_date14", "business_process_type14", "business_process_reason14", "pay_rate_type14", "base_pay_change14", "job_profile14", "time_type14", "cost_center14",
                   "effective_date15", "business_process_type15", "business_process_reason15", "pay_rate_type15", "base_pay_change15", "job_profile15", "time_type15", "cost_center15", 
                   "2008_annual_performance_rating", "2009_annual_performance_rating", "2010_annual_performance_rating",
                   "2011_annual_performance_rating", "2012_annual_performance_rating", "2013_annual_performance_rating",
                   "2014_annual_performance_rating", "2015_annual_performance_rating", "2016_annual_performance_rating",
                   "2017_annual_performance_rating", "2018_annual_performance_rating", "2019_annual_performance_rating",
                   "2020_annual_performance_rating")



for (i in 1:15) {
  
  df2_year <- df2 %>% 
    select(department, employee_id, gender, race_ethnicity,
           military_status, date_of_birth,
           original_hire_date, hire_date, termination_date, 
           pay_rate_type,
           current_base_pay,
           job_profile_current, time_type_current, cost_center_current,
           ends_with(paste0("e", as.character(i))),
           ends_with(paste0("n", as.character(i))),
           ends_with(paste0("r", as.character(i)))
    ) %>% 
    rename(pay_rate_type_current=pay_rate_type) 
  
  colnames(df2_year) <- gsub(paste0("n", as.character(i)), "n", colnames(df2_year))
  colnames(df2_year) <- gsub(paste0("e", as.character(i)), "e", colnames(df2_year))
  colnames(df2_year) <- gsub(paste0("r", as.character(i)), "r", colnames(df2_year))
  
  if(!"effective_date" %in% colnames(df2_year)) {
    df2_year$effective_date <- NA
  }
  
  
  if(!"business_process_type" %in% colnames(df2_year)) {
    df2_year$business_process_type <- NA
  }
  
  
  if(!"business_process_reason" %in% colnames(df2_year)) {
    df2_year$business_process_reason <- NA
  }
  
  
  if(!"pay_rate_type" %in% colnames(df2_year)) {
    df2_year$pay_rate_type <- NA
  }
  
  
  if(!"base_pay_change" %in% colnames(df2_year)) {
    df2_year$base_pay_change <- NA
  }
  
  
  if(!"job_profile" %in% colnames(df2_year)) {
    df2_year$job_profile <- NA
  }
  
  if(!"time_type" %in% colnames(df2_year)) {
    df2_year$time_type<- NA
  }
  
  
  if(!"cost_center" %in% colnames(df2_year)) {
    df2_year$cost_center<- NA
  }
  
  df2_year <- df2_year %>% 
    select(department, employee_id, gender, military_status,
           race_ethnicity,
           date_of_birth,
           original_hire_date, hire_date, termination_date,
           pay_rate_type_current,
           current_base_pay,
           job_profile_current, time_type_current, cost_center_current,
           effective_date,
           business_process_type, business_process_reason, pay_rate_type,
           base_pay_change, job_profile, time_type, cost_center)
  df2_year$workday <- i
  
  if (i==1) {
    df2_mega <- df2_year
  } else {
    df2_mega <- rbind(df2_mega, df2_year)
  }
  
}


data_date <- ymd("2021-05-28")

df2_mega <- df2_mega %>% mutate(date_of_birth = mdy(date_of_birth),
                              age = floor(decimal_date(data_date) - decimal_date(date_of_birth)))


df2_mega <- df2_mega %>% 
  mutate(date_of_birth = case_when(
    age < 0 ~ date_of_birth-years(100),
    TRUE ~ date_of_birth),
    age = floor(decimal_date(data_date) - decimal_date(date_of_birth)))



df2_mega <- df2_mega %>% mutate(hire_date = mdy(hire_date),
                              years_of_service = floor(decimal_date(data_date) - decimal_date(hire_date)))

df2_mega <- df2_mega %>% 
  mutate(hire_date = case_when(
    years_of_service < 0 ~ hire_date-years(100),
    TRUE ~ hire_date),
    years_of_service = case_when(
      years_of_service < 0 ~ floor(decimal_date(data_date) - decimal_date(hire_date)),
      TRUE ~ years_of_service
    ))

df2_mega <- df2_mega %>% 
  mutate(original_hire_date=mdy(original_hire_date),
         termination_date=mdy(termination_date),
         effective_date=mdy(effective_date))

df2_mega <- df2_mega %>% 
  mutate(age_group_5=case_when(
    age < 25 ~ "<25",
    age >= 25 & age < 30 ~ "25-29",
    age >= 30 & age < 35 ~ "30-34",
    age >= 35 & age < 40 ~ "35-39",
    age >= 40 & age < 45 ~ "40-44",
    age >= 45 & age < 50 ~ "45-49",
    age >= 50 & age < 55 ~ "50-54",
    age >= 55 & age < 60 ~ "55-59",
    age >= 60 & age < 65 ~ "60-64",
    age >= 65 ~ "65+",
    TRUE ~ NA_character_))

df2_mega <- df2_mega %>% 
  mutate(age_group_10=case_when(
    age < 25 ~ "<25",
    age >= 25 & age < 35 ~ "25-34",
    age >= 35 & age < 45 ~ "35-44",
    age >= 45 & age < 55 ~ "45-55",
    age >= 55 & age < 65 ~ "55-64",
    age >= 65 ~ "65+",
    TRUE ~ NA_character_))

df2_mega <- df2_mega %>% 
  mutate(years_of_service_grouped=case_when(
    years_of_service==0 ~ "0",
    years_of_service>=1  & years_of_service<3 ~ "1-2",
    years_of_service>=3  & years_of_service<3 ~ "3-5",
    years_of_service>=6  & years_of_service<3 ~ "6-10",
    years_of_service>=11  & years_of_service<3 ~ "11-15",
    years_of_service>=16  & years_of_service<3 ~ "16-20",
    years_of_service>=21  & years_of_service<3 ~ "21-25",
    years_of_service>=25 ~ "25+",
    TRUE ~ NA_character_))


df2_mega <- df2_mega %>%
  mutate(dept=case_when(
    department == 'News' ~ 'News', 
    department == 'Editorial' ~ 'News',
    department == 'Client Solutions' ~ 'Commercial',
    department == 'Circulation' ~ 'Commercial',
    department == 'Finance' ~ 'Commercial',
    department == 'Marketing' ~ 'Commercial',
    department == 'WP News Media Services' ~ 'Commercial',
    department == 'Production' ~ 'Commercial',
    department == 'Public Relations' ~ 'Commercial',
    department == 'Administration' ~ 'Commercial',
    department == 'Product'~ 'Commercial',
    TRUE ~ 'Other'))


df2_mega <- df2_mega %>%
  mutate(desk=case_when(
    cost_center_current == '110000 News Operations' ~ 'Operations',
    cost_center_current == '110001 News Digital Operations' ~ 'Operations',
    cost_center_current == '110610 Audience Development and Engagement' ~ 'Audience Development and Engagement',
    cost_center_current == '110620 News Audio' ~ 'Audio',
    cost_center_current == '110604 Presentation Design' ~ 'Design',
    cost_center_current == '110605 Presentation' ~ 'Photography',
    cost_center_current == '110664 News National Apps' ~ 'Emerging News Products',
    cost_center_current == '110665 News The Lily' ~ 'Emerging News Products',
    cost_center_current == '110666 News Snapchat' ~ 'Emerging News Products',
    cost_center_current == '110667 News By The Way' ~ 'Emerging News Products',
    cost_center_current == '113210 Economy and Business' ~ 'Financial',
    cost_center_current == '114000 Foreign Administration' ~ 'Foreign',
    cost_center_current == '114095 News Foreign Brazil' ~ 'Foreign',
    cost_center_current == '114100 Foreign Latam' ~ 'Foreign',
    cost_center_current == '114220 News Foreign Istanbul' ~ 'Foreign',
    cost_center_current == '114235 Foreign Western Europe' ~ 'Foreign',
    cost_center_current == '114300 News Foreign West Africa' ~ 'Foreign',
    cost_center_current == '114415 Foreign Hong Kong' ~ 'Foreign',
    cost_center_current == '114405 Foreign Beijing Bureau' ~ 'Foreign',
    cost_center_current == '114105 Foreign Mexico Bureau' ~ 'Foreign',
    cost_center_current == '114005 Foreign Beirut Bureau' ~ 'Foreign',
    cost_center_current == '114400 Foreign India Bureau' ~ 'Foreign',
    cost_center_current == '114410 Foreign Tokyo Bureau' ~ 'Foreign',
    cost_center_current == '114205 Foreign Islamabad Bureau' ~ 'Foreign',
    cost_center_current == '114305 Foreign Nairobi Bureau' ~ 'Foreign',
    cost_center_current == '114240 Foreign Rome Bureau' ~ 'Foreign',
    cost_center_current == '114200 Foreign London Bureau' ~ 'Foreign',
    cost_center_current == '114230 Foreign Moscow Bureau' ~ 'Foreign',
    cost_center_current == '114225 Foreign Cairo Bureau' ~ 'Foreign',
    cost_center_current == '114215 Foreign Berlin Bureau' ~ 'Foreign',
    cost_center_current == "114310 Foreign Baghdad Bureau" ~ "Foreign",
    cost_center_current == "114315 Foreign Jerusalem Bureau" ~ "Foreign",
    cost_center_current == '110603 Presentation Graphics' ~ 'Graphics',
    cost_center_current == '110450 Investigative' ~ 'Investigative',
    cost_center_current == '112300 Local Politics and Government' ~ 'Local',
    cost_center_current == '110601 Multiplatform Desk' ~ 'Multiplatform',
    cost_center_current == '110500 Magazine' ~ 'National',
    cost_center_current == '113200 National Politics and Government' ~ 'National',
    cost_center_current == '113205 National Security' ~ 'National',
    cost_center_current == '113215 News National Health & Science' ~ 'National',
    cost_center_current == '113220 National Enterprise' ~ 'National',
    cost_center_current == '113235 National America' ~ 'National',
    cost_center_current == '113240 News National Environment' ~ 'National',
    cost_center_current == '110006 News Content & Research' ~ 'News Content and Research',
    cost_center_current == '110455 News Logistics' ~ 'News Logistics',
    cost_center_current == '110410 Book World' ~ 'Outlook',
    cost_center_current == '110460 Outlook' ~ 'Outlook',
    cost_center_current == '110475 Polling' ~ 'Polling',
    cost_center_current == '110015 Sports Main' ~ 'Sports',
    cost_center_current == '110300 Style' ~ 'Style',
    cost_center_current == '110435 Food' ~ 'Style',
    cost_center_current == '110485 Travel' ~ 'Style',
    cost_center_current == '110495 Local Living' ~ 'Style',
    cost_center_current == '110505 Weekend' ~ 'Style',
    cost_center_current == '110600 Universal Desk' ~ 'Universal Desk',
    cost_center_current == '110652 News Video - General' ~ 'Video',
    cost_center_current == '110663 Wake Up Report' ~ 'Other',
    cost_center_current == '115000 Editorial Administration' ~ 'Editorial',
    TRUE ~ 'non-newsroom'))


df2_mega <- df2_mega %>%
  mutate(tier=case_when(
    desk == 'National' ~ 'Tier 1',
    desk == 'Foreign' ~ 'Tier 1',
    desk == 'Financial' ~ 'Tier 1',
    desk == 'Investigative' ~ 'Tier 1',
    desk == 'Style' ~ 'Tier 2',
    desk == 'Local' ~ 'Tier 2',
    desk == 'Graphics' ~ 'Tier 2',
    desk == 'Universal Desk' ~ 'Tier 2',
    desk == 'Sports' ~ 'Tier 2',
    desk == 'Outlook' ~ 'Tier 2',
    desk == 'Editorial' ~ 'Tier 2',
    desk == 'Audio' ~ 'Tier 3',
    desk == 'Polling' ~ 'Tier 3',
    desk == 'Design' ~ 'Tier 3',
    desk == 'Operations' ~ 'Tier 3',
    desk == 'Multiplatform' ~ 'Tier 3',
    desk == 'Video' ~ 'Tier 3',
    desk == 'Audience Development and Engagement' ~ 'Tier 3',
    desk == 'News Logistics' ~ 'Tier 4',
    desk == 'News Content and Research' ~ 'Tier 4',
    desk == 'Emerging News Products' ~ 'Tier 4',
    desk == 'Other' ~ 'Tier 4',
    TRUE ~ 'Other'))


df2_mega <- df2_mega %>%
  mutate(race_grouping=case_when(
    race_ethnicity == 'White (United States of America)' ~ 'white',
    race_ethnicity == 'Black or African American (United States of America)' ~ 'person of color',
    race_ethnicity == 'Asian (United States of America)' ~ 'person of color',
    race_ethnicity == 'Hispanic or Latino (United States of America)' ~ 'person of color',
    race_ethnicity == 'Two or More Races (United States of America)' ~ 'person of color',
    race_ethnicity == 'American Indian or Alaska Native (United States of America)' ~ 'person of color',
    race_ethnicity == 'Native Hawaiian or Other Pacific Islander (United States of America)' ~ 'person of color',
    TRUE ~ 'unknown'))

df2_mega %>% count(year(termination_date))


df_mega$data <- "active"
df2_mega$data <- "terminated"
df3_mega <- rbind(df_mega, df2_mega)

df <- df3_mega

comes_goes <- df %>% 
  rename(data_type=data) %>% 
  mutate(termination_year=year(termination_date))

comes_goes$termination_year <- ifelse(comes_goes$termination_year==1900, NA, comes_goes$termination_year)

comes_goes$eff_year = ifelse(comes_goes$data_type=="active", 2022-comes_goes$years_of_service, comes_goes$termination_year)

comes_goes$data_year= year(comes_goes$effective_date)
comes_goes_active <- comes_goes %>% filter(data_type=="active")
comes_goes_active$year_type <- ifelse(year(comes_goes_active$hire_date)==year(comes_goes_active$effective_date), "hired", "active")
comes_goes_term <- comes_goes %>% filter(data_type=="terminated")

comes_goes_term <- comes_goes_term %>% 
  group_by(employee_id) %>% 
  arrange(employee_id, effective_date) %>% 
  mutate(row=row_number()) 

comes_goes_term_solo <- comes_goes_term %>% 
  filter(row_number()==n()) %>% 
  mutate(year_type="terminated")

comes_goes_term <- left_join(comes_goes_term, comes_goes_term_solo)
comes_goes_term$year_type <- ifelse(is.na(comes_goes_term$year_type), "active", comes_goes_term$year_type)
comes_goes_term$row <- NULL

comes_goes <- rbind(comes_goes_active, comes_goes_term)

comes_goes2 <- comes_goes %>% 
  group_by(employee_id, data_year) %>%
  arrange(employee_id, data_year, desc(year_type)) %>% 
  slice(1)

uniques <- comes_goes2 %>% 
  ungroup() %>% 
  select(employee_id, hire_date, termination_date, data_type, termination_year, data_year, year_type) %>% 
  unique() %>% 
  filter(year_type=="hired")

ids <- comes_goes %>% 
  ungroup() %>% 
  select(employee_id, hire_date, termination_date) %>% 
  unique() %>% 
  mutate(start_year=year(hire_date), end_year=year(termination_date))

ids$end_year <- ifelse(ids$end_year==1900, 2021, ids$end_year)


for(i in 1:nrow(ids)) {
  
  data_year <- ids$start_year[i]:ids$end_year[i]
  df_solo <- data.frame(data_year)
  df_solo$employee_id <- ids$employee_id[i]
  

  df_solo <- left_join(df_solo, uniques) %>% 
    select(employee_id, data_year, year_type)
  
  if (i==1) {
    df <- df_solo
  } else {
    df <- rbind(df, df_solo)
  }
  print(i)
}


cg_narrow <- comes_goes %>% 
  group_by(employee_id) %>% 
  arrange(employee_id, effective_date) %>% 
  filter(row_number()==n()) %>% 
  select(department, employee_id, gender, race_ethnicity,
         date_of_birth, original_hire_date, hire_date,
         termination_date, pay_rate_type_current,
         current_base_pay, job_profile_current,
         time_type_current, cost_center_current,
         cost_center, age, years_of_service,
         age_group_5, age_group_10, years_of_service_grouped,
         dept, desk, tier, race_grouping) %>% 
  unique()

# 
df<- df %>% 
  left_join(cg_narrow) %>% 
  mutate(year_type=case_when(
    year(termination_date)==data_year~ "terminated",
    TRUE ~ year_type
  ))

df$year_type <- ifelse(is.na(df$year_type), "active", df$year_type)

#### hires versus terminations by year (total company)

#From 2016 to 2021, The Post hired more women than men across the company. Much of that growth happened in the Newsroom.
df %>%   
  filter(data_year >=2016) %>% 
  count(gender, year_type)  %>%
  filter(year_type!="active") %>% 
  pivot_wider(names_from="year_type", values_from="n") %>% 
  kable()

gender	hired	terminated
Female	312	261
Male	203	201
Prefer not to disclose	1	1

#The number of women in Commercial, on the other hand, dramatically shrank.
#More than twice as many women as men in Commercial have departed. Our analysis showed that 2019 was a standout year for turnover among female staffers in Commercial, with nearly 40 women leaving The Post.
df %>% 
  filter(data_year >=2016) %>% 
  count(dept, gender, year_type) %>% 
  filter(year_type!="active") %>% 
  pivot_wider(names_from="year_type", values_from="n") %>% 
  mutate(category=case_when(
    terminated==hired ~ "Same",
    hired > terminated ~ "More hired than left",
    terminated > hired ~ "More left than hired",
    TRUE ~ ""
  )) %>% 
  kable()

dept	gender	hired	terminated	category
Commercial	Female	81	127	More left than hired
Commercial	Male	35	64	More left than hired
News	Female	215	115	More hired than left
News	Male	154	89	More hired than left
News	Prefer not to disclose	1	1	Same
Other	Female	16	19	More left than hired
Other	Male	14	48	More left than hired

#From 2016 to 2021, more workers of color left in Commercial than were hired, with the exception of Hispanic or Latino workers, whose numbers remained consistent. Demographic groups of fewer than five were excluded to protect individuals’ privacy.
df %>% 
  filter(data_year >=2016) %>% 
  count(dept, race_ethnicity, year_type) %>% 
  filter(year_type!="active") %>% 
  pivot_wider(names_from="year_type", values_from="n") %>% 
  filter(hired>=5 & terminated>=5) %>% 
  mutate(category=case_when(
    hired>terminated ~ "More hired than left",
    terminated>hired ~ "More left than hired",
    terminated==hired ~ "Same",
    TRUE ~ ""
  )) %>% 
  kable()

dept	race_ethnicity	hired	terminated	category
Commercial	Asian (United States of America)	14	18	More left than hired
Commercial	Black or African American (United States of America)	30	51	More left than hired
Commercial	Hispanic or Latino (United States of America)	6	7	More left than hired
Commercial	Two or More Races (United States of America)	5	8	More left than hired
Commercial	White (United States of America)	59	102	More left than hired
News	Asian (United States of America)	42	24	More hired than left
News	Black or African American (United States of America)	37	31	More hired than left
News	Hispanic or Latino (United States of America)	28	12	More hired than left
News	Two or More Races (United States of America)	17	5	More hired than left
News	White (United States of America)	221	127	More hired than left
Other	Black or African American (United States of America)	6	37	More left than hired
Other	White (United States of America)	19	23	More left than hired

#In the Newsroom, on the other hand, more workers were hired than left. For every employee that left, 1.8 employees were hired. 
df %>% 
  filter(data_year >=2016) %>% 
  count(dept,year_type) %>% 
  filter(year_type!="active") %>% 
  pivot_wider(names_from="year_type", values_from="n") %>% 
  mutate(category=case_when(
    hired>terminated ~ "More hired than left",
    terminated>hired ~ "More left than hired",
    terminated==hired ~ "Same",
    TRUE ~ ""
  )) %>% 
  kable()

dept	hired	terminated	category
Commercial	116	191	More left than hired
News	370	205	More hired than left
Other	30	67	More left than hired

#370/205 = 1.8

#However, that was not the case for Black employees. For every Black employee who left, only 1.2 Black employees were hired to replace them. 

df %>% 
  filter(data_year >=2016) %>% 
  count(dept, race_ethnicity, year_type) %>% 
  filter(year_type!="active") %>% 
  pivot_wider(names_from="year_type", values_from="n") %>% 
  filter(hired>=5 & terminated>=5) %>% 
  mutate(category=case_when(
    hired>terminated ~ "More hired than left",
    terminated>hired ~ "More left than hired",
    terminated==hired ~ "Same",
    TRUE ~ ""
  )) %>% 
  kable()

dept	race_ethnicity	hired	terminated	category
Commercial	Asian (United States of America)	14	18	More left than hired
Commercial	Black or African American (United States of America)	30	51	More left than hired
Commercial	Hispanic or Latino (United States of America)	6	7	More left than hired
Commercial	Two or More Races (United States of America)	5	8	More left than hired
Commercial	White (United States of America)	59	102	More left than hired
News	Asian (United States of America)	42	24	More hired than left
News	Black or African American (United States of America)	37	31	More hired than left
News	Hispanic or Latino (United States of America)	28	12	More hired than left
News	Two or More Races (United States of America)	17	5	More hired than left
News	White (United States of America)	221	127	More hired than left
Other	Black or African American (United States of America)	6	37	More left than hired
Other	White (United States of America)	19	23	More left than hired

#37/31 = 1.2

#A deeper analysis of the data shows that in 2020 alone more than 1 out of 3 workers who left the Newsroom were Black. Fewer than 1 in 5 of those hired that year were Black.
df %>% 
  filter(data_year ==2020) %>% 
  filter(dept=="News") %>% 
  count(dept, race_ethnicity, data_year, year_type) %>% 
  filter(year_type!="active") %>% 
  pivot_wider(names_from="year_type", values_from="n") %>% 
  #filter(hired>=5 & terminated>=5) %>% 
  mutate(category=case_when(
    hired>terminated ~ "More hired than left",
    terminated>hired ~ "More left than hired",
    terminated==hired ~ "Same",
    TRUE ~ ""
  )) %>% 
  kable()

dept	race_ethnicity	data_year	hired	terminated	category
News	Asian (United States of America)	2020	9	1	More hired than left
News	Black or African American (United States of America)	2020	9	10	More left than hired
News	Hispanic or Latino (United States of America)	2020	5	2	More hired than left
News	Prefer Not to Disclose (United States of America)	2020	1	1	Same
News	Two or More Races (United States of America)	2020	3	NA
News	White (United States of America)	2020	21	14	More hired than left
News	NA	2020	3	1	More hired than left

#10/29 = .34
#9/51 =.176 or 1/5=.2

#In 2017, only 17 percent of Newsroom hires were people of color. 
#In 2020 and in the first half of 2021, however, they made up 55 percent of new hires.  
view_this <-df %>% 
  filter(dept=="News") %>% 
  count(dept, race_grouping, data_year, year_type) %>% 
  filter(year_type!="active") %>% 
  pivot_wider(names_from="year_type", values_from="n") %>% 
  #filter(hired>=5 & terminated>=5) %>% 
   mutate(category=case_when(
    hired>terminated ~ "More hired than left",
    terminated>hired ~ "More left than hired",
    terminated==hired ~ "Same",
    TRUE ~ ""
  )) %>% 
  kable()

#12/(12+5+54)=.17


#Of the hires in 2020 and so far in 2021, 55 percent have been people of color. 

df %>% 
  filter(dept=="News") %>% 
  filter(data_year==2020 | data_year==2021) %>% 
  count(dept, race_grouping, year_type) %>% 
  filter(year_type!="active") %>% 
  pivot_wider(names_from="year_type", values_from="n") %>% 
  mutate(category=case_when(
    hired>terminated ~ "More hired than left",
    terminated>hired ~ "More left than hired",
    terminated==hired ~ "Same",
    TRUE ~ ""
  )) %>% 
  kable()

dept	race_grouping	hired	terminated	category
News	person of color	43	18	More hired than left
News	unknown	7	2	More hired than left
News	white	32	20	More hired than left

#43/(43+7+32) = .52
#18/(18+2+20) = .45

Newsroom

#In the Newsroom, journalists of color were, on average, hired at a younger age than White workers. 
df %>% 
  filter(year_type=="hired") %>% 
  filter(data_year>=2016) %>% 
  mutate(hire_age=data_year-year(date_of_birth)) %>% 
  filter(dept=="News") %>% 
  group_by(race_ethnicity) %>% 
  summarize(count=n(),
            `average age`=round(mean(hire_age)),
            `median age`=round(median(hire_age))
  ) %>% 
  filter(count>=5) %>% 
  mutate(race_ethnicity=gsub(" \\(United States of America\\)", "", race_ethnicity)) %>% 
  filter(!is.na(race_ethnicity))%>% 
  kable()

race_ethnicity	count	average age	median age
Asian	42	29	27
Black or African American	37	31	30
Hispanic or Latino	28	30	29
Prefer Not to Disclose	11	31	31
Two or More Races	17	28	27
White	221	33	31

#On the Commercial side, Black employees tend to be hired at an older age than employees of other races and ethnicities, with average and median hiring ages of 39 and 42, respectively. Asian employees, meanwhile, have the youngest average and median hiring ages at 27 and 26, respectively.
df %>% 
  filter(year_type=="hired") %>% 
  filter(data_year>=2016) %>% 
  mutate(hire_age=data_year-year(date_of_birth)) %>% 
  filter(dept=="Commercial") %>% 
  group_by(race_ethnicity) %>% 
  summarize(count=n(),
            `average age`=round(mean(hire_age)),
            `median age`=round(median(hire_age))
  ) %>% 
  filter(count>=5) %>% 
  mutate(race_ethnicity=gsub(" \\(United States of America\\)", "", race_ethnicity)) %>% 
  filter(!is.na(race_ethnicity))%>% 
  kable()

race_ethnicity	count	average age	median age
Asian	14	27	26
Black or African American	30	39	42
Hispanic or Latino	6	34	28
Two or More Races	5	33	31
White	59	32	28

#Turnover by desk chart
turnoverbydesk <- df %>% 
  filter(dept=="News") %>% 
  filter(pay_rate_type_current=="Salaried") %>% 
  filter(data_year>=2016) %>% 
  count(dept, desk, year_type) %>% 
  filter(year_type!="active") %>% 
  pivot_wider(names_from="year_type", values_from="n") %>% 
  filter(hired>=5 & terminated>=5) %>% 
  mutate(category=case_when(
    hired>terminated ~ "More hired than left",
    terminated>hired ~ "More left than hired",
    terminated==hired ~ "Same",
    TRUE ~ ""
  )) 

kable(turnoverbydesk)

dept	desk	hired	terminated	category
News	Audience Development and Engagement	24	8	More hired than left
News	Emerging News Products	22	10	More hired than left
News	Financial	28	19	More hired than left
News	Graphics	16	6	More hired than left
News	Local	21	10	More hired than left
News	Multiplatform	16	8	More hired than left
News	National	44	15	More hired than left
News	Sports	18	12	More hired than left
News	Video	36	12	More hired than left

#Within the Newsroom, White employees were often hired at a much higher rate than employees of color. In some sections, fewer than five employees of color were hired and therefore couldn’t even be counted within the data in order to protect workers’ privacy. This was the case for Local, Financial and Multiplatform, where 14, 22 and 13 White employees, respectively, were hired from 2016 to 2021.
#and chart
newsroom_desks <- df %>% 
  filter(dept=="News") %>% 
  filter(pay_rate_type_current=="Salaried") %>% 
  filter(data_year>=2016) %>% 
  count(race_grouping, desk, year_type) %>% 
  filter(year_type!="active") %>% 
  pivot_wider(names_from="year_type", values_from="n") %>% 
  filter(hired>=5 & terminated>=5) %>% 
  mutate(category=case_when(
    hired>terminated ~ "More hired than left",
    terminated>hired ~ "More left than hired",
    terminated==hired ~ "Same",
    TRUE ~ ""
  ))
kable(newsroom_desks)

race_grouping	desk	hired	terminated	category
person of color	Emerging News Products	6	6	Same
person of color	National	13	5	More hired than left
person of color	Sports	9	6	More hired than left
person of color	Video	15	5	More hired than left
white	Financial	22	11	More hired than left
white	Local	14	6	More hired than left
white	Multiplatform	13	6	More hired than left
white	National	28	10	More hired than left
white	Sports	9	6	More hired than left
white	Video	20	7	More hired than left

library(vroom)
df <-vroom("raw_data/active_wd_2021.csv")

colnames(df) <- c("department","employee_id","gender",   
"race_ethnicity","education", "military_status",
"date_of_birth", "original_hire_date","hire_date","pay_rate_type", "current_base_pay", 
"job_profile_current","time_type_current", "cost_center_current",  
"effective_date1", "business_process_type1", "business_process_reason1", "pay_rate_type1", "base_pay_change1", "job_profile1", "time_type1", "cost_center1", 
"effective_date2", "business_process_type2", "business_process_reason2", "pay_rate_type2", "base_pay_change2", "job_profile2", "time_type2", "cost_center2", 
"effective_date3", "business_process_type3", "business_process_reason3", "pay_rate_type3", "base_pay_change3", "job_profile3", "time_type3", "cost_center3",
"effective_date4", "business_process_type4", "business_process_reason4", "pay_rate_type4", "base_pay_change4", "job_profile4", "time_type4", "cost_center4", 
"effective_date5", "business_process_type5", "business_process_reason5", "pay_rate_type5", "base_pay_change5", "job_profile5", "time_type5", "cost_center5", 
"effective_date6", "business_process_type6", "business_process_reason6", "pay_rate_type6", "base_pay_change6", "job_profile6", "time_type6", "cost_center6",
"effective_date7", "business_process_type7", "business_process_reason7", "pay_rate_type7", "base_pay_change7", "job_profile7", "time_type7", "cost_center7", 
"effective_date8", "business_process_type8", "business_process_reason8", "pay_rate_type8", "base_pay_change8", "job_profile8", "time_type8", "cost_center8", 
"effective_date9", "business_process_type9", "business_process_reason9", "pay_rate_type9", "base_pay_change9", "job_profile9", "time_type9", "cost_center9",
"effective_date10", "business_process_type10", "business_process_reason10", "pay_rate_type10", "base_pay_change10", "job_profile10", "time_type10", "cost_center10",
"effective_date11", "business_process_type11", "business_process_reason11", "pay_rate_type11", "base_pay_change11", "job_profile11", "time_type11", "cost_center11",
"effective_date12", "business_process_type12", "business_process_reason12", "pay_rate_type12", "base_pay_change12", "job_profile12", "time_type12", "cost_center12", 
"effective_date13", "business_process_type13", "business_process_reason13", "pay_rate_type13", "base_pay_change13", "job_profile13", "time_type13", "cost_center13", 
"effective_date14", "business_process_type14", "business_process_reason14", "pay_rate_type14", "base_pay_change14", "job_profile14", "time_type14", "cost_center14",
"effective_date15", "business_process_type15", "business_process_reason15", "pay_rate_type15", "base_pay_change15", "job_profile15", "time_type15", "cost_center15", 
"effective_date16", "business_process_type16", "business_process_reason16", "pay_rate_type16", "base_pay_change16", "job_profile16", "time_type16", "cost_center16", 
"effective_date17", "business_process_type17", "business_process_reason17", "pay_rate_type17", "base_pay_change17", "job_profile17", "time_type17", "cost_center17", 
"effective_date18", "business_process_type18", "business_process_reason18", "pay_rate_type18", "base_pay_change18", "job_profile18", "time_type18", "cost_center18", 
"effective_date19", "business_process_type19", "business_process_reason19", "pay_rate_type19", "base_pay_change19", "job_profile19", "time_type19", "cost_center19", 
"effective_date20", "business_process_type20", "business_process_reason20", "pay_rate_type20", "base_pay_change20", "job_profile20", "time_type20", "cost_center20", 
"effective_date21", "business_process_type21", "business_process_reason21", "pay_rate_type21", "base_pay_change21", "job_profile21", "time_type21", "cost_center21", 
"effective_date22", "business_process_type22", "business_process_reason22", "pay_rate_type22", "base_pay_change22", "job_profile22", "time_type22", "cost_center22", 
"effective_date23", "business_process_type23", "business_process_reason23", "pay_rate_type23", "base_pay_change23", "job_profile23", "time_type23", "cost_center23", 
"effective_date24", "business_process_type24", "business_process_reason24", "pay_rate_type24", "base_pay_change24", "job_profile24", "time_type24", "cost_center24", 
"effective_date25", "business_process_type25", "business_process_reason25", #"pay_rate_type_25", "base_pay_change25", 
"job_profile25", #"time_type25", 
"cost_center25", 
"effective_date_26", "business_process_type_26", "business_process_reason_26", #"pay_rate_type_26", "base_pay_change_26", 
"job_profile_26", #"time_type_26", 
"cost_center_26", "2008_annual_performance_rating", "2009_annual_performance_rating", "2010_annual_performance_rating",
"2011_annual_performance_rating", "2012_annual_performance_rating", "2013_annual_performance_rating",
"2014_annual_performance_rating", "2015_annual_performance_rating", "2016_annual_performance_rating",
"2017_annual_performance_rating", "2018_annual_performance_rating", "2019_annual_performance_rating",
"2020_annual_performance_rating")
  
df2 <- vroom("raw_data/terminated_wd_2021.csv")


colnames(df2) <- c("department","employee_id","gender",   
                  "race_ethnicity","education", "military_status",
                  "date_of_birth", "original_hire_date","hire_date", "termination_date",
                  "pay_rate_type", "current_base_pay", 
                  "job_profile_current","time_type_current", "cost_center_current",  
                  "effective_date1", "business_process_type1", "business_process_reason1", "pay_rate_type1", "base_pay_change1", "job_profile1", "time_type1", "cost_center1", 
                  "effective_date2", "business_process_type2", "business_process_reason2", "pay_rate_type2", "base_pay_change2", "job_profile2", "time_type2", "cost_center2", 
                  "effective_date3", "business_process_type3", "business_process_reason3", "pay_rate_type3", "base_pay_change3", "job_profile3", "time_type3", "cost_center3",
                  "effective_date4", "business_process_type4", "business_process_reason4", "pay_rate_type_4", "base_pay_change4", "job_profile4", "time_type4", "cost_center4", 
                  "effective_date5", "business_process_type5", "business_process_reason5", "pay_rate_type_5", "base_pay_change5", "job_profile5", "time_type5", "cost_center5", 
                  "effective_date6", "business_process_type6", "business_process_reason6", "pay_rate_type_6", "base_pay_change6", "job_profile6", "time_type6", "cost_center6",
                  "effective_date7", "business_process_type7", "business_process_reason7", "pay_rate_type_7", "base_pay_change7", "job_profile7", "time_type7", "cost_center7", 
                  "effective_date8", "business_process_type8", "business_process_reason8", "pay_rate_type_8", "base_pay_change8", "job_profile8", "time_type8", "cost_center8", 
                  "effective_date9", "business_process_type9", "business_process_reason9", "pay_rate_type_9", "base_pay_change9", "job_profile9", "time_type9", "cost_center_9",
                  "effective_date10", "business_process_type10", "business_process_reason10", "pay_rate_type10", "base_pay_change10", "job_profile10", "time_type10", "cost_center10",
                  "effective_date11", "business_process_type11", "business_process_reason11", "pay_rate_type11", "base_pay_change11", "job_profile11", "time_type11", "cost_center11",
                  "effective_date12", "business_process_type12", "business_process_reason12", "pay_rate_type12", "base_pay_change12", "job_profile12", "time_type12", "cost_center12", 
                  "effective_date13", "business_process_type13", "business_process_reason13", "pay_rate_type13", "base_pay_change13", "job_profile13", "time_type13", "cost_center13", 
                  "effective_date14", "business_process_type14", "business_process_reason14", "pay_rate_type14", "base_pay_change14", "job_profile14", "time_type14", "cost_center14",
                  "effective_date15", "business_process_type15", "business_process_reason15", "pay_rate_type15", "base_pay_change15", "job_profile15", "time_type15", "cost_center15", 
                   "2008_annual_performance_rating", "2009_annual_performance_rating", "2010_annual_performance_rating",
                  "2011_annual_performance_rating", "2012_annual_performance_rating", "2013_annual_performance_rating",
                  "2014_annual_performance_rating", "2015_annual_performance_rating", "2016_annual_performance_rating",
                  "2017_annual_performance_rating", "2018_annual_performance_rating", "2019_annual_performance_rating",
                  "2020_annual_performance_rating")

data_date <- ymd("2021-05-28")

df <- df %>% mutate(date_of_birth = mdy(date_of_birth),
                    age = floor(decimal_date(data_date) - decimal_date(date_of_birth)))

df <- df %>% 
  mutate(date_of_birth = case_when(
    age < 0 ~ date_of_birth-years(100),
    TRUE ~ date_of_birth),
    age = floor(decimal_date(data_date) - decimal_date(date_of_birth)))


df2 <- df2 %>% mutate(date_of_birth = mdy(date_of_birth),
                    age = floor(decimal_date(data_date) - decimal_date(date_of_birth)))

df2 <- df2 %>% 
  mutate(date_of_birth = case_when(
    age < 0 ~ date_of_birth-years(100),
    TRUE ~ date_of_birth),
    age = floor(decimal_date(data_date) - decimal_date(date_of_birth)))



df <- df %>% mutate(hire_date = mdy(hire_date),
                    years_of_service = floor(decimal_date(data_date) - decimal_date(hire_date)))

df <- df %>% 
  mutate(original_hire_date=mdy(original_hire_date))#,
         #termination_date=ymd(termination_date),
         #effective_date=mdy(effective_date))


df2 <- df2 %>% mutate(hire_date = mdy(hire_date),
                      termination_date=mdy(termination_date),
                    years_of_service = floor(decimal_date(data_date) - decimal_date(hire_date)))

df2 <- df2 %>% 
  mutate(original_hire_date=mdy(original_hire_date),
         termination_date=ymd(termination_date)#,
         #effective_date=mdy(effective_date)
         )


df <- df %>% 
  mutate(age_group_5=case_when(
    age < 25 ~ "<25",
    age >= 25 & age < 30 ~ "25-29",
    age >= 30 & age < 35 ~ "30-34",
    age >= 35 & age < 40 ~ "35-39",
    age >= 40 & age < 45 ~ "40-44",
    age >= 45 & age < 50 ~ "45-49",
    age >= 50 & age < 55 ~ "50-54",
    age >= 55 & age < 60 ~ "55-59",
    age >= 60 & age < 65 ~ "60-64",
    age >= 65 ~ "65+",
    TRUE ~ NA_character_))


df2 <- df2 %>% 
  mutate(age_group_5=case_when(
    age < 25 ~ "<25",
    age >= 25 & age < 30 ~ "25-29",
    age >= 30 & age < 35 ~ "30-34",
    age >= 35 & age < 40 ~ "35-39",
    age >= 40 & age < 45 ~ "34-44",
    age >= 45 & age < 50 ~ "45-49",
    age >= 50 & age < 55 ~ "50-54",
    age >= 55 & age < 60 ~ "55-59",
    age >= 60 & age < 65 ~ "60-64",
    age >= 65 ~ "65+",
    TRUE ~ NA_character_))


df <- df %>% 
  mutate(age_group_10=case_when(
    age < 25 ~ "<25",
    age >= 25 & age < 35 ~ "25-34",
    age >= 35 & age < 45 ~ "35-44",
    age >= 45 & age < 55 ~ "45-55",
    age >= 55 & age < 65 ~ "55-64",
    age >= 65 ~ "65+",
    TRUE ~ NA_character_))


df2 <- df2 %>% 
  mutate(age_group_10=case_when(
    age < 25 ~ "<25",
    age >= 25 & age < 35 ~ "25-34",
    age >= 35 & age < 45 ~ "35-44",
    age >= 45 & age < 55 ~ "45-55",
    age >= 55 & age < 65 ~ "55-64",
    age >= 65 ~ "65+",
    TRUE ~ NA_character_))

df <- df %>% 
  mutate(years_of_service_grouped=case_when(
    years_of_service==0 ~ "0",
    years_of_service>=1  & years_of_service<3 ~ "1-2",
    years_of_service>=3  & years_of_service<3 ~ "3-5",
    years_of_service>=6  & years_of_service<3 ~ "6-10",
    years_of_service>=11  & years_of_service<3 ~ "11-15",
    years_of_service>=16  & years_of_service<3 ~ "16-20",
    years_of_service>=21  & years_of_service<3 ~ "21-25",
    years_of_service>=25 ~ "25+",
    TRUE ~ NA_character_))


df2 <- df2 %>% 
  mutate(years_of_service_grouped=case_when(
    years_of_service==0 ~ "0",
    years_of_service>=1  & years_of_service<3 ~ "1-2",
    years_of_service>=3  & years_of_service<3 ~ "3-5",
    years_of_service>=6  & years_of_service<3 ~ "6-10",
    years_of_service>=11  & years_of_service<3 ~ "11-15",
    years_of_service>=16  & years_of_service<3 ~ "16-20",
    years_of_service>=21  & years_of_service<3 ~ "21-25",
    years_of_service>=25 ~ "25+",
    TRUE ~ NA_character_))

df <- df %>%
  mutate(dept=case_when(
    department == "News" ~ "News", 
    department == "Editorial" ~ "News",
    department == "Client Solutions" ~ "Commercial",
    department == "Circulation" ~ "Commercial",
    department == "Finance" ~ "Commercial",
    department == "Marketing" ~ "Commercial",
    department == "WP News Media Services" ~ "Commercial",
    department == "Production" ~ "Commercial",
    department == "Public Relations" ~ "Commercial",
    department == "Administration" ~ "Commercial",
    department == "Product"~ "Commercial",
    TRUE ~ "Other"))

df2 <- df2 %>%
  mutate(dept=case_when(
    department == "News" ~ "News", 
    department == "Editorial" ~ "News",
    department == "News Service and Syndicate" ~ "News", 
    department == "Audience Development and Insights" ~ "Commercial",
    department == "Client Solutions" ~ "Commercial",
    department == "Customer Care and Logistics" ~ "Commercial",
    department == "Finance" ~ "Commercial",
    department == "Legal" ~ "Commercial",
    department == "Marketing" ~ "Commercial",
    department == "WP News Media Services" ~ "Commercial",
    department == "Production" ~ "Commercial",
    department == "Public Relations" ~ "Commercial",
    department == "Washington Post Live" ~ "Commercial",
    department == "Product" ~ "Commercial",
    TRUE ~ "Other"))

df <- df %>%
  mutate(desk=case_when(
    cost_center_current == "110000 News Operations" ~ "Operations",
    cost_center_current == "110001 News Digital Operations" ~ "Operations",
    cost_center_current == "110610 Audience Development and Engagement" ~ "Audience Development and Engagement",
    cost_center_current == "110620 News Audio" ~ "Audio",
    cost_center_current == "110604 Presentation Design" ~ "Design",
    cost_center_current == "110605 Presentation" ~ "Photography",
    cost_center_current == "110664 News National Apps" ~ "Emerging News Products",
    cost_center_current == "110665 News The Lily" ~ "Emerging News Products",
    cost_center_current == "110666 News Snapchat" ~ "Emerging News Products",
    cost_center_current == "110667 News By The Way" ~ "Emerging News Products",
    cost_center_current == "113210 Economy and Business" ~ "Financial",
    cost_center_current == "114000 Foreign Administration" ~ "Foreign",
    cost_center_current == "114095 News Foreign Brazil" ~ "Foreign",
    cost_center_current == "114100 Foreign Latam" ~ "Foreign",
    cost_center_current == "114220 News Foreign Istanbul" ~ "Foreign",
    cost_center_current == "114235 Foreign Western Europe" ~ "Foreign",
    cost_center_current == "114300 News Foreign West Africa" ~ "Foreign",
    cost_center_current == "114415 Foreign Hong Kong" ~ "Foreign",
    cost_center_current == "114405 Foreign Beijing Bureau" ~ "Foreign",
    cost_center_current == "114105 Foreign Mexico Bureau" ~ "Foreign",
    cost_center_current == "114005 Foreign Beirut Bureau" ~ "Foreign",
    cost_center_current == "114400 Foreign India Bureau" ~ "Foreign",
    cost_center_current == "114410 Foreign Tokyo Bureau" ~ "Foreign",
    cost_center_current == "114205 Foreign Islamabad Bureau" ~ "Foreign",
    cost_center_current == "114305 Foreign Nairobi Bureau" ~ "Foreign",
    cost_center_current == "114240 Foreign Rome Bureau" ~ "Foreign",
    cost_center_current == "114200 Foreign London Bureau" ~ "Foreign",
    cost_center_current == "114230 Foreign Moscow Bureau" ~ "Foreign",
    cost_center_current == "114225 Foreign Cairo Bureau" ~ "Foreign",
    cost_center_current == "114215 Foreign Berlin Bureau" ~ "Foreign",
    cost_center_current == "114310 Foreign Baghdad Bureau" ~ "Foreign",
    cost_center_current == "114315 Foreign Jerusalem Bureau" ~ "Foreign",
    cost_center_current == "110603 Presentation Graphics" ~ "Graphics",
    cost_center_current == "110450 Investigative" ~ "Investigative",
    cost_center_current == "112300 Local Politics and Government" ~ "Local",
    cost_center_current == "110601 Multiplatform Desk" ~ "Multiplatform",
    cost_center_current == "110500 Magazine" ~ "National",
    cost_center_current == "113200 National Politics and Government" ~ "National",
    cost_center_current == "113205 National Security" ~ "National",
    cost_center_current == "113215 News National Health & Science" ~ "National",
    cost_center_current == "113220 National Enterprise" ~ "National",
    cost_center_current == "113235 National America" ~ "National",
    cost_center_current == "113240 News National Environment" ~ "National",
    cost_center_current == "110006 News Content & Research" ~ "News Content and Research",
    cost_center_current == "110455 News Logistics" ~ "News Logistics",
    cost_center_current == "110410 Book World" ~ "Outlook",
    cost_center_current == "110460 Outlook" ~ "Outlook",
    cost_center_current == "110475 Polling" ~ "Polling",
    cost_center_current == "110015 Sports Main" ~ "Sports",
    cost_center_current == "110300 Style" ~ "Style",
    cost_center_current == "110435 Food" ~ "Style",
    cost_center_current == "110485 Travel" ~ "Style",
    cost_center_current == "110495 Local Living" ~ "Style",
    cost_center_current == "110505 Weekend" ~ "Style",
    cost_center_current == "110600 Universal Desk" ~ "Universal Desk",
    cost_center_current == "110652 News Video - General" ~ "Video",
    cost_center_current == "110663 Wake Up Report" ~ "Other",
    cost_center_current == "115000 Editorial Administration" ~ "Editorial",
    TRUE ~ "non-newsroom"))

df2 <- df2 %>%
  mutate(desk=case_when(
    cost_center_current == "110000 News Operations" ~ "Operations",
    cost_center_current == "110001 News Digital Operations" ~ "Operations",
    cost_center_current == "110610 Audience Development and Engagement" ~ "Audience Development and Engagement",
    cost_center_current == "110620 News Audio" ~ "Audio",
    cost_center_current == "110604 Presentation Design" ~ "Design",
    cost_center_current == "110605 Presentation" ~ "Photography",
    cost_center_current == "110664 News National Apps" ~ "Emerging News Products",
    cost_center_current == "110665 News The Lily" ~ "Emerging News Products",
    cost_center_current == "110666 News Snapchat" ~ "Emerging News Products",
    cost_center_current == "110667 News By The Way" ~ "Emerging News Products",
    cost_center_current == "113210 Economy and Business" ~ "Financial",
    cost_center_current == "114000 Foreign Administration" ~ "Foreign",
    cost_center_current == "114095 News Foreign Brazil" ~ "Foreign",
    cost_center_current == "114100 Foreign Latam" ~ "Foreign",
    cost_center_current == "114220 News Foreign Istanbul" ~ "Foreign",
    cost_center_current == "114235 Foreign Western Europe" ~ "Foreign",
    cost_center_current == "114300 News Foreign West Africa" ~ "Foreign",
    cost_center_current == "114415 Foreign Hong Kong" ~ "Foreign",
    cost_center_current == "114405 Foreign Beijing Bureau" ~ "Foreign",
    cost_center_current == "114105 Foreign Mexico Bureau" ~ "Foreign",
    cost_center_current == "114005 Foreign Beirut Bureau" ~ "Foreign",
    cost_center_current == "114400 Foreign India Bureau" ~ "Foreign",
    cost_center_current == "114410 Foreign Tokyo Bureau" ~ "Foreign",
    cost_center_current == "114205 Foreign Islamabad Bureau" ~ "Foreign",
    cost_center_current == "114305 Foreign Nairobi Bureau" ~ "Foreign",
    cost_center_current == "114240 Foreign Rome Bureau" ~ "Foreign",
    cost_center_current == "114200 Foreign London Bureau" ~ "Foreign",
    cost_center_current == "114230 Foreign Moscow Bureau" ~ "Foreign",
    cost_center_current == "114225 Foreign Cairo Bureau" ~ "Foreign",
    cost_center_current == "114215 Foreign Berlin Bureau" ~ "Foreign",
    cost_center_current == "114310 Foreign Baghdad Bureau" ~ "Foreign",
    cost_center_current == "114315 Foreign Jerusalem Bureau" ~ "Foreign",
    cost_center_current == "110603 Presentation Graphics" ~ "Graphics",
    cost_center_current == "110450 Investigative" ~ "Investigative",
    cost_center_current == "112300 Local Politics and Government" ~ "Local",
    cost_center_current == "110601 Multiplatform Desk" ~ "Multiplatform",
    cost_center_current == "110500 Magazine" ~ "National",
    cost_center_current == "113200 National Politics and Government" ~ "National",
    cost_center_current == "113205 National Security" ~ "National",
    cost_center_current == "113215 News National Health & Science" ~ "National",
    cost_center_current == "113220 National Enterprise" ~ "National",
    cost_center_current == "113235 National America" ~ "National",
    cost_center_current == "113240 News National Environment" ~ "National",
    cost_center_current == "110006 News Content & Research" ~ "News Content and Research",
    cost_center_current == "110455 News Logistics" ~ "News Logistics",
    cost_center_current == "110410 Book World" ~ "Outlook",
    cost_center_current == "110460 Outlook" ~ "Outlook",
    cost_center_current == "110475 Polling" ~ "Polling",
    cost_center_current == "110015 Sports Main" ~ "Sports",
    cost_center_current == "110300 Style" ~ "Style",
    cost_center_current == "110435 Food" ~ "Style",
    cost_center_current == "110485 Travel" ~ "Style",
    cost_center_current == "110495 Local Living" ~ "Style",
    cost_center_current == "110505 Weekend" ~ "Style",
    cost_center_current == "110600 Universal Desk" ~ "Universal Desk",
    cost_center_current == "110652 News Video - General" ~ "Video",
    cost_center_current == "110663 Wake Up Report" ~ "Other",
    cost_center_current == "115000 Editorial Administration" ~ "Editorial",
    TRUE ~ "non-newsroom"))

df <- df %>%
  mutate(tier=case_when(
    desk == "National" ~ "Tier 1",
    desk == "Foreign" ~ "Tier 1",
    desk == "Financial" ~ "Tier 1",
    desk == "Investigative" ~ "Tier 1",
    desk == "Style" ~ "Tier 2",
    desk == "Local" ~ "Tier 2",
    desk == "Graphics" ~ "Tier 2",
    desk == "Universal Desk" ~ "Tier 2",
    desk == "Sports" ~ "Tier 2",
    desk == "Outlook" ~ "Tier 2",
    desk == "Editorial" ~ "Tier 2",
    desk == "Audio" ~ "Tier 3",
    desk == "Polling" ~ "Tier 3",
    desk == "Design" ~ "Tier 3",
    desk == "Operations" ~ "Tier 3",
    desk == "Multiplatform" ~ "Tier 3",
    desk == "Video" ~ "Tier 3",
    desk == "Audience Development and Engagement" ~ "Tier 3",
    desk == "News Logistics" ~ "Tier 4",
    desk == "News Content and Research" ~ "Tier 4",
    desk == "Emerging News Products" ~ "Tier 4",
    desk == "Other" ~ "Tier 4",
    TRUE ~ "Other"))

df2 <- df2 %>%
  mutate(tier=case_when(
    desk == "National" ~ "Tier 1",
    desk == "Foreign" ~ "Tier 1",
    desk == "Financial" ~ "Tier 1",
    desk == "Investigative" ~ "Tier 1",
    desk == "Style" ~ "Tier 2",
    desk == "Local" ~ "Tier 2",
    desk == "Graphics" ~ "Tier 2",
    desk == "Universal Desk" ~ "Tier 2",
    desk == "Sports" ~ "Tier 2",
    desk == "Outlook" ~ "Tier 2",
    desk == "Editorial" ~ "Tier 2",
    desk == "Audio" ~ "Tier 3",
    desk == "Polling" ~ "Tier 3",
    desk == "Design" ~ "Tier 3",
    desk == "Operations" ~ "Tier 3",
    desk == "Multiplatform" ~ "Tier 3",
    desk == "Video" ~ "Tier 3",
    desk == "Audience Development and Engagement" ~ "Tier 3",
    desk == "News Logistics" ~ "Tier 4",
    desk == "News Content and Research" ~ "Tier 4",
    desk == "Emerging News Products" ~ "Tier 4",
    desk == "Other" ~ "Tier 4",
    TRUE ~ "Other"))

df <- df %>%
  mutate(race_grouping=case_when(
    race_ethnicity == "White (United States of America)" ~ "white",
    race_ethnicity == "Black or African American (United States of America)" ~ "person of color",
    race_ethnicity == "Asian (United States of America)" ~ "person of color",
    race_ethnicity == "Hispanic or Latino (United States of America)" ~ "person of color",
    race_ethnicity == "Two or More Races (United States of America)" ~ "person of color",
    race_ethnicity == "American Indian or Alaska Native (United States of America)" ~ "person of color",
    race_ethnicity == "Native Hawaiian or Other Pacific Islander (United States of America)" ~ "person of color",
    TRUE ~ "unknown"))

df2 <- df2 %>%
  mutate(race_grouping=case_when(
    race_ethnicity == "White (United States of America)" ~ "white",
    race_ethnicity == "Black or African American (United States of America)" ~ "person of color",
    race_ethnicity == "Asian (United States of America)" ~ "person of color",
    race_ethnicity == "Hispanic or Latino (United States of America)" ~ "person of color",
    race_ethnicity == "Two or More Races (United States of America)" ~ "person of color",
    race_ethnicity == "American Indian or Alaska Native (United States of America)" ~ "person of color",
    race_ethnicity == "Native Hawaiian or Other Pacific Islander (United States of America)" ~ "person of color",
    TRUE ~ "unknown"))


pay_change_function <- function(x=df, cycles=26, label="active") {
  for (i in 1:cycles) {
    if (i <24) {
      df_year <- df %>% 
        select(employee_id, paste0("business_process_reason", as.character(i)),
               paste0("base_pay_change", as.character(i)),
               paste0("effective_date", as.character(i)),
               paste0("pay_rate_type", as.character(i)),
               paste0("time_type", as.character(i)),
               gender, race_ethnicity, race_grouping,
               age_group_5, dept, tier,
               `2008_annual_performance_rating`,
               `2009_annual_performance_rating`,
               `2010_annual_performance_rating`,
               `2011_annual_performance_rating`,
               `2012_annual_performance_rating`,
               `2013_annual_performance_rating`,
               `2014_annual_performance_rating`,
               `2015_annual_performance_rating`,
               `2016_annual_performance_rating`,
               `2017_annual_performance_rating`,
               `2018_annual_performance_rating`,
               `2019_annual_performance_rating`,
               `2020_annual_performance_rating`
        ) 
    } else {
      df_year <- df %>% 
        select(employee_id, paste0("business_process_reason", as.character(i)),
               #paste0("base_pay_change", as.character(i)),
               paste0("effective_date", as.character(i)),
               #paste0("pay_rate_type", as.character(i)),
               gender, race_ethnicity, race_grouping,
               age_group_5, dept, tier,
               `2008_annual_performance_rating`,
               `2009_annual_performance_rating`,
               `2010_annual_performance_rating`,
               `2011_annual_performance_rating`,
               `2012_annual_performance_rating`,
               `2013_annual_performance_rating`,
               `2014_annual_performance_rating`,
               `2015_annual_performance_rating`,
               `2016_annual_performance_rating`,
               `2017_annual_performance_rating`,
               `2018_annual_performance_rating`,
               `2019_annual_performance_rating`,
               `2020_annual_performance_rating`
               ) 
      
      df_year$base_pay_change <- NA
      df_year$pay_rate_type <- NA
      df_year$time_type <- NA
      
    }
    
    colnames(df_year) <- gsub(paste0("n", as.character(i)), "n", colnames(df_year))
    colnames(df_year) <- gsub(paste0("e", as.character(i)), "e", colnames(df_year))
    #colnames(df_year) <- gsub(paste0("r", as.character(i)), "r", colnames(df_year))
    
    df_year <- df_year %>% 
      select(employee_id, business_process_reason,base_pay_change,effective_date,
             pay_rate_type,gender, race_ethnicity, race_grouping,
             age_group_5, dept, tier,
             `2008_annual_performance_rating`,
             `2009_annual_performance_rating`,
             `2010_annual_performance_rating`,
             `2011_annual_performance_rating`,
             `2012_annual_performance_rating`,
             `2013_annual_performance_rating`,
             `2014_annual_performance_rating`,
             `2015_annual_performance_rating`,
             `2016_annual_performance_rating`,
             `2017_annual_performance_rating`,
             `2018_annual_performance_rating`,
             `2019_annual_performance_rating`,
             `2020_annual_performance_rating`) %>% 
      mutate(workday = paste0(label, "-", i))
    
    if (i==1) {
      mega_df <- df_year
    } else {
      mega_df <- rbind(mega_df, df_year)
    }
  }
  return(mega_df)
}

reason_for_change1 <- pay_change_function(df, cycles=25, label="active")
reason_for_change2 <- pay_change_function(df2, cycles=15, label="terminated")
reason_for_change_combined <- rbind(reason_for_change1, reason_for_change2)

emp_perf_function <- function(x=df_name, year_start=2008, year_end=2020, label="active") {
  for (i in year_start:year_end) {
    df_name <- x
    df_year <- df_name %>% 
      select(employee_id, paste0(as.character(i), "_annual_performance_rating"),
             gender, race_ethnicity, race_grouping, dept)
    
    colnames(df_year) <- c("employee_id", "performance_rating","gender",
                           "race_ethnicity","race_grouping","dept")
    
    df_year$workday <- paste0(label, "-", i)
    
    if (i==year_start) {
      mega_df <- df_year
    } else {
      mega_df <- rbind(mega_df, df_year)  
      
    }
  }
  return(mega_df)
}  

ratings1 <- emp_perf_function(df, year_start=2008, year_end=2020, label="active")
ratings2 <- emp_perf_function(df2, year_start=2008, year_end=2020, label="terminated")

ratings_combined <- rbind(ratings1, ratings2) %>% 
  filter(!is.na(performance_rating))

news_salaried <- filter(df, dept == "News", pay_rate_type == "Salaried")
news_hourly <- filter(df, dept == "News", pay_rate_type == "Hourly")
commercial_salaried <- filter(df, dept == "Commercial", pay_rate_type == "Salaried")
commercial_hourly <- filter(df, dept == "Commercial", pay_rate_type == "Hourly")

news_salaried2 <-  filter(df2, dept == "News", pay_rate_type == "Salaried")
news_hourly2 <-  filter(df2, dept == "News", pay_rate_type == "Hourly")
commercial_salaried2 <-  filter(df2, dept == "Commercial", pay_rate_type == "Salaried")
commercial_hourly2 <-  filter(df2, dept == "Commercial", pay_rate_type == "Hourly")

#Within our Guild data, 67 percent of Guild-covered employees were White, and 28 percent were people of color.

df %>% 
  filter(pay_rate_type=="Salaried") %>% 
  group_by(race_grouping) %>% 
  summarize(n=n(), median=median(current_base_pay, na.rm=T),
            age=median(age, na.rm=T)) %>% 
  mutate(percent=round(n/sum(n)*100,0))%>% 
  kable()

race_grouping	n	median	age	percent
person of color	218	94530.0	35.5	28
unknown	40	129890.0	35.0	5
white	525	110453.4	39.0	67

# Salaried White male Newsroom employees outnumbered salaried White female Newsroom employees (229 to 207), but salaried women of color outnumbered salaried men of color in the Newsroom (109 to 72). 
df %>% 
  filter(dept=="News") %>% 
  filter(pay_rate_type=="Salaried") %>% 
  count(gender, race_grouping) %>% 
  filter(n>=5)%>% 
  kable()

gender	race_grouping	n
Female	person of color	109
Female	unknown	20
Female	white	207
Male	person of color	72
Male	unknown	19
Male	white	229

#The median salary for men was $120,977, compared with $102,700 for women, a gap of 18 percent. 
df %>% 
  filter(dept=="News") %>% 
  filter(pay_rate_type=="Salaried") %>% 
  group_by(gender) %>% 
  summarize(n=n(), median=median(current_base_pay, na.rm=T),
            age=median(age, na.rm=T)) %>% 
  mutate(percent=round(n/sum(n)*100,0))  %>% 
  filter(n>=5)%>% 
  kable()

gender	n	median	age	percent
Female	336	102700.0	35.5	51
Male	320	120976.6	42.0	49

#The median pay for women was $36 per hour, compared with $34 per hour for men. 
df %>% 
  filter(dept=="News") %>% 
  filter(pay_rate_type=="Hourly") %>% 
  group_by(gender) %>% 
  summarize(n=n(), median=median(current_base_pay, na.rm=T),
            age=median(age, na.rm=T)) %>% 
  mutate(percent=round(n/sum(n)*100,0))  %>% 
  filter(n>=5)%>% 
  kable()

gender	n	median	age	percent
Female	50	35.90	34	60
Male	33	33.77	34	40

#The disparity in median pay between men and women was starkest among employees under the age of 40, among whom there was a gap of 12.5 percent, amounting to nearly $13,000.
df %>% 
  filter(dept=="News") %>% 
  filter(pay_rate_type=="Salaried") %>%
  mutate(y40 =case_when(age >=40 ~ "Over 40",
                        age < 40 ~ "Under 40",
                        TRUE ~ "Unknown")) %>% 
  group_by(gender, y40) %>%
  summarize(n=n(), median=median(current_base_pay, na.rm=T),
            age=median(age, na.rm=T)) %>%
  mutate(percent=round(n/sum(n)*100,0))  %>%
  filter(n>=5)%>% 
  kable()

gender	y40	n	median	age	percent
Female	Over 40	125	128280	52	37
Female	Under 40	211	90440	31	63
Male	Over 40	181	131410	52	57
Male	Under 40	139	103400	34	43

#103400-90400
#(103400-90400)/90400

#White journalists made nearly 16 percent more than journalists of color. 
#White journalists made a median salary of $113,810, while journalists of color had a median salary of $98,435. 
df %>% 
  filter(dept=="News") %>% 
  filter(pay_rate_type=="Salaried") %>% 
  group_by(race_grouping) %>% 
  summarize(n=n(), median=median(current_base_pay, na.rm=T),
            age=median(age, na.rm=T)) %>% 
  mutate(percent=round(n/sum(n)*100,0))  %>% 
  filter(n>=5)%>% 
  kable()

race_grouping	n	median	age	percent
person of color	181	98435.0	35	28
unknown	39	130000.0	35	6
white	437	113809.7	41	67

#This gap was particularly stark when comparing the median salaries for White men and women of color. White men had a median salary of $123,797, compared with a median salary of $94,840 for women of color. That gap was nearly $29,000 — or over 28 percent.
df %>% 
  filter(dept=="News") %>% 
  filter(pay_rate_type=="Salaried") %>% 
  group_by(gender, race_grouping) %>% 
  summarize(n=n(), median=median(current_base_pay, na.rm=T),
            age=median(age, na.rm=T)) %>% 
  mutate(percent=round(n/sum(n)*100,0))  %>% 
  filter(n>=5)%>% 
  kable()

gender	race_grouping	n	median	age	percent
Female	person of color	109	94840.0	33.0	32
Female	unknown	20	116890.0	33.5	6
Female	white	207	105780.0	37.0	62
Male	person of color	72	104420.0	40.0	22
Male	unknown	19	133560.0	36.0	6
Male	white	229	123796.9	43.0	72

#For the 320 salaried male Newsroom employees working at The Post, the median salary was $120,977. 
#For the 336 salaried female Newsroom employees, it was $102,700.
#The median age for salaried men working in the Newsroom was 42, compared with 36 for salaried women.

df %>% 
  filter(dept=="News") %>% 
  filter(pay_rate_type=="Salaried") %>% 
  group_by(gender) %>% 
  summarize(n=n(), median=median(current_base_pay, na.rm=T),
            age=median(age, na.rm=T)) %>% 
  mutate(percent=round(n/sum(n)*100,0))  %>% 
  filter(n>=5)%>% 
  kable()

gender	n	median	age	percent
Female	336	102700.0	35.5	51
Male	320	120976.6	42.0	49

#For the 83 hourly employees across the Newsroom, there was also a slight difference. 
#The median hourly wage for men was $34, compared with $36 for women. 
#Comparing by age is difficult because only 33 men worked hourly jobs in the Newsroom and their ages varied widely.
#That said, female hourly workers (50) made more than male hourly workers. 
#Under the age of 40, the gap was nearly $3, and over the age of 40, the gap was $10. 
#However, the sample size for hourly workers was low, meaning a few employees could lower or raise the median fairly drastically.

df %>% 
  filter(dept=="News") %>% 
  filter(pay_rate_type=="Hourly") %>% 
  group_by(gender) %>% 
  summarize(n=n(), median=median(current_base_pay, na.rm=T),
            age=median(age, na.rm=T)) %>% 
  mutate(percent=round(n/sum(n)*100,0))  %>% 
  filter(n>=5)%>% 
  kable()

gender	n	median	age	percent
Female	50	35.90	34	60
Male	33	33.77	34	40

#Over the age of 40: There were nearly 1.4 men for every woman in the Newsroom. 
#Their median salary was 2.4 percent more, a gap of over $3,000. 
#Under the age of 40: There were nearly 1.5 women for every man in the Newsroom. 
#Their median salary was 12.5 percent less, a gap of nearly $13,000.

df %>% 
  filter(dept=="News") %>% 
  filter(pay_rate_type=="Salaried") %>%
  mutate(y40 =case_when(age >=40 ~ "Over 40",
                        age < 40 ~ "Under 40",
                        TRUE ~ "Unknown")) %>% 
  group_by(gender, y40) %>%
  summarize(n=n(), median=median(current_base_pay, na.rm=T),
            age=median(age, na.rm=T)) %>%
  mutate(percent=round(n/sum(n)*100,0))  %>%
  filter(n>=5)%>% 
  kable()

gender	y40	n	median	age	percent
Female	Over 40	125	128280	52	37
Female	Under 40	211	90440	31	63
Male	Over 40	181	131410	52	57
Male	Under 40	139	103400	34	43

#Among those under 40, Newsroom employees of color made about 4 percent less than their White counterparts, with median salaries of $92,000 and $96,000, respectively.
#The race disparity widened for journalists 40 and over. 
#Newsroom employees of color had a median salary of $121,875, while their White colleagues had a median salary of $130,800 — a gap of more than 7 percent.

df %>% 
  filter(dept=="News") %>% 
  filter(pay_rate_type=="Salaried") %>%
  mutate(y40 =case_when(age >=40 ~ "Over 40",
                        age < 40 ~ "Under 40",
                        TRUE ~ "Unknown")) %>% 
  group_by(race_grouping, y40) %>%
  summarize(n=n(), median=median(current_base_pay, na.rm=T),
            age=median(age, na.rm=T)) %>%
  mutate(percent=round(n/sum(n)*100,0))  %>%
  filter(n>=5)%>% 
  kable()

race_grouping	y40	n	median	age	percent
person of color	Over 40	68	121874.6	47.5	38
person of color	Under 40	113	92000.0	30.0	62
unknown	Over 40	10	151785.4	54.0	26
unknown	Under 40	29	105000.0	33.0	74
white	Over 40	228	130799.7	52.0	52
white	Under 40	209	96000.0	32.0	48

#Below are the median salaries by race and ethnicity across the Newsroom:
#  White: $113,810
#Black: $102,700
#Asian: $103,970
#Hispanic or Latino: $95,780
#Two or more races: $91,090

df %>% 
  filter(dept=="News") %>% 
  filter(pay_rate_type=="Salaried") %>%
  mutate(y40 =case_when(age >=40 ~ "Over 40",
                        age < 40 ~ "Under 40",
                        TRUE ~ "Unknown")) %>% 
  group_by(race_ethnicity) %>%
  summarize(n=n(), median=median(current_base_pay, na.rm=T),
            age=median(age, na.rm=T)) %>%
  mutate(percent=round(n/sum(n)*100,0))  %>%
  filter(n>=5)%>% 
  kable()

race_ethnicity	n	median	age	percent
Asian (United States of America)	59	103970.05	35.0	9
Black or African American (United States of America)	60	102700.04	39.0	9
Hispanic or Latino (United States of America)	42	95780.04	35.5	6
Prefer Not to Disclose (United States of America)	12	94890.00	31.0	2
Two or More Races (United States of America)	18	91090.00	30.0	3
White (United States of America)	437	113809.68	41.0	67
NA	27	144340.00	36.0	4

#The gap between White journalists in the Newsroom and journalists of color was nearly 16 percent, with a median salary of $113,810 for 437 White journalists and $98,435 for 181 journalists of color. 
df %>% 
  filter(dept=="News") %>% 
  filter(pay_rate_type=="Salaried") %>%
  mutate(y40 =case_when(age >=40 ~ "Over 40",
                        age < 40 ~ "Under 40",
                        TRUE ~ "Unknown")) %>% 
  group_by(race_grouping) %>%
  summarize(n=n(), median=median(current_base_pay, na.rm=T),
            age=median(age, na.rm=T)) %>%
  mutate(percent=round(n/sum(n)*100,0))  %>%
  filter(n>=5)%>% 
  kable()

race_grouping	n	median	age	percent
person of color	181	98435.0	35	28
unknown	39	130000.0	35	6
white	437	113809.7	41	67

#About 58 of the hourly employees in the Newsroom were White — and they made a median wage of $35 an hour compared with $34 for hourly employees of color. 
df %>% 
  filter(dept=="News") %>% 
  filter(pay_rate_type=="Hourly") %>%
  mutate(y40 =case_when(age >=40 ~ "Over 40",
                        age < 40 ~ "Under 40",
                        TRUE ~ "Unknown")) %>% 
  group_by(race_grouping) %>%
  summarize(n=n(), median=median(current_base_pay, na.rm=T),
            age=median(age, na.rm=T)) %>%
  mutate(percent=round(n/sum(n)*100,0))  %>%
  filter(n>=5)%>% 
  kable()

race_grouping	n	median	age	percent
person of color	24	33.59	27.5	29
white	58	35.40	36.5	70

#The median salaries by group were as follows: 

#White men: $123,797
#Men of color: $104,420
#White women: $105,780
#Women of color: $94,840

#Median ages by group were as follows:

#White men: 43
#Men of color: 40
#White women: 37
#Women of color: 33

df %>% 
  filter(dept=="News") %>% 
  filter(pay_rate_type=="Salaried") %>%
  mutate(y40 =case_when(age >=40 ~ "Over 40",
                        age < 40 ~ "Under 40",
                        TRUE ~ "Unknown")) %>% 
  group_by(race_grouping, gender) %>%
  summarize(n=n(), median=median(current_base_pay, na.rm=T),
            age=median(age, na.rm=T)) %>%
  mutate(percent=round(n/sum(n)*100,0))  %>%
  filter(n>=5)%>% 
  kable()

race_grouping	gender	n	median	age	percent
person of color	Female	109	94840.0	33.0	60
person of color	Male	72	104420.0	40.0	40
unknown	Female	20	116890.0	33.5	51
unknown	Male	19	133560.0	36.0	49
white	Female	207	105780.0	37.0	47
white	Male	229	123796.9	43.0	52

# For most sections the median salary for men surpasses women CHART
#Of the 15 sections that had at least five men and five women:
#In 10 sections, men had higher median pay than women (beyond 5 percent): Audience, Design, Foreign, Investigative, Local, Multiplatform, National, Opinion, Photography and Style.
#One section, Sports, had a median pay disparity that favors women by 18 percent.
#In four sections, there was approximate pay equity between genders (within 5 percent): Emerging News Products, Financial, Graphics and Video.

desk_analysis <- df %>% 
  filter(dept=="News") %>% 
  filter(pay_rate_type=="Salaried") %>%
  mutate(y40 =case_when(age >=40 ~ "Over 40",
                        age < 40 ~ "Under 40",
                        TRUE ~ "Unknown")) %>% 
  group_by(desk, gender) %>%
  summarize(n=n(), median=median(current_base_pay, na.rm=T),
            age=median(age, na.rm=T)) %>%
  mutate(percent=round(n/sum(n)*100,0))  %>%
  filter(n>=5)

kable(desk_analysis)

desk	gender	n	median	age	percent
Audience Development and Engagement	Female	19	88880.00	31.0	68
Audience Development and Engagement	Male	9	94780.00	34.0	32
Audio	Female	5	94280.00	31.0	62
Design	Female	19	82201.52	29.0	58
Design	Male	14	93640.00	37.0	42
Editorial	Female	15	102692.61	33.0	45
Editorial	Male	18	156476.68	45.5	55
Emerging News Products	Female	19	73440.00	29.0	73
Emerging News Products	Male	7	76840.01	28.0	27
Financial	Female	16	133666.00	42.0	37
Financial	Male	27	137340.00	41.0	63
Foreign	Female	17	129780.00	31.0	52
Foreign	Male	16	150780.00	37.5	48
Graphics	Female	11	104320.14	31.0	52
Graphics	Male	9	101640.00	34.0	43
Investigative	Female	6	133200.00	38.0	35
Investigative	Male	11	151780.00	43.0	65
Local	Female	36	108075.80	39.5	53
Local	Male	32	122011.18	51.5	47
Multiplatform	Female	28	88560.00	42.5	64
Multiplatform	Male	16	96356.89	51.5	36
National	Female	54	145602.97	42.5	47
National	Male	60	178737.50	41.0	53
Photography	Female	10	91847.57	42.5	37
Photography	Male	17	107940.95	49.0	63
Sports	Female	10	125935.00	32.5	23
Sports	Male	33	103685.00	41.0	77
Style	Female	28	107060.00	41.0	56
Style	Male	22	118280.00	46.5	44
Video	Female	29	91560.00	30.0	63
Video	Male	17	95000.00	35.0	37

# Workers of color have lower median salary in nearly all departments CHART
#Of 13 sections for which there were at least five White journalists and five journalists of color:
#Six sections had a median pay disparity favoring White journalists of more than 7 percent: the Newsroom, Emerging News Products, Video, Style, Photography and National.
#Seven sections had approximate pay equity (within 5 percent) between journalists of color and White journalists: Audience, Design, Financial, Graphics, Local, Multiplatform and Sports.

desk_analysis <- df %>% 
  filter(dept=="News") %>% 
  filter(pay_rate_type=="Salaried") %>%
  mutate(y40 =case_when(age >=40 ~ "Over 40",
                        age < 40 ~ "Under 40",
                        TRUE ~ "Unknown")) %>% 
  group_by(desk, race_grouping) %>%
  summarize(n=n(), median=median(current_base_pay, na.rm=T),
            age=median(age, na.rm=T)) %>%
  mutate(percent=round(n/sum(n)*100,0))  %>%
  filter(n>=5)

kable(desk_analysis)

desk	race_grouping	n	median	age	percent
Audience Development and Engagement	person of color	11	88280.00	34.0	39
Audience Development and Engagement	white	17	89280.00	31.0	61
Design	person of color	15	90009.88	32.0	45
Design	white	17	85060.00	32.0	52
Editorial	person of color	7	104560.00	33.0	21
Editorial	white	25	134806.25	42.0	76
Emerging News Products	person of color	7	71853.00	27.0	27
Emerging News Products	white	18	76700.01	28.5	69
Financial	person of color	8	131420.00	38.5	19
Financial	white	33	137799.88	41.0	77
Foreign	unknown	25	144340.00	36.0	76
Foreign	white	6	124843.27	32.5	18
Graphics	person of color	8	102980.07	29.5	38
Graphics	white	11	106340.00	33.0	52
Investigative	white	13	151780.00	43.0	76
Local	person of color	19	113000.00	43.0	28
Local	white	49	113280.00	46.0	72
Multiplatform	person of color	8	90611.76	46.0	18
Multiplatform	white	36	92547.18	49.0	82
National	person of color	28	135595.00	39.5	25
National	white	83	177838.45	44.0	73
Photography	person of color	7	95000.00	49.0	26
Photography	white	19	106340.00	48.0	70
Sports	person of color	14	102780.00	38.5	33
Sports	white	29	107560.00	41.0	67
Style	person of color	12	104243.74	36.0	24
Style	white	38	115471.31	47.5	76
Video	person of color	18	87000.00	29.5	39
Video	white	27	95000.00	35.0	59

# Race and gender pay disparities evident across sections CHART
#There were only four sections that had at least five White men, five White women, five men of color and five women of color: Design, Local, National and Sports.
#All four sections had racial and gender pay disparities. 
#For employees across these four sections, the disparity in median salaries between White men and women of color was 8 percent. 
#The starkest gaps in median salaries were between White men and men of color in National ($51,000) and between White women and women of color in Sports ($65,000).


desk_analysis <- df %>% 
  filter(dept=="News") %>% 
  filter(pay_rate_type=="Salaried") %>%
  mutate(y40 =case_when(age >=40 ~ "Over 40",
                        age < 40 ~ "Under 40",
                        TRUE ~ "Unknown")) %>% 
  group_by(desk, race_grouping, gender) %>%
  summarize(n=n(), median=median(current_base_pay, na.rm=T),
            age=median(age, na.rm=T)) %>%
  mutate(percent=round(n/sum(n)*100,0))  %>%
  filter(n>=5)

kable(desk_analysis)

desk	race_grouping	gender	n	median	age	percent
Audience Development and Engagement	person of color	Female	9	90000.00	35.0	82
Audience Development and Engagement	white	Female	10	86330.00	29.0	59
Audience Development and Engagement	white	Male	7	103060.00	35.0	41
Design	person of color	Female	8	85789.94	29.0	53
Design	person of color	Male	7	92500.00	41.0	47
Design	white	Female	11	82201.52	31.0	65
Design	white	Male	6	92500.00	37.0	35
Editorial	white	Female	10	107200.37	35.0	40
Editorial	white	Male	15	158673.36	46.0	60
Emerging News Products	white	Female	14	72940.00	29.0	78
Financial	person of color	Female	6	141070.00	43.0	75
Financial	white	Female	10	129876.00	40.5	30
Financial	white	Male	23	142027.50	41.0	70
Foreign	unknown	Female	13	136560.00	36.0	52
Foreign	unknown	Male	12	147670.00	37.5	48
Graphics	person of color	Female	5	104320.14	28.0	62
Graphics	white	Female	5	104780.00	31.0	45
Graphics	white	Male	5	117131.00	59.0	45
Investigative	white	Male	9	151840.00	43.0	69
Local	person of color	Female	12	106890.00	39.5	63
Local	person of color	Male	7	122909.15	50.0	37
Local	white	Female	24	108075.80	38.5	49
Local	white	Male	25	121113.20	54.0	51
Multiplatform	person of color	Female	5	84220.51	32.0	62
Multiplatform	white	Female	23	89840.00	43.0	64
Multiplatform	white	Male	13	96600.08	50.0	36
National	person of color	Female	16	137890.00	40.0	57
National	person of color	Male	12	135050.00	37.5	43
National	white	Female	36	148550.00	45.5	43
National	white	Male	47	186065.00	43.0	57
Photography	person of color	Male	7	95000.00	49.0	100
Photography	white	Female	10	91847.57	42.5	53
Photography	white	Male	9	119340.00	52.0	47
Sports	person of color	Female	5	95000.00	27.0	36
Sports	person of color	Male	9	110560.00	42.0	64
Sports	white	Female	5	160089.98	60.0	17
Sports	white	Male	24	102653.66	40.0	83
Style	person of color	Female	8	96353.74	36.0	67
Style	white	Female	20	111833.47	47.0	53
Style	white	Male	18	116175.06	49.0	47
Video	person of color	Female	15	84000.00	29.0	83
Video	white	Female	13	95000.00	35.0	48
Video	white	Male	14	94500.00	35.0	52

#Across sections in which employees’ median salary was higher than $125,000 (including National, Investigative, Foreign, Financial and Opinions): 
#67 percent of employees were White and 55 percent of employees were male. 


df %>% 
  filter(dept=="News") %>% 
  filter(pay_rate_type=="Salaried") %>% 
  mutate(t_desk=case_when(
    desk=="National" | 
      desk=="Investigative" |
      desk=="Foreign" |
      desk=="Financial" |
      desk=="Editorial" ~ "Higher",
    TRUE ~ "Lower")) %>% 
  group_by(t_desk, race_grouping) %>% 
  summarise(
    count = length(current_base_pay),
    median = median(current_base_pay, na.rm = FALSE)
  ) %>% 
  mutate(percent=count/sum(count)*100)%>% 
  kable()

t_desk	race_grouping	count	median	percent
Higher	person of color	48	134455	20.000000
Higher	unknown	32	136950	13.333333
Higher	white	160	150485	66.666667
Lower	person of color	133	92400	31.894484
Lower	unknown	7	90000	1.678657
Lower	white	277	100000	66.426858

df %>% 
  filter(dept=="News") %>% 
  filter(pay_rate_type=="Salaried") %>% 
  mutate(t_desk=case_when(
    desk=="National" | 
      desk=="Investigative" |
      desk=="Foreign" |
      desk=="Financial" |
      desk=="Editorial" ~ "Higher",
    TRUE ~ "Lower")) %>% 
  group_by(t_desk, gender) %>% 
  summarise(
    count = length(current_base_pay),
    median = median(current_base_pay, na.rm = FALSE)
  ) %>% 
  mutate(percent=count/sum(count)*100)%>% 
  kable()

t_desk	gender	count	median	percent
Higher	Female	108	135780.00	45.0000000
Higher	Male	132	154535.00	55.0000000
Lower	Female	228	92214.84	54.6762590
Lower	Male	188	101042.43	45.0839329
Lower	Prefer not to disclose	1	106340.00	0.2398082

#40 percent of employees were White men, 27 percent were White women, 12 percent were women of color, and 8 percent were men of color (the rest did not indicate race). 

df %>% 
  filter(dept=="News") %>% 
  filter(pay_rate_type=="Salaried") %>% 
  mutate(t_desk=case_when(
    desk=="National" | 
      desk=="Investigative" |
      desk=="Foreign" |
      desk=="Financial" |
      desk=="Editorial" ~ "Higher",
    TRUE ~ "Lower")) %>% 
  group_by(t_desk, race_grouping, gender) %>% 
  summarise(
    count = length(current_base_pay),
    median = median(current_base_pay, na.rm = FALSE)
  ) %>% 
  mutate(percent=count/sum(count)*100)%>% 
  kable()

t_desk	race_grouping	gender	count	median	percent
Higher	person of color	Female	28	132500.0	58.3333333
Higher	person of color	Male	20	135050.0	41.6666667
Higher	unknown	Female	16	133170.0	50.0000000
Higher	unknown	Male	16	140840.0	50.0000000
Higher	white	Female	64	137384.8	40.0000000
Higher	white	Male	96	166841.8	60.0000000
Lower	person of color	Female	81	91060.0	60.9022556
Lower	person of color	Male	52	94890.0	39.0977444
Lower	unknown	Female	4	87780.0	57.1428571
Lower	unknown	Male	3	98340.0	42.8571429
Lower	white	Female	143	95000.0	51.6245487
Lower	white	Male	133	103400.0	48.0144404
Lower	white	Prefer not to disclose	1	106340.0	0.3610108