Using mutate
can sometimes feel like…
But soon you will be a mutate expert and everything will be…
awesome.
mutate(grade_group =
ifelse(Grade >= 5 & Grade <= 8 & Subject == "ELA", "5-8",
ifelse(Grade >= 5 & Grade <= 8 & Subject == "Math", "5-7",
ifelse(Grade < 5, "3-4", "Unknown")
)))
I find this works really well for small data, otherwise you’re adding time to the process.
mutate(grade_group = ifelse(Grade >= 5 & Grade <= 8 & Subject == "ELA", "5-8", NA),
grade_group = ifelse(Grade >= 5 & Grade <= 8 & Subject == "Math", "5-7", grade_group),
grade_group = ifelse(Grade < 5, "3-4", "grade_group"))
For larger data, joining against a translation table may be faster.
grade_group <- tibble(grade = 3:8,
ela_group = c("3-4", "3-4", rep("5-8", 4)),
math_group = c("3-4", "3-4", rep("5-7", 4))
)
grade_group
## # A tibble: 6 x 3
## grade ela_group math_group
## <int> <chr> <chr>
## 1 3 3-4 3-4
## 2 4 3-4 3-4
## 3 5 5-8 5-7
## 4 6 5-8 5-7
## 5 7 5-8 5-7
## 6 8 5-8 5-7
results <- results %>% left_join(grade_group, by="grade")
case_when()
THANK YOU DPLYR! Now that this is in full working order, this is the best solution to the nested if problem.
mutate(grade_group = case_when(
Grade < 5 ~ "3-4",
Grade <=8 & Subject == "ELA" ~ "5-8",
Grade <= 8 & Subject == "Math" ~ "5-7",
TRUE ~ NA
)
group_by
and mutate
A grouped data frame is really useful to work with mutate statements. You can do ranking, summarizing, filtering..etc on your groups without losing data.
Example: So I have this file with student profile info, but each student has one record per school year, so Ying has 9th grade, 10th grade, and 11th grade. I want to create a file with one row per student, taking only the latest year.
df %>%
group_by(student_id) %>%
mutate(max_year = max(school_year) %>%
filter(school_year == max_year) %>%
ungroup()
Example: I want to rank schools by their sccores on a particular subject.
schools %>%
group_by(subject) %>%
mutate(rank = min_rank(percent_correct))
mutate_at
/summarise_at
:as.numeric()
)mutate_at(vars(), funs())
results %>%
mutate_at(vars(score, grade), as.numeric)
mutate_if
/summarise_if
:inputs: test, function(s)
summarise_if(is.numeric, funs(mean, min, max, sd, n))
summarise_if(is.numeric, as.character)
summarise_if(is.numeric, as.factor)
Find the the mean of ALL the columns in the PARCC scores data set with “SCORE” in the name AND all columns that R knows are numbers.
parcc_scores <- read_rds("S:/Data Analytics/State Test Analysis/2016-2017/PARCC/Results Release/~Data/Processed/Uncommon Student-level results/Output/PARCC_SY1617_results_ESMS.rds")
## Warning: package 'bindrcpp' was built under R version 3.3.3
## # A tibble: 1 x 8
## score proficient voidscorecode voidscorereason testreadingscalescore
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 766.2965 0.70374 NaN NaN 53.25137
## # ... with 3 more variables: testwritingscalescore <dbl>,
## # tested_diff_school <dbl>, testscorecomplete <dbl>
summarise_at
or summarise_if
NOT summarise
.SPREAD
to create ELA and Math versions of BOTH proficient and score.parcc_scores <- read_rds("S:/Data Analytics/State Test Analysis/2016-2017/PARCC/Results Release/~Data/Processed/Uncommon Student-level results/Output/PARCC_SY1617_results_ESMS.rds")
## # A tibble: 6 x 4
## # Groups: grade [3]
## grade report_subject proficient score
## <chr> <chr> <dbl> <dbl>
## 1 3 ELA 0.7145877 774.9471
## 2 3 Math 0.7864693 770.5455
## 3 4 ELA 0.7816594 774.5917
## 4 4 Math 0.6965066 762.2009
## 5 5 ELA 0.7325349 769.6747
## 6 5 Math 0.5588822 753.7784
Clean up this set of code to use summarise_at and mutate_at where useful. Do not worry about all the column names being the same as they are now. :)
combinedSurveyTransp <- read_rds("S:/Data Analytics/Staff Analyses/Staff Survey/temp/combined_survey_tranporse.rds")
schoolSummary<-combinedSurveyTransp %>%
group_by(Region,School,Grade.Span, School.Year, Please.select.your.role, Q.Display, Q.Category,For.how.long.have.you.been.with.Uncommon.Schools, Principal, DOO) %>%
summarise(
LikertAvg=sum(Likert,na.rm=TRUE),
Likert5=sum(Likert5,na.rm=TRUE),
Likert4=sum(Likert4,na.rm=TRUE),
Likert3=sum(Likert3,na.rm=TRUE),
Likert2=sum(Likert2,na.rm=TRUE),
Likert1=sum(Likert1,na.rm=TRUE),
Likert54=sum(Likert54,na.rm=TRUE),
Likert12=sum(Likert12,na.rm=TRUE),
LikertN=sum(ifelse(is.na(Likert), 0, 1)),
NPSAvg=sum(NPS,na.rm=TRUE),
NPS9.10=sum(NPS9.10,na.rm=TRUE),
NPS7.8=sum(NPS7.8,na.rm=TRUE),
NPS0.6=sum(NPS0.6,na.rm=TRUE),
NPSN=sum(ifelse(is.na(NPS), 0, 1)),
workloadLow=sum(workloadLow,na.rm=TRUE),
workloadRight=sum(workloadRight, na.rm=TRUE),
workloadHigh=sum(workloadHigh, na.rm=TRUE),
workloadTooHigh=sum(workloadTooHigh, na.rm=TRUE),
workloadN=sum(ifelse(is.na(workloadN), 0, 1)),
leaveEOY=sum(leaveEOY,na.rm=TRUE),
leave1.2=sum(leave1.2,na.rm=TRUE),
leave3.4=sum(leave3.4,na.rm=TRUE),
leave5.6=sum(leave5.6,na.rm=TRUE),
leave.Many=sum(leave.Many,na.rm=TRUE),
leaveN=sum(ifelse(is.na(expectedYrsN), 0, 1)),
reasonCareer=sum(reasonCareer,na.rm=TRUE),
reasonLearn=sum(reasonLearn,na.rm=TRUE),
reasonLeader=sum(reasonLeader,na.rm=TRUE),
reasonLifestyle=sum(reasonLifestyle,na.rm=TRUE),
reasonMoving=sum(reasonMoving,na.rm=TRUE),
reasonAdvance=sum(reasonAdvance,na.rm=TRUE),
reasonSalary=sum(reasonSalary,na.rm=TRUE),
reasonStaffCulture=sum(reasonStaffCulture,na.rm=TRUE),
reasonStudentCulture=sum(reasonStudentCulture,na.rm=TRUE),
reasonOther=sum(reasonOther,na.rm=TRUE),
reasonN=sum(ifelse(is.na(whyLeaveN), 0, 1)),
desiredChangeAvg=sum(desiredChange, na.rm=TRUE),
desiredChangeN=sum(ifelse(is.na(desiredChange), 0, 1))
)