library(ggplot2)
library(readxl)
library(vembedr)
library(visdat)
library(tidyverse)
library(skimr)
library(palmerpenguins)
library(dplyr)
library(ggthemes)
library(ggridges)
library(janitor)
library(wesanderson)
library(here)
dplyr::glimpse()
or skimr::skim()
on the data. You should upload the data file into the data
directory.# Using read_excel here because this is what I am learning in this class, it is new to me. I always converted xlsx into csv and and loaded csv data.
friends = read_excel(here('data/friends_excel.xlsx'),
sheet=1,
na="NA")
friendsEMO = read_excel(here('data/friends_emotions_excel.xlsx'),
sheet=1,
na="NA")
# Direct links from the provided website
#friends = readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-08/friends.csv')
#friends_emotions = readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-08/friends_emotions.csv')
#friendsCSV = read.csv("friends.csv")
#friendsEmoCSV = read.csv("friends_emotions.csv")
# text and speaker are character variables and season, episode, scene and utterance are numeric variable.
dplyr::glimpse(friends)
## Rows: 67,373
## Columns: 6
## $ text <chr> "There's nothing to tell! He's just some guy I work with!",…
## $ speaker <chr> "Monica Geller", "Joey Tribbiani", "Chandler Bing", "Phoebe…
## $ season <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ episode <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ scene <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ utterance <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …
skimr::skim(friendsEMO)
Name | friendsEMO |
Number of rows | 12606 |
Number of columns | 5 |
_______________________ | |
Column type frequency: | |
character | 1 |
numeric | 4 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
emotion | 0 | 1 | 3 | 8 | 0 | 7 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
season | 0 | 1 | 2.57 | 1.11 | 1 | 2 | 3 | 4 | 4 | ▆▆▁▇▇ |
episode | 0 | 1 | 13.19 | 7.20 | 1 | 7 | 14 | 20 | 25 | ▇▆▇▇▇ |
scene | 0 | 1 | 7.52 | 4.43 | 1 | 4 | 7 | 11 | 29 | ▇▇▂▁▁ |
utterance | 0 | 1 | 9.80 | 5.94 | 1 | 5 | 9 | 14 | 30 | ▇▇▅▂▁ |
slice(friendsEMO)
## # A tibble: 12,606 x 5
## season episode scene utterance emotion
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 1 1 4 1 Mad
## 2 1 1 4 3 Neutral
## 3 1 1 4 4 Joyful
## 4 1 1 4 5 Neutral
## 5 1 1 4 6 Neutral
## 6 1 1 4 7 Neutral
## 7 1 1 4 8 Scared
## 8 1 1 4 10 Joyful
## 9 1 1 4 11 Joyful
## 10 1 1 4 12 Sad
## # … with 12,596 more rows
NA
coded as something else, or it is multiple tables, please make some notes here about what you need to do before you start transforming the data in the next section.case_when()
, etc.# I dont need to transform values but need to merge the two main dataset because one has the emotional responses and the other has the character names.
friendsMERGE = friendsEMO %>%
left_join(y = friends,
by = c("season" = "season", "episode" = "episode",
"scene" = "scene", "utterance" = "utterance"))
#nrow(friendsMERGE)
#friendsMERGEinner = friendsEMO %>%
# inner_join(y = friends,
# by = c("season" = "season", "episode" = "episode",
# "scene" = "scene", "utterance" = "utterance"))
#nrow(friendsMERGEinner)
friendsMERGE = friendsMERGE %>%
select(season, emotion, speaker)
#!is.na(friendsMERGE) too many rows to check
vis_dat(friendsMERGE)
left_join
, inner_join
, or right_join
on these tables. No credit will be provided if you don’t.glimpse()
, skim()
or head()
to illustrate your point.slice(friendsMERGE)
## # A tibble: 12,606 x 3
## season emotion speaker
## <dbl> <chr> <chr>
## 1 1 Mad Ross Geller
## 2 1 Neutral Joey Tribbiani
## 3 1 Joyful Chandler Bing
## 4 1 Neutral Joey Tribbiani
## 5 1 Neutral Chandler Bing
## 6 1 Neutral Joey Tribbiani
## 7 1 Scared Chandler Bing
## 8 1 Joyful Joey Tribbiani
## 9 1 Joyful Chandler Bing
## 10 1 Sad Ross Geller
## # … with 12,596 more rows
group_by()/summarize()
to make a summary of the data here. The summary should be relevant to your research question# Here I am filtering the the emotions I need and for the main six characters. We have emotions data only for four seasons.
friendsFinal = friendsMERGE %>%
filter(emotion %in% c("Joyful", "Sad", "Powerful", "Mad")) %>%
filter(speaker %in% c("Chandler Bing", "Joey Tribbiani",
"Ross Geller", "Phoebe Buffay", "Monica Geller",
"Rachel Green")) %>%
filter(season %in% c(1, 2, 3, 4)) %>%
group_by(speaker, emotion, season) %>%
summarize(EmoCount = n())
## `summarise()` has grouped output by 'speaker', 'emotion'. You can override using the `.groups` argument.
glimpse(friendsFinal)
## Rows: 96
## Columns: 4
## Groups: speaker, emotion [24]
## $ speaker <chr> "Chandler Bing", "Chandler Bing", "Chandler Bing", "Chandler…
## $ emotion <chr> "Joyful", "Joyful", "Joyful", "Joyful", "Mad", "Mad", "Mad",…
## $ season <dbl> 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, …
## $ EmoCount <int> 96, 92, 103, 96, 57, 57, 63, 41, 20, 17, 39, 65, 17, 18, 18,…
slice(friendsFinal)
## # A tibble: 96 x 4
## # Groups: speaker, emotion [24]
## speaker emotion season EmoCount
## <chr> <chr> <dbl> <int>
## 1 Chandler Bing Joyful 1 96
## 2 Chandler Bing Joyful 2 92
## 3 Chandler Bing Joyful 3 103
## 4 Chandler Bing Joyful 4 96
## 5 Chandler Bing Mad 1 57
## 6 Chandler Bing Mad 2 57
## 7 Chandler Bing Mad 3 63
## 8 Chandler Bing Mad 4 41
## 9 Chandler Bing Powerful 1 20
## 10 Chandler Bing Powerful 2 17
## # … with 86 more rows
# Bar plot by each four interested emotions
ggplot(data = friendsFinal,
aes(x = speaker,
y = EmoCount,
fill = factor(season))) +
geom_bar(stat = "identity", position = "dodge") +
facet_wrap(vars(emotion)) +
scale_fill_discrete(name = "Seasons") +
labs(x = "character Names",
y = "Emotional Response Count",
title = "Barplot of Emotions by Seasons") +
theme_bw() +
theme(axis.text.x = element_text(angle = -30, hjust = 0),
legend.position = "bottom") +
scale_fill_viridis_d(name = "Seasons") +
scale_color_manual(values = wes_palette("BottleRocket2"))
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
# Bar plot for all emotional response by season
ggplot(data = friendsFinal,
aes(x = season,
y = EmoCount,
fill = factor(speaker))) +
geom_bar(stat = "identity", position = "dodge") +
scale_fill_discrete(name = "Seasons") +
labs(x = "Seasons",
y = "Emotional Response Count",
title = "Barplot of Emotions by Seasons") +
theme_bw() +
theme(axis.text.x = element_text(angle = -30, hjust = 0),
legend.position = "bottom") +
scale_fill_viridis_d(name = "character Names") +
scale_color_manual(values = wes_palette("BottleRocket1"))
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
# I followed the class notes and this website to write the functions https://www.statmethods.net/management/userfunctions.html
gubla = function(df, emo, seas){
outDF = df %>%
filter(emotion %in% c(emo)) %>%
filter(season == seas) %>%
filter(speaker %in% c("Chandler Bing", "Joey Tribbiani",
"Ross Geller", "Phoebe Buffay", "Monica Geller",
"Rachel Green")) # I only want the main six characters' emotional responses
return(outDF)
}
pipra = function(df, emo){
outDF = df %>%
group_by(speaker) %>%
summarize(emo = n())
return(outDF)
}
# This is how I planned to use them
#JoyS1 = gubla(df = friendsMERGE, emo = "Joyful", seas = 1)
#SadS1 = gubla(df = friendsMERGE, emo = "Sad", seas = 1)
#JoyS1summary = pipra(df = JoyS1)
#SadS1summary = pipra(df = SadS1)
#MERGEemoS1 = JoyS1summary%>%
# left_join(y = SadS1summary,
# by = c("speaker" = "speaker"))
#slice(MERGEemoS1)