library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(tidyr)
library(tidytext)
library(textdata)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ purrr     1.0.4
## ✔ ggplot2   4.0.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(stringr)
library(purrr)

df <- read_csv("songs_about_jane.csv")
## Rows: 12 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): track_title, lyric
## dbl (2): track_n, line
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
split_lyric_into_lines <- function(txt) {
  if (is.na(txt) || txt == "") return(character(0))
  
  # Split before capital letters, but only if preceded by space
  parts <- str_split(
    txt,
    "\\s+(?=[A-Z])"
  )[[1]]
  
  parts <- str_trim(parts)
  parts <- parts[parts != ""]
  
  parts
}


df_lines <- df %>%
  mutate(line_vec = map(lyric, split_lyric_into_lines)) %>%
  select(track_title, track_n, line_vec) %>%
  unnest_longer(line_vec, values_to = "lyric") %>%
  group_by(track_title, track_n) %>%
  mutate(line = row_number()) %>%
  ungroup() %>%
  select(track_title, track_n, line, lyric)

# Save as a new CSV
write_csv(df_lines, "songs_about_jane_lines.csv")

Lyric Sentiment Analysis

  1. Utilize sentiment analysis to study a textual document in a manner you find suitable. Choose a lexicon library to assess the sentiment of the dataset (e.g., determining whether it is positive, negative, or joyous). Present your findings using suitable charts, and explain the results in 1-2 paragraphs within the knit HTML output, including relevant code and visualizations.

Overall, I think the sentiment analysis of Maroon 5’s Songs About Jane, worked pretty well. One thing I think the analysis excelled at, was recognizing which words contributed to negative sentiment and which contributed to positive sentiment. Looking at the word cloud, its clear to see that positive and negative words were separated well based on their sentiment. I also think the sentiment analysis worked well when looking at the plot that shows the sentiment over time for each song in the album. While each song in Songs About Jane is pretty doomy and gloomy, some songs like Tangled and Not Coming Home are more negative and register more negatively, and songs like She Will Be Loved are more positive and register more positively.

songsaboutjane <- read.csv("songs_about_jane_lines_clean.csv")
nrc_anger <- get_sentiments("nrc") %>% 
  filter(sentiment == "anger")
nrc_anger
## # A tibble: 1,245 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abandoned   anger    
##  2 abandonment anger    
##  3 abhor       anger    
##  4 abhorrent   anger    
##  5 abolish     anger    
##  6 abomination anger    
##  7 abuse       anger    
##  8 accursed    anger    
##  9 accusation  anger    
## 10 accused     anger    
## # ℹ 1,235 more rows
library(stringr)
## we need to make sure that the lyrics are characters
songsaboutjane$lyric <- as.character(songsaboutjane$lyric)

tidy_song <- songsaboutjane %>%
  group_by(track_title) %>%
  ungroup() %>%
  unnest_tokens(word,lyric)
tidy_song %>%
  filter(track_title == "Harder to Breathe")%>%
  inner_join(nrc_anger) %>%
  count(word, sort = TRUE)
## Joining with `by = join_by(word)`
## # A tibble: 2 × 2
##   word        n
##   <chr>   <int>
## 1 painful     1
## 2 sting       1
song_sentiment <- tidy_song %>%
  inner_join(get_sentiments("bing")) %>%
  count(track_title, index = line, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
ggplot(song_sentiment, aes(index, sentiment, fill = track_title)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~track_title)

afinn <- tidy_song %>% 
  filter(track_title == "Harder to Breathe") %>%
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = line) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")
## Joining with `by = join_by(word)`
bing_and_nrc <- bind_rows(tidy_song %>% 
                            filter(track_title == "Harder to Breathe") %>%
                            inner_join(get_sentiments("bing")) %>%
                            mutate(method = "Bing et al."),
                          tidy_song %>% 
                            filter(track_title == "Harder to Breathe") %>%
                            inner_join(get_sentiments("nrc") %>% 
                                         filter(sentiment %in% c("positive", 
                                                                 "negative"))) %>%
                            mutate(method = "NRC")) %>%
  count(method, index = line, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
## Joining with `by = join_by(word)`
bind_rows(afinn, 
          bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

get_sentiments("nrc") %>% 
     filter(sentiment %in% c("positive", 
                             "negative")) %>% 
  count(sentiment)
## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   3316
## 2 positive   2308
get_sentiments("bing") %>% 
  count(sentiment)
## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005
bing_word_counts <- tidy_song %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
## Joining with `by = join_by(word)`
bing_word_counts
## # A tibble: 101 × 3
##    word     sentiment     n
##    <chr>    <chr>     <int>
##  1 love     positive     20
##  2 hard     negative     11
##  3 loved    positive     10
##  4 bad      negative      9
##  5 like     positive      9
##  6 better   positive      7
##  7 crazy    negative      7
##  8 shameful negative      7
##  9 breaking negative      6
## 10 broken   negative      6
## # ℹ 91 more rows
bing_word_counts %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()
## Selecting by n

library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
tidy_song %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))
## Joining with `by = join_by(word)`

library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
tidy_song %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("#ff5555", "#33DD33"),
                   max.words = 100)
## Joining with `by = join_by(word)`

library(sentimentr)

song_sentiment_sent <- songsaboutjane %>%
    get_sentences() %>%
    sentiment_by(by = c('track_title', 'line'))%>%
  as.data.frame()
ggplot(song_sentiment_sent, aes(line, ave_sentiment, fill = track_title)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~track_title)

AI for text classification and sentiment

Finally, we will conduct a structured, AI-based analysis of a TikTok dataset. The data include climate-related TikToks for which we aim to study not just sentiment, but also categories such as climate pessimism. While we can access an AI API from R, it requires a paid license. For accessibility, I designed this exercise in UMich ChatGPT, but it works similarly with other AIs. It does require copying and pasting prompts into the interface, and we are not able to fine-tune the model. In essence, we use a zero-shot model to classify content and return sentiment in a structured format (though a few examples help “nudge” performance and can be tweaked).

#head(read.csv("TikTok_Pessimism.csv"))
Label: A
Rationale: accurately describes urban heat island effect and benefits of tree cover
Evidence spans:
  • reduce the urban heat island effect
  • tree coverage is vital
  • reduce heat-related illnesses
Label: C
Rationale: frames climate change as exaggerated doom or bullshit
Evidence spans:
  • countdown to when global warming is going to kill us or some bullshit
Label: A
Rationale: affirms climate change and links agricultural emissions to greenhouse gases
Evidence spans:
  • important emitters of greenhouse gases
  • directly contribute to climate change
Label: A
Rationale: accepts climate change impacts and criticizes political rejection of climate action
Evidence spans:
  • DeSantis rejects climate change
  • war on climate change
Label: A
Rationale: describes climate change, melting ice caps, and environmental degradation as real threats
Evidence spans:
  • climate change are real, tangible threats
  • ice caps are melting
Label: A
Rationale: supports climate science and critiques carbon credit misuse enabling continued emissions
Evidence spans:
  • undermining the overall goal of reducing that carbon footprint
  • current issues of climate change
Label: A
Rationale: affirms livestock emissions and advocates plant-based solutions to climate change
Evidence spans:
  • not looking at the long-term impacts for methane
  • not a solution to the climate issue
Label: A
Rationale: states climate change has economic consequences such as rising insurance costs
Evidence spans:
  • Climate change is driving Texas home insurance through the roof
Label: A
Rationale: accurately describes climate tech as technologies addressing climate change
Evidence spans:
  • technological solutions for climate change
  • reduce greenhouse gas emissions
Label: A
Rationale: criticizes ecological harm and highlights impacts on ecosystems
Evidence spans:
  • ecological catastrophe
  • native vegetation
Label: A
Rationale: supports climate policy and ties worsening disasters to climate inaction
Evidence spans:
  • climate measures
  • problem is going to continue to get worse
Label: A
Rationale: endorses climate science and warns of severe warming from CO2 rise
Evidence spans:
  • CO2 emissions are cooking the planet
  • global overheating is not just a theory
Label: A
Rationale: explains biogas as a renewable energy alternative reducing emissions
Evidence spans:
  • biogas is produced
  • energy services
Label: A
Rationale: describes observable climate-related environmental changes
Evidence spans:
  • seeing changes to the planet
  • environment is super important
Label: A
Rationale: promotes climate solutions from Project Drawdown to reduce atmospheric greenhouse gases
Evidence spans:
  • start to solve climate change
  • reduce impacts of human activities
Label: E
Rationale: historical account of a volcanic eruption; not about modern climate change
Evidence spans:
  • 1815
  • volcanic eruption
Label: C
Rationale: minimizes ocean acidification by reframing it as merely reduced alkalinity
Evidence spans:
  • slight decrease in alkalinity
  • instead of saying the oceans are acidifying
Label: A
Rationale: supports emissions reductions and sustainable materials as climate solutions
Evidence spans:
  • reduce CO2 emissions
  • plastic waste
Label: A
Rationale: advocates for clean air, safe water, and reduced extreme-heat deaths
Evidence spans:
  • freedom to breathe clean air
  • extreme heat

AI Tik-Tok Sentiment Analysis

  1. Perform an AI-based analysis of your data and compare these results with those from the sentiment analysis. Reflect on how each method handles ambiguous text, providing examples of challenges related to subjectivity, tone, context, polarity, irony, sarcasm, comparisons, and neutral language (1-2 paragraphs; appropriate code and charts)

For the most part, the sentiment analysis, worked. Some things it struggled with was that whenever there was someone who accepts and recognizes climate change but talks about it with negative language, it tended to think of that sentiment and comments as denial. This makes sense as irony, sarcasm, and context is hard to pick up on by AI models.

Scheme I followed

Below is clear schema of classes, a sentiment category, and a few examples to guide the model that we provide to the the model. We also request a brief rationale, which helps us understand how the model made its decision. Copy and paste the full prompt into the UMich ChatGPT text box to start.

System: You are a rater that classifies short statements about climate change using a fixed schema. Follow the instructions exactly and return valid JSON only.

User: Classify the following statement.

Schema (JSON):

{
  "label": "A | B | C | D | E",
  "rationale": "brief reason citing key phrases",
  "evidence_spans": ["exact substrings that triggered the label"]
}

Definitions:
A = Accepts climate science …
B = Legitimate uncertainty …
C = Skeptical rhetoric …
D = Outright denial …
E = Off-topic/other …

Examples:
“CO₂ traps heat; we need to cut emissions.” → {"label":"A","rationale":"endorses greenhouse effect","evidence_spans":["cut emissions"]}
“I’m not sure how much is human-caused—any sources?” → {"label":"B","rationale":"good-faith uncertainty","evidence_spans":["any sources"]}
“It’s a hoax by scientists for grant money.” → {"label":"C","rationale":"conspiracy frame","evidence_spans":["hoax","grant money"]}
“Climate change isn’t real.” → {"label":"D","rationale":"explicit denial","evidence_spans":["isn’t real"]}

Now, copy one short statement or sentence from the TikTok dataset (from the text column in your CSV) and paste it directly into the UMich ChatGPT text box after the structured prompt. It should return a response in JSON format, similar to this:

{"label":"A","sentiment":"positive","rationale":"supports climate science and highlights positive impacts of urban tree cover on environment and health","evidence_spans":["reduce the urban heat island effect","tree coverage is vital in sheltering communities from heat waves","helps to reduce heat-related illnesses and mortality","reduce erosion and filter rainwater","reduce flooding"]}

You can also copy and paste multiple lines at once—the model will process them in a single run. When you are happy with the classifications, ask it to return a CSV of the results. Then copy the CSV text into Google Sheets or Excel and save it.