【R语言文本挖掘】:情感分析与词云图绘制

引言

1.情感数据集
library(`tidytext`)
library(dplyr)
get_sentiments("nrc") %>% head()
A tibble: 6 × 2
| word | sentiment |
|---|
| <chr> | <chr> |
|---|
| abacus | trust |
| abandon | fear |
| abandon | negative |
| abandon | sadness |
| abandoned | anger |
| abandoned | fear |
get_sentiments("bing") %>% head()
A tibble: 6 × 2
| word | sentiment |
|---|
| <chr> | <chr> |
|---|
| 2-faces | negative |
| abnormal | negative |
| abolish | negative |
| abominable | negative |
| abominably | negative |
| abominate | negative |
get_sentiments("afinn") %>% head()
A tibble: 6 × 2
| word | value |
|---|
| <chr> | <dbl> |
|---|
| abandon | -2 |
| abandoned | -2 |
| abandons | -2 |
| abducted | -2 |
| abduction | -2 |
| abductions | -2 |
2.使用内连接进行情感分析
library(janeaustenr)
library(dplyr)
library(stringr)
tidy_books <- austen_books() %>%
group_by(book) %>%
mutate(
linenumber = row_number(),
chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]",
ignore_case = TrUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)
tidy_books %>% head()
A tibble: 6 × 4
| book | linenumber | chapter | word |
|---|
| <fct> | <int> | <int> | <chr> |
|---|
| Sense & Sensibility | 1 | 0 | sense |
| Sense & Sensibility | 1 | 0 | and |
| Sense & Sensibility | 1 | 0 | sensibility |
| Sense & Sensibility | 3 | 0 | by |
| Sense & Sensibility | 3 | 0 | jane |
| Sense & Sensibility | 3 | 0 | austen |
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
tidy_books %>%
filter(book=='Emma') %>%
inner_join(nrc_joy) %>%
count(word,sort=TrUE)%>%
head()
A tibble: 6 × 2
| word | n |
|---|
| <chr> | <int> |
|---|
| good | 359 |
| friend | 166 |
| hope | 143 |
| happy | 125 |
| love | 117 |
| deal | 92 |
library(tidyr)
jane_austen_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
jane_austen_sentiment %>% head()
A tibble: 6 × 5
| book | index | negative | positive | sentiment |
|---|
| <fct> | <dbl> | <int> | <int> | <int> |
|---|
| Sense & Sensibility | 0 | 16 | 32 | 16 |
| Sense & Sensibility | 1 | 19 | 53 | 34 |
| Sense & Sensibility | 2 | 12 | 31 | 19 |
| Sense & Sensibility | 3 | 15 | 31 | 16 |
| Sense & Sensibility | 4 | 16 | 34 | 18 |
| Sense & Sensibility | 5 | 16 | 51 | 35 |
library(ggplot2)
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
geom_col()+
facet_wrap(~book, ncol = 2, scales = "free_x")

3.对比三种情感字典
pride_prejudice <- tidy_books %>%
filter(book == 'Pride & Prejudice')
pride_prejudice %>% head()
A tibble: 6 × 4
| book | linenumber | chapter | word |
|---|
| <fct> | <int> | <int> | <chr> |
|---|
| Pride & Prejudice | 1 | 0 | pride |
| Pride & Prejudice | 1 | 0 | and |
| Pride & Prejudice | 1 | 0 | prejudice |
| Pride & Prejudice | 3 | 0 | by |
| Pride & Prejudice | 3 | 0 | jane |
| Pride & Prejudice | 3 | 0 | austen |
afinn <- pride_prejudice %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
bing <- pride_prejudice %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing")%>%
count(index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative)
nrc <- pride_prejudice %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c('positive','negative'))) %>%
mutate(method = "NRC")%>%
count(index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative)
bind_rows(afinn,
bing_and_nrc) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")

get_sentiments('nrc') %>%
filter(sentiment %in% c('positive','negative')) %>%
count(sentiment)
A tibble: 2 × 2
| sentiment | n |
|---|
| <chr> | <int> |
|---|
| negative | 3316 |
| positive | 2308 |
get_sentiments('bing') %>%
count(sentiment)
A tibble: 2 × 2
| sentiment | n |
|---|
| <chr> | <int> |
|---|
| negative | 4781 |
| positive | 2005 |
4.最常见的积极和消极的单词
bing_word_count <- tidy_books %>%
inner_join(get_sentiments('bing')) %>%
count(word, sentiment, sort = TrUE) %>%
ungroup()
[1m[22mJoining, by = "word"
bing_word_count %>% head()
A tibble: 6 × 3
| word | sentiment | n |
|---|
| <chr> | <chr> | <int> |
|---|
| miss | negative | 1855 |
| well | positive | 1523 |
| good | positive | 1380 |
| great | positive | 981 |
| like | positive | 725 |
| better | positive | 639 |
bing_word_count %>%
group_by(sentiment) %>%
slice_max(n, n = 10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment",
y = NULL)

custom_stop_words <- bind_rows(tibble(word = c("miss"),
lexicon = c("custom")),
stop_words)
custom_stop_words %>% head()
A tibble: 6 × 2
| word | lexicon |
|---|
| <chr> | <chr> |
|---|
| miss | custom |
| a | SMArT |
| a's | SMArT |
| able | SMArT |
| about | SMArT |
| above | SMArT |
5.词云绘制
library(wordcloud)
tidy_books %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))

library(reshape2)
tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TrUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("blue", "red"),
max.words = 100)

6.总结
参考资料:Text Mining with R