【R语言文本挖掘】：情感分析与词云图绘制

请添加图片描述

引言

1.情感数据集

AFINN
bing
nrc

library(`tidytext`)
library(dplyr)
get_sentiments("nrc") %>% head()

A tibble: 6 × 2
word	sentiment
<chr>	<chr>
abacus	trust
abandon	fear
abandon	negative
abandon	sadness
abandoned	anger
abandoned	fear

get_sentiments("bing") %>% head()

A tibble: 6 × 2
word	sentiment
<chr>	<chr>
2-faces	negative
abnormal	negative
abolish	negative
abominable	negative
abominably	negative
abominate	negative

get_sentiments("afinn") %>% head()

A tibble: 6 × 2
word	value
<chr>	<dbl>
abandon	-2
abandoned	-2
abandons	-2
abducted	-2
abduction	-2
abductions	-2

2.使用内连接进行情感分析

library(janeaustenr)
library(dplyr)
library(stringr)

tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, 
                                regex("^chapter [\\divxlc]", 
                                      ignore_case = TrUE)))) %>%#使用正则表达式来定义章节
  ungroup() %>%
  unnest_tokens(word, text)#分词
tidy_books %>% head()

A tibble: 6 × 4
book	linenumber	chapter	word
<fct>	<int>	<int>	<chr>
Sense & Sensibility	1	0	sense
Sense & Sensibility	1	0	and
Sense & Sensibility	1	0	sensibility
Sense & Sensibility	3	0	by
Sense & Sensibility	3	0	jane
Sense & Sensibility	3	0	austen

nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")

tidy_books %>%
    filter(book=='Emma') %>%
    inner_join(nrc_joy) %>%
    count(word,sort=TrUE)%>%
    head()

A tibble: 6 × 2
word	n
<chr>	<int>
good	359
friend	166
hope	143
happy	125
love	117
deal	92

library(tidyr)

jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%#使用bing情绪词典进行内连接
  count(book, index = linenumber %/% 80, sentiment) %>%#按八十行为一个小段进行记数
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%#将数据转换成宽数据 
  mutate(sentiment = positive - negative)#计算净情绪，如果大于0说明是积极情绪，小于0说明是消极的

jane_austen_sentiment %>% head()

A tibble: 6 × 5
book	index	negative	positive	sentiment
<fct>	<dbl>	<int>	<int>	<int>
Sense & Sensibility	0	16	32	16
Sense & Sensibility	1	19	53	34
Sense & Sensibility	2	12	31	19
Sense & Sensibility	3	15	31	16
Sense & Sensibility	4	16	34	18
Sense & Sensibility	5	16	51	35

library(ggplot2)

ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col()+#绘制柱形图
  facet_wrap(~book, ncol = 2, scales = "free_x")#根据不同书进行分面绘图，两行

请添加图片描述

3.对比三种情感字典

pride_prejudice <- tidy_books %>% 
    filter(book == 'Pride & Prejudice')
pride_prejudice %>% head()

A tibble: 6 × 4
book	linenumber	chapter	word
<fct>	<int>	<int>	<chr>
Pride & Prejudice	1	0	pride
Pride & Prejudice	1	0	and
Pride & Prejudice	1	0	prejudice
Pride & Prejudice	3	0	by
Pride & Prejudice	3	0	jane
Pride & Prejudice	3	0	austen

# 使用AFINN词典
afinn <- pride_prejudice %>% 
  inner_join(get_sentiments("afinn")) %>% #内连接，得到带有情感的文本
  group_by(index = linenumber %/% 80) %>% #每隔80行作为一小段
  summarise(sentiment = sum(value)) %>% #这里我们进行一个求和处理，因为这里是以数字表示情感的
  mutate(method = "AFINN")

# bing词典
bing <- pride_prejudice %>% 
        inner_join(get_sentiments("bing")) %>%#使用bing词典进行内连接
        mutate(method = "Bing")%>%
        count(index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
# 使用nrc词典
nrc <- pride_prejudice %>% 
        inner_join(get_sentiments("nrc") %>%
        filter(sentiment %in% c('positive','negative'))) %>%
        mutate(method = "NRC")%>%
        count(index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) %>% 
  mutate(sentiment = positive - negative)

bind_rows(afinn, 
          bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

png

get_sentiments('nrc') %>%
  filter(sentiment %in% c('positive','negative')) %>%
  count(sentiment)

A tibble: 2 × 2
sentiment	n
<chr>	<int>
negative	3316
positive	2308

get_sentiments('bing') %>%
  count(sentiment)

A tibble: 2 × 2
sentiment	n
<chr>	<int>
negative	4781
positive	2005

4.最常见的积极和消极的单词

bing_word_count <- tidy_books %>%
 inner_join(get_sentiments('bing')) %>%
 count(word, sentiment, sort = TrUE) %>%
 ungroup()

[1m[22mJoining, by = "word"

bing_word_count %>% head()

A tibble: 6 × 3
word	sentiment	n
<chr>	<chr>	<int>
miss	negative	1855
well	positive	1523
good	positive	1380
great	positive	981
like	positive	725
better	positive	639

bing_word_count %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% #删选出前10的
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%#重新排序
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

png

custom_stop_words <- bind_rows(tibble(word = c("miss"),  
                                      lexicon = c("custom")), 
                               stop_words)

custom_stop_words %>% head()

A tibble: 6 × 2
word	lexicon
<chr>	<chr>
miss	custom
a	SMArT
a's	SMArT
able	SMArT
about	SMArT
above	SMArT

5.词云绘制

library(wordcloud)
tidy_books %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))#出现次数最多的前100个

png

library(reshape2)
tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TrUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
 comparison.cloud(colors = c("blue", "red"),
                  
                  max.words = 100)

png

6.总结

参考资料：Text Mining with R