0
点赞
收藏
分享

微信扫一扫

【R语言文本挖掘】:情感分析与词云图绘制

【R语言文本挖掘】:情感分析与词云图绘制


请添加图片描述

引言

image-20220717193903881

1.情感数据集

  • AFINN
  • bing
  • nrc
library(`tidytext`)
library(dplyr)
get_sentiments("nrc") %>% head()
A tibble: 6 × 2
wordsentiment
<chr><chr>
abacus trust
abandon fear
abandon negative
abandon sadness
abandonedanger
abandonedfear
get_sentiments("bing") %>% head()
A tibble: 6 × 2
wordsentiment
<chr><chr>
2-faces negative
abnormal negative
abolish negative
abominablenegative
abominablynegative
abominate negative
get_sentiments("afinn") %>% head()
A tibble: 6 × 2
wordvalue
<chr><dbl>
abandon -2
abandoned -2
abandons -2
abducted -2
abduction -2
abductions-2

2.使用内连接进行情感分析

library(janeaustenr)
library(dplyr)
library(stringr)

tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, 
                                regex("^chapter [\\divxlc]", 
                                      ignore_case = TrUE)))) %>%#使用正则表达式来定义章节
  ungroup() %>%
  unnest_tokens(word, text)#分词
tidy_books %>% head()
A tibble: 6 × 4
booklinenumberchapterword
<fct><int><int><chr>
Sense & Sensibility10sense
Sense & Sensibility10and
Sense & Sensibility10sensibility
Sense & Sensibility30by
Sense & Sensibility30jane
Sense & Sensibility30austen
nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")

tidy_books %>%
    filter(book=='Emma') %>%
    inner_join(nrc_joy) %>%
    count(word,sort=TrUE)%>%
    head()
A tibble: 6 × 2
wordn
<chr><int>
good 359
friend166
hope 143
happy 125
love 117
deal 92
library(tidyr)

jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%#使用bing情绪词典进行内连接
  count(book, index = linenumber %/% 80, sentiment) %>%#按八十行为一个小段进行记数
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%#将数据转换成宽数据 
  mutate(sentiment = positive - negative)#计算净情绪,如果大于0说明是积极情绪,小于0说明是消极的
jane_austen_sentiment %>% head()
A tibble: 6 × 5
bookindexnegativepositivesentiment
<fct><dbl><int><int><int>
Sense & Sensibility0163216
Sense & Sensibility1195334
Sense & Sensibility2123119
Sense & Sensibility3153116
Sense & Sensibility4163418
Sense & Sensibility5165135
library(ggplot2)

ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col()+#绘制柱形图
  facet_wrap(~book, ncol = 2, scales = "free_x")#根据不同书进行分面绘图,两行


请添加图片描述

3.对比三种情感字典

pride_prejudice <- tidy_books %>% 
    filter(book == 'Pride & Prejudice')
pride_prejudice %>% head()
A tibble: 6 × 4
booklinenumberchapterword
<fct><int><int><chr>
Pride & Prejudice10pride
Pride & Prejudice10and
Pride & Prejudice10prejudice
Pride & Prejudice30by
Pride & Prejudice30jane
Pride & Prejudice30austen
# 使用AFINN词典
afinn <- pride_prejudice %>% 
  inner_join(get_sentiments("afinn")) %>% #内连接,得到带有情感的文本
  group_by(index = linenumber %/% 80) %>% #每隔80行作为一小段
  summarise(sentiment = sum(value)) %>% #这里我们进行一个求和处理,因为这里是以数字表示情感的
  mutate(method = "AFINN")

# bing词典
bing <- pride_prejudice %>% 
        inner_join(get_sentiments("bing")) %>%#使用bing词典进行内连接
        mutate(method = "Bing")%>%
        count(index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
# 使用nrc词典
nrc <- pride_prejudice %>% 
        inner_join(get_sentiments("nrc") %>%
        filter(sentiment %in% c('positive','negative'))) %>%
        mutate(method = "NRC")%>%
        count(index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) %>% 
  mutate(sentiment = positive - negative)

bind_rows(afinn, 
          bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")


png

get_sentiments('nrc') %>%
  filter(sentiment %in% c('positive','negative')) %>%
  count(sentiment)
A tibble: 2 × 2
sentimentn
<chr><int>
negative3316
positive2308
get_sentiments('bing') %>%
  count(sentiment)
A tibble: 2 × 2
sentimentn
<chr><int>
negative4781
positive2005

4.最常见的积极和消极的单词

bing_word_count <- tidy_books %>%
 inner_join(get_sentiments('bing')) %>%
 count(word, sentiment, sort = TrUE) %>%
 ungroup()
[1m[22mJoining, by = "word"
bing_word_count %>% head()
A tibble: 6 × 3
wordsentimentn
<chr><chr><int>
miss negative1855
well positive1523
good positive1380
great positive 981
like positive 725
betterpositive 639
bing_word_count %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% #删选出前10的
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%#重新排序
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)


png

custom_stop_words <- bind_rows(tibble(word = c("miss"),  
                                      lexicon = c("custom")), 
                               stop_words)

custom_stop_words %>% head()
A tibble: 6 × 2
wordlexicon
<chr><chr>
miss custom
a SMArT
a's SMArT
able SMArT
aboutSMArT
aboveSMArT

5.词云绘制

library(wordcloud)
tidy_books %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))#出现次数最多的前100个

png

library(reshape2)
tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TrUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
 comparison.cloud(colors = c("blue", "red"),
                  
                  max.words = 100)

png

6.总结

参考资料:Text Mining with R

举报

相关推荐

0 条评论