【R语言文本挖掘】:情感分析与词云图绘制

引言

1.情感数据集
library(`tidytext`)
library(dplyr)
get_sentiments("nrc") %>% head()
A tibble: 6 × 2
word | sentiment |
---|
<chr> | <chr> |
---|
abacus | trust |
abandon | fear |
abandon | negative |
abandon | sadness |
abandoned | anger |
abandoned | fear |
get_sentiments("bing") %>% head()
A tibble: 6 × 2
word | sentiment |
---|
<chr> | <chr> |
---|
2-faces | negative |
abnormal | negative |
abolish | negative |
abominable | negative |
abominably | negative |
abominate | negative |
get_sentiments("afinn") %>% head()
A tibble: 6 × 2
word | value |
---|
<chr> | <dbl> |
---|
abandon | -2 |
abandoned | -2 |
abandons | -2 |
abducted | -2 |
abduction | -2 |
abductions | -2 |
2.使用内连接进行情感分析
library(janeaustenr)
library(dplyr)
library(stringr)
tidy_books <- austen_books() %>%
group_by(book) %>%
mutate(
linenumber = row_number(),
chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]",
ignore_case = TrUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)
tidy_books %>% head()
A tibble: 6 × 4
book | linenumber | chapter | word |
---|
<fct> | <int> | <int> | <chr> |
---|
Sense & Sensibility | 1 | 0 | sense |
Sense & Sensibility | 1 | 0 | and |
Sense & Sensibility | 1 | 0 | sensibility |
Sense & Sensibility | 3 | 0 | by |
Sense & Sensibility | 3 | 0 | jane |
Sense & Sensibility | 3 | 0 | austen |
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
tidy_books %>%
filter(book=='Emma') %>%
inner_join(nrc_joy) %>%
count(word,sort=TrUE)%>%
head()
A tibble: 6 × 2
word | n |
---|
<chr> | <int> |
---|
good | 359 |
friend | 166 |
hope | 143 |
happy | 125 |
love | 117 |
deal | 92 |
library(tidyr)
jane_austen_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
jane_austen_sentiment %>% head()
A tibble: 6 × 5
book | index | negative | positive | sentiment |
---|
<fct> | <dbl> | <int> | <int> | <int> |
---|
Sense & Sensibility | 0 | 16 | 32 | 16 |
Sense & Sensibility | 1 | 19 | 53 | 34 |
Sense & Sensibility | 2 | 12 | 31 | 19 |
Sense & Sensibility | 3 | 15 | 31 | 16 |
Sense & Sensibility | 4 | 16 | 34 | 18 |
Sense & Sensibility | 5 | 16 | 51 | 35 |
library(ggplot2)
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
geom_col()+
facet_wrap(~book, ncol = 2, scales = "free_x")

3.对比三种情感字典
pride_prejudice <- tidy_books %>%
filter(book == 'Pride & Prejudice')
pride_prejudice %>% head()
A tibble: 6 × 4
book | linenumber | chapter | word |
---|
<fct> | <int> | <int> | <chr> |
---|
Pride & Prejudice | 1 | 0 | pride |
Pride & Prejudice | 1 | 0 | and |
Pride & Prejudice | 1 | 0 | prejudice |
Pride & Prejudice | 3 | 0 | by |
Pride & Prejudice | 3 | 0 | jane |
Pride & Prejudice | 3 | 0 | austen |
afinn <- pride_prejudice %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
bing <- pride_prejudice %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing")%>%
count(index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative)
nrc <- pride_prejudice %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c('positive','negative'))) %>%
mutate(method = "NRC")%>%
count(index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative)
bind_rows(afinn,
bing_and_nrc) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")

get_sentiments('nrc') %>%
filter(sentiment %in% c('positive','negative')) %>%
count(sentiment)
A tibble: 2 × 2
sentiment | n |
---|
<chr> | <int> |
---|
negative | 3316 |
positive | 2308 |
get_sentiments('bing') %>%
count(sentiment)
A tibble: 2 × 2
sentiment | n |
---|
<chr> | <int> |
---|
negative | 4781 |
positive | 2005 |
4.最常见的积极和消极的单词
bing_word_count <- tidy_books %>%
inner_join(get_sentiments('bing')) %>%
count(word, sentiment, sort = TrUE) %>%
ungroup()
[1m[22mJoining, by = "word"
bing_word_count %>% head()
A tibble: 6 × 3
word | sentiment | n |
---|
<chr> | <chr> | <int> |
---|
miss | negative | 1855 |
well | positive | 1523 |
good | positive | 1380 |
great | positive | 981 |
like | positive | 725 |
better | positive | 639 |
bing_word_count %>%
group_by(sentiment) %>%
slice_max(n, n = 10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment",
y = NULL)

custom_stop_words <- bind_rows(tibble(word = c("miss"),
lexicon = c("custom")),
stop_words)
custom_stop_words %>% head()
A tibble: 6 × 2
word | lexicon |
---|
<chr> | <chr> |
---|
miss | custom |
a | SMArT |
a's | SMArT |
able | SMArT |
about | SMArT |
above | SMArT |
5.词云绘制
library(wordcloud)
tidy_books %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))

library(reshape2)
tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TrUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("blue", "red"),
max.words = 100)

6.总结
参考资料:Text Mining with R