【R语言文本挖掘】:分析单词和文档频率——TF-IDF

文章目录
引言
1.简·奥斯汀小说集的频率
接下来我们导入相关库
library(dplyr)
library(janeaustenr)
library(tidytext)
分词处理
book_words <- austen_books() %>%
unnest_tokens(word, text) %>%
count(book, word, sort = TrUE)
book_words%>%head()
A tibble: 6 × 3
book | word | n |
---|
<fct> | <chr> | <int> |
---|
Mansfield Park | the | 6206 |
Mansfield Park | to | 5475 |
Mansfield Park | and | 5438 |
Emma | to | 5239 |
Emma | the | 5201 |
Emma | and | 4896 |
total_words <- book_words %>%
group_by(book) %>%
summarize(total = sum(n))
total_words%>%head()
A tibble: 6 × 2
book | total |
---|
<fct> | <int> |
---|
Sense & Sensibility | 119957 |
Pride & Prejudice | 122204 |
Mansfield Park | 160460 |
Emma | 160996 |
Northanger Abbey | 77780 |
Persuasion | 83658 |
book_words <- left_join(book_words, total_words)
book_words %>% head()
[1m[22mJoining, by = c("book", "total")
A tibble: 6 × 4
book | word | n | total |
---|
<fct> | <chr> | <int> | <int> |
---|
Mansfield Park | the | 6206 | 160460 |
Mansfield Park | to | 5475 | 160460 |
Mansfield Park | and | 5438 | 160460 |
Emma | to | 5239 | 160996 |
Emma | the | 5201 | 160996 |
Emma | and | 4896 | 160996 |
library(ggplot2)
ggplot(book_words, aes(n/total, fill = book)) +
geom_histogram(show.legend = FALSE) +
xlim(NA, 0.0009) +
facet_wrap(~book, ncol = 2, scales = "free_y")

2.齐夫定律
freq_by_rank <- book_words %>%
group_by(book) %>%
mutate(rank = row_number(),
`term frequency` = n/total)
freq_by_rank%>%head()
A grouped_df: 6 × 6
book | word | n | total | rank | term frequency |
---|
<fct> | <chr> | <int> | <int> | <int> | <dbl> |
---|
Mansfield Park | the | 6206 | 160460 | 1 | 0.03867631 |
Mansfield Park | to | 5475 | 160460 | 2 | 0.03412065 |
Mansfield Park | and | 5438 | 160460 | 3 | 0.03389007 |
Emma | to | 5239 | 160996 | 1 | 0.03254118 |
Emma | the | 5201 | 160996 | 2 | 0.03230515 |
Emma | and | 4896 | 160996 | 3 | 0.03041069 |
freq_by_rank %>%
ggplot(aes(rank, `term frequency`, color = book)) +
geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) +
scale_x_log10() +
scale_y_log10()

rank_subset <- freq_by_rank %>%
filter(rank < 500,
rank > 10)
lm(log10(`term frequency`) ~ log10(rank), data = rank_subset)
Call:
lm(formula = log10(`term frequency`) ~ log10(rank), data = rank_subset)
Coefficients:
(Intercept) log10(rank)
-0.6226 -1.1125
freq_by_rank %>%
ggplot(aes(rank, `term frequency`, color = book)) +
geom_abline(intercept = -0.62, slope = -1.1,
color = "gray50", linetype = 2) +
geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) +
scale_x_log10() +
scale_y_log10()

3.bind_tf_idf() 函数
book_tf_idf <- book_words %>%
bind_tf_idf(word, book, n)
book_tf_idf %>% head()
A tibble: 6 × 7
book | word | n | total | tf | idf | tf_idf |
---|
<fct> | <chr> | <int> | <int> | <dbl> | <dbl> | <dbl> |
---|
Mansfield Park | the | 6206 | 160460 | 0.03867631 | 0 | 0 |
Mansfield Park | to | 5475 | 160460 | 0.03412065 | 0 | 0 |
Mansfield Park | and | 5438 | 160460 | 0.03389007 | 0 | 0 |
Emma | to | 5239 | 160996 | 0.03254118 | 0 | 0 |
Emma | the | 5201 | 160996 | 0.03230515 | 0 | 0 |
Emma | and | 4896 | 160996 | 0.03041069 | 0 | 0 |
book_tf_idf %>%
select(-total) %>%
arrange(desc(tf_idf)) %>%head()
A tibble: 6 × 6
book | word | n | tf | idf | tf_idf |
---|
<fct> | <chr> | <int> | <dbl> | <dbl> | <dbl> |
---|
Sense & Sensibility | elinor | 623 | 0.005193528 | 1.791759 | 0.009305552 |
Sense & Sensibility | marianne | 492 | 0.004101470 | 1.791759 | 0.007348847 |
Mansfield Park | crawford | 493 | 0.003072417 | 1.791759 | 0.005505032 |
Pride & Prejudice | darcy | 373 | 0.003052273 | 1.791759 | 0.005468939 |
Persuasion | elliot | 254 | 0.003036171 | 1.791759 | 0.005440088 |
Emma | emma | 786 | 0.004882109 | 1.098612 | 0.005363545 |
library(forcats)
book_tf_idf %>%
group_by(book) %>%
slice_max(tf_idf, n = 15) %>%
ungroup() %>%
ggplot(aes(tf_idf, fct_reorder(word, tf_idf), fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free") +
labs(x = "tf-idf", y = NULL)

4.物理文档语料库
library(gutenbergr)
physics <- gutenberg_download(c(37729, 14725, 13476, 30155),
meta_fields = "author")
physics_words <- physics %>%
unnest_tokens(word,text)%>%
count(author,word, sort = TrUE)
physics_words %>% head()
A tibble: 6 × 3
author | word | n |
---|
<chr> | <chr> | <int> |
---|
Galilei, Galileo | the | 3760 |
Tesla, Nikola | the | 3604 |
Huygens, Christiaan | the | 3553 |
Einstein, Albert | the | 2993 |
Galilei, Galileo | of | 2049 |
Einstein, Albert | of | 2028 |
plot_physics <- physics_words %>%
bind_tf_idf(word, author, n) %>%
mutate(author = factor(author, levels = c("Galilei, Galileo",
"Huygens, Christiaan",
"Tesla, Nikola",
"Einstein, Albert")))
plot_physics %>%
group_by(author) %>%
slice_max(tf_idf, n = 15) %>%
ungroup() %>%
mutate(word = reorder(word, tf_idf)) %>%
ggplot(aes(tf_idf, word, fill = author)) +
geom_col(show.legend = FALSE) +
labs(x = "tf-idf", y = NULL) +
facet_wrap(~author, ncol = 2, scales = "free")

library(stringr)
physics %>%
filter(str_detect(text, "_k_"))
A tibble: 7 × 3
gutenberg_id | text | author |
---|
<int> | <chr> | <chr> |
---|
14725 | surface AB at the points AK_k_B. Then instead of the hemispherical | Huygens, Christiaan |
14725 | would needs be that from all the other points K_k_B there should | Huygens, Christiaan |
14725 | necessarily be equal to CD, because C_k_ is equal to CK, and C_g_ to | Huygens, Christiaan |
14725 | the crystal at K_k_, all the points of the wave CO_oc_ will have | Huygens, Christiaan |
14725 | O_o_ has reached K_k_. Which is easy to comprehend, since, of these | Huygens, Christiaan |
14725 | CO_oc_ in the crystal, when O_o_ has arrived at K_k_, because it forms | Huygens, Christiaan |
30155 | ρ is the average density of the matter and _k_ is a constant connected | Einstein, Albert |
physics %>%
filter(str_detect(text, "rC")) %>%
select(text) %>%
head()
A tibble: 6 × 1
text |
---|
<chr> |
---|
line rC, parallel and equal to AB, to be a portion of a wave of light, |
represents the partial wave coming from the point A, after the wave rC |
be the propagation of the wave rC which fell on AB, and would be the |
transparent body; seeing that the wave rC, having come to the aperture |
incident rays. Let there be such a ray rC falling upon the surface |
CK. Make CO perpendicular to rC, and across the angle KCO adjust OK, |
mystopwords <- tibble(word = c("eq", "co", "rc", "ac", "ak", "bn",
"fig", "file", "cg", "cb", "cm",
"ab", "_k", "_k_", "_x"))
physics_words <- anti_join(physics_words, mystopwords,
by = "word")
plot_physics <- physics_words %>%
bind_tf_idf(word, author, n) %>%
mutate(word = str_remove_all(word, "_")) %>%
group_by(author) %>%
slice_max(tf_idf, n = 15) %>%
ungroup() %>%
mutate(word = fct_reorder(word, tf_idf)) %>%
mutate(author = factor(author, levels = c("Galilei, Galileo",
"Huygens, Christiaan",
"Tesla, Nikola",
"Einstein, Albert")))
ggplot(plot_physics, aes(tf_idf, word, fill = author)) +
geom_col(show.legend = FALSE) +
facet_wrap(~author, ncol = 2, scales = "free") +
labs(x = "tf-idf", y = NULL)

5.总结