【R语言文本挖掘】：分析单词和文档频率——TF-IDF

请添加图片描述

文章目录

【R语言文本挖掘】：分析单词和文档频率——TF-IDF
引言
1.简·奥斯汀小说集的频率
2.齐夫定律
3.bind_tf_idf() 函数
4.物理文档语料库
5.总结

引言

1.简·奥斯汀小说集的频率

接下来我们导入相关库

# 导入相关库
library(dplyr)
library(janeaustenr)
library(tidytext)

分词处理

# 将文本分词成word
book_words <- austen_books() %>%
unnest_tokens(word, text) %>% #将文本分词
count(book, word, sort = TrUE)
book_words%>%head()

A tibble: 6 × 3
book	word	n
<fct>	<chr>	<int>
Mansfield Park	the	6206
Mansfield Park	to	5475
Mansfield Park	and	5438
Emma	to	5239
Emma	the	5201
Emma	and	4896

total_words <- book_words %>% 
  group_by(book) %>% 
  summarize(total = sum(n))

total_words%>%head()

A tibble: 6 × 2
book	total
<fct>	<int>
Sense & Sensibility	119957
Pride & Prejudice	122204
Mansfield Park	160460
Emma	160996
Northanger Abbey	77780
Persuasion	83658

book_words <- left_join(book_words, total_words)#左连接

book_words %>% head()

[1m[22mJoining, by = c("book", "total")

A tibble: 6 × 4
book	word	n	total
<fct>	<chr>	<int>	<int>
Mansfield Park	the	6206	160460
Mansfield Park	to	5475	160460
Mansfield Park	and	5438	160460
Emma	to	5239	160996
Emma	the	5201	160996
Emma	and	4896	160996

library(ggplot2)

ggplot(book_words, aes(n/total, fill = book)) +
  geom_histogram(show.legend = FALSE) +
  xlim(NA, 0.0009) +
  facet_wrap(~book, ncol = 2, scales = "free_y")#分面绘图

png

2.齐夫定律

freq_by_rank <- book_words %>% 
  group_by(book) %>% 
  mutate(rank = row_number(), 
         `term frequency` = n/total)#计算rank和频率
freq_by_rank%>%head()

A grouped_df: 6 × 6
book	word	n	total	rank	term frequency
<fct>	<chr>	<int>	<int>	<int>	<dbl>
Mansfield Park	the	6206	160460	1	0.03867631
Mansfield Park	to	5475	160460	2	0.03412065
Mansfield Park	and	5438	160460	3	0.03389007
Emma	to	5239	160996	1	0.03254118
Emma	the	5201	160996	2	0.03230515
Emma	and	4896	160996	3	0.03041069

freq_by_rank %>% 
  ggplot(aes(rank, `term frequency`, color = book)) + 
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) + 
  scale_x_log10() +
  scale_y_log10()

png

rank_subset <- freq_by_rank %>% 
  filter(rank < 500,
         rank > 10)

lm(log10(`term frequency`) ~ log10(rank), data = rank_subset)

    Call:
    lm(formula = log10(`term frequency`) ~ log10(rank), data = rank_subset)
    
    Coefficients:
    (Intercept)  log10(rank)  
        -0.6226      -1.1125

freq_by_rank %>% 
  ggplot(aes(rank, `term frequency`, color = book)) + 
  geom_abline(intercept = -0.62, slope = -1.1, 
              color = "gray50", linetype = 2) +
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) + 
  scale_x_log10() +
  scale_y_log10()

png

3.bind_tf_idf() 函数

book_tf_idf <- book_words %>%
  bind_tf_idf(word, book, n)

book_tf_idf %>% head()

A tibble: 6 × 7
book	word	n	total	tf	idf	tf_idf
<fct>	<chr>	<int>	<int>	<dbl>	<dbl>	<dbl>
Mansfield Park	the	6206	160460	0.03867631	0	0
Mansfield Park	to	5475	160460	0.03412065	0	0
Mansfield Park	and	5438	160460	0.03389007	0	0
Emma	to	5239	160996	0.03254118	0	0
Emma	the	5201	160996	0.03230515	0	0
Emma	and	4896	160996	0.03041069	0	0

book_tf_idf %>%
  select(-total) %>%
  arrange(desc(tf_idf)) %>%head()

A tibble: 6 × 6
book	word	n	tf	idf	tf_idf
<fct>	<chr>	<int>	<dbl>	<dbl>	<dbl>
Sense & Sensibility	elinor	623	0.005193528	1.791759	0.009305552
Sense & Sensibility	marianne	492	0.004101470	1.791759	0.007348847
Mansfield Park	crawford	493	0.003072417	1.791759	0.005505032
Pride & Prejudice	darcy	373	0.003052273	1.791759	0.005468939
Persuasion	elliot	254	0.003036171	1.791759	0.005440088
Emma	emma	786	0.004882109	1.098612	0.005363545

library(forcats)

book_tf_idf %>%
  group_by(book) %>%
  slice_max(tf_idf, n = 15) %>%
  ungroup() %>%
  ggplot(aes(tf_idf, fct_reorder(word, tf_idf), fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free") +
  labs(x = "tf-idf", y = NULL)

png

4.物理文档语料库

library(gutenbergr)
physics <- gutenberg_download(c(37729, 14725, 13476, 30155), 
                              meta_fields = "author")

physics_words <- physics %>%
    unnest_tokens(word,text)%>%
    count(author,word, sort = TrUE)
physics_words %>% head()

A tibble: 6 × 3
author	word	n
<chr>	<chr>	<int>
Galilei, Galileo	the	3760
Tesla, Nikola	the	3604
Huygens, Christiaan	the	3553
Einstein, Albert	the	2993
Galilei, Galileo	of	2049
Einstein, Albert	of	2028

plot_physics <- physics_words %>%
  bind_tf_idf(word, author, n) %>%
  mutate(author = factor(author, levels = c("Galilei, Galileo",
                                            "Huygens, Christiaan", 
                                            "Tesla, Nikola",
                                            "Einstein, Albert")))

plot_physics %>% 
  group_by(author) %>% 
  slice_max(tf_idf, n = 15) %>% 
  ungroup() %>%
  mutate(word = reorder(word, tf_idf)) %>%
  ggplot(aes(tf_idf, word, fill = author)) +
  geom_col(show.legend = FALSE) +
  labs(x = "tf-idf", y = NULL) +
  facet_wrap(~author, ncol = 2, scales = "free")

png

library(stringr)

physics %>% 
  filter(str_detect(text, "_k_")) #%>% 
  #select(text)

A tibble: 7 × 3
gutenberg_id	text	author
<int>	<chr>	<chr>
14725	surface AB at the points AK_k_B. Then instead of the hemispherical	Huygens, Christiaan
14725	would needs be that from all the other points K_k_B there should	Huygens, Christiaan
14725	necessarily be equal to CD, because C_k_ is equal to CK, and C_g_ to	Huygens, Christiaan
14725	the crystal at K_k_, all the points of the wave CO_oc_ will have	Huygens, Christiaan
14725	O_o_ has reached K_k_. Which is easy to comprehend, since, of these	Huygens, Christiaan
14725	CO_oc_ in the crystal, when O_o_ has arrived at K_k_, because it forms	Huygens, Christiaan
30155	ρ is the average density of the matter and _k_ is a constant connected	Einstein, Albert

physics %>% 
  filter(str_detect(text, "rC")) %>% 
  select(text) %>%
  head()

A tibble: 6 × 1
text
<chr>
line rC, parallel and equal to AB, to be a portion of a wave of light,
represents the partial wave coming from the point A, after the wave rC
be the propagation of the wave rC which fell on AB, and would be the
transparent body; seeing that the wave rC, having come to the aperture
incident rays. Let there be such a ray rC falling upon the surface
CK. Make CO perpendicular to rC, and across the angle KCO adjust OK,

mystopwords <- tibble(word = c("eq", "co", "rc", "ac", "ak", "bn", 
                                   "fig", "file", "cg", "cb", "cm",
                               "ab", "_k", "_k_", "_x"))

physics_words <- anti_join(physics_words, mystopwords, 
                           by = "word")

plot_physics <- physics_words %>%
  bind_tf_idf(word, author, n) %>%
  mutate(word = str_remove_all(word, "_")) %>%
  group_by(author) %>% 
  slice_max(tf_idf, n = 15) %>%
  ungroup() %>%
  mutate(word = fct_reorder(word, tf_idf)) %>%
  mutate(author = factor(author, levels = c("Galilei, Galileo",
                                            "Huygens, Christiaan",
                                            "Tesla, Nikola",
                                            "Einstein, Albert")))

ggplot(plot_physics, aes(tf_idf, word, fill = author)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~author, ncol = 2, scales = "free") +
  labs(x = "tf-idf", y = NULL)