0
点赞
收藏
分享

微信扫一扫

【R语言文本挖掘】:分析单词和文档频率——TF-IDF

【R语言文本挖掘】:分析单词和文档频率——TF-IDF


请添加图片描述

文章目录

引言

1.简·奥斯汀小说集的频率

接下来我们导入相关库

# 导入相关库
library(dplyr)
library(janeaustenr)
library(tidytext)

分词处理

# 将文本分词成word
book_words <- austen_books() %>%
unnest_tokens(word, text) %>% #将文本分词
count(book, word, sort = TrUE)
book_words%>%head()
A tibble: 6 × 3
bookwordn
<fct><chr><int>
Mansfield Parkthe6206
Mansfield Parkto 5475
Mansfield Parkand5438
Emma to 5239
Emma the5201
Emma and4896
total_words <- book_words %>% 
  group_by(book) %>% 
  summarize(total = sum(n))

total_words%>%head()
A tibble: 6 × 2
booktotal
<fct><int>
Sense & Sensibility119957
Pride & Prejudice 122204
Mansfield Park 160460
Emma 160996
Northanger Abbey 77780
Persuasion 83658
book_words <- left_join(book_words, total_words)#左连接

book_words %>% head()
[1m[22mJoining, by = c("book", "total")
A tibble: 6 × 4
bookwordntotal
<fct><chr><int><int>
Mansfield Parkthe6206160460
Mansfield Parkto 5475160460
Mansfield Parkand5438160460
Emma to 5239160996
Emma the5201160996
Emma and4896160996
library(ggplot2)

ggplot(book_words, aes(n/total, fill = book)) +
  geom_histogram(show.legend = FALSE) +
  xlim(NA, 0.0009) +
  facet_wrap(~book, ncol = 2, scales = "free_y")#分面绘图

png

2.齐夫定律

freq_by_rank <- book_words %>% 
  group_by(book) %>% 
  mutate(rank = row_number(), 
         `term frequency` = n/total)#计算rank和频率
freq_by_rank%>%head()
A grouped_df: 6 × 6
bookwordntotalrankterm frequency
<fct><chr><int><int><int><dbl>
Mansfield Parkthe620616046010.03867631
Mansfield Parkto 547516046020.03412065
Mansfield Parkand543816046030.03389007
Emma to 523916099610.03254118
Emma the520116099620.03230515
Emma and489616099630.03041069
freq_by_rank %>% 
  ggplot(aes(rank, `term frequency`, color = book)) + 
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) + 
  scale_x_log10() +
  scale_y_log10()

png

rank_subset <- freq_by_rank %>% 
  filter(rank < 500,
         rank > 10)

lm(log10(`term frequency`) ~ log10(rank), data = rank_subset)
​    Call:
​    lm(formula = log10(`term frequency`) ~ log10(rank), data = rank_subset)
​    
​    Coefficients:
​    (Intercept)  log10(rank)  
​        -0.6226      -1.1125  
freq_by_rank %>% 
  ggplot(aes(rank, `term frequency`, color = book)) + 
  geom_abline(intercept = -0.62, slope = -1.1, 
              color = "gray50", linetype = 2) +
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) + 
  scale_x_log10() +
  scale_y_log10()


png

3.bind_tf_idf() 函数

book_tf_idf <- book_words %>%
  bind_tf_idf(word, book, n)

book_tf_idf %>% head()
A tibble: 6 × 7
bookwordntotaltfidftf_idf
<fct><chr><int><int><dbl><dbl><dbl>
Mansfield Parkthe62061604600.0386763100
Mansfield Parkto 54751604600.0341206500
Mansfield Parkand54381604600.0338900700
Emma to 52391609960.0325411800
Emma the52011609960.0323051500
Emma and48961609960.0304106900
book_tf_idf %>%
  select(-total) %>%
  arrange(desc(tf_idf)) %>%head()
A tibble: 6 × 6
bookwordntfidftf_idf
<fct><chr><int><dbl><dbl><dbl>
Sense & Sensibilityelinor 6230.0051935281.7917590.009305552
Sense & Sensibilitymarianne4920.0041014701.7917590.007348847
Mansfield Park crawford4930.0030724171.7917590.005505032
Pride & Prejudice darcy 3730.0030522731.7917590.005468939
Persuasion elliot 2540.0030361711.7917590.005440088
Emma emma 7860.0048821091.0986120.005363545
library(forcats)

book_tf_idf %>%
  group_by(book) %>%
  slice_max(tf_idf, n = 15) %>%
  ungroup() %>%
  ggplot(aes(tf_idf, fct_reorder(word, tf_idf), fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free") +
  labs(x = "tf-idf", y = NULL)

png

4.物理文档语料库

library(gutenbergr)
physics <- gutenberg_download(c(37729, 14725, 13476, 30155), 
                              meta_fields = "author")
physics_words <- physics %>%
    unnest_tokens(word,text)%>%
    count(author,word, sort = TrUE)
physics_words %>% head()
A tibble: 6 × 3
authorwordn
<chr><chr><int>
Galilei, Galileo the3760
Tesla, Nikola the3604
Huygens, Christiaanthe3553
Einstein, Albert the2993
Galilei, Galileo of 2049
Einstein, Albert of 2028
plot_physics <- physics_words %>%
  bind_tf_idf(word, author, n) %>%
  mutate(author = factor(author, levels = c("Galilei, Galileo",
                                            "Huygens, Christiaan", 
                                            "Tesla, Nikola",
                                            "Einstein, Albert")))

plot_physics %>% 
  group_by(author) %>% 
  slice_max(tf_idf, n = 15) %>% 
  ungroup() %>%
  mutate(word = reorder(word, tf_idf)) %>%
  ggplot(aes(tf_idf, word, fill = author)) +
  geom_col(show.legend = FALSE) +
  labs(x = "tf-idf", y = NULL) +
  facet_wrap(~author, ncol = 2, scales = "free")


png

library(stringr)

physics %>% 
  filter(str_detect(text, "_k_")) #%>% 
  #select(text)
A tibble: 7 × 3
gutenberg_idtextauthor
<int><chr><chr>
14725surface AB at the points AK_k_B. Then instead of the hemispherical Huygens, Christiaan
14725would needs be that from all the other points K_k_B there should Huygens, Christiaan
14725necessarily be equal to CD, because C_k_ is equal to CK, and C_g_ to Huygens, Christiaan
14725the crystal at K_k_, all the points of the wave CO_oc_ will have Huygens, Christiaan
14725O_o_ has reached K_k_. Which is easy to comprehend, since, of these Huygens, Christiaan
14725CO_oc_ in the crystal, when O_o_ has arrived at K_k_, because it formsHuygens, Christiaan
30155ρ is the average density of the matter and _k_ is a constant connectedEinstein, Albert
physics %>% 
  filter(str_detect(text, "rC")) %>% 
  select(text) %>%
  head()
A tibble: 6 × 1
text
<chr>
line rC, parallel and equal to AB, to be a portion of a wave of light,
represents the partial wave coming from the point A, after the wave rC
be the propagation of the wave rC which fell on AB, and would be the
transparent body; seeing that the wave rC, having come to the aperture
incident rays. Let there be such a ray rC falling upon the surface
CK. Make CO perpendicular to rC, and across the angle KCO adjust OK,
mystopwords <- tibble(word = c("eq", "co", "rc", "ac", "ak", "bn", 
                                   "fig", "file", "cg", "cb", "cm",
                               "ab", "_k", "_k_", "_x"))

physics_words <- anti_join(physics_words, mystopwords, 
                           by = "word")

plot_physics <- physics_words %>%
  bind_tf_idf(word, author, n) %>%
  mutate(word = str_remove_all(word, "_")) %>%
  group_by(author) %>% 
  slice_max(tf_idf, n = 15) %>%
  ungroup() %>%
  mutate(word = fct_reorder(word, tf_idf)) %>%
  mutate(author = factor(author, levels = c("Galilei, Galileo",
                                            "Huygens, Christiaan",
                                            "Tesla, Nikola",
                                            "Einstein, Albert")))

ggplot(plot_physics, aes(tf_idf, word, fill = author)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~author, ncol = 2, scales = "free") +
  labs(x = "tf-idf", y = NULL)


png

5.总结

举报

相关推荐

0 条评论