RPackage002---stringr-CFANZ编程社区

看了下stringr包的文档，常用的应该是匹配正则那一块吧。后面如果实际工作遇到需要处理的情况，就加上些case吧.
# ****************stringr学习**************** -------------------------------------------------------

# *****字符串的转换***** --------------------------------------------------------
library(stringr)
## 参数设置:string 需要变换的字符串 locale需要转换成的语言，默认英语"en"
dog <- "The quick brown dog"
str_to_upper(dog)
# "THE QUICK BROWN DOG"
str_to_lower(dog)
# "the quick brown dog"
str_to_title(dog)
# "The Quick Brown Dog"


# *****反转匹配***** --------------------------------------------------------

numbers <- "1 and 2 and 4 and 456"
num_loc <- str_locate_all(numbers, "[0-9]+")[[1]] # 返回数字的起止位置
str_sub(numbers, num_loc[, "start"], num_loc[, "end"]) # "1"   "2"   "4"   "456"
text_loc <- invert_match(num_loc) # 返回非数字的起止位置
str_sub(numbers, text_loc[, "start"], text_loc[, "end"]) # ""      " and " " and " " and " ""  

# *****控制匹配行为***** --------------------------------------------------------
## 针对pattern的参数设置，默认是正则
## fixed 匹配一个固定的字符串
## coll 匹配不同语言,文档上有例子
## regex 正则
## boundary 按照什么格式划分,字符character，单词word，句子sentence
## 详细的参数设置还是看文档比较重要
pattern <- "a.b"
strings <- c("abb", "a.b")
str_detect(strings, pattern) # TRUE  TRUE
str_detect(strings, fixed(pattern)) # 严格匹配FALSE  TRUE
str_detect(strings, coll(pattern)) # FALSE  TRUE
# coll() is useful for locale-aware case-insensitive matching
i <- c("I", "\u0130", "i")
i
str_detect(i, fixed("i", TRUE))
str_detect(i, coll("i", TRUE))
str_detect(i, coll("i", TRUE, locale = "tr"))# 需要指定语言，暂且没遇到过
# Word boundaries
words <- c("These are some words.")
str_count(words, boundary("word")) # 统计单词个数
str_split(words, " ")[[1]] # 按照空格划分字符串
str_split(words, boundary("word"))[[1]]
# Regular expression variations
str_extract_all("The Cat in the Hat", "[a-z]+") # 正则，匹配小写a-z
str_extract_all("The Cat in the Hat", regex("[a-z]+", ignore_case = TRUE)) # 忽略大小写
str_extract_all("a\nb\nc", "^.")
str_extract_all("a\nb\nc", regex("^.", multiline = TRUE))
str_extract_all("a\nb\nc", "a.")
str_extract_all("a\nb\nc", regex("a.", dotall = TRUE))
# *****字符串拼接***** --------------------------------------------------------
## 和paste差不多，不赘
str_c("Letter: ", letters)
str_c("Letter", letters, sep = ": ")
str_c(letters, " is for", "...")
str_c(letters[-26], " comes before ", letters[-1])
str_c(letters, collapse = "")
str_c(letters, collapse = ", ")

# *****指定字符编码***** --------------------------------------------------------
### 常遇到的是window连接数据库出现问题。。。
x <- rawToChar(as.raw(177))
x
str_conv(x, "ISO-8859-2") # Polish "a with ogonek"
str_conv(x, "ISO-8859-1") # Plus-minus

# *****统计字符串出现次数***** --------------------------------------------------------

fruit <- c("apple", "banana", "pear", "pineapple")
str_count(fruit, "a")
str_count(fruit, "p")
str_count(fruit, "e")
str_count(fruit, c("a", "b", "p", "p")) # 貌似是先匹配第一个，匹配上就结束，匹配不上转下一个
str_count(c("a.", "...", ".a.a"), ".") #234 正则里面.代表匹配任何单个的字符字母数字甚至.字符本身
str_count(c("a.", "...", ".a.a"), fixed(".")) # fix严格匹配
# Missing inputs give missing outputs
str_c(c("a", NA, "b"), "-d")
# Use str_replace_NA to display literal NAs:
str_c(str_replace_na(c("a", NA, "b")), "-d")

# *****是否存在某字符串***** --------------------------------------------------------
fruit <- c("apple", "banana", "pear", "pinapple")
str_detect(fruit, "a")
str_detect(fruit, "^a") # 开始是a
str_detect(fruit, "a$") # 结尾是a
str_detect(fruit, "b") # 有b
str_detect(fruit, "[aeiou]") #[]或的关系
# Also vectorised over pattern
str_detect("aecfg", letters)

# *****复制***** --------------------------------------------------------

fruit <- c("apple", "pear", "banana")
str_dup(fruit, 2)
str_dup(fruit, 1:3)
str_c("ba", str_dup("na", 0:5))

# *****提取***** --------------------------------------------------------

shopping_list <- c("apples x4", "bag of flour", "bag of sugar", "milk x2")
str_extract(shopping_list, "\\d") # 匹配数字，第一个\是转义符，等价[0-9]
str_extract(shopping_list, "[a-z]+") # +匹配多次，即匹配所有小写字母的组合情况,找到不间断的小写字母组合
str_extract(shopping_list, "[a-z]{1,4}") #最少1个，最多4个小写字母的组合 
str_extract(shopping_list, "\\b[a-z]{1,4}\\b") # "\b"匹配一个单词边界，此处为匹配长度为1-4的单词
# Extract all matches
str_extract_all(shopping_list, "[a-z]+")
str_extract_all(shopping_list, "\\b[a-z]+\\b")
str_extract_all(shopping_list, "\\d")
# Simplify results into character matrix 返回矩阵
str_extract_all(shopping_list, "\\b[a-z]+\\b", simplify = TRUE)
str_extract_all(shopping_list, "\\d", simplify = TRUE)
# Extract all words
str_extract_all("This is, suprisingly, a sentence.", boundary("word"))

# *****插入字符串***** --------------------------------------------------------
## 像是字符格式的设置，暂时没用到，不管了
## paste0，拼接下似乎也可以的，写sql的时候，似乎方便点
# Using values from the environment, and some formats
user_name <- "smbache"
amount <- 6.656
account <- 1337
str_interp("User ${user_name} (account $[08d]{account}) has $$[.2f]{amount}.")
# Nested brace pairs work inside expressions too, and any braces can be
# placed outside the expressions.
str_interp("Works with } nested { braces too: $[.2f]{{{2 + 2}*{amount}}}")
# Values can also come from a list
str_interp(
  "One value, ${value1}, and then another, ${value2*2}.",
  list(value1 = 10, value2 = 20)
)
# Or a data frame
str_interp(
  "Values are $[.2f]{max(Sepal.Width)} and $[.2f]{min(Sepal.Width)}.",
  iris
)

# *****统计字符串长度***** --------------------------------------------------------

# 和nchar类似，不赘
str_length(c("a","ab","abc"))


# *****指定字符位置***** --------------------------------------------------------
## 参数设置： string 输入字符串 pattern 正则表达式[有空学习下]
## 返回的结果:
## str_locate 一个整数元素组成的矩阵，第一列是符合匹配的字符的开始位置，第二列为结束位置
## str_locate_all 列表，矩阵组成
fruit <- c("apple", "banana", "pear", "pineapple")
str_locate(fruit, "a")
#       start end
# [1,]     1   1
# [2,]     2   2
# [3,]     3   3
# [4,]     5   5
str_locate_all(fruit, "a")
# [[1]]
#         start end
# [1,]     1   1
# 
# [[2]]
#         start end
# [1,]     2   2
# [2,]     4   4
# [3,]     6   6
# 
# [[3]]
#         start end
# [1,]     3   3
# 
# [[4]]
#         start end
# [1,]     5   5
numbers <- "1 and 2 and 4 and 456"
num_loc <- str_locate_all(numbers, "[0-9]+")[[1]]
#       start end
# [1,]     1   1
# [2,]     7   7
# [3,]    13  13
# [4,]    19  21


# *****排序***** --------------------------------------------------------
## str_order
str_order(letters,decreasing = TRUE) # 返回排序的序号,降序
str_sort(letters) # 返回排序后的字符


# *****填补字符串***** --------------------------------------------------------
## 参数
## string 字符串
## width 填补内容之后的长度
## side 填补位置:left right both
## pad 填补内容，默认空格
str_pad("hadley", 10, "both","*") # "**hadley**"

# *****查找覆盖***** --------------------------------------------------------
## 参数
## string 字符串
## pattern 正则
## replacement 替代内容

str_replace("abccba", "[ab]", "-") # "-bccba" 匹配第一个
str_replace_all("abccba", "[ab]", "-") # "--cc--" 全部匹配



# *****切分字符串***** --------------------------------------------------------

## 参数 n 切割的个数
str_split(string = "baacaad",pattern = "aa", n = 3)# "b" "c" "d"
str_split(string = "baacaad",pattern = "aa", n = 2)# "b"    "caad"

# *****查找替换***** --------------------------------------------------------

hw <- "Hadley Wickham"
str_sub(hw, 1, 6) # "Hadley"
str_sub(hw, 1, 6) <- "Wr";hw # "Wr Wickham"

# *****提取子集，或者返回位置***** --------------------------------------------------------
fruit <- c("apple", "banana", "pear", "pinapple")
str_subset(fruit, "a")
str_which(fruit, "a")

# *****删除空格***** --------------------------------------------------------
str_trim(string,side=c("both", "left", "right"))

# *****加密字符串***** --------------------------------------------------------
## 参数
## width 最终显示的字符串长度,貌似不能给指定位置加密
## side 加密的位置
## ellipsis 加密符号
x <- ""
str_trunc(string = x, width = 10, side = "right",ellipsis = "****")