直接上代码,有问题看文档,大部分能解决吧~
# 安装说明 --------------------------------------------------------------------
## https://r-forge.r-project.org/R/?group_id=1054 官网下载手动安装
## rJava不赘,网上有教程
library(rJava)
library(Rwordseg)
segment.options(isNameRecognition=F) #人名识别为FALSE
# segmentCN 参数设置 -----------------------------------------------------------
#### 参数设置:
###### 1、strwords 字符串,或者文本文件目录
######## A Chinese sentence in UTF-8 or the path of a text file.
###### 2、analyzer 分词工具
######## A JAVA object of analyzer.
###### 3、nature 是否输出词性,默认不输出
######## Whether to recognise the nature of the words.
###### 4、nosymbol 是否保留标点
######## Whether to keep symbols in the sentence.
###### 5、returnType 返回结果的格式,默认是字符串向量;"tm"输出tm格式数据
######## Default is a string vector but we also can choose 'tm' to output a single string separated by space so that it can be used by Corpus directly.
###### 6、isfast 是否使用快的分析器,如果采用这个形式,则无法输出词性
######## Whether to run the fast analyzer.
###### 7、outfile strwords是文件时,输出结果的路径
######## The path of output if strwords is a file.
###### 8、blocklines strwords是文件时,一次读取文件的最大行数
######## The (maximal) number of lines to read at one time when strwords is a file.
segmentCN(strwords,
analyzer = get("Analyzer", envir = .RwordsegEnv),
nature = FALSE, nosymbol = TRUE,
returnType = c("vector", "tm"), isfast = FALSE,
outfile = "", blocklines = 1000)
# 词典管理 --------------------------------------------------------------------
## 查看安装的字典
listDict()
## 载入词典
#### 支持普通格式的文本词典和Sogou的Secl格式细胞词典
#### 参数设置:
###### 1、dictpath 词典绝对路径
###### 2、dictname 词典名称
###### 3、dicttype 词典类型,默认txt
###### 4、load 是否立即载入字典
installDict(
dictpath = "./Rwordseg/Dict/chinese-surname.scel",
dictname = "chinese-surname",
dicttype = "scel",
load = TRUE
)
## 移除字典
uninstallDict(removedict = "chinese-surname")
# 自定义文本字典 -----------------------------------------------------------------
### %R_HOME%\library\Rwordseg\dict
### 可以在文件中添加任意后缀为.dic的文件,输入自定义的词,一行一词,回车换行。
### 修改后loadDict()即可导入字典
# 其他参数 --------------------------------------------------------------------
## 人名识别
segment.options(isNameRecognition=FALSE)
# Demo --------------------------------------------------------------------
## 不加载词典
segmentCN("重庆市涪陵区皮家街14号1-1")
# [1] "重庆市" "涪陵" "区" "皮" "家" "街" "14号" "1" "1"
segmentCN("贵州省纳雍县曙光乡鼠场村大树脚组")
# [1] "贵州省" "纳" "雍" "县" "曙光" "乡" "鼠"
# [8] "场" "村" "大树" "脚" "组"
## 加载词典
installDict(
dictpath = "./Rwordseg/Dict/chinese-area.scel",
dictname = "chinese-surname",
dicttype = "scel",
load = TRUE
)
segmentCN("重庆市涪陵区皮家街14号1-1")
# [1] "重庆市" "涪陵区" "皮" "家" "街" "14号" "1" "1"
segmentCN("贵州省纳雍县曙光乡鼠场村大树脚组")
# [1] "贵州省" "纳雍县" "曙光乡" "鼠" "场" "村" "大树" "脚" "组"
uninstallDict()