13、学习Lucene3.5索引之通过TokenStream显示分词-CFANZ编程社区

13、学习Lucene3.5索引之通过TokenStream显示分词

1.显示分词词汇

public static void displayToken(String str, Analyzer analyzer){
    try {
        /**
         * TokenStream tokenStream(String fieldName, Reader reader):
         * 获取TokenStream（分词流）
         * 参数一：域名（这里没有什么实际意义）
         * 参数二:输入流
         */
        TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(str));

        //创建一个属性（里面有分词后的词汇），这个属性添加到TokenStream中，随TokenStream移动而增加
        CharTermAttribute cta = tokenStream.addAttribute(CharTermAttribute.class);

        //遍历TokenStream
        while (tokenStream.incrementToken()){
            System.out.print("["+cta+"]");
        }
        System.out.println("");
        System.out.println("--------------------------------");
    } catch (IOException e) {
        e.printStackTrace();
    }
}

2.显示分词所有信息

public static void displayAllTokenInfo(String str, Analyzer analyzer){
    try {
        TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(str));
        //位置增量的属性，存储语汇单元之间的距离
        PositionIncrementAttribute pia = tokenStream.addAttribute(PositionIncrementAttribute.class);
        //每个语汇单元的位置偏移量
        OffsetAttribute oa = tokenStream.addAttribute(OffsetAttribute.class);
        //使用的分词器的类型信息
        TypeAttribute ta = tokenStream.addAttribute(TypeAttribute.class);
        //存储每一个语汇单元的信息（分词单元信息）
        CharTermAttribute ca = tokenStream.addAttribute(CharTermAttribute.class);
        while (tokenStream.incrementToken()){
            System.out.println("位置增量："+pia+" 词汇的偏移量："+oa.startOffset()+"--"+oa.endOffset()+" 类型："+ta+" 分词："+ca);
        }
    } catch (IOException e) {
        e.printStackTrace();
    }
}

0 条评论