该程序用于读取word文档的文字内容,如果是艺术字,图片不能读取
先在idea创建maven项目
在pom.xml添加以下依赖
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.17</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.17</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml-schemas -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml-schemas</artifactId>
<version>3.17</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi-scratchpad -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>3.17</version>
</dependency>
代码:
package com.gong;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
//import org.apache.poi.ooxml.POIXMLDocument;
//import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
public class Word {
public static String ReadDoc(String path) throws IOException {
String resullt = "";
//首先判断文件中的是doc/docx
try {
if (path.endsWith(".doc")) {
InputStream is = new FileInputStream(new File(path));
WordExtractor re = new WordExtractor(is);
resullt = re.getText();
re.close();
} else if (path.endsWith(".docx")) {
OPCPackage opcPackage = POIXMLDocument.openPackage(path);
POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);
resullt = extractor.getText();
extractor.close();
} else {
System.out.println("此文件不是word文件");
}
} catch(Exception e){
e.printStackTrace();
}
return resullt;
}
public static void main(String[] args) throws IOException {
String path="E:\\datas\\学习.docx";
String result=ReadDoc(path);
System.out.println(result);
}
}
运行程序在终端打印出来word文档的内容