-
JAVA读取word(doc)(docx)标题和内容----POI
JAVA 实现POI方式读取WORD文件内容
1、下载poi的jar包
下载地址:https://www.apache.org/dyn/closer.lua/poi/release/bin/poi-bin-3.17-20170915.tar.gz
下载解压后用到的jar包
maven:
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi --> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi</artifactId> <version>4.1.2</version> </dependency> <dependency> <groupId>cn.hutool</groupId> <artifactId>hutool-all</artifactId> <version>5.5.7</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml --> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml</artifactId> <version>4.1.2</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml-schemas --> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml-schemas</artifactId> <version>4.1.2</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>ooxml-schemas</artifactId> <version>1.1</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.poi/poi-scratchpad --> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-scratchpad</artifactId> <version>4.1.2</version> </dependency>
一、读取word全部内容(这个不区分doc和docx)
1 package com.wordcom; 2 3 import java.io.File; 4 import java.io.FileInputStream; 5 import java.io.InputStream; 6 import org.apache.poi.POIXMLDocument; 7 import org.apache.poi.POIXMLTextExtractor; 8 import org.apache.poi.hwpf.extractor.WordExtractor; 9 import org.apache.poi.openxml4j.opc.OPCPackage; 10 import org.apache.poi.xwpf.extractor.XWPFWordExtractor; 11 /** 12 * @Author:hp 13 * @Description: 14 * @Date:2021年11月4日14:58:11 15 * @Modified by:读取word所有内容 16 **/ 17 public class DocUtil { 18 public static void main(String[] args) { 19 String filePath = "C:\\Users\\hp\\Desktop\\新建文件夹 (2)\\忻州地调中心站11楼机房更换通信电源三措一案.docx"; 20 String content = readWord(filePath); 21 System.out.println(content); 22 } 23 24 public static String readWord(String path) { 25 String buffer = ""; 26 try { 27 if (path.endsWith(".doc")) { 28 InputStream is = new FileInputStream(new File(path)); 29 WordExtractor ex = new WordExtractor(is); 30 buffer = ex.getText(); 31 ex.close(); 32 } else if (path.endsWith("docx")) { 33 OPCPackage opcPackage = POIXMLDocument.openPackage(path); 34 POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage); 35 buffer = extractor.getText(); 36 extractor.close(); 37 } else { 38 System.out.println("此文件不是word文件!"); 39 } 40 41 } catch (Exception e) { 42 e.printStackTrace(); 43 } 44 45 return buffer; 46 } 47 }
二、获取word各级标题(doc格式)
这个需要保证word格式提前定义好标题格式才能读出来
1 package com.wordcom; 2 import org.apache.poi.hwpf.HWPFDocument; 3 import org.apache.poi.hwpf.model.StyleDescription; 4 import org.apache.poi.hwpf.model.StyleSheet; 5 import org.apache.poi.hwpf.usermodel.Paragraph; 6 import org.apache.poi.hwpf.usermodel.ParagraphProperties; 7 import org.apache.poi.hwpf.usermodel.Range; 8 import java.io.*; 9 10 /** 11 * @author hp 12 *获取doc文档的标题 13 */ 14 public class WordTitle { 15 public static void main(String[] args) throws Exception { 16 17 String filePath = "C:\\Users\\hp\\Desktop\\新建文件夹 (2)\\正文查找.doc"; 18 printWord(filePath); 19 20 } 21 public static void printWord(String filePath) throws IOException { 22 23 InputStream is = new FileInputStream(filePath); 24 25 HWPFDocument doc = new HWPFDocument(is); 26 27 Range r = doc.getRange();// 文档范围 28 29 for (int i = 0; i < r.numParagraphs(); i++) { 30 31 Paragraph p = r.getParagraph(i);// 获取段落 32 int numStyles = doc.getStyleSheet().numStyles(); 33 34 int styleIndex = p.getStyleIndex(); 35 36 if (numStyles > styleIndex) { 37 38 StyleSheet style_sheet = doc.getStyleSheet(); 39 40 StyleDescription style = style_sheet.getStyleDescription(styleIndex); 41 ParagraphProperties style1 = style_sheet.getParagraphStyle(styleIndex); 42 43 String styleName = style.getName();// 获取每个段落样式名称 44 //System.out.println(style_sheet); 45 //System.out.println(styleName); 46 // 获取自己理想样式的段落文本信息 47 //String styleLoving = "标题"; 48 String text = p.text();// 段落文本 49 //if (styleName != null && styleName.contains(styleLoving)) { 50 if (styleName.equals("标题")) { 51 52 System.out.println(text); 53 } 54 } 55 } 56 doc.close(); 57 } 58 }
三、按段落读取word(doc)(docx)
可以按照自己的需求提取特定的内容
doc
1 package com.wordcom; 2 import org.apache.poi.hwpf.HWPFDocument; 3 import org.apache.poi.hwpf.model.StyleDescription; 4 import org.apache.poi.hwpf.model.StyleSheet; 5 import org.apache.poi.hwpf.usermodel.Paragraph; 6 import org.apache.poi.hwpf.usermodel.ParagraphProperties; 7 import org.apache.poi.hwpf.usermodel.Range; 8 import java.io.*; 9 10 /** 11 * 12 * @author hp 13 *获取doc文档的标题 14 */ 15 public class WordTitledoc { 16 public static void main(String[] args) throws Exception { 17 18 String filePath = "C:\\Users\\hp\\Desktop\\新建文件夹 (2)\\一案 .doc"; 19 20 printWord(filePath); 21 22 } 23 24 public static void printWord(String filePath) throws IOException { 25 26 InputStream is = new FileInputStream(filePath); 27 28 HWPFDocument doc = new HWPFDocument(is); 29 30 Range r = doc.getRange();// 文档范围 31 32 for (int i = 0; i < r.numParagraphs(); i++) { 33 34 Paragraph p = r.getParagraph(i);// 获取段落 35 int numStyles = doc.getStyleSheet().numStyles(); 36 37 int styleIndex = p.getStyleIndex(); 38 39 if (numStyles > styleIndex) { 40 41 StyleSheet style_sheet = doc.getStyleSheet(); 42 43 StyleDescription style = style_sheet.getStyleDescription(styleIndex); 44 ParagraphProperties style1 = style_sheet.getParagraphStyle(styleIndex); 45 46 String styleName = style.getName();// 获取每个段落样式名称 47 //System.out.println(style_sheet); 48 //System.out.println(styleName); 49 // 获取自己理想样式的段落文本信息 50 //String styleLoving = "标题"; 51 String text = p.text();// 段落文本 52 //if (styleName != null && styleName.contains(styleLoving)) { 53 if (text.contains(".") || text.contains("、")) { 54 //String text = p.text();// 段落文本 55 if (!text.contains(",") && !text.contains(";") && !text.contains("。") && !text.contains("") && !text.contains("20")) { 56 System.out.println(text); 57 } 58 } 59 } 60 } 61 doc.close(); 62 } 63 }
docx
package com.wordcom; import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.apache.poi.xwpf.usermodel.XWPFParagraph; import java.io.*; import java.util.ArrayList; import java.util.List; import java.util.Map; /** * * @author hp *获取docx文档的标题 */ public class WordTitledocx { public static void main(String[] args) throws Exception { String filePath = "C:\\Users\\hp\\Desktop\\新建文件夹 (2)\\忻州地调中心站11楼机房更换通信电源三措一案.docx"; printWord(filePath); } public static void printWord(String filePath) throws IOException { InputStream is = new FileInputStream(filePath); XWPFDocument doc = new XWPFDocument(is); List<Map<String,Object>> list = new ArrayList(); List<XWPFParagraph> paragraphs2 = doc.getParagraphs(); for (XWPFParagraph xwpfParagraph : paragraphs2) { String text = xwpfParagraph.getParagraphText(); if (text.contains(".") || text.contains("、")) { //String text = p.text();// 段落文本 if (!text.contains(",") && !text.contains(";") && !text.contains("。") && !text.contains("") && !text.contains("20")) { System.out.println(text); } } } } }
来源:https://www.cnblogs.com/nntld/p/15527369.html
最新更新
python爬虫及其可视化
使用python爬取豆瓣电影短评评论内容
nodejs爬虫
Python正则表达式完全指南
爬取豆瓣Top250图书数据
shp 地图文件批量添加字段
爬虫小试牛刀(爬取学校通知公告)
【python基础】函数-初识函数
【python基础】函数-返回值
HTTP请求:requests模块基础使用必知必会
SQL SERVER中递归
2个场景实例讲解GaussDB(DWS)基表统计信息估
常用的 SQL Server 关键字及其含义
动手分析SQL Server中的事务中使用的锁
openGauss内核分析:SQL by pass & 经典执行
一招教你如何高效批量导入与更新数据
天天写SQL,这些神奇的特性你知道吗?
openGauss内核分析:执行计划生成
[IM002]Navicat ODBC驱动器管理器 未发现数据
初入Sql Server 之 存储过程的简单使用
uniapp/H5 获取手机桌面壁纸 (静态壁纸)
[前端] DNS解析与优化
为什么在js中需要添加addEventListener()?
JS模块化系统
js通过Object.defineProperty() 定义和控制对象
这是目前我见过最好的跨域解决方案!
减少回流与重绘
减少回流与重绘
如何使用KrpanoToolJS在浏览器切图
performance.now() 与 Date.now() 对比