当前位置:
首页 > temp > 简明python教程 >
-
C# Net 使用openxml提取word中的文本和图片并转为Html
C# Net Core openxml 提取 提出 取 word 文本 图片 Html Text Drawing
C# Net Core openxml 提取 提出 取 word 文本 图片 Html Text Drawing
注:只支持内嵌,不支持公式
------------------------------------------------
---------------文章最后为效果------------
------------------------------------------------
加入包:OpenXml
创建文件:Read.cs
复制下面全部代码到文件 Read.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
|
using DocumentFormat.OpenXml; using DocumentFormat.OpenXml.Packaging; using DocumentFormat.OpenXml.Wordprocessing; using System; using System.Collections.Generic; using System.IO; using System.Text; using System.Xml; using System.Xml.Xsl; namespace YCBX.Office.WordXml { public class WordRead { public static List< string > ReadToHtml( string wordPathStr) { return ReadToHtml( new FileStream(wordPathStr, FileMode.Open)); } public static List< string > ReadToHtml(Stream wordStream) { using (WordprocessingDocument doc = WordprocessingDocument.Open(wordStream, false )) { //XmlWriterSettings settings = new XmlWriterSettings() { OmitXmlDeclaration = true, ConformanceLevel = ConformanceLevel.Auto,DoNotEscapeUriAttributes=true}; List< string > paragraphHtmls = new List< string >(); MainDocumentPart mainPart = doc.MainDocumentPart; Body body = doc.MainDocumentPart.Document.Body; //段落 foreach ( var paragraph in body.Elements<Paragraph>()) { StringBuilder paragraphHtml = new StringBuilder(); //块 foreach ( var run in paragraph.ChildElements) { if (run is Run) { foreach (OpenXmlElement openXmlElement in run.Elements()) { //软回车 if (openXmlElement is Break br) { paragraphHtmls.Add(paragraphHtml.ToString()); paragraphHtml = new StringBuilder(); } //文字块 else if (openXmlElement is Text text) { paragraphHtml.Append(text.Text); } //图像块 else if (openXmlElement is Drawing drawing) { //得到图像的内嵌ID(外嵌没做处理) var inline = drawing.Inline; var extent = inline.Extent; var pic = inline.Graphic.GraphicData.GetFirstChild<DocumentFormat.OpenXml.Drawing.Pictures.Picture>(); var embed = pic.BlipFill.Blip.Embed.Value; //得到图像流 var part = mainPart.GetPartById(embed); var stream = part.GetStream(); //流转2进制 byte [] bytes = new byte [stream.Length]; stream.Read(bytes, 0, bytes.Length); //2进制转base64 string imgHtml = $ "<img width='{ImageExtent.EMU_TO_PX((decimal)extent.Cx.Value).ToString(" 0. ")}' height='{ImageExtent.EMU_TO_PX((decimal)extent.Cy.Value).ToString(" 0. ")}' src='data:{part.ContentType};base64," + Convert.ToBase64String(bytes) + "' />" ; paragraphHtml.Append(imgHtml); } } } //else if(run is DocumentFormat.OpenXml.Math.OfficeMath math) //{ // var x = new XmlDocument(); // x.LoadXml(math.OuterXml); // using var ms = ConvertToMatchMl(x, settings); // paragraphHtml.Append(ConvertToLatex(settings, ms)); //} } paragraphHtmls.Add(paragraphHtml.ToString()); } return paragraphHtmls; } } /// <summary> /// 合并文档 /// </summary> /// <param name="finalFile"></param> /// <param name="files"></param> public static void Combine( string finalFile, List< string > files) { if (files.Count < 2) { return ; } File.Copy(files[0], finalFile, true ); using (WordprocessingDocument doc = WordprocessingDocument.Open(finalFile, true )) { Body b = doc.MainDocumentPart.Document.Body; for ( int i = 1; i < files.Count; i++) { using (WordprocessingDocument doc1 = WordprocessingDocument.Open(files[i], true )) { foreach ( var inst in doc1.MainDocumentPart.Document.Body.Elements()) { b.Append(inst.CloneNode( true )); } } } } } private string ConvertToLatex(XmlWriterSettings settings, Stream ms) { var latexTransform = new XslCompiledTransform(); latexTransform.Load(Path.Combine(AppContext.BaseDirectory, "xsltml" , "mmltex.xsl" ), new XsltSettings( true , true ), new XmlUrlResolver() ); using var la = new MemoryStream(); latexTransform.Transform( new XmlTextReader(ms), XmlWriter.Create(la, settings)); la.Seek(0, SeekOrigin.Begin); StreamReader sr = new StreamReader(la, Encoding.UTF8); return sr.ReadToEnd(); } private Stream ConvertToMatchMl(XmlDocument xmlDocument, XmlWriterSettings settings) { var ms = new MemoryStream(); var xslTransform = new XslCompiledTransform(); xslTransform.Load(Path.Combine(AppContext.BaseDirectory, "xsltml" , "OMML2MML.XSL" )); xslTransform.Transform(xmlDocument, XmlWriter.Create(ms, settings)); ms.Seek(0, SeekOrigin.Begin); return ms; } } } |
栏目列表
最新更新
nodejs爬虫
Python正则表达式完全指南
爬取豆瓣Top250图书数据
shp 地图文件批量添加字段
爬虫小试牛刀(爬取学校通知公告)
【python基础】函数-初识函数
【python基础】函数-返回值
HTTP请求:requests模块基础使用必知必会
Python初学者友好丨详解参数传递类型
如何有效管理爬虫流量?
2个场景实例讲解GaussDB(DWS)基表统计信息估
常用的 SQL Server 关键字及其含义
动手分析SQL Server中的事务中使用的锁
openGauss内核分析:SQL by pass & 经典执行
一招教你如何高效批量导入与更新数据
天天写SQL,这些神奇的特性你知道吗?
openGauss内核分析:执行计划生成
[IM002]Navicat ODBC驱动器管理器 未发现数据
初入Sql Server 之 存储过程的简单使用
SQL Server -- 解决存储过程传入参数作为s
关于JS定时器的整理
JS中使用Promise.all控制所有的异步请求都完
js中字符串的方法
import-local执行流程与node模块路径解析流程
检测数据类型的四种方法
js中数组的方法,32种方法
前端操作方法
数据类型
window.localStorage.setItem 和 localStorage.setIte
如何完美解决前端数字计算精度丢失与数