-
C# Net 使用 openxml 提取word中的文本和图片并转为Html
C# Net Core openxml 提取 提出 取 word 文本 图片 Html Text Drawing
C# Net Core openxml 提取 提出 取 word 文本 图片 Html Text Drawing
注:只支持内嵌,不支持公式
------------------------------------------------
---------------文章最后为效果------------
------------------------------------------------
加入包:OpenXml
创建文件:Read.cs
复制下面全部代码到文件 Read.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
|
using DocumentFormat.OpenXml; using DocumentFormat.OpenXml.Packaging; using DocumentFormat.OpenXml.Wordprocessing; using System; using System.Collections.Generic; using System.IO; using System.Text; using System.Xml; using System.Xml.Xsl; namespace YCBX.Office.WordXml { public class WordRead { public static List< string > ReadToHtml( string wordPathStr) { return ReadToHtml( new FileStream(wordPathStr, FileMode.Open)); } public static List< string > ReadToHtml(Stream wordStream) { using (WordprocessingDocument doc = WordprocessingDocument.Open(wordStream, false )) { //XmlWriterSettings settings = new XmlWriterSettings() { OmitXmlDeclaration = true, ConformanceLevel = ConformanceLevel.Auto,DoNotEscapeUriAttributes=true}; List< string > paragraphHtmls = new List< string >(); MainDocumentPart mainPart = doc.MainDocumentPart; Body body = doc.MainDocumentPart.Document.Body; //段落 foreach ( var paragraph in body.Elements<Paragraph>()) { StringBuilder paragraphHtml = new StringBuilder(); //块 foreach ( var run in paragraph.ChildElements) { if (run is Run) { foreach (OpenXmlElement openXmlElement in run.Elements()) { //软回车 if (openXmlElement is Break br) { paragraphHtmls.Add(paragraphHtml.ToString()); paragraphHtml = new StringBuilder(); } //文字块 else if (openXmlElement is Text text) { paragraphHtml.Append(text.Text); } //图像块 else if (openXmlElement is Drawing drawing) { //得到图像的内嵌ID(外嵌没做处理) var inline = drawing.Inline; var extent = inline.Extent; var pic = inline.Graphic.GraphicData.GetFirstChild<DocumentFormat.OpenXml.Drawing.Pictures.Picture>(); var embed = pic.BlipFill.Blip.Embed.Value; //得到图像流 var part = mainPart.GetPartById(embed); var stream = part.GetStream(); //流转2进制 byte [] bytes = new byte [stream.Length]; stream.Read(bytes, 0, bytes.Length); //2进制转base64 string imgHtml = $ "<img width='{ImageExtent.EMU_TO_PX((decimal)extent.Cx.Value).ToString(" 0. ")}' height='{ImageExtent.EMU_TO_PX((decimal)extent.Cy.Value).ToString(" 0. ")}' src='data:{part.ContentType};base64," + Convert.ToBase64String(bytes) + "' />" ; paragraphHtml.Append(imgHtml); } } } //else if(run is DocumentFormat.OpenXml.Math.OfficeMath math) //{ // var x = new XmlDocument(); // x.LoadXml(math.OuterXml); // using var ms = ConvertToMatchMl(x, settings); // paragraphHtml.Append(ConvertToLatex(settings, ms)); //} } paragraphHtmls.Add(paragraphHtml.ToString()); } return paragraphHtmls; } } /// <summary> /// 合并文档 /// </summary> /// <param name="finalFile"></param> /// <param name="files"></param> public static void Combine( string finalFile, List< string > files) { if (files.Count < 2) { return ; } File.Copy(files[0], finalFile, true ); using (WordprocessingDocument doc = WordprocessingDocument.Open(finalFile, true )) { Body b = doc.MainDocumentPart.Document.Body; for ( int i = 1; i < files.Count; i++) { using (WordprocessingDocument doc1 = WordprocessingDocument.Open(files[i], true )) { foreach ( var inst in doc1.MainDocumentPart.Document.Body.Elements()) { b.Append(inst.CloneNode( true )); } } } } } private string ConvertToLatex(XmlWriterSettings settings, Stream ms) { var latexTransform = new XslCompiledTransform(); latexTransform.Load(Path.Combine(AppContext.BaseDirectory, "xsltml" , "mmltex.xsl" ), new XsltSettings( true , true ), new XmlUrlResolver() ); using var la = new MemoryStream(); latexTransform.Transform( new XmlTextReader(ms), XmlWriter.Create(la, settings)); la.Seek(0, SeekOrigin.Begin); StreamReader sr = new StreamReader(la, Encoding.UTF8); return sr.ReadToEnd(); } private Stream ConvertToMatchMl(XmlDocument xmlDocument, XmlWriterSettings settings) { var ms = new MemoryStream(); var xslTransform = new XslCompiledTransform(); xslTransform.Load(Path.Combine(AppContext.BaseDirectory, "xsltml" , "OMML2MML.XSL" )); xslTransform.Transform(xmlDocument, XmlWriter.Create(ms, settings)); ms.Seek(0, SeekOrigin.Begin); return ms; } } } |
创建文件:ImageExtent.cs
复制下面全部代码到文件 ImageExtent.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
|
using System; using System.Collections.Generic; using System.Text; namespace YCBX.Office.WordXml { /// <summary> /// 图像长度单位转换 /// </summary> public class ImageExtent { private const decimal CM_TO_PX = 96M; private const decimal INCH_TO_CM = 2.54M; /// <summary> /// 厘米到EMU(English Metric Unit) /// </summary> private const decimal CM_TO_EMU = 360000M; /// <summary> /// EMU(English Metric Unit) 到像素(px) /// </summary> /// <param name="EMU"></param> public static decimal EMU_TO_PX( decimal EMU) { return EMU / CM_TO_EMU / INCH_TO_CM * CM_TO_PX; } } } |
调用方法:
1
|
var sss = new Read().ParagraphHtmlAll( "1.docx" ); |
word文件中为:
创建一个test.html,将代码放在<body></body>中,查看效果为:
完成
栏目列表
最新更新
nodejs爬虫
Python正则表达式完全指南
爬取豆瓣Top250图书数据
shp 地图文件批量添加字段
爬虫小试牛刀(爬取学校通知公告)
【python基础】函数-初识函数
【python基础】函数-返回值
HTTP请求:requests模块基础使用必知必会
Python初学者友好丨详解参数传递类型
如何有效管理爬虫流量?
SQL SERVER中递归
2个场景实例讲解GaussDB(DWS)基表统计信息估
常用的 SQL Server 关键字及其含义
动手分析SQL Server中的事务中使用的锁
openGauss内核分析:SQL by pass & 经典执行
一招教你如何高效批量导入与更新数据
天天写SQL,这些神奇的特性你知道吗?
openGauss内核分析:执行计划生成
[IM002]Navicat ODBC驱动器管理器 未发现数据
初入Sql Server 之 存储过程的简单使用
这是目前我见过最好的跨域解决方案!
减少回流与重绘
减少回流与重绘
如何使用KrpanoToolJS在浏览器切图
performance.now() 与 Date.now() 对比
一款纯 JS 实现的轻量化图片编辑器
关于开发 VS Code 插件遇到的 workbench.scm.
前端设计模式——观察者模式
前端设计模式——中介者模式
创建型-原型模式