VB.net 2010 视频教程 VB.net 2010 视频教程 python基础视频教程
SQL Server 2008 视频教程 c#入门经典教程 Visual Basic从门到精通视频教程
当前位置:
首页 > c#编程 >
  • C# Net 使用 openxml 提取word中的文本和图片并转为Html

C# Net Core openxml 提取 提出 取 word  文本  图片 Html Text Drawing

C# Net Core openxml 提取 提出 取 word  文本  图片 Html Text Drawing

注:只支持内嵌,不支持公式

 

------------------------------------------------

---------------文章最后为效果------------

------------------------------------------------

 

加入包:OpenXml

创建文件:Read.cs

复制下面全部代码到文件 Read.cs

 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
using DocumentFormat.OpenXml;
using DocumentFormat.OpenXml.Packaging;
using DocumentFormat.OpenXml.Wordprocessing;
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using System.Xml;
using System.Xml.Xsl;
 
namespace YCBX.Office.WordXml
{
    public class WordRead
    {
        public static List<string> ReadToHtml(string wordPathStr)
        {
            return ReadToHtml(new FileStream(wordPathStr, FileMode.Open));
        }
 
        public static List<string> ReadToHtml(Stream wordStream)
        {
            using (WordprocessingDocument doc = WordprocessingDocument.Open(wordStream, false))
            {
                //XmlWriterSettings settings = new XmlWriterSettings() { OmitXmlDeclaration = true, ConformanceLevel = ConformanceLevel.Auto,DoNotEscapeUriAttributes=true};
                List<string> paragraphHtmls = new List<string>();
 
                MainDocumentPart mainPart = doc.MainDocumentPart;
                Body body = doc.MainDocumentPart.Document.Body;
 
                //段落
                foreach (var paragraph in body.Elements<Paragraph>())
                {
                    StringBuilder paragraphHtml = new StringBuilder();
                    //块
                    foreach (var run in paragraph.ChildElements)
                    {
                        if (run is Run)
                        {
                            foreach (OpenXmlElement openXmlElement in run.Elements())
                            {
                                //软回车
                                if (openXmlElement is Break br)
                                {
                                    paragraphHtmls.Add(paragraphHtml.ToString());
                                    paragraphHtml = new StringBuilder();
                                }
                                //文字块
                                else if (openXmlElement is Text text)
                                {
                                    paragraphHtml.Append(text.Text);
                                }
                                //图像块
                                else if (openXmlElement is Drawing drawing)
                                {
                                    //得到图像的内嵌ID(外嵌没做处理)
                                    var inline = drawing.Inline;
                                    var extent = inline.Extent;
                                    var pic = inline.Graphic.GraphicData.GetFirstChild<DocumentFormat.OpenXml.Drawing.Pictures.Picture>();
                                    var embed = pic.BlipFill.Blip.Embed.Value;
 
                                    //得到图像流
                                    var part = mainPart.GetPartById(embed);
                                    var stream = part.GetStream();
 
                                    //流转2进制
                                    byte[] bytes = new byte[stream.Length];
                                    stream.Read(bytes, 0, bytes.Length);
 
                                    //2进制转base64
                                    string imgHtml = $"<img width='{ImageExtent.EMU_TO_PX((decimal)extent.Cx.Value).ToString("0.")}' height='{ImageExtent.EMU_TO_PX((decimal)extent.Cy.Value).ToString("0.")}' src='data:{part.ContentType};base64," + Convert.ToBase64String(bytes) + "' />";
                                    paragraphHtml.Append(imgHtml);
                                }
                            }
                        }
                        //else if(run is DocumentFormat.OpenXml.Math.OfficeMath math)
                        //{
                        //    var x = new XmlDocument();
                        //    x.LoadXml(math.OuterXml);
                        //    using var ms = ConvertToMatchMl(x, settings);
                        //    paragraphHtml.Append(ConvertToLatex(settings, ms));
                        //}
                    }
 
                    paragraphHtmls.Add(paragraphHtml.ToString());
                }
 
                return paragraphHtmls;
            }
            
        }
 
        /// <summary>
        /// 合并文档
        /// </summary>
        /// <param name="finalFile"></param>
        /// <param name="files"></param>
        public static void Combine(string finalFile, List<string> files)
        {
            if (files.Count < 2)
            {
                return;
            }
            File.Copy(files[0], finalFile, true);
            using (WordprocessingDocument doc = WordprocessingDocument.Open(finalFile, true))
            {
                Body b = doc.MainDocumentPart.Document.Body;
                for (int i = 1; i < files.Count; i++)
                {
                    using (WordprocessingDocument doc1 = WordprocessingDocument.Open(files[i], true))
                    {
                        foreach (var inst in doc1.MainDocumentPart.Document.Body.Elements())
                        {
                            b.Append(inst.CloneNode(true));
                        }
                    }
                }
            }
        }
 
        private string ConvertToLatex(XmlWriterSettings settings, Stream ms)
        {
            var latexTransform = new XslCompiledTransform();
            latexTransform.Load(Path.Combine(AppContext.BaseDirectory, "xsltml""mmltex.xsl"), new XsltSettings(true,true),new XmlUrlResolver() );
            using var la = new MemoryStream();
            latexTransform.Transform(new XmlTextReader(ms), XmlWriter.Create(la, settings));
            la.Seek(0, SeekOrigin.Begin);
            StreamReader sr = new StreamReader(la, Encoding.UTF8);
            return sr.ReadToEnd();
        }
 
        private Stream ConvertToMatchMl(XmlDocument xmlDocument, XmlWriterSettings settings)
        {
            var ms = new MemoryStream();
            var xslTransform = new XslCompiledTransform();
            xslTransform.Load(Path.Combine(AppContext.BaseDirectory, "xsltml""OMML2MML.XSL"));
            xslTransform.Transform(xmlDocument, XmlWriter.Create(ms, settings));
            ms.Seek(0, SeekOrigin.Begin);
            return ms;
        }
    }
}

  

 

创建文件:ImageExtent.cs

复制下面全部代码到文件 ImageExtent.cs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
using System;
using System.Collections.Generic;
using System.Text;
 
namespace YCBX.Office.WordXml
{
    /// <summary>
    /// 图像长度单位转换
    /// </summary>
    public class ImageExtent
    {
        private const decimal CM_TO_PX = 96M;
        private const decimal INCH_TO_CM = 2.54M;
        /// <summary>
        /// 厘米到EMU(English Metric Unit)
        /// </summary>
        private const decimal CM_TO_EMU = 360000M;
 
        /// <summary>
        /// EMU(English Metric Unit) 到像素(px)
        /// </summary>
        /// <param name="EMU"></param>
        public static decimal EMU_TO_PX(decimal EMU)
        {
            return EMU / CM_TO_EMU / INCH_TO_CM * CM_TO_PX;
        }
    }
}

  

调用方法:

1
var sss = new Read().ParagraphHtmlAll("1.docx");

 

word文件中为:

 

 

创建一个test.html,将代码放在<body></body>中,查看效果为:

 

 

完成

 


相关教程