I have one problem parsing a * .docx document with OpenXML (C #).
So, here are my steps:
1. Download a * .docx document
2. Get a list of items
3. Look for text, image and table elements in each paragraph
4. Create html tags for each text and image element
5. Save the output as a * .html file
I learned how to find an image file in a document and extract it. Now one step - find where the position of the table is in the text (paragraph).
If anyone knows how to find a table in a .docx document using OpenXML, please help. Thank.
Additionally: Well, maybe I donβt quite understand what I mean. If we get the content of the paragraph, you can find chield objects like text blocks, images, and so on. So, if the paragraph contains Run, which contains the Picture, this means that an image is placed at this point in the Word document.
An example of my function:
public static string ParseDocxDocument(string pathToFile)
{
StringBuilder result = new StringBuilder();
WordprocessingDocument wordProcessingDoc = WordprocessingDocument.Open(pathToFile, true);
List<ImagePart> imgPart = wordProcessingDoc.MainDocumentPart.ImageParts.ToList();
IEnumerable<Paragraph> paragraphElement = wordProcessingDoc.MainDocumentPart.Document.Descendants<Paragraph>();
int imgCounter = 0;
foreach (Paragraph par in paragraphElement)
{
result.Append("<div style=\"width:100%; text-align:");
if (par.ParagraphProperties != null && par.ParagraphProperties.Justification != null)
switch (par.ParagraphProperties.Justification.Val.Value)
{
case JustificationValues.Left:
result.Append("left;");
break;
case JustificationValues.Center:
result.Append("center;");
break;
case JustificationValues.Both:
result.Append("justify;");
break;
case JustificationValues.Right:
default:
result.Append("right;");
break;
}
else
result.Append("left;");
if (par.ParagraphProperties != null && par.ParagraphProperties.ParagraphMarkRunProperties != null && par.ParagraphProperties.ParagraphMarkRunProperties.HasChildren)
foreach (OpenXmlElement chield in par.ParagraphProperties.ParagraphMarkRunProperties.ChildElements)
{
switch (chield.GetType().Name)
{
case "Bold":
result.Append("font-weight:bold;");
break;
case "Underline":
result.Append("text-decoration:underline;");
break;
case "Italic":
result.Append("font-style:italic;");
break;
case "FontSize":
result.Append("font-size:" + ((FontSize)chield).Val.Value + "px;");
break;
default: break;
}
}
result.Append("\">");
IEnumerable<Run> runs = par.Descendants<Run>();
foreach (Run run in runs)
{
if (run.HasChildren)
{
foreach (OpenXmlElement chield in run.ChildElements.Where(o => o.GetType().Name == "Picture"))
{
result.Append(string.Format("<img style=\"{1}\" src=\"data:image/jpeg;base64,{0}\" />", GetBase64Image(imgPart[imgCounter].GetStream()),
((DocumentFormat.OpenXml.Vml.Shape)chield.ChildElements.Where(o => o.GetType().Name == "Shape").FirstOrDefault()).Style
));
imgCounter++;
}
}
}
IEnumerable<Text> textElement = par.Descendants<Text>();
if (par.Descendants<Text>().Count() == 0)
result.Append("<br />");
foreach (Text t in textElement)
{
result.Append(t.Text);
}
result.Append("</div>");
result.Append(Environment.NewLine);
}
wordProcessingDoc.Close();
return result.ToString();
}
Now I want to indicate the location of the table in the text (as it appears in Word).
The final:
Ok, that's it, I found out. In my example function, one big mistake. I list the paragraph elements of a Body document. Tables are at the same level as Paragraph, so functions ignore tables. Therefore, we need to list the elements of the Body document.
Here is my test function to create the correct HTML from docx (this is just test code, so it doesn't get cleared)
public static string ParseDocxDocument(string pathToFile)
{
StringBuilder result = new StringBuilder();
WordprocessingDocument wordProcessingDoc = WordprocessingDocument.Open(pathToFile, true);
List<ImagePart> imgPart = wordProcessingDoc.MainDocumentPart.ImageParts.ToList();
List<string> tableCellContent = new List<string>();
IEnumerable<Paragraph> paragraphElement = wordProcessingDoc.MainDocumentPart.Document.Descendants<Paragraph>();
int imgCounter = 0;
foreach (OpenXmlElement section in wordProcessingDoc.MainDocumentPart.Document.Body.Elements<OpenXmlElement>())
{
if(section.GetType().Name == "Paragraph")
{
Paragraph par = (Paragraph)section;
result.Append("<div style=\"width:100%; text-align:");
if (par.ParagraphProperties != null && par.ParagraphProperties.Justification != null)
switch (par.ParagraphProperties.Justification.Val.Value)
{
case JustificationValues.Left:
result.Append("left;");
break;
case JustificationValues.Center:
result.Append("center;");
break;
case JustificationValues.Both:
result.Append("justify;");
break;
case JustificationValues.Right:
default:
result.Append("right;");
break;
}
else
result.Append("left;");
if (par.ParagraphProperties != null && par.ParagraphProperties.ParagraphMarkRunProperties != null && par.ParagraphProperties.ParagraphMarkRunProperties.HasChildren)
foreach (OpenXmlElement chield in par.ParagraphProperties.ParagraphMarkRunProperties.ChildElements)
{
switch (chield.GetType().Name)
{
case "Bold":
result.Append("font-weight:bold;");
break;
case "Underline":
result.Append("text-decoration:underline;");
break;
case "Italic":
result.Append("font-style:italic;");
break;
case "FontSize":
result.Append("font-size:" + ((FontSize)chield).Val.Value + "px;");
break;
default: break;
}
}
result.Append("\">");
IEnumerable<Run> runs = par.Descendants<Run>();
foreach (Run run in runs)
{
if (run.HasChildren)
{
foreach (OpenXmlElement chield in run.ChildElements.Where(o => o.GetType().Name == "Picture"))
{
result.Append(string.Format("<img style=\"{1}\" src=\"data:image/jpeg;base64,{0}\" />", GetBase64Image(imgPart[imgCounter].GetStream()),
((DocumentFormat.OpenXml.Vml.Shape)chield.ChildElements.Where(o => o.GetType().Name == "Shape").FirstOrDefault()).Style
));
imgCounter++;
}
foreach (OpenXmlElement table in run.ChildElements.Where(o => o.GetType().Name == "Table"))
{
result.Append("<strong>HERE TABLE</strong>");
}
}
}
IEnumerable<Text> textElement = par.Descendants<Text>();
if (par.Descendants<Text>().Count() == 0)
result.Append("<br />");
foreach (Text t in textElement.Where(o=>!tableCellContent.Contains(o.Text.Trim())))
{
result.Append(t.Text);
}
result.Append("</div>");
result.Append(Environment.NewLine);
}
else if (section.GetType().Name=="Table")
{
result.Append("<table>");
Table tab = (Table)section;
foreach (TableRow row in tab.Descendants<TableRow>())
{
result.Append("<tr>");
foreach (TableCell cell in row.Descendants<TableCell>())
{
result.Append("<td>");
result.Append(cell.InnerText);
tableCellContent.Add(cell.InnerText.Trim());
result.Append("</td>");
}
result.Append("</tr>");
}
result.Append("</table>");
}
}
wordProcessingDoc.Close();
return result.ToString();
}
private static string GetBase64Image(Stream inputData)
{
byte[] data = new byte[inputData.Length];
inputData.Read(data, 0, data.Length);
return Convert.ToBase64String(data);
}