Extract table from docx

I have one problem parsing a * .docx document with OpenXML (C #).

So, here are my steps:
1. Download a * .docx document
2. Get a list of items
3. Look for text, image and table elements in each paragraph
4. Create html tags for each text and image element

5. Save the output as a * .html file

I learned how to find an image file in a document and extract it. Now one step - find where the position of the table is in the text (paragraph).

If anyone knows how to find a table in a .docx document using OpenXML, please help. Thank.

Additionally: Well, maybe I don’t quite understand what I mean. If we get the content of the paragraph, you can find chield objects like text blocks, images, and so on. So, if the paragraph contains Run, which contains the Picture, this means that an image is placed at this point in the Word document.

An example of my function:

public static string ParseDocxDocument(string pathToFile)
    {
        StringBuilder result = new StringBuilder();
        WordprocessingDocument wordProcessingDoc = WordprocessingDocument.Open(pathToFile, true);
        List<ImagePart> imgPart = wordProcessingDoc.MainDocumentPart.ImageParts.ToList();
        IEnumerable<Paragraph> paragraphElement = wordProcessingDoc.MainDocumentPart.Document.Descendants<Paragraph>();
        int imgCounter = 0;


        foreach (Paragraph par in paragraphElement)
        {

                //Add new paragraph tag
                result.Append("<div style=\"width:100%; text-align:");

                //Append anchor style
                if (par.ParagraphProperties != null && par.ParagraphProperties.Justification != null)
                    switch (par.ParagraphProperties.Justification.Val.Value)
                    {
                        case JustificationValues.Left:
                            result.Append("left;");
                            break;
                        case JustificationValues.Center:
                            result.Append("center;");
                            break;
                        case JustificationValues.Both:
                            result.Append("justify;");
                            break;
                        case JustificationValues.Right:
                        default:
                            result.Append("right;");
                            break;
                    }
                else
                    result.Append("left;");

                //Append text decoration style
                if (par.ParagraphProperties != null && par.ParagraphProperties.ParagraphMarkRunProperties != null && par.ParagraphProperties.ParagraphMarkRunProperties.HasChildren)
                    foreach (OpenXmlElement chield in par.ParagraphProperties.ParagraphMarkRunProperties.ChildElements)
                    {
                        switch (chield.GetType().Name)
                        {
                            case "Bold":
                                result.Append("font-weight:bold;");
                                break;
                            case "Underline":
                                result.Append("text-decoration:underline;");
                                break;
                            case "Italic":
                                result.Append("font-style:italic;");
                                break;
                            case "FontSize":
                                result.Append("font-size:" + ((FontSize)chield).Val.Value + "px;");
                                break;
                            default: break;
                        }
                    }

                result.Append("\">");

                //Add image tag
                IEnumerable<Run> runs = par.Descendants<Run>();
                foreach (Run run in runs)
                {
                    if (run.HasChildren)
                    {
                        foreach (OpenXmlElement chield in run.ChildElements.Where(o => o.GetType().Name == "Picture"))
                        {
                            result.Append(string.Format("<img style=\"{1}\" src=\"data:image/jpeg;base64,{0}\" />", GetBase64Image(imgPart[imgCounter].GetStream()),
                                           ((DocumentFormat.OpenXml.Vml.Shape)chield.ChildElements.Where(o => o.GetType().Name == "Shape").FirstOrDefault()).Style
                                ));
                            imgCounter++;
                        }
                    }
                }

                //Append inner text
                IEnumerable<Text> textElement = par.Descendants<Text>();
                if (par.Descendants<Text>().Count() == 0)
                    result.Append("<br />");

                foreach (Text t in textElement)
                {
                    result.Append(t.Text);
                }


                result.Append("</div>");
                result.Append(Environment.NewLine);

        }

        wordProcessingDoc.Close();

        return result.ToString();
    }

Now I want to indicate the location of the table in the text (as it appears in Word).

The final:

Ok, that's it, I found out. In my example function, one big mistake. I list the paragraph elements of a Body document. Tables are at the same level as Paragraph, so functions ignore tables. Therefore, we need to list the elements of the Body document.

Here is my test function to create the correct HTML from docx (this is just test code, so it doesn't get cleared)

public static string ParseDocxDocument(string pathToFile)
    {
        StringBuilder result = new StringBuilder();
        WordprocessingDocument wordProcessingDoc = WordprocessingDocument.Open(pathToFile, true);
        List<ImagePart> imgPart = wordProcessingDoc.MainDocumentPart.ImageParts.ToList();
        List<string> tableCellContent = new List<string>();
        IEnumerable<Paragraph> paragraphElement = wordProcessingDoc.MainDocumentPart.Document.Descendants<Paragraph>();
        int imgCounter = 0;

        foreach (OpenXmlElement section in wordProcessingDoc.MainDocumentPart.Document.Body.Elements<OpenXmlElement>())
        {
            if(section.GetType().Name == "Paragraph")
            {
              Paragraph par = (Paragraph)section;
                //Add new paragraph tag
                result.Append("<div style=\"width:100%; text-align:");

                //Append anchor style
                if (par.ParagraphProperties != null && par.ParagraphProperties.Justification != null)
                    switch (par.ParagraphProperties.Justification.Val.Value)
                    {
                        case JustificationValues.Left:
                            result.Append("left;");
                            break;
                        case JustificationValues.Center:
                            result.Append("center;");
                            break;
                        case JustificationValues.Both:
                            result.Append("justify;");
                            break;
                        case JustificationValues.Right:
                        default:
                            result.Append("right;");
                            break;
                    }
                else
                    result.Append("left;");

                //Append text decoration style
                if (par.ParagraphProperties != null && par.ParagraphProperties.ParagraphMarkRunProperties != null && par.ParagraphProperties.ParagraphMarkRunProperties.HasChildren)
                    foreach (OpenXmlElement chield in par.ParagraphProperties.ParagraphMarkRunProperties.ChildElements)
                    {
                        switch (chield.GetType().Name)
                        {
                            case "Bold":
                                result.Append("font-weight:bold;");
                                break;
                            case "Underline":
                                result.Append("text-decoration:underline;");
                                break;
                            case "Italic":
                                result.Append("font-style:italic;");
                                break;
                            case "FontSize":
                                result.Append("font-size:" + ((FontSize)chield).Val.Value + "px;");
                                break;
                            default: break;
                        }
                    }

                result.Append("\">");

                //Add image tag
                IEnumerable<Run> runs = par.Descendants<Run>();
                foreach (Run run in runs)
                {
                    if (run.HasChildren)
                    {
                        foreach (OpenXmlElement chield in run.ChildElements.Where(o => o.GetType().Name == "Picture"))
                        {
                            result.Append(string.Format("<img style=\"{1}\" src=\"data:image/jpeg;base64,{0}\" />", GetBase64Image(imgPart[imgCounter].GetStream()),
                                           ((DocumentFormat.OpenXml.Vml.Shape)chield.ChildElements.Where(o => o.GetType().Name == "Shape").FirstOrDefault()).Style
                                ));
                            imgCounter++;
                        }
                        foreach (OpenXmlElement table in run.ChildElements.Where(o => o.GetType().Name == "Table"))
                        {
                            result.Append("<strong>HERE TABLE</strong>");
                        }
                    }
                }

                //Append inner text
                IEnumerable<Text> textElement = par.Descendants<Text>();
                if (par.Descendants<Text>().Count() == 0)
                    result.Append("<br />");

                foreach (Text t in textElement.Where(o=>!tableCellContent.Contains(o.Text.Trim())))
                {
                    result.Append(t.Text);
                }


                result.Append("</div>");
                result.Append(Environment.NewLine);

            }
            else if (section.GetType().Name=="Table")
            {
                result.Append("<table>");
                Table tab = (Table)section;
                foreach (TableRow row in tab.Descendants<TableRow>())
                {
                    result.Append("<tr>");
                    foreach (TableCell cell in row.Descendants<TableCell>())
                    {
                        result.Append("<td>");
                        result.Append(cell.InnerText);
                        tableCellContent.Add(cell.InnerText.Trim());
                        result.Append("</td>");
                    }
                    result.Append("</tr>");
                }
                result.Append("</table>");
            }                
        }


        wordProcessingDoc.Close();

        return result.ToString();
    }

    private static string GetBase64Image(Stream inputData)
    {
        byte[] data = new byte[inputData.Length];
        inputData.Read(data, 0, data.Length);
        return Convert.ToBase64String(data);
    }
+5
source share
1

.

Table table = doc.MainDocumentPart.Document.Body.Elements<Table>().First();
+1

All Articles