I tested @Simon Mourier's answer on the Google homepage and got a lot of CSS and Javascript, so I added an additional filter to remove it:
public string getBodyText(string html)
{
string str = "";
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(html);
try
{
doc.DocumentNode.Descendants().Where( n => n.Name == "script" || n.Name == "style" ).ToList().ForEach(n => n.Remove());
foreach (HtmlNode node in doc.DocumentNode.SelectNodes("//text()[normalize-space(.) != '']"))
{
str += node.InnerText.Trim() + " ";
}
}
catch (Exception)
{
}
return str;
}
source
share