public class HtmlUtilities
static async System.Threading.Tasks.Task Main()
string html = "<html><body><h1>Title</h1><p>This is a <b>bold</b> text.</p><div>Another<br/> block.</div></body></html>";
string extractedText = ConvertToPlainText(html);
Console.WriteLine("Extracted:" + extractedText);
public static string ConvertToPlainText(string html)
HtmlDocument doc = new HtmlDocument();
StringWriter sw = new StringWriter();
ConvertTo(doc.DocumentNode, sw);
public static int CountWords(string plainText)
return !String.IsNullOrEmpty(plainText) ? plainText.Split(' ', '\n').Length : 0;
public static string Cut(string text, int length)
if (!String.IsNullOrEmpty(text) && text.Length > length)
text = text.Substring(0, length - 4) + " ...";
private static void ConvertContentTo(HtmlNode node, TextWriter outText)
foreach (HtmlNode subnode in node.ChildNodes)
ConvertTo(subnode, outText);
private static void ConvertTo(HtmlNode node, TextWriter outText)
case HtmlNodeType.Comment:
case HtmlNodeType.Document:
ConvertContentTo(node, outText);
string parentName = node.ParentNode.Name;
if ((parentName == "script") || (parentName == "style"))
html = ((HtmlTextNode)node).Text;
if (HtmlNode.IsOverlappedClosingElement(html))
if (html.Trim().Length > 0)
outText.Write(HtmlEntity.DeEntitize(html));
case HtmlNodeType.Element:
ConvertContentTo(node, outText);