using System.Collections.Generic;
using System.Text.RegularExpressions;
using System.Text.RegularExpressions;
public static void Main()
var html = @"<html><head><meta name=""Generator"" content=""Microsoft Exchange Server"">
<!-- converted from text -->
<style><!-- .EmailQuote { margin-left: 1pt; padding-left: 4pt; border-left: #800000 2px solid; } --></style></head>
<font size=""2""><span style=""font-size:11pt;""><div class=""PlainText""> <p>Hello<br/>world, how are you doing today?</p><p>This is a separate <b>paragraph</b></p></div><div>Please replace <b> tags with <i> tags</div></span></font>
var texts = GetTextFromHtml(html);
var regextexts = RemoveHtmlEntities(html);
Console.WriteLine(texts);
Console.WriteLine("----------------------------");
Console.WriteLine(regextexts);
private static string RemoveHtmlEntities(string input)
input = HttpUtility.HtmlDecode(input);
input = Regex.Replace(input, """, "'");
return Regex.Replace(input, "<.*?>", String.Empty).Trim();
private static string GetTextFromHtml(string html)
if (string.IsNullOrEmpty(html))
var htmlDoc = new HtmlDocument();
return GetTextFromNodes(htmlDoc.DocumentNode.ChildNodes);
private static string GetTextFromNodes(HtmlNodeCollection nodes, int indent = 0)
StringBuilder texts = new StringBuilder();
string[] linebreaks = {"p", "br", "table", "th", "tr"};
string[] indentTag = {"ul", "li"};
foreach (var node in nodes)
if (node.Name.ToLowerInvariant() == "style")
if (indentTag.Contains(node.Name.ToLowerInvariant()))
texts.Append(GetTextFromNodes(node.ChildNodes, indent + 1));
texts.Append(GetTextFromNodes(node.ChildNodes, indent));
var innerText = node.InnerText;
if (!string.IsNullOrWhiteSpace(innerText))
innerText = HttpUtility.HtmlDecode(innerText);
texts.Append(new String(' ', indent) + innerText);
if (node.Name.ToLowerInvariant() == "a")
texts.Append("\r\n" + node.Attributes["href"].Value + "\r\n");
if (node.Name.ToLowerInvariant() == "img" && !node.Attributes["src"].Value.EndsWith("invis.gif"))
texts.Append("\r\n" + node.Attributes["src"].Value + "\r\n");
if (linebreaks.Contains(node.Name.ToLowerInvariant()))