using System.Collections.Generic;
private const string WhitelistedTableClass = "whitelisted-table";
public static void Main()
<p>A Paragraph of something...</p>
<a href='https://google.com'>A link to Google</a>
<table class='whitelisted-table'>
<td>Alfreds Futterkiste</td>
<td>Centro comercial Moctezuma</td>
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
var thColumns = new List<string>();
var rows = new List<string>();
var childNodes = doc.DocumentNode.ChildNodes;
var texts = ConvertToPlainText(childNodes);
Console.WriteLine(string.Join("", texts));
private static ICollection<string> ConvertToPlainText(HtmlNodeCollection nodes)
var texts = new List<string>();
foreach (var node in nodes)
var nodeName = node.Name.ToLowerInvariant();
if (node.HasClass(WhitelistedTableClass))
texts.Add(ConvertTable(node));
if (node.Attributes != null && node.Attributes.Count > 0 && node.Attributes.Contains("href"))
texts.Add(LinkToTxt(node) + "\n\n");
texts.AddRange(ConvertToPlainText(node.ChildNodes));
var innerText = node.InnerText;
if (!string.IsNullOrWhiteSpace(innerText))
texts.Add(innerText + "\n\n");
private static string LinkToTxt(HtmlNode node)
var link = node.Attributes["href"].Value;
var linkLabel = node.InnerText.ToString();
return linkLabel + ": " + link;
private static string ConvertTable(HtmlNode node)
var thColumns = new List<string>();
var rows = new List<string>();
var trs = node.SelectNodes("tr");
var theadColValues = ParseTHead(node);
if (theadColValues.Count > 0)
thColumns.AddRange(theadColValues);
if (trs != null && trs.Count > 0)
var ths = row.SelectNodes("th");
foreach (var cell in ths)
thColumns.Add(cell.InnerText);
var tds = row.SelectNodes("td");
var rowValues = new List<string>();
foreach (var cell in tds)
var columnValue = thColumns[idx];
rowValues.Add(columnValue + ": " + cell.InnerText);
rows.Add(string.Join("\n", rowValues));
var tBodyValues = ParseTBody(thColumns, node);
if (tBodyValues.Count > 0)
rows.Add(string.Join("\n\n", tBodyValues));
return string.Join("", rows);
return string.Join("\n\n", rows);
private static List<string> ParseTHead(HtmlNode node)
var thColumns = new List<string>();
var thead = node.SelectSingleNode("thead");
var theadTrs = thead.SelectNodes("tr");
foreach (var row in theadTrs)
var ths = row.SelectNodes("th");
foreach (var cell in ths)
thColumns.Add(cell.InnerText);
private static List<string> ParseTBody(List<string> thColumns, HtmlNode node)
var tBodyValues = new List<string>();
var tbody = node.SelectSingleNode("tbody");
var trs = tbody.SelectNodes("tr");
var rowValues = new List<string>();
var tds = row.SelectNodes("td");
foreach (var cell in tds)
var columnValue = thColumns[idx];
rowValues.Add(columnValue + ": " + cell.InnerText);
tBodyValues.Add(string.Join("\n", rowValues));