using System.Text.RegularExpressions;
using System.Collections.Generic;
public static string RemoveHtmlTags(string html)
string htmlRemoved = Regex.Replace(html, @"<script[^>]*>[\s\S]*?</script>|<[^>]+>| ", " ").Trim();
string normalised = Regex.Replace(htmlRemoved, @"\s{2,}", " ");
public static void Main()
using (WebClient client = new WebClient())
string html = client.DownloadString("http://stackoverflow.com/").ToLower();
html = RemoveHtmlTags(html);
List<string> list = html.Split(' ').ToList();
var onlyAlphabetRegEx = new Regex(@"^[A-z]+$");
list = list.Where(f => onlyAlphabetRegEx.IsMatch(f)).ToList();
string[] blacklist = { "a", "an", "on", "of", "or", "as", "i", "in", "is", "to", "the", "and", "for", "with", "not", "by" };
list = list.Where(x => x.Length > 2).Where(x => !blacklist.Contains(x)).ToList();
var keywords = list.GroupBy(x => x).OrderByDescending(x => x.Count());
foreach (var word in keywords)
Console.WriteLine("{0} {1}", word.Key, word.Count());