using System.Collections.Generic;
using System.Globalization;
using System.Text.RegularExpressions;
public class Canonicalisation
private const String IpValidation = "^([1-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])(\\.([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])){3}$";
private static Canonicalisation _instance;
private Canonicalisation(){}
public static Canonicalisation GetInstance()
return _instance ?? (_instance = new Canonicalisation());
private static String Unescape(String url)
var stringBuilder = new StringBuilder(url);
url = stringBuilder.ToString();
for (var x = 0; x < 50; x++)
url = Uri.UnescapeDataString(url);
private static String DecodeHost(String host)
var addr = new Uri(host);
if (host == IpValidation)
if (host.IndexOf('.') > -1)
return addr.Host.ToLower();
if (addr.Host != IpValidation)
return addr.Host.ToLower();
public string RemoveBackSlashes(string input)
var literal = new StringBuilder();
if (Char.GetUnicodeCategory(c) != UnicodeCategory.Control)
literal.Append(((ushort)c).ToString("x4"));
return literal.ToString();
public String CanonicalizeUrl(String queryUrl)
String url = RemoveBackSlashes(queryUrl);
url = Uri.UnescapeDataString(url);
while ((url.IndexOf("..", StringComparison.Ordinal) != -1))
url = url.Replace("..", ".");
url = url.Replace("(?i)%5C", "%");
url = url.Replace("/./", "/");
url = url.Replace("/../", "/");
url = url.Replace("./", "/");
if (url.IndexOf("http://", StringComparison.Ordinal) <= -1 && url.IndexOf("https://", StringComparison.Ordinal) <= -1) url = "http://" + url;
url = url.Replace("[\\t\\n\\r\\f\\e]*", "");
var theUrl = new Uri(url);
var path = theUrl.AbsolutePath;
var query = theUrl.Query;
var protocol = GetProtocol(theUrl);
if (protocol == null || String.IsNullOrEmpty(protocol))
var user = theUrl.UserInfo;
path = path.Replace("//", "/");
var sb = new StringBuilder();
foreach (char urlChar in host)
if ((urlChar >= '0' && urlChar <= '9') || (urlChar >= 'a' && urlChar <= 'z') || urlChar == '.' || urlChar == '-')
sb.Append(Uri.EscapeDataString(urlChar.ToString(CultureInfo.InvariantCulture)));
while (host.StartsWith("."))
if (String.IsNullOrEmpty(path))
var pattern = new Regex("([a-z]{1})([0-9]{2})");
while (pattern.IsMatch(host))
host = pattern.Replace(host,val);
var regex = new Regex("[\\s].*");
if (!host.Contains("%") && !regex.IsMatch(host))
return Uri.EscapeUriString(CreateUri(protocol, host, port,path,query).ToString()).Replace("%25","%");
if (!host.Contains("%") && !regex.IsMatch(host))
uri = CreateUri(protocol,host, port,path,query);
if (uri != null && !String.IsNullOrEmpty(uri.AbsolutePath.Trim()))
path = path.Replace("[;]*", "");
sb.Append(protocol + ":");
if (host.LastIndexOf("/", host.Length, StringComparison.Ordinal) > 8)
host = host.Substring(0, host.Length - 1);
path = path.Replace("#", "%23");
const string endtrim = "//";
if (sb.ToString().EndsWith(endtrim))
string result = sb.ToString();
while (result.EndsWith(endtrim))
result = result.Substring(0, result.Length - endtrim.Length);
sb = new StringBuilder(result);
throw new Exception("Could not canonicalise URL: " + queryUrl, e);
throw new ApplicationException("Application wasn't supposed to be here.");
private Uri CreateUri(string protocol, string site, int port,string path,string query)
if (site.StartsWith(protocol))
url += protocol + "://" + site;
if (query.StartsWith("?") && !String.IsNullOrWhiteSpace(query))
else if (!String.IsNullOrWhiteSpace(query))
private static string GetProtocol(Uri theUrl)
if (theUrl.OriginalString.Contains("://"))
return theUrl.OriginalString.Split(new[] {"://"}, StringSplitOptions.None)[0];
private static String Escape(String url)
var sb = new StringBuilder();
foreach (char charUrl in url)
else if (charUrl <= 32 || charUrl >= 127 || charUrl == '%')
sb.Append(Uri.EscapeDataString(charUrl.ToString(CultureInfo.CurrentCulture)));
public List<string> GetLookupUrLs(String queryUrl)
var urls = new List<string>();
var canonicalizedUrl = CanonicalizeUrl(queryUrl);
if (canonicalizedUrl != null)
var url = new Uri(canonicalizedUrl);
var path = url.AbsolutePath;
var hosts = new List<String> {host};
var hostArray = host.Split(new []{"\\."},StringSplitOptions.None);
var sb = new StringBuilder();
var start = (hostArray.Length < 6 ? 1 : hostArray.Length - 5);
var stop = hostArray.Length;
for (var i = start; i < stop - 1; i++)
for (var j = i; j < stop; j++)
sb.Append(hostArray[j] + ".");
sb.Remove(sb.Length - 1, 1);
hosts.Add(sb.ToString());
var paths = new List<String> {path + query, path};
if (!paths.Contains("/"))
var st = path.Split('/');
foreach (var thisToken in st)
pathElement = pathElement + thisToken + (thisToken.IndexOf(".", StringComparison.Ordinal) == -1 ? "/" : "");
if (!paths.Contains(pathElement))
foreach (string selectedHost in hosts)
urls.AddRange(paths.Select(t => selectedHost + t));
throw new Exception("Could not generate lookup URLs", e);
public class CanonicalisationTest
Assert.AreEqual(Canonicalize("http://host/%25%32%35"), "http://host/%25");
public void DoubleTrimTest()
Assert.AreEqual(Canonicalize("http://host/%25%32%35%25%32%35"), "http://host/%25%25");
public void NoSlashtrimTest()
Assert.AreEqual(Canonicalize("http://host/%2525252525252525"), "http://host/%25");
public void MiddleUrlTrimTest()
Assert.AreEqual(Canonicalize("http://host/asdf%25%32%35asd"), "http://host/asdf%25asd");
public void PercentageRecognitionTestTest()
Assert.AreEqual(Canonicalize("http://host/%%%25%32%35asd%%"), "http://host/%25%25%25asd%25%25");
Assert.AreEqual(Canonicalize("http://www.google.com/"), "http://www.google.com/");
public void AdvancedIpTest()
"http://%31%36%38%2e%31%38%38%2e%39%39%2e%32%36/%2E%73%65%63%75%72%65/%77%77%77%2E%65%62%61%79%2E%63%6F%6D/"),
"http://168.188.99.26/.secure/www.ebay.com/");
public void BasicIpTest()
"http://195.127.0.11/uploads/%20%20%20%20/.verify/.eBaysecure=updateuserdataxplimnbqmn-xplmvalidateinfoswqpcmlx=hgplmcx/"),
"http://195.127.0.11/uploads/%20%20%20%20/.verify/.eBaysecure=updateuserdataxplimnbqmn-xplmvalidateinfoswqpcmlx=hgplmcx/");
public void IpConversionTest()
Assert.AreEqual(Canonicalize("http://3279880203/blah"), "http://195.127.0.11/blah");
public void NoPrototcolTest()
Assert.AreEqual(Canonicalize("www.google.com/"), "http://www.google.com/");
public void NoProtocolAndEndingSlashTest()
Assert.AreEqual(Canonicalize("www.google.com"), "http://www.google.com/");
public void HashtagTrimTest()
Assert.AreEqual(Canonicalize("http://www.evil.com/blah#frag"), "http://www.evil.com/blah");
public void CapitalizationTest()
Assert.AreEqual(Canonicalize("http://www.GOOgle.com/"), "http://www.google.com/");
Assert.AreEqual(Canonicalize("http://www.google.com.../"), "http://www.google.com/");
public void BackSlashTrimTest()
Assert.AreEqual(Canonicalize("http://www.google.com/foo\tbar\rbaz\n2"), "http://www.google.com/foobarbaz2");
public void UrlParameterBasicTest()
Assert.AreEqual(Canonicalize("http://www.google.com/q?"), "http://www.google.com/q?");
public void UrlparamaterOneParameterTest()
Assert.AreEqual(Canonicalize("http://www.google.com/q?r?"), "http://www.google.com/q?r?");
public void UrlParameterDoubleParameterTest()
Assert.AreEqual(Canonicalize("http://www.google.com/q?r?s"), "http://www.google.com/q?r?s");
public void AdvancedHashTagTestTest()
Assert.AreEqual(Canonicalize("http://evil.com/foo#bar#baz"), "http://evil.com/foo");
Assert.AreEqual(Canonicalize("http://evil.com/foo);"), "http://evil.com/foo);");
public void ParameterSlashTest()
Assert.AreEqual(Canonicalize("http://evil.com/foo?bar);"), "http://evil.com/foo?bar);");
public void NotrailingSlashTest()
Assert.AreEqual(Canonicalize("http://notrailingslash.com"), "http://notrailingslash.com/");
Assert.AreEqual(Canonicalize("http://www.gotaport.com:1234/"), "http://www.gotaport.com:1234/");
public void TrailingSpacesTest()
Assert.AreEqual(Canonicalize(" http://www.google.com/ "), "http://www.google.com/");
Assert.AreEqual(Canonicalize("https://www.securesite.com/"), "https://www.securesite.com/");
public void TwoSlashesTest()
Assert.AreEqual(Canonicalize("http://host.com//twoslashes?more//slashes") , "http://host.com/twoslashes?more//slashes");
private string Canonicalize(string url)
var canonizer = Canonicalisation.GetInstance();
return canonizer.CanonicalizeUrl(url);
public void AdvancedLeadingSpaceTest()
Assert.AreEqual(Canonicalize("http://%20leadingspace.com/"), "http://%20leadingspace.com/");
public void BasicLocationTest()
Assert.AreEqual(Canonicalize("http://host.com/ab%23cd") , "http://host.com/ab%23cd");
public void ConversionTest()
"http://host%23.com/%257Ea%2521b%2540c%2523d%2524e%25f%255E00%252611%252A22%252833%252944_55%252B"),
"http://host%23.com/~a!b@c%23d$e%25f^00&11*22(33)44_55+");
public void NoProtocolLeadingSpaceTest()
Assert.AreEqual(Canonicalize("%20leadingspace.com/"), "http://%20leadingspace.com/");
public void UnicodeTest()
Assert.AreEqual(Canonicalize("http://\x01\x80.com/"), "http://%01%80.com/");
public void DotTrimTest()
Assert.AreEqual(Canonicalize("http://www.google.com/blah/.."), "http://www.google.com/");
public void LeadingSpaceTest()
Assert.AreEqual(Canonicalize("http:// leadingspace.com/"), "http://%20leadingspace.com/");