using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Diagnostics;
using System.Net.Http.Headers;
using System.Security.Authentication;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
private static Stopwatch CrawlerStopWatch = new Stopwatch();
private static HttpClient HttpClientCrawler = null;
private static HttpClientHandler HttpClientCrawlerHttpsHandler = new HttpClientHandler() { SslProtocols = SslProtocols.Tls12 | SslProtocols.Tls11 | SslProtocols.Tls };
private static BlockingCollection<Thread> CrawlerThreads = new BlockingCollection<Thread>();
private static BlockingCollection<String> CrawlerFailuresBlockingCollection = new BlockingCollection<String>();
private static BlockingCollection<String> CrawlerHitsBlockingCollection = new BlockingCollection<String>();
private static BlockingCollection<String> CrawlerMissesBlockingCollection = new BlockingCollection<String>();
private static String BaseUrl = "http://numbersapi.com/";
private static String uriTemplate = String.Concat(BaseUrl, "{0}");
private static int ThreadsCount = 200;
private static int ThreadRange = 50;
private static int Offset = 0;
private static int ItemsToProcess = ThreadsCount * ThreadRange;
private static int MaxUrlLength = String.Format(uriTemplate, ItemsToProcess).Length;
private static ReaderWriterLockSlim CrawlerReaderWriterLockSlim = new ReaderWriterLockSlim();
private const int NumberOfSimultaneousConnections = 20;
private const int MaxResponseContentBufferSize = 4096;
private const int FirstIndexToRequest = 1;
private const int LastIndexToRequest = 10000;
private static String CrawlerResultantFileName = "z.CrawlerResult.txt";
public static void Main(String[] args)
CrawlerStopWatch.Start();
HttpClientHandler hch = new HttpClientHandler() { Proxy = null, UseProxy = false };
HttpClient[] clients = Enumerable.Range(0, NumberOfSimultaneousConnections).Select(i =>
new HttpClient(hch) { MaxResponseContentBufferSize = MaxResponseContentBufferSize }
List<Task<string>> tasks = new List<Task<string>>();
for (int i = FirstIndexToRequest; i <= LastIndexToRequest; ++i)
string uri = string.Format(uriTemplate, i);
tasks.Add(ProcessURLAsync(uri, clients[i % NumberOfSimultaneousConnections]));
Task.WaitAll(tasks.ToArray());
string[] results = tasks.Select(t => t.Result).ToArray();
} catch (Exception) { FinalizeCrawler(); }
Console.WriteLine("Wyck");
private async static Task<string> ProcessURLAsync(string uri, HttpClient client)
HttpResponseMessage response = null;
try { response = await client.GetAsync(uri); response.EnsureSuccessStatusCode(); } catch (Exception e) { HandleCrawlerFailure(uri, new HttpResponseMessage() { ReasonPhrase = e.Message}); }
byte[] content = await response.Content.ReadAsByteArrayAsync();
String EncodedContent = Encoding.UTF8.GetString(content);
if (isResultRelevant(EncodedContent)) { HandleCrawlerRelevantResult(uri, EncodedContent); } else { HandleCrawlerIrrelevantResult(uri, EncodedContent); }
public static void HandleCrawlerFailure(String Url, HttpResponseMessage HttpResponseMessage)
CrawlerFailuresBlockingCollection.Add(Url);
int ProcessedItems = CrawlerHitsBlockingCollection.Count + CrawlerMissesBlockingCollection.Count + CrawlerFailuresBlockingCollection.Count;
Console.WriteLine($"[Item #{ProcessedItems.ToString().PadRight(ItemsToProcess.ToString().Length)}] {Url.PadRight(MaxUrlLength)} returned {(int)HttpResponseMessage.StatusCode} Code | {HttpResponseMessage.ReasonPhrase}");
public static Boolean isResultRelevant(String Content)
Boolean IsRelevant = true;
String[] RegularExpressionsArray = new string[]
@"^[\d]+ is a boring number\.$",
@"^[\d]+ is an uninteresting number\.$",
@"^[\d]+ is an unremarkable number\.$",
@"^[\d]+ is a number for which we're missing a fact (submit one to numbersapi at google mail!)\.$",
foreach (String RegularExpression in RegularExpressionsArray) { if (Regex.IsMatch(Content, RegularExpression)) { IsRelevant = false; break; } }
public static void HandleCrawlerRelevantResult(String Url, String Content)
CrawlerResultantFileWriteLine(Url);
CrawlerHitsBlockingCollection.Add(Url);
int ProcessedItems = CrawlerHitsBlockingCollection.Count + CrawlerMissesBlockingCollection.Count + CrawlerFailuresBlockingCollection.Count;
Console.WriteLine($"[Item #{ProcessedItems.ToString().PadRight(ItemsToProcess.ToString().Length)}] {Url.PadRight(MaxUrlLength)} is relevant");
public static void HandleCrawlerIrrelevantResult(String Url, String Content)
CrawlerMissesBlockingCollection.Add(Url);
int ProcessedItems = CrawlerHitsBlockingCollection.Count + CrawlerMissesBlockingCollection.Count + CrawlerFailuresBlockingCollection.Count;
Console.WriteLine($"[Item #{ProcessedItems.ToString().PadRight(ItemsToProcess.ToString().Length)}] {Url.PadRight(MaxUrlLength)} is irrelevant");
public static void FinalizeCrawler()
TimeSpan TimeSpan = TimeSpan.FromMilliseconds(CrawlerStopWatch.ElapsedMilliseconds);
String TimeLapseInformation = String.Format("{0:D2}h:{1:D2}m:{2:D2}s:{3:D3}ms",
Console.WriteLine($"Crawling finished in {TimeLapseInformation}.");
Console.WriteLine($"<{CrawlerFailuresBlockingCollection.Count + CrawlerHitsBlockingCollection.Count + CrawlerMissesBlockingCollection.Count}> out of <{ItemsToProcess}> items have been crawled having <{CrawlerHitsBlockingCollection.Count}> relevant items, <{CrawlerMissesBlockingCollection.Count}> irrelevant items and <{CrawlerFailuresBlockingCollection.Count}> failures.");
Console.WriteLine("###############################");
public static void CrawlerResultantFileWriteLine(String Line)
CrawlerReaderWriterLockSlim.EnterWriteLock();
try { using (StreamWriter StreamWriter = File.AppendText(String.Concat(AppDomain.CurrentDomain.BaseDirectory, "\\", CrawlerResultantFileName))) { StreamWriter.WriteLine(Line); StreamWriter.Close(); } }
finally { CrawlerReaderWriterLockSlim.ExitWriteLock(); }