, . .
, , .
Hashmap map<url,count>
while(log file has nextline){
url = nextline in logfile
add url to map and update count
}
List list
foreach(m in map){
add m to list
}
sort the list by count value
take top n from the list
- O (n) + O (m * log (m)), n - m - .
# . .
.
The algorithm uses a hash map to store found links. After that, the sorting algorithm sets the top 100 links. The sorting algorithm uses a simple data container data structure.
The complexity of the memory depends on the expected individual links. The hashmap should be able to contain the found individual links, otherwise this algorithm will not work.
using System;
using System.Collections.Generic;
using System.Linq;
public class Program
{
public static void Main(string[] args)
{
RunLinkCount();
Console.WriteLine("press a key to exit");
Console.ReadKey();
}
class LinkData : IComparable
{
public string Url { get; set; }
public int Count { get; set; }
public int CompareTo(object obj)
{
var other = obj as LinkData;
int i = other == null ? 0 : other.Count;
return i.CompareTo(this.Count);
}
}
static void RunLinkCount()
{
var urls = new List<string>();
var rand = new Random();
const int loglength = 500000;
for (int i = 0; i < loglength; i++)
{
urls.Add(string.Format("http://{0}.com", rand.Next(1000)
.ToString("x")));
}
var lookup = new Dictionary<string, int>();
var stopwatch = new System.Diagnostics.Stopwatch();
stopwatch.Start();
foreach (var url in urls)
{
if (lookup.ContainsKey(url))
{
int i = lookup[url];
lookup[url] = i + 1;
}
else
{
lookup.Add(url, 1);
}
}
var list = lookup.Select(i => new LinkData
{ Url = i.Key, Count = i.Value }).ToList();
list.Sort();
var top = list.Take(100).ToList();
stopwatch.Stop();
foreach (var i in top)
{
Console.WriteLine("Url: {0}, Count: {1}", i.Url, i.Count);
}
Console.WriteLine(string.Format("Time elapsed msec: {0}",
stopwatch.ElapsedMilliseconds));
}
}
Edit: This answer has been updated based on comments.
- Added: runtime and memory complexity analysis.
- added: pseudocode
- added: explain how we manage a fairly large log file
source
share