The quickest solution is to do this completely in memory, restricting disk reading in just one pass. If you do not have enough memory to do this, I suggest dividing it into pieces.
O (n ^ 2) - . 1000 , . , , , , .
my $buffersize = 1000;
open FH1, '/arf/test.txt';
open FH2, '/arf/test.txt';
while (! eof(FH1)) {
my @buffer = ();
for (1..$buffersize) {
my $outer = <FH1>;
chomp $outer;
push @buffer, $outer;
last if eof(FH1);
}
while ($inner = <FH2>){
for (@buffer) {
[code to compare line];
}
}
}
, 25 -
, , test.txt, 1-10 . 4, script 4 11, .
use strict;
use warnings;
use autodie;
my $buffersize = 4;
my $file = 'test.txt';
open my $fh1, '<', $file;
open my $fh2, '<', $file;
while (! eof($fh1)) {
my $startofbuffer = tell($fh1);
seek($fh2, $startofbuffer, 0);
my @buffer = ();
for (1..$buffersize) {
chomp(my $outer = <$fh1>);
print "buffer, $.\n";
push @buffer, $outer;
last if eof($fh1);
}
for (my $offset = 0; !eof($fh2); $offset++) {
chomp(my $inner = <$fh2>);
for my $i (0..$#buffer) {
last if $i >= $offset;
my $outer = $buffer[$i];
print "Compare outer $outer to inner $inner\n";
}
}
}
buffer, 1
buffer, 2
buffer, 3
buffer, 4
Compare outer 1 to inner 2
Compare outer 1 to inner 3
Compare outer 2 to inner 3
Compare outer 1 to inner 4
Compare outer 2 to inner 4
Compare outer 3 to inner 4
Compare outer 1 to inner 5
Compare outer 2 to inner 5
Compare outer 3 to inner 5
Compare outer 4 to inner 5
Compare outer 1 to inner 6
Compare outer 2 to inner 6
Compare outer 3 to inner 6
Compare outer 4 to inner 6
Compare outer 1 to inner 7
Compare outer 2 to inner 7
Compare outer 3 to inner 7
Compare outer 4 to inner 7
Compare outer 1 to inner 8
Compare outer 2 to inner 8
Compare outer 3 to inner 8
Compare outer 4 to inner 8
Compare outer 1 to inner 9
Compare outer 2 to inner 9
Compare outer 3 to inner 9
Compare outer 4 to inner 9
Compare outer 1 to inner 10
Compare outer 2 to inner 10
Compare outer 3 to inner 10
Compare outer 4 to inner 10
buffer, 5
buffer, 6
buffer, 7
buffer, 8
Compare outer 5 to inner 6
Compare outer 5 to inner 7
Compare outer 6 to inner 7
Compare outer 5 to inner 8
Compare outer 6 to inner 8
Compare outer 7 to inner 8
Compare outer 5 to inner 9
Compare outer 6 to inner 9
Compare outer 7 to inner 9
Compare outer 8 to inner 9
Compare outer 5 to inner 10
Compare outer 6 to inner 10
Compare outer 7 to inner 10
Compare outer 8 to inner 10
buffer, 9
buffer, 10
Compare outer 9 to inner 10
, 2 -
, script, :
use strict;
use warnings;
use autodie;
my $lines = shift or die "Missing line count\n";
die "Lines outside of range 1 - 1,000,000" if $lines < 1 or $lines > 1_000_000;
my $fakedata = 70;
my $filename = 'fd' . sprintf("%06d", $lines) . '.txt';
open my $fh, '>', $filename;
for my $i (1..$lines) {
my $fake = join '', map {('a'..'z')[int rand 26]} (1..$fakedata);
$fh->print("$i fake${fake}fake\n");
}
close $fh;
print "Created $filename\n";
1;
__END__
, , . 1_000, 10_000 20_000.
For Buffer size, show Time in sec and (# of File Reads)
--------------------------------------------------------
File by lines b = 1 b = 10 b = 100 b = 1k b = 10k
------------- ----- ------ ------- ------ -------
Lines = 1k t = 1.54s t = 0.35s t = 0.22s t = 0.21
Size = 88k r = (1001) r = (101) r = (11) r = (2)
Tests = 500k
Lines = 10k t = 185s t = 35s t = 23s t = 21.7s t = 21.5s
Size = 899k r = (10k) r = (1k) r = (101) r = (11) r = (2)
Tests = 50m
Lines = 20k t = 593s t = 136s t = 90s t = 86s t = 85.5s
Size = 1.8m r = (20k) r = (2k) r = (201) r = (21) r = (3)
Tests = 200m
, 100 script 5 . - , N * N/2. 200 20k, , , 4 , 10k .
, 250 156,25 , 20- . 3,71 25,7 . , , , , , / .
, , . , , O (n log n) O (N ** 2). , , perl , . , , , .
, .