#!/usr/bin/perl -w
use Mail::Mbox::MessageParser;
my $file_name = $ARGV[0] || 'mbox';
my $file_handle = new FileHandle($file_name);
my $folder_reader =
new Mail::Mbox::MessageParser( {
'file_name' => $file_name,
'file_handle' => $file_handle,
'enable_cache' => 0,
'enable_grep' => 1,
} );
# Any newlines or such before the start of the first email
$folder_reader->prologue();
my $msgcount = 0;
my $foundaddr = 0;
my %addrcount;
# This is the main loop. It's executed once for each email
while(!$folder_reader->end_of_file())
{
if((++$msgcount) % 100 == 0) {
print STDERR "\r$msgcount"
}
my $email = $folder_reader->read_next_email();
if($$email =~ /\nFrom: (.*)\r?\n/) {
my $fromaddr = $1;
if( $fromaddr =~ m/[\w\d\.\,\-\=\+\_\%\$\!\#\^\&]+\@([\w\d\-]+\.)+[\w\d\-]+/ ) {
$foundaddr++;
my $addr = lc($&);
if(++$addrcount{$addr} == 1) {
# print "$addr\n";
}
}
}
}
print STDERR "\n\n";
print "$msgcount messages processed, of which $foundaddr had legible email addresses.\n\n";
$count = 1;
foreach my $email (sort { $addrcount{$b} <=> $addrcount{$a} } keys %addrcount) {
printf("%2d. %-60.60s %d\n", $count++, obfuscate($email), $addrcount{$email});
}
sub obfuscate {
my $addr = shift;
$addr =~ s/@/ AT /ig;
$addr =~ s/\./ dot /ig;
return $addr;
}
/[\w\d\.\,\-\=\+\_\%\$\!\#\^\&]+\@([\w\d\-]+\.)+[\w\d\-]+/Your regular expression scares me.
posted by Optimus Chyme at 3:02 PM on August 31, 2006