#!/usr/bin/perl -w

# Copyright 2002 Magnus Ihse, magnus@ihse.net
#
# You are allowed to copy, modify and redistribute this program
# according to the terms in the Gnu Public License (GPL).
# The GPL can be found here:
# http://www.gnu.org/licenses/gpl
#

# version 1.0 - 2002-01-08 - first deployed version
# version 1.1 - 2002-01-09 - trimmed parameters, added functionality
# version 1.11 - 2002-01-09 - lowered uppercase score to 0.5

#help
# formail -f -A "X-spam-value: 15" < spamsource

#my IN;

#Configuration
my $score_multiple_recipients = 2;
my $score_spam_id = 4;
my $score_html = 3;
my $score_keyword_match = 1;
my $score_url_match = 0.5;
my $score_line_uppercase = 0.5;
my $score_foreign_sender = 4;
my $score_foreign_charset = 4;
my $score_subject_uppercase = 5;
my $score_subject_contains_adv = 10;
my $score_feedback_form = 8;
my $score_remove_mail = 10;
my $score_mailinglist = 15; # substract if real mailinglist
my $score_personal_to = 8;

my $KW1 = $score_keyword_match;
my $KW2 = $KW1 + 5;

# Start score assuming no "from" header
my $score = $score_foreign_sender + $score_personal_to;
my $debug = 0;
my $print_resulting_score = 0;
my $score_threshold = 15;
my $add_headers = 1;

my @body;
my @saved_headers;

my %keywords = (
                #GENERAL SPAM KEYWORDS
                "click" => $KW2,
                "spammer" => $KW1,
                "removed" => $KW2,
                "removal" => $KW2,
                "remove" => $KW2,
                "register" => $KW1,
                "registration" => $KW1,
                "list" => $KW1,
                "opt-in" => $KW2,
                "301" => $KW1,
                "1618" => $KW1,
                "promotion" => $KW1,
                "feedback" => $KW1,
                "spam" => $KW1,
                "advertisement" => $KW1,
                "inadvertently" => $KW1,
                "future" => $KW1,
                "mailing" => $KW1,
                "visit" => $KW1,
                "100%" => $KW1,
                "guaranteed" => $KW1,
                "fee" => $KW1,
                "gift" => $KW1,
                "buy" => $KW1,
                "pay" => $KW1,
                "value" => $KW1,
                "future" => $KW1,
                "newsletter" => $KW1,
		"opportunity" => $KW1,
		"incredible" => $KW1,
		"welcome" => $KW1,

                #TYPICAL SELL SPAM
		"obligation" => $KW1,
                "today" => $KW1,
                "price" => $KW1,
                "order" => $KW1,
                "save" => $KW1,

                #TYPICAL PAYMENT SPAM
                "debt" => $KW1,
                "income" => $KW1,
		"bills" => $KW1,
		"invest" => $KW1,
		"investment" => $KW1,

                #TYPICAL WIN SPAM
                "congratulations" => $KW1,
                "contest" => $KW1,
                "vacation" => $KW1,
                "win" => $KW1,
                "won" => $KW1,
                "confirm" => $KW1,
                "entry" => $KW1,
		"invited" => $KW1,

                #TYPICAL SEX SPAM
		"hot" => $KW1,
                "free" => $KW1,
                "membership" => $KW1,
                "sex" => $KW2,
                "sexy" => $KW1,
                "porn" => $KW2,
                "nude" => $KW2,
                "teen" => $KW1,
                "credit" => $KW1,
                "webcam" => $KW1,
                "live" => $KW1,
                "adult" => $KW2,
                "pictures" => $KW1,
                "cum" => $KW1,
                "xxx" => $KW1,
                "pics" => $KW1,

                # TYPICAL HEATH SPAM
                "weight" => $KW1,
                "diet" => $KW1,
                "fat" => $KW1,
                "health" => $KW1,
                "aging" => $KW1,
                "dieting" => $KW1,

                # DOMAIN NAME SPAM
                "domain" => $KW1,
                "site" => $KW1,
                "transfer" => $KW1,
                "register" => $KW1,

                # SPANISH KEYWORDS
                "remover" => $KW1,
                "lista" => $KW1,
                "removido" => $KW1,
                
                # GERMAN KEYWORDS
                "seite" => $KW1 );


       


sub score_keywords {
  my $s = $_[0];

  my @words = split (/\s/, lc($s));
  # FIXME: remove punctuation chars

  foreach $_ (@words) {
    if (exists $keywords{$_}) {
      $score += $keywords{$_};
      print "Score (" . $keywords{$_} . "): Keyword match ($_)\n" if ($debug);
    }
    if (/^http:/) {
      $score += $score_url_match;
      print "Score ($score_url_match): URL match ($_)\n" if ($debug);
    }
    if (/^mailto:.*remove/) {
      $score += $score_remove_mail;
      print "Score ($score_remove_mail): Remove mail address ($_)\n" if ($debug);
    }
  }
}

sub score_from {
  my $s = $_[0];

  # If sender has from header, and is from Sweden, remove presupposed score
  if ($s =~ /\.se/ || $s =~ /\.nu/) {
    $score -= $score_foreign_sender;
    print "Score (-$score_foreign_sender): Sender is from Sweden ($s)\n" if ($debug);
  }
}

sub count_recipients {
  my $s = $_[0];

  $_ = $s;
  my $count = s/\@/!/g;
  return $count;
}

sub check_personal_to {
  my $s = $_[0];

  if (($s =~ /d95-mih\@.*\.kth\.se/) or ($s =~ /\@ihse\.net/) or
      ($s =~ /ihse\@stacken\.kth\.se/)) {
    $score -= $score_personal_to;
    print "Score (-$score_personal_to): Mail personally addressed to me
($s)\n" if ($debug);
  }  
}

sub score_to {
  my $s = $_[0];

  check_personal_to($s);
  $count = count_recipients($s);
  if ($count > 6) {
    $score += $score_multiple_recipients;
    print "Score ($score_multiple_recipients): Multiple to-addresses (count: $count)\n" if ($debug);
  }
}

sub score_cc {
  my $s = $_[0];

  check_personal_to($s);
  $count = count_recipients($s);
  if ($count > 6) {
    $score += $score_multiple_recipients;
    print "Score ($score_multiple_recipients): Multiple cc-addresses (count: $count)\n" if ($debug);
  }
}

sub is_mostly_uppercase {
  my $s = $_[0];

  $_ = $s;
  my $lccount = s/[a-z]/ /g;
  my $uccount = s/[A-Z]/ /g;

  return ((length($s) >= 10) and ($uccount > 10*$lccount));
}

sub score_subject {
  my $s = $_[0];

#  subject: [tal/spamID > 3000] // 3000 för att inte fastna på årtal
  if ($s =~ /(\d{4,})/) {
    if ($1 > 3000) {
      $score += $score_spam_id;
      print "Score ($score_spam_id): Spam ID in subject (ID: $1)\n" if ($debug);
    }
  }

#subject: A-Z ratio mot a-z > 90%
  if (is_mostly_uppercase($s)) {
    $score += $score_subject_uppercase;
    print "Score ($score_subject_uppercase): Subject is uppercase ($s)\n" if ($debug);
  }
  
#subject: ADV
  if ($s =~ /ADV/) {
    $score += $score_subject_contains_adv;
    print "Score ($score_subject_contains_adv): Subject contains 'ADV'\n" if ($debug);
  }

  if ($s =~ /=?windows-1251?/) {
    $score += $score_foreign_charset;
    print "Score ($score_foreign_charset): Subject is in foreign charset\n" if ($debug);
  }
  score_keywords($s);

}

sub score_contentType {
  my $s = $_[0];

  if ($s =~ /text\/html/i) {
    $score += $score_html;
    print "Score ($score_html): Content-type is HTML\n" if ($debug);
  }
}

sub score_mailinglist {
  $score -= $score_mailinglist;
  print "Score (-$score_mailinglist): Mail is real mailinglist\n" if
($debug);
}

sub handle_header {
  my $header = $_[0];
  ($type, $content) = split(":", $header, 2);
  $type = lc($type);

  if ($type eq "subject") { score_subject($content); }
  if ($type eq "from") { score_from($content); }
  if ($type eq "to") { score_to($content); } 
  if ($type eq "cc") { score_cc($content); }
  if ($type eq "content-type") { score_contentType($content); }
  if ($type eq "list-post") { score_mailinglist($content); }
}

sub read_headers {
  my $last_header = "";
  my $header;
  headers: while (<>) {
    my $original_header = $_;
    chomp;
#    chop;
    $header = $_;
    if (length($header) == 0) {
      if (length($last_header) > 0) {
        handle_header($last_header);
      }
      last headers;
    } else {
      push (@saved_headers, $original_header);
      if ($header =~ /^\s/) {
        $last_header .= $header;
      } else {
        if (length($last_header) > 0) {
          handle_header($last_header);
        }
        $last_header = $header;
      }
    }
  }
}

sub read_body {
  while (<>) {
    push (@body, $_);
    chomp;
#    chop;
    $s = $_;
    score_keywords($s);

    if (is_mostly_uppercase($s)) {
      $score += $score_line_uppercase;
      print "Score ($score_line_uppercase): Line is uppercase ($s)\n" if ($debug);
    }
    if ($s =~ /^Content-Type: text\/html/) {
      $score += $score_html;
      print "Score ($score_html): HTML in multipart\n" if ($debug);
    }
    if ($s =~ /Below is the result of your feedback form/) {
      $score += $score_feedback_form;
      print "Score ($score_feedback_form): Formmail hole used\n" if ($debug);
    }
  }
}
    
    
read_headers();
read_body();

if ($add_headers) {
  print @saved_headers;
  print "X-spam-score: $score\n";
  if ($score > $score_threshold) {
    print "X-spam-status: Probably\n";
  }
  print "\n";
  print @body;
}

print "Total score: " . $score . "\n" if ($print_resulting_score);
if ($add_headers) {
  exit 0;
} else {
  exit $score;
}


