#!/usr/bin/perl

use strict;
use Encode;
use MIME::Base64;
use MIME::QuotedPrint;
use POSIX qw(_exit uname);

# whitelists and blacklists are located in this directory.
my $DIR = "/home/vandry/var/quarantine.d";

my $env_from = $ENV{'MILTER_FROM'};
$env_from =~ s/^<(.*)>$/$1/;

# If it has an argument (was delivered to user+something@domain) then
# it is probably a mailing list or something and should not be subject
# to checks
acceptmsg() if ($ENV{'MILTER_ARGUMENT'} ne '');
# other mailing lists and such
acceptmsg() if ($env_from =~ /^owner-arin-discuss\@arin\.net$/);
# (others deleted for public version of script)

my $raw = "";

my $header = collect_header(\$raw);
my $fromaddr = get_unique_address($header, 'From');
if ((!defined($fromaddr)) || (!defined($fromaddr->{addr}))) {
	rejectmsg("Message must include a From header\n",
		"Messages without From headers are rejected\n");
}
check_whitelist($fromaddr);
check_blacklist($fromaddr);
from_sanity($fromaddr);
my $subject = get_free_header($header, 'subject');
if (defined($subject)) {
	check_subject_blacklist($subject);
	subject_sanity($subject);
} else {
	rejectmsg("Message must include a Subject header\n",
		"Messages without Subject headers are rejected\n");
}
my $msgid = get_unique_address($header, 'Message-ID');
msgid_sanity($header, $msgid);
received_sanity($header);

# Believe it or not, some spam is marked as such!
if (get_free_header($header, 'x-spam-flag') =~ /YES/i) {
	rejectmsg("Message marked as spam\n",
		"Messages marked as spam are rejected\n");
}

# Still cannot decide? Read the rest of the message

# collect body
$raw .= $_ while (<STDIN>);

my $result = fork();
if (defined($result)) {
	if ($result == 0) {
		if (open(STDIN, "-|") == 0) {
			print $raw;
			$| = 1;
			_exit(0);
		}
		# This thing does some checks on the bodies
		# It must not assume that it will receive the
		# entire message, it might be cut off if the
		# per-user-milter server makes a decision
		exec('/home/vandry/bin/emptycheck', 'partial', 'html');
		_exit(0);
	}
	waitpid $result, 0;
	rejectmsg() if ($? == 512);
}

# Default action
acceptmsg();

sub from_sanity {
	my ($addr) = @_;

	unless ($addr->{addr} =~ /^[^@]+@[^@]+$/) {
		rejectmsg("From header does not contain an email address in user\@host format\n",
			"Messages without email addresses in the from header are rejected\n");
	}

	if (exists($addr->{displayname})) {
		my $dn = $addr->{displayname};
		my $len = length($dn);

		$_ = $dn; s/\s//g;
		my $spaces = $len - length($_);

		$_ = $dn; s/\p{Unassigned}//g; s/\x{fffd}//g;
		my $badchar = $len - length($_);

		if ((($badchar / ($len - $spaces)) > .4) || ($badchar > 5)) {
			rejectmsg("Your name contains too many garbage or invalid characters\n",
				"Messages with very many garbage characters in From are rejected\n");
		}

		if (($spaces / $len) > .6) {
			rejectmsg("Your name contains too many space characters\n",
				"Messages with very many spaces in From are rejected\n");
		}

		if ($dn =~ /^\s*\(\d\d\d\) \d\d\d-\d\d\d\d/) {
			rejectmsg("Your name starts with a phone number\n",
				"Messages with names that start with phone numbers are rejected\n");
		}

		if ($dn =~ /^\s*$/) {
			rejectmsg("Your name is present in your email but blank\n",
				"A blank name is not acceptable\n");
		}

		my $any_nonblacklisted_words = 0;
		for my $word (split(/\s+/, $dn)) {
			# I just added this test recently. I expect to add more words here
			next if (lc($word) eq 'UK');
			next if (lc($word) eq 'national');
			next if (lc($word) eq 'lottery');
			next if (lc($word) eq 'Cialis');
			$any_nonblacklisted_words = 1;
			last;
		}
		if (!$any_nonblacklisted_words) {
			rejectmsg("Your name is made up exclusively of blacklisted words\n",
				"Please choose a better display name\n",
				"The one given is unlikely to be your name.\n");
		}
		if ($dn =~ /\p{Han}[ ,-]\p{Han}[ ,-]\p{Han}[ ,-]\p{Han}/) {
			rejectmsg("Your name follows a regular pattern of alternating Chinese characters\n",
				"and separator characters commonly used by spammers\n",
				"Messages with display names in From following this pattern are rejected\n");
		}
	}
}

sub msgid_sanity {
	my ($h, $addr) = @_;

	if ((!defined($addr)) || (!defined($addr->{addr}))) {
		return;
	}

	unless ($addr->{addr} =~ /^[^@]+@[^@]+$/) {
		rejectmsg("Message-ID header does not contain an ID in id\@host format\n",
			"Messages with malformed Message-ID headers are rejected\n");
	}

	my ($sysname, $nodename, $release, $version, $machine) = uname();
	my ($localname, $aliases) = gethostbyname($nodename);

	if (lc(substr($addr->{addr}, -length($localname)-1)) eq lc('@' . $localname)) {
		my $foundlocal = 0;
		for my $h (received_by_hosts($h)) {
			if (lc($h) eq lc($localname)) {
				$foundlocal = 1;
				last;
			}
		}
		if (!$foundlocal) {
			rejectmsg("Message-ID was locally generated but local system did not see it before\n",
				"Possibly forged Message-ID header, rejected\n");
		}
	}
}

sub received_sanity {
	my ($h) = @_;

	for my $h (received_by_hosts($h)) {
		if (lc($h) eq 'tzone.org') {
			# No TZoNE.ORG system identifies itself this way in Received headers!
			# but lots of spammers falsify such a system name
			rejectmsg("Message claims to be received by \"" . $h .
					"\" but no MTA uses this name\n",
				"Messages with fraudulent/falsified Received headers are rejected\n");
		}
	}
}

sub subject_sanity {
	my ($subj) = @_;

	my $len = length($subj);

	$_ = $subj; s/\s//g;
	my $spaces = $len - length($_);

	$_ = $subj; s/\p{Unassigned}//g; s/\x{fffd}//g;
	my $badchar = $len - length($_);

	if ((($badchar / ($len - $spaces)) > .4) || ($badchar > 5)) {
		rejectmsg("The message subject  contains too many garbage or invalid characters\n",
			"Messages with very many garbage characters in Subject are rejected\n");
	}

	if (($spaces / $len) > .6) {
		rejectmsg("The message subject contains too many space characters\n",
			"Messages with very many spaces in Subject are rejected\n");
	}

	if ($subj =~ /\p{Han}[ ,-]\p{Han}[ ,-]\p{Han}[ ,-]\p{Han}/) {
		rejectmsg("The message subject follows a regular pattern of alternating Chinese\n",
			"characters and separator characters commonly used by spammers\n",
			"Messages with such patterns in the Subject header are rejected\n");
	}
}

sub check_whitelist {
	my ($fromaddr) = @_;

	if (open(WHITELIST, "<", $DIR . "/whitelist")) {
		while (<WHITELIST>) {
			my $from_tokens = tokenize_address_header($_);
			my $fromlist = parse_addresses($from_tokens);
			for my $addr (@{parse_addresses(tokenize_address_header($_))}) {
				if (addr_match($addr, $fromaddr)) {
					close WHITELIST;
					print "Whitelisted\n";
					acceptmsg();
				}
			}
		}
		close WHITELIST;
	} else {
		print "Error opening whitelist: ", $!, "\n";
		acceptmsg();	# accept automatically on this kind of error
	}
}

sub check_blacklist {
	my ($fromaddr) = @_;

	if (open(BLACKLIST, "<", $DIR . "/blacklist")) {
		while (<BLACKLIST>) {
			my $from_tokens = tokenize_address_header($_);
			my $fromlist = parse_addresses($from_tokens);
			for my $addr (@{parse_addresses(tokenize_address_header($_))}) {
				if (addr_match($addr, $fromaddr)) {
					close BLACKLIST;
					rejectmsg("Message sender is on the blacklist.\n",
						"Messages with From headers on the blacklist are rejected.\n");
				}
			}
		}
		close BLACKLIST;
	}
}

sub check_subject_blacklist {
	my ($subject) = @_;

	if (open(BLACKLIST, "<", $DIR . "/subject_blacklist")) {
		while (<BLACKLIST>) {
			chomp;
			if ($_ eq $subject) {
				close BLACKLIST;
				rejectmsg("Message subject is on the blacklist.\n",
					"Messages with Subject headers on the blacklist are rejected.\n");
			}
		}
		close BLACKLIST;
	}
}

sub addr_match {
	my ($aa, $ab) = @_;

	return 0 if (ref($aa) ne 'HASH');
	return 0 if (ref($ab) ne 'HASH');
	return 0 unless ($aa->{addr} eq $ab->{addr});
	if (exists($aa->{displayname}) && exists($ab->{displayname})) {
		return 0 unless ($aa->{displayname} eq $ab->{displayname});
		return 1;
	} elsif ((!exists($aa->{displayname})) && (!exists($ab->{displayname}))) {
		return 1;
	}
	return 0;
}

sub rejectmsg {
	print @_;
	exit 67;
}

sub acceptmsg {
	exit 0;
}

sub get_unique_address {
	my ($h, $name) = @_;

	if (!exists($h->{lc $name})) {
		return undef;
	}
	if (scalar(@{$h->{lc $name}}) > 1) {
		rejectmsg("Message must not contain more than one " . $name . " header\n",
			"Messages with multiple " . $name . " headers are rejected\n");
	}
	my $tokens = tokenize_address_header($h->{lc $name}->[0]);
	my $list = parse_addresses($tokens);

	if (scalar(@$list) < 1) {
		rejectmsg($name . " header does not list any addresses\n",
			"Messages with an invalid or empty " . $name . " header are rejected\n");
	}

	if (scalar(@$list) > 1) {
		rejectmsg($name . " header has more than one address\n",
			"A message is only allowed to have one " . $name . " address or it is rejected\n");
	}

	if (ref($list->[0]) ne 'HASH') {
		rejectmsg($list->[0] . "\n",
			"Messages with unparseable " . $name . " headers are rejected\n");
	}

	$list->[0];
}

sub get_free_header {
	my ($h, $name) = @_;

	if (!exists($h->{$name})) {
		return undef;
	}
	my $tokens = [
		map { [ 'nextheader' ], @{tokenize_free_header($_)} } @{$h->{$name}}
	];
	shift @$tokens;

	as_text($tokens, 1);
}

sub parse_addresses {
	my ($t) = @_;
	my $list = [];
	my $result = [];
	my $new = 1;

	if (ref($t) ne 'ARRAY') {
		return [ $t ];
	}
	for my $x (@$t) {
		if ($x->[0] eq ',') {
			$new = 1;
		} elsif ($new) {
			push @$list, [ $x ];
			$new = 0;
		} else {
			push @{$list->[scalar(@$list)-1]}, $x;
		}
	}
	for my $addr (@$list) {
		my $have_left = 0;
		my $between = [];
		my $not_between = [];
		my $have_right = 0;
		my $have_quoted = 0;
		my $quoted_text = "";
		for my $x (@$addr) {
			if ($x->[0] eq '<') {
				$have_left = 1;
				next;
			} elsif ($x->[0] eq '>') {
				$have_right = 1;
				next;
			}
			if ($have_left && (!$have_right)) {
				push @$between, $x;
			} elsif ($x->[0] eq 'quoted-string') {
				if ($have_quoted == 0) {
					$have_quoted = 1;
				} else {
					$quoted_text .= " ";
				}
				$quoted_text .= $x->[1];
			} else {
				push @$not_between, $x;
			}
		}
		if (
			($have_left && (!$have_right)) ||
			($have_right && (!$have_left))
		) {
			push @$result, "Unbalanced < >";
		} elsif ($have_left && $have_right) {
			push @$result, {
				displayname => $have_quoted ?
					clean_text($quoted_text) :
					as_text($not_between, 1),
				addr => as_text($between, 0),
			};
		} elsif ($have_quoted) {
			push @$result, {
				displayname => rfc2047($quoted_text),
				addr => as_text($not_between, 0),
			};
		} else {
			push @$result, {
				addr => as_text($addr, 0),
			};
		}
	}
	$result;
}

sub as_text {
	my ($t, $rfc2047) = @_;
	my $result = "";
	my $last_is_text = 0;

	for my $x (@$t) {
		if (($x->[0] eq 'text') || ($x->[0] eq 'quoted-string')) {
			$result .= " " if ($last_is_text);
			$last_is_text = 1;
			$result .= $rfc2047 ? rfc2047($x->[1]) : clean_text($x->[1]);
		} elsif ($x->[0] eq 'whitespace') {
			$result .= $x->[1];
			$last_is_text = 0;
		} else {
			$last_is_text = 0;
			$result .= $x->[0];
		}
	}
	$result;
}

sub tokenize_address_header {
	my ($s) = @_;
	my $result = [];

	while ($s ne '') {
		next if ($s =~ s/^\s+//);
		if (substr($s, 0, 1) eq "\"") {
			my $end = index($s, "\"", 1);
			return "Unbalanced quotes" if ($end < 1);
			push @$result, [ 'quoted-string', substr($s, 1, $end-1) ];
			substr($s, 0, $end+1) = '';
		} elsif (
			(substr($s, 0, 1) eq '<') ||
			(substr($s, 0, 1) eq '>') ||
			(substr($s, 0, 1) eq ',') ||
			(substr($s, 0, 1) eq '@')
		) {
			push @$result, [ substr($s, 0, 1) ];
			substr($s, 0, 1) = '';
		} else {
			$s =~ s/^([^"<>,@\s]+)//;
			push @$result, [ 'text', $1 ];
		}
	}
	$result;
}

sub tokenize_free_header {
	my ($s) = @_;
	my $result = [];

	while ($s ne '') {
		if ($s =~ s/^(\s+)//) {
			push @$result, [ 'whitespace', $1 ];
		} elsif ($s =~ s/^(\S+)//) {
			push @$result, [ 'text', $1 ];
		}
	}
	$result;
}

sub received_by_hosts {
	my ($h) = @_;
	my @result = ();
	return @result unless (exists($h->{received}));
	for my $r (@{$h->{received}}) {
		my $state = 0;
		for my $t (@{tokenize_free_header($r)}) {
			if ($state == 1) {
				if (($t->[0] eq 'text') && ($t->[1] eq 'by')) {
					$state = 2
				} else {
					$state = 0;
				}
			} elsif ($state == 2) {
				$state = ($t->[0] eq 'whitespace') ? 3 : 0;
			} elsif ($state == 3) {
				if ($t->[0] eq 'text') {
					push @result, $t->[1];
					last;
				}
				$state = 0;
			}
			if ($state == 0) {
				$state = 1 if ($t->[0] eq 'whitespace');
			}
		}
	}
	@result;
}

sub collect_header {
	my ($rawp) = @_;

	my $h = {};
	my $last = undef;
	while (<STDIN>) {
		$$rawp .= $_;
		s/\s+$//;
		return $h if (/^$/);
		if (/^\s/) {
			if ($last) {
				s/^\s+//;
				$$last .= "\n" . $_;
			}
		} elsif (/^([a-zA-Z0-9-]+):\s*(.*)$/) {
			my $key = lc $1;
			my $value = $2;
			if (!exists($h->{$1})) {
				$h->{$key} = [];
			}
			push @{$h->{$key}}, $value;
			$last = \($h->{$key}->[scalar(@{$h->{$key}})-1]);
		} else {
			exit 0;
		}
	}
}

sub clean_text {
	my ($s) = @_;

	# plain text in the headers should be ASCII
	$s =~ s/[\x00-\x1f\x7f-\xff]/\x{fffd}/g;
	$s;
}

sub rfc2047 {
	my ($s) = @_;

	unless ($s =~ /^=\?([_a-zA-Z0-9-]+)\?([^\?\s]*)\?(\S*?)\?=$/) {
		return clean_text($s);
	}

	my ($charset, $coding, $c) = ($1, $2, $3);
	my $plain;

	if (($coding eq 'B') || ($coding eq 'b')) {
		$plain = MIME::Base64::decode_base64($c);
	} elsif (($coding eq 'q') || ($coding eq 'Q')) {
		$c =~ s/_/ /g;
		$plain = MIME::QuotedPrint::decode_qp($c);
	} else {
		return "\x{fffd}";
	}

	my $new;
	if (lc($charset) eq 'iso-2022-jp') {
		# Perl bug: iso-2022-jp encoding does not respect FB_DEFAULT mode.
		# It handles it as Encode::FB_PERLQQ
		$new = '';
		while ($plain ne '') {
			$new .= Encode::decode($charset, $plain, Encode::FB_QUIET);
			last if ($plain eq '');
			$new .= "\x{fffd}";
			substr($plain, 0, 1) = '';
		}
	} else {
		# Get replacement characters
		$new = Encode::decode($charset, $plain, Encode::FB_DEFAULT);
	}
	$new;
}
