package HTML::JFilter;

use strict;
use HTML::Parser;
use Tie::Scalar;
require Exporter;
use vars qw($tagname $paramname $AllowXHTML @EXPORT_OK @ISA $VERSION %strip_tag);
@EXPORT_OK = qw(html_tag arg_escape StripHTML PolishHTML);
@ISA = qw(Exporter);

$VERSION = '0.8.1';

$AllowXHTML = 1;

$tagname = '[\w!][\w\d-]*';
$paramname = '[\w!][\w\d-]*';

%strip_tag = (
	script => 1,
	style => 1,
	applet => 1,
	title => 1
);

sub parseFilter {
	my $filter = shift;
	my %allowed;
	while ($filter =~ /^(.*)$/mg) {
		my $line = $1;
		$line =~ s/^\s+//;
		$line =~ s/[#';].*$//;
		$line =~ s/\s+$//;
		next if $line eq '';

		die "Malformed filter definition line: $line\n"
			unless $line =~ /^$tagname(?:\s+$tagname)*(?:\s*:\s*$paramname(?:\s+$paramname)*)?$/o;

		$line = lc $line;
		my ($tags, $params) = split /\s*:\s*/, $line, 2;
		my @tags = split /\s+/, $tags;

		if ($params) {
			my @params = split /\s+/, $params;
			my %param_hash;
			@param_hash{@params} = ();
			@allowed{@tags} = (\%param_hash) x scalar(@tags) ;
		} else {
			@allowed{@tags} = () ;
		}

	}
	return \%allowed;
}

sub html_tag {
    my ( $tag, $args) = @_;
    my $html;
    $html = "<$tag";
	if ($args and ref($args)) {
		my $key;
		foreach $key (keys %$args) {
			if (defined $args->{$key}) {
				$html .= " $key=".arg_escape($args->{$key});
			} else {
				$html .= ' '.$key;
			}
		}
	}
    $html .= ">";
    return $html;
}

sub arg_escape {
	my $arg = shift;
	return qq{"$arg"} if ($arg !~ /"/);
	return qq{'$arg'} if ($arg !~ /'/);
	$arg =~ s/"/&dblquote;/g;
	return qq{"$arg"};
}

sub new {
	my $pkg = shift;
	my $allowed = parseFilter(shift());
	my $result = '';

	my $skip_comment = shift(); $skip_comment = 'yes' unless defined $skip_comment;
	if (!$skip_comment or lc($skip_comment) eq 'no') {
		$skip_comment = [];
	} elsif (lc($skip_comment) eq 'ssi') {
		my $skip_ssi = sub {
			if ($_[0] !~ /^<!--#/) {
				$result .=  $_[0];
			}
		};
		$skip_comment = [comment_h => [ $skip_ssi, "text"]];
	} else {
		$skip_comment = [comment_h => [""]];
	}

	my $skip_body_of;

	my $end_func = sub {
		my($tagname, $origtext) = @_;
		my $lc_tagname = lc $tagname;
		if (defined $skip_body_of and $lc_tagname eq $skip_body_of) {
			undef $skip_body_of;
		} elsif (exists $allowed->{lc $tagname}) {
			$result .= $origtext
		}
	};

	my $start_func = sub {
		return if defined $skip_body_of;
		my($tagname, $attrs, $origtext) = @_;
		my $lc_tagname = lc $tagname;
		unless (exists $allowed->{$lc_tagname}) {
			if ($strip_tag{$lc_tagname}) {
				$skip_body_of = $lc_tagname;
			}
			return;
		}
		my $allowed_attrs = $allowed->{$lc_tagname};
		unless (defined $allowed_attrs) {
			$result .= html_tag($tagname);
			return;
		}

		my $filtered_attrs;
		foreach my $attr (%$attrs) {
			$filtered_attrs->{$attr} = $attrs->{$attr}
				if exists $allowed_attrs->{lc $attr};
		}

		$result .= html_tag($tagname, $filtered_attrs);
	};

	my $parser = HTML::Parser->new( api_version => 3,
		start_h => [$start_func, "tagname, attr, text"],
		end_h   => [$end_func,   "tagname, text"],
		@$skip_comment,
		default_h	=> [sub {$result .= $_[0] unless defined $skip_body_of}, "text"],
		marked_sections => 1,
		boolean_attribute_value => undef,
	);
	$parser->xml_mode(1) if $AllowXHTML;

	my $self = {parser => $parser, result => \$result};
	bless $self, 'HTML::JFilter';
	return $self;
}

sub HTML::JFilter::doSTRING {
	my $self = shift();
	$self->{parser}->parse(shift())->eof;

	my $result = ${$self->{result}};
	${$self->{result}} = '';
	return $result;
}

@HTML::JFilter::IO::ISA = qw(Tie::StdScalar);
sub HTML::JFilter::IO::FETCH {};
sub HTML::JFilter::IO::STORE {print {(shift())->{FH}} @_};
sub HTML::JFilter::IO::TIESCALAR {my $self = {FH => $_[1]}; bless $self, 'HTML::JFilter::IO' };

sub HTML::JFilter::doFILE {
	my $self = shift();
	my ($source,$dest) = @_;
	my $FH;

    if (!ref($dest) && ref(\$dest) ne "GLOB") {
		# we got a file name
		open $FH, "> $dest" or die "Cannot create $dest : $!\n";
		tie ${$self->{result}}, 'HTML::JFilter::IO', $FH;
    } else {
		tie ${$self->{result}}, 'HTML::JFilter::IO', $dest;
	}

	$self->{parser}->parse_file($source)->eof;

	untie ${$self->{result}};
	close $FH if defined $FH;

	return 1;
}


{
my $stripHTML;
my $result = '';
my $skip;
my @listtype;

sub StripHTML {

	if (! defined $stripHTML) {
		$stripHTML = HTML::Parser->new( api_version => 3,
			start_h	=> [
				sub {
					my $tagname = lc $_[0];
					if ($strip_tag{$tagname}) {
						$skip = $tagname;
					} elsif ($tagname eq 'li') {
						if (scalar(@listtype)>1) {
							$result .= '  ' x (scalar(@listtype)-1);
						}
						if ($listtype[-1]) {
							$result .= "$listtype[-1]. ";
							$listtype[-1]++;
						} else {
							$result .= '- '
						}
					} elsif ($tagname eq 'ul') {
						push @listtype, 0;
					} elsif ($tagname eq 'ol') {
						push @listtype, 1;
					}
				}, "tagname"],
			end_h	=> [
				sub {
					my $tagname = lc($_[0]);
					if ($tagname eq $skip) {
						undef $skip;
					} elsif ($tagname eq 'ul' or $tagname eq 'ol') {
						pop @listtype;
					}
				}, "tagname"],
			text_h	=> [sub {$result .= $_[0] unless defined $skip}, "dtext"],
			marked_sections => 1,
			boolean_attribute_value => undef,
		);
		$stripHTML->xml_mode(1) if $AllowXHTML;
	}

	undef $skip;
	$stripHTML->parse(@_)->eof();
	undef $skip;

	my $tmp;
	($tmp, $result) = ( $result, '');
	$tmp =~ tr/\xA0/ /;
	$tmp =~ s/(?:\x0D\x0A?|\x0A)/\n/sg;
	$tmp =~ s/[ \t]+$//gm; # remove spaces at the end of lines
	return $tmp;
}
}

sub PolishHTML {
	my $str = shift;
	if ($AllowXHTML) {
		$str =~ s{(.*?)(&\w+;|&#\d+;|<\w[\w\d:\-]*(?:\s+\w[\w\d:\-]*(?:\s*=\s*(?:[^" '><\s]+|(?:'[^']*')+|(?:"[^"]*")+))?)*\s*/?>|</\w[\w\d:\-]*>|<!--.*?-->|$)}
		         {HTML::Entities::encode($1, '^\r\n\t !\#\$%\"\'-;=?-~').$2}gem;
	} else {
		$str =~ s{(.*?)(&\w+;|&#\d+;|<\w[\w\d:\-]*(?:\s+\w[\w\d:\-]*(?:\s*=\s*(?:[^" '><\s]+|(?:'[^']*')+|(?:"[^"]*")+))?)*\s*>|</\w[\w\d:\-]*>|<!--.*?-->|$)}
		         {HTML::Entities::encode($1, '^\r\n\t !\#\$%\"\'-;=?-~').$2}gem;
	}
	return $str;
}

1;

=head1 NAME

HTML::JFilter - module for filtering HTML

Version 0.8.1

based on HTML::Parser

=head1 SYNOPSIS

	use HTML::JFilter;
	$filter = new HTML::JFilter <<'*END*'
	b i code pre br
	a: href name
	font: color size style
	*END*

	$filteredHTML = $filter->doSTRING($enteredHTML);

=head1 DESCRIPTION


=head2 Methods

=over 4

=item new

	$filter = new HTML::JFilter $allowedTags [, $removeComments]

Creates a new HTML::JFilter object.

The $allowedTags is a string that contains all allowed HTML tags and their allowed attributes.
The format is:

	tagname tagname tagname ....
	tagname tagname : attribute attribute ...
	# comment
	tagname # comment
	tagname ; comment
	tagname ' comment
	...

The $removeComments specifies whether to remove the comments in the HTML.
The possible values are:

	no - leave them alone
	ssi - remove only the SSI ones
	yes (or any other true value) - remove all comments

The default is "yes".

Returns the created object in case of success, die()s otherwise.

=item doFILE

	$filter->doFILE( $fromfile, $tofile);

Reads the contents of $fromfile, processes them and writes the result into $tofile.

Both $fromfile and $tofile may be either filenames of FILEHANDLEs or IO objects.

=item doSTRING

	$result = $filter->doSTRING( $htmltext );

Processes the HTML in the variable and returns the result.

=back

=head2 Helper functions

You can import all following functions to your namespace by C<use HTML:JParser qw(function_name)>. Otherwise you
have to specify the package name like this: C<HTML::JParser::function_name(...>

=over 4

=item html_tag

	print html_tag( $tagname, \%parameters);

This function returns a string containing the tag $tagname with all the %parameters properly quoted.
Eg.

	print html_tag( 'a', {href => q<JavaScript:Foo( "bar", 'baz' )>, target => '_blank'});


=item arg_escape

	$tag = "<FOO bar=" . arg_escape( $argument ) . ">";

Returns the $argument quoted and escaped.

=back


=head1 AUTHOR

Jan Krynicky <Jenda@Krynicky.cz>
http://Jenda.Krynicky.cz

=head1 COPYRIGHT

Copyright (c) 2002 Jan Krynicky <Jenda@Krynicky.cz>. All rights reserved.

This program is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.

=cut
