#!/usr/bin/perl -w

#
# copyright notice: copyright 2005 David A Thompson
#
# statement of copying permission: 
# this program is distributed under the terms of the GNU General Public License 
# (or the Lesser GPL) and can be distributed under the GNU GPL version 2 or 
# later
#

#
# tag-remove
#	- removes xml-style tags from an html, xhtml, xml, ... file
#	- "cleans the file up", removing (some) extra spaces and DOS-type control characters, etc.
#	- writes the new file to filename.out
#
#	"smart" features:
#		- merges paragraphs (stuff bracketed by <p> and </p>)
#			into single line
#		- preserves link information <a href="blabla" > tags
#


########################
#
# process command line
#
########################
#
# use @flag to hold all flags
# flagl = flag array length -1
# @filenames - used to hold all file names in command line
# fnl = number of elements in @filenames -1

$avlength=@ARGV-1;

# scan ARGV first for "--joebob" variables and put them in @flag
# and put other ARGV elements in @fn
$j=0;
$k=0;
for $i (0 .. $avlength)
{
	#print "argv-$i is ->$ARGV[$i]<-\n";
    if ( $ARGV[$i] =~ /--\w+?/ )
    {
	$flag[$j]=$ARGV[$i];
	$j++;
    }
    else
    {
	$filenames[$k]=$ARGV[$i];
	$k++;
    }
}
$flagl=$j-1;


#
# is --debug passed?
#
$db=0;
for $i (0 .. $flagl)
{
    if ( $flag[$i] && $flag[$i] =~ m/--debug/)
    { 
	$db=1;
	last;
    }
}
if ($db) {
        print "flags are @flag\nfilenames are @filenames\n";
	print "debugging on\n";
}


#
# is --help passed?
#
for $i (0 .. $flagl)
{
    if ( $flag[$i] && $flag[$i] =~ m/--help/)
    { 
	@help = "\ntag-remove [--help ][--debug] [--noplnk] [--nosmart] [filename1] [filename2] ... [filename n]\n\n\nsee documentation distributed with package for additional information\n\n";
	print @help;
	exit;
    }
}

#
# is --noplnk passed?
#	- preserving links is on by default
$plnk=1;
for $i (0 .. $flagl)
{
    if ( $flag[$i] =~ m/--noplnk/)
    { 
	$plnk=0;
	if ($db) { print "plnk is set to $plnk\n"; }
	last;
    }
}

#
# is --nosmart passed?
#	- tells tag-remove not to be smart:
#		- don't group lines from a paragraph
#
#	variables:
#
#	$smart - when defined as 1, tell tag-remove to be smart
$smart=1;
for $i (0 .. $flagl)
{
    if ( $flag[$i] =~ m/--nosmart/)
    { 
	$smart=0;
	if ($db) { print "smart is set to $smart\n"; }
	last;
    }
}

#
# process files
#
$fnl=@filenames;
unless ($filenames[0])
{
	print "You must at least supply a filename for tag-remove to process\n";
	die;
}
for $zz (1 .. $fnl)
{
	$filename=$filenames[$zz-1];
	PROCESS($filename);
}

####################################################
####################################################
#
# start of routine which is called for each file
#
####################################################
####################################################
#
# order of events:
#	- open file and put into array
#	- strip out unneeded control characters
#	- rewrite file to ensure all lines are separate array elements
#	- strip out whitespace at ends of lines
#	- remove tags
#	- look for tags that span multiple lines
#	- remove all blank lines at the start of the file

sub PROCESS {

# initialize all local variables
my @lines=();
my @linesout=();


#
# open file-to-process and put file content in array lines
#

# get info from @filein array now...
if ($db) { print "at sub is ->@_<-\n"; }

$filein=shift(@_);

# replace any suffix with .out
if ( $filein =~ /(.+)\..*/ )	
{
	$fileout="$1.out";
# 	print "match is --> $1 <--\n\n";
# 	print "new filename is $filein\n";
}

#
# open file-to-process and put file content in array lines
#
die "Can't open file name ->$filein<-" unless open(INFO,$filein);

if ($db) { print "\nOpening $filein\n"; }
open(INFO,$filein);
@lines=<INFO>;
close(INFO);

#
# figure out what filein looks like
#
$lineslength=@lines;							#

#
# create a duplicate array (linesout) that will be
# the output
#
for ( $i=0; $i < $lineslength; $i++)
{
	$linesout[$i]=$lines[$i];
}

if ($db)
{ print "\nGenerated linesout, linesout is: @linesout\n"; }


###################################
#
# PROCESS TEXT FILE-TO-PROCESS
#
###################################

$llength=@linesout;

#
# strip out ^M and other CR-ish stuff not needed with linux
#
foreach (@linesout)
{
	$_ =~ s/[\cM\r]/\n/g;
}
# need to rewrite and reread file so that \n's are now processed correctly
# and each line ends up as a separate array element
# -- write it to temp file
unlink("/tmp/tmp.otl");
open(FOUT, ">/tmp/tmp.otl");
flock(FOUT,2);
print FOUT @linesout;
close (FOUT);
# now read it
@linesout=();
open(INFO,"/tmp/tmp.otl");
@linesout=<INFO>;
close(INFO);

#
# $llength is length of linesout
#
$llength=@linesout;
#if ($db) 
#{	print "\nlinesout after rewrite is $llength lines long:\n";
#	for ($i=0;$i<$llength;$i++) 
#	{
#		print " line $i ->$linesout[$i]<-\n"; 
#	}
#}


#
foreach (@linesout)
{
	# strip out any whitespace at ends of lines
	chomp ($_);
	$_ = "$_\n";
	# remove any &nbsp; and replace w/space
	$_ =~ s/&nbsp;/ /g;
}

#
# start removing all of those tags
#
################
#
# variables
#	deletearray - contains line numbers to delete
#		example: 0,1,4,8 would indicate to delete lines 0,1,4,8
#	deleteindex - contains next position in deletearray
#	@parray
#		- array storing first and last line pairs for paragraphs
#		based on where <p> and </p> are found
#	$parrayindex
#		- points to location in array where next item should be stored
#
#
# order of events:
#
#	- loop through linesout and:
#		- check for paragraph tag locations
#		- check for and remove complete tags on single lines
#			- plnk on?
#		- check if only thing on line is a tag
#			- put line number in delete array if nothing except a tag
#	- after loop is finished
#		- merge paragraph lines
#			- transfer empty lines to delete array
#		- delete all lines in delete array
#
#	- delete tags that span multiple lines (< and > aren't on same line)


@deletearray=();
$deletearrayindex=0;
@parray=();
$parrayindex=0;

if ($db) { print "\nChecking line"; }
for ( $i=0; $i < $llength; $i++)
{
	if ($db) { print " $i"; }

	# paragraph checking
	# 	- do it unless --nosmart is set
	#	
	if ($smart)
	{
		if ( $db )
		{
			print "\n\tCheck for paragraph tags...\n";
		}
		# look for <p> or </p> but don't match <pre> or other 
		# similar stuff
		if ($linesout[$i] =~ m/<p[ >]/ || $linesout[$i] =~ m/<\/p[ >]/ )
		{
			if ($db) { print "\n\t\tfound paragraph start/end at line $i"; }
			$parray[$parrayindex]=$i;
			$parrayindex++;
			if ($db) { print "\n\t\tparray is now @parray and parrayindex is now $parrayindex"; }
			# check if </p> is on same line
			if ($linesout[$i] =~ m/<p[ >]/ && $linesout[$i] =~ m/<\/p/ )
			{
				if ($db) { print "\n\t\tfound paragraph start AND end at line $i"; }
				$parray[$parrayindex]=$i;
				$parrayindex++;
				if ($db) { print "\n\t\tparray is now @parray and parrayindex is now $parrayindex"; }
			}
		}
	}

	# does line have a complete tag (< and > on same line)?
	if ( $linesout[$i] =~ m/<[^>]*>/ )
	{
		if ($db) { print "\n\tCheck for complete tag on line"; }
		if ( $plnk && ($linesout[$i] =~ m/<a / || $linesout[$i] =~ m/<\\a/ ))
		{
			# remove anything that doesn't look like <>, <a
			# if $plnk is set
			$linesout[$i] =~ s/<[^a^(\\a)]*>//g;
					
			if ($db)
			{
				print "\t\tplnk on: OUT line ->$linesout[$i]<- \n";
			}
		}
		# if there isn't a <a AND plnk isn't set...
		else
		{
			$linesout[$i] =~ s/<[^>]*>//g;
			if ($db)
			{
				print "\n\t\tplnk off: OUT line ->$linesout[$i]<- \n";
			}
		}

		# check if there is anything on the line that isn't a tag
		# delete the line if the only thing present is a tag
		# - need to adjust llength and i if the line is deleted
		unless ($linesout[$i] =~ m/\S/ )
		{
			$deletearray[$deletearrayindex]=$i;
			$deletearrayindex++;
			if ($db)
			{
				print "\tflagging blank line $i for deletion\n";
			}
		}
		
		#
		# look for lines that have only blank spaces and get rid of spaces and tabs
		# ** doesn't above routine flag these lines for deletion?
		#
		if ($db) 
		{ 
			print "\tBlank space cleanup: checking line $i: $linesout[$i]<--"; 
		}
		if ( $linesout[$i] =~ m/^\s+$/ )	# look for a line with only empty space tags
		{
			if ($db) 
			{
				print "\t\tMATCHED line $i: $linesout[$i]"; 
			}
			$linesout[$i] = "\n"; # replace with carriage return
			if ($db) { print "\t\tOUT line $linesout[$i] \n"; }	# output file line
		}
	}
}

if ($db) { 
	print "parray is @parray\n";
	print "deletearray is @deletearray\n";
}

###
#
# group paragraph lines if --nosmart isn't defined
#
# variables
#
#	pstart = paragraph start line
#	pend = paragraph end line
#
#	newline = new line formed by adding lines together
#	addline = line to be added to newline
#
# move through parray a pair at a time
# and
#	- construct new lines from linesout
#	leave other lines 'as is' for the time being
if ($db) { print "\nGenerating merged paragraph lines...\n"; }
for ( $i=0; $i < $parrayindex; $i++)
{
	$pstart=$parray[$i];
	$pend=$parray[$i+1];
	if ($db) { print "\tmerging from line $pstart to line $pend\n"; }
	# start putting lines together unless no grouping needed
	unless ( $pstart==$pend ) 
	{
		$newline="";
		$firsttextline=0;	# set to zero until we hit first text line
		# merge lines pstart through pend
		for ( $j=$pstart; $j <= $pend; $j++)
		{
			$addline=$linesout[$j];
			# chomp line unless it's the last line in the paragraph
			if ($j < $pend) { chomp $addline; }
			# pretty spacing between paragraph lines
			#	- really pretty spacing: don't add space to first line
			#	--->>>> identify first line of TEXT in paragraph, not
			#		just first line ($pstart) of paragraph...
			
			if ( $addline =~ m/\S/ )
			{
				if ($firsttextline==0)
				{
					$firsttextline=1;
				}
				else
				{
					$addline=" $addline";
				}
			}
			$newline="$newline$addline";
			if ($db) 
			{
				print "\t\t\taddline is ->$addline<-\n\t\t\tnewline is $newline\n";
			}
		}
		# put newline in linesout
		if ($db) { print "\tNew paragraph line (line $pstart in linesout) is: $newline\n"; }
		$linesout[$pstart]=$newline;
	}
	$i++;
}

#######
#
# now delete lines from linesout
#	- those that are already in delete array
#	- those that were in paragraphs
#
# variables
#	deletestart - line to begin deleting at for paragraph range
#	deletenumber - number of lines to delete including deletestart for paragraph range
#	deleteline - single line number to delete (from deletearray)
#
#	i - index variable for parray
#	j - index variable for deletearray
#

# convert parray information to a second delete array, pdeletearray
# then sort and merge the two deletearray contents, removing any redundancies
#	firstline=first line to delete as indicated by parray
#	lastline=last line to delete as indicated by parray
#

# first need to remove paragraph start lines from deletearray since they are 
# converted into merged paragraph lines
#	
#	lookline - line we're looking for in deletearray
#	lastn	- used to drop from loop 1 or loop 2
$last0=0;
$i=0;
if ($db) { print "parrayindex is $parrayindex\n"; }
while ($i<$parrayindex && $last0==0)
{
	if ($db)
	{
		print "\ti is $i and parray[i] is $parray[$i]\n";	
	}
	$j=0;
	$last1=0;
	while ($last1==0 && $j<$deletearrayindex )
	{
		# if we've found it in deletearray, remove it
		if ($deletearray[$j]==$parray[$i])
		{
			if ($db)
			{
				print "deleted element $j from deletearray (@deletearray)\n";
			}
			splice @deletearray,$j,1;
			
			$deletearrayindex--;
			# we're done with this loop
			$last1=1;
		}
		$j++;
	}
	$i=$i+2;
}
$pdeletearrayindex=0;
@pdeletearray=();
for ($i=0; $i<$parrayindex; $i++)
{
	$firstline=$parray[$i]+1;
	$lastline=$parray[$i+1];
	for ($j=$firstline; $j<$lastline+1; $j++)
	{
		$pdeletearray[$pdeletearrayindex]=$j;
		$pdeletearrayindex++;
	}
	$i++;
}

if ($db)
{
	print "\nDeleting linesout lines in paragraphs or with only tags\n";
	print "\tpdeletearray is @pdeletearray\n";
	print "\tdeletearray is @deletearray\n";
}

# merge two arrays (pdeletearray and deletearray) into new array (finaldeletearray)
#	
#	point=variable pointing to insert location in deletearrayindex
if ($db) { print "merging arrays"; }
@finaldeletearray=();
$point=0;
$i=0;
while ($i<$pdeletearrayindex)
{
	#if ($db)
	# { print "\tpoint is $point and i is $i\n";
		#print "\t deletearray[point] is $deletearray[$point] and pdeletearray[i] is $pdeletearray[$i]\n"; }
		
	# is pdeletearray value greater than value at insertion point in deletearray?
	# if so 
	# - add deletearray value to finaldeletearray
	if ($pdeletearray[$i] > $deletearray[$point])
	{
		push(@finaldeletearray, $deletearray[$point]);
		$point++;
	}
	# if pdeletearray value is less than value at insertion point then add it
	elsif ($pdeletearray[$i] < $deletearray[$point])
	{	
		push(@finaldeletearray,$pdeletearray[$i]);
		$i++;
	}
	# if pdeletearray value equals the value at the insertion point then 
	# just push one value but bump both indices
	
	else
	{
		push(@finaldeletearray,$pdeletearray[$i]);
		$i++;
		$point++;
	}
}
if ($db) { print "Final deletearray is @finaldeletearray\nDeleting line(s) "; }


# delete lines in @deletearray (finally!)
$l=@finaldeletearray-1;
for ( $j=$l; $j > -1; $j-- )
{
	$deleteline=$finaldeletearray[$j];
	# remove element at position $deleteline
	splice @linesout, $deleteline, 1;
	if ($db) 
	{ 
		#print "\t\tj (deletearray pointer) is $j\n";
		print " $deleteline";
	}
}

#
# look for tags that span multiple lines (i.e., < and > aren't on same line)
#
#	variables:
#
#	$match	- string to look for start tag(s)
#	$i	- index variable to walk through linesout
#	$j	- index variable to walk through linesout looking for a tag end

if ( $plnk ) { $match='<[^a^>]*?[^>]*?$|<[^\/a^>]*?$' }
else { $match='<[^>]*?$'; }

# need to reset llength since we've deleted lines in above routines
$llength=@linesout;
if ($db) { print "\n\nMulti-line tag check ($llength lines): checking line"; }

for ( $i=0; $i < $llength; $i++)
{
	if ($db) 
	{ 
		print " $i ";
	}
	
	if ( $linesout[$i] =~ m/$match/ )
	{
		if ($db) { print "\n\tmatched start tag at line $i\n->$linesout[$i]<-\n"; }		
		
		# look for end of tag on a future line (tags with ends on the same line have already been processed)
		if ($db) { print "\t\t\tend tag check: checking line "; }
		$j=$i+1;
		while ( ($linesout[$j] !~ m/>/) && ($j < $llength) )
		{
			if ($db) { print " $j"; }
			$j++;
		}
		if ( $j >= $llength )
		{
			if ($db) { print "\t\tlinesout length is $llength\n";
			print "\t\tsearched all the way to element $j\n";
			print "\t\tno end tag\n"; }
		}
		# otherwise, we've found the end tag...
		else
		{
			if ($db) { print "\t\tmatched end tag at line $j\n->$linesout[$j]<-\n"; }
			# delete all intervening lines
			$start=$i+1;
			$stop=$j-1;
			$delta=$j-$i;	# this is used later to bump i
			if  ( ($stop - $start) >= 0 )
			{
				# make all intervening lines blank
				if ($db) { print "\tblanking intervening lines from $start to $stop\n\t\tblanked line "; }
				for ($k=$start; $k<=$stop; $k++ )
				{
					$linesout[$k]="\n";
					if ($db) { print "$k "; }
				}
			}
			if ($db) { print "\n\tediting start and end lines $i and $j\n"; }
			# preserve content on lines except for tag line
			$linesout[$j] =~ s/[^>]*>//;
			$linesout[$i] =~ s/<.*//;

## now need to check if anything else is on lines $i and $j
			
# delete the lines if they are now blank (prettifying stuff)
# - need to adjust llength and i if the line is deleted
			# is something on j?
			unless ($linesout[$j] =~ m/\S/ )
			{
				splice @linesout,$j,1;
				if ($db) { print "\t\tdeleted line j: $j"; }
				$llength--;
			}
			if ($db)
			{
				print "\t\tOUT line j->$linesout[$j]<- \n";	# output file line
			}
			unless ($linesout[$i] =~ m/\S/ )
			{
				splice @linesout,$i,1;
				if ($db) { print "\t\tdeleted line i: $i"; }
				$i--;
				$llength--;
			}
			if ($db)
			{
				print "\tOUT line i->$linesout[$i]<- \n";	# output file line
			}
			# since we found end tag, continue scanning from line after end tag
			$i=$i+$delta;
		}
	}
}

# remove any blank lines at end of document
$x=$llength-1;
if ($db) { print "checking for lines at end\n";
	print "linesout-llength-1 is $linesout[$x]\n";
}

while ($linesout[$x] !~ m/\w/ )
{
	pop @linesout;
	$x--;
	if ($db) 
	{
		print "linesout-llength-1 is now $linesout[$x]\n";
	}
}
# remove any blank lines at start of document
if ($db) { 
	print "\nchecking for blank lines at start...\n";
	print "\tline 0 is ->$linesout[0]<-\n"; 
	}

while ($linesout[0] !~ m/\w/ )
{
	shift @linesout;
	$llength--;
	if ($db) { print "\n\t removed line 0; ...line 0 is now ->$linesout[0]<-\n"; } 
}

#
# OUTPUT THE OUTPUT FILE WHEN FINISHED
#
#
open(FOUT, ">$fileout");
flock(FOUT,2);
print FOUT @linesout;
if ($db) { print "Output file is $fileout\n"; }
close (FOUT);
}
