#! /opt/local/bin/perl
####
##
## IMPORTANT!: The path of perl listed above MUST be the path
## used on your system!
###########################################################################
#
# html2text
#
# A perl script designed to read HTML files and convert them into
# some form of text representation. The output format is controlled
# by "tag" files that are included automatically based on the name of
# the executable (for example, naming this file "html2latex" would
# load the "html2latex.tag" file automatically). Users can provide
# their own tag files to customize the translation even further.
#
# Usage:
#
# html2text [options] [infile] [outfile]
#
# where "options" is one of
#
# -f formatfile specifies an additional .tag file to load
#
# (see also the .tag files for more options defined there)
#
# and "infile" is the name of the .html file to parse (or "-" for
# standard input), and "outfile" is the name of the file to write (or
# "-" for standard out). If "infile" is missing it defaults to "-".
# If "outfile" is missing, it defaults to the input file name with
# its extension replaced by ".txt" (or possibly some other string
# specified in the .tag file), or to "-" if "infile" is "-".
#
# The .tag files are perl scripts that define variables and
# subroutines that extend the functions of html2text. These are what
# make it possible to output special dialects like TeX, LaTeX, etc.
# They usually contain definitions of $htmlChar, $htmlTag and
# $htmlEntity variables, plus some associated subroutines.
# See the .tag files for more information on how they work.
#
# Environment Variables:
#
# HTML2FORMAT directory path where .tag files are stored
# HTML2TEXT pointer to user-specified customization file
# for html2text command
# XXX if this perl script is renamed to be some other
# name (like "html2latex") then the environment
# variable with that name (e.g. "HTML2LATEX")
# will point to the user-specified custom .tag file
# for that format.
#
###########################################################################
#
# Update History:
#
# who when what
# ------ -------- ------------------------------------------
# schaefer 7/97 Fixed bugs involving $htmlFormatPath and &htmlTag
#
# dpvc 1/96 Added script path to $htmlFormatPath automatically
#
# dpvc 11/95 Fixed a bug with removal of leading spaces.
# Convert TABs to SPACEs before the line is processed.
#
# dpvc 10/95 Wrote it.
#
#
# This is the path where .tag files can be found. You may need to
# customize this for your installation. This should be the only item
# that is installation-dependent. The directory where this script
# runs from is added automatically to the path.
#
$htmlFormatPath = ".";
if ($0 =~ m!/[^/]*$!) {$htmlFormatPath .= ":$`" if ($` ne "" && $` ne ".")}
#
# The maximum width of a line of text for the output file
#
$htmlWidth = 78;
#
# The default extension for the output file (may be changed within
# .tag files)
#
$htmlExtension = ".txt";
#
# The initial and terminal strings for the output file.
# These are usually set by the .tag file. For example, in the
# html2latex.tag file, these include the \documentstyle command and
# the \begin{document} command required by LaTeX.
#
$htmlInitialString = "";
$htmlFinalString = "";
#
# $htmlPrintInitFinStr controls whether the initial and terminal strings
# are printed. By default, it is set to 1 and the strings are printed.
# A command line flag allows this to be set to 0 and supress these strings
# when processing a TeX file for inclusion in another TeX file.
$htmlPrintInitFinStr = 1;
#
# $htmlBreakChars is a string of characters where line breaks can be
# made if there are no spaces in a line to be broken. For example,
# $htmlBreakChars = "\\\{" will allow breaks before the back slash
# and before the open brace. When such a break is made, the
# $htmlBreakNL character is inserted at the end of the line. For
# example, if $htmlBreakNL = "%" then lines that are broken at
# non-spaces will end with a percent sign. This is appropriate for
# TeX output.
#
$htmlBreakChars = "";
$htmlBreakNL = "";
#
# These are the special characters in HTML. You can add other ones
# by defining more entries in this associative array. For example,
# you can use $htmlChar{"{"} = '\{' to make "{" translate to "\{" in
# the output file (this is appropriate for TeX output).
#
# If the right-hand side begins with an ampersand (&) then the string
# is executed via and "eval" statement when that character appears in
# the input file; otherwise, the right-hand side is inserted as is
# into the output file in place of the input character.
#
# The two examples below call perl subroutines when their associated
# characters are found in the input file; the example above replaces
# the input character with a literal string.
$htmlChar{"<"} = '&htmlTag';
$htmlChar{"&"} = '&htmlEntity';
#
# The characters to output for an open angle bracket and an
# ampersand (for use within &htmlTag and &htmlEntity).
#
$htmlLtString = "<";
$htmlAmpString = "&";
#
# The following associative array defines the translations for the
# different HTML entity names. As with $htmlChar above, if the
# string begins with an ampersand (&), then the string is executed
# when the entity appears in the input file, otherwise the string
# is sent to the output file as is.
#
# If an entity appears that is not named here, then its name is sent
# to the output file instead.
#
$htmlEntity{"amp"} = '&htmlPrint($htmlAmpString)';
$htmlEntity{"lt"} = '<';
$htmlEntity{"gt"} = '>';
$htmlEntity{"nbsp"} = ' ';
$htmlEntity{"iexcl"} = '';
$htmlEntity{"cent"} = '';
$htmlEntity{"pound"} = '#';
$htmlEntity{"curren"} = '';
$htmlEntity{"yen"} = '';
$htmlEntity{"brvbar"} = '|';
$htmlEntity{"brkbar"} = '|';
$htmlEntity{"sect"} = '';
$htmlEntity{"uml"} = '';
$htmlEntity{"copy"} = '(c)';
$htmlEntity{"ordf"} = '';
$htmlEntity{"laquo"} = '<<';
$htmlEntity{"not"} = '-';
$htmlEntity{"shy"} = '';
$htmlEntity{"reg"} = '(r)';
$htmlEntity{"hibar"} = '-';
$htmlEntity{"deg"} = '';
$htmlEntity{"plusmn"} = '+-';
$htmlEntity{"sup2"} = '^2';
$htmlEntity{"sup3"} = '^3';
$htmlEntity{"acute"} = "'";
$htmlEntity{"micro"} = '';
$htmlEntity{"para"} = '';
$htmlEntity{"middot"} = '.';
$htmlEntity{"cedil"} = '';
$htmlEntity{"sup1"} = '^1';
$htmlEntity{"ordm"} = '';
$htmlEntity{"raquo"} = '>>';
$htmlEntity{"frac14"} = '1/4';
$htmlEntity{"frac12"} = '1/2';
$htmlEntity{"frac34"} = '3/4';
$htmlEntity{"iquest"} = '';
$htmlEntity{"Agrave"} = 'A';
$htmlEntity{"Aacute"} = 'A';
$htmlEntity{"Acirc"} = 'A';
$htmlEntity{"Atilde"} = 'A';
$htmlEntity{"Auml"} = 'A';
$htmlEntity{"Aring"} = 'A';
$htmlEntity{"AElig"} = 'AE';
$htmlEntity{"Ccedil"} = 'C';
$htmlEntity{"Egrave"} = 'E';
$htmlEntity{"Eacute"} = 'E';
$htmlEntity{"Ecirc"} = 'E';
$htmlEntity{"Euml"} = 'E';
$htmlEntity{"Igrave"} = 'I';
$htmlEntity{"Iacute"} = 'I';
$htmlEntity{"Icurc"} = 'I';
$htmlEntity{"Iuml"} = 'I';
$htmlEntity{"ETH"} = '';
$htmlEntity{"Dstrok"} = '';
$htmlEntity{"Ntilde"} = 'N';
$htmlEntity{"Ograve"} = 'O';
$htmlEntity{"Oacute"} = 'O';
$htmlEntity{"Ocirc"} = 'O';
$htmlEntity{"Otilde"} = 'O';
$htmlEntity{"Ouml"} = 'O';
$htmlEntity{"times"} = 'x';
$htmlEntity{"Oslash"} = 'O';
$htmlEntity{"Ugrave"} = 'U';
$htmlEntity{"Uacute"} = 'U';
$htmlEntity{"Ucirc"} = 'U';
$htmlEntity{"Uuml"} = 'U';
$htmlEntity{"Yacute"} = 'Y';
$htmlEntity{"THORN"} = '';
$htmlEntity{"szlig"} = 'ss';
$htmlEntity{"agrave"} = 'a';
$htmlEntity{"aacute"} = "a";
$htmlEntity{"acirc"} = 'a';
$htmlEntity{"atilde"} = 'a';
$htmlEntity{"auml"} = 'a';
$htmlEntity{"aring"} = 'a';
$htmlEntity{"aelig"} = 'ae';
$htmlEntity{"ccedil"} = 'c';
$htmlEntity{"egrave"} = 'e';
$htmlEntity{"eacute"} = 'e';
$htmlEntity{"ecirc"} = 'e';
$htmlEntity{"euml"} = 'e';
$htmlEntity{"igrave"} = 'i';
$htmlEntity{"iacute"} = 'i';
$htmlEntity{"icirc"} = 'i';
$htmlEntity{"iuml"} = 'i';
$htmlEntity{"eth"} = '';
$htmlEntity{"ntilde"} = 'n';
$htmlEntity{"ograve"} = 'o';
$htmlEntity{"oacute"} = 'o';
$htmlEntity{"ocirc"} = 'o';
$htmlEntity{"otilde"} = 'o';
$htmlEntity{"ouml"} = 'o';
$htmlEntity{"divide"} = '/';
$htmlEntity{"oslash"} = 'o';
$htmlEntity{"ugrave"} = 'u';
$htmlEntity{"uacute"} = 'u';
$htmlEntity{"ucirc"} = 'u';
$htmlEntity{"uuml"} = 'u';
$htmlEntity{"yacute"} = 'y';
$htmlEntity{"thorn"} = '';
$htmlEntity{"yuml"} = 'y';
#
# The special entity form "nnn;" is handled by inserting the ASCII
# character with number nnn if nnn < 160, otherwise the correct
# entity from the list above is selected. The association of numbers
# to names is done with the following array.
#
@htmlNumberEntity =
(
"nbsp", "iexcl", "cent", "pound", "curren", # 160 - 164
"yen", "brvbar", "sect", "uml", "copy", # 165 - 169
"ordf", "laquo", "not", "shy", "reg", # 170 - 174
"hibar", "deg", "plusmn", "sup2", "sup3", # 175 - 179
"acute", "micro", "para", "middot", "cedil", # 180 - 184
"sup1", "ordm", "raquo", "frac14", "frac12", # 185 - 189
"frac34", "iquest", "Agrave", "Aacute", "Acirc", # 190 - 194
"Atilde", "Auml", "Aring", "AElig", "Ccedil", # 195 - 199
"Egrave", "Eacute", "Ecirc", "Euml", "Igrave", # 200 - 204
"Iacute", "Icurc", "Iuml", "ETH", "Ntilde", # 205 - 209
"Ograve", "Oacute", "Ocirc", "Otilde", "Ouml", # 210 - 214
"times", "Oslash", "Ugrave", "Uacute", "Ucirc", # 215 - 219
"Uuml", "Yacute", "THORN", "szlig", "agrave", # 220 - 224
"aacute", "acirc", "atilde", "auml", "aring", # 225 - 229
"aelig", "ccedil", "egrave", "eacute", "ecirc", # 230 - 234
"euml", "igrave", "iacute", "icirc", "iuml", # 235 - 239
"eth", "ntilde", "ograve", "oacute", "ocirc", # 240 - 244
"otilde", "ouml", "divide", "oslash", "ugrave", # 245 - 249
"uacute", "ucirc", "uuml", "yacute", "thorn", # 250 - 255
"yuml" # 256
);
#
# The following definitions determine way that the different HTML
# tags will be handled. Tag names must appear below in all upper
# case (though they can be in mixed case in the input file). Tags
# that are not defined will be ignored (and will not appear in the
# output file.
#
# As with $htmlChar and $htmlEntity above, if a tag's value begines
# with and ampersand (&), then the string will be executed when the
# tag appears in the input file, otherwise the string will be sent to
# the output file. For example, the definitions below mean that the
#
paragraph tag will translate to a blank line in the outputfile,
# while the tag will call the &htmlIMG subroutine for further
# processing. The
tag will be removed from the file, since it
# has no definition below.
#
# The .tag files can augment the basic functionality of this file by
# defining translation strings new tags, or different strings for
# some of the tags listed below.
#
$htmlTag{"P"} = "\n\n"; # a blank line
$htmlTag{"BR"} = "\n"; # a line break
$htmlTag{"HR"} = # a horizontal line
"\n---------------------------------------------------------------\n";
$htmlTag{"IMG"} = '&htmlIMG'; # process images specially
#
# The text between and or and
# will be ignored and will not appear in the output file
#
$htmlTag{"HEAD"} = '&htmlSuspendOutput';
$htmlTag{"/HEAD"} = '&htmlRestoreOutput';
$htmlTag{"TITLE"} = '&htmlSuspendOutput';
$htmlTag{"/TITLE"} = '&htmlRestoreOutput';
#
# Lists simply start on a new line (see html2text.tag for a more
# complete implementation of lists).
#
$htmlTag{"UL"} = "\n"; $htmlTag{"/UL"} = "\n";
$htmlTag{"OL"} = "\n"; $htmlTag{"/OL"} = "\n";
$htmlTag{"LI"} = "\n";
$htmlTag{"DL"} = "\n"; $htmlTag{"/DL"} = "\n";
$htmlTag{"DT"} = "\n";
$htmlTag{"DD"} = "\n";
#
# Blockquotes are separate paragraphs (see html2text.tag for a more
# complete implementation)
#
$htmlTag{"BLOCKQUOTE"} = "\n\n";
$htmlTag{"/BLOCKQUOTE"} = "\n\n";
#
# PRE formatted text is handled specially
#
$htmlTag{"PRE"} = '&htmlPRE';
$htmlTag{"/PRE"} = '&htmlPREend';
#
# Make headers be separate paragraphs
#
$htmlTag{"H1"} = "\n\n"; $htmlTag{"/H1"} = "\n\n";
$htmlTag{"H2"} = "\n\n"; $htmlTag{"/H2"} = "\n\n";
$htmlTag{"H3"} = "\n\n"; $htmlTag{"/H3"} = "\n\n";
$htmlTag{"H4"} = "\n\n"; $htmlTag{"/H4"} = "\n\n";
$htmlTag{"H5"} = "\n\n"; $htmlTag{"/H5"} = "\n\n";
$htmlTag{"H6"} = "\n\n"; $htmlTag{"/H6"} = "\n\n";
#
# FORMs appear as separate paragraphs. Ignore all the input items
# (see html2text.tag for a more complete implementation).
#
$htmlTag{"FORM"} = "\n\n";
$htmlTag{"/FORM"} = "\n\n";
$htmlTag{"INPUT"} = '&htmlIgnoreSpaces';
$htmlTag{"TEXTAREA"} = '&htmlIgnoreSpaces';
$htmlTag{"SELECT"} = '&htmlIgnoreSpaces';
#
# Ignore leading spaces in the text that follows
#
sub htmlIgnoreSpaces {$htmlLineSpace = 1}
#
# Prevent text from going to the output file. Calls to these
# reoutines can be nested, and output will not restart until the
# outermost &htmlRestoreOutput call.
#
sub htmlSuspendOutput {$htmlNoOutput++}
sub htmlRestoreOutput {$htmlNoOutput-- if ($htmlNoOutput > 0)}
#
# Turn on and off PRE formatted mode
#
sub htmlPRE {&htmlPrint("\n"); $htmlPreMode = 1}
sub htmlPREend {$htmlPreMode = 0; &htmlPrint("\n")}
#
# For an image, parse the rest of the tag parameters
# If there is an ALT string, use it, otherwise print "[IMAGE]"
#
sub htmlIMG {&htmlParseTags; &htmlTextIMG}
sub htmlTextIMG
{
if (defined($tag{"ALT"})) {&htmlOutputHTML($tag{"ALT"})}
else {&htmlOutput("[IMAGE]")}
}
#
# For INPUT tags, get the rest of the tag parameters
# Get the type of the tag (TEXT is the default)
# Call the appropriate subroutine to handle it
#
sub htmlInput
{
local ($type);
&htmlParseTags;
$type = &htmlGetTag("TYPE","text");
$type =~ tr/a-z/A-Z/;
eval "&htmlInput$type";
}
#
# has no function. The .tag files must supply
# other INPUT tag types.
#
sub htmlInputHIDDEN {}
###########################################################################
#
# The following routines are the core of the html2text processor, and
# should not be modified without great care. The variables defined
# here can be used by subroutines defined in .tag files, and can even
# be modified when the need arises.
#
###########################################################################
#
# This is the place where the unprocessed text from the input file is
# stored.
#
$htmlBuffer = "";
#
# &htmlMain - the main loop
#
# Get the list of special characters
# If initial-final-string flag is set
# Print the initial string
# While there is more data in the file
# Get the next line from the file
# Try to process the data
# If initial-final-string flag is set
# Print the final string
# Flush any buffered output
#
sub htmlMain
{
local ($htmlCharList) = '[\\'.join('',sort(keys(%htmlChar))).']';
if ($htmlPrintInitFinStr) {&htmlPrint($htmlInitialString);}
while (!eof(STDIN))
{
$htmlBuffer .= ;
&htmlHandleBuffer;
}
if ($htmlPrintInitFinStr) {&htmlPrint($htmlFinalString);}
&htmlPrint("\n") if ($htmlLine ne "");
}
#
# &htmlHandleBuffer - Process the characters in the input buffer
#
# Convert tabs to spaces (even in PRE mode)
# While there is a special character in the buffer
# Set the buffer to be whatever follows it
# Output the stuff that preceeds it
# Process the special character
# Output any remaining text and clear the buffer
#
sub htmlHandleBuffer
{
$htmlBuffer =~ s/\t/ /g;
while ($htmlBuffer =~ m/$htmlCharList/o)
{
$htmlBuffer = $';
&htmlOutput($`);
&htmlHandleChar($&);
}
&htmlOutput($htmlBuffer);
$htmlBuffer = "";
}
#
# &htmlFindNext - Look for a pattern in the input file
#
# If we're allowed to read more data, read from the file until we
# find the pattern.
# If the buffer contains the pattern
# Save the stuff following it in the buffer
# Return the material up to the match, and the match itself
# Otherwise
# Return the entire buffer (with no match string), and clear the buffer
#
$htmlReadingSTDIN = 1; # controls whether to read more from the
# input file or just use what's in the buffer
sub htmlFindNext
{
local ($pattern) = @_;
while ($htmlBuffer !~ m/$pattern/i && ! eof(STDIN) && $htmlReadingSTDIN)
{$htmlBuffer .= }
if ($htmlBuffer =~ m/$pattern/i) {
$htmlBuffer = $';
return ($`,$&);
} else {
local ($tmp) = $htmlBuffer;
$htmlBuffer = "";
return ($tmp,"");
}
}
######################################################################
#
# Output routines
#
######################################################################
#
# &htmlPrint - send a string to the output buffer verbatim
#
# Don't compress spaces
# Change \n to \r (so we can tell that these are real newlines and
# not ones from the file)
# Output the string
#
$htmlForceSpaces = 0; # true if output should not compress spaces
sub htmlPrint
{
local ($htmlForceSpaces) = 1;
local ($string) = @_;
$string =~ s/\n/\r/g;
&htmlOutput($string);
}
#
# &htmlOutputHTML - send a string to the output buffer, processing
# any embedded HTML commands
#
# Locally set the buffer to the output string
# Locally don't allow reading from the file
# Process the buffer (i.e., look for Entities or other translations)
# Warning: this allows for nested tags, which is not really legal,
# but it was the easiest way to do this.
#
sub htmlOutputHTML
{
local ($htmlBuffer) = @_;
local ($htmlReadingSTDIN) = 0;
&htmlHandleBuffer;
}
#
# &htmlOutput - break a string into lines and print them
#
# If we have text to output, and we are allowing output
# Break the string into lines and send them to the output routine
# Output the last part with no line terminator
#
sub htmlOutput
{
local ($string) = @_;
if ($string ne "" && !$htmlNoOutput)
{
while ($string =~ m/(\n|\r+)/) {&htmlOutputLine($`,$&); $string = $'}
&htmlOutputLine($string,"") if ($string ne "");
}
}
$htmlLineNL = 0; # 1 if we are at a line break in the input file
$htmlLinePar = 2; # counts the number of \n we have printed
$htmlLineSpace = 1; # 1 if the last character printed was a space,
# -1 if it was the $htmlBreakNL character
$htmlPreMode = 0; # 1 if PRE mode in effect
$htmlIndent = ""; # indentation string
$htmlLine = ""; # text waiting to be output
#
# &htmlOutput - cook a line and send it to the output buffer
#
# If we are in PRE mode
# Add the indenting if this is the beginning of a line
# Add the text to be printed and print it
# Otherwise
# Add a leading space if there was a NL in the file and
# either the line is not empty and the previous thing wasn't a
# space, or we just had a forced line break with $htmlBreakNL
# If we are not preserving spaces
# Make double spaces into single ones (except at sentence ends)
# Eliminate leading spaces if we already have printed one
# Eliminate trailing spaces if we have a newline to print
# If we still have something to print, or we have a newline to print
# Record whether there is a newline from the file
# Add the data to the line buffer (may cause actual output)
# If the newline contains \r, substitute \n for them and print
# Otherwise
# Check if there is a newline from the file
#
sub htmlOutputLine
{
local ($line,$nl) = @_;
if ($htmlPreMode)
{
$htmlLine = $htmlIndent if ($htmlLine eq "");
$htmlLine .= $line;
&htmlPrintLine($nl);
} else {
$line = " ".$line if ($htmlLineNL &&
(($htmlLine ne "" && !$htmlLineSpace) || $htmlLineSpace == -1));
if (!$htmlForceSpaces)
{
$line =~ s/([^\.\:\!\?\"\']|^) +/\1 /g; ###LINE633
$line =~ s/^ +// if ($htmlLineSpace == 1);
$line =~ s/ +$// if ($nl ne "");
}
if ((($line ne " " || !$htmlLineNL) && $line ne "") || $nl =~ m/\r/)
{
$htmlLineNL = ($nl eq "\n");
&htmlAddToLine("$line");
if ($nl =~ s/\r/\n/g) {&htmlPrintLine($nl)}
} else {
$htmlLineNL |= ($nl eq "\n");
}
}
}
#
# &htmlPrintLine - print the output buffer and record new lines
#
# If the line has some datta, print it, and clear the newline counter
# Set the ignore-initial-spaces flag
# Clear the line buffer
# If this is a paragraph break
# Print the correct number of newlines and count them
# Otherwise if this is a line break
# Print a newline if needed and count it
# Otherwise the last thing printed is not a new line
#
sub htmlPrintLine
{
local ($nl) = @_;
if ($htmlLine ne $htmlIndent) {print $htmlLine; $htmlLinePar = 0}
$htmlLineSpace = 1;
$htmlLine = "";
if ($nl eq "\n\n")
{
if ($htmlLinePar == 0) {print "\n\n"}
elsif ($htmlLinePar == 1) {print "\n"}
$htmlLinePar = 2;
} elsif ($nl eq "\n") {
print "\n" if (!$htmlLinePar);
$htmlLinePar = 1;
} else {$htmlLinePar = 0}
}
#
# &htmlAddToLine - Add some data to the output line buffer
#
# Add the indentation if this is the start of a line
# Add the new data to the line
# While the line is too long:
# Find the last space before the line gets too long
# If there is no such space
# If there are additional break characters and one of them is found
# If it is the first thing on the line
# Give up: simply break the line at the maximum length
# Add the break termination character (% for TeX)
# Otherwise
# Break the line before the given character and save the rest
# Otherwise (no other breakpoints)
# Give up: simply break at the maximum width and add the
# break NL character (% for TeX)
# Otherwise (a space was found)
# Break the line at the space and remove the space
# Print the portion of the line before the breakpoint
# If there is more data, insert the indentation
# Indicate that there is data after the last line break and that
# spaces are still important
# If the remaining line is not empty
# Record whether it ended with a space (so new spaces will be ignored)
# Record that we have data since the last line break
#
sub htmlAddToLine
{
local ($line,$i);
$htmlLine = $htmlIndent if ($htmlLine eq "");
$htmlLine .= "@_";
while (length($htmlLine) > $htmlWidth)
{
$i = rindex($htmlLine," ",$htmlWidth);
if ($i < length($htmlIndent))
{
if ($htmlBreakChars ne "" &&
$line =~ m/[$htmlBreakChars][^$htmlBreakChars]*$/)
{
if ($& eq $htmlIndent)
{
$line = substr($htmlLine,$htmlWidth);
$htmlLine = substr($line,0,$htmlWidth).$htmlBreakNL;
} else {
$line = $&; $htmlLine = $`.$htmlBreakNL;
}
} else {
$line = substr($htmlLine,$htmlWidth);
$htmlLine = substr($htmlLine,0,$htmlWidth).$htmlBreakNL;
}
} else {
$line = substr($htmlLine,$i+1);
$htmlLine = substr($htmlLine,0,$i);
$htmlLine =~ s/^ +//;
}
&htmlPrintLine("\n");
if ($line ne "") {$htmlLine = $htmlIndent.$line}
$htmlLinePar = 0; $htmlLineSpace = 1;
}
if ($htmlLine ne $htmlIndent)
{
$htmlLineSpace = (substr($htmlLine,-1,1) eq " ");
$htmlLinePar = 0;
}
}
#
# &htmlHandleChar - perform a special character's action
#
# Get the string associated with the given character
# Do what the string asks
#
sub htmlHandleChar
{
local ($command) = $htmlChar{"@_"};
&htmlDoString($command);
}
#
# &htmlDoString - execute or print a string
#
# If the string begins with "&"
# Evaluate the string as a perl command and report any errors
# Otherwise print the string
#
sub htmlDoString
{
local ($string) = @_;
if (substr($string,0,1) eq "&") {eval $string; warn $@ if ($@)}
else {&htmlPrint($string)}
}
######################################################################
#
# Routines to handle Tags, Entities, etc.
#
######################################################################
#
# &htmlTag - look up and do an HTML tag
#
# Get the next character
# If it is "!" do the comments
# Otherwise if it is a space or newline or nothing, print a "<"
# Otherwise (a real tag)
# Put back the character
# Find the end of the tag
# Split the tag at equal-signs or spaces or new lines
# Get the name of the tag and put it in upper case
# If the tag is defined, do what it says, otherwise ignore it
#
sub htmlTag
{
local ($empty,$end) = &htmlFindNext("."); ###LINE795
local ($name,@tags,%tag);
if ($end eq "!") {&htmlComment}
elsif ($end eq " " || $end eq "\n" || $end eq "")
{
&htmlOutput($htmlLtString.$end);
} else {
$htmlBuffer = $end.$htmlBuffer;
($name,$end) = &htmlFindNext(">");
@tags = split("([ \n]*=[ \n]*|[ \n]+)",$name);
$name = shift(@tags); $name =~ tr/a-z/A-Z/;
if (defined($htmlTag{$name})) {&htmlDoString($htmlTag{$name})}
}
}
#
# &htmlParseTags - get an array of tags and their values
#
# While there are more items in the tag list
# If the item is a name not an equal sign or a space
# Translate the name to upper case
# If the next item is an equal sign
# Remove it and parse the item's value
# Otherwise set the item's value to be empty
#
sub htmlParseTags
{
local ($id);
while ($id = shift(@tags))
{
if ($id !~ m/( *= *| +)/)
{
$id =~ tr/a-z/A-Z/;
if (@tags[0] =~ m/=/) {shift(@tags); &htmlParseValue} ###LINE830
else {$tag{$id} = ""}
}
}
}
#
# &htmlParseValue - get the value for an item of the form ID=VALUE
# (takes quotes into account)
#
# Get the next item and use that as the value
# If the first character is a quote
# As long as the last character is not a quote
# If there are no more tags, add an end quote explicitly
# Add the next item to the value (the value may have been split at
# spaces, for example)
# Remove the quotation marks
# Set the item to its value
#
sub htmlParseValue
{
local ($value) = shift(@tags);
if (substr($value,0,1) eq '"')
{
while (substr($value,-1,1) ne '"' || length($value) eq 1)
{
if ($#tags < 0) {@tags = ('"')}
$value .= shift(@tags);
}
$value = substr($value,1,length($value)-2);
}
$tag{$id} = $value;
}
sub htmlGetTag
{
local ($name,$value) = @_;
$value = $tag{$name} if (defined($tag{$name}));
return ($value);
}
#
# &htmlComment - handle a comment tag
#
# Get the next two characters
# If they are two dashes (i.e, a long comment "")
# Otherwise
# Put back the two characters
# Find the end comment (">")
# If we have a comment handler, call it on the comment data
#
sub htmlComment
{
local ($com,$end) = &htmlFindNext("..");
if ($end eq "--") {($com,$end) = &htmlFindNext("-->")} else
{
$htmlBuffer = $end.$htmlBuffer;
($com,$end) = &htmlFindNext(">");
}
if (defined($htmlComment)) {eval $htmlComment."(\$com)"}
}
#
# &htmlEntity - handle an HTML entity name
#
# Find the end of the name (a space, semi-colon or end-of-line)
# If the name is blank, output an ampersand
# Otherwise
# If the first character is a number sign, do a number entity
# Otherwise if the entity is defined, do it
# Otherwise output the entity name as it appeared in the file
#
sub htmlEntity
{
local ($name,$end) = &htmlFindNext("( +|\;|\$)");
if ($name eq "") {&htmlOutput($htmlAmpString.$end)} else
{
if (substr($name,0,1) eq "\#") {&htmlNumberEntity}
elsif (defined($htmlEntity{$name})) {&htmlDoString($htmlEntity{$name})}
else {&htmlOutput($htmlAmpString.$name.$end)}
}
}
#
# &htmlNumberEntity - Handle nnn; entities
#
# Remove the initial number sign
# If the "nnn" is less than 160, print that ASCII character
# Otherwise do the entity whose name is given in the htmlNumberEntity array
#
sub htmlNumberEntity
{
substr($name,0,1) = "";
if ($name < 160) {&htmlPrint(sprintf("%c",$name))}
else {&htmlDoString($htmlEntity{@htmlNumberEntity[$name-160]})} ###LINE927
}
######################################################################
#
# File I/O and Initialization routines
#
######################################################################
#
# &htmlFindFormat - locate a .tag file in the path
#
# If the name is an absolute one
# If the file is readable, return it
# Otherwise
# For each directory in the $htmlFormatPath string
# If the file is there, return its name
# Otherwise return nothing
#
sub htmlFindFormat
{
local ($path);
local ($name) = @_;
if (substr($name,0,1) eq "/")
{
if (-r "$name.tag") {return "$name.tag"}
} else {
foreach $path (split(":",$htmlFormatPath))
{if (-r "$path/$name.tag") {return "$path/$name.tag"}}
}
return "";
}
#
# &htmlRequire - load a required .tag file
#
# Find the format file
# If it exists, load it, otherwise print a warning
#
sub htmlRequire
{
local ($file) = &htmlFindFormat("@_");
if ($file ne "") {require $file}
else {warn "Can't locate required format file '@_'"}
}
#
# &htmlInitialize - load the header files and set up variables
#
# Get the .tag file for the current format, if any
# Get the name of the current format in upper case
# If there is an environment variable for this format, use it
# Otherwise look in the home directory for a dot file
# If a customization file exists for this format, load it
#
sub htmlInitialize
{
local ($name);
$name = &htmlFindFormat($htmlCommandName);
if ($name ne "") {require $name}
$name = $htmlCommandName;
$name =~ tr/a-z/A-Z/;
if (defined($ENV{$name})) {$name = $ENV{$name}}
else {$name = "$HOME/.$htmlCommandName";} ###LINE994
if ($name ne "" && (-r $name)) {require $name}
}
#
# &htmlOpenFiles - open the input and output files
#
# If no input file was specified, read from stdin
# If no output file was specified
# If the input is stdin, use stdout
# Otherwise, use the input file but replace the extension
# If the input file is not stdin
# Try to open the input file, and error if not successful
# If the output file is not stdout
# Try to open the output file, and error if not successful
#
sub htmlOpenFiles
{
if ($htmlInFile eq "") {$htmlInFile = "-"}
if ($htmlOutFile eq "")
{
if ($htmlInFile eq "-") {$htmlOutFile = "-"} else
{
$htmlOutFile = $htmlInFile;
$htmlOutFile =~ s#(.*/|^)(.*)\..*#\2$htmlExtension#; ###LINE1018
}
}
if ($htmlInFile ne "-")
{
if (!open(STDIN,$htmlInFile))
{&cliError("Can't open '$htmlInFile' for reading")}
}
if ($htmlOutFile ne "-")
{
if (!open(STDOUT,">".$htmlOutFile))
{&cliError("Can't open '$htmlOutFile' for writing")}
}
}
######################################################################
#
# Command Line Argument Processing Routines
#
######################################################################
#
# &cliReadArgs - handle command line arguments
#
# While there are more arguments to process
# If the next argument is a dash followed by some flag
# Get the flag name
# If the flag is defined, do its associated routine
# Otherwise warn that no such flag is defined
# Otherwise
# If the input file is not specified, this is it
# Otherwise if the output file is not set, this is it
# Otherwise there are too many command line arguments
#
sub cliReadArgs
{
local ($arg);
while ($#ARGV >= 0)
{
$arg = shift(@ARGV);
if (substr($arg,0,1) eq "-" && length($arg) > 1)
{
$arg = substr($arg,1);
if (defined($cliArg{$arg})) {eval $cliArg{$arg}}
else {&cliError("Undefined option '-$arg'")}
} else {
if ($htmlInFile eq "") {$htmlInFile = $arg}
elsif ($htmlOutFile eq "") {$htmlOutFile = $arg}
else {&cliError("Too many file names specified on command line")}
}
}
}
#
# &cliError - print an error and exit
#
sub cliError {print STDERR $htmlCommandName,": @_\n"; exit 1;}
#
# The array below defines the valid command-line arguments. The value
# of each item in the associative array is the function to call when
# that flag is found in the command line.
#
# The .tag files can define additional flags if desired.
#
$cliArg{"f"} = '&cliFlagF';
#
# &cliFlagF - implements the "-f format" command-line option
#
# Load the required format file
#
sub cliFlagF {&htmlRequire(shift(@ARGV))}
#
# &cliFlagNoInitStr - Disables printing of the initial and final strings
#
# This sets the variable $htmlPrintInitFinStr, which is checked in
# &htmlMain before the initial and final strings are printed to the
# output files.
#
$cliArg{"noinitstring"} = '&cliFlagNoInitStr';
sub cliFlagNoInitStr {$htmlPrintInitFinStr = 0}
#
# This variable holds the name of the executing command
#
$htmlCommandName = substr($0,rindex($0,'/')+1);
#
# The input and output file names
#
$htmlInFile = "";
$htmlOutFile = "";
#
# Get the format-file search path (if specified)
#
if (defined($ENV{"HTML2FORMAT"})) {$htmlFormatPath = $ENV{"HTML2FORMAT"}}
#
# Initialize everything and load the formats
# Parse the command line
# Open the input and output files
# Process the input file
&htmlInitialize;
&cliReadArgs;
&htmlOpenFiles;
&htmlMain;