#!/usr/local/bin/perl # Change the above line to reflect the location of perl on your system # txt2html.pl # Convert raw text to something with a little HTML formatting # # Written by Seth Golub # http://www.cs.wustl.edu/~seth/txt2html/ # # $Revision: 1.3 $ # $Date: 1996/09/23 18:04:34 $ # $Author: lstein $ # # # $Log: txt2html.pl,v $ # Revision 1.3 1996/09/23 18:04:34 lstein # Bug fixes. # # Revision 1.2 1995/11/20 01:52:30 lstein # No changes! # # Revision 1.1.1.1 1995/11/19 22:16:02 lstein # This is the entire "htdocs" portion of the WICGR World Wide Web # site. # # Revision 1.12 1995/10/14 01:57:43 sgolub # * Fixed bug in heading regexp # * Changed underline tolerance parameters from min & max length # difference to length difference & offset difference # * Centralized line reading, added handling of DOS carriage returns # * Switched to heading style stack. Styles still very limited. # * Changed heading anchor names from a simple count to a hierarchical # section number. # # Revision 1.11 1995/09/20 23:13:21 seth # * Blank lines are never considered underlined # * Shortline breaking slightly more intelligent (or at least different) # * Paragraph breaks much more intelligent # * Lowercased tags. Style is so fickle. # * Added links dictionaries, link making, etc. # * Allow repeated bullet chars for unordered lists. (Tiny mod to regexp) # * switched order of caps & liststuff in main() # * improved untabify() so it converts the whole line, not just beginning # * split up all lines >79 characters to avoid common downloading error # (people would sometimes copy the script off the display, # inadvertantly adding a few newlines in bad places in the code) # * Handles option "--" now. # * Accepts named files as input as alternative to stdin # * Deals with stdin properly (no more extra EOFs needed) # * Improved mail handling # # Revision 1.10 1994/12/28 20:10:25 seth # * Added --extract, etc. # # Revision 1.9 94/12/13 15:16:23 15:16:23 seth (Seth Golub) # * Changed from #!/usr/local/bin/perl to the more clever version in # the man page. (How did I manage not to read this for so long?) # * Swapped hrule & header back to handle double lines. Why should # this order screw up headers? # # Revision 1.8 1994/11/30 21:07:03 seth # * put mail_anchor back in. (Why did I take this out?) # * Finally added handling of lettered lists (ordered lists marked with # letters) # * Added title option (--title, -t) # * Shortline now looks at how long the line was before txt2html # started adding tags. ($line_length) # * Changed list references to scalars where appropriate. (@foo[0] -> $foo[0]) # * Added untabify() to homogenize leading indentation for list # prefixes and functions that use line length # * Added "underline tolerance" for when underlines are not exactly the # same length as what they underline. # * Added error message for unrecognized options # * removed \w matching on --capstag # * Tagline now removes leading & trailing whitespace before tagging # * swapped order of caps & heading in main loop # * Cleaned up code for speed and to get rid of warnings # * Added more restrictions to something being a mail header # * Added indentation for lists, just to make the output more readable. # * Fixed major bug in lists: $OL and $UL were never set, so when a # list was ended "" was *always* used! # * swapped order of hrule & header to properly handle long underlines # # Revision 1.7 94/10/28 13:16:11 13:16:11 seth (Seth Golub) # * Added to comments in options section # * renamed blank to is_blank # * Page break is converted to horizontal rule
# * moved usage subroutine up top so people who look through code see # it sooner # # Revision 1.6 94/10/28 12:43:46 12:43:46 seth (Seth Golub) # * Creates anchors at each heading # # Revision 1.5 94/07/14 17:43:59 17:43:59 seth (Seth Golub) # * Fixed minor bug in Headers # * Preformatting can be set to only start/stop when TWO lines of # [non]formatted-looking-text are encountered. Old behavior is still # possible through command line options (-pb 1 -pe 1). # * Can preformat entire document (-pb 0) or disable preformatting # completely (-pe 0). # * Fixed minor bug in CAPS handling (paragraph breaks broke) # * Puts paragraph tags *before* paragraphs, not just between them. # # Revision 1.4 94/06/20 16:42:55 16:42:55 seth (Seth Golub) # * Allow ':' for numbered lists (e.g. "1: Figs") # * Whitespace at end of line will not start or end preformatting # * Mailmode is now off by default # * Doesn't break short lines if they are the first line in a list # item. It *should* break them anyway if the next line is a # continuation of the list item, but I haven't dealt with this yet. # * Added action on lines that are all capital letters. You can change # how these lines get tagged, as well as the mininum number of # consecutive capital letters required to fire off this action. # # Revision 1.3 94/05/17 15:58:58 15:58:58 seth (Seth Golub) # * Tiny bugfix in unhyphenation # # Revision 1.2 94/05/16 18:15:16 18:15:16 seth (Seth Golub) # * Added unhyphenation # # Revision 1.1 94/05/16 16:19:03 16:19:03 seth (Seth Golub) # Initial revision # # # 1.02 Allow '-' in mail headers # Added handling for multiline mail headers # ################################################################# # Some initializations that need to go before the configuration # @links_dictionaries = 0; # ######################### ######################### # Configurable options # # [-s ] | [--shortline ] $short_line_length = 40; # Lines this short (or shorter) must be # intentionally broken and are kept # that short.
# [-p ] | [--prewhite ] $preformat_whitespace_min = 5; # Minimum number of consecutive leading # whitespace characters to trigger # preformatting. # NOTE: Tabs are now expanded to # spaces before this check is made. # That means if $tab_width is 8 and # this is 5, then one tab may be # expanded to 8 spaces, which is # enough to trigger preformatting. $par_indent = 2; # Minumum number of spaces indented in # first lines of paragraphs. # Only used when there's no blank line # preceding the new paragraph. (Like in # this comment) # [-pb ] | [--prebegin ] $preformat_trigger_lines = 2; # How many lines of preformatted-looking # text are needed to switch to
                                # <= 0 : Preformat entire document
                                #    1 : one line triggers
                                # >= 2 : two lines trigger

# [-pe    ] | [--preend                     ]
$endpreformat_trigger_lines = 2; # How many lines of unpreformatted-looking
                                 # text are needed to switch from 
                                 # <= 0 : Never preformat within document
                                 #    1 : one line triggers
                                 # >= 2 : two lines trigger
# NOTE for --prebegin and --preend:
# A zero takes precedence.  If one is zero, the other is ignored.
# If both are zero, entire document is preformatted.


# [-r     ] | [--hrule                      ]
$hrule_min = 4;                 # Min number of ---s for an HRule.

# [-c     ] | [--caps                       ]
$min_caps_length = 3;           # min sequential CAPS for an all-caps line

# [-ct  ] | [--capstag                  ]
$caps_tag = "STRONG";           # Tag to put around all-caps lines

# [-m/+m     ] | [--mail        / --nomail        ]
$mailmode = 0;                  # Deal with mail headers & quoted text

# [-u/+u     ] | [--unhyphenate / --nounhyphenate ]
$unhyphenation = 1;             # Enables unhyphenation of text.

# [-a  ] | [--append                  ]
# [+a        ] | [--noappend                      ]
$append_file = 0;               # If you want something appended by
                                # default, put the filename here.
                                # The appended text will not be
                                # processed at all, so make sure it's
                                # plain text or decent HTML.  i.e. do
                                # not have things like:
                                #   Seth Golub 
                                # but instead, have:
                                #   Seth Golub <seth@cs.wustl.edu>

# [-t ] | [--title <title>                 ]
$title = 0;                     # You can specify a title.
                                # Otherwise it won't put one in.

# [-ul <n>   ] | [--ulength <n>             ]
$underline_length_tolerance = 1; # How much longer or shorter can
                                 # underlines be and still be underlines?

# [-uo <n>   ] | [--uoffset <n>            ]
$underline_offset_tolerance = 1; # How far offset can underlines
                                 # be and still be underlines?

# [-tw <n>   ] | [--tabwidth <n>                  ]
$tab_width = 8;                 # How many spaces equal a tab?


# [-iw <n>   ] | [--indent <n>                    ]
$indent_width = 2;              # Indents this many spaces for each
                                # level of a list

# [-/+e      ] | [--extract / --noextract         ]
$extract = 0;                   # Extract Mode (suitable for inserting)

# [-l <file> ] | [--link <dictfile>               ]
# [+l        ] | [--nolink                        ]
$make_links = 1;                # Should we try to link anything?

$tidy_links = 0;                # Put newlines around hrefs?

# [-db <n>   ] | [--debug <n>                     ]
$dict_debug = 0;                # Debug mode for link dictionaries
                                # Bitwise-Or what you want to see:
                                # 1: The parsing of the dictionary
                                # 2: The code that will make the links

$system_link_dict = "/usr/local/lib/txt2html-linkdict"; # after options
$default_link_dict = "$ENV{'HOME'}/.txt2html-linkdict"; # before options


# END OF CONFIGURABLE OPTIONS
########################################


########################################
# Definitions  (Don't change these)
#
$NONE       =   0;
$LIST       =   1;
$HRULE      =   2;
$PAR        =   4;
$PRE        =   8;
$END        =  16;
$BREAK      =  32;
$HEADER     =  64;
$MAILHEADER = 128;
$MAILQUOTE  = 256;
$CAPS       = 512;
$LINK       =1024;

$OL = 1;
$UL = 2;

$* = 1;                 # Turn on multiline searches

sub usage
{
    $0 =~ s#.*/##;
    print STDERR <<EOUsage;

Usage: $0 [options]

where options are:
     [-v        ] | [--version                       ]
     [-h        ] | [--help                          ]
     [-t <title>] | [--title <title>                 ]
     [-l <file> ] | [--link <dictfile>               ]
     [+l        ] | [--nolink                        ]
     [-a <file> ] | [--append <file>                 ]
     [+a        ] | [--noappend                      ]
     [-e/+e     ] | [--extract / --noextract         ]
     [-c <n>    ] | [--caps <n>                      ]
     [-ct <tag> ] | [--capstag <tag>                 ]
     [-m/+m     ] | [--mail     / --nomail           ]
     [-u/+u     ] | [--unhyphen / --nounhyphen       ]
     [-tw <n>   ] | [--tabwidth <n>                  ]
     [-iw <n>   ] | [--indent <n>                    ]
     [-ul <n>   ] | [--underlinelong <n>             ]
     [-us <n>   ] | [--underlineshort <n>            ]
     [-s <n>    ] | [--shortline <n>                 ]
     [-p <n>    ] | [--prewhite <n>                  ]
     [-pb <n>   ] | [--prebegin <n>                  ]
     [-pe <n>   ] | [--preend <n>                    ]
     [-r <n>    ] | [--hrule <n>                     ]
     [-db <n>   ] | [--debug <n>                     ]

  More complete explanations of these options can be found in
  comments near the beginning of the script.

EOUsage
}


sub deal_with_options
{
    while ($ARGV[0] =~ /^[-+].+/)
    {
        if (($ARGV[0] eq "-l" || $ARGV[0] eq "--link") &&
            $ARGV[1])
        {
            if (-r $ARGV[1]) {
                $make_links = 1;
                push(@links_dictionaries, $ARGV[1]);
            } else {
                print STDERR "Can't find or read link-file $ARGV[1].\n";
            }
            shift @ARGV;
            next;
        }

        if (($ARGV[0] eq "+l" || $ARGV[0] eq "--nolink") )
        {
            $system_link_dict = "";
            $make_links = 0;
            @links_dictionaries = 0;
            shift @ARGV;
            next;
        }

        if (($ARGV[0] eq "-r" || $ARGV[0] eq "--hrule") &&
            $ARGV[1] =~ /^%d+$/)
        {
            $hrule_min = $ARGV[1];
            shift @ARGV;
            next;
        }

        if (($ARGV[0] eq "-s" || $ARGV[0] eq "--shortline") &&
            $ARGV[1] =~ /^\d+$/)
        {
            $short_line_length = $ARGV[1];
            shift @ARGV;
            next;
        }

        if (($ARGV[0] eq "-p" || $ARGV[0] eq "--prewhite") &&
            $ARGV[1] =~ /^\d+$/)
        {
            $preformat_whitespace_min = $ARGV[1];
            shift @ARGV;
            next;
        }

        if (($ARGV[0] eq "-pb" || $ARGV[0] eq "--prebegin") &&
            $ARGV[1] =~ /^\d+$/)
        {
            $preformat_trigger_lines = $ARGV[1];
            shift @ARGV;
            next;
        }

        if (($ARGV[0] eq "-pe" || $ARGV[0] eq "--preend") &&
            $ARGV[1] =~ /^\d+$/)
        {
            $endpreformat_trigger_lines = $ARGV[1];
            shift @ARGV;
            next;
        }

        if (($ARGV[0] eq "-e" || $ARGV[0] eq "--extract"))
        {
            $extract = 1;
            shift @ARGV;
            next;
        }

        if (($ARGV[0] eq "+e" || $ARGV[0] eq "--noextract"))
        {
            $extract = 0;
            shift @ARGV;
            next;
        }

        if (($ARGV[0] eq "-c" || $ARGV[0] eq "--caps") &&
            $ARGV[1] =~ /^\d+$/)
        {
            $min_caps_length = $ARGV[1];
            shift @ARGV;
            next;
        }

        if (($ARGV[0] eq "-ct" || $ARGV[0] eq "--capstag") &&
            $ARGV[1])
        {
            $caps_tag = $ARGV[1];
            shift @ARGV;
            next;
        }

        if ($ARGV[0] eq "-m" || $ARGV[0] eq "--mail")
        {
            $mailmode = 1;
            next;
        }

        if ($ARGV[0] eq "+m" || $ARGV[0] eq "--nomail")
        {
            $mailmode = 0;
            next;
        }

        if ($ARGV[0] eq "-u" || $ARGV[0] eq "--unhyphen")
        {
            $unhyphenation = 1;
            next;
        }

        if ($ARGV[0] eq "+u" || $ARGV[0] eq "--nounhyphen")
        {
            $unhyphenation = 0;
            next;
        }

        if (($ARGV[0] eq "-a" || $ARGV[0] eq "--append") &&
            $ARGV[1])
        {
            if (-r $ARGV[1]) {
                $append_file = $ARGV[1];
            } else {
                print STDERR "Can't find or read $ARGV[1].\n";
            }
            shift @ARGV;
            next;
        }

        if ($ARGV[0] eq "+a" || $ARGV[0] eq "--noappend")
        {
            $append_file = 0;
            next;
        }

        if (($ARGV[0] eq "-t" || $ARGV[0] eq "--title") &&
            $ARGV[1])
        {
            $title = $ARGV[1];
            shift @ARGV;
            next;
        }

        if (($ARGV[0] eq "-ul" || $ARGV[0] eq "--ulength") &&
            $ARGV[1] =~ /^\d+$/)
        {
            $underline_length_tolerance = $ARGV[1];
            shift @ARGV;
            next;
        }

        if (($ARGV[0] eq "-uo" || $ARGV[0] eq "--uoffset") &&
            $ARGV[1] =~ /^\d+$/)
        {
            $underline_offset_tolerance = $ARGV[1];
            shift @ARGV;
            next;
        }

        if (($ARGV[0] eq "-tw" || $ARGV[0] eq "--tabwidth") &&
            $ARGV[1] =~ /^\d+$/)
        {
            $tab_width = $ARGV[1];
            shift @ARGV;
            next;
        }

        if (($ARGV[0] eq "-iw" || $ARGV[0] eq "--indentwidth") &&
            $ARGV[1] =~ /^\d+$/)
        {
            $indent_width = $ARGV[1];
            shift @ARGV;
            next;
        }

        if ($ARGV[0] eq "-v" || $ARGV[0] eq "--version")
        {
            print '$Header: /usr/local/db1/cvsroot/htdocs/WWW/tools/html_helpers/txt2html.pl,v 1.3 1996/09/23 18:04:34 lstein Exp $ ';
            print "\n";
            exit;
        }

        if ($ARGV[0] eq "-h" || $ARGV[0] eq "--help")
        {
            &usage;
            exit;
        }

        if (($ARGV[0] eq "-db" || $ARGV[0] eq "--debug") &&
            $ARGV[1] =~ /^\d+$/)
        {
            $dict_debug = $ARGV[1];
            shift @ARGV;
            next;
        }

        if ($ARGV[0] eq "--")
        {
            shift @ARGV;
            last;
        }

        print STDERR "Unrecognized option: $ARGV[0]\n";
        print STDERR " or bad paramater: $ARGV[1]\n" if($ARGV[1]);

        &usage;
        exit(1);

    } continue {

        shift @ARGV;
    }

    $preformat_trigger_lines = 0 if ($preformat_trigger_lines < 0);
    $preformat_trigger_lines = 2 if ($preformat_trigger_lines > 2);

    $endpreformat_trigger_lines = 1 if ($preformat_trigger_lines == 0);
    $endpreformat_trigger_lines = 0 if ($endpreformat_trigger_lines < 0);
    $endpreformat_trigger_lines = 2 if ($endpreformat_trigger_lines > 2);
}

sub is_blank
{
    return $_[0] =~ /^\s*$/;
}

sub escape
{
    $line =~ s/&/&/g;
    $line =~ s/>/>/g;
    $line =~ s/</</g;
    $line =~ s/\014/\n<hr>\n/g; # Linefeeds become horizontal rules
}

sub hrule
{
    if ($line =~ /^\s*([-_~=\*]\s*){$hrule_min,}$/)
    {
        $line = "<hr>\n";
        $prev =~ s/<p>//;
        $line_action |= $HRULE;
    }
}

sub shortline
{

    # Short lines should be broken even on list item lines iff the
    # following line is more text.  I haven't figured out how to do
    # that yet.  For now, I'll just not break on short lines in lists.
    # (sorry)

    if (!($mode & ($PRE | $LIST))
        && !&is_blank($line)
        && !&is_blank($prev)
        && ($prev_line_length < $short_line_length)
        && !($line_action & ($END | $HEADER | $HRULE | $LIST | $PAR))
        && !($prev_action & ($HEADER | $HRULE | $BREAK)))
    {
        $prev .= "<br>" . chop($prev);
        $prev_action |= $BREAK;
    }
}

sub mailstuff
{
    if ((($line =~ /^\w*>/)    # Handle "FF> Werewolves."
         || ($line =~ /^\w*\|/)) # Handle "Igor| There wolves."
        && !&is_blank($nextline))
    {
        $line =~ s/$/<br>/;
        $line_action |= ($BREAK | $MAILQUOTE);
        if(!($prev_action & ($BREAK | $PAR)))
        {
            $prev .= "<p>\n";
            $line_action |= $PAR;
        }
    } elsif (($line =~ /^From:? /)
             && &is_blank($prev))
    {
        &anchor_mail if !($prev_action & $MAILHEADER);
        $line =~ s/$/<br>/;
        $line_action |= ($BREAK | $MAILHEADER);
    } elsif (($line =~ /^[\w\-]*:/)  # Handle "Some-Header: blah"
             && ($prev_action & $MAILHEADER)
             && !&is_blank($nextline))
    {
        $line =~ s/$/<br>/;
        $line_action |= ($BREAK | $MAILHEADER);
    } elsif (($line =~ /^\s+\S/) &&   # Handle multi-line mail headers
             ($prev_action & $MAILHEADER) &&
             !&is_blank($nextline))
    {
        $line =~ s/$/<br>/;
        $line_action |= ($BREAK | $MAILHEADER);
    }
}

# Subtracts modes listed in $mask from $vector.
sub subtract_modes
{
    local($vector, $mask) = @_;
    ($vector | $mask) - $mask;
}

sub paragraph
{
    if(!&is_blank($line)
       && !($mode & $PRE)
       && !&subtract_modes($line_action, $END | $MAILQUOTE | $CAPS | $BREAK)
       && (&is_blank($prev)
           || ($line_action & $END)
           || ($line_indent > $prev_indent + $par_indent)))
    {
        $prev .= "<p>\n";
        $line_action |= $PAR;
    }
}

# If the line is blank, return the second argument.  Otherwise,
# return the number of spaces before any nonspaces on the line.
sub count_indent
{
    local($line, $prev_length) = @_;
    if(&is_blank($line))
    {
        return $prev_length;
    }
    local($ws) = $line =~ /^( *)[^ ]/;
    length($ws);
}

sub listprefix
{
    local($line) = @_;
    local($prefix, $number, $rawprefix);

    return (0,0,0) if (!($line =~ /^\s*[-=\*o]+\s+\S/ ) &&
                       !($line =~ /^\s*(\d+|[a-zA-Z])[\.\)\]:]\s+\S/ ));

    ($number) = $line =~ /^\s*(\d+|[a-zA-Z])/;

    # That slippery exception of "o" as a bullet
    # (This ought to be determined more through the context of what lists
    #  we have in progress, but this will probably work well enough.)
    if($line =~ /^\s*o\s/)
    {
        $number = 0;
    }

    if ($number)
    {
        ($rawprefix) = $line =~ /^(\s*(\d+|[a-zA-Z]).)/;
        $prefix = $rawprefix;
        $prefix =~ s/(\d+|[a-zA-Z])//;  # Take the number out
    } else {
        ($rawprefix) = $line =~ /^(\s*[-=o\*]+.)/;
        $prefix = $rawprefix;
    }
    ($prefix, $number, $rawprefix);
}

sub startlist
{
    local($prefix, $number, $rawprefix) = @_;

    $listprefix[$listnum] = $prefix;
    if($number)
    {
        # It doesn't start with 1,a,A.  Let's not screw with it.
        if (($number != 1) && ($number ne "a") && ($number ne "A"))
        {
            return;
        }
        $prev .= "$list_indent<ol>\n";
        $list[$listnum] = $OL;
    } else {
        $prev .= "$list_indent<ul>\n";
        $list[$listnum] = $UL;
    }
    $listnum++;
    $list_indent = " " x $listnum x $indent_width;
    $line_action |= $LIST;
    $mode |= $LIST;
}


sub endlist                     # End N lists
{
    local($n) = @_;
    for(; $n > 0; $n--, $listnum--)
    {
        $list_indent = " " x ($listnum-1) x $indent_width;
        if($list[$listnum-1] == $UL)
        {
            $prev .= "$list_indent</ul>\n";
        } elsif($list[$listnum-1] == $OL)
        {
            $prev .= "$list_indent</ol>\n";
        } else
        {
            print STDERR "Encountered list of unknown type\n";
        }
    }
    $line_action |= $END;
    $mode ^= ($LIST & $mode) if (!$listnum);
}

sub continuelist
{
    $line =~ s/^\s*[-=o\*]+\s*/$list_indent<li> / if $list[$listnum-1] == $UL;
    $line =~ s/^\s*(\d+|[a-zA-Z]).\s*/$list_indent<li> /
        if $list[$listnum-1] == $OL;
    $line_action |= $LIST;
}

sub liststuff
{
    local($i);

    local($prefix, $number, $rawprefix) = &listprefix($line);

    $i = $listnum;
    if (!$prefix)
    {
        return if !&is_blank($prev); # inside a list item

        # This ain't no list.  We'll want to end all of them.
        return if !($mode & $LIST);     # This just speeds up the inevitable
        $i = 0;
    } else
    {
        # Maybe we're going back up to a previous list
        $i-- while (($prefix ne $listprefix[$i-1]) && ($i >= 0));
    }

    if (($i >= 0) && ($i != $listnum))
    {
        &endlist($listnum - $i);
    } elsif (!$listnum || $i != $listnum)
    {
        &startlist($prefix, $number, $rawprefix);
    }

    &continuelist($prefix, $number, $rawprefix) if ($mode & $LIST);
}

sub endpreformat
{
    if(!($line =~ /\s{$preformat_whitespace_min,}\S+/) &&
       ($endpreformat_trigger_lines == 1 ||
        !($nextline =~ /\s{$preformat_whitespace_min,}\S+/)))
    {
        $prev =~ s#$#\n</pre>#;
        $mode ^= ($PRE & $mode);
        $line_action |= $END;
    }
}

sub preformat
{
    if($preformat_trigger_lines == 0 ||
       (($line =~ /\s{$preformat_whitespace_min,}\S+/) &&
        ($preformat_trigger_lines == 1 ||
         $nextline =~ /\s{$preformat_whitespace_min,}\S+/)))
    {
        $line =~ s/^/<pre>\n/;
        $prev =~ s/<p>//;
        $mode |= $PRE;
        $line_action |= $PRE;
    }
}

sub make_new_anchor
{
    local($anchor, $i);

    return "null" if(!$heading_level);
    $anchor = "section-";
    $heading_count[$heading_level-1]++;

    # Reset lower order counters
    for($i=$#heading_count + 1; $i > $heading_level; $i--)
    {
        $heading_count[$i-1] = 0;
    }

    for($i=0; $i < $heading_level; $i++)
    {
        $heading_count[$i] = 1 if !$heading_count[$i]; # In case they skip any
        $anchor .= sprintf("%d.", $heading_count[$i]);
    }
    chop($anchor);
    $anchor;
}

sub anchor_mail
{
    local($text) = $line =~ /\S+: *(.*) *$/;
    local($anchor) = &make_new_anchor($text);
    $line =~ s/(.*)/<a name="$anchor">$1<\/a>/;
}

sub anchor_heading
{
    local($heading) = @_;
    local($anchor) = &make_new_anchor($heading);
    $line =~ s/(<h.>.*<\/h.>)/<a name="$anchor">$1<\/a>/;
}

sub heading
{
    local($hoffset, $heading) = $line =~ /^(\s*)(.+)$/;
    local($uoffset, $underline) = $nextline =~ /^(\s*)(\S+)\s*$/;

    local($lendiff, $offsetdiff);
    $lendiff = length($heading) - length($underline);
    $lendiff *= -1 if $lendiff < 0;

    $offsetdiff = length($hoffset) - length($uoffset);
    $offsetdiff *= -1 if $offsetdiff < 0;

    if(&is_blank($line)
       ||($lendiff > $underline_length_tolerance)
       ||($offsetdiff > $underline_offset_tolerance))
    {
        return;
    }

    $underline = substr($underline,0,1);

    $underline .= "C" if &iscaps($line); # Call it a different style if the
                                         # heading is in all caps.
    $heading_styles{$underline} = ++$num_heading_styles
        if !$heading_styles{$underline};

    $heading_level = $heading_styles{$underline};

    $nextline = &getline;             # Eat the underline
    &tagline("h${heading_level}");
    &anchor_heading($heading);
    $line_action |= $HEADER;
}

sub unhyphenate
{
    local($second);

    # This looks hairy because of all the quoted characters.
    # All I'm doing is pulling out the word that begins the next line.
    # Along with it, I pull out any punctuation that follows.
    # Preceding whitespace is preserved.  We don't want to screw up
    # our own guessing systems that rely on indentation.
    ($second) = $nextline =~ /^\s*([a-zA-Z]+[\)\}\]\.,:;\'\"\>]*\s*)/; # "
    $nextline =~ s/^(\s*)[a-zA-Z]+[\)\}\]\.,:;\'\"\>]*\s*/$1/; # "
    # (The silly comments are for my less-than-perfect code hilighter)

    $line =~ s/\-\s*$/$second/;
    $line .= "\n";
}

sub untabify
{
    local($line) = @_;
    while($line =~ /\011/)
    {
        $line =~ s/\011/" " x ($tab_width - (length($`) % $tab_width))/e;
    }
    $line;
}

sub tagline
{
    local($tag) = @_;
    chop $line;                 # Drop newline
    $line =~ s/^\s*(.*)$/<$tag>$1<\/$tag>\n/;
}

sub iscaps
{
    local($_) = @_;
    /^[^a-z<]*[A-Z]{$min_caps_length,}[^a-z<]*$/;
}

sub caps
{
    if(&iscaps($line))
    {
        &tagline($caps_tag);
        $line_action |= $CAPS;
    }
}

# Convert very simple globs to regexps
sub glob2regexp
{
    local($glob) = @_;
    # Escape funky chars
    $glob =~ s/[^\w\[\]\*\?\|\\]/\\$&/g;
    local($regexp,$i,$len,$escaped) = ("",0,length($glob),0);

    for(;$i < $len; $i++)
    {
        $char = substr($glob,$i,1);
        if($escaped)
        {
            $escaped = 0;
            $regexp .= $char;
            next;
        }
        if ($char eq "\\") {
            $escaped = 1; next;
            $regexp .= $char;
        }
        if ($char eq "?") {
            $regexp .= "."; next;
        }
        if ($char eq "*") {
            $regexp .= ".*"; next;
        }
        $regexp .= $char;       # Normal character
    }
    "\\b" . $regexp . "\\b";
}

sub add_regexp_to_links_table
{
    local($key,$URL,$switches) = @_;
        # No sense adding a second one if it's already in there.
        # It would never get used.
        if(!$links_table{$key})
        {
            # Keep track of the order they were added so we can
            # look for matches in the same order
            push(@links_table_order, ($key));

            $links_table{$key} = $URL;        # Put it in The Table
            $links_switch_table{$key} = $switches;
            print STDERR
 " ($#links_table_order)\tKEY: $key\n\tVALUE: $URL\n\tSWITCHES: $switches\n\n"
                if ($dict_debug & 1);
        } else
        {
            if($dict_debug & 1) {
                print STDERR " Skipping entry.  Key already in table.\n";
                print STDERR "\tKEY: $key\n\tVALUE: $URL\n\n";
            }
        }
}

sub add_literal_to_links_table
{
    local($key,$URL,$switches) = @_;
    $key =~ s/(\W)/\\\1/g; # Escape non-alphanumeric chars
    $key = "\\b$key\\b"; # Make a regexp out of it
    &add_regexp_to_links_table($key,$URL,$switches);
}

sub add_glob_to_links_table
{
    local($key,$URL,$switches) = @_;
    &add_regexp_to_links_table(&glob2regexp($key),$URL,$switches);
}

# This is the only function you should need to change if you want to
# use a different dictionary file format.
sub parse_dict
{
    local($dictfile, $dict) = @_;

    print STDERR "Parsing dictionary file $dictfile\n" if ($dict_debug & 1);

    $dict =~ s/^\#.*$//g;        # Strip lines that start with '#'
    $dict =~ s/^.*[^\\]:\s*$//g; # Strip lines that end with unescaped ':'

    if($dict =~ /->\s*->/)
    {
        $message = "Two consecutive '->'s found in $dictfile\n";

        # Print out any useful context so they can find it.
        ($near) = $dict =~ /([\S ]*\s*->\s*->\s*\S*)/;
        $message .= "\n$near\n" if $near =~ /\S/;
        die $message;
    }

    while($dict =~ /\s*(.+)\s+\-+([ieh]+\-+)?\>\s*(.*\S+)\s*\n/ig)
    {
        local($key, $URL,$switches,$options);
        $key = $1;
        $options = $2;
        $URL = $3;
        $switches = 0;
        $switches += 1 if $options =~ /i/i; # Case insensitivity
        $switches += 2 if $options =~ /e/i; # This could be interesting
        $switches += 4 if $options =~ /h/i; # provides HTML, not just URL

        $key =~ s/\s*$//;       # Chop trailing whitespace

        if($key =~ m|^/|)       # Regexp
        {
            $key = substr($key,1);
            $key =~ s|/$||;     # Allow them to forget the closing /
            &add_regexp_to_links_table($key,$URL,$switches);
        } elsif($key =~ /^\|/)  # alternate regexp format
        {
            $key = substr($key,1);
            $key =~ s/\|$//;    # Allow them to forget the closing |
            $key =~ s|/|\\/|g;  # Escape all slashes
            &add_regexp_to_links_table($key,$URL,$switches);
        } elsif ($key =~ /\"/)
        {
            $key = substr($key,1);
            $key =~ s/\"$//;    # Allow them to forget the closing "
            &add_literal_to_links_table($key,$URL,$switches);
        } else
        {
            &add_glob_to_links_table($key,$URL,$switches);
        }
    }
}

sub in_link_context
{
    local($match, $before) = @_;
    return 1 if $match =~ m@</?a>@; # No links allowed inside match

    local($final_open, $final_close);
    $final_open = rindex($before, "<a ") - $[;
    $final_close = rindex($before, "</a>") - $[;

    ($final_open >= 0)          # Link opened
        && (($final_close < 0)  # and not closed    or
            || ($final_open > $final_close)); # one opened after last close
}

# This subroutine looks a little odd.  Rather than build up some code
# and keep "eval"ing later, I'm building a new subroutine.  This way I
# can declare local vars and not worry about the namespace in the
# calling context.  I don't know how much it really gains me, but I
# don't know of any real costs and it seems like it could be
# friendlier to optimization.  (Plus it's cool to define new
# subroutines at runtime.  :-)
sub make_dictionary_links_code
{
    local($i,$pattern,$switches,$options,$code,$href);
    $code = <<EOCode;
sub dynamic_make_dictionary_links
{
    local(\$line_link) = (\$line_action | \$LINK);
    local(\$before,\$linkme,\$line_with_links);
EOCode
    for($i=1; $i <= $#links_table_order; $i++)
    {
        $pattern = $links_table_order[$i];
        $key = $pattern;
        $switches = $links_switch_table{$key};
        
        $s_sw = "";             # Options for searching
        $s_sw .= "i" if($switches & 1);
        
        $r_sw = "";             # Options for replacing
        $r_sw .= "i" if($switches & 1);
        $r_sw .= "e" if($switches & 2);

        $href = $links_table{$key};

        $href =~ s@/@\\/@g;
        $href = '<a href="' . $href . '">$&<\\/a>'
            if !($switches & 4);

        $code .= <<EOCode;

    \$line_with_links = "";
    while(\$line =~ /$pattern/$s_sw)
    {
        \$link_line = $LINK if(!\$link_line);
        \$before = \$\`;
        \$linkme = \$&;

        \$line = substr(\$line, length(\$before) + length(\$linkme));
        
        \$linkme =~ s/$pattern/$href/$r_sw
            if(!&in_link_context(\$linkme,\$before));
        \$line_with_links .= \$before . \$linkme;
    }
    \$line = \$line_with_links . \$line;
EOCode
    }
    $code .= <<EOCode;

    \$line_action |= \$line_link; # Cheaper to only to do bitwise OR once.
}
EOCode
    print STDERR "$code" if ($dict_debug & 2);
    eval "$code";
    if($@)
    {
        print STDERR "Problem making dictionary eval code\n";
        die $@;
    }
}

sub load_dictionary_links
{
    local($dict, $contents);
    local($i);
    @links_table_order = 0;
    %links_table = 0;
    for($i=1; $i <= $#links_dictionaries; $i++)
    {
        $dict= $links_dictionaries[$i];

        (-r "$dict") || die "Dictionary file $dict not found or unreadable";
        open(DICT, "$dict") || die "Can't open Dictionary file $dict";

        $contents = "";
        $contents .= $_ while(<DICT>);
        close(DICT);
        &parse_dict($dict, $contents);
    }
    &make_dictionary_links_code;
}

sub make_dictionary_links
{
    eval "&dynamic_make_dictionary_links;";
    warn $@ if $@;
}

sub getline
{
    local($line);
    $line = <>;
    $line =~ s/[ \011]*\015$//; # Chop trailing whitespace and DOS CRs
    $line = &untabify($line);   # Change all tabs to spaces
    $line;
}

sub main
{
    push(@links_dictionaries,($default_link_dict))
        if ($make_links && (-f $default_link_dict));
    &deal_with_options;
    if($make_links)
    {
        push(@links_dictionaries,($system_link_dict)) if -f $system_link_dict;
        &load_dictionary_links;
    }
    if(!$extract)
    {
        print "<html>\n";
        print "<head>\n";

        # It'd be nice if we could guess a title from the first header,
        # but even that would be too late if we're doing this in one pass.
        print "<title>$title\n" if($title);

        print "\n";
                                
        print "\n";
    }

    $prev_line_length = 0;
    $prev_indent = 0;
    $prev     = "";
    $line     = &getline;
    $nextline = &getline if $line;
    $line = &untabify($line);
    do
    {
        $line_length = length($line); # Do this before tags go in
        $line_indent = &count_indent($line, $prev_indent);

        &escape;

        &endpreformat if (($mode & $PRE) && ($preformat_trigger_lines != 0));

        &hrule if !($mode & $PRE);


        &heading   if (!($mode & $PRE) &&
                       $nextline =~ /^\s*[=\-\*\.~\+]+\s*$/);

        &liststuff if (!($mode & $PRE) &&
                       !&is_blank($line));

        &mailstuff if ($mailmode &&
                       !($mode & $PRE) &&
                       !($line_action & $HEADER));

        &preformat if (!($line_action & ($HEADER | $LIST | $MAILHEADER)) &&
                       !($mode & ($LIST | $PRE)) &&
                       ($endpreformat_trigger_lines != 0));

        ¶graph;
        &shortline;

        &unhyphenate if ($unhyphenation &&
                         ($line =~ /[a-zA-Z]\-$/) && # ends in hyphen
                         # next line starts w/letters
                         ($nextline =~ /^\s*[a-zA-Z]/) &&
                         !($mode & ($PRE | $HEADER | $MAILHEADER | $BREAK)));

        &make_dictionary_links if ($make_links
                                   && !&is_blank($line)
                                   && $#links_table_order);

        &caps if  !($mode & $PRE);

        # Print it out and move on.

        print $prev;

        if (!&is_blank($nextline))
        {
            $prev_action = $line_action;
            $line_action     = $NONE;
            $prev_line_length = $line_length;
            $prev_indent = $line_indent;
        }

        $prev = $line;
        $line = $nextline;
        $nextline = &getline if $nextline;
    } until (!$nextline && !$line && !$prev);

    $prev = "";
    &endlist($listnum) if ($mode & $LIST); # End all lists
    print $prev;

    print "\n";

    print "
\n" if ($mode & $PRE); if ($append_file) { if(-r $append_file) { open(APPEND, $append_file); print while ; } else { print STDERR "Can't find or read file $append_file to append.\n"; } } if(!$extract) { print "\n"; print "\n"; } } &main();