#!/usr/local/bin/perl #print STDERR "ARGS:\n" . join("\n", @ARGV); $TABLE_SCRIPT = "/home/guru/ps2html/v2/table_parser.pl"; #$TABLE_SCRIPT = "./table_parser.pl"; $TABLE_FILE = "/home/guru/ps2html/v2/mln.table"; $VERBOSE = 1; $TXT2HTML = 1; # # These are the three special characters that must be escaped in # any HTML document. These values are used in the do_text subroutine # $NOT_HTML_CHARS{"<"} = "<"; $NOT_HTML_CHARS{">"} = ">"; $NOT_HTML_CHARS{"&"} = "&"; $NOT_HTML_CHARS_PATTERN = "[" . join("", keys %NOT_HTML_CHARS) . "]"; if(1 != do $TABLE_SCRIPT) { print STDERR "Unable to load the table parser from file:\n"; print STDERR "$TABLE_SCRIPT\n"; print STDERR "Please make sure that $0 is configured correctly.\n"; exit(-1); } # This is the newgetopt perl package which should have been installed # when perl was installed. It ought to reside in a directory with # a name like `/usr/lib/perl' or `/usr/local/lib/perl' # # If this doesn't load up, complain to your unix system administrator. require("newgetopt.pl"); if(0 == &NGetOpt("table=s", "input=s", "output=s", "pagenum=i", "d")) { print STDERR "ERROR: Unable to parse command line options.\n"; exit(-1); } # This next part is tricky, there are two ways to define # the table file, manually with "-table filename" on the # command line, or just "filename" if "filename" is the first # argument and "-table" is not any other argument. # # The reasoning behind this weirdness is that it makes it # easy to make table files executable with a magic number # of: # # #!/usr/local/bin/perl /somepath/txt2html.pl # # which causes the kernel to automagically make the filename # the first argument to the txt2html.pl. Most (if not all) # kernels only allow one argument in a #! call and we have to # use it up with "/somepath/txt2html.pl". Note, that the # "/somepath/" component is required in the table file header if(defined($opt_table)) { $TABLE_FILE = $opt_table; } if(defined($opt_input)) { close(STDIN); $result = open(STDIN, $opt_input); if($result == 0) { print STDERR "ERROR: Unable to open $opt_input for reading.\n"; exit(-1); } } if(defined($opt_output)) { close(STDOUT); $result = open(STDOUT, ">$opt_output"); if($result == 0) { print STDERR "ERROR: Unable to open $opt_output for writing.\n"; exit(-1); } } if(defined($opt_pagenum)) { $pagenum = $opt_pagenum; } else { $pagenum = 696; } &load_table($TABLE_FILE); $CHAR_TRANS_PATTERN = '(' . join("|", keys %char_trans) . ')'; $CHAR_TRANS_PATTERN =~ s/\\/\\\\/g; while() { if(/^S (\d+) (\d+) \((.+)\) (\d+)\s*$/) { &do_text($1, $2, $3); # Text string } elsif(/^F (\d+) (\d+) \((.+)\)\s*$/) { # Font Change &do_newfont($1, $2, $3); } elsif(/^P\s*$/) { # Page Break &do_pagebreak(*text, $pagenum); print stderr "$pagenum, "; $pagenum++; } else { print stderr "Bad line in text file:\n"; print stderr $_; exit(-1); } } print stderr "\n"; if(!defined $general{"spell_checker"}) { &info_msg( "** No hyphenation checker defined, assuming all", "** hyphenation is correct. Probably want to spell", "** check the final output for words stuck together." ); } else { &info_msg( "** Dehyphenating:"); } foreach(split(/,\s*/, $general{"line_break_sections"})) { $lb_sections{$_} = 1; } foreach(sort keys %texts) { if($_ eq "nada") { next; } &info_msg("\t$_"); *text = eval "*$_"; &char_trans(*text); print STDERR "char\n"; if($lb_sections{$_} == 1) { &add_line_breaks(*text); print STDERR "Line Breaks\n"; } elsif(defined $general{"spell_checker"}) { &dehyphenate(*text); print STDERR "dehyphenate\n"; } &clean_up(*text); print STDERR "cleaned\n"; @textA = &set_width(*text, 70); &warn_trans(*textA, $_); $text = join("\n", @textA); if(!defined($general{"page_break_section"})) { &fix_pagebreaks($general{"page_break_pattern"}, *text); } } if(defined($general{"page_break_section"})) { *text = eval '*' . $general{"page_break_section"}; &fix_pagebreaks($general{"page_break_pattern"}, *text); } print eval '"' . $output_format . '"'; # # Subroutine called on each line of new text # # Arguments are: x, y, string # sub do_text { local($x,$y,$cur_str) = @_; local(@current_style); # # Check for gutter margin adjustment # if(0 == $pagenum % 2) { $x -= $general{"gutter"}; } # # Check if text is outside the official margins # if( ($x < $general{"left_margin"}) || ($x > $general{"right_margin"}) ||($y > $general{"top_margin"}) || ($y < $general{"bottom_margin"})) { return; } $nice_str = $cur_str; $nice_str =~ s/[\[\]\(\)\$\\\|]/\\$&/g; @current_style = ( "^$x,$y,$nice_str\$", "^$x,$y,$nice_str\$", "^,$y,$nice_str\$", "^$x,,$nice_str\$", "^,,$nice_str\$", "^$x,$y,\$", "^,$y,\$", "^$x,,\$", "^,,\$" ); $current_style = join("|", @current_style); # This command translates only basic HTML related characters. # for example <, >, and &. It is called on each line of # text read in, so the quicker it is, the better. # # All the other translations (which are likely document and # document creator specific) are done by &char_trans after the # text is completely read in. $* = 1; $cur_str =~ s/$NOT_HTML_CHARS_PATTERN/$NOT_HTML_CHARS{$&}/g; $* = 0; # # If this is a new line, check the newline only # text section definitions first # if($y != $last_y) { $text .= "\r"; foreach $set ($current_section, "BaSe") { if(1 == &do_ts_match("newline", $set, $current_style, $cur_str)) { return; } } } foreach $set ($current_section, "BaSe") { if(1 == &do_ts_match("all", $set, $current_style, $cur_str)) { return; } } # # Unable to find a text section definition to match # the current text style # print stderr "\n** No type section definition found for:\n"; printf(stderr " Font: %s, %5.5s pts\n", $cur_font, $cur_size); printf(stderr " X: %-5.5s Y: %-5.5s\n", $x, $y); print stderr " Page #: $pagenum\n"; printf(stderr " Text: \"%s\"\n", $cur_str); print stderr " Current Section: $current_section\n"; printf(stderr " Previous text: %s\n", substr($text, -90)); exit(-1); } sub do_newfont { local($height, $width, $fname) = @_; $fname =~ s/^\|_+(.*)$/$1/; $cur_font = $fname; $cur_size = sprintf("%4.2f", $height / 12); } sub do_ts_match { local($where, $sec, $current_style, $cur_str) = @_; local($text_section, $start, $stop); local($ts_name) = &mktsvar($sec, $where, $cur_font, $cur_size); if(-1 == eval "\$#$ts_name") { return(0); } foreach $text_section (eval "@$ts_name") { if($text_section =~ /$current_style/) { # $text_section now has "$x,$y,$cur_str" $current_section = $sections{"$cur_font,$cur_size,$text_section,$ts_name"}; if (defined($opt_d)) { print STDERR "MATCHED: $cur_str\n$current_section\n"; print STDERR "x: $x y: $y $cur_font ($cur_size) Page: $pagenum\n"; } $current_section = &do_codes($current_section); if (defined($opt_d)) { print STDERR "CLOSE: |" . join("|", @close); print STDERR "\n$current_section\n\n"; } $text .= $cur_str; $last_x = $x; $last_y = $y; return 1; # Match found, everybody is happy } } return(0); # No match found } # # This routine takes care of managing the state stack and # entering the proper HTML codes when an old state is exited # and a new state entered. Note that section names are used # to represent states and each word in a section name indicates # a separate state with start and stop codes # sub do_codes { local($cur_section) = @_; local(@new_name, $i, $name); @new_name = split(/\s+/, $cur_section); if($#new_name < $#state_stack) { $min = $#new_name; } else { $min = $#state_stack; } # # Check for any differences in the `base' path # for($i = 0; $i <= $min; $i++) { if($new_name[$i] ne $state_stack[$i]) { last; } } # # If there are no differences, just add # any new stuff to the old stuff # if($i == $#state_stack+1) { *text = eval "*$new_name[0]"; for(;$i <= $#new_name; $i++) { push(@state_stack, $new_name[$i]); $name = join(" ", @new_name[0..$i]); push(@close, $codes_stop{$name}); $text .= eval $codes_start{$name}; } } else { while($#state_stack > $i-1) { $text . = eval pop(@close); pop(@state_stack); } *text = eval "*$new_name[0]"; while($#state_stack < $#new_name) { push(@state_stack, $new_name[$i]); $name = join(" ", @new_name[0..$i]); push(@close, $codes_stop{$name}); $text .= eval $codes_start{$name}; $i++; } } $texts{$new_name[0]}++; # Keep track of all text # sections so we can dehyphenate # Used in case the current_section has changed return join(" ", @state_stack); } # # This is an implementation of the spell check routine used to # verify the dehyphenization that ps2html does. It makes use # of an external program to verify all the words at once. # # Define $general{"spell_checker"} = "/bin/spell" to make this routine work. # Additionally, you don't need to use the hyphen_init # routine if you use this method of spell checking. sub hyphen_check { local(@words) = @_; local(%fixed_words, @rejects, @real_rejects, @broken); if(1 != open(HYPHENWORDS, "> /tmp/ps2html_hyphen.$$")) { print stderr "Unable to write to /tmp/ps2html_spell.$$, "; print stderr "please check configuration.\n"; exit(-1); } for ($i=0;$i <= $#words; $i++) { $_ = $words[$i]; # Get rid of any HTML tags s/<.*>//g; s/&#\d\d\d//g; s/<//g; s/&//g; # Get rid of any additional punctation, note that # we don't get rid of hyphens because they may legitimately # occur in a word, and the one hyphen we are checking has # already been removed before it got here. tr/!.,?()[]{}"';:%$^&|//d; $fixed_words{$_} = $words[$i]; } print HYPHENWORDS join("\n", keys %fixed_words); close(HYPHENWORDS); @rejects = `$general{"spell_checker"} /tmp/ps2html_hyphen.$$`; unlink("/tmp/ps2html_hyphen.$$"); chop(@rejects); foreach $word (@rejects) { # # This error checking is necessary because /bin/spell # is broken in that given a hyphenated word, it breaks # it into word parts to check the spelling of each. # When it can't match the spelling on a word, it doesn't # rehyphenate and this program can't realistically put # the hyphenation back in that case, so we report it # and let the user manually fix the errors. Hopefully # there will be relatively few such errors. # if(!defined $fixed_words{$word}) { push(@broken, $word); } else { push(@real_rejects, $fixed_words{$word}); } } if($#broken != -1) { &info_msg("-- Funky hyphenation for these word parts:"); foreach (&array2lines(*broken, 70, ", ")) { &info_msg("** $_"); } } return(@real_rejects); } sub dehyphenate { local(*text) = $_[0]; local(@words, $word, $i, @rejects, %check, @slots); local(@assumed_ok, @kept_hyphen); local($*) = 1; @words = split(/ /, $text); for ($i=0; $i <= $#words; $i++) { $_ = $words[$i]; if(/^\s*(.*--)(.*)\-\r(.*)\s*$/) { # EM dash before hyphen $word = $1 . $2 . $3; $words[$i] = $word; } elsif(/^\s*(.*)\-\r(.*)(--.*)\s*$/) { # EM dash after hyphen $word = $1 . $2 . $3; $words[$i] = $word . $3; } elsif(/^\s*(.*)\-\r(.*)\s*$/) { # No EM dash at all. $word = $1 . $2; if($word =~ /^[A-Z]/) { $words[$i] = $word; } } else { next; } if($word eq $words[$i]) { $word =~ tr/!.,?()[]{}";:%$^&|//d; push(@assumed_ok, $word); } else { if($check{$word} ne "") { $check{$word} .= ","; } $check{$word} .= $i; } } if($#assumed_ok != -1) { @rejects = &hyphen_check(@assumed_ok); if($#rejects != -1) { &info_msg("-- Assuming these words don't require hyphens:"); foreach (&array2lines(*rejects, 70, ", ")) { &info_msg("** $_"); } } } @rejects = &hyphen_check(keys %check); # For each rejected word, leave the hyphen in foreach $word (@rejects) { @slots = split(/,/, $check{$word}); foreach $i (@slots) { #$words[$i] =~ s/\r/#R#/g; $words[$i] =~ tr/\r//d; $_ = $words[$i]; } tr/!.,?()[]{}";:%$^&|//d; push(@kept_hyphen, $_); delete $check{$word}; } if($#kept_hyphen != -1) { &info_msg("-- Leaving hyphenation in for these words:"); foreach (&array2lines(*kept_hyphen, 70, ", ")) { &info_msg("** $_"); } } # All the rest of the words must be ok, so dehyphenate them foreach $word (keys %check) { @slots = split(/,/, $check{$word}); foreach $i (@slots) { $words[$i] = $word; } delete $check{$word}; } $text = join(" ", @words); $text =~ tr/\r/ /; # Take care of all the other linebreaks return(1); } # This routine takes a pointer to an array, a width and a seperator # string as arguments and returns an array of strings containing the # values in the specified array joined by the seperator, each line # not exceeding the specified width. # # Thus it is useful for printing arrays that are longer than a screen # line. # # Note this version of array2lines has been slightly modified with # the foreknowledge that we are dealing with \n separated lines, both # before and after the conversion. sub array2lines { local(*list, $max_width, $seperator) = @_[0,1,2]; local($width, $i, @answer, $sep_len, $line, @templines); $sep_len = length($seperator); for($width=0, $i=0; $i <= $#list; $i++) { if($list[$i] =~ /\n/) { @templines = split(/\n/, $list[$i]); if(defined($line)) { if(length($list[$i] . $templines[0]) > $max_width) { push(@answer, $line, @templines); $width = 0; undef $line; } else { push(@answer, $line . shift(@templines), @templines[0..$#templines-1]); $line = pop(@templines) . $seperator; $width = length($line); } } else { push(@answer, @templines[0..$#templines-1]); $line = pop(@templines) . $seperator; $width = length($line); } next; } if(length($list[$i]) > $max_width) { if(defined($line)) { push(@answer, $line); undef $line; $width = 0; } push(@answer, $list[$i]); next; } $width += length($list[$i]); if($width > $max_width) { $line = substr($line, 0, length($line)-$sep_len); push(@answer, $line); undef $line; $width = 0; redo; } elsif($width + $sep_len > $max_width) { $line .= $list[$i]; push(@answer, $line); undef $line; $width = 0; } else { $line .= $list[$i] . $seperator; $width += $sep_len; } } if(defined $line) { $line = substr($line, 0, length($line)-$sep_len); push(@answer, $line); } return(@answer); } # Do the non-HTML tag related character translations sub char_trans { local(*text) = $_[0]; local($*) = 1; local($key); $text =~ s/$CHAR_TRANS_PATTERN/$char_trans{$1}/g; } # Warn about non-ascii characters that weren't translated # Need to pass in an array of lines, and the name of the # section being checked. sub warn_trans { local(*text_list, $section) = @_[0,1]; local(@bad,$i); for ($i=0; $i <= $#text_list; $i++) { $_ = $text_list[$i]; if(/\\\d\d\d/) { push(@bad, "** $i: $_"); } } if($#bad > -1) { &info_msg( "-- Unable to translate these non-ASCII characters:", @bad ); } } # Replaces carriage returns '\r' with the HTML line break
sub add_line_breaks { local(*text) = $_[0]; local($*) = 1; $text =~ s/\r/
\n/g; } # Breaks the text up into an array of lines, each no longer than # $width. sub set_width { local(*text, $width) = @_[0,1]; local(@lines); @lines = split(/ /,$text); &array2lines(*lines, $width, " "); } sub clean_up { local(*text) = $_[0]; local($*) = 1; # # This changes the characteristic postscript \( and \) # strings to ( and ) respectively. It probably ought to # be handled by the general char_trans() routine, but I # can't figure out how to get a table file to nicely specify # them. As this script will always be reading the output # of the ps2txt program, it will always need to do this kind # of translation, so it is paradigmatically correct to put them # here anyway. # $text =~ s/\\\)/)/g; $text =~ s/\\\(/(/g; # These two deal with spaces that are caused by line breaks # in the postscript. The code replaces carriage returns (\r) # which are used to represent line breaks from the postscript # and would otherwise be converted to a spaces. $text =~ s/^\r//; $text =~ s/\n\r/\n/g; # Makes sure an open paren is preceded by whitespace $text =~ s/([^\s\>])\(/$1 \(/g; # Makes sure an open bracket is preceded by whitespace $text =~ s/([^\s\>])\[/$1 \[/g; # Deal with poor spacing around HTML tags (like italics and stuff) # " word5" - > "word5" $text =~ s/([^\s\>]<[^\/>]+>) +(\S)/$1$2/g; # "word6 " - > "word6" $text =~ s/(\S) +(<\/[^>]+>)/$1$2/g; # Open tags - "word1word2" -> "word1 word2" $text =~ s/([^\s\>])(<[^\/>]+>)\s*(\S)/$1 $2$3/g; # Close tags - "word3word4" -> "word3 word4" $text =~ s/(\S)\s*(<\/[^>]+>)([^\s\<])/$1$2 $3/g; # These are for extra spaces inside the text of a link # It should be ok as the above two commands guarantee that # there should be enough space outside of the link tags # Space before - Hello<\a> $text =~ s/(<[Aa]\s+[^>]+>)\s([^<]+)(<\/[Aa]>)/$1$2$3/g; # Space before - Goodbye <\a> $text =~ s/(<[Aa]\s+[^>]+>)(.+)\s(<\/[Aa]>)/$1$2$3/g; # These could probably be combined into one # substitute command to speed things up, but # I leave it this way to improve code understandability # Removes whitespace after an open paranthese "( " $text =~ s/\(\s+/(/g; # Removes whitespace before a close paranthese " )" $text =~ s/\s+\)/)/g; # Removes whitespace before punctuation marks: ,.!:; $text =~ s/\s+([\.\,\!\:\;])/$1/g; } sub info_msg { local(@message) = @_; if($VERBOSE) { print stderr join("\n", @message) . "\n"; } push(@info_messages, @message); } # This routine makes sure that the pagebreak marker is not # in the middle of a hyphenated word. Instead, it moves it # to follow the word, and dehyphenates the word. # # No particular warning is printed about the dehyphenization, # it just does it and then warns to spell check the final # document sub fix_pagebreaks { local($pat,*text) = @_[0,1]; local($*) = 1; if($pat eq "") { &info_mesg("ERROR: Empty page break pattern in &fix_pagebreaks()"); return; } # There is something broken with perl, if I don't (\s) at the begining # and just use $1$3$2 to substitute, $1 is always empty... if($text =~ s/\s(\S+)\-\s*($pat)\s*(\S+)/ $1$3$2/g) { &info_msg("-- Be sure to spell check the final document! --"); } $text =~ s/(\s*$pat\s*)<\/a>/<\/a>\n$1/g; } # # This routine prompts the user to # manually review and edit the html output. # If we are in batch mode, then no prompts are issued # sub manual_fix { local($response, $ret, $ppid); # Don't want to do this sort of thing if we are in batch mode if($VERBOSE == 0) { return; } if(!((defined @EDIT_CMD) && (defined @VIEW_CMD) && (defined @MOSAIC))) { return; } if(!defined $ENV{DISPLAY}) { print stderr "No DISPLAY variable found, assuming not an X-Terminal.\n"; return; } print stdout "Would you like to manually review the HTML file? (Y/n) "; $response = ; chop($response); $response =~ tr/A-Z/a-z/; if($response eq "n") { return; } if(($response ne "y") && ($response ne "")) { print stderr "Assuming you don't want to review.\n"; return; } $ppid = $$; if(0 == fork) { if($ppid == $$) { print stderr "\nError: unable to fork off editing commands: $!\n"; return; } exec(@VIEW_CMD, "$author_name.err"); } elsif(0 == fork) { exec(@EDIT_CMD, "$author_name.html"); } elsif(0 == fork) { exec(@MOSAIC, "$author_name.html"); } } sub do_pagebreak { local(*text, $pagenum) = @_; if(defined($general{"page_break_section"})) { *text = eval '*' . $general{"page_break_section"}; } if(defined($general{"page_break_command"})) { $text .= eval $general{"page_break_command"}; } } # These routines are useful for the perl fragments that # can be run as the start and stop lines in the text section # definitions # Takes care of popping $count levels of state sub pop_state { local($count) = $_[0]; local($i); for($i = 0; $i < $count; $i++) { pop(@state_stack); pop(@close); } } # Push multiple levels on to the state, arguments in the form of: # # state_name1, close_value1, state_name2, close_value2, ... # # e.g. &push_state("Body Quote", "\n"); sub push_state { local(@args) = @_; local($i); for($i = 0; $i <= $#args; $i += 2) { push(@state_stack, $args[$i]); push(@close, $args[$i+1]); } }