"Fossies" - the Fresh Open Source Software Archive

Member "html2sgml-0.3/html2sgml" (26 Aug 1997, 12011 Bytes) of package /linux/misc/old/html2sgml-0.3.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Perl source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file.

    1 #!/usr/bin/perl
    2 # html2sgml - converts html-filese to
    3 # v 0.3
    4 # Rude script to convert a html-file to a sgml-file in a format
    5 # suitabel for linuxdoc-sgml. See html2sgml.1 for ducumentation.
    6 # (c) Peter Antman, 1997.
    7 # send bug reports to:
    8 # peter.antman@abc.se
    9 #
   10 #
   11 # This program is free software; you can redistribute it and/or modify
   12 # it under the terms of the GNU General Public License as published by
   13 # the Free Software Foundation; either version 2 of the License, or
   14 # (at your option) any later version.
   15 # 
   16 # This program is distributed in the hope that it will be useful,
   17 # but WITHOUT ANY WARRANTY; without even the implied warranty of
   18 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   19 # GNU General Public License for more details.
   20 # 
   21 # You should have received a copy of the GNU General Public License
   22 # along with this program; if not, write to the Free Software
   23 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
   24 #
   25 
   26 
   27 
   28 $THISPROG = "html2sgml 0.3";
   29 
   30 
   31 $usage = "usage: html2sgml fil.html";
   32 
   33 die $usage if @ARGV < 1;
   34 $htmlfile = shift (@ARGV);
   35 
   36 $htmlfile =~ /([\S]*?)\.html/ or die "Could not figure out postfix for filename: $!\n";
   37 $filename = $1;
   38 $sgmlfile = "$filename.sgml";
   39 
   40 open (html, $htmlfile);
   41 open (sgml, ">$sgmlfile");
   42 
   43 
   44 # Make sgml-header
   45 print sgml <<'End_off';
   46 <!--Converted to sgml with html2sgml-->
   47 <!doctype linuxdoc system>
   48 <article>
   49 End_off
   50 
   51 # If we can find an applix-world file in the same directory with
   52 # the same name: try to get footnotes
   53 
   54 if (-e "$filename.aw") {
   55     open (aw, "$filename.aw");
   56     
   57     $nrfootnotes = 0;
   58     $start_note = nej;
   59     $inT = nej;
   60     while (<aw>) {
   61     
   62         # Find all fotnotes and their number and sequensnumber
   63     
   64     if (/^<S_F.*?footnote\s\"(\d*)\">$/) {
   65         $number = $1;    
   66         $nrfootnotes++;
   67             $tmpftn{$number} = "$nrfootnotes";
   68     
   69     }
   70     
   71     # What footnote are we taking the content from
   72     if (/^<start_footnote\s\"(\d*)\">$/) {
   73         $whichnote = $1;
   74         $start_note = ja;
   75     }
   76     
   77     # En open footnote
   78     if (/^<end_footnote>$/) {
   79         $start_note = nej;
   80     }
   81     
   82     # Grab the content
   83     if ($start_note eq ja) {
   84         if (!((/footnote_body/) or (/<T.*?position/) or (/<Symbol/))) {
   85             
   86         # citations in footnotes are tricky in converted docs
   87         s/\\\"/''/g;
   88         
   89             # special rutin to handle long footnotes
   90             # this is done only when foonote was not ended in first round
   91             if ($inT eq ja) {
   92 
   93                 if (/(^[\s]+.*?\\)|(^[\s]+.*?>)/) {
   94                     s/\\//g;
   95                     s/^\s//g;
   96                     s/\n//g;
   97                     
   98                     
   99                     if (/.*?>/) {
  100                     
  101                         $tmpharb = "$tmpharb" . "$_";
  102                         if ($tmpharb =~ /italic/) {
  103                             $tmpharb =~ s/\"\s.*?>//;
  104                             $footnotes{$tmpftn{$whichnote}} = "$footnotes{$tmpftn{$whichnote}}" . "<it>$tmpharb</it>";
  105                         } elsif ($tmpharb =~ /bold/) {
  106                             $tmpharb =~ s/\"\s.*?>//;
  107                             $footnotes{$tmpftn{$whichnote}} = "$footnotes{$tmpftn{$whichnote}}" . "<bf>$tmpharb</bf>";
  108                         } else {
  109                             $tmpharb =~ s/\".*?>$//;
  110                             $footnotes{$tmpftn{$whichnote}} = "$footnotes{$tmpftn{$whichnote}}" . "$tmpharb";
  111                         }
  112                         
  113                         $inT = nej;
  114                     } else {
  115                         $tmpharb = "$tmpharb" . "$_";
  116                     }
  117                 }
  118             }
  119             
  120             # The real footnotecontent
  121         
  122         if (/(<T\s\"(.*)\"(.*)$)|(<T\s\"(.*)[^\"]\\)/) {
  123             # Footnotes have three formats
  124             # 1) <T "CONTENT">
  125             # 2) <T "CONTENT" TAG>
  126             # 3) <T "CONTENT\
  127             # Hm, and
  128             # 4) <T "CONTENT"\ TAG>
  129             if (/<T\s\"([^\"]+)\"(.*)>$/) {
  130                 $cont = $1;
  131                 $cont =~ s/\\//g;
  132         
  133                 $it = $2;
  134                 
  135                 if (/italic/) {
  136                     $cont = "<it>$cont</it>";
  137                 }
  138         
  139                 if (/bold/) {
  140                 $cont = "<bf>$cont</bf>";
  141                 }
  142         
  143 
  144         
  145                 $footnotes{$tmpftn{$whichnote}} = "$footnotes{$tmpftn{$whichnote}}" . "$cont";
  146             }
  147             if (/<T\s\"(.*?)\\$/) {
  148             # Ok we are in a long footnote, wich is difficult
  149             # It may not even be the hole footnote, just a
  150             # special formated piece of it.
  151                 $cont = $1;
  152                 $cont =~ s/\\//g;
  153             # Have to harbour it temporary
  154                 $tmpharb = ();
  155                 $tmpharb = $cont;   
  156                 $inT = ja;
  157                 
  158             }
  159         
  160 
  161             }
  162     }
  163     }
  164     }
  165     
  166 }
  167 close aw;
  168 
  169 # Sorry we have to do this, but othervise we might get norty chars
  170 # in footnotes; I have only converted chars of interest to mee. 
  171 # Others have to fill in with other 8-bit chars
  172 for ($i = 1; $i <= $nrfootnotes; $i++) {
  173         $tmp = "$footnotes{$i}";
  174         $tmp =~ s/\&/&amp;/g;
  175             $tmp =~ s/\^[\s]*of/&aring;/g;
  176         $tmp =~ s/\^[\s]*pg/&ouml;/g;
  177         $tmp =~ s/\^[\s]*oe/&auml;/g;
  178         
  179         $tmp =~ s/\^[\s]*mf/&Aring;/g;
  180         $tmp =~ s/\^[\s]*me/&Auml;/g;
  181         $tmp =~ s/\^[\s]*ng/&Ouml;/g;
  182         $tmp =~ s/\^[\s]*mj/&Eacute;/g;
  183         $tmp =~ s/\^[\s]*mi/&Egrave;/g;
  184         $tmp =~ s/\^[\s]*oj/&eacute;/g;
  185         $tmp =~ s/\^[\s]*oi/&egrave;/g;
  186         $tmp =~ s/\^[\s]*ni/&Oslash;/g;
  187         $tmp =~ s/\^[\s]*pi/&oslash;/g;
  188         $tmp =~ s/\^[\s]*nm/&Uuml;/g;
  189         $tmp =~ s/\^[\s]*pm/&uuml;/g;
  190         $tmp =~ s/\^[\s]*kh//g;
  191         $tmp =~ s/\[/&lsqb;/g;
  192         $tmp =~ s/\]/&rsqb;/g;
  193         $tmp =~ s/\^//g;
  194 
  195         $footnotes{$i} = $tmp;
  196         
  197 }
  198 
  199 
  200 # Now we must check a couple of things first. The title and where to start
  201 # section levels, and tables, Therefore we poure througe the whole file until we
  202 # find something usefull.
  203 
  204 $hone = 0;
  205 $htwo = 0;
  206 
  207 $intable = nej;
  208 $nrtable = 0;
  209 $nrrow = 0;
  210 
  211 LOOP:
  212 while (<html>) {
  213     if ($_ =~/\<TITLE\>([^\<]*)/i) {
  214         print sgml "<title>$1\n";
  215     }
  216     if ($_ =~/\<H1.*?\>/i) {
  217         $hone++;
  218     
  219         }
  220     
  221     # We have to count tables and the amount of coloumns in them
  222     if ($_=~/<TABLE.*?>/i) {
  223         $intable = ja;
  224         $nrtable++;
  225     }
  226     if ($intable eq ja) {
  227         if ($_=~/<TR.*?>/) {
  228         $nrrow++;
  229         }
  230         if ($nrrow == 1) {
  231         if (/<\/TD.*?>/) {
  232             $table{$nrtable} = "$table{$nrtable}" . "l";
  233         }
  234         }
  235     }
  236     
  237     if ($_=~/<\/TABLE.*?>/) {
  238         $intable = nej;
  239         $nrrow = 0;
  240     }
  241     }
  242 if ($hone > 1 ) {
  243     $sect = H1;
  244 } else {
  245     $sect = H2;
  246 }
  247     
  248 close html;
  249 
  250 $nrtable = 0;
  251 $intable = nej;
  252 
  253 # Lets first write I couple of thing we probably cant figure out
  254 # Would be good if we could figure out the name of the writer
  255 # Perhaps ask for one?
  256 print sgml <<'End_off_sub';
  257 <author>You name
  258 <date>
  259 <abstract>
  260 </abstract>
  261 <toc>
  262 End_off_sub
  263 
  264 # Now we can start the real substitution, the order of the substitution is
  265 # in some cases realy important
  266 open (html, $htmlfile);
  267 
  268 $inquote = nej;
  269 $inverb = nej;
  270 $nrfootnotes = 0;
  271 while (<html>) {
  272     
  273     # must be first for sake of footnotes   
  274     
  275         s/<FONT.*?>|<\/FONT>//gi;
  276     
  277         # footnotes, as they are converted in Applix aw->html
  278     if (/<SUP>|<\/SUP>/) {
  279         # grab all footnotes on the current line
  280         while (/<SUP>[0-9]+/) {
  281             $nrfootnotes++;
  282             $putin = $footnotes{$nrfootnotes};
  283             s/<SUP>[0-9]+/<footnote>$putin<\/footnote>/;
  284         }
  285         # if foonotenumber noot on the same line as <SUP>
  286         while (/[0-9]+<\/SUP>/) {
  287             $nrfootnotes++;
  288             $putin = $footnotes{$nrfootnotes};
  289             s/[0-9]+<\/SUP>/<footnote>$putin<\/footnote>/;
  290         }
  291     }
  292     
  293     
  294     # Things to remowe
  295     s/<SUP>|<\/SUP>//gi;
  296     s/<HTML>|<\/HTML>//gi;
  297     s/<HEAD>|<\/HEAD>//gi;
  298     s/<TITLE>((.*<\/TITLE>)|(.*$))//gi;
  299     s/<\/TITLE>//gi;
  300     s/<HR>//gi;
  301     s/<U>|<\/U>//gi;
  302     s/<BODY.*?>|\<\/BODY>//gi;
  303     s/[0-9]*<\/SUP>//gi;
  304     s/<TT>|<\/TT>//gi;
  305     s/<SAMP>|<\/SAMP>//gi;
  306     
  307     s/<CENTER>|<\/CENTER>//gi;
  308     s/<ADDRESS>|<\/ADDRESS>//gi;
  309     s/<P\s.*?>//gi;
  310     
  311     
  312     #things to substitute
  313     
  314     # fist sectionlevel
  315     # first ones that has no meaning, remove them
  316      s/<H1><\/H1>//gi;
  317      s/<H2><\/H2>//gi;
  318      s/<H3><\/H3>//gi;
  319      s/<H4><\/H4>//gi;
  320      s/<H5><\/H5>//gi;
  321     if ($sect eq H1) {
  322         s/<H1.*?>/<sect>/gi;
  323         s/<\/H1>/\n<p>/gi;
  324         s/<H2.*?>/<sect1>/gi;
  325         s/<\/H2>/\n<p>/gi;
  326         s/<H3.*?>/<sect2>/gi;
  327         s/<\/H3>/\n<p>/gi;
  328         s/<H4.*?>/<sect3>/gi;
  329         s/<\/H4>/\n<p>/gi;
  330         s/<H5.*?>/<sect4>/gi;
  331         s/<\/H5>/\n<p>/gi;
  332     } else {
  333         s/<H2.*?>/<sect>/gi;
  334         s/<\/H2>/\n<p>/gi;
  335         s/<H3.*?>/<sect1>/gi;
  336         s/<\/H3>/\n<p>/gi;
  337         s/<H4.*?>/<sect2>/gi;
  338         s/<\/H4>/\n<p>/gi;
  339         s/<H5.*?>/<sect3>/gi;
  340         s/<\/H5>/\n<p>/gi;
  341         }
  342     
  343     # lets take al one to one things
  344     s/<BR>//gi;
  345     s/<UL>/<itemize>/gi;
  346     s/<\/UL>/<\/itemize>/gi;
  347     s/<OL>/<enum>/gi;
  348     s/<\/OL>/<\/enum>/gi;
  349     s/<DIR>/<itemize>/gi;
  350     s/<\/DIR>/<\/itemize>/gi;
  351 
  352     # Does not generate good results
  353     s/<DL.*?>/<descrip>/gi;
  354     s/<\/DL>/<\/descrip>/gi;
  355     s/<DT>/<tag>/gi;
  356     s/<DD>/<\/tag>/gi;
  357     
  358     s/<LI>/<item>/gi;
  359     s/<\/LI>/<\/item>/gi;
  360     s/<LQ>/<lq>/gi;
  361     s/<\/LQ>/<\/lq>/gi;
  362     s/<EM>/<em>/g;
  363     s/<\/EM>/<\/em>/g;
  364     s/<CITE>/<em>/gi;
  365     s/<\/CITE>/<\/em>/gi;
  366     s/<B>/<bf>/gi;
  367     s/<\/B>/<\/bf>/gi;
  368     s/<STRONG>/<bf>/gi;
  369     s/<\/STRONG>/<\/bf>/gi;
  370     s/<I>/<it>/gi;
  371     s/<\/I>/<\/it>/gi;
  372     s/<SF>/<sf>/g;
  373     s/<\/SF>/<\/sf>/g;
  374     s/<CODE>/<tt>/gi;
  375     s/<\/CODE>/<\/tt>\n/gi;
  376     s/<PRE>/<tscreen><verb>/gi;
  377     
  378     # \n taken away from ending if tscreen
  379     s/<\/PRE>/<\/verb><\/tscreen>/gi;
  380     s/<MC>/<mc>/gi;
  381     s/<\/MC>/<\/mc>/gi;
  382     s/&quot;/''/gi;
  383     s/<DFN>/<tt>/gi;
  384     s/<\/DFN>/<\/tt>/gi;
  385     
  386     # To prevent norty things to happend i latex
  387     s/``/''/g;
  388     
  389     s/&#60;/&lt;/g;
  390     s/&#62;/&gt;/g;
  391     s/&#38;/&amp;/gi;
  392     
  393     
  394     # we have to do special things inside verb and quote
  395     if (/\<tscreen\>\<verb>/) {
  396         $inverb = ja;
  397     }
  398     
  399     if (/\<\/verb\>\<\/tscreen\>/) {
  400         $inverb = nej;
  401     }
  402     
  403     # quote does not like empty rows
  404     if (/\<BLOCKQUOTE\>/i) {
  405         $inquote = ja;
  406         s/<BLOCKQUOTE>/<quote>/gi;
  407     }
  408     if (/\<\/BLOCKQUOTE\>/i) {
  409         $inquote = nej;
  410         s/<\/BLOCKQUOTE>/<\/quote>/gi;
  411     }
  412     
  413 
  414     
  415     #table - hm...probably only works for applix,
  416     # to work with sgml2html, change tabular to table in mapping
  417     if ($_=~/<TABLE.*?>/i) {
  418         $nrtable++;
  419         s/<TABLE.*>?/<tabular ca=\"$table{$nrtable}\">/gi;
  420     }
  421     s/<TR.*?>//gi;
  422     s/<\/TD><\/TR>/<rowsep>/gi;
  423     s/<TD.*?>//gi;
  424     s/<\/TD>/<colsep>/gi;
  425     if ($_=~/<\/TABLE.*?>/i) {
  426         s/<\/TABLE>/<\/tabular>/gi;
  427     }
  428 
  429 
  430     #urls - pure url does not produce god looking formats - use htmlurl for all
  431     s/<A\sNAME=\"(.*)\">?(.*)<\/A>/\2<label id=\"\1\">/gi;
  432     s/<A\sNAME=(.*?)>(.*?)<\/A>/\2<label id=\"\1\">/gi;
  433     s/<A\sHREF=\"#([^\"]*)\">?(.*?)<\/A>/<htmlurl url=\"\1\" name=\"\2\">/gi;;
  434     s/<A\sHREF=\"([^\"]*)\">?(.*?)<\/A>/<htmlurl url=\"\1\" name=\"\2\">/gi;
  435     s/<A\sHREF=\"([^\"]*)\">?(.*?)$/<htmlurl url=\"\1\" name=\"\2\">/gi;
  436     
  437     # pure formaters dont use quotes
  438     s/<A\sHREF=#([^\"]*?)>(.*?)<\/A>/<htmlurl url=\"\1\" name=\"\2\">/gi;;
  439     s/<A\sHREF=([^\"]*?)>(.*?)<\/A>/<htmlurl url=\"\1\" name=\"\2\">/gi;
  440     s/<A\sHREF=([^\"]*?)>(.*?)$/<htmlurl url=\"\1\" name=\"\2\">/gi;
  441     # img - makes a eps-img, will only convert gifs
  442     if ($_=~/\<IMG.*?SRC=([^>]*?)\.([^>]*?)>/i) {
  443                       
  444         $img = $1;
  445         $img =~ s/\"//g;
  446         $ext = $2;
  447         $ext =~ s/\"//g;
  448         s/<IMG.*?SRC=[^>]*?>/<figure>\n<eps file=\"$img\">\n<\/figure>/gi;
  449          # save the pics
  450         push(@pics, "$img.$ext");
  451                    
  452     }
  453     
  454                   
  455     # remove stale things
  456     s/<\/A>//gi;    
  457                
  458     # Fix smlish away from verb-env, should be more...
  459     if ($inverb eq ja) {
  460 
  461         if (!/<quote>|<tscreen><verb>|<footnote>|<\/footnote>/) {
  462         s/<.*?>|<\/.*?>//gi;
  463         s/&gt;/>/gi;
  464         s/&lt;/</gi;
  465         s/&lsqb;/\[/g;
  466         s/&rsqb;/\]/g;
  467         s/&amp;/&ero;/gi;
  468         
  469         # iso-char have to be iso-char in verb
  470         # not all implemented
  471         s/&Auml;//g;
  472         s/&Aring;//g;
  473         s/&Egrave;//g;
  474         s/&Eacute;//g;
  475         s/&Ouml;//g;
  476         s/&auml;//g;
  477         s/&aring;//g;
  478         s/&egrave;//g;
  479         s/&eacute;//g;
  480         s/&ouml;//g;    
  481         }
  482         
  483                       
  484 # Dont know if this should realy be here, taken away for jargon
  485         s/(\n)\n/\1/g;
  486     }
  487 
  488     # special macro-conversion
  489     if ($inverb eq nej) {
  490         s/\$/&dollar;/gi;
  491         s/([^&])#/\1&num;/gi;
  492         s/%/&percnt;/gi;
  493         s/~/&tilde;/gi;
  494         s/\\/&bsol;/gi;
  495         s/\[/&lsqb;/g;
  496         s/\]/&rsqb;/g;
  497         s/\^/&circ;/g;
  498         s/_/&lowbar;/g;
  499     
  500                    }
  501                    
  502     if ($inquote eq ja) {
  503         s/^\n//gi;
  504         }              
  505     
  506     s/<P>|^<\/P>$//g;
  507     
  508     if ($inquote eq nej) {
  509     
  510         s/<\/P>/\n/gi;
  511     } else {
  512         s/<\/P>//gi;
  513     }
  514     
  515     
  516     if ($inquote eq ja) {
  517         if (/^\s*$/) {
  518             # Dont do anything, just remove empty lines from quotes
  519         } else {
  520                 print sgml;
  521             }
  522     } else {      
  523         print sgml;
  524     }
  525                   
  526 }
  527 print sgml "</article>";
  528 close sgml;
  529 
  530 # convert any gifs (needs programs: giftopnm, ppptopgm and pnmtops) to ps
  531 while (<@pics>) {
  532     if (-e "$_"){
  533         ($name, $ex) = split(/\./, $_);
  534         `giftopnm $_ | ppmtopgm | pnmtops -noturn > $name.ps`;
  535     }
  536 }
  537 
  538 exit
  539 
  540 
  541 
  542 
  543