"Fossies" - the Fresh Open Source Software Archive

Member "html2sgml-0.3/html2sgml.bak" (26 Aug 1997, 12000 Bytes) of package /linux/misc/old/html2sgml-0.3.tar.gz:


As a special service "Fossies" has tried to format the requested text file into HTML format (style: standard) with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file.

    1 #!oj
    2 # html2sgml - converts html-filese to
    3 # v 0.3
    4 # Rude script to convert a html-file to a sgml-file in a format
    5 # suitabel for linuxdoc-sgml. See html2sgml.1 for ducumentation.
    6 # (c) Peter Antman, 1997.
    7 # send bug reports to:
    8 # peter.antman@abc.se
    9 #
   10 #
   11 # This program is free software; you can redistribute it and/or modify
   12 # it under the terms of the GNU General Public License as published by
   13 # the Free Software Foundation; either version 2 of the License, or
   14 # (at your option) any later version.
   15 # 
   16 # This program is distributed in the hope that it will be useful,
   17 # but WITHOUT ANY WARRANTY; without even the implied warranty of
   18 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   19 # GNU General Public License for more details.
   20 # 
   21 # You should have received a copy of the GNU General Public License
   22 # along with this program; if not, write to the Free Software
   23 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
   24 #
   25 
   26 
   27 
   28 $THISPROG = "html2sgml 0.3";
   29 
   30 
   31 $usage = "usage: html2sgml fil.html";
   32 
   33 die $usage if @ARGV < 1;
   34 $htmlfile = shift (@ARGV);
   35 
   36 $htmlfile =~ /([\S]*?)\.html/ or die "Could not figure out postfix for filename: $!\n";
   37 $filename = $1;
   38 $sgmlfile = "$filename.sgml";
   39 
   40 open (html, $htmlfile);
   41 open (sgml, ">$sgmlfile");
   42 
   43 
   44 # Make sgml-header
   45 print sgml <<'End_off';
   46 <!--Converted to sgml with html2sgml-->
   47 <!doctype linuxdoc system>
   48 <article>
   49 End_off
   50 
   51 # If we can find an applix-world file in the same directory with
   52 # the same name: try to get footnotes
   53 
   54 if (-e "$filename.aw") {
   55     open (aw, "$filename.aw");
   56     
   57     $nrfootnotes = 0;
   58     $start_note = nej;
   59     $inT = nej;
   60     while (<aw>) {
   61     
   62     	# Find all fotnotes and their number and sequensnumber
   63 	
   64 	if (/^<S_F.*?footnote\s\"(\d*)\">$/) {
   65 		$number = $1;    
   66 		$nrfootnotes++;
   67 	    	$tmpftn{$number} = "$nrfootnotes";
   68 	
   69 	}
   70 	
   71 	# What footnote are we taking the content from
   72 	if (/^<start_footnote\s\"(\d*)\">$/) {
   73 	    $whichnote = $1;
   74 	    $start_note = ja;
   75 	}
   76 	
   77 	# En open footnote
   78 	if (/^<end_footnote>$/) {
   79 	    $start_note = nej;
   80 	}
   81 	
   82 	# Grab the content
   83 	if ($start_note eq ja) {
   84 	    if (!((/footnote_body/) or (/<T.*?position/) or (/<Symbol/))) {
   85 	    	
   86 		# citations in footnotes are tricky in converted docs
   87 		s/\\\"/''/g;
   88 		
   89 	    	# special rutin to handle long footnotes
   90 	    	# this is done only when foonote was not ended in first round
   91 	    	if ($inT eq ja) {
   92 
   93 	    		if (/(^[\s]+.*?\\)|(^[\s]+.*?>)/) {
   94 	    			s/\\//g;
   95 	    			s/^\s//g;
   96 	    			s/\n//g;
   97 	    			
   98 	    			
   99 	    			if (/.*?>/) {
  100 	    			
  101 	    				$tmpharb = "$tmpharb" . "$_";
  102 	    				if ($tmpharb =~ /italic/) {
  103 	    					$tmpharb =~ s/\"\s.*?>//;
  104 	    					$footnotes{$tmpftn{$whichnote}} = "$footnotes{$tmpftn{$whichnote}}" . "<it>$tmpharb</it>";
  105 	    				} elsif ($tmpharb =~ /bold/) {
  106 	    					$tmpharb =~ s/\"\s.*?>//;
  107 	    					$footnotes{$tmpftn{$whichnote}} = "$footnotes{$tmpftn{$whichnote}}" . "<bf>$tmpharb</bf>";
  108 	    				} else {
  109 	    					$tmpharb =~ s/\".*?>$//;
  110 	    					$footnotes{$tmpftn{$whichnote}} = "$footnotes{$tmpftn{$whichnote}}" . "$tmpharb";
  111 	    				}
  112 	    				
  113 	    				$inT = nej;
  114 	    			} else {
  115 	    				$tmpharb = "$tmpharb" . "$_";
  116 	    			}
  117 	    		}
  118 	    	}
  119 	    	
  120 	    	# The real footnotecontent
  121 	    
  122 		if (/(<T\s\"(.*)\"(.*)$)|(<T\s\"(.*)[^\"]\\)/) {
  123 			# Footnotes have three formats
  124 			# 1) <T "CONTENT">
  125 			# 2) <T "CONTENT" TAG>
  126 			# 3) <T "CONTENT\
  127 			# Hm, and
  128 			# 4) <T "CONTENT"\ TAG>
  129 			if (/<T\s\"([^\"]+)\"(.*)>$/) {
  130 				$cont = $1;
  131 				$cont =~ s/\\//g;
  132 		
  133 				$it = $2;
  134 				
  135 				if (/italic/) {
  136 		    		$cont = "<it>$cont</it>";
  137 				}
  138 		
  139 				if (/bold/) {
  140 				$cont = "<bf>$cont</bf>";
  141 				}
  142 		
  143 
  144 		
  145 				$footnotes{$tmpftn{$whichnote}} = "$footnotes{$tmpftn{$whichnote}}" . "$cont";
  146 			}
  147 			if (/<T\s\"(.*?)\\$/) {
  148 			# Ok we are in a long footnote, wich is difficult
  149 			# It may not even be the hole footnote, just a
  150 			# special formated piece of it.
  151 				$cont = $1;
  152 				$cont =~ s/\\//g;
  153 			# Have to harbour it temporary
  154 				$tmpharb = ();
  155 				$tmpharb = $cont;	
  156 				$inT = ja;
  157 				
  158 			}
  159 		
  160 
  161 	    	}
  162 	}
  163 	}
  164 	}
  165 	
  166 }
  167 close aw;
  168 
  169 # Sorry we have to do this, but othervise we might get norty chars
  170 # in footnotes; I have only converted chars of interest to mee. 
  171 # Others have to fill in with other 8-bit chars
  172 for ($i = 1; $i <= $nrfootnotes; $i++) {
  173 		$tmp = "$footnotes{$i}";
  174 		$tmp =~ s/\&/&amp;/g;
  175 	    	$tmp =~ s/\^[\s]*of/&aring;/g;
  176 		$tmp =~ s/\^[\s]*pg/&ouml;/g;
  177 		$tmp =~ s/\^[\s]*oe/&auml;/g;
  178 		
  179 		$tmp =~ s/\^[\s]*mf/&Aring;/g;
  180 		$tmp =~ s/\^[\s]*me/&Auml;/g;
  181 		$tmp =~ s/\^[\s]*ng/&Ouml;/g;
  182 		$tmp =~ s/\^[\s]*mj/&Eacute;/g;
  183 		$tmp =~ s/\^[\s]*mi/&Egrave;/g;
  184 		$tmp =~ s/\^[\s]*oj/&eacute;/g;
  185 		$tmp =~ s/\^[\s]*oi/&egrave;/g;
  186 		$tmp =~ s/\^[\s]*ni/&Oslash;/g;
  187 		$tmp =~ s/\^[\s]*pi/&oslash;/g;
  188 		$tmp =~ s/\^[\s]*nm/&Uuml;/g;
  189 		$tmp =~ s/\^[\s]*pm/&uuml;/g;
  190 		$tmp =~ s/\^[\s]*kh//g;
  191 		$tmp =~ s/\[/&lsqb;/g;
  192 		$tmp =~ s/\]/&rsqb;/g;
  193 		$tmp =~ s/\^//g;
  194 
  195 		$footnotes{$i} = $tmp;
  196 		
  197 }
  198 
  199 
  200 # Now we must check a couple of things first. The title and where to start
  201 # section levels, and tables, Therefore we poure througe the whole file until we
  202 # find something usefull.
  203 
  204 $hone = 0;
  205 $htwo = 0;
  206 
  207 $intable = nej;
  208 $nrtable = 0;
  209 $nrrow = 0;
  210 
  211 LOOP:
  212 while (<html>) {
  213 	if ($_ =~/\<TITLE\>([^\<]*)/i) {
  214 		print sgml "<title>$1\n";
  215 	}
  216 	if ($_ =~/\<H1.*?\>/i) {
  217 		$hone++;
  218 	
  219 	    }
  220     
  221 	# We have to count tables and the amount of coloumns in them
  222 	if ($_=~/<TABLE.*?>/i) {
  223 	    $intable = ja;
  224 	    $nrtable++;
  225 	}
  226 	if ($intable eq ja) {
  227 	    if ($_=~/<TR.*?>/) {
  228 		$nrrow++;
  229 	    }
  230 	    if ($nrrow == 1) {
  231 		if (/<\/TD.*?>/) {
  232 		    $table{$nrtable} = "$table{$nrtable}" . "l";
  233 		}
  234 	    }
  235 	}
  236 	
  237 	if ($_=~/<\/TABLE.*?>/) {
  238 	    $intable = nej;
  239 	    $nrrow = 0;
  240 	}
  241     }
  242 if ($hone > 1 ) {
  243 	$sect = H1;
  244 } else {
  245 	$sect = H2;
  246 }
  247     
  248 close html;
  249 
  250 $nrtable = 0;
  251 $intable = nej;
  252 
  253 # Lets first write I couple of thing we probably cant figure out
  254 # Would be good if we could figure out the name of the writer
  255 # Perhaps ask for one?
  256 print sgml <<'End_off_sub';
  257 <author>You name
  258 <date>
  259 <abstract>
  260 </abstract>
  261 <toc>
  262 End_off_sub
  263 
  264 # Now we can start the real substitution, the order of the substitution is
  265 # in some cases realy important
  266 open (html, $htmlfile);
  267 
  268 $inquote = nej;
  269 $inverb = nej;
  270 $nrfootnotes = 0;
  271 while (<html>) {
  272 	
  273 	# must be first for sake of footnotes	
  274     
  275     	s/<FONT.*?>|<\/FONT>//gi;
  276     
  277     	# footnotes, as they are converted in Applix aw->html
  278 	if (/<SUP>|<\/SUP>/) {
  279 		# grab all footnotes on the current line
  280 		while (/<SUP>[0-9]+/) {
  281 			$nrfootnotes++;
  282 			$putin = $footnotes{$nrfootnotes};
  283 			s/<SUP>[0-9]+/<footnote>$putin<\/footnote>/;
  284 		}
  285 		# if foonotenumber noot on the same line as <SUP>
  286 		while (/[0-9]+<\/SUP>/) {
  287 			$nrfootnotes++;
  288 			$putin = $footnotes{$nrfootnotes};
  289 			s/[0-9]+<\/SUP>/<footnote>$putin<\/footnote>/;
  290 		}
  291 	}
  292 	
  293 	
  294 	# Things to remowe
  295 	s/<SUP>|<\/SUP>//gi;
  296 	s/<HTML>|<\/HTML>//gi;
  297 	s/<HEAD>|<\/HEAD>//gi;
  298 	s/<TITLE>((.*<\/TITLE>)|(.*$))//gi;
  299 	s/<\/TITLE>//gi;
  300 	s/<HR>//gi;
  301 	s/<U>|<\/U>//gi;
  302 	s/<BODY.*?>|\<\/BODY>//gi;
  303 	s/[0-9]*<\/SUP>//gi;
  304 	s/<TT>|<\/TT>//gi;
  305 	s/<SAMP>|<\/SAMP>//gi;
  306 	
  307 	s/<CENTER>|<\/CENTER>//gi;
  308 	s/<ADDRESS>|<\/ADDRESS>//gi;
  309 	s/<P\s.*?>//gi;
  310 	
  311 	
  312 	#things to substitute
  313 	
  314 	# fist sectionlevel
  315 	# first ones that has no meaning, remove them
  316 	 s/<H1><\/H1>//gi;
  317 	 s/<H2><\/H2>//gi;
  318 	 s/<H3><\/H3>//gi;
  319 	 s/<H4><\/H4>//gi;
  320 	 s/<H5><\/H5>//gi;
  321 	if ($sect eq H1) {
  322 	    s/<H1.*?>/<sect>/gi;
  323 	    s/<\/H1>/\n<p>/gi;
  324 	    s/<H2.*?>/<sect1>/gi;
  325 	    s/<\/H2>/\n<p>/gi;
  326 	    s/<H3.*?>/<sect2>/gi;
  327 	    s/<\/H3>/\n<p>/gi;
  328 	    s/<H4.*?>/<sect3>/gi;
  329 	    s/<\/H4>/\n<p>/gi;
  330 	    s/<H5.*?>/<sect4>/gi;
  331 	    s/<\/H5>/\n<p>/gi;
  332 	} else {
  333 	    s/<H2.*?>/<sect>/gi;
  334 	    s/<\/H2>/\n<p>/gi;
  335 	    s/<H3.*?>/<sect1>/gi;
  336 	    s/<\/H3>/\n<p>/gi;
  337 	    s/<H4.*?>/<sect2>/gi;
  338 	    s/<\/H4>/\n<p>/gi;
  339 	    s/<H5.*?>/<sect3>/gi;
  340 	    s/<\/H5>/\n<p>/gi;
  341 	    }
  342 	
  343 	# lets take al one to one things
  344 	s/<BR>//gi;
  345 	s/<UL>/<itemize>/gi;
  346 	s/<\/UL>/<\/itemize>/gi;
  347 	s/<OL>/<enum>/gi;
  348 	s/<\/OL>/<\/enum>/gi;
  349 	s/<DIR>/<itemize>/gi;
  350 	s/<\/DIR>/<\/itemize>/gi;
  351 
  352 	# Does not generate good results
  353 	s/<DL.*?>/<descrip>/gi;
  354 	s/<\/DL>/<\/descrip>/gi;
  355 	s/<DT>/<tag>/gi;
  356 	s/<DD>/<\/tag>/gi;
  357 	
  358 	s/<LI>/<item>/gi;
  359 	s/<\/LI>/<\/item>/gi;
  360 	s/<LQ>/<lq>/gi;
  361 	s/<\/LQ>/<\/lq>/gi;
  362 	s/<EM>/<em>/g;
  363 	s/<\/EM>/<\/em>/g;
  364 	s/<CITE>/<em>/gi;
  365 	s/<\/CITE>/<\/em>/gi;
  366 	s/<B>/<bf>/gi;
  367 	s/<\/B>/<\/bf>/gi;
  368 	s/<STRONG>/<bf>/gi;
  369 	s/<\/STRONG>/<\/bf>/gi;
  370 	s/<I>/<it>/gi;
  371 	s/<\/I>/<\/it>/gi;
  372 	s/<SF>/<sf>/g;
  373 	s/<\/SF>/<\/sf>/g;
  374 	s/<CODE>/<tt>/gi;
  375 	s/<\/CODE>/<\/tt>\n/gi;
  376 	s/<PRE>/<tscreen><verb>/gi;
  377 	
  378 	# \n taken away from ending if tscreen
  379 	s/<\/PRE>/<\/verb><\/tscreen>/gi;
  380 	s/<MC>/<mc>/gi;
  381 	s/<\/MC>/<\/mc>/gi;
  382 	s/&quot;/''/gi;
  383 	s/<DFN>/<tt>/gi;
  384 	s/<\/DFN>/<\/tt>/gi;
  385 	
  386 	# To prevent norty things to happend i latex
  387 	s/``/''/g;
  388 	
  389 	s/&#60;/&lt;/g;
  390 	s/&#62;/&gt;/g;
  391 	s/&#38;/&amp;/gi;
  392 	
  393 	
  394 	# we have to do special things inside verb and quote
  395 	if (/\<tscreen\>\<verb>/) {
  396 	    $inverb = ja;
  397 	}
  398 	
  399 	if (/\<\/verb\>\<\/tscreen\>/) {
  400 	    $inverb = nej;
  401 	}
  402 	
  403 	# quote does not like empty rows
  404 	if (/\<BLOCKQUOTE\>/i) {
  405 	    $inquote = ja;
  406 	    s/<BLOCKQUOTE>/<quote>/gi;
  407 	}
  408 	if (/\<\/BLOCKQUOTE\>/i) {
  409 	    $inquote = nej;
  410 	    s/<\/BLOCKQUOTE>/<\/quote>/gi;
  411 	}
  412 	
  413 
  414 	
  415 	#table - hm...probably only works for applix,
  416 	# to work with sgml2html, change tabular to table in mapping
  417 	if ($_=~/<TABLE.*?>/i) {
  418 	    $nrtable++;
  419 	    s/<TABLE.*>?/<tabular ca=\"$table{$nrtable}\">/gi;
  420 	}
  421 	s/<TR.*?>//gi;
  422 	s/<\/TD><\/TR>/<rowsep>/gi;
  423 	s/<TD.*?>//gi;
  424 	s/<\/TD>/<colsep>/gi;
  425 	if ($_=~/<\/TABLE.*?>/i) {
  426 	    s/<\/TABLE>/<\/tabular>/gi;
  427 	}
  428 
  429 
  430 	#urls - pure url does not produce god looking formats - use htmlurl for all
  431 	s/<A\sNAME=\"(.*)\">?(.*)<\/A>/\2<label id=\"\1\">/gi;
  432 	s/<A\sNAME=(.*?)>(.*?)<\/A>/\2<label id=\"\1\">/gi;
  433 	s/<A\sHREF=\"#([^\"]*)\">?(.*?)<\/A>/<htmlurl url=\"\1\" name=\"\2\">/gi;;
  434 	s/<A\sHREF=\"([^\"]*)\">?(.*?)<\/A>/<htmlurl url=\"\1\" name=\"\2\">/gi;
  435 	s/<A\sHREF=\"([^\"]*)\">?(.*?)$/<htmlurl url=\"\1\" name=\"\2\">/gi;
  436 	
  437 	# pure formaters dont use quotes
  438 	s/<A\sHREF=#([^\"]*?)>(.*?)<\/A>/<htmlurl url=\"\1\" name=\"\2\">/gi;;
  439 	s/<A\sHREF=([^\"]*?)>(.*?)<\/A>/<htmlurl url=\"\1\" name=\"\2\">/gi;
  440 	s/<A\sHREF=([^\"]*?)>(.*?)$/<htmlurl url=\"\1\" name=\"\2\">/gi;
  441 	# img - makes a eps-img, will only convert gifs
  442 	if ($_=~/\<IMG.*?SRC=([^>]*?)\.([^>]*?)>/i) {
  443 				      
  444 		$img = $1;
  445 		$img =~ s/\"//g;
  446 		$ext = $2;
  447 		$ext =~ s/\"//g;
  448 		s/<IMG.*?SRC=[^>]*?>/<figure>\n<eps file=\"$img\">\n<\/figure>/gi;
  449 		 # save the pics
  450 		push(@pics, "$img.$ext");
  451 				   
  452 	}
  453 	
  454 				  
  455 	# remove stale things
  456 	s/<\/A>//gi;	
  457 			   
  458 	# Fix smlish away from verb-env, should be more...
  459 	if ($inverb eq ja) {
  460 
  461 	    if (!/<quote>|<tscreen><verb>|<footnote>|<\/footnote>/) {
  462 		s/<.*?>|<\/.*?>//gi;
  463 		s/&gt;/>/gi;
  464 		s/&lt;/</gi;
  465 		s/&lsqb;/\[/g;
  466 		s/&rsqb;/\]/g;
  467 		s/&amp;/&ero;/gi;
  468 		
  469 		# iso-char have to be iso-char in verb
  470 		# not all implemented
  471 		s/&Auml;//g;
  472 		s/&Aring;//g;
  473 		s/&Egrave;//g;
  474 		s/&Eacute;//g;
  475 		s/&Ouml;//g;
  476 		s/&auml;//g;
  477 		s/&aring;//g;
  478 		s/&egrave;//g;
  479 		s/&eacute;//g;
  480 		s/&ouml;//g;    
  481 	    }
  482 	    
  483 				      
  484 # Dont know if this should realy be here, taken away for jargon
  485 	    s/(\n)\n/\1/g;
  486 	}
  487 
  488 	# special macro-conversion
  489 	if ($inverb eq nej) {
  490 		s/\$/&dollar;/gi;
  491 		s/([^&])#/\1&num;/gi;
  492 		s/%/&percnt;/gi;
  493 		s/~/&tilde;/gi;
  494 		s/\\/&bsol;/gi;
  495 		s/\[/&lsqb;/g;
  496 		s/\]/&rsqb;/g;
  497 		s/\^/&circ;/g;
  498 		s/_/&lowbar;/g;
  499 	
  500 			       }
  501 			       
  502 	if ($inquote eq ja) {
  503 		s/^\n//gi;
  504 	    }		       
  505 	
  506 	s/<P>|^<\/P>$//g;
  507 	
  508 	if ($inquote eq nej) {
  509 	
  510 		s/<\/P>/\n/gi;
  511 	} else {
  512 		s/<\/P>//gi;
  513 	}
  514 	
  515 	
  516 	if ($inquote eq ja) {
  517 		if (/^\s*$/) {
  518 			# Dont do anything, just remove empty lines from quotes
  519 		} else {
  520 	    		print sgml;
  521 	    	}
  522 	} else {	  
  523 		print sgml;
  524 	}
  525 			      
  526 }
  527 print sgml "</article>";
  528 close sgml;
  529 
  530 # convert any gifs (needs programs: giftopnm, ppptopgm and pnmtops) to ps
  531 while (<@pics>) {
  532 	if (-e "$_"){
  533 		($name, $ex) = split(/\./, $_);
  534 		`giftopnm $_ | ppmtopgm | pnmtops -noturn > $name.ps`;
  535 	}
  536 }
  537 
  538 exit
  539 
  540 
  541 
  542 
  543