"Fossies" - the Fresh Open Source Software Archive

Member "html2sgml-0.3/html2sgml.in" (26 Aug 1997, 11995 Bytes) of package /linux/misc/old/html2sgml-0.3.tar.gz:


As a special service "Fossies" has tried to format the requested text file into HTML format (style: standard) with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file.

    1 # html2sgml - converts html-filese to
    2 # v 0.3
    3 # Rude script to convert a html-file to a sgml-file in a format
    4 # suitabel for linuxdoc-sgml. See html2sgml.1 for ducumentation.
    5 # (c) Peter Antman, 1997.
    6 # send bug reports to:
    7 # peter.antman@abc.se
    8 #
    9 #
   10 # This program is free software; you can redistribute it and/or modify
   11 # it under the terms of the GNU General Public License as published by
   12 # the Free Software Foundation; either version 2 of the License, or
   13 # (at your option) any later version.
   14 # 
   15 # This program is distributed in the hope that it will be useful,
   16 # but WITHOUT ANY WARRANTY; without even the implied warranty of
   17 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   18 # GNU General Public License for more details.
   19 # 
   20 # You should have received a copy of the GNU General Public License
   21 # along with this program; if not, write to the Free Software
   22 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
   23 #
   24 
   25 
   26 
   27 $THISPROG = "html2sgml 0.3";
   28 
   29 
   30 $usage = "usage: html2sgml fil.html";
   31 
   32 die $usage if @ARGV < 1;
   33 $htmlfile = shift (@ARGV);
   34 
   35 $htmlfile =~ /([\S]*?)\.html/ or die "Could not figure out postfix for filename: $!\n";
   36 $filename = $1;
   37 $sgmlfile = "$filename.sgml";
   38 
   39 open (html, $htmlfile);
   40 open (sgml, ">$sgmlfile");
   41 
   42 
   43 # Make sgml-header
   44 print sgml <<'End_off';
   45 <!--Converted to sgml with html2sgml-->
   46 <!doctype linuxdoc system>
   47 <article>
   48 End_off
   49 
   50 # If we can find an applix-world file in the same directory with
   51 # the same name: try to get footnotes
   52 
   53 if (-e "$filename.aw") {
   54     open (aw, "$filename.aw");
   55     
   56     $nrfootnotes = 0;
   57     $start_note = nej;
   58     $inT = nej;
   59     while (<aw>) {
   60     
   61     	# Find all fotnotes and their number and sequensnumber
   62 	
   63 	if (/^<S_F.*?footnote\s\"(\d*)\">$/) {
   64 		$number = $1;    
   65 		$nrfootnotes++;
   66 	    	$tmpftn{$number} = "$nrfootnotes";
   67 	
   68 	}
   69 	
   70 	# What footnote are we taking the content from
   71 	if (/^<start_footnote\s\"(\d*)\">$/) {
   72 	    $whichnote = $1;
   73 	    $start_note = ja;
   74 	}
   75 	
   76 	# En open footnote
   77 	if (/^<end_footnote>$/) {
   78 	    $start_note = nej;
   79 	}
   80 	
   81 	# Grab the content
   82 	if ($start_note eq ja) {
   83 	    if (!((/footnote_body/) or (/<T.*?position/) or (/<Symbol/))) {
   84 	    	
   85 		# citations in footnotes are tricky in converted docs
   86 		s/\\\"/''/g;
   87 		
   88 	    	# special rutin to handle long footnotes
   89 	    	# this is done only when foonote was not ended in first round
   90 	    	if ($inT eq ja) {
   91 
   92 	    		if (/(^[\s]+.*?\\)|(^[\s]+.*?>)/) {
   93 	    			s/\\//g;
   94 	    			s/^\s//g;
   95 	    			s/\n//g;
   96 	    			
   97 	    			
   98 	    			if (/.*?>/) {
   99 	    			
  100 	    				$tmpharb = "$tmpharb" . "$_";
  101 	    				if ($tmpharb =~ /italic/) {
  102 	    					$tmpharb =~ s/\"\s.*?>//;
  103 	    					$footnotes{$tmpftn{$whichnote}} = "$footnotes{$tmpftn{$whichnote}}" . "<it>$tmpharb</it>";
  104 	    				} elsif ($tmpharb =~ /bold/) {
  105 	    					$tmpharb =~ s/\"\s.*?>//;
  106 	    					$footnotes{$tmpftn{$whichnote}} = "$footnotes{$tmpftn{$whichnote}}" . "<bf>$tmpharb</bf>";
  107 	    				} else {
  108 	    					$tmpharb =~ s/\".*?>$//;
  109 	    					$footnotes{$tmpftn{$whichnote}} = "$footnotes{$tmpftn{$whichnote}}" . "$tmpharb";
  110 	    				}
  111 	    				
  112 	    				$inT = nej;
  113 	    			} else {
  114 	    				$tmpharb = "$tmpharb" . "$_";
  115 	    			}
  116 	    		}
  117 	    	}
  118 	    	
  119 	    	# The real footnotecontent
  120 	    
  121 		if (/(<T\s\"(.*)\"(.*)$)|(<T\s\"(.*)[^\"]\\)/) {
  122 			# Footnotes have three formats
  123 			# 1) <T "CONTENT">
  124 			# 2) <T "CONTENT" TAG>
  125 			# 3) <T "CONTENT\
  126 			# Hm, and
  127 			# 4) <T "CONTENT"\ TAG>
  128 			if (/<T\s\"([^\"]+)\"(.*)>$/) {
  129 				$cont = $1;
  130 				$cont =~ s/\\//g;
  131 		
  132 				$it = $2;
  133 				
  134 				if (/italic/) {
  135 		    		$cont = "<it>$cont</it>";
  136 				}
  137 		
  138 				if (/bold/) {
  139 				$cont = "<bf>$cont</bf>";
  140 				}
  141 		
  142 
  143 		
  144 				$footnotes{$tmpftn{$whichnote}} = "$footnotes{$tmpftn{$whichnote}}" . "$cont";
  145 			}
  146 			if (/<T\s\"(.*?)\\$/) {
  147 			# Ok we are in a long footnote, wich is difficult
  148 			# It may not even be the hole footnote, just a
  149 			# special formated piece of it.
  150 				$cont = $1;
  151 				$cont =~ s/\\//g;
  152 			# Have to harbour it temporary
  153 				$tmpharb = ();
  154 				$tmpharb = $cont;	
  155 				$inT = ja;
  156 				
  157 			}
  158 		
  159 
  160 	    	}
  161 	}
  162 	}
  163 	}
  164 	
  165 }
  166 close aw;
  167 
  168 # Sorry we have to do this, but othervise we might get norty chars
  169 # in footnotes; I have only converted chars of interest to mee. 
  170 # Others have to fill in with other 8-bit chars
  171 for ($i = 1; $i <= $nrfootnotes; $i++) {
  172 		$tmp = "$footnotes{$i}";
  173 		$tmp =~ s/\&/&amp;/g;
  174 	    	$tmp =~ s/\^[\s]*of/&aring;/g;
  175 		$tmp =~ s/\^[\s]*pg/&ouml;/g;
  176 		$tmp =~ s/\^[\s]*oe/&auml;/g;
  177 		
  178 		$tmp =~ s/\^[\s]*mf/&Aring;/g;
  179 		$tmp =~ s/\^[\s]*me/&Auml;/g;
  180 		$tmp =~ s/\^[\s]*ng/&Ouml;/g;
  181 		$tmp =~ s/\^[\s]*mj/&Eacute;/g;
  182 		$tmp =~ s/\^[\s]*mi/&Egrave;/g;
  183 		$tmp =~ s/\^[\s]*oj/&eacute;/g;
  184 		$tmp =~ s/\^[\s]*oi/&egrave;/g;
  185 		$tmp =~ s/\^[\s]*ni/&Oslash;/g;
  186 		$tmp =~ s/\^[\s]*pi/&oslash;/g;
  187 		$tmp =~ s/\^[\s]*nm/&Uuml;/g;
  188 		$tmp =~ s/\^[\s]*pm/&uuml;/g;
  189 		$tmp =~ s/\^[\s]*kh//g;
  190 		$tmp =~ s/\[/&lsqb;/g;
  191 		$tmp =~ s/\]/&rsqb;/g;
  192 		$tmp =~ s/\^//g;
  193 
  194 		$footnotes{$i} = $tmp;
  195 		
  196 }
  197 
  198 
  199 # Now we must check a couple of things first. The title and where to start
  200 # section levels, and tables, Therefore we poure througe the whole file until we
  201 # find something usefull.
  202 
  203 $hone = 0;
  204 $htwo = 0;
  205 
  206 $intable = nej;
  207 $nrtable = 0;
  208 $nrrow = 0;
  209 
  210 LOOP:
  211 while (<html>) {
  212 	if ($_ =~/\<TITLE\>([^\<]*)/i) {
  213 		print sgml "<title>$1\n";
  214 	}
  215 	if ($_ =~/\<H1.*?\>/i) {
  216 		$hone++;
  217 	
  218 	    }
  219     
  220 	# We have to count tables and the amount of coloumns in them
  221 	if ($_=~/<TABLE.*?>/i) {
  222 	    $intable = ja;
  223 	    $nrtable++;
  224 	}
  225 	if ($intable eq ja) {
  226 	    if ($_=~/<TR.*?>/) {
  227 		$nrrow++;
  228 	    }
  229 	    if ($nrrow == 1) {
  230 		if (/<\/TD.*?>/) {
  231 		    $table{$nrtable} = "$table{$nrtable}" . "l";
  232 		}
  233 	    }
  234 	}
  235 	
  236 	if ($_=~/<\/TABLE.*?>/) {
  237 	    $intable = nej;
  238 	    $nrrow = 0;
  239 	}
  240     }
  241 if ($hone > 1 ) {
  242 	$sect = H1;
  243 } else {
  244 	$sect = H2;
  245 }
  246     
  247 close html;
  248 
  249 $nrtable = 0;
  250 $intable = nej;
  251 
  252 # Lets first write I couple of thing we probably cant figure out
  253 # Would be good if we could figure out the name of the writer
  254 # Perhaps ask for one?
  255 print sgml <<'End_off_sub';
  256 <author>You name
  257 <date>
  258 <abstract>
  259 </abstract>
  260 <toc>
  261 End_off_sub
  262 
  263 # Now we can start the real substitution, the order of the substitution is
  264 # in some cases realy important
  265 open (html, $htmlfile);
  266 
  267 $inquote = nej;
  268 $inverb = nej;
  269 $nrfootnotes = 0;
  270 while (<html>) {
  271 	
  272 	# must be first for sake of footnotes	
  273     
  274     	s/<FONT.*?>|<\/FONT>//gi;
  275     
  276     	# footnotes, as they are converted in Applix aw->html
  277 	if (/<SUP>|<\/SUP>/) {
  278 		# grab all footnotes on the current line
  279 		while (/<SUP>[0-9]+/) {
  280 			$nrfootnotes++;
  281 			$putin = $footnotes{$nrfootnotes};
  282 			s/<SUP>[0-9]+/<footnote>$putin<\/footnote>/;
  283 		}
  284 		# if foonotenumber noot on the same line as <SUP>
  285 		while (/[0-9]+<\/SUP>/) {
  286 			$nrfootnotes++;
  287 			$putin = $footnotes{$nrfootnotes};
  288 			s/[0-9]+<\/SUP>/<footnote>$putin<\/footnote>/;
  289 		}
  290 	}
  291 	
  292 	
  293 	# Things to remowe
  294 	s/<SUP>|<\/SUP>//gi;
  295 	s/<HTML>|<\/HTML>//gi;
  296 	s/<HEAD>|<\/HEAD>//gi;
  297 	s/<TITLE>((.*<\/TITLE>)|(.*$))//gi;
  298 	s/<\/TITLE>//gi;
  299 	s/<HR>//gi;
  300 	s/<U>|<\/U>//gi;
  301 	s/<BODY.*?>|\<\/BODY>//gi;
  302 	s/[0-9]*<\/SUP>//gi;
  303 	s/<TT>|<\/TT>//gi;
  304 	s/<SAMP>|<\/SAMP>//gi;
  305 	
  306 	s/<CENTER>|<\/CENTER>//gi;
  307 	s/<ADDRESS>|<\/ADDRESS>//gi;
  308 	s/<P\s.*?>//gi;
  309 	
  310 	
  311 	#things to substitute
  312 	
  313 	# fist sectionlevel
  314 	# first ones that has no meaning, remove them
  315 	 s/<H1><\/H1>//gi;
  316 	 s/<H2><\/H2>//gi;
  317 	 s/<H3><\/H3>//gi;
  318 	 s/<H4><\/H4>//gi;
  319 	 s/<H5><\/H5>//gi;
  320 	if ($sect eq H1) {
  321 	    s/<H1.*?>/<sect>/gi;
  322 	    s/<\/H1>/\n<p>/gi;
  323 	    s/<H2.*?>/<sect1>/gi;
  324 	    s/<\/H2>/\n<p>/gi;
  325 	    s/<H3.*?>/<sect2>/gi;
  326 	    s/<\/H3>/\n<p>/gi;
  327 	    s/<H4.*?>/<sect3>/gi;
  328 	    s/<\/H4>/\n<p>/gi;
  329 	    s/<H5.*?>/<sect4>/gi;
  330 	    s/<\/H5>/\n<p>/gi;
  331 	} else {
  332 	    s/<H2.*?>/<sect>/gi;
  333 	    s/<\/H2>/\n<p>/gi;
  334 	    s/<H3.*?>/<sect1>/gi;
  335 	    s/<\/H3>/\n<p>/gi;
  336 	    s/<H4.*?>/<sect2>/gi;
  337 	    s/<\/H4>/\n<p>/gi;
  338 	    s/<H5.*?>/<sect3>/gi;
  339 	    s/<\/H5>/\n<p>/gi;
  340 	    }
  341 	
  342 	# lets take al one to one things
  343 	s/<BR>//gi;
  344 	s/<UL>/<itemize>/gi;
  345 	s/<\/UL>/<\/itemize>/gi;
  346 	s/<OL>/<enum>/gi;
  347 	s/<\/OL>/<\/enum>/gi;
  348 	s/<DIR>/<itemize>/gi;
  349 	s/<\/DIR>/<\/itemize>/gi;
  350 
  351 	# Does not generate good results
  352 	s/<DL.*?>/<descrip>/gi;
  353 	s/<\/DL>/<\/descrip>/gi;
  354 	s/<DT>/<tag>/gi;
  355 	s/<DD>/<\/tag>/gi;
  356 	
  357 	s/<LI>/<item>/gi;
  358 	s/<\/LI>/<\/item>/gi;
  359 	s/<LQ>/<lq>/gi;
  360 	s/<\/LQ>/<\/lq>/gi;
  361 	s/<EM>/<em>/g;
  362 	s/<\/EM>/<\/em>/g;
  363 	s/<CITE>/<em>/gi;
  364 	s/<\/CITE>/<\/em>/gi;
  365 	s/<B>/<bf>/gi;
  366 	s/<\/B>/<\/bf>/gi;
  367 	s/<STRONG>/<bf>/gi;
  368 	s/<\/STRONG>/<\/bf>/gi;
  369 	s/<I>/<it>/gi;
  370 	s/<\/I>/<\/it>/gi;
  371 	s/<SF>/<sf>/g;
  372 	s/<\/SF>/<\/sf>/g;
  373 	s/<CODE>/<tt>/gi;
  374 	s/<\/CODE>/<\/tt>\n/gi;
  375 	s/<PRE>/<tscreen><verb>/gi;
  376 	
  377 	# \n taken away from ending if tscreen
  378 	s/<\/PRE>/<\/verb><\/tscreen>/gi;
  379 	s/<MC>/<mc>/gi;
  380 	s/<\/MC>/<\/mc>/gi;
  381 	s/&quot;/''/gi;
  382 	s/<DFN>/<tt>/gi;
  383 	s/<\/DFN>/<\/tt>/gi;
  384 	
  385 	# To prevent norty things to happend i latex
  386 	s/``/''/g;
  387 	
  388 	s/&#60;/&lt;/g;
  389 	s/&#62;/&gt;/g;
  390 	s/&#38;/&amp;/gi;
  391 	
  392 	
  393 	# we have to do special things inside verb and quote
  394 	if (/\<tscreen\>\<verb>/) {
  395 	    $inverb = ja;
  396 	}
  397 	
  398 	if (/\<\/verb\>\<\/tscreen\>/) {
  399 	    $inverb = nej;
  400 	}
  401 	
  402 	# quote does not like empty rows
  403 	if (/\<BLOCKQUOTE\>/i) {
  404 	    $inquote = ja;
  405 	    s/<BLOCKQUOTE>/<quote>/gi;
  406 	}
  407 	if (/\<\/BLOCKQUOTE\>/i) {
  408 	    $inquote = nej;
  409 	    s/<\/BLOCKQUOTE>/<\/quote>/gi;
  410 	}
  411 	
  412 
  413 	
  414 	#table - hm...probably only works for applix,
  415 	# to work with sgml2html, change tabular to table in mapping
  416 	if ($_=~/<TABLE.*?>/i) {
  417 	    $nrtable++;
  418 	    s/<TABLE.*>?/<tabular ca=\"$table{$nrtable}\">/gi;
  419 	}
  420 	s/<TR.*?>//gi;
  421 	s/<\/TD><\/TR>/<rowsep>/gi;
  422 	s/<TD.*?>//gi;
  423 	s/<\/TD>/<colsep>/gi;
  424 	if ($_=~/<\/TABLE.*?>/i) {
  425 	    s/<\/TABLE>/<\/tabular>/gi;
  426 	}
  427 
  428 
  429 	#urls - pure url does not produce god looking formats - use htmlurl for all
  430 	s/<A\sNAME=\"(.*)\">?(.*)<\/A>/\2<label id=\"\1\">/gi;
  431 	s/<A\sNAME=(.*?)>(.*?)<\/A>/\2<label id=\"\1\">/gi;
  432 	s/<A\sHREF=\"#([^\"]*)\">?(.*?)<\/A>/<htmlurl url=\"\1\" name=\"\2\">/gi;;
  433 	s/<A\sHREF=\"([^\"]*)\">?(.*?)<\/A>/<htmlurl url=\"\1\" name=\"\2\">/gi;
  434 	s/<A\sHREF=\"([^\"]*)\">?(.*?)$/<htmlurl url=\"\1\" name=\"\2\">/gi;
  435 	
  436 	# pure formaters dont use quotes
  437 	s/<A\sHREF=#([^\"]*?)>(.*?)<\/A>/<htmlurl url=\"\1\" name=\"\2\">/gi;;
  438 	s/<A\sHREF=([^\"]*?)>(.*?)<\/A>/<htmlurl url=\"\1\" name=\"\2\">/gi;
  439 	s/<A\sHREF=([^\"]*?)>(.*?)$/<htmlurl url=\"\1\" name=\"\2\">/gi;
  440 	# img - makes a eps-img, will only convert gifs
  441 	if ($_=~/\<IMG.*?SRC=([^>]*?)\.([^>]*?)>/i) {
  442 				      
  443 		$img = $1;
  444 		$img =~ s/\"//g;
  445 		$ext = $2;
  446 		$ext =~ s/\"//g;
  447 		s/<IMG.*?SRC=[^>]*?>/<figure>\n<eps file=\"$img\">\n<\/figure>/gi;
  448 		 # save the pics
  449 		push(@pics, "$img.$ext");
  450 				   
  451 	}
  452 	
  453 				  
  454 	# remove stale things
  455 	s/<\/A>//gi;	
  456 			   
  457 	# Fix smlish away from verb-env, should be more...
  458 	if ($inverb eq ja) {
  459 
  460 	    if (!/<quote>|<tscreen><verb>|<footnote>|<\/footnote>/) {
  461 		s/<.*?>|<\/.*?>//gi;
  462 		s/&gt;/>/gi;
  463 		s/&lt;/</gi;
  464 		s/&lsqb;/\[/g;
  465 		s/&rsqb;/\]/g;
  466 		s/&amp;/&ero;/gi;
  467 		
  468 		# iso-char have to be iso-char in verb
  469 		# not all implemented
  470 		s/&Auml;//g;
  471 		s/&Aring;//g;
  472 		s/&Egrave;//g;
  473 		s/&Eacute;//g;
  474 		s/&Ouml;//g;
  475 		s/&auml;//g;
  476 		s/&aring;//g;
  477 		s/&egrave;//g;
  478 		s/&eacute;//g;
  479 		s/&ouml;//g;    
  480 	    }
  481 	    
  482 				      
  483 # Dont know if this should realy be here, taken away for jargon
  484 	    s/(\n)\n/\1/g;
  485 	}
  486 
  487 	# special macro-conversion
  488 	if ($inverb eq nej) {
  489 		s/\$/&dollar;/gi;
  490 		s/([^&])#/\1&num;/gi;
  491 		s/%/&percnt;/gi;
  492 		s/~/&tilde;/gi;
  493 		s/\\/&bsol;/gi;
  494 		s/\[/&lsqb;/g;
  495 		s/\]/&rsqb;/g;
  496 		s/\^/&circ;/g;
  497 		s/_/&lowbar;/g;
  498 	
  499 			       }
  500 			       
  501 	if ($inquote eq ja) {
  502 		s/^\n//gi;
  503 	    }		       
  504 	
  505 	s/<P>|^<\/P>$//g;
  506 	
  507 	if ($inquote eq nej) {
  508 	
  509 		s/<\/P>/\n/gi;
  510 	} else {
  511 		s/<\/P>//gi;
  512 	}
  513 	
  514 	
  515 	if ($inquote eq ja) {
  516 		if (/^\s*$/) {
  517 			# Dont do anything, just remove empty lines from quotes
  518 		} else {
  519 	    		print sgml;
  520 	    	}
  521 	} else {	  
  522 		print sgml;
  523 	}
  524 			      
  525 }
  526 print sgml "</article>";
  527 close sgml;
  528 
  529 # convert any gifs (needs programs: giftopnm, ppptopgm and pnmtops) to ps
  530 while (<@pics>) {
  531 	if (-e "$_"){
  532 		($name, $ex) = split(/\./, $_);
  533 		`giftopnm $_ | ppmtopgm | pnmtops -noturn > $name.ps`;
  534 	}
  535 }
  536 
  537 exit
  538 
  539 
  540 
  541 
  542