# html2sgml - converts html-filese to # v 0.3 # Rude script to convert a html-file to a sgml-file in a format # suitabel for linuxdoc-sgml. See html2sgml.1 for ducumentation. # (c) Peter Antman, 1997. # send bug reports to: # peter.antman@abc.se # # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # $THISPROG = "html2sgml 0.3"; $usage = "usage: html2sgml fil.html"; die $usage if @ARGV < 1; $htmlfile = shift (@ARGV); $htmlfile =~ /([\S]*?)\.html/ or die "Could not figure out postfix for filename: $!\n"; $filename = $1; $sgmlfile = "$filename.sgml"; open (html, $htmlfile); open (sgml, ">$sgmlfile"); # Make sgml-header print sgml <<'End_off';
End_off # If we can find an applix-world file in the same directory with # the same name: try to get footnotes if (-e "$filename.aw") { open (aw, "$filename.aw"); $nrfootnotes = 0; $start_note = nej; $inT = nej; while () { # Find all fotnotes and their number and sequensnumber if (/^$/) { $number = $1; $nrfootnotes++; $tmpftn{$number} = "$nrfootnotes"; } # What footnote are we taking the content from if (/^$/) { $whichnote = $1; $start_note = ja; } # En open footnote if (/^$/) { $start_note = nej; } # Grab the content if ($start_note eq ja) { if (!((/footnote_body/) or (/)/) { s/\\//g; s/^\s//g; s/\n//g; if (/.*?>/) { $tmpharb = "$tmpharb" . "$_"; if ($tmpharb =~ /italic/) { $tmpharb =~ s/\"\s.*?>//; $footnotes{$tmpftn{$whichnote}} = "$footnotes{$tmpftn{$whichnote}}" . "$tmpharb"; } elsif ($tmpharb =~ /bold/) { $tmpharb =~ s/\"\s.*?>//; $footnotes{$tmpftn{$whichnote}} = "$footnotes{$tmpftn{$whichnote}}" . "$tmpharb"; } else { $tmpharb =~ s/\".*?>$//; $footnotes{$tmpftn{$whichnote}} = "$footnotes{$tmpftn{$whichnote}}" . "$tmpharb"; } $inT = nej; } else { $tmpharb = "$tmpharb" . "$_"; } } } # The real footnotecontent if (/( # 2) # 3) if (/$/) { $cont = $1; $cont =~ s/\\//g; $it = $2; if (/italic/) { $cont = "$cont"; } if (/bold/) { $cont = "$cont"; } $footnotes{$tmpftn{$whichnote}} = "$footnotes{$tmpftn{$whichnote}}" . "$cont"; } if (/) { if ($_ =~/\([^\<]*)/i) { print sgml "$1\n"; } if ($_ =~/\<H1.*?\>/i) { $hone++; } # We have to count tables and the amount of coloumns in them if ($_=~/<TABLE.*?>/i) { $intable = ja; $nrtable++; } if ($intable eq ja) { if ($_=~/<TR.*?>/) { $nrrow++; } if ($nrrow == 1) { if (/<\/TD.*?>/) { $table{$nrtable} = "$table{$nrtable}" . "l"; } } } if ($_=~/<\/TABLE.*?>/) { $intable = nej; $nrrow = 0; } } if ($hone > 1 ) { $sect = H1; } else { $sect = H2; } close html; $nrtable = 0; $intable = nej; # Lets first write I couple of thing we probably cant figure out # Would be good if we could figure out the name of the writer # Perhaps ask for one? print sgml <<'End_off_sub'; <author>You name <date> <abstract> </abstract> <toc> End_off_sub # Now we can start the real substitution, the order of the substitution is # in some cases realy important open (html, $htmlfile); $inquote = nej; $inverb = nej; $nrfootnotes = 0; while (<html>) { # must be first for sake of footnotes s/<FONT.*?>|<\/FONT>//gi; # footnotes, as they are converted in Applix aw->html if (/<SUP>|<\/SUP>/) { # grab all footnotes on the current line while (/<SUP>[0-9]+/) { $nrfootnotes++; $putin = $footnotes{$nrfootnotes}; s/<SUP>[0-9]+/<footnote>$putin<\/footnote>/; } # if foonotenumber noot on the same line as <SUP> while (/[0-9]+<\/SUP>/) { $nrfootnotes++; $putin = $footnotes{$nrfootnotes}; s/[0-9]+<\/SUP>/<footnote>$putin<\/footnote>/; } } # Things to remowe s/<SUP>|<\/SUP>//gi; s/<HTML>|<\/HTML>//gi; s/<HEAD>|<\/HEAD>//gi; s/<TITLE>((.*<\/TITLE>)|(.*$))//gi; s/<\/TITLE>//gi; s/<HR>//gi; s/<U>|<\/U>//gi; s/<BODY.*?>|\<\/BODY>//gi; s/[0-9]*<\/SUP>//gi; s/<TT>|<\/TT>//gi; s/<SAMP>|<\/SAMP>//gi; s/<CENTER>|<\/CENTER>//gi; s/<ADDRESS>|<\/ADDRESS>//gi; s/<P\s.*?>//gi; #things to substitute # fist sectionlevel # first ones that has no meaning, remove them s/<H1><\/H1>//gi; s/<H2><\/H2>//gi; s/<H3><\/H3>//gi; s/<H4><\/H4>//gi; s/<H5><\/H5>//gi; if ($sect eq H1) { s/<H1.*?>/<sect>/gi; s/<\/H1>/\n<p>/gi; s/<H2.*?>/<sect1>/gi; s/<\/H2>/\n<p>/gi; s/<H3.*?>/<sect2>/gi; s/<\/H3>/\n<p>/gi; s/<H4.*?>/<sect3>/gi; s/<\/H4>/\n<p>/gi; s/<H5.*?>/<sect4>/gi; s/<\/H5>/\n<p>/gi; } else { s/<H2.*?>/<sect>/gi; s/<\/H2>/\n<p>/gi; s/<H3.*?>/<sect1>/gi; s/<\/H3>/\n<p>/gi; s/<H4.*?>/<sect2>/gi; s/<\/H4>/\n<p>/gi; s/<H5.*?>/<sect3>/gi; s/<\/H5>/\n<p>/gi; } # lets take al one to one things s/<BR>//gi; s/<UL>/<itemize>/gi; s/<\/UL>/<\/itemize>/gi; s/<OL>/<enum>/gi; s/<\/OL>/<\/enum>/gi; s/<DIR>/<itemize>/gi; s/<\/DIR>/<\/itemize>/gi; # Does not generate good results s/<DL.*?>/<descrip>/gi; s/<\/DL>/<\/descrip>/gi; s/<DT>/<tag>/gi; s/<DD>/<\/tag>/gi; s/<LI>/<item>/gi; s/<\/LI>/<\/item>/gi; s/<LQ>/<lq>/gi; s/<\/LQ>/<\/lq>/gi; s/<EM>/<em>/g; s/<\/EM>/<\/em>/g; s/<CITE>/<em>/gi; s/<\/CITE>/<\/em>/gi; s/<B>/<bf>/gi; s/<\/B>/<\/bf>/gi; s/<STRONG>/<bf>/gi; s/<\/STRONG>/<\/bf>/gi; s/<I>/<it>/gi; s/<\/I>/<\/it>/gi; s/<SF>/<sf>/g; s/<\/SF>/<\/sf>/g; s/<CODE>/<tt>/gi; s/<\/CODE>/<\/tt>\n/gi; s/<PRE>/<tscreen><verb>/gi; # \n taken away from ending if tscreen s/<\/PRE>/<\/verb><\/tscreen>/gi; s/<MC>/<mc>/gi; s/<\/MC>/<\/mc>/gi; s/"/''/gi; s/<DFN>/<tt>/gi; s/<\/DFN>/<\/tt>/gi; # To prevent norty things to happend i latex s/``/''/g; s/</</g; s/>/>/g; s/&/&/gi; # we have to do special things inside verb and quote if (/\<tscreen\>\<verb>/) { $inverb = ja; } if (/\<\/verb\>\<\/tscreen\>/) { $inverb = nej; } # quote does not like empty rows if (/\<BLOCKQUOTE\>/i) { $inquote = ja; s/<BLOCKQUOTE>/<quote>/gi; } if (/\<\/BLOCKQUOTE\>/i) { $inquote = nej; s/<\/BLOCKQUOTE>/<\/quote>/gi; } #table - hm...probably only works for applix, # to work with sgml2html, change tabular to table in mapping if ($_=~/<TABLE.*?>/i) { $nrtable++; s/<TABLE.*>?/<tabular ca=\"$table{$nrtable}\">/gi; } s/<TR.*?>//gi; s/<\/TD><\/TR>/<rowsep>/gi; s/<TD.*?>//gi; s/<\/TD>/<colsep>/gi; if ($_=~/<\/TABLE.*?>/i) { s/<\/TABLE>/<\/tabular>/gi; } #urls - pure url does not produce god looking formats - use htmlurl for all s/<A\sNAME=\"(.*)\">?(.*)<\/A>/\2<label id=\"\1\">/gi; s/<A\sNAME=(.*?)>(.*?)<\/A>/\2<label id=\"\1\">/gi; s/<A\sHREF=\"#([^\"]*)\">?(.*?)<\/A>/<htmlurl url=\"\1\" name=\"\2\">/gi;; s/<A\sHREF=\"([^\"]*)\">?(.*?)<\/A>/<htmlurl url=\"\1\" name=\"\2\">/gi; s/<A\sHREF=\"([^\"]*)\">?(.*?)$/<htmlurl url=\"\1\" name=\"\2\">/gi; # pure formaters dont use quotes s/<A\sHREF=#([^\"]*?)>(.*?)<\/A>/<htmlurl url=\"\1\" name=\"\2\">/gi;; s/<A\sHREF=([^\"]*?)>(.*?)<\/A>/<htmlurl url=\"\1\" name=\"\2\">/gi; s/<A\sHREF=([^\"]*?)>(.*?)$/<htmlurl url=\"\1\" name=\"\2\">/gi; # img - makes a eps-img, will only convert gifs if ($_=~/\<IMG.*?SRC=([^>]*?)\.([^>]*?)>/i) { $img = $1; $img =~ s/\"//g; $ext = $2; $ext =~ s/\"//g; s/<IMG.*?SRC=[^>]*?>/<figure>\n<eps file=\"$img\">\n<\/figure>/gi; # save the pics push(@pics, "$img.$ext"); } # remove stale things s/<\/A>//gi; # Fix smlish away from verb-env, should be more... if ($inverb eq ja) { if (!/<quote>|<tscreen><verb>|<footnote>|<\/footnote>/) { s/<.*?>|<\/.*?>//gi; s/>/>/gi; s/</</gi; s/[/\[/g; s/]/\]/g; s/&/&ero;/gi; # iso-char have to be iso-char in verb # not all implemented s/Ä//g; s/Å//g; s/È//g; s/É//g; s/Ö//g; s/ä//g; s/å//g; s/è//g; s/é//g; s/ö//g; } # Dont know if this should realy be here, taken away for jargon s/(\n)\n/\1/g; } # special macro-conversion if ($inverb eq nej) { s/\$/$/gi; s/([^&])#/\1#/gi; s/%/%/gi; s/~/˜/gi; s/\\/\/gi; s/\[/[/g; s/\]/]/g; s/\^/ˆ/g; s/_/_/g; } if ($inquote eq ja) { s/^\n//gi; } s/<P>|^<\/P>$//g; if ($inquote eq nej) { s/<\/P>/\n/gi; } else { s/<\/P>//gi; } if ($inquote eq ja) { if (/^\s*$/) { # Dont do anything, just remove empty lines from quotes } else { print sgml; } } else { print sgml; } } print sgml "</article>"; close sgml; # convert any gifs (needs programs: giftopnm, ppptopgm and pnmtops) to ps while (<@pics>) { if (-e "$_"){ ($name, $ex) = split(/\./, $_); `giftopnm $_ | ppmtopgm | pnmtops -noturn > $name.ps`; } } exit