"Fossies" - the Fresh Open Source Software Archive

Member "roundup-2.0.0/roundup/dehtml.py" (29 Feb 2020, 6551 Bytes) of package /linux/www/roundup-2.0.0.tar.gz:


The requested HTML page contains a <FORM> tag that is unusable on "Fossies" in "automatic" (rendered) mode so that page is shown as HTML source code syntax highlighting (style: standard) with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file. See also the latest Fossies "Diffs" side-by-side code changes report for "dehtml.py": 1.6.1_vs_2.0.0.

    1 
    2 from __future__ import print_function
    3 from roundup.anypy.strings import u2s, uchr
    4 
    5 import sys
    6 _pyver = sys.version_info[0]
    7 
    8 class dehtml:
    9     def __init__(self, converter):
   10         if converter == "none":
   11             self.html2text = None
   12             return
   13 
   14         try:
   15             if converter == "beautifulsoup":
   16                 # Not as well tested as dehtml.
   17                 from bs4 import BeautifulSoup
   18 
   19                 def html2text(html):
   20                     soup = BeautifulSoup(html)
   21 
   22                     # kill all script and style elements
   23                     for script in soup(["script", "style"]):
   24                         script.extract()
   25 
   26                     return u2s(soup.get_text('\n', strip=True))
   27 
   28                 self.html2text = html2text
   29             else:
   30                 raise ImportError
   31         except ImportError:
   32             # use the fallback below if beautiful soup is not installed.
   33             try:
   34                 # Python 3+.
   35                 from html.parser import HTMLParser
   36                 from html.entities import name2codepoint
   37             except ImportError:
   38                 # Python 2.
   39                 from HTMLParser import HTMLParser
   40                 from htmlentitydefs import name2codepoint
   41 
   42             class DumbHTMLParser(HTMLParser):
   43                 # class attribute
   44                 text = ""
   45 
   46                 # internal state variable
   47                 _skip_data = False
   48                 _last_empty = False
   49 
   50                 def handle_data(self, data):
   51                     if self._skip_data:  # skip data in script or style block
   52                         return
   53 
   54                     if (data.strip() == ""):
   55                         # reduce multiple blank lines to 1
   56                         if (self._last_empty):
   57                             return
   58                         else:
   59                             self._last_empty = True
   60                     else:
   61                         self._last_empty = False
   62 
   63                     self.text = self.text + data
   64 
   65                 def handle_starttag(self, tag, attrs):
   66                     if (tag == "p"):
   67                         self.text = self.text + "\n"
   68                     if (tag in ("style", "script")):
   69                         self._skip_data = True
   70 
   71                 def handle_endtag(self, tag):
   72                     if (tag in ("style", "script")):
   73                         self._skip_data = False
   74 
   75                 def handle_entityref(self, name):
   76                     if self._skip_data:
   77                         return
   78                     c = uchr(name2codepoint[name])
   79                     try:
   80                         self.text = self.text + c
   81                     except UnicodeEncodeError:
   82                         # print a space as a placeholder
   83                         self.text = self.text + ' '
   84 
   85             def html2text(html):
   86                 if _pyver == 3:
   87                     parser = DumbHTMLParser(convert_charrefs=True)
   88                 else:
   89                     parser = DumbHTMLParser()
   90                 parser.feed(html)
   91                 parser.close()
   92                 return parser.text
   93 
   94             self.html2text = html2text
   95 
   96 
   97 if "__main__" == __name__:
   98     html = '''
   99 <body>
  100 <script>
  101 this must not be in output
  102 </script>
  103 <style>
  104 p {display:block}
  105 </style>
  106     <div class="header"><h1>Roundup</h1>
  107         <div id="searchbox" style="display: none">
  108           <form class="search" action="../search.html" method="get">
  109             <input type="text" name="q" size="18" />
  110             <input type="submit" value="Search" />
  111             <input type="hidden" name="check_keywords" value="yes" />
  112             <input type="hidden" name="area" value="default" />
  113           </form>
  114         </div>
  115         <script type="text/javascript">$('#searchbox').show(0);</script>
  116     </div>
  117        <ul class="current">
  118 <li class="toctree-l1"><a class="reference internal" href="../index.html">Home</a></li>
  119 <li class="toctree-l1"><a class="reference external" href="http://pypi.python.org/pypi/roundup">Download</a></li>
  120 <li class="toctree-l1 current"><a class="reference internal" href="../docs.html">Docs</a><ul class="current">
  121 <li class="toctree-l2"><a class="reference internal" href="features.html">Roundup Features</a></li>
  122 <li class="toctree-l2 current"><a class="current reference internal" href="">Installing Roundup</a></li>
  123 <li class="toctree-l2"><a class="reference internal" href="upgrading.html">Upgrading to newer versions of Roundup</a></li>
  124 <li class="toctree-l2"><a class="reference internal" href="FAQ.html">Roundup FAQ</a></li>
  125 <li class="toctree-l2"><a class="reference internal" href="user_guide.html">User Guide</a></li>
  126 <li class="toctree-l2"><a class="reference internal" href="customizing.html">Customising Roundup</a></li>
  127 <li class="toctree-l2"><a class="reference internal" href="admin_guide.html">Administration Guide</a></li>
  128 </ul>
  129 <div class="section" id="prerequisites">
  130 <h2><a class="toc-backref" href="#id5">Prerequisites</a></h2>
  131 <p>Roundup requires Python 2.5 or newer (but not Python 3) with a functioning
  132 anydbm module. Download the latest version from <a class="reference external" href="http://www.python.org/">http://www.python.org/</a>.
  133 It is highly recommended that users install the latest patch version
  134 of python as these contain many fixes to serious bugs.</p>
  135 <p>Some variants of Linux will need an additional &#8220;python dev&#8221; package
  136 installed for Roundup installation to work. Debian and derivatives, are
  137 known to require this.</p>
  138 <p>If you&#8217;re on windows, you will either need to be using the ActiveState python
  139 distribution (at <a class="reference external" href="http://www.activestate.com/Products/ActivePython/">http://www.activestate.com/Products/ActivePython/</a>), or you&#8217;ll
  140 have to install the win32all package separately (get it from
  141 <a class="reference external" href="http://starship.python.net/crew/mhammond/win32/">http://starship.python.net/crew/mhammond/win32/</a>).</p>
  142 <script>
  143   &lt; HELP &GT;
  144 </script>
  145 </div>
  146 </body>
  147 '''
  148 
  149     html2text = dehtml("dehtml").html2text
  150     if html2text:
  151         print(html2text(html))
  152 
  153     try:
  154         # trap error seen if N_TOKENS not defined when run.
  155         html2text = dehtml("beautifulsoup").html2text
  156         if html2text:
  157             print(html2text(html))
  158     except NameError as e:
  159         print("captured error %s" % e)
  160 
  161     html2text = dehtml("none").html2text
  162     if html2text:
  163         print("FAIL: Error, dehtml(none) is returning a function")
  164     else:
  165         print("PASS: dehtml(none) is returning None")