"Fossies" - the Fresh Open Source Software Archive

Member "OCRmyPDF-8.3.0/src/ocrmypdf/exec/unpaper.py" (13 May 2019, 4548 Bytes) of package /linux/privat/OCRmyPDF-8.3.0.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Python source code syntax highlighting (style: standard) with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file. For more information about "unpaper.py" see the Fossies "Dox" file reference documentation and the last Fossies "Diffs" side-by-side code changes report: 8.0.1_vs_8.1.0.

    1 # © 2015 James R. Barlow: github.com/jbarlow83
    2 #
    3 # This file is part of OCRmyPDF.
    4 #
    5 # OCRmyPDF is free software: you can redistribute it and/or modify
    6 # it under the terms of the GNU General Public License as published by
    7 # the Free Software Foundation, either version 3 of the License, or
    8 # (at your option) any later version.
    9 #
   10 # OCRmyPDF is distributed in the hope that it will be useful,
   11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
   12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   13 # GNU General Public License for more details.
   14 #
   15 # You should have received a copy of the GNU General Public License
   16 # along with OCRmyPDF.  If not, see <http://www.gnu.org/licenses/>.
   17 
   18 # unpaper documentation:
   19 # https://github.com/Flameeyes/unpaper/blob/master/doc/basic-concepts.md
   20 
   21 import os
   22 import shlex
   23 import subprocess
   24 import sys
   25 from functools import lru_cache
   26 from subprocess import PIPE, STDOUT, CalledProcessError
   27 from tempfile import TemporaryDirectory
   28 
   29 from . import get_version
   30 from ..exceptions import MissingDependencyError, SubprocessOutputError
   31 
   32 try:
   33     from PIL import Image
   34 except ImportError:
   35     print("Could not find Python3 imaging library", file=sys.stderr)
   36     raise
   37 
   38 
   39 @lru_cache(maxsize=1)
   40 def version():
   41     return get_version('unpaper')
   42 
   43 
   44 def run(input_file, output_file, dpi, log, mode_args):
   45     args_unpaper = ['unpaper', '-v', '--dpi', str(dpi)] + mode_args
   46 
   47     SUFFIXES = {'1': '.pbm', 'L': '.pgm', 'RGB': '.ppm'}
   48 
   49     im = Image.open(input_file)
   50     if im.mode not in SUFFIXES.keys():
   51         log.info("Converting image to other colorspace")
   52         try:
   53             if im.mode == 'P' and len(im.getcolors()) == 2:
   54                 im = im.convert(mode='1')
   55             else:
   56                 im = im.convert(mode='RGB')
   57         except IOError as e:
   58             log.error("Could not convert image with type " + im.mode)
   59             im.close()
   60             raise MissingDependencyError() from e
   61 
   62     try:
   63         suffix = SUFFIXES[im.mode]
   64     except KeyError:
   65         log.error("Failed to convert image to a supported format.")
   66         im.close()
   67         raise MissingDependencyError() from e
   68 
   69     with TemporaryDirectory() as tmpdir:
   70         input_pnm = os.path.join(tmpdir, f'input{suffix}')
   71         output_pnm = os.path.join(tmpdir, f'output{suffix}')
   72         im.save(input_pnm, format='PPM')
   73         im.close()
   74 
   75         # To prevent any shenanigans from accepting arbitrary parameters in
   76         # --unpaper-args, we:
   77         # 1) run with cwd set to a tmpdir with only unpaper's files
   78         # 2) forbid the use of '/' in arguments, to prevent changing paths
   79         # 3) append absolute paths for the input and output file
   80         # This should ensure that a user cannot clobber some other file with
   81         # their unpaper arguments (whether intentionally or otherwise)
   82         args_unpaper.extend([input_pnm, output_pnm])
   83         try:
   84             proc = subprocess.run(
   85                 args_unpaper,
   86                 check=True,
   87                 close_fds=True,
   88                 universal_newlines=True,
   89                 stderr=STDOUT,
   90                 cwd=tmpdir,
   91                 stdout=PIPE,
   92             )
   93         except CalledProcessError as e:
   94             log.debug(e.output)
   95             raise e from e
   96         else:
   97             log.debug(proc.stdout)
   98             # unpaper sets dpi to 72; fix this
   99             try:
  100                 Image.open(output_pnm).save(output_file, dpi=(dpi, dpi))
  101             except (FileNotFoundError, OSError):
  102                 raise SubprocessOutputError(
  103                     "unpaper: failed to produce the expected output file. Called with: "
  104                     + str(args_unpaper)
  105                 ) from None
  106 
  107 
  108 def validate_custom_args(args: str):
  109     unpaper_args = shlex.split(args)
  110     if any('/' in arg for arg in unpaper_args):
  111         raise ValueError('No filenames allowed in --unpaper-args')
  112     return unpaper_args
  113 
  114 
  115 def clean(input_file, output_file, dpi, log, unpaper_args=None):
  116     default_args = [
  117         '--layout',
  118         'none',
  119         '--mask-scan-size',
  120         '100',  # don't blank out narrow columns
  121         '--no-border-align',  # don't align visible content to borders
  122         '--no-mask-center',  # don't center visible content within page
  123         '--no-grayfilter',  # don't remove light gray areas
  124         '--no-blackfilter',  # don't remove solid black areas
  125         '--no-deskew',  # don't deskew
  126     ]
  127     if not unpaper_args:
  128         unpaper_args = default_args
  129     run(input_file, output_file, dpi, log, unpaper_args)