"Fossies" - the Fresh Open Source Software Archive  

Source code changes of the file "src/ocrmypdf/exec/unpaper.py" between
OCRmyPDF-8.0.1.tar.gz and OCRmyPDF-8.1.0.tar.gz

About: OCRmyPDF adds an OCR text layer to scanned PDF files, allowing them to be searched.

unpaper.py  (OCRmyPDF-8.0.1):unpaper.py  (OCRmyPDF-8.1.0)
skipping to change at line 22 skipping to change at line 22
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details. # GNU General Public License for more details.
# #
# You should have received a copy of the GNU General Public License # You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>. # along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
# unpaper documentation: # unpaper documentation:
# https://github.com/Flameeyes/unpaper/blob/master/doc/basic-concepts.md # https://github.com/Flameeyes/unpaper/blob/master/doc/basic-concepts.md
import os import os
import shlex
import subprocess
import sys import sys
from functools import lru_cache from functools import lru_cache
from subprocess import STDOUT, CalledProcessError, check_output from subprocess import PIPE, STDOUT, CalledProcessError
from tempfile import NamedTemporaryFile from tempfile import TemporaryDirectory
from . import get_version from . import get_version
from ..exceptions import MissingDependencyError from ..exceptions import MissingDependencyError, SubprocessOutputError
try: try:
from PIL import Image from PIL import Image
except ImportError: except ImportError:
print("Could not find Python3 imaging library", file=sys.stderr) print("Could not find Python3 imaging library", file=sys.stderr)
raise raise
@lru_cache(maxsize=1) @lru_cache(maxsize=1)
def version(): def version():
return get_version('unpaper') return get_version('unpaper')
skipping to change at line 65 skipping to change at line 67
im.close() im.close()
raise MissingDependencyError() from e raise MissingDependencyError() from e
try: try:
suffix = SUFFIXES[im.mode] suffix = SUFFIXES[im.mode]
except KeyError: except KeyError:
log.error("Failed to convert image to a supported format.") log.error("Failed to convert image to a supported format.")
im.close() im.close()
raise MissingDependencyError() from e raise MissingDependencyError() from e
with NamedTemporaryFile(suffix=suffix) as input_pnm, NamedTemporaryFile( with TemporaryDirectory() as tmpdir:
suffix=suffix, mode="r+b" input_pnm = os.path.join(tmpdir, f'input{suffix}')
) as output_pnm: output_pnm = os.path.join(tmpdir, f'output{suffix}')
im.save(input_pnm, format='PPM') im.save(input_pnm, format='PPM')
im.close() im.close()
os.unlink(output_pnm.name) # To prevent any shenanigans from accepting arbitrary parameters in
# --unpaper-args, we:
args_unpaper.extend([input_pnm.name, output_pnm.name]) # 1) run with cwd set to a tmpdir with only unpaper's files
# 2) forbid the use of '/' in arguments, to prevent changing paths
# 3) append absolute paths for the input and output file
# This should ensure that a user cannot clobber some other file with
# their unpaper arguments (whether intentionally or otherwise)
args_unpaper.extend([input_pnm, output_pnm])
try: try:
stdout = check_output( proc = subprocess.run(
args_unpaper, close_fds=True, universal_newlines=True, stderr=ST args_unpaper,
DOUT check=True,
close_fds=True,
universal_newlines=True,
stderr=STDOUT,
cwd=tmpdir,
stdout=PIPE,
) )
except CalledProcessError as e: except CalledProcessError as e:
log.debug(e.output) log.debug(e.output)
raise e from e raise e from e
else: else:
log.debug(stdout) log.debug(proc.stdout)
# unpaper sets dpi to 72 # unpaper sets dpi to 72; fix this
Image.open(output_pnm.name).save(output_file, dpi=(dpi, dpi)) try:
Image.open(output_pnm).save(output_file, dpi=(dpi, dpi))
def clean(input_file, output_file, dpi, log): except (FileNotFoundError, OSError):
run( raise SubprocessOutputError(
input_file, "unpaper: failed to produce the expected output file. Called
output_file, with: "
dpi, + str(args_unpaper)
log, ) from None
[
'--layout', def validate_custom_args(args: str):
'none', unpaper_args = shlex.split(args)
'--mask-scan-size', if any('/' in arg for arg in unpaper_args):
'100', # don't blank out narrow columns raise ValueError('No filenames allowed in --unpaper-args')
'--no-border-align', # don't align visible content to borders return unpaper_args
'--no-mask-center', # don't center visible content within page
'--no-grayfilter', # don't remove light gray areas def clean(input_file, output_file, dpi, log, unpaper_args=None):
'--no-blackfilter', # don't remove solid black areas default_args = [
'--no-deskew', # don't deskew '--layout',
], 'none',
) '--mask-scan-size',
'100', # don't blank out narrow columns
'--no-border-align', # don't align visible content to borders
'--no-mask-center', # don't center visible content within page
'--no-grayfilter', # don't remove light gray areas
'--no-blackfilter', # don't remove solid black areas
'--no-deskew', # don't deskew
]
if not unpaper_args:
unpaper_args = default_args
run(input_file, output_file, dpi, log, unpaper_args)
 End of changes. 7 change blocks. 
12 lines changed or deleted 24 lines changed or added

Home  |  About  |  Features  |  All  |  Newest  |  Dox  |  Diffs  |  RSS Feeds  |  Screenshots  |  Comments  |  Imprint  |  Privacy  |  HTTP(S)