"Fossies" - the Fresh Open Source Software Archive  

Source code changes of the file "src/ocrmypdf/exec/tesseract.py" between
OCRmyPDF-8.0.1.tar.gz and OCRmyPDF-8.1.0.tar.gz

About: OCRmyPDF adds an OCR text layer to scanned PDF files, allowing them to be searched.

tesseract.py  (OCRmyPDF-8.0.1):tesseract.py  (OCRmyPDF-8.1.0)
skipping to change at line 195 skipping to change at line 195
raise TesseractConfigError(problem) raise TesseractConfigError(problem)
elif 'error' in line.lower() or 'exception' in line.lower(): elif 'error' in line.lower() or 'exception' in line.lower():
log.error(prefix + line.strip()) log.error(prefix + line.strip())
elif 'warning' in line.lower(): elif 'warning' in line.lower():
log.warning(prefix + line.strip()) log.warning(prefix + line.strip())
elif 'read_params_file' in line.lower(): elif 'read_params_file' in line.lower():
log.error(prefix + line.strip()) log.error(prefix + line.strip())
else: else:
log.info(prefix + line.strip()) log.info(prefix + line.strip())
def page_timedout(log, input_file): def page_timedout(log, input_file, timeout):
if timeout == 0:
return
prefix = f"{(page_number(input_file)):4d}: [tesseract] " prefix = f"{(page_number(input_file)):4d}: [tesseract] "
log.warning(prefix + " took too long to OCR - skipping") log.warning(prefix + " took too long to OCR - skipping")
def _generate_null_hocr(output_hocr, output_sidecar, image): def _generate_null_hocr(output_hocr, output_sidecar, image):
"""Produce a .hocr file that reports no text detected on a page that is """Produce a .hocr file that reports no text detected on a page that is
the same size as the input image.""" the same size as the input image."""
from PIL import Image from PIL import Image
im = Image.open(image) im = Image.open(image)
w, h = im.size w, h = im.size
skipping to change at line 250 skipping to change at line 252
# Reminder: test suite tesseract spoofers will break after any changes # Reminder: test suite tesseract spoofers will break after any changes
# to the number of order parameters here # to the number of order parameters here
args_tesseract.extend([input_file, prefix, 'hocr', 'txt'] + tessconfig) args_tesseract.extend([input_file, prefix, 'hocr', 'txt'] + tessconfig)
try: try:
log.debug(args_tesseract) log.debug(args_tesseract)
stdout = check_output(args_tesseract, stderr=STDOUT, timeout=timeout) stdout = check_output(args_tesseract, stderr=STDOUT, timeout=timeout)
except TimeoutExpired: except TimeoutExpired:
# Generate a HOCR file with no recognized text if tesseract times out # Generate a HOCR file with no recognized text if tesseract times out
# Temporary workaround to hocrTransform not being able to function if # Temporary workaround to hocrTransform not being able to function if
# it does not have a valid hOCR file. # it does not have a valid hOCR file.
page_timedout(log, input_file) page_timedout(log, input_file, timeout)
_generate_null_hocr(output_hocr, output_sidecar, input_file) _generate_null_hocr(output_hocr, output_sidecar, input_file)
except CalledProcessError as e: except CalledProcessError as e:
tesseract_log_output(log, e.output, input_file) tesseract_log_output(log, e.output, input_file)
if b'Image too large' in e.output: if b'Image too large' in e.output:
_generate_null_hocr(output_hocr, output_sidecar, input_file) _generate_null_hocr(output_hocr, output_sidecar, input_file)
return return
raise e from e raise e from e
else: else:
tesseract_log_output(log, stdout, input_file) tesseract_log_output(log, stdout, input_file)
skipping to change at line 338 skipping to change at line 340
# to the number of order parameters here # to the number of order parameters here
args_tesseract.extend([input_image, prefix, 'pdf', 'txt'] + tessconfig) args_tesseract.extend([input_image, prefix, 'pdf', 'txt'] + tessconfig)
try: try:
log.debug(args_tesseract) log.debug(args_tesseract)
stdout = check_output(args_tesseract, stderr=STDOUT, timeout=timeout) stdout = check_output(args_tesseract, stderr=STDOUT, timeout=timeout)
if os.path.exists(prefix + '.txt'): if os.path.exists(prefix + '.txt'):
shutil.move(prefix + '.txt', output_text) shutil.move(prefix + '.txt', output_text)
except TimeoutExpired: except TimeoutExpired:
page_timedout(log, input_image) page_timedout(log, input_image, timeout)
use_skip_page(text_only, skip_pdf, output_pdf, output_text) use_skip_page(text_only, skip_pdf, output_pdf, output_text)
except CalledProcessError as e: except CalledProcessError as e:
tesseract_log_output(log, e.output, input_image) tesseract_log_output(log, e.output, input_image)
if b'Image too large' in e.output: if b'Image too large' in e.output:
use_skip_page(text_only, skip_pdf, output_pdf, output_text) use_skip_page(text_only, skip_pdf, output_pdf, output_text)
return return
raise e from e raise e from e
else: else:
tesseract_log_output(log, stdout, input_image) tesseract_log_output(log, stdout, input_image)
 End of changes. 3 change blocks. 
3 lines changed or deleted 5 lines changed or added

Home  |  About  |  Features  |  All  |  Newest  |  Dox  |  Diffs  |  RSS Feeds  |  Screenshots  |  Comments  |  Imprint  |  Privacy  |  HTTP(S)