"Fossies" - the Fresh Open Source Software Archive  

Source code changes of the file "src/ocrmypdf/_weave.py" between
OCRmyPDF-8.0.1.tar.gz and OCRmyPDF-8.1.0.tar.gz

About: OCRmyPDF adds an OCR text layer to scanned PDF files, allowing them to be searched.

_weave.py  (OCRmyPDF-8.0.1):_weave.py  (OCRmyPDF-8.1.0)
skipping to change at page 1, line 21 skipping to change at page 1, line 21
# OCRmyPDF is distributed in the hope that it will be useful, # OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of # but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details. # GNU General Public License for more details.
# #
# You should have received a copy of the GNU General Public License # You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>. # along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
from itertools import groupby from itertools import groupby
from pathlib import Path from pathlib import Path
import os
import pikepdf import pikepdf
from .exec import tesseract from .exec import tesseract
from .helpers import flatten_groups, page_number from .helpers import flatten_groups, page_number
MAX_OPEN_PAGE_PDFS = int(os.environ.get('_OCRMYPDF_MAX_OPEN_PAGE_PDFS', 100))
def _update_page_resources(*, page, font, font_key, procset): def _update_page_resources(*, page, font, font_key, procset):
"Update this page's fonts with a reference to the Glyphless font" """Update this page's fonts with a reference to the Glyphless font"""
if '/Resources' not in page: if '/Resources' not in page:
page['/Resources'] = pikepdf.Dictionary({}) page['/Resources'] = pikepdf.Dictionary({})
resources = page['/Resources'] resources = page['/Resources']
try: try:
fonts = resources['/Font'] fonts = resources['/Font']
except KeyError: except KeyError:
fonts = pikepdf.Dictionary({}) fonts = pikepdf.Dictionary({})
if font_key not in fonts: if font_key is not None and font_key not in fonts:
fonts[font_key] = font fonts[font_key] = font
resources['/Font'] = fonts resources['/Font'] = fonts
# Reassign /ProcSet to one that just lists everything - ProcSet is # Reassign /ProcSet to one that just lists everything - ProcSet is
# obsolete and doesn't matter but recommended for old viewer support # obsolete and doesn't matter but recommended for old viewer support
resources['/ProcSet'] = procset resources['/ProcSet'] = procset
def strip_invisible_text(pdf, page, log): def strip_invisible_text(pdf, page, log):
stream = [] stream = []
in_text_obj = False in_text_obj = False
skipping to change at page 1, line 160 skipping to change at page 1, line 163
if strip_old_text: if strip_old_text:
strip_invisible_text(pdf_base, base_page, log) strip_invisible_text(pdf_base, base_page, log)
base_page.page_contents_add(new_text_layer, prepend=True) base_page.page_contents_add(new_text_layer, prepend=True)
_update_page_resources( _update_page_resources(
page=base_page, font=font, font_key=font_key, procset=procset page=base_page, font=font, font_key=font_key, procset=procset
) )
def _find_font(text, pdf_base): def _find_font(text, pdf_base):
"Copy a font from the filename text into pdf_base" """Copy a font from the filename text into pdf_base"""
font, font_key = None, None font, font_key = None, None
possible_font_names = ('/f-0-0', '/F1') possible_font_names = ('/f-0-0', '/F1')
try: try:
pdf_text = pikepdf.open(text) pdf_text = pikepdf.open(text)
pdf_text_fonts = pdf_text.pages[0].Resources.get('/Font', {}) pdf_text_fonts = pdf_text.pages[0].Resources.get('/Font', {})
except Exception: except Exception:
return None, None return None, None
for f in possible_font_names: for f in possible_font_names:
pdf_text_font = pdf_text_fonts.get(f, None) pdf_text_font = pdf_text_fonts.get(f, None)
if pdf_text_font is not None: if pdf_text_font is not None:
font_key = f font_key = f
break break
if pdf_text_font: if pdf_text_font:
font = pdf_base.copy_foreign(pdf_text_font) font = pdf_base.copy_foreign(pdf_text_font)
if font_key is None:
print('font_key is None')
return font, font_key return font, font_key
def _traverse_toc(pdf_base, visitor_fn, log): def _traverse_toc(pdf_base, visitor_fn, log):
""" """
Walk the table of contents, calling visitor_fn() at each node Walk the table of contents, calling visitor_fn() at each node
The /Outlines data structure is a messy data structure, but rather than The /Outlines data structure is a messy data structure, but rather than
navigating hierarchically we just track unique nodes. Enqueue nodes when navigating hierarchically we just track unique nodes. Enqueue nodes when
we find them, and never visit them again. set() is awesome. We look for we find them, and never visit them again. set() is awesome. We look for
the two types of object in the table of contents that can be page bookmarks the two types of object in the table of contents that can be page bookmarks
skipping to change at page 1, line 243 skipping to change at page 1, line 248
""" """
if not pageref_remap: if not pageref_remap:
return return
def remap_dest(dest_node): def remap_dest(dest_node):
""" """
Inner helper function: change the objgen for any page from the old we Inner helper function: change the objgen for any page from the old we
invalidated to its new one. invalidated to its new one.
""" """
if not isinstance(dest_node, pikepdf.Array): try:
return pageref = dest_node[0]
pageref = dest_node[0] if pageref['/Type'] == '/Page' and pageref.objgen in pageref_remap:
if pageref['/Type'] == '/Page' and pageref.objgen in pageref_remap: new_objgen = pageref_remap[pageref.objgen]
new_objgen = pageref_remap[pageref.objgen] dest_node[0] = pdf_base.get_object(new_objgen)
dest_node[0] = pdf_base.get_object(new_objgen) except (IndexError, TypeError) as e:
log.warning("This file may contain invalid table of contents entries
")
log.debug(e)
def visit_remap_dest(pdf_base, node, log): def visit_remap_dest(pdf_base, node, log):
""" """
Visitor function to fix ToC entries Visitor function to fix ToC entries
Test for the two types of references to pages that can occur in ToCs. Test for the two types of references to pages that can occur in ToCs.
Both types have the same final format (an indirect reference to the Both types have the same final format (an indirect reference to the
target page). target page).
""" """
if '/Dest' in node: if '/Dest' in node:
skipping to change at page 1, line 387 skipping to change at page 1, line 394
procset=procset, procset=procset,
strip_old_text=strip_old, strip_old_text=strip_old,
log=log, log=log,
) )
# Correct the rotation if applicable # Correct the rotation if applicable
pdf_base.pages[page_num - 1].Rotate = ( pdf_base.pages[page_num - 1].Rotate = (
content_rotation - autorotate_correction content_rotation - autorotate_correction
) % 360 ) % 360
if len(keep_open) > 100: if len(keep_open) > MAX_OPEN_PAGE_PDFS:
# qpdf limitations require us to keep files open when we intend # qpdf limitations require us to keep files open when we intend
# to copy content from them before saving. However, we want to keep # to copy content from them before saving. However, we want to keep
# a lid on file handles and memory usage, so for big files we're # a lid on file handles and memory usage, so for big files we're
# even if page 1 doesn't use it, so we have a way to get it back. # even if page 1 doesn't use it, so we have a way to get it back.
page0 = pdf_base.pages[0] page0 = pdf_base.pages[0]
_update_page_resources( _update_page_resources(
page=page0, font=font, font_key=font_key, procset=procset page=page0, font=font, font_key=font_key, procset=procset
) )
interim = output_file + f'_working{page_num}.pdf' interim = output_file + f'_working{page_num}.pdf'
pdf_base.save(interim) pdf_base.save(interim)
del pdf_base del pdf_base
keep_open = [] keep_open = []
pdf_base = pikepdf.open(interim) pdf_base = pikepdf.open(interim)
procset = pdf_base.pages[0].Resources.ProcSet procset = pdf_base.pages[0].Resources.ProcSet
font = pdf_base.pages[0].Resources.Font.get(font_key) font, font_key = None, None # Reacquire this information
_fix_toc(pdf_base, pagerefs, log) _fix_toc(pdf_base, pagerefs, log)
pdf_base.save(output_file) pdf_base.save(output_file)
 End of changes. 9 change blocks. 
11 lines changed or deleted 19 lines changed or added

Home  |  About  |  Features  |  All  |  Newest  |  Dox  |  Diffs  |  RSS Feeds  |  Screenshots  |  Comments  |  Imprint  |  Privacy  |  HTTP(S)