"Fossies" - the Fresh Open Source Software Archive

Member "bup-0.30/lib/bup/gc.py" (28 Sep 2019, 10219 Bytes) of package /linux/privat/bup-0.30.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Python source code syntax highlighting (style: standard) with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file. For more information about "gc.py" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 0.29.3_vs_0.30.

    1 
    2 from __future__ import absolute_import
    3 import glob, os, subprocess, sys, tempfile
    4 from bup import bloom, git, midx
    5 from bup.compat import range
    6 from bup.git import MissingObject, walk_object
    7 from bup.helpers import Nonlocal, log, progress, qprogress
    8 from os.path import basename
    9 
   10 # This garbage collector uses a Bloom filter to track the live objects
   11 # during the mark phase.  This means that the collection is
   12 # probabilistic; it may retain some (known) percentage of garbage, but
   13 # it can also work within a reasonable, fixed RAM budget for any
   14 # particular percentage and repository size.
   15 #
   16 # The collection proceeds as follows:
   17 #
   18 #   - Scan all live objects by walking all of the refs, and insert
   19 #     every hash encountered into a new Bloom "liveness" filter.
   20 #     Compute the size of the liveness filter based on the total
   21 #     number of objects in the repository.  This is the "mark phase".
   22 #
   23 #   - Clear the data that's dependent on the repository's object set,
   24 #     i.e. the reflog, the normal Bloom filter, and the midxes.
   25 #
   26 #   - Traverse all of the pack files, consulting the liveness filter
   27 #     to decide which objects to keep.
   28 #
   29 #     For each pack file, rewrite it iff it probably contains more
   30 #     than (currently) 10% garbage (computed by an initial traversal
   31 #     of the packfile in consultation with the liveness filter).  To
   32 #     rewrite, traverse the packfile (again) and write each hash that
   33 #     tests positive against the liveness filter to a packwriter.
   34 #
   35 #     During the traversal of all of the packfiles, delete redundant,
   36 #     old packfiles only after the packwriter has finished the pack
   37 #     that contains all of their live objects.
   38 #
   39 # The current code unconditionally tracks the set of tree hashes seen
   40 # during the mark phase, and skips any that have already been visited.
   41 # This should decrease the IO load at the cost of increased RAM use.
   42 
   43 # FIXME: add a bloom filter tuning parameter?
   44 
   45 
   46 def count_objects(dir, verbosity):
   47     # For now we'll just use open_idx(), but we could probably be much
   48     # more efficient since all we need is a single integer (the last
   49     # fanout entry) from each index.
   50     object_count = 0
   51     indexes = glob.glob(os.path.join(dir, '*.idx'))
   52     for i, idx_name in enumerate(indexes):
   53         if verbosity:
   54             log('found %d objects (%d/%d %s)\r'
   55                 % (object_count, i + 1, len(indexes), basename(idx_name)))
   56         idx = git.open_idx(idx_name)
   57         object_count += len(idx)
   58     return object_count
   59 
   60 
   61 def report_live_item(n, total, ref_name, ref_id, item, verbosity):
   62     status = 'scanned %02.2f%%' % (n * 100.0 / total)
   63     hex_id = ref_id.encode('hex')
   64     dirslash = '/' if item.type == 'tree' else ''
   65     chunk_path = item.chunk_path
   66 
   67     if chunk_path:
   68         if verbosity < 4:
   69             return
   70         ps = '/'.join(item.path)
   71         chunk_ps = '/'.join(chunk_path)
   72         log('%s %s:%s/%s%s\n' % (status, hex_id, ps, chunk_ps, dirslash))
   73         return
   74 
   75     # Top commit, for example has none.
   76     demangled = git.demangle_name(item.path[-1], item.mode)[0] if item.path \
   77                 else None
   78 
   79     # Don't print mangled paths unless the verbosity is over 3.
   80     if demangled:
   81         ps = '/'.join(item.path[:-1] + [demangled])
   82         if verbosity == 1:
   83             qprogress('%s %s:%s%s\r' % (status, hex_id, ps, dirslash))
   84         elif (verbosity > 1 and item.type == 'tree') \
   85              or (verbosity > 2 and item.type == 'blob'):
   86             log('%s %s:%s%s\n' % (status, hex_id, ps, dirslash))
   87     elif verbosity > 3:
   88         ps = '/'.join(item.path)
   89         log('%s %s:%s%s\n' % (status, hex_id, ps, dirslash))
   90 
   91 
   92 def find_live_objects(existing_count, cat_pipe, verbosity=0):
   93     prune_visited_trees = True # In case we want a command line option later
   94     pack_dir = git.repo('objects/pack')
   95     ffd, bloom_filename = tempfile.mkstemp('.bloom', 'tmp-gc-', pack_dir)
   96     os.close(ffd)
   97     # FIXME: allow selection of k?
   98     # FIXME: support ephemeral bloom filters (i.e. *never* written to disk)
   99     live_objs = bloom.create(bloom_filename, expected=existing_count, k=None)
  100     # live_objs will hold on to the fd until close or exit
  101     os.unlink(bloom_filename)
  102     stop_at, trees_visited = None, None
  103     if prune_visited_trees:
  104         trees_visited = set()
  105         stop_at = lambda x: x.decode('hex') in trees_visited
  106     approx_live_count = 0
  107     for ref_name, ref_id in git.list_refs():
  108         for item in walk_object(cat_pipe.get, ref_id.encode('hex'),
  109                                 stop_at=stop_at,
  110                                 include_data=None):
  111             # FIXME: batch ids
  112             if verbosity:
  113                 report_live_item(approx_live_count, existing_count,
  114                                  ref_name, ref_id, item, verbosity)
  115             if trees_visited is not None and item.type == 'tree':
  116                 trees_visited.add(item.oid)
  117             if verbosity:
  118                 if not live_objs.exists(item.oid):
  119                     live_objs.add(item.oid)
  120                     approx_live_count += 1
  121             else:
  122                 live_objs.add(item.oid)
  123     trees_visited = None
  124     if verbosity:
  125         log('expecting to retain about %.2f%% unnecessary objects\n'
  126             % live_objs.pfalse_positive())
  127     return live_objs
  128 
  129 
  130 def sweep(live_objects, existing_count, cat_pipe, threshold, compression,
  131           verbosity):
  132     # Traverse all the packs, saving the (probably) live data.
  133 
  134     ns = Nonlocal()
  135     ns.stale_files = []
  136     def remove_stale_files(new_pack_prefix):
  137         if verbosity and new_pack_prefix:
  138             log('created ' + basename(new_pack_prefix) + '\n')
  139         for p in ns.stale_files:
  140             if new_pack_prefix and p.startswith(new_pack_prefix):
  141                 continue  # Don't remove the new pack file
  142             if verbosity:
  143                 log('removing ' + basename(p) + '\n')
  144             os.unlink(p)
  145         if ns.stale_files:  # So git cat-pipe will close them
  146             cat_pipe.restart()
  147         ns.stale_files = []
  148 
  149     writer = git.PackWriter(objcache_maker=None,
  150                             compression_level=compression,
  151                             run_midx=False,
  152                             on_pack_finish=remove_stale_files)
  153 
  154     # FIXME: sanity check .idx names vs .pack names?
  155     collect_count = 0
  156     for idx_name in glob.glob(os.path.join(git.repo('objects/pack'), '*.idx')):
  157         if verbosity:
  158             qprogress('preserving live data (%d%% complete)\r'
  159                       % ((float(collect_count) / existing_count) * 100))
  160         idx = git.open_idx(idx_name)
  161 
  162         idx_live_count = 0
  163         for i in range(0, len(idx)):
  164             sha = idx.shatable[i * 20 : (i + 1) * 20]
  165             if live_objects.exists(sha):
  166                 idx_live_count += 1
  167 
  168         collect_count += idx_live_count
  169         if idx_live_count == 0:
  170             if verbosity:
  171                 log('deleting %s\n'
  172                     % git.repo_rel(basename(idx_name)))
  173             ns.stale_files.append(idx_name)
  174             ns.stale_files.append(idx_name[:-3] + 'pack')
  175             continue
  176 
  177         live_frac = idx_live_count / float(len(idx))
  178         if live_frac > ((100 - threshold) / 100.0):
  179             if verbosity:
  180                 log('keeping %s (%d%% live)\n' % (git.repo_rel(basename(idx_name)),
  181                                                   live_frac * 100))
  182             continue
  183 
  184         if verbosity:
  185             log('rewriting %s (%.2f%% live)\n' % (basename(idx_name),
  186                                                   live_frac * 100))
  187         for i in range(0, len(idx)):
  188             sha = idx.shatable[i * 20 : (i + 1) * 20]
  189             if live_objects.exists(sha):
  190                 item_it = cat_pipe.get(sha.encode('hex'))
  191                 _, typ, _ = next(item_it)
  192                 writer.just_write(sha, typ, ''.join(item_it))
  193 
  194         ns.stale_files.append(idx_name)
  195         ns.stale_files.append(idx_name[:-3] + 'pack')
  196 
  197     if verbosity:
  198         progress('preserving live data (%d%% complete)\n'
  199                  % ((float(collect_count) / existing_count) * 100))
  200 
  201     # Nothing should have recreated midx/bloom yet.
  202     pack_dir = git.repo('objects/pack')
  203     assert(not os.path.exists(os.path.join(pack_dir, 'bup.bloom')))
  204     assert(not glob.glob(os.path.join(pack_dir, '*.midx')))
  205 
  206     # try/catch should call writer.abort()?
  207     # This will finally run midx.
  208     writer.close()  # Can only change refs (if needed) after this.
  209     remove_stale_files(None)  # In case we didn't write to the writer.
  210 
  211     if verbosity:
  212         log('discarded %d%% of objects\n'
  213             % ((existing_count - count_objects(pack_dir, verbosity))
  214                / float(existing_count) * 100))
  215 
  216 
  217 def bup_gc(threshold=10, compression=1, verbosity=0):
  218     cat_pipe = git.cp()
  219     existing_count = count_objects(git.repo('objects/pack'), verbosity)
  220     if verbosity:
  221         log('found %d objects\n' % existing_count)
  222     if not existing_count:
  223         if verbosity:
  224             log('nothing to collect\n')
  225     else:
  226         try:
  227             live_objects = find_live_objects(existing_count, cat_pipe,
  228                                              verbosity=verbosity)
  229         except MissingObject as ex:
  230             log('bup: missing object %r \n' % ex.oid.encode('hex'))
  231             sys.exit(1)
  232         try:
  233             # FIXME: just rename midxes and bloom, and restore them at the end if
  234             # we didn't change any packs?
  235             packdir = git.repo('objects/pack')
  236             if verbosity: log('clearing midx files\n')
  237             midx.clear_midxes(packdir)
  238             if verbosity: log('clearing bloom filter\n')
  239             bloom.clear_bloom(packdir)
  240             if verbosity: log('clearing reflog\n')
  241             expirelog_cmd = ['git', 'reflog', 'expire', '--all', '--expire=all']
  242             expirelog = subprocess.Popen(expirelog_cmd, preexec_fn = git._gitenv())
  243             git._git_wait(' '.join(expirelog_cmd), expirelog)
  244             if verbosity: log('removing unreachable data\n')
  245             sweep(live_objects, existing_count, cat_pipe,
  246                   threshold, compression,
  247                   verbosity)
  248         finally:
  249             live_objects.close()