"Fossies" - the Fresh Open Source Software Archive

Member "eucalyptus-4.4.2/storage/blobstore.c" (4 Aug 2017, 217286 Bytes) of package /linux/misc/eucalyptus-4.4.2.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "blobstore.c" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 4.4.1_vs_4.4.2.

    1 // -*- mode: C; c-basic-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
    2 // vim: set softtabstop=4 shiftwidth=4 tabstop=4 expandtab:
    3 
    4 /*************************************************************************
    5  * Copyright 2009-2012 Eucalyptus Systems, Inc.
    6  *
    7  * This program is free software: you can redistribute it and/or modify
    8  * it under the terms of the GNU General Public License as published by
    9  * the Free Software Foundation; version 3 of the License.
   10  *
   11  * This program is distributed in the hope that it will be useful,
   12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
   13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   14  * GNU General Public License for more details.
   15  *
   16  * You should have received a copy of the GNU General Public License
   17  * along with this program.  If not, see http://www.gnu.org/licenses/.
   18  *
   19  * Please contact Eucalyptus Systems, Inc., 6755 Hollister Ave., Goleta
   20  * CA 93117, USA or visit http://www.eucalyptus.com/licenses/ if you need
   21  * additional information or have any questions.
   22  *
   23  * This file may incorporate work covered under the following copyright
   24  * and permission notice:
   25  *
   26  *   Software License Agreement (BSD License)
   27  *
   28  *   Copyright (c) 2008, Regents of the University of California
   29  *   All rights reserved.
   30  *
   31  *   Redistribution and use of this software in source and binary forms,
   32  *   with or without modification, are permitted provided that the
   33  *   following conditions are met:
   34  *
   35  *     Redistributions of source code must retain the above copyright
   36  *     notice, this list of conditions and the following disclaimer.
   37  *
   38  *     Redistributions in binary form must reproduce the above copyright
   39  *     notice, this list of conditions and the following disclaimer
   40  *     in the documentation and/or other materials provided with the
   41  *     distribution.
   42  *
   43  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   44  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   45  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   46  *   FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   47  *   COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   48  *   INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   49  *   BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   50  *   LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   51  *   CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   52  *   LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   53  *   ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   54  *   POSSIBILITY OF SUCH DAMAGE. USERS OF THIS SOFTWARE ACKNOWLEDGE
   55  *   THE POSSIBLE PRESENCE OF OTHER OPEN SOURCE LICENSED MATERIAL,
   56  *   COPYRIGHTED MATERIAL OR PATENTED MATERIAL IN THIS SOFTWARE,
   57  *   AND IF ANY SUCH MATERIAL IS DISCOVERED THE PARTY DISCOVERING
   58  *   IT MAY INFORM DR. RICH WOLSKI AT THE UNIVERSITY OF CALIFORNIA,
   59  *   SANTA BARBARA WHO WILL THEN ASCERTAIN THE MOST APPROPRIATE REMEDY,
   60  *   WHICH IN THE REGENTS' DISCRETION MAY INCLUDE, WITHOUT LIMITATION,
   61  *   REPLACEMENT OF THE CODE SO IDENTIFIED, LICENSING OF THE CODE SO
   62  *   IDENTIFIED, OR WITHDRAWAL OF THE CODE CAPABILITY TO THE EXTENT
   63  *   NEEDED TO COMPLY WITH ANY SUCH LICENSES OR RIGHTS.
   64  ************************************************************************/
   65 
   66 //!
   67 //! @file storage/blobstore.c
   68 //! Implements blobstore storage
   69 //!
   70 
   71 /*----------------------------------------------------------------------------*\
   72  |                                                                            |
   73  |                                  INCLUDES                                  |
   74  |                                                                            |
   75 \*----------------------------------------------------------------------------*/
   76 
   77 #define _GNU_SOURCE
   78 #include <stdio.h>
   79 #include <stdlib.h>
   80 #include <string.h>
   81 #include <assert.h>
   82 #include <unistd.h>                    // close
   83 #include <time.h>                      // time
   84 #include <sys/time.h>                  // gettimeofday
   85 #include <sys/stat.h>                  // mkdir
   86 #include <errno.h>                     // errno
   87 #include <sys/types.h>                 // *dir, etc, wait
   88 #include <sys/file.h>                  // flock
   89 #include <dirent.h>
   90 #include <sys/wait.h>                  // wait
   91 #include <pthread.h>
   92 #include <sys/types.h>                 // gettid
   93 #include <regex.h>
   94 #include <libgen.h>                    // basename
   95 
   96 #include <eucalyptus.h>                // euca user
   97 #include <misc.h>                      // ensure_...
   98 #include <ipc.h>
   99 #include <euca_string.h>
  100 
  101 #include "blobstore.h"
  102 #include "diskutil.h"
  103 
  104 #ifdef _EUCA_BLOBS
  105 #include "map.h"
  106 #endif /* _EUCA_BLOBS */
  107 
  108 /*----------------------------------------------------------------------------*\
  109  |                                                                            |
  110  |                                  DEFINES                                   |
  111  |                                                                            |
  112 \*----------------------------------------------------------------------------*/
  113 
  114 #define BLOBSTORE_METADATA_FILE                  ".blobstore"
  115 #define BLOBSTORE_METADATA_TIMEOUT_USEC          (1000000LL * 60 * 2)   //!< it may take dozens of seconds to open blobstore when others are LRU-purging it
  116 #define BLOBSTORE_LOCK_TIMEOUT_USEC               500000LL
  117 #define BLOBSTORE_FIND_TIMEOUT_USEC                50000LL
  118 #define BLOBSTORE_DELETE_TIMEOUT_USEC              50000LL
  119 #define BLOBSTORE_SLEEP_INTERVAL_USEC              99999LL
  120 #define BLOBSTORE_DMSETUP_TIMEOUT_SEC                 60
  121 #define BLOBSTORE_MAX_CONCURRENT                      99
  122 #define BLOBSTORE_NO_TIMEOUT                          -1L
  123 #define BLOBSTORE_SIG_MAX                         262144
  124 #define DM_PATH                                  "/dev/mapper/"
  125 #define DM_FORMAT                                DM_PATH "%s"   //!< @TODO do not hardcode?
  126 #define MIN_BLOCKS_SNAPSHOT                      32 //!< otherwise dmsetup fails with device-mapper: reload ioctl failed: Cannot allocate memory OR device-mapper: reload ioctl failed: Input/output error
  127 #define EUCA_ZERO                                "euca-zero"
  128 #define EUCA_ZERO_SIZE                           "2199023255552"    //!< is one petabyte enough?
  129 
  130 #define __INLINE__                               __inline__
  131 
  132 #ifdef _UNIT_TEST
  133 #define F1                                       "/tmp/blobstore_test_1"
  134 #define F2                                       "/tmp/blobstore_test_2"
  135 #define F3                                       "/tmp/blobstore_test_3"
  136 
  137 #define _R                                       BLOBSTORE_FLAG_RDONLY
  138 #define _W                                       BLOBSTORE_FLAG_RDWR
  139 #define _C                                      (BLOBSTORE_FLAG_CREAT | BLOBSTORE_FLAG_EXCL | BLOBSTORE_FLAG_RDWR)
  140 #define _CBB                                    (BLOBSTORE_FLAG_CREAT | BLOBSTORE_FLAG_EXCL)
  141 
  142 #define B1                                       "BLOCKBLOB-01"
  143 #define B2                                       "BLOCKBLOB-02"
  144 #define B3                                       "BLOCKBLOB-03"
  145 #define B4                                       "BLOCKBLOB-04"
  146 #define B5                                       "BLOCKBLOB-05"
  147 #define B6                                       "BLOCKBLOB-06"
  148 
  149 #define BS_SIZE                                      30
  150 #define BB_SIZE                                      10
  151 #define CBB_SIZE                                     32
  152 #define STRESS_BS_SIZE                           100000
  153 #define STRESS_MIN_BB                                64
  154 #define STRESS_BLOBS                                 10
  155 
  156 #define LOCK_CYCLES                                    3
  157 #define COMPETITIVE_PARTICIPANTS                       3
  158 #define COMPETITIVE_ITERATIONS                        30
  159 #define COMPETITIVE_PAUSE_USEC                         5
  160 #define COMPETITIVE_TIMEOUT_USEC                 3000000L
  161 #endif /* _UNIT_TEST */
  162 
  163 #ifdef _EUCA_BLOBS
  164 #define USAGE                                    "Usage: euca-blobs [cache=... work=...] command [param1] [param2]...\n"
  165 #define HELP                                     "\n"                                         \
  166                                                  "\thelp\t\t- print this help message\n"      \
  167                                                  "\tlist\t\t- list blobs in work and cache\n" \
  168                                                  "\tdelete [id]\t- delete blob with\n"
  169 #define MAX_ARGS                                 5
  170 #endif /* _EUCA_BLOBS */
  171 
  172 /*----------------------------------------------------------------------------*\
  173  |                                                                            |
  174  |                                  TYPEDEFS                                  |
  175  |                                                                            |
  176 \*----------------------------------------------------------------------------*/
  177 
  178 /*----------------------------------------------------------------------------*\
  179  |                                                                            |
  180  |                                ENUMERATIONS                                |
  181  |                                                                            |
  182 \*----------------------------------------------------------------------------*/
  183 
  184 // if changing, change the array below and set_blockblob_metadata_path()
  185 typedef enum {                         //!< paths to files containing...
  186     BLOCKBLOB_PATH_NONE = 0,           //!< sentinel for identifying files that are not blockblob related
  187     BLOCKBLOB_PATH_BLOCKS,             //!< ...blocks, either in flat format or as a snapshot backing
  188     BLOCKBLOB_PATH_LOCK,               //!< ...nothing, but needed for safe locking of access to the blob
  189     BLOCKBLOB_PATH_DM,                 //!< ...device mapper devices created for this clone, if any
  190     BLOCKBLOB_PATH_DEPS,               //!< ...names of blockblobs that this blockblob depends on, if any
  191     BLOCKBLOB_PATH_LOOPBACK,           //!< ...name of the loopback device for this blob, when attached
  192     BLOCKBLOB_PATH_SIG,                //!< ...signature of the blob, if provided from outside
  193     BLOCKBLOB_PATH_REFS,               //!< ...names of blockblobs that depend on this blockblob, if any
  194     BLOCKBLOB_PATH_HOLLOW,             //!< ...nothing, but the file acts as a marker of 'hollow' blobs
  195     BLOCKBLOB_PATH_TOTAL,
  196 } blockblob_path_t;
  197 
  198 enum {
  199     DMSETUP,
  200     ROOTWRAP,
  201     LASTHELPER,
  202 };
  203 
  204 /*----------------------------------------------------------------------------*\
  205  |                                                                            |
  206  |                                 STRUCTURES                                 |
  207  |                                                                            |
  208 \*----------------------------------------------------------------------------*/
  209 
  210 typedef struct _blobstore_filelock {
  211     char path[PATH_MAX];               //!< path that the file was open with @TODO canonicalize?
  212     int refs;                          //!< number of open file descriptors (some holding the lock, some waiting) for this path in this process
  213     int next_fd;                       //!< next available file descriptor in the table below:
  214     int fd[BLOBSTORE_MAX_CONCURRENT];
  215     int fd_status[BLOBSTORE_MAX_CONCURRENT];    //!< 0 = unused, 1 = open
  216 #ifdef _TEST_FILELOCK
  217     unsigned int thread_id[BLOBSTORE_MAX_CONCURRENT];
  218 #endif                                 /* _TEST_FILELOCK */
  219     pthread_rwlock_t lock;             //!< reader/writer lock for controlling intra-process access
  220     pthread_mutex_t mutex;             //!< for locking this specific struct during manipulations
  221     sem *sem;                          //!< semaphore for debugging
  222     struct _blobstore_filelock *next;  //!< pointer for constructing a LL
  223 } blobstore_filelock;
  224 
  225 /*----------------------------------------------------------------------------*\
  226  |                                                                            |
  227  |                             EXTERNAL VARIABLES                             |
  228  |                                                                            |
  229 \*----------------------------------------------------------------------------*/
  230 
  231 /* Should preferably be handled in header file */
  232 
  233 /*----------------------------------------------------------------------------*\
  234  |                                                                            |
  235  |                              GLOBAL VARIABLES                              |
  236  |                                                                            |
  237 \*----------------------------------------------------------------------------*/
  238 
  239 //! Blobstore errors matching strings. Make sure these match up with blobstore_error_t enums above
  240 const char *_blobstore_error_strings[] = {
  241     "success",
  242     "general error",
  243 
  244     // system errno equivalents
  245     "no such entity",
  246     "bad file descriptor",
  247     "out of memory",
  248     "permission denied",
  249     "already exists",
  250     "invalid parameters",
  251     "no space left",
  252     "timeout",
  253     "too many files open",
  254 
  255     // blobstore-specific errors
  256     "wrong signature",
  257     "unknown error",
  258 };
  259 
  260 const char *blobstore_relation_type_name[] = {
  261     "copy",
  262     "map",
  263     "snapshot",
  264 };
  265 
  266 __thread blobstore_error_t _blobstore_errno = BLOBSTORE_ERROR_OK;   //!< thread-local errno
  267 
  268 /*----------------------------------------------------------------------------*\
  269  |                                                                            |
  270  |                              STATIC VARIABLES                              |
  271  |                                                                            |
  272 \*----------------------------------------------------------------------------*/
  273 
  274 // entries must match the ones in enum above
  275 static const char *blobstore_metadata_suffixes[] = {
  276     "none",                            // sentinel entry so that all actual entries have indeces > 0
  277     "blocks",                          // MUST be second so loop in check_metadata_name() works
  278     "lock",
  279     "dm",
  280     "deps",
  281     "loopback",
  282     "sig",
  283     "refs",
  284     "hollow",
  285 };
  286 
  287 static void (*err_fn) (const char *msg) = NULL;
  288 static unsigned char _do_print_errors = 1;
  289 static unsigned char _do_print_trace = 1;
  290 static pthread_mutex_t _blobstore_mutex = PTHREAD_MUTEX_INITIALIZER;    //!< process-global mutex
  291 static blobstore_filelock *locks_list = NULL;   //!< process-global LL head @TODO replace this with a hash table
  292 
  293 //! @{
  294 //! @name debugging counters
  295 //! @TODO remove these
  296 static long _locks_list_add_ctr = 0L;
  297 static long _locks_list_rem_ctr = 0L;
  298 static long _open_success_ctr = 0L;
  299 static long _close_success_ctr = 0L;
  300 static long _open_error_ctr = 0L;
  301 static long _open_timeout_ctr = 0L;
  302 static long _close_error_ctr = 0L;
  303 static char zero_buf[1] = "\0";
  304 //! @}
  305 
  306 static __thread char _blobstore_last_msg[512] = "";
  307 static __thread char _blobstore_last_trace[8172] = "";
  308 
  309 static char *helpers[LASTHELPER] = {
  310     "dmsetup",
  311     "euca_rootwrap",
  312 };
  313 
  314 static char *helpers_path[LASTHELPER];
  315 static int initialized = 0;
  316 
  317 #ifdef _UNIT_TEST
  318 static char *_farray[] = { F1, F2, F3 };
  319 #endif /* _UNIT_TEST */
  320 
  321 #ifdef _EUCA_BLOBS
  322 static char show_debug = FALSE;
  323 static char show_extras = FALSE;
  324 static char show_children = FALSE;
  325 static char show_parents = FALSE;
  326 static char *euca_home = NULL;
  327 static char *work_path = NULL;
  328 static char *cache_path = NULL;
  329 static blobstore *work_bs = NULL;
  330 static blobstore *cache_bs = NULL;
  331 static map *blob_map;
  332 #endif /* _EUCA_BLOBS */
  333 
  334 /*----------------------------------------------------------------------------*\
  335  |                                                                            |
  336  |                              STATIC PROTOTYPES                             |
  337  |                                                                            |
  338 \*----------------------------------------------------------------------------*/
  339 
  340 static void myprintf(int loglevel, const char *format, ...);
  341 static __INLINE__ void _err_on(void);
  342 static __INLINE__ void _err_off(void);
  343 static void err(blobstore_error_t error, const char *custom_msg, const int src_line_no, const char *src_file_name);
  344 static __INLINE__ void propagate_system_errno(blobstore_error_t default_errno, const int src_line_no, const char *src_file_name);
  345 static void gen_id(char *str, unsigned int size);
  346 static void close_filelock(blobstore_filelock * l);
  347 static void free_filelock(blobstore_filelock * l);
  348 static int close_and_unlock(int fd);
  349 #ifdef _TEST_LOCKS
  350 static char *path_to_sem_name(const char *path, char *name, int name_size);
  351 #endif /* _TEST_LOCKS */
  352 static int open_and_lock(const char *path, int flags, long long timeout_usec, mode_t mode);
  353 static char *get_val(const char *buf, const char *key);
  354 static int fd_to_buf(int fd, char *buf, int size_buf);
  355 static int buf_to_fd(int fd, const char *buf, int size_buf);
  356 static int read_store_metadata(blobstore * bs);
  357 static int write_store_metadata(blobstore * bs);
  358 static int set_blockblob_metadata_path(blockblob_path_t path_t, const blobstore * bs, const char *bb_id, char *path, size_t path_size);
  359 static int write_blockblob_metadata_path(blockblob_path_t path_t, const blobstore * bs, const char *bb_id, const char *str);
  360 static int read_blockblob_metadata_path(blockblob_path_t path_t, const blobstore * bs, const char *bb_id, char *str, int str_size);
  361 static int write_array_blockblob_metadata_path(blockblob_path_t path_t, const blobstore * bs, const char *bb_id, char **array, int array_size);
  362 static int read_array_blockblob_metadata_path(blockblob_path_t path_t, const blobstore * bs, const char *bb_id, char ***array, int *array_size);
  363 static int update_entry_blockblob_metadata_path(blockblob_path_t path_t, const blobstore * bs, const char *bb_id, const char *entry, int removing);
  364 static int typeof_blockblob_metadata_path(const blobstore * bs, const char *path, char *bb_id, unsigned int bb_id_size);
  365 static int delete_blockblob_files(const blobstore * bs, const char *bb_id);
  366 static int ensure_blockblob_metadata_path(const blobstore * bs, const char *bb_id);
  367 static void free_bbs(blockblob * bbs);
  368 static unsigned int check_in_use(blobstore * bs, const char *bb_id, long long timeout_usec);
  369 static void set_device_path(blockblob * bb);
  370 static blockblob **walk_bs(blobstore * bs, const char *dir_path, blockblob ** tail_bb, const blockblob * bb_to_avoid);
  371 static blockblob *scan_blobstore(blobstore * bs, const blockblob * bb_to_avoid);
  372 static int compare_bbs(const void *bb1, const void *bb2);
  373 static long long purge_blockblobs_lru(blobstore * bs, blockblob * bb_list, long long need_blocks);
  374 static int get_stale_refs(const blockblob * bb, char ***refs);
  375 static int loop_remove(blobstore * bs, const char *bb_id);
  376 static int dm_suspend_resume(const char *dev_name);
  377 static int dm_check_device(const char *dev_name);
  378 static int dm_delete_device(const char *dev_name);
  379 static int dm_delete_devices(char *dev_names[], int size);
  380 static int dm_create_devices(char *dev_names[], char *dm_tables[], int size);
  381 static char *dm_get_zero(void);
  382 static int blockblob_check(const blockblob * bb);
  383 static int delete_blob_state(blockblob * bb, long long timeout_usec, char do_force);
  384 static int verify_bb(const blockblob * bb, unsigned long long min_size_bytes);
  385 
  386 #ifdef _UNIT_TEST
  387 static void _fill_blob(blockblob * bb, char c, int use_file);
  388 static blobstore *create_teststore(int size_blocks, const char *base, const char *name, blobstore_format_t format, blobstore_revocation_t revocation,
  389                                    blobstore_snapshot_t snapshot);
  390 static int write_byte(blockblob * bb, int seek, char c);
  391 static char read_byte(blockblob * bb, int seek);
  392 static int do_clone_stresstest(const char *base, const char *name, blobstore_format_t format, blobstore_revocation_t revocation, blobstore_snapshot_t snapshot);
  393 static int check_destination(blockblob * bb4, char *op);
  394 static int do_copy_test(const char *base, const char *name);
  395 static int do_clone_test(const char *base, const char *name, blobstore_format_t format, blobstore_revocation_t revocation, blobstore_snapshot_t snapshot, int copy_or_snapshot);
  396 static int do_metadata_test(const char *base, const char *name);
  397 static int do_blobstore_test(const char *base, const char *name, blobstore_format_t format, blobstore_revocation_t revocation);
  398 static void *competitor_function(void *ptr);
  399 static void *thread_function(void *ptr);
  400 static void dummy_err_fn(const char *msg);
  401 #endif /* _UNIT_TEST */
  402 
  403 #ifdef _EUCA_BLOBS
  404 static void bs_errors(const char *msg);
  405 static int open_blobstore(const char *path, blobstore ** bs, const char *name);
  406 static int open_blobstores();
  407 static void close_blobstores();
  408 static int do_list_bs(blobstore * bs, const char *regex);
  409 static void print_tree(const char *prefix, blockblob_meta * bm, blockblob_path_t type);
  410 static int do_list(const char *regex);
  411 static int do_delete(const char *bs_path, const char *bb_id);
  412 static void usage(const char *msg);
  413 static void set_global_parameter(char *key, char *val);
  414 #endif /* _EUCA_BLOBS */
  415 
  416 /*----------------------------------------------------------------------------*\
  417  |                                                                            |
  418  |                                   MACROS                                   |
  419  |                                                                            |
  420 \*----------------------------------------------------------------------------*/
  421 
  422 #define ERR(_ERRNO,_MSG)               err(_ERRNO, _MSG, __LINE__, __FILE__)
  423 
  424 #define PROPAGATE_ERR(_ERRNO)          propagate_system_errno(_ERRNO, __LINE__, __FILE__)
  425 
  426 #ifdef _UNIT_TEST
  427 #define _UNEXPECTED()                  printf ("======================> UNEXPECTED RESULT (errors=%d)!!!\n", ++errors);
  428 
  429 #define _CHKMETA(_ST, _RE)                                                               \
  430 {                                                                                        \
  431     snprintf(entry_path, sizeof(entry_path), "%s/%s", bs->path, _ST);                    \
  432     if (_RE != typeof_blockblob_metadata_path(bs, entry_path, blob_id, sizeof(blob_id))) \
  433         _UNEXPECTED();                                                                   \
  434 }
  435 
  436 #define _OPEN(_FD, _FI, _FL, _TI, _RE)                                                                \
  437 {                                                                                                     \
  438     _blobstore_errno = 0;                                                                             \
  439     printf("%d: open (" _FI " flags=%d timeout=%d)", getpid(), _FL, _TI);                             \
  440     _FD = open_and_lock(_FI, _FL, _TI, BLOBSTORE_FILE_PERM);                                          \
  441     printf("=%d errno=%d '%s'\n", _FD, _blobstore_errno, blobstore_get_error_str(_blobstore_errno));  \
  442     if ((_FD == -1) && (_blobstore_errno == 0))                                                       \
  443         printf("======================> UNSET errno ON ERROR (errors=%d)!!!\n", ++errors);            \
  444     else if (((_RE == -1) && (_FD != -1)) || ((_RE == 0) && (_FD < 0)))                               \
  445         _UNEXPECTED();                                                                                \
  446 }
  447 
  448 #define _CLOS(_FD, _FI)                                        \
  449 {                                                              \
  450     ret = close_and_unlock(_FD);                               \
  451     printf("%d: close (%d " _FI ")=%d\n", getpid(), _FD, ret); \
  452 }
  453 
  454 #define _PARENT_WAITS()                                                   \
  455 {                                                                         \
  456     int status = 0;                                                       \
  457     int ret = 0;                                                          \
  458     printf("waiting for child pid=%d\n", pid);                            \
  459     ret = wait(&status);                                                  \
  460     printf("waited for child pid=%d ret=%d\n", ret, WEXITSTATUS(status)); \
  461     errors += WEXITSTATUS(status);                                        \
  462 }
  463 
  464 #define _OPENBB(_BB, _ID, _SI, _SG, _FL, _TI, _RE)                                                                                   \
  465 {                                                                                                                                    \
  466     _blobstore_errno = 0;                                                                                                            \
  467     printf("%d: bb_open (%s size=%d flags=%d timeout=%d)", getpid(), SP(_ID), _SI, _FL, _TI);                                        \
  468     _BB = blockblob_open(bs, _ID, (_SI) * 512, _FL, _SG, _TI);                                                                       \
  469     printf("=%s errno=%d '%s'\n", ((_BB == NULL) ? ("NULL") : ("OK")), _blobstore_errno, blobstore_get_error_str(_blobstore_errno)); \
  470     if ((_BB == NULL) && (_blobstore_errno == 0))                                                                                    \
  471         printf("======================> UNSET errno ON ERROR (errors=%d)!!!\n", ++errors);                                           \
  472     else if (((_RE == -1) && (_BB != NULL)) || ((_RE == 0) && (_BB == NULL)))                                                        \
  473         _UNEXPECTED();                                                                                                               \
  474 }
  475 
  476 // same as _OPENBB but accepts bytes rather than blocks
  477 #define _OPENBBb(_BB, _ID, _SI, _SG, _FL, _TI, _RE)                                                                                  \
  478 {                                                                                                                                    \
  479     _blobstore_errno = 0;                                                                                                            \
  480     printf("%d: bb_open (%s size=%d flags=%d timeout=%d)", getpid(), SP(_ID), _SI, _FL, _TI);                                        \
  481     _BB = blockblob_open(bs, _ID, _SI, _FL, _SG, _TI);                                                                               \
  482     printf("=%s errno=%d '%s'\n", ((_BB == NULL) ? ("NULL") : ("OK")), _blobstore_errno, blobstore_get_error_str(_blobstore_errno)); \
  483     if ((_BB == NULL) && (_blobstore_errno == 0))                                                                                    \
  484         printf("======================> UNSET errno ON ERROR (errors=%d)!!!\n", ++errors);                                           \
  485     else if (((_RE == -1) && (_BB != NULL)) || ((_RE == 0) && (_BB == NULL)))                                                        \
  486         _UNEXPECTED();                                                                                                               \
  487 }
  488 
  489 #define _SEARCH(_PATTERN, _RE)                                                                                               \
  490 {                                                                                                                            \
  491     results = NULL;                                                                                                          \
  492     printf("%d: bs_search (pattern=%s)", getpid(), _PATTERN);                                                                \
  493     nresults = blobstore_search (bs, _PATTERN, &results);                                                                    \
  494     printf("=%d (expected %d) errno=%d '%s'\n", nresults, _RE, _blobstore_errno, blobstore_get_error_str(_blobstore_errno)); \
  495     if ((nresults < 0) && (_blobstore_errno == 0))                                                                           \
  496         printf("======================> UNSET errno ON ERROR (errors=%d)!!!\n", ++errors);                                   \
  497     else if (_RE != nresults)                                                                                                \
  498         _UNEXPECTED();                                                                                                       \
  499     for (blockblob_meta * bm = results; bm;) {                                                                               \
  500         blockblob_meta * next = bm->next;                                                                                    \
  501         EUCA_FREE(bm);                                                                                                       \
  502         bm = next;                                                                                                           \
  503     }                                                                                                                        \
  504 }
  505 
  506 #define _CLOSBB(_BB, _ID)                                                    \
  507 {                                                                            \
  508     ret = blockblob_close(_BB);                                              \
  509     printf("%d: bb_close (%lu %s)=%d errno=%d '%s'\n",                       \
  510             getpid(), ((unsigned long) _BB), SP(_ID), ret, _blobstore_errno, \
  511             blobstore_get_error_str(_blobstore_errno));                      \
  512 }
  513 
  514 #define _DELEBB(_BB, _ID, _RE)                                               \
  515 {                                                                            \
  516     ret = blockblob_delete(_BB, 3000, 0);                                    \
  517     printf("%d: bb_delete (%lu %s)=%d errno=%d '%s'\n",                      \
  518             getpid(), ((unsigned long) _BB), SP(_ID), ret, _blobstore_errno, \
  519             blobstore_get_error_str(_blobstore_errno));                      \
  520     if (ret != _RE)                                                          \
  521         _UNEXPECTED();                                                       \
  522 }
  523 
  524 #define _CLONBB(_BB, _ID, _MP, _RE)                                                                  \
  525 {                                                                                                    \
  526     _blobstore_errno = 0;                                                                            \
  527     printf("%d: bb_clone (%s map=%lu)", getpid(), SP(_ID), ((unsigned long) _MP));                   \
  528     ret = blockblob_clone(_BB, _MP, (sizeof(_MP) / sizeof(blockmap)));                               \
  529     printf("=%d errno=%d '%s'\n", ret, _blobstore_errno, blobstore_get_error_str(_blobstore_errno)); \
  530     if ((ret == -1) && (_blobstore_errno == 0))                                                      \
  531         printf("======================> UNSET errno ON ERROR (errors=%d)!!!\n", ++errors);           \
  532     else if (_RE != ret)                                                                             \
  533         _UNEXPECTED();                                                                               \
  534 }
  535 
  536 #define _COPYBB(_SBB, _SO, _DBB, _DO, _LEN, _RE)                                                     \
  537 {                                                                                                    \
  538     _blobstore_errno = 0;                                                                            \
  539     printf("%d: bb_copy (%s to %s)", getpid(), (_SBB)->id, (_DBB)->id);                              \
  540     ret = blockblob_copy(_SBB, _SO, _DBB, _DO, _LEN);                                                \
  541     printf("=%d errno=%d '%s'\n", ret, _blobstore_errno, blobstore_get_error_str(_blobstore_errno)); \
  542     if ((ret == -1) && (_blobstore_errno == 0))                                                      \
  543         printf("======================> UNSET errno ON ERROR (errors=%d)!!!\n", ++errors);           \
  544     else if (_RE != ret)                                                                             \
  545         _UNEXPECTED();                                                                               \
  546 }
  547 #endif /* _UNIT_TEST */
  548 
  549 /*----------------------------------------------------------------------------*\
  550  |                                                                            |
  551  |                               IMPLEMENTATION                               |
  552  |                                                                            |
  553 \*----------------------------------------------------------------------------*/
  554 
  555 //!
  556 //!
  557 //!
  558 //! @param[in] loglevel
  559 //! @param[in] format
  560 //!
  561 //! @pre
  562 //!
  563 //! @note
  564 //!
  565 static void myprintf(int loglevel, const char *format, ...)
  566 {
  567     char buf[1024];
  568 
  569     va_list ap;
  570     va_start(ap, format);
  571     vsnprintf(buf, sizeof(buf), format, ap);
  572     va_end(ap);
  573 
  574     if (err_fn)
  575         err_fn(buf);
  576     else
  577         puts(buf);
  578 }
  579 
  580 //!
  581 //!
  582 //!
  583 //! @param[in] error
  584 //!
  585 //! @return
  586 //!
  587 //! @pre
  588 //!
  589 //! @note
  590 //!
  591 const char *blobstore_get_error_str(blobstore_error_t error)
  592 {
  593     return _blobstore_error_strings[error];
  594 }
  595 
  596 //!
  597 //!
  598 //!
  599 //! @return
  600 //!
  601 //! @pre
  602 //!
  603 //! @note
  604 //!
  605 const char *blobstore_get_last_msg(void)
  606 {
  607     return _blobstore_last_msg;
  608 }
  609 
  610 //!
  611 //!
  612 //!
  613 //! @return
  614 //!
  615 //! @pre
  616 //!
  617 //! @note
  618 //!
  619 const char *blobstore_get_last_trace(void)
  620 {
  621     return _blobstore_last_trace;
  622 }
  623 
  624 //!
  625 //!
  626 //!
  627 //! @note
  628 //!
  629 static __INLINE__ void _err_on(void)
  630 {
  631     _do_print_errors = 1;
  632 }
  633 
  634 //!
  635 //!
  636 //!
  637 //! @note
  638 //!
  639 static __INLINE__ void _err_off(void)
  640 {
  641     _do_print_errors = 0;
  642 }
  643 
  644 //!
  645 //!
  646 //!
  647 //! @param[in] error
  648 //! @param[in] custom_msg
  649 //! @param[in] src_line_no
  650 //! @param[in] src_file_name
  651 //!
  652 //! @pre
  653 //!
  654 //! @note
  655 //!
  656 static void err(blobstore_error_t error, const char *custom_msg, const int src_line_no, const char *src_file_name)
  657 {
  658     const char *msg = custom_msg;
  659     if (msg == NULL) {
  660         msg = blobstore_get_error_str(error);
  661     }
  662     snprintf(_blobstore_last_msg, sizeof(_blobstore_last_msg), "%s:%d %s", src_file_name, src_line_no, msg);
  663     log_dump_trace(_blobstore_last_trace, sizeof(_blobstore_last_trace));
  664 
  665     if (_do_print_errors) {
  666         myprintf(EUCA_LOG_ERROR, "error: %s\n", _blobstore_last_msg);
  667         if (_do_print_trace)
  668             myprintf(EUCA_LOG_ERROR, "%s", _blobstore_last_trace);
  669     }
  670     _blobstore_errno = error;
  671 }
  672 
  673 //!
  674 //!
  675 //!
  676 //! @param[in] default_errno
  677 //! @param[in] src_line_no
  678 //! @param[in] src_file_name
  679 //!
  680 //! @pre
  681 //!
  682 //! @note
  683 //!
  684 static __INLINE__ void propagate_system_errno(blobstore_error_t default_errno, const int src_line_no, const char *src_file_name)
  685 {
  686     switch (errno) {
  687     case ENOENT:
  688         _blobstore_errno = BLOBSTORE_ERROR_NOENT;
  689         break;
  690     case ENOMEM:
  691         _blobstore_errno = BLOBSTORE_ERROR_NOMEM;
  692         break;
  693     case EACCES:
  694         _blobstore_errno = BLOBSTORE_ERROR_ACCES;
  695         break;
  696     case EEXIST:
  697         _blobstore_errno = BLOBSTORE_ERROR_EXIST;
  698         break;
  699     case EINVAL:
  700         _blobstore_errno = BLOBSTORE_ERROR_INVAL;
  701         break;
  702     case ENOSPC:
  703         _blobstore_errno = BLOBSTORE_ERROR_NOSPC;
  704         break;
  705     case EAGAIN:
  706         _blobstore_errno = BLOBSTORE_ERROR_AGAIN;
  707         break;
  708     default:
  709         perror("blobstore");
  710         _blobstore_errno = default_errno;
  711     }
  712     err(_blobstore_errno, NULL, src_line_no, src_file_name);
  713 }
  714 
  715 //!
  716 //!
  717 //!
  718 //! @param[in] fn
  719 //!
  720 //! @pre
  721 //!
  722 //! @note
  723 //!
  724 void blobstore_set_error_function(void (*fn) (const char *msg))
  725 {
  726     err_fn = fn;
  727 }
  728 
  729 //!
  730 //!
  731 //!
  732 //! @param[in] str
  733 //! @param[in] size
  734 //!
  735 //! @pre
  736 //!
  737 //! @note
  738 //!
  739 static void gen_id(char *str, unsigned int size)
  740 {
  741     snprintf(str, size, "%08lx%08lx%08lx", (unsigned long)random(), (unsigned long)random(), (unsigned long)random());
  742 }
  743 
  744 //!
  745 //!
  746 //!
  747 //! @param[in] l
  748 //! @param[in] type
  749 //!
  750 //! @return
  751 //!
  752 //! @pre
  753 //!
  754 //! @note
  755 //!
  756 struct flock *flock_whole_file(struct flock *l, short type)
  757 {
  758     l->l_type = type;
  759     l->l_pid = 0;
  760 
  761     // set params so as to lock the whole file
  762     l->l_start = 0;
  763     l->l_whence = SEEK_SET;
  764     l->l_len = 0;
  765 
  766     return l;
  767 }
  768 
  769 //!
  770 //!
  771 //!
  772 //! @param[in] l
  773 //!
  774 //! @pre \li MUST be called with _blobstore_mutex held.
  775 //!      \li The l parameter must not be NULL
  776 //!
  777 //! @note
  778 //!
  779 static void close_filelock(blobstore_filelock * l)
  780 {
  781     // close all file descriptors at once (we do this because
  782     // closing any one removes the lock for all descriptors
  783     // held by a process)
  784     for (int i = 0; i < l->next_fd; i++) {
  785         if (l->fd[i] > -1) {
  786             close(l->fd[i]);
  787             l->fd[i] = -1;
  788         }
  789     }
  790     l->next_fd = 0;                    // knock the open fd counter back to 0
  791 }
  792 
  793 //!
  794 //!
  795 //!
  796 //! @param[in] l
  797 //!
  798 //! @pre \li MUST be called with _blobstore_mutex held
  799 //!      \li The l parameter must not be NULL.
  800 //!
  801 //! @note
  802 //!
  803 static void free_filelock(blobstore_filelock * l)
  804 {
  805     pthread_rwlock_destroy(&(l->lock));
  806     pthread_mutex_destroy(&(l->mutex));
  807     EUCA_FREE(l);
  808 }
  809 
  810 //!
  811 //! This function must be used to close files opened with open_and_lock(). (Simply doing close() will
  812 //! leave the file locked via pthreads and future open_and_lock() requests from the same process may
  813 //! fail.)  Also, closing the file descriptor releases the OS file lock for the process, so any other
  814 //! read-only descriptors held by the process are no longer guarded since other processes may open the
  815 //! file for writing.
  816 //!
  817 //! @param[in] fd
  818 //!
  819 //! @return
  820 //!
  821 //! @pre
  822 //!
  823 //! @note
  824 //!
  825 static int close_and_unlock(int fd)
  826 {
  827     if (fd < 0) {
  828         ERR(BLOBSTORE_ERROR_BADF, NULL);
  829         return -1;
  830     }
  831     int ret = 0;
  832     {                                  // critical section
  833         pthread_mutex_lock(&_blobstore_mutex);  // grab global lock (we will not block below and we may be deallocating)
  834         LOGTRACE("{%u} close_and_unlock: obtained global lock for closing of fd=%d\n", (unsigned int)pthread_self(), fd);
  835 
  836         blobstore_filelock *path_lock = NULL;   // lock struct to which this fd belongs
  837         int index = -1;                // index of this fd entry in the lock struct
  838 
  839         // traverse all locks, looking for one with fd,
  840         // when found, compute index and open_fds
  841         blobstore_filelock **next_ptr = &locks_list;
  842         for (blobstore_filelock * l = locks_list; l; l = l->next) { // look for the fd
  843             assert(l->next_fd >= 0 && l->next_fd <= BLOBSTORE_MAX_CONCURRENT);
  844             for (int i = 0; i < l->next_fd; i++) {
  845                 if (l->fd_status[i] && l->fd[i] == fd) {
  846                     path_lock = l;     // found it!
  847                     index = i;
  848                     break;
  849                 }
  850             }
  851             if (index != -1)
  852                 break;
  853             next_ptr = &(l->next);     // list head or prev element
  854         }
  855 
  856         if (path_lock) {
  857             assert(*next_ptr == path_lock);
  858             assert(index >= 0 && index < BLOBSTORE_MAX_CONCURRENT);
  859 
  860             boolean did_close = FALSE;
  861             boolean do_free = FALSE;
  862             {                          // inner critical section to protect changes to 'path_lock', if any
  863                 pthread_mutex_lock(&(path_lock->mutex));    // grab path-specific mutex
  864                 if (path_lock->fd_status[index] == 1) { // has not been closed yet
  865                     path_lock->fd_status[index] = 0;    // set status to 'unused'
  866                     did_close = TRUE;
  867                     path_lock->refs--;
  868 
  869                     int open_fds = 0;
  870                     for (int i = 0; i < path_lock->next_fd; i++) {
  871                         if (path_lock->fd_status[i]) {
  872                             assert(path_lock->fd[i] != fd);
  873                             open_fds++;
  874                         }
  875                     }
  876 
  877                     if (open_fds == 0 && path_lock->refs == 0) {    // no open blockblob file descriptors in this process
  878                         close_filelock(path_lock);
  879                         *next_ptr = path_lock->next;    // remove from LL
  880                         do_free = TRUE;
  881                         _locks_list_rem_ctr++;
  882                         LOGTRACE("{%u} close_and_unlock: unlocked and freed fd=%d path=%s\n", (unsigned int)pthread_self(), fd, path_lock->path);
  883 
  884                     } else {
  885                         LOGTRACE("{%u} close_and_unlock: kept fd=%d path=%s open/refs=%d/%d\n", (unsigned int)pthread_self(), fd, path_lock->path, open_fds, path_lock->refs);
  886                     }
  887                     pthread_rwlock_unlock(&(path_lock->lock));  // give up the Posix lock
  888                     /* lock testing code
  889                        if (path_lock->sem) {
  890                        sem_v (path_lock->sem);
  891                        sem_free (path_lock->sem);
  892                        path_lock->sem = NULL;
  893                        }
  894                      */
  895                 }
  896                 pthread_mutex_unlock(&(path_lock->mutex));
  897             }                          // end of inner critical section
  898 
  899             if (do_free)
  900                 free_filelock(path_lock);
  901 
  902             if (!did_close) {
  903                 ERR(BLOBSTORE_ERROR_BADF, "file descriptor already closed");
  904                 ret = -1;
  905             }
  906         } else {                       // no match
  907             ERR(BLOBSTORE_ERROR_BADF, "not an open file descriptor");
  908             ret = -1;
  909         }
  910 
  911         if (ret == 0)
  912             _close_success_ctr++;
  913         else
  914             _close_error_ctr++;
  915 
  916         LOGTRACE("{%u} close_and_unlock: releasing global lock for closing of fd=%d ret=%d\n", (unsigned int)pthread_self(), fd, ret);
  917         pthread_mutex_unlock(&_blobstore_mutex);
  918     }                                  // end of critical section
  919 
  920     return ret;
  921 }
  922 
  923 #ifdef _TEST_LOCKS
  924 //!
  925 //!
  926 //!
  927 //! @param[in] path
  928 //! @param[in] name
  929 //! @param[in] name_size
  930 //!
  931 //! @return
  932 //!
  933 //! @pre
  934 //!
  935 //! @note
  936 //!
  937 static char *path_to_sem_name(const char *path, char *name, int name_size)
  938 {
  939     snprintf(name, name_size, "euca%s", path);
  940     for (int i = 0; i < name_size && name[i]; i++)
  941         if (name[i] == '/')
  942             name[i] = '-';
  943     return name;
  944 }
  945 #endif /* _TEST_LOCKS */
  946 
  947 //!
  948 //! This function creates or opens a file and locks it. The lock is:
  949 //!
  950 //! \li exclusive if the file is being created or written to, or a
  951 //! \li non-exclusive readers' lock if the file was opened RDONLY.
  952 //!
  953 //! The lock works both across threads and processes.  File descriptors obtained from
  954 //! this function should be released with close_and_unlock(). All locks held by a process
  955 //! are released upon termination, whether normal or abnormal.
  956 //!
  957 //! @param[in] path
  958 //! @param[in] flags \li BLOBSTORE_FLAG_RDONLY - open with O_RDONLY, reader lock
  959 //!                  \li BLOBSTORE_FLAG_RDWR - open with O_RDWR, writer lock
  960 //!                  \li BLOBSTORE_FLAG_CREAT - open with O_RDWR | O_CREAT, writer lock
  961 //!                  \li BLOBSTORE_FLAG_EXCL - can be added to _CREAT, as with open()
  962 //! @param[in] timeout_usec \li timeout in microseconds for waiting on a lock
  963 //!                         \li BLOBSTORE_NO_TIMEOUT / -1 - wait forever
  964 //!                         \li BLOBSTORE_NO_WAIT / 0 - do not wait at all
  965 //! @param[in] mode gets passed to open() directly
  966 //!
  967 //! @return
  968 //!
  969 //! @see close_and_unlock()
  970 //!
  971 //! @pre
  972 //!
  973 //! @note
  974 //!
  975 static int open_and_lock(const char *path, int flags, long long timeout_usec, mode_t mode)
  976 {
  977     short l_type;
  978     int o_flags = 0;
  979     long long started = time_usec();
  980     long long deadline = started + timeout_usec;
  981 
  982     // verify the flags and, based on them,
  983     // decide what type of lock to use
  984     if (flags & BLOBSTORE_FLAG_RDONLY) {
  985         l_type = F_RDLCK;              // use shared (read) lock
  986         o_flags |= O_RDONLY;           // required when using F_RDLCK
  987 
  988     } else if ((flags & BLOBSTORE_FLAG_RDWR) || (flags & BLOBSTORE_FLAG_CREAT)) {
  989         l_type = F_WRLCK;              // use exclusive (write) lock
  990         o_flags |= O_RDWR;             // required when using F_WRLCK
  991         if (flags & BLOBSTORE_FLAG_CREAT) {
  992             o_flags |= O_CREAT;
  993             // intentionally ignore _EXCL supplied without _CREAT
  994             if (flags & BLOBSTORE_FLAG_EXCL)
  995                 o_flags |= O_EXCL;
  996         }
  997 
  998         if (flags & BLOBSTORE_FLAG_CREAT)
  999             o_flags |= O_TRUNC;
 1000     } else {
 1001         ERR(BLOBSTORE_ERROR_INVAL, "flags to open_and_lock must include either _RDONLY or _RDWR or _CREAT");
 1002         return -1;
 1003     }
 1004 
 1005     // handle intra-process locking, with a pthreads read-write lock
 1006     // either find in a global linked list 'locks_list' or
 1007     // allocate and append to it a 'blobstore_filelock' struct
 1008     blobstore_filelock *path_lock = NULL;
 1009     {                                  // critical section
 1010         pthread_mutex_lock(&_blobstore_mutex);  // grab the global mutex
 1011         blobstore_filelock **next_ptr = &locks_list;
 1012         for (blobstore_filelock * l = locks_list; l; l = l->next) { // look through existing locks
 1013             if (strcmp(path, l->path) == 0) {
 1014                 path_lock = l;
 1015                 break;
 1016             }
 1017             next_ptr = &(l->next);
 1018         }
 1019         // next_ptr now points either to LL head or
 1020         // to the last non-matching element's next pointer
 1021 
 1022         if (path_lock == NULL) {       // this path is not locked by any thread
 1023             path_lock = EUCA_ZALLOC(1, sizeof(blobstore_filelock));
 1024             if (path_lock == NULL) {
 1025                 pthread_mutex_unlock(&_blobstore_mutex);
 1026                 ERR(BLOBSTORE_ERROR_NOMEM, NULL);
 1027                 return -1;
 1028             }
 1029             euca_strncpy(path_lock->path, path, sizeof(path_lock->path));
 1030             pthread_rwlock_init(&(path_lock->lock), NULL);
 1031             pthread_mutex_init(&(path_lock->mutex), NULL);
 1032             *next_ptr = path_lock;     // add at the end of LL
 1033             _locks_list_add_ctr++;
 1034         } else {
 1035             assert(*next_ptr == path_lock);
 1036             if (path_lock->next_fd == BLOBSTORE_MAX_CONCURRENT) {
 1037                 pthread_mutex_unlock(&_blobstore_mutex);
 1038                 ERR(BLOBSTORE_ERROR_MFILE, "too many open file descriptors");   // to be precise, this means too many file descriptors with overlapping lifetimes
 1039                 return -1;
 1040             }
 1041         }
 1042         pthread_mutex_lock(&(path_lock->mutex));    // grab path-specific mutex
 1043         {
 1044             path_lock->refs++;         // increase the reference count while still under lock
 1045         }
 1046         pthread_mutex_unlock(&(path_lock->mutex));  // release path-specific mutex
 1047         pthread_mutex_unlock(&_blobstore_mutex);    // release global mutex
 1048     }                                  // end of critical section
 1049 
 1050     // open/create the file, using Posix file locks for inter-process locking
 1051     int fd = open(path, o_flags, mode);
 1052     LOGTRACE("{%u} open_and_lock: open fd=%d flags=%0x path=%s\n", (unsigned int)pthread_self(), fd, o_flags, path);
 1053     if (fd == -1) {
 1054         PROPAGATE_ERR(BLOBSTORE_ERROR_UNKNOWN);
 1055         goto error;
 1056     }
 1057 
 1058     {                                  // critical section
 1059         pthread_mutex_lock(&_blobstore_mutex);  // grab the global mutex
 1060 
 1061         // ensure we do not have this file descriptor already in some other list
 1062         for (blobstore_filelock * l = locks_list; l; l = l->next) {
 1063             {                          // inner critical section
 1064                 pthread_mutex_lock(&(l->mutex));    // grab path-specific mutex for atomic update to the table of descriptors
 1065                 for (int i = 0; i < l->next_fd; i++) {
 1066                     if (l->fd[i] == fd) {
 1067                         LOGWARN("WARNING: blobstore lock closed outside close_and_unlock [fd=%d, index=%d, refs=%d]\n", fd, i, l->refs);
 1068                         l->fd[i] = -1; // set to invalid so no one else closes our valid descriptor
 1069                         l->fd_status[i] = 0;    // definitely unused.
 1070                         l->refs--;
 1071                     }
 1072                 }
 1073                 pthread_mutex_unlock(&(l->mutex));  // release path-specific mutex
 1074             }                          // end of inner critical section
 1075         }
 1076 
 1077         {                              // inner critical section
 1078             pthread_mutex_lock(&(path_lock->mutex));    // grab path-specific mutex for atomic update to the table of descriptors
 1079 
 1080             // record the file descriptor in the array regardless of whether
 1081             // we ultimately succeed in obtaining the lock or not -- we must
 1082             // ensure we do not close this file descriptor until all users
 1083             // of the lock are through
 1084             path_lock->fd[path_lock->next_fd] = fd; // record file descriptor to enable future lookups
 1085             path_lock->fd_status[path_lock->next_fd] = 1;   // mark the slot as in-use
 1086 #ifdef _TEST_FILELOCK
 1087             path_lock->thread_id[path_lock->next_fd] = (unsigned int)pthread_self();
 1088 #endif
 1089             path_lock->next_fd++;      // move the index up (it only goes up because we close all file descriptors together)
 1090 
 1091             pthread_mutex_unlock(&(path_lock->mutex));  // release path-specific mutex
 1092         }                              // end of inner critical section
 1093 
 1094         pthread_mutex_unlock(&_blobstore_mutex);    // release global mutex
 1095     }                                  // end of critical section
 1096 
 1097     for (;;) {
 1098         // first try getting the Posix rwlock
 1099         int ret;
 1100         if (l_type == F_WRLCK)
 1101             ret = pthread_rwlock_trywrlock(&(path_lock->lock));
 1102         else
 1103             ret = pthread_rwlock_tryrdlock(&(path_lock->lock));
 1104         if (ret == 0) {
 1105             // Posix rwlock succeeded, try the file lock
 1106             errno = 0;
 1107             struct flock l;
 1108             if (fcntl(fd, F_SETLK, flock_whole_file(&l, l_type)) != -1)
 1109                 break;                 // success!
 1110             pthread_rwlock_unlock(&(path_lock->lock));  // give up the Posix lock
 1111             if (errno != EAGAIN) {     // any error other than inability to get the lock
 1112                 PROPAGATE_ERR(BLOBSTORE_ERROR_UNKNOWN);
 1113                 goto error;
 1114             }
 1115         }
 1116         long long now = time_usec();
 1117         if (timeout_usec != BLOBSTORE_NO_TIMEOUT && now >= deadline) {  // we timed out waiting for the lock
 1118             ERR(BLOBSTORE_ERROR_AGAIN, NULL);
 1119             pthread_mutex_lock(&_blobstore_mutex);
 1120             _open_timeout_ctr++;
 1121             pthread_mutex_unlock(&_blobstore_mutex);
 1122             goto error;
 1123         }
 1124         LOGTRACE("{%u} open_and_lock: could not acquire %s lock, sleeping on %s\n", (unsigned int)pthread_self(), (ret == 0) ? ("file") : ("posix"), path);
 1125 
 1126         usleep(BLOBSTORE_SLEEP_INTERVAL_USEC);
 1127     }
 1128 
 1129     // successully acquired both file and Posix locks
 1130 
 1131 #ifdef _TEST_LOCKS
 1132     if (l_type == F_WRLCK) {
 1133         char sem_name[512];
 1134         path_lock->sem = sem_alloc(1, path_to_sem_name(path, sem_name, sizeof(sem_name)));
 1135         sem_p(path_lock->sem);
 1136     }
 1137 #endif // _TEST_LOCKS
 1138 
 1139     pthread_mutex_lock(&_blobstore_mutex);
 1140     _open_success_ctr++;
 1141     pthread_mutex_unlock(&_blobstore_mutex);
 1142     {                                  // print out information about the newly acquired lock
 1143         struct stat s;
 1144         fstat(fd, &s);
 1145 
 1146         struct flock l;
 1147         fcntl(fd, F_GETLK, flock_whole_file(&l, l_type));
 1148 
 1149         LOGTRACE("{%u} open_and_lock: locked fd=%d path=%s flags=%d ino=%ld mode=%0o [lock type=%d whence=%d start=%ld length=%ld]\n",
 1150                  (unsigned int)pthread_self(), fd, path, o_flags, s.st_ino, s.st_mode, l.l_type, l.l_whence, l.l_start, l.l_len);
 1151     }
 1152     return fd;
 1153 
 1154 error:
 1155     // due to aproblem above (inability to open the file or
 1156     // to acquire Posix locks within the deadline), the
 1157     // 'blobstore_filelock' struct will be removed from the
 1158     // global linked list 'locks_list', its files closed,
 1159     // and its memory freed -- but only if this is the last
 1160     // thread using it
 1161 
 1162     {                                  // critical section
 1163         pthread_mutex_lock(&_blobstore_mutex);  // grab the global lock to protect locks_list traversal
 1164 
 1165         // we must recalculate next_ptr since the element that it points to
 1166         // may have been removed from the LL and freed while we were outside
 1167         // the critical section
 1168         blobstore_filelock **next_ptr = &locks_list;
 1169         for (blobstore_filelock * l = locks_list; l; l = l->next) { // look through existing locks
 1170             if (path_lock == l)
 1171                 break;
 1172             next_ptr = &(l->next);
 1173         }
 1174         // next_ptr must point at the struct we are looking for,
 1175         // which must be in the list
 1176         assert(*next_ptr == path_lock);
 1177 
 1178         boolean do_free = FALSE;
 1179         {                              // inner critical section
 1180             pthread_mutex_lock(&(path_lock->mutex));    // grab path-specific mutex for atomic update to the table of descriptors
 1181             path_lock->refs--;
 1182 
 1183             int open_fds = 0;
 1184             for (int i = 0; i < path_lock->next_fd; i++) {
 1185                 if (path_lock->fd_status[i]) {
 1186                     if (path_lock->fd[i] == fd) {
 1187                         path_lock->fd_status[i] = 0;    // mark as 'unused'
 1188                     } else {
 1189                         open_fds++;
 1190                     }
 1191                 }
 1192             }
 1193 
 1194             if (open_fds == 0 && path_lock->refs == 0) {    // no open blockblob file descriptors in this process
 1195                 close_filelock(path_lock);
 1196                 *next_ptr = path_lock->next;    // remove from LL
 1197                 do_free = TRUE;
 1198                 _locks_list_rem_ctr++;
 1199                 LOGTRACE("{%u} open_and_lock: freed fd=%d path=%s\n", (unsigned int)pthread_self(), fd, path_lock->path);
 1200 
 1201             } else {
 1202                 LOGTRACE("{%u} open_and_lock: kept fd=%d path=%s open/refs=%d/%d\n", (unsigned int)pthread_self(), fd, path_lock->path, open_fds, path_lock->refs);
 1203             }
 1204 
 1205             pthread_mutex_unlock(&(path_lock->mutex));
 1206         }                              // end of inner critical section
 1207 
 1208         if (do_free)
 1209             free_filelock(path_lock);
 1210 
 1211         _open_error_ctr++;
 1212         pthread_mutex_unlock(&_blobstore_mutex);
 1213     }                                  // end of critical section
 1214 
 1215     return -1;
 1216 }
 1217 
 1218 //!
 1219 //!
 1220 //!
 1221 //! @param[in] buf
 1222 //! @param[in] key
 1223 //!
 1224 //! @return
 1225 //!
 1226 //! @pre
 1227 //!
 1228 //! @note
 1229 //!
 1230 static char *get_val(const char *buf, const char *key)
 1231 {
 1232     char *val = NULL;
 1233     char full_key[512];
 1234     snprintf(full_key, sizeof(full_key), "%s: ", key);
 1235     char *val_begin = strstr(buf, full_key);
 1236     if (val_begin) {
 1237         val_begin += strlen(full_key);
 1238         char *val_end = val_begin;
 1239         while (*val_end != '\n' && *val_end != '\0')
 1240             val_end++;
 1241         val = EUCA_ZALLOC(val_end - val_begin + 1, sizeof(char));   // +1 for the \0
 1242         if (val == NULL) {
 1243             ERR(BLOBSTORE_ERROR_NOMEM, NULL);
 1244             return NULL;
 1245         }
 1246         strncpy(val, val_begin, val_end - val_begin);
 1247     }
 1248 
 1249     return val;
 1250 }
 1251 
 1252 //!
 1253 //! Helper for reading a file into a buffer
 1254 //!
 1255 //! @param[in] fd
 1256 //! @param[in] buf
 1257 //! @param[in] size_buf
 1258 //!
 1259 //! @return The number of bytes read or -1 if error
 1260 //!
 1261 //! @pre
 1262 //!
 1263 //! @note
 1264 //!
 1265 static int fd_to_buf(int fd, char *buf, int size_buf)
 1266 {
 1267     if (lseek(fd, 0, SEEK_SET) == -1) {
 1268         ERR(BLOBSTORE_ERROR_ACCES, "failed to seek in metadata file");
 1269         return -1;
 1270     }
 1271 
 1272     struct stat sb;
 1273     if (fstat(fd, &sb) == -1) {
 1274         ERR(BLOBSTORE_ERROR_ACCES, "failed to stat metadata file");
 1275         return -1;
 1276     }
 1277 
 1278     if (read(fd, buf, size_buf) != sb.st_size)  //! @TODO do this in a loop?
 1279     {
 1280         ERR(BLOBSTORE_ERROR_NOENT, "failed to read metadata file");
 1281         return -1;
 1282     }
 1283 
 1284     return sb.st_size;
 1285 }
 1286 
 1287 //!
 1288 //! Helper for write buffer into a file at descriptor
 1289 //!
 1290 //! @param[in] fd
 1291 //! @param[in] buf
 1292 //! @param[in] size_buf
 1293 //!
 1294 //! @return The number of bytes written or -1 if error
 1295 //!
 1296 //! @pre
 1297 //!
 1298 //! @note
 1299 //!
 1300 static int buf_to_fd(int fd, const char *buf, int size_buf)
 1301 {
 1302     if (lseek(fd, 0, SEEK_SET) == -1) {
 1303         ERR(BLOBSTORE_ERROR_ACCES, "failed to seek in metadata file");
 1304         return -1;
 1305     }
 1306 
 1307     ssize_t size_wrote = write(fd, buf, size_buf);  //! @TODO do this in a loop?
 1308     if (size_wrote < size_buf) {
 1309         ERR(BLOBSTORE_ERROR_NOENT, "failed to write metadata file");
 1310         return -1;
 1311     }
 1312     // as a sanity check, stat the file and verify its size
 1313     struct stat sb;
 1314     if (fstat(fd, &sb) == -1) {
 1315         ERR(BLOBSTORE_ERROR_ACCES, "failed to stat metadata file");
 1316         return -1;
 1317     }
 1318 
 1319     if (sb.st_size != size_buf) {
 1320         ERR(BLOBSTORE_ERROR_NOENT, "failed to read back metadata file");
 1321         return -1;
 1322     }
 1323 
 1324     return sb.st_size;
 1325 }
 1326 
 1327 //!
 1328 //!
 1329 //!
 1330 //! @param[in] bs
 1331 //!
 1332 //! @return
 1333 //!
 1334 //! @pre
 1335 //!
 1336 //! @note
 1337 //!
 1338 static int read_store_metadata(blobstore * bs)
 1339 {
 1340     char *val = NULL;
 1341     char buf[1024] = "";
 1342     int size = fd_to_buf(bs->fd, buf, (sizeof(buf) - 1));
 1343 
 1344     if (size == -1)
 1345         return -1;
 1346     if (size < 30) {
 1347         ERR(BLOBSTORE_ERROR_NOENT, "metadata size is too small");
 1348         return -1;
 1349     }
 1350 
 1351     buf[size] = '\0';
 1352     if ((val = get_val(buf, "id")) == NULL)
 1353         return -1;
 1354     euca_strncpy(bs->id, val, sizeof(bs->id));
 1355     EUCA_FREE(val);
 1356 
 1357     if ((val = get_val(buf, "limit")) == NULL)
 1358         return -1;
 1359     errno = 0;
 1360     bs->limit_blocks = strtoll(val, NULL, 10);
 1361     EUCA_FREE(val);
 1362     if (errno != 0) {
 1363         ERR(BLOBSTORE_ERROR_NOENT, "invalid metadata file (limit is missing)");
 1364         return -1;
 1365     }
 1366 
 1367     if ((val = get_val(buf, "revocation")) == NULL)
 1368         return -1;
 1369     errno = 0;
 1370     bs->revocation_policy = strtoll(val, NULL, 10);
 1371     EUCA_FREE(val);
 1372     if (errno != 0) {
 1373         ERR(BLOBSTORE_ERROR_NOENT, "invalid metadata file (revocation is missing)");
 1374         return -1;
 1375     }
 1376 
 1377     if ((val = get_val(buf, "snapshot")) == NULL)
 1378         return -1;
 1379     errno = 0;
 1380     bs->snapshot_policy = strtoll(val, NULL, 10);
 1381     EUCA_FREE(val);
 1382     if (errno != 0) {
 1383         ERR(BLOBSTORE_ERROR_NOENT, "invalid metadata file (snapshot is missing)");
 1384         return -1;
 1385     }
 1386 
 1387     if ((val = get_val(buf, "format")) == NULL)
 1388         return -1;
 1389     errno = 0;
 1390     bs->format = strtoll(val, NULL, 10);
 1391     EUCA_FREE(val);
 1392     if (errno != 0) {
 1393         ERR(BLOBSTORE_ERROR_NOENT, "invalid metadata file (format is missing)");
 1394         return -1;
 1395     }
 1396     return 0;
 1397 }
 1398 
 1399 //!
 1400 //!
 1401 //!
 1402 //! @param[in] bs
 1403 //!
 1404 //! @return
 1405 //!
 1406 //! @pre
 1407 //!
 1408 //! @note
 1409 //!
 1410 static int write_store_metadata(blobstore * bs)
 1411 {
 1412     if (ftruncate(bs->fd, 0) == -1) {
 1413         ERR(BLOBSTORE_ERROR_NOENT, "failed to truncate the metadata file");
 1414         return -1;
 1415     }
 1416     if (lseek(bs->fd, 0, SEEK_SET) == -1) {
 1417         ERR(BLOBSTORE_ERROR_ACCES, "failed to seek in metadata file");
 1418         return -1;
 1419     }
 1420     char buf[1024];
 1421     snprintf(buf, sizeof(buf), "id: %s\n" "limit: %lld\n" "revocation: %d\n" "snapshot: %d\n" "format: %d\n", bs->id, bs->limit_blocks,
 1422              bs->revocation_policy, bs->snapshot_policy, bs->format);
 1423     int slen = strlen(buf);
 1424     int len = write(bs->fd, buf, slen);
 1425     if (len != slen) {
 1426         ERR(BLOBSTORE_ERROR_NOENT, "failed to write to the metadata file");
 1427         return -1;
 1428     }
 1429 
 1430     return 0;
 1431 }
 1432 
 1433 //!
 1434 //!
 1435 //!
 1436 //! @return
 1437 //!
 1438 //! @pre
 1439 //!
 1440 //! @note
 1441 //!
 1442 int blobstore_init(void)
 1443 {
 1444     int ret = 0;
 1445 
 1446     if (!initialized) {
 1447         ret = diskutil_init(0);
 1448         if (ret) {
 1449             ERR(BLOBSTORE_ERROR_UNKNOWN, "failed to initialize diskutil library");
 1450         } else {
 1451             ret = verify_helpers(helpers, helpers_path, LASTHELPER);
 1452             if (ret) {
 1453                 for (int i = 0; i < LASTHELPER; i++) {
 1454                     if (helpers_path[i] == NULL)
 1455                         LOGERROR("ERROR: missing a required handler: %s\n", helpers[i]);
 1456                 }
 1457                 ERR(BLOBSTORE_ERROR_UNKNOWN, "failed to initialize blobstore library");
 1458             } else {
 1459                 initialized = 1;
 1460             }
 1461         }
 1462         euca_srand();                  // seed the random number generator
 1463     }
 1464 
 1465     return ret;
 1466 }
 1467 
 1468 //!
 1469 //!
 1470 //!
 1471 //! @return
 1472 //!
 1473 //! @pre
 1474 //!
 1475 //! @note
 1476 //!
 1477 int blobstore_cleanup(void)
 1478 {
 1479     diskutil_cleanup();
 1480     return 0;
 1481 }
 1482 
 1483 //!
 1484 //!
 1485 //!
 1486 //! @param[in] path
 1487 //! @param[in] limit_blocks
 1488 //! @param[in] flags
 1489 //! @param[in] format
 1490 //! @param[in] revocation_policy
 1491 //! @param[in] snapshot_policy
 1492 //!
 1493 //! @return
 1494 //!
 1495 //! @pre
 1496 //!
 1497 //! @note
 1498 //!
 1499 blobstore *blobstore_open(const char *path, unsigned long long limit_blocks, unsigned int flags,    // BLOBSTORE_FLAG_CREAT - same semantcs as for open() flags
 1500                           blobstore_format_t format, blobstore_revocation_t revocation_policy, blobstore_snapshot_t snapshot_policy)
 1501 {
 1502     int saved_errno;
 1503 
 1504     if (blobstore_init())
 1505         return NULL;
 1506 
 1507     blobstore *bs = EUCA_ZALLOC(1, sizeof(blobstore));
 1508     if (bs == NULL) {
 1509         ERR(BLOBSTORE_ERROR_NOMEM, NULL);
 1510         goto out;
 1511     }
 1512     euca_strncpy(bs->path, path, sizeof(bs->path)); //! @TODO canonicalize path
 1513     char meta_path[PATH_MAX];
 1514     snprintf(meta_path, sizeof(meta_path), "%s/%s", bs->path, BLOBSTORE_METADATA_FILE);
 1515 
 1516     int write_flags = 0;
 1517     if (flags & BLOBSTORE_FLAG_CREAT) {
 1518         write_flags = BLOBSTORE_FLAG_CREAT | BLOBSTORE_FLAG_EXCL;
 1519     };
 1520 
 1521 write_metadata:
 1522 
 1523     if (write_flags) {
 1524         _blobstore_errno = BLOBSTORE_ERROR_OK;
 1525         _err_off();
 1526         bs->fd = open_and_lock(meta_path, write_flags, 0, BLOBSTORE_FILE_PERM);
 1527         _err_on();
 1528         if (bs->fd != -1) {            // managed to create or open blobstore metadata file and got exclusive lock
 1529 
 1530             // the intention is to create the blobstore for the first time
 1531             if (write_flags & BLOBSTORE_FLAG_CREAT) {
 1532                 gen_id(bs->id, sizeof(bs->id));
 1533                 bs->limit_blocks = limit_blocks;
 1534                 bs->revocation_policy = (revocation_policy == BLOBSTORE_REVOCATION_ANY) ? BLOBSTORE_REVOCATION_NONE : revocation_policy;
 1535                 bs->snapshot_policy = (snapshot_policy == BLOBSTORE_SNAPSHOT_ANY) ? BLOBSTORE_SNAPSHOT_DM : snapshot_policy;    //! @TODO verify that DM is available?
 1536                 bs->format = (format == BLOBSTORE_FORMAT_ANY) ? BLOBSTORE_FORMAT_FILES : format;
 1537 
 1538                 // write metadata to disk
 1539                 write_store_metadata(bs);
 1540 
 1541             } else if (write_flags & BLOBSTORE_FLAG_RDWR) { // the intention is to adjust metadata
 1542                 if (read_store_metadata(bs))
 1543                     goto free;
 1544                 assert(bs->id);
 1545                 if (limit_blocks)
 1546                     bs->limit_blocks = limit_blocks;
 1547                 if (revocation_policy != BLOBSTORE_REVOCATION_ANY)
 1548                     bs->revocation_policy = revocation_policy;
 1549                 write_store_metadata(bs);
 1550             }
 1551             close_and_unlock(bs->fd);  // try to close, thus giving up the exclusive lock
 1552         }
 1553         if (_blobstore_errno != BLOBSTORE_ERROR_OK &&   // either open or write failed
 1554             _blobstore_errno != BLOBSTORE_ERROR_EXIST &&    // it is OK if file already exists
 1555             _blobstore_errno != BLOBSTORE_ERROR_AGAIN) {    // it is OK if we lost the race for the write lock
 1556             ERR(_blobstore_errno, "failed to open or create blobstore");
 1557             goto free;
 1558         }
 1559     }
 1560     // now (re)open, with a shared read lock
 1561     bs->fd = open_and_lock(meta_path, BLOBSTORE_FLAG_RDONLY, BLOBSTORE_METADATA_TIMEOUT_USEC, BLOBSTORE_FILE_PERM);
 1562     if (bs->fd == -1) {
 1563         goto free;
 1564     }
 1565     if (read_store_metadata(bs)) {     // try reading metadata
 1566         goto free;
 1567     }
 1568     // verify that parameters are not being changed
 1569     if (limit_blocks && limit_blocks != bs->limit_blocks) {
 1570         if (flags & BLOBSTORE_FLAG_STRICT) {
 1571             ERR(BLOBSTORE_ERROR_INVAL, "'limit_blocks' does not match existing blobstore");
 1572             goto free;
 1573         } else {
 1574             LOGINFO("adjusting blobstore limit from %lld to %lld\n", bs->limit_blocks, limit_blocks);
 1575             write_flags = BLOBSTORE_FLAG_RDWR;
 1576             close_and_unlock(bs->fd);
 1577             goto write_metadata;
 1578         }
 1579     }
 1580     if (snapshot_policy != BLOBSTORE_SNAPSHOT_ANY && snapshot_policy != bs->snapshot_policy) {
 1581         ERR(BLOBSTORE_ERROR_INVAL, "'snapshot_policy' does not match existing blobstore");
 1582         goto free;
 1583     }
 1584     if (format != BLOBSTORE_FORMAT_ANY && format != bs->format) {
 1585         ERR(BLOBSTORE_ERROR_INVAL, "'format' does not match existing blobstore");
 1586         goto free;
 1587     }
 1588     if (revocation_policy != BLOBSTORE_REVOCATION_ANY && revocation_policy != bs->revocation_policy) {
 1589         if (flags & BLOBSTORE_FLAG_STRICT) {
 1590             ERR(BLOBSTORE_ERROR_INVAL, "'revocation_policy' does not match existing blobstore");    //! @TODO maybe make revocation_policy changeable after creation
 1591             goto free;
 1592         } else {
 1593             write_flags = BLOBSTORE_FLAG_RDWR;
 1594             close_and_unlock(bs->fd);
 1595             goto write_metadata;
 1596         }
 1597     }
 1598     int fd = bs->fd;
 1599     bs->fd = -1;
 1600     close_and_unlock(fd);
 1601     goto out;
 1602 
 1603 free:
 1604     saved_errno = _blobstore_errno;
 1605     close_and_unlock(bs->fd);
 1606     EUCA_FREE(bs);
 1607     _blobstore_errno = saved_errno;
 1608 
 1609 out:
 1610     return bs;
 1611 }
 1612 
 1613 //!
 1614 //! Frees the blobstore handle
 1615 //!
 1616 //! @param[in] bs
 1617 //!
 1618 //! @return
 1619 //!
 1620 //! @pre
 1621 //!
 1622 //! @note
 1623 //!
 1624 int blobstore_close(blobstore * bs)
 1625 {
 1626     EUCA_FREE(bs);
 1627     return 0;
 1628 }
 1629 
 1630 //!
 1631 //! Locks the blobstore
 1632 //!
 1633 //! @param[in] bs
 1634 //! @param[in] timeout_usec
 1635 //!
 1636 //! @return
 1637 //!
 1638 //! @pre
 1639 //!
 1640 //! @note
 1641 //!
 1642 int blobstore_lock(blobstore * bs, long long timeout_usec)
 1643 {
 1644     char meta_path[PATH_MAX];
 1645     snprintf(meta_path, sizeof(meta_path), "%s/%s", bs->path, BLOBSTORE_METADATA_FILE);
 1646 
 1647     LOGTRACE("{%u} blobstore_lock: called for %s\n", (unsigned int)pthread_self(), bs->path);
 1648     int fd = open_and_lock(meta_path, BLOBSTORE_FLAG_RDWR, timeout_usec, BLOBSTORE_FILE_PERM);
 1649     if (fd != -1)
 1650         bs->fd = fd;
 1651     return fd;
 1652 }
 1653 
 1654 //!
 1655 //! Unlocks the blobstore
 1656 //!
 1657 //! @param[in] bs
 1658 //!
 1659 //! @return
 1660 //!
 1661 //! @pre
 1662 //!
 1663 //! @note
 1664 //!
 1665 int blobstore_unlock(blobstore * bs)
 1666 {
 1667     int fd = bs->fd;
 1668     bs->fd = -1;
 1669     LOGTRACE("{%u} blobstore_unlock: called for %s\n", (unsigned int)pthread_self(), bs->path);
 1670     return close_and_unlock(fd);
 1671 }
 1672 
 1673 //!
 1674 //! If no outside references to store or blobs exist, and
 1675 //! no blobs are protected, deletes the blobs, the store metadata,
 1676 //! and frees the blobstore handle
 1677 //!
 1678 //! @param[in] bs
 1679 //!
 1680 //! @return
 1681 //!
 1682 //! @pre
 1683 //!
 1684 //! @note
 1685 //!
 1686 int blobstore_delete(blobstore * bs)
 1687 {
 1688     LOGINFO("creating the baloon blob\n");
 1689     blockblob *bb = blockblob_open(bs, "__baloon_blob__",
 1690                                    bs->limit_blocks * 512,  // biggest possible blob
 1691                                    (BLOBSTORE_FLAG_CREAT | BLOBSTORE_FLAG_EXCL),
 1692                                    NULL,    // do not care for signature
 1693                                    BLOBSTORE_METADATA_TIMEOUT_USEC);    // give a generous timeout
 1694     if (bb == NULL) {
 1695         LOGINFO("failed to purge blobstore: %s: %s\n", blobstore_get_error_str(blobstore_get_error()), blobstore_get_last_msg());
 1696         ERR(BLOBSTORE_ERROR_INVAL, "failed to purge blobstore with a baloon blob");
 1697         return EUCA_ERROR;
 1698     }
 1699     blockblob_delete(bb, BLOBSTORE_DELETE_TIMEOUT_USEC, TRUE);  // get rid of the last blob
 1700 
 1701     char meta_path[PATH_MAX];
 1702     snprintf(meta_path, sizeof(meta_path), "%s/%s", bs->path, BLOBSTORE_METADATA_FILE);
 1703     LOGINFO("removing blobstore metadata '%s'\n", meta_path);
 1704     unlink(meta_path);
 1705     EUCA_FREE(bs);
 1706 
 1707     return EUCA_OK;
 1708 }
 1709 
 1710 //!
 1711 //!
 1712 //!
 1713 //! @return
 1714 //!
 1715 //! @pre
 1716 //!
 1717 //! @note
 1718 //!
 1719 int blobstore_get_error(void)
 1720 {
 1721     return _blobstore_errno;
 1722 }
 1723 
 1724 //!
 1725 //! Helper for setting paths, depending on blockblob_path_t given BLOCKBLOB_PATH_X: x = tolower(X)
 1726 //!
 1727 //!  for BLOBSTORE_FORMAT_FILES:     BS/BB.x
 1728 //!  for BLOBSTORE_FORMAT_DIRECTORY: BS/BB/x
 1729 //!
 1730 //!  where BS is blobstore path and BB is a blockblob id.
 1731 //!  BB may have '/' in it, thus placing all blob-related
 1732 //!  files in a deeper dir hierarchy
 1733 //!
 1734 //! @param[in]  path_t
 1735 //! @param[in]  bs
 1736 //! @param[in]  bb_id
 1737 //! @param[out] path
 1738 //! @param[in]  path_size
 1739 //!
 1740 //! @return
 1741 //!
 1742 //! @pre
 1743 //!
 1744 //! @note
 1745 //!
 1746 static int set_blockblob_metadata_path(blockblob_path_t path_t, const blobstore * bs, const char *bb_id, char *path, size_t path_size)
 1747 {
 1748     char base[PATH_MAX];
 1749     snprintf(base, sizeof(base), "%s/%s", bs->path, bb_id);
 1750 
 1751     char name[32];
 1752     switch (path_t) {
 1753     case BLOCKBLOB_PATH_BLOCKS:
 1754         euca_strncpy(name, blobstore_metadata_suffixes[BLOCKBLOB_PATH_BLOCKS], sizeof(name));
 1755         break;
 1756     case BLOCKBLOB_PATH_LOCK:
 1757         euca_strncpy(name, blobstore_metadata_suffixes[BLOCKBLOB_PATH_LOCK], sizeof(name));
 1758         break;
 1759     case BLOCKBLOB_PATH_DM:
 1760         euca_strncpy(name, blobstore_metadata_suffixes[BLOCKBLOB_PATH_DM], sizeof(name));
 1761         break;
 1762     case BLOCKBLOB_PATH_DEPS:
 1763         euca_strncpy(name, blobstore_metadata_suffixes[BLOCKBLOB_PATH_DEPS], sizeof(name));
 1764         break;
 1765     case BLOCKBLOB_PATH_LOOPBACK:
 1766         euca_strncpy(name, blobstore_metadata_suffixes[BLOCKBLOB_PATH_LOOPBACK], sizeof(name));
 1767         break;
 1768     case BLOCKBLOB_PATH_SIG:
 1769         euca_strncpy(name, blobstore_metadata_suffixes[BLOCKBLOB_PATH_SIG], sizeof(name));
 1770         break;
 1771     case BLOCKBLOB_PATH_REFS:
 1772         euca_strncpy(name, blobstore_metadata_suffixes[BLOCKBLOB_PATH_REFS], sizeof(name));
 1773         break;
 1774     case BLOCKBLOB_PATH_HOLLOW:
 1775         euca_strncpy(name, blobstore_metadata_suffixes[BLOCKBLOB_PATH_HOLLOW], sizeof(name));
 1776         break;
 1777     default:
 1778         ERR(BLOBSTORE_ERROR_INVAL, "invalid path_t");
 1779         return -1;
 1780     }
 1781 
 1782     switch (bs->format) {
 1783     case BLOBSTORE_FORMAT_FILES:
 1784         snprintf(path, path_size, "%s.%s", base, name);
 1785         break;
 1786     case BLOBSTORE_FORMAT_DIRECTORY:
 1787         snprintf(path, path_size, "%s/%s", base, name);
 1788         break;
 1789     default:
 1790         ERR(BLOBSTORE_ERROR_INVAL, "invalid bs->format");
 1791         return -1;
 1792     }
 1793 
 1794     return 0;
 1795 }
 1796 
 1797 //!
 1798 //! Write string 'str' into a specific metadata file (based on 'path_t') of blob 'bb_id'
 1799 //!
 1800 //! @param[in] path_t
 1801 //! @param[in] bs
 1802 //! @param[in] bb_id
 1803 //! @param[in] str
 1804 //!
 1805 //! @return 0 for success or -1 for error
 1806 //!
 1807 //! @pre
 1808 //!
 1809 //! @note
 1810 //!
 1811 static int write_blockblob_metadata_path(blockblob_path_t path_t, const blobstore * bs, const char *bb_id, const char *str)
 1812 {
 1813     int ret = 0;
 1814     char path[PATH_MAX];
 1815     set_blockblob_metadata_path(path_t, bs, bb_id, path, sizeof(path));
 1816 
 1817     int fd = open_and_lock(path,
 1818                            BLOBSTORE_FLAG_CREAT | BLOBSTORE_FLAG_RDWR,
 1819                            BLOBSTORE_METADATA_TIMEOUT_USEC,
 1820                            BLOBSTORE_FILE_PERM);
 1821     if (fd == -1)
 1822         return -1;
 1823     int size = buf_to_fd(fd, str, strlen(str));
 1824     int ret_close = close_and_unlock(fd);
 1825     if (size != strlen(str)) {
 1826         // set the error code, possibly overriding one set by close_and_unlock
 1827         ERR(BLOBSTORE_ERROR_UNKNOWN, "failed to write desired number of characters to metadata file");
 1828         ret = -1;
 1829     } else if (ret_close != 0) {
 1830         ret = -1;                      // close_and_unlock should have set the error code
 1831     }
 1832 
 1833     return ret;
 1834 }
 1835 
 1836 //!
 1837 //! Reads contents of a specific metadata file (based on 'path_t') of blob 'bb_id' into string 'str' up to 'str_size'
 1838 //!
 1839 //! @param[in]  path_t
 1840 //! @param[in]  bs
 1841 //! @param[in]  bb_id
 1842 //! @param[out] str
 1843 //! @param[in]  str_size
 1844 //!
 1845 //! @return The number of bytes read or -1 in case of error
 1846 //!
 1847 //! @pre
 1848 //!
 1849 //! @note
 1850 //!
 1851 static int read_blockblob_metadata_path(blockblob_path_t path_t, const blobstore * bs, const char *bb_id, char *str, int str_size)
 1852 {
 1853     char path[PATH_MAX];
 1854     set_blockblob_metadata_path(path_t, bs, bb_id, path, sizeof(path));
 1855 
 1856     int fd = open_and_lock(path,
 1857                            BLOBSTORE_FLAG_RDONLY,
 1858                            BLOBSTORE_METADATA_TIMEOUT_USEC,
 1859                            BLOBSTORE_FILE_PERM);
 1860     if (fd == -1)
 1861         return -1;
 1862     int size = fd_to_buf(fd, str, str_size);
 1863     int ret_close = close_and_unlock(fd);
 1864     if (size < 1) {
 1865         // set the error code, possibly overriding one set by close_and_unlock
 1866         ERR(BLOBSTORE_ERROR_NOENT, "blockblob metadata size is too small");
 1867         size = -1;
 1868     } else if (ret_close != 0) {
 1869         size = -1;                     // close_and_unlock should have set the error code
 1870     }
 1871 
 1872     return size;
 1873 }
 1874 
 1875 //!
 1876 //! Writes strings from 'array' of size 'array_size' (which can be 0) line-by-line
 1877 //! into a specific metadata file (based on 'path_t') of blob 'bb_id'
 1878 //!
 1879 //! @param[in]  path_t
 1880 //! @param[in]  bs
 1881 //! @param[in]  bb_id
 1882 //! @param[out] array
 1883 //! @param[out] array_size
 1884 //!
 1885 //! @return 0 for success and -1 for error
 1886 //!
 1887 //! @pre
 1888 //!
 1889 //! @note
 1890 //!
 1891 static int write_array_blockblob_metadata_path(blockblob_path_t path_t, const blobstore * bs, const char *bb_id, char **array, int array_size)
 1892 {
 1893     int i = 0;
 1894     int fd = 0;
 1895     int ret = 0;
 1896     int dataLen = 0;
 1897     unsigned int openFlags = (BLOBSTORE_FLAG_CREAT | BLOBSTORE_FLAG_TRUNC | BLOBSTORE_FLAG_RDWR);
 1898     char path[EUCA_MAX_PATH] = "";
 1899 
 1900     set_blockblob_metadata_path(path_t, bs, bb_id, path, sizeof(path));
 1901     if ((fd = open_and_lock(path, openFlags, BLOBSTORE_METADATA_TIMEOUT_USEC, BLOBSTORE_FILE_PERM)) == -1) {
 1902         PROPAGATE_ERR(BLOBSTORE_ERROR_UNKNOWN);
 1903         return (-1);
 1904     }
 1905 
 1906     for (i = 0; i < array_size; i++) {
 1907         dataLen = strlen(array[i]);
 1908         if (write(fd, array[i], dataLen) != dataLen) {
 1909             PROPAGATE_ERR(BLOBSTORE_ERROR_UNKNOWN);
 1910             ret = -1;
 1911             break;
 1912         }
 1913 
 1914         if (write(fd, "\n", 1) != 1) {
 1915             PROPAGATE_ERR(BLOBSTORE_ERROR_UNKNOWN);
 1916             ret = -1;
 1917             break;
 1918         }
 1919     }
 1920 
 1921     if (close_and_unlock(fd) != 0) {
 1922         PROPAGATE_ERR(BLOBSTORE_ERROR_UNKNOWN);
 1923         ret = -1;
 1924     }
 1925 
 1926     return (ret);
 1927 }
 1928 
 1929 //!
 1930 //! The equivalent of getline for file descriptor.
 1931 //!
 1932 //! @param[in,out] ppLine pointer to the character array to read into
 1933 //! @param[in,out] n amount of memory currently allocated for (*ppLine), if any
 1934 //! @param[in]     fd file descriptor to read from
 1935 //!
 1936 //! @return On success, number of characters read excluding the '\n' character is returned. A
 1937 //!         value or 0 indicates we reached the end of the file. A returned value of -1 indicates
 1938 //!         an error and the errno is set appropriately. On error, the original allocated memory
 1939 //!         is left untouched.
 1940 //!
 1941 //! @pre
 1942 //!
 1943 //! @note
 1944 //!
 1945 ssize_t get_line_desc(char **ppLine, size_t * n, int fd)
 1946 {
 1947     char c = '\0';
 1948     size_t length = 0;
 1949     size_t newSize = (*n);
 1950     ssize_t error = 0;
 1951     char *pLine = *ppLine;
 1952     char *pNewBlock = *ppLine;
 1953 
 1954     do {
 1955         // Read one character.. If 0, then EOF, if less then error!
 1956         if ((error = read(fd, &c, 1)) <= 0)
 1957             break;
 1958 
 1959         // If we're going over, re-allocate memory
 1960         if ((length + 1) >= newSize) {
 1961             newSize += 64;
 1962 
 1963             if ((pNewBlock = EUCA_REALLOC(pLine, newSize, sizeof(char))) == NULL) {
 1964                 error = -1;
 1965                 break;
 1966             }
 1967 
 1968             pLine = pNewBlock;
 1969         }
 1970 
 1971         pLine[length++] = c;
 1972     } while (c != '\n');
 1973 
 1974     // Did we have an error?
 1975     if (error < 0) {
 1976         // If (*n) was originally 0 we should free pLine since we allocated that memory.
 1977         if (((*n) == 0) && (pLine != NULL)) {
 1978             EUCA_FREE(pLine);
 1979         }
 1980         return (-1);
 1981     }
 1982     // Now strip the '\n' character
 1983     if (pLine != NULL) {
 1984         (*ppLine) = pLine;
 1985         pLine[length] = '\0';          // Safety
 1986 
 1987         // Now strip '\n' if present. We could have reached EOF and no '\n' was present
 1988         if (pLine[length - 1] == '\n')
 1989             pLine[--length] = '\0';
 1990 
 1991         // Update the (*n) value
 1992         (*n) = newSize;
 1993     }
 1994 
 1995     return (length);
 1996 }
 1997 
 1998 //!
 1999 //! Reads lines from a specific metadata file (based on 'path_t') of blob 'bb_id',
 2000 //! places each line into a newly allocated string, arranges pointers to these
 2001 //! strings into a newly allocated array of pointers, and places the size into 'array_size'
 2002 //!
 2003 //! @param[in]  path_t
 2004 //! @param[in]  bs
 2005 //! @param[in]  bb_id
 2006 //! @param[out] array
 2007 //! @param[out] array_size
 2008 //!
 2009 //! @return 0 for success and -1 for error
 2010 //!
 2011 //! @pre
 2012 //!
 2013 //! @note Caller must deallocate the array and the strings pointed to by the array
 2014 //!
 2015 static int read_array_blockblob_metadata_path(blockblob_path_t path_t, const blobstore * bs, const char *bb_id, char ***array, int *array_size)
 2016 {
 2017     int fd = -1;
 2018     int ret = 0;
 2019     int i = 0;
 2020     int j = 0;
 2021     size_t n = 0;
 2022     ssize_t rdLen = 1;
 2023     char **lines = NULL;
 2024     char *line = NULL;
 2025     char **bigger_lines = NULL;
 2026     char path[EUCA_MAX_PATH] = "";
 2027 
 2028     set_blockblob_metadata_path(path_t, bs, bb_id, path, sizeof(path));
 2029 
 2030     // Acquire the metadata file descriptor
 2031     if ((fd = open_and_lock(path, BLOBSTORE_FLAG_RDONLY, BLOBSTORE_METADATA_TIMEOUT_USEC, BLOBSTORE_FILE_PERM)) == -1) {
 2032         PROPAGATE_ERR(BLOBSTORE_ERROR_UNKNOWN);
 2033         *array = NULL;
 2034         *array_size = 0;
 2035         return 0;
 2036     }
 2037     // Read each line and fill our array
 2038     for (i = 0, rdLen = 1; rdLen > 0; i++) {
 2039         n = 0;
 2040         line = NULL;
 2041 
 2042         // Read the file. 0 means EOF, < 0 means error...
 2043         if ((rdLen = get_line_desc(&line, &n, fd)) < 0) {
 2044             EUCA_FREE(line);
 2045             PROPAGATE_ERR(BLOBSTORE_ERROR_UNKNOWN);
 2046             ret = -1;
 2047             break;
 2048         } else if (rdLen == 0) {
 2049             // EOF, no more data
 2050             break;
 2051         }
 2052 
 2053         LOGEXTREME("%s => [%d] READ LINE %s rdLen %lu, n %ld\n", __func__, fd, line, rdLen, n);
 2054 
 2055         // Add one more entry to our metadata array
 2056         if ((bigger_lines = EUCA_REALLOC(lines, (i + 1), sizeof(char *))) == NULL) {
 2057             ERR(BLOBSTORE_ERROR_NOMEM, NULL);
 2058             EUCA_FREE(line);
 2059             ret = -1;
 2060             break;
 2061         }
 2062 
 2063         lines = bigger_lines;
 2064         lines[i] = line;
 2065     }
 2066 
 2067     // Release the metadata file descriptor
 2068     if (close_and_unlock(fd) != 0) {
 2069         PROPAGATE_ERR(BLOBSTORE_ERROR_UNKNOWN);
 2070         ret = -1;
 2071     }
 2072     // if something failed, lets do some house cleanup before we bail
 2073     if (ret == -1) {
 2074         if (lines != NULL) {
 2075             for (j = 0; j < i; j++)
 2076                 EUCA_FREE(lines[j]);
 2077             EUCA_FREE(lines);
 2078         }
 2079         return (ret);
 2080     }
 2081 
 2082     *array = lines;
 2083     *array_size = i;
 2084     return (0);
 2085 }
 2086 
 2087 //!
 2088 //!
 2089 //!
 2090 //! @param[in] path_t
 2091 //! @param[in] bs
 2092 //! @param[in] bb_id
 2093 //! @param[in] entry
 2094 //! @param[in] removing
 2095 //!
 2096 //! @return
 2097 //!
 2098 //! @pre
 2099 //!
 2100 //! @note
 2101 //!
 2102 static int update_entry_blockblob_metadata_path(blockblob_path_t path_t, const blobstore * bs, const char *bb_id, const char *entry, int removing)
 2103 {
 2104     int ret = 0;
 2105 
 2106     // read in current entries from a metadata file
 2107     char **entries;
 2108     int entries_size;
 2109     if (read_array_blockblob_metadata_path(path_t, bs, bb_id, &entries, &entries_size) == -1) {
 2110         return -1;
 2111     }
 2112     // see if this entry is already in the metadata file
 2113     int found = -1;
 2114     for (int j = 0; j < entries_size; j++) {
 2115         if (!strcmp(entry, entries[j])) {
 2116             found = j;
 2117             break;
 2118         }
 2119     }
 2120 
 2121     if (found == -1 && !removing) {    // not in the file and adding
 2122         entries_size++;
 2123         char **bigger_entries = EUCA_ZALLOC(entries_size, sizeof(char *));
 2124         if (bigger_entries == NULL) {
 2125             ret = -1;
 2126             goto cleanup;
 2127         }
 2128         for (int i = 0; i < entries_size - 1; i++) {    // we do not trust realloc
 2129             bigger_entries[i] = entries[i];
 2130         }
 2131         EUCA_FREE(entries);
 2132         entries = bigger_entries;
 2133         entries[entries_size - 1] = strdup(entry);
 2134 
 2135     } else if (found != -1 && removing) {   // in the file and deleting
 2136         EUCA_FREE(entries[found]);
 2137         entries_size--;
 2138         if (entries_size && found != entries_size) {    // still entries left and not deleting last one
 2139             entries[found] = entries[entries_size]; // move the last one over the one we're deleting
 2140         }
 2141 
 2142     } else {                           // nothing to do
 2143         goto cleanup;
 2144     }
 2145 
 2146     // save new entries into the metadata file
 2147     if (write_array_blockblob_metadata_path(path_t, bs, bb_id, entries, entries_size) == -1) {
 2148         ret = -1;
 2149     }
 2150 
 2151 cleanup:
 2152     if (entries != NULL) {
 2153         for (int j = 0; j < entries_size; j++) {
 2154             EUCA_FREE(entries[j]);
 2155         }
 2156         EUCA_FREE(entries);
 2157     }
 2158     return ret;
 2159 }
 2160 
 2161 //!
 2162 //! Retrieves the type of the blockblob metadata path we have.
 2163 //!
 2164 //! @param[in] bs
 2165 //! @param[in] path
 2166 //! @param[in] bb_id
 2167 //! @param[in] bb_id_size
 2168 //!
 2169 //! @return If 'path' looks like a blockblob metadata file (based on the suffix), return the type of the file and
 2170 //!         set bb_id appropriately, else return 0 if it is an unrecognized file, else return -1 for error
 2171 //!
 2172 //! @pre
 2173 //!
 2174 //! @note
 2175 //!
 2176 static int typeof_blockblob_metadata_path(const blobstore * bs, const char *path, char *bb_id, unsigned int bb_id_size)
 2177 {
 2178     assert(path);
 2179     assert(bs->path);
 2180     assert(strstr(path, bs->path) == path);
 2181 
 2182     const char *rel_path = path + strlen(bs->path) + 1; // +1 for '/'
 2183     int p_len = strlen(rel_path);
 2184 
 2185     for (int i = 1; i < BLOCKBLOB_PATH_TOTAL; i++) {    // start at 1 to avoid BLOCKBLOB_PATH_NONE
 2186         char suffix[1024];
 2187         if (bs->format == BLOBSTORE_FORMAT_DIRECTORY) {
 2188             snprintf(suffix, sizeof(suffix), "/%s", blobstore_metadata_suffixes[i]);
 2189         } else {
 2190             snprintf(suffix, sizeof(suffix), ".%s", blobstore_metadata_suffixes[i]);
 2191         }
 2192         unsigned int s_len = strlen(suffix);
 2193         const char *sp = suffix + s_len - 1;    // last char of suffix
 2194         const char *pp = rel_path + p_len - 1;  // last char of (relative) path
 2195         unsigned int matched;
 2196         for (matched = 0; *sp == *pp; sp--, pp--) {
 2197             matched++;
 2198             if (sp == suffix)
 2199                 break;
 2200             if (pp == rel_path)
 2201                 break;
 2202         }
 2203         if (matched == s_len           // whole suffix matched
 2204             && matched < p_len) {      // there is more than the suffix
 2205             if ((bb_id_size - 1) < (p_len - s_len)) // not enough room in bb_id
 2206                 return -1;
 2207             strncpy(bb_id, rel_path, p_len - s_len);    // extract the name, without the suffix
 2208             bb_id[p_len - s_len] = '\0';    // terminate the string
 2209             return i;
 2210         }
 2211     }
 2212     return 0;
 2213 }
 2214 
 2215 //!
 2216 //!
 2217 //!
 2218 //! @param[in] bs
 2219 //! @param[in] bb_id
 2220 //!
 2221 //! @return the number of files and directories deleted as part of removing the
 2222 //!         blob (thus, 0 means there was nothing to delete)
 2223 //!
 2224 //! @pre
 2225 //!
 2226 //! @note
 2227 //!
 2228 static int delete_blockblob_files(const blobstore * bs, const char *bb_id)
 2229 {
 2230     int count = 0;
 2231 
 2232     for (int path_t = 1; path_t < BLOCKBLOB_PATH_TOTAL; path_t++) { // go through all types of blob-related files...
 2233         char path[PATH_MAX];
 2234         set_blockblob_metadata_path((blockblob_path_t) path_t, bs, bb_id, path, sizeof(path));
 2235         if (unlink(path) == 0)         // ...and try deleting them
 2236             count++;
 2237     }
 2238 
 2239     // delete blob's subdirectories if there are any
 2240     char path[PATH_MAX];
 2241     snprintf(path, sizeof(path), "%s/%s%s", bs->path, bb_id, bs->format == BLOBSTORE_FORMAT_DIRECTORY ? "/" : "");
 2242     for (int i = strlen(path) - 1; i > 0; i--) {
 2243         if (path[i] == '/') {
 2244             path[i] = '\0';
 2245             if (rmdir(path) == 0) {
 2246                 count++;
 2247             } else {
 2248                 break;
 2249             }
 2250         }
 2251     }
 2252 
 2253     return count;
 2254 }
 2255 
 2256 //!
 2257 //! Helper for ensuring a directory required by blob exists
 2258 //!
 2259 //! @param[in] bs
 2260 //! @param[in] bb_id
 2261 //!
 2262 //! @return 0 = already existed, 1 = created OK, -1 = error
 2263 //!
 2264 //! @pre
 2265 //!
 2266 //! @note
 2267 //!
 2268 static int ensure_blockblob_metadata_path(const blobstore * bs, const char *bb_id)
 2269 {
 2270     char base[PATH_MAX];
 2271     snprintf(base, sizeof(base), "%s/%s", bs->path, bb_id);
 2272     return ensure_directories_exist(base, !(bs->format == BLOBSTORE_FORMAT_DIRECTORY), NULL, NULL, BLOBSTORE_DIRECTORY_PERM);
 2273 }
 2274 
 2275 //!
 2276 //!
 2277 //!
 2278 //! @param[in] bbs
 2279 //!
 2280 //! @pre
 2281 //!
 2282 //! @note
 2283 //!
 2284 static void free_bbs(blockblob * bbs)
 2285 {
 2286     while (bbs) {
 2287         blockblob *next_bb = bbs->next;
 2288         EUCA_FREE(bbs);
 2289         bbs = next_bb;
 2290     }
 2291 }
 2292 
 2293 //!
 2294 //!
 2295 //!
 2296 //! @param[in] bs
 2297 //! @param[in] bb_id
 2298 //! @param[in] timeout_usec
 2299 //!
 2300 //! @return
 2301 //!
 2302 //! @pre
 2303 //!
 2304 //! @note
 2305 //!
 2306 static unsigned int check_in_use(blobstore * bs, const char *bb_id, long long timeout_usec)
 2307 {
 2308     unsigned int in_use = 0;
 2309     char path[PATH_MAX];
 2310 
 2311     // determine the path of the .lock file for this blob
 2312     set_blockblob_metadata_path(BLOCKBLOB_PATH_LOCK, bs, bb_id, path, sizeof(path));
 2313 
 2314     _err_off();                        // do not complain if metadata files do not exist
 2315     int fd = open_and_lock(path, BLOBSTORE_FLAG_RDWR, timeout_usec, BLOBSTORE_FILE_PERM);   // try opening to see what happens
 2316     if (fd != -1) {
 2317         struct stat s;
 2318         if (fstat(fd, &s) == 0) {
 2319             if (s.st_size > 0) {       // lock file was not truncated before being released => file not properly closed
 2320                 in_use |= BLOCKBLOB_STATUS_ABANDONED;
 2321             }
 2322         }
 2323         close_and_unlock(fd);
 2324     } else {
 2325         in_use |= BLOCKBLOB_STATUS_OPENED;  //! @TODO check if open failed for other reason?
 2326     }
 2327 
 2328     if (read_blockblob_metadata_path(BLOCKBLOB_PATH_REFS, bs, bb_id, path, sizeof(path)) > 0) {
 2329         in_use |= BLOCKBLOB_STATUS_MAPPED;
 2330     }
 2331 
 2332     if (read_blockblob_metadata_path(BLOCKBLOB_PATH_DEPS, bs, bb_id, path, sizeof(path)) > 0) {
 2333         in_use |= BLOCKBLOB_STATUS_BACKED;
 2334     }
 2335 
 2336     if (read_blockblob_metadata_path(BLOCKBLOB_PATH_DM, bs, bb_id, path, sizeof(path)) > 0) {
 2337         in_use |= BLOCKBLOB_STATUS_BACKED;
 2338     }
 2339     _err_on();
 2340 
 2341     return in_use;
 2342 }
 2343 
 2344 //!
 2345 //!
 2346 //!
 2347 //! @param[in] bb
 2348 //!
 2349 //! @pre
 2350 //!
 2351 //! @note
 2352 //!
 2353 static void set_device_path(blockblob * bb)
 2354 {
 2355     char **dm_devs = NULL;
 2356     int dm_devs_size = 0;
 2357 
 2358     _err_off();                        // do not care if .dm file does not exist
 2359     read_array_blockblob_metadata_path(BLOCKBLOB_PATH_DM, bb->store, bb->id, &dm_devs, &dm_devs_size);
 2360     _err_on();
 2361 
 2362     if (dm_devs_size > 0) {            // .dm is there => set device_path to the device-mapper path
 2363         snprintf(bb->device_path, sizeof(bb->device_path), DM_FORMAT, dm_devs[dm_devs_size - 1]);   // main device is the last one
 2364         euca_strncpy(bb->dm_name, dm_devs[dm_devs_size - 1], sizeof(bb->dm_name));
 2365         for (int i = 0; i < dm_devs_size; i++) {
 2366             EUCA_FREE(dm_devs[i]);
 2367         }
 2368         EUCA_FREE(dm_devs);
 2369     } else {                           // .dm is not there => set device_path to loopback
 2370         char lo_dev[PATH_MAX] = "";
 2371         _err_off();                    // do not care if loopback file does not exist
 2372         read_blockblob_metadata_path(BLOCKBLOB_PATH_LOOPBACK, bb->store, bb->id, lo_dev, sizeof(lo_dev));
 2373         _err_on();
 2374         euca_strncpy(bb->device_path, lo_dev, sizeof(bb->device_path));
 2375     }
 2376 }
 2377 
 2378 //!
 2379 //! Given a directory that may contain both blobstore files and
 2380 //! non-blobstore files (e.g., instance metadata and soft-links),
 2381 //! this deletes all files not managed by the blobstore.
 2382 //!
 2383 //! @param[in] bs blobstore that may contains blobs under dir_path
 2384 //! @param[in] dir_path directory in which to delete non-blob files
 2385 //!
 2386 //! @return count of files that the function tried to delete or -1 on error
 2387 //!
 2388 //!
 2389 int blobstore_delete_nonblobs(blobstore * bs, const char *dir_path)
 2390 {
 2391     int ndeleted = 0;
 2392 
 2393     DIR *dir;
 2394     if ((dir = opendir(dir_path)) == NULL) {
 2395         return -1;
 2396     }
 2397 
 2398     struct dirent *dir_entry;
 2399     while ((dir_entry = readdir(dir)) != NULL) {
 2400         char *entry_name = dir_entry->d_name;
 2401 
 2402         if (!strcmp(".", entry_name) || !strcmp("..", entry_name) || !strcmp(BLOBSTORE_METADATA_FILE, entry_name))
 2403             continue;                  // ignore known unrelated files
 2404 
 2405         // get the path of the directory item
 2406         char entry_path[BLOBSTORE_MAX_PATH];
 2407         snprintf(entry_path, sizeof(entry_path), "%s/%s", dir_path, entry_name);
 2408 
 2409         char blob_id[BLOBSTORE_MAX_PATH];
 2410         if (typeof_blockblob_metadata_path(bs, entry_path, blob_id, sizeof(blob_id)) > 0)
 2411             continue;                  // ignore all blobstore files
 2412 
 2413         char *base_name = strdup(dir_path);
 2414         LOGDEBUG("[%s] removing %s\n", basename(base_name), entry_name);
 2415         free(base_name);
 2416         unlink(entry_path);
 2417         ndeleted++;
 2418     }
 2419 
 2420     closedir(dir);
 2421     return ndeleted;
 2422 }
 2423 
 2424 //!
 2425 //!
 2426 //!
 2427 //! @param[in] bs
 2428 //! @param[in] dir_path
 2429 //! @param[in] tail_bb
 2430 //! @param[in] bb_to_avoid
 2431 //!
 2432 //! @return
 2433 //!
 2434 //! @pre
 2435 //!
 2436 //! @note
 2437 //!
 2438 static blockblob **walk_bs(blobstore * bs, const char *dir_path, blockblob ** tail_bb, const blockblob * bb_to_avoid)
 2439 {
 2440     DIR *dir;
 2441     if ((dir = opendir(dir_path)) == NULL) {
 2442         return tail_bb;                // ignore access errors in blobstore directory
 2443     }
 2444 
 2445     struct dirent *dir_entry;
 2446     while ((dir_entry = readdir(dir)) != NULL) {
 2447         char *entry_name = dir_entry->d_name;
 2448 
 2449         if (!strcmp(".", entry_name) || !strcmp("..", entry_name) || !strcmp(BLOBSTORE_METADATA_FILE, entry_name))
 2450             continue;                  // ignore known unrelated files
 2451 
 2452         // get the path of the directory item
 2453         char entry_path[BLOBSTORE_MAX_PATH];
 2454         snprintf(entry_path, sizeof(entry_path), "%s/%s", dir_path, entry_name);
 2455         struct stat sb;
 2456         if (stat(entry_path, &sb) == -1) {
 2457             // ignore access errors in the blobstore directory
 2458             //! @TODO is this wise?
 2459             continue;
 2460         }
 2461         // recurse if this is a directory
 2462         if (S_ISDIR(sb.st_mode)) {
 2463             tail_bb = walk_bs(bs, entry_path, tail_bb, bb_to_avoid);
 2464             if (tail_bb == NULL) {
 2465                 closedir(dir);
 2466                 return NULL;
 2467             }
 2468             continue;
 2469         }
 2470 
 2471         char blob_id[BLOBSTORE_MAX_PATH];
 2472         if (typeof_blockblob_metadata_path(bs, entry_path, blob_id, sizeof(blob_id)) != BLOCKBLOB_PATH_BLOCKS)
 2473             continue;                  // ignore all files except .blocks file
 2474 
 2475         if (bb_to_avoid != NULL && strncmp(blob_id, bb_to_avoid->id, sizeof(blob_id)) == 0)
 2476             continue;                  // avoid that particular blockblob
 2477 
 2478         blockblob *bb = EUCA_ZALLOC(1, sizeof(blockblob));
 2479         if (bb == NULL) {
 2480             goto free;
 2481         }
 2482         *tail_bb = bb;                 // add to LL
 2483         tail_bb = &(bb->next);
 2484 
 2485         // fill out the struct
 2486         bb->store = bs;
 2487         euca_strncpy(bb->id, blob_id, sizeof(bb->id));
 2488         euca_strncpy(bb->blocks_path, entry_path, sizeof(bb->blocks_path));
 2489         set_device_path(bb);           // read .dm and .loopback and set bb->device_path accordingly
 2490         bb->size_bytes = sb.st_size;
 2491         bb->blocks_allocated = sb.st_blocks;
 2492         bb->last_accessed = sb.st_atime;
 2493         bb->last_modified = sb.st_mtime;
 2494         bb->snapshot_type = BLOBSTORE_FORMAT_ANY;   // it is not necessary to know whether this is a snapshot
 2495         bb->in_use = check_in_use(bs, bb->id, 0);
 2496 
 2497         // see if it's hollow
 2498         char buf[64];
 2499         if (read_blockblob_metadata_path(BLOCKBLOB_PATH_HOLLOW, bb->store, bb->id, buf, sizeof(buf)) != -1) {
 2500             bb->is_hollow = TRUE;
 2501         }
 2502         // if there is a .refs file, subtract the mapped blocks, if any, from the size
 2503         char **array = NULL;
 2504         int array_size = 0;
 2505         if (read_array_blockblob_metadata_path(BLOCKBLOB_PATH_DEPS, bb->store, bb->id, &array, &array_size) != -1) {
 2506             for (int i = 0; i < array_size; i++) {
 2507                 char *store_path = NULL;
 2508                 char *blob_id = NULL;
 2509                 char *rel_type = NULL;
 2510                 char *start_block = NULL;
 2511                 char *len_blocks = NULL;
 2512 
 2513                 store_path = strtok(array[i], " ");
 2514                 blob_id = strtok(NULL, " ");
 2515                 rel_type = strtok(NULL, " ");
 2516                 start_block = strtok(NULL, " ");
 2517                 len_blocks = strtok(NULL, " ");
 2518                 if (rel_type && len_blocks && strcmp(rel_type, blobstore_relation_type_name[BLOBSTORE_MAP]) == 0) {
 2519                     bb->size_bytes -= strtoull(len_blocks, NULL, 0) * 512LL;
 2520                 }
 2521             }
 2522         }
 2523 
 2524         if (array) {
 2525             for (int i = 0; i < array_size; i++)
 2526                 EUCA_FREE(array[i]);
 2527             EUCA_FREE(array);
 2528         }
 2529     }
 2530 
 2531 free:
 2532     closedir(dir);
 2533     return tail_bb;
 2534 }
 2535 
 2536 //!
 2537 //! Runs through the blobstore and puts all found blockblobs into a linked list, returning its head
 2538 //!
 2539 //! @param[in] bs
 2540 //! @param[in] bb_to_avoid
 2541 //!
 2542 //! @return A pointer to the head of a linked list containing all found blockblobs
 2543 //!
 2544 //! @pre
 2545 //!
 2546 //! @note
 2547 //!
 2548 static blockblob *scan_blobstore(blobstore * bs, const blockblob * bb_to_avoid)
 2549 {
 2550     blockblob *bbs = NULL;
 2551     if (walk_bs(bs, bs->path, &bbs, bb_to_avoid) == NULL) {
 2552         if (bbs)
 2553             free_bbs(bbs);
 2554         bbs = NULL;
 2555     }
 2556 
 2557     return bbs;
 2558 }
 2559 
 2560 //!
 2561 //!
 2562 //!
 2563 //! @param[in] bb1
 2564 //! @param[in] bb2
 2565 //!
 2566 //! @return
 2567 //!
 2568 //! @pre
 2569 //!
 2570 //! @note
 2571 //!
 2572 static int compare_bbs(const void *bb1, const void *bb2)
 2573 {
 2574     return (int)((*(blockblob **) bb1)->last_modified - (*(blockblob **) bb2)->last_modified);
 2575 }
 2576 
 2577 //!
 2578 //!
 2579 //!
 2580 //! @param[in] bs
 2581 //! @param[in] bb_list
 2582 //! @param[in] need_blocks
 2583 //!
 2584 //! @return
 2585 //!
 2586 //! @pre
 2587 //!
 2588 //! @note
 2589 //!
 2590 static long long purge_blockblobs_lru(blobstore * bs, blockblob * bb_list, long long need_blocks)
 2591 {
 2592     int list_length = 0;
 2593     long long purged = 0;
 2594 
 2595     for (blockblob * bb = bb_list; bb; bb = bb->next) {
 2596         list_length++;
 2597     }
 2598 
 2599     if (list_length) {
 2600         blockblob *bb;
 2601         int i;
 2602 
 2603         blockblob **bb_array = (blockblob **) EUCA_ZALLOC(list_length, sizeof(blockblob *));
 2604         if (!bb_array)
 2605             return purged;
 2606 
 2607         for (i = 0, bb = bb_list; bb; bb = bb->next, i++) {
 2608             bb_array[i] = bb;
 2609         }
 2610 
 2611         qsort(bb_array, list_length, sizeof(blockblob *), compare_bbs);
 2612 
 2613         int iteration = 0;
 2614         int deleted;
 2615         do {
 2616             // iterate multiple times in case there are dependencies
 2617             //! @TODO unify with _fsck's iteration code?
 2618             deleted = 0;               // deleted in this round
 2619             for (i = 0; i < list_length; i++) {
 2620                 bb = bb_array[i];
 2621                 if (bb == NULL)        // was either deleted or deemed undeletable on previous iteration
 2622                     continue;
 2623                 bb->in_use = check_in_use(bs, bb->id, 0);   // record in-use status
 2624 
 2625                 char code = '?';
 2626                 if (bb->in_use & BLOCKBLOB_STATUS_MAPPED) {
 2627                     // mapped blobs have children, thus cannot be deleted at this iteration
 2628                     code = 'C';
 2629 
 2630                 } else if (bb->in_use & BLOCKBLOB_STATUS_OPENED) {
 2631                     bb_array[i] = NULL; // mark it to skip in the future
 2632                     code = 'O';
 2633 
 2634                 } else if (delete_blob_state(bb, BLOBSTORE_DELETE_TIMEOUT_USEC, 1) == -1) {
 2635                     bb_array[i] = NULL; // mark it to skip in the future
 2636                     code = '!';
 2637 
 2638                 } else {
 2639                     purged += round_up_sec(bb->size_bytes) / 512;
 2640                     bb_array[i] = NULL; // mark it to skip in the future
 2641                     code = 'D';
 2642                     deleted++;
 2643                 }
 2644                 LOGDEBUG("LRU %d %08lld: %29s %c%c%c%c %c %9llu %s", iteration, purged, bb->id, (bb->in_use & BLOCKBLOB_STATUS_OPENED) ? ('o') : ('-'), // o = open
 2645                          (bb->in_use & BLOCKBLOB_STATUS_BACKED) ? ('p') : ('-'),    // p = has parents
 2646                          (bb->in_use & BLOCKBLOB_STATUS_MAPPED) ? ('c') : ('-'),    // c = has children
 2647                          (bb->in_use & BLOCKBLOB_STATUS_ABANDONED) ? ('a') : ('-'), // a = was abandoned
 2648                          code,         // outcome codes: D=deleted, else C=children, !=undeletable, O=open
 2649                          bb->size_bytes / 512L, // size is in sectors
 2650                          ctime(&(bb->last_modified)));  // ctime adds a newline
 2651                 if (purged >= need_blocks)
 2652                     break;
 2653             }
 2654             iteration++;
 2655         } while (deleted && (purged < need_blocks));
 2656         EUCA_FREE(bb_array);
 2657     }
 2658 
 2659     return purged;
 2660 }
 2661 
 2662 //!
 2663 //!
 2664 //!
 2665 //! @param[in] bs
 2666 //! @param[in] meta
 2667 //!
 2668 //! @return
 2669 //!
 2670 //! @pre
 2671 //!
 2672 //! @note
 2673 //!
 2674 int blobstore_stat(blobstore * bs, blobstore_meta * meta)
 2675 {
 2676     int ret = 0;
 2677 
 2678     if (blobstore_lock(bs, BLOBSTORE_LOCK_TIMEOUT_USEC) == -1) {    // lock it so we can traverse blobstore safely
 2679         return EUCA_ERROR;
 2680     }
 2681     // put existing items in the blobstore into a LL
 2682     _blobstore_errno = BLOBSTORE_ERROR_OK;
 2683     blockblob *bbs = scan_blobstore(bs, NULL);
 2684     if (bbs == NULL) {
 2685         if (_blobstore_errno != BLOBSTORE_ERROR_OK) {
 2686             goto unlock;
 2687         }
 2688     }
 2689     // analyze the LL, calculating sizes
 2690     meta->blocks_allocated = 0;
 2691     meta->blocks_unlocked = 0;
 2692     meta->blocks_locked = 0;
 2693     meta->num_blobs = 0;
 2694     for (blockblob * abb = bbs; abb;) {
 2695         //! @TODO unify this with locked/unlocked calculation in open()
 2696         long long abb_size_blocks = round_up_sec(abb->size_bytes) / 512;
 2697         if (abb->in_use & BLOCKBLOB_STATUS_OPENED) {
 2698             // these can't be purged if we need space
 2699             //! @TODO look into recursive purging of unused references?
 2700             meta->blocks_locked += abb_size_blocks;
 2701         } else {
 2702             // these potentially can be purged, unless they are depended on by locked ones
 2703             meta->blocks_unlocked += abb_size_blocks;
 2704         }
 2705         meta->blocks_allocated += abb->blocks_allocated;
 2706         meta->num_blobs++;
 2707 
 2708         // free this node and move the pointer
 2709         blockblob *old_bb = abb;
 2710         abb = abb->next;
 2711         EUCA_FREE(old_bb);
 2712     }
 2713 
 2714 unlock:
 2715 
 2716     if (blobstore_unlock(bs) == -1) {
 2717         ERR(BLOBSTORE_ERROR_UNKNOWN, "failed to unlock the blobstore");
 2718     }
 2719 
 2720     euca_strncpy(meta->id, bs->id, sizeof(meta->id));
 2721     meta->revocation_policy = bs->revocation_policy;
 2722     meta->snapshot_policy = bs->snapshot_policy;
 2723     meta->format = bs->format;
 2724     meta->blocks_limit = bs->limit_blocks;
 2725     if (realpath(bs->path, meta->path) == NULL) {
 2726         LOGERROR("failed to resolve the blobstore path %s\n", bs->path);
 2727         ret = EUCA_ERROR;
 2728     }
 2729 
 2730     return ret;
 2731 }
 2732 
 2733 //!
 2734 //! Read .refs file content and return any entries that point to blobs that no longer exist
 2735 //!
 2736 //! @param[in]  bb
 2737 //! @param[out] refs
 2738 //!
 2739 //! @return size of the array placed into *refs, which caller must free, or -1 on error
 2740 //!
 2741 //! @pre
 2742 //!
 2743 //! @note
 2744 //!
 2745 static int get_stale_refs(const blockblob * bb, char ***refs)
 2746 {
 2747     blobstore *bs = bb->store;
 2748     char **array = NULL;
 2749     int array_size = 0;
 2750     int stale_refs = 0;
 2751 
 2752     if (read_array_blockblob_metadata_path(BLOCKBLOB_PATH_REFS, bb->store, bb->id, &array, &array_size) != -1) {
 2753         for (int i = 0; i < array_size; i++) {
 2754             char ref[BLOBSTORE_MAX_PATH + MAX_DM_NAME + 1];
 2755             euca_strncpy(ref, array[i], sizeof(ref));
 2756 
 2757             char *store_path = strtok(array[i], " ");
 2758             char *blob_id = strtok(NULL, " ");  // the remaining entries in array[i] are ignored
 2759             char ref_exists = 0;
 2760 
 2761             if (strlen(store_path) < 1 || strlen(blob_id) < 1)
 2762                 goto stale_ref;
 2763 
 2764             blobstore *ref_bs = bs;
 2765             if (strcmp(bs->path, store_path)) { // if deleting reference in a different blobstore
 2766                 // need to open it
 2767                 ref_bs = blobstore_open(store_path, 0, BLOBSTORE_FLAG_CREAT, BLOBSTORE_FORMAT_ANY, BLOBSTORE_REVOCATION_ANY, BLOBSTORE_SNAPSHOT_ANY);
 2768                 if (ref_bs == NULL)    // blobstore with a child blob does not exist
 2769                     goto stale_ref;
 2770             }
 2771 
 2772             blockblob *ref_bb = blockblob_open(ref_bs, blob_id, 0, 0, NULL, BLOBSTORE_FIND_TIMEOUT_USEC);
 2773             if (ref_bb) {
 2774                 blockblob_close(ref_bb);
 2775                 ref_exists = 1;
 2776             } else {
 2777                 if (_blobstore_errno != BLOBSTORE_ERROR_NOENT)  // conservatively assume that unless the error says otherwise, the blob exists
 2778                     ref_exists = 1;
 2779             }
 2780             if (ref_bs != bs) {
 2781                 blobstore_close(ref_bs);
 2782             }
 2783 
 2784 stale_ref:
 2785 
 2786             if (ref_exists) {
 2787                 EUCA_FREE(array[i]);   // free names of refs that exist
 2788             } else {
 2789                 strcpy(array[i], ref); // since strtok() clobbered the original value
 2790                 stale_refs++;
 2791             }
 2792         }
 2793     }
 2794 
 2795     if (stale_refs > 0) {
 2796         if (refs) {
 2797             *refs = EUCA_ZALLOC(stale_refs, sizeof(char *));
 2798             if (*refs == NULL) {
 2799                 stale_refs = -1;       // OOM error
 2800             }
 2801         }
 2802         for (int i = 0, j = 0; i < array_size; i++) {
 2803             if (array[i]) {            // ref does not exist
 2804                 if (refs && *refs) {
 2805                     (*refs)[j++] = array[i];
 2806                     assert(j <= stale_refs);
 2807                 } else {
 2808                     EUCA_FREE(array[i]);
 2809                 }
 2810             }
 2811         }
 2812     }
 2813 
 2814     if (array_size > 0)
 2815         EUCA_FREE(array);
 2816 
 2817     return stale_refs;
 2818 }
 2819 
 2820 //!
 2821 //! Checks the integrity check of the blobstore. With a non-NULL examiner(), each found
 2822 //! blob is passed to it for examination and the blob is deleted if function returns non-zero
 2823 //!
 2824 //! @param[in] bs
 2825 //! @param[in] examiner
 2826 //!
 2827 //! @return
 2828 //!
 2829 //! @pre
 2830 //!
 2831 //! @note
 2832 //!
 2833 int blobstore_fsck(blobstore * bs, int (*examiner) (const blockblob * bb))
 2834 {
 2835     int ret = 0;
 2836 
 2837     if (blobstore_lock(bs, BLOBSTORE_LOCK_TIMEOUT_USEC) == -1) {    // lock it so we can traverse blobstore safely
 2838         ERR(BLOBSTORE_ERROR_UNKNOWN, "failed to lock the blobstore");
 2839         return -1;
 2840     }
 2841     // put existing items in the blobstore into a LL
 2842     _blobstore_errno = BLOBSTORE_ERROR_OK;
 2843     blockblob *bbs = scan_blobstore(bs, NULL);
 2844 
 2845     if (blobstore_unlock(bs) == -1) {
 2846         ERR(BLOBSTORE_ERROR_UNKNOWN, "failed to unlock the blobstore");
 2847         ret = -1;
 2848         goto free;
 2849     }
 2850 
 2851     if (bbs == NULL) {
 2852         if (_blobstore_errno != BLOBSTORE_ERROR_OK) {
 2853             ret = -1;
 2854         }
 2855         goto free;
 2856     }
 2857 
 2858     {                                  // check objects in the blobstore
 2859 
 2860         unsigned int num_blobs = 0;
 2861         unsigned int blobs_deleted = 0;
 2862         unsigned int blobs_undeletable = 0;
 2863         unsigned int blobs_unopenable = 0;
 2864         unsigned int to_delete_prev = 0;
 2865         unsigned int iterations = 1;
 2866         for (; iterations < 10; iterations++) { // outer loop for multiple iterations over the list
 2867             unsigned int to_delete = 0;
 2868 
 2869             // run through LL, examining each blockblob
 2870             for (blockblob * abb = bbs; abb; abb = abb->next) {
 2871                 if (iterations == 1)
 2872                     num_blobs++;       // count all blobs on the first iteration
 2873 
 2874                 if (abb->store == NULL) // these were cleared or condemned on a previous iteration
 2875                     continue;
 2876 
 2877                 // examiner(), if specified, tell us whether to delete the blob
 2878                 if (blockblob_check(abb) || // blob state is inconsistent
 2879                     (examiner && examiner(abb))) {  // blobstore user condemned the blob
 2880 
 2881                     blockblob *bb = blockblob_open(bs, abb->id, 0, 0, NULL, BLOBSTORE_FIND_TIMEOUT_USEC);
 2882                     if (bb != NULL) {
 2883                         if (bb->in_use & BLOCKBLOB_STATUS_MAPPED) {
 2884 
 2885                             // Since we are checking integrity, do not trust .refs file blindly,
 2886                             // but ensure that the entries -- blobs depending on this one -- exist
 2887 
 2888                             char **stale_refs;
 2889                             int num_stale_refs = get_stale_refs(bb, &stale_refs);
 2890                             if (num_stale_refs > 0) {
 2891                                 for (int i = 0; i < num_stale_refs; i++) {
 2892                                     // update the .refs file to remove this entry
 2893                                     LOGINFO("removing stale/corrupted reference in blob %s to %s\n", bb->id, stale_refs[i]);
 2894                                     update_entry_blockblob_metadata_path(BLOCKBLOB_PATH_REFS, bb->store, bb->id, stale_refs[i], 1);
 2895                                     EUCA_FREE(stale_refs[i]);
 2896                                 }
 2897                                 EUCA_FREE(stale_refs);
 2898                             }
 2899                             // mapped blobs have children, thus cannot be deleted at this iteration
 2900                             blockblob_close(bb);
 2901                             to_delete++;
 2902 
 2903                         } else if (blockblob_delete(bb, BLOBSTORE_DELETE_TIMEOUT_USEC, 1) == -1) {
 2904                             LOGWARN("WARNING: failed to delete blockblob %s\n", abb->id);
 2905                             blockblob_close(bb);
 2906                             abb->store = NULL;  // so it will get skipped on next iteration
 2907                             blobs_undeletable++;
 2908 
 2909                         } else {
 2910                             LOGINFO("deleted stale/corrupted blob %s\n", abb->id);
 2911                             abb->store = NULL;  // so it will get skipped on next iteration
 2912                             blobs_deleted++;
 2913                         }
 2914                     } else {
 2915                         LOGWARN("could not open blockblob %s (it may be in use)\n", abb->id);
 2916                         abb->store = NULL;  // so it will get skipped on next iteration
 2917                         blobs_unopenable++;
 2918                     }
 2919                 }
 2920             }
 2921             assert(iterations < 11);
 2922 
 2923             if (to_delete == to_delete_prev)    // could not delete anything new this iteration
 2924                 break;
 2925             to_delete_prev = to_delete;
 2926             if (to_delete == 0)
 2927                 break;
 2928         }
 2929 
 2930         if (num_blobs > 0)
 2931             LOGINFO("%s: examined %d blob(s) in %d iteration(s): "
 2932                     "deleted %d, failed on %d + %d, failed to open %d\n", bs->path, num_blobs, iterations, blobs_deleted, to_delete_prev, blobs_undeletable, blobs_unopenable);
 2933     }
 2934 free:
 2935     if (bbs) {
 2936         free_bbs(bbs);
 2937     }
 2938 
 2939     return ret;
 2940 }
 2941 
 2942 //!
 2943 //!
 2944 //!
 2945 //! @param[in]  bs
 2946 //! @param[in]  regex
 2947 //! @param[out] results
 2948 //!
 2949 //! @return
 2950 //!
 2951 //! @pre
 2952 //!
 2953 //! @note
 2954 //!
 2955 int blobstore_search(blobstore * bs, const char *regex, blockblob_meta ** results)
 2956 {
 2957     blockblob_meta *head = NULL;
 2958     blockblob *bbs = NULL;
 2959     int ret = 0;
 2960     regex_t re;
 2961 
 2962     if (regcomp(&re, regex, REG_NOSUB) != 0) {
 2963         ERR(BLOBSTORE_ERROR_UNKNOWN, "failed to parse search regular expression");
 2964         return -1;
 2965     }
 2966 
 2967     int blobstore_locked = 0;
 2968     if (blobstore_lock(bs, BLOBSTORE_LOCK_TIMEOUT_USEC) == -1) {    // lock it so we can traverse blobstore safely
 2969         ERR(BLOBSTORE_ERROR_UNKNOWN, "failed to lock the blobstore");
 2970         ret = -1;
 2971         goto free;
 2972     } else {
 2973         blobstore_locked = 1;
 2974     }
 2975     // put existing items in the blobstore into a LL
 2976     _blobstore_errno = BLOBSTORE_ERROR_OK;
 2977     bbs = scan_blobstore(bs, NULL);
 2978     if (bbs == NULL) {
 2979         if (_blobstore_errno != BLOBSTORE_ERROR_OK) {
 2980             ret = -1;
 2981             goto free;
 2982         }
 2983     }
 2984     // run through LL, looking for matches
 2985     unsigned int num_blobs = 0;
 2986     unsigned int blobs_matched = 0;
 2987     blockblob_meta *prev = NULL;
 2988     for (blockblob * abb = bbs; abb; abb = abb->next) {
 2989         num_blobs++;
 2990         if (regexec(&re, abb->id, 0, NULL, 0) != 0)
 2991             continue;
 2992         blobs_matched++;
 2993 
 2994         blockblob_meta *bm = EUCA_ZALLOC(1, sizeof(blockblob_meta));
 2995         if (bm == NULL) {
 2996             ERR(BLOBSTORE_ERROR_NOMEM, NULL);
 2997             ret = -1;
 2998             goto free;
 2999         }
 3000 
 3001         euca_strncpy(bm->id, abb->id, sizeof(bm->id));
 3002         bm->bs = bs;
 3003         bm->size_bytes = abb->size_bytes;
 3004         bm->in_use = abb->in_use;
 3005         bm->is_hollow = abb->is_hollow;
 3006         bm->last_accessed = abb->last_accessed;
 3007         bm->last_modified = abb->last_modified;
 3008         if (head == NULL) {
 3009             head = bm;
 3010         } else {
 3011             prev->next = bm;
 3012             bm->prev = prev;
 3013         }
 3014         prev = bm;
 3015     }
 3016 
 3017     ret = blobs_matched;
 3018 
 3019 free:
 3020     regfree(&re);                      // free the regular expression
 3021     if (bbs)
 3022         free_bbs(bbs);                 // free the blockblobs LL returned by the search function
 3023 
 3024     if (blobstore_locked && blobstore_unlock(bs) == -1) {
 3025         ERR(BLOBSTORE_ERROR_UNKNOWN, "failed to unlock the blobstore");
 3026         ret = -1;
 3027     }
 3028 
 3029     if (ret < 0) {                     // there were problems, so free the partial linked list, if any
 3030         for (blockblob_meta * bm = head; bm;) {
 3031             blockblob_meta *next = bm->next;
 3032             EUCA_FREE(bm);
 3033             bm = next;
 3034         }
 3035     } else {
 3036         *results = head;
 3037     }
 3038 
 3039     return ret;
 3040 }
 3041 
 3042 //!
 3043 //!
 3044 //!
 3045 //! @param[in] bs
 3046 //! @param[in] regex
 3047 //!
 3048 //! @return
 3049 //!
 3050 //! @pre
 3051 //!
 3052 //! @note
 3053 //!
 3054 int blobstore_delete_regex(blobstore * bs, const char *regex)
 3055 {
 3056     blockblob_meta *matches = NULL;
 3057     int found = blobstore_search(bs, regex, &matches);
 3058     int left_to_delete = found;
 3059     int deleted;
 3060     do {
 3061         // iterate multiple times in case there are dependencies
 3062         //! @TODO unify with _fsck's iteration code?
 3063         deleted = 0;                   // deleted in this round
 3064         for (blockblob_meta * bm = matches; bm; bm = bm->next) {
 3065             blockblob *bb = blockblob_open(bs, bm->id, 0, 0, NULL, BLOBSTORE_FIND_TIMEOUT_USEC);
 3066             if (bb != NULL) {
 3067                 if (bb->in_use & BLOCKBLOB_STATUS_MAPPED) {
 3068                     // mapped blobs have children, thus cannot be deleted at this iteration
 3069                     blockblob_close(bb);
 3070                     continue;
 3071                 }
 3072                 if (blockblob_delete(bb, BLOBSTORE_DELETE_TIMEOUT_USEC, 0) == -1) {
 3073                     blockblob_close(bb);
 3074                 } else {
 3075                     deleted++;
 3076                 }
 3077             }
 3078         }
 3079     } while (deleted && (left_to_delete -= deleted));
 3080 
 3081     // free the search results
 3082     for (blockblob_meta * bm = matches; bm;) {
 3083         blockblob_meta *next = bm->next;
 3084         EUCA_FREE(bm);
 3085         bm = next;
 3086     }
 3087 
 3088     return (left_to_delete == 0) ? (found) : (-1);
 3089 }
 3090 
 3091 //!
 3092 //!
 3093 //!
 3094 //! @param[in] bs
 3095 //! @param[in] id can be NULL if creating, in which case blobstore will pick a random ID
 3096 //! @param[in] size_bytes on create: reserve this size; on open: verify the size, unless set to 0
 3097 //! @param[in] flags BLOBSTORE_FLAG_CREAT | BLOBSTORE_FLAG_EXCL - same semantcs as for open() flags, BLOBSTORE_FLAG_HOLLOW - when creating
 3098 //! @param[in] sig if non-NULL, on create sig is recorded, on open it is verified
 3099 //! @param[in] timeout_usec maximum wait, in microseconds
 3100 //!
 3101 //! @return
 3102 //!
 3103 //! @pre
 3104 //!
 3105 //! @note
 3106 //!
 3107 blockblob *blockblob_open(blobstore * bs, const char *id, unsigned long long size_bytes, unsigned int flags, const char *sig, unsigned long long timeout_usec)
 3108 {
 3109     long long size_blocks = round_up_sec(size_bytes) / 512;
 3110     if (flags & ~(BLOBSTORE_FLAG_CREAT | BLOBSTORE_FLAG_EXCL | BLOBSTORE_FLAG_HOLLOW)) {
 3111         ERR(BLOBSTORE_ERROR_INVAL, "only _CREAT, _EXCL, & _HOLLOW flags are allowed");
 3112         return NULL;
 3113     }
 3114     if (id == NULL && !(flags & BLOBSTORE_FLAG_CREAT)) {
 3115         ERR(BLOBSTORE_ERROR_INVAL, "NULL id is only allowed with _CREAT");
 3116         return NULL;
 3117     }
 3118     if (size_blocks == 0 && (flags & BLOBSTORE_FLAG_CREAT)) {
 3119         ERR(BLOBSTORE_ERROR_INVAL, "size_blocks can be 0 only without _CREAT");
 3120         return NULL;
 3121     }
 3122     if (size_blocks != 0 && (flags & BLOBSTORE_FLAG_CREAT) && (size_blocks > bs->limit_blocks) && !(flags && BLOBSTORE_FLAG_HOLLOW)) {
 3123         ERR(BLOBSTORE_ERROR_NOSPC, NULL);
 3124         return NULL;
 3125     }
 3126 
 3127     LOGTRACE("{%u} blockblob_open: opening blob id=%s flags=%d timeout=%lld\n", (unsigned int)pthread_self(), id, flags, timeout_usec);
 3128 
 3129     blockblob *bbs = NULL;             // a temp LL of blockblobs, used for computing free space and for purging
 3130     blockblob *bb = EUCA_ZALLOC(1, sizeof(blockblob));
 3131     if (bb == NULL) {
 3132         ERR(BLOBSTORE_ERROR_NOMEM, NULL);
 3133         goto out;
 3134     }
 3135 
 3136     bb->store = bs;
 3137     if (id) {
 3138         euca_strncpy(bb->id, id, sizeof(bb->id));
 3139     } else {
 3140         gen_id(bb->id, sizeof(bb->id));
 3141     }
 3142     bb->fd_lock = -1;
 3143     bb->fd_blocks = -1;
 3144     bb->size_bytes = size_bytes;
 3145     set_blockblob_metadata_path(BLOCKBLOB_PATH_BLOCKS, bs, bb->id, bb->blocks_path, sizeof(bb->blocks_path));
 3146 
 3147     int blobstore_locked = 0;
 3148     if (blobstore_lock(bs, timeout_usec) == -1) {   // lock it so we can create blob's file atomically
 3149         goto free;                     // failed to obtain a lock on the blobstore
 3150     } else {
 3151         blobstore_locked = 1;
 3152     }
 3153 
 3154     //! @TODO maybe don't create directories needlessly if flags==0?
 3155     int created_directory = ensure_blockblob_metadata_path(bs, bb->id);
 3156     if (created_directory == -1) {
 3157         PROPAGATE_ERR(BLOBSTORE_ERROR_UNKNOWN);
 3158         goto unlock;
 3159     }
 3160     if (blobstore_unlock(bs) == -1) {
 3161         ERR(BLOBSTORE_ERROR_UNKNOWN, "failed to unlock the blobstore");
 3162         goto free;
 3163     }
 3164     blobstore_locked = 0;
 3165 
 3166     int created_blob = 0;
 3167     char lpath[PATH_MAX];
 3168     set_blockblob_metadata_path(BLOCKBLOB_PATH_LOCK, bs, bb->id, lpath, sizeof(lpath));
 3169     bb->fd_lock = open_and_lock(lpath, flags | BLOBSTORE_FLAG_RDWR, timeout_usec, BLOBSTORE_FILE_PERM); // blobs are always opened with exclusive write access
 3170     if (bb->fd_lock == -1) {
 3171         // failed to open/create and lock the blockblob
 3172         goto clean;
 3173     }
 3174     char thread_id[512];
 3175     int thread_id_len = 0;
 3176     snprintf(thread_id, sizeof(thread_id), "%d/%u", getpid(), (unsigned int)pthread_self());
 3177     thread_id_len = strlen(thread_id);
 3178     if (write(bb->fd_lock, thread_id, thread_id_len) != thread_id_len) {
 3179         // Fail to write our thread indentifier in the lock file.
 3180         ERR(BLOBSTORE_ERROR_UNKNOWN, "failed to write to the blobstore");
 3181         goto clean;
 3182     }
 3183     // convert BLOBSTORE_* flags into standard Posix open() flags and open/create the blocks file
 3184     int o_flags = 0;
 3185     if (flags & BLOBSTORE_FLAG_RDONLY) {
 3186         o_flags |= O_RDONLY;
 3187     } else if ((flags & BLOBSTORE_FLAG_RDWR) || (flags & BLOBSTORE_FLAG_CREAT)) {
 3188         o_flags |= O_RDWR;
 3189         if (flags & BLOBSTORE_FLAG_CREAT) {
 3190             o_flags |= O_CREAT;
 3191             // intentionally ignore _EXCL supplied without _CREAT
 3192             if (flags & BLOBSTORE_FLAG_EXCL)
 3193                 o_flags |= O_EXCL;
 3194         }
 3195     }
 3196     bb->fd_blocks = open(bb->blocks_path, o_flags, BLOBSTORE_FILE_PERM);
 3197     if (bb->fd_blocks == -1) {         // failed to open/create the content file
 3198         PROPAGATE_ERR(BLOBSTORE_ERROR_UNKNOWN);
 3199         goto clean;
 3200     }
 3201 
 3202     struct stat sb;
 3203     if (fstat(bb->fd_blocks, &sb) == -1) {
 3204         goto clean;
 3205     }
 3206 
 3207     if (sb.st_size == 0) {             // new blob
 3208         created_blob = 1;
 3209 
 3210         if (blobstore_lock(bs, timeout_usec) == -1) {   // lock it so we can traverse blobstore safely
 3211             goto clean;                // failed to obtain a lock on the blobstore
 3212         } else {
 3213             blobstore_locked = 1;
 3214         }
 3215 
 3216         // put existing items in the blobstore into a LL
 3217         _blobstore_errno = BLOBSTORE_ERROR_OK;
 3218         bbs = scan_blobstore(bs, bb);
 3219         if (bbs == NULL) {
 3220             if (_blobstore_errno != BLOBSTORE_ERROR_OK) {
 3221                 goto clean;
 3222             }
 3223         }
 3224         // a bit of a hack: HOLLOW blobs skip the blobstore limit check upon creation
 3225         if (flags & BLOBSTORE_FLAG_HOLLOW) {
 3226             bb->is_hollow = TRUE;
 3227             if (write_blockblob_metadata_path(BLOCKBLOB_PATH_HOLLOW, bs, bb->id, "this blob is hollow\n"))
 3228                 goto clean;
 3229 
 3230         } else {                       // enforce blobstore limits
 3231 
 3232             // analyze the LL, calculating sizes
 3233             long long blocks_unlocked = 0;
 3234             long long blocks_locked = 0;
 3235             unsigned int num_blobs = 0;
 3236             for (blockblob * abb = bbs; abb; abb = abb->next) {
 3237                 long long abb_size_blocks = round_up_sec(abb->size_bytes) / 512;
 3238                 if (abb->is_hollow)
 3239                     abb_size_blocks = 0;
 3240                 if (abb->in_use & BLOCKBLOB_STATUS_OPENED) {
 3241                     // these can't be purged if we need space
 3242                     //! @TODO look into recursive purging of unused references?
 3243                     blocks_locked += abb_size_blocks;
 3244                 } else {
 3245                     blocks_unlocked += abb_size_blocks; // these potentially can be purged, unless they are depended on by locked ones
 3246                 }
 3247                 num_blobs++;
 3248             }
 3249 
 3250             long long blocks_free = bs->limit_blocks - (blocks_unlocked + blocks_locked);
 3251             if (blocks_free < size_blocks) {
 3252                 if (!(bs->revocation_policy == BLOBSTORE_REVOCATION_LRU)    // not allowed to purge
 3253                     || (blocks_free + blocks_unlocked) < size_blocks) { // not enough purgeable material
 3254                     ERR(BLOBSTORE_ERROR_NOSPC, NULL);
 3255                     goto clean;
 3256                 }
 3257                 long long blocks_needed = size_blocks - blocks_free;
 3258                 _err_off();            // do not care about errors duing purging
 3259                 long long blocks_freed = purge_blockblobs_lru(bs, bbs, blocks_needed);
 3260                 _err_on();
 3261                 if (blocks_freed < blocks_needed) {
 3262                     ERR(BLOBSTORE_ERROR_NOSPC, "could not purge enough from cache");
 3263                     goto clean;
 3264                 }
 3265             }
 3266         }
 3267 
 3268         if (lseek(bb->fd_blocks, size_bytes - 1, SEEK_CUR) == (off_t) - 1) {    // create a file with a hole
 3269             PROPAGATE_ERR(BLOBSTORE_ERROR_UNKNOWN);
 3270             goto clean;
 3271         }
 3272         if (write(bb->fd_blocks, zero_buf, 1) != (ssize_t) 1) {
 3273             PROPAGATE_ERR(BLOBSTORE_ERROR_UNKNOWN);
 3274             goto clean;
 3275         }
 3276         if (sig)
 3277             if (write_blockblob_metadata_path(BLOCKBLOB_PATH_SIG, bs, bb->id, sig)) {
 3278                 goto clean;
 3279             }
 3280         bb->snapshot_type = BLOBSTORE_SNAPSHOT_NONE;    // just created, so not a snapshot
 3281 
 3282         if (blobstore_unlock(bs) == -1) {
 3283             ERR(BLOBSTORE_ERROR_UNKNOWN, "failed to unlock the blobstore");
 3284         }
 3285         blobstore_locked = 0;
 3286 
 3287     } else {                           // blob existed
 3288 
 3289         char buf[BLOBSTORE_SIG_MAX];
 3290 
 3291         if (bb->size_bytes == 0) {     // find out the size from the file size
 3292             bb->size_bytes = sb.st_size;
 3293         } else if (bb->size_bytes != sb.st_size) {  // verify the size specified by the user
 3294             LOGERROR("{%u} encountered a size mismatch when opening a blob (requested %lld, found %ld)\n", (unsigned int)pthread_self(), bb->size_bytes, sb.st_size);
 3295             ERR(BLOBSTORE_ERROR_SIGNATURE, "size of the existing blockblob does not match");
 3296             goto clean;
 3297         }
 3298         // determine whether this blob is a map of another,
 3299         // in which case the blocks are backing and should
 3300         // not be accessed directly
 3301         if (read_blockblob_metadata_path(BLOCKBLOB_PATH_DM, bs, bb->id, buf, sizeof(buf)) > 0) {
 3302             bb->snapshot_type = BLOBSTORE_SNAPSHOT_DM;
 3303         } else {
 3304             bb->snapshot_type = BLOBSTORE_SNAPSHOT_NONE;
 3305         }
 3306 
 3307         // check if its hollow
 3308         if (read_blockblob_metadata_path(BLOCKBLOB_PATH_HOLLOW, bs, bb->id, buf, sizeof(buf)) != -1) {
 3309             bb->is_hollow = TRUE;
 3310         }
 3311 
 3312         if (sig && (strlen(sig) > 0)) { // check the signature, if there
 3313             int sig_size;
 3314             if ((sig_size = read_blockblob_metadata_path(BLOCKBLOB_PATH_SIG, bs, bb->id, buf, sizeof(buf))) != strlen(sig)
 3315                 || (strncmp(sig, buf, sig_size) != 0)) {
 3316                 LOGERROR("{%u} encountered signature mismatch when opening a blob (requested size [%ld], found [%d])\n", (unsigned int)pthread_self(), strlen(sig), sig_size);
 3317                 ERR(BLOBSTORE_ERROR_SIGNATURE, NULL);
 3318                 goto clean;
 3319             }
 3320         }
 3321         // check its in-use status
 3322         bb->in_use = check_in_use(bs, bb->id, 0);
 3323     }
 3324 
 3325     {                                  // create a loopback device, if there isn't a valid one already (this may happen whether the blob is new or old)
 3326         char lo_dev[PATH_MAX] = "";
 3327         struct stat sb;
 3328 
 3329         _err_off();                    // do not care if loopback file does not exist
 3330         read_blockblob_metadata_path(BLOCKBLOB_PATH_LOOPBACK, bs, bb->id, lo_dev, sizeof(lo_dev));
 3331         _err_on();
 3332         if ((strlen(lo_dev) < 1)       // nothing in .loopback file
 3333             || (stat(lo_dev, &sb) == -1)    // something in .loopback that does not exist
 3334             || (!S_ISBLK(sb.st_mode))) {    // something in .loopback that is not block device
 3335 
 3336             if (diskutil_loop(bb->blocks_path, 0, lo_dev, sizeof(lo_dev))) {
 3337                 ERR(BLOBSTORE_ERROR_UNKNOWN, "failed to obtain a loopback device for a blockblob");
 3338                 goto clean;
 3339             }
 3340             write_blockblob_metadata_path(BLOCKBLOB_PATH_LOOPBACK, bs, bb->id, lo_dev);
 3341         }
 3342     }
 3343 
 3344     set_device_path(bb);               // read .dm and .loopback and set bb->device_path accordingly
 3345 
 3346     goto out;                          // all is well
 3347 
 3348 clean:
 3349     {
 3350         int saved_errno = _blobstore_errno; // save it because close_and_unlock() or delete_blockblob_files() may reset it
 3351         if (bb->fd_lock != -1) {
 3352             if (ftruncate(bb->fd_lock, 0) != 0) {
 3353                 ERR(BLOBSTORE_ERROR_UNKNOWN, "failed to truncate the blobstore lock file.");
 3354             }
 3355             close_and_unlock(bb->fd_lock);
 3356         }
 3357         if (bb->fd_blocks != -1) {
 3358             close(bb->fd_blocks);
 3359         }
 3360         if (created_directory || created_blob) {    // only delete disk state if we created it
 3361             delete_blockblob_files(bs, bb->id);
 3362         }
 3363         if (saved_errno) {
 3364             _blobstore_errno = saved_errno;
 3365         }
 3366     }
 3367 
 3368 unlock:
 3369     {
 3370         int saved_errno = _blobstore_errno;
 3371         if (blobstore_locked && blobstore_unlock(bs) == -1) {
 3372             ERR(BLOBSTORE_ERROR_UNKNOWN, "failed to unlock the blobstore");
 3373             if (saved_errno) {
 3374                 _blobstore_errno = saved_errno;
 3375             }
 3376         }
 3377     }
 3378 
 3379 free:
 3380     EUCA_FREE(bb);
 3381 
 3382 out:
 3383     LOGTRACE("{%u} blockblob_open: done with blob id=%s ret=%p\n", (unsigned int)pthread_self(), id, bb);
 3384     if (bb == NULL) {
 3385         LOGTRACE("{%u} blockblob_open: errno=%d msg=%s\n", (unsigned int)pthread_self(), _blobstore_errno, blobstore_get_last_msg());
 3386     }
 3387 
 3388     free_bbs(bbs);
 3389     return bb;
 3390 }
 3391 
 3392 //!
 3393 //!
 3394 //!
 3395 //! @param[in] bs
 3396 //! @param[in] bb_id
 3397 //!
 3398 //! @return
 3399 //!
 3400 //! @pre
 3401 //!
 3402 //! @note
 3403 //!
 3404 static int loop_remove(blobstore * bs, const char *bb_id)
 3405 {
 3406     char path[PATH_MAX] = "";
 3407     int ret = 0;
 3408 
 3409     _err_off();                        // do not care if loopback file does not exist
 3410     read_blockblob_metadata_path(BLOCKBLOB_PATH_LOOPBACK, bs, bb_id, path, sizeof(path));   // loads path of /dev/loop?
 3411     _err_on();
 3412 
 3413     if (strlen(path)) {
 3414         if (diskutil_unloop(path)) {
 3415             ERR(BLOBSTORE_ERROR_UNKNOWN, "failed to remove loopback device for blockblob");
 3416             ret = -1;
 3417         } else {
 3418             set_blockblob_metadata_path(BLOCKBLOB_PATH_LOOPBACK, bs, bb_id, path, sizeof(path));    // load path of .../loopback file itself
 3419             unlink(path);
 3420         }
 3421     }
 3422 
 3423     return ret;
 3424 }
 3425 
 3426 //!
 3427 //! releases the blob locks, allowing others to open() it, and frees the blockblob handle
 3428 //!
 3429 //! @param[in] bb
 3430 //!
 3431 //! @return
 3432 //!
 3433 //! @pre
 3434 //!
 3435 //! @note
 3436 //!
 3437 int blockblob_close(blockblob * bb)
 3438 {
 3439     if (bb == NULL) {
 3440         ERR(BLOBSTORE_ERROR_INVAL, NULL);
 3441         return -1;
 3442     }
 3443     int ret = 0;
 3444     LOGTRACE("{%u} blockblob_close: closing blob id=%s\n", (unsigned int)pthread_self(), bb->id);
 3445 
 3446     // do not remove /dev/loop* if it is used by device mapper
 3447     // (we do not care about BLOCKBLOB_STATUS_OPENED because
 3448     // it should be only this thread that has the blob open)
 3449     int in_use = check_in_use(bb->store, bb->id, 0);
 3450     if (!(in_use & (BLOCKBLOB_STATUS_MAPPED | BLOCKBLOB_STATUS_BACKED))) {
 3451         ret = loop_remove(bb->store, bb->id);
 3452     }
 3453     ret |= close(bb->fd_blocks);
 3454     if (ftruncate(bb->fd_lock, 0) != 0) {
 3455         ERR(BLOBSTORE_ERROR_UNKNOWN, "failed to truncate the blobstore lock file.");
 3456     }
 3457     ret |= close_and_unlock(bb->fd_lock);
 3458     EUCA_FREE(bb);                     // we free the blob regardless of whether closing succeeds or not
 3459     return ret;
 3460 }
 3461 
 3462 //!
 3463 //!
 3464 //!
 3465 //! @param[in] dev_name
 3466 //!
 3467 //! @return
 3468 //!
 3469 //! @pre
 3470 //!
 3471 //! @note
 3472 //!
 3473 static int dm_suspend_resume(const char *dev_name)
 3474 {
 3475     int ret = EUCA_OK;
 3476 
 3477     if ((ret = euca_execlp(NULL, helpers_path[ROOTWRAP], helpers_path[DMSETUP], "suspend", dev_name, NULL)) != EUCA_OK) {
 3478         ERR(BLOBSTORE_ERROR_UNKNOWN, "failed to suspend device with 'dmsetup'");
 3479         return (-1);
 3480     }
 3481 
 3482     if ((ret = euca_execlp(NULL, helpers_path[ROOTWRAP], helpers_path[DMSETUP], "resume", dev_name, NULL)) != EUCA_OK) {
 3483         ERR(BLOBSTORE_ERROR_UNKNOWN, "failed to resume device with 'dmsetup'");
 3484         return (-1);
 3485     }
 3486 
 3487     return (0);
 3488 }
 3489 
 3490 //!
 3491 //!
 3492 //!
 3493 //! @param[in] dev_name
 3494 //!
 3495 //! @return
 3496 //!
 3497 //! @pre
 3498 //!
 3499 //! @note
 3500 //!
 3501 static int dm_check_device(const char *dev_name)
 3502 {
 3503     // see if the device exists
 3504     char dm_path[MAX_DM_PATH];
 3505     snprintf(dm_path, sizeof(dm_path), DM_PATH "%s", dev_name);
 3506     return check_path(dm_path);        // we do not use check_block() because /dev/mapper/... entries can be sym links
 3507 }
 3508 
 3509 //!
 3510 //!
 3511 //!
 3512 //! @param[in] dev_name
 3513 //!
 3514 //! @return
 3515 //!
 3516 //! @pre
 3517 //!
 3518 //! @note
 3519 //!
 3520 static int dm_delete_device(const char *dev_name)
 3521 {
 3522     int ret = 0;
 3523     int retries = 1;
 3524     char dm_path[MAX_DM_PATH] = "";
 3525 
 3526     // see if the device to delete exists
 3527     snprintf(dm_path, sizeof(dm_path), DM_PATH "%s", dev_name);
 3528     errno = 0;
 3529     if (check_path(dm_path) && (errno == ENOENT))   // we do not use check_block() because /dev/mapper/... entries can be sym links
 3530         return (0);
 3531 
 3532 try_again:
 3533     myprintf(EUCA_LOG_INFO, "removing device %s (retries=%d)\n", dev_name, retries);
 3534     if ((euca_execlp(NULL, helpers_path[ROOTWRAP], helpers_path[DMSETUP], "remove", dev_name, NULL)) != EUCA_OK) {
 3535         if (retries--) {
 3536             usleep(100);
 3537             goto try_again;
 3538         }
 3539         ERR(BLOBSTORE_ERROR_UNKNOWN, "failed to remove device mapper device with 'dmsetup'");
 3540         ret = -1;
 3541     }
 3542     return (ret);
 3543 }
 3544 
 3545 //!
 3546 //!
 3547 //!
 3548 //! @param[in] dev_names
 3549 //! @param[in] size
 3550 //!
 3551 //! @return
 3552 //!
 3553 //! @pre
 3554 //!
 3555 //! @note
 3556 //!
 3557 static int dm_delete_devices(char *dev_names[], int size)
 3558 {
 3559     if (size < 1)
 3560         return 0;
 3561     int ret = 0;
 3562 
 3563     // construct list of device names in the order that they should be removed
 3564     int devices = 0;
 3565     char **dev_names_removable = EUCA_ZALLOC(size, sizeof(char *));
 3566     if (dev_names_removable == NULL) {
 3567         ERR(BLOBSTORE_ERROR_NOMEM, NULL);
 3568         return -1;
 3569     }
 3570     for (int i = size - 1; i >= 0; i--) {
 3571         char *name = dev_names[i];
 3572         int seen = 0;
 3573         for (int j = i + 1; j < size; j++) {
 3574             if (!strcmp(name, dev_names[j])) {
 3575                 seen = 1;
 3576                 break;
 3577             }
 3578         }
 3579         if (!seen) {
 3580             dev_names_removable[devices++] = name;
 3581         }
 3582     }
 3583 
 3584     // run through devices and remove them
 3585     for (int i = 0; i < devices; i++) {
 3586 
 3587         // some of these devices may have children devices that were created
 3588         // by GNU parted for each of the partitions inside; here we look for
 3589         // those devices and remove them so the main device is not 'busy'.
 3590         for (int j = 1; j < 10; j++) {
 3591             char name_p[1024];         // device mapper name of a potential partition entry
 3592             char path_p[1024];         // path to the device mapper file
 3593             // just append 'pN' to the name, e.g., sda -> sdap1
 3594             snprintf(name_p, sizeof(name_p), "%sp%d", dev_names_removable[i], j);
 3595             snprintf(path_p, sizeof(path_p), DM_FORMAT, name_p);
 3596             if (check_path(path_p) == 0) {
 3597                 dm_delete_device(name_p);
 3598             }
 3599             // also try appending just 'N', since that may be the name format, too
 3600             snprintf(name_p, sizeof(name_p), "%s%d", dev_names_removable[i], j);
 3601             snprintf(path_p, sizeof(path_p), DM_FORMAT, name_p);
 3602             if (check_path(path_p) == 0) {
 3603                 dm_delete_device(name_p);
 3604             }
 3605         }
 3606         ret = dm_delete_device(dev_names_removable[i]);
 3607     }
 3608     EUCA_FREE(dev_names_removable);
 3609 
 3610     return ret;
 3611 }
 3612 
 3613 //!
 3614 //!
 3615 //!
 3616 //! @param[in] dev_names
 3617 //! @param[in] dm_tables
 3618 //! @param[in] size
 3619 //!
 3620 //! @return
 3621 //!
 3622 //! @pre
 3623 //!
 3624 //! @note
 3625 //!
 3626 static int dm_create_devices(char *dev_names[], char *dm_tables[], int size)
 3627 {
 3628     int i = 0;
 3629     int fd = 0;
 3630     int status = 0;
 3631     int rc = EUCA_OK;
 3632     int rbytes = 0;
 3633     pid_t cpid = 0;
 3634     char tmpfile[EUCA_MAX_PATH] = "";
 3635     char dm_path[MAX_DM_PATH] = "";
 3636 
 3637     for (i = 0; i < size; i++) {
 3638         // create devices one by one
 3639         myprintf(EUCA_LOG_INFO, "creating device %s\n", dev_names[i]);
 3640 
 3641         if ((cpid = fork()) < 0) {
 3642             // fork error
 3643             PROPAGATE_ERR(BLOBSTORE_ERROR_UNKNOWN);
 3644             goto cleanup;
 3645         } else if (cpid == 0) {
 3646             // child process - runs `dmsetup` using system()
 3647             bzero(tmpfile, sizeof(tmpfile));
 3648             snprintf(tmpfile, sizeof(tmpfile) - 1, "/tmp/dmsetup.XXXXXX");
 3649             if ((fd = safe_mkstemp(tmpfile)) >= 0) {
 3650                 if ((rbytes = write(fd, dm_tables[i], strlen(dm_tables[i]))) != strlen(dm_tables[i])) {
 3651                     // if write error
 3652                     LOGERROR("{%u} error: dm_create_devices: write returned number of bytes != write buffer: %d/%ld\n", (unsigned int)pthread_self(), rbytes, strlen(dm_tables[i]));
 3653                     unlink(tmpfile);
 3654                     exit(1);
 3655                 }
 3656                 close(fd);
 3657             } else {
 3658                 // couldn't get fd
 3659                 LOGERROR("{%u} error: dm_create_devices: couldn't open temporary file %s: %s\n", (unsigned int)pthread_self(), tmpfile, strerror(errno));
 3660                 unlink(tmpfile);
 3661                 exit(1);
 3662             }
 3663 
 3664             // invoke `dmsetup create ...`
 3665             rc = euca_execlp(&status, helpers_path[ROOTWRAP], helpers_path[DMSETUP], "create", dev_names[i], tmpfile, NULL);
 3666 
 3667             // free out temp file
 3668             unlink(tmpfile);
 3669 
 3670             // pass back dmsetup's return code
 3671             exit(WEXITSTATUS(status));
 3672         }
 3673         // parent - waits for child, reacts to status
 3674         if ((rc = timewait(cpid, &status, BLOBSTORE_DMSETUP_TIMEOUT_SEC)) <= 0) {
 3675             LOGERROR("{%u} error: dm_create_devices: bad exit from dmsetup child: %d\n", (unsigned int)pthread_self(), rc);
 3676             PROPAGATE_ERR(BLOBSTORE_ERROR_UNKNOWN);
 3677             goto cleanup;
 3678         }
 3679 
 3680         if (WEXITSTATUS(status) != 0) {
 3681             ERR(BLOBSTORE_ERROR_UNKNOWN, "failed to set up device mapper table with 'dmsetup'");
 3682             myprintf(EUCA_LOG_INFO, "{%u} command: %s %s create %s\n", (unsigned int)pthread_self(), helpers_path[ROOTWRAP], helpers_path[DMSETUP], dev_names[i]);
 3683             myprintf(EUCA_LOG_INFO, "{%u} input: %s", (unsigned int)pthread_self(), dm_tables[i]);
 3684             goto cleanup;
 3685         }
 3686 
 3687         snprintf(dm_path, sizeof(dm_path), DM_PATH "%s", dev_names[i]);
 3688         if (diskutil_ch(dm_path, get_username(), NULL, BLOBSTORE_FILE_PERM) != EUCA_OK) {
 3689             ERR(BLOBSTORE_ERROR_UNKNOWN, "failed to change permissions on the device mapper file\n");
 3690             goto cleanup;
 3691         }
 3692     }
 3693 
 3694     return (0);
 3695 cleanup:
 3696     _err_off();
 3697     dm_delete_devices(dev_names, i + 1);
 3698     _err_on();
 3699     return (-1);
 3700 }
 3701 
 3702 //!
 3703 //!
 3704 //!
 3705 //! @return
 3706 //!
 3707 //! @pre
 3708 //!
 3709 //! @note
 3710 //!
 3711 static char *dm_get_zero(void)
 3712 {
 3713     static char dev_zero[] = DM_PATH EUCA_ZERO;
 3714 
 3715     struct stat sb;
 3716     int tried = 0;
 3717     while (stat(dev_zero, &sb) == -1) {
 3718         if (tried) {
 3719             ERR(BLOBSTORE_ERROR_UNKNOWN, "failed to create blockblob zero block device");
 3720             return NULL;
 3721         }
 3722 
 3723         char *dm_tables[1] = { "0 " EUCA_ZERO_SIZE " zero" };
 3724         char *dm_names[1] = { EUCA_ZERO };
 3725         dm_create_devices(dm_names, dm_tables, 1);
 3726 
 3727         tried = 1;
 3728     }
 3729 
 3730     if (!S_ISBLK(sb.st_mode)) {
 3731         ERR(BLOBSTORE_ERROR_UNKNOWN, "blockblob zero is not a block device");
 3732         return NULL;
 3733     }
 3734 
 3735     return dev_zero;
 3736 }
 3737 
 3738 //!
 3739 //!
 3740 //!
 3741 //! @param[in] bb
 3742 //!
 3743 //! @return
 3744 //!
 3745 //! @pre
 3746 //!
 3747 //! @note
 3748 //!
 3749 static int blockblob_check(const blockblob * bb)
 3750 {
 3751     char **array = NULL;
 3752     int array_size = 0;
 3753     int err = 0;
 3754     _err_off();                        // do not care if metadata files do not exist
 3755 
 3756     // check on dm devices listed in .dm of this blob, if any
 3757     if (read_array_blockblob_metadata_path(BLOCKBLOB_PATH_DM, bb->store, bb->id, &array, &array_size) != -1) {
 3758         for (int i = 0; i < array_size; i++) {
 3759             if (dm_check_device(array[i]))
 3760                 err++;
 3761             EUCA_FREE(array[i]);
 3762         }
 3763         EUCA_FREE(array);
 3764     }
 3765     // check on the loop device listed in .loopback of the blob, if any
 3766     char lo_dev[PATH_MAX] = "";
 3767     read_blockblob_metadata_path(BLOCKBLOB_PATH_LOOPBACK, bb->store, bb->id, lo_dev, sizeof(lo_dev));
 3768     if (strlen(lo_dev) > 0) {
 3769         struct stat sb;
 3770         if (stat(lo_dev, &sb) == -1) {
 3771             err++;
 3772         } else if (!S_ISBLK(sb.st_mode)) {
 3773             err++;
 3774         } else if (diskutil_loop_check(bb->blocks_path, lo_dev)) {
 3775             err++;
 3776         }
 3777     }
 3778     // check on .refs that point to blobs that no longer exist
 3779     if (get_stale_refs(bb, NULL) > 0)
 3780         err++;
 3781 
 3782     // check on .lock files that are non-zero => blobs that were not closed properly
 3783     if (bb->in_use & BLOCKBLOB_STATUS_ABANDONED)
 3784         err++;
 3785 
 3786     _err_on();
 3787     return err;
 3788 }
 3789 
 3790 //!
 3791 //!
 3792 //!
 3793 //! @param[in] bb
 3794 //! @param[in] timeout_usec
 3795 //! @param[in] do_force
 3796 //!
 3797 //! @return
 3798 //!
 3799 //! @pre
 3800 //!
 3801 //! @note
 3802 //!
 3803 static int delete_blob_state(blockblob * bb, long long timeout_usec, char do_force)
 3804 {
 3805     blobstore *bs = bb->store;
 3806     char **array = NULL;
 3807     int array_size = 0;
 3808     int ret = 0;
 3809 
 3810     // delete dm devices listed in .dm of this blob
 3811     if (read_array_blockblob_metadata_path(BLOCKBLOB_PATH_DM, bb->store, bb->id, &array, &array_size) == -1 || dm_delete_devices(array, array_size) == -1) {
 3812         if (!do_force) {
 3813             ret = -1;
 3814             goto free;
 3815         }
 3816     }
 3817     for (int i = 0; i < array_size; i++) {
 3818         EUCA_FREE(array[i]);
 3819     }
 3820     EUCA_FREE(array);
 3821     array_size = 0;
 3822     array = NULL;
 3823 
 3824     // Read in .deps (blobs that this blob depends on),
 3825     // so as to update their .refs (blobs depending on them).
 3826     if (read_array_blockblob_metadata_path(BLOCKBLOB_PATH_DEPS, bb->store, bb->id, &array, &array_size) == -1) {
 3827         ret = -1;
 3828         if (!do_force) {
 3829             ret = -1;
 3830             goto free;
 3831         }
 3832     }
 3833     char my_ref[BLOBSTORE_MAX_PATH + MAX_DM_NAME + 1];
 3834     snprintf(my_ref, sizeof(my_ref), "%s %s", bb->store->path, bb->id);
 3835     for (int i = 0; i < array_size; i++) {
 3836         char *store_path = strtok(array[i], " ");
 3837         char *blob_id = strtok(NULL, " ");  // the remaining entries in array[i] are ignored
 3838 
 3839         if (strlen(store_path) < 1 || strlen(blob_id) < 1) {
 3840             continue;                  //! @TODO print a warning about store/blob corruption?
 3841         }
 3842 
 3843         blobstore *dep_bs = bs;
 3844         if (strcmp(bs->path, store_path)) { // if deleting reference in a different blobstore
 3845             // need to open it
 3846             dep_bs = blobstore_open(store_path, 0, BLOBSTORE_FLAG_CREAT, BLOBSTORE_FORMAT_ANY, BLOBSTORE_REVOCATION_ANY, BLOBSTORE_SNAPSHOT_ANY);
 3847             if (dep_bs == NULL)
 3848                 continue;              //! @TODO print a warning about store/blob corruption?
 3849             if (blobstore_lock(dep_bs, timeout_usec) == -1) {   // lock this (different) blobstore, too, so .refs are updated atomically
 3850                 blobstore_close(dep_bs);
 3851                 continue;              //! @TODO print a warning about store/blob corruption?
 3852             }
 3853         }
 3854         // update .refs file on each of the dependencies
 3855         if (update_entry_blockblob_metadata_path(BLOCKBLOB_PATH_REFS, dep_bs, blob_id, my_ref, 1) == -1) {
 3856             //! @TODO print a warning about store/blob corruption?
 3857         }
 3858 
 3859         if (!(check_in_use(dep_bs, blob_id, 0) & ~(BLOCKBLOB_STATUS_ABANDONED))) {  // in use except abandoned
 3860             loop_remove(dep_bs, blob_id);   //! @TODO do we care about errors?
 3861         }
 3862         if (dep_bs != bs) {
 3863             blobstore_unlock(dep_bs);
 3864             blobstore_close(dep_bs);
 3865         }
 3866     }
 3867 
 3868     // remove the loopback entry for this blob
 3869     if (loop_remove(bs, bb->id) == -1) {
 3870         ret = -1;
 3871     }
 3872     // remove the files, data and metadata, for of this blob
 3873     if (delete_blockblob_files(bs, bb->id) < 1) {
 3874         ret = -1;
 3875     }
 3876 
 3877 free:
 3878     for (int i = 0; i < array_size; i++) {
 3879         EUCA_FREE(array[i]);
 3880     }
 3881     EUCA_FREE(array);
 3882 
 3883     return ret;
 3884 }
 3885 
 3886 //!
 3887 //! If no outside references to the blob exist, and blob is not protected,
 3888 //! deletes the blob and its metadata
 3889 //!
 3890 //! @param[in] bb
 3891 //! @param[in] timeout_usec
 3892 //! @param[in] do_force
 3893 //!
 3894 //! @return 0 if cleanup was successful and frees the blockblob handle, -1 otherwise,
 3895 //!         and DOES NOT free the blockblob handle (so that it can be closed and freed
 3896 //!         with blockblob_close)
 3897 //!
 3898 //! @pre
 3899 //!
 3900 //! @note
 3901 //!
 3902 int blockblob_delete(blockblob * bb, long long timeout_usec, char do_force)
 3903 {
 3904     if (bb == NULL) {
 3905         ERR(BLOBSTORE_ERROR_INVAL, NULL);
 3906         return -1;
 3907     }
 3908     blobstore *bs = bb->store;
 3909     int ret = 0;
 3910     if (blobstore_lock(bs, timeout_usec) == -1) {   // lock it so we can traverse it
 3911         return -1;                     // failed to obtain a lock on the blobstore
 3912     }
 3913     // do not delete the blob if it is used by another one
 3914     bb->in_use = check_in_use(bs, bb->id, 0);   // update in_use status
 3915     // if in use other than opened (by this thread), backed, or abandoned
 3916     if (!do_force && (bb->in_use & ~(BLOCKBLOB_STATUS_OPENED | BLOCKBLOB_STATUS_BACKED | BLOCKBLOB_STATUS_ABANDONED))) {
 3917         ERR(BLOBSTORE_ERROR_AGAIN, NULL);
 3918         ret = -1;
 3919     } else {
 3920         ret = delete_blob_state(bb, timeout_usec, do_force);    // do the bulk of the cleanup
 3921 
 3922         // close the open file descriptors
 3923         if (ftruncate(bb->fd_lock, 0) != 0) {
 3924             ERR(BLOBSTORE_ERROR_UNKNOWN, "failed to truncate the blobstore lock file.");
 3925         }
 3926 
 3927         if (close_and_unlock(bb->fd_lock) == -1) {
 3928             ret = -1;
 3929         } else {
 3930             bb->fd_lock = 0;           //! @TODO needed? maybe -1?
 3931         }
 3932 
 3933         if (close(bb->fd_blocks) == -1) {
 3934             ret = -1;
 3935         } else {
 3936             bb->fd_blocks = 0;         //! @TODO needed? maybe -1?
 3937         }
 3938 
 3939         // free the blob struct if everything above was OK
 3940         if (ret == 0) {
 3941             EUCA_FREE(bb);
 3942         }
 3943     }
 3944 
 3945     int saved_errno = 0;
 3946     saved_errno = _blobstore_errno;    // save it because blobstore_unlock may overwrite it
 3947     if (blobstore_unlock(bs) == -1) {
 3948         ERR(BLOBSTORE_ERROR_UNKNOWN, "failed to unlock the blobstore");
 3949     }
 3950     if (saved_errno) {
 3951         _blobstore_errno = saved_errno;
 3952     }
 3953 
 3954     return ret;
 3955 }
 3956 
 3957 //!
 3958 //!
 3959 //!
 3960 //! @param[in] bb
 3961 //! @param[in] min_size_bytes
 3962 //!
 3963 //! @return
 3964 //!
 3965 //! @pre
 3966 //!
 3967 //! @note
 3968 //!
 3969 static int verify_bb(const blockblob * bb, unsigned long long min_size_bytes)
 3970 {
 3971     if (bb->fd_lock == -1) {
 3972         ERR(BLOBSTORE_ERROR_INVAL, "blockblob lock involved in operation is not open");
 3973         return -1;
 3974     }
 3975     if (bb->fd_blocks == -1) {
 3976         ERR(BLOBSTORE_ERROR_INVAL, "blockblob involved in operation is not open");
 3977         return -1;
 3978     }
 3979     struct stat sb;
 3980     if (fstat(bb->fd_blocks, &sb) == -1) {
 3981         PROPAGATE_ERR(BLOBSTORE_ERROR_NOENT);
 3982         return -1;
 3983     }
 3984     if (sb.st_size < bb->size_bytes) {
 3985         ERR(BLOBSTORE_ERROR_UNKNOWN, "blockblob involved in operation has backing of unexpected size");
 3986         LOGERROR("sb.st_size=%ld bb->size_bytes=%lld\n", sb.st_size, bb->size_bytes);
 3987         return -1;
 3988     }
 3989     if (sb.st_size < min_size_bytes) {
 3990         ERR(BLOBSTORE_ERROR_INVAL, "blockblob involved in operation has backing that is too small");
 3991         return -1;
 3992     }
 3993     if (stat(bb->device_path, &sb) == -1) {
 3994         PROPAGATE_ERR(BLOBSTORE_ERROR_NOENT);
 3995         return -1;
 3996     }
 3997     if (!S_ISBLK(sb.st_mode)) {
 3998         ERR(BLOBSTORE_ERROR_INVAL, "blockblob involved in operation is missing a loopback block device");
 3999         return -1;
 4000     }
 4001     return 0;
 4002 }
 4003 
 4004 //!
 4005 //!
 4006 //!
 4007 //! @param[in] src_bb pointer to source blob to copy data from
 4008 //! @param[in] src_offset_bytes start offset in source
 4009 //! @param[in] dst_bb pointer to destination blob to copy data to
 4010 //! @param[in] dst_offset_bytes start offset in destination
 4011 //! @param[in] len_bytes 0 = copy until EOF of source
 4012 //!
 4013 //! @return
 4014 //!
 4015 //! @pre
 4016 //!
 4017 //! @note
 4018 //!
 4019 int blockblob_copy(blockblob * src_bb, unsigned long long src_offset_bytes, blockblob * dst_bb, unsigned long long dst_offset_bytes, unsigned long long len_bytes)  //
 4020 {
 4021     int ret = 0;
 4022 
 4023     if (src_bb == NULL || dst_bb == NULL) {
 4024         ERR(BLOBSTORE_ERROR_INVAL, "blockblob pointer is NULL");
 4025         return -1;
 4026     }
 4027 
 4028     long long copy_len_bytes = len_bytes;
 4029     if (copy_len_bytes == 0) {
 4030         copy_len_bytes = src_bb->size_bytes - src_offset_bytes;
 4031     }
 4032     if (copy_len_bytes < 1) {
 4033         ERR(BLOBSTORE_ERROR_INVAL, "copy source offset outside of range");
 4034         return -1;
 4035     }
 4036     // make sure both source and destination blobs are in good shape and big enough
 4037     if (verify_bb(src_bb, src_offset_bytes + copy_len_bytes) || verify_bb(dst_bb, dst_offset_bytes + copy_len_bytes)) {
 4038         return -1;
 4039     }
 4040     // determine the largest acceptable block size for dd, all the way down to a byte possibly
 4041     int granularity = 4096;
 4042     while (src_offset_bytes % granularity || dst_offset_bytes % granularity || copy_len_bytes % granularity) {
 4043         granularity /= 2;
 4044     }
 4045 
 4046     // do the copy (with block devices dd will silently omit to copy bytes outside the block boundary, so we use paths for uncloned blobs)
 4047     const char *src_path = (src_bb->snapshot_type == BLOBSTORE_SNAPSHOT_DM) ? (blockblob_get_dev(src_bb)) : (blockblob_get_file(src_bb));
 4048     const char *dst_path = (dst_bb->snapshot_type == BLOBSTORE_SNAPSHOT_DM) ? (blockblob_get_dev(dst_bb)) : (blockblob_get_file(dst_bb));
 4049     mode_t old_umask = umask(~BLOBSTORE_FILE_PERM);
 4050     int error = diskutil_dd2(src_path, dst_path, granularity, copy_len_bytes / granularity, dst_offset_bytes / granularity, src_offset_bytes / granularity);
 4051     umask(old_umask);
 4052     if (error) {
 4053         ERR(BLOBSTORE_ERROR_INVAL, "failed to copy a section");
 4054         return -1;
 4055     }
 4056 
 4057     return ret;
 4058 }
 4059 
 4060 //!
 4061 //! Sorts the device mapper table string sent to dmsetup. In some case, the table is
 4062 //! sent in partition ordering rather than start block ordering. This cause dmsetup to
 4063 //! get sick and puke some errors. For example, the following table will cause some
 4064 //! errors:
 4065 //! \li 0 63 linear /dev/mapper/euca-dsk-3AE63D3B-d6320e89-p0-snap 0
 4066 //! \li 204863 2764800 linear /dev/loop0 0
 4067 //! \li 2969663 6516 linear /dev/loop1 0
 4068 //! \li 2976179 1024 linear /dev/loop2 0
 4069 //! \li 63 204800 linear /dev/loop3 0
 4070 //! This function will take the previous table and re-order it in starting block order
 4071 //! as in the following:
 4072 //! \li 0 63 linear /dev/mapper/euca-dsk-3AE63D3B-d6320e89-p0-snap 0
 4073 //! \li 63 204800 linear /dev/loop3 0
 4074 //! \li 204863 2764800 linear /dev/loop0 0
 4075 //! \li 2969663 6516 linear /dev/loop1 0
 4076 //! \li 2976179 1024 linear /dev/loop2 0
 4077 //!
 4078 //! @param[in,out] pOldTable the table string to sort
 4079 //!
 4080 //! @return a pointer to the newly allocated table string if successful or NULL if any
 4081 //!         error occured.
 4082 //!
 4083 //! @pre The provided table field must not be NULL and must contain more than 1 entry
 4084 //!      separated by the newline character.
 4085 //!
 4086 //! @post On success the given table will be freed and a newly constructed table will be
 4087 //!       returned. The original table pointer will be set to the newly returned table too.
 4088 //!
 4089 static char *dm_sort_table(char **pOldTable)
 4090 {
 4091 #define DM_MAX_LINES          32
 4092 #define DM_LINE_LENGTH       256
 4093 
 4094     unsigned int i = 0;
 4095     unsigned int lineId = UINT32_MAX;
 4096     unsigned long long minVal = UINT64_MAX;
 4097     unsigned long long curVal = 0;
 4098     char *aLines[DM_MAX_LINES] = { NULL };  //!< TODO: Turn this into a dynamic re-alloc'ed array?
 4099     char sLine[DM_LINE_LENGTH] = "";
 4100     char *pNewTable = NULL;
 4101     char *pDupTable = NULL;
 4102     register unsigned int j = 0;
 4103     register unsigned int count = 0;
 4104 
 4105     if (pOldTable == NULL)
 4106         return (NULL);
 4107 
 4108     // Make sure our given table isn't NULL.
 4109     if ((*pOldTable) != NULL) {
 4110         // Duplicate the original table in case we need it later. strtok() will mess it up
 4111         pDupTable = strdup((*pOldTable));
 4112 
 4113         // Split in lines and count
 4114         aLines[count] = strtok((*pOldTable), "\n");
 4115         while ((aLines[count] != NULL) && (count < (DM_MAX_LINES - 1))) {
 4116             count++;
 4117             aLines[count] = strtok(NULL, "\n");
 4118         }
 4119 
 4120         // Will we need to sort?
 4121         if (aLines[count] != NULL) {
 4122             // hmmm. This sounds list we has more than DM_MAX_LINES... Just return the table as is
 4123             pNewTable = pDupTable;
 4124         } else if (count == 1) {
 4125             // So we have 1 line. Because strtok() messed up the original table
 4126             // lets return the duplicate version of the original
 4127             pNewTable = pDupTable;
 4128         } else {
 4129             // we need more than 1 line in this table to sort. At this point we know
 4130             // we have less than DM_MAX_LINES so we don't have to worry 'bout it.
 4131             if (count > 1) {
 4132                 // Sort every lines in the 'lines' array
 4133                 for (i = 0; i < count; i++) {
 4134                     // Search for the smaller starting block value in the lefover lines
 4135                     lineId = UINT32_MAX;
 4136                     minVal = UINT64_MAX;
 4137                     for (j = 0; j < count; j++) {
 4138                         // As we pick lines from the array, they become NULLs
 4139                         if (aLines[j] != NULL) {
 4140                             // Retrieve the starting block number which is the first item on the line
 4141                             if (sscanf(aLines[j], "%llu", &curVal) == 1) {
 4142                                 // Is this a newest low?
 4143                                 if (curVal < minVal) {
 4144                                     lineId = j;
 4145                                     minVal = curVal;
 4146                                 }
 4147                             }
 4148                         }
 4149                     }
 4150 
 4151                     // Since we set line ID to UINT32_MAX, its safe to assume its valid if less than count
 4152                     if (lineId < count) {
 4153                         // Re-add the newline character at the end of this string.
 4154                         if (snprintf(sLine, DM_LINE_LENGTH, "%s\n", aLines[lineId]) > 0) {
 4155                             // Add it to our new table.
 4156                             if ((pNewTable = euca_strdupcat(pNewTable, sLine)) == NULL) {
 4157                                 EUCA_FREE(pDupTable);
 4158                                 EUCA_FREE((*pOldTable));
 4159                                 return (NULL);
 4160                             }
 4161                         }
 4162                         // Lets no longer consider this line.
 4163                         aLines[lineId] = NULL;
 4164                     }
 4165                 }
 4166             }
 4167             // If count is anything else than 1, we no longer need pDupTable
 4168             EUCA_FREE(pDupTable);
 4169         }
 4170     }
 4171     // Free our given table and return the new one.
 4172     EUCA_FREE((*pOldTable));
 4173 
 4174     // Set our in/out parameter properly on our way out
 4175     (*pOldTable) = pNewTable;
 4176     return (pNewTable);
 4177 
 4178 #undef DM_MAX_LINES
 4179 #undef DM_LINE_LENGTH
 4180 }
 4181 
 4182 //!
 4183 //!
 4184 //!
 4185 //! @param[in] bb pointer to destination blob, which blocks may be used as backing
 4186 //! @param[in] map pointer to map of blocks from other blobs/devices to be copied/mapped/snapshotted
 4187 //! @param[in] map_size size of the map[]
 4188 //!
 4189 //! @return
 4190 //!
 4191 //! @pre
 4192 //!
 4193 //! @note
 4194 //!
 4195 int blockblob_clone(blockblob * bb, const blockmap * map, unsigned int map_size)
 4196 {
 4197     int ret = 0;
 4198     if (bb == NULL) {
 4199         ERR(BLOBSTORE_ERROR_INVAL, "blockblob pointer is NULL");
 4200         return -1;
 4201     }
 4202 
 4203     if (map == NULL || map_size < 1 || map_size > MAX_BLOCKMAP_SIZE) {
 4204         ERR(BLOBSTORE_ERROR_INVAL, "invalid blockbmap or its size");
 4205         return -1;
 4206     }
 4207     long long bb_size_blocks = round_down_sec(bb->size_bytes) / 512;    // dmsetup will not map partial blocks, so we conservatively round down
 4208 
 4209     // verify dependencies (block devices present, blob sizes make sense, zero device present)
 4210     char *zero_dev = NULL;
 4211     for (int i = 0; i < map_size; i++) {
 4212         const blockmap *m = map + i;
 4213         if (m->relation_type != BLOBSTORE_COPY && bb->store->snapshot_policy != BLOBSTORE_SNAPSHOT_DM) {
 4214             ERR(BLOBSTORE_ERROR_INVAL, "relation type is incompatible with snapshot policy");
 4215             return -1;
 4216         }
 4217 
 4218         switch (m->source_type) {
 4219         case BLOBSTORE_DEVICE:{
 4220                 const char *path = m->source.device_path;
 4221                 if (path == NULL) {
 4222                     ERR(BLOBSTORE_ERROR_INVAL, "one of the device paths is NULL");
 4223                     return -1;
 4224                 }
 4225                 struct stat sb;
 4226                 if (stat(path, &sb) == -1) {
 4227                     PROPAGATE_ERR(BLOBSTORE_ERROR_NOENT);
 4228                     return -1;
 4229                 }
 4230                 if (!S_ISBLK(sb.st_mode)) {
 4231                     ERR(BLOBSTORE_ERROR_INVAL, "one of the device paths is not a block device");
 4232                     return -1;
 4233                 }
 4234                 break;
 4235             }
 4236         case BLOBSTORE_BLOCKBLOB:{
 4237                 const blockblob *sbb = m->source.blob;
 4238                 if (sbb == NULL) {
 4239                     ERR(BLOBSTORE_ERROR_INVAL, "one of the source blockblob pointers is NULL");
 4240                     return -1;
 4241                 }
 4242                 long long sbb_size_blocks = round_down_sec(sbb->size_bytes) / 512;  // dmsetup will not map partial blocks, so we conservatively round down
 4243                 if (verify_bb(sbb, sbb_size_blocks)) {
 4244                     return -1;
 4245                 }
 4246                 if (sbb_size_blocks < (m->first_block_src + m->len_blocks)) {
 4247                     LOGWARN("source size = %lld mappped size = %lld\n", sbb_size_blocks, (m->first_block_src + m->len_blocks));
 4248                     ERR(BLOBSTORE_ERROR_INVAL, "one of the source blockblobs is too small for the map");
 4249                     return -1;
 4250                 }
 4251                 if (bb_size_blocks < (m->first_block_dst + m->len_blocks)) {
 4252                     ERR(BLOBSTORE_ERROR_INVAL, "the destination blockblob is too small for the map");
 4253                     return -1;
 4254                 }
 4255                 if (m->relation_type == BLOBSTORE_SNAPSHOT && m->len_blocks < MIN_BLOCKS_SNAPSHOT) {
 4256                     ERR(BLOBSTORE_ERROR_INVAL, "snapshot size is too small");
 4257                     return -1;
 4258                 }
 4259                 break;
 4260             }
 4261         case BLOBSTORE_ZERO:
 4262             zero_dev = dm_get_zero();
 4263             if (zero_dev == NULL) {
 4264                 return -1;
 4265             }
 4266 
 4267             break;
 4268         default:
 4269             ERR(BLOBSTORE_ERROR_INVAL, "invalid map entry type");
 4270             return -1;
 4271         }
 4272     }
 4273 
 4274     // compute the base name of the device mapper device
 4275     char dm_base[MAX_DM_LINE];
 4276     snprintf(dm_base, sizeof(dm_base), "euca-%s", bb->id);
 4277     for (char *c = dm_base; *c != '\0'; c++) {
 4278         if (*c == '/')                 // if the ID has slashes,
 4279             *c = '-';                  // replace them with hyphens
 4280     }
 4281 
 4282     int devices = 0;
 4283     int mapped_or_snapshotted = 0;
 4284     char buf[MAX_DM_LINE];
 4285     char *main_dm_table = NULL;
 4286     char **dev_names = EUCA_ZALLOC(map_size * 4 + 1, sizeof(char *));   // for device mapper dev names we will create
 4287     if (dev_names == NULL) {
 4288         ERR(BLOBSTORE_ERROR_NOMEM, NULL);
 4289         return -1;
 4290     }
 4291     char **dm_tables = EUCA_ZALLOC(map_size * 4 + 1, sizeof(char *));   // for device mapper tables
 4292     if (dm_tables == NULL) {
 4293         ERR(BLOBSTORE_ERROR_NOMEM, NULL);
 4294         EUCA_FREE(dev_names);
 4295         return -1;
 4296     }
 4297     // either does copies or computes the device mapper tables
 4298     for (int i = 0; i < map_size; i++) {
 4299         const blockmap *m = map + i;
 4300         const char *dev;
 4301 
 4302         switch (m->source_type) {
 4303         case BLOBSTORE_DEVICE:
 4304             dev = m->source.device_path;
 4305             break;
 4306         case BLOBSTORE_BLOCKBLOB:
 4307             dev = m->source.blob->device_path;
 4308             break;
 4309         case BLOBSTORE_ZERO:
 4310             dev = zero_dev;
 4311             break;
 4312         default:
 4313             ERR(BLOBSTORE_ERROR_INVAL, "invalid device map source type");
 4314             ret = -1;
 4315             goto free;
 4316         }
 4317 
 4318         long long first_block_src = m->first_block_src;
 4319         switch (m->relation_type) {
 4320         case BLOBSTORE_COPY:
 4321             // do the copy
 4322             if (diskutil_dd2(dev, bb->device_path, 512, m->len_blocks, m->first_block_dst, m->first_block_src)) {
 4323                 ERR(BLOBSTORE_ERROR_INVAL, "failed to copy a section");
 4324                 ret = -1;
 4325                 goto free;
 4326             }
 4327             // append to the main dm table (we do this here even if we never end up using the device mapper because all segments were copied)
 4328             snprintf(buf, sizeof(buf), "%lld %lld linear %s %lld\n", m->first_block_dst, m->len_blocks, bb->device_path, m->first_block_dst);
 4329             main_dm_table = euca_strdupcat(main_dm_table, buf);
 4330             break;
 4331 
 4332         case BLOBSTORE_SNAPSHOT:{
 4333                 int granularity = 16;  // coarser granularity does not work
 4334                 while (m->len_blocks % granularity) {   // do we need to do this?
 4335                     granularity /= 2;
 4336                 }
 4337 
 4338                 // with a linear map, create a backing device for the snapshot
 4339                 snprintf(buf, sizeof(buf), "%s-p%d-back", dm_base, i);
 4340                 dev_names[devices] = strdup(buf);
 4341                 char *backing_dev = dev_names[devices];
 4342                 snprintf(buf, sizeof(buf), "0 %lld linear %s %lld\n", m->len_blocks, bb->device_path, m->first_block_dst);
 4343                 dm_tables[devices] = strdup(buf);
 4344                 devices++;
 4345 
 4346                 // if there is an offset in the source device, create another map (since snapshots cannot be done at offsets)
 4347                 const char *snapshotted_dev = dev;
 4348                 if (m->first_block_src > 0 && m->source_type != BLOBSTORE_ZERO) {
 4349                     snprintf(buf, sizeof(buf), "%s-p%d-real", dm_base, i);
 4350                     dev_names[devices] = strdup(buf);
 4351                     snapshotted_dev = dev_names[devices];
 4352                     snprintf(buf, sizeof(buf), "0 %lld linear %s %lld\n", m->len_blocks, ((dev) ? dev : 0), m->first_block_src);
 4353                     dm_tables[devices] = strdup(buf);
 4354                     devices++;
 4355                 }
 4356                 // take a snapshot of the source
 4357                 snprintf(buf, sizeof(buf), "%s-p%d-snap", dm_base, i);
 4358                 dev_names[devices] = strdup(buf);
 4359                 dev = dev_names[devices];
 4360                 // We use 'n' for a non-persistent snapshot, which will not persist across a reboot.
 4361                 // With 'p' we could get a persistent snapshot at the cost of 0.2-3.0% overhead in
 4362                 // disk space, depending on chunksize [1-16], but then we would need to rebuild
 4363                 // device mapper entries and change space management to accommodate the overhead.
 4364                 snprintf(buf, sizeof(buf), "0 %lld snapshot %s%s " DM_PATH "%s n %d\n", m->len_blocks, snapshotted_dev[0] == 'e' ? DM_PATH : "",
 4365                          snapshotted_dev, backing_dev, granularity);
 4366                 dm_tables[devices] = strdup(buf);
 4367                 devices++;
 4368 
 4369                 first_block_src = 0;   // for snapshots the mapping goes from the -snap device at offset 0
 4370                 // yes, fall through
 4371             }
 4372 
 4373         case BLOBSTORE_MAP:
 4374             // append to the main dm table
 4375             snprintf(buf, sizeof(buf), "%lld %lld linear %s%s %lld\n", m->first_block_dst, m->len_blocks, dev[0] == 'e' ? DM_PATH : "", dev, first_block_src);
 4376             main_dm_table = euca_strdupcat(main_dm_table, buf);
 4377             mapped_or_snapshotted++;
 4378             break;
 4379 
 4380         default:
 4381             ERR(BLOBSTORE_ERROR_INVAL, "invalid device map source type");
 4382             ret = -1;
 4383             goto free;
 4384         }
 4385     }
 4386 
 4387     if (mapped_or_snapshotted) {       // we must use the device mapper
 4388         if ((main_dm_table = dm_sort_table(&main_dm_table)) == NULL) {
 4389             ret = -1;
 4390             goto free;
 4391         }
 4392 
 4393         euca_strncpy(bb->dm_name, dm_base, sizeof(bb->dm_name));
 4394         dev_names[devices] = strdup(dm_base);
 4395         dm_tables[devices] = main_dm_table;
 4396         devices++;
 4397 
 4398         // change device_path from loopback to the device-mapper path
 4399         snprintf(bb->device_path, sizeof(bb->device_path), DM_FORMAT, dm_base);
 4400 
 4401         if (dm_create_devices(dev_names, dm_tables, devices)) {
 4402             ret = -1;
 4403             goto free;
 4404         }
 4405         // record new devices in .dm of this blob
 4406         if (write_array_blockblob_metadata_path(BLOCKBLOB_PATH_DM, bb->store, bb->id, dev_names, devices) == -1) {
 4407             ret = -1;
 4408             goto cleanup;
 4409         }
 4410         bb->snapshot_type = BLOBSTORE_SNAPSHOT_DM;  // remember that blobstore uses device mapper
 4411 
 4412         // update .refs on dependencies and create .deps for this blob
 4413         char my_ref[BLOBSTORE_MAX_PATH + MAX_DM_NAME + 1];
 4414         snprintf(my_ref, sizeof(my_ref), "%s %s", bb->store->path, bb->id); //! @TODO use store ID to proof against moving blobstore?
 4415         for (int i = 0; i < map_size; i++) {
 4416             const blockmap *m = map + i;
 4417             const blockblob *sbb = m->source.blob;
 4418 
 4419             if (m->source_type != BLOBSTORE_BLOCKBLOB)  // only blobstores have references
 4420                 continue;
 4421 
 4422             if (m->relation_type == BLOBSTORE_COPY) // copies do not create references
 4423                 continue;
 4424 
 4425             if (blobstore_lock(sbb->store, BLOBSTORE_LOCK_TIMEOUT_USEC) == -1) {    // lock the source blobstore so the .refs are updated atomically
 4426                 LOGERROR("{%u} error: timed out on a blobstore lock while attempting to update .refs\n", (unsigned int)pthread_self());
 4427                 ret = -1;
 4428                 goto cleanup;          //! @TODO remove .refs entries from this batch that succeeded, if any?
 4429             }
 4430             // update .refs
 4431             if (update_entry_blockblob_metadata_path(BLOCKBLOB_PATH_REFS, sbb->store, sbb->id, my_ref, 0) == -1) {
 4432                 ret = -1;
 4433                 goto cleanup;          //! @TODO remove .refs entries from this batch that succeeded, if any?
 4434             }
 4435 
 4436             if (blobstore_unlock(sbb->store) == -1) {
 4437                 ret = -1;
 4438                 goto cleanup;          //! @TODO remove .refs entries from this batch that succeeded, if any?
 4439             }
 4440             // record the dependency in .deps (redundant entries will be filtered out)
 4441             char dep_ref[BLOBSTORE_MAX_PATH + MAX_DM_NAME + 1];
 4442             snprintf(dep_ref, sizeof(dep_ref), "%s %s %s %llu %llu", sbb->store->path, sbb->id, blobstore_relation_type_name[m->relation_type], m->first_block_dst, m->len_blocks);
 4443             if (update_entry_blockblob_metadata_path(BLOCKBLOB_PATH_DEPS, bb->store, bb->id, dep_ref, 0) == -1) {
 4444                 ret = -1;
 4445                 goto cleanup;          // ditto
 4446             }
 4447         }
 4448     } else {
 4449         EUCA_FREE(main_dm_table);
 4450     }
 4451 
 4452     goto free;
 4453 
 4454 cleanup:                              // this is failure cleanup code path
 4455     {
 4456         int saved_errno;
 4457 
 4458         saved_errno = _blobstore_errno; // save it because dm_delete_devices may overwrite it
 4459         LOGERROR("error: blockblob_clone: %s (%d)\n", blobstore_get_last_msg(), _blobstore_errno);
 4460 
 4461         // remove dm devices that may have been created
 4462         if (dm_delete_devices(dev_names, devices) == 0) {
 4463 
 4464             // remove the .dm file so that others do not
 4465             // needlessly attempt to remove dm devices later
 4466             char path[PATH_MAX];
 4467             set_blockblob_metadata_path(BLOCKBLOB_PATH_DM, bb->store, bb->id, path, sizeof(path));
 4468             unlink(path);
 4469         }
 4470         _blobstore_errno = saved_errno;
 4471     }
 4472 
 4473 free:
 4474     // Only free main_dm_table if mapped_or_snapshotted is 0. If its greater than
 4475     // 0, it would be assigned to the dm_tables array.
 4476     if (mapped_or_snapshotted == 0) {
 4477         EUCA_FREE(main_dm_table);
 4478     }
 4479 
 4480     for (int i = 0; i < devices; i++) {
 4481         EUCA_FREE(dev_names[i]);
 4482         EUCA_FREE(dm_tables[i]);
 4483     }
 4484     EUCA_FREE(dev_names);
 4485     EUCA_FREE(dm_tables);
 4486 
 4487     return ret;
 4488 }
 4489 
 4490 //!
 4491 //! Retrieces a block device pointing to the blob
 4492 //!
 4493 //! @param[in] bb
 4494 //!
 4495 //! @return a block device pointing to the blob
 4496 //!
 4497 //! @pre
 4498 //!
 4499 //! @note
 4500 //!
 4501 const char *blockblob_get_dev(blockblob * bb)
 4502 {
 4503     if (bb == NULL) {
 4504         ERR(BLOBSTORE_ERROR_INVAL, NULL);
 4505         return NULL;
 4506     }
 4507     return bb->device_path;
 4508 }
 4509 
 4510 //!
 4511 //! Retrieves a path to the file containg the blob, but only if snapshot_type is not DM
 4512 //!
 4513 //! @param[in] bb
 4514 //!
 4515 //! @return a path to the file containg the blob
 4516 //!
 4517 //! @pre
 4518 //!
 4519 //! @note
 4520 //!
 4521 const char *blockblob_get_file(blockblob * bb)
 4522 {
 4523     if (bb == NULL) {
 4524         ERR(BLOBSTORE_ERROR_INVAL, NULL);
 4525         return NULL;
 4526     }
 4527     if (bb->snapshot_type == BLOBSTORE_SNAPSHOT_DM) {
 4528         ERR(BLOBSTORE_ERROR_INVAL, "file access only supported for uncloned blockblobs");
 4529         return NULL;
 4530     }
 4531     return bb->blocks_path;
 4532 }
 4533 
 4534 //!
 4535 //! Returns the blobstore of the blob
 4536 //! @param[in] bb
 4537 //!
 4538 //! @return pointer to the blobstore
 4539 //!
 4540 
 4541 blobstore *blockblob_get_blobstore(blockblob * bb)
 4542 {
 4543     if (bb == NULL) {
 4544         ERR(BLOBSTORE_ERROR_INVAL, NULL);
 4545         return NULL;
 4546     }
 4547     return bb->store;
 4548 }
 4549 
 4550 //!
 4551 //! Returns the directory in which the blob files are located
 4552 //!
 4553 //! @param[in] bb
 4554 //! @param[in] buf
 4555 //! @param[in] buflen
 4556 //!
 4557 //! @return success (0) or failure (-1)
 4558 //!
 4559 int blockblob_get_dir(blockblob * bb, char *buf, int buflen)
 4560 {
 4561     if (bb == NULL) {
 4562         ERR(BLOBSTORE_ERROR_INVAL, NULL);
 4563         return -1;
 4564     }
 4565     euca_strncpy(buf, bb->blocks_path, buflen);
 4566     for (int i = (strlen(buf) - 1); i > 1; i--) {
 4567         if (buf[i] == '/') {
 4568             buf[i] = '\0';
 4569             return 0;
 4570         }
 4571     }
 4572     ERR(BLOBSTORE_ERROR_INVAL, NULL);
 4573     return -1;
 4574 }
 4575 
 4576 //!
 4577 //!
 4578 //!
 4579 //! @param[in] bb
 4580 //!
 4581 //! @return size of blob in blocks
 4582 //!
 4583 //! @pre
 4584 //!
 4585 //! @note
 4586 //!
 4587 unsigned long long blockblob_get_size_blocks(blockblob * bb)
 4588 {
 4589     if (bb == NULL) {
 4590         ERR(BLOBSTORE_ERROR_INVAL, NULL);
 4591         return 0;
 4592     }
 4593     return round_up_sec(bb->size_bytes) / 512;
 4594 }
 4595 
 4596 //!
 4597 //!
 4598 //!
 4599 //! @param[in] bb
 4600 //!
 4601 //! @return size of blob in bytes
 4602 //!
 4603 //! @pre
 4604 //!
 4605 //! @note
 4606 //!
 4607 unsigned long long blockblob_get_size_bytes(blockblob * bb)
 4608 {
 4609     if (bb == NULL) {
 4610         ERR(BLOBSTORE_ERROR_INVAL, NULL);
 4611         return 0;
 4612     }
 4613     return bb->size_bytes;
 4614 }
 4615 
 4616 //!
 4617 //! flushes outstanding I/O on:
 4618 //! \li system's buffer cache
 4619 //! \li dm device at dev_path (if specified)
 4620 //! \li dm device pointing to the blob (if bb is specified)
 4621 //!
 4622 //! @param[in] dev_path
 4623 //! @param[in] bb
 4624 //!
 4625 //! @return
 4626 //!
 4627 int blockblob_sync(const char *dev_path, const blockblob * bb)
 4628 {
 4629     int err = 0;
 4630 
 4631     sync();                            // ensure the whole buffer cache is flushed
 4632 
 4633     if ((err == 0) && (dev_path != NULL)) {
 4634         err = dm_suspend_resume(dev_path);
 4635     }
 4636 
 4637     if ((err == 0) && (bb != NULL)) {
 4638         err = dm_suspend_resume(bb->device_path);
 4639     }
 4640 
 4641     return (err);
 4642 }
 4643 
 4644 #ifdef _UNIT_TEST
 4645 //!
 4646 //!
 4647 //!
 4648 //! @param[in] bb
 4649 //! @param[in] c
 4650 //! @param[in] use_file
 4651 //!
 4652 //! @return
 4653 //!
 4654 //! @pre
 4655 //!
 4656 //! @note
 4657 //!
 4658 static void _fill_blob(blockblob * bb, char c, int use_file)
 4659 {
 4660     const char *path;
 4661     if (use_file) {
 4662         path = blockblob_get_file(bb);
 4663     } else {
 4664         path = blockblob_get_dev(bb);
 4665     }
 4666 
 4667     char buf[1];
 4668     buf[0] = c;
 4669 
 4670     printf("filling out with dummy data %s\n", path);
 4671     int fd = open(path, O_WRONLY);
 4672     int failed_bytes = 0;
 4673     if (fd != -1) {
 4674         for (int i = 0; i < bb->size_bytes; i++) {
 4675             if (write(fd, buf, 1) != 1)
 4676                 failed_bytes++;
 4677         }
 4678     }
 4679     if (failed_bytes) {
 4680         printf("WARNING: failed to fill %d byte(s) to path %s\n", failed_bytes, path);
 4681     }
 4682     if (fd >= 0) {
 4683         fsync(fd);
 4684         close(fd);
 4685     }
 4686 }
 4687 
 4688 //!
 4689 //!
 4690 //!
 4691 //! @param[in] size_blocks
 4692 //! @param[in] base
 4693 //! @param[in] name
 4694 //! @param[in] format
 4695 //! @param[in] revocation
 4696 //! @param[in] snapshot
 4697 //!
 4698 //! @return
 4699 //!
 4700 //! @pre
 4701 //!
 4702 //! @note
 4703 //!
 4704 static blobstore *create_teststore(int size_blocks, const char *base, const char *name, blobstore_format_t format, blobstore_revocation_t revocation, blobstore_snapshot_t snapshot)
 4705 {
 4706     static int ts = 0;
 4707     static int counter = 0;
 4708 
 4709     if (ts == 0) {
 4710         ts = ((int)time(NULL)) - 1292630988;
 4711         //ts = (((int)time(NULL))<<24)>>24;
 4712     }
 4713 
 4714     char bs_path[PATH_MAX];
 4715     snprintf(bs_path, sizeof(bs_path), "%s/test_blobstore_%05d_%s_%03d", base, ts, name, counter++);
 4716     if (mkdir(bs_path, BLOBSTORE_DIRECTORY_PERM) == -1) {
 4717         printf("failed to create %s\n", bs_path);
 4718         return NULL;
 4719     }
 4720     printf("created %s\n", bs_path);
 4721     blobstore *bs = blobstore_open(bs_path, size_blocks, BLOBSTORE_FLAG_CREAT, format, revocation, snapshot);
 4722     if (bs == NULL) {
 4723         printf("ERROR: %s\n", blobstore_get_error_str(blobstore_get_error()));
 4724         return NULL;
 4725     }
 4726     return bs;
 4727 }
 4728 
 4729 //!
 4730 //!
 4731 //!
 4732 //! @param[in] bb
 4733 //! @param[in] seek
 4734 //! @param[in] c
 4735 //!
 4736 //! @return
 4737 //!
 4738 //! @pre
 4739 //!
 4740 //! @note
 4741 //!
 4742 static int write_byte(blockblob * bb, int seek, char c)
 4743 {
 4744     const char *dev = blockblob_get_dev(bb);
 4745     int fd = open(dev, O_WRONLY);
 4746     if (fd == -1) {
 4747         printf("ERROR: failed to open the blockblob dev %s\n", dev);
 4748         return -1;
 4749     }
 4750     if (lseek(fd, seek, SEEK_SET) == -1) {
 4751         printf("ERROR: failed to lseek in blockblob dev %s\n", dev);
 4752         close(fd);
 4753         return -1;
 4754     }
 4755     if (write(fd, &c, 1) != 1) {
 4756         printf("ERROR: failed to write to blockblob dev %s\n", dev);
 4757         close(fd);
 4758         return -1;
 4759     }
 4760     fsync(fd);
 4761     close(fd);
 4762 
 4763     return 0;
 4764 }
 4765 
 4766 //!
 4767 //!
 4768 //!
 4769 //! @param[in] bb
 4770 //! @param[in] seek
 4771 //!
 4772 //! @return
 4773 //!
 4774 //! @pre
 4775 //!
 4776 //! @note
 4777 //!
 4778 static char read_byte(blockblob * bb, int seek)
 4779 {
 4780     const char *dev = blockblob_get_dev(bb);
 4781     int fd = open(dev, O_RDONLY);
 4782     if (fd == -1) {
 4783         printf("ERROR: failed to open the blockblob dev %s\n", dev);
 4784         return -1;
 4785     }
 4786     if (lseek(fd, seek, SEEK_SET) == -1) {
 4787         printf("ERROR: failed to lseek in blockblob dev %s\n", dev);
 4788         close(fd);
 4789         return -1;
 4790     }
 4791     char buf[1];
 4792     if (read(fd, buf, 1) != 1) {
 4793         printf("ERROR: failed to write to blockblob dev %s\n", dev);
 4794         close(fd);
 4795         return -1;
 4796     }
 4797     close(fd);
 4798 
 4799     return buf[0];
 4800 }
 4801 
 4802 //!
 4803 //!
 4804 //!
 4805 //! @param[in] base
 4806 //! @param[in] name
 4807 //! @param[in] format
 4808 //! @param[in] revocation
 4809 //! @param[in] snapshot
 4810 //!
 4811 //! @return
 4812 //!
 4813 //! @pre
 4814 //!
 4815 //! @note
 4816 //!
 4817 static int do_clone_stresstest(const char *base, const char *name, blobstore_format_t format, blobstore_revocation_t revocation, blobstore_snapshot_t snapshot)
 4818 {
 4819     int errors = 0;
 4820     blobstore *bs1 = NULL;
 4821     blobstore *bs2 = NULL;
 4822 
 4823     printf("commencing cloning stress-test...\n");
 4824 
 4825     if ((bs1 = create_teststore(STRESS_BS_SIZE, base, name, BLOBSTORE_FORMAT_DIRECTORY, BLOBSTORE_REVOCATION_NONE, BLOBSTORE_SNAPSHOT_DM)) == NULL) {
 4826         errors++;
 4827         goto done;
 4828     }
 4829 
 4830     if ((bs2 = create_teststore(STRESS_BS_SIZE, base, name, BLOBSTORE_FORMAT_DIRECTORY, BLOBSTORE_REVOCATION_LRU, BLOBSTORE_SNAPSHOT_DM)) == NULL) {
 4831         errors++;
 4832         goto done;
 4833     }
 4834 
 4835     blockblob *bbs1[STRESS_BLOBS];
 4836     long long bbs1_sizes[STRESS_BLOBS];
 4837     blockblob *bbs2[STRESS_BLOBS * 2];
 4838     long long bbs2_sizes[STRESS_BLOBS * 2];
 4839 
 4840     // calculate sizes
 4841     long long avg = STRESS_BS_SIZE / STRESS_BLOBS;
 4842     if (avg < STRESS_MIN_BB * 2) {
 4843         printf("ERROR: average blob size %lld for stress test is too small (<%d)\n", avg, STRESS_MIN_BB * 2);
 4844         errors++;
 4845         goto done;
 4846     }
 4847     for (int i = 0; i < STRESS_BLOBS; i++) {
 4848         bbs1_sizes[i] = avg;
 4849         bbs1[i] = NULL;
 4850         bbs2[i] = NULL;
 4851         bbs2[i + STRESS_BLOBS] = NULL;
 4852     }
 4853     for (int i = 0; i < STRESS_BLOBS * 3; i++) {    // run over the array a few times
 4854         int j = i % (STRESS_BLOBS / 2); // modify pairs from array
 4855         int k = j + (STRESS_BLOBS / 2);
 4856         long long max_delta = MIN(bbs1_sizes[j] - STRESS_MIN_BB, bbs1_sizes[k] - STRESS_MIN_BB);
 4857         long long delta = max_delta * (((double)random() / RAND_MAX) - 0.5);
 4858         bbs1_sizes[j] -= delta;
 4859         bbs2_sizes[j] = bbs1_sizes[j] / 2;
 4860         bbs2_sizes[j + STRESS_BLOBS] = bbs1_sizes[j] - bbs1_sizes[j] / 2;
 4861 
 4862         bbs1_sizes[k] += delta;
 4863         bbs2_sizes[k] = bbs1_sizes[k] / 2;
 4864         bbs2_sizes[k + STRESS_BLOBS] = bbs1_sizes[k] - bbs1_sizes[k] / 2;
 4865     }
 4866     long long bbs1_totals = 0;
 4867     for (int i = 0; i < STRESS_BLOBS; i++) {
 4868         bbs1_totals += bbs1_sizes[i];
 4869         long long pair = bbs2_sizes[i] + bbs2_sizes[i + STRESS_BLOBS];
 4870         assert(pair == bbs1_sizes[i]);
 4871         printf("%lld ", bbs1_sizes[i]);
 4872     }
 4873     assert(bbs1_totals == STRESS_BS_SIZE);
 4874     printf("\n");
 4875 
 4876     // fill the stores
 4877     for (int i = 0; i < STRESS_BLOBS; i++) {
 4878 #define _OPENERR(BS,BB,BBSIZE)                                          \
 4879         BB = blockblob_open (BS, NULL, BBSIZE*512, BLOBSTORE_FLAG_CREAT | BLOBSTORE_FLAG_EXCL, NULL, 1000); \
 4880         if (BB == NULL) {                                               \
 4881             printf ("ERROR: failed to create blockblob i=%d\n", i);       \
 4882             errors++;                                                   \
 4883             goto drain;                                                 \
 4884         }
 4885         printf("allocating slot %d\n", i);
 4886         _OPENERR(bs1, bbs1[i], bbs1_sizes[i]);
 4887         _OPENERR(bs2, bbs2[i], bbs2_sizes[i]);
 4888         _OPENERR(bs2, bbs2[i + STRESS_BLOBS], bbs2_sizes[i + STRESS_BLOBS]);
 4889         write_byte(bbs2[i + STRESS_BLOBS], 0, 'b'); // write a byte into beginning of blob that will be snapshotted
 4890         blockmap map[] = {
 4891 {BLOBSTORE_MAP, BLOBSTORE_BLOCKBLOB, {blob:bbs2[i]}, 0, 0, bbs2_sizes[i]},
 4892 {BLOBSTORE_SNAPSHOT, BLOBSTORE_BLOCKBLOB, {blob:bbs2[i + STRESS_BLOBS]}, 0, bbs2_sizes[i], bbs2_sizes[i + STRESS_BLOBS]},
 4893         };
 4894         if (blockblob_clone(bbs1[i], map, 2) == -1) {
 4895             printf("ERROR: failed to clone on iteration %i\n", i);
 4896             errors++;
 4897             goto drain;
 4898         }
 4899         // verify that mapping works
 4900         write_byte(bbs2[i], bbs2_sizes[i] * 512 - 1, 'a');  // write a byte into the end of the blob that is being mapped
 4901         dm_suspend_resume(bbs1[i]->dm_name);
 4902         char c1 = read_byte(bbs1[i], bbs2_sizes[i] * 512 - 1);  // read that byte back via bbs1
 4903         char c2 = read_byte(bbs1[i], bbs2_sizes[i] * 512);  // read the byte written before the snapshot
 4904         if (c1 != 'a' || c2 != 'b') {
 4905             printf("ERROR: clone verification failed (c1=='%c', c2=='%c')\n", c1, c2);
 4906             errors++;
 4907             goto drain;
 4908         }
 4909     }
 4910 
 4911     // induce churn in stores
 4912     for (int k = 0; k < STRESS_BLOBS * 1; k++) {
 4913         usleep(100);
 4914         // randomly free a few random blobs
 4915         int to_free = (int)((STRESS_BLOBS / 2) * ((double)random() / RAND_MAX));
 4916         printf("will free %d random blobs\n", to_free);
 4917         for (int j = 0; j < to_free; j++) {
 4918             int i = (int)((STRESS_BLOBS - 1) * ((double)random() / RAND_MAX));
 4919             if (bbs1[i] != NULL) {
 4920                 printf("freeing slot %d\n", i);
 4921 #define _DELWARN(BB) if (BB && blockblob_delete (BB, 1000, 0) == -1) { printf ("WARNING: failed to delete blockblob %s i=%d\n", BB->id, i); } BB=NULL
 4922                 _DELWARN(bbs1[i]);
 4923                 blockblob_close(bbs2[i]);   // so it can be purged with LRU
 4924                 bbs2[i] = NULL;
 4925                 blockblob_close(bbs2[i + STRESS_BLOBS]);    // so it can be purged with LRU
 4926                 bbs2[i + STRESS_BLOBS] = NULL;
 4927             }
 4928         }
 4929 
 4930         // re-allocate those sizes
 4931         for (int i = 0; i < STRESS_BLOBS; i++) {
 4932             if (bbs1[i] != NULL)
 4933                 continue;
 4934             printf("allocating slot %d\n", i);
 4935             _OPENERR(bs1, bbs1[i], bbs1_sizes[i]);
 4936             _OPENERR(bs2, bbs2[i], bbs2_sizes[i]);
 4937             _OPENERR(bs2, bbs2[i + STRESS_BLOBS], bbs2_sizes[i + STRESS_BLOBS]);
 4938             write_byte(bbs2[i + STRESS_BLOBS], 0, 'b'); // write a byte into beginning of blob that will be snapshotted
 4939             blockmap map[] = {
 4940 {BLOBSTORE_MAP, BLOBSTORE_BLOCKBLOB, {blob:bbs2[i]}
 4941                  , 0, 0, bbs2_sizes[i]}
 4942                 ,
 4943 {BLOBSTORE_SNAPSHOT, BLOBSTORE_BLOCKBLOB, {blob:bbs2[i + STRESS_BLOBS]}
 4944                  , 0, bbs2_sizes[i], bbs2_sizes[i + STRESS_BLOBS]}
 4945                 ,
 4946             };
 4947             if (blockblob_clone(bbs1[i], map, 2) == -1) {
 4948                 printf("ERROR: failed to clone on iteration %i\n", i);
 4949                 errors++;
 4950                 goto drain;
 4951             }
 4952             // verify that mapping works
 4953             write_byte(bbs2[i], bbs2_sizes[i] * 512 - 1, 'a');  // write a byte into the end of the blob that is being mapped
 4954             dm_suspend_resume(bbs1[i]->dm_name);
 4955             char c1 = read_byte(bbs1[i], bbs2_sizes[i] * 512 - 1);  // read that byte back via bbs1
 4956             char c2 = read_byte(bbs1[i], bbs2_sizes[i] * 512);  // read the byte written before the snapshot
 4957             if (c1 != 'a' || c2 != 'b') {
 4958                 printf("ERROR: clone verification failed (c1=='%c', c2=='%c')\n", c1, c2);
 4959                 errors++;
 4960                 goto drain;
 4961             }
 4962         }
 4963     }
 4964 
 4965 drain:
 4966     // drain the stores
 4967     printf("resting before draining...\n");
 4968     sleep(1);
 4969     for (int i = 0; i < STRESS_BLOBS; i++) {
 4970         printf("freeing slot %d\n", i);
 4971         _DELWARN(bbs1[i]);
 4972         _DELWARN(bbs2[i]);
 4973         _DELWARN(bbs2[i + STRESS_BLOBS]);
 4974     }
 4975 
 4976     printf("completed cloning stress-test\n");
 4977 done:
 4978     if (bs1 != NULL)
 4979         blobstore_close(bs1);
 4980     if (bs2 != NULL)
 4981         blobstore_close(bs2);
 4982     return errors;
 4983 }
 4984 
 4985 //!
 4986 //!
 4987 //!
 4988 //! @param[in] bb4
 4989 //! @param[in] op
 4990 //!
 4991 //! @return
 4992 //!
 4993 //! @pre
 4994 //!
 4995 //! @note
 4996 //!
 4997 static int check_destination(blockblob * bb4, char *op)
 4998 {
 4999     int errors = 0;
 5000     const char *dev = blockblob_get_dev(bb4);
 5001     if (dev != NULL) {
 5002         int fd = open(dev, O_RDONLY);
 5003         if (fd != -1) {
 5004             for (int i = 1; i < 4; i++) {
 5005                 for (int j = 0; j < CBB_SIZE; j++) {
 5006                     char buf[512];
 5007                     int r = read(fd, buf, sizeof(buf));
 5008                     if (r < 1) {
 5009                         printf("ERROR: failed to read bock device %s\n", dev);
 5010                         errors++;
 5011                         goto stop_comparing;
 5012                     }
 5013                     if (buf[0] != '0' + i) {
 5014                         printf("ERROR: block device %s has unexpected data ('%c' (%d) != '%c')\n", dev, buf[0], buf[0], '0' + i);
 5015                         errors++;
 5016                         goto stop_comparing;
 5017                     }
 5018                 }
 5019             }
 5020 stop_comparing:
 5021             close(fd);
 5022         } else {
 5023             printf("ERROR: failed to open block device %s for the %s\n", dev, op);
 5024             errors++;
 5025         }
 5026     } else {
 5027         printf("ERROR: failed to get a block device for the %s\n", op);
 5028         errors++;
 5029     }
 5030 
 5031     return errors;
 5032 }
 5033 
 5034 //!
 5035 //!
 5036 //!
 5037 //! @param[in] base
 5038 //! @param[in] name
 5039 //!
 5040 //! @return
 5041 //!
 5042 //! @pre
 5043 //!
 5044 //! @note
 5045 //!
 5046 static int do_copy_test(const char *base, const char *name)
 5047 {
 5048     int ret;
 5049     int errors = 0;
 5050     printf("commencing copy test\n");
 5051 
 5052     blobstore *bs = create_teststore(CBB_SIZE * 7, base, name, BLOBSTORE_FORMAT_DIRECTORY, BLOBSTORE_REVOCATION_ANY, BLOBSTORE_SNAPSHOT_ANY);
 5053     if (bs == NULL) {
 5054         errors++;
 5055         goto done;
 5056     }
 5057 
 5058     blockblob *bb1, *bb2, *bb3, *bb4;
 5059 
 5060     // these are to be copied to another
 5061     _OPENBBb(bb1, B1, CBB_SIZE * 512 * 7 + 1, NULL, _CBB, 0, -1);   // too big for bs
 5062     if (errors)
 5063         goto done;
 5064     _OPENBBb(bb1, B1, CBB_SIZE * 512, NULL, _CBB, 0, 0);    // bs size: 1
 5065     _fill_blob(bb1, '1', TRUE);
 5066     _OPENBBb(bb2, B2, CBB_SIZE * 512 + 1, NULL, _CBB, 0, 0);    // bs size: 3
 5067     _fill_blob(bb2, '2', TRUE);
 5068     _OPENBBb(bb3, B3, CBB_SIZE * 512 - 2, NULL, _CBB, 0, 0);    // bs size: 4
 5069     _fill_blob(bb3, '3', TRUE);
 5070 
 5071     // this is to be the destination of the copy
 5072     _OPENBB(bb4, B4, CBB_SIZE * 3, NULL, _CBB, 0, 0);   // bs size: 7
 5073     _COPYBB(bb1, 0, bb4, 0, 0, 0);     // check that len=0 works and that right block size is chosen
 5074     _COPYBB(bb2, 0, bb4, CBB_SIZE * 512, CBB_SIZE * 512 + 1, 0);
 5075     _COPYBB(bb3, 0, bb4, CBB_SIZE * 512 * 2, CBB_SIZE * 512 - 2, 0);
 5076     _COPYBB(bb3, 0, bb4, CBB_SIZE * 512 * 3 - 2, 2, 0);
 5077     _COPYBB(bb3, 0, bb4, CBB_SIZE * 512 * 2, CBB_SIZE * 512, -1);   // source is too small
 5078     _COPYBB(bb3, 2, bb4, CBB_SIZE * 512 * 2, CBB_SIZE * 512, -1);   // source is too small
 5079     _COPYBB(bb3, 0, bb4, CBB_SIZE * 512 * 3 - 1, 2, -1);    // destination is too small
 5080 
 5081     // see if copy worked
 5082     errors += check_destination(bb4, "copy");
 5083 
 5084     _DELEBB(bb1, B1, 0);
 5085     _DELEBB(bb2, B2, 0);
 5086     _DELEBB(bb3, B3, 0);
 5087     _DELEBB(bb4, B4, 0);
 5088     blobstore_close(bs);
 5089 
 5090     printf("completed copy test\n");
 5091 done:
 5092     return errors;
 5093 }
 5094 
 5095 //!
 5096 //!
 5097 //!
 5098 //! @param[in] base
 5099 //! @param[in] name
 5100 //! @param[in] format
 5101 //! @param[in] revocation
 5102 //! @param[in] snapshot
 5103 //! @param[in] copy_or_snapshot
 5104 //!
 5105 //! @return
 5106 //!
 5107 //! @pre
 5108 //!
 5109 //! @note
 5110 //!
 5111 static int do_clone_test(const char *base, const char *name, blobstore_format_t format, blobstore_revocation_t revocation, blobstore_snapshot_t snapshot, int copy_or_snapshot)
 5112 {
 5113     int ret;
 5114     int errors = 0;
 5115     printf("commencing cloning test\n");
 5116 
 5117     blobstore *bs = create_teststore(CBB_SIZE * 6, base, name, BLOBSTORE_FORMAT_DIRECTORY, BLOBSTORE_REVOCATION_ANY, BLOBSTORE_SNAPSHOT_ANY);
 5118     if (bs == NULL) {
 5119         errors++;
 5120         goto done;
 5121     }
 5122 
 5123     blockblob *bb1, *bb2, *bb3, *bb4, *bb5;
 5124 
 5125     // these are to be mapped to others
 5126     _OPENBB(bb1, B1, CBB_SIZE, NULL, _CBB, 0, 0);   // bs size: 1
 5127     _fill_blob(bb1, '1', FALSE