"Fossies" - the Fresh Open Source Software Archive

Member "glusterfs-6.9/xlators/cluster/afr/src/afr-self-heal-data.c" (23 Apr 2020, 26318 Bytes) of package /linux/misc/glusterfs-6.9.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "afr-self-heal-data.c" see the Fossies "Dox" file reference documentation.

    1 /*
    2   Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
    3   This file is part of GlusterFS.
    4 
    5   This file is licensed to you under your choice of the GNU Lesser
    6   General Public License, version 3 or any later version (LGPLv3 or
    7   later), or the GNU General Public License, version 2 (GPLv2), in all
    8   cases as published by the Free Software Foundation.
    9 */
   10 
   11 #include "afr.h"
   12 #include "afr-self-heal.h"
   13 #include <glusterfs/byte-order.h>
   14 #include "protocol-common.h"
   15 #include "afr-messages.h"
   16 #include <glusterfs/events.h>
   17 
   18 #define HAS_HOLES(i) ((i->ia_blocks * 512) < (i->ia_size))
   19 static int
   20 __checksum_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
   21                int op_errno, uint32_t weak, uint8_t *strong, dict_t *xdata)
   22 {
   23     afr_local_t *local = NULL;
   24     struct afr_reply *replies = NULL;
   25     int i = (long)cookie;
   26 
   27     local = frame->local;
   28     replies = local->replies;
   29 
   30     replies[i].valid = 1;
   31     replies[i].op_ret = op_ret;
   32     replies[i].op_errno = op_errno;
   33     if (xdata) {
   34         replies[i].buf_has_zeroes = dict_get_str_boolean(
   35             xdata, "buf-has-zeroes", _gf_false);
   36         replies[i].fips_mode_rchecksum = dict_get_str_boolean(
   37             xdata, "fips-mode-rchecksum", _gf_false);
   38     }
   39     if (strong) {
   40         if (replies[i].fips_mode_rchecksum) {
   41             memcpy(local->replies[i].checksum, strong, SHA256_DIGEST_LENGTH);
   42         } else {
   43             memcpy(local->replies[i].checksum, strong, MD5_DIGEST_LENGTH);
   44         }
   45     }
   46 
   47     syncbarrier_wake(&local->barrier);
   48     return 0;
   49 }
   50 
   51 static gf_boolean_t
   52 __afr_can_skip_data_block_heal(call_frame_t *frame, xlator_t *this, fd_t *fd,
   53                                int source, unsigned char *healed_sinks,
   54                                off_t offset, size_t size, struct iatt *poststat)
   55 {
   56     afr_private_t *priv = NULL;
   57     afr_local_t *local = NULL;
   58     unsigned char *wind_subvols = NULL;
   59     gf_boolean_t checksum_match = _gf_true;
   60     struct afr_reply *replies = NULL;
   61     dict_t *xdata = NULL;
   62     int i = 0;
   63 
   64     priv = this->private;
   65     local = frame->local;
   66     replies = local->replies;
   67 
   68     xdata = dict_new();
   69     if (!xdata)
   70         goto out;
   71     if (dict_set_int32_sizen(xdata, "check-zero-filled", 1)) {
   72         dict_unref(xdata);
   73         goto out;
   74     }
   75 
   76     wind_subvols = alloca0(priv->child_count);
   77     for (i = 0; i < priv->child_count; i++) {
   78         if (i == source || healed_sinks[i])
   79             wind_subvols[i] = 1;
   80     }
   81 
   82     AFR_ONLIST(wind_subvols, frame, __checksum_cbk, rchecksum, fd, offset, size,
   83                xdata);
   84     if (xdata)
   85         dict_unref(xdata);
   86 
   87     if (!replies[source].valid || replies[source].op_ret != 0)
   88         return _gf_false;
   89 
   90     for (i = 0; i < priv->child_count; i++) {
   91         if (i == source)
   92             continue;
   93         if (replies[i].valid) {
   94             if (memcmp(replies[source].checksum, replies[i].checksum,
   95                        replies[source].fips_mode_rchecksum
   96                            ? SHA256_DIGEST_LENGTH
   97                            : MD5_DIGEST_LENGTH)) {
   98                 checksum_match = _gf_false;
   99                 break;
  100             }
  101         }
  102     }
  103 
  104     if (checksum_match) {
  105         if (HAS_HOLES(poststat))
  106             return _gf_true;
  107 
  108         /* For non-sparse files, we might be better off writing the
  109          * zeroes to sinks to avoid mismatch of disk-usage in bricks. */
  110         if (local->replies[source].buf_has_zeroes)
  111             return _gf_false;
  112         else
  113             return _gf_true;
  114     }
  115 out:
  116     return _gf_false;
  117 }
  118 
  119 static gf_boolean_t
  120 __afr_is_sink_zero_filled(xlator_t *this, fd_t *fd, size_t size, off_t offset,
  121                           int sink)
  122 {
  123     afr_private_t *priv = NULL;
  124     struct iobref *iobref = NULL;
  125     struct iovec *iovec = NULL;
  126     int count = 0;
  127     int ret = 0;
  128     gf_boolean_t zero_filled = _gf_false;
  129 
  130     priv = this->private;
  131     ret = syncop_readv(priv->children[sink], fd, size, offset, 0, &iovec,
  132                        &count, &iobref, NULL, NULL, NULL);
  133     if (ret < 0)
  134         goto out;
  135     ret = iov_0filled(iovec, count);
  136     if (!ret)
  137         zero_filled = _gf_true;
  138 out:
  139     if (iovec)
  140         GF_FREE(iovec);
  141     if (iobref)
  142         iobref_unref(iobref);
  143     return zero_filled;
  144 }
  145 
  146 static int
  147 __afr_selfheal_data_read_write(call_frame_t *frame, xlator_t *this, fd_t *fd,
  148                                int source, unsigned char *healed_sinks,
  149                                off_t offset, size_t size,
  150                                struct afr_reply *replies, int type)
  151 {
  152     struct iovec *iovec = NULL;
  153     int count = 0;
  154     struct iobref *iobref = NULL;
  155     int ret = 0;
  156     int i = 0;
  157     afr_private_t *priv = NULL;
  158 
  159     priv = this->private;
  160 
  161     ret = syncop_readv(priv->children[source], fd, size, offset, 0, &iovec,
  162                        &count, &iobref, NULL, NULL, NULL);
  163     if (ret <= 0)
  164         return ret;
  165 
  166     for (i = 0; i < priv->child_count; i++) {
  167         if (!healed_sinks[i])
  168             continue;
  169 
  170             /*
  171              * TODO: Use fiemap() and discard() to heal holes
  172              * in the future.
  173              *
  174              * For now,
  175              *
  176              * - if the source had any holes at all,
  177              * AND
  178              * - if we are writing past the original file size
  179              *   of the sink
  180              * AND
  181              * - is NOT the last block of the source file. if
  182              *   the block contains EOF, it has to be written
  183              *   in order to set the file size even if the
  184              *   last block is 0-filled.
  185              * AND
  186              * - if the read buffer is filled with only 0's
  187              *
  188              * then, skip writing to this source. We don't depend
  189              * on the write to happen to update the size as we
  190              * have performed an ftruncate() upfront anyways.
  191              */
  192 #define is_last_block(o, b, s) ((s >= o) && (s <= (o + b)))
  193         if (HAS_HOLES((&replies[source].poststat)) &&
  194             offset >= replies[i].poststat.ia_size &&
  195             !is_last_block(offset, size, replies[source].poststat.ia_size) &&
  196             (iov_0filled(iovec, count) == 0))
  197             continue;
  198 
  199         /* Avoid filling up sparse regions of the sink with 0-filled
  200          * writes.*/
  201         if (type == AFR_SELFHEAL_DATA_FULL &&
  202             HAS_HOLES((&replies[source].poststat)) &&
  203             ((offset + size) <= replies[i].poststat.ia_size) &&
  204             (iov_0filled(iovec, count) == 0) &&
  205             __afr_is_sink_zero_filled(this, fd, size, offset, i)) {
  206             continue;
  207         }
  208 
  209         ret = syncop_writev(priv->children[i], fd, iovec, count, offset, iobref,
  210                             0, NULL, NULL, NULL, NULL);
  211         if (ret != iov_length(iovec, count)) {
  212             /* write() failed on this sink. unset the corresponding
  213                member in sinks[] (which is healed_sinks[] in the
  214                caller) so that this server does NOT get considered
  215                as successfully healed.
  216             */
  217             healed_sinks[i] = 0;
  218         }
  219     }
  220     if (iovec)
  221         GF_FREE(iovec);
  222     if (iobref)
  223         iobref_unref(iobref);
  224 
  225     return ret;
  226 }
  227 
  228 static int
  229 afr_selfheal_data_block(call_frame_t *frame, xlator_t *this, fd_t *fd,
  230                         int source, unsigned char *healed_sinks, off_t offset,
  231                         size_t size, int type, struct afr_reply *replies)
  232 {
  233     int ret = -1;
  234     int sink_count = 0;
  235     afr_private_t *priv = NULL;
  236     unsigned char *data_lock = NULL;
  237 
  238     priv = this->private;
  239     sink_count = AFR_COUNT(healed_sinks, priv->child_count);
  240     data_lock = alloca0(priv->child_count);
  241 
  242     ret = afr_selfheal_inodelk(frame, this, fd->inode, this->name, offset, size,
  243                                data_lock);
  244     {
  245         if (ret < sink_count) {
  246             ret = -ENOTCONN;
  247             goto unlock;
  248         }
  249 
  250         if (type == AFR_SELFHEAL_DATA_DIFF &&
  251             __afr_can_skip_data_block_heal(frame, this, fd, source,
  252                                            healed_sinks, offset, size,
  253                                            &replies[source].poststat)) {
  254             ret = 0;
  255             goto unlock;
  256         }
  257 
  258         ret = __afr_selfheal_data_read_write(
  259             frame, this, fd, source, healed_sinks, offset, size, replies, type);
  260     }
  261 unlock:
  262     afr_selfheal_uninodelk(frame, this, fd->inode, this->name, offset, size,
  263                            data_lock);
  264     return ret;
  265 }
  266 
  267 static int
  268 afr_selfheal_data_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd,
  269                         unsigned char *healed_sinks)
  270 {
  271     afr_local_t *local = NULL;
  272     afr_private_t *priv = NULL;
  273     int i = 0;
  274 
  275     local = frame->local;
  276     priv = this->private;
  277 
  278     if (!priv->ensure_durability)
  279         return 0;
  280 
  281     AFR_ONLIST(healed_sinks, frame, afr_sh_generic_fop_cbk, fsync, fd, 0, NULL);
  282 
  283     for (i = 0; i < priv->child_count; i++)
  284         if (healed_sinks[i] && local->replies[i].op_ret != 0)
  285             /* fsync() failed. Do NOT consider this server
  286                as successfully healed. Mark it so.
  287             */
  288             healed_sinks[i] = 0;
  289     return 0;
  290 }
  291 
  292 static int
  293 afr_data_self_heal_type_get(afr_private_t *priv, unsigned char *healed_sinks,
  294                             int source, struct afr_reply *replies)
  295 {
  296     int type = AFR_SELFHEAL_DATA_FULL;
  297     int i = 0;
  298 
  299     if (priv->data_self_heal_algorithm == AFR_SELFHEAL_DATA_DYNAMIC) {
  300         type = AFR_SELFHEAL_DATA_FULL;
  301         for (i = 0; i < priv->child_count; i++) {
  302             if (!healed_sinks[i] && i != source)
  303                 continue;
  304             if (replies[i].poststat.ia_size) {
  305                 type = AFR_SELFHEAL_DATA_DIFF;
  306                 break;
  307             }
  308         }
  309     } else {
  310         type = priv->data_self_heal_algorithm;
  311     }
  312     return type;
  313 }
  314 
  315 static int
  316 afr_selfheal_data_do(call_frame_t *frame, xlator_t *this, fd_t *fd, int source,
  317                      unsigned char *healed_sinks, struct afr_reply *replies)
  318 {
  319     afr_private_t *priv = NULL;
  320     off_t off = 0;
  321     size_t block = 0;
  322     int type = AFR_SELFHEAL_DATA_FULL;
  323     int ret = -1;
  324     call_frame_t *iter_frame = NULL;
  325     unsigned char arbiter_sink_status = 0;
  326 
  327     priv = this->private;
  328     if (priv->arbiter_count) {
  329         arbiter_sink_status = healed_sinks[ARBITER_BRICK_INDEX];
  330         healed_sinks[ARBITER_BRICK_INDEX] = 0;
  331     }
  332 
  333     block = 128 * 1024 * priv->data_self_heal_window_size;
  334 
  335     type = afr_data_self_heal_type_get(priv, healed_sinks, source, replies);
  336 
  337     iter_frame = afr_copy_frame(frame);
  338     if (!iter_frame) {
  339         ret = -ENOMEM;
  340         goto out;
  341     }
  342 
  343     for (off = 0; off < replies[source].poststat.ia_size; off += block) {
  344         if (AFR_COUNT(healed_sinks, priv->child_count) == 0) {
  345             ret = -ENOTCONN;
  346             goto out;
  347         }
  348 
  349         ret = afr_selfheal_data_block(iter_frame, this, fd, source,
  350                                       healed_sinks, off, block, type, replies);
  351         if (ret < 0)
  352             goto out;
  353 
  354         AFR_STACK_RESET(iter_frame);
  355         if (iter_frame->local == NULL) {
  356             ret = -ENOTCONN;
  357             goto out;
  358         }
  359     }
  360 
  361     ret = afr_selfheal_data_fsync(frame, this, fd, healed_sinks);
  362 
  363 out:
  364     if (arbiter_sink_status)
  365         healed_sinks[ARBITER_BRICK_INDEX] = arbiter_sink_status;
  366 
  367     if (iter_frame)
  368         AFR_STACK_DESTROY(iter_frame);
  369     return ret;
  370 }
  371 
  372 static int
  373 __afr_selfheal_truncate_sinks(call_frame_t *frame, xlator_t *this, fd_t *fd,
  374                               unsigned char *healed_sinks, uint64_t size)
  375 {
  376     afr_local_t *local = NULL;
  377     afr_private_t *priv = NULL;
  378     int i = 0;
  379 
  380     local = frame->local;
  381     priv = this->private;
  382 
  383     /* This will send truncate on the arbiter brick as well if it is marked as
  384      * sink. If changelog is enabled on the volume it captures truncate as a
  385      * data transactions on the arbiter brick. This will help geo-rep to
  386      * properly sync the data from master to slave if arbiter is the ACTIVE
  387      * brick during syncing and which had got some entries healed for data as
  388      * part of self heal.
  389      */
  390     AFR_ONLIST(healed_sinks, frame, afr_sh_generic_fop_cbk, ftruncate, fd, size,
  391                NULL);
  392 
  393     for (i = 0; i < priv->child_count; i++)
  394         if (healed_sinks[i] && local->replies[i].op_ret == -1)
  395             /* truncate() failed. Do NOT consider this server
  396                as successfully healed. Mark it so.
  397             */
  398             healed_sinks[i] = 0;
  399 
  400     return 0;
  401 }
  402 
  403 gf_boolean_t
  404 afr_has_source_witnesses(xlator_t *this, unsigned char *sources,
  405                          uint64_t *witness)
  406 {
  407     int i = 0;
  408     afr_private_t *priv = NULL;
  409 
  410     priv = this->private;
  411 
  412     for (i = 0; i < priv->child_count; i++) {
  413         if (sources[i] && witness[i])
  414             return _gf_true;
  415     }
  416     return _gf_false;
  417 }
  418 
  419 static gf_boolean_t
  420 afr_does_size_mismatch(xlator_t *this, unsigned char *sources,
  421                        struct afr_reply *replies)
  422 {
  423     int i = 0;
  424     afr_private_t *priv = NULL;
  425     struct iatt *min = NULL;
  426     struct iatt *max = NULL;
  427 
  428     priv = this->private;
  429 
  430     for (i = 0; i < priv->child_count; i++) {
  431         if (!replies[i].valid)
  432             continue;
  433 
  434         if (replies[i].op_ret < 0)
  435             continue;
  436 
  437         if (!sources[i])
  438             continue;
  439 
  440         if (AFR_IS_ARBITER_BRICK(priv, i) && (replies[i].poststat.ia_size == 0))
  441             continue;
  442 
  443         if (!min)
  444             min = &replies[i].poststat;
  445 
  446         if (!max)
  447             max = &replies[i].poststat;
  448 
  449         if (min->ia_size > replies[i].poststat.ia_size)
  450             min = &replies[i].poststat;
  451 
  452         if (max->ia_size < replies[i].poststat.ia_size)
  453             max = &replies[i].poststat;
  454     }
  455 
  456     if (min && max) {
  457         if (min->ia_size != max->ia_size)
  458             return _gf_true;
  459     }
  460 
  461     return _gf_false;
  462 }
  463 
  464 static void
  465 afr_mark_biggest_witness_as_source(xlator_t *this, unsigned char *sources,
  466                                    uint64_t *witness)
  467 {
  468     int i = 0;
  469     afr_private_t *priv = NULL;
  470     uint64_t biggest_witness = 0;
  471 
  472     priv = this->private;
  473     /* Find source with biggest witness count */
  474     for (i = 0; i < priv->child_count; i++) {
  475         if (!sources[i])
  476             continue;
  477         if (biggest_witness < witness[i])
  478             biggest_witness = witness[i];
  479     }
  480 
  481     /* Mark files with less witness count as not source */
  482     for (i = 0; i < priv->child_count; i++) {
  483         if (!sources[i])
  484             continue;
  485         if (witness[i] < biggest_witness)
  486             sources[i] = 0;
  487     }
  488 
  489     return;
  490 }
  491 
  492 /* This is a tie breaker function. Only one source be assigned here */
  493 static void
  494 afr_mark_newest_file_as_source(xlator_t *this, unsigned char *sources,
  495                                struct afr_reply *replies)
  496 {
  497     int i = 0;
  498     afr_private_t *priv = NULL;
  499     int source = -1;
  500     uint32_t max_ctime = 0;
  501 
  502     priv = this->private;
  503     /* Find source with latest ctime */
  504     for (i = 0; i < priv->child_count; i++) {
  505         if (!sources[i])
  506             continue;
  507 
  508         if (max_ctime <= replies[i].poststat.ia_ctime) {
  509             source = i;
  510             max_ctime = replies[i].poststat.ia_ctime;
  511         }
  512     }
  513 
  514     /* Only mark one of the files as source to break ties */
  515     memset(sources, 0, sizeof(*sources) * priv->child_count);
  516     sources[source] = 1;
  517 }
  518 
  519 static int
  520 __afr_selfheal_data_finalize_source(
  521     call_frame_t *frame, xlator_t *this, inode_t *inode, unsigned char *sources,
  522     unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on,
  523     unsigned char *undid_pending, struct afr_reply *replies, uint64_t *witness)
  524 {
  525     afr_private_t *priv = NULL;
  526     int source = -1;
  527     int sources_count = 0;
  528     priv = this->private;
  529 
  530     sources_count = AFR_COUNT(sources, priv->child_count);
  531 
  532     if ((AFR_CMP(locked_on, healed_sinks, priv->child_count) == 0) ||
  533         !sources_count) {
  534         /* split brain */
  535         source = afr_mark_split_brain_source_sinks(
  536             frame, this, inode, sources, sinks, healed_sinks, locked_on,
  537             replies, AFR_DATA_TRANSACTION);
  538         if (source < 0) {
  539             gf_event(EVENT_AFR_SPLIT_BRAIN,
  540                      "client-pid=%d;"
  541                      "subvol=%s;type=data;"
  542                      "file=%s",
  543                      this->ctx->cmd_args.client_pid, this->name,
  544                      uuid_utoa(inode->gfid));
  545             return -EIO;
  546         }
  547 
  548         _afr_fav_child_reset_sink_xattrs(
  549             frame, this, inode, source, healed_sinks, undid_pending,
  550             AFR_DATA_TRANSACTION, locked_on, replies);
  551         goto out;
  552     }
  553 
  554     /* No split brain at this point. If we were called from
  555      * afr_heal_splitbrain_file(), abort.*/
  556     if (afr_dict_contains_heal_op(frame))
  557         return -EIO;
  558 
  559     /* If there are no witnesses/size-mismatches on sources we are done*/
  560     if (!afr_does_size_mismatch(this, sources, replies) &&
  561         !afr_has_source_witnesses(this, sources, witness))
  562         goto out;
  563 
  564     afr_mark_largest_file_as_source(this, sources, replies);
  565     afr_mark_biggest_witness_as_source(this, sources, witness);
  566     afr_mark_newest_file_as_source(this, sources, replies);
  567     if (priv->arbiter_count)
  568         /* Choose non-arbiter brick as source for empty files. */
  569         afr_mark_source_sinks_if_file_empty(this, sources, sinks, healed_sinks,
  570                                             locked_on, replies,
  571                                             AFR_DATA_TRANSACTION);
  572 
  573 out:
  574     afr_mark_active_sinks(this, sources, locked_on, healed_sinks);
  575     source = afr_choose_source_by_policy(priv, sources, AFR_DATA_TRANSACTION);
  576 
  577     return source;
  578 }
  579 
  580 /*
  581  * __afr_selfheal_data_prepare:
  582  *
  583  * This function inspects the on-disk xattrs and determines which subvols
  584  * are sources and sinks.
  585  *
  586  * The return value is the index of the subvolume to be used as the source
  587  * for self-healing, or -1 if no healing is necessary/split brain.
  588  */
  589 int
  590 __afr_selfheal_data_prepare(call_frame_t *frame, xlator_t *this, inode_t *inode,
  591                             unsigned char *locked_on, unsigned char *sources,
  592                             unsigned char *sinks, unsigned char *healed_sinks,
  593                             unsigned char *undid_pending,
  594                             struct afr_reply *replies, unsigned char *pflag)
  595 {
  596     int ret = -1;
  597     int source = -1;
  598     afr_private_t *priv = NULL;
  599     uint64_t *witness = NULL;
  600 
  601     priv = this->private;
  602 
  603     ret = afr_selfheal_unlocked_discover(frame, inode, inode->gfid, replies);
  604 
  605     if (ret)
  606         return ret;
  607 
  608     witness = alloca0(priv->child_count * sizeof(*witness));
  609     ret = afr_selfheal_find_direction(frame, this, replies,
  610                                       AFR_DATA_TRANSACTION, locked_on, sources,
  611                                       sinks, witness, pflag);
  612     if (ret)
  613         return ret;
  614 
  615     /* Initialize the healed_sinks[] array optimistically to
  616        the intersection of to-be-healed (i.e sinks[]) and
  617        the list of servers which are up (i.e locked_on[]).
  618        As we encounter failures in the healing process, we
  619        will unmark the respective servers in the healed_sinks[]
  620        array.
  621     */
  622     AFR_INTERSECT(healed_sinks, sinks, locked_on, priv->child_count);
  623 
  624     source = __afr_selfheal_data_finalize_source(
  625         frame, this, inode, sources, sinks, healed_sinks, locked_on,
  626         undid_pending, replies, witness);
  627     if (source < 0)
  628         return -EIO;
  629 
  630     return source;
  631 }
  632 
  633 static int
  634 __afr_selfheal_data(call_frame_t *frame, xlator_t *this, fd_t *fd,
  635                     unsigned char *locked_on)
  636 {
  637     afr_private_t *priv = NULL;
  638     int ret = -1;
  639     unsigned char *sources = NULL;
  640     unsigned char *sinks = NULL;
  641     unsigned char *data_lock = NULL;
  642     unsigned char *healed_sinks = NULL;
  643     unsigned char *undid_pending = NULL;
  644     struct afr_reply *locked_replies = NULL;
  645     int source = -1;
  646     gf_boolean_t did_sh = _gf_true;
  647     gf_boolean_t is_arbiter_the_only_sink = _gf_false;
  648     gf_boolean_t empty_file = _gf_false;
  649 
  650     priv = this->private;
  651 
  652     sources = alloca0(priv->child_count);
  653     sinks = alloca0(priv->child_count);
  654     healed_sinks = alloca0(priv->child_count);
  655     data_lock = alloca0(priv->child_count);
  656     undid_pending = alloca0(priv->child_count);
  657 
  658     locked_replies = alloca0(sizeof(*locked_replies) * priv->child_count);
  659 
  660     ret = afr_selfheal_inodelk(frame, this, fd->inode, this->name, 0, 0,
  661                                data_lock);
  662     {
  663         if (ret < priv->child_count) {
  664             gf_msg_debug(this->name, 0,
  665                          "%s: Skipping "
  666                          "self-heal as only %d number "
  667                          "of subvolumes "
  668                          "could be locked",
  669                          uuid_utoa(fd->inode->gfid), ret);
  670             ret = -ENOTCONN;
  671             goto unlock;
  672         }
  673 
  674         ret = __afr_selfheal_data_prepare(frame, this, fd->inode, data_lock,
  675                                           sources, sinks, healed_sinks,
  676                                           undid_pending, locked_replies, NULL);
  677         if (ret < 0)
  678             goto unlock;
  679 
  680         if (AFR_COUNT(healed_sinks, priv->child_count) == 0) {
  681             did_sh = _gf_false;
  682             goto unlock;
  683         }
  684 
  685         source = ret;
  686 
  687         if (AFR_IS_ARBITER_BRICK(priv, source)) {
  688             empty_file = afr_is_file_empty_on_all_children(priv,
  689                                                            locked_replies);
  690             if (empty_file)
  691                 goto restore_time;
  692 
  693             did_sh = _gf_false;
  694             goto unlock;
  695         }
  696 
  697         ret = __afr_selfheal_truncate_sinks(
  698             frame, this, fd, healed_sinks,
  699             locked_replies[source].poststat.ia_size);
  700         if (ret < 0)
  701             goto unlock;
  702 
  703         if (priv->arbiter_count &&
  704             AFR_COUNT(healed_sinks, priv->child_count) == 1 &&
  705             healed_sinks[ARBITER_BRICK_INDEX]) {
  706             is_arbiter_the_only_sink = _gf_true;
  707             goto restore_time;
  708         }
  709         ret = 0;
  710     }
  711 unlock:
  712     afr_selfheal_uninodelk(frame, this, fd->inode, this->name, 0, 0, data_lock);
  713     if (ret < 0)
  714         goto out;
  715 
  716     if (!did_sh)
  717         goto out;
  718 
  719     ret = afr_selfheal_data_do(frame, this, fd, source, healed_sinks,
  720                                locked_replies);
  721     if (ret)
  722         goto out;
  723 restore_time:
  724     afr_selfheal_restore_time(frame, this, fd->inode, source, healed_sinks,
  725                               locked_replies);
  726 
  727     if (!is_arbiter_the_only_sink && !empty_file) {
  728         ret = afr_selfheal_inodelk(frame, this, fd->inode, this->name, 0, 0,
  729                                    data_lock);
  730         if (ret < priv->child_count) {
  731             ret = -ENOTCONN;
  732             did_sh = _gf_false;
  733             goto skip_undo_pending;
  734         }
  735     }
  736     ret = afr_selfheal_undo_pending(
  737         frame, this, fd->inode, sources, sinks, healed_sinks, undid_pending,
  738         AFR_DATA_TRANSACTION, locked_replies, data_lock);
  739 skip_undo_pending:
  740     afr_selfheal_uninodelk(frame, this, fd->inode, this->name, 0, 0, data_lock);
  741 out:
  742 
  743     if (did_sh)
  744         afr_log_selfheal(fd->inode->gfid, this, ret, "data", source, sources,
  745                          healed_sinks);
  746     else
  747         ret = 1;
  748 
  749     if (locked_replies)
  750         afr_replies_wipe(locked_replies, priv->child_count);
  751 
  752     return ret;
  753 }
  754 
  755 int
  756 afr_selfheal_data_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
  757                            int32_t op_ret, int32_t op_errno, fd_t *fd,
  758                            dict_t *xdata)
  759 {
  760     afr_local_t *local = NULL;
  761     int i = (long)cookie;
  762 
  763     local = frame->local;
  764 
  765     local->replies[i].valid = 1;
  766     local->replies[i].op_ret = op_ret;
  767     local->replies[i].op_errno = op_errno;
  768 
  769     syncbarrier_wake(&local->barrier);
  770 
  771     return 0;
  772 }
  773 
  774 int
  775 afr_selfheal_data_open(xlator_t *this, inode_t *inode, fd_t **fd)
  776 {
  777     int ret = 0;
  778     fd_t *fd_tmp = NULL;
  779     loc_t loc = {
  780         0,
  781     };
  782     call_frame_t *frame = NULL;
  783     afr_local_t *local = NULL;
  784     afr_private_t *priv = NULL;
  785     int i = 0;
  786 
  787     priv = this->private;
  788 
  789     fd_tmp = fd_create(inode, 0);
  790     if (!fd_tmp)
  791         return -ENOMEM;
  792 
  793     loc.inode = inode_ref(inode);
  794     gf_uuid_copy(loc.gfid, inode->gfid);
  795 
  796     frame = afr_frame_create(this, &ret);
  797     if (!frame) {
  798         ret = -ret;
  799         fd_unref(fd_tmp);
  800         goto out;
  801     }
  802     local = frame->local;
  803 
  804     AFR_ONLIST(local->child_up, frame, afr_selfheal_data_open_cbk, open, &loc,
  805                O_RDWR | O_LARGEFILE, fd_tmp, NULL);
  806 
  807     ret = -ENOTCONN;
  808     for (i = 0; i < priv->child_count; i++) {
  809         if (!local->replies[i].valid)
  810             continue;
  811 
  812         if (local->replies[i].op_ret < 0) {
  813             ret = -local->replies[i].op_errno;
  814             continue;
  815         }
  816 
  817         ret = 0;
  818         break;
  819     }
  820 
  821     if (ret < 0) {
  822         fd_unref(fd_tmp);
  823         goto out;
  824     } else {
  825         fd_bind(fd_tmp);
  826     }
  827 
  828     *fd = fd_tmp;
  829 out:
  830     loc_wipe(&loc);
  831     if (frame)
  832         AFR_STACK_DESTROY(frame);
  833     return ret;
  834 }
  835 
  836 int
  837 afr_selfheal_data(call_frame_t *frame, xlator_t *this, fd_t *fd)
  838 {
  839     afr_private_t *priv = NULL;
  840     unsigned char *locked_on = NULL;
  841     int ret = 0;
  842     inode_t *inode = fd->inode;
  843 
  844     priv = this->private;
  845 
  846     locked_on = alloca0(priv->child_count);
  847 
  848     ret = afr_selfheal_tie_breaker_inodelk(frame, this, inode, priv->sh_domain,
  849                                            0, 0, locked_on);
  850     {
  851         if (ret < priv->child_count) {
  852             gf_msg_debug(this->name, 0,
  853                          "%s: Skipping "
  854                          "self-heal as only %d number of "
  855                          "subvolumes could be locked",
  856                          uuid_utoa(fd->inode->gfid), ret);
  857             /* Either less than two subvols available, or another
  858                selfheal (from another server) is in progress. Skip
  859                for now in any case there isn't anything to do.
  860             */
  861             ret = -ENOTCONN;
  862             goto unlock;
  863         }
  864 
  865         ret = __afr_selfheal_data(frame, this, fd, locked_on);
  866     }
  867 unlock:
  868     afr_selfheal_uninodelk(frame, this, inode, priv->sh_domain, 0, 0,
  869                            locked_on);
  870 
  871     return ret;
  872 }