"Fossies" - the Fresh Open Source Software Archive

Member "glusterfs-6.9/xlators/cluster/afr/src/afr-self-heal-common.c" (23 Apr 2020, 79094 Bytes) of package /linux/misc/glusterfs-6.9.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "afr-self-heal-common.c" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 6.8_vs_6.9.

    1 /*
    2   Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
    3   This file is part of GlusterFS.
    4 
    5   This file is licensed to you under your choice of the GNU Lesser
    6   General Public License, version 3 or any later version (LGPLv3 or
    7   later), or the GNU General Public License, version 2 (GPLv2), in all
    8   cases as published by the Free Software Foundation.
    9 */
   10 
   11 #include "afr.h"
   12 #include "afr-self-heal.h"
   13 #include <glusterfs/byte-order.h>
   14 #include "protocol-common.h"
   15 #include "afr-messages.h"
   16 #include <glusterfs/events.h>
   17 
   18 void
   19 afr_heal_synctask(xlator_t *this, afr_local_t *local);
   20 
   21 int
   22 afr_lookup_and_heal_gfid(xlator_t *this, inode_t *parent, const char *name,
   23                          inode_t *inode, struct afr_reply *replies, int source,
   24                          unsigned char *sources, void *gfid, int *gfid_idx)
   25 {
   26     afr_private_t *priv = NULL;
   27     call_frame_t *frame = NULL;
   28     afr_local_t *local = NULL;
   29     unsigned char *wind_on = NULL;
   30     ia_type_t ia_type = IA_INVAL;
   31     dict_t *xdata = NULL;
   32     loc_t loc = {
   33         0,
   34     };
   35     int ret = 0;
   36     int i = 0;
   37 
   38     priv = this->private;
   39     wind_on = alloca0(priv->child_count);
   40     if (source >= 0 && replies[source].valid && replies[source].op_ret == 0)
   41         ia_type = replies[source].poststat.ia_type;
   42 
   43     if (ia_type != IA_INVAL)
   44         goto heal;
   45 
   46     /* If ia_type is still invalid, it means either
   47      * (a)'source' was -1, i.e. parent dir pending xattrs are in split-brain
   48      * (or) (b) The parent dir pending xattrs are all zeroes (i.e. all bricks
   49      * are sources) and the 'source' we selected earlier might be the one where
   50      * the file is not actually present.
   51      *
   52      * In both cases, let us pick a brick with a successful reply and use its
   53      * ia_type.
   54      * */
   55     for (i = 0; i < priv->child_count; i++) {
   56         if (source == -1) {
   57             /* case (a) above. */
   58             if (replies[i].valid && replies[i].op_ret == 0) {
   59                 ia_type = replies[i].poststat.ia_type;
   60                 break;
   61             }
   62         } else {
   63             /* case (b) above. */
   64             if (i == source)
   65                 continue;
   66             if (sources[i] && replies[i].valid && replies[i].op_ret == 0) {
   67                 ia_type = replies[i].poststat.ia_type;
   68                 break;
   69             }
   70         }
   71     }
   72 
   73 heal:
   74     /* gfid heal on those subvolumes that do not have gfid associated
   75      * with the inode and update those replies.
   76      */
   77     for (i = 0; i < priv->child_count; i++) {
   78         if (!replies[i].valid || replies[i].op_ret != 0)
   79             continue;
   80         if (!gf_uuid_is_null(replies[i].poststat.ia_gfid) ||
   81             replies[i].poststat.ia_type != ia_type)
   82             continue;
   83 
   84         wind_on[i] = 1;
   85     }
   86 
   87     if (AFR_COUNT(wind_on, priv->child_count) == 0)
   88         return 0;
   89 
   90     xdata = dict_new();
   91     if (!xdata) {
   92         ret = -ENOMEM;
   93         goto out;
   94     }
   95 
   96     ret = dict_set_gfuuid(xdata, "gfid-req", gfid, true);
   97     if (ret) {
   98         ret = -ENOMEM;
   99         goto out;
  100     }
  101 
  102     frame = afr_frame_create(this, &ret);
  103     if (!frame) {
  104         ret = -ret;
  105         goto out;
  106     }
  107 
  108     local = frame->local;
  109     loc.parent = inode_ref(parent);
  110     gf_uuid_copy(loc.pargfid, parent->gfid);
  111     loc.name = name;
  112     loc.inode = inode_ref(inode);
  113 
  114     AFR_ONLIST(wind_on, frame, afr_selfheal_discover_cbk, lookup, &loc, xdata);
  115 
  116     for (i = 0; i < priv->child_count; i++) {
  117         if (!wind_on[i])
  118             continue;
  119         afr_reply_wipe(&replies[i]);
  120         afr_reply_copy(&replies[i], &local->replies[i]);
  121     }
  122     if (gfid_idx && (*gfid_idx == -1)) {
  123         /*Pick a brick where the gifd heal was successful.*/
  124         for (i = 0; i < priv->child_count; i++) {
  125             if (!wind_on[i])
  126                 continue;
  127             if (replies[i].valid && replies[i].op_ret == 0 &&
  128                 !gf_uuid_is_null(replies[i].poststat.ia_gfid)) {
  129                 *gfid_idx = i;
  130                 break;
  131             }
  132         }
  133     }
  134 out:
  135     if (gfid_idx && (*gfid_idx == -1) && (ret == 0)) {
  136         ret = -afr_final_errno(local, priv);
  137     }
  138     loc_wipe(&loc);
  139     if (frame)
  140         AFR_STACK_DESTROY(frame);
  141     if (xdata)
  142         dict_unref(xdata);
  143 
  144     return ret;
  145 }
  146 
  147 int
  148 afr_gfid_sbrain_source_from_src_brick(xlator_t *this, struct afr_reply *replies,
  149                                       char *src_brick)
  150 {
  151     int i = 0;
  152     afr_private_t *priv = NULL;
  153 
  154     priv = this->private;
  155     for (i = 0; i < priv->child_count; i++) {
  156         if (!replies[i].valid || replies[i].op_ret == -1)
  157             continue;
  158         if (strcmp(priv->children[i]->name, src_brick) == 0)
  159             return i;
  160     }
  161     return -1;
  162 }
  163 
  164 int
  165 afr_selfheal_gfid_mismatch_by_majority(struct afr_reply *replies,
  166                                        int child_count)
  167 {
  168     int j = 0;
  169     int i = 0;
  170     int votes;
  171 
  172     for (i = 0; i < child_count; i++) {
  173         if (!replies[i].valid || replies[i].op_ret == -1)
  174             continue;
  175 
  176         votes = 1;
  177         for (j = i + 1; j < child_count; j++) {
  178             if ((!gf_uuid_compare(replies[i].poststat.ia_gfid,
  179                                   replies[j].poststat.ia_gfid)))
  180                 votes++;
  181             if (votes > child_count / 2)
  182                 return i;
  183         }
  184     }
  185 
  186     return -1;
  187 }
  188 
  189 int
  190 afr_gfid_sbrain_source_from_bigger_file(struct afr_reply *replies,
  191                                         int child_count)
  192 {
  193     int i = 0;
  194     int src = -1;
  195     uint64_t size = 0;
  196 
  197     for (i = 0; i < child_count; i++) {
  198         if (!replies[i].valid || replies[i].op_ret == -1)
  199             continue;
  200         if (size < replies[i].poststat.ia_size) {
  201             src = i;
  202             size = replies[i].poststat.ia_size;
  203         } else if (replies[i].poststat.ia_size == size) {
  204             src = -1;
  205         }
  206     }
  207     return src;
  208 }
  209 
  210 int
  211 afr_gfid_sbrain_source_from_latest_mtime(struct afr_reply *replies,
  212                                          int child_count)
  213 {
  214     int i = 0;
  215     int src = -1;
  216     uint32_t mtime = 0;
  217     uint32_t mtime_nsec = 0;
  218 
  219     for (i = 0; i < child_count; i++) {
  220         if (!replies[i].valid || replies[i].op_ret != 0)
  221             continue;
  222         if ((mtime < replies[i].poststat.ia_mtime) ||
  223             ((mtime == replies[i].poststat.ia_mtime) &&
  224              (mtime_nsec < replies[i].poststat.ia_mtime_nsec))) {
  225             src = i;
  226             mtime = replies[i].poststat.ia_mtime;
  227             mtime_nsec = replies[i].poststat.ia_mtime_nsec;
  228         } else if ((mtime == replies[i].poststat.ia_mtime) &&
  229                    (mtime_nsec == replies[i].poststat.ia_mtime_nsec)) {
  230             src = -1;
  231         }
  232     }
  233     return src;
  234 }
  235 
  236 int
  237 afr_gfid_split_brain_source(xlator_t *this, struct afr_reply *replies,
  238                             inode_t *inode, uuid_t pargfid, const char *bname,
  239                             int src_idx, int child_idx,
  240                             unsigned char *locked_on, int *src, dict_t *xdata)
  241 {
  242     afr_private_t *priv = NULL;
  243     char g1[64] = {
  244         0,
  245     };
  246     char g2[64] = {
  247         0,
  248     };
  249     int up_count = 0;
  250     int heal_op = -1;
  251     int ret = -1;
  252     char *src_brick = NULL;
  253 
  254     *src = -1;
  255     priv = this->private;
  256     up_count = AFR_COUNT(locked_on, priv->child_count);
  257     if (up_count != priv->child_count) {
  258         gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
  259                "All the bricks should be up to resolve the gfid split "
  260                "barin");
  261         if (xdata) {
  262             ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg",
  263                                            SALL_BRICKS_UP_TO_RESOLVE);
  264             if (ret)
  265                 gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_SET_FAILED,
  266                        "Error setting"
  267                        " gfid-heal-msg dict");
  268         }
  269         goto out;
  270     }
  271 
  272     if (xdata) {
  273         ret = dict_get_int32_sizen(xdata, "heal-op", &heal_op);
  274         if (ret)
  275             goto fav_child;
  276     } else {
  277         goto fav_child;
  278     }
  279 
  280     switch (heal_op) {
  281         case GF_SHD_OP_SBRAIN_HEAL_FROM_BIGGER_FILE:
  282             *src = afr_gfid_sbrain_source_from_bigger_file(replies,
  283                                                            priv->child_count);
  284             if (*src == -1) {
  285                 gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
  286                        SNO_BIGGER_FILE);
  287                 if (xdata) {
  288                     ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg",
  289                                                    SNO_BIGGER_FILE);
  290                     if (ret)
  291                         gf_msg(this->name, GF_LOG_ERROR, 0,
  292                                AFR_MSG_DICT_SET_FAILED,
  293                                "Error"
  294                                " setting gfid-heal-msg dict");
  295                 }
  296             }
  297             break;
  298 
  299         case GF_SHD_OP_SBRAIN_HEAL_FROM_LATEST_MTIME:
  300             *src = afr_gfid_sbrain_source_from_latest_mtime(replies,
  301                                                             priv->child_count);
  302             if (*src == -1) {
  303                 gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
  304                        SNO_DIFF_IN_MTIME);
  305                 if (xdata) {
  306                     ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg",
  307                                                    SNO_DIFF_IN_MTIME);
  308                     if (ret)
  309                         gf_msg(this->name, GF_LOG_ERROR, 0,
  310                                AFR_MSG_DICT_SET_FAILED,
  311                                "Error"
  312                                "setting gfid-heal-msg dict");
  313                 }
  314             }
  315             break;
  316 
  317         case GF_SHD_OP_SBRAIN_HEAL_FROM_BRICK:
  318             ret = dict_get_str_sizen(xdata, "child-name", &src_brick);
  319             if (ret) {
  320                 gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
  321                        "Error getting the source "
  322                        "brick");
  323                 break;
  324             }
  325             *src = afr_gfid_sbrain_source_from_src_brick(this, replies,
  326                                                          src_brick);
  327             if (*src == -1) {
  328                 gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
  329                        SERROR_GETTING_SRC_BRICK);
  330                 if (xdata) {
  331                     ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg",
  332                                                    SERROR_GETTING_SRC_BRICK);
  333                     if (ret)
  334                         gf_msg(this->name, GF_LOG_ERROR, 0,
  335                                AFR_MSG_DICT_SET_FAILED,
  336                                "Error"
  337                                " setting gfid-heal-msg dict");
  338                 }
  339             }
  340             break;
  341 
  342         default:
  343             break;
  344     }
  345     goto out;
  346 
  347 fav_child:
  348     switch (priv->fav_child_policy) {
  349         case AFR_FAV_CHILD_BY_SIZE:
  350             *src = afr_sh_fav_by_size(this, replies, inode);
  351             break;
  352         case AFR_FAV_CHILD_BY_MTIME:
  353             *src = afr_sh_fav_by_mtime(this, replies, inode);
  354             break;
  355         case AFR_FAV_CHILD_BY_CTIME:
  356             *src = afr_sh_fav_by_ctime(this, replies, inode);
  357             break;
  358         case AFR_FAV_CHILD_BY_MAJORITY:
  359             if (priv->child_count != 2)
  360                 *src = afr_selfheal_gfid_mismatch_by_majority(
  361                     replies, priv->child_count);
  362             else
  363                 *src = -1;
  364 
  365             if (*src == -1) {
  366                 gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
  367                        "No majority to resolve "
  368                        "gfid split brain");
  369             }
  370             break;
  371         default:
  372             break;
  373     }
  374 
  375 out:
  376     if (*src == -1) {
  377         gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
  378                "Gfid mismatch detected for <gfid:%s>/%s>, %s on %s and"
  379                " %s on %s.",
  380                uuid_utoa(pargfid), bname,
  381                uuid_utoa_r(replies[child_idx].poststat.ia_gfid, g1),
  382                priv->children[child_idx]->name,
  383                uuid_utoa_r(replies[src_idx].poststat.ia_gfid, g2),
  384                priv->children[src_idx]->name);
  385         gf_event(EVENT_AFR_SPLIT_BRAIN,
  386                  "client-pid=%d;"
  387                  "subvol=%s;type=gfid;file="
  388                  "<gfid:%s>/%s>;count=2;child-%d=%s;gfid-%d=%s;"
  389                  "child-%d=%s;gfid-%d=%s",
  390                  this->ctx->cmd_args.client_pid, this->name, uuid_utoa(pargfid),
  391                  bname, child_idx, priv->children[child_idx]->name, child_idx,
  392                  uuid_utoa_r(replies[child_idx].poststat.ia_gfid, g1), src_idx,
  393                  priv->children[src_idx]->name, src_idx,
  394                  uuid_utoa_r(replies[src_idx].poststat.ia_gfid, g2));
  395         return -1;
  396     }
  397     return 0;
  398 }
  399 
  400 int
  401 afr_selfheal_post_op_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
  402                          int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
  403 {
  404     afr_local_t *local = NULL;
  405 
  406     local = frame->local;
  407 
  408     local->op_ret = op_ret;
  409     local->op_errno = op_errno;
  410     syncbarrier_wake(&local->barrier);
  411 
  412     return 0;
  413 }
  414 
  415 int
  416 afr_selfheal_post_op(call_frame_t *frame, xlator_t *this, inode_t *inode,
  417                      int subvol, dict_t *xattr, dict_t *xdata)
  418 {
  419     afr_private_t *priv = NULL;
  420     afr_local_t *local = NULL;
  421     loc_t loc = {
  422         0,
  423     };
  424     int ret = 0;
  425 
  426     priv = this->private;
  427     local = frame->local;
  428 
  429     loc.inode = inode_ref(inode);
  430     gf_uuid_copy(loc.gfid, inode->gfid);
  431 
  432     local->op_ret = 0;
  433 
  434     STACK_WIND(frame, afr_selfheal_post_op_cbk, priv->children[subvol],
  435                priv->children[subvol]->fops->xattrop, &loc,
  436                GF_XATTROP_ADD_ARRAY, xattr, xdata);
  437 
  438     syncbarrier_wait(&local->barrier, 1);
  439     if (local->op_ret < 0)
  440         ret = -local->op_errno;
  441 
  442     loc_wipe(&loc);
  443     local->op_ret = 0;
  444 
  445     return ret;
  446 }
  447 
  448 int
  449 afr_check_stale_error(struct afr_reply *replies, afr_private_t *priv)
  450 {
  451     int i = 0;
  452     int op_errno = 0;
  453     int tmp_errno = 0;
  454     int stale_count = 0;
  455 
  456     for (i = 0; i < priv->child_count; i++) {
  457         tmp_errno = replies[i].op_errno;
  458         if (tmp_errno == ENOENT || tmp_errno == ESTALE) {
  459             op_errno = afr_higher_errno(op_errno, tmp_errno);
  460             stale_count++;
  461         }
  462     }
  463     if (stale_count != priv->child_count)
  464         return -ENOTCONN;
  465     else
  466         return -op_errno;
  467 }
  468 
  469 int
  470 afr_sh_generic_fop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
  471                        int op_ret, int op_errno, struct iatt *pre,
  472                        struct iatt *post, dict_t *xdata)
  473 {
  474     int i = (long)cookie;
  475     afr_local_t *local = NULL;
  476 
  477     local = frame->local;
  478 
  479     local->replies[i].valid = 1;
  480     local->replies[i].op_ret = op_ret;
  481     local->replies[i].op_errno = op_errno;
  482     if (pre)
  483         local->replies[i].prestat = *pre;
  484     if (post)
  485         local->replies[i].poststat = *post;
  486     if (xdata)
  487         local->replies[i].xdata = dict_ref(xdata);
  488 
  489     syncbarrier_wake(&local->barrier);
  490 
  491     return 0;
  492 }
  493 
  494 int
  495 afr_selfheal_restore_time(call_frame_t *frame, xlator_t *this, inode_t *inode,
  496                           int source, unsigned char *healed_sinks,
  497                           struct afr_reply *replies)
  498 {
  499     loc_t loc = {
  500         0,
  501     };
  502 
  503     loc.inode = inode_ref(inode);
  504     gf_uuid_copy(loc.gfid, inode->gfid);
  505 
  506     AFR_ONLIST(healed_sinks, frame, afr_sh_generic_fop_cbk, setattr, &loc,
  507                &replies[source].poststat,
  508                (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME | GF_SET_ATTR_CTIME),
  509                NULL);
  510 
  511     loc_wipe(&loc);
  512 
  513     return 0;
  514 }
  515 
  516 dict_t *
  517 afr_selfheal_output_xattr(xlator_t *this, gf_boolean_t is_full_crawl,
  518                           afr_transaction_type type, int *output_dirty,
  519                           int **output_matrix, int subvol,
  520                           int **full_heal_mtx_out)
  521 {
  522     int j = 0;
  523     int idx = 0;
  524     int d_idx = 0;
  525     int ret = 0;
  526     int *raw = 0;
  527     dict_t *xattr = NULL;
  528     afr_private_t *priv = NULL;
  529 
  530     priv = this->private;
  531     idx = afr_index_for_transaction_type(type);
  532     d_idx = afr_index_for_transaction_type(AFR_DATA_TRANSACTION);
  533 
  534     xattr = dict_new();
  535     if (!xattr)
  536         return NULL;
  537 
  538     /* clear dirty */
  539     raw = GF_CALLOC(sizeof(int), AFR_NUM_CHANGE_LOGS, gf_afr_mt_int32_t);
  540     if (!raw)
  541         goto err;
  542 
  543     raw[idx] = hton32(output_dirty[subvol]);
  544     ret = dict_set_bin(xattr, AFR_DIRTY, raw,
  545                        sizeof(int) * AFR_NUM_CHANGE_LOGS);
  546     if (ret) {
  547         GF_FREE(raw);
  548         goto err;
  549     }
  550 
  551     /* clear/set pending */
  552     for (j = 0; j < priv->child_count; j++) {
  553         raw = GF_CALLOC(sizeof(int), AFR_NUM_CHANGE_LOGS, gf_afr_mt_int32_t);
  554         if (!raw)
  555             goto err;
  556 
  557         raw[idx] = hton32(output_matrix[subvol][j]);
  558         if (is_full_crawl)
  559             raw[d_idx] = hton32(full_heal_mtx_out[subvol][j]);
  560 
  561         ret = dict_set_bin(xattr, priv->pending_key[j], raw,
  562                            sizeof(int) * AFR_NUM_CHANGE_LOGS);
  563         if (ret) {
  564             GF_FREE(raw);
  565             goto err;
  566         }
  567     }
  568 
  569     return xattr;
  570 err:
  571     if (xattr)
  572         dict_unref(xattr);
  573     return NULL;
  574 }
  575 
  576 int
  577 afr_selfheal_undo_pending(call_frame_t *frame, xlator_t *this, inode_t *inode,
  578                           unsigned char *sources, unsigned char *sinks,
  579                           unsigned char *healed_sinks,
  580                           unsigned char *undid_pending,
  581                           afr_transaction_type type, struct afr_reply *replies,
  582                           unsigned char *locked_on)
  583 {
  584     afr_private_t *priv = NULL;
  585     afr_local_t *local = NULL;
  586     int i = 0;
  587     int j = 0;
  588     unsigned char *pending = NULL;
  589     int *input_dirty = NULL;
  590     int **input_matrix = NULL;
  591     int **full_heal_mtx_in = NULL;
  592     int **full_heal_mtx_out = NULL;
  593     int *output_dirty = NULL;
  594     int **output_matrix = NULL;
  595     dict_t *xattr = NULL;
  596     dict_t *xdata = NULL;
  597 
  598     priv = this->private;
  599     local = frame->local;
  600 
  601     pending = alloca0(priv->child_count);
  602 
  603     input_dirty = alloca0(priv->child_count * sizeof(int));
  604     input_matrix = ALLOC_MATRIX(priv->child_count, int);
  605     full_heal_mtx_in = ALLOC_MATRIX(priv->child_count, int);
  606     full_heal_mtx_out = ALLOC_MATRIX(priv->child_count, int);
  607     output_dirty = alloca0(priv->child_count * sizeof(int));
  608     output_matrix = ALLOC_MATRIX(priv->child_count, int);
  609 
  610     xdata = dict_new();
  611     if (!xdata)
  612         return -1;
  613 
  614     afr_selfheal_extract_xattr(this, replies, type, input_dirty, input_matrix);
  615 
  616     if (local->need_full_crawl)
  617         afr_selfheal_extract_xattr(this, replies, AFR_DATA_TRANSACTION, NULL,
  618                                    full_heal_mtx_in);
  619 
  620     for (i = 0; i < priv->child_count; i++)
  621         if (sinks[i] && !healed_sinks[i])
  622             pending[i] = 1;
  623 
  624     for (i = 0; i < priv->child_count; i++) {
  625         for (j = 0; j < priv->child_count; j++) {
  626             if (pending[j]) {
  627                 output_matrix[i][j] = 1;
  628                 if (type == AFR_ENTRY_TRANSACTION)
  629                     full_heal_mtx_out[i][j] = 1;
  630             } else if (locked_on[j]) {
  631                 output_matrix[i][j] = -input_matrix[i][j];
  632                 if (type == AFR_ENTRY_TRANSACTION)
  633                     full_heal_mtx_out[i][j] = -full_heal_mtx_in[i][j];
  634             }
  635         }
  636     }
  637 
  638     for (i = 0; i < priv->child_count; i++) {
  639         if (!pending[i])
  640             output_dirty[i] = -input_dirty[i];
  641     }
  642 
  643     for (i = 0; i < priv->child_count; i++) {
  644         if (!locked_on[i])
  645             /* perform post-op only on subvols we had locked
  646                and inspected on.
  647             */
  648             continue;
  649         if (undid_pending[i])
  650             /* We already unset the pending xattrs in
  651              * _afr_fav_child_reset_sink_xattrs(). */
  652             continue;
  653 
  654         xattr = afr_selfheal_output_xattr(this, local->need_full_crawl, type,
  655                                           output_dirty, output_matrix, i,
  656                                           full_heal_mtx_out);
  657         if (!xattr) {
  658             continue;
  659         }
  660 
  661         if ((type == AFR_ENTRY_TRANSACTION) && (priv->esh_granular)) {
  662             if (xdata && dict_set_int8(xdata, GF_XATTROP_PURGE_INDEX, 1))
  663                 gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_DICT_SET_FAILED,
  664                        "Failed to set"
  665                        " dict value for %s",
  666                        GF_XATTROP_PURGE_INDEX);
  667         }
  668 
  669         afr_selfheal_post_op(frame, this, inode, i, xattr, xdata);
  670         dict_unref(xattr);
  671     }
  672 
  673     if (xdata)
  674         dict_unref(xdata);
  675 
  676     return 0;
  677 }
  678 
  679 void
  680 afr_reply_copy(struct afr_reply *dst, struct afr_reply *src)
  681 {
  682     dict_t *xdata = NULL;
  683 
  684     dst->valid = src->valid;
  685     dst->op_ret = src->op_ret;
  686     dst->op_errno = src->op_errno;
  687     dst->prestat = src->prestat;
  688     dst->poststat = src->poststat;
  689     dst->preparent = src->preparent;
  690     dst->postparent = src->postparent;
  691     dst->preparent2 = src->preparent2;
  692     dst->postparent2 = src->postparent2;
  693     if (src->xdata)
  694         xdata = dict_ref(src->xdata);
  695     else
  696         xdata = NULL;
  697     if (dst->xdata)
  698         dict_unref(dst->xdata);
  699     dst->xdata = xdata;
  700     if (xdata && dict_get_str_boolean(xdata, "fips-mode-rchecksum",
  701                                       _gf_false) == _gf_true) {
  702         memcpy(dst->checksum, src->checksum, SHA256_DIGEST_LENGTH);
  703     } else {
  704         memcpy(dst->checksum, src->checksum, MD5_DIGEST_LENGTH);
  705     }
  706     dst->fips_mode_rchecksum = src->fips_mode_rchecksum;
  707 }
  708 
  709 void
  710 afr_replies_copy(struct afr_reply *dst, struct afr_reply *src, int count)
  711 {
  712     int i = 0;
  713 
  714     if (dst == src)
  715         return;
  716 
  717     for (i = 0; i < count; i++) {
  718         afr_reply_copy(&dst[i], &src[i]);
  719     }
  720 }
  721 
  722 int
  723 afr_selfheal_fill_dirty(xlator_t *this, int *dirty, int subvol, int idx,
  724                         dict_t *xdata)
  725 {
  726     void *pending_raw = NULL;
  727     int pending[3] = {
  728         0,
  729     };
  730 
  731     if (!dirty)
  732         return 0;
  733 
  734     if (dict_get_ptr(xdata, AFR_DIRTY, &pending_raw))
  735         return -1;
  736 
  737     if (!pending_raw)
  738         return -1;
  739 
  740     memcpy(pending, pending_raw, sizeof(pending));
  741 
  742     dirty[subvol] = ntoh32(pending[idx]);
  743 
  744     return 0;
  745 }
  746 
  747 int
  748 afr_selfheal_fill_matrix(xlator_t *this, int **matrix, int subvol, int idx,
  749                          dict_t *xdata)
  750 {
  751     int i = 0;
  752     void *pending_raw = NULL;
  753     int pending[3] = {
  754         0,
  755     };
  756     afr_private_t *priv = NULL;
  757 
  758     priv = this->private;
  759 
  760     if (!matrix)
  761         return 0;
  762 
  763     for (i = 0; i < priv->child_count; i++) {
  764         if (dict_get_ptr(xdata, priv->pending_key[i], &pending_raw))
  765             continue;
  766 
  767         if (!pending_raw)
  768             continue;
  769 
  770         memcpy(pending, pending_raw, sizeof(pending));
  771 
  772         matrix[subvol][i] = ntoh32(pending[idx]);
  773     }
  774 
  775     return 0;
  776 }
  777 
  778 int
  779 afr_selfheal_extract_xattr(xlator_t *this, struct afr_reply *replies,
  780                            afr_transaction_type type, int *dirty, int **matrix)
  781 {
  782     afr_private_t *priv = NULL;
  783     int i = 0;
  784     dict_t *xdata = NULL;
  785     int idx = -1;
  786 
  787     idx = afr_index_for_transaction_type(type);
  788 
  789     priv = this->private;
  790 
  791     for (i = 0; i < priv->child_count; i++) {
  792         if (!replies[i].valid || replies[i].op_ret != 0)
  793             continue;
  794 
  795         if (!replies[i].xdata)
  796             continue;
  797 
  798         xdata = replies[i].xdata;
  799 
  800         afr_selfheal_fill_dirty(this, dirty, i, idx, xdata);
  801         afr_selfheal_fill_matrix(this, matrix, i, idx, xdata);
  802     }
  803 
  804     return 0;
  805 }
  806 
  807 /*
  808  * If by chance there are multiple sources with differing sizes, select
  809  * the largest file as the source.
  810  *
  811  * This can happen if data was directly modified in the backend or for snapshots
  812  */
  813 void
  814 afr_mark_largest_file_as_source(xlator_t *this, unsigned char *sources,
  815                                 struct afr_reply *replies)
  816 {
  817     int i = 0;
  818     afr_private_t *priv = NULL;
  819     uint64_t size = 0;
  820 
  821     /* Find source with biggest file size */
  822     priv = this->private;
  823     for (i = 0; i < priv->child_count; i++) {
  824         if (!sources[i])
  825             continue;
  826         if (!replies[i].valid || replies[i].op_ret != 0) {
  827             sources[i] = 0;
  828             continue;
  829         }
  830         if (size <= replies[i].poststat.ia_size) {
  831             size = replies[i].poststat.ia_size;
  832         }
  833     }
  834 
  835     /* Mark sources with less size as not source */
  836     for (i = 0; i < priv->child_count; i++) {
  837         if (!sources[i])
  838             continue;
  839         if (size > replies[i].poststat.ia_size)
  840             sources[i] = 0;
  841     }
  842 }
  843 
  844 void
  845 afr_mark_latest_mtime_file_as_source(xlator_t *this, unsigned char *sources,
  846                                      struct afr_reply *replies)
  847 {
  848     int i = 0;
  849     afr_private_t *priv = NULL;
  850     uint32_t mtime = 0;
  851     uint32_t mtime_nsec = 0;
  852 
  853     priv = this->private;
  854     for (i = 0; i < priv->child_count; i++) {
  855         if (!sources[i])
  856             continue;
  857         if (!replies[i].valid || replies[i].op_ret != 0) {
  858             sources[i] = 0;
  859             continue;
  860         }
  861         if ((mtime < replies[i].poststat.ia_mtime) ||
  862             ((mtime == replies[i].poststat.ia_mtime) &&
  863              (mtime_nsec < replies[i].poststat.ia_mtime_nsec))) {
  864             mtime = replies[i].poststat.ia_mtime;
  865             mtime_nsec = replies[i].poststat.ia_mtime_nsec;
  866         }
  867     }
  868     for (i = 0; i < priv->child_count; i++) {
  869         if (!sources[i])
  870             continue;
  871         if ((mtime > replies[i].poststat.ia_mtime) ||
  872             ((mtime == replies[i].poststat.ia_mtime) &&
  873              (mtime_nsec > replies[i].poststat.ia_mtime_nsec))) {
  874             sources[i] = 0;
  875         }
  876     }
  877 }
  878 
  879 void
  880 afr_mark_active_sinks(xlator_t *this, unsigned char *sources,
  881                       unsigned char *locked_on, unsigned char *sinks)
  882 {
  883     int i = 0;
  884     afr_private_t *priv = NULL;
  885 
  886     priv = this->private;
  887 
  888     for (i = 0; i < priv->child_count; i++) {
  889         if (!sources[i] && locked_on[i])
  890             sinks[i] = 1;
  891         else
  892             sinks[i] = 0;
  893     }
  894 }
  895 
  896 gf_boolean_t
  897 afr_dict_contains_heal_op(call_frame_t *frame)
  898 {
  899     afr_local_t *local = NULL;
  900     dict_t *xdata_req = NULL;
  901     int ret = 0;
  902     int heal_op = -1;
  903 
  904     local = frame->local;
  905     xdata_req = local->xdata_req;
  906     ret = dict_get_int32_sizen(xdata_req, "heal-op", &heal_op);
  907     if (ret)
  908         return _gf_false;
  909     if (local->xdata_rsp == NULL) {
  910         local->xdata_rsp = dict_new();
  911         if (!local->xdata_rsp)
  912             return _gf_true;
  913     }
  914     ret = dict_set_sizen_str_sizen(local->xdata_rsp, "sh-fail-msg",
  915                                    SFILE_NOT_IN_SPLIT_BRAIN);
  916 
  917     return _gf_true;
  918 }
  919 
  920 gf_boolean_t
  921 afr_can_decide_split_brain_source_sinks(struct afr_reply *replies,
  922                                         int child_count)
  923 {
  924     int i = 0;
  925 
  926     for (i = 0; i < child_count; i++)
  927         if (replies[i].valid != 1 || replies[i].op_ret != 0)
  928             return _gf_false;
  929 
  930     return _gf_true;
  931 }
  932 
  933 int
  934 afr_mark_split_brain_source_sinks_by_heal_op(
  935     call_frame_t *frame, xlator_t *this, unsigned char *sources,
  936     unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on,
  937     struct afr_reply *replies, afr_transaction_type type, int heal_op)
  938 {
  939     afr_local_t *local = NULL;
  940     afr_private_t *priv = NULL;
  941     dict_t *xdata_req = NULL;
  942     dict_t *xdata_rsp = NULL;
  943     int ret = 0;
  944     int i = 0;
  945     char *name = NULL;
  946     int source = -1;
  947 
  948     local = frame->local;
  949     priv = this->private;
  950     xdata_req = local->xdata_req;
  951 
  952     for (i = 0; i < priv->child_count; i++) {
  953         if (locked_on[i])
  954             if (sources[i] || !sinks[i] || !healed_sinks[i]) {
  955                 ret = -1;
  956                 goto out;
  957             }
  958     }
  959     if (local->xdata_rsp == NULL) {
  960         local->xdata_rsp = dict_new();
  961         if (!local->xdata_rsp) {
  962             ret = -1;
  963             goto out;
  964         }
  965     }
  966     xdata_rsp = local->xdata_rsp;
  967 
  968     if (!afr_can_decide_split_brain_source_sinks(replies, priv->child_count)) {
  969         ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
  970                                        SBRAIN_HEAL_NO_GO_MSG);
  971         ret = -1;
  972         goto out;
  973     }
  974 
  975     for (i = 0; i < priv->child_count; i++)
  976         if (locked_on[i])
  977             sources[i] = 1;
  978     switch (heal_op) {
  979         case GF_SHD_OP_SBRAIN_HEAL_FROM_BIGGER_FILE:
  980             if (type == AFR_METADATA_TRANSACTION) {
  981                 ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
  982                                                SUSE_SOURCE_BRICK_TO_HEAL);
  983                 if (!ret)
  984                     ret = -1;
  985                 goto out;
  986             }
  987             afr_mark_largest_file_as_source(this, sources, replies);
  988             if (AFR_COUNT(sources, priv->child_count) != 1) {
  989                 ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
  990                                                SNO_BIGGER_FILE);
  991                 if (!ret)
  992                     ret = -1;
  993                 goto out;
  994             }
  995             break;
  996         case GF_SHD_OP_SBRAIN_HEAL_FROM_LATEST_MTIME:
  997             if (type == AFR_METADATA_TRANSACTION) {
  998                 ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
  999                                                SUSE_SOURCE_BRICK_TO_HEAL);
 1000                 if (!ret)
 1001                     ret = -1;
 1002                 goto out;
 1003             }
 1004             afr_mark_latest_mtime_file_as_source(this, sources, replies);
 1005             if (AFR_COUNT(sources, priv->child_count) != 1) {
 1006                 ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
 1007                                                SNO_DIFF_IN_MTIME);
 1008                 if (!ret)
 1009                     ret = -1;
 1010                 goto out;
 1011             }
 1012             break;
 1013         case GF_SHD_OP_SBRAIN_HEAL_FROM_BRICK:
 1014             ret = dict_get_str_sizen(xdata_req, "child-name", &name);
 1015             if (ret)
 1016                 goto out;
 1017             source = afr_get_child_index_from_name(this, name);
 1018             if (source < 0) {
 1019                 ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
 1020                                                SINVALID_BRICK_NAME);
 1021                 if (!ret)
 1022                     ret = -1;
 1023                 goto out;
 1024             }
 1025             if (locked_on[source] != 1) {
 1026                 ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
 1027                                                SBRICK_IS_NOT_UP);
 1028                 if (!ret)
 1029                     ret = -1;
 1030                 goto out;
 1031             }
 1032             memset(sources, 0, sizeof(*sources) * priv->child_count);
 1033             sources[source] = 1;
 1034             break;
 1035         default:
 1036             ret = -1;
 1037             goto out;
 1038     }
 1039     for (i = 0; i < priv->child_count; i++) {
 1040         if (sources[i]) {
 1041             source = i;
 1042             break;
 1043         }
 1044     }
 1045     sinks[source] = 0;
 1046     healed_sinks[source] = 0;
 1047     ret = source;
 1048 out:
 1049     if (ret < 0)
 1050         memset(sources, 0, sizeof(*sources) * priv->child_count);
 1051     return ret;
 1052 }
 1053 
 1054 int
 1055 afr_sh_fav_by_majority(xlator_t *this, struct afr_reply *replies,
 1056                        inode_t *inode)
 1057 {
 1058     afr_private_t *priv;
 1059     int vote_count = -1;
 1060     int fav_child = -1;
 1061     int i = 0;
 1062     int k = 0;
 1063 
 1064     priv = this->private;
 1065 
 1066     for (i = 0; i < priv->child_count; i++) {
 1067         if (replies[i].valid == 1) {
 1068             gf_msg_debug(this->name, 0,
 1069                          "Child:%s mtime_sec = %" PRId64 ", size = %" PRIu64
 1070                          " for gfid %s",
 1071                          priv->children[i]->name, replies[i].poststat.ia_mtime,
 1072                          replies[i].poststat.ia_size, uuid_utoa(inode->gfid));
 1073             vote_count = 0;
 1074             for (k = 0; k < priv->child_count; k++) {
 1075                 if ((replies[k].poststat.ia_mtime ==
 1076                      replies[i].poststat.ia_mtime) &&
 1077                     (replies[k].poststat.ia_size ==
 1078                      replies[i].poststat.ia_size)) {
 1079                     vote_count++;
 1080                 }
 1081             }
 1082             if (vote_count > priv->child_count / 2) {
 1083                 fav_child = i;
 1084                 break;
 1085             }
 1086         }
 1087     }
 1088     return fav_child;
 1089 }
 1090 
 1091 /*
 1092  * afr_sh_fav_by_mtime: Choose favorite child by mtime.
 1093  */
 1094 int
 1095 afr_sh_fav_by_mtime(xlator_t *this, struct afr_reply *replies, inode_t *inode)
 1096 {
 1097     afr_private_t *priv;
 1098     int fav_child = -1;
 1099     int i = 0;
 1100     uint32_t cmp_mtime = 0;
 1101     uint32_t cmp_mtime_nsec = 0;
 1102 
 1103     priv = this->private;
 1104 
 1105     for (i = 0; i < priv->child_count; i++) {
 1106         if (replies[i].valid == 1) {
 1107             gf_msg_debug(this->name, 0,
 1108                          "Child:%s mtime = %" PRId64
 1109                          ", mtime_nsec = %d for "
 1110                          "gfid %s",
 1111                          priv->children[i]->name, replies[i].poststat.ia_mtime,
 1112                          replies[i].poststat.ia_mtime_nsec,
 1113                          uuid_utoa(inode->gfid));
 1114             if (replies[i].poststat.ia_mtime > cmp_mtime) {
 1115                 cmp_mtime = replies[i].poststat.ia_mtime;
 1116                 cmp_mtime_nsec = replies[i].poststat.ia_mtime_nsec;
 1117                 fav_child = i;
 1118             } else if ((replies[i].poststat.ia_mtime == cmp_mtime) &&
 1119                        (replies[i].poststat.ia_mtime_nsec > cmp_mtime_nsec)) {
 1120                 cmp_mtime = replies[i].poststat.ia_mtime;
 1121                 cmp_mtime_nsec = replies[i].poststat.ia_mtime_nsec;
 1122                 fav_child = i;
 1123             }
 1124         }
 1125     }
 1126     return fav_child;
 1127 }
 1128 
 1129 /*
 1130  * afr_sh_fav_by_ctime: Choose favorite child by ctime.
 1131  */
 1132 int
 1133 afr_sh_fav_by_ctime(xlator_t *this, struct afr_reply *replies, inode_t *inode)
 1134 {
 1135     afr_private_t *priv;
 1136     int fav_child = -1;
 1137     int i = 0;
 1138     uint32_t cmp_ctime = 0;
 1139     uint32_t cmp_ctime_nsec = 0;
 1140 
 1141     priv = this->private;
 1142 
 1143     for (i = 0; i < priv->child_count; i++) {
 1144         if (replies[i].valid == 1) {
 1145             gf_msg_debug(this->name, 0,
 1146                          "Child:%s ctime = %" PRId64
 1147                          ", ctime_nsec = %d for "
 1148                          "gfid %s",
 1149                          priv->children[i]->name, replies[i].poststat.ia_ctime,
 1150                          replies[i].poststat.ia_ctime_nsec,
 1151                          uuid_utoa(inode->gfid));
 1152             if (replies[i].poststat.ia_ctime > cmp_ctime) {
 1153                 cmp_ctime = replies[i].poststat.ia_ctime;
 1154                 cmp_ctime_nsec = replies[i].poststat.ia_ctime_nsec;
 1155                 fav_child = i;
 1156             } else if ((replies[i].poststat.ia_ctime == cmp_ctime) &&
 1157                        (replies[i].poststat.ia_ctime_nsec > cmp_ctime_nsec)) {
 1158                 cmp_ctime = replies[i].poststat.ia_ctime;
 1159                 cmp_ctime_nsec = replies[i].poststat.ia_ctime_nsec;
 1160                 fav_child = i;
 1161             }
 1162         }
 1163     }
 1164     return fav_child;
 1165 }
 1166 
 1167 /*
 1168  * afr_sh_fav_by_size: Choose favorite child by size
 1169  * when not all files are of zero size.
 1170  */
 1171 int
 1172 afr_sh_fav_by_size(xlator_t *this, struct afr_reply *replies, inode_t *inode)
 1173 {
 1174     afr_private_t *priv;
 1175     int fav_child = -1;
 1176     int i = 0;
 1177     uint64_t cmp_sz = 0;
 1178 
 1179     priv = this->private;
 1180     for (i = 0; i < priv->child_count; i++) {
 1181         if (!replies[i].valid) {
 1182             continue;
 1183         }
 1184         gf_msg_debug(this->name, 0,
 1185                      "Child:%s file size = %" PRIu64 " for gfid %s",
 1186                      priv->children[i]->name, replies[i].poststat.ia_size,
 1187                      uuid_utoa(inode->gfid));
 1188         if (replies[i].poststat.ia_type == IA_IFDIR) {
 1189             gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SBRAIN_FAV_CHILD_POLICY,
 1190                    "Cannot perform selfheal on %s. "
 1191                    "Size policy is not applicable to directories.",
 1192                    uuid_utoa(inode->gfid));
 1193             break;
 1194         }
 1195         if (replies[i].poststat.ia_size > cmp_sz) {
 1196             cmp_sz = replies[i].poststat.ia_size;
 1197             fav_child = i;
 1198         } else if (replies[i].poststat.ia_size == cmp_sz) {
 1199             fav_child = -1;
 1200         }
 1201     }
 1202     if (fav_child == -1) {
 1203         gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
 1204                "No bigger file");
 1205     }
 1206     return fav_child;
 1207 }
 1208 
 1209 int
 1210 afr_sh_get_fav_by_policy(xlator_t *this, struct afr_reply *replies,
 1211                          inode_t *inode, char **policy_str)
 1212 {
 1213     afr_private_t *priv = NULL;
 1214     int fav_child = -1;
 1215 
 1216     priv = this->private;
 1217     if (!afr_can_decide_split_brain_source_sinks(replies, priv->child_count)) {
 1218         return -1;
 1219     }
 1220 
 1221     switch (priv->fav_child_policy) {
 1222         case AFR_FAV_CHILD_BY_SIZE:
 1223             fav_child = afr_sh_fav_by_size(this, replies, inode);
 1224             if (policy_str && fav_child >= 0) {
 1225                 *policy_str = "SIZE";
 1226             }
 1227             break;
 1228         case AFR_FAV_CHILD_BY_CTIME:
 1229             fav_child = afr_sh_fav_by_ctime(this, replies, inode);
 1230             if (policy_str && fav_child >= 0) {
 1231                 *policy_str = "CTIME";
 1232             }
 1233             break;
 1234         case AFR_FAV_CHILD_BY_MTIME:
 1235             fav_child = afr_sh_fav_by_mtime(this, replies, inode);
 1236             if (policy_str && fav_child >= 0) {
 1237                 *policy_str = "MTIME";
 1238             }
 1239             break;
 1240         case AFR_FAV_CHILD_BY_MAJORITY:
 1241             fav_child = afr_sh_fav_by_majority(this, replies, inode);
 1242             if (policy_str && fav_child >= 0) {
 1243                 *policy_str = "MAJORITY";
 1244             }
 1245             break;
 1246         case AFR_FAV_CHILD_NONE:
 1247         default:
 1248             break;
 1249     }
 1250 
 1251     return fav_child;
 1252 }
 1253 
 1254 int
 1255 afr_mark_split_brain_source_sinks_by_policy(
 1256     call_frame_t *frame, xlator_t *this, inode_t *inode, unsigned char *sources,
 1257     unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on,
 1258     struct afr_reply *replies, afr_transaction_type type)
 1259 {
 1260     afr_private_t *priv = NULL;
 1261     int fav_child = -1;
 1262     char mtime_str[256];
 1263     char ctime_str[256];
 1264     char *policy_str = NULL;
 1265     struct tm *tm_ptr;
 1266     time_t time;
 1267 
 1268     priv = this->private;
 1269 
 1270     fav_child = afr_sh_get_fav_by_policy(this, replies, inode, &policy_str);
 1271     if (fav_child == -1) {
 1272         gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SBRAIN_FAV_CHILD_POLICY,
 1273                "No child selected by favorite-child policy.");
 1274     } else if (fav_child > priv->child_count - 1) {
 1275         gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SBRAIN_FAV_CHILD_POLICY,
 1276                "Invalid child (%d) "
 1277                "selected by policy %s.",
 1278                fav_child, policy_str);
 1279     } else if (fav_child >= 0) {
 1280         time = replies[fav_child].poststat.ia_mtime;
 1281         tm_ptr = localtime(&time);
 1282         strftime(mtime_str, sizeof(mtime_str), "%Y-%m-%d %H:%M:%S", tm_ptr);
 1283         time = replies[fav_child].poststat.ia_ctime;
 1284         tm_ptr = localtime(&time);
 1285         strftime(ctime_str, sizeof(ctime_str), "%Y-%m-%d %H:%M:%S", tm_ptr);
 1286 
 1287         gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SBRAIN_FAV_CHILD_POLICY,
 1288                "Source %s selected as authentic to resolve conflicting data "
 1289                "in file (gfid:%s) by %s (%" PRIu64
 1290                " bytes @ %s mtime, %s "
 1291                "ctime).",
 1292                priv->children[fav_child]->name, uuid_utoa(inode->gfid),
 1293                policy_str, replies[fav_child].poststat.ia_size, mtime_str,
 1294                ctime_str);
 1295 
 1296         sources[fav_child] = 1;
 1297         sinks[fav_child] = 0;
 1298         healed_sinks[fav_child] = 0;
 1299     }
 1300     return fav_child;
 1301 }
 1302 
 1303 gf_boolean_t
 1304 afr_is_file_empty_on_all_children(afr_private_t *priv,
 1305                                   struct afr_reply *replies)
 1306 {
 1307     int i = 0;
 1308 
 1309     for (i = 0; i < priv->child_count; i++) {
 1310         if ((!replies[i].valid) || (replies[i].op_ret != 0) ||
 1311             (replies[i].poststat.ia_size != 0))
 1312             return _gf_false;
 1313     }
 1314 
 1315     return _gf_true;
 1316 }
 1317 
 1318 int
 1319 afr_mark_source_sinks_if_file_empty(xlator_t *this, unsigned char *sources,
 1320                                     unsigned char *sinks,
 1321                                     unsigned char *healed_sinks,
 1322                                     unsigned char *locked_on,
 1323                                     struct afr_reply *replies,
 1324                                     afr_transaction_type type)
 1325 {
 1326     int source = -1;
 1327     int i = 0;
 1328     afr_private_t *priv = this->private;
 1329     struct iatt stbuf = {
 1330         0,
 1331     };
 1332 
 1333     if ((AFR_COUNT(locked_on, priv->child_count) < priv->child_count) ||
 1334         (afr_success_count(replies, priv->child_count) < priv->child_count))
 1335         return -1;
 1336 
 1337     if (type == AFR_DATA_TRANSACTION) {
 1338         if (!afr_is_file_empty_on_all_children(priv, replies))
 1339             return -1;
 1340         goto mark;
 1341     }
 1342 
 1343     /*For AFR_METADATA_TRANSACTION, metadata must be same on all bricks.*/
 1344     stbuf = replies[0].poststat;
 1345     for (i = 1; i < priv->child_count; i++) {
 1346         if ((!IA_EQUAL(stbuf, replies[i].poststat, type)) ||
 1347             (!IA_EQUAL(stbuf, replies[i].poststat, uid)) ||
 1348             (!IA_EQUAL(stbuf, replies[i].poststat, gid)) ||
 1349             (!IA_EQUAL(stbuf, replies[i].poststat, prot)))
 1350             return -1;
 1351     }
 1352     for (i = 1; i < priv->child_count; i++) {
 1353         if (!afr_xattrs_are_equal(replies[0].xdata, replies[i].xdata))
 1354             return -1;
 1355     }
 1356 
 1357 mark:
 1358     /* data/metadata is same on all bricks. Pick one of them as source. Rest
 1359      * are sinks.*/
 1360     for (i = 0; i < priv->child_count; i++) {
 1361         if (source == -1) {
 1362             source = i;
 1363             sources[i] = 1;
 1364             sinks[i] = 0;
 1365             healed_sinks[i] = 0;
 1366             continue;
 1367         }
 1368         sources[i] = 0;
 1369         sinks[i] = 1;
 1370         healed_sinks[i] = 1;
 1371     }
 1372 
 1373     return source;
 1374 }
 1375 
 1376 /* Return a source depending on the type of heal_op, and set sources[source],
 1377  * sinks[source] and healed_sinks[source] to 1, 0 and 0 respectively. Do so
 1378  * only if the following condition is met:
 1379  * ∀i((i ∈ locked_on[] ∧ i=1)==>(sources[i]=0 ∧ sinks[i]=1 ∧ healed_sinks[i]=1))
 1380  * i.e. for each locked node, sources[node] is 0; healed_sinks[node] and
 1381  * sinks[node] are 1. This should be the case if the file is in split-brain.
 1382  */
 1383 int
 1384 afr_mark_split_brain_source_sinks(
 1385     call_frame_t *frame, xlator_t *this, inode_t *inode, unsigned char *sources,
 1386     unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on,
 1387     struct afr_reply *replies, afr_transaction_type type)
 1388 {
 1389     afr_local_t *local = NULL;
 1390     afr_private_t *priv = NULL;
 1391     dict_t *xdata_req = NULL;
 1392     int heal_op = -1;
 1393     int ret = -1;
 1394     int source = -1;
 1395 
 1396     local = frame->local;
 1397     priv = this->private;
 1398     xdata_req = local->xdata_req;
 1399 
 1400     source = afr_mark_source_sinks_if_file_empty(
 1401         this, sources, sinks, healed_sinks, locked_on, replies, type);
 1402     if (source >= 0)
 1403         return source;
 1404 
 1405     ret = dict_get_int32_sizen(xdata_req, "heal-op", &heal_op);
 1406     if (ret)
 1407         goto autoheal;
 1408 
 1409     source = afr_mark_split_brain_source_sinks_by_heal_op(
 1410         frame, this, sources, sinks, healed_sinks, locked_on, replies, type,
 1411         heal_op);
 1412     return source;
 1413 
 1414 autoheal:
 1415     /* Automatically heal if fav_child_policy is set. */
 1416     if (priv->fav_child_policy != AFR_FAV_CHILD_NONE) {
 1417         source = afr_mark_split_brain_source_sinks_by_policy(
 1418             frame, this, inode, sources, sinks, healed_sinks, locked_on,
 1419             replies, type);
 1420         if (source != -1) {
 1421             ret = dict_set_int32_sizen(xdata_req, "fav-child-policy", 1);
 1422             if (ret)
 1423                 return -1;
 1424         }
 1425     }
 1426 
 1427     return source;
 1428 }
 1429 
 1430 int
 1431 _afr_fav_child_reset_sink_xattrs(call_frame_t *frame, xlator_t *this,
 1432                                  inode_t *inode, int source,
 1433                                  unsigned char *healed_sinks,
 1434                                  unsigned char *undid_pending,
 1435                                  afr_transaction_type type,
 1436                                  unsigned char *locked_on,
 1437                                  struct afr_reply *replies)
 1438 {
 1439     afr_private_t *priv = NULL;
 1440     afr_local_t *local = NULL;
 1441     int *input_dirty = NULL;
 1442     int **input_matrix = NULL;
 1443     int *output_dirty = NULL;
 1444     int **output_matrix = NULL;
 1445     dict_t *xattr = NULL;
 1446     dict_t *xdata = NULL;
 1447     int i = 0;
 1448 
 1449     priv = this->private;
 1450     local = frame->local;
 1451 
 1452     if (!dict_get_sizen(local->xdata_req, "fav-child-policy"))
 1453         return 0;
 1454 
 1455     xdata = dict_new();
 1456     if (!xdata)
 1457         return -1;
 1458 
 1459     input_dirty = alloca0(priv->child_count * sizeof(int));
 1460     input_matrix = ALLOC_MATRIX(priv->child_count, int);
 1461     output_dirty = alloca0(priv->child_count * sizeof(int));
 1462     output_matrix = ALLOC_MATRIX(priv->child_count, int);
 1463 
 1464     afr_selfheal_extract_xattr(this, replies, type, input_dirty, input_matrix);
 1465 
 1466     for (i = 0; i < priv->child_count; i++) {
 1467         if (i == source || !healed_sinks[i])
 1468             continue;
 1469         output_dirty[i] = -input_dirty[i];
 1470         output_matrix[i][source] = -input_matrix[i][source];
 1471     }
 1472 
 1473     for (i = 0; i < priv->child_count; i++) {
 1474         if (!healed_sinks[i] || !locked_on[i])
 1475             continue;
 1476         xattr = afr_selfheal_output_xattr(this, _gf_false, type, output_dirty,
 1477                                           output_matrix, i, NULL);
 1478 
 1479         afr_selfheal_post_op(frame, this, inode, i, xattr, xdata);
 1480 
 1481         undid_pending[i] = 1;
 1482         dict_unref(xattr);
 1483     }
 1484 
 1485     if (xdata)
 1486         dict_unref(xdata);
 1487 
 1488     return 0;
 1489 }
 1490 
 1491 gf_boolean_t
 1492 afr_does_witness_exist(xlator_t *this, uint64_t *witness)
 1493 {
 1494     int i = 0;
 1495     afr_private_t *priv = NULL;
 1496 
 1497     priv = this->private;
 1498 
 1499     for (i = 0; i < priv->child_count; i++) {
 1500         if (witness[i])
 1501             return _gf_true;
 1502     }
 1503     return _gf_false;
 1504 }
 1505 
 1506 unsigned int
 1507 afr_get_quorum_count(afr_private_t *priv)
 1508 {
 1509     if (priv->quorum_count == AFR_QUORUM_AUTO) {
 1510         return priv->child_count / 2 + 1;
 1511     } else {
 1512         return priv->quorum_count;
 1513     }
 1514 }
 1515 
 1516 void
 1517 afr_selfheal_post_op_failure_accounting(afr_private_t *priv, char *accused,
 1518                                         unsigned char *sources,
 1519                                         unsigned char *locked_on)
 1520 {
 1521     int i = 0;
 1522     unsigned int quorum_count = 0;
 1523 
 1524     if (AFR_COUNT(sources, priv->child_count) != 0)
 1525         return;
 1526 
 1527     quorum_count = afr_get_quorum_count(priv);
 1528     for (i = 0; i < priv->child_count; i++) {
 1529         if ((accused[i] < quorum_count) && locked_on[i]) {
 1530             sources[i] = 1;
 1531         }
 1532     }
 1533     return;
 1534 }
 1535 
 1536 /*
 1537  * This function determines if a self-heal is required for a given inode,
 1538  * and if needed, in what direction.
 1539  *
 1540  * locked_on[] is the array representing servers which have been locked and
 1541  * from which xattrs have been fetched for analysis.
 1542  *
 1543  * The output of the function is by filling the arrays sources[] and sinks[].
 1544  *
 1545  * sources[i] is set if i'th server is an eligible source for a selfheal.
 1546  *
 1547  * sinks[i] is set if i'th server needs to be healed.
 1548  *
 1549  * if sources[0..N] are all set, there is no need for a selfheal.
 1550  *
 1551  * if sinks[0..N] are all set, the inode is in split brain.
 1552  *
 1553  */
 1554 
 1555 int
 1556 afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this,
 1557                             struct afr_reply *replies,
 1558                             afr_transaction_type type, unsigned char *locked_on,
 1559                             unsigned char *sources, unsigned char *sinks,
 1560                             uint64_t *witness, unsigned char *pflag)
 1561 {
 1562     afr_private_t *priv = NULL;
 1563     int i = 0;
 1564     int j = 0;
 1565     int *dirty = NULL;         /* Denotes if dirty xattr is set */
 1566     int **matrix = NULL;       /* Changelog matrix */
 1567     char *accused = NULL;      /* Accused others without any self-accusal */
 1568     char *pending = NULL;      /* Have pending operations on others */
 1569     char *self_accused = NULL; /* Accused itself */
 1570 
 1571     priv = this->private;
 1572 
 1573     dirty = alloca0(priv->child_count * sizeof(int));
 1574     accused = alloca0(priv->child_count);
 1575     pending = alloca0(priv->child_count);
 1576     self_accused = alloca0(priv->child_count);
 1577     matrix = ALLOC_MATRIX(priv->child_count, int);
 1578     memset(witness, 0, sizeof(*witness) * priv->child_count);
 1579 
 1580     /* First construct the pending matrix for further analysis */
 1581     afr_selfheal_extract_xattr(this, replies, type, dirty, matrix);
 1582 
 1583     if (pflag) {
 1584         for (i = 0; i < priv->child_count; i++) {
 1585             for (j = 0; j < priv->child_count; j++)
 1586                 if (matrix[i][j])
 1587                     *pflag |= PFLAG_PENDING;
 1588             if (*pflag)
 1589                 break;
 1590         }
 1591     }
 1592 
 1593     if (afr_success_count(replies, priv->child_count) < priv->child_count) {
 1594         /* Treat this just like locks not being acquired */
 1595         return -ENOTCONN;
 1596     }
 1597 
 1598     /* short list all self-accused */
 1599     for (i = 0; i < priv->child_count; i++) {
 1600         if (matrix[i][i])
 1601             self_accused[i] = 1;
 1602     }
 1603 
 1604     /* Next short list all accused to exclude them from being sources */
 1605     /* Self-accused can't accuse others as they are FOOLs */
 1606     for (i = 0; i < priv->child_count; i++) {
 1607         for (j = 0; j < priv->child_count; j++) {
 1608             if (matrix[i][j]) {
 1609                 if (!self_accused[i])
 1610                     accused[j] += 1;
 1611                 if (i != j)
 1612                     pending[i] += 1;
 1613             }
 1614         }
 1615     }
 1616 
 1617     /* Short list all non-accused as sources */
 1618     for (i = 0; i < priv->child_count; i++) {
 1619         if (!accused[i] && locked_on[i])
 1620             sources[i] = 1;
 1621         else
 1622             sources[i] = 0;
 1623     }
 1624 
 1625     /* Everyone accused by non-self-accused sources are sinks */
 1626     memset(sinks, 0, priv->child_count);
 1627     for (i = 0; i < priv->child_count; i++) {
 1628         if (!sources[i])
 1629             continue;
 1630         if (self_accused[i])
 1631             continue;
 1632         for (j = 0; j < priv->child_count; j++) {
 1633             if (matrix[i][j])
 1634                 sinks[j] = 1;
 1635         }
 1636     }
 1637 
 1638     /* For breaking ties provide with number of fops they witnessed */
 1639 
 1640     /*
 1641      * count the pending fops witnessed from itself to others when it is
 1642      * self-accused
 1643      */
 1644     for (i = 0; i < priv->child_count; i++) {
 1645         if (!self_accused[i])
 1646             continue;
 1647         for (j = 0; j < priv->child_count; j++) {
 1648             if (i == j)
 1649                 continue;
 1650             witness[i] += matrix[i][j];
 1651         }
 1652     }
 1653 
 1654     if (type == AFR_DATA_TRANSACTION || type == AFR_METADATA_TRANSACTION)
 1655         afr_selfheal_post_op_failure_accounting(priv, accused, sources,
 1656                                                 locked_on);
 1657 
 1658     /* If no sources, all locked nodes are sinks - split brain */
 1659     if (AFR_COUNT(sources, priv->child_count) == 0) {
 1660         for (i = 0; i < priv->child_count; i++) {
 1661             if (locked_on[i])
 1662                 sinks[i] = 1;
 1663         }
 1664         if (pflag)
 1665             *pflag |= PFLAG_SBRAIN;
 1666     }
 1667 
 1668     /* One more class of witness similar to dirty in v2 is where no pending
 1669      * exists but we have self-accusing markers. This can happen in afr-v1
 1670      * if the brick crashes just after doing xattrop on self but
 1671      * before xattrop on the other xattrs on the brick in pre-op. */
 1672     if (AFR_COUNT(pending, priv->child_count) == 0) {
 1673         for (i = 0; i < priv->child_count; i++) {
 1674             if (self_accused[i])
 1675                 witness[i] += matrix[i][i];
 1676         }
 1677     } else {
 1678         /* In afr-v1 if a file is self-accused and has pending
 1679          * operations on others then it is similar to 'dirty' in afr-v2.
 1680          * Consider such cases as witness.
 1681          */
 1682         for (i = 0; i < priv->child_count; i++) {
 1683             if (self_accused[i] && pending[i])
 1684                 witness[i] += matrix[i][i];
 1685         }
 1686     }
 1687 
 1688     /* count the number of dirty fops witnessed */
 1689     for (i = 0; i < priv->child_count; i++)
 1690         witness[i] += dirty[i];
 1691 
 1692     return 0;
 1693 }
 1694 
 1695 void
 1696 afr_log_selfheal(uuid_t gfid, xlator_t *this, int ret, char *type, int source,
 1697                  unsigned char *sources, unsigned char *healed_sinks)
 1698 {
 1699     char *status = NULL;
 1700     char *sinks_str = NULL;
 1701     char *p = NULL;
 1702     char *sources_str = NULL;
 1703     char *q = NULL;
 1704     afr_private_t *priv = NULL;
 1705     gf_loglevel_t loglevel = GF_LOG_NONE;
 1706     int i = 0;
 1707 
 1708     priv = this->private;
 1709     sinks_str = alloca0(priv->child_count * 8);
 1710     p = sinks_str;
 1711     sources_str = alloca0(priv->child_count * 8);
 1712     q = sources_str;
 1713     for (i = 0; i < priv->child_count; i++) {
 1714         if (healed_sinks[i])
 1715             p += sprintf(p, "%d ", i);
 1716         if (sources[i]) {
 1717             if (source == i) {
 1718                 q += sprintf(q, "[%d] ", i);
 1719             } else {
 1720                 q += sprintf(q, "%d ", i);
 1721             }
 1722         }
 1723     }
 1724 
 1725     if (ret < 0) {
 1726         status = "Failed";
 1727         loglevel = GF_LOG_DEBUG;
 1728     } else {
 1729         status = "Completed";
 1730         loglevel = GF_LOG_INFO;
 1731     }
 1732 
 1733     gf_msg(this->name, loglevel, 0, AFR_MSG_SELF_HEAL_INFO,
 1734            "%s %s selfheal on %s. "
 1735            "sources=%s sinks=%s",
 1736            status, type, uuid_utoa(gfid), sources_str, sinks_str);
 1737 }
 1738 
 1739 int
 1740 afr_selfheal_discover_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
 1741                           int op_ret, int op_errno, inode_t *inode,
 1742                           struct iatt *buf, dict_t *xdata, struct iatt *parbuf)
 1743 {
 1744     afr_local_t *local = NULL;
 1745     int i = -1;
 1746     GF_UNUSED int ret = -1;
 1747     int8_t need_heal = 1;
 1748 
 1749     local = frame->local;
 1750     i = (long)cookie;
 1751 
 1752     local->replies[i].valid = 1;
 1753     local->replies[i].op_ret = op_ret;
 1754     local->replies[i].op_errno = op_errno;
 1755     if (buf)
 1756         local->replies[i].poststat = *buf;
 1757     if (parbuf)
 1758         local->replies[i].postparent = *parbuf;
 1759     if (xdata) {
 1760         local->replies[i].xdata = dict_ref(xdata);
 1761         ret = dict_get_int8(xdata, "link-count", &need_heal);
 1762         local->replies[i].need_heal = need_heal;
 1763     } else {
 1764         local->replies[i].need_heal = need_heal;
 1765     }
 1766 
 1767     syncbarrier_wake(&local->barrier);
 1768 
 1769     return 0;
 1770 }
 1771 
 1772 inode_t *
 1773 afr_selfheal_unlocked_lookup_on(call_frame_t *frame, inode_t *parent,
 1774                                 const char *name, struct afr_reply *replies,
 1775                                 unsigned char *lookup_on, dict_t *xattr)
 1776 {
 1777     loc_t loc = {
 1778         0,
 1779     };
 1780     dict_t *xattr_req = NULL;
 1781     afr_local_t *local = NULL;
 1782     afr_private_t *priv = NULL;
 1783     inode_t *inode = NULL;
 1784 
 1785     local = frame->local;
 1786     priv = frame->this->private;
 1787 
 1788     xattr_req = dict_new();
 1789     if (!xattr_req)
 1790         return NULL;
 1791 
 1792     if (xattr)
 1793         dict_copy(xattr, xattr_req);
 1794 
 1795     if (afr_xattr_req_prepare(frame->this, xattr_req) != 0) {
 1796         dict_unref(xattr_req);
 1797         return NULL;
 1798     }
 1799 
 1800     inode = inode_new(parent->table);
 1801     if (!inode) {
 1802         dict_unref(xattr_req);
 1803         return NULL;
 1804     }
 1805 
 1806     loc.parent = inode_ref(parent);
 1807     gf_uuid_copy(loc.pargfid, parent->gfid);
 1808     loc.name = name;
 1809     loc.inode = inode_ref(inode);
 1810 
 1811     AFR_ONLIST(lookup_on, frame, afr_selfheal_discover_cbk, lookup, &loc,
 1812                xattr_req);
 1813 
 1814     afr_replies_copy(replies, local->replies, priv->child_count);
 1815 
 1816     loc_wipe(&loc);
 1817     dict_unref(xattr_req);
 1818 
 1819     return inode;
 1820 }
 1821 
 1822 int
 1823 afr_selfheal_unlocked_discover_on(call_frame_t *frame, inode_t *inode,
 1824                                   uuid_t gfid, struct afr_reply *replies,
 1825                                   unsigned char *discover_on, dict_t *dict)
 1826 {
 1827     loc_t loc = {
 1828         0,
 1829     };
 1830     dict_t *xattr_req = NULL;
 1831     afr_local_t *local = NULL;
 1832     afr_private_t *priv = NULL;
 1833 
 1834     local = frame->local;
 1835     priv = frame->this->private;
 1836 
 1837     xattr_req = dict_new();
 1838     if (!xattr_req)
 1839         return -ENOMEM;
 1840     if (dict)
 1841         dict_copy(dict, xattr_req);
 1842 
 1843     if (afr_xattr_req_prepare(frame->this, xattr_req) != 0) {
 1844         dict_unref(xattr_req);
 1845         return -ENOMEM;
 1846     }
 1847 
 1848     loc.inode = inode_ref(inode);
 1849     gf_uuid_copy(loc.gfid, gfid);
 1850 
 1851     AFR_ONLIST(discover_on, frame, afr_selfheal_discover_cbk, lookup, &loc,
 1852                xattr_req);
 1853 
 1854     afr_replies_copy(replies, local->replies, priv->child_count);
 1855 
 1856     loc_wipe(&loc);
 1857     dict_unref(xattr_req);
 1858 
 1859     return 0;
 1860 }
 1861 
 1862 int
 1863 afr_selfheal_unlocked_discover(call_frame_t *frame, inode_t *inode, uuid_t gfid,
 1864                                struct afr_reply *replies)
 1865 {
 1866     afr_local_t *local = NULL;
 1867     dict_t *dict = NULL;
 1868 
 1869     local = frame->local;
 1870     if (local && local->xattr_req)
 1871         dict = local->xattr_req;
 1872 
 1873     return afr_selfheal_unlocked_discover_on(frame, inode, gfid, replies,
 1874                                              local->child_up, dict);
 1875 }
 1876 
 1877 unsigned int
 1878 afr_success_count(struct afr_reply *replies, unsigned int count)
 1879 {
 1880     int i = 0;
 1881     unsigned int success = 0;
 1882 
 1883     for (i = 0; i < count; i++)
 1884         if (replies[i].valid && replies[i].op_ret == 0)
 1885             success++;
 1886     return success;
 1887 }
 1888 
 1889 int
 1890 afr_selfheal_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
 1891                       int op_ret, int op_errno, dict_t *xdata)
 1892 {
 1893     afr_local_t *local = NULL;
 1894     int i = 0;
 1895 
 1896     local = frame->local;
 1897     i = (long)cookie;
 1898 
 1899     local->replies[i].valid = 1;
 1900     local->replies[i].op_ret = op_ret;
 1901     local->replies[i].op_errno = op_errno;
 1902 
 1903     syncbarrier_wake(&local->barrier);
 1904 
 1905     return 0;
 1906 }
 1907 
 1908 int
 1909 afr_locked_fill(call_frame_t *frame, xlator_t *this, unsigned char *locked_on)
 1910 {
 1911     int i = 0;
 1912     afr_private_t *priv = NULL;
 1913     afr_local_t *local = NULL;
 1914     int count = 0;
 1915 
 1916     local = frame->local;
 1917     priv = this->private;
 1918 
 1919     for (i = 0; i < priv->child_count; i++) {
 1920         if (local->replies[i].valid && local->replies[i].op_ret == 0) {
 1921             locked_on[i] = 1;
 1922             count++;
 1923         } else {
 1924             locked_on[i] = 0;
 1925         }
 1926     }
 1927 
 1928     return count;
 1929 }
 1930 
 1931 int
 1932 afr_selfheal_tryinodelk(call_frame_t *frame, xlator_t *this, inode_t *inode,
 1933                         char *dom, off_t off, size_t size,
 1934                         unsigned char *locked_on)
 1935 {
 1936     loc_t loc = {
 1937         0,
 1938     };
 1939     struct gf_flock flock = {
 1940         0,
 1941     };
 1942 
 1943     loc.inode = inode_ref(inode);
 1944     gf_uuid_copy(loc.gfid, inode->gfid);
 1945 
 1946     flock.l_type = F_WRLCK;
 1947     flock.l_start = off;
 1948     flock.l_len = size;
 1949 
 1950     AFR_ONALL(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLK, &flock,
 1951               NULL);
 1952 
 1953     loc_wipe(&loc);
 1954 
 1955     return afr_locked_fill(frame, this, locked_on);
 1956 }
 1957 
 1958 int
 1959 afr_selfheal_inodelk(call_frame_t *frame, xlator_t *this, inode_t *inode,
 1960                      char *dom, off_t off, size_t size,
 1961                      unsigned char *locked_on)
 1962 {
 1963     loc_t loc = {
 1964         0,
 1965     };
 1966     struct gf_flock flock = {
 1967         0,
 1968     };
 1969     afr_local_t *local = NULL;
 1970     int i = 0;
 1971     afr_private_t *priv = NULL;
 1972 
 1973     priv = this->private;
 1974     local = frame->local;
 1975 
 1976     loc.inode = inode_ref(inode);
 1977     gf_uuid_copy(loc.gfid, inode->gfid);
 1978 
 1979     flock.l_type = F_WRLCK;
 1980     flock.l_start = off;
 1981     flock.l_len = size;
 1982 
 1983     AFR_ONALL(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLK, &flock,
 1984               NULL);
 1985 
 1986     for (i = 0; i < priv->child_count; i++) {
 1987         if (local->replies[i].op_ret == -1 &&
 1988             local->replies[i].op_errno == EAGAIN) {
 1989             afr_locked_fill(frame, this, locked_on);
 1990             afr_selfheal_uninodelk(frame, this, inode, dom, off, size,
 1991                                    locked_on);
 1992 
 1993             AFR_SEQ(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLKW,
 1994                     &flock, NULL);
 1995             break;
 1996         }
 1997     }
 1998 
 1999     loc_wipe(&loc);
 2000 
 2001     return afr_locked_fill(frame, this, locked_on);
 2002 }
 2003 
 2004 static void
 2005 afr_get_lock_and_eagain_counts(afr_private_t *priv, struct afr_reply *replies,
 2006                                int *lock_count, int *eagain_count)
 2007 {
 2008     int i = 0;
 2009 
 2010     for (i = 0; i < priv->child_count; i++) {
 2011         if (!replies[i].valid)
 2012             continue;
 2013         if (replies[i].op_ret == 0) {
 2014             (*lock_count)++;
 2015         } else if (replies[i].op_ret == -1 && replies[i].op_errno == EAGAIN) {
 2016             (*eagain_count)++;
 2017         }
 2018     }
 2019 }
 2020 
 2021 /*Do blocking locks if number of locks acquired is majority and there were some
 2022  * EAGAINs. Useful for odd-way replication*/
 2023 int
 2024 afr_selfheal_tie_breaker_inodelk(call_frame_t *frame, xlator_t *this,
 2025                                  inode_t *inode, char *dom, off_t off,
 2026                                  size_t size, unsigned char *locked_on)
 2027 {
 2028     loc_t loc = {
 2029         0,
 2030     };
 2031     struct gf_flock flock = {
 2032         0,
 2033     };
 2034     afr_local_t *local = NULL;
 2035     afr_private_t *priv = NULL;
 2036     int lock_count = 0;
 2037     int eagain_count = 0;
 2038 
 2039     priv = this->private;
 2040     local = frame->local;
 2041 
 2042     loc.inode = inode_ref(inode);
 2043     gf_uuid_copy(loc.gfid, inode->gfid);
 2044 
 2045     flock.l_type = F_WRLCK;
 2046     flock.l_start = off;
 2047     flock.l_len = size;
 2048 
 2049     AFR_ONALL(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLK, &flock,
 2050               NULL);
 2051 
 2052     afr_get_lock_and_eagain_counts(priv, local->replies, &lock_count,
 2053                                    &eagain_count);
 2054 
 2055     if (lock_count > priv->child_count / 2 && eagain_count) {
 2056         afr_locked_fill(frame, this, locked_on);
 2057         afr_selfheal_uninodelk(frame, this, inode, dom, off, size, locked_on);
 2058 
 2059         AFR_SEQ(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLKW,
 2060                 &flock, NULL);
 2061     }
 2062 
 2063     loc_wipe(&loc);
 2064 
 2065     return afr_locked_fill(frame, this, locked_on);
 2066 }
 2067 
 2068 int
 2069 afr_selfheal_uninodelk(call_frame_t *frame, xlator_t *this, inode_t *inode,
 2070                        char *dom, off_t off, size_t size,
 2071                        const unsigned char *locked_on)
 2072 {
 2073     loc_t loc = {
 2074         0,
 2075     };
 2076     struct gf_flock flock = {
 2077         0,
 2078     };
 2079 
 2080     loc.inode = inode_ref(inode);
 2081     gf_uuid_copy(loc.gfid, inode->gfid);
 2082 
 2083     flock.l_type = F_UNLCK;
 2084     flock.l_start = off;
 2085     flock.l_len = size;
 2086 
 2087     AFR_ONLIST(locked_on, frame, afr_selfheal_lock_cbk, inodelk, dom, &loc,
 2088                F_SETLK, &flock, NULL);
 2089 
 2090     loc_wipe(&loc);
 2091 
 2092     return 0;
 2093 }
 2094 
 2095 int
 2096 afr_selfheal_tryentrylk(call_frame_t *frame, xlator_t *this, inode_t *inode,
 2097                         char *dom, const char *name, unsigned char *locked_on)
 2098 {
 2099     loc_t loc = {
 2100         0,
 2101     };
 2102 
 2103     loc.inode = inode_ref(inode);
 2104     gf_uuid_copy(loc.gfid, inode->gfid);
 2105 
 2106     AFR_ONALL(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name,
 2107               ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL);
 2108 
 2109     loc_wipe(&loc);
 2110 
 2111     return afr_locked_fill(frame, this, locked_on);
 2112 }
 2113 
 2114 int
 2115 afr_selfheal_entrylk(call_frame_t *frame, xlator_t *this, inode_t *inode,
 2116                      char *dom, const char *name, unsigned char *locked_on)
 2117 {
 2118     loc_t loc = {
 2119         0,
 2120     };
 2121     afr_local_t *local = NULL;
 2122     int i = 0;
 2123     afr_private_t *priv = NULL;
 2124 
 2125     priv = this->private;
 2126     local = frame->local;
 2127 
 2128     loc.inode = inode_ref(inode);
 2129     gf_uuid_copy(loc.gfid, inode->gfid);
 2130 
 2131     AFR_ONALL(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name,
 2132               ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL);
 2133 
 2134     for (i = 0; i < priv->child_count; i++) {
 2135         if (local->replies[i].op_ret == -1 &&
 2136             local->replies[i].op_errno == EAGAIN) {
 2137             afr_locked_fill(frame, this, locked_on);
 2138             afr_selfheal_unentrylk(frame, this, inode, dom, name, locked_on,
 2139                                    NULL);
 2140 
 2141             AFR_SEQ(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name,
 2142                     ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL);
 2143             break;
 2144         }
 2145     }
 2146 
 2147     loc_wipe(&loc);
 2148 
 2149     return afr_locked_fill(frame, this, locked_on);
 2150 }
 2151 
 2152 int
 2153 afr_selfheal_tie_breaker_entrylk(call_frame_t *frame, xlator_t *this,
 2154                                  inode_t *inode, char *dom, const char *name,
 2155                                  unsigned char *locked_on)
 2156 {
 2157     loc_t loc = {
 2158         0,
 2159     };
 2160     afr_local_t *local = NULL;
 2161     afr_private_t *priv = NULL;
 2162     int lock_count = 0;
 2163     int eagain_count = 0;
 2164 
 2165     priv = this->private;
 2166     local = frame->local;
 2167 
 2168     loc.inode = inode_ref(inode);
 2169     gf_uuid_copy(loc.gfid, inode->gfid);
 2170 
 2171     AFR_ONALL(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name,
 2172               ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL);
 2173 
 2174     afr_get_lock_and_eagain_counts(priv, local->replies, &lock_count,
 2175                                    &eagain_count);
 2176 
 2177     if (lock_count > priv->child_count / 2 && eagain_count) {
 2178         afr_locked_fill(frame, this, locked_on);
 2179         afr_selfheal_unentrylk(frame, this, inode, dom, name, locked_on, NULL);
 2180 
 2181         AFR_SEQ(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name,
 2182                 ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL);
 2183     }
 2184 
 2185     loc_wipe(&loc);
 2186 
 2187     return afr_locked_fill(frame, this, locked_on);
 2188 }
 2189 
 2190 int
 2191 afr_selfheal_unentrylk(call_frame_t *frame, xlator_t *this, inode_t *inode,
 2192                        char *dom, const char *name, unsigned char *locked_on,
 2193                        dict_t *xdata)
 2194 {
 2195     loc_t loc = {
 2196         0,
 2197     };
 2198 
 2199     loc.inode = inode_ref(inode);
 2200     gf_uuid_copy(loc.gfid, inode->gfid);
 2201 
 2202     AFR_ONLIST(locked_on, frame, afr_selfheal_lock_cbk, entrylk, dom, &loc,
 2203                name, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, xdata);
 2204 
 2205     loc_wipe(&loc);
 2206 
 2207     return 0;
 2208 }
 2209 
 2210 gf_boolean_t
 2211 afr_is_data_set(xlator_t *this, dict_t *xdata)
 2212 {
 2213     return afr_is_pending_set(this, xdata, AFR_DATA_TRANSACTION);
 2214 }
 2215 
 2216 gf_boolean_t
 2217 afr_is_metadata_set(xlator_t *this, dict_t *xdata)
 2218 {
 2219     return afr_is_pending_set(this, xdata, AFR_METADATA_TRANSACTION);
 2220 }
 2221 
 2222 gf_boolean_t
 2223 afr_is_entry_set(xlator_t *this, dict_t *xdata)
 2224 {
 2225     return afr_is_pending_set(this, xdata, AFR_ENTRY_TRANSACTION);
 2226 }
 2227 
 2228 /*
 2229  * This function inspects the looked up replies (in an unlocked manner)
 2230  * and decides whether a locked verification and possible healing is
 2231  * required or not. It updates the three booleans for each type
 2232  * of healing. If the boolean flag gets set to FALSE, then we are sure
 2233  * no healing is required. If the boolean flag gets set to TRUE then
 2234  * we have to proceed with locked reinspection.
 2235  */
 2236 
 2237 int
 2238 afr_selfheal_unlocked_inspect(call_frame_t *frame, xlator_t *this, uuid_t gfid,
 2239                               inode_t **link_inode, gf_boolean_t *data_selfheal,
 2240                               gf_boolean_t *metadata_selfheal,
 2241                               gf_boolean_t *entry_selfheal)
 2242 {
 2243     afr_private_t *priv = NULL;
 2244     inode_t *inode = NULL;
 2245     int i = 0;
 2246     int valid_cnt = 0;
 2247     struct iatt first = {
 2248         0,
 2249     };
 2250     int first_idx = 0;
 2251     struct afr_reply *replies = NULL;
 2252     int ret = -1;
 2253 
 2254     priv = this->private;
 2255 
 2256     inode = afr_inode_find(this, gfid);
 2257     if (!inode)
 2258         goto out;
 2259 
 2260     replies = alloca0(sizeof(*replies) * priv->child_count);
 2261 
 2262     ret = afr_selfheal_unlocked_discover(frame, inode, gfid, replies);
 2263     if (ret)
 2264         goto out;
 2265 
 2266     for (i = 0; i < priv->child_count; i++) {
 2267         if (!replies[i].valid)
 2268             continue;
 2269         if (replies[i].op_ret == -1)
 2270             continue;
 2271 
 2272         /* The data segment of the changelog can be non-zero to indicate
 2273          * the directory needs a full heal. So the check below ensures
 2274          * it's not a directory before setting the data_selfheal boolean.
 2275          */
 2276         if (data_selfheal && !IA_ISDIR(replies[i].poststat.ia_type) &&
 2277             afr_is_data_set(this, replies[i].xdata))
 2278             *data_selfheal = _gf_true;
 2279 
 2280         if (metadata_selfheal && afr_is_metadata_set(this, replies[i].xdata))
 2281             *metadata_selfheal = _gf_true;
 2282 
 2283         if (entry_selfheal && afr_is_entry_set(this, replies[i].xdata))
 2284             *entry_selfheal = _gf_true;
 2285 
 2286         valid_cnt++;
 2287         if (valid_cnt == 1) {
 2288             first = replies[i].poststat;
 2289             first_idx = i;
 2290             continue;
 2291         }
 2292 
 2293         if (!IA_EQUAL(first, replies[i].poststat, type)) {
 2294             gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
 2295                    "TYPE mismatch %d vs %d on %s for gfid:%s",
 2296                    (int)first.ia_type, (int)replies[i].poststat.ia_type,
 2297                    priv->children[i]->name,
 2298                    uuid_utoa(replies[i].poststat.ia_gfid));
 2299             gf_event(EVENT_AFR_SPLIT_BRAIN,
 2300                      "client-pid=%d;"
 2301                      "subvol=%s;"
 2302                      "type=file;gfid=%s;"
 2303                      "ia_type-%d=%s;ia_type-%d=%s",
 2304                      this->ctx->cmd_args.client_pid, this->name,
 2305                      uuid_utoa(replies[i].poststat.ia_gfid), first_idx,
 2306                      gf_inode_type_to_str(first.ia_type), i,
 2307                      gf_inode_type_to_str(replies[i].poststat.ia_type));
 2308             ret = -EIO;
 2309             goto out;
 2310         }
 2311 
 2312         if (!IA_EQUAL(first, replies[i].poststat, uid)) {
 2313             gf_msg_debug(this->name, 0,
 2314                          "UID mismatch "
 2315                          "%d vs %d on %s for gfid:%s",
 2316                          (int)first.ia_uid, (int)replies[i].poststat.ia_uid,
 2317                          priv->children[i]->name,
 2318                          uuid_utoa(replies[i].poststat.ia_gfid));
 2319 
 2320             if (metadata_selfheal)
 2321                 *metadata_selfheal = _gf_true;
 2322         }
 2323 
 2324         if (!IA_EQUAL(first, replies[i].poststat, gid)) {
 2325             gf_msg_debug(this->name, 0,
 2326                          "GID mismatch "
 2327                          "%d vs %d on %s for gfid:%s",
 2328                          (int)first.ia_uid, (int)replies[i].poststat.ia_uid,
 2329                          priv->children[i]->name,
 2330                          uuid_utoa(replies[i].poststat.ia_gfid));
 2331 
 2332             if (metadata_selfheal)
 2333                 *metadata_selfheal = _gf_true;
 2334         }
 2335 
 2336         if (!IA_EQUAL(first, replies[i].poststat, prot)) {
 2337             gf_msg_debug(this->name, 0,
 2338                          "MODE mismatch "
 2339                          "%d vs %d on %s for gfid:%s",
 2340                          (int)st_mode_from_ia(first.ia_prot, 0),
 2341                          (int)st_mode_from_ia(replies[i].poststat.ia_prot, 0),
 2342                          priv->children[i]->name,
 2343                          uuid_utoa(replies[i].poststat.ia_gfid));
 2344 
 2345             if (metadata_selfheal)
 2346                 *metadata_selfheal = _gf_true;
 2347         }
 2348 
 2349         if (IA_ISREG(first.ia_type) &&
 2350             !IA_EQUAL(first, replies[i].poststat, size)) {
 2351             gf_msg_debug(this->name, 0,
 2352                          "SIZE mismatch "
 2353                          "%lld vs %lld on %s for gfid:%s",
 2354                          (long long)first.ia_size,
 2355                          (long long)replies[i].poststat.ia_size,
 2356                          priv->children[i]->name,
 2357                          uuid_utoa(replies[i].poststat.ia_gfid));
 2358 
 2359             if (data_selfheal)
 2360                 *data_selfheal = _gf_true;
 2361         }
 2362     }
 2363 
 2364     if (valid_cnt > 0 && link_inode) {
 2365         *link_inode = inode_link(inode, NULL, NULL, &first);
 2366         if (!*link_inode) {
 2367             ret = -EINVAL;
 2368             goto out;
 2369         }
 2370     } else if (valid_cnt < 2) {
 2371         ret = afr_check_stale_error(replies, priv);
 2372         goto out;
 2373     }
 2374 
 2375     ret = 0;
 2376 out:
 2377     if (inode)
 2378         inode_unref(inode);
 2379     if (replies)
 2380         afr_replies_wipe(replies, priv->child_count);
 2381 
 2382     return ret;
 2383 }
 2384 
 2385 inode_t *
 2386 afr_inode_find(xlator_t *this, uuid_t gfid)
 2387 {
 2388     inode_table_t *table = NULL;
 2389     inode_t *inode = NULL;
 2390 
 2391     table = this->itable;
 2392     if (!table)
 2393         return NULL;
 2394 
 2395     inode = inode_find(table, gfid);
 2396     if (inode)
 2397         return inode;
 2398 
 2399     inode = inode_new(table);
 2400     if (!inode)
 2401         return NULL;
 2402 
 2403     gf_uuid_copy(inode->gfid, gfid);
 2404 
 2405     return inode;
 2406 }
 2407 
 2408 call_frame_t *
 2409 afr_frame_create(xlator_t *this, int32_t *op_errno)
 2410 {
 2411     call_frame_t *frame = NULL;
 2412     afr_local_t *local = NULL;
 2413     pid_t pid = GF_CLIENT_PID_SELF_HEALD;
 2414 
 2415     frame = create_frame(this, this->ctx->pool);
 2416     if (!frame)
 2417         return NULL;
 2418 
 2419     local = AFR_FRAME_INIT(frame, (*op_errno));
 2420     if (!local) {
 2421         STACK_DESTROY(frame->root);
 2422         return NULL;
 2423     }
 2424 
 2425     syncopctx_setfspid(&pid);
 2426 
 2427     frame->root->pid = pid;
 2428 
 2429     afr_set_lk_owner(frame, this, frame->root);
 2430 
 2431     return frame;
 2432 }
 2433 
 2434 int
 2435 afr_selfheal_newentry_mark(call_frame_t *frame, xlator_t *this, inode_t *inode,
 2436                            int source, struct afr_reply *replies,
 2437                            unsigned char *sources, unsigned char *newentry)
 2438 {
 2439     int ret = 0;
 2440     int i = 0;
 2441     afr_private_t *priv = NULL;
 2442     dict_t *xattr = NULL;
 2443     int **changelog = NULL;
 2444 
 2445     priv = this->private;
 2446 
 2447     gf_uuid_copy(inode->gfid, replies[source].poststat.ia_gfid);
 2448 
 2449     xattr = dict_new();
 2450     if (!xattr)
 2451         return -ENOMEM;
 2452 
 2453     changelog = afr_mark_pending_changelog(priv, newentry, xattr,
 2454                                            replies[source].poststat.ia_type);
 2455 
 2456     if (!changelog) {
 2457         ret = -ENOMEM;
 2458         goto out;
 2459     }
 2460 
 2461     for (i = 0; i < priv->child_count; i++) {
 2462         if (!sources[i])
 2463             continue;
 2464         ret |= afr_selfheal_post_op(frame, this, inode, i, xattr, NULL);
 2465     }
 2466 out:
 2467     if (changelog)
 2468         afr_matrix_cleanup(changelog, priv->child_count);
 2469     if (xattr)
 2470         dict_unref(xattr);
 2471     return ret;
 2472 }
 2473 
 2474 int
 2475 afr_selfheal_do(call_frame_t *frame, xlator_t *this, uuid_t gfid)
 2476 {
 2477     int ret = -1;
 2478     int entry_ret = 1;
 2479     int metadata_ret = 1;
 2480     int data_ret = 1;
 2481     int or_ret = 0;
 2482     inode_t *inode = NULL;
 2483     fd_t *fd = NULL;
 2484     gf_boolean_t data_selfheal = _gf_false;
 2485     gf_boolean_t metadata_selfheal = _gf_false;
 2486     gf_boolean_t entry_selfheal = _gf_false;
 2487     afr_private_t *priv = NULL;
 2488 
 2489     priv = this->private;
 2490 
 2491     ret = afr_selfheal_unlocked_inspect(frame, this, gfid, &inode,
 2492                                         &data_selfheal, &metadata_selfheal,
 2493                                         &entry_selfheal);
 2494     if (ret)
 2495         goto out;
 2496 
 2497     if (!(data_selfheal || metadata_selfheal || entry_selfheal)) {
 2498         ret = 2;
 2499         goto out;
 2500     }
 2501 
 2502     if (inode->ia_type == IA_IFREG) {
 2503         ret = afr_selfheal_data_open(this, inode, &fd);
 2504         if (!fd) {
 2505             ret = -EIO;
 2506             goto out;
 2507         }
 2508     }
 2509 
 2510     if (data_selfheal && priv->data_self_heal)
 2511         data_ret = afr_selfheal_data(frame, this, fd);
 2512 
 2513     if (metadata_selfheal && priv->metadata_self_heal)
 2514         metadata_ret = afr_selfheal_metadata(frame, this, inode);
 2515 
 2516     if (entry_selfheal && priv->entry_self_heal)
 2517         entry_ret = afr_selfheal_entry(frame, this, inode);
 2518 
 2519     or_ret = (data_ret | metadata_ret | entry_ret);
 2520 
 2521     if (data_ret == -EIO || metadata_ret == -EIO || entry_ret == -EIO)
 2522         ret = -EIO;
 2523     else if (data_ret == 1 && metadata_ret == 1 && entry_ret == 1)
 2524         ret = 1;
 2525     else if (or_ret < 0)
 2526         ret = or_ret;
 2527     else
 2528         ret = 0;
 2529 
 2530 out:
 2531     if (inode)
 2532         inode_unref(inode);
 2533     if (fd)
 2534         fd_unref(fd);
 2535     return ret;
 2536 }
 2537 /*
 2538  * This is the entry point for healing a given GFID. The return values for this
 2539  * function are as follows:
 2540  * '0' if the self-heal is successful
 2541  * '1' if the afr-xattrs are non-zero (due to on-going IO) and no heal is needed
 2542  * '2' if the afr-xattrs are all-zero and no heal is needed
 2543  * $errno if the heal on the gfid failed.
 2544  */
 2545 
 2546 int
 2547 afr_selfheal(xlator_t *this, uuid_t gfid)
 2548 {
 2549     int ret = -1;
 2550     call_frame_t *frame = NULL;
 2551     afr_local_t *local = NULL;
 2552 
 2553     frame = afr_frame_create(this, NULL);
 2554     if (!frame)
 2555         return ret;
 2556 
 2557     local = frame->local;
 2558     local->xdata_req = dict_new();
 2559 
 2560     ret = afr_selfheal_do(frame, this, gfid);
 2561 
 2562     if (frame)
 2563         AFR_STACK_DESTROY(frame);
 2564 
 2565     return ret;
 2566 }
 2567 
 2568 afr_local_t *
 2569 __afr_dequeue_heals(afr_private_t *priv)
 2570 {
 2571     afr_local_t *local = NULL;
 2572 
 2573     if (list_empty(&priv->heal_waiting))
 2574         goto none;
 2575     if ((priv->background_self_heal_count > 0) &&
 2576         (priv->healers >= priv->background_self_heal_count))
 2577         goto none;
 2578 
 2579     local = list_entry(priv->heal_waiting.next, afr_local_t, healer);
 2580     priv->heal_waiters--;
 2581     GF_ASSERT(priv->heal_waiters >= 0);
 2582     list_del_init(&local->healer);
 2583     list_add(&local->healer, &priv->healing);
 2584     priv->healers++;
 2585     return local;
 2586 none:
 2587     gf_msg_debug(THIS->name, 0,
 2588                  "Nothing dequeued. "
 2589                  "Num healers: %d, Num Waiters: %d",
 2590                  priv->healers, priv->heal_waiters);
 2591     return NULL;
 2592 }
 2593 
 2594 int
 2595 afr_refresh_selfheal_wrap(void *opaque)
 2596 {
 2597     call_frame_t *heal_frame = opaque;
 2598     afr_local_t *local = heal_frame->local;
 2599     int ret = 0;
 2600 
 2601     ret = afr_selfheal(heal_frame->this, local->refreshinode->gfid);
 2602     return ret;
 2603 }
 2604 
 2605 int
 2606 afr_refresh_heal_done(int ret, call_frame_t *frame, void *opaque)
 2607 {
 2608     call_frame_t *heal_frame = opaque;
 2609     xlator_t *this = heal_frame->this;
 2610     afr_private_t *priv = this->private;
 2611     afr_local_t *local = heal_frame->local;
 2612 
 2613     LOCK(&priv->lock);
 2614     {
 2615         list_del_init(&local->healer);
 2616         priv->healers--;
 2617         GF_ASSERT(priv->healers >= 0);
 2618         local = __afr_dequeue_heals(priv);
 2619     }
 2620     UNLOCK(&priv->lock);
 2621 
 2622     AFR_STACK_DESTROY(heal_frame);
 2623 
 2624     if (local)
 2625         afr_heal_synctask(this, local);
 2626     return 0;
 2627 }
 2628 
 2629 void
 2630 afr_heal_synctask(xlator_t *this, afr_local_t *local)
 2631 {
 2632     int ret = 0;
 2633     call_frame_t *heal_frame = NULL;
 2634 
 2635     heal_frame = local->heal_frame;
 2636     ret = synctask_new(this->ctx->env, afr_refresh_selfheal_wrap,
 2637                        afr_refresh_heal_done, heal_frame, heal_frame);
 2638     if (ret < 0)
 2639         /* Heal not launched. Will be queued when the next inode
 2640          * refresh happens and shd hasn't healed it yet. */
 2641         afr_refresh_heal_done(ret, heal_frame, heal_frame);
 2642 }
 2643 
 2644 gf_boolean_t
 2645 afr_throttled_selfheal(call_frame_t *frame, xlator_t *this)
 2646 {
 2647     gf_boolean_t can_heal = _gf_true;
 2648     afr_private_t *priv = this->private;
 2649     afr_local_t *local = frame->local;
 2650 
 2651     LOCK(&priv->lock);
 2652     {
 2653         if ((priv->background_self_heal_count > 0) &&
 2654             (priv->heal_wait_qlen + priv->background_self_heal_count) >
 2655                 (priv->heal_waiters + priv->healers)) {
 2656             list_add_tail(&local->healer, &priv->heal_waiting);
 2657             priv->heal_waiters++;
 2658             local = __afr_dequeue_heals(priv);
 2659         } else {
 2660             can_heal = _gf_false;
 2661         }
 2662     }
 2663     UNLOCK(&priv->lock);
 2664 
 2665     if (can_heal) {
 2666         if (local)
 2667             afr_heal_synctask(this, local);
 2668         else
 2669             gf_msg_debug(this->name, 0,
 2670                          "Max number of heals are "
 2671                          "pending, background self-heal rejected.");
 2672     }
 2673 
 2674     return can_heal;
 2675 }
 2676 
 2677 int
 2678 afr_choose_source_by_policy(afr_private_t *priv, unsigned char *sources,
 2679                             afr_transaction_type type)
 2680 {
 2681     int source = -1;
 2682     int i = 0;
 2683 
 2684     /* Give preference to local child to save on bandwidth */
 2685     for (i = 0; i < priv->child_count; i++) {
 2686         if (priv->local[i] && sources[i]) {
 2687             if ((type == AFR_DATA_TRANSACTION) && AFR_IS_ARBITER_BRICK(priv, i))
 2688                 continue;
 2689 
 2690             source = i;
 2691             goto out;
 2692         }
 2693     }
 2694 
 2695     for (i = 0; i < priv->child_count; i++) {
 2696         if (sources[i]) {
 2697             source = i;
 2698             goto out;
 2699         }
 2700     }
 2701 out:
 2702     return source;
 2703 }