"Fossies" - the Fresh Open Source Software Archive

Member "glusterfs-6.9/xlators/cluster/afr/src/afr.h" (23 Apr 2020, 41525 Bytes) of package /linux/misc/glusterfs-6.9.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "afr.h" see the Fossies "Dox" file reference documentation and the last Fossies "Diffs" side-by-side code changes report: 7.5_vs_7.6.

    1 /*
    2   Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
    3   This file is part of GlusterFS.
    4 
    5   This file is licensed to you under your choice of the GNU Lesser
    6   General Public License, version 3 or any later version (LGPLv3 or
    7   later), or the GNU General Public License, version 2 (GPLv2), in all
    8   cases as published by the Free Software Foundation.
    9 */
   10 
   11 #ifndef __AFR_H__
   12 #define __AFR_H__
   13 
   14 #include <glusterfs/call-stub.h>
   15 #include <glusterfs/compat-errno.h>
   16 #include "afr-mem-types.h"
   17 
   18 #include "libxlator.h"
   19 #include <glusterfs/timer.h>
   20 #include <glusterfs/syncop.h>
   21 
   22 #include "afr-self-heald.h"
   23 #include "afr-messages.h"
   24 
   25 #define SHD_INODE_LRU_LIMIT 1
   26 #define AFR_PATHINFO_HEADER "REPLICATE:"
   27 #define AFR_SH_READDIR_SIZE_KEY "self-heal-readdir-size"
   28 #define AFR_SH_DATA_DOMAIN_FMT "%s:self-heal"
   29 #define AFR_DIRTY_DEFAULT AFR_XATTR_PREFIX ".dirty"
   30 #define AFR_DIRTY (((afr_private_t *)(THIS->private))->afr_dirty)
   31 
   32 #define AFR_LOCKEE_COUNT_MAX 3
   33 #define AFR_DOM_COUNT_MAX 3
   34 #define AFR_NUM_CHANGE_LOGS 3              /*data + metadata + entry*/
   35 #define AFR_DEFAULT_SPB_CHOICE_TIMEOUT 300 /*in seconds*/
   36 
   37 #define ARBITER_BRICK_INDEX 2
   38 #define THIN_ARBITER_BRICK_INDEX 2
   39 #define AFR_TA_DOM_NOTIFY "afr.ta.dom-notify"
   40 #define AFR_TA_DOM_MODIFY "afr.ta.dom-modify"
   41 
   42 #define AFR_HALO_MAX_LATENCY 99999
   43 
   44 #define PFLAG_PENDING (1 << 0)
   45 #define PFLAG_SBRAIN (1 << 1)
   46 
   47 typedef int (*afr_lock_cbk_t)(call_frame_t *frame, xlator_t *this);
   48 
   49 typedef int (*afr_read_txn_wind_t)(call_frame_t *frame, xlator_t *this,
   50                                    int subvol);
   51 
   52 typedef int (*afr_inode_refresh_cbk_t)(call_frame_t *frame, xlator_t *this,
   53                                        int err);
   54 
   55 typedef int (*afr_changelog_resume_t)(call_frame_t *frame, xlator_t *this);
   56 
   57 #define AFR_COUNT(array, max)                                                  \
   58     ({                                                                         \
   59         int __i;                                                               \
   60         int __res = 0;                                                         \
   61         for (__i = 0; __i < max; __i++)                                        \
   62             if (array[__i])                                                    \
   63                 __res++;                                                       \
   64         __res;                                                                 \
   65     })
   66 #define AFR_INTERSECT(dst, src1, src2, max)                                    \
   67     ({                                                                         \
   68         int __i;                                                               \
   69         for (__i = 0; __i < max; __i++)                                        \
   70             dst[__i] = src1[__i] && src2[__i];                                 \
   71     })
   72 #define AFR_CMP(a1, a2, len)                                                   \
   73     ({                                                                         \
   74         int __cmp = 0;                                                         \
   75         int __i;                                                               \
   76         for (__i = 0; __i < len; __i++)                                        \
   77             if (a1[__i] != a2[__i]) {                                          \
   78                 __cmp = 1;                                                     \
   79                 break;                                                         \
   80             }                                                                  \
   81         __cmp;                                                                 \
   82     })
   83 #define AFR_IS_ARBITER_BRICK(priv, index)                                      \
   84     ((priv->arbiter_count == 1) && (index == ARBITER_BRICK_INDEX))
   85 
   86 #define AFR_SET_ERROR_AND_CHECK_SPLIT_BRAIN(ret, errnum)                       \
   87     do {                                                                       \
   88         local->op_ret = ret;                                                   \
   89         local->op_errno = errnum;                                              \
   90         if (local->op_errno == EIO)                                            \
   91             gf_msg(this->name, GF_LOG_ERROR, local->op_errno,                  \
   92                    AFR_MSG_SPLIT_BRAIN,                                        \
   93                    "Failing %s on gfid %s: "                                   \
   94                    "split-brain observed.",                                    \
   95                    gf_fop_list[local->op], uuid_utoa(local->inode->gfid));     \
   96     } while (0)
   97 
   98 typedef enum {
   99     AFR_FAV_CHILD_NONE,
  100     AFR_FAV_CHILD_BY_SIZE,
  101     AFR_FAV_CHILD_BY_CTIME,
  102     AFR_FAV_CHILD_BY_MTIME,
  103     AFR_FAV_CHILD_BY_MAJORITY,
  104     AFR_FAV_CHILD_POLICY_MAX,
  105 } afr_favorite_child_policy;
  106 
  107 typedef enum {
  108     AFR_SELFHEAL_DATA_FULL = 0,
  109     AFR_SELFHEAL_DATA_DIFF,
  110     AFR_SELFHEAL_DATA_DYNAMIC,
  111 } afr_data_self_heal_type_t;
  112 
  113 typedef enum {
  114     AFR_CHILD_UNKNOWN = -1,
  115     AFR_CHILD_ZERO,
  116     AFR_CHILD_ONE,
  117     AFR_CHILD_THIN_ARBITER,
  118 } afr_child_index;
  119 
  120 typedef enum {
  121     TA_WAIT_FOR_NOTIFY_LOCK_REL, /*FOP came after notify domain lock upcall
  122                                    notification and waiting for its release.*/
  123     TA_GET_INFO_FROM_TA_FILE,    /*FOP needs post-op on ta file to get
  124                                   *info about which brick is bad.*/
  125     TA_INFO_IN_MEMORY_SUCCESS,   /*Bad brick info is in memory and fop failed
  126                                   *on BAD brick - Success*/
  127     TA_INFO_IN_MEMORY_FAILED,    /*Bad brick info is in memory and fop failed
  128                                   *on GOOD brick - Failed*/
  129     TA_SUCCESS,                  /*FOP succeeded on both data bricks.*/
  130 } afr_ta_fop_state_t;
  131 
  132 struct afr_nfsd {
  133     gf_boolean_t iamnfsd;
  134     uint32_t halo_max_latency_msec;
  135 };
  136 
  137 typedef struct _afr_private {
  138     gf_lock_t lock;             /* to guard access to child_count, etc */
  139     unsigned int child_count;   /* total number of children   */
  140     unsigned int arbiter_count; /*subset of child_count.
  141                                   Has to be 0 or 1.*/
  142 
  143     xlator_t **children;
  144 
  145     inode_t *root_inode;
  146 
  147     /* For thin-arbiter. */
  148     unsigned int thin_arbiter_count; /* 0 or 1 at the moment.*/
  149     uuid_t ta_gfid;
  150     unsigned char ta_child_up;
  151     int ta_bad_child_index;
  152     int ta_event_gen;
  153     off_t ta_notify_dom_lock_offset;
  154     gf_boolean_t release_ta_notify_dom_lock;
  155     unsigned int ta_in_mem_txn_count;
  156     unsigned int ta_on_wire_txn_count;
  157     struct list_head ta_waitq;
  158     struct list_head ta_onwireq;
  159 
  160     unsigned char *child_up;
  161     int64_t *child_latency;
  162     unsigned char *local;
  163 
  164     char **pending_key;
  165 
  166     afr_data_self_heal_type_t data_self_heal_algorithm;
  167     unsigned int data_self_heal_window_size; /* max number of pipelined
  168                                                 read/writes */
  169 
  170     struct list_head heal_waiting; /*queue for files that need heal*/
  171     uint32_t heal_wait_qlen; /*configurable queue length for heal_waiting*/
  172     int32_t heal_waiters;    /* No. of elements currently in wait queue.*/
  173 
  174     struct list_head healing;            /* queue for files that are undergoing
  175                                             background heal*/
  176     uint32_t background_self_heal_count; /*configurable queue length for
  177                                            healing queue*/
  178     int32_t healers; /* No. of elements currently undergoing background
  179                       heal*/
  180 
  181     gf_boolean_t metadata_self_heal; /* on/off */
  182     gf_boolean_t entry_self_heal;    /* on/off */
  183 
  184     gf_boolean_t metadata_splitbrain_forced_heal; /* on/off */
  185     int read_child;                               /* read-subvolume */
  186     unsigned int hash_mode;     /* for when read_child is not set */
  187     gf_atomic_t *pending_reads; /*No. of pending read cbks per child.*/
  188     int favorite_child;         /* subvolume to be preferred in resolving
  189                                             split-brain cases */
  190 
  191     afr_favorite_child_policy fav_child_policy; /*Policy to use for automatic
  192                                               resolution of split-brains.*/
  193 
  194     unsigned int wait_count; /* # of servers to wait for success */
  195 
  196     gf_timer_t *timer; /* launched when parent up is received */
  197 
  198     gf_boolean_t optimistic_change_log;
  199     gf_boolean_t eager_lock;
  200     gf_boolean_t pre_op_compat; /* on/off */
  201     uint32_t post_op_delay_secs;
  202     unsigned int quorum_count;
  203 
  204     char vol_uuid[UUID_SIZE + 1];
  205     int32_t *last_event;
  206 
  207     /* @event_generation: Keeps count of number of events received which can
  208        potentially impact consistency decisions. The events are CHILD_UP
  209        and CHILD_DOWN, when we have to recalculate the freshness/staleness
  210        of copies to detect if changes had happened while the other server
  211        was down. CHILD_DOWN and CHILD_UP can also be received on network
  212        disconnect/reconnects and not necessarily server going down/up.
  213        Recalculating freshness/staleness on network events is equally
  214        important as we might have had a network split brain.
  215     */
  216     uint32_t event_generation;
  217 
  218     gf_boolean_t choose_local;
  219     gf_boolean_t did_discovery;
  220     uint64_t sh_readdir_size;
  221     gf_boolean_t ensure_durability;
  222     char *sh_domain;
  223     char *afr_dirty;
  224     gf_boolean_t halo_enabled;
  225 
  226     uint32_t halo_max_latency_msec;
  227     uint32_t halo_max_replicas;
  228     uint32_t halo_min_replicas;
  229 
  230     afr_self_heald_t shd;
  231     struct afr_nfsd nfsd;
  232 
  233     gf_boolean_t consistent_metadata;
  234     uint64_t spb_choice_timeout;
  235     gf_boolean_t need_heal;
  236 
  237     /* pump dependencies */
  238     void *pump_private;
  239     gf_boolean_t use_afr_in_pump;
  240     gf_boolean_t granular_locks;
  241     gf_boolean_t full_lock;
  242     gf_boolean_t esh_granular;
  243     gf_boolean_t consistent_io;
  244     gf_boolean_t data_self_heal; /* on/off */
  245 } afr_private_t;
  246 
  247 typedef enum {
  248     AFR_DATA_TRANSACTION,         /* truncate, write, ... */
  249     AFR_METADATA_TRANSACTION,     /* chmod, chown, ... */
  250     AFR_ENTRY_TRANSACTION,        /* create, rmdir, ... */
  251     AFR_ENTRY_RENAME_TRANSACTION, /* rename */
  252 } afr_transaction_type;
  253 
  254 /*
  255   xattr format: trusted.afr.volume = [x y z]
  256   x - data pending
  257   y - metadata pending
  258   z - entry pending
  259 */
  260 
  261 static inline int
  262 afr_index_for_transaction_type(afr_transaction_type type)
  263 {
  264     switch (type) {
  265         case AFR_DATA_TRANSACTION:
  266             return 0;
  267 
  268         case AFR_METADATA_TRANSACTION:
  269             return 1;
  270 
  271         case AFR_ENTRY_TRANSACTION:
  272         case AFR_ENTRY_RENAME_TRANSACTION:
  273             return 2;
  274     }
  275 
  276     return -1; /* make gcc happy */
  277 }
  278 
  279 static inline int
  280 afr_index_from_ia_type(ia_type_t type)
  281 {
  282     switch (type) {
  283         case IA_IFDIR:
  284             return afr_index_for_transaction_type(AFR_ENTRY_TRANSACTION);
  285         case IA_IFREG:
  286             return afr_index_for_transaction_type(AFR_DATA_TRANSACTION);
  287         default:
  288             return -1;
  289     }
  290 }
  291 
  292 typedef struct {
  293     struct gf_flock flock;
  294     loc_t loc;
  295     fd_t *fd;
  296     char *basename;
  297     unsigned char *locked_nodes;
  298     int locked_count;
  299 
  300 } afr_lockee_t;
  301 
  302 int
  303 afr_entry_lockee_cmp(const void *l1, const void *l2);
  304 
  305 typedef struct {
  306     loc_t *lk_loc;
  307 
  308     int lockee_count;
  309     afr_lockee_t lockee[AFR_LOCKEE_COUNT_MAX];
  310 
  311     const char *lk_basename;
  312     const char *lower_basename;
  313     const char *higher_basename;
  314     char lower_locked;
  315     char higher_locked;
  316 
  317     unsigned char *lower_locked_nodes;
  318 
  319     int32_t lock_count;
  320 
  321     int32_t lk_call_count;
  322     int32_t lk_expected_count;
  323     int32_t lk_attempted_count;
  324 
  325     int32_t lock_op_ret;
  326     int32_t lock_op_errno;
  327     afr_lock_cbk_t lock_cbk;
  328     char *domain; /* Domain on which inode/entry lock/unlock in progress.*/
  329 } afr_internal_lock_t;
  330 
  331 struct afr_reply {
  332     int valid;
  333     int32_t op_ret;
  334     int32_t op_errno;
  335     dict_t *xattr; /*For xattrop*/
  336     dict_t *xdata;
  337     struct iatt poststat;
  338     struct iatt postparent;
  339     struct iatt prestat;
  340     struct iatt preparent;
  341     struct iatt preparent2;
  342     struct iatt postparent2;
  343     /* For rchecksum */
  344     uint8_t checksum[SHA256_DIGEST_LENGTH];
  345     gf_boolean_t buf_has_zeroes;
  346     gf_boolean_t fips_mode_rchecksum;
  347     /* For lookup */
  348     int8_t need_heal;
  349 };
  350 
  351 typedef enum {
  352     AFR_FD_NOT_OPENED,
  353     AFR_FD_OPENED,
  354     AFR_FD_OPENING
  355 } afr_fd_open_status_t;
  356 
  357 typedef struct {
  358     afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */
  359     int flags;
  360 
  361     /* the subvolume on which the latest sequence of readdirs (starting
  362        at offset 0) has begun. Till the next readdir request with 0 offset
  363        arrives, we continue to read off this subvol.
  364     */
  365     int readdir_subvol;
  366 } afr_fd_ctx_t;
  367 
  368 typedef enum {
  369     AFR_FOP_LOCK_PARALLEL,
  370     AFR_FOP_LOCK_SERIAL,
  371     AFR_FOP_LOCK_QUORUM_FAILED,
  372 } afr_fop_lock_state_t;
  373 
  374 typedef struct _afr_inode_lock_t {
  375     /* @num_inodelks:
  376        Number of inodelks queried from the server, as queried through
  377        xdata in FOPs. Currently, used to decide if eager-locking must be
  378        temporarily disabled.
  379     */
  380     int32_t num_inodelks;
  381     unsigned int event_generation;
  382     gf_boolean_t release;
  383     gf_boolean_t acquired;
  384     gf_timer_t *delay_timer;
  385     struct list_head owners;  /*Transactions that are performing fop*/
  386     struct list_head post_op; /*Transactions that are done with the fop
  387                                *So can not conflict with the fops*/
  388     struct list_head waiting; /*Transaction that are waiting for
  389                                *conflicting transactions to complete*/
  390     struct list_head frozen;  /*Transactions that need to go as part of
  391                                * next batch of eager-lock*/
  392 } afr_lock_t;
  393 
  394 typedef struct _afr_inode_ctx {
  395     uint64_t read_subvol;
  396     uint64_t write_subvol;
  397     int lock_count;
  398     int spb_choice;
  399     gf_timer_t *timer;
  400     gf_boolean_t need_refresh;
  401     unsigned int *pre_op_done[AFR_NUM_CHANGE_LOGS];
  402     int inherited[AFR_NUM_CHANGE_LOGS];
  403     int on_disk[AFR_NUM_CHANGE_LOGS];
  404 
  405     /* set if any write on this fd was a non stable write
  406        (i.e, without O_SYNC or O_DSYNC)
  407     */
  408     gf_boolean_t witnessed_unstable_write;
  409 
  410     /* @open_fd_count:
  411        Number of open FDs queried from the server, as queried through
  412        xdata in FOPs. Currently, used to decide if eager-locking must be
  413        temporarily disabled.
  414     */
  415     uint32_t open_fd_count;
  416     /*Only 2 types of transactions support eager-locks now. DATA/METADATA*/
  417     afr_lock_t lock[2];
  418 } afr_inode_ctx_t;
  419 
  420 typedef struct _afr_local {
  421     glusterfs_fop_t op;
  422     unsigned int call_count;
  423 
  424     /* @event_generation: copy of priv->event_generation taken at the
  425        time of starting the transaction. The copy is made so that we
  426        have a stable value through the various phases of the transaction.
  427     */
  428     unsigned int event_generation;
  429 
  430     uint32_t open_fd_count;
  431     gf_boolean_t update_open_fd_count;
  432     int32_t num_inodelks;
  433     gf_boolean_t update_num_inodelks;
  434 
  435     gf_lkowner_t saved_lk_owner;
  436 
  437     int32_t op_ret;
  438     int32_t op_errno;
  439 
  440     int32_t **pending;
  441 
  442     int dirty[AFR_NUM_CHANGE_LOGS];
  443 
  444     loc_t loc;
  445     loc_t newloc;
  446 
  447     fd_t *fd;
  448     afr_fd_ctx_t *fd_ctx;
  449 
  450     /* @child_up: copy of priv->child_up taken at the time of transaction
  451        start. The copy is taken so that we have a stable child_up array
  452        through the phases of the transaction as priv->child_up[i] can keep
  453        changing through time.
  454     */
  455     unsigned char *child_up;
  456 
  457     /* @read_attempted:
  458        array of flags representing subvolumes where read operations of
  459        the read transaction have already been attempted. The array is
  460        first pre-filled with down subvolumes, and as reads are performed
  461        on other subvolumes, those are set as well. This way if the read
  462        operation fails we do not retry on that subvolume again.
  463     */
  464     unsigned char *read_attempted;
  465 
  466     /* @readfn:
  467 
  468        pointer to function which will perform the read operation on a given
  469        subvolume. Used in read transactions.
  470     */
  471 
  472     afr_read_txn_wind_t readfn;
  473 
  474     /* @refreshed:
  475 
  476        the inode was "refreshed" (i.e, pending xattrs from all subvols
  477        freshly inspected and inode ctx updated accordingly) as part of
  478        this transaction already.
  479     */
  480     gf_boolean_t refreshed;
  481 
  482     /* @inode:
  483 
  484        the inode on which the read txn is performed on. ref'ed and copied
  485        from either fd->inode or loc.inode
  486     */
  487 
  488     inode_t *inode;
  489 
  490     /* @parent[2]:
  491 
  492        parent inode[s] on which directory transactions are performed.
  493     */
  494 
  495     inode_t *parent;
  496     inode_t *parent2;
  497 
  498     /* @readable:
  499 
  500        array of flags representing servers from which a read can be
  501        performed. This is the output of afr_inode_refresh()
  502     */
  503     unsigned char *readable;
  504     unsigned char *readable2; /*For rename transaction*/
  505 
  506     int read_subvol; /* Current read subvolume */
  507 
  508     afr_inode_refresh_cbk_t refreshfn;
  509 
  510     /* @refreshinode:
  511 
  512        Inode currently getting refreshed.
  513     */
  514     inode_t *refreshinode;
  515 
  516     /*To handle setattr/setxattr on yet to be linked inode from dht*/
  517     uuid_t refreshgfid;
  518 
  519     /*
  520       @pre_op_compat:
  521 
  522       compatibility mode of pre-op. send a separate pre-op and
  523       op operations as part of transaction, rather than combining
  524     */
  525 
  526     gf_boolean_t pre_op_compat;
  527 
  528     dict_t *xattr_req;
  529 
  530     afr_internal_lock_t internal_lock;
  531 
  532     dict_t *dict;
  533 
  534     int optimistic_change_log;
  535 
  536     /* Is the current writev() going to perform a stable write?
  537        i.e, is fd->flags or @flags writev param have O_SYNC or
  538        O_DSYNC?
  539     */
  540     gf_boolean_t stable_write;
  541 
  542     /* This write appended to the file. Nnot necessarily O_APPEND,
  543        just means the offset of write was at the end of file.
  544     */
  545     gf_boolean_t append_write;
  546 
  547     /*
  548       This struct contains the arguments for the "continuation"
  549       (scheme-like) of fops
  550     */
  551 
  552     struct {
  553         struct {
  554             gf_boolean_t needs_fresh_lookup;
  555             uuid_t gfid_req;
  556         } lookup;
  557 
  558         struct {
  559             unsigned char buf_set;
  560             struct statvfs buf;
  561         } statfs;
  562 
  563         struct {
  564             int32_t flags;
  565             fd_t *fd;
  566         } open;
  567 
  568         struct {
  569             int32_t cmd;
  570             struct gf_flock user_flock;
  571             struct gf_flock ret_flock;
  572             unsigned char *locked_nodes;
  573         } lk;
  574 
  575         /* inode read */
  576 
  577         struct {
  578             int32_t mask;
  579             int last_index; /* index of the child we tried previously */
  580         } access;
  581 
  582         struct {
  583             int last_index;
  584         } stat;
  585 
  586         struct {
  587             int last_index;
  588         } fstat;
  589 
  590         struct {
  591             size_t size;
  592             int last_index;
  593         } readlink;
  594 
  595         struct {
  596             char *name;
  597             int last_index;
  598             long xattr_len;
  599         } getxattr;
  600 
  601         struct {
  602             size_t size;
  603             off_t offset;
  604             int last_index;
  605             uint32_t flags;
  606         } readv;
  607 
  608         /* dir read */
  609 
  610         struct {
  611             int success_count;
  612             int32_t op_ret;
  613             int32_t op_errno;
  614 
  615             uint32_t *checksum;
  616         } opendir;
  617 
  618         struct {
  619             int32_t op_ret;
  620             int32_t op_errno;
  621             size_t size;
  622             off_t offset;
  623             dict_t *dict;
  624             gf_boolean_t failed;
  625             int last_index;
  626         } readdir;
  627         /* inode write */
  628 
  629         struct {
  630             struct iatt prebuf;
  631             struct iatt postbuf;
  632         } inode_wfop;  // common structure for all inode-write-fops
  633 
  634         struct {
  635             int32_t op_ret;
  636 
  637             struct iovec *vector;
  638             struct iobref *iobref;
  639             int32_t count;
  640             off_t offset;
  641             uint32_t flags;
  642         } writev;
  643 
  644         struct {
  645             off_t offset;
  646         } truncate;
  647 
  648         struct {
  649             off_t offset;
  650         } ftruncate;
  651 
  652         struct {
  653             struct iatt in_buf;
  654             int32_t valid;
  655         } setattr;
  656 
  657         struct {
  658             struct iatt in_buf;
  659             int32_t valid;
  660         } fsetattr;
  661 
  662         struct {
  663             dict_t *dict;
  664             int32_t flags;
  665         } setxattr;
  666 
  667         struct {
  668             dict_t *dict;
  669             int32_t flags;
  670         } fsetxattr;
  671 
  672         struct {
  673             char *name;
  674         } removexattr;
  675 
  676         struct {
  677             dict_t *xattr;
  678             gf_xattrop_flags_t optype;
  679         } xattrop;
  680 
  681         /* dir write */
  682 
  683         struct {
  684             inode_t *inode;
  685             struct iatt buf;
  686             struct iatt preparent;
  687             struct iatt postparent;
  688             struct iatt prenewparent;
  689             struct iatt postnewparent;
  690         } dir_fop;  // common structure for all dir fops
  691 
  692         struct {
  693             fd_t *fd;
  694             dict_t *params;
  695             int32_t flags;
  696             mode_t mode;
  697         } create;
  698 
  699         struct {
  700             dev_t dev;
  701             mode_t mode;
  702             dict_t *params;
  703         } mknod;
  704 
  705         struct {
  706             int32_t mode;
  707             dict_t *params;
  708         } mkdir;
  709 
  710         struct {
  711             int flags;
  712         } rmdir;
  713 
  714         struct {
  715             dict_t *params;
  716             char *linkpath;
  717         } symlink;
  718 
  719         struct {
  720             int32_t mode;
  721             off_t offset;
  722             size_t len;
  723         } fallocate;
  724 
  725         struct {
  726             off_t offset;
  727             size_t len;
  728         } discard;
  729 
  730         struct {
  731             off_t offset;
  732             off_t len;
  733             struct iatt prebuf;
  734             struct iatt postbuf;
  735         } zerofill;
  736 
  737         struct {
  738             char *volume;
  739             int32_t cmd;
  740             int32_t in_cmd;
  741             struct gf_flock in_flock;
  742             struct gf_flock flock;
  743             void *xdata;
  744         } inodelk;
  745 
  746         struct {
  747             char *volume;
  748             char *basename;
  749             entrylk_cmd in_cmd;
  750             entrylk_cmd cmd;
  751             entrylk_type type;
  752             void *xdata;
  753         } entrylk;
  754 
  755         struct {
  756             off_t offset;
  757             gf_seek_what_t what;
  758         } seek;
  759 
  760         struct {
  761             int32_t datasync;
  762         } fsync;
  763 
  764         struct {
  765             struct gf_lease user_lease;
  766             struct gf_lease ret_lease;
  767             unsigned char *locked_nodes;
  768         } lease;
  769 
  770     } cont;
  771 
  772     struct {
  773         off_t start, len;
  774 
  775         gf_boolean_t eager_lock_on;
  776         gf_boolean_t do_eager_unlock;
  777 
  778         char *basename;
  779         char *new_basename;
  780 
  781         loc_t parent_loc;
  782         loc_t new_parent_loc;
  783 
  784         afr_transaction_type type;
  785 
  786         /* stub to resume on destruction
  787            of the transaction frame */
  788         call_stub_t *resume_stub;
  789 
  790         struct list_head owner_list;
  791         struct list_head wait_list;
  792 
  793         unsigned char *pre_op;
  794 
  795         /* Changelog xattr dict for [f]xattrop*/
  796         dict_t **changelog_xdata;
  797         unsigned char *pre_op_sources;
  798 
  799         /* @failed_subvols: subvolumes on which a pre-op or a
  800             FOP failed. */
  801         unsigned char *failed_subvols;
  802 
  803         /* @dirtied: flag which indicates whether we set dirty flag
  804            in the OP. Typically true when we are performing operation
  805            on more than one subvol and optimistic changelog is disabled
  806 
  807            A 'true' value set in @dirtied flag means an 'undirtying'
  808            has to be done in POST-OP phase.
  809         */
  810         gf_boolean_t dirtied;
  811 
  812         /* @inherited: flag which indicates that the dirty flags
  813            of the previous transaction were inherited
  814         */
  815         gf_boolean_t inherited;
  816 
  817         /*
  818           @no_uninherit: flag which indicates that a pre_op_uninherit()
  819           must _not_ be attempted (and returned as failure) always. This
  820           flag is set when a hard pre-op is performed, but not accounted
  821           for it in fd_ctx->on_disk[]. Such transactions are "isolated"
  822           from the pre-op piggybacking entirely and therefore uninherit
  823           must not be attempted.
  824         */
  825         gf_boolean_t no_uninherit;
  826 
  827         /* @uninherit_done:
  828            @uninherit_value:
  829 
  830            The above pair variables make pre_op_uninherit() idempotent.
  831            Both are FALSE initially. The first call to pre_op_uninherit
  832            sets @uninherit_done to TRUE and the return value to
  833            @uninherit_value. Further calls will check for @uninherit_done
  834            to be TRUE and if so will simply return @uninherit_value.
  835         */
  836         gf_boolean_t uninherit_done;
  837         gf_boolean_t uninherit_value;
  838 
  839         gf_boolean_t in_flight_sb;  /* Indicator for occurrence of
  840                                        split-brain while in the middle of
  841                                        a txn. */
  842         int32_t in_flight_sb_errno; /* This is where the cause of the
  843                                        failure on the last good copy of
  844                                        the file is stored.
  845                                        */
  846 
  847         /* @changelog_resume: function to be called after changlogging
  848            (either pre-op or post-op) is done
  849         */
  850         afr_changelog_resume_t changelog_resume;
  851 
  852         call_frame_t *main_frame; /*Fop frame*/
  853         call_frame_t *frame;      /*Transaction frame*/
  854 
  855         int (*wind)(call_frame_t *frame, xlator_t *this, int subvol);
  856 
  857         int (*unwind)(call_frame_t *frame, xlator_t *this);
  858 
  859         /* post-op hook */
  860     } transaction;
  861 
  862     syncbarrier_t barrier;
  863 
  864     /* extra data for fops */
  865     dict_t *xdata_req;
  866     dict_t *xdata_rsp;
  867 
  868     dict_t *xattr_rsp; /*for [f]xattrop*/
  869 
  870     mode_t umask;
  871     int xflag;
  872     gf_boolean_t do_discovery;
  873     struct afr_reply *replies;
  874 
  875     /* For  client side background heals. */
  876     struct list_head healer;
  877     call_frame_t *heal_frame;
  878 
  879     gf_boolean_t need_full_crawl;
  880     afr_fop_lock_state_t fop_lock_state;
  881 
  882     gf_boolean_t is_read_txn;
  883     afr_inode_ctx_t *inode_ctx;
  884 
  885     /*For thin-arbiter transactions.*/
  886     unsigned char read_txn_query_child;
  887     unsigned char ta_child_up;
  888     struct list_head ta_waitq;
  889     struct list_head ta_onwireq;
  890     afr_ta_fop_state_t fop_state;
  891     int ta_failed_subvol;
  892     int ta_event_gen;
  893     gf_boolean_t is_new_entry;
  894 } afr_local_t;
  895 
  896 typedef struct afr_spbc_timeout {
  897     call_frame_t *frame;
  898     gf_boolean_t d_spb;
  899     gf_boolean_t m_spb;
  900     loc_t *loc;
  901     int spb_child_index;
  902 } afr_spbc_timeout_t;
  903 
  904 typedef struct afr_spb_status {
  905     call_frame_t *frame;
  906     loc_t *loc;
  907 } afr_spb_status_t;
  908 
  909 typedef struct afr_empty_brick_args {
  910     call_frame_t *frame;
  911     loc_t loc;
  912     int empty_index;
  913     char *op_type;
  914 } afr_empty_brick_args_t;
  915 
  916 typedef struct afr_read_subvol_args {
  917     ia_type_t ia_type;
  918     uuid_t gfid;
  919 } afr_read_subvol_args_t;
  920 
  921 typedef struct afr_granular_esh_args {
  922     fd_t *heal_fd;
  923     xlator_t *xl;
  924     call_frame_t *frame;
  925     gf_boolean_t mismatch; /* flag to represent occurrence of type/gfid
  926                               mismatch */
  927 } afr_granular_esh_args_t;
  928 
  929 int
  930 afr_inode_get_readable(call_frame_t *frame, inode_t *inode, xlator_t *this,
  931                        unsigned char *readable, int *event_p, int type);
  932 int
  933 afr_inode_read_subvol_get(inode_t *inode, xlator_t *this,
  934                           unsigned char *data_subvols,
  935                           unsigned char *metadata_subvols,
  936                           int *event_generation);
  937 int
  938 __afr_inode_read_subvol_get(inode_t *inode, xlator_t *this,
  939                             unsigned char *data_subvols,
  940                             unsigned char *metadata_subvols,
  941                             int *event_generation);
  942 
  943 int
  944 __afr_inode_read_subvol_set(inode_t *inode, xlator_t *this,
  945                             unsigned char *data_subvols,
  946                             unsigned char *metadata_subvol,
  947                             int event_generation);
  948 int
  949 afr_inode_read_subvol_set(inode_t *inode, xlator_t *this,
  950                           unsigned char *data_subvols,
  951                           unsigned char *metadata_subvols,
  952                           int event_generation);
  953 
  954 int
  955 afr_inode_event_gen_reset(inode_t *inode, xlator_t *this);
  956 
  957 int
  958 afr_read_subvol_select_by_policy(inode_t *inode, xlator_t *this,
  959                                  unsigned char *readable,
  960                                  afr_read_subvol_args_t *args);
  961 
  962 int
  963 afr_inode_read_subvol_type_get(inode_t *inode, xlator_t *this,
  964                                unsigned char *readable, int *event_p, int type);
  965 int
  966 afr_read_subvol_get(inode_t *inode, xlator_t *this, int *subvol_p,
  967                     unsigned char *readables, int *event_p,
  968                     afr_transaction_type type, afr_read_subvol_args_t *args);
  969 
  970 #define afr_data_subvol_get(i, t, s, r, e, a)                                  \
  971     afr_read_subvol_get(i, t, s, r, e, AFR_DATA_TRANSACTION, a)
  972 
  973 #define afr_metadata_subvol_get(i, t, s, r, e, a)                              \
  974     afr_read_subvol_get(i, t, s, r, e, AFR_METADATA_TRANSACTION, a)
  975 
  976 int
  977 afr_inode_refresh(call_frame_t *frame, xlator_t *this, inode_t *inode,
  978                   uuid_t gfid, afr_inode_refresh_cbk_t cbk);
  979 
  980 int32_t
  981 afr_notify(xlator_t *this, int32_t event, void *data, void *data2);
  982 
  983 int
  984 xattr_is_equal(dict_t *this, char *key1, data_t *value1, void *data);
  985 
  986 int
  987 afr_add_entry_lockee(afr_local_t *local, loc_t *loc, char *basename,
  988                      int child_count);
  989 
  990 int
  991 afr_add_inode_lockee(afr_local_t *local, int child_count);
  992 
  993 void
  994 afr_lockees_cleanup(afr_internal_lock_t *int_lock);
  995 
  996 int
  997 afr_attempt_lock_recovery(xlator_t *this, int32_t child_index);
  998 
  999 int
 1000 afr_mark_locked_nodes(xlator_t *this, fd_t *fd, unsigned char *locked_nodes);
 1001 
 1002 void
 1003 afr_set_lk_owner(call_frame_t *frame, xlator_t *this, void *lk_owner);
 1004 
 1005 int
 1006 afr_set_lock_number(call_frame_t *frame, xlator_t *this);
 1007 
 1008 int32_t
 1009 afr_unlock(call_frame_t *frame, xlator_t *this);
 1010 
 1011 int
 1012 afr_lock_nonblocking(call_frame_t *frame, xlator_t *this);
 1013 
 1014 int
 1015 afr_blocking_lock(call_frame_t *frame, xlator_t *this);
 1016 
 1017 int
 1018 afr_internal_lock_finish(call_frame_t *frame, xlator_t *this);
 1019 
 1020 int
 1021 __afr_fd_ctx_set(xlator_t *this, fd_t *fd);
 1022 
 1023 afr_fd_ctx_t *
 1024 afr_fd_ctx_get(fd_t *fd, xlator_t *this);
 1025 
 1026 int
 1027 afr_build_parent_loc(loc_t *parent, loc_t *child, int32_t *op_errno);
 1028 
 1029 int
 1030 afr_locked_nodes_count(unsigned char *locked_nodes, int child_count);
 1031 
 1032 int
 1033 afr_replies_interpret(call_frame_t *frame, xlator_t *this, inode_t *inode,
 1034                       gf_boolean_t *start_heal);
 1035 
 1036 void
 1037 afr_local_replies_wipe(afr_local_t *local, afr_private_t *priv);
 1038 
 1039 void
 1040 afr_local_cleanup(afr_local_t *local, xlator_t *this);
 1041 
 1042 int
 1043 afr_frame_return(call_frame_t *frame);
 1044 
 1045 int
 1046 afr_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
 1047          fd_t *fd, dict_t *xdata);
 1048 
 1049 void
 1050 afr_local_transaction_cleanup(afr_local_t *local, xlator_t *this);
 1051 
 1052 int
 1053 afr_cleanup_fd_ctx(xlator_t *this, fd_t *fd);
 1054 
 1055 #define AFR_STACK_UNWIND(fop, frame, op_ret, op_errno, params...)              \
 1056     do {                                                                       \
 1057         afr_local_t *__local = NULL;                                           \
 1058         xlator_t *__this = NULL;                                               \
 1059         int32_t __op_ret = 0;                                                  \
 1060         int32_t __op_errno = 0;                                                \
 1061                                                                                \
 1062         __op_ret = op_ret;                                                     \
 1063         __op_errno = op_errno;                                                 \
 1064         if (frame) {                                                           \
 1065             __local = frame->local;                                            \
 1066             __this = frame->this;                                              \
 1067             afr_handle_inconsistent_fop(frame, &__op_ret, &__op_errno);        \
 1068             if (__local && __local->is_read_txn)                               \
 1069                 afr_pending_read_decrement(__this->private,                    \
 1070                                            __local->read_subvol);              \
 1071             frame->local = NULL;                                               \
 1072         }                                                                      \
 1073                                                                                \
 1074         STACK_UNWIND_STRICT(fop, frame, __op_ret, __op_errno, params);         \
 1075         if (__local) {                                                         \
 1076             afr_local_cleanup(__local, __this);                                \
 1077             mem_put(__local);                                                  \
 1078         }                                                                      \
 1079     } while (0)
 1080 
 1081 #define AFR_STACK_DESTROY(frame)                                               \
 1082     do {                                                                       \
 1083         afr_local_t *__local = NULL;                                           \
 1084         xlator_t *__this = NULL;                                               \
 1085         __local = frame->local;                                                \
 1086         __this = frame->this;                                                  \
 1087         frame->local = NULL;                                                   \
 1088         STACK_DESTROY(frame->root);                                            \
 1089         if (__local) {                                                         \
 1090             afr_local_cleanup(__local, __this);                                \
 1091             mem_put(__local);                                                  \
 1092         }                                                                      \
 1093     } while (0);
 1094 
 1095 #define AFR_FRAME_INIT(frame, op_errno)                                        \
 1096     ({                                                                         \
 1097         frame->local = mem_get0(THIS->local_pool);                             \
 1098         if (afr_local_init(frame->local, frame->this->private, &op_errno)) {   \
 1099             afr_local_cleanup(frame->local, frame->this);                      \
 1100             mem_put(frame->local);                                             \
 1101             frame->local = NULL;                                               \
 1102         };                                                                     \
 1103         frame->local;                                                          \
 1104     })
 1105 
 1106 #define AFR_STACK_RESET(frame)                                                 \
 1107     do {                                                                       \
 1108         afr_local_t *__local = NULL;                                           \
 1109         xlator_t *__this = NULL;                                               \
 1110         __local = frame->local;                                                \
 1111         __this = frame->this;                                                  \
 1112         frame->local = NULL;                                                   \
 1113         int __opr;                                                             \
 1114         STACK_RESET(frame->root);                                              \
 1115         if (__local) {                                                         \
 1116             afr_local_cleanup(__local, __this);                                \
 1117             mem_put(__local);                                                  \
 1118         }                                                                      \
 1119         AFR_FRAME_INIT(frame, __opr);                                          \
 1120     } while (0)
 1121 
 1122 /* allocate and return a string that is the basename of argument */
 1123 static inline char *
 1124 AFR_BASENAME(const char *str)
 1125 {
 1126     char *__tmp_str = NULL;
 1127     char *__basename_str = NULL;
 1128     __tmp_str = gf_strdup(str);
 1129     __basename_str = gf_strdup(basename(__tmp_str));
 1130     GF_FREE(__tmp_str);
 1131     return __basename_str;
 1132 }
 1133 
 1134 call_frame_t *
 1135 afr_copy_frame(call_frame_t *base);
 1136 
 1137 int
 1138 afr_transaction_local_init(afr_local_t *local, xlator_t *this);
 1139 
 1140 int32_t
 1141 afr_marker_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
 1142                     const char *name, afr_local_t *local, afr_private_t *priv);
 1143 
 1144 int
 1145 afr_local_init(afr_local_t *local, afr_private_t *priv, int32_t *op_errno);
 1146 
 1147 int
 1148 afr_internal_lock_init(afr_internal_lock_t *lk, size_t child_count);
 1149 
 1150 int
 1151 afr_higher_errno(int32_t old_errno, int32_t new_errno);
 1152 
 1153 int
 1154 afr_final_errno(afr_local_t *local, afr_private_t *priv);
 1155 
 1156 int
 1157 afr_xattr_req_prepare(xlator_t *this, dict_t *xattr_req);
 1158 
 1159 void
 1160 afr_fix_open(fd_t *fd, xlator_t *this);
 1161 
 1162 afr_fd_ctx_t *
 1163 afr_fd_ctx_get(fd_t *fd, xlator_t *this);
 1164 
 1165 void
 1166 afr_set_low_priority(call_frame_t *frame);
 1167 int
 1168 afr_child_fd_ctx_set(xlator_t *this, fd_t *fd, int32_t child, int flags);
 1169 
 1170 void
 1171 afr_matrix_cleanup(int32_t **pending, unsigned int m);
 1172 
 1173 int32_t **
 1174 afr_matrix_create(unsigned int m, unsigned int n);
 1175 
 1176 int **
 1177 afr_mark_pending_changelog(afr_private_t *priv, unsigned char *pending,
 1178                            dict_t *xattr, ia_type_t iat);
 1179 
 1180 void
 1181 afr_filter_xattrs(dict_t *xattr);
 1182 
 1183 /*
 1184  * Special value indicating we should use the "auto" quorum method instead of
 1185  * a fixed value (including zero to turn off quorum enforcement).
 1186  */
 1187 #define AFR_QUORUM_AUTO INT_MAX
 1188 
 1189 int
 1190 afr_fd_report_unstable_write(xlator_t *this, afr_local_t *local);
 1191 
 1192 gf_boolean_t
 1193 afr_fd_has_witnessed_unstable_write(xlator_t *this, inode_t *inode);
 1194 
 1195 void
 1196 afr_reply_wipe(struct afr_reply *reply);
 1197 
 1198 void
 1199 afr_replies_wipe(struct afr_reply *replies, int count);
 1200 
 1201 gf_boolean_t
 1202 afr_xattrs_are_equal(dict_t *dict1, dict_t *dict2);
 1203 
 1204 gf_boolean_t
 1205 afr_is_xattr_ignorable(char *key);
 1206 
 1207 int
 1208 afr_get_heal_info(call_frame_t *frame, xlator_t *this, loc_t *loc);
 1209 
 1210 int
 1211 afr_heal_splitbrain_file(call_frame_t *frame, xlator_t *this, loc_t *loc);
 1212 
 1213 int
 1214 afr_get_split_brain_status(void *opaque);
 1215 
 1216 int
 1217 afr_get_split_brain_status_cbk(int ret, call_frame_t *frame, void *opaque);
 1218 
 1219 int
 1220 afr_inode_split_brain_choice_set(inode_t *inode, xlator_t *this,
 1221                                  int spb_choice);
 1222 int
 1223 afr_inode_split_brain_choice_get(inode_t *inode, xlator_t *this,
 1224                                  int *spb_choice);
 1225 int
 1226 afr_get_child_index_from_name(xlator_t *this, char *name);
 1227 
 1228 int
 1229 afr_is_split_brain(call_frame_t *frame, xlator_t *this, inode_t *inode,
 1230                    uuid_t gfid, gf_boolean_t *d_spb, gf_boolean_t *m_spb);
 1231 int
 1232 afr_spb_choice_timeout_cancel(xlator_t *this, inode_t *inode);
 1233 
 1234 int
 1235 afr_set_split_brain_choice(int ret, call_frame_t *frame, void *opaque);
 1236 
 1237 gf_boolean_t
 1238 afr_get_need_heal(xlator_t *this);
 1239 
 1240 void
 1241 afr_set_need_heal(xlator_t *this, afr_local_t *local);
 1242 
 1243 int
 1244 afr_selfheal_data_open(xlator_t *this, inode_t *inode, fd_t **fd);
 1245 
 1246 int
 1247 afr_get_msg_id(char *op_type);
 1248 
 1249 int
 1250 afr_set_in_flight_sb_status(xlator_t *this, call_frame_t *frame,
 1251                             inode_t *inode);
 1252 
 1253 int32_t
 1254 afr_quorum_errno(afr_private_t *priv);
 1255 
 1256 gf_boolean_t
 1257 afr_is_consistent_io_possible(afr_local_t *local, afr_private_t *priv,
 1258                               int32_t *op_errno);
 1259 void
 1260 afr_handle_inconsistent_fop(call_frame_t *frame, int32_t *op_ret,
 1261                             int32_t *op_errno);
 1262 
 1263 void
 1264 afr_inode_write_fill(call_frame_t *frame, xlator_t *this, int child_index,
 1265                      int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
 1266                      struct iatt *postbuf, dict_t *xdata);
 1267 void
 1268 afr_process_post_writev(call_frame_t *frame, xlator_t *this);
 1269 
 1270 void
 1271 afr_writev_unwind(call_frame_t *frame, xlator_t *this);
 1272 
 1273 void
 1274 afr_writev_copy_outvars(call_frame_t *src_frame, call_frame_t *dst_frame);
 1275 
 1276 void
 1277 afr_update_uninodelk(afr_local_t *local, afr_internal_lock_t *int_lock,
 1278                      int32_t child_index);
 1279 afr_fd_ctx_t *
 1280 __afr_fd_ctx_get(fd_t *fd, xlator_t *this);
 1281 
 1282 gf_boolean_t
 1283 afr_is_inode_refresh_reqd(inode_t *inode, xlator_t *this, int event_gen1,
 1284                           int event_gen2);
 1285 
 1286 int
 1287 afr_serialize_xattrs_with_delimiter(call_frame_t *frame, xlator_t *this,
 1288                                     char *buf, const char *default_str,
 1289                                     int32_t *serz_len, char delimiter);
 1290 gf_boolean_t
 1291 afr_is_symmetric_error(call_frame_t *frame, xlator_t *this);
 1292 
 1293 int
 1294 __afr_inode_ctx_get(xlator_t *this, inode_t *inode, afr_inode_ctx_t **ctx);
 1295 
 1296 uint64_t
 1297 afr_write_subvol_get(call_frame_t *frame, xlator_t *this);
 1298 
 1299 int
 1300 afr_write_subvol_set(call_frame_t *frame, xlator_t *this);
 1301 
 1302 int
 1303 afr_write_subvol_reset(call_frame_t *frame, xlator_t *this);
 1304 
 1305 int
 1306 afr_set_inode_local(xlator_t *this, afr_local_t *local, inode_t *inode);
 1307 
 1308 int
 1309 afr_fill_ta_loc(xlator_t *this, loc_t *loc);
 1310 
 1311 int
 1312 afr_ta_post_op_lock(xlator_t *this, loc_t *loc);
 1313 
 1314 int
 1315 afr_ta_post_op_unlock(xlator_t *this, loc_t *loc);
 1316 
 1317 gf_boolean_t
 1318 afr_is_pending_set(xlator_t *this, dict_t *xdata, int type);
 1319 
 1320 int
 1321 __afr_get_up_children_count(afr_private_t *priv);
 1322 
 1323 call_frame_t *
 1324 afr_ta_frame_create(xlator_t *this);
 1325 
 1326 gf_boolean_t
 1327 afr_ta_has_quorum(afr_private_t *priv, afr_local_t *local);
 1328 
 1329 void
 1330 afr_ta_lock_release_synctask(xlator_t *this);
 1331 
 1332 void
 1333 afr_ta_locked_priv_invalidate(afr_private_t *priv);
 1334 
 1335 gf_boolean_t
 1336 afr_lookup_has_quorum(call_frame_t *frame, xlator_t *this,
 1337                       unsigned char *subvols);
 1338 
 1339 void
 1340 afr_mark_new_entry_changelog(call_frame_t *frame, xlator_t *this);
 1341 
 1342 void
 1343 afr_selfheal_childup(xlator_t *this, afr_private_t *priv);
 1344 #endif /* __AFR_H__ */