"Fossies" - the Fresh Open Source Software Archive

Member "btrfs-progs-v5.4.1/volumes.c" (9 Jan 2020, 72995 Bytes) of package /linux/misc/btrfs-progs-v5.4.1.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "volumes.c" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: v5.4_vs_v5.4.1.

    1 /*
    2  * Copyright (C) 2007 Oracle.  All rights reserved.
    3  *
    4  * This program is free software; you can redistribute it and/or
    5  * modify it under the terms of the GNU General Public
    6  * License v2 as published by the Free Software Foundation.
    7  *
    8  * This program is distributed in the hope that it will be useful,
    9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
   10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   11  * General Public License for more details.
   12  *
   13  * You should have received a copy of the GNU General Public
   14  * License along with this program; if not, write to the
   15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
   16  * Boston, MA 021110-1307, USA.
   17  */
   18 #include <stdio.h>
   19 #include <stdlib.h>
   20 #include <sys/types.h>
   21 #include <sys/stat.h>
   22 #include <uuid/uuid.h>
   23 #include <fcntl.h>
   24 #include <unistd.h>
   25 #include "ctree.h"
   26 #include "disk-io.h"
   27 #include "transaction.h"
   28 #include "print-tree.h"
   29 #include "volumes.h"
   30 #include "common/utils.h"
   31 #include "kernel-lib/raid56.h"
   32 
   33 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
   34     [BTRFS_RAID_RAID10] = {
   35         .sub_stripes    = 2,
   36         .dev_stripes    = 1,
   37         .devs_max   = 0,    /* 0 == as many as possible */
   38         .devs_min   = 4,
   39         .tolerated_failures = 1,
   40         .devs_increment = 2,
   41         .ncopies    = 2,
   42         .nparity        = 0,
   43         .raid_name  = "raid10",
   44         .bg_flag    = BTRFS_BLOCK_GROUP_RAID10,
   45         .mindev_error   = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
   46     },
   47     [BTRFS_RAID_RAID1] = {
   48         .sub_stripes    = 1,
   49         .dev_stripes    = 1,
   50         .devs_max   = 2,
   51         .devs_min   = 2,
   52         .tolerated_failures = 1,
   53         .devs_increment = 2,
   54         .ncopies    = 2,
   55         .nparity        = 0,
   56         .raid_name  = "raid1",
   57         .bg_flag    = BTRFS_BLOCK_GROUP_RAID1,
   58         .mindev_error   = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
   59     },
   60     [BTRFS_RAID_RAID1C3] = {
   61         .sub_stripes    = 1,
   62         .dev_stripes    = 1,
   63         .devs_max   = 3,
   64         .devs_min   = 3,
   65         .tolerated_failures = 2,
   66         .devs_increment = 3,
   67         .ncopies    = 3,
   68     },
   69     [BTRFS_RAID_RAID1C4] = {
   70         .sub_stripes    = 1,
   71         .dev_stripes    = 1,
   72         .devs_max   = 4,
   73         .devs_min   = 4,
   74         .tolerated_failures = 3,
   75         .devs_increment = 4,
   76         .ncopies    = 4,
   77     },
   78     [BTRFS_RAID_DUP] = {
   79         .sub_stripes    = 1,
   80         .dev_stripes    = 2,
   81         .devs_max   = 1,
   82         .devs_min   = 1,
   83         .tolerated_failures = 0,
   84         .devs_increment = 1,
   85         .ncopies    = 2,
   86         .nparity        = 0,
   87         .raid_name  = "dup",
   88         .bg_flag    = BTRFS_BLOCK_GROUP_DUP,
   89         .mindev_error   = 0,
   90     },
   91     [BTRFS_RAID_RAID0] = {
   92         .sub_stripes    = 1,
   93         .dev_stripes    = 1,
   94         .devs_max   = 0,
   95         .devs_min   = 2,
   96         .tolerated_failures = 0,
   97         .devs_increment = 1,
   98         .ncopies    = 1,
   99         .nparity        = 0,
  100         .raid_name  = "raid0",
  101         .bg_flag    = BTRFS_BLOCK_GROUP_RAID0,
  102         .mindev_error   = 0,
  103     },
  104     [BTRFS_RAID_SINGLE] = {
  105         .sub_stripes    = 1,
  106         .dev_stripes    = 1,
  107         .devs_max   = 1,
  108         .devs_min   = 1,
  109         .tolerated_failures = 0,
  110         .devs_increment = 1,
  111         .ncopies    = 1,
  112         .nparity        = 0,
  113         .raid_name  = "single",
  114         .bg_flag    = 0,
  115         .mindev_error   = 0,
  116     },
  117     [BTRFS_RAID_RAID5] = {
  118         .sub_stripes    = 1,
  119         .dev_stripes    = 1,
  120         .devs_max   = 0,
  121         .devs_min   = 2,
  122         .tolerated_failures = 1,
  123         .devs_increment = 1,
  124         .ncopies    = 1,
  125         .nparity        = 1,
  126         .raid_name  = "raid5",
  127         .bg_flag    = BTRFS_BLOCK_GROUP_RAID5,
  128         .mindev_error   = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
  129     },
  130     [BTRFS_RAID_RAID6] = {
  131         .sub_stripes    = 1,
  132         .dev_stripes    = 1,
  133         .devs_max   = 0,
  134         .devs_min   = 3,
  135         .tolerated_failures = 2,
  136         .devs_increment = 1,
  137         .ncopies    = 1,
  138         .nparity        = 2,
  139         .raid_name  = "raid6",
  140         .bg_flag    = BTRFS_BLOCK_GROUP_RAID6,
  141         .mindev_error   = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
  142     },
  143 };
  144 
  145 struct stripe {
  146     struct btrfs_device *dev;
  147     u64 physical;
  148 };
  149 
  150 static inline int nr_parity_stripes(struct map_lookup *map)
  151 {
  152     if (map->type & BTRFS_BLOCK_GROUP_RAID5)
  153         return 1;
  154     else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
  155         return 2;
  156     else
  157         return 0;
  158 }
  159 
  160 static inline int nr_data_stripes(struct map_lookup *map)
  161 {
  162     return map->num_stripes - nr_parity_stripes(map);
  163 }
  164 
  165 #define is_parity_stripe(x) ( ((x) == BTRFS_RAID5_P_STRIPE) || ((x) == BTRFS_RAID6_Q_STRIPE) )
  166 
  167 static LIST_HEAD(fs_uuids);
  168 
  169 /*
  170  * Find a device specified by @devid or @uuid in the list of @fs_devices, or
  171  * return NULL.
  172  *
  173  * If devid and uuid are both specified, the match must be exact, otherwise
  174  * only devid is used.
  175  */
  176 static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices,
  177         u64 devid, u8 *uuid)
  178 {
  179     struct list_head *head = &fs_devices->devices;
  180     struct btrfs_device *dev;
  181 
  182     list_for_each_entry(dev, head, dev_list) {
  183         if (dev->devid == devid &&
  184             (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
  185             return dev;
  186         }
  187     }
  188     return NULL;
  189 }
  190 
  191 static struct btrfs_fs_devices *find_fsid(u8 *fsid, u8 *metadata_uuid)
  192 {
  193     struct btrfs_fs_devices *fs_devices;
  194 
  195     list_for_each_entry(fs_devices, &fs_uuids, list) {
  196         if (metadata_uuid && (memcmp(fsid, fs_devices->fsid,
  197                          BTRFS_FSID_SIZE) == 0) &&
  198             (memcmp(metadata_uuid, fs_devices->metadata_uuid,
  199                 BTRFS_FSID_SIZE) == 0)) {
  200             return fs_devices;
  201         } else if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0){
  202             return fs_devices;
  203         }
  204     }
  205     return NULL;
  206 }
  207 
  208 static int device_list_add(const char *path,
  209                struct btrfs_super_block *disk_super,
  210                u64 devid, struct btrfs_fs_devices **fs_devices_ret)
  211 {
  212     struct btrfs_device *device;
  213     struct btrfs_fs_devices *fs_devices;
  214     u64 found_transid = btrfs_super_generation(disk_super);
  215     bool metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
  216         BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
  217 
  218     if (metadata_uuid)
  219         fs_devices = find_fsid(disk_super->fsid,
  220                        disk_super->metadata_uuid);
  221     else
  222         fs_devices = find_fsid(disk_super->fsid, NULL);
  223 
  224     if (!fs_devices) {
  225         fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
  226         if (!fs_devices)
  227             return -ENOMEM;
  228         INIT_LIST_HEAD(&fs_devices->devices);
  229         list_add(&fs_devices->list, &fs_uuids);
  230         memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
  231         if (metadata_uuid)
  232             memcpy(fs_devices->metadata_uuid,
  233                    disk_super->metadata_uuid, BTRFS_FSID_SIZE);
  234         else
  235             memcpy(fs_devices->metadata_uuid, fs_devices->fsid,
  236                    BTRFS_FSID_SIZE);
  237 
  238         fs_devices->latest_devid = devid;
  239         fs_devices->latest_trans = found_transid;
  240         fs_devices->lowest_devid = (u64)-1;
  241         device = NULL;
  242     } else {
  243         device = find_device(fs_devices, devid,
  244                        disk_super->dev_item.uuid);
  245     }
  246     if (!device) {
  247         device = kzalloc(sizeof(*device), GFP_NOFS);
  248         if (!device) {
  249             /* we can safely leave the fs_devices entry around */
  250             return -ENOMEM;
  251         }
  252         device->fd = -1;
  253         device->devid = devid;
  254         device->generation = found_transid;
  255         memcpy(device->uuid, disk_super->dev_item.uuid,
  256                BTRFS_UUID_SIZE);
  257         device->name = kstrdup(path, GFP_NOFS);
  258         if (!device->name) {
  259             kfree(device);
  260             return -ENOMEM;
  261         }
  262         device->label = kstrdup(disk_super->label, GFP_NOFS);
  263         if (!device->label) {
  264             kfree(device->name);
  265             kfree(device);
  266             return -ENOMEM;
  267         }
  268         device->total_devs = btrfs_super_num_devices(disk_super);
  269         device->super_bytes_used = btrfs_super_bytes_used(disk_super);
  270         device->total_bytes =
  271             btrfs_stack_device_total_bytes(&disk_super->dev_item);
  272         device->bytes_used =
  273             btrfs_stack_device_bytes_used(&disk_super->dev_item);
  274         list_add(&device->dev_list, &fs_devices->devices);
  275         device->fs_devices = fs_devices;
  276     } else if (!device->name || strcmp(device->name, path)) {
  277         char *name;
  278 
  279         /*
  280          * The existing device has newer generation, so this one could
  281          * be a stale one, don't add it.
  282          */
  283         if (found_transid < device->generation) {
  284             warning(
  285     "adding device %s gen %llu but found an existing device %s gen %llu",
  286                 path, found_transid, device->name,
  287                 device->generation);
  288             return -EEXIST;
  289         }
  290 
  291         name = strdup(path);
  292                 if (!name)
  293                         return -ENOMEM;
  294                 kfree(device->name);
  295                 device->name = name;
  296         }
  297 
  298 
  299     if (found_transid > fs_devices->latest_trans) {
  300         fs_devices->latest_devid = devid;
  301         fs_devices->latest_trans = found_transid;
  302     }
  303     if (fs_devices->lowest_devid > devid) {
  304         fs_devices->lowest_devid = devid;
  305     }
  306     *fs_devices_ret = fs_devices;
  307     return 0;
  308 }
  309 
  310 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
  311 {
  312     struct btrfs_fs_devices *seed_devices;
  313     struct btrfs_device *device;
  314     int ret = 0;
  315 
  316 again:
  317     if (!fs_devices)
  318         return 0;
  319     while (!list_empty(&fs_devices->devices)) {
  320         device = list_entry(fs_devices->devices.next,
  321                     struct btrfs_device, dev_list);
  322         if (device->fd != -1) {
  323             if (device->writeable && fsync(device->fd) == -1) {
  324                 warning("fsync on device %llu failed: %m",
  325                     device->devid);
  326                 ret = -errno;
  327             }
  328             if (posix_fadvise(device->fd, 0, 0, POSIX_FADV_DONTNEED))
  329                 fprintf(stderr, "Warning, could not drop caches\n");
  330             close(device->fd);
  331             device->fd = -1;
  332         }
  333         device->writeable = 0;
  334         list_del(&device->dev_list);
  335         /* free the memory */
  336         free(device->name);
  337         free(device->label);
  338         free(device);
  339     }
  340 
  341     seed_devices = fs_devices->seed;
  342     fs_devices->seed = NULL;
  343     if (seed_devices) {
  344         struct btrfs_fs_devices *orig;
  345 
  346         orig = fs_devices;
  347         fs_devices = seed_devices;
  348         list_del(&orig->list);
  349         free(orig);
  350         goto again;
  351     } else {
  352         list_del(&fs_devices->list);
  353         free(fs_devices);
  354     }
  355 
  356     return ret;
  357 }
  358 
  359 void btrfs_close_all_devices(void)
  360 {
  361     struct btrfs_fs_devices *fs_devices;
  362 
  363     while (!list_empty(&fs_uuids)) {
  364         fs_devices = list_entry(fs_uuids.next, struct btrfs_fs_devices,
  365                     list);
  366         btrfs_close_devices(fs_devices);
  367     }
  368 }
  369 
  370 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, int flags)
  371 {
  372     int fd;
  373     struct btrfs_device *device;
  374     int ret;
  375 
  376     list_for_each_entry(device, &fs_devices->devices, dev_list) {
  377         if (!device->name) {
  378             printk("no name for device %llu, skip it now\n", device->devid);
  379             continue;
  380         }
  381 
  382         fd = open(device->name, flags);
  383         if (fd < 0) {
  384             ret = -errno;
  385             error("cannot open device '%s': %m", device->name);
  386             goto fail;
  387         }
  388 
  389         if (posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED))
  390             fprintf(stderr, "Warning, could not drop caches\n");
  391 
  392         if (device->devid == fs_devices->latest_devid)
  393             fs_devices->latest_bdev = fd;
  394         if (device->devid == fs_devices->lowest_devid)
  395             fs_devices->lowest_bdev = fd;
  396         device->fd = fd;
  397         if (flags & O_RDWR)
  398             device->writeable = 1;
  399     }
  400     return 0;
  401 fail:
  402     btrfs_close_devices(fs_devices);
  403     return ret;
  404 }
  405 
  406 int btrfs_scan_one_device(int fd, const char *path,
  407               struct btrfs_fs_devices **fs_devices_ret,
  408               u64 *total_devs, u64 super_offset, unsigned sbflags)
  409 {
  410     struct btrfs_super_block *disk_super;
  411     char buf[BTRFS_SUPER_INFO_SIZE];
  412     int ret;
  413     u64 devid;
  414 
  415     disk_super = (struct btrfs_super_block *)buf;
  416     ret = btrfs_read_dev_super(fd, disk_super, super_offset, sbflags);
  417     if (ret < 0)
  418         return -EIO;
  419     devid = btrfs_stack_device_id(&disk_super->dev_item);
  420     if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_METADUMP)
  421         *total_devs = 1;
  422     else
  423         *total_devs = btrfs_super_num_devices(disk_super);
  424 
  425     ret = device_list_add(path, disk_super, devid, fs_devices_ret);
  426 
  427     return ret;
  428 }
  429 
  430 /*
  431  * find_free_dev_extent_start - find free space in the specified device
  432  * @device:   the device which we search the free space in
  433  * @num_bytes:    the size of the free space that we need
  434  * @search_start: the position from which to begin the search
  435  * @start:    store the start of the free space.
  436  * @len:      the size of the free space. that we find, or the size
  437  *        of the max free space if we don't find suitable free space
  438  *
  439  * this uses a pretty simple search, the expectation is that it is
  440  * called very infrequently and that a given device has a small number
  441  * of extents
  442  *
  443  * @start is used to store the start of the free space if we find. But if we
  444  * don't find suitable free space, it will be used to store the start position
  445  * of the max free space.
  446  *
  447  * @len is used to store the size of the free space that we find.
  448  * But if we don't find suitable free space, it is used to store the size of
  449  * the max free space.
  450  */
  451 static int find_free_dev_extent_start(struct btrfs_device *device,
  452                       u64 num_bytes, u64 search_start,
  453                       u64 *start, u64 *len)
  454 {
  455     struct btrfs_key key;
  456     struct btrfs_root *root = device->dev_root;
  457     struct btrfs_dev_extent *dev_extent;
  458     struct btrfs_path *path;
  459     u64 hole_size;
  460     u64 max_hole_start;
  461     u64 max_hole_size;
  462     u64 extent_end;
  463     u64 search_end = device->total_bytes;
  464     int ret;
  465     int slot;
  466     struct extent_buffer *l;
  467     u64 min_search_start;
  468 
  469     /*
  470      * We don't want to overwrite the superblock on the drive nor any area
  471      * used by the boot loader (grub for example), so we make sure to start
  472      * at an offset of at least 1MB.
  473      */
  474     min_search_start = max(root->fs_info->alloc_start, (u64)SZ_1M);
  475     search_start = max(search_start, min_search_start);
  476 
  477     path = btrfs_alloc_path();
  478     if (!path)
  479         return -ENOMEM;
  480 
  481     max_hole_start = search_start;
  482     max_hole_size = 0;
  483 
  484     if (search_start >= search_end) {
  485         ret = -ENOSPC;
  486         goto out;
  487     }
  488 
  489     path->reada = READA_FORWARD;
  490 
  491     key.objectid = device->devid;
  492     key.offset = search_start;
  493     key.type = BTRFS_DEV_EXTENT_KEY;
  494 
  495     ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
  496     if (ret < 0)
  497         goto out;
  498     if (ret > 0) {
  499         ret = btrfs_previous_item(root, path, key.objectid, key.type);
  500         if (ret < 0)
  501             goto out;
  502     }
  503 
  504     while (1) {
  505         l = path->nodes[0];
  506         slot = path->slots[0];
  507         if (slot >= btrfs_header_nritems(l)) {
  508             ret = btrfs_next_leaf(root, path);
  509             if (ret == 0)
  510                 continue;
  511             if (ret < 0)
  512                 goto out;
  513 
  514             break;
  515         }
  516         btrfs_item_key_to_cpu(l, &key, slot);
  517 
  518         if (key.objectid < device->devid)
  519             goto next;
  520 
  521         if (key.objectid > device->devid)
  522             break;
  523 
  524         if (key.type != BTRFS_DEV_EXTENT_KEY)
  525             goto next;
  526 
  527         if (key.offset > search_start) {
  528             hole_size = key.offset - search_start;
  529 
  530             /*
  531              * Have to check before we set max_hole_start, otherwise
  532              * we could end up sending back this offset anyway.
  533              */
  534             if (hole_size > max_hole_size) {
  535                 max_hole_start = search_start;
  536                 max_hole_size = hole_size;
  537             }
  538 
  539             /*
  540              * If this free space is greater than which we need,
  541              * it must be the max free space that we have found
  542              * until now, so max_hole_start must point to the start
  543              * of this free space and the length of this free space
  544              * is stored in max_hole_size. Thus, we return
  545              * max_hole_start and max_hole_size and go back to the
  546              * caller.
  547              */
  548             if (hole_size >= num_bytes) {
  549                 ret = 0;
  550                 goto out;
  551             }
  552         }
  553 
  554         dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
  555         extent_end = key.offset + btrfs_dev_extent_length(l,
  556                                   dev_extent);
  557         if (extent_end > search_start)
  558             search_start = extent_end;
  559 next:
  560         path->slots[0]++;
  561         cond_resched();
  562     }
  563 
  564     /*
  565      * At this point, search_start should be the end of
  566      * allocated dev extents, and when shrinking the device,
  567      * search_end may be smaller than search_start.
  568      */
  569     if (search_end > search_start) {
  570         hole_size = search_end - search_start;
  571 
  572         if (hole_size > max_hole_size) {
  573             max_hole_start = search_start;
  574             max_hole_size = hole_size;
  575         }
  576     }
  577 
  578     /* See above. */
  579     if (max_hole_size < num_bytes)
  580         ret = -ENOSPC;
  581     else
  582         ret = 0;
  583 
  584 out:
  585     btrfs_free_path(path);
  586     *start = max_hole_start;
  587     if (len)
  588         *len = max_hole_size;
  589     return ret;
  590 }
  591 
  592 static int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
  593                 u64 *start, u64 *len)
  594 {
  595     /* FIXME use last free of some kind */
  596     return find_free_dev_extent_start(device, num_bytes, 0, start, len);
  597 }
  598 
  599 /*
  600  * Insert one device extent into the fs.
  601  */
  602 int btrfs_insert_dev_extent(struct btrfs_trans_handle *trans,
  603                 struct btrfs_device *device,
  604                 u64 chunk_offset, u64 num_bytes, u64 start)
  605 {
  606     int ret;
  607     struct btrfs_path *path;
  608     struct btrfs_root *root = device->dev_root;
  609     struct btrfs_dev_extent *extent;
  610     struct extent_buffer *leaf;
  611     struct btrfs_key key;
  612 
  613     path = btrfs_alloc_path();
  614     if (!path)
  615         return -ENOMEM;
  616 
  617     key.objectid = device->devid;
  618     key.offset = start;
  619     key.type = BTRFS_DEV_EXTENT_KEY;
  620     ret = btrfs_insert_empty_item(trans, root, path, &key,
  621                       sizeof(*extent));
  622     if (ret < 0)
  623         goto err;
  624 
  625     leaf = path->nodes[0];
  626     extent = btrfs_item_ptr(leaf, path->slots[0],
  627                 struct btrfs_dev_extent);
  628     btrfs_set_dev_extent_chunk_tree(leaf, extent, BTRFS_CHUNK_TREE_OBJECTID);
  629     btrfs_set_dev_extent_chunk_objectid(leaf, extent,
  630                         BTRFS_FIRST_CHUNK_TREE_OBJECTID);
  631     btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
  632 
  633     write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
  634             (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
  635             BTRFS_UUID_SIZE);
  636 
  637     btrfs_set_dev_extent_length(leaf, extent, num_bytes);
  638     btrfs_mark_buffer_dirty(leaf);
  639 err:
  640     btrfs_free_path(path);
  641     return ret;
  642 }
  643 
  644 /*
  645  * Allocate one free dev extent and insert it into the fs.
  646  */
  647 static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
  648                   struct btrfs_device *device,
  649                   u64 chunk_offset, u64 num_bytes, u64 *start)
  650 {
  651     int ret;
  652 
  653     ret = find_free_dev_extent(device, num_bytes, start, NULL);
  654     if (ret)
  655         return ret;
  656     return btrfs_insert_dev_extent(trans, device, chunk_offset, num_bytes,
  657                     *start);
  658 }
  659 
  660 static int find_next_chunk(struct btrfs_fs_info *fs_info, u64 *offset)
  661 {
  662     struct btrfs_root *root = fs_info->chunk_root;
  663     struct btrfs_path *path;
  664     int ret;
  665     struct btrfs_key key;
  666     struct btrfs_chunk *chunk;
  667     struct btrfs_key found_key;
  668 
  669     path = btrfs_alloc_path();
  670     if (!path)
  671         return -ENOMEM;
  672 
  673     key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
  674     key.offset = (u64)-1;
  675     key.type = BTRFS_CHUNK_ITEM_KEY;
  676 
  677     ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
  678     if (ret < 0)
  679         goto error;
  680 
  681     BUG_ON(ret == 0);
  682 
  683     ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
  684     if (ret) {
  685         *offset = 0;
  686     } else {
  687         btrfs_item_key_to_cpu(path->nodes[0], &found_key,
  688                       path->slots[0]);
  689         if (found_key.objectid != BTRFS_FIRST_CHUNK_TREE_OBJECTID)
  690             *offset = 0;
  691         else {
  692             chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
  693                            struct btrfs_chunk);
  694             *offset = found_key.offset +
  695                 btrfs_chunk_length(path->nodes[0], chunk);
  696         }
  697     }
  698     ret = 0;
  699 error:
  700     btrfs_free_path(path);
  701     return ret;
  702 }
  703 
  704 static int find_next_devid(struct btrfs_root *root, struct btrfs_path *path,
  705                u64 *objectid)
  706 {
  707     int ret;
  708     struct btrfs_key key;
  709     struct btrfs_key found_key;
  710 
  711     key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
  712     key.type = BTRFS_DEV_ITEM_KEY;
  713     key.offset = (u64)-1;
  714 
  715     ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
  716     if (ret < 0)
  717         goto error;
  718 
  719     BUG_ON(ret == 0);
  720 
  721     ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
  722                   BTRFS_DEV_ITEM_KEY);
  723     if (ret) {
  724         *objectid = 1;
  725     } else {
  726         btrfs_item_key_to_cpu(path->nodes[0], &found_key,
  727                       path->slots[0]);
  728         *objectid = found_key.offset + 1;
  729     }
  730     ret = 0;
  731 error:
  732     btrfs_release_path(path);
  733     return ret;
  734 }
  735 
  736 /*
  737  * the device information is stored in the chunk root
  738  * the btrfs_device struct should be fully filled in
  739  */
  740 int btrfs_add_device(struct btrfs_trans_handle *trans,
  741              struct btrfs_fs_info *fs_info,
  742              struct btrfs_device *device)
  743 {
  744     int ret;
  745     struct btrfs_path *path;
  746     struct btrfs_dev_item *dev_item;
  747     struct extent_buffer *leaf;
  748     struct btrfs_key key;
  749     struct btrfs_root *root = fs_info->chunk_root;
  750     unsigned long ptr;
  751     u64 free_devid = 0;
  752 
  753     path = btrfs_alloc_path();
  754     if (!path)
  755         return -ENOMEM;
  756 
  757     ret = find_next_devid(root, path, &free_devid);
  758     if (ret)
  759         goto out;
  760 
  761     key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
  762     key.type = BTRFS_DEV_ITEM_KEY;
  763     key.offset = free_devid;
  764 
  765     ret = btrfs_insert_empty_item(trans, root, path, &key,
  766                       sizeof(*dev_item));
  767     if (ret)
  768         goto out;
  769 
  770     leaf = path->nodes[0];
  771     dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
  772 
  773     device->devid = free_devid;
  774     btrfs_set_device_id(leaf, dev_item, device->devid);
  775     btrfs_set_device_generation(leaf, dev_item, 0);
  776     btrfs_set_device_type(leaf, dev_item, device->type);
  777     btrfs_set_device_io_align(leaf, dev_item, device->io_align);
  778     btrfs_set_device_io_width(leaf, dev_item, device->io_width);
  779     btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
  780     btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
  781     btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
  782     btrfs_set_device_group(leaf, dev_item, 0);
  783     btrfs_set_device_seek_speed(leaf, dev_item, 0);
  784     btrfs_set_device_bandwidth(leaf, dev_item, 0);
  785     btrfs_set_device_start_offset(leaf, dev_item, 0);
  786 
  787     ptr = (unsigned long)btrfs_device_uuid(dev_item);
  788     write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
  789     ptr = (unsigned long)btrfs_device_fsid(dev_item);
  790     write_extent_buffer(leaf, fs_info->fs_devices->metadata_uuid, ptr,
  791                 BTRFS_UUID_SIZE);
  792     btrfs_mark_buffer_dirty(leaf);
  793     fs_info->fs_devices->total_rw_bytes += device->total_bytes;
  794     ret = 0;
  795 
  796 out:
  797     btrfs_free_path(path);
  798     return ret;
  799 }
  800 
  801 int btrfs_update_device(struct btrfs_trans_handle *trans,
  802             struct btrfs_device *device)
  803 {
  804     int ret;
  805     struct btrfs_path *path;
  806     struct btrfs_root *root;
  807     struct btrfs_dev_item *dev_item;
  808     struct extent_buffer *leaf;
  809     struct btrfs_key key;
  810 
  811     root = device->dev_root->fs_info->chunk_root;
  812 
  813     path = btrfs_alloc_path();
  814     if (!path)
  815         return -ENOMEM;
  816 
  817     key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
  818     key.type = BTRFS_DEV_ITEM_KEY;
  819     key.offset = device->devid;
  820 
  821     ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
  822     if (ret < 0)
  823         goto out;
  824 
  825     if (ret > 0) {
  826         ret = -ENOENT;
  827         goto out;
  828     }
  829 
  830     leaf = path->nodes[0];
  831     dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
  832 
  833     btrfs_set_device_id(leaf, dev_item, device->devid);
  834     btrfs_set_device_type(leaf, dev_item, device->type);
  835     btrfs_set_device_io_align(leaf, dev_item, device->io_align);
  836     btrfs_set_device_io_width(leaf, dev_item, device->io_width);
  837     btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
  838     btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
  839     btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
  840     btrfs_mark_buffer_dirty(leaf);
  841 
  842 out:
  843     btrfs_free_path(path);
  844     return ret;
  845 }
  846 
  847 int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
  848                struct btrfs_chunk *chunk, int item_size)
  849 {
  850     struct btrfs_super_block *super_copy = fs_info->super_copy;
  851     struct btrfs_disk_key disk_key;
  852     u32 array_size;
  853     u8 *ptr;
  854 
  855     array_size = btrfs_super_sys_array_size(super_copy);
  856     if (array_size + item_size + sizeof(disk_key)
  857             > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
  858         return -EFBIG;
  859 
  860     ptr = super_copy->sys_chunk_array + array_size;
  861     btrfs_cpu_key_to_disk(&disk_key, key);
  862     memcpy(ptr, &disk_key, sizeof(disk_key));
  863     ptr += sizeof(disk_key);
  864     memcpy(ptr, chunk, item_size);
  865     item_size += sizeof(disk_key);
  866     btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
  867     return 0;
  868 }
  869 
  870 static u64 chunk_bytes_by_type(u64 type, u64 calc_size, int num_stripes,
  871                    int sub_stripes)
  872 {
  873     if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
  874         return calc_size;
  875     else if (type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4))
  876         return calc_size;
  877     else if (type & BTRFS_BLOCK_GROUP_RAID10)
  878         return calc_size * (num_stripes / sub_stripes);
  879     else if (type & BTRFS_BLOCK_GROUP_RAID5)
  880         return calc_size * (num_stripes - 1);
  881     else if (type & BTRFS_BLOCK_GROUP_RAID6)
  882         return calc_size * (num_stripes - 2);
  883     else
  884         return calc_size * num_stripes;
  885 }
  886 
  887 
  888 static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
  889 {
  890     /* TODO, add a way to store the preferred stripe size */
  891     return BTRFS_STRIPE_LEN;
  892 }
  893 
  894 /*
  895  * btrfs_device_avail_bytes - count bytes available for alloc_chunk
  896  *
  897  * It is not equal to "device->total_bytes - device->bytes_used".
  898  * We do not allocate any chunk in 1M at beginning of device, and not
  899  * allowed to allocate any chunk before alloc_start if it is specified.
  900  * So search holes from max(1M, alloc_start) to device->total_bytes.
  901  */
  902 static int btrfs_device_avail_bytes(struct btrfs_trans_handle *trans,
  903                     struct btrfs_device *device,
  904                     u64 *avail_bytes)
  905 {
  906     struct btrfs_path *path;
  907     struct btrfs_root *root = device->dev_root;
  908     struct btrfs_key key;
  909     struct btrfs_dev_extent *dev_extent = NULL;
  910     struct extent_buffer *l;
  911     u64 search_start = root->fs_info->alloc_start;
  912     u64 search_end = device->total_bytes;
  913     u64 extent_end = 0;
  914     u64 free_bytes = 0;
  915     int ret;
  916     int slot = 0;
  917 
  918     search_start = max(BTRFS_BLOCK_RESERVED_1M_FOR_SUPER, search_start);
  919 
  920     path = btrfs_alloc_path();
  921     if (!path)
  922         return -ENOMEM;
  923 
  924     key.objectid = device->devid;
  925     key.offset = root->fs_info->alloc_start;
  926     key.type = BTRFS_DEV_EXTENT_KEY;
  927 
  928     path->reada = READA_FORWARD;
  929     ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
  930     if (ret < 0)
  931         goto error;
  932     ret = btrfs_previous_item(root, path, 0, key.type);
  933     if (ret < 0)
  934         goto error;
  935 
  936     while (1) {
  937         l = path->nodes[0];
  938         slot = path->slots[0];
  939         if (slot >= btrfs_header_nritems(l)) {
  940             ret = btrfs_next_leaf(root, path);
  941             if (ret == 0)
  942                 continue;
  943             if (ret < 0)
  944                 goto error;
  945             break;
  946         }
  947         btrfs_item_key_to_cpu(l, &key, slot);
  948 
  949         if (key.objectid < device->devid)
  950             goto next;
  951         if (key.objectid > device->devid)
  952             break;
  953         if (key.type != BTRFS_DEV_EXTENT_KEY)
  954             goto next;
  955         if (key.offset > search_end)
  956             break;
  957         if (key.offset > search_start)
  958             free_bytes += key.offset - search_start;
  959 
  960         dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
  961         extent_end = key.offset + btrfs_dev_extent_length(l,
  962                                   dev_extent);
  963         if (extent_end > search_start)
  964             search_start = extent_end;
  965         if (search_start > search_end)
  966             break;
  967 next:
  968         path->slots[0]++;
  969         cond_resched();
  970     }
  971 
  972     if (search_start < search_end)
  973         free_bytes += search_end - search_start;
  974 
  975     *avail_bytes = free_bytes;
  976     ret = 0;
  977 error:
  978     btrfs_free_path(path);
  979     return ret;
  980 }
  981 
  982 #define BTRFS_MAX_DEVS(info) ((BTRFS_LEAF_DATA_SIZE(info)   \
  983             - sizeof(struct btrfs_item)     \
  984             - sizeof(struct btrfs_chunk))       \
  985             / sizeof(struct btrfs_stripe) + 1)
  986 
  987 #define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE    \
  988                 - 2 * sizeof(struct btrfs_disk_key) \
  989                 - 2 * sizeof(struct btrfs_chunk))   \
  990                 / sizeof(struct btrfs_stripe) + 1)
  991 
  992 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
  993               struct btrfs_fs_info *info, u64 *start,
  994               u64 *num_bytes, u64 type)
  995 {
  996     u64 dev_offset;
  997     struct btrfs_root *extent_root = info->extent_root;
  998     struct btrfs_root *chunk_root = info->chunk_root;
  999     struct btrfs_stripe *stripes;
 1000     struct btrfs_device *device = NULL;
 1001     struct btrfs_chunk *chunk;
 1002     struct list_head private_devs;
 1003     struct list_head *dev_list = &info->fs_devices->devices;
 1004     struct list_head *cur;
 1005     struct map_lookup *map;
 1006     int min_stripe_size = SZ_1M;
 1007     u64 calc_size = SZ_8M;
 1008     u64 min_free;
 1009     u64 max_chunk_size = 4 * calc_size;
 1010     u64 avail = 0;
 1011     u64 max_avail = 0;
 1012     u64 percent_max;
 1013     int num_stripes = 1;
 1014     int max_stripes = 0;
 1015     int min_stripes = 1;
 1016     int sub_stripes = 1;
 1017     int looped = 0;
 1018     int ret;
 1019     int index;
 1020     int stripe_len = BTRFS_STRIPE_LEN;
 1021     struct btrfs_key key;
 1022     u64 offset;
 1023 
 1024     if (list_empty(dev_list)) {
 1025         return -ENOSPC;
 1026     }
 1027 
 1028     if (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
 1029         if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
 1030             calc_size = SZ_8M;
 1031             max_chunk_size = calc_size * 2;
 1032             min_stripe_size = SZ_1M;
 1033             max_stripes = BTRFS_MAX_DEVS_SYS_CHUNK;
 1034         } else if (type & BTRFS_BLOCK_GROUP_DATA) {
 1035             calc_size = SZ_1G;
 1036             max_chunk_size = 10 * calc_size;
 1037             min_stripe_size = SZ_64M;
 1038             max_stripes = BTRFS_MAX_DEVS(info);
 1039         } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
 1040             /* for larger filesystems, use larger metadata chunks */
 1041             if (info->fs_devices->total_rw_bytes > 50ULL * SZ_1G)
 1042                 max_chunk_size = SZ_1G;
 1043             else
 1044                 max_chunk_size = SZ_256M;
 1045             calc_size = max_chunk_size;
 1046             min_stripe_size = SZ_32M;
 1047             max_stripes = BTRFS_MAX_DEVS(info);
 1048         }
 1049     }
 1050     if (type & BTRFS_BLOCK_GROUP_RAID1) {
 1051         num_stripes = min_t(u64, 2,
 1052                   btrfs_super_num_devices(info->super_copy));
 1053         if (num_stripes < 2)
 1054             return -ENOSPC;
 1055         min_stripes = 2;
 1056     }
 1057     if (type & BTRFS_BLOCK_GROUP_RAID1C3) {
 1058         num_stripes = min_t(u64, 3,
 1059                   btrfs_super_num_devices(info->super_copy));
 1060         if (num_stripes < 3)
 1061             return -ENOSPC;
 1062         min_stripes = 3;
 1063     }
 1064     if (type & BTRFS_BLOCK_GROUP_RAID1C4) {
 1065         num_stripes = min_t(u64, 4,
 1066                   btrfs_super_num_devices(info->super_copy));
 1067         if (num_stripes < 4)
 1068             return -ENOSPC;
 1069         min_stripes = 4;
 1070     }
 1071     if (type & BTRFS_BLOCK_GROUP_DUP) {
 1072         num_stripes = 2;
 1073         min_stripes = 2;
 1074     }
 1075     if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
 1076         num_stripes = btrfs_super_num_devices(info->super_copy);
 1077         if (num_stripes > max_stripes)
 1078             num_stripes = max_stripes;
 1079         min_stripes = 2;
 1080     }
 1081     if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
 1082         num_stripes = btrfs_super_num_devices(info->super_copy);
 1083         if (num_stripes > max_stripes)
 1084             num_stripes = max_stripes;
 1085         if (num_stripes < 4)
 1086             return -ENOSPC;
 1087         num_stripes &= ~(u32)1;
 1088         sub_stripes = 2;
 1089         min_stripes = 4;
 1090     }
 1091     if (type & (BTRFS_BLOCK_GROUP_RAID5)) {
 1092         num_stripes = btrfs_super_num_devices(info->super_copy);
 1093         if (num_stripes > max_stripes)
 1094             num_stripes = max_stripes;
 1095         if (num_stripes < 2)
 1096             return -ENOSPC;
 1097         min_stripes = 2;
 1098         stripe_len = find_raid56_stripe_len(num_stripes - 1,
 1099                     btrfs_super_stripesize(info->super_copy));
 1100     }
 1101     if (type & (BTRFS_BLOCK_GROUP_RAID6)) {
 1102         num_stripes = btrfs_super_num_devices(info->super_copy);
 1103         if (num_stripes > max_stripes)
 1104             num_stripes = max_stripes;
 1105         if (num_stripes < 3)
 1106             return -ENOSPC;
 1107         min_stripes = 3;
 1108         stripe_len = find_raid56_stripe_len(num_stripes - 2,
 1109                     btrfs_super_stripesize(info->super_copy));
 1110     }
 1111 
 1112     /* we don't want a chunk larger than 10% of the FS */
 1113     percent_max = div_factor(btrfs_super_total_bytes(info->super_copy), 1);
 1114     max_chunk_size = min(percent_max, max_chunk_size);
 1115 
 1116 again:
 1117     if (chunk_bytes_by_type(type, calc_size, num_stripes, sub_stripes) >
 1118         max_chunk_size) {
 1119         calc_size = max_chunk_size;
 1120         calc_size /= num_stripes;
 1121         calc_size /= stripe_len;
 1122         calc_size *= stripe_len;
 1123     }
 1124     /* we don't want tiny stripes */
 1125     calc_size = max_t(u64, calc_size, min_stripe_size);
 1126 
 1127     calc_size /= stripe_len;
 1128     calc_size *= stripe_len;
 1129     INIT_LIST_HEAD(&private_devs);
 1130     cur = dev_list->next;
 1131     index = 0;
 1132 
 1133     if (type & BTRFS_BLOCK_GROUP_DUP)
 1134         min_free = calc_size * 2;
 1135     else
 1136         min_free = calc_size;
 1137 
 1138     /* build a private list of devices we will allocate from */
 1139     while(index < num_stripes) {
 1140         device = list_entry(cur, struct btrfs_device, dev_list);
 1141         ret = btrfs_device_avail_bytes(trans, device, &avail);
 1142         if (ret)
 1143             return ret;
 1144         cur = cur->next;
 1145         if (avail >= min_free) {
 1146             list_move(&device->dev_list, &private_devs);
 1147             index++;
 1148             if (type & BTRFS_BLOCK_GROUP_DUP)
 1149                 index++;
 1150         } else if (avail > max_avail)
 1151             max_avail = avail;
 1152         if (cur == dev_list)
 1153             break;
 1154     }
 1155     if (index < num_stripes) {
 1156         list_splice(&private_devs, dev_list);
 1157         if (index >= min_stripes) {
 1158             num_stripes = index;
 1159             if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
 1160                 num_stripes /= sub_stripes;
 1161                 num_stripes *= sub_stripes;
 1162             }
 1163             looped = 1;
 1164             goto again;
 1165         }
 1166         if (!looped && max_avail > 0) {
 1167             looped = 1;
 1168             calc_size = max_avail;
 1169             goto again;
 1170         }
 1171         return -ENOSPC;
 1172     }
 1173     ret = find_next_chunk(info, &offset);
 1174     if (ret)
 1175         return ret;
 1176     key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
 1177     key.type = BTRFS_CHUNK_ITEM_KEY;
 1178     key.offset = offset;
 1179 
 1180     chunk = kmalloc(btrfs_chunk_item_size(num_stripes), GFP_NOFS);
 1181     if (!chunk)
 1182         return -ENOMEM;
 1183 
 1184     map = kmalloc(btrfs_map_lookup_size(num_stripes), GFP_NOFS);
 1185     if (!map) {
 1186         kfree(chunk);
 1187         return -ENOMEM;
 1188     }
 1189 
 1190     stripes = &chunk->stripe;
 1191     *num_bytes = chunk_bytes_by_type(type, calc_size,
 1192                      num_stripes, sub_stripes);
 1193     index = 0;
 1194     while(index < num_stripes) {
 1195         struct btrfs_stripe *stripe;
 1196         BUG_ON(list_empty(&private_devs));
 1197         cur = private_devs.next;
 1198         device = list_entry(cur, struct btrfs_device, dev_list);
 1199 
 1200         /* loop over this device again if we're doing a dup group */
 1201         if (!(type & BTRFS_BLOCK_GROUP_DUP) ||
 1202             (index == num_stripes - 1))
 1203             list_move(&device->dev_list, dev_list);
 1204 
 1205         ret = btrfs_alloc_dev_extent(trans, device, key.offset,
 1206                  calc_size, &dev_offset);
 1207         if (ret < 0)
 1208             goto out_chunk_map;
 1209 
 1210         device->bytes_used += calc_size;
 1211         ret = btrfs_update_device(trans, device);
 1212         if (ret < 0)
 1213             goto out_chunk_map;
 1214 
 1215         map->stripes[index].dev = device;
 1216         map->stripes[index].physical = dev_offset;
 1217         stripe = stripes + index;
 1218         btrfs_set_stack_stripe_devid(stripe, device->devid);
 1219         btrfs_set_stack_stripe_offset(stripe, dev_offset);
 1220         memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
 1221         index++;
 1222     }
 1223     BUG_ON(!list_empty(&private_devs));
 1224 
 1225     /* key was set above */
 1226     btrfs_set_stack_chunk_length(chunk, *num_bytes);
 1227     btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
 1228     btrfs_set_stack_chunk_stripe_len(chunk, stripe_len);
 1229     btrfs_set_stack_chunk_type(chunk, type);
 1230     btrfs_set_stack_chunk_num_stripes(chunk, num_stripes);
 1231     btrfs_set_stack_chunk_io_align(chunk, stripe_len);
 1232     btrfs_set_stack_chunk_io_width(chunk, stripe_len);
 1233     btrfs_set_stack_chunk_sector_size(chunk, info->sectorsize);
 1234     btrfs_set_stack_chunk_sub_stripes(chunk, sub_stripes);
 1235     map->sector_size = info->sectorsize;
 1236     map->stripe_len = stripe_len;
 1237     map->io_align = stripe_len;
 1238     map->io_width = stripe_len;
 1239     map->type = type;
 1240     map->num_stripes = num_stripes;
 1241     map->sub_stripes = sub_stripes;
 1242 
 1243     ret = btrfs_insert_item(trans, chunk_root, &key, chunk,
 1244                 btrfs_chunk_item_size(num_stripes));
 1245     BUG_ON(ret);
 1246     *start = key.offset;;
 1247 
 1248     map->ce.start = key.offset;
 1249     map->ce.size = *num_bytes;
 1250 
 1251     ret = insert_cache_extent(&info->mapping_tree.cache_tree, &map->ce);
 1252     if (ret < 0)
 1253         goto out_chunk_map;
 1254 
 1255     if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
 1256         ret = btrfs_add_system_chunk(info, &key,
 1257                     chunk, btrfs_chunk_item_size(num_stripes));
 1258         if (ret < 0)
 1259             goto out_chunk;
 1260     }
 1261 
 1262     kfree(chunk);
 1263     return ret;
 1264 
 1265 out_chunk_map:
 1266     kfree(map);
 1267 out_chunk:
 1268     kfree(chunk);
 1269     return ret;
 1270 }
 1271 
 1272 /*
 1273  * Alloc a DATA chunk with SINGLE profile.
 1274  *
 1275  * It allocates a chunk with 1:1 mapping (btrfs logical bytenr == on-disk bytenr)
 1276  * Caller must make sure the chunk and dev_extent are not occupied.
 1277  */
 1278 int btrfs_alloc_data_chunk(struct btrfs_trans_handle *trans,
 1279                struct btrfs_fs_info *info, u64 *start, u64 num_bytes)
 1280 {
 1281     u64 dev_offset;
 1282     struct btrfs_root *extent_root = info->extent_root;
 1283     struct btrfs_root *chunk_root = info->chunk_root;
 1284     struct btrfs_stripe *stripes;
 1285     struct btrfs_device *device = NULL;
 1286     struct btrfs_chunk *chunk;
 1287     struct list_head *dev_list = &info->fs_devices->devices;
 1288     struct list_head *cur;
 1289     struct map_lookup *map;
 1290     u64 calc_size = SZ_8M;
 1291     int num_stripes = 1;
 1292     int sub_stripes = 1;
 1293     int ret;
 1294     int index;
 1295     int stripe_len = BTRFS_STRIPE_LEN;
 1296     struct btrfs_key key;
 1297 
 1298 
 1299     if (*start != round_down(*start, info->sectorsize)) {
 1300         error("DATA chunk start not sectorsize aligned: %llu",
 1301                 (unsigned long long)*start);
 1302         return -EINVAL;
 1303     }
 1304 
 1305     key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
 1306     key.type = BTRFS_CHUNK_ITEM_KEY;
 1307     key.offset = *start;
 1308     dev_offset = *start;
 1309 
 1310     chunk = kmalloc(btrfs_chunk_item_size(num_stripes), GFP_NOFS);
 1311     if (!chunk)
 1312         return -ENOMEM;
 1313 
 1314     map = kmalloc(btrfs_map_lookup_size(num_stripes), GFP_NOFS);
 1315     if (!map) {
 1316         kfree(chunk);
 1317         return -ENOMEM;
 1318     }
 1319 
 1320     stripes = &chunk->stripe;
 1321     calc_size = num_bytes;
 1322 
 1323     index = 0;
 1324     cur = dev_list->next;
 1325     device = list_entry(cur, struct btrfs_device, dev_list);
 1326 
 1327     while (index < num_stripes) {
 1328         struct btrfs_stripe *stripe;
 1329 
 1330         ret = btrfs_insert_dev_extent(trans, device, key.offset, calc_size,
 1331                 dev_offset);
 1332         BUG_ON(ret);
 1333 
 1334         device->bytes_used += calc_size;
 1335         ret = btrfs_update_device(trans, device);
 1336         BUG_ON(ret);
 1337 
 1338         map->stripes[index].dev = device;
 1339         map->stripes[index].physical = dev_offset;
 1340         stripe = stripes + index;
 1341         btrfs_set_stack_stripe_devid(stripe, device->devid);
 1342         btrfs_set_stack_stripe_offset(stripe, dev_offset);
 1343         memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
 1344         index++;
 1345     }
 1346 
 1347     /* key was set above */
 1348     btrfs_set_stack_chunk_length(chunk, num_bytes);
 1349     btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
 1350     btrfs_set_stack_chunk_stripe_len(chunk, stripe_len);
 1351     btrfs_set_stack_chunk_type(chunk, BTRFS_BLOCK_GROUP_DATA);
 1352     btrfs_set_stack_chunk_num_stripes(chunk, num_stripes);
 1353     btrfs_set_stack_chunk_io_align(chunk, stripe_len);
 1354     btrfs_set_stack_chunk_io_width(chunk, stripe_len);
 1355     btrfs_set_stack_chunk_sector_size(chunk, info->sectorsize);
 1356     btrfs_set_stack_chunk_sub_stripes(chunk, sub_stripes);
 1357     map->sector_size = info->sectorsize;
 1358     map->stripe_len = stripe_len;
 1359     map->io_align = stripe_len;
 1360     map->io_width = stripe_len;
 1361     map->type = BTRFS_BLOCK_GROUP_DATA;
 1362     map->num_stripes = num_stripes;
 1363     map->sub_stripes = sub_stripes;
 1364 
 1365     ret = btrfs_insert_item(trans, chunk_root, &key, chunk,
 1366                 btrfs_chunk_item_size(num_stripes));
 1367     BUG_ON(ret);
 1368 
 1369     map->ce.start = key.offset;
 1370     map->ce.size = num_bytes;
 1371 
 1372     ret = insert_cache_extent(&info->mapping_tree.cache_tree, &map->ce);
 1373     BUG_ON(ret);
 1374 
 1375     kfree(chunk);
 1376     return ret;
 1377 }
 1378 
 1379 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
 1380 {
 1381     struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
 1382     struct cache_extent *ce;
 1383     struct map_lookup *map;
 1384     int ret;
 1385 
 1386     ce = search_cache_extent(&map_tree->cache_tree, logical);
 1387     if (!ce) {
 1388         fprintf(stderr, "No mapping for %llu-%llu\n",
 1389             (unsigned long long)logical,
 1390             (unsigned long long)logical+len);
 1391         return 1;
 1392     }
 1393     if (ce->start > logical || ce->start + ce->size < logical) {
 1394         fprintf(stderr, "Invalid mapping for %llu-%llu, got "
 1395             "%llu-%llu\n", (unsigned long long)logical,
 1396             (unsigned long long)logical+len,
 1397             (unsigned long long)ce->start,
 1398             (unsigned long long)ce->start + ce->size);
 1399         return 1;
 1400     }
 1401     map = container_of(ce, struct map_lookup, ce);
 1402 
 1403     if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
 1404              BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4))
 1405         ret = map->num_stripes;
 1406     else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
 1407         ret = map->sub_stripes;
 1408     else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
 1409         ret = 2;
 1410     else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
 1411         ret = 3;
 1412     else
 1413         ret = 1;
 1414     return ret;
 1415 }
 1416 
 1417 int btrfs_next_bg(struct btrfs_fs_info *fs_info, u64 *logical,
 1418           u64 *size, u64 type)
 1419 {
 1420     struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
 1421     struct cache_extent *ce;
 1422     struct map_lookup *map;
 1423     u64 cur = *logical;
 1424 
 1425     ce = search_cache_extent(&map_tree->cache_tree, cur);
 1426 
 1427     while (ce) {
 1428         /*
 1429          * only jump to next bg if our cur is not 0
 1430          * As the initial logical for btrfs_next_bg() is 0, and
 1431          * if we jump to next bg, we skipped a valid bg.
 1432          */
 1433         if (cur) {
 1434             ce = next_cache_extent(ce);
 1435             if (!ce)
 1436                 return -ENOENT;
 1437         }
 1438 
 1439         cur = ce->start;
 1440         map = container_of(ce, struct map_lookup, ce);
 1441         if (map->type & type) {
 1442             *logical = ce->start;
 1443             *size = ce->size;
 1444             return 0;
 1445         }
 1446         if (!cur)
 1447             ce = next_cache_extent(ce);
 1448     }
 1449 
 1450     return -ENOENT;
 1451 }
 1452 
 1453 int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
 1454              u64 physical, u64 **logical, int *naddrs, int *stripe_len)
 1455 {
 1456     struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
 1457     struct cache_extent *ce;
 1458     struct map_lookup *map;
 1459     u64 *buf;
 1460     u64 bytenr;
 1461     u64 length;
 1462     u64 stripe_nr;
 1463     u64 rmap_len;
 1464     int i, j, nr = 0;
 1465 
 1466     ce = search_cache_extent(&map_tree->cache_tree, chunk_start);
 1467     BUG_ON(!ce);
 1468     map = container_of(ce, struct map_lookup, ce);
 1469 
 1470     length = ce->size;
 1471     rmap_len = map->stripe_len;
 1472     if (map->type & BTRFS_BLOCK_GROUP_RAID10)
 1473         length = ce->size / (map->num_stripes / map->sub_stripes);
 1474     else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
 1475         length = ce->size / map->num_stripes;
 1476     else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
 1477                   BTRFS_BLOCK_GROUP_RAID6)) {
 1478         length = ce->size / nr_data_stripes(map);
 1479         rmap_len = map->stripe_len * nr_data_stripes(map);
 1480     }
 1481 
 1482     buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
 1483 
 1484     for (i = 0; i < map->num_stripes; i++) {
 1485         if (map->stripes[i].physical > physical ||
 1486             map->stripes[i].physical + length <= physical)
 1487             continue;
 1488 
 1489         stripe_nr = (physical - map->stripes[i].physical) /
 1490                 map->stripe_len;
 1491 
 1492         if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
 1493             stripe_nr = (stripe_nr * map->num_stripes + i) /
 1494                     map->sub_stripes;
 1495         } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
 1496             stripe_nr = stripe_nr * map->num_stripes + i;
 1497         } /* else if RAID[56], multiply by nr_data_stripes().
 1498            * Alternatively, just use rmap_len below instead of
 1499            * map->stripe_len */
 1500 
 1501         bytenr = ce->start + stripe_nr * rmap_len;
 1502         for (j = 0; j < nr; j++) {
 1503             if (buf[j] == bytenr)
 1504                 break;
 1505         }
 1506         if (j == nr)
 1507             buf[nr++] = bytenr;
 1508     }
 1509 
 1510     *logical = buf;
 1511     *naddrs = nr;
 1512     *stripe_len = rmap_len;
 1513 
 1514     return 0;
 1515 }
 1516 
 1517 static inline int parity_smaller(u64 a, u64 b)
 1518 {
 1519     return a > b;
 1520 }
 1521 
 1522 /* Bubble-sort the stripe set to put the parity/syndrome stripes last */
 1523 static void sort_parity_stripes(struct btrfs_multi_bio *bbio, u64 *raid_map)
 1524 {
 1525     struct btrfs_bio_stripe s;
 1526     int i;
 1527     u64 l;
 1528     int again = 1;
 1529 
 1530     while (again) {
 1531         again = 0;
 1532         for (i = 0; i < bbio->num_stripes - 1; i++) {
 1533             if (parity_smaller(raid_map[i], raid_map[i+1])) {
 1534                 s = bbio->stripes[i];
 1535                 l = raid_map[i];
 1536                 bbio->stripes[i] = bbio->stripes[i+1];
 1537                 raid_map[i] = raid_map[i+1];
 1538                 bbio->stripes[i+1] = s;
 1539                 raid_map[i+1] = l;
 1540                 again = 1;
 1541             }
 1542         }
 1543     }
 1544 }
 1545 
 1546 int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 1547             u64 logical, u64 *length,
 1548             struct btrfs_multi_bio **multi_ret, int mirror_num,
 1549             u64 **raid_map_ret)
 1550 {
 1551     return __btrfs_map_block(fs_info, rw, logical, length, NULL,
 1552                  multi_ret, mirror_num, raid_map_ret);
 1553 }
 1554 
 1555 int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 1556               u64 logical, u64 *length, u64 *type,
 1557               struct btrfs_multi_bio **multi_ret, int mirror_num,
 1558               u64 **raid_map_ret)
 1559 {
 1560     struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
 1561     struct cache_extent *ce;
 1562     struct map_lookup *map;
 1563     u64 offset;
 1564     u64 stripe_offset;
 1565     u64 stripe_nr;
 1566     u64 *raid_map = NULL;
 1567     int stripes_allocated = 8;
 1568     int stripes_required = 1;
 1569     int stripe_index;
 1570     int i;
 1571     struct btrfs_multi_bio *multi = NULL;
 1572 
 1573     if (multi_ret && rw == READ) {
 1574         stripes_allocated = 1;
 1575     }
 1576 again:
 1577     ce = search_cache_extent(&map_tree->cache_tree, logical);
 1578     if (!ce) {
 1579         kfree(multi);
 1580         *length = (u64)-1;
 1581         return -ENOENT;
 1582     }
 1583     if (ce->start > logical) {
 1584         kfree(multi);
 1585         *length = ce->start - logical;
 1586         return -ENOENT;
 1587     }
 1588 
 1589     if (multi_ret) {
 1590         multi = kzalloc(btrfs_multi_bio_size(stripes_allocated),
 1591                 GFP_NOFS);
 1592         if (!multi)
 1593             return -ENOMEM;
 1594     }
 1595     map = container_of(ce, struct map_lookup, ce);
 1596     offset = logical - ce->start;
 1597 
 1598     if (rw == WRITE) {
 1599         if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
 1600                  BTRFS_BLOCK_GROUP_RAID1C3 |
 1601                  BTRFS_BLOCK_GROUP_RAID1C4 |
 1602                  BTRFS_BLOCK_GROUP_DUP)) {
 1603             stripes_required = map->num_stripes;
 1604         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
 1605             stripes_required = map->sub_stripes;
 1606         }
 1607     }
 1608     if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)
 1609         && multi_ret && ((rw & WRITE) || mirror_num > 1) && raid_map_ret) {
 1610             /* RAID[56] write or recovery. Return all stripes */
 1611             stripes_required = map->num_stripes;
 1612 
 1613             /* Only allocate the map if we've already got a large enough multi_ret */
 1614             if (stripes_allocated >= stripes_required) {
 1615                 raid_map = kmalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
 1616                 if (!raid_map) {
 1617                     kfree(multi);
 1618                     return -ENOMEM;
 1619                 }
 1620             }
 1621     }
 1622 
 1623     /* if our multi bio struct is too small, back off and try again */
 1624     if (multi_ret && stripes_allocated < stripes_required) {
 1625         stripes_allocated = stripes_required;
 1626         kfree(multi);
 1627         multi = NULL;
 1628         goto again;
 1629     }
 1630     stripe_nr = offset;
 1631     /*
 1632      * stripe_nr counts the total number of stripes we have to stride
 1633      * to get to this block
 1634      */
 1635     stripe_nr = stripe_nr / map->stripe_len;
 1636 
 1637     stripe_offset = stripe_nr * map->stripe_len;
 1638     BUG_ON(offset < stripe_offset);
 1639 
 1640     /* stripe_offset is the offset of this block in its stripe*/
 1641     stripe_offset = offset - stripe_offset;
 1642 
 1643     if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
 1644              BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4 |
 1645              BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
 1646              BTRFS_BLOCK_GROUP_RAID10 |
 1647              BTRFS_BLOCK_GROUP_DUP)) {
 1648         /* we limit the length of each bio to what fits in a stripe */
 1649         *length = min_t(u64, ce->size - offset,
 1650                   map->stripe_len - stripe_offset);
 1651     } else {
 1652         *length = ce->size - offset;
 1653     }
 1654 
 1655     if (!multi_ret)
 1656         goto out;
 1657 
 1658     multi->num_stripes = 1;
 1659     stripe_index = 0;
 1660     if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
 1661              BTRFS_BLOCK_GROUP_RAID1C3 |
 1662              BTRFS_BLOCK_GROUP_RAID1C4)) {
 1663         if (rw == WRITE)
 1664             multi->num_stripes = map->num_stripes;
 1665         else if (mirror_num)
 1666             stripe_index = mirror_num - 1;
 1667         else
 1668             stripe_index = stripe_nr % map->num_stripes;
 1669     } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
 1670         int factor = map->num_stripes / map->sub_stripes;
 1671 
 1672         stripe_index = stripe_nr % factor;
 1673         stripe_index *= map->sub_stripes;
 1674 
 1675         if (rw == WRITE)
 1676             multi->num_stripes = map->sub_stripes;
 1677         else if (mirror_num)
 1678             stripe_index += mirror_num - 1;
 1679 
 1680         stripe_nr = stripe_nr / factor;
 1681     } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
 1682         if (rw == WRITE)
 1683             multi->num_stripes = map->num_stripes;
 1684         else if (mirror_num)
 1685             stripe_index = mirror_num - 1;
 1686     } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
 1687                 BTRFS_BLOCK_GROUP_RAID6)) {
 1688 
 1689         if (raid_map) {
 1690             int rot;
 1691             u64 tmp;
 1692             u64 raid56_full_stripe_start;
 1693             u64 full_stripe_len = nr_data_stripes(map) * map->stripe_len;
 1694 
 1695             /*
 1696              * align the start of our data stripe in the logical
 1697              * address space
 1698              */
 1699             raid56_full_stripe_start = offset / full_stripe_len;
 1700             raid56_full_stripe_start *= full_stripe_len;
 1701 
 1702             /* get the data stripe number */
 1703             stripe_nr = raid56_full_stripe_start / map->stripe_len;
 1704             stripe_nr = stripe_nr / nr_data_stripes(map);
 1705 
 1706             /* Work out the disk rotation on this stripe-set */
 1707             rot = stripe_nr % map->num_stripes;
 1708 
 1709             /* Fill in the logical address of each stripe */
 1710             tmp = stripe_nr * nr_data_stripes(map);
 1711 
 1712             for (i = 0; i < nr_data_stripes(map); i++)
 1713                 raid_map[(i+rot) % map->num_stripes] =
 1714                     ce->start + (tmp + i) * map->stripe_len;
 1715 
 1716             raid_map[(i+rot) % map->num_stripes] = BTRFS_RAID5_P_STRIPE;
 1717             if (map->type & BTRFS_BLOCK_GROUP_RAID6)
 1718                 raid_map[(i+rot+1) % map->num_stripes] = BTRFS_RAID6_Q_STRIPE;
 1719 
 1720             *length = map->stripe_len;
 1721             stripe_index = 0;
 1722             stripe_offset = 0;
 1723             multi->num_stripes = map->num_stripes;
 1724         } else {
 1725             stripe_index = stripe_nr % nr_data_stripes(map);
 1726             stripe_nr = stripe_nr / nr_data_stripes(map);
 1727 
 1728             /*
 1729              * Mirror #0 or #1 means the original data block.
 1730              * Mirror #2 is RAID5 parity block.
 1731              * Mirror #3 is RAID6 Q block.
 1732              */
 1733             if (mirror_num > 1)
 1734                 stripe_index = nr_data_stripes(map) + mirror_num - 2;
 1735 
 1736             /* We distribute the parity blocks across stripes */
 1737             stripe_index = (stripe_nr + stripe_index) % map->num_stripes;
 1738         }
 1739     } else {
 1740         /*
 1741          * after this do_div call, stripe_nr is the number of stripes
 1742          * on this device we have to walk to find the data, and
 1743          * stripe_index is the number of our device in the stripe array
 1744          */
 1745         stripe_index = stripe_nr % map->num_stripes;
 1746         stripe_nr = stripe_nr / map->num_stripes;
 1747     }
 1748     BUG_ON(stripe_index >= map->num_stripes);
 1749 
 1750     for (i = 0; i < multi->num_stripes; i++) {
 1751         multi->stripes[i].physical =
 1752             map->stripes[stripe_index].physical + stripe_offset +
 1753             stripe_nr * map->stripe_len;
 1754         multi->stripes[i].dev = map->stripes[stripe_index].dev;
 1755         stripe_index++;
 1756     }
 1757     *multi_ret = multi;
 1758 
 1759     if (type)
 1760         *type = map->type;
 1761 
 1762     if (raid_map) {
 1763         sort_parity_stripes(multi, raid_map);
 1764         *raid_map_ret = raid_map;
 1765     }
 1766 out:
 1767     return 0;
 1768 }
 1769 
 1770 struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
 1771                        u8 *uuid, u8 *fsid)
 1772 {
 1773     struct btrfs_device *device;
 1774     struct btrfs_fs_devices *cur_devices;
 1775 
 1776     cur_devices = fs_info->fs_devices;
 1777     while (cur_devices) {
 1778         if (!fsid ||
 1779             (!memcmp(cur_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE) ||
 1780              fs_info->ignore_fsid_mismatch)) {
 1781             device = find_device(cur_devices, devid, uuid);
 1782             if (device)
 1783                 return device;
 1784         }
 1785         cur_devices = cur_devices->seed;
 1786     }
 1787     return NULL;
 1788 }
 1789 
 1790 struct btrfs_device *
 1791 btrfs_find_device_by_devid(struct btrfs_fs_devices *fs_devices,
 1792                u64 devid, int instance)
 1793 {
 1794     struct list_head *head = &fs_devices->devices;
 1795     struct btrfs_device *dev;
 1796     int num_found = 0;
 1797 
 1798     list_for_each_entry(dev, head, dev_list) {
 1799         if (dev->devid == devid && num_found++ == instance)
 1800             return dev;
 1801     }
 1802     return NULL;
 1803 }
 1804 
 1805 /*
 1806  * Return 0 if the chunk at @chunk_offset exists and is not read-only.
 1807  * Return 1 if the chunk at @chunk_offset exists and is read-only.
 1808  * Return <0 if we can't find chunk at @chunk_offset.
 1809  */
 1810 int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
 1811 {
 1812     struct cache_extent *ce;
 1813     struct map_lookup *map;
 1814     struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
 1815     int readonly = 0;
 1816     int i;
 1817 
 1818     /*
 1819      * During chunk recovering, we may fail to find block group's
 1820      * corresponding chunk, we will rebuild it later
 1821      */
 1822     if (fs_info->is_chunk_recover)
 1823         return 0;
 1824 
 1825     ce = search_cache_extent(&map_tree->cache_tree, chunk_offset);
 1826     if (!ce)
 1827         return -ENOENT;
 1828 
 1829     map = container_of(ce, struct map_lookup, ce);
 1830     for (i = 0; i < map->num_stripes; i++) {
 1831         if (!map->stripes[i].dev->writeable) {
 1832             readonly = 1;
 1833             break;
 1834         }
 1835     }
 1836 
 1837     return readonly;
 1838 }
 1839 
 1840 static struct btrfs_device *fill_missing_device(u64 devid)
 1841 {
 1842     struct btrfs_device *device;
 1843 
 1844     device = kzalloc(sizeof(*device), GFP_NOFS);
 1845     device->devid = devid;
 1846     device->fd = -1;
 1847     return device;
 1848 }
 1849 
 1850 /*
 1851  * slot == -1: SYSTEM chunk
 1852  * return -EIO on error, otherwise return 0
 1853  */
 1854 int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info,
 1855                 struct extent_buffer *leaf,
 1856                 struct btrfs_chunk *chunk,
 1857                 int slot, u64 logical)
 1858 {
 1859     u64 length;
 1860     u64 stripe_len;
 1861     u16 num_stripes;
 1862     u16 sub_stripes;
 1863     u64 type;
 1864     u32 chunk_ondisk_size;
 1865     u32 sectorsize = fs_info->sectorsize;
 1866 
 1867     /*
 1868      * Basic chunk item size check.  Note that btrfs_chunk already contains
 1869      * one stripe, so no "==" check.
 1870      */
 1871     if (slot >= 0 &&
 1872         btrfs_item_size_nr(leaf, slot) < sizeof(struct btrfs_chunk)) {
 1873         error("invalid chunk item size, have %u expect [%zu, %lu)",
 1874             btrfs_item_size_nr(leaf, slot),
 1875             sizeof(struct btrfs_chunk),
 1876             BTRFS_LEAF_DATA_SIZE(fs_info));
 1877         return -EUCLEAN;
 1878     }
 1879     length = btrfs_chunk_length(leaf, chunk);
 1880     stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
 1881     num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
 1882     sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
 1883     type = btrfs_chunk_type(leaf, chunk);
 1884 
 1885     if (num_stripes == 0) {
 1886         error("invalid num_stripes, have %u expect non-zero",
 1887             num_stripes);
 1888         return -EUCLEAN;
 1889     }
 1890     if (slot >= 0 && btrfs_chunk_item_size(num_stripes) !=
 1891         btrfs_item_size_nr(leaf, slot)) {
 1892         error("invalid chunk item size, have %u expect %lu",
 1893             btrfs_item_size_nr(leaf, slot),
 1894             btrfs_chunk_item_size(num_stripes));
 1895         return -EUCLEAN;
 1896     }
 1897 
 1898     /*
 1899      * These valid checks may be insufficient to cover every corner cases.
 1900      */
 1901     if (!IS_ALIGNED(logical, sectorsize)) {
 1902         error("invalid chunk logical %llu",  logical);
 1903         return -EIO;
 1904     }
 1905     if (btrfs_chunk_sector_size(leaf, chunk) != sectorsize) {
 1906         error("invalid chunk sectorsize %llu",
 1907               (unsigned long long)btrfs_chunk_sector_size(leaf, chunk));
 1908         return -EIO;
 1909     }
 1910     if (!length || !IS_ALIGNED(length, sectorsize)) {
 1911         error("invalid chunk length %llu",  length);
 1912         return -EIO;
 1913     }
 1914     if (stripe_len != BTRFS_STRIPE_LEN) {
 1915         error("invalid chunk stripe length: %llu", stripe_len);
 1916         return -EIO;
 1917     }
 1918     /* Check on chunk item type */
 1919     if (slot == -1 && (type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
 1920         error("invalid chunk type %llu", type);
 1921         return -EIO;
 1922     }
 1923     if (type & ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
 1924              BTRFS_BLOCK_GROUP_PROFILE_MASK)) {
 1925         error("unrecognized chunk type: %llu",
 1926               ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
 1927             BTRFS_BLOCK_GROUP_PROFILE_MASK) & type);
 1928         return -EIO;
 1929     }
 1930     if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
 1931         error("missing chunk type flag: %llu", type);
 1932         return -EIO;
 1933     }
 1934     if (!(is_power_of_2(type & BTRFS_BLOCK_GROUP_PROFILE_MASK) ||
 1935           (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0)) {
 1936         error("conflicting chunk type detected: %llu", type);
 1937         return -EIO;
 1938     }
 1939     if ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) &&
 1940         !is_power_of_2(type & BTRFS_BLOCK_GROUP_PROFILE_MASK)) {
 1941         error("conflicting chunk profile detected: %llu", type);
 1942         return -EIO;
 1943     }
 1944 
 1945     chunk_ondisk_size = btrfs_chunk_item_size(num_stripes);
 1946     /*
 1947      * Btrfs_chunk contains at least one stripe, and for sys_chunk
 1948      * it can't exceed the system chunk array size
 1949      * For normal chunk, it should match its chunk item size.
 1950      */
 1951     if (num_stripes < 1 ||
 1952         (slot == -1 && chunk_ondisk_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) ||
 1953         (slot >= 0 && chunk_ondisk_size > btrfs_item_size_nr(leaf, slot))) {
 1954         error("invalid num_stripes: %u", num_stripes);
 1955         return -EIO;
 1956     }
 1957     /*
 1958      * Device number check against profile
 1959      */
 1960     if ((type & BTRFS_BLOCK_GROUP_RAID10 && (sub_stripes != 2 ||
 1961           !IS_ALIGNED(num_stripes, sub_stripes))) ||
 1962         (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes < 1) ||
 1963         (type & BTRFS_BLOCK_GROUP_RAID1C3 && num_stripes < 3) ||
 1964         (type & BTRFS_BLOCK_GROUP_RAID1C4 && num_stripes < 4) ||
 1965         (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) ||
 1966         (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) ||
 1967         (type & BTRFS_BLOCK_GROUP_DUP && num_stripes > 2) ||
 1968         ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&
 1969          num_stripes != 1)) {
 1970         error("Invalid num_stripes:sub_stripes %u:%u for profile %llu",
 1971               num_stripes, sub_stripes,
 1972               type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
 1973         return -EIO;
 1974     }
 1975 
 1976     return 0;
 1977 }
 1978 
 1979 /*
 1980  * Slot is used to verify the chunk item is valid
 1981  *
 1982  * For sys chunk in superblock, pass -1 to indicate sys chunk.
 1983  */
 1984 static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
 1985               struct extent_buffer *leaf,
 1986               struct btrfs_chunk *chunk, int slot)
 1987 {
 1988     struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
 1989     struct map_lookup *map;
 1990     struct cache_extent *ce;
 1991     u64 logical;
 1992     u64 length;
 1993     u64 devid;
 1994     u8 uuid[BTRFS_UUID_SIZE];
 1995     int num_stripes;
 1996     int ret;
 1997     int i;
 1998 
 1999     logical = key->offset;
 2000     length = btrfs_chunk_length(leaf, chunk);
 2001     num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
 2002     /* Validation check */
 2003     ret = btrfs_check_chunk_valid(fs_info, leaf, chunk, slot, logical);
 2004     if (ret) {
 2005         error("%s checksums match, but it has an invalid chunk, %s",
 2006               (slot == -1) ? "Superblock" : "Metadata",
 2007               (slot == -1) ? "try btrfsck --repair -s <superblock> ie, 0,1,2" : "");
 2008         return ret;
 2009     }
 2010 
 2011     ce = search_cache_extent(&map_tree->cache_tree, logical);
 2012 
 2013     /* already mapped? */
 2014     if (ce && ce->start <= logical && ce->start + ce->size > logical) {
 2015         return 0;
 2016     }
 2017 
 2018     map = kmalloc(btrfs_map_lookup_size(num_stripes), GFP_NOFS);
 2019     if (!map)
 2020         return -ENOMEM;
 2021 
 2022     map->ce.start = logical;
 2023     map->ce.size = length;
 2024     map->num_stripes = num_stripes;
 2025     map->io_width = btrfs_chunk_io_width(leaf, chunk);
 2026     map->io_align = btrfs_chunk_io_align(leaf, chunk);
 2027     map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
 2028     map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
 2029     map->type = btrfs_chunk_type(leaf, chunk);
 2030     map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
 2031 
 2032     for (i = 0; i < num_stripes; i++) {
 2033         map->stripes[i].physical =
 2034             btrfs_stripe_offset_nr(leaf, chunk, i);
 2035         devid = btrfs_stripe_devid_nr(leaf, chunk, i);
 2036         read_extent_buffer(leaf, uuid, (unsigned long)
 2037                    btrfs_stripe_dev_uuid_nr(chunk, i),
 2038                    BTRFS_UUID_SIZE);
 2039         map->stripes[i].dev = btrfs_find_device(fs_info, devid, uuid,
 2040                             NULL);
 2041         if (!map->stripes[i].dev) {
 2042             map->stripes[i].dev = fill_missing_device(devid);
 2043             printf("warning, device %llu is missing\n",
 2044                    (unsigned long long)devid);
 2045             list_add(&map->stripes[i].dev->dev_list,
 2046                  &fs_info->fs_devices->devices);
 2047         }
 2048 
 2049     }
 2050     ret = insert_cache_extent(&map_tree->cache_tree, &map->ce);
 2051     if (ret < 0) {
 2052         errno = -ret;
 2053         error("failed to add chunk map start=%llu len=%llu: %d (%m)",
 2054               map->ce.start, map->ce.size, ret);
 2055     }
 2056 
 2057     return ret;
 2058 }
 2059 
 2060 static int fill_device_from_item(struct extent_buffer *leaf,
 2061                  struct btrfs_dev_item *dev_item,
 2062                  struct btrfs_device *device)
 2063 {
 2064     unsigned long ptr;
 2065 
 2066     device->devid = btrfs_device_id(leaf, dev_item);
 2067     device->total_bytes = btrfs_device_total_bytes(leaf, dev_item);
 2068     device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
 2069     device->type = btrfs_device_type(leaf, dev_item);
 2070     device->io_align = btrfs_device_io_align(leaf, dev_item);
 2071     device->io_width = btrfs_device_io_width(leaf, dev_item);
 2072     device->sector_size = btrfs_device_sector_size(leaf, dev_item);
 2073 
 2074     ptr = (unsigned long)btrfs_device_uuid(dev_item);
 2075     read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
 2076 
 2077     return 0;
 2078 }
 2079 
 2080 static int open_seed_devices(struct btrfs_fs_info *fs_info, u8 *fsid)
 2081 {
 2082     struct btrfs_fs_devices *fs_devices;
 2083     int ret;
 2084 
 2085     fs_devices = fs_info->fs_devices->seed;
 2086     while (fs_devices) {
 2087         if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
 2088             ret = 0;
 2089             goto out;
 2090         }
 2091         fs_devices = fs_devices->seed;
 2092     }
 2093 
 2094     fs_devices = find_fsid(fsid, NULL);
 2095     if (!fs_devices) {
 2096         /* missing all seed devices */
 2097         fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
 2098         if (!fs_devices) {
 2099             ret = -ENOMEM;
 2100             goto out;
 2101         }
 2102         INIT_LIST_HEAD(&fs_devices->devices);
 2103         list_add(&fs_devices->list, &fs_uuids);
 2104         memcpy(fs_devices->fsid, fsid, BTRFS_FSID_SIZE);
 2105     }
 2106 
 2107     ret = btrfs_open_devices(fs_devices, O_RDONLY);
 2108     if (ret)
 2109         goto out;
 2110 
 2111     fs_devices->seed = fs_info->fs_devices->seed;
 2112     fs_info->fs_devices->seed = fs_devices;
 2113 out:
 2114     return ret;
 2115 }
 2116 
 2117 static int read_one_dev(struct btrfs_fs_info *fs_info,
 2118             struct extent_buffer *leaf,
 2119             struct btrfs_dev_item *dev_item)
 2120 {
 2121     struct btrfs_device *device;
 2122     u64 devid;
 2123     int ret = 0;
 2124     u8 fs_uuid[BTRFS_UUID_SIZE];
 2125     u8 dev_uuid[BTRFS_UUID_SIZE];
 2126 
 2127     devid = btrfs_device_id(leaf, dev_item);
 2128     read_extent_buffer(leaf, dev_uuid,
 2129                (unsigned long)btrfs_device_uuid(dev_item),
 2130                BTRFS_UUID_SIZE);
 2131     read_extent_buffer(leaf, fs_uuid,
 2132                (unsigned long)btrfs_device_fsid(dev_item),
 2133                BTRFS_FSID_SIZE);
 2134 
 2135     if (memcmp(fs_uuid, fs_info->fs_devices->fsid, BTRFS_UUID_SIZE)) {
 2136         ret = open_seed_devices(fs_info, fs_uuid);
 2137         if (ret)
 2138             return ret;
 2139     }
 2140 
 2141     device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
 2142     if (!device) {
 2143         device = kzalloc(sizeof(*device), GFP_NOFS);
 2144         if (!device)
 2145             return -ENOMEM;
 2146         device->fd = -1;
 2147         list_add(&device->dev_list,
 2148              &fs_info->fs_devices->devices);
 2149     }
 2150 
 2151     fill_device_from_item(leaf, dev_item, device);
 2152     device->dev_root = fs_info->dev_root;
 2153     fs_info->fs_devices->total_rw_bytes +=
 2154         btrfs_device_total_bytes(leaf, dev_item);
 2155     return ret;
 2156 }
 2157 
 2158 int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
 2159 {
 2160     struct btrfs_super_block *super_copy = fs_info->super_copy;
 2161     struct extent_buffer *sb;
 2162     struct btrfs_disk_key *disk_key;
 2163     struct btrfs_chunk *chunk;
 2164     u8 *array_ptr;
 2165     unsigned long sb_array_offset;
 2166     int ret = 0;
 2167     u32 num_stripes;
 2168     u32 array_size;
 2169     u32 len = 0;
 2170     u32 cur_offset;
 2171     struct btrfs_key key;
 2172 
 2173     if (fs_info->nodesize < BTRFS_SUPER_INFO_SIZE) {
 2174         printf("ERROR: nodesize %u too small to read superblock\n",
 2175                 fs_info->nodesize);
 2176         return -EINVAL;
 2177     }
 2178     sb = alloc_dummy_extent_buffer(fs_info, BTRFS_SUPER_INFO_OFFSET,
 2179                        BTRFS_SUPER_INFO_SIZE);
 2180     if (!sb)
 2181         return -ENOMEM;
 2182     btrfs_set_buffer_uptodate(sb);
 2183     write_extent_buffer(sb, super_copy, 0, sizeof(*super_copy));
 2184     array_size = btrfs_super_sys_array_size(super_copy);
 2185 
 2186     array_ptr = super_copy->sys_chunk_array;
 2187     sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
 2188     cur_offset = 0;
 2189 
 2190     while (cur_offset < array_size) {
 2191         disk_key = (struct btrfs_disk_key *)array_ptr;
 2192         len = sizeof(*disk_key);
 2193         if (cur_offset + len > array_size)
 2194             goto out_short_read;
 2195 
 2196         btrfs_disk_key_to_cpu(&key, disk_key);
 2197 
 2198         array_ptr += len;
 2199         sb_array_offset += len;
 2200         cur_offset += len;
 2201 
 2202         if (key.type == BTRFS_CHUNK_ITEM_KEY) {
 2203             chunk = (struct btrfs_chunk *)sb_array_offset;
 2204             /*
 2205              * At least one btrfs_chunk with one stripe must be
 2206              * present, exact stripe count check comes afterwards
 2207              */
 2208             len = btrfs_chunk_item_size(1);
 2209             if (cur_offset + len > array_size)
 2210                 goto out_short_read;
 2211 
 2212             num_stripes = btrfs_chunk_num_stripes(sb, chunk);
 2213             if (!num_stripes) {
 2214                 printk(
 2215         "ERROR: invalid number of stripes %u in sys_array at offset %u\n",
 2216                     num_stripes, cur_offset);
 2217                 ret = -EIO;
 2218                 break;
 2219             }
 2220 
 2221             len = btrfs_chunk_item_size(num_stripes);
 2222             if (cur_offset + len > array_size)
 2223                 goto out_short_read;
 2224 
 2225             ret = read_one_chunk(fs_info, &key, sb, chunk, -1);
 2226             if (ret)
 2227                 break;
 2228         } else {
 2229             printk(
 2230         "ERROR: unexpected item type %u in sys_array at offset %u\n",
 2231                 (u32)key.type, cur_offset);
 2232             ret = -EIO;
 2233             break;
 2234         }
 2235         array_ptr += len;
 2236         sb_array_offset += len;
 2237         cur_offset += len;
 2238     }
 2239     free_extent_buffer(sb);
 2240     return ret;
 2241 
 2242 out_short_read:
 2243     printk("ERROR: sys_array too short to read %u bytes at offset %u\n",
 2244             len, cur_offset);
 2245     free_extent_buffer(sb);
 2246     return -EIO;
 2247 }
 2248 
 2249 int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
 2250 {
 2251     struct btrfs_path *path;
 2252     struct extent_buffer *leaf;
 2253     struct btrfs_key key;
 2254     struct btrfs_key found_key;
 2255     struct btrfs_root *root = fs_info->chunk_root;
 2256     int ret;
 2257     int slot;
 2258 
 2259     path = btrfs_alloc_path();
 2260     if (!path)
 2261         return -ENOMEM;
 2262 
 2263     /*
 2264      * Read all device items, and then all the chunk items. All
 2265      * device items are found before any chunk item (their object id
 2266      * is smaller than the lowest possible object id for a chunk
 2267      * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
 2268      */
 2269     key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
 2270     key.offset = 0;
 2271     key.type = 0;
 2272     ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 2273     if (ret < 0)
 2274         goto error;
 2275     while(1) {
 2276         leaf = path->nodes[0];
 2277         slot = path->slots[0];
 2278         if (slot >= btrfs_header_nritems(leaf)) {
 2279             ret = btrfs_next_leaf(root, path);
 2280             if (ret == 0)
 2281                 continue;
 2282             if (ret < 0)
 2283                 goto error;
 2284             break;
 2285         }
 2286         btrfs_item_key_to_cpu(leaf, &found_key, slot);
 2287         if (found_key.type == BTRFS_DEV_ITEM_KEY) {
 2288             struct btrfs_dev_item *dev_item;
 2289             dev_item = btrfs_item_ptr(leaf, slot,
 2290                           struct btrfs_dev_item);
 2291             ret = read_one_dev(fs_info, leaf, dev_item);
 2292             if (ret < 0)
 2293                 goto error;
 2294         } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
 2295             struct btrfs_chunk *chunk;
 2296             chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
 2297             ret = read_one_chunk(fs_info, &found_key, leaf, chunk,
 2298                          slot);
 2299             if (ret < 0)
 2300                 goto error;
 2301         }
 2302         path->slots[0]++;
 2303     }
 2304 
 2305     ret = 0;
 2306 error:
 2307     btrfs_free_path(path);
 2308     return ret;
 2309 }
 2310 
 2311 struct list_head *btrfs_scanned_uuids(void)
 2312 {
 2313     return &fs_uuids;
 2314 }
 2315 
 2316 static int rmw_eb(struct btrfs_fs_info *info,
 2317           struct extent_buffer *eb, struct extent_buffer *orig_eb)
 2318 {
 2319     int ret;
 2320     unsigned long orig_off = 0;
 2321     unsigned long dest_off = 0;
 2322     unsigned long copy_len = eb->len;
 2323 
 2324     ret = read_whole_eb(info, eb, 0);
 2325     if (ret)
 2326         return ret;
 2327 
 2328     if (eb->start + eb->len <= orig_eb->start ||
 2329         eb->start >= orig_eb->start + orig_eb->len)
 2330         return 0;
 2331     /*
 2332      * | ----- orig_eb ------- |
 2333      *         | ----- stripe -------  |
 2334      *         | ----- orig_eb ------- |
 2335      *              | ----- orig_eb ------- |
 2336      */
 2337     if (eb->start > orig_eb->start)
 2338         orig_off = eb->start - orig_eb->start;
 2339     if (orig_eb->start > eb->start)
 2340         dest_off = orig_eb->start - eb->start;
 2341 
 2342     if (copy_len > orig_eb->len - orig_off)
 2343         copy_len = orig_eb->len - orig_off;
 2344     if (copy_len > eb->len - dest_off)
 2345         copy_len = eb->len - dest_off;
 2346 
 2347     memcpy(eb->data + dest_off, orig_eb->data + orig_off, copy_len);
 2348     return 0;
 2349 }
 2350 
 2351 static int split_eb_for_raid56(struct btrfs_fs_info *info,
 2352                    struct extent_buffer *orig_eb,
 2353                    struct extent_buffer **ebs,
 2354                    u64 stripe_len, u64 *raid_map,
 2355                    int num_stripes)
 2356 {
 2357     struct extent_buffer **tmp_ebs;
 2358     u64 start = orig_eb->start;
 2359     u64 this_eb_start;
 2360     int i;
 2361     int ret = 0;
 2362 
 2363     tmp_ebs = calloc(num_stripes, sizeof(*tmp_ebs));
 2364     if (!tmp_ebs)
 2365         return -ENOMEM;
 2366 
 2367     /* Alloc memory in a row for data stripes */
 2368     for (i = 0; i < num_stripes; i++) {
 2369         if (raid_map[i] >= BTRFS_RAID5_P_STRIPE)
 2370             break;
 2371 
 2372         tmp_ebs[i] = calloc(1, sizeof(**tmp_ebs) + stripe_len);
 2373         if (!tmp_ebs[i]) {
 2374             ret = -ENOMEM;
 2375             goto clean_up;
 2376         }
 2377     }
 2378 
 2379     for (i = 0; i < num_stripes; i++) {
 2380         struct extent_buffer *eb = tmp_ebs[i];
 2381 
 2382         if (raid_map[i] >= BTRFS_RAID5_P_STRIPE)
 2383             break;
 2384 
 2385         eb->start = raid_map[i];
 2386         eb->len = stripe_len;
 2387         eb->refs = 1;
 2388         eb->flags = 0;
 2389         eb->fd = -1;
 2390         eb->dev_bytenr = (u64)-1;
 2391 
 2392         this_eb_start = raid_map[i];
 2393 
 2394         if (start > this_eb_start ||
 2395             start + orig_eb->len < this_eb_start + stripe_len) {
 2396             ret = rmw_eb(info, eb, orig_eb);
 2397             if (ret)
 2398                 goto clean_up;
 2399         } else {
 2400             memcpy(eb->data, orig_eb->data + eb->start - start,
 2401                    stripe_len);
 2402         }
 2403         ebs[i] = eb;
 2404     }
 2405     free(tmp_ebs);
 2406     return ret;
 2407 clean_up:
 2408     for (i = 0; i < num_stripes; i++)
 2409         free(tmp_ebs[i]);
 2410     free(tmp_ebs);
 2411     return ret;
 2412 }
 2413 
 2414 int write_raid56_with_parity(struct btrfs_fs_info *info,
 2415                  struct extent_buffer *eb,
 2416                  struct btrfs_multi_bio *multi,
 2417                  u64 stripe_len, u64 *raid_map)
 2418 {
 2419     struct extent_buffer **ebs, *p_eb = NULL, *q_eb = NULL;
 2420     int i;
 2421     int ret;
 2422     int alloc_size = eb->len;
 2423     void **pointers;
 2424 
 2425     ebs = malloc(sizeof(*ebs) * multi->num_stripes);
 2426     pointers = malloc(sizeof(*pointers) * multi->num_stripes);
 2427     if (!ebs || !pointers) {
 2428         free(ebs);
 2429         free(pointers);
 2430         return -ENOMEM;
 2431     }
 2432 
 2433     if (stripe_len > alloc_size)
 2434         alloc_size = stripe_len;
 2435 
 2436     ret = split_eb_for_raid56(info, eb, ebs, stripe_len, raid_map,
 2437                   multi->num_stripes);
 2438     if (ret)
 2439         goto out;
 2440 
 2441     for (i = 0; i < multi->num_stripes; i++) {
 2442         struct extent_buffer *new_eb;
 2443         if (raid_map[i] < BTRFS_RAID5_P_STRIPE) {
 2444             ebs[i]->dev_bytenr = multi->stripes[i].physical;
 2445             ebs[i]->fd = multi->stripes[i].dev->fd;
 2446             multi->stripes[i].dev->total_ios++;
 2447             if (ebs[i]->start != raid_map[i]) {
 2448                 ret = -EINVAL;
 2449                 goto out_free_split;
 2450             }
 2451             continue;
 2452         }
 2453         new_eb = malloc(sizeof(*eb) + alloc_size);
 2454         if (!new_eb) {
 2455             ret = -ENOMEM;
 2456             goto out_free_split;
 2457         }
 2458         new_eb->dev_bytenr = multi->stripes[i].physical;
 2459         new_eb->fd = multi->stripes[i].dev->fd;
 2460         multi->stripes[i].dev->total_ios++;
 2461         new_eb->len = stripe_len;
 2462 
 2463         if (raid_map[i] == BTRFS_RAID5_P_STRIPE)
 2464             p_eb = new_eb;
 2465         else if (raid_map[i] == BTRFS_RAID6_Q_STRIPE)
 2466             q_eb = new_eb;
 2467     }
 2468     if (q_eb) {
 2469         ebs[multi->num_stripes - 2] = p_eb;
 2470         ebs[multi->num_stripes - 1] = q_eb;
 2471 
 2472         for (i = 0; i < multi->num_stripes; i++)
 2473             pointers[i] = ebs[i]->data;
 2474 
 2475         raid6_gen_syndrome(multi->num_stripes, stripe_len, pointers);
 2476     } else {
 2477         ebs[multi->num_stripes - 1] = p_eb;
 2478         for (i = 0; i < multi->num_stripes; i++)
 2479             pointers[i] = ebs[i]->data;
 2480         ret = raid5_gen_result(multi->num_stripes, stripe_len,
 2481                        multi->num_stripes - 1, pointers);
 2482         if (ret < 0)
 2483             goto out_free_split;
 2484     }
 2485 
 2486     for (i = 0; i < multi->num_stripes; i++) {
 2487         ret = write_extent_to_disk(ebs[i]);
 2488         if (ret < 0)
 2489             goto out_free_split;
 2490     }
 2491 
 2492 out_free_split:
 2493     for (i = 0; i < multi->num_stripes; i++) {
 2494         if (ebs[i] != eb)
 2495             free(ebs[i]);
 2496     }
 2497 out:
 2498     free(ebs);
 2499     free(pointers);
 2500 
 2501     return ret;
 2502 }
 2503 
 2504 /*
 2505  * Get stripe length from chunk item and its stripe items
 2506  *
 2507  * Caller should only call this function after validating the chunk item
 2508  * by using btrfs_check_chunk_valid().
 2509  */
 2510 u64 btrfs_stripe_length(struct btrfs_fs_info *fs_info,
 2511             struct extent_buffer *leaf,
 2512             struct btrfs_chunk *chunk)
 2513 {
 2514     u64 stripe_len;
 2515     u64 chunk_len;
 2516     u32 num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
 2517     u64 profile = btrfs_chunk_type(leaf, chunk) &
 2518               BTRFS_BLOCK_GROUP_PROFILE_MASK;
 2519 
 2520     chunk_len = btrfs_chunk_length(leaf, chunk);
 2521 
 2522     switch (profile) {
 2523     case 0: /* Single profile */
 2524     case BTRFS_BLOCK_GROUP_RAID1:
 2525     case BTRFS_BLOCK_GROUP_RAID1C3:
 2526     case BTRFS_BLOCK_GROUP_RAID1C4:
 2527     case BTRFS_BLOCK_GROUP_DUP:
 2528         stripe_len = chunk_len;
 2529         break;
 2530     case BTRFS_BLOCK_GROUP_RAID0:
 2531         stripe_len = chunk_len / num_stripes;
 2532         break;
 2533     case BTRFS_BLOCK_GROUP_RAID5:
 2534         stripe_len = chunk_len / (num_stripes - 1);
 2535         break;
 2536     case BTRFS_BLOCK_GROUP_RAID6:
 2537         stripe_len = chunk_len / (num_stripes - 2);
 2538         break;
 2539     case BTRFS_BLOCK_GROUP_RAID10:
 2540         stripe_len = chunk_len / (num_stripes /
 2541                 btrfs_chunk_sub_stripes(leaf, chunk));
 2542         break;
 2543     default:
 2544         /* Invalid chunk profile found */
 2545         BUG_ON(1);
 2546     }
 2547     return stripe_len;
 2548 }
 2549 
 2550 /*
 2551  * Return 0 if size of @device is already good
 2552  * Return >0 if size of @device is not aligned but fixed without problems
 2553  * Return <0 if something wrong happened when aligning the size of @device
 2554  */
 2555 int btrfs_fix_device_size(struct btrfs_fs_info *fs_info,
 2556               struct btrfs_device *device)
 2557 {
 2558     struct btrfs_trans_handle *trans;
 2559     struct btrfs_key key;
 2560     struct btrfs_path path;
 2561     struct btrfs_root *chunk_root = fs_info->chunk_root;
 2562     struct btrfs_dev_item *di;
 2563     u64 old_bytes = device->total_bytes;
 2564     int ret;
 2565 
 2566     if (IS_ALIGNED(old_bytes, fs_info->sectorsize))
 2567         return 0;
 2568 
 2569     /* Align the in-memory total_bytes first, and use it as correct size */
 2570     device->total_bytes = round_down(device->total_bytes,
 2571                      fs_info->sectorsize);
 2572 
 2573     key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
 2574     key.type = BTRFS_DEV_ITEM_KEY;
 2575     key.offset = device->devid;
 2576 
 2577     trans = btrfs_start_transaction(chunk_root, 1);
 2578     if (IS_ERR(trans)) {
 2579         ret = PTR_ERR(trans);
 2580         errno = -ret;
 2581         error("error starting transaction: %d (%m)", ret);
 2582         return ret;
 2583     }
 2584 
 2585     btrfs_init_path(&path);
 2586     ret = btrfs_search_slot(trans, chunk_root, &key, &path, 0, 1);
 2587     if (ret > 0) {
 2588         error("failed to find DEV_ITEM for devid %llu", device->devid);
 2589         ret = -ENOENT;
 2590         goto err;
 2591     }
 2592     if (ret < 0) {
 2593         errno = -ret;
 2594         error("failed to search chunk root: %d (%m)", ret);
 2595         goto err;
 2596     }
 2597     di = btrfs_item_ptr(path.nodes[0], path.slots[0], struct btrfs_dev_item);
 2598     btrfs_set_device_total_bytes(path.nodes[0], di, device->total_bytes);
 2599     btrfs_mark_buffer_dirty(path.nodes[0]);
 2600     ret = btrfs_commit_transaction(trans, chunk_root);
 2601     if (ret < 0) {
 2602         errno = -ret;
 2603         error("failed to commit current transaction: %d (%m)", ret);
 2604         btrfs_release_path(&path);
 2605         return ret;
 2606     }
 2607     btrfs_release_path(&path);
 2608     printf("Fixed device size for devid %llu, old size: %llu new size: %llu\n",
 2609         device->devid, old_bytes, device->total_bytes);
 2610     return 1;
 2611 
 2612 err:
 2613     /* We haven't modified anything, it's OK to commit current trans */
 2614     btrfs_commit_transaction(trans, chunk_root);
 2615     btrfs_release_path(&path);
 2616     return ret;
 2617 }
 2618 
 2619 /*
 2620  * Return 0 if super block total_bytes matches all devices' total_bytes
 2621  * Return >0 if super block total_bytes mismatch but fixed without problem
 2622  * Return <0 if we failed to fix super block total_bytes
 2623  */
 2624 int btrfs_fix_super_size(struct btrfs_fs_info *fs_info)
 2625 {
 2626     struct btrfs_trans_handle *trans;
 2627     struct btrfs_device *device;
 2628     struct list_head *dev_list = &fs_info->fs_devices->devices;
 2629     u64 total_bytes = 0;
 2630     u64 old_bytes = btrfs_super_total_bytes(fs_info->super_copy);
 2631     int ret;
 2632 
 2633     list_for_each_entry(device, dev_list, dev_list) {
 2634         /*
 2635          * Caller should ensure this function is called after aligning
 2636          * all devices' total_bytes.
 2637          */
 2638         if (!IS_ALIGNED(device->total_bytes, fs_info->sectorsize)) {
 2639             error("device %llu total_bytes %llu not aligned to %u",
 2640                 device->devid, device->total_bytes,
 2641                 fs_info->sectorsize);
 2642             return -EUCLEAN;
 2643         }
 2644         total_bytes += device->total_bytes;
 2645     }
 2646 
 2647     if (total_bytes == old_bytes)
 2648         return 0;
 2649 
 2650     btrfs_set_super_total_bytes(fs_info->super_copy, total_bytes);
 2651 
 2652     /* Commit transaction to update all super blocks */
 2653     trans = btrfs_start_transaction(fs_info->tree_root, 1);
 2654     if (IS_ERR(trans)) {
 2655         ret = PTR_ERR(trans);
 2656         errno = -ret;
 2657         error("error starting transaction: %d (%m)", ret);
 2658         return ret;
 2659     }
 2660     ret = btrfs_commit_transaction(trans, fs_info->tree_root);
 2661     if (ret < 0) {
 2662         errno = -ret;
 2663         error("failed to commit current transaction: %d (%m)", ret);
 2664         return ret;
 2665     }
 2666     printf("Fixed super total bytes, old size: %llu new size: %llu\n",
 2667         old_bytes, total_bytes);
 2668     return 1;
 2669 }
 2670 
 2671 /*
 2672  * Return 0 if all devices and super block sizes are good
 2673  * Return >0 if any device/super size problem was found, but fixed
 2674  * Return <0 if something wrong happened during fixing
 2675  */
 2676 int btrfs_fix_device_and_super_size(struct btrfs_fs_info *fs_info)
 2677 {
 2678     struct btrfs_device *device;
 2679     struct list_head *dev_list = &fs_info->fs_devices->devices;
 2680     bool have_bad_value = false;
 2681     int ret;
 2682 
 2683     /* Seed device is not supported yet */
 2684     if (fs_info->fs_devices->seed) {
 2685         error("fixing device size with seed device is not supported yet");
 2686         return -EOPNOTSUPP;
 2687     }
 2688 
 2689     /* All devices must be set up before repairing */
 2690     if (list_empty(dev_list)) {
 2691         error("no device found");
 2692         return -ENODEV;
 2693     }
 2694     list_for_each_entry(device, dev_list, dev_list) {
 2695         if (device->fd == -1 || !device->writeable) {
 2696             error("devid %llu is missing or not writeable",
 2697                   device->devid);
 2698             error(
 2699     "fixing device size needs all device(s) to be present and writeable");
 2700             return -ENODEV;
 2701         }
 2702     }
 2703 
 2704     /* Repair total_bytes of each device */
 2705     list_for_each_entry(device, dev_list, dev_list) {
 2706         ret = btrfs_fix_device_size(fs_info, device);
 2707         if (ret < 0)
 2708             return ret;
 2709         if (ret > 0)
 2710             have_bad_value = true;
 2711     }
 2712 
 2713     /* Repair super total_byte */
 2714     ret = btrfs_fix_super_size(fs_info);
 2715     if (ret > 0)
 2716         have_bad_value = true;
 2717     if (have_bad_value) {
 2718         printf(
 2719     "Fixed unaligned/mismatched total_bytes for super block and device items\n");
 2720         ret = 1;
 2721     } else {
 2722         printf("No device size related problem found\n");
 2723         ret = 0;
 2724     }
 2725     return ret;
 2726 }