drbd_req.c (drbd-9.1.8) | : | drbd_req.c (drbd-9.1.9) | ||
---|---|---|---|---|
skipping to change at line 607 | skipping to change at line 607 | |||
if (connection->todo.req_next != req) | if (connection->todo.req_next != req) | |||
return; | return; | |||
rcu_read_lock(); | rcu_read_lock(); | |||
list_for_each_entry_continue_rcu(req, &connection->resource->transfer_log , tl_requests) { | list_for_each_entry_continue_rcu(req, &connection->resource->transfer_log , tl_requests) { | |||
const unsigned s = req->net_rq_state[connection->peer_node_id]; | const unsigned s = req->net_rq_state[connection->peer_node_id]; | |||
connection->send.seen_dagtag_sector = req->dagtag_sector; | connection->send.seen_dagtag_sector = req->dagtag_sector; | |||
if (s & RQ_NET_QUEUED) { | if (s & RQ_NET_QUEUED) { | |||
found_req = req; | found_req = req; | |||
break; | break; | |||
} | } | |||
/* Found a request which is for this peer but not yet queued. | ||||
* Do not skip past it. */ | ||||
if (s & RQ_NET_PENDING && !(s & RQ_NET_SENT)) | ||||
break; | ||||
} | } | |||
rcu_read_unlock(); | rcu_read_unlock(); | |||
connection->todo.req_next = found_req; | connection->todo.req_next = found_req; | |||
} | } | |||
/** | ||||
* set_cache_ptr_if_null() - Set caching pointer to given request if not current | ||||
ly set. | ||||
* @cache_ptr: Pointer to set. | ||||
* @req: Request to potentially set the pointer to. | ||||
* | ||||
* The caching pointer system is designed to track the oldest request in the | ||||
* transfer log fulfilling some condition. In particular, a combination of | ||||
* flags towards a given peer. This condition must guarantee that the request | ||||
* will not be destroyed. | ||||
* | ||||
* This system is implemented by set_cache_ptr_if_null() and | ||||
* advance_cache_ptr(). A request must be in the transfer log and fulfil the | ||||
* condition before set_cache_ptr_if_null() is called. If | ||||
* set_cache_ptr_if_null() is called before this request is in the transfer log | ||||
* or before it fulfils the condition, the pointer may be advanced past this | ||||
* request, or unset, which also has the effect of skipping the request. | ||||
* | ||||
* Once the condition is no longer fulfilled for a request, advance_cache_ptr() | ||||
* must be called. If the caching pointer currently points to this request, | ||||
* this will advance it to the next request fulfilling the condition. | ||||
* | ||||
* set_cache_ptr_if_null() may be called concurrently with itself and with | ||||
* advance_cache_ptr(). However, advance_cache_ptr() must not be called | ||||
* concurrently for a given caching pointer. If it were, the call for the older | ||||
* request may advance the pointer to the newer request, although the newer | ||||
* request has concurrently been modified such that it no longer fulfils the | ||||
* condition. | ||||
*/ | ||||
static void set_cache_ptr_if_null(struct drbd_request **cache_ptr, struct drbd_r equest *req) | static void set_cache_ptr_if_null(struct drbd_request **cache_ptr, struct drbd_r equest *req) | |||
{ | { | |||
struct drbd_request *prev_req, *old_req = NULL; | struct drbd_request *prev_req, *old_req = NULL; | |||
rcu_read_lock(); | rcu_read_lock(); | |||
prev_req = cmpxchg(cache_ptr, old_req, req); | prev_req = cmpxchg(cache_ptr, old_req, req); | |||
while (prev_req != old_req) { | while (prev_req != old_req) { | |||
if (prev_req && req->dagtag_sector > prev_req->dagtag_sector) | if (prev_req && req->dagtag_sector > prev_req->dagtag_sector) | |||
break; | break; | |||
old_req = prev_req; | old_req = prev_req; | |||
prev_req = cmpxchg(cache_ptr, old_req, req); | prev_req = cmpxchg(cache_ptr, old_req, req); | |||
} | } | |||
rcu_read_unlock(); | rcu_read_unlock(); | |||
} | } | |||
/* See set_cache_ptr_if_null(). */ | ||||
static void advance_cache_ptr(struct drbd_connection *connection, | static void advance_cache_ptr(struct drbd_connection *connection, | |||
struct drbd_request __rcu **cache_ptr, struct drbd_ request *req, | struct drbd_request __rcu **cache_ptr, struct drbd_ request *req, | |||
unsigned int is_set, unsigned int is_clear) | unsigned int is_set, unsigned int is_clear) | |||
{ | { | |||
struct drbd_request *old_req; | struct drbd_request *old_req; | |||
struct drbd_request *found_req = NULL; | struct drbd_request *found_req = NULL; | |||
rcu_read_lock(); | rcu_read_lock(); | |||
old_req = rcu_dereference(*cache_ptr); | old_req = rcu_dereference(*cache_ptr); | |||
if (old_req != req) { | if (old_req != req) { | |||
skipping to change at line 702 | skipping to change at line 735 | |||
/* apply */ | /* apply */ | |||
spin_lock(&req->rq_lock); /* local IRQ already disabled */ | spin_lock(&req->rq_lock); /* local IRQ already disabled */ | |||
old_local = req->local_rq_state; | old_local = req->local_rq_state; | |||
req->local_rq_state &= ~clear_local; | req->local_rq_state &= ~clear_local; | |||
req->local_rq_state |= set_local; | req->local_rq_state |= set_local; | |||
if (idx != -1) { | if (idx != -1) { | |||
old_net = req->net_rq_state[idx]; | old_net = req->net_rq_state[idx]; | |||
req->net_rq_state[idx] &= ~clear; | WRITE_ONCE(req->net_rq_state[idx], (req->net_rq_state[idx] & ~cle | |||
req->net_rq_state[idx] |= set; | ar) | set); | |||
connection = peer_device->connection; | connection = peer_device->connection; | |||
} | } | |||
/* no change? */ | /* no change? */ | |||
unchanged = req->local_rq_state == old_local && | unchanged = req->local_rq_state == old_local && | |||
(idx == -1 || req->net_rq_state[idx] == old_net); | (idx == -1 || req->net_rq_state[idx] == old_net); | |||
spin_unlock(&req->rq_lock); | spin_unlock(&req->rq_lock); | |||
if (unchanged) | if (unchanged) | |||
skipping to change at line 950 | skipping to change at line 982 | |||
mod_rq_state(req, m, peer_device, RQ_LOCAL_PENDING, RQ_LOCAL_COMP LETED); | mod_rq_state(req, m, peer_device, RQ_LOCAL_PENDING, RQ_LOCAL_COMP LETED); | |||
break; | break; | |||
case DISCARD_COMPLETED_NOTSUPP: | case DISCARD_COMPLETED_NOTSUPP: | |||
case DISCARD_COMPLETED_WITH_ERROR: | case DISCARD_COMPLETED_WITH_ERROR: | |||
/* I'd rather not detach from local disk just because it | /* I'd rather not detach from local disk just because it | |||
* failed a REQ_OP_DISCARD. */ | * failed a REQ_OP_DISCARD. */ | |||
mod_rq_state(req, m, peer_device, RQ_LOCAL_PENDING, RQ_LOCAL_COMP LETED); | mod_rq_state(req, m, peer_device, RQ_LOCAL_PENDING, RQ_LOCAL_COMP LETED); | |||
break; | break; | |||
case QUEUE_FOR_NET_READ: | case NEW_NET_READ: | |||
/* READ, and | /* READ, and | |||
* no local disk, | * no local disk, | |||
* or target area marked as invalid, | * or target area marked as invalid, | |||
* or just got an io-error. */ | * or just got an io-error. */ | |||
/* from __drbd_make_request | /* from __drbd_make_request | |||
* or from bio_endio during read io-error recovery */ | * or from bio_endio during read io-error recovery */ | |||
/* So we can verify the handle in the answer packet. | /* So we can verify the handle in the answer packet. | |||
* Corresponding drbd_remove_request_interval is in | * Corresponding drbd_remove_request_interval is in | |||
* drbd_req_complete() */ | * drbd_req_complete() */ | |||
D_ASSERT(device, drbd_interval_empty(&req->i)); | D_ASSERT(device, drbd_interval_empty(&req->i)); | |||
spin_lock_irqsave(&device->interval_lock, flags); | spin_lock_irqsave(&device->interval_lock, flags); | |||
drbd_insert_interval(&device->read_requests, &req->i); | drbd_insert_interval(&device->read_requests, &req->i); | |||
spin_unlock_irqrestore(&device->interval_lock, flags); | spin_unlock_irqrestore(&device->interval_lock, flags); | |||
D_ASSERT(device, !(req->net_rq_state[idx] & RQ_NET_MASK)); | D_ASSERT(device, !(req->net_rq_state[idx] & RQ_NET_MASK)); | |||
D_ASSERT(device, !(req->local_rq_state & RQ_LOCAL_MASK)); | D_ASSERT(device, !(req->local_rq_state & RQ_LOCAL_MASK)); | |||
mod_rq_state(req, m, peer_device, 0, RQ_NET_PENDING|RQ_NET_QUEUED ); | mod_rq_state(req, m, peer_device, 0, RQ_NET_PENDING); | |||
break; | break; | |||
case QUEUE_FOR_NET_WRITE: | case NEW_NET_WRITE: | |||
/* assert something? */ | /* assert something? */ | |||
/* from __drbd_make_request only */ | /* from __drbd_make_request only */ | |||
/* NOTE | /* NOTE | |||
* In case the req ended up on the transfer log before being | * In case the req ended up on the transfer log before being | |||
* queued on the worker, it could lead to this request being | * queued on the worker, it could lead to this request being | |||
* missed during cleanup after connection loss. | * missed during cleanup after connection loss. | |||
* So we have to do both operations here, | * So we have to do both operations here, | |||
* within the same lock that protects the transfer log. | * within the same lock that protects the transfer log. | |||
* | * | |||
* _req_add_to_epoch(req); this has to be after the | * _req_add_to_epoch(req); this has to be after the | |||
* _maybe_start_new_epoch(req); which happened in | * _maybe_start_new_epoch(req); which happened in | |||
* __drbd_make_request, because we now may set the bit | * __drbd_make_request, because we now may set the bit | |||
* again ourselves to close the current epoch. | * again ourselves to close the current epoch. | |||
* | * | |||
* Add req to the (now) current epoch (barrier). */ | * Add req to the (now) current epoch (barrier). */ | |||
D_ASSERT(device, !(req->net_rq_state[idx] & RQ_NET_MASK)); | D_ASSERT(device, !(req->net_rq_state[idx] & RQ_NET_MASK)); | |||
/* queue work item to send data */ | /* queue work item to send data */ | |||
mod_rq_state(req, m, peer_device, 0, RQ_NET_PENDING|RQ_NET_QUEUED |RQ_EXP_BARR_ACK| | mod_rq_state(req, m, peer_device, 0, RQ_NET_PENDING|RQ_EXP_BARR_A CK| | |||
drbd_protocol_state_bits(peer_device->connection) ); | drbd_protocol_state_bits(peer_device->connection) ); | |||
/* Close the epoch, in case it outgrew the limit. | /* Close the epoch, in case it outgrew the limit. | |||
* Or if this is a "batch bio", and some of our peers is "old", | * Or if this is a "batch bio", and some of our peers is "old", | |||
* because a batch bio "storm" (like, large scale discarding | * because a batch bio "storm" (like, large scale discarding | |||
* during mkfs time) would be likely to starve out the peers | * during mkfs time) would be likely to starve out the peers | |||
* activity log, if it is smaller than ours (or we don't have | * activity log, if it is smaller than ours (or we don't have | |||
* any). And a fix for the resulting potential distributed | * any). And a fix for the resulting potential distributed | |||
* deadlock was only implemented with P_CONFIRM_STABLE with | * deadlock was only implemented with P_CONFIRM_STABLE with | |||
* protocol version 114. | * protocol version 114. | |||
skipping to change at line 1017 | skipping to change at line 1049 | |||
else { | else { | |||
rcu_read_lock(); | rcu_read_lock(); | |||
nc = rcu_dereference(peer_device->connection->transport.n et_conf); | nc = rcu_dereference(peer_device->connection->transport.n et_conf); | |||
p = nc->max_epoch_size; | p = nc->max_epoch_size; | |||
rcu_read_unlock(); | rcu_read_unlock(); | |||
} | } | |||
if (device->resource->current_tle_writes >= p) | if (device->resource->current_tle_writes >= p) | |||
start_new_tl_epoch(device->resource); | start_new_tl_epoch(device->resource); | |||
break; | break; | |||
case QUEUE_FOR_SEND_OOS: | case NEW_NET_OOS: | |||
mod_rq_state(req, m, peer_device, 0, RQ_NET_PENDING); | ||||
break; | ||||
case ADDED_TO_TRANSFER_LOG: | ||||
mod_rq_state(req, m, peer_device, 0, RQ_NET_QUEUED); | mod_rq_state(req, m, peer_device, 0, RQ_NET_QUEUED); | |||
break; | break; | |||
case SEND_CANCELED: | case SEND_CANCELED: | |||
case SEND_FAILED: | case SEND_FAILED: | |||
/* Just update flags so it is no longer marked as on the sender | /* Just update flags so it is no longer marked as on the sender | |||
* queue; real cleanup will be done from | * queue; real cleanup will be done from | |||
* tl_walk(,CONNECTION_LOST*). */ | * tl_walk(,CONNECTION_LOST*). */ | |||
mod_rq_state(req, m, peer_device, RQ_NET_QUEUED, 0); | mod_rq_state(req, m, peer_device, RQ_NET_QUEUED, 0); | |||
break; | break; | |||
skipping to change at line 1044 | skipping to change at line 1080 | |||
mod_rq_state(req, m, peer_device, RQ_NET_QUEUED|RQ_NET_PE NDING, | mod_rq_state(req, m, peer_device, RQ_NET_QUEUED|RQ_NET_PE NDING, | |||
RQ_NET_SENT|RQ_NET_OK); | RQ_NET_SENT|RQ_NET_OK); | |||
else | else | |||
mod_rq_state(req, m, peer_device, RQ_NET_QUEUED, RQ_NET_S ENT); | mod_rq_state(req, m, peer_device, RQ_NET_QUEUED, RQ_NET_S ENT); | |||
/* It is still not yet RQ_NET_DONE until the | /* It is still not yet RQ_NET_DONE until the | |||
* corresponding epoch barrier got acked as well, | * corresponding epoch barrier got acked as well, | |||
* so we know what to dirty on connection loss. */ | * so we know what to dirty on connection loss. */ | |||
break; | break; | |||
case OOS_HANDED_TO_NETWORK: | case OOS_HANDED_TO_NETWORK: | |||
/* Was not set PENDING, no longer QUEUED, so is now DONE | /* No longer PENDING or QUEUED, so is now DONE | |||
* as far as this connection is concerned. */ | * as far as this connection is concerned. */ | |||
mod_rq_state(req, m, peer_device, RQ_NET_QUEUED, RQ_NET_DONE); | mod_rq_state(req, m, peer_device, RQ_NET_PENDING|RQ_NET_QUEUED, R Q_NET_DONE); | |||
break; | break; | |||
case CONNECTION_LOST: | case CONNECTION_LOST: | |||
case CONNECTION_LOST_WHILE_SUSPENDED: | case CONNECTION_LOST_WHILE_SUSPENDED: | |||
/* Only apply to requests that were for this peer but not done. * / | /* Only apply to requests that were for this peer but not done. * / | |||
if (!(req->net_rq_state[idx] & RQ_NET_MASK) || req->net_rq_state[ idx] & RQ_NET_DONE) | if (!(req->net_rq_state[idx] & RQ_NET_MASK) || req->net_rq_state[ idx] & RQ_NET_DONE) | |||
break; | break; | |||
/* For protocol A, or when not suspended, we consider the | /* For protocol A, or when not suspended, we consider the | |||
* request to be lost towards this peer. | * request to be lost towards this peer. | |||
skipping to change at line 1392 | skipping to change at line 1428 | |||
(peer_disk_state == D_INCONSISTENT && | (peer_disk_state == D_INCONSISTENT && | |||
(repl_state == L_ESTABLISHED || | (repl_state == L_ESTABLISHED || | |||
(repl_state >= L_WF_BITMAP_T && repl_state < L_AHEAD))); | (repl_state >= L_WF_BITMAP_T && repl_state < L_AHEAD))); | |||
/* Before proto 96 that was >= CONNECTED instead of >= L_WF_BITMAP_T. | /* Before proto 96 that was >= CONNECTED instead of >= L_WF_BITMAP_T. | |||
That is equivalent since before 96 IO was frozen in the L_WF_BITMAP* | That is equivalent since before 96 IO was frozen in the L_WF_BITMAP* | |||
states. */ | states. */ | |||
} | } | |||
static bool drbd_should_send_out_of_sync(struct drbd_peer_device *peer_device) | static bool drbd_should_send_out_of_sync(struct drbd_peer_device *peer_device) | |||
{ | { | |||
return peer_device->repl_state[NOW] == L_AHEAD || peer_device->repl_state | enum drbd_disk_state peer_disk_state = peer_device->disk_state[NOW]; | |||
[NOW] == L_WF_BITMAP_S; | enum drbd_repl_state repl_state = peer_device->repl_state[NOW]; | |||
/* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessar | ||||
y | return repl_state == L_AHEAD || | |||
since we enter state L_AHEAD only if proto >= 96 */ | repl_state == L_WF_BITMAP_S || | |||
(peer_disk_state == D_OUTDATED && repl_state >= L_ESTABLISHED); | ||||
/* proto 96 check omitted, there was no L_AHEAD back then, | ||||
* peer disk was never Outdated while connection was established, | ||||
* and IO was frozen during bitmap exchange */ | ||||
} | } | |||
/* Prefer to read from protcol C peers, then B, last A */ | /* Prefer to read from protcol C peers, then B, last A */ | |||
static u64 calc_nodes_to_read_from(struct drbd_device *device) | static u64 calc_nodes_to_read_from(struct drbd_device *device) | |||
{ | { | |||
struct drbd_peer_device *peer_device; | struct drbd_peer_device *peer_device; | |||
u64 candidates[DRBD_PROT_C] = {}; | u64 candidates[DRBD_PROT_C] = {}; | |||
int wp; | int wp; | |||
rcu_read_lock(); | rcu_read_lock(); | |||
skipping to change at line 1547 | skipping to change at line 1590 | |||
remote = drbd_should_do_remote(peer_device, NOW); | remote = drbd_should_do_remote(peer_device, NOW); | |||
send_oos = drbd_should_send_out_of_sync(peer_device); | send_oos = drbd_should_send_out_of_sync(peer_device); | |||
if (!remote && !send_oos) | if (!remote && !send_oos) | |||
continue; | continue; | |||
D_ASSERT(device, !(remote && send_oos)); | D_ASSERT(device, !(remote && send_oos)); | |||
if (remote) { | if (remote) { | |||
++count; | ++count; | |||
_req_mod(req, QUEUE_FOR_NET_WRITE, peer_device); | _req_mod(req, NEW_NET_WRITE, peer_device); | |||
} else | } else | |||
_req_mod(req, QUEUE_FOR_SEND_OOS, peer_device); | _req_mod(req, NEW_NET_OOS, peer_device); | |||
} | } | |||
return count; | return count; | |||
} | } | |||
static void drbd_queue_request(struct drbd_request *req) | ||||
{ | ||||
struct drbd_device *device = req->device; | ||||
struct drbd_peer_device *peer_device; | ||||
for_each_peer_device(peer_device, device) { | ||||
if (req->net_rq_state[peer_device->node_id] & RQ_NET_PENDING) | ||||
_req_mod(req, ADDED_TO_TRANSFER_LOG, peer_device); | ||||
} | ||||
} | ||||
static void drbd_process_discard_or_zeroes_req(struct drbd_request *req, int fla gs) | static void drbd_process_discard_or_zeroes_req(struct drbd_request *req, int fla gs) | |||
{ | { | |||
int err = drbd_issue_discard_or_zero_out(req->device, | int err = drbd_issue_discard_or_zero_out(req->device, | |||
req->i.sector, req->i.size >> 9, flags); | req->i.sector, req->i.size >> 9, flags); | |||
if (err) | if (err) | |||
req->private_bio->bi_status = BLK_STS_IOERR; | req->private_bio->bi_status = BLK_STS_IOERR; | |||
bio_endio(req->private_bio); | bio_endio(req->private_bio); | |||
} | } | |||
static void | static void | |||
skipping to change at line 1888 | skipping to change at line 1942 | |||
if (prev_write) { | if (prev_write) { | |||
kref_get(&req->kref); | kref_get(&req->kref); | |||
prev_write->destroy_next = req; | prev_write->destroy_next = req; | |||
} | } | |||
if (!drbd_process_write_request(req)) | if (!drbd_process_write_request(req)) | |||
no_remote = true; | no_remote = true; | |||
} else { | } else { | |||
if (peer_device) | if (peer_device) | |||
_req_mod(req, QUEUE_FOR_NET_READ, peer_device); | _req_mod(req, NEW_NET_READ, peer_device); | |||
else | else | |||
no_remote = true; | no_remote = true; | |||
} | } | |||
/* req may now be accessed by other threads - do not modify | /* req may now be accessed by other threads - do not modify | |||
* "immutable" fields after this point */ | * "immutable" fields after this point */ | |||
list_add_tail_rcu(&req->tl_requests, &resource->transfer_log); | list_add_tail_rcu(&req->tl_requests, &resource->transfer_log); | |||
/* Do this after adding to the transfer log so that the | ||||
* caching pointer req_not_net_done is set if | ||||
* necessary. */ | ||||
drbd_queue_request(req); | ||||
} | } | |||
spin_unlock(&resource->tl_update_lock); | spin_unlock(&resource->tl_update_lock); | |||
if (rw == WRITE) | if (rw == WRITE) | |||
wake_all_senders(resource); | wake_all_senders(resource); | |||
else if (peer_device) | else if (peer_device) | |||
wake_up(&peer_device->connection->sender_work.q_wait); | wake_up(&peer_device->connection->sender_work.q_wait); | |||
if (no_remote == false) { | if (no_remote == false) { | |||
struct drbd_plug_cb *plug = drbd_check_plugged(resource); | struct drbd_plug_cb *plug = drbd_check_plugged(resource); | |||
End of changes. 17 change blocks. | ||||
17 lines changed or deleted | 76 lines changed or added |