"Fossies" - the Fresh Open Source Software Archive 
Member "nova-22.0.1/nova/compute/manager.py" (19 Nov 2020, 519242 Bytes) of package /linux/misc/openstack/nova-22.0.1.tar.gz:
As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Python source code syntax highlighting (style:
standard) with prefixed line numbers.
Alternatively you can here
view or
download the uninterpreted source code file.
For more information about "manager.py" see the
Fossies "Dox" file reference documentation and the latest
Fossies "Diffs" side-by-side code changes report:
22.0.0_vs_22.0.1.
1 # Copyright 2010 United States Government as represented by the
2 # Administrator of the National Aeronautics and Space Administration.
3 # Copyright 2011 Justin Santa Barbara
4 # All Rights Reserved.
5 #
6 # Licensed under the Apache License, Version 2.0 (the "License"); you may
7 # not use this file except in compliance with the License. You may obtain
8 # a copy of the License at
9 #
10 # http://www.apache.org/licenses/LICENSE-2.0
11 #
12 # Unless required by applicable law or agreed to in writing, software
13 # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
14 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
15 # License for the specific language governing permissions and limitations
16 # under the License.
17
18 """Handles all processes relating to instances (guest vms).
19
20 The :py:class:`ComputeManager` class is a :py:class:`nova.manager.Manager` that
21 handles RPC calls relating to creating instances. It is responsible for
22 building a disk image, launching it via the underlying virtualization driver,
23 responding to calls to check its state, attaching persistent storage, and
24 terminating it.
25
26 """
27
28 import base64
29 import binascii
30 import contextlib
31 import copy
32 import functools
33 import inspect
34 import sys
35 import time
36 import traceback
37 import typing as ty
38
39 from cinderclient import exceptions as cinder_exception
40 from cursive import exception as cursive_exception
41 import eventlet.event
42 from eventlet import greenthread
43 import eventlet.semaphore
44 import eventlet.timeout
45 import futurist
46 from keystoneauth1 import exceptions as keystone_exception
47 import os_traits
48 from oslo_log import log as logging
49 import oslo_messaging as messaging
50 from oslo_serialization import jsonutils
51 from oslo_service import loopingcall
52 from oslo_service import periodic_task
53 from oslo_utils import excutils
54 from oslo_utils import strutils
55 from oslo_utils import timeutils
56 from oslo_utils import units
57 import six
58 from six.moves import range
59
60 from nova.accelerator import cyborg
61 from nova import block_device
62 from nova.compute import api as compute
63 from nova.compute import build_results
64 from nova.compute import claims
65 from nova.compute import power_state
66 from nova.compute import resource_tracker
67 from nova.compute import rpcapi as compute_rpcapi
68 from nova.compute import task_states
69 from nova.compute import utils as compute_utils
70 from nova.compute.utils import wrap_instance_event
71 from nova.compute import vm_states
72 from nova import conductor
73 import nova.conf
74 import nova.context
75 from nova import exception
76 from nova import exception_wrapper
77 from nova.i18n import _
78 from nova.image import glance
79 from nova import manager
80 from nova.network import model as network_model
81 from nova.network import neutron
82 from nova import objects
83 from nova.objects import base as obj_base
84 from nova.objects import external_event as external_event_obj
85 from nova.objects import fields
86 from nova.objects import instance as obj_instance
87 from nova.objects import migrate_data as migrate_data_obj
88 from nova.pci import request as pci_req_module
89 from nova.pci import whitelist
90 from nova import rpc
91 from nova import safe_utils
92 from nova.scheduler.client import query
93 from nova.scheduler.client import report
94 from nova.scheduler import utils as scheduler_utils
95 from nova import utils
96 from nova.virt import block_device as driver_block_device
97 from nova.virt import configdrive
98 from nova.virt import driver
99 from nova.virt import event as virtevent
100 from nova.virt import hardware
101 from nova.virt import storage_users
102 from nova.virt import virtapi
103 from nova.volume import cinder
104
105 CONF = nova.conf.CONF
106
107 LOG = logging.getLogger(__name__)
108
109 get_notifier = functools.partial(rpc.get_notifier, service='compute')
110 wrap_exception = functools.partial(exception_wrapper.wrap_exception,
111 get_notifier=get_notifier,
112 binary='nova-compute')
113
114
115 @contextlib.contextmanager
116 def errors_out_migration_ctxt(migration):
117 """Context manager to error out migration on failure."""
118
119 try:
120 yield
121 except Exception:
122 with excutils.save_and_reraise_exception():
123 if migration:
124 # We may have been passed None for our migration if we're
125 # receiving from an older client. The migration will be
126 # errored via the legacy path.
127 migration.status = 'error'
128 try:
129 migration.save()
130 except Exception:
131 LOG.debug(
132 'Error setting migration status for instance %s.',
133 migration.instance_uuid, exc_info=True)
134
135
136 @utils.expects_func_args('migration')
137 def errors_out_migration(function):
138 """Decorator to error out migration on failure."""
139
140 @functools.wraps(function)
141 def decorated_function(self, context, *args, **kwargs):
142 wrapped_func = safe_utils.get_wrapped_function(function)
143 keyed_args = inspect.getcallargs(wrapped_func, self, context,
144 *args, **kwargs)
145 migration = keyed_args['migration']
146 with errors_out_migration_ctxt(migration):
147 return function(self, context, *args, **kwargs)
148
149 return decorated_function
150
151
152 @utils.expects_func_args('instance')
153 def reverts_task_state(function):
154 """Decorator to revert task_state on failure."""
155
156 @functools.wraps(function)
157 def decorated_function(self, context, *args, **kwargs):
158 try:
159 return function(self, context, *args, **kwargs)
160 except exception.UnexpectedTaskStateError as e:
161 # Note(maoy): unexpected task state means the current
162 # task is preempted. Do not clear task state in this
163 # case.
164 with excutils.save_and_reraise_exception():
165 LOG.info("Task possibly preempted: %s",
166 e.format_message())
167 except Exception:
168 with excutils.save_and_reraise_exception():
169 wrapped_func = safe_utils.get_wrapped_function(function)
170 keyed_args = inspect.getcallargs(wrapped_func, self, context,
171 *args, **kwargs)
172 # NOTE(mriedem): 'instance' must be in keyed_args because we
173 # have utils.expects_func_args('instance') decorating this
174 # method.
175 instance = keyed_args['instance']
176 original_task_state = instance.task_state
177 try:
178 self._instance_update(context, instance, task_state=None)
179 LOG.info("Successfully reverted task state from %s on "
180 "failure for instance.",
181 original_task_state, instance=instance)
182 except exception.InstanceNotFound:
183 # We might delete an instance that failed to build shortly
184 # after it errored out this is an expected case and we
185 # should not trace on it.
186 pass
187 except Exception as e:
188 LOG.warning("Failed to revert task state for instance. "
189 "Error: %s", e, instance=instance)
190
191 return decorated_function
192
193
194 @utils.expects_func_args('instance')
195 def wrap_instance_fault(function):
196 """Wraps a method to catch exceptions related to instances.
197
198 This decorator wraps a method to catch any exceptions having to do with
199 an instance that may get thrown. It then logs an instance fault in the db.
200 """
201
202 @functools.wraps(function)
203 def decorated_function(self, context, *args, **kwargs):
204 try:
205 return function(self, context, *args, **kwargs)
206 except exception.InstanceNotFound:
207 raise
208 except Exception as e:
209 # NOTE(gtt): If argument 'instance' is in args rather than kwargs,
210 # we will get a KeyError exception which will cover up the real
211 # exception. So, we update kwargs with the values from args first.
212 # then, we can get 'instance' from kwargs easily.
213 kwargs.update(dict(zip(function.__code__.co_varnames[2:], args)))
214
215 with excutils.save_and_reraise_exception():
216 compute_utils.add_instance_fault_from_exc(context,
217 kwargs['instance'], e, sys.exc_info())
218
219 return decorated_function
220
221
222 @utils.expects_func_args('image_id', 'instance')
223 def delete_image_on_error(function):
224 """Used for snapshot related method to ensure the image created in
225 compute.api is deleted when an error occurs.
226 """
227
228 @functools.wraps(function)
229 def decorated_function(self, context, image_id, instance,
230 *args, **kwargs):
231 try:
232 return function(self, context, image_id, instance,
233 *args, **kwargs)
234 except Exception:
235 with excutils.save_and_reraise_exception():
236 compute_utils.delete_image(
237 context, instance, self.image_api, image_id,
238 log_exc_info=True)
239
240 return decorated_function
241
242
243 # Each collection of events is a dict of eventlet Events keyed by a tuple of
244 # event name and associated tag
245 _InstanceEvents = ty.Dict[ty.Tuple[str, str], eventlet.event.Event]
246
247
248 class InstanceEvents(object):
249 def __init__(self):
250 self._events: ty.Optional[ty.Dict[str, _InstanceEvents]] = {}
251
252 @staticmethod
253 def _lock_name(instance) -> str:
254 return '%s-%s' % (instance.uuid, 'events')
255
256 def prepare_for_instance_event(
257 self,
258 instance: 'objects.Instance',
259 name: str,
260 tag: str,
261 ) -> eventlet.event.Event:
262 """Prepare to receive an event for an instance.
263
264 This will register an event for the given instance that we will
265 wait on later. This should be called before initiating whatever
266 action will trigger the event. The resulting eventlet.event.Event
267 object should be wait()'d on to ensure completion.
268
269 :param instance: the instance for which the event will be generated
270 :param name: the name of the event we're expecting
271 :param tag: the tag associated with the event we're expecting
272 :returns: an event object that should be wait()'d on
273 """
274 @utils.synchronized(self._lock_name(instance))
275 def _create_or_get_event():
276 if self._events is None:
277 # NOTE(danms): We really should have a more specific error
278 # here, but this is what we use for our default error case
279 raise exception.NovaException(
280 'In shutdown, no new events can be scheduled')
281
282 instance_events = self._events.setdefault(instance.uuid, {})
283 return instance_events.setdefault((name, tag),
284 eventlet.event.Event())
285 LOG.debug('Preparing to wait for external event %(name)s-%(tag)s',
286 {'name': name, 'tag': tag}, instance=instance)
287 return _create_or_get_event()
288
289 def pop_instance_event(self, instance, event):
290 """Remove a pending event from the wait list.
291
292 This will remove a pending event from the wait list so that it
293 can be used to signal the waiters to wake up.
294
295 :param instance: the instance for which the event was generated
296 :param event: the nova.objects.external_event.InstanceExternalEvent
297 that describes the event
298 :returns: the eventlet.event.Event object on which the waiters
299 are blocked
300 """
301 no_events_sentinel = object()
302 no_matching_event_sentinel = object()
303
304 @utils.synchronized(self._lock_name(instance))
305 def _pop_event():
306 if self._events is None:
307 LOG.debug('Unexpected attempt to pop events during shutdown',
308 instance=instance)
309 return no_events_sentinel
310 events = self._events.get(instance.uuid)
311 if not events:
312 return no_events_sentinel
313 _event = events.pop((event.name, event.tag), None)
314 if not events:
315 del self._events[instance.uuid]
316 if _event is None:
317 return no_matching_event_sentinel
318 return _event
319
320 result = _pop_event()
321 if result is no_events_sentinel:
322 LOG.debug('No waiting events found dispatching %(event)s',
323 {'event': event.key},
324 instance=instance)
325 return None
326 elif result is no_matching_event_sentinel:
327 LOG.debug(
328 'No event matching %(event)s in %(events)s',
329 {
330 'event': event.key,
331 # mypy can't identify the none check in _pop_event
332 'events': self._events.get( # type: ignore
333 instance.uuid, {}).keys(),
334 },
335 instance=instance,
336 )
337 return None
338 else:
339 return result
340
341 def clear_events_for_instance(self, instance):
342 """Remove all pending events for an instance.
343
344 This will remove all events currently pending for an instance
345 and return them (indexed by event name).
346
347 :param instance: the instance for which events should be purged
348 :returns: a dictionary of {event_name: eventlet.event.Event}
349 """
350 @utils.synchronized(self._lock_name(instance))
351 def _clear_events():
352 if self._events is None:
353 LOG.debug('Unexpected attempt to clear events during shutdown',
354 instance=instance)
355 return dict()
356 # NOTE(danms): We have historically returned the raw internal
357 # format here, which is {event.key: [events, ...])} so just
358 # trivially convert it here.
359 return {'%s-%s' % k: e
360 for k, e in self._events.pop(instance.uuid, {}).items()}
361 return _clear_events()
362
363 def cancel_all_events(self):
364 if self._events is None:
365 LOG.debug('Unexpected attempt to cancel events during shutdown.')
366 return
367 our_events = self._events
368 # NOTE(danms): Block new events
369 self._events = None
370
371 for instance_uuid, events in our_events.items():
372 for (name, tag), eventlet_event in events.items():
373 LOG.debug('Canceling in-flight event %(name)s-%(tag)s for '
374 'instance %(instance_uuid)s',
375 {'name': name,
376 'tag': tag,
377 'instance_uuid': instance_uuid})
378 event = objects.InstanceExternalEvent(
379 instance_uuid=instance_uuid,
380 name=name, status='failed',
381 tag=tag, data={})
382 eventlet_event.send(event)
383
384
385 class ComputeVirtAPI(virtapi.VirtAPI):
386 def __init__(self, compute):
387 super(ComputeVirtAPI, self).__init__()
388 self._compute = compute
389 self.reportclient = compute.reportclient
390
391 class ExitEarly(Exception):
392 def __init__(self, events):
393 super(Exception, self).__init__()
394 self.events = events
395
396 self._exit_early_exc = ExitEarly
397
398 def exit_wait_early(self, events):
399 """Exit a wait_for_instance_event() immediately and avoid
400 waiting for some events.
401
402 :param: events: A list of (name, tag) tuples for events that we should
403 skip waiting for during a wait_for_instance_event().
404 """
405 raise self._exit_early_exc(events=events)
406
407 def _default_error_callback(self, event_name, instance):
408 raise exception.NovaException(_('Instance event failed'))
409
410 @contextlib.contextmanager
411 def wait_for_instance_event(self, instance, event_names, deadline=300,
412 error_callback=None):
413 """Plan to wait for some events, run some code, then wait.
414
415 This context manager will first create plans to wait for the
416 provided event_names, yield, and then wait for all the scheduled
417 events to complete.
418
419 Note that this uses an eventlet.timeout.Timeout to bound the
420 operation, so callers should be prepared to catch that
421 failure and handle that situation appropriately.
422
423 If the event is not received by the specified timeout deadline,
424 eventlet.timeout.Timeout is raised.
425
426 If the event is received but did not have a 'completed'
427 status, a NovaException is raised. If an error_callback is
428 provided, instead of raising an exception as detailed above
429 for the failure case, the callback will be called with the
430 event_name and instance, and can return True to continue
431 waiting for the rest of the events, False to stop processing,
432 or raise an exception which will bubble up to the waiter.
433
434 If the inner code wishes to abort waiting for one or more
435 events because it knows some state to be finished or condition
436 to be satisfied, it can use VirtAPI.exit_wait_early() with a
437 list of event (name,tag) items to avoid waiting for those
438 events upon context exit. Note that exit_wait_early() exits
439 the context immediately and should be used to signal that all
440 work has been completed and provide the unified list of events
441 that need not be waited for. Waiting for the remaining events
442 will begin immediately upon early exit as if the context was
443 exited normally.
444
445 :param instance: The instance for which an event is expected
446 :param event_names: A list of event names. Each element is a
447 tuple of strings to indicate (name, tag),
448 where name is required, but tag may be None.
449 :param deadline: Maximum number of seconds we should wait for all
450 of the specified events to arrive.
451 :param error_callback: A function to be called if an event arrives
452
453 """
454
455 if error_callback is None:
456 error_callback = self._default_error_callback
457 events = {}
458 for event_name in event_names:
459 name, tag = event_name
460 event_name = objects.InstanceExternalEvent.make_key(name, tag)
461 try:
462 events[event_name] = (
463 self._compute.instance_events.prepare_for_instance_event(
464 instance, name, tag))
465 except exception.NovaException:
466 error_callback(event_name, instance)
467 # NOTE(danms): Don't wait for any of the events. They
468 # should all be canceled and fired immediately below,
469 # but don't stick around if not.
470 deadline = 0
471 try:
472 yield
473 except self._exit_early_exc as e:
474 early_events = set([objects.InstanceExternalEvent.make_key(n, t)
475 for n, t in e.events])
476 else:
477 early_events = set([])
478
479 with eventlet.timeout.Timeout(deadline):
480 for event_name, event in events.items():
481 if event_name in early_events:
482 continue
483 else:
484 actual_event = event.wait()
485 if actual_event.status == 'completed':
486 continue
487 # If we get here, we have an event that was not completed,
488 # nor skipped via exit_wait_early(). Decide whether to
489 # keep waiting by calling the error_callback() hook.
490 decision = error_callback(event_name, instance)
491 if decision is False:
492 break
493
494 def update_compute_provider_status(self, context, rp_uuid, enabled):
495 """Used to add/remove the COMPUTE_STATUS_DISABLED trait on the provider
496
497 :param context: nova auth RequestContext
498 :param rp_uuid: UUID of a compute node resource provider in Placement
499 :param enabled: True if the node is enabled in which case the trait
500 would be removed, False if the node is disabled in which case
501 the trait would be added.
502 :raises: ResourceProviderTraitRetrievalFailed
503 :raises: ResourceProviderUpdateConflict
504 :raises: ResourceProviderUpdateFailed
505 :raises: TraitRetrievalFailed
506 :raises: keystoneauth1.exceptions.ClientException
507 """
508 trait_name = os_traits.COMPUTE_STATUS_DISABLED
509 # Get the current traits (and generation) for the provider.
510 # TODO(mriedem): Leverage the ProviderTree cache in get_provider_traits
511 trait_info = self.reportclient.get_provider_traits(context, rp_uuid)
512 # If the host is enabled, remove the trait (if set), else add
513 # the trait if it doesn't already exist.
514 original_traits = trait_info.traits
515 new_traits = None
516 if enabled and trait_name in original_traits:
517 new_traits = original_traits - {trait_name}
518 LOG.debug('Removing trait %s from compute node resource '
519 'provider %s in placement.', trait_name, rp_uuid)
520 elif not enabled and trait_name not in original_traits:
521 new_traits = original_traits | {trait_name}
522 LOG.debug('Adding trait %s to compute node resource '
523 'provider %s in placement.', trait_name, rp_uuid)
524
525 if new_traits is not None:
526 self.reportclient.set_traits_for_provider(
527 context, rp_uuid, new_traits, generation=trait_info.generation)
528
529
530 class ComputeManager(manager.Manager):
531 """Manages the running instances from creation to destruction."""
532
533 target = messaging.Target(version='5.12')
534
535 def __init__(self, compute_driver=None, *args, **kwargs):
536 """Load configuration options and connect to the hypervisor."""
537 # We want the ComputeManager, ResourceTracker and ComputeVirtAPI all
538 # using the same instance of SchedulerReportClient which has the
539 # ProviderTree cache for this compute service.
540 self.reportclient = report.SchedulerReportClient()
541 self.virtapi = ComputeVirtAPI(self)
542 self.network_api = neutron.API()
543 self.volume_api = cinder.API()
544 self.image_api = glance.API()
545 self._last_bw_usage_poll = 0.0
546 self._bw_usage_supported = True
547 self.compute_api = compute.API()
548 self.compute_rpcapi = compute_rpcapi.ComputeAPI()
549 self.compute_task_api = conductor.ComputeTaskAPI()
550 self.query_client = query.SchedulerQueryClient()
551 self.instance_events = InstanceEvents()
552 self._sync_power_pool = eventlet.GreenPool(
553 size=CONF.sync_power_state_pool_size)
554 self._syncs_in_progress = {}
555 self.send_instance_updates = (
556 CONF.filter_scheduler.track_instance_changes)
557 if CONF.max_concurrent_builds != 0:
558 self._build_semaphore = eventlet.semaphore.Semaphore(
559 CONF.max_concurrent_builds)
560 else:
561 self._build_semaphore = compute_utils.UnlimitedSemaphore()
562 if CONF.max_concurrent_snapshots > 0:
563 self._snapshot_semaphore = eventlet.semaphore.Semaphore(
564 CONF.max_concurrent_snapshots)
565 else:
566 self._snapshot_semaphore = compute_utils.UnlimitedSemaphore()
567 if CONF.max_concurrent_live_migrations > 0:
568 self._live_migration_executor = futurist.GreenThreadPoolExecutor(
569 max_workers=CONF.max_concurrent_live_migrations)
570 else:
571 # CONF.max_concurrent_live_migrations is 0 (unlimited)
572 self._live_migration_executor = futurist.GreenThreadPoolExecutor()
573 # This is a dict, keyed by instance uuid, to a two-item tuple of
574 # migration object and Future for the queued live migration.
575 self._waiting_live_migrations = {}
576
577 super(ComputeManager, self).__init__(service_name="compute",
578 *args, **kwargs)
579
580 # NOTE(russellb) Load the driver last. It may call back into the
581 # compute manager via the virtapi, so we want it to be fully
582 # initialized before that happens.
583 self.driver = driver.load_compute_driver(self.virtapi, compute_driver)
584 self.use_legacy_block_device_info = \
585 self.driver.need_legacy_block_device_info
586 self.rt = resource_tracker.ResourceTracker(
587 self.host, self.driver, reportclient=self.reportclient)
588
589 def reset(self):
590 LOG.info('Reloading compute RPC API')
591 compute_rpcapi.reset_globals()
592 self.compute_rpcapi = compute_rpcapi.ComputeAPI()
593 self.reportclient.clear_provider_cache()
594
595 def _update_resource_tracker(self, context, instance):
596 """Let the resource tracker know that an instance has changed state."""
597
598 if instance.host == self.host:
599 self.rt.update_usage(context, instance, instance.node)
600
601 def _instance_update(self, context, instance, **kwargs):
602 """Update an instance in the database using kwargs as value."""
603
604 for k, v in kwargs.items():
605 setattr(instance, k, v)
606 instance.save()
607 self._update_resource_tracker(context, instance)
608
609 def _nil_out_instance_obj_host_and_node(self, instance):
610 # NOTE(jwcroppe): We don't do instance.save() here for performance
611 # reasons; a call to this is expected to be immediately followed by
612 # another call that does instance.save(), thus avoiding two writes
613 # to the database layer.
614 instance.host = None
615 instance.node = None
616 # ResourceTracker._set_instance_host_and_node also sets launched_on
617 # to the same value as host and is really only ever used by legacy
618 # nova-network code, but we should also null it out to avoid confusion
619 # if there is an instance in the database with no host set but
620 # launched_on is set. Note that we do not care about using launched_on
621 # as some kind of debug helper if diagnosing a build failure, that is
622 # what instance action events are for.
623 instance.launched_on = None
624 # If the instance is not on a host, it's not in an aggregate and
625 # therefore is not in an availability zone.
626 instance.availability_zone = None
627
628 def _set_instance_obj_error_state(self, instance, clean_task_state=False):
629 try:
630 instance.vm_state = vm_states.ERROR
631 if clean_task_state:
632 instance.task_state = None
633 instance.save()
634 except exception.InstanceNotFound:
635 LOG.debug('Instance has been destroyed from under us while '
636 'trying to set it to ERROR', instance=instance)
637
638 def _get_instances_on_driver(self, context, filters=None):
639 """Return a list of instance records for the instances found
640 on the hypervisor which satisfy the specified filters. If filters=None
641 return a list of instance records for all the instances found on the
642 hypervisor.
643 """
644 if not filters:
645 filters = {}
646 try:
647 driver_uuids = self.driver.list_instance_uuids()
648 if len(driver_uuids) == 0:
649 # Short circuit, don't waste a DB call
650 return objects.InstanceList()
651 filters['uuid'] = driver_uuids
652 local_instances = objects.InstanceList.get_by_filters(
653 context, filters, use_slave=True)
654 return local_instances
655 except NotImplementedError:
656 pass
657
658 # The driver doesn't support uuids listing, so we'll have
659 # to brute force.
660 driver_instances = self.driver.list_instances()
661 # NOTE(mjozefcz): In this case we need to apply host filter.
662 # Without this all instance data would be fetched from db.
663 filters['host'] = self.host
664 instances = objects.InstanceList.get_by_filters(context, filters,
665 use_slave=True)
666 name_map = {instance.name: instance for instance in instances}
667 local_instances = []
668 for driver_instance in driver_instances:
669 instance = name_map.get(driver_instance)
670 if not instance:
671 continue
672 local_instances.append(instance)
673 return local_instances
674
675 def _destroy_evacuated_instances(self, context, node_cache):
676 """Destroys evacuated instances.
677
678 While nova-compute was down, the instances running on it could be
679 evacuated to another host. This method looks for evacuation migration
680 records where this is the source host and which were either started
681 (accepted), in-progress (pre-migrating) or migrated (done). From those
682 migration records, local instances reported by the hypervisor are
683 compared to the instances for the migration records and those local
684 guests are destroyed, along with instance allocation records in
685 Placement for this node.
686 Then allocations are removed from Placement for every instance that is
687 evacuated from this host regardless if the instance is reported by the
688 hypervisor or not.
689
690 :param context: The request context
691 :param node_cache: A dict of ComputeNode objects keyed by the UUID of
692 the compute node
693 :return: A dict keyed by instance uuid mapped to Migration objects
694 for instances that were migrated away from this host
695 """
696 filters = {
697 'source_compute': self.host,
698 # NOTE(mriedem): Migration records that have been accepted are
699 # included in case the source node comes back up while instances
700 # are being evacuated to another host. We don't want the same
701 # instance being reported from multiple hosts.
702 # NOTE(lyarwood): pre-migrating is also included here as the
703 # source compute can come back online shortly after the RT
704 # claims on the destination that in-turn moves the migration to
705 # pre-migrating. If the evacuate fails on the destination host,
706 # the user can rebuild the instance (in ERROR state) on the source
707 # host.
708 'status': ['accepted', 'pre-migrating', 'done'],
709 'migration_type': fields.MigrationType.EVACUATION,
710 }
711 with utils.temporary_mutation(context, read_deleted='yes'):
712 evacuations = objects.MigrationList.get_by_filters(context,
713 filters)
714 if not evacuations:
715 return {}
716 evacuations = {mig.instance_uuid: mig for mig in evacuations}
717
718 # TODO(mriedem): We could optimize by pre-loading the joined fields
719 # we know we'll use, like info_cache and flavor.
720 local_instances = self._get_instances_on_driver(context)
721 evacuated_local_instances = {inst.uuid: inst
722 for inst in local_instances
723 if inst.uuid in evacuations}
724
725 for instance in evacuated_local_instances.values():
726 LOG.info('Destroying instance as it has been evacuated from '
727 'this host but still exists in the hypervisor',
728 instance=instance)
729 try:
730 network_info = self.network_api.get_instance_nw_info(
731 context, instance)
732 bdi = self._get_instance_block_device_info(context,
733 instance)
734 destroy_disks = not (self._is_instance_storage_shared(
735 context, instance))
736 except exception.InstanceNotFound:
737 network_info = network_model.NetworkInfo()
738 bdi = {}
739 LOG.info('Instance has been marked deleted already, '
740 'removing it from the hypervisor.',
741 instance=instance)
742 # always destroy disks if the instance was deleted
743 destroy_disks = True
744 self.driver.destroy(context, instance,
745 network_info,
746 bdi, destroy_disks)
747
748 hostname_to_cn_uuid = {
749 cn.hypervisor_hostname: cn.uuid
750 for cn in node_cache.values()}
751
752 for instance_uuid, migration in evacuations.items():
753 try:
754 if instance_uuid in evacuated_local_instances:
755 # Avoid the db call if we already have the instance loaded
756 # above
757 instance = evacuated_local_instances[instance_uuid]
758 else:
759 instance = objects.Instance.get_by_uuid(
760 context, instance_uuid)
761 except exception.InstanceNotFound:
762 # The instance already deleted so we expect that every
763 # allocation of that instance has already been cleaned up
764 continue
765
766 LOG.info('Cleaning up allocations of the instance as it has been '
767 'evacuated from this host',
768 instance=instance)
769 if migration.source_node not in hostname_to_cn_uuid:
770 LOG.error("Failed to clean allocation of evacuated "
771 "instance as the source node %s is not found",
772 migration.source_node, instance=instance)
773 continue
774 cn_uuid = hostname_to_cn_uuid[migration.source_node]
775
776 # If the instance was deleted in the interim, assume its
777 # allocations were properly cleaned up (either by its hosting
778 # compute service or the API).
779 if (not instance.deleted and
780 not self.reportclient.
781 remove_provider_tree_from_instance_allocation(
782 context, instance.uuid, cn_uuid)):
783 LOG.error("Failed to clean allocation of evacuated instance "
784 "on the source node %s",
785 cn_uuid, instance=instance)
786
787 migration.status = 'completed'
788 migration.save()
789 return evacuations
790
791 def _is_instance_storage_shared(self, context, instance, host=None):
792 shared_storage = True
793 data = None
794 try:
795 data = self.driver.check_instance_shared_storage_local(context,
796 instance)
797 if data:
798 shared_storage = (self.compute_rpcapi.
799 check_instance_shared_storage(context,
800 instance, data, host=host))
801 except NotImplementedError:
802 LOG.debug('Hypervisor driver does not support '
803 'instance shared storage check, '
804 'assuming it\'s not on shared storage',
805 instance=instance)
806 shared_storage = False
807 except Exception:
808 LOG.exception('Failed to check if instance shared',
809 instance=instance)
810 finally:
811 if data:
812 self.driver.check_instance_shared_storage_cleanup(context,
813 data)
814 return shared_storage
815
816 def _complete_partial_deletion(self, context, instance):
817 """Complete deletion for instances in DELETED status but not marked as
818 deleted in the DB
819 """
820 instance.destroy()
821 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
822 context, instance.uuid)
823 self._complete_deletion(context,
824 instance)
825 self._notify_about_instance_usage(context, instance, "delete.end")
826 compute_utils.notify_about_instance_action(context, instance,
827 self.host, action=fields.NotificationAction.DELETE,
828 phase=fields.NotificationPhase.END, bdms=bdms)
829
830 def _complete_deletion(self, context, instance):
831 self._update_resource_tracker(context, instance)
832
833 self.reportclient.delete_allocation_for_instance(context,
834 instance.uuid)
835
836 self._clean_instance_console_tokens(context, instance)
837 self._delete_scheduler_instance_info(context, instance.uuid)
838
839 def _validate_pinning_configuration(self, instances):
840 if not self.driver.capabilities.get('supports_pcpus', False):
841 return
842
843 for instance in instances:
844 # ignore deleted instances
845 if instance.deleted:
846 continue
847
848 # if this is an unpinned instance and the host only has
849 # 'cpu_dedicated_set' configured, we need to tell the operator to
850 # correct their configuration
851 if not instance.numa_topology or (
852 instance.numa_topology.cpu_policy in (
853 None, fields.CPUAllocationPolicy.SHARED
854 )
855 ):
856 # we don't need to check 'vcpu_pin_set' since it can't coexist
857 # alongside 'cpu_dedicated_set'
858 if (CONF.compute.cpu_dedicated_set and
859 not CONF.compute.cpu_shared_set):
860 msg = _("This host has unpinned instances but has no CPUs "
861 "set aside for this purpose; configure '[compute] "
862 "cpu_shared_set' instead of, or in addition to, "
863 "'[compute] cpu_dedicated_set'")
864 raise exception.InvalidConfiguration(msg)
865
866 continue
867
868 # ditto for pinned instances if only 'cpu_shared_set' is configured
869 if (CONF.compute.cpu_shared_set and
870 not CONF.compute.cpu_dedicated_set and
871 not CONF.vcpu_pin_set):
872 msg = _("This host has pinned instances but has no CPUs "
873 "set aside for this purpose; configure '[compute] "
874 "cpu_dedicated_set' instead of, or in addition to, "
875 "'[compute] cpu_shared_set'.")
876 raise exception.InvalidConfiguration(msg)
877
878 # if this is a mixed instance with both pinned and unpinned CPUs,
879 # the host must have both 'cpu_dedicated_set' and 'cpu_shared_set'
880 # configured. check if 'cpu_shared_set' is set.
881 if (instance.numa_topology.cpu_policy ==
882 fields.CPUAllocationPolicy.MIXED and
883 not CONF.compute.cpu_shared_set):
884 msg = _("This host has mixed instance requesting both pinned "
885 "and unpinned CPUs but hasn't set aside unpinned CPUs "
886 "for this purpose; Configure "
887 "'[compute] cpu_shared_set'.")
888 raise exception.InvalidConfiguration(msg)
889
890 # for mixed instance check if 'cpu_dedicated_set' is set.
891 if (instance.numa_topology.cpu_policy ==
892 fields.CPUAllocationPolicy.MIXED and
893 not CONF.compute.cpu_dedicated_set):
894 msg = _("This host has mixed instance requesting both pinned "
895 "and unpinned CPUs but hasn't set aside pinned CPUs "
896 "for this purpose; Configure "
897 "'[compute] cpu_dedicated_set'")
898 raise exception.InvalidConfiguration(msg)
899
900 # also check to make sure the operator hasn't accidentally
901 # dropped some cores that instances are currently using
902 available_dedicated_cpus = (hardware.get_vcpu_pin_set() or
903 hardware.get_cpu_dedicated_set())
904 pinned_cpus = instance.numa_topology.cpu_pinning
905 if available_dedicated_cpus and (
906 pinned_cpus - available_dedicated_cpus):
907 # we can't raise an exception because of bug #1289064,
908 # which meant we didn't recalculate CPU pinning information
909 # when we live migrated a pinned instance
910 LOG.warning(
911 "Instance is pinned to host CPUs %(cpus)s "
912 "but one or more of these CPUs are not included in "
913 "either '[compute] cpu_dedicated_set' or "
914 "'vcpu_pin_set'; you should update these "
915 "configuration options to include the missing CPUs "
916 "or rebuild or cold migrate this instance.",
917 {'cpus': list(pinned_cpus)},
918 instance=instance)
919
920 def _validate_vtpm_configuration(self, instances):
921 if self.driver.capabilities.get('supports_vtpm', False):
922 return
923
924 for instance in instances:
925 if instance.deleted:
926 continue
927
928 # NOTE(stephenfin): We don't have an attribute on the instance to
929 # check for this, so we need to inspect the flavor/image metadata
930 if hardware.get_vtpm_constraint(
931 instance.flavor, instance.image_meta,
932 ):
933 msg = _(
934 'This host has instances with the vTPM feature enabled, '
935 'but the host is not correctly configured; enable '
936 'vTPM support.'
937 )
938 raise exception.InvalidConfiguration(msg)
939
940 def _reset_live_migration(self, context, instance):
941 migration = None
942 try:
943 migration = objects.Migration.get_by_instance_and_status(
944 context, instance.uuid, 'running')
945 if migration:
946 self.live_migration_abort(context, instance, migration.id)
947 except Exception:
948 LOG.exception('Failed to abort live-migration',
949 instance=instance)
950 finally:
951 if migration:
952 self._set_migration_status(migration, 'error')
953 LOG.info('Instance found in migrating state during '
954 'startup. Resetting task_state',
955 instance=instance)
956 instance.task_state = None
957 instance.save(expected_task_state=[task_states.MIGRATING])
958
959 def _init_instance(self, context, instance):
960 """Initialize this instance during service init."""
961
962 # NOTE(danms): If the instance appears to not be owned by this
963 # host, it may have been evacuated away, but skipped by the
964 # evacuation cleanup code due to configuration. Thus, if that
965 # is a possibility, don't touch the instance in any way, but
966 # log the concern. This will help avoid potential issues on
967 # startup due to misconfiguration.
968 if instance.host != self.host:
969 LOG.warning('Instance %(uuid)s appears to not be owned '
970 'by this host, but by %(host)s. Startup '
971 'processing is being skipped.',
972 {'uuid': instance.uuid,
973 'host': instance.host})
974 return
975
976 # Instances that are shut down, or in an error state can not be
977 # initialized and are not attempted to be recovered. The exception
978 # to this are instances that are in RESIZE_MIGRATING or DELETING,
979 # which are dealt with further down.
980 if (instance.vm_state == vm_states.SOFT_DELETED or
981 (instance.vm_state == vm_states.ERROR and
982 instance.task_state not in
983 (task_states.RESIZE_MIGRATING, task_states.DELETING))):
984 LOG.debug("Instance is in %s state.",
985 instance.vm_state, instance=instance)
986 return
987
988 if instance.vm_state == vm_states.DELETED:
989 try:
990 self._complete_partial_deletion(context, instance)
991 except Exception:
992 # we don't want that an exception blocks the init_host
993 LOG.exception('Failed to complete a deletion',
994 instance=instance)
995 return
996
997 if (instance.vm_state == vm_states.BUILDING or
998 instance.task_state in [task_states.SCHEDULING,
999 task_states.BLOCK_DEVICE_MAPPING,
1000 task_states.NETWORKING,
1001 task_states.SPAWNING]):
1002 # NOTE(dave-mcnally) compute stopped before instance was fully
1003 # spawned so set to ERROR state. This is safe to do as the state
1004 # may be set by the api but the host is not so if we get here the
1005 # instance has already been scheduled to this particular host.
1006 LOG.debug("Instance failed to spawn correctly, "
1007 "setting to ERROR state", instance=instance)
1008 self._set_instance_obj_error_state(instance, clean_task_state=True)
1009 return
1010
1011 if (instance.vm_state in [vm_states.ACTIVE, vm_states.STOPPED] and
1012 instance.task_state in [task_states.REBUILDING,
1013 task_states.REBUILD_BLOCK_DEVICE_MAPPING,
1014 task_states.REBUILD_SPAWNING]):
1015 # NOTE(jichenjc) compute stopped before instance was fully
1016 # spawned so set to ERROR state. This is consistent to BUILD
1017 LOG.debug("Instance failed to rebuild correctly, "
1018 "setting to ERROR state", instance=instance)
1019 self._set_instance_obj_error_state(instance, clean_task_state=True)
1020 return
1021
1022 if (instance.vm_state != vm_states.ERROR and
1023 instance.task_state in [task_states.IMAGE_SNAPSHOT_PENDING,
1024 task_states.IMAGE_PENDING_UPLOAD,
1025 task_states.IMAGE_UPLOADING,
1026 task_states.IMAGE_SNAPSHOT]):
1027 LOG.debug("Instance in transitional state %s at start-up "
1028 "clearing task state",
1029 instance.task_state, instance=instance)
1030 try:
1031 self._post_interrupted_snapshot_cleanup(context, instance)
1032 except Exception:
1033 # we don't want that an exception blocks the init_host
1034 LOG.exception('Failed to cleanup snapshot.', instance=instance)
1035 instance.task_state = None
1036 instance.save()
1037
1038 if (instance.vm_state != vm_states.ERROR and
1039 instance.task_state in [task_states.RESIZE_PREP]):
1040 LOG.debug("Instance in transitional state %s at start-up "
1041 "clearing task state",
1042 instance['task_state'], instance=instance)
1043 instance.task_state = None
1044 instance.save()
1045
1046 if instance.task_state == task_states.DELETING:
1047 try:
1048 LOG.info('Service started deleting the instance during '
1049 'the previous run, but did not finish. Restarting'
1050 ' the deletion now.', instance=instance)
1051 instance.obj_load_attr('metadata')
1052 instance.obj_load_attr('system_metadata')
1053 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
1054 context, instance.uuid)
1055 self._delete_instance(context, instance, bdms)
1056 except Exception:
1057 # we don't want that an exception blocks the init_host
1058 LOG.exception('Failed to complete a deletion',
1059 instance=instance)
1060 self._set_instance_obj_error_state(instance)
1061 return
1062
1063 current_power_state = self._get_power_state(instance)
1064 try_reboot, reboot_type = self._retry_reboot(
1065 instance, current_power_state)
1066
1067 if try_reboot:
1068 LOG.debug("Instance in transitional state (%(task_state)s) at "
1069 "start-up and power state is (%(power_state)s), "
1070 "triggering reboot",
1071 {'task_state': instance.task_state,
1072 'power_state': current_power_state},
1073 instance=instance)
1074
1075 # NOTE(mikal): if the instance was doing a soft reboot that got as
1076 # far as shutting down the instance but not as far as starting it
1077 # again, then we've just become a hard reboot. That means the
1078 # task state for the instance needs to change so that we're in one
1079 # of the expected task states for a hard reboot.
1080 if (instance.task_state in task_states.soft_reboot_states and
1081 reboot_type == 'HARD'):
1082 instance.task_state = task_states.REBOOT_PENDING_HARD
1083 instance.save()
1084
1085 self.reboot_instance(context, instance, block_device_info=None,
1086 reboot_type=reboot_type)
1087 return
1088
1089 elif (current_power_state == power_state.RUNNING and
1090 instance.task_state in [task_states.REBOOT_STARTED,
1091 task_states.REBOOT_STARTED_HARD,
1092 task_states.PAUSING,
1093 task_states.UNPAUSING]):
1094 LOG.warning("Instance in transitional state "
1095 "(%(task_state)s) at start-up and power state "
1096 "is (%(power_state)s), clearing task state",
1097 {'task_state': instance.task_state,
1098 'power_state': current_power_state},
1099 instance=instance)
1100 instance.task_state = None
1101 instance.vm_state = vm_states.ACTIVE
1102 instance.save()
1103 elif (current_power_state == power_state.PAUSED and
1104 instance.task_state == task_states.UNPAUSING):
1105 LOG.warning("Instance in transitional state "
1106 "(%(task_state)s) at start-up and power state "
1107 "is (%(power_state)s), clearing task state "
1108 "and unpausing the instance",
1109 {'task_state': instance.task_state,
1110 'power_state': current_power_state},
1111 instance=instance)
1112 try:
1113 self.unpause_instance(context, instance)
1114 except NotImplementedError:
1115 # Some virt driver didn't support pause and unpause
1116 pass
1117 except Exception:
1118 LOG.exception('Failed to unpause instance', instance=instance)
1119 return
1120
1121 if instance.task_state == task_states.POWERING_OFF:
1122 try:
1123 LOG.debug("Instance in transitional state %s at start-up "
1124 "retrying stop request",
1125 instance.task_state, instance=instance)
1126 self.stop_instance(context, instance, True)
1127 except Exception:
1128 # we don't want that an exception blocks the init_host
1129 LOG.exception('Failed to stop instance', instance=instance)
1130 return
1131
1132 if instance.task_state == task_states.POWERING_ON:
1133 try:
1134 LOG.debug("Instance in transitional state %s at start-up "
1135 "retrying start request",
1136 instance.task_state, instance=instance)
1137 self.start_instance(context, instance)
1138 except Exception:
1139 # we don't want that an exception blocks the init_host
1140 LOG.exception('Failed to start instance', instance=instance)
1141 return
1142
1143 net_info = instance.get_network_info()
1144 try:
1145 self.driver.plug_vifs(instance, net_info)
1146 except NotImplementedError as e:
1147 LOG.debug(e, instance=instance)
1148 except exception.VirtualInterfacePlugException:
1149 # NOTE(mriedem): If we get here, it could be because the vif_type
1150 # in the cache is "binding_failed" or "unbound".
1151 # The periodic task _heal_instance_info_cache checks for this
1152 # condition. It should fix this by binding the ports again when
1153 # it gets to this instance.
1154 LOG.exception('Virtual interface plugging failed for instance. '
1155 'The port binding:host_id may need to be manually '
1156 'updated.', instance=instance)
1157 self._set_instance_obj_error_state(instance)
1158 return
1159
1160 if instance.task_state == task_states.RESIZE_MIGRATING:
1161 # We crashed during resize/migration, so roll back for safety
1162 try:
1163 # NOTE(mriedem): check old_vm_state for STOPPED here, if it's
1164 # not in system_metadata we default to True for backwards
1165 # compatibility
1166 power_on = (instance.system_metadata.get('old_vm_state') !=
1167 vm_states.STOPPED)
1168
1169 block_dev_info = self._get_instance_block_device_info(context,
1170 instance)
1171
1172 migration = objects.Migration.get_by_id_and_instance(
1173 context, instance.migration_context.migration_id,
1174 instance.uuid)
1175 self.driver.finish_revert_migration(context, instance,
1176 net_info, migration, block_dev_info, power_on)
1177
1178 except Exception:
1179 LOG.exception('Failed to revert crashed migration',
1180 instance=instance)
1181 finally:
1182 LOG.info('Instance found in migrating state during '
1183 'startup. Resetting task_state',
1184 instance=instance)
1185 instance.task_state = None
1186 instance.save()
1187 if instance.task_state == task_states.MIGRATING:
1188 # Live migration did not complete, but instance is on this
1189 # host. Abort ongoing migration if still running and reset state.
1190 self._reset_live_migration(context, instance)
1191
1192 db_state = instance.power_state
1193 drv_state = self._get_power_state(instance)
1194 expect_running = (db_state == power_state.RUNNING and
1195 drv_state != db_state)
1196
1197 LOG.debug('Current state is %(drv_state)s, state in DB is '
1198 '%(db_state)s.',
1199 {'drv_state': drv_state, 'db_state': db_state},
1200 instance=instance)
1201
1202 if expect_running and CONF.resume_guests_state_on_host_boot:
1203 self._resume_guests_state(context, instance, net_info)
1204
1205 def _resume_guests_state(self, context, instance, net_info):
1206 LOG.info('Rebooting instance after nova-compute restart.',
1207 instance=instance)
1208 block_device_info = \
1209 self._get_instance_block_device_info(context, instance)
1210
1211 try:
1212 self.driver.resume_state_on_host_boot(
1213 context, instance, net_info, block_device_info)
1214 except NotImplementedError:
1215 LOG.warning('Hypervisor driver does not support '
1216 'resume guests', instance=instance)
1217 except Exception:
1218 # NOTE(vish): The instance failed to resume, so we set the
1219 # instance to error and attempt to continue.
1220 LOG.warning('Failed to resume instance',
1221 instance=instance)
1222 self._set_instance_obj_error_state(instance)
1223
1224 def _retry_reboot(self, instance, current_power_state):
1225 current_task_state = instance.task_state
1226 retry_reboot = False
1227 reboot_type = compute_utils.get_reboot_type(current_task_state,
1228 current_power_state)
1229
1230 pending_soft = (
1231 current_task_state == task_states.REBOOT_PENDING and
1232 instance.vm_state in vm_states.ALLOW_SOFT_REBOOT)
1233 pending_hard = (
1234 current_task_state == task_states.REBOOT_PENDING_HARD and
1235 instance.vm_state in vm_states.ALLOW_HARD_REBOOT)
1236 started_not_running = (current_task_state in
1237 [task_states.REBOOT_STARTED,
1238 task_states.REBOOT_STARTED_HARD] and
1239 current_power_state != power_state.RUNNING)
1240
1241 if pending_soft or pending_hard or started_not_running:
1242 retry_reboot = True
1243
1244 return retry_reboot, reboot_type
1245
1246 def handle_lifecycle_event(self, event):
1247 LOG.info("VM %(state)s (Lifecycle Event)",
1248 {'state': event.get_name()},
1249 instance_uuid=event.get_instance_uuid())
1250 context = nova.context.get_admin_context(read_deleted='yes')
1251 vm_power_state = None
1252 event_transition = event.get_transition()
1253 if event_transition == virtevent.EVENT_LIFECYCLE_STOPPED:
1254 vm_power_state = power_state.SHUTDOWN
1255 elif event_transition == virtevent.EVENT_LIFECYCLE_STARTED:
1256 vm_power_state = power_state.RUNNING
1257 elif event_transition in (
1258 virtevent.EVENT_LIFECYCLE_PAUSED,
1259 virtevent.EVENT_LIFECYCLE_POSTCOPY_STARTED,
1260 virtevent.EVENT_LIFECYCLE_MIGRATION_COMPLETED):
1261 vm_power_state = power_state.PAUSED
1262 elif event_transition == virtevent.EVENT_LIFECYCLE_RESUMED:
1263 vm_power_state = power_state.RUNNING
1264 elif event_transition == virtevent.EVENT_LIFECYCLE_SUSPENDED:
1265 vm_power_state = power_state.SUSPENDED
1266 else:
1267 LOG.warning("Unexpected lifecycle event: %d", event_transition)
1268
1269 migrate_finish_statuses = {
1270 # This happens on the source node and indicates live migration
1271 # entered post-copy mode.
1272 virtevent.EVENT_LIFECYCLE_POSTCOPY_STARTED: 'running (post-copy)',
1273 # Suspended for offline migration.
1274 virtevent.EVENT_LIFECYCLE_MIGRATION_COMPLETED: 'running'
1275 }
1276
1277 expected_attrs = []
1278 if event_transition in migrate_finish_statuses:
1279 # Join on info_cache since that's needed in migrate_instance_start.
1280 expected_attrs.append('info_cache')
1281 instance = objects.Instance.get_by_uuid(context,
1282 event.get_instance_uuid(),
1283 expected_attrs=expected_attrs)
1284
1285 # Note(lpetrut): The event may be delayed, thus not reflecting
1286 # the current instance power state. In that case, ignore the event.
1287 current_power_state = self._get_power_state(instance)
1288 if current_power_state == vm_power_state:
1289 LOG.debug('Synchronizing instance power state after lifecycle '
1290 'event "%(event)s"; current vm_state: %(vm_state)s, '
1291 'current task_state: %(task_state)s, current DB '
1292 'power_state: %(db_power_state)s, VM power_state: '
1293 '%(vm_power_state)s',
1294 {'event': event.get_name(),
1295 'vm_state': instance.vm_state,
1296 'task_state': instance.task_state,
1297 'db_power_state': instance.power_state,
1298 'vm_power_state': vm_power_state},
1299 instance_uuid=instance.uuid)
1300 self._sync_instance_power_state(context,
1301 instance,
1302 vm_power_state)
1303
1304 # The following checks are for live migration. We want to activate
1305 # the port binding for the destination host before the live migration
1306 # is resumed on the destination host in order to reduce network
1307 # downtime. Otherwise the ports are bound to the destination host
1308 # in post_live_migration_at_destination.
1309 # TODO(danms): Explore options for using a different live migration
1310 # specific callback for this instead of piggy-backing on the
1311 # handle_lifecycle_event callback.
1312 if (instance.task_state == task_states.MIGRATING and
1313 event_transition in migrate_finish_statuses):
1314 status = migrate_finish_statuses[event_transition]
1315 try:
1316 migration = objects.Migration.get_by_instance_and_status(
1317 context, instance.uuid, status)
1318 LOG.debug('Binding ports to destination host: %s',
1319 migration.dest_compute, instance=instance)
1320 # For neutron, migrate_instance_start will activate the
1321 # destination host port bindings, if there are any created by
1322 # conductor before live migration started.
1323 self.network_api.migrate_instance_start(
1324 context, instance, migration)
1325 except exception.MigrationNotFoundByStatus:
1326 LOG.warning("Unable to find migration record with status "
1327 "'%s' for instance. Port binding will happen in "
1328 "post live migration.", status, instance=instance)
1329
1330 def handle_events(self, event):
1331 if isinstance(event, virtevent.LifecycleEvent):
1332 try:
1333 self.handle_lifecycle_event(event)
1334 except exception.InstanceNotFound:
1335 LOG.debug("Event %s arrived for non-existent instance. The "
1336 "instance was probably deleted.", event)
1337 else:
1338 LOG.debug("Ignoring event %s", event)
1339
1340 def init_virt_events(self):
1341 if CONF.workarounds.handle_virt_lifecycle_events:
1342 self.driver.register_event_listener(self.handle_events)
1343 else:
1344 # NOTE(mriedem): If the _sync_power_states periodic task is
1345 # disabled we should emit a warning in the logs.
1346 if CONF.sync_power_state_interval < 0:
1347 LOG.warning('Instance lifecycle events from the compute '
1348 'driver have been disabled. Note that lifecycle '
1349 'changes to an instance outside of the compute '
1350 'service will not be synchronized '
1351 'automatically since the _sync_power_states '
1352 'periodic task is also disabled.')
1353 else:
1354 LOG.info('Instance lifecycle events from the compute '
1355 'driver have been disabled. Note that lifecycle '
1356 'changes to an instance outside of the compute '
1357 'service will only be synchronized by the '
1358 '_sync_power_states periodic task.')
1359
1360 def _get_nodes(self, context):
1361 """Queried the ComputeNode objects from the DB that are reported by the
1362 hypervisor.
1363
1364 :param context: the request context
1365 :return: a dict of ComputeNode objects keyed by the UUID of the given
1366 node.
1367 """
1368 nodes_by_uuid = {}
1369 try:
1370 node_names = self.driver.get_available_nodes()
1371 except exception.VirtDriverNotReady:
1372 LOG.warning(
1373 "Virt driver is not ready. If this is the first time this "
1374 "service is starting on this host, then you can ignore this "
1375 "warning.")
1376 return {}
1377
1378 for node_name in node_names:
1379 try:
1380 node = objects.ComputeNode.get_by_host_and_nodename(
1381 context, self.host, node_name)
1382 nodes_by_uuid[node.uuid] = node
1383 except exception.ComputeHostNotFound:
1384 LOG.warning(
1385 "Compute node %s not found in the database. If this is "
1386 "the first time this service is starting on this host, "
1387 "then you can ignore this warning.", node_name)
1388 return nodes_by_uuid
1389
1390 def init_host(self):
1391 """Initialization for a standalone compute service."""
1392
1393 if CONF.pci.passthrough_whitelist:
1394 # Simply loading the PCI passthrough whitelist will do a bunch of
1395 # validation that would otherwise wait until the PciDevTracker is
1396 # constructed when updating available resources for the compute
1397 # node(s) in the resource tracker, effectively killing that task.
1398 # So load up the whitelist when starting the compute service to
1399 # flush any invalid configuration early so we can kill the service
1400 # if the configuration is wrong.
1401 whitelist.Whitelist(CONF.pci.passthrough_whitelist)
1402
1403 nova.conf.neutron.register_dynamic_opts(CONF)
1404 # Even if only libvirt uses them, make it available for all drivers
1405 nova.conf.devices.register_dynamic_opts(CONF)
1406
1407 # Override the number of concurrent disk operations allowed if the
1408 # user has specified a limit.
1409 if CONF.compute.max_concurrent_disk_ops != 0:
1410 compute_utils.disk_ops_semaphore = \
1411 eventlet.semaphore.BoundedSemaphore(
1412 CONF.compute.max_concurrent_disk_ops)
1413
1414 self.driver.init_host(host=self.host)
1415 context = nova.context.get_admin_context()
1416 instances = objects.InstanceList.get_by_host(
1417 context, self.host,
1418 expected_attrs=['info_cache', 'metadata', 'numa_topology'])
1419
1420 self.init_virt_events()
1421
1422 self._validate_pinning_configuration(instances)
1423 self._validate_vtpm_configuration(instances)
1424
1425 # NOTE(gibi): At this point the compute_nodes of the resource tracker
1426 # has not been populated yet so we cannot rely on the resource tracker
1427 # here.
1428 # NOTE(gibi): If ironic and vcenter virt driver slow start time
1429 # becomes problematic here then we should consider adding a config
1430 # option or a driver flag to tell us if we should thread
1431 # _destroy_evacuated_instances and
1432 # _error_out_instances_whose_build_was_interrupted out in the
1433 # background on startup
1434 nodes_by_uuid = self._get_nodes(context)
1435
1436 try:
1437 # checking that instance was not already evacuated to other host
1438 evacuated_instances = self._destroy_evacuated_instances(
1439 context, nodes_by_uuid)
1440
1441 # Initialise instances on the host that are not evacuating
1442 for instance in instances:
1443 if instance.uuid not in evacuated_instances:
1444 self._init_instance(context, instance)
1445
1446 # NOTE(gibi): collect all the instance uuids that is in some way
1447 # was already handled above. Either by init_instance or by
1448 # _destroy_evacuated_instances. This way we can limit the scope of
1449 # the _error_out_instances_whose_build_was_interrupted call to look
1450 # only for instances that have allocations on this node and not
1451 # handled by the above calls.
1452 already_handled = {instance.uuid for instance in instances}.union(
1453 evacuated_instances)
1454 self._error_out_instances_whose_build_was_interrupted(
1455 context, already_handled, nodes_by_uuid.keys())
1456
1457 finally:
1458 if instances:
1459 # We only send the instance info to the scheduler on startup
1460 # if there is anything to send, otherwise this host might
1461 # not be mapped yet in a cell and the scheduler may have
1462 # issues dealing with the information. Later changes to
1463 # instances on this host will update the scheduler, or the
1464 # _sync_scheduler_instance_info periodic task will.
1465 self._update_scheduler_instance_info(context, instances)
1466
1467 def _error_out_instances_whose_build_was_interrupted(
1468 self, context, already_handled_instances, node_uuids):
1469 """If there are instances in BUILDING state that are not
1470 assigned to this host but have allocations in placement towards
1471 this compute that means the nova-compute service was
1472 restarted while those instances waited for the resource claim
1473 to finish and the _set_instance_host_and_node() to update the
1474 instance.host field. We need to push them to ERROR state here to
1475 prevent keeping them in BUILDING state forever.
1476
1477 :param context: The request context
1478 :param already_handled_instances: The set of instance UUIDs that the
1479 host initialization process already handled in some way.
1480 :param node_uuids: The list of compute node uuids handled by this
1481 service
1482 """
1483
1484 # Strategy:
1485 # 1) Get the allocations from placement for our compute node(s)
1486 # 2) Remove the already handled instances from the consumer list;
1487 # they are either already initialized or need to be skipped.
1488 # 3) Check which remaining consumer is an instance in BUILDING state
1489 # and push it to ERROR state.
1490
1491 LOG.info(
1492 "Looking for unclaimed instances stuck in BUILDING status for "
1493 "nodes managed by this host")
1494 for cn_uuid in node_uuids:
1495 try:
1496 f = self.reportclient.get_allocations_for_resource_provider
1497 allocations = f(context, cn_uuid).allocations
1498 except (exception.ResourceProviderAllocationRetrievalFailed,
1499 keystone_exception.ClientException) as e:
1500 LOG.error(
1501 "Could not retrieve compute node resource provider %s and "
1502 "therefore unable to error out any instances stuck in "
1503 "BUILDING state. Error: %s", cn_uuid, six.text_type(e))
1504 continue
1505
1506 not_handled_consumers = (set(allocations) -
1507 already_handled_instances)
1508
1509 if not not_handled_consumers:
1510 continue
1511
1512 filters = {
1513 'vm_state': vm_states.BUILDING,
1514 'uuid': not_handled_consumers
1515 }
1516
1517 instances = objects.InstanceList.get_by_filters(
1518 context, filters, expected_attrs=[])
1519
1520 for instance in instances:
1521 LOG.debug(
1522 "Instance spawn was interrupted before instance_claim, "
1523 "setting instance to ERROR state", instance=instance)
1524 self._set_instance_obj_error_state(
1525 instance, clean_task_state=True)
1526
1527 def cleanup_host(self):
1528 self.driver.register_event_listener(None)
1529 self.instance_events.cancel_all_events()
1530 self.driver.cleanup_host(host=self.host)
1531 self._cleanup_live_migrations_in_pool()
1532
1533 def _cleanup_live_migrations_in_pool(self):
1534 # Shutdown the pool so we don't get new requests.
1535 self._live_migration_executor.shutdown(wait=False)
1536 # For any queued migrations, cancel the migration and update
1537 # its status.
1538 for migration, future in self._waiting_live_migrations.values():
1539 # If we got here before the Future was submitted then we need
1540 # to move on since there isn't anything we can do.
1541 if future is None:
1542 continue
1543 if future.cancel():
1544 self._set_migration_status(migration, 'cancelled')
1545 LOG.info('Successfully cancelled queued live migration.',
1546 instance_uuid=migration.instance_uuid)
1547 else:
1548 LOG.warning('Unable to cancel live migration.',
1549 instance_uuid=migration.instance_uuid)
1550 self._waiting_live_migrations.clear()
1551
1552 def pre_start_hook(self):
1553 """After the service is initialized, but before we fully bring
1554 the service up by listening on RPC queues, make sure to update
1555 our available resources (and indirectly our available nodes).
1556 """
1557 self.update_available_resource(nova.context.get_admin_context(),
1558 startup=True)
1559
1560 def _get_power_state(self, instance):
1561 """Retrieve the power state for the given instance."""
1562 LOG.debug('Checking state', instance=instance)
1563 try:
1564 return self.driver.get_info(instance, use_cache=False).state
1565 except exception.InstanceNotFound:
1566 return power_state.NOSTATE
1567
1568 # TODO(stephenfin): Remove this once we bump the compute API to v6.0
1569 def get_console_topic(self, context):
1570 """Retrieves the console host for a project on this host.
1571
1572 Currently this is just set in the flags for each compute host.
1573
1574 """
1575 # TODO(mdragon): perhaps make this variable by console_type?
1576 return 'console.%s' % CONF.console_host
1577
1578 # TODO(stephenfin): Remove this once we bump the compute API to v6.0
1579 @wrap_exception()
1580 def get_console_pool_info(self, context, console_type):
1581 return self.driver.get_console_pool_info(console_type)
1582
1583 # TODO(stephenfin): Remove this as it's nova-network only
1584 @wrap_exception()
1585 def refresh_instance_security_rules(self, context, instance):
1586 """Tell the virtualization driver to refresh security rules for
1587 an instance.
1588
1589 Passes straight through to the virtualization driver.
1590
1591 Synchronize the call because we may still be in the middle of
1592 creating the instance.
1593 """
1594 pass
1595
1596 def _await_block_device_map_created(self, context, vol_id):
1597 # TODO(yamahata): creating volume simultaneously
1598 # reduces creation time?
1599 # TODO(yamahata): eliminate dumb polling
1600 start = time.time()
1601 retries = CONF.block_device_allocate_retries
1602 # (1) if the configured value is 0, one attempt should be made
1603 # (2) if the configured value is > 0, then the total number attempts
1604 # is (retries + 1)
1605 attempts = 1
1606 if retries >= 1:
1607 attempts = retries + 1
1608 for attempt in range(1, attempts + 1):
1609 volume = self.volume_api.get(context, vol_id)
1610 volume_status = volume['status']
1611 if volume_status not in ['creating', 'downloading']:
1612 if volume_status == 'available':
1613 return attempt
1614 LOG.warning("Volume id: %(vol_id)s finished being "
1615 "created but its status is %(vol_status)s.",
1616 {'vol_id': vol_id,
1617 'vol_status': volume_status})
1618 break
1619 greenthread.sleep(CONF.block_device_allocate_retries_interval)
1620 raise exception.VolumeNotCreated(volume_id=vol_id,
1621 seconds=int(time.time() - start),
1622 attempts=attempt,
1623 volume_status=volume_status)
1624
1625 def _decode_files(self, injected_files):
1626 """Base64 decode the list of files to inject."""
1627 if not injected_files:
1628 return []
1629
1630 def _decode(f):
1631 path, contents = f
1632 # Py3 raises binascii.Error instead of TypeError as in Py27
1633 try:
1634 decoded = base64.b64decode(contents)
1635 return path, decoded
1636 except (TypeError, binascii.Error):
1637 raise exception.Base64Exception(path=path)
1638
1639 return [_decode(f) for f in injected_files]
1640
1641 def _validate_instance_group_policy(self, context, instance,
1642 scheduler_hints):
1643 # NOTE(russellb) Instance group policy is enforced by the scheduler.
1644 # However, there is a race condition with the enforcement of
1645 # the policy. Since more than one instance may be scheduled at the
1646 # same time, it's possible that more than one instance with an
1647 # anti-affinity policy may end up here. It's also possible that
1648 # multiple instances with an affinity policy could end up on different
1649 # hosts. This is a validation step to make sure that starting the
1650 # instance here doesn't violate the policy.
1651 group_hint = scheduler_hints.get('group')
1652 if not group_hint:
1653 return
1654
1655 # The RequestSpec stores scheduler_hints as key=list pairs so we need
1656 # to check the type on the value and pull the single entry out. The
1657 # API request schema validates that the 'group' hint is a single value.
1658 if isinstance(group_hint, list):
1659 group_hint = group_hint[0]
1660
1661 @utils.synchronized(group_hint)
1662 def _do_validation(context, instance, group_hint):
1663 group = objects.InstanceGroup.get_by_hint(context, group_hint)
1664 if group.policy and 'anti-affinity' == group.policy:
1665 instances_uuids = objects.InstanceList.get_uuids_by_host(
1666 context, self.host)
1667 ins_on_host = set(instances_uuids)
1668 members = set(group.members)
1669 # Determine the set of instance group members on this host
1670 # which are not the instance in question. This is used to
1671 # determine how many other members from the same anti-affinity
1672 # group can be on this host.
1673 members_on_host = ins_on_host & members - set([instance.uuid])
1674 rules = group.rules
1675 if rules and 'max_server_per_host' in rules:
1676 max_server = rules['max_server_per_host']
1677 else:
1678 max_server = 1
1679 if len(members_on_host) >= max_server:
1680 msg = _("Anti-affinity instance group policy "
1681 "was violated.")
1682 raise exception.RescheduledException(
1683 instance_uuid=instance.uuid,
1684 reason=msg)
1685 elif group.policy and 'affinity' == group.policy:
1686 group_hosts = group.get_hosts(exclude=[instance.uuid])
1687 if group_hosts and self.host not in group_hosts:
1688 msg = _("Affinity instance group policy was violated.")
1689 raise exception.RescheduledException(
1690 instance_uuid=instance.uuid,
1691 reason=msg)
1692
1693 if not CONF.workarounds.disable_group_policy_check_upcall:
1694 _do_validation(context, instance, group_hint)
1695
1696 def _log_original_error(self, exc_info, instance_uuid):
1697 LOG.error('Error: %s', exc_info[1], instance_uuid=instance_uuid,
1698 exc_info=exc_info)
1699
1700 @periodic_task.periodic_task
1701 def _check_instance_build_time(self, context):
1702 """Ensure that instances are not stuck in build."""
1703 timeout = CONF.instance_build_timeout
1704 if timeout == 0:
1705 return
1706
1707 filters = {'vm_state': vm_states.BUILDING,
1708 'host': self.host}
1709
1710 building_insts = objects.InstanceList.get_by_filters(context,
1711 filters, expected_attrs=[], use_slave=True)
1712
1713 for instance in building_insts:
1714 if timeutils.is_older_than(instance.created_at, timeout):
1715 self._set_instance_obj_error_state(instance)
1716 LOG.warning("Instance build timed out. Set to error "
1717 "state.", instance=instance)
1718
1719 def _check_instance_exists(self, instance):
1720 """Ensure an instance with the same name is not already present."""
1721 if self.driver.instance_exists(instance):
1722 raise exception.InstanceExists(name=instance.name)
1723
1724 def _allocate_network_async(self, context, instance, requested_networks,
1725 security_groups, resource_provider_mapping):
1726 """Method used to allocate networks in the background.
1727
1728 Broken out for testing.
1729 """
1730 # First check to see if we're specifically not supposed to allocate
1731 # networks because if so, we can exit early.
1732 if requested_networks and requested_networks.no_allocate:
1733 LOG.debug("Not allocating networking since 'none' was specified.",
1734 instance=instance)
1735 return network_model.NetworkInfo([])
1736
1737 LOG.debug("Allocating IP information in the background.",
1738 instance=instance)
1739 retries = CONF.network_allocate_retries
1740 attempts = retries + 1
1741 retry_time = 1
1742 bind_host_id = self.driver.network_binding_host_id(context, instance)
1743 for attempt in range(1, attempts + 1):
1744 try:
1745 nwinfo = self.network_api.allocate_for_instance(
1746 context, instance,
1747 requested_networks=requested_networks,
1748 security_groups=security_groups,
1749 bind_host_id=bind_host_id,
1750 resource_provider_mapping=resource_provider_mapping)
1751 LOG.debug('Instance network_info: |%s|', nwinfo,
1752 instance=instance)
1753 instance.system_metadata['network_allocated'] = 'True'
1754 # NOTE(JoshNang) do not save the instance here, as it can cause
1755 # races. The caller shares a reference to instance and waits
1756 # for this async greenthread to finish before calling
1757 # instance.save().
1758 return nwinfo
1759 except Exception as e:
1760 log_info = {'attempt': attempt,
1761 'attempts': attempts}
1762 if attempt == attempts:
1763 LOG.exception('Instance failed network setup '
1764 'after %(attempts)d attempt(s)',
1765 log_info)
1766 raise e
1767 LOG.warning('Instance failed network setup '
1768 '(attempt %(attempt)d of %(attempts)d)',
1769 log_info, instance=instance)
1770 time.sleep(retry_time)
1771 retry_time *= 2
1772 if retry_time > 30:
1773 retry_time = 30
1774 # Not reached.
1775
1776 def _build_networks_for_instance(self, context, instance,
1777 requested_networks, security_groups, resource_provider_mapping):
1778
1779 # If we're here from a reschedule the network may already be allocated.
1780 if strutils.bool_from_string(
1781 instance.system_metadata.get('network_allocated', 'False')):
1782 # NOTE(alex_xu): The network_allocated is True means the network
1783 # resource already allocated at previous scheduling, and the
1784 # network setup is cleanup at previous. After rescheduling, the
1785 # network resource need setup on the new host.
1786 self.network_api.setup_instance_network_on_host(
1787 context, instance, instance.host)
1788 return self.network_api.get_instance_nw_info(context, instance)
1789
1790 network_info = self._allocate_network(context, instance,
1791 requested_networks, security_groups,
1792 resource_provider_mapping)
1793
1794 return network_info
1795
1796 def _allocate_network(self, context, instance, requested_networks,
1797 security_groups, resource_provider_mapping):
1798 """Start network allocation asynchronously. Return an instance
1799 of NetworkInfoAsyncWrapper that can be used to retrieve the
1800 allocated networks when the operation has finished.
1801 """
1802 # NOTE(comstud): Since we're allocating networks asynchronously,
1803 # this task state has little meaning, as we won't be in this
1804 # state for very long.
1805 instance.vm_state = vm_states.BUILDING
1806 instance.task_state = task_states.NETWORKING
1807 instance.save(expected_task_state=[None])
1808
1809 return network_model.NetworkInfoAsyncWrapper(
1810 self._allocate_network_async, context, instance,
1811 requested_networks, security_groups, resource_provider_mapping)
1812
1813 def _default_root_device_name(self, instance, image_meta, root_bdm):
1814 """Gets a default root device name from the driver.
1815
1816 :param nova.objects.Instance instance:
1817 The instance for which to get the root device name.
1818 :param nova.objects.ImageMeta image_meta:
1819 The metadata of the image of the instance.
1820 :param nova.objects.BlockDeviceMapping root_bdm:
1821 The description of the root device.
1822 :returns: str -- The default root device name.
1823 :raises: InternalError, TooManyDiskDevices
1824 """
1825 try:
1826 return self.driver.default_root_device_name(instance,
1827 image_meta,
1828 root_bdm)
1829 except NotImplementedError:
1830 return compute_utils.get_next_device_name(instance, [])
1831
1832 def _default_device_names_for_instance(self, instance,
1833 root_device_name,
1834 *block_device_lists):
1835 """Default the missing device names in the BDM from the driver.
1836
1837 :param nova.objects.Instance instance:
1838 The instance for which to get default device names.
1839 :param str root_device_name: The root device name.
1840 :param list block_device_lists: List of block device mappings.
1841 :returns: None
1842 :raises: InternalError, TooManyDiskDevices
1843 """
1844 try:
1845 self.driver.default_device_names_for_instance(instance,
1846 root_device_name,
1847 *block_device_lists)
1848 except NotImplementedError:
1849 compute_utils.default_device_names_for_instance(
1850 instance, root_device_name, *block_device_lists)
1851
1852 def _get_device_name_for_instance(self, instance, bdms, block_device_obj):
1853 """Get the next device name from the driver, based on the BDM.
1854
1855 :param nova.objects.Instance instance:
1856 The instance whose volume is requesting a device name.
1857 :param nova.objects.BlockDeviceMappingList bdms:
1858 The block device mappings for the instance.
1859 :param nova.objects.BlockDeviceMapping block_device_obj:
1860 A block device mapping containing info about the requested block
1861 device.
1862 :returns: The next device name.
1863 :raises: InternalError, TooManyDiskDevices
1864 """
1865 # NOTE(ndipanov): Copy obj to avoid changing the original
1866 block_device_obj = block_device_obj.obj_clone()
1867 try:
1868 return self.driver.get_device_name_for_instance(
1869 instance, bdms, block_device_obj)
1870 except NotImplementedError:
1871 return compute_utils.get_device_name_for_instance(
1872 instance, bdms, block_device_obj.get("device_name"))
1873
1874 def _default_block_device_names(self, instance, image_meta, block_devices):
1875 """Verify that all the devices have the device_name set. If not,
1876 provide a default name.
1877
1878 It also ensures that there is a root_device_name and is set to the
1879 first block device in the boot sequence (boot_index=0).
1880 """
1881 root_bdm = block_device.get_root_bdm(block_devices)
1882 if not root_bdm:
1883 return
1884
1885 # Get the root_device_name from the root BDM or the instance
1886 root_device_name = None
1887 update_root_bdm = False
1888
1889 if root_bdm.device_name:
1890 root_device_name = root_bdm.device_name
1891 instance.root_device_name = root_device_name
1892 elif instance.root_device_name:
1893 root_device_name = instance.root_device_name
1894 root_bdm.device_name = root_device_name
1895 update_root_bdm = True
1896 else:
1897 root_device_name = self._default_root_device_name(instance,
1898 image_meta,
1899 root_bdm)
1900
1901 instance.root_device_name = root_device_name
1902 root_bdm.device_name = root_device_name
1903 update_root_bdm = True
1904
1905 if update_root_bdm:
1906 root_bdm.save()
1907
1908 ephemerals = []
1909 swap = []
1910 block_device_mapping = []
1911
1912 for device in block_devices:
1913 if block_device.new_format_is_ephemeral(device):
1914 ephemerals.append(device)
1915
1916 if block_device.new_format_is_swap(device):
1917 swap.append(device)
1918
1919 if driver_block_device.is_block_device_mapping(device):
1920 block_device_mapping.append(device)
1921
1922 self._default_device_names_for_instance(instance,
1923 root_device_name,
1924 ephemerals,
1925 swap,
1926 block_device_mapping)
1927
1928 def _block_device_info_to_legacy(self, block_device_info):
1929 """Convert BDI to the old format for drivers that need it."""
1930
1931 if self.use_legacy_block_device_info:
1932 ephemerals = driver_block_device.legacy_block_devices(
1933 driver.block_device_info_get_ephemerals(block_device_info))
1934 mapping = driver_block_device.legacy_block_devices(
1935 driver.block_device_info_get_mapping(block_device_info))
1936 swap = block_device_info['swap']
1937 if swap:
1938 swap = swap.legacy()
1939
1940 block_device_info.update({
1941 'ephemerals': ephemerals,
1942 'swap': swap,
1943 'block_device_mapping': mapping})
1944
1945 def _add_missing_dev_names(self, bdms, instance):
1946 for bdm in bdms:
1947 if bdm.device_name is not None:
1948 continue
1949
1950 device_name = self._get_device_name_for_instance(instance,
1951 bdms, bdm)
1952 values = {'device_name': device_name}
1953 bdm.update(values)
1954 bdm.save()
1955
1956 def _prep_block_device(self, context, instance, bdms):
1957 """Set up the block device for an instance with error logging."""
1958 try:
1959 self._add_missing_dev_names(bdms, instance)
1960 block_device_info = driver.get_block_device_info(instance, bdms)
1961 mapping = driver.block_device_info_get_mapping(block_device_info)
1962 driver_block_device.attach_block_devices(
1963 mapping, context, instance, self.volume_api, self.driver,
1964 wait_func=self._await_block_device_map_created)
1965
1966 self._block_device_info_to_legacy(block_device_info)
1967 return block_device_info
1968
1969 except exception.OverQuota as e:
1970 LOG.warning('Failed to create block device for instance due'
1971 ' to exceeding volume related resource quota.'
1972 ' Error: %s', e.message, instance=instance)
1973 raise
1974
1975 except Exception as ex:
1976 LOG.exception('Instance failed block device setup',
1977 instance=instance)
1978 # InvalidBDM will eventually result in a BuildAbortException when
1979 # booting from volume, and will be recorded as an instance fault.
1980 # Maintain the original exception message which most likely has
1981 # useful details which the standard InvalidBDM error message lacks.
1982 raise exception.InvalidBDM(six.text_type(ex))
1983
1984 def _update_instance_after_spawn(self, instance,
1985 vm_state=vm_states.ACTIVE):
1986 instance.power_state = self._get_power_state(instance)
1987 instance.vm_state = vm_state
1988 instance.task_state = None
1989 # NOTE(sean-k-mooney): configdrive.update_instance checks
1990 # instance.launched_at to determine if it is the first or
1991 # subsequent spawn of an instance. We need to call update_instance
1992 # first before setting instance.launched_at or instance.config_drive
1993 # will never be set to true based on the value of force_config_drive.
1994 # As a result the config drive will be lost on a hard reboot of the
1995 # instance even when force_config_drive=true. see bug #1835822.
1996 configdrive.update_instance(instance)
1997 instance.launched_at = timeutils.utcnow()
1998
1999 def _update_scheduler_instance_info(self, context, instance):
2000 """Sends an InstanceList with created or updated Instance objects to
2001 the Scheduler client.
2002
2003 In the case of init_host, the value passed will already be an
2004 InstanceList. Other calls will send individual Instance objects that
2005 have been created or resized. In this case, we create an InstanceList
2006 object containing that Instance.
2007 """
2008 if not self.send_instance_updates:
2009 return
2010 if isinstance(instance, obj_instance.Instance):
2011 instance = objects.InstanceList(objects=[instance])
2012 context = context.elevated()
2013 self.query_client.update_instance_info(context, self.host,
2014 instance)
2015
2016 def _delete_scheduler_instance_info(self, context, instance_uuid):
2017 """Sends the uuid of the deleted Instance to the Scheduler client."""
2018 if not self.send_instance_updates:
2019 return
2020 context = context.elevated()
2021 self.query_client.delete_instance_info(context, self.host,
2022 instance_uuid)
2023
2024 @periodic_task.periodic_task(spacing=CONF.scheduler_instance_sync_interval)
2025 def _sync_scheduler_instance_info(self, context):
2026 if not self.send_instance_updates:
2027 return
2028 context = context.elevated()
2029 instances = objects.InstanceList.get_by_host(context, self.host,
2030 expected_attrs=[],
2031 use_slave=True)
2032 uuids = [instance.uuid for instance in instances]
2033 self.query_client.sync_instance_info(context, self.host, uuids)
2034
2035 def _notify_about_instance_usage(self, context, instance, event_suffix,
2036 network_info=None, extra_usage_info=None,
2037 fault=None):
2038 compute_utils.notify_about_instance_usage(
2039 self.notifier, context, instance, event_suffix,
2040 network_info=network_info,
2041 extra_usage_info=extra_usage_info, fault=fault)
2042
2043 def _deallocate_network(self, context, instance,
2044 requested_networks=None):
2045 # If we were told not to allocate networks let's save ourselves
2046 # the trouble of calling the network API.
2047 if requested_networks and requested_networks.no_allocate:
2048 LOG.debug("Skipping network deallocation for instance since "
2049 "networking was not requested.", instance=instance)
2050 return
2051
2052 LOG.debug('Deallocating network for instance', instance=instance)
2053 with timeutils.StopWatch() as timer:
2054 self.network_api.deallocate_for_instance(
2055 context, instance, requested_networks=requested_networks)
2056 # nova-network does an rpc call so we're OK tracking time spent here
2057 LOG.info('Took %0.2f seconds to deallocate network for instance.',
2058 timer.elapsed(), instance=instance)
2059
2060 def _get_instance_block_device_info(self, context, instance,
2061 refresh_conn_info=False,
2062 bdms=None):
2063 """Transform block devices to the driver block_device format."""
2064
2065 if bdms is None:
2066 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
2067 context, instance.uuid)
2068 block_device_info = driver.get_block_device_info(instance, bdms)
2069
2070 if not refresh_conn_info:
2071 # if the block_device_mapping has no value in connection_info
2072 # (returned as None), don't include in the mapping
2073 block_device_info['block_device_mapping'] = [
2074 bdm for bdm in driver.block_device_info_get_mapping(
2075 block_device_info)
2076 if bdm.get('connection_info')]
2077 else:
2078 driver_block_device.refresh_conn_infos(
2079 driver.block_device_info_get_mapping(block_device_info),
2080 context, instance, self.volume_api, self.driver)
2081
2082 self._block_device_info_to_legacy(block_device_info)
2083
2084 return block_device_info
2085
2086 def _build_failed(self, node):
2087 if CONF.compute.consecutive_build_service_disable_threshold:
2088 # NOTE(danms): Update our counter, but wait for the next
2089 # update_available_resource() periodic to flush it to the DB
2090 self.rt.build_failed(node)
2091
2092 def _build_succeeded(self, node):
2093 self.rt.build_succeeded(node)
2094
2095 @wrap_exception()
2096 @reverts_task_state
2097 @wrap_instance_fault
2098 def build_and_run_instance(self, context, instance, image, request_spec,
2099 filter_properties, admin_password=None,
2100 injected_files=None, requested_networks=None,
2101 security_groups=None, block_device_mapping=None,
2102 node=None, limits=None, host_list=None, accel_uuids=None):
2103
2104 @utils.synchronized(instance.uuid)
2105 def _locked_do_build_and_run_instance(*args, **kwargs):
2106 # NOTE(danms): We grab the semaphore with the instance uuid
2107 # locked because we could wait in line to build this instance
2108 # for a while and we want to make sure that nothing else tries
2109 # to do anything with this instance while we wait.
2110 with self._build_semaphore:
2111 try:
2112 result = self._do_build_and_run_instance(*args, **kwargs)
2113 except Exception:
2114 # NOTE(mriedem): This should really only happen if
2115 # _decode_files in _do_build_and_run_instance fails, and
2116 # that's before a guest is spawned so it's OK to remove
2117 # allocations for the instance for this node from Placement
2118 # below as there is no guest consuming resources anyway.
2119 # The _decode_files case could be handled more specifically
2120 # but that's left for another day.
2121 result = build_results.FAILED
2122 raise
2123 finally:
2124 if result == build_results.FAILED:
2125 # Remove the allocation records from Placement for the
2126 # instance if the build failed. The instance.host is
2127 # likely set to None in _do_build_and_run_instance
2128 # which means if the user deletes the instance, it
2129 # will be deleted in the API, not the compute service.
2130 # Setting the instance.host to None in
2131 # _do_build_and_run_instance means that the
2132 # ResourceTracker will no longer consider this instance
2133 # to be claiming resources against it, so we want to
2134 # reflect that same thing in Placement. No need to
2135 # call this for a reschedule, as the allocations will
2136 # have already been removed in
2137 # self._do_build_and_run_instance().
2138 self.reportclient.delete_allocation_for_instance(
2139 context, instance.uuid)
2140
2141 if result in (build_results.FAILED,
2142 build_results.RESCHEDULED):
2143 self._build_failed(node)
2144 else:
2145 self._build_succeeded(node)
2146
2147 # NOTE(danms): We spawn here to return the RPC worker thread back to
2148 # the pool. Since what follows could take a really long time, we don't
2149 # want to tie up RPC workers.
2150 utils.spawn_n(_locked_do_build_and_run_instance,
2151 context, instance, image, request_spec,
2152 filter_properties, admin_password, injected_files,
2153 requested_networks, security_groups,
2154 block_device_mapping, node, limits, host_list,
2155 accel_uuids)
2156
2157 def _check_device_tagging(self, requested_networks, block_device_mapping):
2158 tagging_requested = False
2159 if requested_networks:
2160 for net in requested_networks:
2161 if 'tag' in net and net.tag is not None:
2162 tagging_requested = True
2163 break
2164 if block_device_mapping and not tagging_requested:
2165 for bdm in block_device_mapping:
2166 if 'tag' in bdm and bdm.tag is not None:
2167 tagging_requested = True
2168 break
2169 if (tagging_requested and
2170 not self.driver.capabilities.get('supports_device_tagging',
2171 False)):
2172 raise exception.BuildAbortException('Attempt to boot guest with '
2173 'tagged devices on host that '
2174 'does not support tagging.')
2175
2176 def _check_trusted_certs(self, instance):
2177 if (instance.trusted_certs and
2178 not self.driver.capabilities.get('supports_trusted_certs',
2179 False)):
2180 raise exception.BuildAbortException(
2181 'Trusted image certificates provided on host that does not '
2182 'support certificate validation.')
2183
2184 @wrap_exception()
2185 @reverts_task_state
2186 @wrap_instance_event(prefix='compute')
2187 @wrap_instance_fault
2188 def _do_build_and_run_instance(self, context, instance, image,
2189 request_spec, filter_properties, admin_password, injected_files,
2190 requested_networks, security_groups, block_device_mapping,
2191 node=None, limits=None, host_list=None, accel_uuids=None):
2192
2193 try:
2194 LOG.debug('Starting instance...', instance=instance)
2195 instance.vm_state = vm_states.BUILDING
2196 instance.task_state = None
2197 instance.save(expected_task_state=
2198 (task_states.SCHEDULING, None))
2199 except exception.InstanceNotFound:
2200 msg = 'Instance disappeared before build.'
2201 LOG.debug(msg, instance=instance)
2202 return build_results.FAILED
2203 except exception.UnexpectedTaskStateError as e:
2204 LOG.debug(e.format_message(), instance=instance)
2205 return build_results.FAILED
2206
2207 # b64 decode the files to inject:
2208 decoded_files = self._decode_files(injected_files)
2209
2210 if limits is None:
2211 limits = {}
2212
2213 if node is None:
2214 node = self._get_nodename(instance, refresh=True)
2215
2216 try:
2217 with timeutils.StopWatch() as timer:
2218 self._build_and_run_instance(context, instance, image,
2219 decoded_files, admin_password, requested_networks,
2220 security_groups, block_device_mapping, node, limits,
2221 filter_properties, request_spec, accel_uuids)
2222 LOG.info('Took %0.2f seconds to build instance.',
2223 timer.elapsed(), instance=instance)
2224 return build_results.ACTIVE
2225 except exception.RescheduledException as e:
2226 retry = filter_properties.get('retry')
2227 if not retry:
2228 # no retry information, do not reschedule.
2229 LOG.debug("Retry info not present, will not reschedule",
2230 instance=instance)
2231 self._cleanup_allocated_networks(context, instance,
2232 requested_networks)
2233 self._cleanup_volumes(context, instance,
2234 block_device_mapping, raise_exc=False)
2235 compute_utils.add_instance_fault_from_exc(context,
2236 instance, e, sys.exc_info(),
2237 fault_message=e.kwargs['reason'])
2238 self._nil_out_instance_obj_host_and_node(instance)
2239 self._set_instance_obj_error_state(instance,
2240 clean_task_state=True)
2241 return build_results.FAILED
2242 LOG.debug(e.format_message(), instance=instance)
2243 # This will be used for logging the exception
2244 retry['exc'] = traceback.format_exception(*sys.exc_info())
2245 # This will be used for setting the instance fault message
2246 retry['exc_reason'] = e.kwargs['reason']
2247
2248 self._cleanup_allocated_networks(context, instance,
2249 requested_networks)
2250
2251 self._nil_out_instance_obj_host_and_node(instance)
2252 instance.task_state = task_states.SCHEDULING
2253 instance.save()
2254 # The instance will have already claimed resources from this host
2255 # before this build was attempted. Now that it has failed, we need
2256 # to unclaim those resources before casting to the conductor, so
2257 # that if there are alternate hosts available for a retry, it can
2258 # claim resources on that new host for the instance.
2259 self.reportclient.delete_allocation_for_instance(context,
2260 instance.uuid)
2261
2262 self.compute_task_api.build_instances(context, [instance],
2263 image, filter_properties, admin_password,
2264 injected_files, requested_networks, security_groups,
2265 block_device_mapping, request_spec=request_spec,
2266 host_lists=[host_list])
2267 return build_results.RESCHEDULED
2268 except (exception.InstanceNotFound,
2269 exception.UnexpectedDeletingTaskStateError):
2270 msg = 'Instance disappeared during build.'
2271 LOG.debug(msg, instance=instance)
2272 self._cleanup_allocated_networks(context, instance,
2273 requested_networks)
2274 return build_results.FAILED
2275 except Exception as e:
2276 if isinstance(e, exception.BuildAbortException):
2277 LOG.error(e.format_message(), instance=instance)
2278 else:
2279 # Should not reach here.
2280 LOG.exception('Unexpected build failure, not rescheduling '
2281 'build.', instance=instance)
2282 self._cleanup_allocated_networks(context, instance,
2283 requested_networks)
2284 self._cleanup_volumes(context, instance,
2285 block_device_mapping, raise_exc=False)
2286 compute_utils.add_instance_fault_from_exc(context, instance,
2287 e, sys.exc_info())
2288 self._nil_out_instance_obj_host_and_node(instance)
2289 self._set_instance_obj_error_state(instance, clean_task_state=True)
2290 return build_results.FAILED
2291
2292 @staticmethod
2293 def _get_scheduler_hints(filter_properties, request_spec=None):
2294 """Helper method to get scheduler hints.
2295
2296 This method prefers to get the hints out of the request spec, but that
2297 might not be provided. Conductor will pass request_spec down to the
2298 first compute chosen for a build but older computes will not pass
2299 the request_spec to conductor's build_instances method for a
2300 a reschedule, so if we're on a host via a retry, request_spec may not
2301 be provided so we need to fallback to use the filter_properties
2302 to get scheduler hints.
2303 """
2304 hints = {}
2305 if request_spec is not None and 'scheduler_hints' in request_spec:
2306 hints = request_spec.scheduler_hints
2307 if not hints:
2308 hints = filter_properties.get('scheduler_hints') or {}
2309 return hints
2310
2311 @staticmethod
2312 def _get_request_group_mapping(request_spec):
2313 """Return request group resource - provider mapping. This is currently
2314 used for Neutron ports that have resource request due to the port
2315 having QoS minimum bandwidth policy rule attached.
2316
2317 :param request_spec: A RequestSpec object or None
2318 :returns: A dict keyed by RequestGroup requester_id, currently Neutron
2319 port_id, to resource provider UUID that provides resource for that
2320 RequestGroup. Or None if the request_spec was None.
2321 """
2322 if request_spec:
2323 return request_spec.get_request_group_mapping()
2324 else:
2325 return None
2326
2327 def _build_and_run_instance(self, context, instance, image, injected_files,
2328 admin_password, requested_networks, security_groups,
2329 block_device_mapping, node, limits, filter_properties,
2330 request_spec=None, accel_uuids=None):
2331
2332 image_name = image.get('name')
2333 self._notify_about_instance_usage(context, instance, 'create.start',
2334 extra_usage_info={'image_name': image_name})
2335 compute_utils.notify_about_instance_create(
2336 context, instance, self.host,
2337 phase=fields.NotificationPhase.START,
2338 bdms=block_device_mapping)
2339
2340 # NOTE(mikal): cache the keystone roles associated with the instance
2341 # at boot time for later reference
2342 instance.system_metadata.update(
2343 {'boot_roles': ','.join(context.roles)})
2344
2345 self._check_device_tagging(requested_networks, block_device_mapping)
2346 self._check_trusted_certs(instance)
2347
2348 provider_mapping = self._get_request_group_mapping(request_spec)
2349
2350 if provider_mapping:
2351 try:
2352 compute_utils\
2353 .update_pci_request_spec_with_allocated_interface_name(
2354 context, self.reportclient, instance, provider_mapping)
2355 except (exception.AmbiguousResourceProviderForPCIRequest,
2356 exception.UnexpectedResourceProviderNameForPCIRequest
2357 ) as e:
2358 raise exception.BuildAbortException(
2359 reason=six.text_type(e), instance_uuid=instance.uuid)
2360
2361 # TODO(Luyao) cut over to get_allocs_for_consumer
2362 allocs = self.reportclient.get_allocations_for_consumer(
2363 context, instance.uuid)
2364
2365 try:
2366 scheduler_hints = self._get_scheduler_hints(filter_properties,
2367 request_spec)
2368 with self.rt.instance_claim(context, instance, node, allocs,
2369 limits):
2370 # NOTE(russellb) It's important that this validation be done
2371 # *after* the resource tracker instance claim, as that is where
2372 # the host is set on the instance.
2373 self._validate_instance_group_policy(context, instance,
2374 scheduler_hints)
2375 image_meta = objects.ImageMeta.from_dict(image)
2376
2377 with self._build_resources(context, instance,
2378 requested_networks, security_groups, image_meta,
2379 block_device_mapping, provider_mapping,
2380 accel_uuids) as resources:
2381 instance.vm_state = vm_states.BUILDING
2382 instance.task_state = task_states.SPAWNING
2383 # NOTE(JoshNang) This also saves the changes to the
2384 # instance from _allocate_network_async, as they aren't
2385 # saved in that function to prevent races.
2386 instance.save(expected_task_state=
2387 task_states.BLOCK_DEVICE_MAPPING)
2388 block_device_info = resources['block_device_info']
2389 network_info = resources['network_info']
2390 accel_info = resources['accel_info']
2391 LOG.debug('Start spawning the instance on the hypervisor.',
2392 instance=instance)
2393 with timeutils.StopWatch() as timer:
2394 self.driver.spawn(context, instance, image_meta,
2395 injected_files, admin_password,
2396 allocs, network_info=network_info,
2397 block_device_info=block_device_info,
2398 accel_info=accel_info)
2399 LOG.info('Took %0.2f seconds to spawn the instance on '
2400 'the hypervisor.', timer.elapsed(),
2401 instance=instance)
2402 except (exception.InstanceNotFound,
2403 exception.UnexpectedDeletingTaskStateError) as e:
2404 with excutils.save_and_reraise_exception():
2405 self._notify_about_instance_usage(context, instance,
2406 'create.error', fault=e)
2407 compute_utils.notify_about_instance_create(
2408 context, instance, self.host,
2409 phase=fields.NotificationPhase.ERROR, exception=e,
2410 bdms=block_device_mapping)
2411 except exception.ComputeResourcesUnavailable as e:
2412 LOG.debug(e.format_message(), instance=instance)
2413 self._notify_about_instance_usage(context, instance,
2414 'create.error', fault=e)
2415 compute_utils.notify_about_instance_create(
2416 context, instance, self.host,
2417 phase=fields.NotificationPhase.ERROR, exception=e,
2418 bdms=block_device_mapping)
2419 raise exception.RescheduledException(
2420 instance_uuid=instance.uuid, reason=e.format_message())
2421 except exception.BuildAbortException as e:
2422 with excutils.save_and_reraise_exception():
2423 LOG.debug(e.format_message(), instance=instance)
2424 self._notify_about_instance_usage(context, instance,
2425 'create.error', fault=e)
2426 compute_utils.notify_about_instance_create(
2427 context, instance, self.host,
2428 phase=fields.NotificationPhase.ERROR, exception=e,
2429 bdms=block_device_mapping)
2430 except exception.NoMoreFixedIps as e:
2431 LOG.warning('No more fixed IP to be allocated',
2432 instance=instance)
2433 self._notify_about_instance_usage(context, instance,
2434 'create.error', fault=e)
2435 compute_utils.notify_about_instance_create(
2436 context, instance, self.host,
2437 phase=fields.NotificationPhase.ERROR, exception=e,
2438 bdms=block_device_mapping)
2439 msg = _('Failed to allocate the network(s) with error %s, '
2440 'not rescheduling.') % e.format_message()
2441 raise exception.BuildAbortException(instance_uuid=instance.uuid,
2442 reason=msg)
2443 except (exception.ExternalNetworkAttachForbidden,
2444 exception.VirtualInterfaceCreateException,
2445 exception.VirtualInterfaceMacAddressException,
2446 exception.FixedIpInvalidOnHost,
2447 exception.UnableToAutoAllocateNetwork,
2448 exception.NetworksWithQoSPolicyNotSupported) as e:
2449 LOG.exception('Failed to allocate network(s)',
2450 instance=instance)
2451 self._notify_about_instance_usage(context, instance,
2452 'create.error', fault=e)
2453 compute_utils.notify_about_instance_create(
2454 context, instance, self.host,
2455 phase=fields.NotificationPhase.ERROR, exception=e,
2456 bdms=block_device_mapping)
2457 msg = _('Failed to allocate the network(s), not rescheduling.')
2458 raise exception.BuildAbortException(instance_uuid=instance.uuid,
2459 reason=msg)
2460 except (exception.FlavorDiskTooSmall,
2461 exception.FlavorMemoryTooSmall,
2462 exception.ImageNotActive,
2463 exception.ImageUnacceptable,
2464 exception.InvalidDiskInfo,
2465 exception.InvalidDiskFormat,
2466 cursive_exception.SignatureVerificationError,
2467 exception.CertificateValidationFailed,
2468 exception.VolumeEncryptionNotSupported,
2469 exception.InvalidInput,
2470 # TODO(mriedem): We should be validating RequestedVRamTooHigh
2471 # in the API during server create and rebuild.
2472 exception.RequestedVRamTooHigh) as e:
2473 self._notify_about_instance_usage(context, instance,
2474 'create.error', fault=e)
2475 compute_utils.notify_about_instance_create(
2476 context, instance, self.host,
2477 phase=fields.NotificationPhase.ERROR, exception=e,
2478 bdms=block_device_mapping)
2479 raise exception.BuildAbortException(instance_uuid=instance.uuid,
2480 reason=e.format_message())
2481 except Exception as e:
2482 LOG.exception('Failed to build and run instance',
2483 instance=instance)
2484 self._notify_about_instance_usage(context, instance,
2485 'create.error', fault=e)
2486 compute_utils.notify_about_instance_create(
2487 context, instance, self.host,
2488 phase=fields.NotificationPhase.ERROR, exception=e,
2489 bdms=block_device_mapping)
2490 raise exception.RescheduledException(
2491 instance_uuid=instance.uuid, reason=six.text_type(e))
2492
2493 # NOTE(alaski): This is only useful during reschedules, remove it now.
2494 instance.system_metadata.pop('network_allocated', None)
2495
2496 # If CONF.default_access_ip_network_name is set, grab the
2497 # corresponding network and set the access ip values accordingly.
2498 network_name = CONF.default_access_ip_network_name
2499 if (network_name and not instance.access_ip_v4 and
2500 not instance.access_ip_v6):
2501 # Note that when there are multiple ips to choose from, an
2502 # arbitrary one will be chosen.
2503 for vif in network_info:
2504 if vif['network']['label'] == network_name:
2505 for ip in vif.fixed_ips():
2506 if not instance.access_ip_v4 and ip['version'] == 4:
2507 instance.access_ip_v4 = ip['address']
2508 if not instance.access_ip_v6 and ip['version'] == 6:
2509 instance.access_ip_v6 = ip['address']
2510 break
2511
2512 self._update_instance_after_spawn(instance)
2513
2514 try:
2515 instance.save(expected_task_state=task_states.SPAWNING)
2516 except (exception.InstanceNotFound,
2517 exception.UnexpectedDeletingTaskStateError) as e:
2518 with excutils.save_and_reraise_exception():
2519 self._notify_about_instance_usage(context, instance,
2520 'create.error', fault=e)
2521 compute_utils.notify_about_instance_create(
2522 context, instance, self.host,
2523 phase=fields.NotificationPhase.ERROR, exception=e,
2524 bdms=block_device_mapping)
2525
2526 self._update_scheduler_instance_info(context, instance)
2527 self._notify_about_instance_usage(context, instance, 'create.end',
2528 extra_usage_info={'message': _('Success')},
2529 network_info=network_info)
2530 compute_utils.notify_about_instance_create(context, instance,
2531 self.host, phase=fields.NotificationPhase.END,
2532 bdms=block_device_mapping)
2533
2534 def _build_resources_cleanup(self, instance, network_info):
2535 # Make sure the async call finishes
2536 if network_info is not None:
2537 network_info.wait(do_raise=False)
2538 self.driver.clean_networks_preparation(instance,
2539 network_info)
2540 self.driver.failed_spawn_cleanup(instance)
2541
2542 @contextlib.contextmanager
2543 def _build_resources(self, context, instance, requested_networks,
2544 security_groups, image_meta, block_device_mapping,
2545 resource_provider_mapping, accel_uuids):
2546 resources = {}
2547 network_info = None
2548 try:
2549 LOG.debug('Start building networks asynchronously for instance.',
2550 instance=instance)
2551 network_info = self._build_networks_for_instance(context, instance,
2552 requested_networks, security_groups,
2553 resource_provider_mapping)
2554 resources['network_info'] = network_info
2555 except (exception.InstanceNotFound,
2556 exception.UnexpectedDeletingTaskStateError):
2557 raise
2558 except exception.UnexpectedTaskStateError as e:
2559 raise exception.BuildAbortException(instance_uuid=instance.uuid,
2560 reason=e.format_message())
2561 except Exception:
2562 # Because this allocation is async any failures are likely to occur
2563 # when the driver accesses network_info during spawn().
2564 LOG.exception('Failed to allocate network(s)',
2565 instance=instance)
2566 msg = _('Failed to allocate the network(s), not rescheduling.')
2567 raise exception.BuildAbortException(instance_uuid=instance.uuid,
2568 reason=msg)
2569
2570 try:
2571 # Perform any driver preparation work for the driver.
2572 self.driver.prepare_for_spawn(instance)
2573
2574 # Depending on a virt driver, some network configuration is
2575 # necessary before preparing block devices.
2576 self.driver.prepare_networks_before_block_device_mapping(
2577 instance, network_info)
2578
2579 # Verify that all the BDMs have a device_name set and assign a
2580 # default to the ones missing it with the help of the driver.
2581 self._default_block_device_names(instance, image_meta,
2582 block_device_mapping)
2583
2584 LOG.debug('Start building block device mappings for instance.',
2585 instance=instance)
2586 instance.vm_state = vm_states.BUILDING
2587 instance.task_state = task_states.BLOCK_DEVICE_MAPPING
2588 instance.save()
2589
2590 block_device_info = self._prep_block_device(context, instance,
2591 block_device_mapping)
2592 resources['block_device_info'] = block_device_info
2593 except (exception.InstanceNotFound,
2594 exception.UnexpectedDeletingTaskStateError):
2595 with excutils.save_and_reraise_exception():
2596 self._build_resources_cleanup(instance, network_info)
2597 except (exception.UnexpectedTaskStateError,
2598 exception.OverQuota, exception.InvalidBDM) as e:
2599 self._build_resources_cleanup(instance, network_info)
2600 raise exception.BuildAbortException(instance_uuid=instance.uuid,
2601 reason=e.format_message())
2602 except Exception:
2603 LOG.exception('Failure prepping block device',
2604 instance=instance)
2605 self._build_resources_cleanup(instance, network_info)
2606 msg = _('Failure prepping block device.')
2607 raise exception.BuildAbortException(instance_uuid=instance.uuid,
2608 reason=msg)
2609
2610 arqs = []
2611 if instance.flavor.extra_specs.get('accel:device_profile'):
2612 try:
2613 arqs = self._get_bound_arq_resources(
2614 context, instance, accel_uuids)
2615 except (Exception, eventlet.timeout.Timeout) as exc:
2616 LOG.exception(exc)
2617 self._build_resources_cleanup(instance, network_info)
2618 compute_utils.delete_arqs_if_needed(context, instance)
2619 msg = _('Failure getting accelerator requests.')
2620 raise exception.BuildAbortException(
2621 reason=msg, instance_uuid=instance.uuid)
2622
2623 resources['accel_info'] = arqs
2624 try:
2625 yield resources
2626 except Exception as exc:
2627 with excutils.save_and_reraise_exception() as ctxt:
2628 if not isinstance(exc, (
2629 exception.InstanceNotFound,
2630 exception.UnexpectedDeletingTaskStateError)):
2631 LOG.exception('Instance failed to spawn',
2632 instance=instance)
2633 # Make sure the async call finishes
2634 if network_info is not None:
2635 network_info.wait(do_raise=False)
2636 # if network_info is empty we're likely here because of
2637 # network allocation failure. Since nothing can be reused on
2638 # rescheduling it's better to deallocate network to eliminate
2639 # the chance of orphaned ports in neutron
2640 deallocate_networks = False if network_info else True
2641 try:
2642 self._shutdown_instance(context, instance,
2643 block_device_mapping, requested_networks,
2644 try_deallocate_networks=deallocate_networks)
2645 except Exception as exc2:
2646 ctxt.reraise = False
2647 LOG.warning('Could not clean up failed build,'
2648 ' not rescheduling. Error: %s',
2649 six.text_type(exc2))
2650 raise exception.BuildAbortException(
2651 instance_uuid=instance.uuid,
2652 reason=six.text_type(exc))
2653 finally:
2654 # Call Cyborg to delete accelerator requests
2655 compute_utils.delete_arqs_if_needed(context, instance)
2656
2657 def _get_bound_arq_resources(self, context, instance, arq_uuids):
2658 """Get bound accelerator requests.
2659
2660 The ARQ binding was kicked off in the conductor as an async
2661 operation. Here we wait for the notification from Cyborg.
2662
2663 If the notification arrived before this point, which can happen
2664 in many/most cases (see [1]), it will be lost. To handle that,
2665 we use exit_wait_early.
2666 [1] https://review.opendev.org/#/c/631244/46/nova/compute/
2667 manager.py@2627
2668
2669 :param instance: instance object
2670 :param arq_uuids: List of accelerator request (ARQ) UUIDs.
2671 :returns: List of ARQs for which bindings have completed,
2672 successfully or otherwise
2673 """
2674
2675 cyclient = cyborg.get_client(context)
2676 if arq_uuids is None:
2677 arqs = cyclient.get_arqs_for_instance(instance.uuid)
2678 arq_uuids = [arq['uuid'] for arq in arqs]
2679 events = [('accelerator-request-bound', arq_uuid)
2680 for arq_uuid in arq_uuids]
2681
2682 timeout = CONF.arq_binding_timeout
2683 with self.virtapi.wait_for_instance_event(
2684 instance, events, deadline=timeout):
2685 resolved_arqs = cyclient.get_arqs_for_instance(
2686 instance.uuid, only_resolved=True)
2687 # Events for these resolved ARQs may have already arrived.
2688 # Such 'early' events need to be ignored.
2689 early_events = [('accelerator-request-bound', arq['uuid'])
2690 for arq in resolved_arqs]
2691 if early_events:
2692 self.virtapi.exit_wait_early(early_events)
2693
2694 # Since a timeout in wait_for_instance_event will raise, we get
2695 # here only if all binding events have been received.
2696 resolved_uuids = [arq['uuid'] for arq in resolved_arqs]
2697 if sorted(resolved_uuids) != sorted(arq_uuids):
2698 # Query Cyborg to get all.
2699 arqs = cyclient.get_arqs_for_instance(instance.uuid)
2700 else:
2701 arqs = resolved_arqs
2702 return arqs
2703
2704 def _cleanup_allocated_networks(self, context, instance,
2705 requested_networks):
2706 """Cleanup networks allocated for instance.
2707
2708 :param context: nova request context
2709 :param instance: nova.objects.instance.Instance object
2710 :param requested_networks: nova.objects.NetworkRequestList
2711 """
2712 LOG.debug('Unplugging VIFs for instance', instance=instance)
2713
2714 network_info = instance.get_network_info()
2715
2716 # NOTE(stephenfin) to avoid nova destroying the instance without
2717 # unplugging the interface, refresh network_info if it is empty.
2718 if not network_info:
2719 try:
2720 network_info = self.network_api.get_instance_nw_info(
2721 context, instance,
2722 )
2723 except Exception as exc:
2724 LOG.warning(
2725 'Failed to update network info cache when cleaning up '
2726 'allocated networks. Stale VIFs may be left on this host.'
2727 'Error: %s', six.text_type(exc)
2728 )
2729 return
2730
2731 try:
2732 self.driver.unplug_vifs(instance, network_info)
2733 except NotImplementedError:
2734 # This is an optional method so ignore things if it doesn't exist
2735 LOG.debug(
2736 'Virt driver does not provide unplug_vifs method, so it '
2737 'is not possible determine if VIFs should be unplugged.'
2738 )
2739 except exception.NovaException as exc:
2740 # It's possible that the instance never got as far as plugging
2741 # VIFs, in which case we would see an exception which can be
2742 # mostly ignored
2743 LOG.warning(
2744 'Cleaning up VIFs failed for instance. Error: %s',
2745 six.text_type(exc), instance=instance,
2746 )
2747 else:
2748 LOG.debug('Unplugged VIFs for instance', instance=instance)
2749
2750 try:
2751 self._deallocate_network(context, instance, requested_networks)
2752 except Exception:
2753 LOG.exception('Failed to deallocate networks', instance=instance)
2754 return
2755
2756 instance.system_metadata['network_allocated'] = 'False'
2757 try:
2758 instance.save()
2759 except exception.InstanceNotFound:
2760 # NOTE(alaski): It's possible that we're cleaning up the networks
2761 # because the instance was deleted. If that's the case then this
2762 # exception will be raised by instance.save()
2763 pass
2764
2765 def _try_deallocate_network(self, context, instance,
2766 requested_networks=None):
2767
2768 # During auto-scale cleanup, we could be deleting a large number
2769 # of servers at the same time and overloading parts of the system,
2770 # so we retry a few times in case of connection failures to the
2771 # networking service.
2772 @loopingcall.RetryDecorator(
2773 max_retry_count=3, inc_sleep_time=2, max_sleep_time=12,
2774 exceptions=(keystone_exception.connection.ConnectFailure,))
2775 def _deallocate_network_with_retries():
2776 try:
2777 self._deallocate_network(
2778 context, instance, requested_networks)
2779 except keystone_exception.connection.ConnectFailure as e:
2780 # Provide a warning that something is amiss.
2781 with excutils.save_and_reraise_exception():
2782 LOG.warning('Failed to deallocate network for instance; '
2783 'retrying. Error: %s', six.text_type(e),
2784 instance=instance)
2785
2786 try:
2787 # tear down allocated network structure
2788 _deallocate_network_with_retries()
2789 except Exception as ex:
2790 with excutils.save_and_reraise_exception():
2791 LOG.error('Failed to deallocate network for instance. '
2792 'Error: %s', ex, instance=instance)
2793 self._set_instance_obj_error_state(instance)
2794
2795 def _get_power_off_values(self, instance, clean_shutdown):
2796 """Get the timing configuration for powering down this instance."""
2797 if clean_shutdown:
2798 timeout = compute_utils.get_value_from_system_metadata(instance,
2799 key='image_os_shutdown_timeout', type=int,
2800 default=CONF.shutdown_timeout)
2801 retry_interval = CONF.compute.shutdown_retry_interval
2802 else:
2803 timeout = 0
2804 retry_interval = 0
2805
2806 return timeout, retry_interval
2807
2808 def _power_off_instance(self, instance, clean_shutdown=True):
2809 """Power off an instance on this host."""
2810 timeout, retry_interval = self._get_power_off_values(
2811 instance, clean_shutdown)
2812 self.driver.power_off(instance, timeout, retry_interval)
2813
2814 def _shutdown_instance(self, context, instance,
2815 bdms, requested_networks=None, notify=True,
2816 try_deallocate_networks=True):
2817 """Shutdown an instance on this host.
2818
2819 :param:context: security context
2820 :param:instance: a nova.objects.Instance object
2821 :param:bdms: the block devices for the instance to be torn
2822 down
2823 :param:requested_networks: the networks on which the instance
2824 has ports
2825 :param:notify: true if a final usage notification should be
2826 emitted
2827 :param:try_deallocate_networks: false if we should avoid
2828 trying to teardown networking
2829 """
2830 context = context.elevated()
2831 LOG.info('Terminating instance', instance=instance)
2832
2833 if notify:
2834 self._notify_about_instance_usage(context, instance,
2835 "shutdown.start")
2836 compute_utils.notify_about_instance_action(context, instance,
2837 self.host, action=fields.NotificationAction.SHUTDOWN,
2838 phase=fields.NotificationPhase.START, bdms=bdms)
2839
2840 network_info = instance.get_network_info()
2841
2842 # NOTE(arnaudmorin) to avoid nova destroying the instance without
2843 # unplugging the interface, refresh network_info if it is empty.
2844 if not network_info:
2845 network_info = self.network_api.get_instance_nw_info(
2846 context, instance)
2847
2848 # NOTE(vish) get bdms before destroying the instance
2849 vol_bdms = [bdm for bdm in bdms if bdm.is_volume]
2850 block_device_info = self._get_instance_block_device_info(
2851 context, instance, bdms=bdms)
2852
2853 # NOTE(melwitt): attempt driver destroy before releasing ip, may
2854 # want to keep ip allocated for certain failures
2855 try:
2856 LOG.debug('Start destroying the instance on the hypervisor.',
2857 instance=instance)
2858 with timeutils.StopWatch() as timer:
2859 self.driver.destroy(context, instance, network_info,
2860 block_device_info)
2861 LOG.info('Took %0.2f seconds to destroy the instance on the '
2862 'hypervisor.', timer.elapsed(), instance=instance)
2863 except exception.InstancePowerOffFailure:
2864 # if the instance can't power off, don't release the ip
2865 with excutils.save_and_reraise_exception():
2866 pass
2867 except Exception:
2868 with excutils.save_and_reraise_exception():
2869 # deallocate ip and fail without proceeding to
2870 # volume api calls, preserving current behavior
2871 if try_deallocate_networks:
2872 self._try_deallocate_network(context, instance,
2873 requested_networks)
2874
2875 if try_deallocate_networks:
2876 self._try_deallocate_network(context, instance, requested_networks)
2877
2878 timer.restart()
2879 for bdm in vol_bdms:
2880 try:
2881 if bdm.attachment_id:
2882 self.volume_api.attachment_delete(context,
2883 bdm.attachment_id)
2884 else:
2885 # NOTE(vish): actual driver detach done in driver.destroy,
2886 # so just tell cinder that we are done with it.
2887 connector = self.driver.get_volume_connector(instance)
2888 self.volume_api.terminate_connection(context,
2889 bdm.volume_id,
2890 connector)
2891 self.volume_api.detach(context, bdm.volume_id,
2892 instance.uuid)
2893
2894 except exception.VolumeAttachmentNotFound as exc:
2895 LOG.debug('Ignoring VolumeAttachmentNotFound: %s', exc,
2896 instance=instance)
2897 except exception.DiskNotFound as exc:
2898 LOG.debug('Ignoring DiskNotFound: %s', exc,
2899 instance=instance)
2900 except exception.VolumeNotFound as exc:
2901 LOG.debug('Ignoring VolumeNotFound: %s', exc,
2902 instance=instance)
2903 except (cinder_exception.EndpointNotFound,
2904 keystone_exception.EndpointNotFound) as exc:
2905 LOG.warning('Ignoring EndpointNotFound for '
2906 'volume %(volume_id)s: %(exc)s',
2907 {'exc': exc, 'volume_id': bdm.volume_id},
2908 instance=instance)
2909 except cinder_exception.ClientException as exc:
2910 LOG.warning('Ignoring unknown cinder exception for '
2911 'volume %(volume_id)s: %(exc)s',
2912 {'exc': exc, 'volume_id': bdm.volume_id},
2913 instance=instance)
2914 except Exception as exc:
2915 LOG.warning('Ignoring unknown exception for '
2916 'volume %(volume_id)s: %(exc)s',
2917 {'exc': exc, 'volume_id': bdm.volume_id},
2918 instance=instance)
2919 if vol_bdms:
2920 LOG.info('Took %(time).2f seconds to detach %(num)s volumes '
2921 'for instance.',
2922 {'time': timer.elapsed(), 'num': len(vol_bdms)},
2923 instance=instance)
2924
2925 if notify:
2926 self._notify_about_instance_usage(context, instance,
2927 "shutdown.end")
2928 compute_utils.notify_about_instance_action(context, instance,
2929 self.host, action=fields.NotificationAction.SHUTDOWN,
2930 phase=fields.NotificationPhase.END, bdms=bdms)
2931
2932 def _cleanup_volumes(self, context, instance, bdms, raise_exc=True,
2933 detach=True):
2934 original_exception = None
2935 for bdm in bdms:
2936 if detach and bdm.volume_id:
2937 try:
2938 LOG.debug("Detaching volume: %s", bdm.volume_id,
2939 instance_uuid=instance.uuid)
2940 destroy = bdm.delete_on_termination
2941 self._detach_volume(context, bdm, instance,
2942 destroy_bdm=destroy)
2943 except Exception as exc:
2944 original_exception = exc
2945 LOG.warning('Failed to detach volume: %(volume_id)s '
2946 'due to %(exc)s',
2947 {'volume_id': bdm.volume_id, 'exc': exc})
2948
2949 if bdm.volume_id and bdm.delete_on_termination:
2950 try:
2951 LOG.debug("Deleting volume: %s", bdm.volume_id,
2952 instance_uuid=instance.uuid)
2953 self.volume_api.delete(context, bdm.volume_id)
2954 except Exception as exc:
2955 original_exception = exc
2956 LOG.warning('Failed to delete volume: %(volume_id)s '
2957 'due to %(exc)s',
2958 {'volume_id': bdm.volume_id, 'exc': exc})
2959 if original_exception is not None and raise_exc:
2960 raise original_exception
2961
2962 def _delete_instance(self, context, instance, bdms):
2963 """Delete an instance on this host.
2964
2965 :param context: nova request context
2966 :param instance: nova.objects.instance.Instance object
2967 :param bdms: nova.objects.block_device.BlockDeviceMappingList object
2968 """
2969 events = self.instance_events.clear_events_for_instance(instance)
2970 if events:
2971 LOG.debug('Events pending at deletion: %(events)s',
2972 {'events': ','.join(events.keys())},
2973 instance=instance)
2974 self._notify_about_instance_usage(context, instance,
2975 "delete.start")
2976 compute_utils.notify_about_instance_action(context, instance,
2977 self.host, action=fields.NotificationAction.DELETE,
2978 phase=fields.NotificationPhase.START, bdms=bdms)
2979
2980 self._shutdown_instance(context, instance, bdms)
2981
2982 # NOTE(vish): We have already deleted the instance, so we have
2983 # to ignore problems cleaning up the volumes. It
2984 # would be nice to let the user know somehow that
2985 # the volume deletion failed, but it is not
2986 # acceptable to have an instance that can not be
2987 # deleted. Perhaps this could be reworked in the
2988 # future to set an instance fault the first time
2989 # and to only ignore the failure if the instance
2990 # is already in ERROR.
2991
2992 # NOTE(ameeda): The volumes have already been detached during
2993 # the above _shutdown_instance() call and this is
2994 # why detach is not requested from
2995 # _cleanup_volumes() in this case
2996
2997 self._cleanup_volumes(context, instance, bdms,
2998 raise_exc=False, detach=False)
2999 # Delete Cyborg ARQs if the instance has a device profile.
3000 compute_utils.delete_arqs_if_needed(context, instance)
3001 # if a delete task succeeded, always update vm state and task
3002 # state without expecting task state to be DELETING
3003 instance.vm_state = vm_states.DELETED
3004 instance.task_state = None
3005 instance.power_state = power_state.NOSTATE
3006 instance.terminated_at = timeutils.utcnow()
3007 instance.save()
3008
3009 self._complete_deletion(context, instance)
3010 # only destroy the instance in the db if the _complete_deletion
3011 # doesn't raise and therefore allocation is successfully
3012 # deleted in placement
3013 instance.destroy()
3014
3015 self._notify_about_instance_usage(context, instance, "delete.end")
3016 compute_utils.notify_about_instance_action(context, instance,
3017 self.host, action=fields.NotificationAction.DELETE,
3018 phase=fields.NotificationPhase.END, bdms=bdms)
3019
3020 @wrap_exception()
3021 @reverts_task_state
3022 @wrap_instance_event(prefix='compute')
3023 @wrap_instance_fault
3024 def terminate_instance(self, context, instance, bdms):
3025 """Terminate an instance on this host."""
3026 @utils.synchronized(instance.uuid)
3027 def do_terminate_instance(instance, bdms):
3028 # NOTE(mriedem): If we are deleting the instance while it was
3029 # booting from volume, we could be racing with a database update of
3030 # the BDM volume_id. Since the compute API passes the BDMs over RPC
3031 # to compute here, the BDMs may be stale at this point. So check
3032 # for any volume BDMs that don't have volume_id set and if we
3033 # detect that, we need to refresh the BDM list before proceeding.
3034 # TODO(mriedem): Move this into _delete_instance and make the bdms
3035 # parameter optional.
3036 for bdm in list(bdms):
3037 if bdm.is_volume and not bdm.volume_id:
3038 LOG.debug('There are potentially stale BDMs during '
3039 'delete, refreshing the BlockDeviceMappingList.',
3040 instance=instance)
3041 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
3042 context, instance.uuid)
3043 break
3044 try:
3045 self._delete_instance(context, instance, bdms)
3046 except exception.InstanceNotFound:
3047 LOG.info("Instance disappeared during terminate",
3048 instance=instance)
3049 except Exception:
3050 # As we're trying to delete always go to Error if something
3051 # goes wrong that _delete_instance can't handle.
3052 with excutils.save_and_reraise_exception():
3053 LOG.exception('Setting instance vm_state to ERROR',
3054 instance=instance)
3055 self._set_instance_obj_error_state(instance)
3056
3057 do_terminate_instance(instance, bdms)
3058
3059 # NOTE(johannes): This is probably better named power_off_instance
3060 # so it matches the driver method, but because of other issues, we
3061 # can't use that name in grizzly.
3062 @wrap_exception()
3063 @reverts_task_state
3064 @wrap_instance_event(prefix='compute')
3065 @wrap_instance_fault
3066 def stop_instance(self, context, instance, clean_shutdown):
3067 """Stopping an instance on this host."""
3068
3069 @utils.synchronized(instance.uuid)
3070 def do_stop_instance():
3071 current_power_state = self._get_power_state(instance)
3072 LOG.debug('Stopping instance; current vm_state: %(vm_state)s, '
3073 'current task_state: %(task_state)s, current DB '
3074 'power_state: %(db_power_state)s, current VM '
3075 'power_state: %(current_power_state)s',
3076 {'vm_state': instance.vm_state,
3077 'task_state': instance.task_state,
3078 'db_power_state': instance.power_state,
3079 'current_power_state': current_power_state},
3080 instance_uuid=instance.uuid)
3081
3082 # NOTE(mriedem): If the instance is already powered off, we are
3083 # possibly tearing down and racing with other operations, so we can
3084 # expect the task_state to be None if something else updates the
3085 # instance and we're not locking it.
3086 expected_task_state = [task_states.POWERING_OFF]
3087 # The list of power states is from _sync_instance_power_state.
3088 if current_power_state in (power_state.NOSTATE,
3089 power_state.SHUTDOWN,
3090 power_state.CRASHED):
3091 LOG.info('Instance is already powered off in the '
3092 'hypervisor when stop is called.',
3093 instance=instance)
3094 expected_task_state.append(None)
3095
3096 self._notify_about_instance_usage(context, instance,
3097 "power_off.start")
3098
3099 compute_utils.notify_about_instance_action(context, instance,
3100 self.host, action=fields.NotificationAction.POWER_OFF,
3101 phase=fields.NotificationPhase.START)
3102
3103 self._power_off_instance(instance, clean_shutdown)
3104 instance.power_state = self._get_power_state(instance)
3105 instance.vm_state = vm_states.STOPPED
3106 instance.task_state = None
3107 instance.save(expected_task_state=expected_task_state)
3108 self._notify_about_instance_usage(context, instance,
3109 "power_off.end")
3110
3111 compute_utils.notify_about_instance_action(context, instance,
3112 self.host, action=fields.NotificationAction.POWER_OFF,
3113 phase=fields.NotificationPhase.END)
3114
3115 do_stop_instance()
3116
3117 def _power_on(self, context, instance):
3118 network_info = self.network_api.get_instance_nw_info(context, instance)
3119 block_device_info = self._get_instance_block_device_info(context,
3120 instance)
3121 accel_info = self._get_accel_info(context, instance)
3122 self.driver.power_on(context, instance,
3123 network_info,
3124 block_device_info, accel_info)
3125
3126 def _delete_snapshot_of_shelved_instance(self, context, instance,
3127 snapshot_id):
3128 """Delete snapshot of shelved instance."""
3129 try:
3130 self.image_api.delete(context, snapshot_id)
3131 except (exception.ImageNotFound,
3132 exception.ImageNotAuthorized) as exc:
3133 LOG.warning("Failed to delete snapshot "
3134 "from shelved instance (%s).",
3135 exc.format_message(), instance=instance)
3136 except Exception:
3137 LOG.exception("Something wrong happened when trying to "
3138 "delete snapshot from shelved instance.",
3139 instance=instance)
3140
3141 # NOTE(johannes): This is probably better named power_on_instance
3142 # so it matches the driver method, but because of other issues, we
3143 # can't use that name in grizzly.
3144 @wrap_exception()
3145 @reverts_task_state
3146 @wrap_instance_event(prefix='compute')
3147 @wrap_instance_fault
3148 def start_instance(self, context, instance):
3149 """Starting an instance on this host."""
3150 self._notify_about_instance_usage(context, instance, "power_on.start")
3151 compute_utils.notify_about_instance_action(context, instance,
3152 self.host, action=fields.NotificationAction.POWER_ON,
3153 phase=fields.NotificationPhase.START)
3154 self._power_on(context, instance)
3155 instance.power_state = self._get_power_state(instance)
3156 instance.vm_state = vm_states.ACTIVE
3157 instance.task_state = None
3158
3159 # Delete an image(VM snapshot) for a shelved instance
3160 snapshot_id = instance.system_metadata.get('shelved_image_id')
3161 if snapshot_id:
3162 self._delete_snapshot_of_shelved_instance(context, instance,
3163 snapshot_id)
3164
3165 # Delete system_metadata for a shelved instance
3166 compute_utils.remove_shelved_keys_from_system_metadata(instance)
3167
3168 instance.save(expected_task_state=task_states.POWERING_ON)
3169 self._notify_about_instance_usage(context, instance, "power_on.end")
3170 compute_utils.notify_about_instance_action(context, instance,
3171 self.host, action=fields.NotificationAction.POWER_ON,
3172 phase=fields.NotificationPhase.END)
3173
3174 @messaging.expected_exceptions(NotImplementedError,
3175 exception.TriggerCrashDumpNotSupported,
3176 exception.InstanceNotRunning)
3177 @wrap_exception()
3178 @wrap_instance_event(prefix='compute')
3179 @wrap_instance_fault
3180 def trigger_crash_dump(self, context, instance):
3181 """Trigger crash dump in an instance."""
3182
3183 self._notify_about_instance_usage(context, instance,
3184 "trigger_crash_dump.start")
3185 compute_utils.notify_about_instance_action(context, instance,
3186 self.host, action=fields.NotificationAction.TRIGGER_CRASH_DUMP,
3187 phase=fields.NotificationPhase.START)
3188
3189 # This method does not change task_state and power_state because the
3190 # effect of a trigger depends on user's configuration.
3191 self.driver.trigger_crash_dump(instance)
3192
3193 self._notify_about_instance_usage(context, instance,
3194 "trigger_crash_dump.end")
3195 compute_utils.notify_about_instance_action(context, instance,
3196 self.host, action=fields.NotificationAction.TRIGGER_CRASH_DUMP,
3197 phase=fields.NotificationPhase.END)
3198
3199 @wrap_exception()
3200 @reverts_task_state
3201 @wrap_instance_event(prefix='compute')
3202 @wrap_instance_fault
3203 def soft_delete_instance(self, context, instance):
3204 """Soft delete an instance on this host."""
3205 with compute_utils.notify_about_instance_delete(
3206 self.notifier, context, instance, 'soft_delete',
3207 source=fields.NotificationSource.COMPUTE):
3208 try:
3209 self.driver.soft_delete(instance)
3210 except NotImplementedError:
3211 # Fallback to just powering off the instance if the
3212 # hypervisor doesn't implement the soft_delete method
3213 self.driver.power_off(instance)
3214 instance.power_state = self._get_power_state(instance)
3215 instance.vm_state = vm_states.SOFT_DELETED
3216 instance.task_state = None
3217 instance.save(expected_task_state=[task_states.SOFT_DELETING])
3218
3219 @wrap_exception()
3220 @reverts_task_state
3221 @wrap_instance_event(prefix='compute')
3222 @wrap_instance_fault
3223 def restore_instance(self, context, instance):
3224 """Restore a soft-deleted instance on this host."""
3225 self._notify_about_instance_usage(context, instance, "restore.start")
3226 compute_utils.notify_about_instance_action(context, instance,
3227 self.host, action=fields.NotificationAction.RESTORE,
3228 phase=fields.NotificationPhase.START)
3229 try:
3230 self.driver.restore(instance)
3231 except NotImplementedError:
3232 # Fallback to just powering on the instance if the hypervisor
3233 # doesn't implement the restore method
3234 self._power_on(context, instance)
3235 instance.power_state = self._get_power_state(instance)
3236 instance.vm_state = vm_states.ACTIVE
3237 instance.task_state = None
3238 instance.save(expected_task_state=task_states.RESTORING)
3239 self._notify_about_instance_usage(context, instance, "restore.end")
3240 compute_utils.notify_about_instance_action(context, instance,
3241 self.host, action=fields.NotificationAction.RESTORE,
3242 phase=fields.NotificationPhase.END)
3243
3244 @staticmethod
3245 def _set_migration_status(migration, status):
3246 """Set the status, and guard against a None being passed in.
3247
3248 This is useful as some of the compute RPC calls will not pass
3249 a migration object in older versions. The check can be removed when
3250 we move past 4.x major version of the RPC API.
3251 """
3252 if migration:
3253 migration.status = status
3254 migration.save()
3255
3256 def _rebuild_default_impl(
3257 self, context, instance, image_meta, injected_files,
3258 admin_password, allocations, bdms, detach_block_devices,
3259 attach_block_devices, network_info=None, evacuate=False,
3260 block_device_info=None, preserve_ephemeral=False,
3261 accel_uuids=None):
3262 if preserve_ephemeral:
3263 # The default code path does not support preserving ephemeral
3264 # partitions.
3265 raise exception.PreserveEphemeralNotSupported()
3266
3267 accel_info = []
3268 if evacuate:
3269 if instance.flavor.extra_specs.get('accel:device_profile'):
3270 try:
3271 accel_info = self._get_bound_arq_resources(
3272 context, instance, accel_uuids or [])
3273 except (Exception, eventlet.timeout.Timeout) as exc:
3274 LOG.exception(exc)
3275 self._build_resources_cleanup(instance, network_info)
3276 msg = _('Failure getting accelerator resources.')
3277 raise exception.BuildAbortException(
3278 instance_uuid=instance.uuid, reason=msg)
3279 detach_block_devices(context, bdms)
3280 else:
3281 self._power_off_instance(instance, clean_shutdown=True)
3282 detach_block_devices(context, bdms)
3283 self.driver.destroy(context, instance,
3284 network_info=network_info,
3285 block_device_info=block_device_info)
3286 try:
3287 accel_info = self._get_accel_info(context, instance)
3288 except Exception as exc:
3289 LOG.exception(exc)
3290 self._build_resources_cleanup(instance, network_info)
3291 msg = _('Failure getting accelerator resources.')
3292 raise exception.BuildAbortException(
3293 instance_uuid=instance.uuid, reason=msg)
3294
3295 instance.task_state = task_states.REBUILD_BLOCK_DEVICE_MAPPING
3296 instance.save(expected_task_state=[task_states.REBUILDING])
3297
3298 new_block_device_info = attach_block_devices(context, instance, bdms)
3299
3300 instance.task_state = task_states.REBUILD_SPAWNING
3301 instance.save(
3302 expected_task_state=[task_states.REBUILD_BLOCK_DEVICE_MAPPING])
3303
3304 with instance.mutated_migration_context():
3305 self.driver.spawn(context, instance, image_meta, injected_files,
3306 admin_password, allocations,
3307 network_info=network_info,
3308 block_device_info=new_block_device_info,
3309 accel_info=accel_info)
3310
3311 def _notify_instance_rebuild_error(self, context, instance, error, bdms):
3312 self._notify_about_instance_usage(context, instance,
3313 'rebuild.error', fault=error)
3314 compute_utils.notify_about_instance_rebuild(
3315 context, instance, self.host,
3316 phase=fields.NotificationPhase.ERROR, exception=error, bdms=bdms)
3317
3318 @messaging.expected_exceptions(exception.PreserveEphemeralNotSupported,
3319 exception.BuildAbortException)
3320 @wrap_exception()
3321 @reverts_task_state
3322 @wrap_instance_event(prefix='compute')
3323 @wrap_instance_fault
3324 def rebuild_instance(self, context, instance, orig_image_ref, image_ref,
3325 injected_files, new_pass, orig_sys_metadata,
3326 bdms, recreate, on_shared_storage,
3327 preserve_ephemeral, migration,
3328 scheduled_node, limits, request_spec,
3329 accel_uuids=None):
3330 """Destroy and re-make this instance.
3331
3332 A 'rebuild' effectively purges all existing data from the system and
3333 remakes the VM with given 'metadata' and 'personalities'.
3334
3335 :param context: `nova.RequestContext` object
3336 :param instance: Instance object
3337 :param orig_image_ref: Original image_ref before rebuild
3338 :param image_ref: New image_ref for rebuild
3339 :param injected_files: Files to inject
3340 :param new_pass: password to set on rebuilt instance
3341 :param orig_sys_metadata: instance system metadata from pre-rebuild
3342 :param bdms: block-device-mappings to use for rebuild
3343 :param recreate: True if the instance is being evacuated (e.g. the
3344 hypervisor it was on failed) - cleanup of old state will be
3345 skipped.
3346 :param on_shared_storage: True if instance files on shared storage.
3347 If not provided then information from the
3348 driver will be used to decide if the instance
3349 files are available or not on the target host
3350 :param preserve_ephemeral: True if the default ephemeral storage
3351 partition must be preserved on rebuild
3352 :param migration: a Migration object if one was created for this
3353 rebuild operation (if it's a part of evacuate)
3354 :param scheduled_node: A node of the host chosen by the scheduler. If a
3355 host was specified by the user, this will be
3356 None
3357 :param limits: Overcommit limits set by the scheduler. If a host was
3358 specified by the user, this will be None
3359 :param request_spec: a RequestSpec object used to schedule the instance
3360 :param accel_uuids: a list of cyborg ARQ uuids or None if the RPC API
3361 is <=5.11
3362
3363 """
3364 # recreate=True means the instance is being evacuated from a failed
3365 # host to a new destination host (this host). The 'recreate' variable
3366 # name is confusing, so rename it to evacuate here at the top, which
3367 # is simpler than renaming a parameter in an RPC versioned method.
3368 evacuate = recreate
3369 context = context.elevated()
3370
3371 if evacuate:
3372 LOG.info("Evacuating instance", instance=instance)
3373 else:
3374 LOG.info("Rebuilding instance", instance=instance)
3375
3376 if evacuate:
3377 # This is an evacuation to a new host, so we need to perform a
3378 # resource claim.
3379 rebuild_claim = self.rt.rebuild_claim
3380 else:
3381 # This is a rebuild to the same host, so we don't need to make
3382 # a claim since the instance is already on this host.
3383 rebuild_claim = claims.NopClaim
3384
3385 if image_ref:
3386 image_meta = objects.ImageMeta.from_image_ref(
3387 context, self.image_api, image_ref)
3388 elif evacuate:
3389 # For evacuate the API does not send down the image_ref since the
3390 # image does not change so just get it from what was stashed in
3391 # the instance system_metadata when the instance was created (or
3392 # last rebuilt). This also works for volume-backed instances.
3393 image_meta = instance.image_meta
3394 else:
3395 image_meta = objects.ImageMeta()
3396
3397 # NOTE(mriedem): On an evacuate, we need to update
3398 # the instance's host and node properties to reflect it's
3399 # destination node for the evacuate.
3400 if not scheduled_node:
3401 if evacuate:
3402 try:
3403 compute_node = self._get_compute_info(context, self.host)
3404 scheduled_node = compute_node.hypervisor_hostname
3405 except exception.ComputeHostNotFound:
3406 LOG.exception('Failed to get compute_info for %s',
3407 self.host)
3408 else:
3409 scheduled_node = instance.node
3410
3411 allocs = self.reportclient.get_allocations_for_consumer(
3412 context, instance.uuid)
3413
3414 # If the resource claim or group policy validation fails before we
3415 # do anything to the guest or its networking/volumes we want to keep
3416 # the current status rather than put the instance into ERROR status.
3417 instance_state = instance.vm_state
3418 with self._error_out_instance_on_exception(
3419 context, instance, instance_state=instance_state):
3420 try:
3421 self._do_rebuild_instance_with_claim(
3422 context, instance, orig_image_ref,
3423 image_meta, injected_files, new_pass, orig_sys_metadata,
3424 bdms, evacuate, on_shared_storage, preserve_ephemeral,
3425 migration, request_spec, allocs, rebuild_claim,
3426 scheduled_node, limits, accel_uuids)
3427 except (exception.ComputeResourcesUnavailable,
3428 exception.RescheduledException) as e:
3429 if isinstance(e, exception.ComputeResourcesUnavailable):
3430 LOG.debug("Could not rebuild instance on this host, not "
3431 "enough resources available.", instance=instance)
3432 else:
3433 # RescheduledException is raised by the late server group
3434 # policy check during evacuation if a parallel scheduling
3435 # violated the policy.
3436 # We catch the RescheduledException here but we don't have
3437 # the plumbing to do an actual reschedule so we abort the
3438 # operation.
3439 LOG.debug("Could not rebuild instance on this host, "
3440 "late server group check failed.",
3441 instance=instance)
3442 # NOTE(ndipanov): We just abort the build for now and leave a
3443 # migration record for potential cleanup later
3444 self._set_migration_status(migration, 'failed')
3445 # Since the claim failed, we need to remove the allocation
3446 # created against the destination node. Note that we can only
3447 # get here when evacuating to a destination node. Rebuilding
3448 # on the same host (not evacuate) uses the NopClaim which will
3449 # not raise ComputeResourcesUnavailable.
3450 self.rt.delete_allocation_for_evacuated_instance(
3451 context, instance, scheduled_node, node_type='destination')
3452 self._notify_instance_rebuild_error(context, instance, e, bdms)
3453 # Wrap this in InstanceFaultRollback so that the
3454 # _error_out_instance_on_exception context manager keeps the
3455 # vm_state unchanged.
3456 raise exception.InstanceFaultRollback(
3457 inner_exception=exception.BuildAbortException(
3458 instance_uuid=instance.uuid,
3459 reason=e.format_message()))
3460 except (exception.InstanceNotFound,
3461 exception.UnexpectedDeletingTaskStateError) as e:
3462 LOG.debug('Instance was deleted while rebuilding',
3463 instance=instance)
3464 self._set_migration_status(migration, 'failed')
3465 self._notify_instance_rebuild_error(context, instance, e, bdms)
3466 except Exception as e:
3467 self._set_migration_status(migration, 'failed')
3468 if evacuate or scheduled_node is not None:
3469 self.rt.delete_allocation_for_evacuated_instance(
3470 context, instance, scheduled_node,
3471 node_type='destination')
3472 self._notify_instance_rebuild_error(context, instance, e, bdms)
3473 raise
3474 else:
3475 instance.apply_migration_context()
3476 # NOTE (ndipanov): This save will now update the host and node
3477 # attributes making sure that next RT pass is consistent since
3478 # it will be based on the instance and not the migration DB
3479 # entry.
3480 instance.host = self.host
3481 instance.node = scheduled_node
3482 instance.save()
3483 instance.drop_migration_context()
3484
3485 # NOTE (ndipanov): Mark the migration as done only after we
3486 # mark the instance as belonging to this host.
3487 self._set_migration_status(migration, 'done')
3488
3489 def _do_rebuild_instance_with_claim(
3490 self, context, instance, orig_image_ref, image_meta,
3491 injected_files, new_pass, orig_sys_metadata, bdms, evacuate,
3492 on_shared_storage, preserve_ephemeral, migration, request_spec,
3493 allocations, rebuild_claim, scheduled_node, limits, accel_uuids):
3494 """Helper to avoid deep nesting in the top-level method."""
3495
3496 provider_mapping = None
3497 if evacuate:
3498 provider_mapping = self._get_request_group_mapping(request_spec)
3499
3500 if provider_mapping:
3501 compute_utils.\
3502 update_pci_request_spec_with_allocated_interface_name(
3503 context, self.reportclient, instance, provider_mapping)
3504
3505 claim_context = rebuild_claim(
3506 context, instance, scheduled_node, allocations,
3507 limits=limits, image_meta=image_meta, migration=migration)
3508
3509 with claim_context:
3510 self._do_rebuild_instance(
3511 context, instance, orig_image_ref, image_meta, injected_files,
3512 new_pass, orig_sys_metadata, bdms, evacuate, on_shared_storage,
3513 preserve_ephemeral, migration, request_spec, allocations,
3514 provider_mapping, accel_uuids)
3515
3516 @staticmethod
3517 def _get_image_name(image_meta):
3518 if image_meta.obj_attr_is_set("name"):
3519 return image_meta.name
3520 else:
3521 return ''
3522
3523 def _do_rebuild_instance(
3524 self, context, instance, orig_image_ref, image_meta,
3525 injected_files, new_pass, orig_sys_metadata, bdms, evacuate,
3526 on_shared_storage, preserve_ephemeral, migration, request_spec,
3527 allocations, request_group_resource_providers_mapping,
3528 accel_uuids):
3529 orig_vm_state = instance.vm_state
3530
3531 if evacuate:
3532 if request_spec:
3533 # NOTE(gibi): Do a late check of server group policy as
3534 # parallel scheduling could violate such policy. This will
3535 # cause the evacuate to fail as rebuild does not implement
3536 # reschedule.
3537 hints = self._get_scheduler_hints({}, request_spec)
3538 self._validate_instance_group_policy(context, instance, hints)
3539
3540 if not self.driver.capabilities.get("supports_evacuate", False):
3541 raise exception.InstanceEvacuateNotSupported
3542
3543 self._check_instance_exists(instance)
3544
3545 if on_shared_storage is None:
3546 LOG.debug('on_shared_storage is not provided, using driver '
3547 'information to decide if the instance needs to '
3548 'be evacuated')
3549 on_shared_storage = self.driver.instance_on_disk(instance)
3550
3551 elif (on_shared_storage !=
3552 self.driver.instance_on_disk(instance)):
3553 # To cover case when admin expects that instance files are
3554 # on shared storage, but not accessible and vice versa
3555 raise exception.InvalidSharedStorage(
3556 _("Invalid state of instance files on shared"
3557 " storage"))
3558
3559 if on_shared_storage:
3560 LOG.info('disk on shared storage, evacuating using'
3561 ' existing disk')
3562 elif instance.image_ref:
3563 orig_image_ref = instance.image_ref
3564 LOG.info("disk not on shared storage, evacuating from "
3565 "image: '%s'", str(orig_image_ref))
3566 else:
3567 LOG.info('disk on volume, evacuating using existing '
3568 'volume')
3569
3570 # We check trusted certs capabilities for both evacuate (rebuild on
3571 # another host) and rebuild (rebuild on the same host) because for
3572 # evacuate we need to make sure an instance with trusted certs can
3573 # have the image verified with those certs during rebuild, and for
3574 # rebuild we could be rebuilding a server that started out with no
3575 # trusted certs on this host, and then was rebuilt with trusted certs
3576 # for a new image, in which case we need to validate that new image
3577 # with the trusted certs during the rebuild.
3578 self._check_trusted_certs(instance)
3579
3580 # This instance.exists message should contain the original
3581 # image_ref, not the new one. Since the DB has been updated
3582 # to point to the new one... we have to override it.
3583 orig_image_ref_url = self.image_api.generate_image_url(orig_image_ref,
3584 context)
3585 extra_usage_info = {'image_ref_url': orig_image_ref_url}
3586 compute_utils.notify_usage_exists(
3587 self.notifier, context, instance, self.host,
3588 current_period=True, system_metadata=orig_sys_metadata,
3589 extra_usage_info=extra_usage_info)
3590
3591 # This message should contain the new image_ref
3592 extra_usage_info = {'image_name': self._get_image_name(image_meta)}
3593 self._notify_about_instance_usage(context, instance,
3594 "rebuild.start", extra_usage_info=extra_usage_info)
3595 # NOTE: image_name is not included in the versioned notification
3596 # because we already provide the image_uuid in the notification
3597 # payload and the image details can be looked up via the uuid.
3598 compute_utils.notify_about_instance_rebuild(
3599 context, instance, self.host,
3600 phase=fields.NotificationPhase.START,
3601 bdms=bdms)
3602
3603 instance.power_state = self._get_power_state(instance)
3604 instance.task_state = task_states.REBUILDING
3605 instance.save(expected_task_state=[task_states.REBUILDING])
3606
3607 if evacuate:
3608 self.network_api.setup_networks_on_host(
3609 context, instance, self.host)
3610 # For nova-network this is needed to move floating IPs
3611 # For neutron this updates the host in the port binding
3612 # TODO(cfriesen): this network_api call and the one above
3613 # are so similar, we should really try to unify them.
3614 self.network_api.setup_instance_network_on_host(
3615 context, instance, self.host, migration,
3616 provider_mappings=request_group_resource_providers_mapping)
3617 # TODO(mriedem): Consider decorating setup_instance_network_on_host
3618 # with @api.refresh_cache and then we wouldn't need this explicit
3619 # call to get_instance_nw_info.
3620 network_info = self.network_api.get_instance_nw_info(context,
3621 instance)
3622 else:
3623 network_info = instance.get_network_info()
3624
3625 if bdms is None:
3626 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
3627 context, instance.uuid)
3628
3629 block_device_info = \
3630 self._get_instance_block_device_info(
3631 context, instance, bdms=bdms)
3632
3633 def detach_block_devices(context, bdms):
3634 for bdm in bdms:
3635 if bdm.is_volume:
3636 # NOTE (ildikov): Having the attachment_id set in the BDM
3637 # means that it's the new Cinder attach/detach flow
3638 # (available from v3.44). In that case we explicitly
3639 # attach and detach the volumes through attachment level
3640 # operations. In this scenario _detach_volume will delete
3641 # the existing attachment which would make the volume
3642 # status change to 'available' if we don't pre-create
3643 # another empty attachment before deleting the old one.
3644 attachment_id = None
3645 if bdm.attachment_id:
3646 attachment_id = self.volume_api.attachment_create(
3647 context, bdm['volume_id'], instance.uuid)['id']
3648 self._detach_volume(context, bdm, instance,
3649 destroy_bdm=False)
3650 if attachment_id:
3651 bdm.attachment_id = attachment_id
3652 bdm.save()
3653
3654 files = self._decode_files(injected_files)
3655
3656 kwargs = dict(
3657 context=context,
3658 instance=instance,
3659 image_meta=image_meta,
3660 injected_files=files,
3661 admin_password=new_pass,
3662 allocations=allocations,
3663 bdms=bdms,
3664 detach_block_devices=detach_block_devices,
3665 attach_block_devices=self._prep_block_device,
3666 block_device_info=block_device_info,
3667 network_info=network_info,
3668 preserve_ephemeral=preserve_ephemeral,
3669 evacuate=evacuate,
3670 accel_uuids=accel_uuids)
3671 try:
3672 with instance.mutated_migration_context():
3673 self.driver.rebuild(**kwargs)
3674 except NotImplementedError:
3675 # NOTE(rpodolyaka): driver doesn't provide specialized version
3676 # of rebuild, fall back to the default implementation
3677 self._rebuild_default_impl(**kwargs)
3678 self._update_instance_after_spawn(instance)
3679 instance.save(expected_task_state=[task_states.REBUILD_SPAWNING])
3680
3681 if orig_vm_state == vm_states.STOPPED:
3682 LOG.info("bringing vm to original state: '%s'",
3683 orig_vm_state, instance=instance)
3684 instance.vm_state = vm_states.ACTIVE
3685 instance.task_state = task_states.POWERING_OFF
3686 instance.progress = 0
3687 instance.save()
3688 self.stop_instance(context, instance, False)
3689 # TODO(melwitt): We should clean up instance console tokens here in the
3690 # case of evacuate. The instance is on a new host and will need to
3691 # establish a new console connection.
3692 self._update_scheduler_instance_info(context, instance)
3693 self._notify_about_instance_usage(
3694 context, instance, "rebuild.end",
3695 network_info=network_info,
3696 extra_usage_info=extra_usage_info)
3697 compute_utils.notify_about_instance_rebuild(
3698 context, instance, self.host,
3699 phase=fields.NotificationPhase.END,
3700 bdms=bdms)
3701
3702 def _handle_bad_volumes_detached(self, context, instance, bad_devices,
3703 block_device_info):
3704 """Handle cases where the virt-layer had to detach non-working volumes
3705 in order to complete an operation.
3706 """
3707 for bdm in block_device_info['block_device_mapping']:
3708 if bdm.get('mount_device') in bad_devices:
3709 try:
3710 volume_id = bdm['connection_info']['data']['volume_id']
3711 except KeyError:
3712 continue
3713
3714 # NOTE(sirp): ideally we'd just call
3715 # `compute_api.detach_volume` here but since that hits the
3716 # DB directly, that's off limits from within the
3717 # compute-manager.
3718 #
3719 # API-detach
3720 LOG.info("Detaching from volume api: %s", volume_id)
3721 self.volume_api.begin_detaching(context, volume_id)
3722
3723 # Manager-detach
3724 self.detach_volume(context, volume_id, instance)
3725
3726 def _get_accel_info(self, context, instance):
3727 dp_name = instance.flavor.extra_specs.get('accel:device_profile')
3728 if dp_name:
3729 cyclient = cyborg.get_client(context)
3730 accel_info = cyclient.get_arqs_for_instance(instance.uuid)
3731 else:
3732 accel_info = []
3733 return accel_info
3734
3735 @wrap_exception()
3736 @reverts_task_state
3737 @wrap_instance_event(prefix='compute')
3738 @wrap_instance_fault
3739 def reboot_instance(self, context, instance, block_device_info,
3740 reboot_type):
3741 @utils.synchronized(instance.uuid)
3742 def do_reboot_instance(context, instance, block_device_info,
3743 reboot_type):
3744 self._reboot_instance(context, instance, block_device_info,
3745 reboot_type)
3746 do_reboot_instance(context, instance, block_device_info, reboot_type)
3747
3748 def _reboot_instance(self, context, instance, block_device_info,
3749 reboot_type):
3750 """Reboot an instance on this host."""
3751 # acknowledge the request made it to the manager
3752 if reboot_type == "SOFT":
3753 instance.task_state = task_states.REBOOT_PENDING
3754 expected_states = task_states.soft_reboot_states
3755 else:
3756 instance.task_state = task_states.REBOOT_PENDING_HARD
3757 expected_states = task_states.hard_reboot_states
3758
3759 context = context.elevated()
3760 LOG.info("Rebooting instance", instance=instance)
3761
3762 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
3763 context, instance.uuid)
3764 block_device_info = self._get_instance_block_device_info(
3765 context, instance, bdms=bdms)
3766
3767 network_info = self.network_api.get_instance_nw_info(context, instance)
3768
3769 accel_info = self._get_accel_info(context, instance)
3770
3771 self._notify_about_instance_usage(context, instance, "reboot.start")
3772 compute_utils.notify_about_instance_action(
3773 context, instance, self.host,
3774 action=fields.NotificationAction.REBOOT,
3775 phase=fields.NotificationPhase.START,
3776 bdms=bdms
3777 )
3778
3779 instance.power_state = self._get_power_state(instance)
3780 instance.save(expected_task_state=expected_states)
3781
3782 if instance.power_state != power_state.RUNNING:
3783 state = instance.power_state
3784 running = power_state.RUNNING
3785 LOG.warning('trying to reboot a non-running instance:'
3786 ' (state: %(state)s expected: %(running)s)',
3787 {'state': state, 'running': running},
3788 instance=instance)
3789
3790 def bad_volumes_callback(bad_devices):
3791 self._handle_bad_volumes_detached(
3792 context, instance, bad_devices, block_device_info)
3793
3794 try:
3795 # Don't change it out of rescue mode
3796 if instance.vm_state == vm_states.RESCUED:
3797 new_vm_state = vm_states.RESCUED
3798 else:
3799 new_vm_state = vm_states.ACTIVE
3800 new_power_state = None
3801 if reboot_type == "SOFT":
3802 instance.task_state = task_states.REBOOT_STARTED
3803 expected_state = task_states.REBOOT_PENDING
3804 else:
3805 instance.task_state = task_states.REBOOT_STARTED_HARD
3806 expected_state = task_states.REBOOT_PENDING_HARD
3807 instance.save(expected_task_state=expected_state)
3808 self.driver.reboot(context, instance,
3809 network_info,
3810 reboot_type,
3811 block_device_info=block_device_info,
3812 accel_info=accel_info,
3813 bad_volumes_callback=bad_volumes_callback)
3814
3815 except Exception as error:
3816 with excutils.save_and_reraise_exception() as ctxt:
3817 exc_info = sys.exc_info()
3818 # if the reboot failed but the VM is running don't
3819 # put it into an error state
3820 new_power_state = self._get_power_state(instance)
3821 if new_power_state == power_state.RUNNING:
3822 LOG.warning('Reboot failed but instance is running',
3823 instance=instance)
3824 compute_utils.add_instance_fault_from_exc(context,
3825 instance, error, exc_info)
3826 self._notify_about_instance_usage(context, instance,
3827 'reboot.error', fault=error)
3828 compute_utils.notify_about_instance_action(
3829 context, instance, self.host,
3830 action=fields.NotificationAction.REBOOT,
3831 phase=fields.NotificationPhase.ERROR,
3832 exception=error, bdms=bdms
3833 )
3834 ctxt.reraise = False
3835 else:
3836 LOG.error('Cannot reboot instance: %s', error,
3837 instance=instance)
3838 self._set_instance_obj_error_state(instance)
3839
3840 if not new_power_state:
3841 new_power_state = self._get_power_state(instance)
3842 try:
3843 instance.power_state = new_power_state
3844 instance.vm_state = new_vm_state
3845 instance.task_state = None
3846 instance.save()
3847 except exception.InstanceNotFound:
3848 LOG.warning("Instance disappeared during reboot",
3849 instance=instance)
3850
3851 self._notify_about_instance_usage(context, instance, "reboot.end")
3852 compute_utils.notify_about_instance_action(
3853 context, instance, self.host,
3854 action=fields.NotificationAction.REBOOT,
3855 phase=fields.NotificationPhase.END,
3856 bdms=bdms
3857 )
3858
3859 @delete_image_on_error
3860 def _do_snapshot_instance(self, context, image_id, instance):
3861 self._snapshot_instance(context, image_id, instance,
3862 task_states.IMAGE_BACKUP)
3863
3864 @wrap_exception()
3865 @reverts_task_state
3866 @wrap_instance_event(prefix='compute')
3867 @wrap_instance_fault
3868 def backup_instance(self, context, image_id, instance, backup_type,
3869 rotation):
3870 """Backup an instance on this host.
3871
3872 :param backup_type: daily | weekly
3873 :param rotation: int representing how many backups to keep around
3874 """
3875 self._do_snapshot_instance(context, image_id, instance)
3876 self._rotate_backups(context, instance, backup_type, rotation)
3877
3878 @wrap_exception()
3879 @reverts_task_state
3880 @wrap_instance_event(prefix='compute')
3881 @wrap_instance_fault
3882 @delete_image_on_error
3883 def snapshot_instance(self, context, image_id, instance):
3884 """Snapshot an instance on this host.
3885
3886 :param context: security context
3887 :param image_id: glance.db.sqlalchemy.models.Image.Id
3888 :param instance: a nova.objects.instance.Instance object
3889 """
3890 # NOTE(dave-mcnally) the task state will already be set by the api
3891 # but if the compute manager has crashed/been restarted prior to the
3892 # request getting here the task state may have been cleared so we set
3893 # it again and things continue normally
3894 try:
3895 instance.task_state = task_states.IMAGE_SNAPSHOT
3896 instance.save(
3897 expected_task_state=task_states.IMAGE_SNAPSHOT_PENDING)
3898 except exception.InstanceNotFound:
3899 # possibility instance no longer exists, no point in continuing
3900 LOG.debug("Instance not found, could not set state %s "
3901 "for instance.",
3902 task_states.IMAGE_SNAPSHOT, instance=instance)
3903 return
3904
3905 except exception.UnexpectedDeletingTaskStateError:
3906 LOG.debug("Instance being deleted, snapshot cannot continue",
3907 instance=instance)
3908 return
3909
3910 with self._snapshot_semaphore:
3911 self._snapshot_instance(context, image_id, instance,
3912 task_states.IMAGE_SNAPSHOT)
3913
3914 def _snapshot_instance(self, context, image_id, instance,
3915 expected_task_state):
3916 context = context.elevated()
3917
3918 instance.power_state = self._get_power_state(instance)
3919 try:
3920 instance.save()
3921
3922 LOG.info('instance snapshotting', instance=instance)
3923
3924 if instance.power_state != power_state.RUNNING:
3925 state = instance.power_state
3926 running = power_state.RUNNING
3927 LOG.warning('trying to snapshot a non-running instance: '
3928 '(state: %(state)s expected: %(running)s)',
3929 {'state': state, 'running': running},
3930 instance=instance)
3931
3932 self._notify_about_instance_usage(
3933 context, instance, "snapshot.start")
3934 compute_utils.notify_about_instance_snapshot(context, instance,
3935 self.host, phase=fields.NotificationPhase.START,
3936 snapshot_image_id=image_id)
3937
3938 def update_task_state(task_state,
3939 expected_state=expected_task_state):
3940 instance.task_state = task_state
3941 instance.save(expected_task_state=expected_state)
3942
3943 with timeutils.StopWatch() as timer:
3944 self.driver.snapshot(context, instance, image_id,
3945 update_task_state)
3946 LOG.info('Took %0.2f seconds to snapshot the instance on '
3947 'the hypervisor.', timer.elapsed(), instance=instance)
3948
3949 instance.task_state = None
3950 instance.save(expected_task_state=task_states.IMAGE_UPLOADING)
3951
3952 self._notify_about_instance_usage(context, instance,
3953 "snapshot.end")
3954 compute_utils.notify_about_instance_snapshot(context, instance,
3955 self.host, phase=fields.NotificationPhase.END,
3956 snapshot_image_id=image_id)
3957 except (exception.InstanceNotFound,
3958 exception.InstanceNotRunning,
3959 exception.UnexpectedDeletingTaskStateError):
3960 # the instance got deleted during the snapshot
3961 # Quickly bail out of here
3962 msg = 'Instance disappeared during snapshot'
3963 LOG.debug(msg, instance=instance)
3964 try:
3965 image = self.image_api.get(context, image_id)
3966 if image['status'] != 'active':
3967 self.image_api.delete(context, image_id)
3968 except exception.ImageNotFound:
3969 LOG.debug('Image not found during clean up %s', image_id)
3970 except Exception:
3971 LOG.warning("Error while trying to clean up image %s",
3972 image_id, instance=instance)
3973 except exception.ImageNotFound:
3974 instance.task_state = None
3975 instance.save()
3976 LOG.warning("Image not found during snapshot", instance=instance)
3977
3978 def _post_interrupted_snapshot_cleanup(self, context, instance):
3979 self.driver.post_interrupted_snapshot_cleanup(context, instance)
3980
3981 @messaging.expected_exceptions(NotImplementedError)
3982 @wrap_exception()
3983 def volume_snapshot_create(self, context, instance, volume_id,
3984 create_info):
3985 try:
3986 self.driver.volume_snapshot_create(context, instance, volume_id,
3987 create_info)
3988 except exception.InstanceNotRunning:
3989 # Libvirt driver can raise this exception
3990 LOG.debug('Instance disappeared during volume snapshot create',
3991 instance=instance)
3992
3993 @messaging.expected_exceptions(NotImplementedError)
3994 @wrap_exception()
3995 def volume_snapshot_delete(self, context, instance, volume_id,
3996 snapshot_id, delete_info):
3997 try:
3998 self.driver.volume_snapshot_delete(context, instance, volume_id,
3999 snapshot_id, delete_info)
4000 except exception.InstanceNotRunning:
4001 # Libvirt driver can raise this exception
4002 LOG.debug('Instance disappeared during volume snapshot delete',
4003 instance=instance)
4004
4005 @wrap_instance_fault
4006 def _rotate_backups(self, context, instance, backup_type, rotation):
4007 """Delete excess backups associated to an instance.
4008
4009 Instances are allowed a fixed number of backups (the rotation number);
4010 this method deletes the oldest backups that exceed the rotation
4011 threshold.
4012
4013 :param context: security context
4014 :param instance: Instance dict
4015 :param backup_type: a user-defined type, like "daily" or "weekly" etc.
4016 :param rotation: int representing how many backups to keep around;
4017 None if rotation shouldn't be used (as in the case of snapshots)
4018 """
4019 filters = {'property-image_type': 'backup',
4020 'property-backup_type': backup_type,
4021 'property-instance_uuid': instance.uuid}
4022
4023 images = self.image_api.get_all(context, filters=filters,
4024 sort_key='created_at', sort_dir='desc')
4025 num_images = len(images)
4026 LOG.debug("Found %(num_images)d images (rotation: %(rotation)d)",
4027 {'num_images': num_images, 'rotation': rotation},
4028 instance=instance)
4029
4030 if num_images > rotation:
4031 # NOTE(sirp): this deletes all backups that exceed the rotation
4032 # limit
4033 excess = len(images) - rotation
4034 LOG.debug("Rotating out %d backups", excess,
4035 instance=instance)
4036 for i in range(excess):
4037 image = images.pop()
4038 image_id = image['id']
4039 LOG.debug("Deleting image %s", image_id,
4040 instance=instance)
4041 try:
4042 self.image_api.delete(context, image_id)
4043 except exception.ImageNotFound:
4044 LOG.info("Failed to find image %(image_id)s to "
4045 "delete", {'image_id': image_id},
4046 instance=instance)
4047 except (exception.ImageDeleteConflict, Exception) as exc:
4048 LOG.info("Failed to delete image %(image_id)s during "
4049 "deleting excess backups. "
4050 "Continuing for next image.. %(exc)s",
4051 {'image_id': image_id, 'exc': exc},
4052 instance=instance)
4053
4054 @wrap_exception()
4055 @reverts_task_state
4056 @wrap_instance_event(prefix='compute')
4057 @wrap_instance_fault
4058 def set_admin_password(self, context, instance, new_pass):
4059 """Set the root/admin password for an instance on this host.
4060
4061 This is generally only called by API password resets after an
4062 image has been built.
4063
4064 @param context: Nova auth context.
4065 @param instance: Nova instance object.
4066 @param new_pass: The admin password for the instance.
4067 """
4068
4069 context = context.elevated()
4070 current_power_state = self._get_power_state(instance)
4071 expected_state = power_state.RUNNING
4072
4073 if current_power_state != expected_state:
4074 instance.task_state = None
4075 instance.save(expected_task_state=task_states.UPDATING_PASSWORD)
4076 _msg = _('instance %s is not running') % instance.uuid
4077 raise exception.InstancePasswordSetFailed(
4078 instance=instance.uuid, reason=_msg)
4079
4080 try:
4081 self.driver.set_admin_password(instance, new_pass)
4082 LOG.info("Admin password set", instance=instance)
4083 instance.task_state = None
4084 instance.save(
4085 expected_task_state=task_states.UPDATING_PASSWORD)
4086 except exception.InstanceAgentNotEnabled:
4087 with excutils.save_and_reraise_exception():
4088 LOG.debug('Guest agent is not enabled for the instance.',
4089 instance=instance)
4090 instance.task_state = None
4091 instance.save(
4092 expected_task_state=task_states.UPDATING_PASSWORD)
4093 except exception.SetAdminPasswdNotSupported:
4094 with excutils.save_and_reraise_exception():
4095 LOG.info('set_admin_password is not supported '
4096 'by this driver or guest instance.',
4097 instance=instance)
4098 instance.task_state = None
4099 instance.save(
4100 expected_task_state=task_states.UPDATING_PASSWORD)
4101 except NotImplementedError:
4102 LOG.warning('set_admin_password is not implemented '
4103 'by this driver or guest instance.',
4104 instance=instance)
4105 instance.task_state = None
4106 instance.save(
4107 expected_task_state=task_states.UPDATING_PASSWORD)
4108 raise NotImplementedError(_('set_admin_password is not '
4109 'implemented by this driver or guest '
4110 'instance.'))
4111 except exception.UnexpectedTaskStateError:
4112 # interrupted by another (most likely delete) task
4113 # do not retry
4114 raise
4115 except Exception:
4116 # Catch all here because this could be anything.
4117 LOG.exception('set_admin_password failed', instance=instance)
4118 # We create a new exception here so that we won't
4119 # potentially reveal password information to the
4120 # API caller. The real exception is logged above
4121 _msg = _('error setting admin password')
4122 raise exception.InstancePasswordSetFailed(
4123 instance=instance.uuid, reason=_msg)
4124
4125 def _get_rescue_image(self, context, instance, rescue_image_ref=None):
4126 """Determine what image should be used to boot the rescue VM."""
4127 # 1. If rescue_image_ref is passed in, use that for rescue.
4128 # 2. Else, use the base image associated with instance's current image.
4129 # The idea here is to provide the customer with a rescue
4130 # environment which they are familiar with.
4131 # So, if they built their instance off of a Debian image,
4132 # their rescue VM will also be Debian.
4133 # 3. As a last resort, use instance's current image.
4134 if not rescue_image_ref:
4135 system_meta = utils.instance_sys_meta(instance)
4136 rescue_image_ref = system_meta.get('image_base_image_ref')
4137
4138 if not rescue_image_ref:
4139 LOG.warning('Unable to find a different image to use for '
4140 'rescue VM, using instance\'s current image',
4141 instance=instance)
4142 rescue_image_ref = instance.image_ref
4143
4144 return objects.ImageMeta.from_image_ref(
4145 context, self.image_api, rescue_image_ref)
4146
4147 @wrap_exception()
4148 @reverts_task_state
4149 @wrap_instance_event(prefix='compute')
4150 @wrap_instance_fault
4151 def rescue_instance(self, context, instance, rescue_password,
4152 rescue_image_ref, clean_shutdown):
4153 context = context.elevated()
4154 LOG.info('Rescuing', instance=instance)
4155
4156 admin_password = (rescue_password if rescue_password else
4157 utils.generate_password())
4158
4159 network_info = self.network_api.get_instance_nw_info(context, instance)
4160
4161 rescue_image_meta = self._get_rescue_image(context, instance,
4162 rescue_image_ref)
4163
4164 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
4165 context, instance.uuid)
4166 block_device_info = self._get_instance_block_device_info(
4167 context, instance, bdms=bdms)
4168
4169 extra_usage_info = {'rescue_image_name':
4170 self._get_image_name(rescue_image_meta)}
4171 self._notify_about_instance_usage(context, instance,
4172 "rescue.start", extra_usage_info=extra_usage_info,
4173 network_info=network_info)
4174 compute_utils.notify_about_instance_rescue_action(
4175 context, instance, self.host, rescue_image_ref,
4176 phase=fields.NotificationPhase.START)
4177
4178 try:
4179 self._power_off_instance(instance, clean_shutdown)
4180
4181 self.driver.rescue(context, instance, network_info,
4182 rescue_image_meta, admin_password,
4183 block_device_info)
4184 except Exception as e:
4185 LOG.exception("Error trying to Rescue Instance",
4186 instance=instance)
4187 self._set_instance_obj_error_state(instance)
4188 raise exception.InstanceNotRescuable(
4189 instance_id=instance.uuid,
4190 reason=_("Driver Error: %s") % e)
4191
4192 compute_utils.notify_usage_exists(self.notifier, context, instance,
4193 self.host, current_period=True)
4194
4195 instance.vm_state = vm_states.RESCUED
4196 instance.task_state = None
4197 instance.power_state = self._get_power_state(instance)
4198 instance.launched_at = timeutils.utcnow()
4199 instance.save(expected_task_state=task_states.RESCUING)
4200
4201 self._notify_about_instance_usage(context, instance,
4202 "rescue.end", extra_usage_info=extra_usage_info,
4203 network_info=network_info)
4204 compute_utils.notify_about_instance_rescue_action(
4205 context, instance, self.host, rescue_image_ref,
4206 phase=fields.NotificationPhase.END)
4207
4208 @wrap_exception()
4209 @reverts_task_state
4210 @wrap_instance_event(prefix='compute')
4211 @wrap_instance_fault
4212 def unrescue_instance(self, context, instance):
4213 orig_context = context
4214 context = context.elevated()
4215 LOG.info('Unrescuing', instance=instance)
4216
4217 network_info = self.network_api.get_instance_nw_info(context, instance)
4218 self._notify_about_instance_usage(context, instance,
4219 "unrescue.start", network_info=network_info)
4220 compute_utils.notify_about_instance_action(context, instance,
4221 self.host, action=fields.NotificationAction.UNRESCUE,
4222 phase=fields.NotificationPhase.START)
4223
4224 with self._error_out_instance_on_exception(context, instance):
4225 self.driver.unrescue(orig_context, instance)
4226
4227 instance.vm_state = vm_states.ACTIVE
4228 instance.task_state = None
4229 instance.power_state = self._get_power_state(instance)
4230 instance.save(expected_task_state=task_states.UNRESCUING)
4231
4232 self._notify_about_instance_usage(context,
4233 instance,
4234 "unrescue.end",
4235 network_info=network_info)
4236 compute_utils.notify_about_instance_action(context, instance,
4237 self.host, action=fields.NotificationAction.UNRESCUE,
4238 phase=fields.NotificationPhase.END)
4239
4240 @wrap_exception()
4241 @wrap_instance_fault
4242 def change_instance_metadata(self, context, diff, instance):
4243 """Update the metadata published to the instance."""
4244 LOG.debug("Changing instance metadata according to %r",
4245 diff, instance=instance)
4246 self.driver.change_instance_metadata(context, instance, diff)
4247
4248 @wrap_exception()
4249 @wrap_instance_event(prefix='compute')
4250 @errors_out_migration
4251 @wrap_instance_fault
4252 def confirm_resize(self, context, instance, migration):
4253 """Confirms a migration/resize and deletes the 'old' instance.
4254
4255 This is called from the API and runs on the source host.
4256
4257 Nothing needs to happen on the destination host at this point since
4258 the instance is already running there. This routine just cleans up the
4259 source host.
4260 """
4261 @utils.synchronized(instance.uuid)
4262 def do_confirm_resize(context, instance, migration):
4263 LOG.debug("Going to confirm migration %s", migration.id,
4264 instance=instance)
4265
4266 if migration.status == 'confirmed':
4267 LOG.info("Migration %s is already confirmed",
4268 migration.id, instance=instance)
4269 return
4270
4271 if migration.status not in ('finished', 'confirming'):
4272 LOG.warning("Unexpected confirmation status '%(status)s' "
4273 "of migration %(id)s, exit confirmation process",
4274 {"status": migration.status, "id": migration.id},
4275 instance=instance)
4276 return
4277
4278 # NOTE(wangpan): Get the instance from db, if it has been
4279 # deleted, we do nothing and return here
4280 expected_attrs = ['metadata', 'system_metadata', 'flavor']
4281 try:
4282 instance = objects.Instance.get_by_uuid(
4283 context, instance.uuid,
4284 expected_attrs=expected_attrs)
4285 except exception.InstanceNotFound:
4286 LOG.info("Instance is not found during confirmation",
4287 instance=instance)
4288 return
4289
4290 with self._error_out_instance_on_exception(context, instance):
4291 try:
4292 self._confirm_resize(
4293 context, instance, migration=migration)
4294 except Exception:
4295 # Something failed when cleaning up the source host so
4296 # log a traceback and leave a hint about hard rebooting
4297 # the server to correct its state in the DB.
4298 with excutils.save_and_reraise_exception(logger=LOG):
4299 LOG.exception(
4300 'Confirm resize failed on source host %s. '
4301 'Resource allocations in the placement service '
4302 'will be removed regardless because the instance '
4303 'is now on the destination host %s. You can try '
4304 'hard rebooting the instance to correct its '
4305 'state.', self.host, migration.dest_compute,
4306 instance=instance)
4307 finally:
4308 # Whether an error occurred or not, at this point the
4309 # instance is on the dest host. Avoid leaking allocations
4310 # in placement by deleting them here...
4311 self._delete_allocation_after_move(
4312 context, instance, migration)
4313 # ...inform the scheduler about the move...
4314 self._delete_scheduler_instance_info(
4315 context, instance.uuid)
4316 # ...and unset the cached flavor information (this is done
4317 # last since the resource tracker relies on it for its
4318 # periodic tasks)
4319 self._delete_stashed_flavor_info(instance)
4320
4321 do_confirm_resize(context, instance, migration)
4322
4323 def _get_updated_nw_info_with_pci_mapping(self, nw_info, pci_mapping):
4324 # NOTE(adrianc): This method returns a copy of nw_info if modifications
4325 # are made else it returns the original nw_info.
4326 updated_nw_info = nw_info
4327 if nw_info and pci_mapping:
4328 updated_nw_info = copy.deepcopy(nw_info)
4329 for vif in updated_nw_info:
4330 if vif['vnic_type'] in network_model.VNIC_TYPES_SRIOV:
4331 try:
4332 vif_pci_addr = vif['profile']['pci_slot']
4333 new_addr = pci_mapping[vif_pci_addr].address
4334 vif['profile']['pci_slot'] = new_addr
4335 LOG.debug("Updating VIF's PCI address for VIF %(id)s. "
4336 "Original value %(orig_val)s, "
4337 "new value %(new_val)s",
4338 {'id': vif['id'],
4339 'orig_val': vif_pci_addr,
4340 'new_val': new_addr})
4341 except (KeyError, AttributeError):
4342 with excutils.save_and_reraise_exception():
4343 # NOTE(adrianc): This should never happen. If we
4344 # get here it means there is some inconsistency
4345 # with either 'nw_info' or 'pci_mapping'.
4346 LOG.error("Unexpected error when updating network "
4347 "information with PCI mapping.")
4348 return updated_nw_info
4349
4350 def _confirm_resize(self, context, instance, migration=None):
4351 """Destroys the source instance."""
4352 self._notify_about_instance_usage(context, instance,
4353 "resize.confirm.start")
4354 compute_utils.notify_about_instance_action(context, instance,
4355 self.host, action=fields.NotificationAction.RESIZE_CONFIRM,
4356 phase=fields.NotificationPhase.START)
4357
4358 # NOTE(tr3buchet): tear down networks on source host
4359 self.network_api.setup_networks_on_host(context, instance,
4360 migration.source_compute, teardown=True)
4361
4362 # TODO(stephenfin): These next three calls should be bundled
4363 network_info = self.network_api.get_instance_nw_info(context,
4364 instance)
4365
4366 # NOTE(adrianc): Populate old PCI device in VIF profile
4367 # to allow virt driver to properly unplug it from Hypervisor.
4368 pci_mapping = (instance.migration_context.
4369 get_pci_mapping_for_migration(True))
4370 network_info = self._get_updated_nw_info_with_pci_mapping(
4371 network_info, pci_mapping)
4372
4373 self.driver.confirm_migration(context, migration, instance,
4374 network_info)
4375
4376 # Free up the old_flavor usage from the resource tracker for this host.
4377 self.rt.drop_move_claim_at_source(context, instance, migration)
4378
4379 # NOTE(mriedem): The old_vm_state could be STOPPED but the user
4380 # might have manually powered up the instance to confirm the
4381 # resize/migrate, so we need to check the current power state
4382 # on the instance and set the vm_state appropriately. We default
4383 # to ACTIVE because if the power state is not SHUTDOWN, we
4384 # assume _sync_instance_power_state will clean it up.
4385 p_state = instance.power_state
4386 vm_state = None
4387 if p_state == power_state.SHUTDOWN:
4388 vm_state = vm_states.STOPPED
4389 LOG.debug("Resized/migrated instance is powered off. "
4390 "Setting vm_state to '%s'.", vm_state,
4391 instance=instance)
4392 else:
4393 vm_state = vm_states.ACTIVE
4394
4395 instance.vm_state = vm_state
4396 instance.task_state = None
4397 instance.save(expected_task_state=[None, task_states.DELETING,
4398 task_states.SOFT_DELETING])
4399
4400 self._notify_about_instance_usage(
4401 context, instance, "resize.confirm.end",
4402 network_info=network_info)
4403 compute_utils.notify_about_instance_action(context, instance,
4404 self.host, action=fields.NotificationAction.RESIZE_CONFIRM,
4405 phase=fields.NotificationPhase.END)
4406
4407 def _delete_allocation_after_move(self, context, instance, migration):
4408 """Deletes resource allocations held by the migration record against
4409 the source compute node resource provider after a confirmed cold /
4410 successful live migration.
4411 """
4412 try:
4413 # NOTE(danms): We're finishing on the source node, so try
4414 # to delete the allocation based on the migration uuid
4415 self.reportclient.delete_allocation_for_instance(
4416 context, migration.uuid, consumer_type='migration')
4417 except exception.AllocationDeleteFailed:
4418 LOG.error('Deleting allocation in placement for migration '
4419 '%(migration_uuid)s failed. The instance '
4420 '%(instance_uuid)s will be put to ERROR state '
4421 'but the allocation held by the migration is '
4422 'leaked.',
4423 {'instance_uuid': instance.uuid,
4424 'migration_uuid': migration.uuid})
4425 raise
4426
4427 def _delete_stashed_flavor_info(self, instance):
4428 """Remove information about the flavor change after a resize."""
4429 instance.old_flavor = None
4430 instance.new_flavor = None
4431 instance.system_metadata.pop('old_vm_state', None)
4432 instance.save()
4433
4434 @wrap_exception()
4435 @wrap_instance_event(prefix='compute')
4436 @errors_out_migration
4437 @wrap_instance_fault
4438 def confirm_snapshot_based_resize_at_source(
4439 self, ctxt, instance, migration):
4440 """Confirms a snapshot-based resize on the source host.
4441
4442 Cleans the guest from the source hypervisor including disks and drops
4443 the MoveClaim which will free up "old_flavor" usage from the
4444 ResourceTracker.
4445
4446 Deletes the allocations held by the migration consumer against the
4447 source compute node resource provider.
4448
4449 :param ctxt: nova auth request context targeted at the source cell
4450 :param instance: Instance object being resized which should have the
4451 "old_flavor" attribute set
4452 :param migration: Migration object for the resize operation
4453 """
4454
4455 @utils.synchronized(instance.uuid)
4456 def do_confirm():
4457 LOG.info('Confirming resize on source host.', instance=instance)
4458 with self._error_out_instance_on_exception(ctxt, instance):
4459 # TODO(mriedem): Could probably make this try/except/finally
4460 # a context manager to share with confirm_resize().
4461 try:
4462 self._confirm_snapshot_based_resize_at_source(
4463 ctxt, instance, migration)
4464 except Exception:
4465 # Something failed when cleaning up the source host so
4466 # log a traceback and leave a hint about hard rebooting
4467 # the server to correct its state in the DB.
4468 with excutils.save_and_reraise_exception(logger=LOG):
4469 LOG.exception(
4470 'Confirm resize failed on source host %s. '
4471 'Resource allocations in the placement service '
4472 'will be removed regardless because the instance '
4473 'is now on the destination host %s. You can try '
4474 'hard rebooting the instance to correct its '
4475 'state.', self.host, migration.dest_compute,
4476 instance=instance)
4477 finally:
4478 # Whether an error occurred or not, at this point the
4479 # instance is on the dest host so to avoid leaking
4480 # allocations in placement, delete them here.
4481 # TODO(mriedem): Should we catch and just log
4482 # AllocationDeleteFailed? What is the user's recourse if
4483 # we got this far but this fails? At this point the
4484 # instance is on the target host and the allocations
4485 # could just be manually cleaned up by the operator.
4486 self._delete_allocation_after_move(ctxt, instance,
4487 migration)
4488 do_confirm()
4489
4490 def _confirm_snapshot_based_resize_at_source(
4491 self, ctxt, instance, migration):
4492 """Private version of confirm_snapshot_based_resize_at_source
4493
4494 This allows the main method to be decorated with error handlers.
4495
4496 :param ctxt: nova auth request context targeted at the source cell
4497 :param instance: Instance object being resized which should have the
4498 "old_flavor" attribute set
4499 :param migration: Migration object for the resize operation
4500 """
4501 # Cleanup the guest from the hypervisor including local disks.
4502 network_info = self.network_api.get_instance_nw_info(ctxt, instance)
4503 LOG.debug('Cleaning up guest from source hypervisor including disks.',
4504 instance=instance)
4505
4506 # FIXME(mriedem): Per bug 1809095, _confirm_resize calls
4507 # _get_updated_nw_info_with_pci_mapping here prior to unplugging
4508 # VIFs on the source, but in our case we have already unplugged
4509 # VIFs during prep_snapshot_based_resize_at_source, so what do we
4510 # need to do about those kinds of ports? Do we need to wait to unplug
4511 # VIFs until confirm like normal resize?
4512
4513 # Note that prep_snapshot_based_resize_at_source already destroyed the
4514 # guest which disconnected volumes and unplugged VIFs but did not
4515 # destroy disks in case something failed during the resize and the
4516 # instance needed to be rebooted or rebuilt on the source host. Now
4517 # that we are confirming the resize we want to cleanup the disks left
4518 # on the source host. We call cleanup() instead of destroy() to avoid
4519 # any InstanceNotFound confusion from the driver since the guest was
4520 # already destroyed on this host. block_device_info=None and
4521 # destroy_vifs=False means cleanup() will not try to disconnect volumes
4522 # or unplug VIFs.
4523 self.driver.cleanup(
4524 ctxt, instance, network_info, block_device_info=None,
4525 destroy_disks=True, destroy_vifs=False)
4526
4527 # Delete port bindings for the source host.
4528 self._confirm_snapshot_based_resize_delete_port_bindings(
4529 ctxt, instance)
4530
4531 # Delete volume attachments for the source host.
4532 self._delete_volume_attachments(ctxt, instance.get_bdms())
4533
4534 # Free up the old_flavor usage from the resource tracker for this host.
4535 self.rt.drop_move_claim_at_source(ctxt, instance, migration)
4536
4537 def _confirm_snapshot_based_resize_delete_port_bindings(
4538 self, ctxt, instance):
4539 """Delete port bindings for the source host when confirming
4540 snapshot-based resize on the source host."
4541
4542 :param ctxt: nova auth RequestContext
4543 :param instance: Instance object that was resized/cold migrated
4544 """
4545 LOG.debug('Deleting port bindings for source host.',
4546 instance=instance)
4547 try:
4548 self.network_api.cleanup_instance_network_on_host(
4549 ctxt, instance, self.host)
4550 except exception.PortBindingDeletionFailed as e:
4551 # Do not let this stop us from cleaning up since the guest
4552 # is already gone.
4553 LOG.error('Failed to delete port bindings from source host. '
4554 'Error: %s', six.text_type(e), instance=instance)
4555
4556 def _delete_volume_attachments(self, ctxt, bdms):
4557 """Deletes volume attachment records for the given bdms.
4558
4559 This method will log but not re-raise any exceptions if the volume
4560 attachment delete fails.
4561
4562 :param ctxt: nova auth request context used to make
4563 DELETE /attachments/{attachment_id} requests to cinder.
4564 :param bdms: objects.BlockDeviceMappingList representing volume
4565 attachments to delete based on BlockDeviceMapping.attachment_id.
4566 """
4567 for bdm in bdms:
4568 if bdm.attachment_id:
4569 try:
4570 self.volume_api.attachment_delete(ctxt, bdm.attachment_id)
4571 except Exception as e:
4572 LOG.error('Failed to delete volume attachment with ID %s. '
4573 'Error: %s', bdm.attachment_id, six.text_type(e),
4574 instance_uuid=bdm.instance_uuid)
4575
4576 @wrap_exception()
4577 @reverts_task_state
4578 @wrap_instance_event(prefix='compute')
4579 @errors_out_migration
4580 @wrap_instance_fault
4581 def revert_snapshot_based_resize_at_dest(self, ctxt, instance, migration):
4582 """Reverts a snapshot-based resize at the destination host.
4583
4584 Cleans the guest from the destination compute service host hypervisor
4585 and related resources (ports, volumes) and frees resource usage from
4586 the compute service on that host.
4587
4588 :param ctxt: nova auth request context targeted at the target cell
4589 :param instance: Instance object whose vm_state is "resized" and
4590 task_state is "resize_reverting".
4591 :param migration: Migration object whose status is "reverting".
4592 """
4593 # A resize revert is essentially a resize back to the old size, so we
4594 # need to send a usage event here.
4595 compute_utils.notify_usage_exists(
4596 self.notifier, ctxt, instance, self.host, current_period=True)
4597
4598 @utils.synchronized(instance.uuid)
4599 def do_revert():
4600 LOG.info('Reverting resize on destination host.',
4601 instance=instance)
4602 with self._error_out_instance_on_exception(ctxt, instance):
4603 self._revert_snapshot_based_resize_at_dest(
4604 ctxt, instance, migration)
4605 do_revert()
4606
4607 # Broadcast to all schedulers that the instance is no longer on
4608 # this host and clear any waiting callback events. This is best effort
4609 # so if anything fails just log it.
4610 try:
4611 self._delete_scheduler_instance_info(ctxt, instance.uuid)
4612 self.instance_events.clear_events_for_instance(instance)
4613 except Exception as e:
4614 LOG.warning('revert_snapshot_based_resize_at_dest failed during '
4615 'post-processing. Error: %s', e, instance=instance)
4616
4617 def _revert_snapshot_based_resize_at_dest(
4618 self, ctxt, instance, migration):
4619 """Private version of revert_snapshot_based_resize_at_dest.
4620
4621 This allows the main method to be decorated with error handlers.
4622
4623 :param ctxt: nova auth request context targeted at the target cell
4624 :param instance: Instance object whose vm_state is "resized" and
4625 task_state is "resize_reverting".
4626 :param migration: Migration object whose status is "reverting".
4627 """
4628 # Cleanup the guest from the hypervisor including local disks.
4629 network_info = self.network_api.get_instance_nw_info(ctxt, instance)
4630 bdms = instance.get_bdms()
4631 block_device_info = self._get_instance_block_device_info(
4632 ctxt, instance, bdms=bdms)
4633 LOG.debug('Destroying guest from destination hypervisor including '
4634 'disks.', instance=instance)
4635 self.driver.destroy(
4636 ctxt, instance, network_info, block_device_info=block_device_info)
4637
4638 # Activate source host port bindings. We need to do this before
4639 # deleting the (active) dest host port bindings in
4640 # setup_networks_on_host otherwise the ports will be unbound and
4641 # finish on the source will fail.
4642 # migrate_instance_start uses migration.dest_compute for the port
4643 # binding host and since we want to activate the source host port
4644 # bindings, we need to temporarily mutate the migration object.
4645 with utils.temporary_mutation(
4646 migration, dest_compute=migration.source_compute):
4647 LOG.debug('Activating port bindings for source host %s.',
4648 migration.source_compute, instance=instance)
4649 # TODO(mriedem): https://review.opendev.org/#/c/594139/ would allow
4650 # us to remove this and make setup_networks_on_host do it.
4651 # TODO(mriedem): Should we try/except/log any errors but continue?
4652 self.network_api.migrate_instance_start(
4653 ctxt, instance, migration)
4654
4655 # Delete port bindings for the target host.
4656 LOG.debug('Deleting port bindings for target host %s.',
4657 self.host, instance=instance)
4658 try:
4659 # Note that deleting the destination host port bindings does
4660 # not automatically activate the source host port bindings.
4661 self.network_api.cleanup_instance_network_on_host(
4662 ctxt, instance, self.host)
4663 except exception.PortBindingDeletionFailed as e:
4664 # Do not let this stop us from cleaning up since the guest
4665 # is already gone.
4666 LOG.error('Failed to delete port bindings from target host. '
4667 'Error: %s', six.text_type(e), instance=instance)
4668
4669 # Delete any volume attachments remaining for this target host.
4670 LOG.debug('Deleting volume attachments for target host.',
4671 instance=instance)
4672 self._delete_volume_attachments(ctxt, bdms)
4673
4674 # Free up the new_flavor usage from the resource tracker for this host.
4675 self.rt.drop_move_claim_at_dest(ctxt, instance, migration)
4676
4677 def _revert_instance_flavor_host_node(self, instance, migration):
4678 """Revert host, node and flavor fields after a resize-revert."""
4679 self._set_instance_info(instance, instance.old_flavor)
4680 instance.host = migration.source_compute
4681 instance.node = migration.source_node
4682 instance.save(expected_task_state=[task_states.RESIZE_REVERTING])
4683
4684 @wrap_exception()
4685 @reverts_task_state
4686 @wrap_instance_event(prefix='compute')
4687 @errors_out_migration
4688 @wrap_instance_fault
4689 def finish_revert_snapshot_based_resize_at_source(
4690 self, ctxt, instance, migration):
4691 """Reverts a snapshot-based resize at the source host.
4692
4693 Spawn the guest and re-connect volumes/VIFs on the source host and
4694 revert the instance to use the old_flavor for resource usage reporting.
4695
4696 Updates allocations in the placement service to move the source node
4697 allocations, held by the migration record, to the instance and drop
4698 the allocations held by the instance on the destination node.
4699
4700 :param ctxt: nova auth request context targeted at the target cell
4701 :param instance: Instance object whose vm_state is "resized" and
4702 task_state is "resize_reverting".
4703 :param migration: Migration object whose status is "reverting".
4704 """
4705
4706 @utils.synchronized(instance.uuid)
4707 def do_revert():
4708 LOG.info('Reverting resize on source host.', instance=instance)
4709 with self._error_out_instance_on_exception(ctxt, instance):
4710 self._finish_revert_snapshot_based_resize_at_source(
4711 ctxt, instance, migration)
4712
4713 try:
4714 do_revert()
4715 finally:
4716 self._delete_stashed_flavor_info(instance)
4717
4718 # Broadcast to all schedulers that the instance is on this host.
4719 # This is best effort so if anything fails just log it.
4720 try:
4721 self._update_scheduler_instance_info(ctxt, instance)
4722 except Exception as e:
4723 LOG.warning('finish_revert_snapshot_based_resize_at_source failed '
4724 'during post-processing. Error: %s', e,
4725 instance=instance)
4726
4727 def _finish_revert_snapshot_based_resize_at_source(
4728 self, ctxt, instance, migration):
4729 """Private version of finish_revert_snapshot_based_resize_at_source.
4730
4731 This allows the main method to be decorated with error handlers.
4732
4733 :param ctxt: nova auth request context targeted at the source cell
4734 :param instance: Instance object whose vm_state is "resized" and
4735 task_state is "resize_reverting".
4736 :param migration: Migration object whose status is "reverting".
4737 """
4738 # Get stashed old_vm_state information to determine if guest should
4739 # be powered on after spawn; we default to ACTIVE for backwards
4740 # compatibility if old_vm_state is not set
4741 old_vm_state = instance.system_metadata.get(
4742 'old_vm_state', vm_states.ACTIVE)
4743
4744 # Revert the flavor and host/node fields to their previous values
4745 self._revert_instance_flavor_host_node(instance, migration)
4746
4747 # Move the allocations against the source compute node resource
4748 # provider, held by the migration, to the instance which will drop
4749 # the destination compute node resource provider allocations held by
4750 # the instance. This puts the allocations against the source node
4751 # back to the old_flavor and owned by the instance.
4752 try:
4753 self._revert_allocation(ctxt, instance, migration)
4754 except exception.AllocationMoveFailed:
4755 # Log the error but do not re-raise because we want to continue to
4756 # process ports and volumes below.
4757 LOG.error('Reverting allocation in placement for migration '
4758 '%(migration_uuid)s failed. You may need to manually '
4759 'remove the allocations for the migration consumer '
4760 'against the source node resource provider '
4761 '%(source_provider)s and the allocations for the '
4762 'instance consumer against the destination node '
4763 'resource provider %(dest_provider)s and then run the '
4764 '"nova-manage placement heal_allocations" command.',
4765 {'instance_uuid': instance.uuid,
4766 'migration_uuid': migration.uuid,
4767 'source_provider': migration.source_node,
4768 'dest_provider': migration.dest_node},
4769 instance=instance)
4770
4771 bdms = instance.get_bdms()
4772 # prep_snapshot_based_resize_at_source created empty volume attachments
4773 # that we need to update here to get the connection_info before calling
4774 # driver.finish_revert_migration which will connect the volumes to this
4775 # host.
4776 LOG.debug('Updating volume attachments for target host %s.',
4777 self.host, instance=instance)
4778 # TODO(mriedem): We should probably make _update_volume_attachments
4779 # (optionally) graceful to errors so we (1) try to process all
4780 # attachments and (2) continue to process networking below.
4781 self._update_volume_attachments(ctxt, instance, bdms)
4782
4783 LOG.debug('Updating port bindings for source host %s.',
4784 self.host, instance=instance)
4785 # TODO(mriedem): Calculate provider mappings when we support
4786 # cross-cell resize/migrate with ports having resource requests.
4787 self._finish_revert_resize_network_migrate_finish(
4788 ctxt, instance, migration, provider_mappings=None)
4789 network_info = self.network_api.get_instance_nw_info(ctxt, instance)
4790
4791 # Remember that prep_snapshot_based_resize_at_source destroyed the
4792 # guest but left the disks intact so we cannot call spawn() here but
4793 # finish_revert_migration should do the job.
4794 block_device_info = self._get_instance_block_device_info(
4795 ctxt, instance, bdms=bdms)
4796 power_on = old_vm_state == vm_states.ACTIVE
4797 driver_error = None
4798 try:
4799 self.driver.finish_revert_migration(
4800 ctxt, instance, network_info, migration,
4801 block_device_info=block_device_info, power_on=power_on)
4802 except Exception as e:
4803 driver_error = e
4804 # Leave a hint about hard rebooting the guest and reraise so the
4805 # instance is put into ERROR state.
4806 with excutils.save_and_reraise_exception(logger=LOG):
4807 LOG.error('An error occurred during finish_revert_migration. '
4808 'The instance may need to be hard rebooted. Error: '
4809 '%s', driver_error, instance=instance)
4810 else:
4811 # Perform final cleanup of the instance in the database.
4812 instance.drop_migration_context()
4813 # If the original vm_state was STOPPED, set it back to STOPPED.
4814 vm_state = vm_states.ACTIVE if power_on else vm_states.STOPPED
4815 self._update_instance_after_spawn(instance, vm_state=vm_state)
4816 instance.save(expected_task_state=[task_states.RESIZE_REVERTING])
4817 finally:
4818 # Complete any volume attachments so the volumes are in-use. We
4819 # do this regardless of finish_revert_migration failing because
4820 # the instance is back on this host now and we do not want to leave
4821 # the volumes in a pending state in case the instance is hard
4822 # rebooted.
4823 LOG.debug('Completing volume attachments for instance on source '
4824 'host.', instance=instance)
4825 with excutils.save_and_reraise_exception(
4826 reraise=driver_error is not None, logger=LOG):
4827 self._complete_volume_attachments(ctxt, bdms)
4828
4829 migration.status = 'reverted'
4830 migration.save()
4831
4832 @wrap_exception()
4833 @reverts_task_state
4834 @wrap_instance_event(prefix='compute')
4835 @errors_out_migration
4836 @wrap_instance_fault
4837 def revert_resize(self, context, instance, migration, request_spec=None):
4838 """Destroys the new instance on the destination machine.
4839
4840 Reverts the model changes, and powers on the old instance on the
4841 source machine.
4842
4843 """
4844 # NOTE(comstud): A revert_resize is essentially a resize back to
4845 # the old size, so we need to send a usage event here.
4846 compute_utils.notify_usage_exists(self.notifier, context, instance,
4847 self.host, current_period=True)
4848
4849 with self._error_out_instance_on_exception(context, instance):
4850 # NOTE(tr3buchet): tear down networks on destination host
4851 self.network_api.setup_networks_on_host(context, instance,
4852 teardown=True)
4853
4854 self.network_api.migrate_instance_start(context,
4855 instance,
4856 migration)
4857
4858 network_info = self.network_api.get_instance_nw_info(context,
4859 instance)
4860 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
4861 context, instance.uuid)
4862 block_device_info = self._get_instance_block_device_info(
4863 context, instance, bdms=bdms)
4864
4865 destroy_disks = not self._is_instance_storage_shared(
4866 context, instance, host=migration.source_compute)
4867 self.driver.destroy(context, instance, network_info,
4868 block_device_info, destroy_disks)
4869
4870 self._terminate_volume_connections(context, instance, bdms)
4871
4872 # Free up the new_flavor usage from the resource tracker for this
4873 # host.
4874 self.rt.drop_move_claim_at_dest(context, instance, migration)
4875
4876 # RPC cast back to the source host to finish the revert there.
4877 self.compute_rpcapi.finish_revert_resize(context, instance,
4878 migration, migration.source_compute, request_spec)
4879
4880 def _finish_revert_resize_network_migrate_finish(
4881 self, context, instance, migration, provider_mappings):
4882 """Causes port binding to be updated. In some Neutron or port
4883 configurations - see NetworkModel.get_bind_time_events() - we
4884 expect the vif-plugged event from Neutron immediately and wait for it.
4885 The rest of the time, the event is expected further along in the
4886 virt driver, so we don't wait here.
4887
4888 :param context: The request context.
4889 :param instance: The instance undergoing the revert resize.
4890 :param migration: The Migration object of the resize being reverted.
4891 :param provider_mappings: a dict of list of resource provider uuids
4892 keyed by port uuid
4893 :raises: eventlet.timeout.Timeout or
4894 exception.VirtualInterfacePlugException.
4895 """
4896 network_info = instance.get_network_info()
4897 events = []
4898 deadline = CONF.vif_plugging_timeout
4899 if deadline and network_info:
4900 events = network_info.get_bind_time_events(migration)
4901 if events:
4902 LOG.debug('Will wait for bind-time events: %s', events)
4903 error_cb = self._neutron_failed_migration_callback
4904 try:
4905 with self.virtapi.wait_for_instance_event(instance, events,
4906 deadline=deadline,
4907 error_callback=error_cb):
4908 # NOTE(hanrong): we need to change migration.dest_compute to
4909 # source host temporarily.
4910 # "network_api.migrate_instance_finish" will setup the network
4911 # for the instance on the destination host. For revert resize,
4912 # the instance will back to the source host, the setup of the
4913 # network for instance should be on the source host. So set
4914 # the migration.dest_compute to source host at here.
4915 with utils.temporary_mutation(
4916 migration, dest_compute=migration.source_compute):
4917 self.network_api.migrate_instance_finish(
4918 context, instance, migration, provider_mappings)
4919 except eventlet.timeout.Timeout:
4920 with excutils.save_and_reraise_exception():
4921 LOG.error('Timeout waiting for Neutron events: %s', events,
4922 instance=instance)
4923
4924 @wrap_exception()
4925 @reverts_task_state
4926 @wrap_instance_event(prefix='compute')
4927 @errors_out_migration
4928 @wrap_instance_fault
4929 def finish_revert_resize(
4930 self, context, instance, migration, request_spec=None):
4931 """Finishes the second half of reverting a resize on the source host.
4932
4933 Bring the original source instance state back (active/shutoff) and
4934 revert the resized attributes in the database.
4935
4936 """
4937 try:
4938 self._finish_revert_resize(
4939 context, instance, migration, request_spec)
4940 finally:
4941 self._delete_stashed_flavor_info(instance)
4942
4943 def _finish_revert_resize(
4944 self, context, instance, migration, request_spec=None,
4945 ):
4946 """Inner version of finish_revert_resize."""
4947 with self._error_out_instance_on_exception(context, instance):
4948 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
4949 context, instance.uuid)
4950 self._notify_about_instance_usage(
4951 context, instance, "resize.revert.start")
4952 compute_utils.notify_about_instance_action(context, instance,
4953 self.host, action=fields.NotificationAction.RESIZE_REVERT,
4954 phase=fields.NotificationPhase.START, bdms=bdms)
4955
4956 # Get stashed old_vm_state information to determine if guest should
4957 # be powered on after spawn; we default to ACTIVE for backwards
4958 # compatibility if old_vm_state is not set
4959 old_vm_state = instance.system_metadata.get(
4960 'old_vm_state', vm_states.ACTIVE)
4961
4962 # Revert the flavor and host/node fields to their previous values
4963 self._revert_instance_flavor_host_node(instance, migration)
4964
4965 try:
4966 source_allocations = self._revert_allocation(
4967 context, instance, migration)
4968 except exception.AllocationMoveFailed:
4969 LOG.error('Reverting allocation in placement for migration '
4970 '%(migration_uuid)s failed. The instance '
4971 '%(instance_uuid)s will be put into ERROR state but '
4972 'the allocation held by the migration is leaked.',
4973 {'instance_uuid': instance.uuid,
4974 'migration_uuid': migration.uuid})
4975 raise
4976
4977 provider_mappings = self._fill_provider_mapping_based_on_allocs(
4978 context, source_allocations, request_spec)
4979
4980 self.network_api.setup_networks_on_host(context, instance,
4981 migration.source_compute)
4982 self._finish_revert_resize_network_migrate_finish(
4983 context, instance, migration, provider_mappings)
4984 network_info = self.network_api.get_instance_nw_info(context,
4985 instance)
4986
4987 # revert_resize deleted any volume attachments for the instance
4988 # and created new ones to be used on this host, but we
4989 # have to update those attachments with the host connector so the
4990 # BDM.connection_info will get set in the call to
4991 # _get_instance_block_device_info below with refresh_conn_info=True
4992 # and then the volumes can be re-connected via the driver on this
4993 # host.
4994 self._update_volume_attachments(context, instance, bdms)
4995
4996 block_device_info = self._get_instance_block_device_info(
4997 context, instance, refresh_conn_info=True, bdms=bdms)
4998
4999 power_on = old_vm_state != vm_states.STOPPED
5000 self.driver.finish_revert_migration(
5001 context, instance, network_info, migration, block_device_info,
5002 power_on)
5003
5004 instance.drop_migration_context()
5005 instance.launched_at = timeutils.utcnow()
5006 instance.save(expected_task_state=task_states.RESIZE_REVERTING)
5007
5008 # Complete any volume attachments so the volumes are in-use.
5009 self._complete_volume_attachments(context, bdms)
5010
5011 # if the original vm state was STOPPED, set it back to STOPPED
5012 LOG.info("Updating instance to original state: '%s'",
5013 old_vm_state, instance=instance)
5014 if power_on:
5015 instance.vm_state = vm_states.ACTIVE
5016 instance.task_state = None
5017 instance.save()
5018 else:
5019 instance.task_state = task_states.POWERING_OFF
5020 instance.save()
5021 self.stop_instance(context, instance=instance,
5022 clean_shutdown=True)
5023
5024 self._notify_about_instance_usage(
5025 context, instance, "resize.revert.end")
5026 compute_utils.notify_about_instance_action(context, instance,
5027 self.host, action=fields.NotificationAction.RESIZE_REVERT,
5028 phase=fields.NotificationPhase.END, bdms=bdms)
5029
5030 def _fill_provider_mapping_based_on_allocs(
5031 self, context, allocations, request_spec):
5032 """Fills and returns the request group - resource provider mapping
5033 based on the allocation passed in.
5034
5035 :param context: The security context
5036 :param allocation: allocation dict keyed by RP UUID.
5037 :param request_spec: The RequestSpec object associated with the
5038 operation
5039 :returns: None if the request_spec is None. Otherwise a mapping
5040 between RequestGroup requester_id, currently Neutron port_id,
5041 and a list of resource provider UUIDs providing resource for
5042 that RequestGroup.
5043 """
5044 if request_spec:
5045 # NOTE(gibi): We need to re-calculate the resource provider -
5046 # port mapping as we have to have the neutron ports allocate
5047 # from the source compute after revert.
5048 scheduler_utils.fill_provider_mapping_based_on_allocation(
5049 context, self.reportclient, request_spec, allocations)
5050 provider_mappings = self._get_request_group_mapping(
5051 request_spec)
5052 else:
5053 # NOTE(gibi): The compute RPC is pinned to be older than 5.2
5054 # and therefore request_spec is not sent. We cannot calculate
5055 # the provider mappings. If the instance has ports with
5056 # resource request then the port update will fail in
5057 # _update_port_binding_for_instance() called via
5058 # _finish_revert_resize_network_migrate_finish() in
5059 # finish_revert_resize.
5060 provider_mappings = None
5061 return provider_mappings
5062
5063 def _revert_allocation(self, context, instance, migration):
5064 """Revert an allocation that is held by migration to our instance."""
5065
5066 # Fetch the original allocation that the instance had on the source
5067 # node, which are now held by the migration
5068 orig_alloc = self.reportclient.get_allocations_for_consumer(
5069 context, migration.uuid)
5070 if not orig_alloc:
5071 LOG.error('Did not find resource allocations for migration '
5072 '%s on source node %s. Unable to revert source node '
5073 'allocations back to the instance.',
5074 migration.uuid, migration.source_node, instance=instance)
5075 return False
5076
5077 LOG.info('Swapping old allocation on %(rp_uuids)s held by migration '
5078 '%(mig)s for instance',
5079 {'rp_uuids': orig_alloc.keys(), 'mig': migration.uuid},
5080 instance=instance)
5081 # FIXME(gibi): This method is flawed in that it does not handle
5082 # allocations against sharing providers in any special way. This leads
5083 # to duplicate allocations against the sharing provider during
5084 # migration.
5085 # TODO(cdent): Should we be doing anything with return values here?
5086 self.reportclient.move_allocations(context, migration.uuid,
5087 instance.uuid)
5088 return orig_alloc
5089
5090 def _prep_resize(self, context, image, instance, instance_type,
5091 filter_properties, node, migration, request_spec,
5092 clean_shutdown=True):
5093
5094 if not filter_properties:
5095 filter_properties = {}
5096
5097 if not instance.host:
5098 self._set_instance_obj_error_state(instance)
5099 msg = _('Instance has no source host')
5100 raise exception.MigrationError(reason=msg)
5101
5102 same_host = instance.host == self.host
5103 # if the flavor IDs match, it's migrate; otherwise resize
5104 if same_host and instance_type.id == instance['instance_type_id']:
5105 # check driver whether support migrate to same host
5106 if not self.driver.capabilities.get(
5107 'supports_migrate_to_same_host', False):
5108 # Raise InstanceFaultRollback so that the
5109 # _error_out_instance_on_exception context manager in
5110 # prep_resize will set the instance.vm_state properly.
5111 raise exception.InstanceFaultRollback(
5112 inner_exception=exception.UnableToMigrateToSelf(
5113 instance_id=instance.uuid, host=self.host))
5114
5115 # NOTE(danms): Stash the new instance_type to avoid having to
5116 # look it up in the database later
5117 instance.new_flavor = instance_type
5118 # NOTE(mriedem): Stash the old vm_state so we can set the
5119 # resized/reverted instance back to the same state later.
5120 vm_state = instance.vm_state
5121 LOG.debug('Stashing vm_state: %s', vm_state, instance=instance)
5122 instance.system_metadata['old_vm_state'] = vm_state
5123 instance.save()
5124
5125 if not isinstance(request_spec, objects.RequestSpec):
5126 # Prior to compute RPC API 5.1 conductor would pass a legacy dict
5127 # version of the request spec to compute and since Stein compute
5128 # could be sending that back to conductor on reschedule, so if we
5129 # got a dict convert it to an object.
5130 # TODO(mriedem): We can drop this compat code when we only support
5131 # compute RPC API >=6.0.
5132 request_spec = objects.RequestSpec.from_primitives(
5133 context, request_spec, filter_properties)
5134 # We don't have to set the new flavor on the request spec because
5135 # if we got here it was due to a reschedule from the compute and
5136 # the request spec would already have the new flavor in it from the
5137 # else block below.
5138
5139 provider_mapping = self._get_request_group_mapping(request_spec)
5140
5141 if provider_mapping:
5142 try:
5143 compute_utils.\
5144 update_pci_request_spec_with_allocated_interface_name(
5145 context, self.reportclient, instance