"Fossies" - the Fresh Open Source Software Archive

Member "nova-22.0.1/nova/compute/manager.py" (19 Nov 2020, 519242 Bytes) of package /linux/misc/openstack/nova-22.0.1.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Python source code syntax highlighting (style: standard) with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file. For more information about "manager.py" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 22.0.0_vs_22.0.1.

    1 # Copyright 2010 United States Government as represented by the
    2 # Administrator of the National Aeronautics and Space Administration.
    3 # Copyright 2011 Justin Santa Barbara
    4 # All Rights Reserved.
    5 #
    6 #    Licensed under the Apache License, Version 2.0 (the "License"); you may
    7 #    not use this file except in compliance with the License. You may obtain
    8 #    a copy of the License at
    9 #
   10 #         http://www.apache.org/licenses/LICENSE-2.0
   11 #
   12 #    Unless required by applicable law or agreed to in writing, software
   13 #    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
   14 #    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
   15 #    License for the specific language governing permissions and limitations
   16 #    under the License.
   17 
   18 """Handles all processes relating to instances (guest vms).
   19 
   20 The :py:class:`ComputeManager` class is a :py:class:`nova.manager.Manager` that
   21 handles RPC calls relating to creating instances.  It is responsible for
   22 building a disk image, launching it via the underlying virtualization driver,
   23 responding to calls to check its state, attaching persistent storage, and
   24 terminating it.
   25 
   26 """
   27 
   28 import base64
   29 import binascii
   30 import contextlib
   31 import copy
   32 import functools
   33 import inspect
   34 import sys
   35 import time
   36 import traceback
   37 import typing as ty
   38 
   39 from cinderclient import exceptions as cinder_exception
   40 from cursive import exception as cursive_exception
   41 import eventlet.event
   42 from eventlet import greenthread
   43 import eventlet.semaphore
   44 import eventlet.timeout
   45 import futurist
   46 from keystoneauth1 import exceptions as keystone_exception
   47 import os_traits
   48 from oslo_log import log as logging
   49 import oslo_messaging as messaging
   50 from oslo_serialization import jsonutils
   51 from oslo_service import loopingcall
   52 from oslo_service import periodic_task
   53 from oslo_utils import excutils
   54 from oslo_utils import strutils
   55 from oslo_utils import timeutils
   56 from oslo_utils import units
   57 import six
   58 from six.moves import range
   59 
   60 from nova.accelerator import cyborg
   61 from nova import block_device
   62 from nova.compute import api as compute
   63 from nova.compute import build_results
   64 from nova.compute import claims
   65 from nova.compute import power_state
   66 from nova.compute import resource_tracker
   67 from nova.compute import rpcapi as compute_rpcapi
   68 from nova.compute import task_states
   69 from nova.compute import utils as compute_utils
   70 from nova.compute.utils import wrap_instance_event
   71 from nova.compute import vm_states
   72 from nova import conductor
   73 import nova.conf
   74 import nova.context
   75 from nova import exception
   76 from nova import exception_wrapper
   77 from nova.i18n import _
   78 from nova.image import glance
   79 from nova import manager
   80 from nova.network import model as network_model
   81 from nova.network import neutron
   82 from nova import objects
   83 from nova.objects import base as obj_base
   84 from nova.objects import external_event as external_event_obj
   85 from nova.objects import fields
   86 from nova.objects import instance as obj_instance
   87 from nova.objects import migrate_data as migrate_data_obj
   88 from nova.pci import request as pci_req_module
   89 from nova.pci import whitelist
   90 from nova import rpc
   91 from nova import safe_utils
   92 from nova.scheduler.client import query
   93 from nova.scheduler.client import report
   94 from nova.scheduler import utils as scheduler_utils
   95 from nova import utils
   96 from nova.virt import block_device as driver_block_device
   97 from nova.virt import configdrive
   98 from nova.virt import driver
   99 from nova.virt import event as virtevent
  100 from nova.virt import hardware
  101 from nova.virt import storage_users
  102 from nova.virt import virtapi
  103 from nova.volume import cinder
  104 
  105 CONF = nova.conf.CONF
  106 
  107 LOG = logging.getLogger(__name__)
  108 
  109 get_notifier = functools.partial(rpc.get_notifier, service='compute')
  110 wrap_exception = functools.partial(exception_wrapper.wrap_exception,
  111                                    get_notifier=get_notifier,
  112                                    binary='nova-compute')
  113 
  114 
  115 @contextlib.contextmanager
  116 def errors_out_migration_ctxt(migration):
  117     """Context manager to error out migration on failure."""
  118 
  119     try:
  120         yield
  121     except Exception:
  122         with excutils.save_and_reraise_exception():
  123             if migration:
  124                 # We may have been passed None for our migration if we're
  125                 # receiving from an older client. The migration will be
  126                 # errored via the legacy path.
  127                 migration.status = 'error'
  128                 try:
  129                     migration.save()
  130                 except Exception:
  131                     LOG.debug(
  132                         'Error setting migration status for instance %s.',
  133                         migration.instance_uuid, exc_info=True)
  134 
  135 
  136 @utils.expects_func_args('migration')
  137 def errors_out_migration(function):
  138     """Decorator to error out migration on failure."""
  139 
  140     @functools.wraps(function)
  141     def decorated_function(self, context, *args, **kwargs):
  142         wrapped_func = safe_utils.get_wrapped_function(function)
  143         keyed_args = inspect.getcallargs(wrapped_func, self, context,
  144                                          *args, **kwargs)
  145         migration = keyed_args['migration']
  146         with errors_out_migration_ctxt(migration):
  147             return function(self, context, *args, **kwargs)
  148 
  149     return decorated_function
  150 
  151 
  152 @utils.expects_func_args('instance')
  153 def reverts_task_state(function):
  154     """Decorator to revert task_state on failure."""
  155 
  156     @functools.wraps(function)
  157     def decorated_function(self, context, *args, **kwargs):
  158         try:
  159             return function(self, context, *args, **kwargs)
  160         except exception.UnexpectedTaskStateError as e:
  161             # Note(maoy): unexpected task state means the current
  162             # task is preempted. Do not clear task state in this
  163             # case.
  164             with excutils.save_and_reraise_exception():
  165                 LOG.info("Task possibly preempted: %s",
  166                          e.format_message())
  167         except Exception:
  168             with excutils.save_and_reraise_exception():
  169                 wrapped_func = safe_utils.get_wrapped_function(function)
  170                 keyed_args = inspect.getcallargs(wrapped_func, self, context,
  171                                                  *args, **kwargs)
  172                 # NOTE(mriedem): 'instance' must be in keyed_args because we
  173                 # have utils.expects_func_args('instance') decorating this
  174                 # method.
  175                 instance = keyed_args['instance']
  176                 original_task_state = instance.task_state
  177                 try:
  178                     self._instance_update(context, instance, task_state=None)
  179                     LOG.info("Successfully reverted task state from %s on "
  180                              "failure for instance.",
  181                              original_task_state, instance=instance)
  182                 except exception.InstanceNotFound:
  183                     # We might delete an instance that failed to build shortly
  184                     # after it errored out this is an expected case and we
  185                     # should not trace on it.
  186                     pass
  187                 except Exception as e:
  188                     LOG.warning("Failed to revert task state for instance. "
  189                                 "Error: %s", e, instance=instance)
  190 
  191     return decorated_function
  192 
  193 
  194 @utils.expects_func_args('instance')
  195 def wrap_instance_fault(function):
  196     """Wraps a method to catch exceptions related to instances.
  197 
  198     This decorator wraps a method to catch any exceptions having to do with
  199     an instance that may get thrown. It then logs an instance fault in the db.
  200     """
  201 
  202     @functools.wraps(function)
  203     def decorated_function(self, context, *args, **kwargs):
  204         try:
  205             return function(self, context, *args, **kwargs)
  206         except exception.InstanceNotFound:
  207             raise
  208         except Exception as e:
  209             # NOTE(gtt): If argument 'instance' is in args rather than kwargs,
  210             # we will get a KeyError exception which will cover up the real
  211             # exception. So, we update kwargs with the values from args first.
  212             # then, we can get 'instance' from kwargs easily.
  213             kwargs.update(dict(zip(function.__code__.co_varnames[2:], args)))
  214 
  215             with excutils.save_and_reraise_exception():
  216                 compute_utils.add_instance_fault_from_exc(context,
  217                         kwargs['instance'], e, sys.exc_info())
  218 
  219     return decorated_function
  220 
  221 
  222 @utils.expects_func_args('image_id', 'instance')
  223 def delete_image_on_error(function):
  224     """Used for snapshot related method to ensure the image created in
  225     compute.api is deleted when an error occurs.
  226     """
  227 
  228     @functools.wraps(function)
  229     def decorated_function(self, context, image_id, instance,
  230                            *args, **kwargs):
  231         try:
  232             return function(self, context, image_id, instance,
  233                             *args, **kwargs)
  234         except Exception:
  235             with excutils.save_and_reraise_exception():
  236                 compute_utils.delete_image(
  237                     context, instance, self.image_api, image_id,
  238                     log_exc_info=True)
  239 
  240     return decorated_function
  241 
  242 
  243 # Each collection of events is a dict of eventlet Events keyed by a tuple of
  244 # event name and associated tag
  245 _InstanceEvents = ty.Dict[ty.Tuple[str, str], eventlet.event.Event]
  246 
  247 
  248 class InstanceEvents(object):
  249     def __init__(self):
  250         self._events: ty.Optional[ty.Dict[str, _InstanceEvents]] = {}
  251 
  252     @staticmethod
  253     def _lock_name(instance) -> str:
  254         return '%s-%s' % (instance.uuid, 'events')
  255 
  256     def prepare_for_instance_event(
  257         self,
  258         instance: 'objects.Instance',
  259         name: str,
  260         tag: str,
  261     ) -> eventlet.event.Event:
  262         """Prepare to receive an event for an instance.
  263 
  264         This will register an event for the given instance that we will
  265         wait on later. This should be called before initiating whatever
  266         action will trigger the event. The resulting eventlet.event.Event
  267         object should be wait()'d on to ensure completion.
  268 
  269         :param instance: the instance for which the event will be generated
  270         :param name: the name of the event we're expecting
  271         :param tag: the tag associated with the event we're expecting
  272         :returns: an event object that should be wait()'d on
  273         """
  274         @utils.synchronized(self._lock_name(instance))
  275         def _create_or_get_event():
  276             if self._events is None:
  277                 # NOTE(danms): We really should have a more specific error
  278                 # here, but this is what we use for our default error case
  279                 raise exception.NovaException(
  280                     'In shutdown, no new events can be scheduled')
  281 
  282             instance_events = self._events.setdefault(instance.uuid, {})
  283             return instance_events.setdefault((name, tag),
  284                                               eventlet.event.Event())
  285         LOG.debug('Preparing to wait for external event %(name)s-%(tag)s',
  286                   {'name': name, 'tag': tag}, instance=instance)
  287         return _create_or_get_event()
  288 
  289     def pop_instance_event(self, instance, event):
  290         """Remove a pending event from the wait list.
  291 
  292         This will remove a pending event from the wait list so that it
  293         can be used to signal the waiters to wake up.
  294 
  295         :param instance: the instance for which the event was generated
  296         :param event: the nova.objects.external_event.InstanceExternalEvent
  297                       that describes the event
  298         :returns: the eventlet.event.Event object on which the waiters
  299                   are blocked
  300         """
  301         no_events_sentinel = object()
  302         no_matching_event_sentinel = object()
  303 
  304         @utils.synchronized(self._lock_name(instance))
  305         def _pop_event():
  306             if self._events is None:
  307                 LOG.debug('Unexpected attempt to pop events during shutdown',
  308                           instance=instance)
  309                 return no_events_sentinel
  310             events = self._events.get(instance.uuid)
  311             if not events:
  312                 return no_events_sentinel
  313             _event = events.pop((event.name, event.tag), None)
  314             if not events:
  315                 del self._events[instance.uuid]
  316             if _event is None:
  317                 return no_matching_event_sentinel
  318             return _event
  319 
  320         result = _pop_event()
  321         if result is no_events_sentinel:
  322             LOG.debug('No waiting events found dispatching %(event)s',
  323                       {'event': event.key},
  324                       instance=instance)
  325             return None
  326         elif result is no_matching_event_sentinel:
  327             LOG.debug(
  328                 'No event matching %(event)s in %(events)s',
  329                 {
  330                     'event': event.key,
  331                     # mypy can't identify the none check in _pop_event
  332                     'events': self._events.get(  # type: ignore
  333                         instance.uuid, {}).keys(),
  334                 },
  335                 instance=instance,
  336             )
  337             return None
  338         else:
  339             return result
  340 
  341     def clear_events_for_instance(self, instance):
  342         """Remove all pending events for an instance.
  343 
  344         This will remove all events currently pending for an instance
  345         and return them (indexed by event name).
  346 
  347         :param instance: the instance for which events should be purged
  348         :returns: a dictionary of {event_name: eventlet.event.Event}
  349         """
  350         @utils.synchronized(self._lock_name(instance))
  351         def _clear_events():
  352             if self._events is None:
  353                 LOG.debug('Unexpected attempt to clear events during shutdown',
  354                           instance=instance)
  355                 return dict()
  356             # NOTE(danms): We have historically returned the raw internal
  357             # format here, which is {event.key: [events, ...])} so just
  358             # trivially convert it here.
  359             return {'%s-%s' % k: e
  360                     for k, e in self._events.pop(instance.uuid, {}).items()}
  361         return _clear_events()
  362 
  363     def cancel_all_events(self):
  364         if self._events is None:
  365             LOG.debug('Unexpected attempt to cancel events during shutdown.')
  366             return
  367         our_events = self._events
  368         # NOTE(danms): Block new events
  369         self._events = None
  370 
  371         for instance_uuid, events in our_events.items():
  372             for (name, tag), eventlet_event in events.items():
  373                 LOG.debug('Canceling in-flight event %(name)s-%(tag)s for '
  374                           'instance %(instance_uuid)s',
  375                           {'name': name,
  376                            'tag': tag,
  377                            'instance_uuid': instance_uuid})
  378                 event = objects.InstanceExternalEvent(
  379                     instance_uuid=instance_uuid,
  380                     name=name, status='failed',
  381                     tag=tag, data={})
  382                 eventlet_event.send(event)
  383 
  384 
  385 class ComputeVirtAPI(virtapi.VirtAPI):
  386     def __init__(self, compute):
  387         super(ComputeVirtAPI, self).__init__()
  388         self._compute = compute
  389         self.reportclient = compute.reportclient
  390 
  391         class ExitEarly(Exception):
  392             def __init__(self, events):
  393                 super(Exception, self).__init__()
  394                 self.events = events
  395 
  396         self._exit_early_exc = ExitEarly
  397 
  398     def exit_wait_early(self, events):
  399         """Exit a wait_for_instance_event() immediately and avoid
  400         waiting for some events.
  401 
  402         :param: events: A list of (name, tag) tuples for events that we should
  403                         skip waiting for during a wait_for_instance_event().
  404         """
  405         raise self._exit_early_exc(events=events)
  406 
  407     def _default_error_callback(self, event_name, instance):
  408         raise exception.NovaException(_('Instance event failed'))
  409 
  410     @contextlib.contextmanager
  411     def wait_for_instance_event(self, instance, event_names, deadline=300,
  412                                 error_callback=None):
  413         """Plan to wait for some events, run some code, then wait.
  414 
  415         This context manager will first create plans to wait for the
  416         provided event_names, yield, and then wait for all the scheduled
  417         events to complete.
  418 
  419         Note that this uses an eventlet.timeout.Timeout to bound the
  420         operation, so callers should be prepared to catch that
  421         failure and handle that situation appropriately.
  422 
  423         If the event is not received by the specified timeout deadline,
  424         eventlet.timeout.Timeout is raised.
  425 
  426         If the event is received but did not have a 'completed'
  427         status, a NovaException is raised.  If an error_callback is
  428         provided, instead of raising an exception as detailed above
  429         for the failure case, the callback will be called with the
  430         event_name and instance, and can return True to continue
  431         waiting for the rest of the events, False to stop processing,
  432         or raise an exception which will bubble up to the waiter.
  433 
  434         If the inner code wishes to abort waiting for one or more
  435         events because it knows some state to be finished or condition
  436         to be satisfied, it can use VirtAPI.exit_wait_early() with a
  437         list of event (name,tag) items to avoid waiting for those
  438         events upon context exit. Note that exit_wait_early() exits
  439         the context immediately and should be used to signal that all
  440         work has been completed and provide the unified list of events
  441         that need not be waited for. Waiting for the remaining events
  442         will begin immediately upon early exit as if the context was
  443         exited normally.
  444 
  445         :param instance: The instance for which an event is expected
  446         :param event_names: A list of event names. Each element is a
  447                             tuple of strings to indicate (name, tag),
  448                             where name is required, but tag may be None.
  449         :param deadline: Maximum number of seconds we should wait for all
  450                          of the specified events to arrive.
  451         :param error_callback: A function to be called if an event arrives
  452 
  453         """
  454 
  455         if error_callback is None:
  456             error_callback = self._default_error_callback
  457         events = {}
  458         for event_name in event_names:
  459             name, tag = event_name
  460             event_name = objects.InstanceExternalEvent.make_key(name, tag)
  461             try:
  462                 events[event_name] = (
  463                     self._compute.instance_events.prepare_for_instance_event(
  464                         instance, name, tag))
  465             except exception.NovaException:
  466                 error_callback(event_name, instance)
  467                 # NOTE(danms): Don't wait for any of the events. They
  468                 # should all be canceled and fired immediately below,
  469                 # but don't stick around if not.
  470                 deadline = 0
  471         try:
  472             yield
  473         except self._exit_early_exc as e:
  474             early_events = set([objects.InstanceExternalEvent.make_key(n, t)
  475                                 for n, t in e.events])
  476         else:
  477             early_events = set([])
  478 
  479         with eventlet.timeout.Timeout(deadline):
  480             for event_name, event in events.items():
  481                 if event_name in early_events:
  482                     continue
  483                 else:
  484                     actual_event = event.wait()
  485                     if actual_event.status == 'completed':
  486                         continue
  487                 # If we get here, we have an event that was not completed,
  488                 # nor skipped via exit_wait_early(). Decide whether to
  489                 # keep waiting by calling the error_callback() hook.
  490                 decision = error_callback(event_name, instance)
  491                 if decision is False:
  492                     break
  493 
  494     def update_compute_provider_status(self, context, rp_uuid, enabled):
  495         """Used to add/remove the COMPUTE_STATUS_DISABLED trait on the provider
  496 
  497         :param context: nova auth RequestContext
  498         :param rp_uuid: UUID of a compute node resource provider in Placement
  499         :param enabled: True if the node is enabled in which case the trait
  500             would be removed, False if the node is disabled in which case
  501             the trait would be added.
  502         :raises: ResourceProviderTraitRetrievalFailed
  503         :raises: ResourceProviderUpdateConflict
  504         :raises: ResourceProviderUpdateFailed
  505         :raises: TraitRetrievalFailed
  506         :raises: keystoneauth1.exceptions.ClientException
  507         """
  508         trait_name = os_traits.COMPUTE_STATUS_DISABLED
  509         # Get the current traits (and generation) for the provider.
  510         # TODO(mriedem): Leverage the ProviderTree cache in get_provider_traits
  511         trait_info = self.reportclient.get_provider_traits(context, rp_uuid)
  512         # If the host is enabled, remove the trait (if set), else add
  513         # the trait if it doesn't already exist.
  514         original_traits = trait_info.traits
  515         new_traits = None
  516         if enabled and trait_name in original_traits:
  517             new_traits = original_traits - {trait_name}
  518             LOG.debug('Removing trait %s from compute node resource '
  519                       'provider %s in placement.', trait_name, rp_uuid)
  520         elif not enabled and trait_name not in original_traits:
  521             new_traits = original_traits | {trait_name}
  522             LOG.debug('Adding trait %s to compute node resource '
  523                       'provider %s in placement.', trait_name, rp_uuid)
  524 
  525         if new_traits is not None:
  526             self.reportclient.set_traits_for_provider(
  527                 context, rp_uuid, new_traits, generation=trait_info.generation)
  528 
  529 
  530 class ComputeManager(manager.Manager):
  531     """Manages the running instances from creation to destruction."""
  532 
  533     target = messaging.Target(version='5.12')
  534 
  535     def __init__(self, compute_driver=None, *args, **kwargs):
  536         """Load configuration options and connect to the hypervisor."""
  537         # We want the ComputeManager, ResourceTracker and ComputeVirtAPI all
  538         # using the same instance of SchedulerReportClient which has the
  539         # ProviderTree cache for this compute service.
  540         self.reportclient = report.SchedulerReportClient()
  541         self.virtapi = ComputeVirtAPI(self)
  542         self.network_api = neutron.API()
  543         self.volume_api = cinder.API()
  544         self.image_api = glance.API()
  545         self._last_bw_usage_poll = 0.0
  546         self._bw_usage_supported = True
  547         self.compute_api = compute.API()
  548         self.compute_rpcapi = compute_rpcapi.ComputeAPI()
  549         self.compute_task_api = conductor.ComputeTaskAPI()
  550         self.query_client = query.SchedulerQueryClient()
  551         self.instance_events = InstanceEvents()
  552         self._sync_power_pool = eventlet.GreenPool(
  553             size=CONF.sync_power_state_pool_size)
  554         self._syncs_in_progress = {}
  555         self.send_instance_updates = (
  556             CONF.filter_scheduler.track_instance_changes)
  557         if CONF.max_concurrent_builds != 0:
  558             self._build_semaphore = eventlet.semaphore.Semaphore(
  559                 CONF.max_concurrent_builds)
  560         else:
  561             self._build_semaphore = compute_utils.UnlimitedSemaphore()
  562         if CONF.max_concurrent_snapshots > 0:
  563             self._snapshot_semaphore = eventlet.semaphore.Semaphore(
  564                 CONF.max_concurrent_snapshots)
  565         else:
  566             self._snapshot_semaphore = compute_utils.UnlimitedSemaphore()
  567         if CONF.max_concurrent_live_migrations > 0:
  568             self._live_migration_executor = futurist.GreenThreadPoolExecutor(
  569                 max_workers=CONF.max_concurrent_live_migrations)
  570         else:
  571             # CONF.max_concurrent_live_migrations is 0 (unlimited)
  572             self._live_migration_executor = futurist.GreenThreadPoolExecutor()
  573         # This is a dict, keyed by instance uuid, to a two-item tuple of
  574         # migration object and Future for the queued live migration.
  575         self._waiting_live_migrations = {}
  576 
  577         super(ComputeManager, self).__init__(service_name="compute",
  578                                              *args, **kwargs)
  579 
  580         # NOTE(russellb) Load the driver last.  It may call back into the
  581         # compute manager via the virtapi, so we want it to be fully
  582         # initialized before that happens.
  583         self.driver = driver.load_compute_driver(self.virtapi, compute_driver)
  584         self.use_legacy_block_device_info = \
  585                             self.driver.need_legacy_block_device_info
  586         self.rt = resource_tracker.ResourceTracker(
  587             self.host, self.driver, reportclient=self.reportclient)
  588 
  589     def reset(self):
  590         LOG.info('Reloading compute RPC API')
  591         compute_rpcapi.reset_globals()
  592         self.compute_rpcapi = compute_rpcapi.ComputeAPI()
  593         self.reportclient.clear_provider_cache()
  594 
  595     def _update_resource_tracker(self, context, instance):
  596         """Let the resource tracker know that an instance has changed state."""
  597 
  598         if instance.host == self.host:
  599             self.rt.update_usage(context, instance, instance.node)
  600 
  601     def _instance_update(self, context, instance, **kwargs):
  602         """Update an instance in the database using kwargs as value."""
  603 
  604         for k, v in kwargs.items():
  605             setattr(instance, k, v)
  606         instance.save()
  607         self._update_resource_tracker(context, instance)
  608 
  609     def _nil_out_instance_obj_host_and_node(self, instance):
  610         # NOTE(jwcroppe): We don't do instance.save() here for performance
  611         # reasons; a call to this is expected to be immediately followed by
  612         # another call that does instance.save(), thus avoiding two writes
  613         # to the database layer.
  614         instance.host = None
  615         instance.node = None
  616         # ResourceTracker._set_instance_host_and_node also sets launched_on
  617         # to the same value as host and is really only ever used by legacy
  618         # nova-network code, but we should also null it out to avoid confusion
  619         # if there is an instance in the database with no host set but
  620         # launched_on is set. Note that we do not care about using launched_on
  621         # as some kind of debug helper if diagnosing a build failure, that is
  622         # what instance action events are for.
  623         instance.launched_on = None
  624         # If the instance is not on a host, it's not in an aggregate and
  625         # therefore is not in an availability zone.
  626         instance.availability_zone = None
  627 
  628     def _set_instance_obj_error_state(self, instance, clean_task_state=False):
  629         try:
  630             instance.vm_state = vm_states.ERROR
  631             if clean_task_state:
  632                 instance.task_state = None
  633             instance.save()
  634         except exception.InstanceNotFound:
  635             LOG.debug('Instance has been destroyed from under us while '
  636                       'trying to set it to ERROR', instance=instance)
  637 
  638     def _get_instances_on_driver(self, context, filters=None):
  639         """Return a list of instance records for the instances found
  640         on the hypervisor which satisfy the specified filters. If filters=None
  641         return a list of instance records for all the instances found on the
  642         hypervisor.
  643         """
  644         if not filters:
  645             filters = {}
  646         try:
  647             driver_uuids = self.driver.list_instance_uuids()
  648             if len(driver_uuids) == 0:
  649                 # Short circuit, don't waste a DB call
  650                 return objects.InstanceList()
  651             filters['uuid'] = driver_uuids
  652             local_instances = objects.InstanceList.get_by_filters(
  653                 context, filters, use_slave=True)
  654             return local_instances
  655         except NotImplementedError:
  656             pass
  657 
  658         # The driver doesn't support uuids listing, so we'll have
  659         # to brute force.
  660         driver_instances = self.driver.list_instances()
  661         # NOTE(mjozefcz): In this case we need to apply host filter.
  662         # Without this all instance data would be fetched from db.
  663         filters['host'] = self.host
  664         instances = objects.InstanceList.get_by_filters(context, filters,
  665                                                         use_slave=True)
  666         name_map = {instance.name: instance for instance in instances}
  667         local_instances = []
  668         for driver_instance in driver_instances:
  669             instance = name_map.get(driver_instance)
  670             if not instance:
  671                 continue
  672             local_instances.append(instance)
  673         return local_instances
  674 
  675     def _destroy_evacuated_instances(self, context, node_cache):
  676         """Destroys evacuated instances.
  677 
  678         While nova-compute was down, the instances running on it could be
  679         evacuated to another host. This method looks for evacuation migration
  680         records where this is the source host and which were either started
  681         (accepted), in-progress (pre-migrating) or migrated (done). From those
  682         migration records, local instances reported by the hypervisor are
  683         compared to the instances for the migration records and those local
  684         guests are destroyed, along with instance allocation records in
  685         Placement for this node.
  686         Then allocations are removed from Placement for every instance that is
  687         evacuated from this host regardless if the instance is reported by the
  688         hypervisor or not.
  689 
  690         :param context: The request context
  691         :param node_cache: A dict of ComputeNode objects keyed by the UUID of
  692             the compute node
  693         :return: A dict keyed by instance uuid mapped to Migration objects
  694             for instances that were migrated away from this host
  695         """
  696         filters = {
  697             'source_compute': self.host,
  698             # NOTE(mriedem): Migration records that have been accepted are
  699             # included in case the source node comes back up while instances
  700             # are being evacuated to another host. We don't want the same
  701             # instance being reported from multiple hosts.
  702             # NOTE(lyarwood): pre-migrating is also included here as the
  703             # source compute can come back online shortly after the RT
  704             # claims on the destination that in-turn moves the migration to
  705             # pre-migrating. If the evacuate fails on the destination host,
  706             # the user can rebuild the instance (in ERROR state) on the source
  707             # host.
  708             'status': ['accepted', 'pre-migrating', 'done'],
  709             'migration_type': fields.MigrationType.EVACUATION,
  710         }
  711         with utils.temporary_mutation(context, read_deleted='yes'):
  712             evacuations = objects.MigrationList.get_by_filters(context,
  713                                                                filters)
  714         if not evacuations:
  715             return {}
  716         evacuations = {mig.instance_uuid: mig for mig in evacuations}
  717 
  718         # TODO(mriedem): We could optimize by pre-loading the joined fields
  719         # we know we'll use, like info_cache and flavor.
  720         local_instances = self._get_instances_on_driver(context)
  721         evacuated_local_instances = {inst.uuid: inst
  722                                      for inst in local_instances
  723                                      if inst.uuid in evacuations}
  724 
  725         for instance in evacuated_local_instances.values():
  726             LOG.info('Destroying instance as it has been evacuated from '
  727                      'this host but still exists in the hypervisor',
  728                      instance=instance)
  729             try:
  730                 network_info = self.network_api.get_instance_nw_info(
  731                     context, instance)
  732                 bdi = self._get_instance_block_device_info(context,
  733                                                            instance)
  734                 destroy_disks = not (self._is_instance_storage_shared(
  735                     context, instance))
  736             except exception.InstanceNotFound:
  737                 network_info = network_model.NetworkInfo()
  738                 bdi = {}
  739                 LOG.info('Instance has been marked deleted already, '
  740                          'removing it from the hypervisor.',
  741                          instance=instance)
  742                 # always destroy disks if the instance was deleted
  743                 destroy_disks = True
  744             self.driver.destroy(context, instance,
  745                                 network_info,
  746                                 bdi, destroy_disks)
  747 
  748         hostname_to_cn_uuid = {
  749             cn.hypervisor_hostname: cn.uuid
  750             for cn in node_cache.values()}
  751 
  752         for instance_uuid, migration in evacuations.items():
  753             try:
  754                 if instance_uuid in evacuated_local_instances:
  755                     # Avoid the db call if we already have the instance loaded
  756                     # above
  757                     instance = evacuated_local_instances[instance_uuid]
  758                 else:
  759                     instance = objects.Instance.get_by_uuid(
  760                         context, instance_uuid)
  761             except exception.InstanceNotFound:
  762                 # The instance already deleted so we expect that every
  763                 # allocation of that instance has already been cleaned up
  764                 continue
  765 
  766             LOG.info('Cleaning up allocations of the instance as it has been '
  767                      'evacuated from this host',
  768                      instance=instance)
  769             if migration.source_node not in hostname_to_cn_uuid:
  770                 LOG.error("Failed to clean allocation of evacuated "
  771                           "instance as the source node %s is not found",
  772                           migration.source_node, instance=instance)
  773                 continue
  774             cn_uuid = hostname_to_cn_uuid[migration.source_node]
  775 
  776             # If the instance was deleted in the interim, assume its
  777             # allocations were properly cleaned up (either by its hosting
  778             # compute service or the API).
  779             if (not instance.deleted and
  780                     not self.reportclient.
  781                         remove_provider_tree_from_instance_allocation(
  782                             context, instance.uuid, cn_uuid)):
  783                 LOG.error("Failed to clean allocation of evacuated instance "
  784                           "on the source node %s",
  785                           cn_uuid, instance=instance)
  786 
  787             migration.status = 'completed'
  788             migration.save()
  789         return evacuations
  790 
  791     def _is_instance_storage_shared(self, context, instance, host=None):
  792         shared_storage = True
  793         data = None
  794         try:
  795             data = self.driver.check_instance_shared_storage_local(context,
  796                                                        instance)
  797             if data:
  798                 shared_storage = (self.compute_rpcapi.
  799                                   check_instance_shared_storage(context,
  800                                   instance, data, host=host))
  801         except NotImplementedError:
  802             LOG.debug('Hypervisor driver does not support '
  803                       'instance shared storage check, '
  804                       'assuming it\'s not on shared storage',
  805                       instance=instance)
  806             shared_storage = False
  807         except Exception:
  808             LOG.exception('Failed to check if instance shared',
  809                           instance=instance)
  810         finally:
  811             if data:
  812                 self.driver.check_instance_shared_storage_cleanup(context,
  813                                                                   data)
  814         return shared_storage
  815 
  816     def _complete_partial_deletion(self, context, instance):
  817         """Complete deletion for instances in DELETED status but not marked as
  818         deleted in the DB
  819         """
  820         instance.destroy()
  821         bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  822                 context, instance.uuid)
  823         self._complete_deletion(context,
  824                                 instance)
  825         self._notify_about_instance_usage(context, instance, "delete.end")
  826         compute_utils.notify_about_instance_action(context, instance,
  827                 self.host, action=fields.NotificationAction.DELETE,
  828                 phase=fields.NotificationPhase.END, bdms=bdms)
  829 
  830     def _complete_deletion(self, context, instance):
  831         self._update_resource_tracker(context, instance)
  832 
  833         self.reportclient.delete_allocation_for_instance(context,
  834                                                          instance.uuid)
  835 
  836         self._clean_instance_console_tokens(context, instance)
  837         self._delete_scheduler_instance_info(context, instance.uuid)
  838 
  839     def _validate_pinning_configuration(self, instances):
  840         if not self.driver.capabilities.get('supports_pcpus', False):
  841             return
  842 
  843         for instance in instances:
  844             # ignore deleted instances
  845             if instance.deleted:
  846                 continue
  847 
  848             # if this is an unpinned instance and the host only has
  849             # 'cpu_dedicated_set' configured, we need to tell the operator to
  850             # correct their configuration
  851             if not instance.numa_topology or (
  852                 instance.numa_topology.cpu_policy in (
  853                     None, fields.CPUAllocationPolicy.SHARED
  854                 )
  855             ):
  856                 # we don't need to check 'vcpu_pin_set' since it can't coexist
  857                 # alongside 'cpu_dedicated_set'
  858                 if (CONF.compute.cpu_dedicated_set and
  859                         not CONF.compute.cpu_shared_set):
  860                     msg = _("This host has unpinned instances but has no CPUs "
  861                             "set aside for this purpose; configure '[compute] "
  862                             "cpu_shared_set' instead of, or in addition to, "
  863                             "'[compute] cpu_dedicated_set'")
  864                     raise exception.InvalidConfiguration(msg)
  865 
  866                 continue
  867 
  868             # ditto for pinned instances if only 'cpu_shared_set' is configured
  869             if (CONF.compute.cpu_shared_set and
  870                     not CONF.compute.cpu_dedicated_set and
  871                     not CONF.vcpu_pin_set):
  872                 msg = _("This host has pinned instances but has no CPUs "
  873                         "set aside for this purpose; configure '[compute] "
  874                         "cpu_dedicated_set' instead of, or in addition to, "
  875                         "'[compute] cpu_shared_set'.")
  876                 raise exception.InvalidConfiguration(msg)
  877 
  878             # if this is a mixed instance with both pinned and unpinned CPUs,
  879             # the host must have both 'cpu_dedicated_set' and 'cpu_shared_set'
  880             # configured. check if 'cpu_shared_set' is set.
  881             if (instance.numa_topology.cpu_policy ==
  882                     fields.CPUAllocationPolicy.MIXED and
  883                     not CONF.compute.cpu_shared_set):
  884                 msg = _("This host has mixed instance requesting both pinned "
  885                         "and unpinned CPUs but hasn't set aside unpinned CPUs "
  886                         "for this purpose; Configure "
  887                         "'[compute] cpu_shared_set'.")
  888                 raise exception.InvalidConfiguration(msg)
  889 
  890             # for mixed instance check if 'cpu_dedicated_set' is set.
  891             if (instance.numa_topology.cpu_policy ==
  892                     fields.CPUAllocationPolicy.MIXED and
  893                     not CONF.compute.cpu_dedicated_set):
  894                 msg = _("This host has mixed instance requesting both pinned "
  895                         "and unpinned CPUs but hasn't set aside pinned CPUs "
  896                         "for this purpose; Configure "
  897                         "'[compute] cpu_dedicated_set'")
  898                 raise exception.InvalidConfiguration(msg)
  899 
  900             # also check to make sure the operator hasn't accidentally
  901             # dropped some cores that instances are currently using
  902             available_dedicated_cpus = (hardware.get_vcpu_pin_set() or
  903                                         hardware.get_cpu_dedicated_set())
  904             pinned_cpus = instance.numa_topology.cpu_pinning
  905             if available_dedicated_cpus and (
  906                     pinned_cpus - available_dedicated_cpus):
  907                 # we can't raise an exception because of bug #1289064,
  908                 # which meant we didn't recalculate CPU pinning information
  909                 # when we live migrated a pinned instance
  910                 LOG.warning(
  911                     "Instance is pinned to host CPUs %(cpus)s "
  912                     "but one or more of these CPUs are not included in "
  913                     "either '[compute] cpu_dedicated_set' or "
  914                     "'vcpu_pin_set'; you should update these "
  915                     "configuration options to include the missing CPUs "
  916                     "or rebuild or cold migrate this instance.",
  917                     {'cpus': list(pinned_cpus)},
  918                     instance=instance)
  919 
  920     def _validate_vtpm_configuration(self, instances):
  921         if self.driver.capabilities.get('supports_vtpm', False):
  922             return
  923 
  924         for instance in instances:
  925             if instance.deleted:
  926                 continue
  927 
  928             # NOTE(stephenfin): We don't have an attribute on the instance to
  929             # check for this, so we need to inspect the flavor/image metadata
  930             if hardware.get_vtpm_constraint(
  931                 instance.flavor, instance.image_meta,
  932             ):
  933                 msg = _(
  934                     'This host has instances with the vTPM feature enabled, '
  935                     'but the host is not correctly configured; enable '
  936                     'vTPM support.'
  937                 )
  938                 raise exception.InvalidConfiguration(msg)
  939 
  940     def _reset_live_migration(self, context, instance):
  941         migration = None
  942         try:
  943             migration = objects.Migration.get_by_instance_and_status(
  944                                       context, instance.uuid, 'running')
  945             if migration:
  946                 self.live_migration_abort(context, instance, migration.id)
  947         except Exception:
  948             LOG.exception('Failed to abort live-migration',
  949                           instance=instance)
  950         finally:
  951             if migration:
  952                 self._set_migration_status(migration, 'error')
  953             LOG.info('Instance found in migrating state during '
  954                      'startup. Resetting task_state',
  955                      instance=instance)
  956             instance.task_state = None
  957             instance.save(expected_task_state=[task_states.MIGRATING])
  958 
  959     def _init_instance(self, context, instance):
  960         """Initialize this instance during service init."""
  961 
  962         # NOTE(danms): If the instance appears to not be owned by this
  963         # host, it may have been evacuated away, but skipped by the
  964         # evacuation cleanup code due to configuration. Thus, if that
  965         # is a possibility, don't touch the instance in any way, but
  966         # log the concern. This will help avoid potential issues on
  967         # startup due to misconfiguration.
  968         if instance.host != self.host:
  969             LOG.warning('Instance %(uuid)s appears to not be owned '
  970                         'by this host, but by %(host)s. Startup '
  971                         'processing is being skipped.',
  972                         {'uuid': instance.uuid,
  973                          'host': instance.host})
  974             return
  975 
  976         # Instances that are shut down, or in an error state can not be
  977         # initialized and are not attempted to be recovered. The exception
  978         # to this are instances that are in RESIZE_MIGRATING or DELETING,
  979         # which are dealt with further down.
  980         if (instance.vm_state == vm_states.SOFT_DELETED or
  981             (instance.vm_state == vm_states.ERROR and
  982             instance.task_state not in
  983             (task_states.RESIZE_MIGRATING, task_states.DELETING))):
  984             LOG.debug("Instance is in %s state.",
  985                       instance.vm_state, instance=instance)
  986             return
  987 
  988         if instance.vm_state == vm_states.DELETED:
  989             try:
  990                 self._complete_partial_deletion(context, instance)
  991             except Exception:
  992                 # we don't want that an exception blocks the init_host
  993                 LOG.exception('Failed to complete a deletion',
  994                               instance=instance)
  995             return
  996 
  997         if (instance.vm_state == vm_states.BUILDING or
  998             instance.task_state in [task_states.SCHEDULING,
  999                                     task_states.BLOCK_DEVICE_MAPPING,
 1000                                     task_states.NETWORKING,
 1001                                     task_states.SPAWNING]):
 1002             # NOTE(dave-mcnally) compute stopped before instance was fully
 1003             # spawned so set to ERROR state. This is safe to do as the state
 1004             # may be set by the api but the host is not so if we get here the
 1005             # instance has already been scheduled to this particular host.
 1006             LOG.debug("Instance failed to spawn correctly, "
 1007                       "setting to ERROR state", instance=instance)
 1008             self._set_instance_obj_error_state(instance, clean_task_state=True)
 1009             return
 1010 
 1011         if (instance.vm_state in [vm_states.ACTIVE, vm_states.STOPPED] and
 1012             instance.task_state in [task_states.REBUILDING,
 1013                                     task_states.REBUILD_BLOCK_DEVICE_MAPPING,
 1014                                     task_states.REBUILD_SPAWNING]):
 1015             # NOTE(jichenjc) compute stopped before instance was fully
 1016             # spawned so set to ERROR state. This is consistent to BUILD
 1017             LOG.debug("Instance failed to rebuild correctly, "
 1018                       "setting to ERROR state", instance=instance)
 1019             self._set_instance_obj_error_state(instance, clean_task_state=True)
 1020             return
 1021 
 1022         if (instance.vm_state != vm_states.ERROR and
 1023             instance.task_state in [task_states.IMAGE_SNAPSHOT_PENDING,
 1024                                     task_states.IMAGE_PENDING_UPLOAD,
 1025                                     task_states.IMAGE_UPLOADING,
 1026                                     task_states.IMAGE_SNAPSHOT]):
 1027             LOG.debug("Instance in transitional state %s at start-up "
 1028                       "clearing task state",
 1029                       instance.task_state, instance=instance)
 1030             try:
 1031                 self._post_interrupted_snapshot_cleanup(context, instance)
 1032             except Exception:
 1033                 # we don't want that an exception blocks the init_host
 1034                 LOG.exception('Failed to cleanup snapshot.', instance=instance)
 1035             instance.task_state = None
 1036             instance.save()
 1037 
 1038         if (instance.vm_state != vm_states.ERROR and
 1039             instance.task_state in [task_states.RESIZE_PREP]):
 1040             LOG.debug("Instance in transitional state %s at start-up "
 1041                       "clearing task state",
 1042                       instance['task_state'], instance=instance)
 1043             instance.task_state = None
 1044             instance.save()
 1045 
 1046         if instance.task_state == task_states.DELETING:
 1047             try:
 1048                 LOG.info('Service started deleting the instance during '
 1049                          'the previous run, but did not finish. Restarting'
 1050                          ' the deletion now.', instance=instance)
 1051                 instance.obj_load_attr('metadata')
 1052                 instance.obj_load_attr('system_metadata')
 1053                 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
 1054                         context, instance.uuid)
 1055                 self._delete_instance(context, instance, bdms)
 1056             except Exception:
 1057                 # we don't want that an exception blocks the init_host
 1058                 LOG.exception('Failed to complete a deletion',
 1059                               instance=instance)
 1060                 self._set_instance_obj_error_state(instance)
 1061             return
 1062 
 1063         current_power_state = self._get_power_state(instance)
 1064         try_reboot, reboot_type = self._retry_reboot(
 1065             instance, current_power_state)
 1066 
 1067         if try_reboot:
 1068             LOG.debug("Instance in transitional state (%(task_state)s) at "
 1069                       "start-up and power state is (%(power_state)s), "
 1070                       "triggering reboot",
 1071                       {'task_state': instance.task_state,
 1072                        'power_state': current_power_state},
 1073                       instance=instance)
 1074 
 1075             # NOTE(mikal): if the instance was doing a soft reboot that got as
 1076             # far as shutting down the instance but not as far as starting it
 1077             # again, then we've just become a hard reboot. That means the
 1078             # task state for the instance needs to change so that we're in one
 1079             # of the expected task states for a hard reboot.
 1080             if (instance.task_state in task_states.soft_reboot_states and
 1081                 reboot_type == 'HARD'):
 1082                 instance.task_state = task_states.REBOOT_PENDING_HARD
 1083                 instance.save()
 1084 
 1085             self.reboot_instance(context, instance, block_device_info=None,
 1086                                  reboot_type=reboot_type)
 1087             return
 1088 
 1089         elif (current_power_state == power_state.RUNNING and
 1090               instance.task_state in [task_states.REBOOT_STARTED,
 1091                                       task_states.REBOOT_STARTED_HARD,
 1092                                       task_states.PAUSING,
 1093                                       task_states.UNPAUSING]):
 1094             LOG.warning("Instance in transitional state "
 1095                         "(%(task_state)s) at start-up and power state "
 1096                         "is (%(power_state)s), clearing task state",
 1097                         {'task_state': instance.task_state,
 1098                          'power_state': current_power_state},
 1099                         instance=instance)
 1100             instance.task_state = None
 1101             instance.vm_state = vm_states.ACTIVE
 1102             instance.save()
 1103         elif (current_power_state == power_state.PAUSED and
 1104               instance.task_state == task_states.UNPAUSING):
 1105             LOG.warning("Instance in transitional state "
 1106                         "(%(task_state)s) at start-up and power state "
 1107                         "is (%(power_state)s), clearing task state "
 1108                         "and unpausing the instance",
 1109                         {'task_state': instance.task_state,
 1110                          'power_state': current_power_state},
 1111                         instance=instance)
 1112             try:
 1113                 self.unpause_instance(context, instance)
 1114             except NotImplementedError:
 1115                 # Some virt driver didn't support pause and unpause
 1116                 pass
 1117             except Exception:
 1118                 LOG.exception('Failed to unpause instance', instance=instance)
 1119             return
 1120 
 1121         if instance.task_state == task_states.POWERING_OFF:
 1122             try:
 1123                 LOG.debug("Instance in transitional state %s at start-up "
 1124                           "retrying stop request",
 1125                           instance.task_state, instance=instance)
 1126                 self.stop_instance(context, instance, True)
 1127             except Exception:
 1128                 # we don't want that an exception blocks the init_host
 1129                 LOG.exception('Failed to stop instance', instance=instance)
 1130             return
 1131 
 1132         if instance.task_state == task_states.POWERING_ON:
 1133             try:
 1134                 LOG.debug("Instance in transitional state %s at start-up "
 1135                           "retrying start request",
 1136                           instance.task_state, instance=instance)
 1137                 self.start_instance(context, instance)
 1138             except Exception:
 1139                 # we don't want that an exception blocks the init_host
 1140                 LOG.exception('Failed to start instance', instance=instance)
 1141             return
 1142 
 1143         net_info = instance.get_network_info()
 1144         try:
 1145             self.driver.plug_vifs(instance, net_info)
 1146         except NotImplementedError as e:
 1147             LOG.debug(e, instance=instance)
 1148         except exception.VirtualInterfacePlugException:
 1149             # NOTE(mriedem): If we get here, it could be because the vif_type
 1150             # in the cache is "binding_failed" or "unbound".
 1151             # The periodic task _heal_instance_info_cache checks for this
 1152             # condition. It should fix this by binding the ports again when
 1153             # it gets to this instance.
 1154             LOG.exception('Virtual interface plugging failed for instance. '
 1155                           'The port binding:host_id may need to be manually '
 1156                           'updated.', instance=instance)
 1157             self._set_instance_obj_error_state(instance)
 1158             return
 1159 
 1160         if instance.task_state == task_states.RESIZE_MIGRATING:
 1161             # We crashed during resize/migration, so roll back for safety
 1162             try:
 1163                 # NOTE(mriedem): check old_vm_state for STOPPED here, if it's
 1164                 # not in system_metadata we default to True for backwards
 1165                 # compatibility
 1166                 power_on = (instance.system_metadata.get('old_vm_state') !=
 1167                             vm_states.STOPPED)
 1168 
 1169                 block_dev_info = self._get_instance_block_device_info(context,
 1170                                                                       instance)
 1171 
 1172                 migration = objects.Migration.get_by_id_and_instance(
 1173                     context, instance.migration_context.migration_id,
 1174                     instance.uuid)
 1175                 self.driver.finish_revert_migration(context, instance,
 1176                     net_info, migration, block_dev_info, power_on)
 1177 
 1178             except Exception:
 1179                 LOG.exception('Failed to revert crashed migration',
 1180                               instance=instance)
 1181             finally:
 1182                 LOG.info('Instance found in migrating state during '
 1183                          'startup. Resetting task_state',
 1184                          instance=instance)
 1185                 instance.task_state = None
 1186                 instance.save()
 1187         if instance.task_state == task_states.MIGRATING:
 1188             # Live migration did not complete, but instance is on this
 1189             # host. Abort ongoing migration if still running and reset state.
 1190             self._reset_live_migration(context, instance)
 1191 
 1192         db_state = instance.power_state
 1193         drv_state = self._get_power_state(instance)
 1194         expect_running = (db_state == power_state.RUNNING and
 1195                           drv_state != db_state)
 1196 
 1197         LOG.debug('Current state is %(drv_state)s, state in DB is '
 1198                   '%(db_state)s.',
 1199                   {'drv_state': drv_state, 'db_state': db_state},
 1200                   instance=instance)
 1201 
 1202         if expect_running and CONF.resume_guests_state_on_host_boot:
 1203             self._resume_guests_state(context, instance, net_info)
 1204 
 1205     def _resume_guests_state(self, context, instance, net_info):
 1206         LOG.info('Rebooting instance after nova-compute restart.',
 1207                  instance=instance)
 1208         block_device_info = \
 1209             self._get_instance_block_device_info(context, instance)
 1210 
 1211         try:
 1212             self.driver.resume_state_on_host_boot(
 1213                 context, instance, net_info, block_device_info)
 1214         except NotImplementedError:
 1215             LOG.warning('Hypervisor driver does not support '
 1216                         'resume guests', instance=instance)
 1217         except Exception:
 1218             # NOTE(vish): The instance failed to resume, so we set the
 1219             #             instance to error and attempt to continue.
 1220             LOG.warning('Failed to resume instance',
 1221                         instance=instance)
 1222             self._set_instance_obj_error_state(instance)
 1223 
 1224     def _retry_reboot(self, instance, current_power_state):
 1225         current_task_state = instance.task_state
 1226         retry_reboot = False
 1227         reboot_type = compute_utils.get_reboot_type(current_task_state,
 1228                                                     current_power_state)
 1229 
 1230         pending_soft = (
 1231             current_task_state == task_states.REBOOT_PENDING and
 1232             instance.vm_state in vm_states.ALLOW_SOFT_REBOOT)
 1233         pending_hard = (
 1234             current_task_state == task_states.REBOOT_PENDING_HARD and
 1235             instance.vm_state in vm_states.ALLOW_HARD_REBOOT)
 1236         started_not_running = (current_task_state in
 1237                                [task_states.REBOOT_STARTED,
 1238                                 task_states.REBOOT_STARTED_HARD] and
 1239                                current_power_state != power_state.RUNNING)
 1240 
 1241         if pending_soft or pending_hard or started_not_running:
 1242             retry_reboot = True
 1243 
 1244         return retry_reboot, reboot_type
 1245 
 1246     def handle_lifecycle_event(self, event):
 1247         LOG.info("VM %(state)s (Lifecycle Event)",
 1248                  {'state': event.get_name()},
 1249                  instance_uuid=event.get_instance_uuid())
 1250         context = nova.context.get_admin_context(read_deleted='yes')
 1251         vm_power_state = None
 1252         event_transition = event.get_transition()
 1253         if event_transition == virtevent.EVENT_LIFECYCLE_STOPPED:
 1254             vm_power_state = power_state.SHUTDOWN
 1255         elif event_transition == virtevent.EVENT_LIFECYCLE_STARTED:
 1256             vm_power_state = power_state.RUNNING
 1257         elif event_transition in (
 1258                 virtevent.EVENT_LIFECYCLE_PAUSED,
 1259                 virtevent.EVENT_LIFECYCLE_POSTCOPY_STARTED,
 1260                 virtevent.EVENT_LIFECYCLE_MIGRATION_COMPLETED):
 1261             vm_power_state = power_state.PAUSED
 1262         elif event_transition == virtevent.EVENT_LIFECYCLE_RESUMED:
 1263             vm_power_state = power_state.RUNNING
 1264         elif event_transition == virtevent.EVENT_LIFECYCLE_SUSPENDED:
 1265             vm_power_state = power_state.SUSPENDED
 1266         else:
 1267             LOG.warning("Unexpected lifecycle event: %d", event_transition)
 1268 
 1269         migrate_finish_statuses = {
 1270             # This happens on the source node and indicates live migration
 1271             # entered post-copy mode.
 1272             virtevent.EVENT_LIFECYCLE_POSTCOPY_STARTED: 'running (post-copy)',
 1273             # Suspended for offline migration.
 1274             virtevent.EVENT_LIFECYCLE_MIGRATION_COMPLETED: 'running'
 1275         }
 1276 
 1277         expected_attrs = []
 1278         if event_transition in migrate_finish_statuses:
 1279             # Join on info_cache since that's needed in migrate_instance_start.
 1280             expected_attrs.append('info_cache')
 1281         instance = objects.Instance.get_by_uuid(context,
 1282                                                 event.get_instance_uuid(),
 1283                                                 expected_attrs=expected_attrs)
 1284 
 1285         # Note(lpetrut): The event may be delayed, thus not reflecting
 1286         # the current instance power state. In that case, ignore the event.
 1287         current_power_state = self._get_power_state(instance)
 1288         if current_power_state == vm_power_state:
 1289             LOG.debug('Synchronizing instance power state after lifecycle '
 1290                       'event "%(event)s"; current vm_state: %(vm_state)s, '
 1291                       'current task_state: %(task_state)s, current DB '
 1292                       'power_state: %(db_power_state)s, VM power_state: '
 1293                       '%(vm_power_state)s',
 1294                       {'event': event.get_name(),
 1295                        'vm_state': instance.vm_state,
 1296                        'task_state': instance.task_state,
 1297                        'db_power_state': instance.power_state,
 1298                        'vm_power_state': vm_power_state},
 1299                       instance_uuid=instance.uuid)
 1300             self._sync_instance_power_state(context,
 1301                                             instance,
 1302                                             vm_power_state)
 1303 
 1304         # The following checks are for live migration. We want to activate
 1305         # the port binding for the destination host before the live migration
 1306         # is resumed on the destination host in order to reduce network
 1307         # downtime. Otherwise the ports are bound to the destination host
 1308         # in post_live_migration_at_destination.
 1309         # TODO(danms): Explore options for using a different live migration
 1310         # specific callback for this instead of piggy-backing on the
 1311         # handle_lifecycle_event callback.
 1312         if (instance.task_state == task_states.MIGRATING and
 1313                 event_transition in migrate_finish_statuses):
 1314             status = migrate_finish_statuses[event_transition]
 1315             try:
 1316                 migration = objects.Migration.get_by_instance_and_status(
 1317                             context, instance.uuid, status)
 1318                 LOG.debug('Binding ports to destination host: %s',
 1319                           migration.dest_compute, instance=instance)
 1320                 # For neutron, migrate_instance_start will activate the
 1321                 # destination host port bindings, if there are any created by
 1322                 # conductor before live migration started.
 1323                 self.network_api.migrate_instance_start(
 1324                     context, instance, migration)
 1325             except exception.MigrationNotFoundByStatus:
 1326                 LOG.warning("Unable to find migration record with status "
 1327                             "'%s' for instance. Port binding will happen in "
 1328                             "post live migration.", status, instance=instance)
 1329 
 1330     def handle_events(self, event):
 1331         if isinstance(event, virtevent.LifecycleEvent):
 1332             try:
 1333                 self.handle_lifecycle_event(event)
 1334             except exception.InstanceNotFound:
 1335                 LOG.debug("Event %s arrived for non-existent instance. The "
 1336                           "instance was probably deleted.", event)
 1337         else:
 1338             LOG.debug("Ignoring event %s", event)
 1339 
 1340     def init_virt_events(self):
 1341         if CONF.workarounds.handle_virt_lifecycle_events:
 1342             self.driver.register_event_listener(self.handle_events)
 1343         else:
 1344             # NOTE(mriedem): If the _sync_power_states periodic task is
 1345             # disabled we should emit a warning in the logs.
 1346             if CONF.sync_power_state_interval < 0:
 1347                 LOG.warning('Instance lifecycle events from the compute '
 1348                             'driver have been disabled. Note that lifecycle '
 1349                             'changes to an instance outside of the compute '
 1350                             'service will not be synchronized '
 1351                             'automatically since the _sync_power_states '
 1352                             'periodic task is also disabled.')
 1353             else:
 1354                 LOG.info('Instance lifecycle events from the compute '
 1355                          'driver have been disabled. Note that lifecycle '
 1356                          'changes to an instance outside of the compute '
 1357                          'service will only be synchronized by the '
 1358                          '_sync_power_states periodic task.')
 1359 
 1360     def _get_nodes(self, context):
 1361         """Queried the ComputeNode objects from the DB that are reported by the
 1362         hypervisor.
 1363 
 1364         :param context: the request context
 1365         :return: a dict of ComputeNode objects keyed by the UUID of the given
 1366             node.
 1367         """
 1368         nodes_by_uuid = {}
 1369         try:
 1370             node_names = self.driver.get_available_nodes()
 1371         except exception.VirtDriverNotReady:
 1372             LOG.warning(
 1373                 "Virt driver is not ready. If this is the first time this "
 1374                 "service is starting on this host, then you can ignore this "
 1375                 "warning.")
 1376             return {}
 1377 
 1378         for node_name in node_names:
 1379             try:
 1380                 node = objects.ComputeNode.get_by_host_and_nodename(
 1381                     context, self.host, node_name)
 1382                 nodes_by_uuid[node.uuid] = node
 1383             except exception.ComputeHostNotFound:
 1384                 LOG.warning(
 1385                     "Compute node %s not found in the database. If this is "
 1386                     "the first time this service is starting on this host, "
 1387                     "then you can ignore this warning.", node_name)
 1388         return nodes_by_uuid
 1389 
 1390     def init_host(self):
 1391         """Initialization for a standalone compute service."""
 1392 
 1393         if CONF.pci.passthrough_whitelist:
 1394             # Simply loading the PCI passthrough whitelist will do a bunch of
 1395             # validation that would otherwise wait until the PciDevTracker is
 1396             # constructed when updating available resources for the compute
 1397             # node(s) in the resource tracker, effectively killing that task.
 1398             # So load up the whitelist when starting the compute service to
 1399             # flush any invalid configuration early so we can kill the service
 1400             # if the configuration is wrong.
 1401             whitelist.Whitelist(CONF.pci.passthrough_whitelist)
 1402 
 1403         nova.conf.neutron.register_dynamic_opts(CONF)
 1404         # Even if only libvirt uses them, make it available for all drivers
 1405         nova.conf.devices.register_dynamic_opts(CONF)
 1406 
 1407         # Override the number of concurrent disk operations allowed if the
 1408         # user has specified a limit.
 1409         if CONF.compute.max_concurrent_disk_ops != 0:
 1410             compute_utils.disk_ops_semaphore = \
 1411                 eventlet.semaphore.BoundedSemaphore(
 1412                     CONF.compute.max_concurrent_disk_ops)
 1413 
 1414         self.driver.init_host(host=self.host)
 1415         context = nova.context.get_admin_context()
 1416         instances = objects.InstanceList.get_by_host(
 1417             context, self.host,
 1418             expected_attrs=['info_cache', 'metadata', 'numa_topology'])
 1419 
 1420         self.init_virt_events()
 1421 
 1422         self._validate_pinning_configuration(instances)
 1423         self._validate_vtpm_configuration(instances)
 1424 
 1425         # NOTE(gibi): At this point the compute_nodes of the resource tracker
 1426         # has not been populated yet so we cannot rely on the resource tracker
 1427         # here.
 1428         # NOTE(gibi): If ironic and vcenter virt driver slow start time
 1429         # becomes problematic here then we should consider adding a config
 1430         # option or a driver flag to tell us if we should thread
 1431         # _destroy_evacuated_instances and
 1432         # _error_out_instances_whose_build_was_interrupted out in the
 1433         # background on startup
 1434         nodes_by_uuid = self._get_nodes(context)
 1435 
 1436         try:
 1437             # checking that instance was not already evacuated to other host
 1438             evacuated_instances = self._destroy_evacuated_instances(
 1439                 context, nodes_by_uuid)
 1440 
 1441             # Initialise instances on the host that are not evacuating
 1442             for instance in instances:
 1443                 if instance.uuid not in evacuated_instances:
 1444                     self._init_instance(context, instance)
 1445 
 1446             # NOTE(gibi): collect all the instance uuids that is in some way
 1447             # was already handled above. Either by init_instance or by
 1448             # _destroy_evacuated_instances. This way we can limit the scope of
 1449             # the _error_out_instances_whose_build_was_interrupted call to look
 1450             # only for instances that have allocations on this node and not
 1451             # handled by the above calls.
 1452             already_handled = {instance.uuid for instance in instances}.union(
 1453                 evacuated_instances)
 1454             self._error_out_instances_whose_build_was_interrupted(
 1455                 context, already_handled, nodes_by_uuid.keys())
 1456 
 1457         finally:
 1458             if instances:
 1459                 # We only send the instance info to the scheduler on startup
 1460                 # if there is anything to send, otherwise this host might
 1461                 # not be mapped yet in a cell and the scheduler may have
 1462                 # issues dealing with the information. Later changes to
 1463                 # instances on this host will update the scheduler, or the
 1464                 # _sync_scheduler_instance_info periodic task will.
 1465                 self._update_scheduler_instance_info(context, instances)
 1466 
 1467     def _error_out_instances_whose_build_was_interrupted(
 1468             self, context, already_handled_instances, node_uuids):
 1469         """If there are instances in BUILDING state that are not
 1470         assigned to this host but have allocations in placement towards
 1471         this compute that means the nova-compute service was
 1472         restarted while those instances waited for the resource claim
 1473         to finish and the _set_instance_host_and_node() to update the
 1474         instance.host field. We need to push them to ERROR state here to
 1475         prevent keeping them in BUILDING state forever.
 1476 
 1477         :param context: The request context
 1478         :param already_handled_instances: The set of instance UUIDs that the
 1479             host initialization process already handled in some way.
 1480         :param node_uuids: The list of compute node uuids handled by this
 1481             service
 1482         """
 1483 
 1484         # Strategy:
 1485         # 1) Get the allocations from placement for our compute node(s)
 1486         # 2) Remove the already handled instances from the consumer list;
 1487         #    they are either already initialized or need to be skipped.
 1488         # 3) Check which remaining consumer is an instance in BUILDING state
 1489         #    and push it to ERROR state.
 1490 
 1491         LOG.info(
 1492             "Looking for unclaimed instances stuck in BUILDING status for "
 1493             "nodes managed by this host")
 1494         for cn_uuid in node_uuids:
 1495             try:
 1496                 f = self.reportclient.get_allocations_for_resource_provider
 1497                 allocations = f(context, cn_uuid).allocations
 1498             except (exception.ResourceProviderAllocationRetrievalFailed,
 1499                     keystone_exception.ClientException) as e:
 1500                 LOG.error(
 1501                     "Could not retrieve compute node resource provider %s and "
 1502                     "therefore unable to error out any instances stuck in "
 1503                     "BUILDING state. Error: %s", cn_uuid, six.text_type(e))
 1504                 continue
 1505 
 1506             not_handled_consumers = (set(allocations) -
 1507                                      already_handled_instances)
 1508 
 1509             if not not_handled_consumers:
 1510                 continue
 1511 
 1512             filters = {
 1513                 'vm_state': vm_states.BUILDING,
 1514                 'uuid': not_handled_consumers
 1515             }
 1516 
 1517             instances = objects.InstanceList.get_by_filters(
 1518                 context, filters, expected_attrs=[])
 1519 
 1520             for instance in instances:
 1521                 LOG.debug(
 1522                     "Instance spawn was interrupted before instance_claim, "
 1523                     "setting instance to ERROR state", instance=instance)
 1524                 self._set_instance_obj_error_state(
 1525                     instance, clean_task_state=True)
 1526 
 1527     def cleanup_host(self):
 1528         self.driver.register_event_listener(None)
 1529         self.instance_events.cancel_all_events()
 1530         self.driver.cleanup_host(host=self.host)
 1531         self._cleanup_live_migrations_in_pool()
 1532 
 1533     def _cleanup_live_migrations_in_pool(self):
 1534         # Shutdown the pool so we don't get new requests.
 1535         self._live_migration_executor.shutdown(wait=False)
 1536         # For any queued migrations, cancel the migration and update
 1537         # its status.
 1538         for migration, future in self._waiting_live_migrations.values():
 1539             # If we got here before the Future was submitted then we need
 1540             # to move on since there isn't anything we can do.
 1541             if future is None:
 1542                 continue
 1543             if future.cancel():
 1544                 self._set_migration_status(migration, 'cancelled')
 1545                 LOG.info('Successfully cancelled queued live migration.',
 1546                          instance_uuid=migration.instance_uuid)
 1547             else:
 1548                 LOG.warning('Unable to cancel live migration.',
 1549                             instance_uuid=migration.instance_uuid)
 1550         self._waiting_live_migrations.clear()
 1551 
 1552     def pre_start_hook(self):
 1553         """After the service is initialized, but before we fully bring
 1554         the service up by listening on RPC queues, make sure to update
 1555         our available resources (and indirectly our available nodes).
 1556         """
 1557         self.update_available_resource(nova.context.get_admin_context(),
 1558                                        startup=True)
 1559 
 1560     def _get_power_state(self, instance):
 1561         """Retrieve the power state for the given instance."""
 1562         LOG.debug('Checking state', instance=instance)
 1563         try:
 1564             return self.driver.get_info(instance, use_cache=False).state
 1565         except exception.InstanceNotFound:
 1566             return power_state.NOSTATE
 1567 
 1568     # TODO(stephenfin): Remove this once we bump the compute API to v6.0
 1569     def get_console_topic(self, context):
 1570         """Retrieves the console host for a project on this host.
 1571 
 1572         Currently this is just set in the flags for each compute host.
 1573 
 1574         """
 1575         # TODO(mdragon): perhaps make this variable by console_type?
 1576         return 'console.%s' % CONF.console_host
 1577 
 1578     # TODO(stephenfin): Remove this once we bump the compute API to v6.0
 1579     @wrap_exception()
 1580     def get_console_pool_info(self, context, console_type):
 1581         return self.driver.get_console_pool_info(console_type)
 1582 
 1583     # TODO(stephenfin): Remove this as it's nova-network only
 1584     @wrap_exception()
 1585     def refresh_instance_security_rules(self, context, instance):
 1586         """Tell the virtualization driver to refresh security rules for
 1587         an instance.
 1588 
 1589         Passes straight through to the virtualization driver.
 1590 
 1591         Synchronize the call because we may still be in the middle of
 1592         creating the instance.
 1593         """
 1594         pass
 1595 
 1596     def _await_block_device_map_created(self, context, vol_id):
 1597         # TODO(yamahata): creating volume simultaneously
 1598         #                 reduces creation time?
 1599         # TODO(yamahata): eliminate dumb polling
 1600         start = time.time()
 1601         retries = CONF.block_device_allocate_retries
 1602         # (1) if the configured value is 0, one attempt should be made
 1603         # (2) if the configured value is > 0, then the total number attempts
 1604         #      is (retries + 1)
 1605         attempts = 1
 1606         if retries >= 1:
 1607             attempts = retries + 1
 1608         for attempt in range(1, attempts + 1):
 1609             volume = self.volume_api.get(context, vol_id)
 1610             volume_status = volume['status']
 1611             if volume_status not in ['creating', 'downloading']:
 1612                 if volume_status == 'available':
 1613                     return attempt
 1614                 LOG.warning("Volume id: %(vol_id)s finished being "
 1615                             "created but its status is %(vol_status)s.",
 1616                             {'vol_id': vol_id,
 1617                              'vol_status': volume_status})
 1618                 break
 1619             greenthread.sleep(CONF.block_device_allocate_retries_interval)
 1620         raise exception.VolumeNotCreated(volume_id=vol_id,
 1621                                          seconds=int(time.time() - start),
 1622                                          attempts=attempt,
 1623                                          volume_status=volume_status)
 1624 
 1625     def _decode_files(self, injected_files):
 1626         """Base64 decode the list of files to inject."""
 1627         if not injected_files:
 1628             return []
 1629 
 1630         def _decode(f):
 1631             path, contents = f
 1632             # Py3 raises binascii.Error instead of TypeError as in Py27
 1633             try:
 1634                 decoded = base64.b64decode(contents)
 1635                 return path, decoded
 1636             except (TypeError, binascii.Error):
 1637                 raise exception.Base64Exception(path=path)
 1638 
 1639         return [_decode(f) for f in injected_files]
 1640 
 1641     def _validate_instance_group_policy(self, context, instance,
 1642                                         scheduler_hints):
 1643         # NOTE(russellb) Instance group policy is enforced by the scheduler.
 1644         # However, there is a race condition with the enforcement of
 1645         # the policy.  Since more than one instance may be scheduled at the
 1646         # same time, it's possible that more than one instance with an
 1647         # anti-affinity policy may end up here.  It's also possible that
 1648         # multiple instances with an affinity policy could end up on different
 1649         # hosts.  This is a validation step to make sure that starting the
 1650         # instance here doesn't violate the policy.
 1651         group_hint = scheduler_hints.get('group')
 1652         if not group_hint:
 1653             return
 1654 
 1655         # The RequestSpec stores scheduler_hints as key=list pairs so we need
 1656         # to check the type on the value and pull the single entry out. The
 1657         # API request schema validates that the 'group' hint is a single value.
 1658         if isinstance(group_hint, list):
 1659             group_hint = group_hint[0]
 1660 
 1661         @utils.synchronized(group_hint)
 1662         def _do_validation(context, instance, group_hint):
 1663             group = objects.InstanceGroup.get_by_hint(context, group_hint)
 1664             if group.policy and 'anti-affinity' == group.policy:
 1665                 instances_uuids = objects.InstanceList.get_uuids_by_host(
 1666                     context, self.host)
 1667                 ins_on_host = set(instances_uuids)
 1668                 members = set(group.members)
 1669                 # Determine the set of instance group members on this host
 1670                 # which are not the instance in question. This is used to
 1671                 # determine how many other members from the same anti-affinity
 1672                 # group can be on this host.
 1673                 members_on_host = ins_on_host & members - set([instance.uuid])
 1674                 rules = group.rules
 1675                 if rules and 'max_server_per_host' in rules:
 1676                     max_server = rules['max_server_per_host']
 1677                 else:
 1678                     max_server = 1
 1679                 if len(members_on_host) >= max_server:
 1680                     msg = _("Anti-affinity instance group policy "
 1681                             "was violated.")
 1682                     raise exception.RescheduledException(
 1683                             instance_uuid=instance.uuid,
 1684                             reason=msg)
 1685             elif group.policy and 'affinity' == group.policy:
 1686                 group_hosts = group.get_hosts(exclude=[instance.uuid])
 1687                 if group_hosts and self.host not in group_hosts:
 1688                     msg = _("Affinity instance group policy was violated.")
 1689                     raise exception.RescheduledException(
 1690                             instance_uuid=instance.uuid,
 1691                             reason=msg)
 1692 
 1693         if not CONF.workarounds.disable_group_policy_check_upcall:
 1694             _do_validation(context, instance, group_hint)
 1695 
 1696     def _log_original_error(self, exc_info, instance_uuid):
 1697         LOG.error('Error: %s', exc_info[1], instance_uuid=instance_uuid,
 1698                   exc_info=exc_info)
 1699 
 1700     @periodic_task.periodic_task
 1701     def _check_instance_build_time(self, context):
 1702         """Ensure that instances are not stuck in build."""
 1703         timeout = CONF.instance_build_timeout
 1704         if timeout == 0:
 1705             return
 1706 
 1707         filters = {'vm_state': vm_states.BUILDING,
 1708                    'host': self.host}
 1709 
 1710         building_insts = objects.InstanceList.get_by_filters(context,
 1711                            filters, expected_attrs=[], use_slave=True)
 1712 
 1713         for instance in building_insts:
 1714             if timeutils.is_older_than(instance.created_at, timeout):
 1715                 self._set_instance_obj_error_state(instance)
 1716                 LOG.warning("Instance build timed out. Set to error "
 1717                             "state.", instance=instance)
 1718 
 1719     def _check_instance_exists(self, instance):
 1720         """Ensure an instance with the same name is not already present."""
 1721         if self.driver.instance_exists(instance):
 1722             raise exception.InstanceExists(name=instance.name)
 1723 
 1724     def _allocate_network_async(self, context, instance, requested_networks,
 1725                                 security_groups, resource_provider_mapping):
 1726         """Method used to allocate networks in the background.
 1727 
 1728         Broken out for testing.
 1729         """
 1730         # First check to see if we're specifically not supposed to allocate
 1731         # networks because if so, we can exit early.
 1732         if requested_networks and requested_networks.no_allocate:
 1733             LOG.debug("Not allocating networking since 'none' was specified.",
 1734                       instance=instance)
 1735             return network_model.NetworkInfo([])
 1736 
 1737         LOG.debug("Allocating IP information in the background.",
 1738                   instance=instance)
 1739         retries = CONF.network_allocate_retries
 1740         attempts = retries + 1
 1741         retry_time = 1
 1742         bind_host_id = self.driver.network_binding_host_id(context, instance)
 1743         for attempt in range(1, attempts + 1):
 1744             try:
 1745                 nwinfo = self.network_api.allocate_for_instance(
 1746                         context, instance,
 1747                         requested_networks=requested_networks,
 1748                         security_groups=security_groups,
 1749                         bind_host_id=bind_host_id,
 1750                         resource_provider_mapping=resource_provider_mapping)
 1751                 LOG.debug('Instance network_info: |%s|', nwinfo,
 1752                           instance=instance)
 1753                 instance.system_metadata['network_allocated'] = 'True'
 1754                 # NOTE(JoshNang) do not save the instance here, as it can cause
 1755                 # races. The caller shares a reference to instance and waits
 1756                 # for this async greenthread to finish before calling
 1757                 # instance.save().
 1758                 return nwinfo
 1759             except Exception as e:
 1760                 log_info = {'attempt': attempt,
 1761                             'attempts': attempts}
 1762                 if attempt == attempts:
 1763                     LOG.exception('Instance failed network setup '
 1764                                   'after %(attempts)d attempt(s)',
 1765                                   log_info)
 1766                     raise e
 1767                 LOG.warning('Instance failed network setup '
 1768                             '(attempt %(attempt)d of %(attempts)d)',
 1769                             log_info, instance=instance)
 1770                 time.sleep(retry_time)
 1771                 retry_time *= 2
 1772                 if retry_time > 30:
 1773                     retry_time = 30
 1774         # Not reached.
 1775 
 1776     def _build_networks_for_instance(self, context, instance,
 1777             requested_networks, security_groups, resource_provider_mapping):
 1778 
 1779         # If we're here from a reschedule the network may already be allocated.
 1780         if strutils.bool_from_string(
 1781                 instance.system_metadata.get('network_allocated', 'False')):
 1782             # NOTE(alex_xu): The network_allocated is True means the network
 1783             # resource already allocated at previous scheduling, and the
 1784             # network setup is cleanup at previous. After rescheduling, the
 1785             # network resource need setup on the new host.
 1786             self.network_api.setup_instance_network_on_host(
 1787                 context, instance, instance.host)
 1788             return self.network_api.get_instance_nw_info(context, instance)
 1789 
 1790         network_info = self._allocate_network(context, instance,
 1791                 requested_networks, security_groups,
 1792                 resource_provider_mapping)
 1793 
 1794         return network_info
 1795 
 1796     def _allocate_network(self, context, instance, requested_networks,
 1797                           security_groups, resource_provider_mapping):
 1798         """Start network allocation asynchronously.  Return an instance
 1799         of NetworkInfoAsyncWrapper that can be used to retrieve the
 1800         allocated networks when the operation has finished.
 1801         """
 1802         # NOTE(comstud): Since we're allocating networks asynchronously,
 1803         # this task state has little meaning, as we won't be in this
 1804         # state for very long.
 1805         instance.vm_state = vm_states.BUILDING
 1806         instance.task_state = task_states.NETWORKING
 1807         instance.save(expected_task_state=[None])
 1808 
 1809         return network_model.NetworkInfoAsyncWrapper(
 1810                 self._allocate_network_async, context, instance,
 1811                 requested_networks, security_groups, resource_provider_mapping)
 1812 
 1813     def _default_root_device_name(self, instance, image_meta, root_bdm):
 1814         """Gets a default root device name from the driver.
 1815 
 1816         :param nova.objects.Instance instance:
 1817             The instance for which to get the root device name.
 1818         :param nova.objects.ImageMeta image_meta:
 1819             The metadata of the image of the instance.
 1820         :param nova.objects.BlockDeviceMapping root_bdm:
 1821             The description of the root device.
 1822         :returns: str -- The default root device name.
 1823         :raises: InternalError, TooManyDiskDevices
 1824         """
 1825         try:
 1826             return self.driver.default_root_device_name(instance,
 1827                                                         image_meta,
 1828                                                         root_bdm)
 1829         except NotImplementedError:
 1830             return compute_utils.get_next_device_name(instance, [])
 1831 
 1832     def _default_device_names_for_instance(self, instance,
 1833                                            root_device_name,
 1834                                            *block_device_lists):
 1835         """Default the missing device names in the BDM from the driver.
 1836 
 1837         :param nova.objects.Instance instance:
 1838             The instance for which to get default device names.
 1839         :param str root_device_name: The root device name.
 1840         :param list block_device_lists: List of block device mappings.
 1841         :returns: None
 1842         :raises: InternalError, TooManyDiskDevices
 1843         """
 1844         try:
 1845             self.driver.default_device_names_for_instance(instance,
 1846                                                           root_device_name,
 1847                                                           *block_device_lists)
 1848         except NotImplementedError:
 1849             compute_utils.default_device_names_for_instance(
 1850                 instance, root_device_name, *block_device_lists)
 1851 
 1852     def _get_device_name_for_instance(self, instance, bdms, block_device_obj):
 1853         """Get the next device name from the driver, based on the BDM.
 1854 
 1855         :param nova.objects.Instance instance:
 1856             The instance whose volume is requesting a device name.
 1857         :param nova.objects.BlockDeviceMappingList bdms:
 1858             The block device mappings for the instance.
 1859         :param nova.objects.BlockDeviceMapping block_device_obj:
 1860             A block device mapping containing info about the requested block
 1861             device.
 1862         :returns: The next device name.
 1863         :raises: InternalError, TooManyDiskDevices
 1864         """
 1865         # NOTE(ndipanov): Copy obj to avoid changing the original
 1866         block_device_obj = block_device_obj.obj_clone()
 1867         try:
 1868             return self.driver.get_device_name_for_instance(
 1869                 instance, bdms, block_device_obj)
 1870         except NotImplementedError:
 1871             return compute_utils.get_device_name_for_instance(
 1872                 instance, bdms, block_device_obj.get("device_name"))
 1873 
 1874     def _default_block_device_names(self, instance, image_meta, block_devices):
 1875         """Verify that all the devices have the device_name set. If not,
 1876         provide a default name.
 1877 
 1878         It also ensures that there is a root_device_name and is set to the
 1879         first block device in the boot sequence (boot_index=0).
 1880         """
 1881         root_bdm = block_device.get_root_bdm(block_devices)
 1882         if not root_bdm:
 1883             return
 1884 
 1885         # Get the root_device_name from the root BDM or the instance
 1886         root_device_name = None
 1887         update_root_bdm = False
 1888 
 1889         if root_bdm.device_name:
 1890             root_device_name = root_bdm.device_name
 1891             instance.root_device_name = root_device_name
 1892         elif instance.root_device_name:
 1893             root_device_name = instance.root_device_name
 1894             root_bdm.device_name = root_device_name
 1895             update_root_bdm = True
 1896         else:
 1897             root_device_name = self._default_root_device_name(instance,
 1898                                                               image_meta,
 1899                                                               root_bdm)
 1900 
 1901             instance.root_device_name = root_device_name
 1902             root_bdm.device_name = root_device_name
 1903             update_root_bdm = True
 1904 
 1905         if update_root_bdm:
 1906             root_bdm.save()
 1907 
 1908         ephemerals = []
 1909         swap = []
 1910         block_device_mapping = []
 1911 
 1912         for device in block_devices:
 1913             if block_device.new_format_is_ephemeral(device):
 1914                 ephemerals.append(device)
 1915 
 1916             if block_device.new_format_is_swap(device):
 1917                 swap.append(device)
 1918 
 1919             if driver_block_device.is_block_device_mapping(device):
 1920                 block_device_mapping.append(device)
 1921 
 1922         self._default_device_names_for_instance(instance,
 1923                                                 root_device_name,
 1924                                                 ephemerals,
 1925                                                 swap,
 1926                                                 block_device_mapping)
 1927 
 1928     def _block_device_info_to_legacy(self, block_device_info):
 1929         """Convert BDI to the old format for drivers that need it."""
 1930 
 1931         if self.use_legacy_block_device_info:
 1932             ephemerals = driver_block_device.legacy_block_devices(
 1933                 driver.block_device_info_get_ephemerals(block_device_info))
 1934             mapping = driver_block_device.legacy_block_devices(
 1935                 driver.block_device_info_get_mapping(block_device_info))
 1936             swap = block_device_info['swap']
 1937             if swap:
 1938                 swap = swap.legacy()
 1939 
 1940             block_device_info.update({
 1941                 'ephemerals': ephemerals,
 1942                 'swap': swap,
 1943                 'block_device_mapping': mapping})
 1944 
 1945     def _add_missing_dev_names(self, bdms, instance):
 1946         for bdm in bdms:
 1947             if bdm.device_name is not None:
 1948                 continue
 1949 
 1950             device_name = self._get_device_name_for_instance(instance,
 1951                                                              bdms, bdm)
 1952             values = {'device_name': device_name}
 1953             bdm.update(values)
 1954             bdm.save()
 1955 
 1956     def _prep_block_device(self, context, instance, bdms):
 1957         """Set up the block device for an instance with error logging."""
 1958         try:
 1959             self._add_missing_dev_names(bdms, instance)
 1960             block_device_info = driver.get_block_device_info(instance, bdms)
 1961             mapping = driver.block_device_info_get_mapping(block_device_info)
 1962             driver_block_device.attach_block_devices(
 1963                 mapping, context, instance, self.volume_api, self.driver,
 1964                 wait_func=self._await_block_device_map_created)
 1965 
 1966             self._block_device_info_to_legacy(block_device_info)
 1967             return block_device_info
 1968 
 1969         except exception.OverQuota as e:
 1970             LOG.warning('Failed to create block device for instance due'
 1971                         ' to exceeding volume related resource quota.'
 1972                         ' Error: %s', e.message, instance=instance)
 1973             raise
 1974 
 1975         except Exception as ex:
 1976             LOG.exception('Instance failed block device setup',
 1977                           instance=instance)
 1978             # InvalidBDM will eventually result in a BuildAbortException when
 1979             # booting from volume, and will be recorded as an instance fault.
 1980             # Maintain the original exception message which most likely has
 1981             # useful details which the standard InvalidBDM error message lacks.
 1982             raise exception.InvalidBDM(six.text_type(ex))
 1983 
 1984     def _update_instance_after_spawn(self, instance,
 1985                                      vm_state=vm_states.ACTIVE):
 1986         instance.power_state = self._get_power_state(instance)
 1987         instance.vm_state = vm_state
 1988         instance.task_state = None
 1989         # NOTE(sean-k-mooney): configdrive.update_instance checks
 1990         # instance.launched_at to determine if it is the first or
 1991         # subsequent spawn of an instance. We need to call update_instance
 1992         # first before setting instance.launched_at or instance.config_drive
 1993         # will never be set to true based on the value of force_config_drive.
 1994         # As a result the config drive will be lost on a hard reboot of the
 1995         # instance even when force_config_drive=true. see bug #1835822.
 1996         configdrive.update_instance(instance)
 1997         instance.launched_at = timeutils.utcnow()
 1998 
 1999     def _update_scheduler_instance_info(self, context, instance):
 2000         """Sends an InstanceList with created or updated Instance objects to
 2001         the Scheduler client.
 2002 
 2003         In the case of init_host, the value passed will already be an
 2004         InstanceList. Other calls will send individual Instance objects that
 2005         have been created or resized. In this case, we create an InstanceList
 2006         object containing that Instance.
 2007         """
 2008         if not self.send_instance_updates:
 2009             return
 2010         if isinstance(instance, obj_instance.Instance):
 2011             instance = objects.InstanceList(objects=[instance])
 2012         context = context.elevated()
 2013         self.query_client.update_instance_info(context, self.host,
 2014                                                instance)
 2015 
 2016     def _delete_scheduler_instance_info(self, context, instance_uuid):
 2017         """Sends the uuid of the deleted Instance to the Scheduler client."""
 2018         if not self.send_instance_updates:
 2019             return
 2020         context = context.elevated()
 2021         self.query_client.delete_instance_info(context, self.host,
 2022                                                instance_uuid)
 2023 
 2024     @periodic_task.periodic_task(spacing=CONF.scheduler_instance_sync_interval)
 2025     def _sync_scheduler_instance_info(self, context):
 2026         if not self.send_instance_updates:
 2027             return
 2028         context = context.elevated()
 2029         instances = objects.InstanceList.get_by_host(context, self.host,
 2030                                                      expected_attrs=[],
 2031                                                      use_slave=True)
 2032         uuids = [instance.uuid for instance in instances]
 2033         self.query_client.sync_instance_info(context, self.host, uuids)
 2034 
 2035     def _notify_about_instance_usage(self, context, instance, event_suffix,
 2036                                      network_info=None, extra_usage_info=None,
 2037                                      fault=None):
 2038         compute_utils.notify_about_instance_usage(
 2039             self.notifier, context, instance, event_suffix,
 2040             network_info=network_info,
 2041             extra_usage_info=extra_usage_info, fault=fault)
 2042 
 2043     def _deallocate_network(self, context, instance,
 2044                             requested_networks=None):
 2045         # If we were told not to allocate networks let's save ourselves
 2046         # the trouble of calling the network API.
 2047         if requested_networks and requested_networks.no_allocate:
 2048             LOG.debug("Skipping network deallocation for instance since "
 2049                       "networking was not requested.", instance=instance)
 2050             return
 2051 
 2052         LOG.debug('Deallocating network for instance', instance=instance)
 2053         with timeutils.StopWatch() as timer:
 2054             self.network_api.deallocate_for_instance(
 2055                 context, instance, requested_networks=requested_networks)
 2056         # nova-network does an rpc call so we're OK tracking time spent here
 2057         LOG.info('Took %0.2f seconds to deallocate network for instance.',
 2058                  timer.elapsed(), instance=instance)
 2059 
 2060     def _get_instance_block_device_info(self, context, instance,
 2061                                         refresh_conn_info=False,
 2062                                         bdms=None):
 2063         """Transform block devices to the driver block_device format."""
 2064 
 2065         if bdms is None:
 2066             bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
 2067                     context, instance.uuid)
 2068         block_device_info = driver.get_block_device_info(instance, bdms)
 2069 
 2070         if not refresh_conn_info:
 2071             # if the block_device_mapping has no value in connection_info
 2072             # (returned as None), don't include in the mapping
 2073             block_device_info['block_device_mapping'] = [
 2074                 bdm for bdm in driver.block_device_info_get_mapping(
 2075                                     block_device_info)
 2076                 if bdm.get('connection_info')]
 2077         else:
 2078             driver_block_device.refresh_conn_infos(
 2079                 driver.block_device_info_get_mapping(block_device_info),
 2080                 context, instance, self.volume_api, self.driver)
 2081 
 2082         self._block_device_info_to_legacy(block_device_info)
 2083 
 2084         return block_device_info
 2085 
 2086     def _build_failed(self, node):
 2087         if CONF.compute.consecutive_build_service_disable_threshold:
 2088             # NOTE(danms): Update our counter, but wait for the next
 2089             # update_available_resource() periodic to flush it to the DB
 2090             self.rt.build_failed(node)
 2091 
 2092     def _build_succeeded(self, node):
 2093         self.rt.build_succeeded(node)
 2094 
 2095     @wrap_exception()
 2096     @reverts_task_state
 2097     @wrap_instance_fault
 2098     def build_and_run_instance(self, context, instance, image, request_spec,
 2099                      filter_properties, admin_password=None,
 2100                      injected_files=None, requested_networks=None,
 2101                      security_groups=None, block_device_mapping=None,
 2102                      node=None, limits=None, host_list=None, accel_uuids=None):
 2103 
 2104         @utils.synchronized(instance.uuid)
 2105         def _locked_do_build_and_run_instance(*args, **kwargs):
 2106             # NOTE(danms): We grab the semaphore with the instance uuid
 2107             # locked because we could wait in line to build this instance
 2108             # for a while and we want to make sure that nothing else tries
 2109             # to do anything with this instance while we wait.
 2110             with self._build_semaphore:
 2111                 try:
 2112                     result = self._do_build_and_run_instance(*args, **kwargs)
 2113                 except Exception:
 2114                     # NOTE(mriedem): This should really only happen if
 2115                     # _decode_files in _do_build_and_run_instance fails, and
 2116                     # that's before a guest is spawned so it's OK to remove
 2117                     # allocations for the instance for this node from Placement
 2118                     # below as there is no guest consuming resources anyway.
 2119                     # The _decode_files case could be handled more specifically
 2120                     # but that's left for another day.
 2121                     result = build_results.FAILED
 2122                     raise
 2123                 finally:
 2124                     if result == build_results.FAILED:
 2125                         # Remove the allocation records from Placement for the
 2126                         # instance if the build failed. The instance.host is
 2127                         # likely set to None in _do_build_and_run_instance
 2128                         # which means if the user deletes the instance, it
 2129                         # will be deleted in the API, not the compute service.
 2130                         # Setting the instance.host to None in
 2131                         # _do_build_and_run_instance means that the
 2132                         # ResourceTracker will no longer consider this instance
 2133                         # to be claiming resources against it, so we want to
 2134                         # reflect that same thing in Placement.  No need to
 2135                         # call this for a reschedule, as the allocations will
 2136                         # have already been removed in
 2137                         # self._do_build_and_run_instance().
 2138                         self.reportclient.delete_allocation_for_instance(
 2139                             context, instance.uuid)
 2140 
 2141                     if result in (build_results.FAILED,
 2142                                   build_results.RESCHEDULED):
 2143                         self._build_failed(node)
 2144                     else:
 2145                         self._build_succeeded(node)
 2146 
 2147         # NOTE(danms): We spawn here to return the RPC worker thread back to
 2148         # the pool. Since what follows could take a really long time, we don't
 2149         # want to tie up RPC workers.
 2150         utils.spawn_n(_locked_do_build_and_run_instance,
 2151                       context, instance, image, request_spec,
 2152                       filter_properties, admin_password, injected_files,
 2153                       requested_networks, security_groups,
 2154                       block_device_mapping, node, limits, host_list,
 2155                       accel_uuids)
 2156 
 2157     def _check_device_tagging(self, requested_networks, block_device_mapping):
 2158         tagging_requested = False
 2159         if requested_networks:
 2160             for net in requested_networks:
 2161                 if 'tag' in net and net.tag is not None:
 2162                     tagging_requested = True
 2163                     break
 2164         if block_device_mapping and not tagging_requested:
 2165             for bdm in block_device_mapping:
 2166                 if 'tag' in bdm and bdm.tag is not None:
 2167                     tagging_requested = True
 2168                     break
 2169         if (tagging_requested and
 2170                 not self.driver.capabilities.get('supports_device_tagging',
 2171                                                  False)):
 2172             raise exception.BuildAbortException('Attempt to boot guest with '
 2173                                                 'tagged devices on host that '
 2174                                                 'does not support tagging.')
 2175 
 2176     def _check_trusted_certs(self, instance):
 2177         if (instance.trusted_certs and
 2178                 not self.driver.capabilities.get('supports_trusted_certs',
 2179                                                  False)):
 2180             raise exception.BuildAbortException(
 2181                 'Trusted image certificates provided on host that does not '
 2182                 'support certificate validation.')
 2183 
 2184     @wrap_exception()
 2185     @reverts_task_state
 2186     @wrap_instance_event(prefix='compute')
 2187     @wrap_instance_fault
 2188     def _do_build_and_run_instance(self, context, instance, image,
 2189             request_spec, filter_properties, admin_password, injected_files,
 2190             requested_networks, security_groups, block_device_mapping,
 2191             node=None, limits=None, host_list=None, accel_uuids=None):
 2192 
 2193         try:
 2194             LOG.debug('Starting instance...', instance=instance)
 2195             instance.vm_state = vm_states.BUILDING
 2196             instance.task_state = None
 2197             instance.save(expected_task_state=
 2198                     (task_states.SCHEDULING, None))
 2199         except exception.InstanceNotFound:
 2200             msg = 'Instance disappeared before build.'
 2201             LOG.debug(msg, instance=instance)
 2202             return build_results.FAILED
 2203         except exception.UnexpectedTaskStateError as e:
 2204             LOG.debug(e.format_message(), instance=instance)
 2205             return build_results.FAILED
 2206 
 2207         # b64 decode the files to inject:
 2208         decoded_files = self._decode_files(injected_files)
 2209 
 2210         if limits is None:
 2211             limits = {}
 2212 
 2213         if node is None:
 2214             node = self._get_nodename(instance, refresh=True)
 2215 
 2216         try:
 2217             with timeutils.StopWatch() as timer:
 2218                 self._build_and_run_instance(context, instance, image,
 2219                         decoded_files, admin_password, requested_networks,
 2220                         security_groups, block_device_mapping, node, limits,
 2221                         filter_properties, request_spec, accel_uuids)
 2222             LOG.info('Took %0.2f seconds to build instance.',
 2223                      timer.elapsed(), instance=instance)
 2224             return build_results.ACTIVE
 2225         except exception.RescheduledException as e:
 2226             retry = filter_properties.get('retry')
 2227             if not retry:
 2228                 # no retry information, do not reschedule.
 2229                 LOG.debug("Retry info not present, will not reschedule",
 2230                     instance=instance)
 2231                 self._cleanup_allocated_networks(context, instance,
 2232                     requested_networks)
 2233                 self._cleanup_volumes(context, instance,
 2234                     block_device_mapping, raise_exc=False)
 2235                 compute_utils.add_instance_fault_from_exc(context,
 2236                         instance, e, sys.exc_info(),
 2237                         fault_message=e.kwargs['reason'])
 2238                 self._nil_out_instance_obj_host_and_node(instance)
 2239                 self._set_instance_obj_error_state(instance,
 2240                                                    clean_task_state=True)
 2241                 return build_results.FAILED
 2242             LOG.debug(e.format_message(), instance=instance)
 2243             # This will be used for logging the exception
 2244             retry['exc'] = traceback.format_exception(*sys.exc_info())
 2245             # This will be used for setting the instance fault message
 2246             retry['exc_reason'] = e.kwargs['reason']
 2247 
 2248             self._cleanup_allocated_networks(context, instance,
 2249                                              requested_networks)
 2250 
 2251             self._nil_out_instance_obj_host_and_node(instance)
 2252             instance.task_state = task_states.SCHEDULING
 2253             instance.save()
 2254             # The instance will have already claimed resources from this host
 2255             # before this build was attempted. Now that it has failed, we need
 2256             # to unclaim those resources before casting to the conductor, so
 2257             # that if there are alternate hosts available for a retry, it can
 2258             # claim resources on that new host for the instance.
 2259             self.reportclient.delete_allocation_for_instance(context,
 2260                                                              instance.uuid)
 2261 
 2262             self.compute_task_api.build_instances(context, [instance],
 2263                     image, filter_properties, admin_password,
 2264                     injected_files, requested_networks, security_groups,
 2265                     block_device_mapping, request_spec=request_spec,
 2266                     host_lists=[host_list])
 2267             return build_results.RESCHEDULED
 2268         except (exception.InstanceNotFound,
 2269                 exception.UnexpectedDeletingTaskStateError):
 2270             msg = 'Instance disappeared during build.'
 2271             LOG.debug(msg, instance=instance)
 2272             self._cleanup_allocated_networks(context, instance,
 2273                     requested_networks)
 2274             return build_results.FAILED
 2275         except Exception as e:
 2276             if isinstance(e, exception.BuildAbortException):
 2277                 LOG.error(e.format_message(), instance=instance)
 2278             else:
 2279                 # Should not reach here.
 2280                 LOG.exception('Unexpected build failure, not rescheduling '
 2281                               'build.', instance=instance)
 2282             self._cleanup_allocated_networks(context, instance,
 2283                     requested_networks)
 2284             self._cleanup_volumes(context, instance,
 2285                     block_device_mapping, raise_exc=False)
 2286             compute_utils.add_instance_fault_from_exc(context, instance,
 2287                     e, sys.exc_info())
 2288             self._nil_out_instance_obj_host_and_node(instance)
 2289             self._set_instance_obj_error_state(instance, clean_task_state=True)
 2290             return build_results.FAILED
 2291 
 2292     @staticmethod
 2293     def _get_scheduler_hints(filter_properties, request_spec=None):
 2294         """Helper method to get scheduler hints.
 2295 
 2296         This method prefers to get the hints out of the request spec, but that
 2297         might not be provided. Conductor will pass request_spec down to the
 2298         first compute chosen for a build but older computes will not pass
 2299         the request_spec to conductor's build_instances method for a
 2300         a reschedule, so if we're on a host via a retry, request_spec may not
 2301         be provided so we need to fallback to use the filter_properties
 2302         to get scheduler hints.
 2303         """
 2304         hints = {}
 2305         if request_spec is not None and 'scheduler_hints' in request_spec:
 2306             hints = request_spec.scheduler_hints
 2307         if not hints:
 2308             hints = filter_properties.get('scheduler_hints') or {}
 2309         return hints
 2310 
 2311     @staticmethod
 2312     def _get_request_group_mapping(request_spec):
 2313         """Return request group resource - provider mapping. This is currently
 2314         used for Neutron ports that have resource request due to the port
 2315         having QoS minimum bandwidth policy rule attached.
 2316 
 2317         :param request_spec: A RequestSpec object or None
 2318         :returns: A dict keyed by RequestGroup requester_id, currently Neutron
 2319         port_id, to resource provider UUID that provides resource for that
 2320         RequestGroup. Or None if the request_spec was None.
 2321         """
 2322         if request_spec:
 2323             return request_spec.get_request_group_mapping()
 2324         else:
 2325             return None
 2326 
 2327     def _build_and_run_instance(self, context, instance, image, injected_files,
 2328             admin_password, requested_networks, security_groups,
 2329             block_device_mapping, node, limits, filter_properties,
 2330             request_spec=None, accel_uuids=None):
 2331 
 2332         image_name = image.get('name')
 2333         self._notify_about_instance_usage(context, instance, 'create.start',
 2334                 extra_usage_info={'image_name': image_name})
 2335         compute_utils.notify_about_instance_create(
 2336             context, instance, self.host,
 2337             phase=fields.NotificationPhase.START,
 2338             bdms=block_device_mapping)
 2339 
 2340         # NOTE(mikal): cache the keystone roles associated with the instance
 2341         # at boot time for later reference
 2342         instance.system_metadata.update(
 2343             {'boot_roles': ','.join(context.roles)})
 2344 
 2345         self._check_device_tagging(requested_networks, block_device_mapping)
 2346         self._check_trusted_certs(instance)
 2347 
 2348         provider_mapping = self._get_request_group_mapping(request_spec)
 2349 
 2350         if provider_mapping:
 2351             try:
 2352                 compute_utils\
 2353                     .update_pci_request_spec_with_allocated_interface_name(
 2354                         context, self.reportclient, instance, provider_mapping)
 2355             except (exception.AmbiguousResourceProviderForPCIRequest,
 2356                     exception.UnexpectedResourceProviderNameForPCIRequest
 2357                     ) as e:
 2358                 raise exception.BuildAbortException(
 2359                     reason=six.text_type(e), instance_uuid=instance.uuid)
 2360 
 2361         # TODO(Luyao) cut over to get_allocs_for_consumer
 2362         allocs = self.reportclient.get_allocations_for_consumer(
 2363                 context, instance.uuid)
 2364 
 2365         try:
 2366             scheduler_hints = self._get_scheduler_hints(filter_properties,
 2367                                                         request_spec)
 2368             with self.rt.instance_claim(context, instance, node, allocs,
 2369                                         limits):
 2370                 # NOTE(russellb) It's important that this validation be done
 2371                 # *after* the resource tracker instance claim, as that is where
 2372                 # the host is set on the instance.
 2373                 self._validate_instance_group_policy(context, instance,
 2374                                                      scheduler_hints)
 2375                 image_meta = objects.ImageMeta.from_dict(image)
 2376 
 2377                 with self._build_resources(context, instance,
 2378                         requested_networks, security_groups, image_meta,
 2379                         block_device_mapping, provider_mapping,
 2380                         accel_uuids) as resources:
 2381                     instance.vm_state = vm_states.BUILDING
 2382                     instance.task_state = task_states.SPAWNING
 2383                     # NOTE(JoshNang) This also saves the changes to the
 2384                     # instance from _allocate_network_async, as they aren't
 2385                     # saved in that function to prevent races.
 2386                     instance.save(expected_task_state=
 2387                             task_states.BLOCK_DEVICE_MAPPING)
 2388                     block_device_info = resources['block_device_info']
 2389                     network_info = resources['network_info']
 2390                     accel_info = resources['accel_info']
 2391                     LOG.debug('Start spawning the instance on the hypervisor.',
 2392                               instance=instance)
 2393                     with timeutils.StopWatch() as timer:
 2394                         self.driver.spawn(context, instance, image_meta,
 2395                                           injected_files, admin_password,
 2396                                           allocs, network_info=network_info,
 2397                                           block_device_info=block_device_info,
 2398                                           accel_info=accel_info)
 2399                     LOG.info('Took %0.2f seconds to spawn the instance on '
 2400                              'the hypervisor.', timer.elapsed(),
 2401                              instance=instance)
 2402         except (exception.InstanceNotFound,
 2403                 exception.UnexpectedDeletingTaskStateError) as e:
 2404             with excutils.save_and_reraise_exception():
 2405                 self._notify_about_instance_usage(context, instance,
 2406                     'create.error', fault=e)
 2407                 compute_utils.notify_about_instance_create(
 2408                     context, instance, self.host,
 2409                     phase=fields.NotificationPhase.ERROR, exception=e,
 2410                     bdms=block_device_mapping)
 2411         except exception.ComputeResourcesUnavailable as e:
 2412             LOG.debug(e.format_message(), instance=instance)
 2413             self._notify_about_instance_usage(context, instance,
 2414                     'create.error', fault=e)
 2415             compute_utils.notify_about_instance_create(
 2416                     context, instance, self.host,
 2417                     phase=fields.NotificationPhase.ERROR, exception=e,
 2418                     bdms=block_device_mapping)
 2419             raise exception.RescheduledException(
 2420                     instance_uuid=instance.uuid, reason=e.format_message())
 2421         except exception.BuildAbortException as e:
 2422             with excutils.save_and_reraise_exception():
 2423                 LOG.debug(e.format_message(), instance=instance)
 2424                 self._notify_about_instance_usage(context, instance,
 2425                     'create.error', fault=e)
 2426                 compute_utils.notify_about_instance_create(
 2427                     context, instance, self.host,
 2428                     phase=fields.NotificationPhase.ERROR, exception=e,
 2429                     bdms=block_device_mapping)
 2430         except exception.NoMoreFixedIps as e:
 2431             LOG.warning('No more fixed IP to be allocated',
 2432                         instance=instance)
 2433             self._notify_about_instance_usage(context, instance,
 2434                     'create.error', fault=e)
 2435             compute_utils.notify_about_instance_create(
 2436                     context, instance, self.host,
 2437                     phase=fields.NotificationPhase.ERROR, exception=e,
 2438                     bdms=block_device_mapping)
 2439             msg = _('Failed to allocate the network(s) with error %s, '
 2440                     'not rescheduling.') % e.format_message()
 2441             raise exception.BuildAbortException(instance_uuid=instance.uuid,
 2442                     reason=msg)
 2443         except (exception.ExternalNetworkAttachForbidden,
 2444                 exception.VirtualInterfaceCreateException,
 2445                 exception.VirtualInterfaceMacAddressException,
 2446                 exception.FixedIpInvalidOnHost,
 2447                 exception.UnableToAutoAllocateNetwork,
 2448                 exception.NetworksWithQoSPolicyNotSupported) as e:
 2449             LOG.exception('Failed to allocate network(s)',
 2450                           instance=instance)
 2451             self._notify_about_instance_usage(context, instance,
 2452                     'create.error', fault=e)
 2453             compute_utils.notify_about_instance_create(
 2454                     context, instance, self.host,
 2455                     phase=fields.NotificationPhase.ERROR, exception=e,
 2456                     bdms=block_device_mapping)
 2457             msg = _('Failed to allocate the network(s), not rescheduling.')
 2458             raise exception.BuildAbortException(instance_uuid=instance.uuid,
 2459                     reason=msg)
 2460         except (exception.FlavorDiskTooSmall,
 2461                 exception.FlavorMemoryTooSmall,
 2462                 exception.ImageNotActive,
 2463                 exception.ImageUnacceptable,
 2464                 exception.InvalidDiskInfo,
 2465                 exception.InvalidDiskFormat,
 2466                 cursive_exception.SignatureVerificationError,
 2467                 exception.CertificateValidationFailed,
 2468                 exception.VolumeEncryptionNotSupported,
 2469                 exception.InvalidInput,
 2470                 # TODO(mriedem): We should be validating RequestedVRamTooHigh
 2471                 # in the API during server create and rebuild.
 2472                 exception.RequestedVRamTooHigh) as e:
 2473             self._notify_about_instance_usage(context, instance,
 2474                     'create.error', fault=e)
 2475             compute_utils.notify_about_instance_create(
 2476                     context, instance, self.host,
 2477                     phase=fields.NotificationPhase.ERROR, exception=e,
 2478                     bdms=block_device_mapping)
 2479             raise exception.BuildAbortException(instance_uuid=instance.uuid,
 2480                     reason=e.format_message())
 2481         except Exception as e:
 2482             LOG.exception('Failed to build and run instance',
 2483                           instance=instance)
 2484             self._notify_about_instance_usage(context, instance,
 2485                     'create.error', fault=e)
 2486             compute_utils.notify_about_instance_create(
 2487                     context, instance, self.host,
 2488                     phase=fields.NotificationPhase.ERROR, exception=e,
 2489                     bdms=block_device_mapping)
 2490             raise exception.RescheduledException(
 2491                     instance_uuid=instance.uuid, reason=six.text_type(e))
 2492 
 2493         # NOTE(alaski): This is only useful during reschedules, remove it now.
 2494         instance.system_metadata.pop('network_allocated', None)
 2495 
 2496         # If CONF.default_access_ip_network_name is set, grab the
 2497         # corresponding network and set the access ip values accordingly.
 2498         network_name = CONF.default_access_ip_network_name
 2499         if (network_name and not instance.access_ip_v4 and
 2500                 not instance.access_ip_v6):
 2501             # Note that when there are multiple ips to choose from, an
 2502             # arbitrary one will be chosen.
 2503             for vif in network_info:
 2504                 if vif['network']['label'] == network_name:
 2505                     for ip in vif.fixed_ips():
 2506                         if not instance.access_ip_v4 and ip['version'] == 4:
 2507                             instance.access_ip_v4 = ip['address']
 2508                         if not instance.access_ip_v6 and ip['version'] == 6:
 2509                             instance.access_ip_v6 = ip['address']
 2510                     break
 2511 
 2512         self._update_instance_after_spawn(instance)
 2513 
 2514         try:
 2515             instance.save(expected_task_state=task_states.SPAWNING)
 2516         except (exception.InstanceNotFound,
 2517                 exception.UnexpectedDeletingTaskStateError) as e:
 2518             with excutils.save_and_reraise_exception():
 2519                 self._notify_about_instance_usage(context, instance,
 2520                     'create.error', fault=e)
 2521                 compute_utils.notify_about_instance_create(
 2522                     context, instance, self.host,
 2523                     phase=fields.NotificationPhase.ERROR, exception=e,
 2524                     bdms=block_device_mapping)
 2525 
 2526         self._update_scheduler_instance_info(context, instance)
 2527         self._notify_about_instance_usage(context, instance, 'create.end',
 2528                 extra_usage_info={'message': _('Success')},
 2529                 network_info=network_info)
 2530         compute_utils.notify_about_instance_create(context, instance,
 2531                 self.host, phase=fields.NotificationPhase.END,
 2532                 bdms=block_device_mapping)
 2533 
 2534     def _build_resources_cleanup(self, instance, network_info):
 2535         # Make sure the async call finishes
 2536         if network_info is not None:
 2537             network_info.wait(do_raise=False)
 2538             self.driver.clean_networks_preparation(instance,
 2539                                                    network_info)
 2540         self.driver.failed_spawn_cleanup(instance)
 2541 
 2542     @contextlib.contextmanager
 2543     def _build_resources(self, context, instance, requested_networks,
 2544                          security_groups, image_meta, block_device_mapping,
 2545                          resource_provider_mapping, accel_uuids):
 2546         resources = {}
 2547         network_info = None
 2548         try:
 2549             LOG.debug('Start building networks asynchronously for instance.',
 2550                       instance=instance)
 2551             network_info = self._build_networks_for_instance(context, instance,
 2552                     requested_networks, security_groups,
 2553                     resource_provider_mapping)
 2554             resources['network_info'] = network_info
 2555         except (exception.InstanceNotFound,
 2556                 exception.UnexpectedDeletingTaskStateError):
 2557             raise
 2558         except exception.UnexpectedTaskStateError as e:
 2559             raise exception.BuildAbortException(instance_uuid=instance.uuid,
 2560                     reason=e.format_message())
 2561         except Exception:
 2562             # Because this allocation is async any failures are likely to occur
 2563             # when the driver accesses network_info during spawn().
 2564             LOG.exception('Failed to allocate network(s)',
 2565                           instance=instance)
 2566             msg = _('Failed to allocate the network(s), not rescheduling.')
 2567             raise exception.BuildAbortException(instance_uuid=instance.uuid,
 2568                     reason=msg)
 2569 
 2570         try:
 2571             # Perform any driver preparation work for the driver.
 2572             self.driver.prepare_for_spawn(instance)
 2573 
 2574             # Depending on a virt driver, some network configuration is
 2575             # necessary before preparing block devices.
 2576             self.driver.prepare_networks_before_block_device_mapping(
 2577                 instance, network_info)
 2578 
 2579             # Verify that all the BDMs have a device_name set and assign a
 2580             # default to the ones missing it with the help of the driver.
 2581             self._default_block_device_names(instance, image_meta,
 2582                                              block_device_mapping)
 2583 
 2584             LOG.debug('Start building block device mappings for instance.',
 2585                       instance=instance)
 2586             instance.vm_state = vm_states.BUILDING
 2587             instance.task_state = task_states.BLOCK_DEVICE_MAPPING
 2588             instance.save()
 2589 
 2590             block_device_info = self._prep_block_device(context, instance,
 2591                     block_device_mapping)
 2592             resources['block_device_info'] = block_device_info
 2593         except (exception.InstanceNotFound,
 2594                 exception.UnexpectedDeletingTaskStateError):
 2595             with excutils.save_and_reraise_exception():
 2596                 self._build_resources_cleanup(instance, network_info)
 2597         except (exception.UnexpectedTaskStateError,
 2598                 exception.OverQuota, exception.InvalidBDM) as e:
 2599             self._build_resources_cleanup(instance, network_info)
 2600             raise exception.BuildAbortException(instance_uuid=instance.uuid,
 2601                     reason=e.format_message())
 2602         except Exception:
 2603             LOG.exception('Failure prepping block device',
 2604                           instance=instance)
 2605             self._build_resources_cleanup(instance, network_info)
 2606             msg = _('Failure prepping block device.')
 2607             raise exception.BuildAbortException(instance_uuid=instance.uuid,
 2608                     reason=msg)
 2609 
 2610         arqs = []
 2611         if instance.flavor.extra_specs.get('accel:device_profile'):
 2612             try:
 2613                 arqs = self._get_bound_arq_resources(
 2614                     context, instance, accel_uuids)
 2615             except (Exception, eventlet.timeout.Timeout) as exc:
 2616                 LOG.exception(exc)
 2617                 self._build_resources_cleanup(instance, network_info)
 2618                 compute_utils.delete_arqs_if_needed(context, instance)
 2619                 msg = _('Failure getting accelerator requests.')
 2620                 raise exception.BuildAbortException(
 2621                     reason=msg, instance_uuid=instance.uuid)
 2622 
 2623         resources['accel_info'] = arqs
 2624         try:
 2625             yield resources
 2626         except Exception as exc:
 2627             with excutils.save_and_reraise_exception() as ctxt:
 2628                 if not isinstance(exc, (
 2629                         exception.InstanceNotFound,
 2630                         exception.UnexpectedDeletingTaskStateError)):
 2631                     LOG.exception('Instance failed to spawn',
 2632                                   instance=instance)
 2633                 # Make sure the async call finishes
 2634                 if network_info is not None:
 2635                     network_info.wait(do_raise=False)
 2636                 # if network_info is empty we're likely here because of
 2637                 # network allocation failure. Since nothing can be reused on
 2638                 # rescheduling it's better to deallocate network to eliminate
 2639                 # the chance of orphaned ports in neutron
 2640                 deallocate_networks = False if network_info else True
 2641                 try:
 2642                     self._shutdown_instance(context, instance,
 2643                             block_device_mapping, requested_networks,
 2644                             try_deallocate_networks=deallocate_networks)
 2645                 except Exception as exc2:
 2646                     ctxt.reraise = False
 2647                     LOG.warning('Could not clean up failed build,'
 2648                                 ' not rescheduling. Error: %s',
 2649                                 six.text_type(exc2))
 2650                     raise exception.BuildAbortException(
 2651                             instance_uuid=instance.uuid,
 2652                             reason=six.text_type(exc))
 2653                 finally:
 2654                     # Call Cyborg to delete accelerator requests
 2655                     compute_utils.delete_arqs_if_needed(context, instance)
 2656 
 2657     def _get_bound_arq_resources(self, context, instance, arq_uuids):
 2658         """Get bound accelerator requests.
 2659 
 2660         The ARQ binding was kicked off in the conductor as an async
 2661         operation. Here we wait for the notification from Cyborg.
 2662 
 2663         If the notification arrived before this point, which can happen
 2664         in many/most cases (see [1]), it will be lost. To handle that,
 2665         we use exit_wait_early.
 2666         [1] https://review.opendev.org/#/c/631244/46/nova/compute/
 2667             manager.py@2627
 2668 
 2669         :param instance: instance object
 2670         :param arq_uuids: List of accelerator request (ARQ) UUIDs.
 2671         :returns: List of ARQs for which bindings have completed,
 2672                   successfully or otherwise
 2673         """
 2674 
 2675         cyclient = cyborg.get_client(context)
 2676         if arq_uuids is None:
 2677             arqs = cyclient.get_arqs_for_instance(instance.uuid)
 2678             arq_uuids = [arq['uuid'] for arq in arqs]
 2679         events = [('accelerator-request-bound', arq_uuid)
 2680                   for arq_uuid in arq_uuids]
 2681 
 2682         timeout = CONF.arq_binding_timeout
 2683         with self.virtapi.wait_for_instance_event(
 2684                 instance, events, deadline=timeout):
 2685             resolved_arqs = cyclient.get_arqs_for_instance(
 2686                     instance.uuid, only_resolved=True)
 2687             # Events for these resolved ARQs may have already arrived.
 2688             # Such 'early' events need to be ignored.
 2689             early_events = [('accelerator-request-bound', arq['uuid'])
 2690                              for arq in resolved_arqs]
 2691             if early_events:
 2692                 self.virtapi.exit_wait_early(early_events)
 2693 
 2694         # Since a timeout in wait_for_instance_event will raise, we get
 2695         # here only if all binding events have been received.
 2696         resolved_uuids = [arq['uuid'] for arq in resolved_arqs]
 2697         if sorted(resolved_uuids) != sorted(arq_uuids):
 2698             # Query Cyborg to get all.
 2699             arqs = cyclient.get_arqs_for_instance(instance.uuid)
 2700         else:
 2701             arqs = resolved_arqs
 2702         return arqs
 2703 
 2704     def _cleanup_allocated_networks(self, context, instance,
 2705             requested_networks):
 2706         """Cleanup networks allocated for instance.
 2707 
 2708         :param context: nova request context
 2709         :param instance: nova.objects.instance.Instance object
 2710         :param requested_networks: nova.objects.NetworkRequestList
 2711         """
 2712         LOG.debug('Unplugging VIFs for instance', instance=instance)
 2713 
 2714         network_info = instance.get_network_info()
 2715 
 2716         # NOTE(stephenfin) to avoid nova destroying the instance without
 2717         # unplugging the interface, refresh network_info if it is empty.
 2718         if not network_info:
 2719             try:
 2720                 network_info = self.network_api.get_instance_nw_info(
 2721                     context, instance,
 2722                 )
 2723             except Exception as exc:
 2724                 LOG.warning(
 2725                     'Failed to update network info cache when cleaning up '
 2726                     'allocated networks. Stale VIFs may be left on this host.'
 2727                     'Error: %s', six.text_type(exc)
 2728                 )
 2729                 return
 2730 
 2731         try:
 2732             self.driver.unplug_vifs(instance, network_info)
 2733         except NotImplementedError:
 2734             # This is an optional method so ignore things if it doesn't exist
 2735             LOG.debug(
 2736                 'Virt driver does not provide unplug_vifs method, so it '
 2737                 'is not possible determine if VIFs should be unplugged.'
 2738             )
 2739         except exception.NovaException as exc:
 2740             # It's possible that the instance never got as far as plugging
 2741             # VIFs, in which case we would see an exception which can be
 2742             # mostly ignored
 2743             LOG.warning(
 2744                 'Cleaning up VIFs failed for instance. Error: %s',
 2745                 six.text_type(exc), instance=instance,
 2746             )
 2747         else:
 2748             LOG.debug('Unplugged VIFs for instance', instance=instance)
 2749 
 2750         try:
 2751             self._deallocate_network(context, instance, requested_networks)
 2752         except Exception:
 2753             LOG.exception('Failed to deallocate networks', instance=instance)
 2754             return
 2755 
 2756         instance.system_metadata['network_allocated'] = 'False'
 2757         try:
 2758             instance.save()
 2759         except exception.InstanceNotFound:
 2760             # NOTE(alaski): It's possible that we're cleaning up the networks
 2761             # because the instance was deleted.  If that's the case then this
 2762             # exception will be raised by instance.save()
 2763             pass
 2764 
 2765     def _try_deallocate_network(self, context, instance,
 2766                                 requested_networks=None):
 2767 
 2768         # During auto-scale cleanup, we could be deleting a large number
 2769         # of servers at the same time and overloading parts of the system,
 2770         # so we retry a few times in case of connection failures to the
 2771         # networking service.
 2772         @loopingcall.RetryDecorator(
 2773             max_retry_count=3, inc_sleep_time=2, max_sleep_time=12,
 2774             exceptions=(keystone_exception.connection.ConnectFailure,))
 2775         def _deallocate_network_with_retries():
 2776             try:
 2777                 self._deallocate_network(
 2778                     context, instance, requested_networks)
 2779             except keystone_exception.connection.ConnectFailure as e:
 2780                 # Provide a warning that something is amiss.
 2781                 with excutils.save_and_reraise_exception():
 2782                     LOG.warning('Failed to deallocate network for instance; '
 2783                                 'retrying. Error: %s', six.text_type(e),
 2784                                 instance=instance)
 2785 
 2786         try:
 2787             # tear down allocated network structure
 2788             _deallocate_network_with_retries()
 2789         except Exception as ex:
 2790             with excutils.save_and_reraise_exception():
 2791                 LOG.error('Failed to deallocate network for instance. '
 2792                           'Error: %s', ex, instance=instance)
 2793                 self._set_instance_obj_error_state(instance)
 2794 
 2795     def _get_power_off_values(self, instance, clean_shutdown):
 2796         """Get the timing configuration for powering down this instance."""
 2797         if clean_shutdown:
 2798             timeout = compute_utils.get_value_from_system_metadata(instance,
 2799                           key='image_os_shutdown_timeout', type=int,
 2800                           default=CONF.shutdown_timeout)
 2801             retry_interval = CONF.compute.shutdown_retry_interval
 2802         else:
 2803             timeout = 0
 2804             retry_interval = 0
 2805 
 2806         return timeout, retry_interval
 2807 
 2808     def _power_off_instance(self, instance, clean_shutdown=True):
 2809         """Power off an instance on this host."""
 2810         timeout, retry_interval = self._get_power_off_values(
 2811             instance, clean_shutdown)
 2812         self.driver.power_off(instance, timeout, retry_interval)
 2813 
 2814     def _shutdown_instance(self, context, instance,
 2815                            bdms, requested_networks=None, notify=True,
 2816                            try_deallocate_networks=True):
 2817         """Shutdown an instance on this host.
 2818 
 2819         :param:context: security context
 2820         :param:instance: a nova.objects.Instance object
 2821         :param:bdms: the block devices for the instance to be torn
 2822                      down
 2823         :param:requested_networks: the networks on which the instance
 2824                                    has ports
 2825         :param:notify: true if a final usage notification should be
 2826                        emitted
 2827         :param:try_deallocate_networks: false if we should avoid
 2828                                         trying to teardown networking
 2829         """
 2830         context = context.elevated()
 2831         LOG.info('Terminating instance', instance=instance)
 2832 
 2833         if notify:
 2834             self._notify_about_instance_usage(context, instance,
 2835                                               "shutdown.start")
 2836             compute_utils.notify_about_instance_action(context, instance,
 2837                     self.host, action=fields.NotificationAction.SHUTDOWN,
 2838                     phase=fields.NotificationPhase.START, bdms=bdms)
 2839 
 2840         network_info = instance.get_network_info()
 2841 
 2842         # NOTE(arnaudmorin) to avoid nova destroying the instance without
 2843         # unplugging the interface, refresh network_info if it is empty.
 2844         if not network_info:
 2845             network_info = self.network_api.get_instance_nw_info(
 2846                 context, instance)
 2847 
 2848         # NOTE(vish) get bdms before destroying the instance
 2849         vol_bdms = [bdm for bdm in bdms if bdm.is_volume]
 2850         block_device_info = self._get_instance_block_device_info(
 2851             context, instance, bdms=bdms)
 2852 
 2853         # NOTE(melwitt): attempt driver destroy before releasing ip, may
 2854         #                want to keep ip allocated for certain failures
 2855         try:
 2856             LOG.debug('Start destroying the instance on the hypervisor.',
 2857                       instance=instance)
 2858             with timeutils.StopWatch() as timer:
 2859                 self.driver.destroy(context, instance, network_info,
 2860                                     block_device_info)
 2861             LOG.info('Took %0.2f seconds to destroy the instance on the '
 2862                      'hypervisor.', timer.elapsed(), instance=instance)
 2863         except exception.InstancePowerOffFailure:
 2864             # if the instance can't power off, don't release the ip
 2865             with excutils.save_and_reraise_exception():
 2866                 pass
 2867         except Exception:
 2868             with excutils.save_and_reraise_exception():
 2869                 # deallocate ip and fail without proceeding to
 2870                 # volume api calls, preserving current behavior
 2871                 if try_deallocate_networks:
 2872                     self._try_deallocate_network(context, instance,
 2873                                                  requested_networks)
 2874 
 2875         if try_deallocate_networks:
 2876             self._try_deallocate_network(context, instance, requested_networks)
 2877 
 2878         timer.restart()
 2879         for bdm in vol_bdms:
 2880             try:
 2881                 if bdm.attachment_id:
 2882                     self.volume_api.attachment_delete(context,
 2883                                                       bdm.attachment_id)
 2884                 else:
 2885                     # NOTE(vish): actual driver detach done in driver.destroy,
 2886                     #             so just tell cinder that we are done with it.
 2887                     connector = self.driver.get_volume_connector(instance)
 2888                     self.volume_api.terminate_connection(context,
 2889                                                          bdm.volume_id,
 2890                                                          connector)
 2891                     self.volume_api.detach(context, bdm.volume_id,
 2892                                            instance.uuid)
 2893 
 2894             except exception.VolumeAttachmentNotFound as exc:
 2895                 LOG.debug('Ignoring VolumeAttachmentNotFound: %s', exc,
 2896                           instance=instance)
 2897             except exception.DiskNotFound as exc:
 2898                 LOG.debug('Ignoring DiskNotFound: %s', exc,
 2899                           instance=instance)
 2900             except exception.VolumeNotFound as exc:
 2901                 LOG.debug('Ignoring VolumeNotFound: %s', exc,
 2902                           instance=instance)
 2903             except (cinder_exception.EndpointNotFound,
 2904                     keystone_exception.EndpointNotFound) as exc:
 2905                 LOG.warning('Ignoring EndpointNotFound for '
 2906                             'volume %(volume_id)s: %(exc)s',
 2907                             {'exc': exc, 'volume_id': bdm.volume_id},
 2908                             instance=instance)
 2909             except cinder_exception.ClientException as exc:
 2910                 LOG.warning('Ignoring unknown cinder exception for '
 2911                             'volume %(volume_id)s: %(exc)s',
 2912                             {'exc': exc, 'volume_id': bdm.volume_id},
 2913                             instance=instance)
 2914             except Exception as exc:
 2915                 LOG.warning('Ignoring unknown exception for '
 2916                             'volume %(volume_id)s: %(exc)s',
 2917                             {'exc': exc, 'volume_id': bdm.volume_id},
 2918                             instance=instance)
 2919         if vol_bdms:
 2920             LOG.info('Took %(time).2f seconds to detach %(num)s volumes '
 2921                      'for instance.',
 2922                      {'time': timer.elapsed(), 'num': len(vol_bdms)},
 2923                      instance=instance)
 2924 
 2925         if notify:
 2926             self._notify_about_instance_usage(context, instance,
 2927                                               "shutdown.end")
 2928             compute_utils.notify_about_instance_action(context, instance,
 2929                     self.host, action=fields.NotificationAction.SHUTDOWN,
 2930                     phase=fields.NotificationPhase.END, bdms=bdms)
 2931 
 2932     def _cleanup_volumes(self, context, instance, bdms, raise_exc=True,
 2933                          detach=True):
 2934         original_exception = None
 2935         for bdm in bdms:
 2936             if detach and bdm.volume_id:
 2937                 try:
 2938                     LOG.debug("Detaching volume: %s", bdm.volume_id,
 2939                               instance_uuid=instance.uuid)
 2940                     destroy = bdm.delete_on_termination
 2941                     self._detach_volume(context, bdm, instance,
 2942                                         destroy_bdm=destroy)
 2943                 except Exception as exc:
 2944                     original_exception = exc
 2945                     LOG.warning('Failed to detach volume: %(volume_id)s '
 2946                                 'due to %(exc)s',
 2947                                 {'volume_id': bdm.volume_id, 'exc': exc})
 2948 
 2949             if bdm.volume_id and bdm.delete_on_termination:
 2950                 try:
 2951                     LOG.debug("Deleting volume: %s", bdm.volume_id,
 2952                               instance_uuid=instance.uuid)
 2953                     self.volume_api.delete(context, bdm.volume_id)
 2954                 except Exception as exc:
 2955                     original_exception = exc
 2956                     LOG.warning('Failed to delete volume: %(volume_id)s '
 2957                                 'due to %(exc)s',
 2958                                 {'volume_id': bdm.volume_id, 'exc': exc})
 2959         if original_exception is not None and raise_exc:
 2960             raise original_exception
 2961 
 2962     def _delete_instance(self, context, instance, bdms):
 2963         """Delete an instance on this host.
 2964 
 2965         :param context: nova request context
 2966         :param instance: nova.objects.instance.Instance object
 2967         :param bdms: nova.objects.block_device.BlockDeviceMappingList object
 2968         """
 2969         events = self.instance_events.clear_events_for_instance(instance)
 2970         if events:
 2971             LOG.debug('Events pending at deletion: %(events)s',
 2972                       {'events': ','.join(events.keys())},
 2973                       instance=instance)
 2974         self._notify_about_instance_usage(context, instance,
 2975                                           "delete.start")
 2976         compute_utils.notify_about_instance_action(context, instance,
 2977                 self.host, action=fields.NotificationAction.DELETE,
 2978                 phase=fields.NotificationPhase.START, bdms=bdms)
 2979 
 2980         self._shutdown_instance(context, instance, bdms)
 2981 
 2982         # NOTE(vish): We have already deleted the instance, so we have
 2983         #             to ignore problems cleaning up the volumes. It
 2984         #             would be nice to let the user know somehow that
 2985         #             the volume deletion failed, but it is not
 2986         #             acceptable to have an instance that can not be
 2987         #             deleted. Perhaps this could be reworked in the
 2988         #             future to set an instance fault the first time
 2989         #             and to only ignore the failure if the instance
 2990         #             is already in ERROR.
 2991 
 2992         # NOTE(ameeda): The volumes have already been detached during
 2993         #               the above _shutdown_instance() call and this is
 2994         #               why detach is not requested from
 2995         #               _cleanup_volumes() in this case
 2996 
 2997         self._cleanup_volumes(context, instance, bdms,
 2998                 raise_exc=False, detach=False)
 2999         # Delete Cyborg ARQs if the instance has a device profile.
 3000         compute_utils.delete_arqs_if_needed(context, instance)
 3001         # if a delete task succeeded, always update vm state and task
 3002         # state without expecting task state to be DELETING
 3003         instance.vm_state = vm_states.DELETED
 3004         instance.task_state = None
 3005         instance.power_state = power_state.NOSTATE
 3006         instance.terminated_at = timeutils.utcnow()
 3007         instance.save()
 3008 
 3009         self._complete_deletion(context, instance)
 3010         # only destroy the instance in the db if the _complete_deletion
 3011         # doesn't raise and therefore allocation is successfully
 3012         # deleted in placement
 3013         instance.destroy()
 3014 
 3015         self._notify_about_instance_usage(context, instance, "delete.end")
 3016         compute_utils.notify_about_instance_action(context, instance,
 3017                 self.host, action=fields.NotificationAction.DELETE,
 3018                 phase=fields.NotificationPhase.END, bdms=bdms)
 3019 
 3020     @wrap_exception()
 3021     @reverts_task_state
 3022     @wrap_instance_event(prefix='compute')
 3023     @wrap_instance_fault
 3024     def terminate_instance(self, context, instance, bdms):
 3025         """Terminate an instance on this host."""
 3026         @utils.synchronized(instance.uuid)
 3027         def do_terminate_instance(instance, bdms):
 3028             # NOTE(mriedem): If we are deleting the instance while it was
 3029             # booting from volume, we could be racing with a database update of
 3030             # the BDM volume_id. Since the compute API passes the BDMs over RPC
 3031             # to compute here, the BDMs may be stale at this point. So check
 3032             # for any volume BDMs that don't have volume_id set and if we
 3033             # detect that, we need to refresh the BDM list before proceeding.
 3034             # TODO(mriedem): Move this into _delete_instance and make the bdms
 3035             # parameter optional.
 3036             for bdm in list(bdms):
 3037                 if bdm.is_volume and not bdm.volume_id:
 3038                     LOG.debug('There are potentially stale BDMs during '
 3039                               'delete, refreshing the BlockDeviceMappingList.',
 3040                               instance=instance)
 3041                     bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
 3042                         context, instance.uuid)
 3043                     break
 3044             try:
 3045                 self._delete_instance(context, instance, bdms)
 3046             except exception.InstanceNotFound:
 3047                 LOG.info("Instance disappeared during terminate",
 3048                          instance=instance)
 3049             except Exception:
 3050                 # As we're trying to delete always go to Error if something
 3051                 # goes wrong that _delete_instance can't handle.
 3052                 with excutils.save_and_reraise_exception():
 3053                     LOG.exception('Setting instance vm_state to ERROR',
 3054                                   instance=instance)
 3055                     self._set_instance_obj_error_state(instance)
 3056 
 3057         do_terminate_instance(instance, bdms)
 3058 
 3059     # NOTE(johannes): This is probably better named power_off_instance
 3060     # so it matches the driver method, but because of other issues, we
 3061     # can't use that name in grizzly.
 3062     @wrap_exception()
 3063     @reverts_task_state
 3064     @wrap_instance_event(prefix='compute')
 3065     @wrap_instance_fault
 3066     def stop_instance(self, context, instance, clean_shutdown):
 3067         """Stopping an instance on this host."""
 3068 
 3069         @utils.synchronized(instance.uuid)
 3070         def do_stop_instance():
 3071             current_power_state = self._get_power_state(instance)
 3072             LOG.debug('Stopping instance; current vm_state: %(vm_state)s, '
 3073                       'current task_state: %(task_state)s, current DB '
 3074                       'power_state: %(db_power_state)s, current VM '
 3075                       'power_state: %(current_power_state)s',
 3076                       {'vm_state': instance.vm_state,
 3077                        'task_state': instance.task_state,
 3078                        'db_power_state': instance.power_state,
 3079                        'current_power_state': current_power_state},
 3080                       instance_uuid=instance.uuid)
 3081 
 3082             # NOTE(mriedem): If the instance is already powered off, we are
 3083             # possibly tearing down and racing with other operations, so we can
 3084             # expect the task_state to be None if something else updates the
 3085             # instance and we're not locking it.
 3086             expected_task_state = [task_states.POWERING_OFF]
 3087             # The list of power states is from _sync_instance_power_state.
 3088             if current_power_state in (power_state.NOSTATE,
 3089                                        power_state.SHUTDOWN,
 3090                                        power_state.CRASHED):
 3091                 LOG.info('Instance is already powered off in the '
 3092                          'hypervisor when stop is called.',
 3093                          instance=instance)
 3094                 expected_task_state.append(None)
 3095 
 3096             self._notify_about_instance_usage(context, instance,
 3097                                               "power_off.start")
 3098 
 3099             compute_utils.notify_about_instance_action(context, instance,
 3100                         self.host, action=fields.NotificationAction.POWER_OFF,
 3101                         phase=fields.NotificationPhase.START)
 3102 
 3103             self._power_off_instance(instance, clean_shutdown)
 3104             instance.power_state = self._get_power_state(instance)
 3105             instance.vm_state = vm_states.STOPPED
 3106             instance.task_state = None
 3107             instance.save(expected_task_state=expected_task_state)
 3108             self._notify_about_instance_usage(context, instance,
 3109                                               "power_off.end")
 3110 
 3111             compute_utils.notify_about_instance_action(context, instance,
 3112                         self.host, action=fields.NotificationAction.POWER_OFF,
 3113                         phase=fields.NotificationPhase.END)
 3114 
 3115         do_stop_instance()
 3116 
 3117     def _power_on(self, context, instance):
 3118         network_info = self.network_api.get_instance_nw_info(context, instance)
 3119         block_device_info = self._get_instance_block_device_info(context,
 3120                                                                  instance)
 3121         accel_info = self._get_accel_info(context, instance)
 3122         self.driver.power_on(context, instance,
 3123                              network_info,
 3124                              block_device_info, accel_info)
 3125 
 3126     def _delete_snapshot_of_shelved_instance(self, context, instance,
 3127                                              snapshot_id):
 3128         """Delete snapshot of shelved instance."""
 3129         try:
 3130             self.image_api.delete(context, snapshot_id)
 3131         except (exception.ImageNotFound,
 3132                 exception.ImageNotAuthorized) as exc:
 3133             LOG.warning("Failed to delete snapshot "
 3134                         "from shelved instance (%s).",
 3135                         exc.format_message(), instance=instance)
 3136         except Exception:
 3137             LOG.exception("Something wrong happened when trying to "
 3138                           "delete snapshot from shelved instance.",
 3139                           instance=instance)
 3140 
 3141     # NOTE(johannes): This is probably better named power_on_instance
 3142     # so it matches the driver method, but because of other issues, we
 3143     # can't use that name in grizzly.
 3144     @wrap_exception()
 3145     @reverts_task_state
 3146     @wrap_instance_event(prefix='compute')
 3147     @wrap_instance_fault
 3148     def start_instance(self, context, instance):
 3149         """Starting an instance on this host."""
 3150         self._notify_about_instance_usage(context, instance, "power_on.start")
 3151         compute_utils.notify_about_instance_action(context, instance,
 3152             self.host, action=fields.NotificationAction.POWER_ON,
 3153             phase=fields.NotificationPhase.START)
 3154         self._power_on(context, instance)
 3155         instance.power_state = self._get_power_state(instance)
 3156         instance.vm_state = vm_states.ACTIVE
 3157         instance.task_state = None
 3158 
 3159         # Delete an image(VM snapshot) for a shelved instance
 3160         snapshot_id = instance.system_metadata.get('shelved_image_id')
 3161         if snapshot_id:
 3162             self._delete_snapshot_of_shelved_instance(context, instance,
 3163                                                       snapshot_id)
 3164 
 3165         # Delete system_metadata for a shelved instance
 3166         compute_utils.remove_shelved_keys_from_system_metadata(instance)
 3167 
 3168         instance.save(expected_task_state=task_states.POWERING_ON)
 3169         self._notify_about_instance_usage(context, instance, "power_on.end")
 3170         compute_utils.notify_about_instance_action(context, instance,
 3171             self.host, action=fields.NotificationAction.POWER_ON,
 3172             phase=fields.NotificationPhase.END)
 3173 
 3174     @messaging.expected_exceptions(NotImplementedError,
 3175                                    exception.TriggerCrashDumpNotSupported,
 3176                                    exception.InstanceNotRunning)
 3177     @wrap_exception()
 3178     @wrap_instance_event(prefix='compute')
 3179     @wrap_instance_fault
 3180     def trigger_crash_dump(self, context, instance):
 3181         """Trigger crash dump in an instance."""
 3182 
 3183         self._notify_about_instance_usage(context, instance,
 3184                                           "trigger_crash_dump.start")
 3185         compute_utils.notify_about_instance_action(context, instance,
 3186                 self.host, action=fields.NotificationAction.TRIGGER_CRASH_DUMP,
 3187                 phase=fields.NotificationPhase.START)
 3188 
 3189         # This method does not change task_state and power_state because the
 3190         # effect of a trigger depends on user's configuration.
 3191         self.driver.trigger_crash_dump(instance)
 3192 
 3193         self._notify_about_instance_usage(context, instance,
 3194                                           "trigger_crash_dump.end")
 3195         compute_utils.notify_about_instance_action(context, instance,
 3196                 self.host, action=fields.NotificationAction.TRIGGER_CRASH_DUMP,
 3197                 phase=fields.NotificationPhase.END)
 3198 
 3199     @wrap_exception()
 3200     @reverts_task_state
 3201     @wrap_instance_event(prefix='compute')
 3202     @wrap_instance_fault
 3203     def soft_delete_instance(self, context, instance):
 3204         """Soft delete an instance on this host."""
 3205         with compute_utils.notify_about_instance_delete(
 3206                 self.notifier, context, instance, 'soft_delete',
 3207                 source=fields.NotificationSource.COMPUTE):
 3208             try:
 3209                 self.driver.soft_delete(instance)
 3210             except NotImplementedError:
 3211                 # Fallback to just powering off the instance if the
 3212                 # hypervisor doesn't implement the soft_delete method
 3213                 self.driver.power_off(instance)
 3214             instance.power_state = self._get_power_state(instance)
 3215             instance.vm_state = vm_states.SOFT_DELETED
 3216             instance.task_state = None
 3217             instance.save(expected_task_state=[task_states.SOFT_DELETING])
 3218 
 3219     @wrap_exception()
 3220     @reverts_task_state
 3221     @wrap_instance_event(prefix='compute')
 3222     @wrap_instance_fault
 3223     def restore_instance(self, context, instance):
 3224         """Restore a soft-deleted instance on this host."""
 3225         self._notify_about_instance_usage(context, instance, "restore.start")
 3226         compute_utils.notify_about_instance_action(context, instance,
 3227             self.host, action=fields.NotificationAction.RESTORE,
 3228             phase=fields.NotificationPhase.START)
 3229         try:
 3230             self.driver.restore(instance)
 3231         except NotImplementedError:
 3232             # Fallback to just powering on the instance if the hypervisor
 3233             # doesn't implement the restore method
 3234             self._power_on(context, instance)
 3235         instance.power_state = self._get_power_state(instance)
 3236         instance.vm_state = vm_states.ACTIVE
 3237         instance.task_state = None
 3238         instance.save(expected_task_state=task_states.RESTORING)
 3239         self._notify_about_instance_usage(context, instance, "restore.end")
 3240         compute_utils.notify_about_instance_action(context, instance,
 3241             self.host, action=fields.NotificationAction.RESTORE,
 3242             phase=fields.NotificationPhase.END)
 3243 
 3244     @staticmethod
 3245     def _set_migration_status(migration, status):
 3246         """Set the status, and guard against a None being passed in.
 3247 
 3248         This is useful as some of the compute RPC calls will not pass
 3249         a migration object in older versions. The check can be removed when
 3250         we move past 4.x major version of the RPC API.
 3251         """
 3252         if migration:
 3253             migration.status = status
 3254             migration.save()
 3255 
 3256     def _rebuild_default_impl(
 3257             self, context, instance, image_meta, injected_files,
 3258             admin_password, allocations, bdms, detach_block_devices,
 3259             attach_block_devices, network_info=None, evacuate=False,
 3260             block_device_info=None, preserve_ephemeral=False,
 3261             accel_uuids=None):
 3262         if preserve_ephemeral:
 3263             # The default code path does not support preserving ephemeral
 3264             # partitions.
 3265             raise exception.PreserveEphemeralNotSupported()
 3266 
 3267         accel_info = []
 3268         if evacuate:
 3269             if instance.flavor.extra_specs.get('accel:device_profile'):
 3270                 try:
 3271                     accel_info = self._get_bound_arq_resources(
 3272                         context, instance, accel_uuids or [])
 3273                 except (Exception, eventlet.timeout.Timeout) as exc:
 3274                     LOG.exception(exc)
 3275                     self._build_resources_cleanup(instance, network_info)
 3276                     msg = _('Failure getting accelerator resources.')
 3277                     raise exception.BuildAbortException(
 3278                         instance_uuid=instance.uuid, reason=msg)
 3279             detach_block_devices(context, bdms)
 3280         else:
 3281             self._power_off_instance(instance, clean_shutdown=True)
 3282             detach_block_devices(context, bdms)
 3283             self.driver.destroy(context, instance,
 3284                                 network_info=network_info,
 3285                                 block_device_info=block_device_info)
 3286             try:
 3287                 accel_info = self._get_accel_info(context, instance)
 3288             except Exception as exc:
 3289                 LOG.exception(exc)
 3290                 self._build_resources_cleanup(instance, network_info)
 3291                 msg = _('Failure getting accelerator resources.')
 3292                 raise exception.BuildAbortException(
 3293                     instance_uuid=instance.uuid, reason=msg)
 3294 
 3295         instance.task_state = task_states.REBUILD_BLOCK_DEVICE_MAPPING
 3296         instance.save(expected_task_state=[task_states.REBUILDING])
 3297 
 3298         new_block_device_info = attach_block_devices(context, instance, bdms)
 3299 
 3300         instance.task_state = task_states.REBUILD_SPAWNING
 3301         instance.save(
 3302             expected_task_state=[task_states.REBUILD_BLOCK_DEVICE_MAPPING])
 3303 
 3304         with instance.mutated_migration_context():
 3305             self.driver.spawn(context, instance, image_meta, injected_files,
 3306                               admin_password, allocations,
 3307                               network_info=network_info,
 3308                               block_device_info=new_block_device_info,
 3309                               accel_info=accel_info)
 3310 
 3311     def _notify_instance_rebuild_error(self, context, instance, error, bdms):
 3312         self._notify_about_instance_usage(context, instance,
 3313                                           'rebuild.error', fault=error)
 3314         compute_utils.notify_about_instance_rebuild(
 3315             context, instance, self.host,
 3316             phase=fields.NotificationPhase.ERROR, exception=error, bdms=bdms)
 3317 
 3318     @messaging.expected_exceptions(exception.PreserveEphemeralNotSupported,
 3319                                    exception.BuildAbortException)
 3320     @wrap_exception()
 3321     @reverts_task_state
 3322     @wrap_instance_event(prefix='compute')
 3323     @wrap_instance_fault
 3324     def rebuild_instance(self, context, instance, orig_image_ref, image_ref,
 3325                          injected_files, new_pass, orig_sys_metadata,
 3326                          bdms, recreate, on_shared_storage,
 3327                          preserve_ephemeral, migration,
 3328                          scheduled_node, limits, request_spec,
 3329                          accel_uuids=None):
 3330         """Destroy and re-make this instance.
 3331 
 3332         A 'rebuild' effectively purges all existing data from the system and
 3333         remakes the VM with given 'metadata' and 'personalities'.
 3334 
 3335         :param context: `nova.RequestContext` object
 3336         :param instance: Instance object
 3337         :param orig_image_ref: Original image_ref before rebuild
 3338         :param image_ref: New image_ref for rebuild
 3339         :param injected_files: Files to inject
 3340         :param new_pass: password to set on rebuilt instance
 3341         :param orig_sys_metadata: instance system metadata from pre-rebuild
 3342         :param bdms: block-device-mappings to use for rebuild
 3343         :param recreate: True if the instance is being evacuated (e.g. the
 3344             hypervisor it was on failed) - cleanup of old state will be
 3345             skipped.
 3346         :param on_shared_storage: True if instance files on shared storage.
 3347                                   If not provided then information from the
 3348                                   driver will be used to decide if the instance
 3349                                   files are available or not on the target host
 3350         :param preserve_ephemeral: True if the default ephemeral storage
 3351                                    partition must be preserved on rebuild
 3352         :param migration: a Migration object if one was created for this
 3353                           rebuild operation (if it's a part of evacuate)
 3354         :param scheduled_node: A node of the host chosen by the scheduler. If a
 3355                                host was specified by the user, this will be
 3356                                None
 3357         :param limits: Overcommit limits set by the scheduler. If a host was
 3358                        specified by the user, this will be None
 3359         :param request_spec: a RequestSpec object used to schedule the instance
 3360         :param accel_uuids: a list of cyborg ARQ uuids or None if the RPC API
 3361                             is <=5.11
 3362 
 3363         """
 3364         # recreate=True means the instance is being evacuated from a failed
 3365         # host to a new destination host (this host). The 'recreate' variable
 3366         # name is confusing, so rename it to evacuate here at the top, which
 3367         # is simpler than renaming a parameter in an RPC versioned method.
 3368         evacuate = recreate
 3369         context = context.elevated()
 3370 
 3371         if evacuate:
 3372             LOG.info("Evacuating instance", instance=instance)
 3373         else:
 3374             LOG.info("Rebuilding instance", instance=instance)
 3375 
 3376         if evacuate:
 3377             # This is an evacuation to a new host, so we need to perform a
 3378             # resource claim.
 3379             rebuild_claim = self.rt.rebuild_claim
 3380         else:
 3381             # This is a rebuild to the same host, so we don't need to make
 3382             # a claim since the instance is already on this host.
 3383             rebuild_claim = claims.NopClaim
 3384 
 3385         if image_ref:
 3386             image_meta = objects.ImageMeta.from_image_ref(
 3387                 context, self.image_api, image_ref)
 3388         elif evacuate:
 3389             # For evacuate the API does not send down the image_ref since the
 3390             # image does not change so just get it from what was stashed in
 3391             # the instance system_metadata when the instance was created (or
 3392             # last rebuilt). This also works for volume-backed instances.
 3393             image_meta = instance.image_meta
 3394         else:
 3395             image_meta = objects.ImageMeta()
 3396 
 3397         # NOTE(mriedem): On an evacuate, we need to update
 3398         # the instance's host and node properties to reflect it's
 3399         # destination node for the evacuate.
 3400         if not scheduled_node:
 3401             if evacuate:
 3402                 try:
 3403                     compute_node = self._get_compute_info(context, self.host)
 3404                     scheduled_node = compute_node.hypervisor_hostname
 3405                 except exception.ComputeHostNotFound:
 3406                     LOG.exception('Failed to get compute_info for %s',
 3407                                   self.host)
 3408             else:
 3409                 scheduled_node = instance.node
 3410 
 3411         allocs = self.reportclient.get_allocations_for_consumer(
 3412                     context, instance.uuid)
 3413 
 3414         # If the resource claim or group policy validation fails before we
 3415         # do anything to the guest or its networking/volumes we want to keep
 3416         # the current status rather than put the instance into ERROR status.
 3417         instance_state = instance.vm_state
 3418         with self._error_out_instance_on_exception(
 3419                 context, instance, instance_state=instance_state):
 3420             try:
 3421                 self._do_rebuild_instance_with_claim(
 3422                     context, instance, orig_image_ref,
 3423                     image_meta, injected_files, new_pass, orig_sys_metadata,
 3424                     bdms, evacuate, on_shared_storage, preserve_ephemeral,
 3425                     migration, request_spec, allocs, rebuild_claim,
 3426                     scheduled_node, limits, accel_uuids)
 3427             except (exception.ComputeResourcesUnavailable,
 3428                     exception.RescheduledException) as e:
 3429                 if isinstance(e, exception.ComputeResourcesUnavailable):
 3430                     LOG.debug("Could not rebuild instance on this host, not "
 3431                               "enough resources available.", instance=instance)
 3432                 else:
 3433                     # RescheduledException is raised by the late server group
 3434                     # policy check during evacuation if a parallel scheduling
 3435                     # violated the policy.
 3436                     # We catch the RescheduledException here but we don't have
 3437                     # the plumbing to do an actual reschedule so we abort the
 3438                     # operation.
 3439                     LOG.debug("Could not rebuild instance on this host, "
 3440                               "late server group check failed.",
 3441                               instance=instance)
 3442                 # NOTE(ndipanov): We just abort the build for now and leave a
 3443                 # migration record for potential cleanup later
 3444                 self._set_migration_status(migration, 'failed')
 3445                 # Since the claim failed, we need to remove the allocation
 3446                 # created against the destination node. Note that we can only
 3447                 # get here when evacuating to a destination node. Rebuilding
 3448                 # on the same host (not evacuate) uses the NopClaim which will
 3449                 # not raise ComputeResourcesUnavailable.
 3450                 self.rt.delete_allocation_for_evacuated_instance(
 3451                     context, instance, scheduled_node, node_type='destination')
 3452                 self._notify_instance_rebuild_error(context, instance, e, bdms)
 3453                 # Wrap this in InstanceFaultRollback so that the
 3454                 # _error_out_instance_on_exception context manager keeps the
 3455                 # vm_state unchanged.
 3456                 raise exception.InstanceFaultRollback(
 3457                     inner_exception=exception.BuildAbortException(
 3458                         instance_uuid=instance.uuid,
 3459                         reason=e.format_message()))
 3460             except (exception.InstanceNotFound,
 3461                     exception.UnexpectedDeletingTaskStateError) as e:
 3462                 LOG.debug('Instance was deleted while rebuilding',
 3463                           instance=instance)
 3464                 self._set_migration_status(migration, 'failed')
 3465                 self._notify_instance_rebuild_error(context, instance, e, bdms)
 3466             except Exception as e:
 3467                 self._set_migration_status(migration, 'failed')
 3468                 if evacuate or scheduled_node is not None:
 3469                     self.rt.delete_allocation_for_evacuated_instance(
 3470                         context, instance, scheduled_node,
 3471                         node_type='destination')
 3472                 self._notify_instance_rebuild_error(context, instance, e, bdms)
 3473                 raise
 3474             else:
 3475                 instance.apply_migration_context()
 3476                 # NOTE (ndipanov): This save will now update the host and node
 3477                 # attributes making sure that next RT pass is consistent since
 3478                 # it will be based on the instance and not the migration DB
 3479                 # entry.
 3480                 instance.host = self.host
 3481                 instance.node = scheduled_node
 3482                 instance.save()
 3483                 instance.drop_migration_context()
 3484 
 3485                 # NOTE (ndipanov): Mark the migration as done only after we
 3486                 # mark the instance as belonging to this host.
 3487                 self._set_migration_status(migration, 'done')
 3488 
 3489     def _do_rebuild_instance_with_claim(
 3490             self, context, instance, orig_image_ref, image_meta,
 3491             injected_files, new_pass, orig_sys_metadata, bdms, evacuate,
 3492             on_shared_storage, preserve_ephemeral, migration, request_spec,
 3493             allocations, rebuild_claim, scheduled_node, limits, accel_uuids):
 3494         """Helper to avoid deep nesting in the top-level method."""
 3495 
 3496         provider_mapping = None
 3497         if evacuate:
 3498             provider_mapping = self._get_request_group_mapping(request_spec)
 3499 
 3500             if provider_mapping:
 3501                 compute_utils.\
 3502                     update_pci_request_spec_with_allocated_interface_name(
 3503                         context, self.reportclient, instance, provider_mapping)
 3504 
 3505         claim_context = rebuild_claim(
 3506             context, instance, scheduled_node, allocations,
 3507             limits=limits, image_meta=image_meta, migration=migration)
 3508 
 3509         with claim_context:
 3510             self._do_rebuild_instance(
 3511                 context, instance, orig_image_ref, image_meta, injected_files,
 3512                 new_pass, orig_sys_metadata, bdms, evacuate, on_shared_storage,
 3513                 preserve_ephemeral, migration, request_spec, allocations,
 3514                 provider_mapping, accel_uuids)
 3515 
 3516     @staticmethod
 3517     def _get_image_name(image_meta):
 3518         if image_meta.obj_attr_is_set("name"):
 3519             return image_meta.name
 3520         else:
 3521             return ''
 3522 
 3523     def _do_rebuild_instance(
 3524             self, context, instance, orig_image_ref, image_meta,
 3525             injected_files, new_pass, orig_sys_metadata, bdms, evacuate,
 3526             on_shared_storage, preserve_ephemeral, migration, request_spec,
 3527             allocations, request_group_resource_providers_mapping,
 3528             accel_uuids):
 3529         orig_vm_state = instance.vm_state
 3530 
 3531         if evacuate:
 3532             if request_spec:
 3533                 # NOTE(gibi): Do a late check of server group policy as
 3534                 # parallel scheduling could violate such policy. This will
 3535                 # cause the evacuate to fail as rebuild does not implement
 3536                 # reschedule.
 3537                 hints = self._get_scheduler_hints({}, request_spec)
 3538                 self._validate_instance_group_policy(context, instance, hints)
 3539 
 3540             if not self.driver.capabilities.get("supports_evacuate", False):
 3541                 raise exception.InstanceEvacuateNotSupported
 3542 
 3543             self._check_instance_exists(instance)
 3544 
 3545             if on_shared_storage is None:
 3546                 LOG.debug('on_shared_storage is not provided, using driver '
 3547                           'information to decide if the instance needs to '
 3548                           'be evacuated')
 3549                 on_shared_storage = self.driver.instance_on_disk(instance)
 3550 
 3551             elif (on_shared_storage !=
 3552                     self.driver.instance_on_disk(instance)):
 3553                 # To cover case when admin expects that instance files are
 3554                 # on shared storage, but not accessible and vice versa
 3555                 raise exception.InvalidSharedStorage(
 3556                         _("Invalid state of instance files on shared"
 3557                             " storage"))
 3558 
 3559             if on_shared_storage:
 3560                 LOG.info('disk on shared storage, evacuating using'
 3561                          ' existing disk')
 3562             elif instance.image_ref:
 3563                 orig_image_ref = instance.image_ref
 3564                 LOG.info("disk not on shared storage, evacuating from "
 3565                          "image: '%s'", str(orig_image_ref))
 3566             else:
 3567                 LOG.info('disk on volume, evacuating using existing '
 3568                          'volume')
 3569 
 3570         # We check trusted certs capabilities for both evacuate (rebuild on
 3571         # another host) and rebuild (rebuild on the same host) because for
 3572         # evacuate we need to make sure an instance with trusted certs can
 3573         # have the image verified with those certs during rebuild, and for
 3574         # rebuild we could be rebuilding a server that started out with no
 3575         # trusted certs on this host, and then was rebuilt with trusted certs
 3576         # for a new image, in which case we need to validate that new image
 3577         # with the trusted certs during the rebuild.
 3578         self._check_trusted_certs(instance)
 3579 
 3580         # This instance.exists message should contain the original
 3581         # image_ref, not the new one.  Since the DB has been updated
 3582         # to point to the new one... we have to override it.
 3583         orig_image_ref_url = self.image_api.generate_image_url(orig_image_ref,
 3584                                                                context)
 3585         extra_usage_info = {'image_ref_url': orig_image_ref_url}
 3586         compute_utils.notify_usage_exists(
 3587                 self.notifier, context, instance, self.host,
 3588                 current_period=True, system_metadata=orig_sys_metadata,
 3589                 extra_usage_info=extra_usage_info)
 3590 
 3591         # This message should contain the new image_ref
 3592         extra_usage_info = {'image_name': self._get_image_name(image_meta)}
 3593         self._notify_about_instance_usage(context, instance,
 3594                 "rebuild.start", extra_usage_info=extra_usage_info)
 3595         # NOTE: image_name is not included in the versioned notification
 3596         # because we already provide the image_uuid in the notification
 3597         # payload and the image details can be looked up via the uuid.
 3598         compute_utils.notify_about_instance_rebuild(
 3599             context, instance, self.host,
 3600             phase=fields.NotificationPhase.START,
 3601             bdms=bdms)
 3602 
 3603         instance.power_state = self._get_power_state(instance)
 3604         instance.task_state = task_states.REBUILDING
 3605         instance.save(expected_task_state=[task_states.REBUILDING])
 3606 
 3607         if evacuate:
 3608             self.network_api.setup_networks_on_host(
 3609                     context, instance, self.host)
 3610             # For nova-network this is needed to move floating IPs
 3611             # For neutron this updates the host in the port binding
 3612             # TODO(cfriesen): this network_api call and the one above
 3613             # are so similar, we should really try to unify them.
 3614             self.network_api.setup_instance_network_on_host(
 3615                 context, instance, self.host, migration,
 3616                 provider_mappings=request_group_resource_providers_mapping)
 3617             # TODO(mriedem): Consider decorating setup_instance_network_on_host
 3618             # with @api.refresh_cache and then we wouldn't need this explicit
 3619             # call to get_instance_nw_info.
 3620             network_info = self.network_api.get_instance_nw_info(context,
 3621                                                                  instance)
 3622         else:
 3623             network_info = instance.get_network_info()
 3624 
 3625         if bdms is None:
 3626             bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
 3627                     context, instance.uuid)
 3628 
 3629         block_device_info = \
 3630             self._get_instance_block_device_info(
 3631                     context, instance, bdms=bdms)
 3632 
 3633         def detach_block_devices(context, bdms):
 3634             for bdm in bdms:
 3635                 if bdm.is_volume:
 3636                     # NOTE (ildikov): Having the attachment_id set in the BDM
 3637                     # means that it's the new Cinder attach/detach flow
 3638                     # (available from v3.44). In that case we explicitly
 3639                     # attach and detach the volumes through attachment level
 3640                     # operations. In this scenario _detach_volume will delete
 3641                     # the existing attachment which would make the volume
 3642                     # status change to 'available' if we don't pre-create
 3643                     # another empty attachment before deleting the old one.
 3644                     attachment_id = None
 3645                     if bdm.attachment_id:
 3646                         attachment_id = self.volume_api.attachment_create(
 3647                             context, bdm['volume_id'], instance.uuid)['id']
 3648                     self._detach_volume(context, bdm, instance,
 3649                                         destroy_bdm=False)
 3650                     if attachment_id:
 3651                         bdm.attachment_id = attachment_id
 3652                         bdm.save()
 3653 
 3654         files = self._decode_files(injected_files)
 3655 
 3656         kwargs = dict(
 3657             context=context,
 3658             instance=instance,
 3659             image_meta=image_meta,
 3660             injected_files=files,
 3661             admin_password=new_pass,
 3662             allocations=allocations,
 3663             bdms=bdms,
 3664             detach_block_devices=detach_block_devices,
 3665             attach_block_devices=self._prep_block_device,
 3666             block_device_info=block_device_info,
 3667             network_info=network_info,
 3668             preserve_ephemeral=preserve_ephemeral,
 3669             evacuate=evacuate,
 3670             accel_uuids=accel_uuids)
 3671         try:
 3672             with instance.mutated_migration_context():
 3673                 self.driver.rebuild(**kwargs)
 3674         except NotImplementedError:
 3675             # NOTE(rpodolyaka): driver doesn't provide specialized version
 3676             # of rebuild, fall back to the default implementation
 3677             self._rebuild_default_impl(**kwargs)
 3678         self._update_instance_after_spawn(instance)
 3679         instance.save(expected_task_state=[task_states.REBUILD_SPAWNING])
 3680 
 3681         if orig_vm_state == vm_states.STOPPED:
 3682             LOG.info("bringing vm to original state: '%s'",
 3683                      orig_vm_state, instance=instance)
 3684             instance.vm_state = vm_states.ACTIVE
 3685             instance.task_state = task_states.POWERING_OFF
 3686             instance.progress = 0
 3687             instance.save()
 3688             self.stop_instance(context, instance, False)
 3689         # TODO(melwitt): We should clean up instance console tokens here in the
 3690         # case of evacuate. The instance is on a new host and will need to
 3691         # establish a new console connection.
 3692         self._update_scheduler_instance_info(context, instance)
 3693         self._notify_about_instance_usage(
 3694                 context, instance, "rebuild.end",
 3695                 network_info=network_info,
 3696                 extra_usage_info=extra_usage_info)
 3697         compute_utils.notify_about_instance_rebuild(
 3698             context, instance, self.host,
 3699             phase=fields.NotificationPhase.END,
 3700             bdms=bdms)
 3701 
 3702     def _handle_bad_volumes_detached(self, context, instance, bad_devices,
 3703                                      block_device_info):
 3704         """Handle cases where the virt-layer had to detach non-working volumes
 3705         in order to complete an operation.
 3706         """
 3707         for bdm in block_device_info['block_device_mapping']:
 3708             if bdm.get('mount_device') in bad_devices:
 3709                 try:
 3710                     volume_id = bdm['connection_info']['data']['volume_id']
 3711                 except KeyError:
 3712                     continue
 3713 
 3714                 # NOTE(sirp): ideally we'd just call
 3715                 # `compute_api.detach_volume` here but since that hits the
 3716                 # DB directly, that's off limits from within the
 3717                 # compute-manager.
 3718                 #
 3719                 # API-detach
 3720                 LOG.info("Detaching from volume api: %s", volume_id)
 3721                 self.volume_api.begin_detaching(context, volume_id)
 3722 
 3723                 # Manager-detach
 3724                 self.detach_volume(context, volume_id, instance)
 3725 
 3726     def _get_accel_info(self, context, instance):
 3727         dp_name = instance.flavor.extra_specs.get('accel:device_profile')
 3728         if dp_name:
 3729             cyclient = cyborg.get_client(context)
 3730             accel_info = cyclient.get_arqs_for_instance(instance.uuid)
 3731         else:
 3732             accel_info = []
 3733         return accel_info
 3734 
 3735     @wrap_exception()
 3736     @reverts_task_state
 3737     @wrap_instance_event(prefix='compute')
 3738     @wrap_instance_fault
 3739     def reboot_instance(self, context, instance, block_device_info,
 3740                         reboot_type):
 3741         @utils.synchronized(instance.uuid)
 3742         def do_reboot_instance(context, instance, block_device_info,
 3743                                reboot_type):
 3744             self._reboot_instance(context, instance, block_device_info,
 3745                                   reboot_type)
 3746         do_reboot_instance(context, instance, block_device_info, reboot_type)
 3747 
 3748     def _reboot_instance(self, context, instance, block_device_info,
 3749                          reboot_type):
 3750         """Reboot an instance on this host."""
 3751         # acknowledge the request made it to the manager
 3752         if reboot_type == "SOFT":
 3753             instance.task_state = task_states.REBOOT_PENDING
 3754             expected_states = task_states.soft_reboot_states
 3755         else:
 3756             instance.task_state = task_states.REBOOT_PENDING_HARD
 3757             expected_states = task_states.hard_reboot_states
 3758 
 3759         context = context.elevated()
 3760         LOG.info("Rebooting instance", instance=instance)
 3761 
 3762         bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
 3763             context, instance.uuid)
 3764         block_device_info = self._get_instance_block_device_info(
 3765             context, instance, bdms=bdms)
 3766 
 3767         network_info = self.network_api.get_instance_nw_info(context, instance)
 3768 
 3769         accel_info = self._get_accel_info(context, instance)
 3770 
 3771         self._notify_about_instance_usage(context, instance, "reboot.start")
 3772         compute_utils.notify_about_instance_action(
 3773             context, instance, self.host,
 3774             action=fields.NotificationAction.REBOOT,
 3775             phase=fields.NotificationPhase.START,
 3776             bdms=bdms
 3777         )
 3778 
 3779         instance.power_state = self._get_power_state(instance)
 3780         instance.save(expected_task_state=expected_states)
 3781 
 3782         if instance.power_state != power_state.RUNNING:
 3783             state = instance.power_state
 3784             running = power_state.RUNNING
 3785             LOG.warning('trying to reboot a non-running instance:'
 3786                         ' (state: %(state)s expected: %(running)s)',
 3787                         {'state': state, 'running': running},
 3788                         instance=instance)
 3789 
 3790         def bad_volumes_callback(bad_devices):
 3791             self._handle_bad_volumes_detached(
 3792                     context, instance, bad_devices, block_device_info)
 3793 
 3794         try:
 3795             # Don't change it out of rescue mode
 3796             if instance.vm_state == vm_states.RESCUED:
 3797                 new_vm_state = vm_states.RESCUED
 3798             else:
 3799                 new_vm_state = vm_states.ACTIVE
 3800             new_power_state = None
 3801             if reboot_type == "SOFT":
 3802                 instance.task_state = task_states.REBOOT_STARTED
 3803                 expected_state = task_states.REBOOT_PENDING
 3804             else:
 3805                 instance.task_state = task_states.REBOOT_STARTED_HARD
 3806                 expected_state = task_states.REBOOT_PENDING_HARD
 3807             instance.save(expected_task_state=expected_state)
 3808             self.driver.reboot(context, instance,
 3809                                network_info,
 3810                                reboot_type,
 3811                                block_device_info=block_device_info,
 3812                                accel_info=accel_info,
 3813                                bad_volumes_callback=bad_volumes_callback)
 3814 
 3815         except Exception as error:
 3816             with excutils.save_and_reraise_exception() as ctxt:
 3817                 exc_info = sys.exc_info()
 3818                 # if the reboot failed but the VM is running don't
 3819                 # put it into an error state
 3820                 new_power_state = self._get_power_state(instance)
 3821                 if new_power_state == power_state.RUNNING:
 3822                     LOG.warning('Reboot failed but instance is running',
 3823                                 instance=instance)
 3824                     compute_utils.add_instance_fault_from_exc(context,
 3825                             instance, error, exc_info)
 3826                     self._notify_about_instance_usage(context, instance,
 3827                             'reboot.error', fault=error)
 3828                     compute_utils.notify_about_instance_action(
 3829                         context, instance, self.host,
 3830                         action=fields.NotificationAction.REBOOT,
 3831                         phase=fields.NotificationPhase.ERROR,
 3832                         exception=error, bdms=bdms
 3833                     )
 3834                     ctxt.reraise = False
 3835                 else:
 3836                     LOG.error('Cannot reboot instance: %s', error,
 3837                               instance=instance)
 3838                     self._set_instance_obj_error_state(instance)
 3839 
 3840         if not new_power_state:
 3841             new_power_state = self._get_power_state(instance)
 3842         try:
 3843             instance.power_state = new_power_state
 3844             instance.vm_state = new_vm_state
 3845             instance.task_state = None
 3846             instance.save()
 3847         except exception.InstanceNotFound:
 3848             LOG.warning("Instance disappeared during reboot",
 3849                         instance=instance)
 3850 
 3851         self._notify_about_instance_usage(context, instance, "reboot.end")
 3852         compute_utils.notify_about_instance_action(
 3853             context, instance, self.host,
 3854             action=fields.NotificationAction.REBOOT,
 3855             phase=fields.NotificationPhase.END,
 3856             bdms=bdms
 3857         )
 3858 
 3859     @delete_image_on_error
 3860     def _do_snapshot_instance(self, context, image_id, instance):
 3861         self._snapshot_instance(context, image_id, instance,
 3862                                 task_states.IMAGE_BACKUP)
 3863 
 3864     @wrap_exception()
 3865     @reverts_task_state
 3866     @wrap_instance_event(prefix='compute')
 3867     @wrap_instance_fault
 3868     def backup_instance(self, context, image_id, instance, backup_type,
 3869                         rotation):
 3870         """Backup an instance on this host.
 3871 
 3872         :param backup_type: daily | weekly
 3873         :param rotation: int representing how many backups to keep around
 3874         """
 3875         self._do_snapshot_instance(context, image_id, instance)
 3876         self._rotate_backups(context, instance, backup_type, rotation)
 3877 
 3878     @wrap_exception()
 3879     @reverts_task_state
 3880     @wrap_instance_event(prefix='compute')
 3881     @wrap_instance_fault
 3882     @delete_image_on_error
 3883     def snapshot_instance(self, context, image_id, instance):
 3884         """Snapshot an instance on this host.
 3885 
 3886         :param context: security context
 3887         :param image_id: glance.db.sqlalchemy.models.Image.Id
 3888         :param instance: a nova.objects.instance.Instance object
 3889         """
 3890         # NOTE(dave-mcnally) the task state will already be set by the api
 3891         # but if the compute manager has crashed/been restarted prior to the
 3892         # request getting here the task state may have been cleared so we set
 3893         # it again and things continue normally
 3894         try:
 3895             instance.task_state = task_states.IMAGE_SNAPSHOT
 3896             instance.save(
 3897                         expected_task_state=task_states.IMAGE_SNAPSHOT_PENDING)
 3898         except exception.InstanceNotFound:
 3899             # possibility instance no longer exists, no point in continuing
 3900             LOG.debug("Instance not found, could not set state %s "
 3901                       "for instance.",
 3902                       task_states.IMAGE_SNAPSHOT, instance=instance)
 3903             return
 3904 
 3905         except exception.UnexpectedDeletingTaskStateError:
 3906             LOG.debug("Instance being deleted, snapshot cannot continue",
 3907                       instance=instance)
 3908             return
 3909 
 3910         with self._snapshot_semaphore:
 3911             self._snapshot_instance(context, image_id, instance,
 3912                                     task_states.IMAGE_SNAPSHOT)
 3913 
 3914     def _snapshot_instance(self, context, image_id, instance,
 3915                            expected_task_state):
 3916         context = context.elevated()
 3917 
 3918         instance.power_state = self._get_power_state(instance)
 3919         try:
 3920             instance.save()
 3921 
 3922             LOG.info('instance snapshotting', instance=instance)
 3923 
 3924             if instance.power_state != power_state.RUNNING:
 3925                 state = instance.power_state
 3926                 running = power_state.RUNNING
 3927                 LOG.warning('trying to snapshot a non-running instance: '
 3928                             '(state: %(state)s expected: %(running)s)',
 3929                             {'state': state, 'running': running},
 3930                             instance=instance)
 3931 
 3932             self._notify_about_instance_usage(
 3933                 context, instance, "snapshot.start")
 3934             compute_utils.notify_about_instance_snapshot(context, instance,
 3935                 self.host, phase=fields.NotificationPhase.START,
 3936                 snapshot_image_id=image_id)
 3937 
 3938             def update_task_state(task_state,
 3939                                   expected_state=expected_task_state):
 3940                 instance.task_state = task_state
 3941                 instance.save(expected_task_state=expected_state)
 3942 
 3943             with timeutils.StopWatch() as timer:
 3944                 self.driver.snapshot(context, instance, image_id,
 3945                                      update_task_state)
 3946             LOG.info('Took %0.2f seconds to snapshot the instance on '
 3947                      'the hypervisor.', timer.elapsed(), instance=instance)
 3948 
 3949             instance.task_state = None
 3950             instance.save(expected_task_state=task_states.IMAGE_UPLOADING)
 3951 
 3952             self._notify_about_instance_usage(context, instance,
 3953                                               "snapshot.end")
 3954             compute_utils.notify_about_instance_snapshot(context, instance,
 3955                 self.host, phase=fields.NotificationPhase.END,
 3956                 snapshot_image_id=image_id)
 3957         except (exception.InstanceNotFound,
 3958                 exception.InstanceNotRunning,
 3959                 exception.UnexpectedDeletingTaskStateError):
 3960             # the instance got deleted during the snapshot
 3961             # Quickly bail out of here
 3962             msg = 'Instance disappeared during snapshot'
 3963             LOG.debug(msg, instance=instance)
 3964             try:
 3965                 image = self.image_api.get(context, image_id)
 3966                 if image['status'] != 'active':
 3967                     self.image_api.delete(context, image_id)
 3968             except exception.ImageNotFound:
 3969                 LOG.debug('Image not found during clean up %s', image_id)
 3970             except Exception:
 3971                 LOG.warning("Error while trying to clean up image %s",
 3972                             image_id, instance=instance)
 3973         except exception.ImageNotFound:
 3974             instance.task_state = None
 3975             instance.save()
 3976             LOG.warning("Image not found during snapshot", instance=instance)
 3977 
 3978     def _post_interrupted_snapshot_cleanup(self, context, instance):
 3979         self.driver.post_interrupted_snapshot_cleanup(context, instance)
 3980 
 3981     @messaging.expected_exceptions(NotImplementedError)
 3982     @wrap_exception()
 3983     def volume_snapshot_create(self, context, instance, volume_id,
 3984                                create_info):
 3985         try:
 3986             self.driver.volume_snapshot_create(context, instance, volume_id,
 3987                                                create_info)
 3988         except exception.InstanceNotRunning:
 3989             # Libvirt driver can raise this exception
 3990             LOG.debug('Instance disappeared during volume snapshot create',
 3991                       instance=instance)
 3992 
 3993     @messaging.expected_exceptions(NotImplementedError)
 3994     @wrap_exception()
 3995     def volume_snapshot_delete(self, context, instance, volume_id,
 3996                                snapshot_id, delete_info):
 3997         try:
 3998             self.driver.volume_snapshot_delete(context, instance, volume_id,
 3999                                                snapshot_id, delete_info)
 4000         except exception.InstanceNotRunning:
 4001             # Libvirt driver can raise this exception
 4002             LOG.debug('Instance disappeared during volume snapshot delete',
 4003                       instance=instance)
 4004 
 4005     @wrap_instance_fault
 4006     def _rotate_backups(self, context, instance, backup_type, rotation):
 4007         """Delete excess backups associated to an instance.
 4008 
 4009         Instances are allowed a fixed number of backups (the rotation number);
 4010         this method deletes the oldest backups that exceed the rotation
 4011         threshold.
 4012 
 4013         :param context: security context
 4014         :param instance: Instance dict
 4015         :param backup_type: a user-defined type, like "daily" or "weekly" etc.
 4016         :param rotation: int representing how many backups to keep around;
 4017             None if rotation shouldn't be used (as in the case of snapshots)
 4018         """
 4019         filters = {'property-image_type': 'backup',
 4020                    'property-backup_type': backup_type,
 4021                    'property-instance_uuid': instance.uuid}
 4022 
 4023         images = self.image_api.get_all(context, filters=filters,
 4024                                         sort_key='created_at', sort_dir='desc')
 4025         num_images = len(images)
 4026         LOG.debug("Found %(num_images)d images (rotation: %(rotation)d)",
 4027                   {'num_images': num_images, 'rotation': rotation},
 4028                   instance=instance)
 4029 
 4030         if num_images > rotation:
 4031             # NOTE(sirp): this deletes all backups that exceed the rotation
 4032             # limit
 4033             excess = len(images) - rotation
 4034             LOG.debug("Rotating out %d backups", excess,
 4035                       instance=instance)
 4036             for i in range(excess):
 4037                 image = images.pop()
 4038                 image_id = image['id']
 4039                 LOG.debug("Deleting image %s", image_id,
 4040                           instance=instance)
 4041                 try:
 4042                     self.image_api.delete(context, image_id)
 4043                 except exception.ImageNotFound:
 4044                     LOG.info("Failed to find image %(image_id)s to "
 4045                              "delete", {'image_id': image_id},
 4046                              instance=instance)
 4047                 except (exception.ImageDeleteConflict, Exception) as exc:
 4048                     LOG.info("Failed to delete image %(image_id)s during "
 4049                              "deleting excess backups. "
 4050                              "Continuing for next image.. %(exc)s",
 4051                              {'image_id': image_id, 'exc': exc},
 4052                              instance=instance)
 4053 
 4054     @wrap_exception()
 4055     @reverts_task_state
 4056     @wrap_instance_event(prefix='compute')
 4057     @wrap_instance_fault
 4058     def set_admin_password(self, context, instance, new_pass):
 4059         """Set the root/admin password for an instance on this host.
 4060 
 4061         This is generally only called by API password resets after an
 4062         image has been built.
 4063 
 4064         @param context: Nova auth context.
 4065         @param instance: Nova instance object.
 4066         @param new_pass: The admin password for the instance.
 4067         """
 4068 
 4069         context = context.elevated()
 4070         current_power_state = self._get_power_state(instance)
 4071         expected_state = power_state.RUNNING
 4072 
 4073         if current_power_state != expected_state:
 4074             instance.task_state = None
 4075             instance.save(expected_task_state=task_states.UPDATING_PASSWORD)
 4076             _msg = _('instance %s is not running') % instance.uuid
 4077             raise exception.InstancePasswordSetFailed(
 4078                 instance=instance.uuid, reason=_msg)
 4079 
 4080         try:
 4081             self.driver.set_admin_password(instance, new_pass)
 4082             LOG.info("Admin password set", instance=instance)
 4083             instance.task_state = None
 4084             instance.save(
 4085                 expected_task_state=task_states.UPDATING_PASSWORD)
 4086         except exception.InstanceAgentNotEnabled:
 4087             with excutils.save_and_reraise_exception():
 4088                 LOG.debug('Guest agent is not enabled for the instance.',
 4089                           instance=instance)
 4090                 instance.task_state = None
 4091                 instance.save(
 4092                     expected_task_state=task_states.UPDATING_PASSWORD)
 4093         except exception.SetAdminPasswdNotSupported:
 4094             with excutils.save_and_reraise_exception():
 4095                 LOG.info('set_admin_password is not supported '
 4096                          'by this driver or guest instance.',
 4097                          instance=instance)
 4098                 instance.task_state = None
 4099                 instance.save(
 4100                     expected_task_state=task_states.UPDATING_PASSWORD)
 4101         except NotImplementedError:
 4102             LOG.warning('set_admin_password is not implemented '
 4103                         'by this driver or guest instance.',
 4104                         instance=instance)
 4105             instance.task_state = None
 4106             instance.save(
 4107                 expected_task_state=task_states.UPDATING_PASSWORD)
 4108             raise NotImplementedError(_('set_admin_password is not '
 4109                                         'implemented by this driver or guest '
 4110                                         'instance.'))
 4111         except exception.UnexpectedTaskStateError:
 4112             # interrupted by another (most likely delete) task
 4113             # do not retry
 4114             raise
 4115         except Exception:
 4116             # Catch all here because this could be anything.
 4117             LOG.exception('set_admin_password failed', instance=instance)
 4118             # We create a new exception here so that we won't
 4119             # potentially reveal password information to the
 4120             # API caller.  The real exception is logged above
 4121             _msg = _('error setting admin password')
 4122             raise exception.InstancePasswordSetFailed(
 4123                 instance=instance.uuid, reason=_msg)
 4124 
 4125     def _get_rescue_image(self, context, instance, rescue_image_ref=None):
 4126         """Determine what image should be used to boot the rescue VM."""
 4127         # 1. If rescue_image_ref is passed in, use that for rescue.
 4128         # 2. Else, use the base image associated with instance's current image.
 4129         #       The idea here is to provide the customer with a rescue
 4130         #       environment which they are familiar with.
 4131         #       So, if they built their instance off of a Debian image,
 4132         #       their rescue VM will also be Debian.
 4133         # 3. As a last resort, use instance's current image.
 4134         if not rescue_image_ref:
 4135             system_meta = utils.instance_sys_meta(instance)
 4136             rescue_image_ref = system_meta.get('image_base_image_ref')
 4137 
 4138         if not rescue_image_ref:
 4139             LOG.warning('Unable to find a different image to use for '
 4140                         'rescue VM, using instance\'s current image',
 4141                         instance=instance)
 4142             rescue_image_ref = instance.image_ref
 4143 
 4144         return objects.ImageMeta.from_image_ref(
 4145             context, self.image_api, rescue_image_ref)
 4146 
 4147     @wrap_exception()
 4148     @reverts_task_state
 4149     @wrap_instance_event(prefix='compute')
 4150     @wrap_instance_fault
 4151     def rescue_instance(self, context, instance, rescue_password,
 4152                         rescue_image_ref, clean_shutdown):
 4153         context = context.elevated()
 4154         LOG.info('Rescuing', instance=instance)
 4155 
 4156         admin_password = (rescue_password if rescue_password else
 4157                       utils.generate_password())
 4158 
 4159         network_info = self.network_api.get_instance_nw_info(context, instance)
 4160 
 4161         rescue_image_meta = self._get_rescue_image(context, instance,
 4162                                                    rescue_image_ref)
 4163 
 4164         bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
 4165                                               context, instance.uuid)
 4166         block_device_info = self._get_instance_block_device_info(
 4167                                 context, instance, bdms=bdms)
 4168 
 4169         extra_usage_info = {'rescue_image_name':
 4170                             self._get_image_name(rescue_image_meta)}
 4171         self._notify_about_instance_usage(context, instance,
 4172                 "rescue.start", extra_usage_info=extra_usage_info,
 4173                 network_info=network_info)
 4174         compute_utils.notify_about_instance_rescue_action(
 4175             context, instance, self.host, rescue_image_ref,
 4176             phase=fields.NotificationPhase.START)
 4177 
 4178         try:
 4179             self._power_off_instance(instance, clean_shutdown)
 4180 
 4181             self.driver.rescue(context, instance, network_info,
 4182                                rescue_image_meta, admin_password,
 4183                                block_device_info)
 4184         except Exception as e:
 4185             LOG.exception("Error trying to Rescue Instance",
 4186                           instance=instance)
 4187             self._set_instance_obj_error_state(instance)
 4188             raise exception.InstanceNotRescuable(
 4189                 instance_id=instance.uuid,
 4190                 reason=_("Driver Error: %s") % e)
 4191 
 4192         compute_utils.notify_usage_exists(self.notifier, context, instance,
 4193                                           self.host, current_period=True)
 4194 
 4195         instance.vm_state = vm_states.RESCUED
 4196         instance.task_state = None
 4197         instance.power_state = self._get_power_state(instance)
 4198         instance.launched_at = timeutils.utcnow()
 4199         instance.save(expected_task_state=task_states.RESCUING)
 4200 
 4201         self._notify_about_instance_usage(context, instance,
 4202                 "rescue.end", extra_usage_info=extra_usage_info,
 4203                 network_info=network_info)
 4204         compute_utils.notify_about_instance_rescue_action(
 4205             context, instance, self.host, rescue_image_ref,
 4206             phase=fields.NotificationPhase.END)
 4207 
 4208     @wrap_exception()
 4209     @reverts_task_state
 4210     @wrap_instance_event(prefix='compute')
 4211     @wrap_instance_fault
 4212     def unrescue_instance(self, context, instance):
 4213         orig_context = context
 4214         context = context.elevated()
 4215         LOG.info('Unrescuing', instance=instance)
 4216 
 4217         network_info = self.network_api.get_instance_nw_info(context, instance)
 4218         self._notify_about_instance_usage(context, instance,
 4219                 "unrescue.start", network_info=network_info)
 4220         compute_utils.notify_about_instance_action(context, instance,
 4221             self.host, action=fields.NotificationAction.UNRESCUE,
 4222             phase=fields.NotificationPhase.START)
 4223 
 4224         with self._error_out_instance_on_exception(context, instance):
 4225             self.driver.unrescue(orig_context, instance)
 4226 
 4227         instance.vm_state = vm_states.ACTIVE
 4228         instance.task_state = None
 4229         instance.power_state = self._get_power_state(instance)
 4230         instance.save(expected_task_state=task_states.UNRESCUING)
 4231 
 4232         self._notify_about_instance_usage(context,
 4233                                           instance,
 4234                                           "unrescue.end",
 4235                                           network_info=network_info)
 4236         compute_utils.notify_about_instance_action(context, instance,
 4237             self.host, action=fields.NotificationAction.UNRESCUE,
 4238             phase=fields.NotificationPhase.END)
 4239 
 4240     @wrap_exception()
 4241     @wrap_instance_fault
 4242     def change_instance_metadata(self, context, diff, instance):
 4243         """Update the metadata published to the instance."""
 4244         LOG.debug("Changing instance metadata according to %r",
 4245                   diff, instance=instance)
 4246         self.driver.change_instance_metadata(context, instance, diff)
 4247 
 4248     @wrap_exception()
 4249     @wrap_instance_event(prefix='compute')
 4250     @errors_out_migration
 4251     @wrap_instance_fault
 4252     def confirm_resize(self, context, instance, migration):
 4253         """Confirms a migration/resize and deletes the 'old' instance.
 4254 
 4255         This is called from the API and runs on the source host.
 4256 
 4257         Nothing needs to happen on the destination host at this point since
 4258         the instance is already running there. This routine just cleans up the
 4259         source host.
 4260         """
 4261         @utils.synchronized(instance.uuid)
 4262         def do_confirm_resize(context, instance, migration):
 4263             LOG.debug("Going to confirm migration %s", migration.id,
 4264                       instance=instance)
 4265 
 4266             if migration.status == 'confirmed':
 4267                 LOG.info("Migration %s is already confirmed",
 4268                          migration.id, instance=instance)
 4269                 return
 4270 
 4271             if migration.status not in ('finished', 'confirming'):
 4272                 LOG.warning("Unexpected confirmation status '%(status)s' "
 4273                             "of migration %(id)s, exit confirmation process",
 4274                             {"status": migration.status, "id": migration.id},
 4275                             instance=instance)
 4276                 return
 4277 
 4278             # NOTE(wangpan): Get the instance from db, if it has been
 4279             #                deleted, we do nothing and return here
 4280             expected_attrs = ['metadata', 'system_metadata', 'flavor']
 4281             try:
 4282                 instance = objects.Instance.get_by_uuid(
 4283                         context, instance.uuid,
 4284                         expected_attrs=expected_attrs)
 4285             except exception.InstanceNotFound:
 4286                 LOG.info("Instance is not found during confirmation",
 4287                          instance=instance)
 4288                 return
 4289 
 4290             with self._error_out_instance_on_exception(context, instance):
 4291                 try:
 4292                     self._confirm_resize(
 4293                         context, instance, migration=migration)
 4294                 except Exception:
 4295                     # Something failed when cleaning up the source host so
 4296                     # log a traceback and leave a hint about hard rebooting
 4297                     # the server to correct its state in the DB.
 4298                     with excutils.save_and_reraise_exception(logger=LOG):
 4299                         LOG.exception(
 4300                             'Confirm resize failed on source host %s. '
 4301                             'Resource allocations in the placement service '
 4302                             'will be removed regardless because the instance '
 4303                             'is now on the destination host %s. You can try '
 4304                             'hard rebooting the instance to correct its '
 4305                             'state.', self.host, migration.dest_compute,
 4306                             instance=instance)
 4307                 finally:
 4308                     # Whether an error occurred or not, at this point the
 4309                     # instance is on the dest host. Avoid leaking allocations
 4310                     # in placement by deleting them here...
 4311                     self._delete_allocation_after_move(
 4312                         context, instance, migration)
 4313                     # ...inform the scheduler about the move...
 4314                     self._delete_scheduler_instance_info(
 4315                         context, instance.uuid)
 4316                     # ...and unset the cached flavor information (this is done
 4317                     # last since the resource tracker relies on it for its
 4318                     # periodic tasks)
 4319                     self._delete_stashed_flavor_info(instance)
 4320 
 4321         do_confirm_resize(context, instance, migration)
 4322 
 4323     def _get_updated_nw_info_with_pci_mapping(self, nw_info, pci_mapping):
 4324         # NOTE(adrianc): This method returns a copy of nw_info if modifications
 4325         # are made else it returns the original nw_info.
 4326         updated_nw_info = nw_info
 4327         if nw_info and pci_mapping:
 4328             updated_nw_info = copy.deepcopy(nw_info)
 4329             for vif in updated_nw_info:
 4330                 if vif['vnic_type'] in network_model.VNIC_TYPES_SRIOV:
 4331                     try:
 4332                         vif_pci_addr = vif['profile']['pci_slot']
 4333                         new_addr = pci_mapping[vif_pci_addr].address
 4334                         vif['profile']['pci_slot'] = new_addr
 4335                         LOG.debug("Updating VIF's PCI address for VIF %(id)s. "
 4336                                   "Original value %(orig_val)s, "
 4337                                   "new value %(new_val)s",
 4338                                   {'id': vif['id'],
 4339                                    'orig_val': vif_pci_addr,
 4340                                    'new_val': new_addr})
 4341                     except (KeyError, AttributeError):
 4342                         with excutils.save_and_reraise_exception():
 4343                             # NOTE(adrianc): This should never happen. If we
 4344                             # get here it means there is some inconsistency
 4345                             # with either 'nw_info' or 'pci_mapping'.
 4346                             LOG.error("Unexpected error when updating network "
 4347                                       "information with PCI mapping.")
 4348         return updated_nw_info
 4349 
 4350     def _confirm_resize(self, context, instance, migration=None):
 4351         """Destroys the source instance."""
 4352         self._notify_about_instance_usage(context, instance,
 4353                                           "resize.confirm.start")
 4354         compute_utils.notify_about_instance_action(context, instance,
 4355             self.host, action=fields.NotificationAction.RESIZE_CONFIRM,
 4356             phase=fields.NotificationPhase.START)
 4357 
 4358         # NOTE(tr3buchet): tear down networks on source host
 4359         self.network_api.setup_networks_on_host(context, instance,
 4360                            migration.source_compute, teardown=True)
 4361 
 4362         # TODO(stephenfin): These next three calls should be bundled
 4363         network_info = self.network_api.get_instance_nw_info(context,
 4364                                                              instance)
 4365 
 4366         # NOTE(adrianc): Populate old PCI device in VIF profile
 4367         # to allow virt driver to properly unplug it from Hypervisor.
 4368         pci_mapping = (instance.migration_context.
 4369                        get_pci_mapping_for_migration(True))
 4370         network_info = self._get_updated_nw_info_with_pci_mapping(
 4371             network_info, pci_mapping)
 4372 
 4373         self.driver.confirm_migration(context, migration, instance,
 4374                                       network_info)
 4375 
 4376         # Free up the old_flavor usage from the resource tracker for this host.
 4377         self.rt.drop_move_claim_at_source(context, instance, migration)
 4378 
 4379         # NOTE(mriedem): The old_vm_state could be STOPPED but the user
 4380         # might have manually powered up the instance to confirm the
 4381         # resize/migrate, so we need to check the current power state
 4382         # on the instance and set the vm_state appropriately. We default
 4383         # to ACTIVE because if the power state is not SHUTDOWN, we
 4384         # assume _sync_instance_power_state will clean it up.
 4385         p_state = instance.power_state
 4386         vm_state = None
 4387         if p_state == power_state.SHUTDOWN:
 4388             vm_state = vm_states.STOPPED
 4389             LOG.debug("Resized/migrated instance is powered off. "
 4390                       "Setting vm_state to '%s'.", vm_state,
 4391                       instance=instance)
 4392         else:
 4393             vm_state = vm_states.ACTIVE
 4394 
 4395         instance.vm_state = vm_state
 4396         instance.task_state = None
 4397         instance.save(expected_task_state=[None, task_states.DELETING,
 4398                                            task_states.SOFT_DELETING])
 4399 
 4400         self._notify_about_instance_usage(
 4401             context, instance, "resize.confirm.end",
 4402             network_info=network_info)
 4403         compute_utils.notify_about_instance_action(context, instance,
 4404                self.host, action=fields.NotificationAction.RESIZE_CONFIRM,
 4405                phase=fields.NotificationPhase.END)
 4406 
 4407     def _delete_allocation_after_move(self, context, instance, migration):
 4408         """Deletes resource allocations held by the migration record against
 4409         the source compute node resource provider after a confirmed cold /
 4410         successful live migration.
 4411         """
 4412         try:
 4413             # NOTE(danms): We're finishing on the source node, so try
 4414             # to delete the allocation based on the migration uuid
 4415             self.reportclient.delete_allocation_for_instance(
 4416                 context, migration.uuid, consumer_type='migration')
 4417         except exception.AllocationDeleteFailed:
 4418             LOG.error('Deleting allocation in placement for migration '
 4419                       '%(migration_uuid)s failed. The instance '
 4420                       '%(instance_uuid)s will be put to ERROR state '
 4421                       'but the allocation held by the migration is '
 4422                       'leaked.',
 4423                       {'instance_uuid': instance.uuid,
 4424                        'migration_uuid': migration.uuid})
 4425             raise
 4426 
 4427     def _delete_stashed_flavor_info(self, instance):
 4428         """Remove information about the flavor change after a resize."""
 4429         instance.old_flavor = None
 4430         instance.new_flavor = None
 4431         instance.system_metadata.pop('old_vm_state', None)
 4432         instance.save()
 4433 
 4434     @wrap_exception()
 4435     @wrap_instance_event(prefix='compute')
 4436     @errors_out_migration
 4437     @wrap_instance_fault
 4438     def confirm_snapshot_based_resize_at_source(
 4439             self, ctxt, instance, migration):
 4440         """Confirms a snapshot-based resize on the source host.
 4441 
 4442         Cleans the guest from the source hypervisor including disks and drops
 4443         the MoveClaim which will free up "old_flavor" usage from the
 4444         ResourceTracker.
 4445 
 4446         Deletes the allocations held by the migration consumer against the
 4447         source compute node resource provider.
 4448 
 4449         :param ctxt: nova auth request context targeted at the source cell
 4450         :param instance: Instance object being resized which should have the
 4451             "old_flavor" attribute set
 4452         :param migration: Migration object for the resize operation
 4453         """
 4454 
 4455         @utils.synchronized(instance.uuid)
 4456         def do_confirm():
 4457             LOG.info('Confirming resize on source host.', instance=instance)
 4458             with self._error_out_instance_on_exception(ctxt, instance):
 4459                 # TODO(mriedem): Could probably make this try/except/finally
 4460                 # a context manager to share with confirm_resize().
 4461                 try:
 4462                     self._confirm_snapshot_based_resize_at_source(
 4463                         ctxt, instance, migration)
 4464                 except Exception:
 4465                     # Something failed when cleaning up the source host so
 4466                     # log a traceback and leave a hint about hard rebooting
 4467                     # the server to correct its state in the DB.
 4468                     with excutils.save_and_reraise_exception(logger=LOG):
 4469                         LOG.exception(
 4470                             'Confirm resize failed on source host %s. '
 4471                             'Resource allocations in the placement service '
 4472                             'will be removed regardless because the instance '
 4473                             'is now on the destination host %s. You can try '
 4474                             'hard rebooting the instance to correct its '
 4475                             'state.', self.host, migration.dest_compute,
 4476                             instance=instance)
 4477                 finally:
 4478                     # Whether an error occurred or not, at this point the
 4479                     # instance is on the dest host so to avoid leaking
 4480                     # allocations in placement, delete them here.
 4481                     # TODO(mriedem): Should we catch and just log
 4482                     # AllocationDeleteFailed? What is the user's recourse if
 4483                     # we got this far but this fails? At this point the
 4484                     # instance is on the target host and the allocations
 4485                     # could just be manually cleaned up by the operator.
 4486                     self._delete_allocation_after_move(ctxt, instance,
 4487                                                        migration)
 4488         do_confirm()
 4489 
 4490     def _confirm_snapshot_based_resize_at_source(
 4491             self, ctxt, instance, migration):
 4492         """Private version of confirm_snapshot_based_resize_at_source
 4493 
 4494         This allows the main method to be decorated with error handlers.
 4495 
 4496         :param ctxt: nova auth request context targeted at the source cell
 4497         :param instance: Instance object being resized which should have the
 4498             "old_flavor" attribute set
 4499         :param migration: Migration object for the resize operation
 4500         """
 4501         # Cleanup the guest from the hypervisor including local disks.
 4502         network_info = self.network_api.get_instance_nw_info(ctxt, instance)
 4503         LOG.debug('Cleaning up guest from source hypervisor including disks.',
 4504                   instance=instance)
 4505 
 4506         # FIXME(mriedem): Per bug 1809095, _confirm_resize calls
 4507         # _get_updated_nw_info_with_pci_mapping here prior to unplugging
 4508         # VIFs on the source, but in our case we have already unplugged
 4509         # VIFs during prep_snapshot_based_resize_at_source, so what do we
 4510         # need to do about those kinds of ports? Do we need to wait to unplug
 4511         # VIFs until confirm like normal resize?
 4512 
 4513         # Note that prep_snapshot_based_resize_at_source already destroyed the
 4514         # guest which disconnected volumes and unplugged VIFs but did not
 4515         # destroy disks in case something failed during the resize and the
 4516         # instance needed to be rebooted or rebuilt on the source host. Now
 4517         # that we are confirming the resize we want to cleanup the disks left
 4518         # on the source host. We call cleanup() instead of destroy() to avoid
 4519         # any InstanceNotFound confusion from the driver since the guest was
 4520         # already destroyed on this host. block_device_info=None and
 4521         # destroy_vifs=False means cleanup() will not try to disconnect volumes
 4522         # or unplug VIFs.
 4523         self.driver.cleanup(
 4524             ctxt, instance, network_info, block_device_info=None,
 4525             destroy_disks=True, destroy_vifs=False)
 4526 
 4527         # Delete port bindings for the source host.
 4528         self._confirm_snapshot_based_resize_delete_port_bindings(
 4529             ctxt, instance)
 4530 
 4531         # Delete volume attachments for the source host.
 4532         self._delete_volume_attachments(ctxt, instance.get_bdms())
 4533 
 4534         # Free up the old_flavor usage from the resource tracker for this host.
 4535         self.rt.drop_move_claim_at_source(ctxt, instance, migration)
 4536 
 4537     def _confirm_snapshot_based_resize_delete_port_bindings(
 4538             self, ctxt, instance):
 4539         """Delete port bindings for the source host when confirming
 4540         snapshot-based resize on the source host."
 4541 
 4542         :param ctxt: nova auth RequestContext
 4543         :param instance: Instance object that was resized/cold migrated
 4544         """
 4545         LOG.debug('Deleting port bindings for source host.',
 4546                   instance=instance)
 4547         try:
 4548             self.network_api.cleanup_instance_network_on_host(
 4549                 ctxt, instance, self.host)
 4550         except exception.PortBindingDeletionFailed as e:
 4551             # Do not let this stop us from cleaning up since the guest
 4552             # is already gone.
 4553             LOG.error('Failed to delete port bindings from source host. '
 4554                       'Error: %s', six.text_type(e), instance=instance)
 4555 
 4556     def _delete_volume_attachments(self, ctxt, bdms):
 4557         """Deletes volume attachment records for the given bdms.
 4558 
 4559         This method will log but not re-raise any exceptions if the volume
 4560         attachment delete fails.
 4561 
 4562         :param ctxt: nova auth request context used to make
 4563             DELETE /attachments/{attachment_id} requests to cinder.
 4564         :param bdms: objects.BlockDeviceMappingList representing volume
 4565             attachments to delete based on BlockDeviceMapping.attachment_id.
 4566         """
 4567         for bdm in bdms:
 4568             if bdm.attachment_id:
 4569                 try:
 4570                     self.volume_api.attachment_delete(ctxt, bdm.attachment_id)
 4571                 except Exception as e:
 4572                     LOG.error('Failed to delete volume attachment with ID %s. '
 4573                               'Error: %s', bdm.attachment_id, six.text_type(e),
 4574                               instance_uuid=bdm.instance_uuid)
 4575 
 4576     @wrap_exception()
 4577     @reverts_task_state
 4578     @wrap_instance_event(prefix='compute')
 4579     @errors_out_migration
 4580     @wrap_instance_fault
 4581     def revert_snapshot_based_resize_at_dest(self, ctxt, instance, migration):
 4582         """Reverts a snapshot-based resize at the destination host.
 4583 
 4584         Cleans the guest from the destination compute service host hypervisor
 4585         and related resources (ports, volumes) and frees resource usage from
 4586         the compute service on that host.
 4587 
 4588         :param ctxt: nova auth request context targeted at the target cell
 4589         :param instance: Instance object whose vm_state is "resized" and
 4590             task_state is "resize_reverting".
 4591         :param migration: Migration object whose status is "reverting".
 4592         """
 4593         # A resize revert is essentially a resize back to the old size, so we
 4594         # need to send a usage event here.
 4595         compute_utils.notify_usage_exists(
 4596             self.notifier, ctxt, instance, self.host, current_period=True)
 4597 
 4598         @utils.synchronized(instance.uuid)
 4599         def do_revert():
 4600             LOG.info('Reverting resize on destination host.',
 4601                      instance=instance)
 4602             with self._error_out_instance_on_exception(ctxt, instance):
 4603                 self._revert_snapshot_based_resize_at_dest(
 4604                     ctxt, instance, migration)
 4605         do_revert()
 4606 
 4607         # Broadcast to all schedulers that the instance is no longer on
 4608         # this host and clear any waiting callback events. This is best effort
 4609         # so if anything fails just log it.
 4610         try:
 4611             self._delete_scheduler_instance_info(ctxt, instance.uuid)
 4612             self.instance_events.clear_events_for_instance(instance)
 4613         except Exception as e:
 4614             LOG.warning('revert_snapshot_based_resize_at_dest failed during '
 4615                         'post-processing. Error: %s', e, instance=instance)
 4616 
 4617     def _revert_snapshot_based_resize_at_dest(
 4618             self, ctxt, instance, migration):
 4619         """Private version of revert_snapshot_based_resize_at_dest.
 4620 
 4621         This allows the main method to be decorated with error handlers.
 4622 
 4623         :param ctxt: nova auth request context targeted at the target cell
 4624         :param instance: Instance object whose vm_state is "resized" and
 4625             task_state is "resize_reverting".
 4626         :param migration: Migration object whose status is "reverting".
 4627         """
 4628         # Cleanup the guest from the hypervisor including local disks.
 4629         network_info = self.network_api.get_instance_nw_info(ctxt, instance)
 4630         bdms = instance.get_bdms()
 4631         block_device_info = self._get_instance_block_device_info(
 4632             ctxt, instance, bdms=bdms)
 4633         LOG.debug('Destroying guest from destination hypervisor including '
 4634                   'disks.', instance=instance)
 4635         self.driver.destroy(
 4636             ctxt, instance, network_info, block_device_info=block_device_info)
 4637 
 4638         # Activate source host port bindings. We need to do this before
 4639         # deleting the (active) dest host port bindings in
 4640         # setup_networks_on_host otherwise the ports will be unbound and
 4641         # finish on the source will fail.
 4642         # migrate_instance_start uses migration.dest_compute for the port
 4643         # binding host and since we want to activate the source host port
 4644         # bindings, we need to temporarily mutate the migration object.
 4645         with utils.temporary_mutation(
 4646                 migration, dest_compute=migration.source_compute):
 4647             LOG.debug('Activating port bindings for source host %s.',
 4648                       migration.source_compute, instance=instance)
 4649             # TODO(mriedem): https://review.opendev.org/#/c/594139/ would allow
 4650             # us to remove this and make setup_networks_on_host do it.
 4651             # TODO(mriedem): Should we try/except/log any errors but continue?
 4652             self.network_api.migrate_instance_start(
 4653                 ctxt, instance, migration)
 4654 
 4655         # Delete port bindings for the target host.
 4656         LOG.debug('Deleting port bindings for target host %s.',
 4657                   self.host, instance=instance)
 4658         try:
 4659             # Note that deleting the destination host port bindings does
 4660             # not automatically activate the source host port bindings.
 4661             self.network_api.cleanup_instance_network_on_host(
 4662                 ctxt, instance, self.host)
 4663         except exception.PortBindingDeletionFailed as e:
 4664             # Do not let this stop us from cleaning up since the guest
 4665             # is already gone.
 4666             LOG.error('Failed to delete port bindings from target host. '
 4667                       'Error: %s', six.text_type(e), instance=instance)
 4668 
 4669         # Delete any volume attachments remaining for this target host.
 4670         LOG.debug('Deleting volume attachments for target host.',
 4671                   instance=instance)
 4672         self._delete_volume_attachments(ctxt, bdms)
 4673 
 4674         # Free up the new_flavor usage from the resource tracker for this host.
 4675         self.rt.drop_move_claim_at_dest(ctxt, instance, migration)
 4676 
 4677     def _revert_instance_flavor_host_node(self, instance, migration):
 4678         """Revert host, node and flavor fields after a resize-revert."""
 4679         self._set_instance_info(instance, instance.old_flavor)
 4680         instance.host = migration.source_compute
 4681         instance.node = migration.source_node
 4682         instance.save(expected_task_state=[task_states.RESIZE_REVERTING])
 4683 
 4684     @wrap_exception()
 4685     @reverts_task_state
 4686     @wrap_instance_event(prefix='compute')
 4687     @errors_out_migration
 4688     @wrap_instance_fault
 4689     def finish_revert_snapshot_based_resize_at_source(
 4690             self, ctxt, instance, migration):
 4691         """Reverts a snapshot-based resize at the source host.
 4692 
 4693         Spawn the guest and re-connect volumes/VIFs on the source host and
 4694         revert the instance to use the old_flavor for resource usage reporting.
 4695 
 4696         Updates allocations in the placement service to move the source node
 4697         allocations, held by the migration record, to the instance and drop
 4698         the allocations held by the instance on the destination node.
 4699 
 4700         :param ctxt: nova auth request context targeted at the target cell
 4701         :param instance: Instance object whose vm_state is "resized" and
 4702             task_state is "resize_reverting".
 4703         :param migration: Migration object whose status is "reverting".
 4704         """
 4705 
 4706         @utils.synchronized(instance.uuid)
 4707         def do_revert():
 4708             LOG.info('Reverting resize on source host.', instance=instance)
 4709             with self._error_out_instance_on_exception(ctxt, instance):
 4710                 self._finish_revert_snapshot_based_resize_at_source(
 4711                     ctxt, instance, migration)
 4712 
 4713         try:
 4714             do_revert()
 4715         finally:
 4716             self._delete_stashed_flavor_info(instance)
 4717 
 4718         # Broadcast to all schedulers that the instance is on this host.
 4719         # This is best effort so if anything fails just log it.
 4720         try:
 4721             self._update_scheduler_instance_info(ctxt, instance)
 4722         except Exception as e:
 4723             LOG.warning('finish_revert_snapshot_based_resize_at_source failed '
 4724                         'during post-processing. Error: %s', e,
 4725                         instance=instance)
 4726 
 4727     def _finish_revert_snapshot_based_resize_at_source(
 4728             self, ctxt, instance, migration):
 4729         """Private version of finish_revert_snapshot_based_resize_at_source.
 4730 
 4731         This allows the main method to be decorated with error handlers.
 4732 
 4733         :param ctxt: nova auth request context targeted at the source cell
 4734         :param instance: Instance object whose vm_state is "resized" and
 4735             task_state is "resize_reverting".
 4736         :param migration: Migration object whose status is "reverting".
 4737         """
 4738         # Get stashed old_vm_state information to determine if guest should
 4739         # be powered on after spawn; we default to ACTIVE for backwards
 4740         # compatibility if old_vm_state is not set
 4741         old_vm_state = instance.system_metadata.get(
 4742             'old_vm_state', vm_states.ACTIVE)
 4743 
 4744         # Revert the flavor and host/node fields to their previous values
 4745         self._revert_instance_flavor_host_node(instance, migration)
 4746 
 4747         # Move the allocations against the source compute node resource
 4748         # provider, held by the migration, to the instance which will drop
 4749         # the destination compute node resource provider allocations held by
 4750         # the instance. This puts the allocations against the source node
 4751         # back to the old_flavor and owned by the instance.
 4752         try:
 4753             self._revert_allocation(ctxt, instance, migration)
 4754         except exception.AllocationMoveFailed:
 4755             # Log the error but do not re-raise because we want to continue to
 4756             # process ports and volumes below.
 4757             LOG.error('Reverting allocation in placement for migration '
 4758                       '%(migration_uuid)s failed. You may need to manually '
 4759                       'remove the allocations for the migration consumer '
 4760                       'against the source node resource provider '
 4761                       '%(source_provider)s and the allocations for the '
 4762                       'instance consumer against the destination node '
 4763                       'resource provider %(dest_provider)s and then run the '
 4764                       '"nova-manage placement heal_allocations" command.',
 4765                       {'instance_uuid': instance.uuid,
 4766                        'migration_uuid': migration.uuid,
 4767                        'source_provider': migration.source_node,
 4768                        'dest_provider': migration.dest_node},
 4769                       instance=instance)
 4770 
 4771         bdms = instance.get_bdms()
 4772         # prep_snapshot_based_resize_at_source created empty volume attachments
 4773         # that we need to update here to get the connection_info before calling
 4774         # driver.finish_revert_migration which will connect the volumes to this
 4775         # host.
 4776         LOG.debug('Updating volume attachments for target host %s.',
 4777                   self.host, instance=instance)
 4778         # TODO(mriedem): We should probably make _update_volume_attachments
 4779         # (optionally) graceful to errors so we (1) try to process all
 4780         # attachments and (2) continue to process networking below.
 4781         self._update_volume_attachments(ctxt, instance, bdms)
 4782 
 4783         LOG.debug('Updating port bindings for source host %s.',
 4784                   self.host, instance=instance)
 4785         # TODO(mriedem): Calculate provider mappings when we support
 4786         # cross-cell resize/migrate with ports having resource requests.
 4787         self._finish_revert_resize_network_migrate_finish(
 4788             ctxt, instance, migration, provider_mappings=None)
 4789         network_info = self.network_api.get_instance_nw_info(ctxt, instance)
 4790 
 4791         # Remember that prep_snapshot_based_resize_at_source destroyed the
 4792         # guest but left the disks intact so we cannot call spawn() here but
 4793         # finish_revert_migration should do the job.
 4794         block_device_info = self._get_instance_block_device_info(
 4795             ctxt, instance, bdms=bdms)
 4796         power_on = old_vm_state == vm_states.ACTIVE
 4797         driver_error = None
 4798         try:
 4799             self.driver.finish_revert_migration(
 4800                 ctxt, instance, network_info, migration,
 4801                 block_device_info=block_device_info, power_on=power_on)
 4802         except Exception as e:
 4803             driver_error = e
 4804             # Leave a hint about hard rebooting the guest and reraise so the
 4805             # instance is put into ERROR state.
 4806             with excutils.save_and_reraise_exception(logger=LOG):
 4807                 LOG.error('An error occurred during finish_revert_migration. '
 4808                           'The instance may need to be hard rebooted. Error: '
 4809                           '%s', driver_error, instance=instance)
 4810         else:
 4811             # Perform final cleanup of the instance in the database.
 4812             instance.drop_migration_context()
 4813             # If the original vm_state was STOPPED, set it back to STOPPED.
 4814             vm_state = vm_states.ACTIVE if power_on else vm_states.STOPPED
 4815             self._update_instance_after_spawn(instance, vm_state=vm_state)
 4816             instance.save(expected_task_state=[task_states.RESIZE_REVERTING])
 4817         finally:
 4818             # Complete any volume attachments so the volumes are in-use. We
 4819             # do this regardless of finish_revert_migration failing because
 4820             # the instance is back on this host now and we do not want to leave
 4821             # the volumes in a pending state in case the instance is hard
 4822             # rebooted.
 4823             LOG.debug('Completing volume attachments for instance on source '
 4824                       'host.', instance=instance)
 4825             with excutils.save_and_reraise_exception(
 4826                     reraise=driver_error is not None, logger=LOG):
 4827                 self._complete_volume_attachments(ctxt, bdms)
 4828 
 4829         migration.status = 'reverted'
 4830         migration.save()
 4831 
 4832     @wrap_exception()
 4833     @reverts_task_state
 4834     @wrap_instance_event(prefix='compute')
 4835     @errors_out_migration
 4836     @wrap_instance_fault
 4837     def revert_resize(self, context, instance, migration, request_spec=None):
 4838         """Destroys the new instance on the destination machine.
 4839 
 4840         Reverts the model changes, and powers on the old instance on the
 4841         source machine.
 4842 
 4843         """
 4844         # NOTE(comstud): A revert_resize is essentially a resize back to
 4845         # the old size, so we need to send a usage event here.
 4846         compute_utils.notify_usage_exists(self.notifier, context, instance,
 4847                                           self.host, current_period=True)
 4848 
 4849         with self._error_out_instance_on_exception(context, instance):
 4850             # NOTE(tr3buchet): tear down networks on destination host
 4851             self.network_api.setup_networks_on_host(context, instance,
 4852                                                     teardown=True)
 4853 
 4854             self.network_api.migrate_instance_start(context,
 4855                                                     instance,
 4856                                                     migration)
 4857 
 4858             network_info = self.network_api.get_instance_nw_info(context,
 4859                                                                  instance)
 4860             bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
 4861                     context, instance.uuid)
 4862             block_device_info = self._get_instance_block_device_info(
 4863                                 context, instance, bdms=bdms)
 4864 
 4865             destroy_disks = not self._is_instance_storage_shared(
 4866                 context, instance, host=migration.source_compute)
 4867             self.driver.destroy(context, instance, network_info,
 4868                                 block_device_info, destroy_disks)
 4869 
 4870             self._terminate_volume_connections(context, instance, bdms)
 4871 
 4872             # Free up the new_flavor usage from the resource tracker for this
 4873             # host.
 4874             self.rt.drop_move_claim_at_dest(context, instance, migration)
 4875 
 4876             # RPC cast back to the source host to finish the revert there.
 4877             self.compute_rpcapi.finish_revert_resize(context, instance,
 4878                     migration, migration.source_compute, request_spec)
 4879 
 4880     def _finish_revert_resize_network_migrate_finish(
 4881             self, context, instance, migration, provider_mappings):
 4882         """Causes port binding to be updated. In some Neutron or port
 4883         configurations - see NetworkModel.get_bind_time_events() - we
 4884         expect the vif-plugged event from Neutron immediately and wait for it.
 4885         The rest of the time, the event is expected further along in the
 4886         virt driver, so we don't wait here.
 4887 
 4888         :param context: The request context.
 4889         :param instance: The instance undergoing the revert resize.
 4890         :param migration: The Migration object of the resize being reverted.
 4891         :param provider_mappings: a dict of list of resource provider uuids
 4892             keyed by port uuid
 4893         :raises: eventlet.timeout.Timeout or
 4894                  exception.VirtualInterfacePlugException.
 4895         """
 4896         network_info = instance.get_network_info()
 4897         events = []
 4898         deadline = CONF.vif_plugging_timeout
 4899         if deadline and network_info:
 4900             events = network_info.get_bind_time_events(migration)
 4901             if events:
 4902                 LOG.debug('Will wait for bind-time events: %s', events)
 4903         error_cb = self._neutron_failed_migration_callback
 4904         try:
 4905             with self.virtapi.wait_for_instance_event(instance, events,
 4906                                                       deadline=deadline,
 4907                                                       error_callback=error_cb):
 4908                 # NOTE(hanrong): we need to change migration.dest_compute to
 4909                 # source host temporarily.
 4910                 # "network_api.migrate_instance_finish" will setup the network
 4911                 # for the instance on the destination host. For revert resize,
 4912                 # the instance will back to the source host, the setup of the
 4913                 # network for instance should be on the source host. So set
 4914                 # the migration.dest_compute to source host at here.
 4915                 with utils.temporary_mutation(
 4916                         migration, dest_compute=migration.source_compute):
 4917                     self.network_api.migrate_instance_finish(
 4918                         context, instance, migration, provider_mappings)
 4919         except eventlet.timeout.Timeout:
 4920             with excutils.save_and_reraise_exception():
 4921                 LOG.error('Timeout waiting for Neutron events: %s', events,
 4922                           instance=instance)
 4923 
 4924     @wrap_exception()
 4925     @reverts_task_state
 4926     @wrap_instance_event(prefix='compute')
 4927     @errors_out_migration
 4928     @wrap_instance_fault
 4929     def finish_revert_resize(
 4930             self, context, instance, migration, request_spec=None):
 4931         """Finishes the second half of reverting a resize on the source host.
 4932 
 4933         Bring the original source instance state back (active/shutoff) and
 4934         revert the resized attributes in the database.
 4935 
 4936         """
 4937         try:
 4938             self._finish_revert_resize(
 4939                 context, instance, migration, request_spec)
 4940         finally:
 4941             self._delete_stashed_flavor_info(instance)
 4942 
 4943     def _finish_revert_resize(
 4944         self, context, instance, migration, request_spec=None,
 4945     ):
 4946         """Inner version of finish_revert_resize."""
 4947         with self._error_out_instance_on_exception(context, instance):
 4948             bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
 4949                 context, instance.uuid)
 4950             self._notify_about_instance_usage(
 4951                     context, instance, "resize.revert.start")
 4952             compute_utils.notify_about_instance_action(context, instance,
 4953                 self.host, action=fields.NotificationAction.RESIZE_REVERT,
 4954                     phase=fields.NotificationPhase.START, bdms=bdms)
 4955 
 4956             # Get stashed old_vm_state information to determine if guest should
 4957             # be powered on after spawn; we default to ACTIVE for backwards
 4958             # compatibility if old_vm_state is not set
 4959             old_vm_state = instance.system_metadata.get(
 4960                 'old_vm_state', vm_states.ACTIVE)
 4961 
 4962             # Revert the flavor and host/node fields to their previous values
 4963             self._revert_instance_flavor_host_node(instance, migration)
 4964 
 4965             try:
 4966                 source_allocations = self._revert_allocation(
 4967                     context, instance, migration)
 4968             except exception.AllocationMoveFailed:
 4969                 LOG.error('Reverting allocation in placement for migration '
 4970                           '%(migration_uuid)s failed. The instance '
 4971                           '%(instance_uuid)s will be put into ERROR state but '
 4972                           'the allocation held by the migration is leaked.',
 4973                           {'instance_uuid': instance.uuid,
 4974                            'migration_uuid': migration.uuid})
 4975                 raise
 4976 
 4977             provider_mappings = self._fill_provider_mapping_based_on_allocs(
 4978                 context, source_allocations, request_spec)
 4979 
 4980             self.network_api.setup_networks_on_host(context, instance,
 4981                                                     migration.source_compute)
 4982             self._finish_revert_resize_network_migrate_finish(
 4983                 context, instance, migration, provider_mappings)
 4984             network_info = self.network_api.get_instance_nw_info(context,
 4985                                                                  instance)
 4986 
 4987             # revert_resize deleted any volume attachments for the instance
 4988             # and created new ones to be used on this host, but we
 4989             # have to update those attachments with the host connector so the
 4990             # BDM.connection_info will get set in the call to
 4991             # _get_instance_block_device_info below with refresh_conn_info=True
 4992             # and then the volumes can be re-connected via the driver on this
 4993             # host.
 4994             self._update_volume_attachments(context, instance, bdms)
 4995 
 4996             block_device_info = self._get_instance_block_device_info(
 4997                     context, instance, refresh_conn_info=True, bdms=bdms)
 4998 
 4999             power_on = old_vm_state != vm_states.STOPPED
 5000             self.driver.finish_revert_migration(
 5001                 context, instance, network_info, migration, block_device_info,
 5002                 power_on)
 5003 
 5004             instance.drop_migration_context()
 5005             instance.launched_at = timeutils.utcnow()
 5006             instance.save(expected_task_state=task_states.RESIZE_REVERTING)
 5007 
 5008             # Complete any volume attachments so the volumes are in-use.
 5009             self._complete_volume_attachments(context, bdms)
 5010 
 5011             # if the original vm state was STOPPED, set it back to STOPPED
 5012             LOG.info("Updating instance to original state: '%s'",
 5013                      old_vm_state, instance=instance)
 5014             if power_on:
 5015                 instance.vm_state = vm_states.ACTIVE
 5016                 instance.task_state = None
 5017                 instance.save()
 5018             else:
 5019                 instance.task_state = task_states.POWERING_OFF
 5020                 instance.save()
 5021                 self.stop_instance(context, instance=instance,
 5022                                    clean_shutdown=True)
 5023 
 5024             self._notify_about_instance_usage(
 5025                     context, instance, "resize.revert.end")
 5026             compute_utils.notify_about_instance_action(context, instance,
 5027                 self.host, action=fields.NotificationAction.RESIZE_REVERT,
 5028                     phase=fields.NotificationPhase.END, bdms=bdms)
 5029 
 5030     def _fill_provider_mapping_based_on_allocs(
 5031             self, context, allocations, request_spec):
 5032         """Fills and returns the request group - resource provider mapping
 5033         based on the allocation passed in.
 5034 
 5035         :param context: The security context
 5036         :param allocation: allocation dict keyed by RP UUID.
 5037         :param request_spec: The RequestSpec object associated with the
 5038             operation
 5039         :returns: None if the request_spec is None. Otherwise a mapping
 5040             between RequestGroup requester_id, currently Neutron port_id,
 5041             and a list of resource provider UUIDs providing resource for
 5042             that RequestGroup.
 5043         """
 5044         if request_spec:
 5045             # NOTE(gibi): We need to re-calculate the resource provider -
 5046             # port mapping as we have to have the neutron ports allocate
 5047             # from the source compute after revert.
 5048             scheduler_utils.fill_provider_mapping_based_on_allocation(
 5049                 context, self.reportclient, request_spec, allocations)
 5050             provider_mappings = self._get_request_group_mapping(
 5051                 request_spec)
 5052         else:
 5053             # NOTE(gibi): The compute RPC is pinned to be older than 5.2
 5054             # and therefore request_spec is not sent. We cannot calculate
 5055             # the provider mappings. If the instance has ports with
 5056             # resource request then the port update will fail in
 5057             # _update_port_binding_for_instance() called via
 5058             # _finish_revert_resize_network_migrate_finish() in
 5059             # finish_revert_resize.
 5060             provider_mappings = None
 5061         return provider_mappings
 5062 
 5063     def _revert_allocation(self, context, instance, migration):
 5064         """Revert an allocation that is held by migration to our instance."""
 5065 
 5066         # Fetch the original allocation that the instance had on the source
 5067         # node, which are now held by the migration
 5068         orig_alloc = self.reportclient.get_allocations_for_consumer(
 5069             context, migration.uuid)
 5070         if not orig_alloc:
 5071             LOG.error('Did not find resource allocations for migration '
 5072                       '%s on source node %s. Unable to revert source node '
 5073                       'allocations back to the instance.',
 5074                       migration.uuid, migration.source_node, instance=instance)
 5075             return False
 5076 
 5077         LOG.info('Swapping old allocation on %(rp_uuids)s held by migration '
 5078                  '%(mig)s for instance',
 5079                  {'rp_uuids': orig_alloc.keys(), 'mig': migration.uuid},
 5080                  instance=instance)
 5081         # FIXME(gibi): This method is flawed in that it does not handle
 5082         # allocations against sharing providers in any special way. This leads
 5083         # to duplicate allocations against the sharing provider during
 5084         # migration.
 5085         # TODO(cdent): Should we be doing anything with return values here?
 5086         self.reportclient.move_allocations(context, migration.uuid,
 5087                                            instance.uuid)
 5088         return orig_alloc
 5089 
 5090     def _prep_resize(self, context, image, instance, instance_type,
 5091                      filter_properties, node, migration, request_spec,
 5092                      clean_shutdown=True):
 5093 
 5094         if not filter_properties:
 5095             filter_properties = {}
 5096 
 5097         if not instance.host:
 5098             self._set_instance_obj_error_state(instance)
 5099             msg = _('Instance has no source host')
 5100             raise exception.MigrationError(reason=msg)
 5101 
 5102         same_host = instance.host == self.host
 5103         # if the flavor IDs match, it's migrate; otherwise resize
 5104         if same_host and instance_type.id == instance['instance_type_id']:
 5105             # check driver whether support migrate to same host
 5106             if not self.driver.capabilities.get(
 5107                     'supports_migrate_to_same_host', False):
 5108                 # Raise InstanceFaultRollback so that the
 5109                 # _error_out_instance_on_exception context manager in
 5110                 # prep_resize will set the instance.vm_state properly.
 5111                 raise exception.InstanceFaultRollback(
 5112                     inner_exception=exception.UnableToMigrateToSelf(
 5113                         instance_id=instance.uuid, host=self.host))
 5114 
 5115         # NOTE(danms): Stash the new instance_type to avoid having to
 5116         # look it up in the database later
 5117         instance.new_flavor = instance_type
 5118         # NOTE(mriedem): Stash the old vm_state so we can set the
 5119         # resized/reverted instance back to the same state later.
 5120         vm_state = instance.vm_state
 5121         LOG.debug('Stashing vm_state: %s', vm_state, instance=instance)
 5122         instance.system_metadata['old_vm_state'] = vm_state
 5123         instance.save()
 5124 
 5125         if not isinstance(request_spec, objects.RequestSpec):
 5126             # Prior to compute RPC API 5.1 conductor would pass a legacy dict
 5127             # version of the request spec to compute and since Stein compute
 5128             # could be sending that back to conductor on reschedule, so if we
 5129             # got a dict convert it to an object.
 5130             # TODO(mriedem): We can drop this compat code when we only support
 5131             # compute RPC API >=6.0.
 5132             request_spec = objects.RequestSpec.from_primitives(
 5133                 context, request_spec, filter_properties)
 5134             # We don't have to set the new flavor on the request spec because
 5135             # if we got here it was due to a reschedule from the compute and
 5136             # the request spec would already have the new flavor in it from the
 5137             # else block below.
 5138 
 5139         provider_mapping = self._get_request_group_mapping(request_spec)
 5140 
 5141         if provider_mapping:
 5142             try:
 5143                 compute_utils.\
 5144                     update_pci_request_spec_with_allocated_interface_name(
 5145                         context, self.reportclient, instance